Skip to content

Commit

Permalink
Anaphora resolution, bleu score
Browse files Browse the repository at this point in the history
  • Loading branch information
Stephen Bly committed Apr 16, 2013
1 parent 535a7f5 commit 54e07db
Show file tree
Hide file tree
Showing 8 changed files with 819 additions and 69 deletions.
171 changes: 171 additions & 0 deletions Question_Answer_Dataset_v1.1/S08/data/set1/a8.osent

Large diffs are not rendered by default.

171 changes: 171 additions & 0 deletions Question_Answer_Dataset_v1.1/S08/data/set1/a8.parse

Large diffs are not rendered by default.

171 changes: 171 additions & 0 deletions Question_Answer_Dataset_v1.1/S08/data/set1/a8.sst

Large diffs are not rendered by default.

171 changes: 171 additions & 0 deletions Question_Answer_Dataset_v1.1/S08/data/set1/a8.tagged

Large diffs are not rendered by default.

Empty file.
Empty file.
77 changes: 39 additions & 38 deletions answer
Original file line number Diff line number Diff line change
Expand Up @@ -26,32 +26,35 @@ def contains_negative(sent):
return "no" in sent or "not" in sent or "n't" in sent

# the set of pronouns, used for anaphora resolution
pronouns = set(["he", "she", "it", "him", "her", "his","they","their","we",
"our","i","you","your","my","mine","yours","ours"])
pronouns = set(["he", "she", "it", "its", "it's", "him", "her", "his","they",
"their","we", "our","i","you","your","my","mine","yours","ours"])

resolved_articles = {}

# Runs coreference resolution on the article using arkref.
# This still needs to be implemented.
def coref(path_to_article):
if path_to_article in resolved_articles:
return resolved_articles[path_to_article]

subprocess.call(["./arkref.sh", "-input", path_to_article])
print open(path_to_article).read()
tagged_article = open(path_to_article.replace("txt", "tagged")).read()
tagged_article = "<root>"+tagged_article+"</root>" # trick arkref into doing entire doc
#print tagged_article
soup = bs4.BeautifulSoup(tagged_article, "html.parser").root
for entity in soup.find_all(True):
if entity.string != None and entity.string.strip().lower() in pronouns:
antecedent_id = entity["entityid"].split("_")[0]
antecedent = soup.find(mentionid=antecedent_id)
string = re.sub('<.*?>',' ',str(antecedent))
tok = nltk.word_tokenize(string)
ants = [(x,y) for x,y in nltk.pos_tag(tok) if y in {'NNP','NN'}]
entity.string.replace_with(' '.join(map(lambda (x,y):x,ants)))
#print 'entity is: '+entity.string
#entity.unwrap()
string2 = re.sub('<.*?>',' ',str(soup))
print string2
antecedent = str(antecedent).split(">", 1)[1].split("<", 1)[0]
#string = re.sub('<.*?>',' ',str(antecedent))
#tok = nltk.word_tokenize(string)
#ants = [(x,y) for x,y in nltk.pos_tag(tok) if y in {'NNP','NN'}]
entity.string.replace_with(antecedent)
#print 'entity is: '+entity.string
resolved = re.sub("<.*?>", "", str(soup))
resolved_articles[path_to_article] = resolved

return open(path_to_article).read()
return resolved

# Answers a question from the information in article.
# Ranks all the sentences and then returns the top choice.
Expand All @@ -65,35 +68,33 @@ def answer(question, article):
if contains_negative(top): return "No"
else: return "Yes"
else:
return top
return top[0]

# The main script
if __name__ == '__main__':
article_name = sys.argv[1]

# for year in ("S08", "S09", "S10"):
# print "Year:", year
# prefix = "Question_Answer_Dataset_v1.1/"+year+"/"
# question_answer_pairs = open(prefix+"question_answer_pairs.txt").readlines()
# question_answer_pairs.pop(0)
# for line in question_answer_pairs:
# if not line.startswith(article_name): continue
# line = line.lstrip(article_name)
# end = line.find("?")
# if end == -1: continue
# question = line[:end+1].strip()
# line = line[end+1:].split()
# path_to_article = prefix+line.pop()+".txt"
# difficulty_answerer = line.pop()
# difficulty_questioner = line.pop()
# correct_answer = " ".join(line)
for year in ("S08", "S09", "S10"):
print "Year:", year
prefix = "Question_Answer_Dataset_v1.1/"+year+"/"
question_answer_pairs = open(prefix+"question_answer_pairs.txt").readlines()
question_answer_pairs.pop(0)
for line in question_answer_pairs:
if not line.startswith(article_name): continue
line = line.lstrip(article_name)
end = line.find("?")
if end == -1: continue
question = line[:end+1].strip()
line = line[end+1:].split()
path_to_article = prefix+line.pop()+".txt"
difficulty_answerer = line.pop()
difficulty_questioner = line.pop()
correct_answer = " ".join(line)

#print "Question:", question
#print "Difficulty from answerer:", difficulty_answerer
#print "Difficulty from questioner:", difficulty_questioner
print "Question:", question
print "Difficulty from answerer:", difficulty_answerer
print "Difficulty from questioner:", difficulty_questioner

# Open the question file and start answering questions.
article = coref('corefTest.txt')
print article
#print "Our answer:", answer(question, article)
#print "Correct answer:", correct_answer
article = coref(path_to_article)
print "Our answer:", answer(question, article)
print "Correct answer:", correct_answer
127 changes: 96 additions & 31 deletions modules/sourceContentSelector.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
import sys, os, string, re
import nltk
from nltk.stem import PorterStemmer
import collections
import numpy as np
sys.path.append("modules")
import lemma

Expand All @@ -17,6 +19,8 @@
# we should probably change this to the WordNet lemmatizer, but this is ok for now
ps = PorterStemmer()

MAX_NGRAMS = 5

# Given a question, returns a list of keywords
def getKeywords(question):
tagged = nltk.tag.pos_tag(question)
Expand All @@ -37,18 +41,24 @@ def getScoredSentences(question, article):
scored_sent = []
sentences = nltk.tokenize.sent_tokenize(article)
for sent in sentences:
if sent.strip() == "": continue
sentence = nltk.tokenize.word_tokenize(sent)
sentence = map(ps.stem, sentence)
s = score(question, sentence)
scored_sent.append((sent, s))
return scored_sent

# Scores a sentence based on how well we think it answers the question
def score(question, sentence):
score = 0
score += ngramWeight(question, sentence)
sentence = map(ps.stem, sentence)
keywords = getKeywords(question)
question = map(ps.stem, question)
score += proximity(keywords, sentence)
question_ngrams = count_ngrams(question, MAX_NGRAMS, True)
sentence_ngrams = count_ngrams(sentence, MAX_NGRAMS, True)
precision, recall = bleu_score(question_ngrams, len(question), sentence_ngrams, len(sentence), 5)
f1 = (2*precision*recall)/(precision+recall)
score += 2*f1
return score

# Finds the shortest window in the targest sentence
Expand All @@ -59,39 +69,94 @@ def proximity(keywords, sentence):
for j in range(length+1-i):
words = set(sentence[j:i+j])
if keywords <= words:
return max(20-i, 0)
return 1 - i/length
return 0

# From YC
def count_ngrams(tokens, n, all_smaller=False):
"""Counts the frequency of n-grams in the given list of tokens.
:param tokens: list of tokens to compute ngrams for.
:param n: number of grams to count.
:param all_smaller: set to True to include all n-grams from n=1 to n.
"""

counts = collections.Counter()
for k in xrange(1 if all_smaller else n, n+1):
for i in xrange(len(tokens)-k+1):
counts[tuple(tokens[i:i+k])] += 1

return counts
#end def

def bleu_score(ref_ngrams, ref_len, pred_ngrams, pred_len, n):
"""Calculate the BLEU precision and recall from ngram counts.
:param ref_ngrams: reference sentence ngrams.
:param ref_len: reference sentence length.
:param pred_ngrams: predicted sentence ngrams.
:param pred_len: predicted sentence length.
:param n: the maximum number of ngrams to consider.
"""

if not ref_len or not pred_len: return 0.0, 0.0
if not len(ref_ngrams) or not len(pred_ngrams): return 0.0, 0.0

ngram_score = np.zeros(n, dtype=np.float32) + 0.1

# compute the ngram intersections
for ngram, c in ref_ngrams.iteritems():
if len(ngram) > n: continue

k = min(c, pred_ngrams[ngram])
ngram_score[len(ngram) - 1] += k
#end for

# compute the geometric mean of the ngrams precision/recall
precision = np.mean(np.log(ngram_score / len(pred_ngrams)))
recall = np.mean(np.log(ngram_score / len(ref_ngrams)))

# apply the brevity penalty
if pred_len <= ref_len: precision += 1.0 - (float(ref_len) / pred_len)
if ref_len <= pred_len: recall += 1.0 - (float(pred_len) / ref_len)

precision = np.exp(precision)
recall = np.exp(recall)

return precision, recall
#end def


# Compare the overlap of two sentences using ngrams
# (up to trigrams). This is similar to the BLEU score.
def ngramWeight(question, sentence):
#stem and take set intersections for unigrams
uniQ = map(ps.stem, question)
uniS = sentence
unigram = set(uniQ).intersection(set(uniS))


#get all bigram overlaps, rolls around end of sentence
if len(uniQ) > 1 and len(uniS) > 1:
bigramQ = set([uniQ[i-1]+uniQ[i] for i,word in enumerate(uniQ)])
bigramS = set([uniS[i-1]+uniS[i] for i,word in enumerate(uniS)])
bigram = bigramQ.intersection(bigramS)
else:
bigram = {}

if len(uniQ) > 2 and len(uniS) > 2:
trigramQ = set([uniQ[i-2]+uniQ[i-1]+uniQ[i] for i,word in enumerate(uniQ)])
trigramS = set([uniS[i-2]+uniS[i-1]+uniS[i] for i,word in enumerate(uniS)])
trigram = trigramQ.intersection(trigramS)
else:
trigram = {}

lam1 = 0.2
lam2 = 0.3
lam3 = 0.5

return lam1*len(unigram) + lam2*len(bigram) + lam3*len(trigram)
#def ngramWeight(question, sentence):
# #stem and take set intersections for unigrams
# uniQ = map(ps.stem, question)
# uniS = sentence
# unigram = set(uniQ).intersection(set(uniS))
#
#
# #get all bigram overlaps, rolls around end of sentence
# if len(uniQ) > 1 and len(uniS) > 1:
# bigramQ = set([uniQ[i-1]+uniQ[i] for i,word in enumerate(uniQ)])
# bigramS = set([uniS[i-1]+uniS[i] for i,word in enumerate(uniS)])
# bigram = bigramQ.intersection(bigramS)
# else:
# bigram = {}
#
# if len(uniQ) > 2 and len(uniS) > 2:
# trigramQ = set([uniQ[i-2]+uniQ[i-1]+uniQ[i] for i,word in enumerate(uniQ)])
# trigramS = set([uniS[i-2]+uniS[i-1]+uniS[i] for i,word in enumerate(uniS)])
# trigram = trigramQ.intersection(trigramS)
# else:
# trigram = {}
#
#
# lam1 = 0.2
# lam2 = 0.3
# lam3 = 0.5
#
# return lam1*len(unigram) + lam2*len(bigram) + lam3*len(trigram)

# for testing
if __name__ == '__main__':
Expand Down

0 comments on commit 54e07db

Please sign in to comment.