Anaphora resolution, bleu score

ryhan · Apr 16, 2013 · 54e07db · 54e07db
1 parent 535a7f5
commit 54e07db
Show file tree

Hide file tree

Showing 8 changed files with 819 additions and 69 deletions.
diff --git a/Question_Answer_Dataset_v1.1/S08/data/set1/a8.osent b/Question_Answer_Dataset_v1.1/S08/data/set1/a8.osent
diff --git a/Question_Answer_Dataset_v1.1/S08/data/set1/a8.parse b/Question_Answer_Dataset_v1.1/S08/data/set1/a8.parse
diff --git a/Question_Answer_Dataset_v1.1/S08/data/set1/a8.sst b/Question_Answer_Dataset_v1.1/S08/data/set1/a8.sst
diff --git a/Question_Answer_Dataset_v1.1/S08/data/set1/a8.tagged b/Question_Answer_Dataset_v1.1/S08/data/set1/a8.tagged
diff --git a/Question_Answer_Dataset_v1.1/S08/data/set1/at.parse b/Question_Answer_Dataset_v1.1/S08/data/set1/at.parse
diff --git a/Question_Answer_Dataset_v1.1/S08/data/set1/at.sst b/Question_Answer_Dataset_v1.1/S08/data/set1/at.sst
diff --git a/answer b/answer
@@ -26,32 +26,35 @@ def contains_negative(sent):
   return "no" in sent or "not" in sent or "n't" in sent
 
 # the set of pronouns, used for anaphora resolution
-pronouns = set(["he", "she", "it", "him", "her", "his","they","their","we",
-                "our","i","you","your","my","mine","yours","ours"])
+pronouns = set(["he", "she", "it", "its", "it's", "him", "her", "his","they",
+                "their","we", "our","i","you","your","my","mine","yours","ours"])
+
+resolved_articles = {}
 
 # Runs coreference resolution on the article using arkref.
 # This still needs to be implemented.
 def coref(path_to_article):
+  if path_to_article in resolved_articles:
+    return resolved_articles[path_to_article]
+
   subprocess.call(["./arkref.sh", "-input", path_to_article])
-  print open(path_to_article).read()
   tagged_article = open(path_to_article.replace("txt", "tagged")).read()
   tagged_article = "<root>"+tagged_article+"</root>" # trick arkref into doing entire doc
-  #print tagged_article
   soup = bs4.BeautifulSoup(tagged_article, "html.parser").root
   for entity in soup.find_all(True):
     if entity.string != None and entity.string.strip().lower() in pronouns:
       antecedent_id = entity["entityid"].split("_")[0]
       antecedent = soup.find(mentionid=antecedent_id)
-      string = re.sub('<.*?>',' ',str(antecedent))
-      tok = nltk.word_tokenize(string)
-      ants = [(x,y) for x,y in nltk.pos_tag(tok) if y in {'NNP','NN'}]
-      entity.string.replace_with(' '.join(map(lambda (x,y):x,ants)))
-      #print 'entity is: '+entity.string 
-    #entity.unwrap()
-  string2 = re.sub('<.*?>',' ',str(soup))
-  print string2
+      antecedent = str(antecedent).split(">", 1)[1].split("<", 1)[0]
+      #string = re.sub('<.*?>',' ',str(antecedent))
+      #tok = nltk.word_tokenize(string)
+      #ants = [(x,y) for x,y in nltk.pos_tag(tok) if y in {'NNP','NN'}]
+      entity.string.replace_with(antecedent)
+      #print 'entity is: '+entity.string
+  resolved = re.sub("<.*?>", "", str(soup))
+  resolved_articles[path_to_article] = resolved
 
-  return open(path_to_article).read()
+  return resolved
 
 # Answers a question from the information in article.
 # Ranks all the sentences and then returns the top choice.
@@ -65,35 +68,33 @@ def answer(question, article):
       if contains_negative(top): return "No"
       else: return "Yes"
     else:
-      return top
+      return top[0]
 
 # The main script
 if __name__ == '__main__':
   article_name = sys.argv[1]
 
-# for year in ("S08", "S09", "S10"):
-#    print "Year:", year
-#    prefix = "Question_Answer_Dataset_v1.1/"+year+"/"
-#    question_answer_pairs = open(prefix+"question_answer_pairs.txt").readlines()
-#    question_answer_pairs.pop(0)
-#    for line in question_answer_pairs:
-#      if not line.startswith(article_name): continue
-#      line = line.lstrip(article_name)
-#      end = line.find("?")
-#      if end == -1: continue
-#      question = line[:end+1].strip()
-#      line = line[end+1:].split()
-#      path_to_article = prefix+line.pop()+".txt"
-#      difficulty_answerer = line.pop()
-#      difficulty_questioner = line.pop()
-#      correct_answer = " ".join(line)
+for year in ("S08", "S09", "S10"):
+  print "Year:", year
+  prefix = "Question_Answer_Dataset_v1.1/"+year+"/"
+  question_answer_pairs = open(prefix+"question_answer_pairs.txt").readlines()
+  question_answer_pairs.pop(0)
+  for line in question_answer_pairs:
+    if not line.startswith(article_name): continue
+    line = line.lstrip(article_name)
+    end = line.find("?")
+    if end == -1: continue
+    question = line[:end+1].strip()
+    line = line[end+1:].split()
+    path_to_article = prefix+line.pop()+".txt"
+    difficulty_answerer = line.pop()
+    difficulty_questioner = line.pop()
+    correct_answer = " ".join(line)
 
-    #print "Question:", question
-    #print "Difficulty from answerer:", difficulty_answerer
-    #print "Difficulty from questioner:", difficulty_questioner
+    print "Question:", question
+    print "Difficulty from answerer:", difficulty_answerer
+    print "Difficulty from questioner:", difficulty_questioner
 
-      # Open the question file and start answering questions.
-  article = coref('corefTest.txt')
-  print article
-    #print "Our answer:", answer(question, article)
-    #print "Correct answer:", correct_answer
+    article = coref(path_to_article)
+    print "Our answer:", answer(question, article)
+    print "Correct answer:", correct_answer
diff --git a/modules/sourceContentSelector.py b/modules/sourceContentSelector.py
@@ -6,6 +6,8 @@
 import sys, os, string, re
 import nltk
 from nltk.stem import PorterStemmer
+import collections
+import numpy as np
 sys.path.append("modules")
 import lemma
 
@@ -17,6 +19,8 @@
 # we should probably change this to the WordNet lemmatizer, but this is ok for now
 ps = PorterStemmer()
 
+MAX_NGRAMS = 5
+
 # Given a question, returns a list of keywords
 def getKeywords(question):
   tagged = nltk.tag.pos_tag(question)
@@ -37,18 +41,24 @@ def getScoredSentences(question, article):
   scored_sent = []
   sentences = nltk.tokenize.sent_tokenize(article)
   for sent in sentences:
+      if sent.strip() == "": continue
       sentence = nltk.tokenize.word_tokenize(sent)
-      sentence = map(ps.stem, sentence)
       s = score(question, sentence)
       scored_sent.append((sent, s))
   return scored_sent
 
 # Scores a sentence based on how well we think it answers the question
 def score(question, sentence):
     score = 0
-    score += ngramWeight(question, sentence)
+    sentence = map(ps.stem, sentence)
     keywords = getKeywords(question)
+    question = map(ps.stem, question)
     score += proximity(keywords, sentence)
+    question_ngrams = count_ngrams(question, MAX_NGRAMS, True)
+    sentence_ngrams = count_ngrams(sentence, MAX_NGRAMS, True)
+    precision, recall = bleu_score(question_ngrams, len(question), sentence_ngrams, len(sentence), 5)
+    f1 = (2*precision*recall)/(precision+recall)
+    score += 2*f1
     return score
 
 # Finds the shortest window in the targest sentence
@@ -59,39 +69,94 @@ def proximity(keywords, sentence):
         for j in range(length+1-i):
             words = set(sentence[j:i+j])
             if keywords <= words:
-                return max(20-i, 0)
+                return 1 - i/length
     return 0
 
+# From YC
+def count_ngrams(tokens, n, all_smaller=False):
+  """Counts the frequency of n-grams in the given list of tokens.
+
+  :param tokens: list of tokens to compute ngrams for.
+  :param n: number of grams to count.
+  :param all_smaller: set to True to include all n-grams from n=1 to n.
+  """
+
+  counts = collections.Counter()
+  for k in xrange(1 if all_smaller else n, n+1):
+    for i in xrange(len(tokens)-k+1):
+      counts[tuple(tokens[i:i+k])] += 1
+
+  return counts
+#end def
+
+def bleu_score(ref_ngrams, ref_len, pred_ngrams, pred_len, n):
+  """Calculate the BLEU precision and recall from ngram counts.
+
+  :param ref_ngrams: reference sentence ngrams.
+  :param ref_len: reference sentence length.
+  :param pred_ngrams: predicted sentence ngrams.
+  :param pred_len: predicted sentence length.
+  :param n: the maximum number of ngrams to consider.
+  """
+
+  if not ref_len or not pred_len: return 0.0, 0.0
+  if not len(ref_ngrams) or not len(pred_ngrams): return 0.0, 0.0
+
+  ngram_score = np.zeros(n, dtype=np.float32) + 0.1
+
+  # compute the ngram intersections
+  for ngram, c in ref_ngrams.iteritems():
+    if len(ngram) > n: continue
+
+    k = min(c, pred_ngrams[ngram])
+    ngram_score[len(ngram) - 1] += k
+  #end for
+
+  # compute the geometric mean of the ngrams precision/recall
+  precision = np.mean(np.log(ngram_score / len(pred_ngrams)))
+  recall = np.mean(np.log(ngram_score / len(ref_ngrams)))
+
+  # apply the brevity penalty
+  if pred_len <= ref_len: precision += 1.0 - (float(ref_len) / pred_len)
+  if ref_len <= pred_len: recall += 1.0 - (float(pred_len) / ref_len)
+
+  precision = np.exp(precision)
+  recall = np.exp(recall)
+
+  return precision, recall
+#end def
+
+
 # Compare the overlap of two sentences using ngrams
 # (up to trigrams). This is similar to the BLEU score.
-def ngramWeight(question, sentence):
-  #stem and take set intersections for unigrams
-  uniQ = map(ps.stem, question)
-  uniS = sentence
-  unigram = set(uniQ).intersection(set(uniS))
-
-
-  #get all bigram overlaps, rolls around end of sentence
-  if len(uniQ) > 1 and len(uniS) > 1:
-    bigramQ = set([uniQ[i-1]+uniQ[i] for i,word in enumerate(uniQ)])
-    bigramS = set([uniS[i-1]+uniS[i] for i,word in enumerate(uniS)])
-    bigram = bigramQ.intersection(bigramS)
-  else:
-      bigram = {}
-
-  if len(uniQ) > 2 and len(uniS) > 2:
-    trigramQ = set([uniQ[i-2]+uniQ[i-1]+uniQ[i] for i,word in enumerate(uniQ)])
-    trigramS = set([uniS[i-2]+uniS[i-1]+uniS[i] for i,word in enumerate(uniS)])
-    trigram = trigramQ.intersection(trigramS)
-  else:
-      trigram = {}
-
-      
-  lam1 = 0.2
-  lam2 = 0.3
-  lam3 = 0.5
-
-  return lam1*len(unigram) + lam2*len(bigram) + lam3*len(trigram)
+#def ngramWeight(question, sentence):
+#  #stem and take set intersections for unigrams
+#  uniQ = map(ps.stem, question)
+#  uniS = sentence
+#  unigram = set(uniQ).intersection(set(uniS))
+#
+#
+#  #get all bigram overlaps, rolls around end of sentence
+#  if len(uniQ) > 1 and len(uniS) > 1:
+#    bigramQ = set([uniQ[i-1]+uniQ[i] for i,word in enumerate(uniQ)])
+#    bigramS = set([uniS[i-1]+uniS[i] for i,word in enumerate(uniS)])
+#    bigram = bigramQ.intersection(bigramS)
+#  else:
+#      bigram = {}
+#
+#  if len(uniQ) > 2 and len(uniS) > 2:
+#    trigramQ = set([uniQ[i-2]+uniQ[i-1]+uniQ[i] for i,word in enumerate(uniQ)])
+#    trigramS = set([uniS[i-2]+uniS[i-1]+uniS[i] for i,word in enumerate(uniS)])
+#    trigram = trigramQ.intersection(trigramS)
+#  else:
+#      trigram = {}
+#
+#
+#  lam1 = 0.2
+#  lam2 = 0.3
+#  lam3 = 0.5
+#
+#  return lam1*len(unigram) + lam2*len(bigram) + lam3*len(trigram)
 
 # for testing
 if __name__ == '__main__':