From e339d9adf0c83452a15a49f42b292bb0412234c0 Mon Sep 17 00:00:00 2001
From: Stephen Bly <stephenbly@cmu-787440.wv.cc.cmu.edu>
Date: Mon, 15 Apr 2013 22:18:11 -0400
Subject: [PATCH] Cleaned up code!!!!

---
 answer                           | 77 ++++++++++++++++++++++----------
 modules/extractor.py             |  1 -
 modules/lemma.py                 |  2 -
 modules/sourceContentSelector.py | 69 +++++-----------------------
 4 files changed, 64 insertions(+), 85 deletions(-)
diff --git a/answer b/answer
index f0de860..4d8de42 100755
--- a/answer
+++ b/answer
@@ -13,8 +13,7 @@ import re
 import itertools
 import nltk
 from nltk.stem import PorterStemmer
-import bs4
-
+from bs4 import BeautifulSoup
 # Import our modules from /modules
 sys.path.append("modules")
 import questionClassifier
@@ -27,6 +26,33 @@ import coref
 def contains_negative(sent):
   return "no" in sent or "not" in sent or "n't" in sent
 
+# the set of pronouns, used for anaphora resolution
+pronouns = set(["he", "she", "it", "its", "it's", "him", "her", "his","they",
+                "their","we", "our","i","you","your","my","mine","yours","ours"])
+
+resolved_articles = {}
+
+# Runs coreference resolution on the article using arkref.
+# This still needs to be implemented.
+def coref(path_to_article):
+  if path_to_article in resolved_articles:
+    return resolved_articles[path_to_article]
+
+  subprocess.call(["./arkref.sh", "-input", path_to_article])
+  tagged_article = open(path_to_article.replace("txt", "tagged")).read()
+  tagged_article = "<root>"+tagged_article+"</root>" # trick arkref into doing entire doc
+  soup = BeautifulSoup(tagged_article, "html.parser").root
+  for entity in soup.find_all(True):
+    if entity.string != None and entity.string.strip().lower() in pronouns:
+      antecedent_id = entity["entityid"].split("_")[0]
+      antecedent = soup.find(mentionid=antecedent_id)
+      antecedent = str(antecedent).split(">", 1)[1].split("<", 1)[0]
+      entity.string.replace_with(antecedent)
+  resolved = re.sub("<.*?>", "", str(soup))
+  resolved_articles[path_to_article] = resolved
+
+  return resolved
+
 # Answers a question from the information in article.
 # Ranks all the sentences and then returns the top choice.
 def answer(question, article):
@@ -38,34 +64,39 @@ def answer(question, article):
     if question_type == "BOOLEAN":
       if contains_negative(top): return "No"
       else: return "Yes"
-    else:
-      return top[0]
+    else: return top[0]
 
 # The main script
 if __name__ == '__main__':
   article_name = sys.argv[1]
 
-for year in ("S08", "S09", "S10"):
-  print "Year:", year
-  prefix = "Question_Answer_Dataset_v1.1/"+year+"/"
-  question_answer_pairs = open(prefix+"question_answer_pairs.txt").readlines()
-  question_answer_pairs.pop(0)
-  for line in question_answer_pairs:
-    if not line.startswith(article_name): continue
-    line = line.lstrip(article_name)
-    end = line.find("?")
-    if end == -1: continue
-    question = line[:end+1].strip()
-    line = line[end+1:].split()
-    path_to_article = prefix+line.pop()+".txt"
-    difficulty_answerer = line.pop()
-    difficulty_questioner = line.pop()
-    correct_answer = " ".join(line)
+  for year in ("S08", "S09", "S10"):
+    print "Year:", year
+    prefix = "Question_Answer_Dataset_v1.1/"+year+"/"
+    question_answer_pairs = open(prefix+"question_answer_pairs.txt").readlines()
+    question_answer_pairs.pop(0)
+    for line in question_answer_pairs:
+      if not line.startswith(article_name): continue
+      line = line.lstrip(article_name)
+      end = line.find("?")
+      if end == -1: continue
+      question = line[:end+1].strip()
+      line = line[end+1:].split()
+      path_to_article = prefix+line.pop()+".txt"
+      difficulty_answerer = line.pop()
+      difficulty_questioner = line.pop()
+      correct_answer = " ".join(line)
 
-    print "Question:", question
-    print "Difficulty from answerer:", difficulty_answerer
-    print "Difficulty from questioner:", difficulty_questioner
+      print "Question:", question
+      print "Difficulty from answerer:", difficulty_answerer
+      print "Difficulty from questioner:", difficulty_questioner
 
+<<<<<<< HEAD
     article = coref.process(path_to_article)
     print "Our answer:", answer(question, article)
     print "Correct answer:", correct_answer
+=======
+      article = coref(path_to_article)
+      print "Our answer:", answer(question, article)
+      print "Correct answer:", correct_answer
+>>>>>>> Cleaned up code!!!!
diff --git a/modules/extractor.py b/modules/extractor.py
index de77a2b..36b86f2 100644
--- a/modules/extractor.py
+++ b/modules/extractor.py
@@ -3,7 +3,6 @@
 
 import nltk
 
-#sample = open("para.txt").read()
 sample = 'I like Harvard and partitions'
 sentences = nltk.sent_tokenize(sample)
 tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
diff --git a/modules/lemma.py b/modules/lemma.py
index 7d8f5ef..50f5b17 100644
--- a/modules/lemma.py
+++ b/modules/lemma.py
@@ -11,14 +11,12 @@ def lem(words):
 
   for word, tag in pos_tagged:
     temp = set({word})
-
     if tag[0] == 'N':
       for ss in wn.synsets(word, pos=wn.NOUN):
         temp.update(set(lemma.name for lemma in ss.lemmas))
     elif tag[0] == 'V':
       for ss in wn.synsets(word, pos=wn.VERB):
         temp.update(set(lemma.name for lemma in ss.lemmas))
-
     lemmaList.append(temp)
 
   return lemmaList
\ No newline at end of file
diff --git a/modules/sourceContentSelector.py b/modules/sourceContentSelector.py
index 8a31a5c..0e0258a 100644
--- a/modules/sourceContentSelector.py
+++ b/modules/sourceContentSelector.py
@@ -8,44 +8,34 @@
 from nltk.stem import PorterStemmer
 import collections
 import numpy as np
-sys.path.append("modules")
-import lemma
 
-# Any keywords in the sentence will be these parts of speech
+# Ignore words that don't have these parts of speech when computing keywords
 key_POS = set(["CD","FW","NN","NNS","NNP","NPS","VB","VBD","VBG","VBN","VBP","VBZ"])
 # auxiliary verbs we should ignore
 aux = set(["is", "was", "did", "does", "do", "were", "are"])
 
-# we should probably change this to the WordNet lemmatizer, but this is ok for now
+# the porter stemmer! yay!
 ps = PorterStemmer()
 
+# check up to 5-grams for the bleu score
 MAX_NGRAMS = 5
 
 # Given a question, returns a list of keywords
 def getKeywords(question):
   tagged = nltk.tag.pos_tag(question)
-  #nltk.ne_chunk(tagged)
   tagged = [pair for pair in tagged if pair[1] in key_POS and pair[0].lower() not in aux]
-  result = []
-  for tag in tagged:
-    if tag[1] == "NNP":
-      # named entities aren't that helpful until we implement coreference resolution
-      result.append(tag[0])
-    else:
-      result.append(ps.stem(tag[0]))
-  return set(result)
+  return {ps.stem(tag[0]) for tag in tagged}
 
 # Given a question, return a list of each sentence in the article
 # with a score attached to it
 def getScoredSentences(question, article):
-  scored_sent = []
+  scored_sentences = []
   sentences = nltk.tokenize.sent_tokenize(article)
-  for sent in sentences:
-      if sent.strip() == "": continue
-      sentence = nltk.tokenize.word_tokenize(sent)
-      s = score(question, sentence)
-      scored_sent.append((sent, s))
-  return scored_sent
+  for sentence in sentences:
+      if sentence.strip() == "": continue
+      s = score(question, nltk.word_tokenize(sentence))
+      scored_sentences.append((sentence, s))
+  return scored_sentences
 
 # Scores a sentence based on how well we think it answers the question
 def score(question, sentence):
@@ -87,7 +77,6 @@ def count_ngrams(tokens, n, all_smaller=False):
       counts[tuple(tokens[i:i+k])] += 1
 
   return counts
-#end def
 
 def bleu_score(ref_ngrams, ref_len, pred_ngrams, pred_len, n):
   """Calculate the BLEU precision and recall from ngram counts.
@@ -110,7 +99,6 @@ def bleu_score(ref_ngrams, ref_len, pred_ngrams, pred_len, n):
 
     k = min(c, pred_ngrams[ngram])
     ngram_score[len(ngram) - 1] += k
-  #end for
 
   # compute the geometric mean of the ngrams precision/recall
   precision = np.mean(np.log(ngram_score / len(pred_ngrams)))
@@ -124,40 +112,3 @@ def bleu_score(ref_ngrams, ref_len, pred_ngrams, pred_len, n):
   recall = np.exp(recall)
 
   return precision, recall
-#end def
-
-
-# Compare the overlap of two sentences using ngrams
-# (up to trigrams). This is similar to the BLEU score.
-#def ngramWeight(question, sentence):
-#  #stem and take set intersections for unigrams
-#  uniQ = map(ps.stem, question)
-#  uniS = sentence
-#  unigram = set(uniQ).intersection(set(uniS))
-#
-#
-#  #get all bigram overlaps, rolls around end of sentence
-#  if len(uniQ) > 1 and len(uniS) > 1:
-#    bigramQ = set([uniQ[i-1]+uniQ[i] for i,word in enumerate(uniQ)])
-#    bigramS = set([uniS[i-1]+uniS[i] for i,word in enumerate(uniS)])
-#    bigram = bigramQ.intersection(bigramS)
-#  else:
-#      bigram = {}
-#
-#  if len(uniQ) > 2 and len(uniS) > 2:
-#    trigramQ = set([uniQ[i-2]+uniQ[i-1]+uniQ[i] for i,word in enumerate(uniQ)])
-#    trigramS = set([uniS[i-2]+uniS[i-1]+uniS[i] for i,word in enumerate(uniS)])
-#    trigram = trigramQ.intersection(trigramS)
-#  else:
-#      trigram = {}
-#
-#
-#  lam1 = 0.2
-#  lam2 = 0.3
-#  lam3 = 0.5
-#
-#  return lam1*len(unigram) + lam2*len(bigram) + lam3*len(trigram)
-
-# for testing
-if __name__ == '__main__':
-    print proximity(set(["the", "moon", "stroke"]), ["I",  "want", "to", "see", "the", "moon", "stroke"])