Created coref module

ryhan · Apr 16, 2013 · b352782 · b352782
1 parent 54e07db
commit b352782
Show file tree

Hide file tree

Showing 2 changed files with 46 additions and 32 deletions.
diff --git a/answer b/answer
@@ -14,48 +14,19 @@ import itertools
 import nltk
 from nltk.stem import PorterStemmer
 import bs4
+
 # Import our modules from /modules
 sys.path.append("modules")
 import questionClassifier
 import sourceContentSelector
+import coref
 
 # To answer yes/no question, we want to just answer yes or no,
 # and not returna  whole sentence. We do this by checking for
 # any negatives in the sentence.
 def contains_negative(sent):
   return "no" in sent or "not" in sent or "n't" in sent
 
-# the set of pronouns, used for anaphora resolution
-pronouns = set(["he", "she", "it", "its", "it's", "him", "her", "his","they",
-                "their","we", "our","i","you","your","my","mine","yours","ours"])
-
-resolved_articles = {}
-
-# Runs coreference resolution on the article using arkref.
-# This still needs to be implemented.
-def coref(path_to_article):
-  if path_to_article in resolved_articles:
-    return resolved_articles[path_to_article]
-
-  subprocess.call(["./arkref.sh", "-input", path_to_article])
-  tagged_article = open(path_to_article.replace("txt", "tagged")).read()
-  tagged_article = "<root>"+tagged_article+"</root>" # trick arkref into doing entire doc
-  soup = bs4.BeautifulSoup(tagged_article, "html.parser").root
-  for entity in soup.find_all(True):
-    if entity.string != None and entity.string.strip().lower() in pronouns:
-      antecedent_id = entity["entityid"].split("_")[0]
-      antecedent = soup.find(mentionid=antecedent_id)
-      antecedent = str(antecedent).split(">", 1)[1].split("<", 1)[0]
-      #string = re.sub('<.*?>',' ',str(antecedent))
-      #tok = nltk.word_tokenize(string)
-      #ants = [(x,y) for x,y in nltk.pos_tag(tok) if y in {'NNP','NN'}]
-      entity.string.replace_with(antecedent)
-      #print 'entity is: '+entity.string
-  resolved = re.sub("<.*?>", "", str(soup))
-  resolved_articles[path_to_article] = resolved
-
-  return resolved
-
 # Answers a question from the information in article.
 # Ranks all the sentences and then returns the top choice.
 def answer(question, article):
@@ -95,6 +66,6 @@ for year in ("S08", "S09", "S10"):
     print "Difficulty from answerer:", difficulty_answerer
     print "Difficulty from questioner:", difficulty_questioner
 
-    article = coref(path_to_article)
+    article = coref.process(path_to_article)
     print "Our answer:", answer(question, article)
     print "Correct answer:", correct_answer
diff --git a/modules/coref.py b/modules/coref.py
@@ -0,0 +1,43 @@
+#!/usr/bin/python
+
+# coref.py
+
+# Useful tools which should be pre-installed
+import os, sys, errno
+import subprocess
+import re
+import itertools
+import nltk
+from nltk.stem import PorterStemmer
+import bs4
+
+# the set of pronouns, used for anaphora resolution
+pronouns = set(["he", "she", "it", "its", "it's", "him", "her", "his","they",
+                "their","we", "our","i","you","your","my","mine","yours","ours"])
+
+resolved_articles = {}
+
+# Runs coreference resolution on the article using arkref.
+# This still needs to be implemented.
+def process(path_to_article):
+  if path_to_article in resolved_articles:
+    return resolved_articles[path_to_article]
+
+  subprocess.call(["./arkref.sh", "-input", path_to_article])
+  tagged_article = open(path_to_article.replace("txt", "tagged")).read()
+  tagged_article = "<root>"+tagged_article+"</root>" # trick arkref into doing entire doc
+  soup = bs4.BeautifulSoup(tagged_article, "html.parser").root
+  for entity in soup.find_all(True):
+    if entity.string != None and entity.string.strip().lower() in pronouns:
+      antecedent_id = entity["entityid"].split("_")[0]
+      antecedent = soup.find(mentionid=antecedent_id)
+      antecedent = str(antecedent).split(">", 1)[1].split("<", 1)[0]
+      #string = re.sub('<.*?>',' ',str(antecedent))
+      #tok = nltk.word_tokenize(string)
+      #ants = [(x,y) for x,y in nltk.pos_tag(tok) if y in {'NNP','NN'}]
+      entity.string.replace_with(antecedent)
+      #print 'entity is: '+entity.string
+  resolved = re.sub("<.*?>", "", str(soup))
+  resolved_articles[path_to_article] = resolved
+
+  return resolved