Handles coref errors

ryhan · Apr 16, 2013 · b5c0f4b · b5c0f4b
1 parent a4abfb8
commit b5c0f4b
Show file tree

Hide file tree

Showing 2 changed files with 32 additions and 20 deletions.
diff --git a/.gitignore b/.gitignore
@@ -39,4 +39,6 @@ nosetests.xml
 
 *.parse
 *.osent
-*.sst
+*.sst
+NUL
+arkref_out.txt
diff --git a/modules/coref.py b/modules/coref.py
@@ -3,10 +3,13 @@
 # coref.py
 
 # Useful tools which should be pre-installed
+import os, sys, errno
 import subprocess
 import re
+import itertools
 import nltk
-from bs4 import BeautifulSoup
+from nltk.stem import PorterStemmer
+import bs4
 
 # the set of pronouns, used for anaphora resolution
 pronouns = set(["he", "she", "it", "its", "it's", "him", "her", "his","they",
@@ -17,24 +20,31 @@
 # Runs coreference resolution on the article using arkref.
 # This still needs to be implemented.
 def process(path_to_article):
-  if path_to_article in resolved_articles:
-    return resolved_articles[path_to_article]
+  original_path = path_to_article
+  try:
+    if path_to_article in resolved_articles:
+      return resolved_articles[path_to_article]
+    # arkref_out = open("arkref_out.txt", "w")
+    fh = open("NUL","w")
+    subprocess.call(["./arkref.sh", "-input", path_to_article], stdout = fh, stderr = fh)
+    fh.close()
 
-  subprocess.call(["./arkref.sh", "-input", path_to_article])
-  tagged_article = open(path_to_article.replace("txt", "tagged")).read()
-  tagged_article = "<root>"+tagged_article+"</root>" # trick arkref into doing entire doc
-  soup = BeautifulSoup(tagged_article, "html.parser").root
-  for entity in soup.find_all(True):
-    if entity.string != None and entity.string.strip().lower() in pronouns:
-      antecedent_id = entity["entityid"].split("_")[0]
-      antecedent = soup.find(mentionid=antecedent_id)
-      antecedent = str(antecedent).split(">", 1)[1].split("<", 1)[0]
-      #string = re.sub('<.*?>',' ',str(antecedent))
-      #tok = nltk.word_tokenize(string)
-      #ants = [(x,y) for x,y in nltk.pos_tag(tok) if y in {'NNP','NN'}]
-      entity.string.replace_with(antecedent)
-      #print 'entity is: '+entity.string
-  resolved = re.sub("<.*?>", "", str(soup))
-  resolved_articles[path_to_article] = resolved
+    tagged_article = open(path_to_article.replace("txt", "tagged")).read()
+    tagged_article = "<root>"+tagged_article+"</root>" # trick arkref into doing entire doc
+    soup = bs4.BeautifulSoup(tagged_article, "html.parser").root
+    for entity in soup.find_all(True):
+      if entity.string != None and entity.string.strip().lower() in pronouns:
+        antecedent_id = entity["entityid"].split("_")[0]
+        antecedent = soup.find(mentionid=antecedent_id)
+        antecedent = str(antecedent).split(">", 1)[1].split("<", 1)[0]
+        #string = re.sub('<.*?>',' ',str(antecedent))
+        #tok = nltk.word_tokenize(string)
+        #ants = [(x,y) for x,y in nltk.pos_tag(tok) if y in {'NNP','NN'}]
+        entity.string.replace_with(antecedent)
+        #print 'entity is: '+entity.string
+    resolved = re.sub("<.*?>", "", str(soup))
+  except:
+    resolved = open(original_path).read()
 
+  resolved_articles[path_to_article] = resolved
   return resolved
-Original file line number
+Diff line change
@@ Expand Up / @@ -39,4 +39,6 @@ nosetests.xml @@
     *.parse
     *.osent
-    *.sst
+    *.sst
+    NUL
+    arkref_out.txt