Skip to content

Commit

Permalink
Handles coref errors
Browse files Browse the repository at this point in the history
  • Loading branch information
ryhan committed Apr 16, 2013
1 parent a4abfb8 commit b5c0f4b
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 20 deletions.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -39,4 +39,6 @@ nosetests.xml

*.parse
*.osent
*.sst
*.sst
NUL
arkref_out.txt
48 changes: 29 additions & 19 deletions modules/coref.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,13 @@
# coref.py

# Useful tools which should be pre-installed
import os, sys, errno
import subprocess
import re
import itertools
import nltk
from bs4 import BeautifulSoup
from nltk.stem import PorterStemmer
import bs4

# the set of pronouns, used for anaphora resolution
pronouns = set(["he", "she", "it", "its", "it's", "him", "her", "his","they",
Expand All @@ -17,24 +20,31 @@
# Runs coreference resolution on the article using arkref.
# This still needs to be implemented.
def process(path_to_article):
if path_to_article in resolved_articles:
return resolved_articles[path_to_article]
original_path = path_to_article
try:
if path_to_article in resolved_articles:
return resolved_articles[path_to_article]
# arkref_out = open("arkref_out.txt", "w")
fh = open("NUL","w")
subprocess.call(["./arkref.sh", "-input", path_to_article], stdout = fh, stderr = fh)
fh.close()

subprocess.call(["./arkref.sh", "-input", path_to_article])
tagged_article = open(path_to_article.replace("txt", "tagged")).read()
tagged_article = "<root>"+tagged_article+"</root>" # trick arkref into doing entire doc
soup = BeautifulSoup(tagged_article, "html.parser").root
for entity in soup.find_all(True):
if entity.string != None and entity.string.strip().lower() in pronouns:
antecedent_id = entity["entityid"].split("_")[0]
antecedent = soup.find(mentionid=antecedent_id)
antecedent = str(antecedent).split(">", 1)[1].split("<", 1)[0]
#string = re.sub('<.*?>',' ',str(antecedent))
#tok = nltk.word_tokenize(string)
#ants = [(x,y) for x,y in nltk.pos_tag(tok) if y in {'NNP','NN'}]
entity.string.replace_with(antecedent)
#print 'entity is: '+entity.string
resolved = re.sub("<.*?>", "", str(soup))
resolved_articles[path_to_article] = resolved
tagged_article = open(path_to_article.replace("txt", "tagged")).read()
tagged_article = "<root>"+tagged_article+"</root>" # trick arkref into doing entire doc
soup = bs4.BeautifulSoup(tagged_article, "html.parser").root
for entity in soup.find_all(True):
if entity.string != None and entity.string.strip().lower() in pronouns:
antecedent_id = entity["entityid"].split("_")[0]
antecedent = soup.find(mentionid=antecedent_id)
antecedent = str(antecedent).split(">", 1)[1].split("<", 1)[0]
#string = re.sub('<.*?>',' ',str(antecedent))
#tok = nltk.word_tokenize(string)
#ants = [(x,y) for x,y in nltk.pos_tag(tok) if y in {'NNP','NN'}]
entity.string.replace_with(antecedent)
#print 'entity is: '+entity.string
resolved = re.sub("<.*?>", "", str(soup))
except:
resolved = open(original_path).read()

resolved_articles[path_to_article] = resolved
return resolved

0 comments on commit b5c0f4b

Please sign in to comment.