From b5c0f4bbe640789099edb2915a8c460a18de3375 Mon Sep 17 00:00:00 2001 From: Ryhan Date: Mon, 15 Apr 2013 22:44:35 -0400 Subject: [PATCH] Handles coref errors --- .gitignore | 4 +++- modules/coref.py | 48 +++++++++++++++++++++++++++++------------------- 2 files changed, 32 insertions(+), 20 deletions(-) diff --git a/.gitignore b/.gitignore index 4af7271..8050fdc 100644 --- a/.gitignore +++ b/.gitignore @@ -39,4 +39,6 @@ nosetests.xml *.parse *.osent -*.sst \ No newline at end of file +*.sst +NUL +arkref_out.txt \ No newline at end of file diff --git a/modules/coref.py b/modules/coref.py index acf695c..d5c253f 100644 --- a/modules/coref.py +++ b/modules/coref.py @@ -3,10 +3,13 @@ # coref.py # Useful tools which should be pre-installed +import os, sys, errno import subprocess import re +import itertools import nltk -from bs4 import BeautifulSoup +from nltk.stem import PorterStemmer +import bs4 # the set of pronouns, used for anaphora resolution pronouns = set(["he", "she", "it", "its", "it's", "him", "her", "his","they", @@ -17,24 +20,31 @@ # Runs coreference resolution on the article using arkref. # This still needs to be implemented. def process(path_to_article): - if path_to_article in resolved_articles: - return resolved_articles[path_to_article] + original_path = path_to_article + try: + if path_to_article in resolved_articles: + return resolved_articles[path_to_article] + # arkref_out = open("arkref_out.txt", "w") + fh = open("NUL","w") + subprocess.call(["./arkref.sh", "-input", path_to_article], stdout = fh, stderr = fh) + fh.close() - subprocess.call(["./arkref.sh", "-input", path_to_article]) - tagged_article = open(path_to_article.replace("txt", "tagged")).read() - tagged_article = ""+tagged_article+"" # trick arkref into doing entire doc - soup = BeautifulSoup(tagged_article, "html.parser").root - for entity in soup.find_all(True): - if entity.string != None and entity.string.strip().lower() in pronouns: - antecedent_id = entity["entityid"].split("_")[0] - antecedent = soup.find(mentionid=antecedent_id) - antecedent = str(antecedent).split(">", 1)[1].split("<", 1)[0] - #string = re.sub('<.*?>',' ',str(antecedent)) - #tok = nltk.word_tokenize(string) - #ants = [(x,y) for x,y in nltk.pos_tag(tok) if y in {'NNP','NN'}] - entity.string.replace_with(antecedent) - #print 'entity is: '+entity.string - resolved = re.sub("<.*?>", "", str(soup)) - resolved_articles[path_to_article] = resolved + tagged_article = open(path_to_article.replace("txt", "tagged")).read() + tagged_article = ""+tagged_article+"" # trick arkref into doing entire doc + soup = bs4.BeautifulSoup(tagged_article, "html.parser").root + for entity in soup.find_all(True): + if entity.string != None and entity.string.strip().lower() in pronouns: + antecedent_id = entity["entityid"].split("_")[0] + antecedent = soup.find(mentionid=antecedent_id) + antecedent = str(antecedent).split(">", 1)[1].split("<", 1)[0] + #string = re.sub('<.*?>',' ',str(antecedent)) + #tok = nltk.word_tokenize(string) + #ants = [(x,y) for x,y in nltk.pos_tag(tok) if y in {'NNP','NN'}] + entity.string.replace_with(antecedent) + #print 'entity is: '+entity.string + resolved = re.sub("<.*?>", "", str(soup)) + except: + resolved = open(original_path).read() + resolved_articles[path_to_article] = resolved return resolved \ No newline at end of file