From b3527823aedfa61fc659f26b16b9afb21d396db8 Mon Sep 17 00:00:00 2001 From: Ryhan Date: Mon, 15 Apr 2013 21:57:43 -0400 Subject: [PATCH] Created coref module --- answer | 35 +++-------------------------------- modules/coref.py | 43 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 32 deletions(-) create mode 100644 modules/coref.py diff --git a/answer b/answer index c97b49d..f0de860 100755 --- a/answer +++ b/answer @@ -14,10 +14,12 @@ import itertools import nltk from nltk.stem import PorterStemmer import bs4 + # Import our modules from /modules sys.path.append("modules") import questionClassifier import sourceContentSelector +import coref # To answer yes/no question, we want to just answer yes or no, # and not returna whole sentence. We do this by checking for @@ -25,37 +27,6 @@ import sourceContentSelector def contains_negative(sent): return "no" in sent or "not" in sent or "n't" in sent -# the set of pronouns, used for anaphora resolution -pronouns = set(["he", "she", "it", "its", "it's", "him", "her", "his","they", - "their","we", "our","i","you","your","my","mine","yours","ours"]) - -resolved_articles = {} - -# Runs coreference resolution on the article using arkref. -# This still needs to be implemented. -def coref(path_to_article): - if path_to_article in resolved_articles: - return resolved_articles[path_to_article] - - subprocess.call(["./arkref.sh", "-input", path_to_article]) - tagged_article = open(path_to_article.replace("txt", "tagged")).read() - tagged_article = ""+tagged_article+"" # trick arkref into doing entire doc - soup = bs4.BeautifulSoup(tagged_article, "html.parser").root - for entity in soup.find_all(True): - if entity.string != None and entity.string.strip().lower() in pronouns: - antecedent_id = entity["entityid"].split("_")[0] - antecedent = soup.find(mentionid=antecedent_id) - antecedent = str(antecedent).split(">", 1)[1].split("<", 1)[0] - #string = re.sub('<.*?>',' ',str(antecedent)) - #tok = nltk.word_tokenize(string) - #ants = [(x,y) for x,y in nltk.pos_tag(tok) if y in {'NNP','NN'}] - entity.string.replace_with(antecedent) - #print 'entity is: '+entity.string - resolved = re.sub("<.*?>", "", str(soup)) - resolved_articles[path_to_article] = resolved - - return resolved - # Answers a question from the information in article. # Ranks all the sentences and then returns the top choice. def answer(question, article): @@ -95,6 +66,6 @@ for year in ("S08", "S09", "S10"): print "Difficulty from answerer:", difficulty_answerer print "Difficulty from questioner:", difficulty_questioner - article = coref(path_to_article) + article = coref.process(path_to_article) print "Our answer:", answer(question, article) print "Correct answer:", correct_answer diff --git a/modules/coref.py b/modules/coref.py new file mode 100644 index 0000000..73db59d --- /dev/null +++ b/modules/coref.py @@ -0,0 +1,43 @@ +#!/usr/bin/python + +# coref.py + +# Useful tools which should be pre-installed +import os, sys, errno +import subprocess +import re +import itertools +import nltk +from nltk.stem import PorterStemmer +import bs4 + +# the set of pronouns, used for anaphora resolution +pronouns = set(["he", "she", "it", "its", "it's", "him", "her", "his","they", + "their","we", "our","i","you","your","my","mine","yours","ours"]) + +resolved_articles = {} + +# Runs coreference resolution on the article using arkref. +# This still needs to be implemented. +def process(path_to_article): + if path_to_article in resolved_articles: + return resolved_articles[path_to_article] + + subprocess.call(["./arkref.sh", "-input", path_to_article]) + tagged_article = open(path_to_article.replace("txt", "tagged")).read() + tagged_article = ""+tagged_article+"" # trick arkref into doing entire doc + soup = bs4.BeautifulSoup(tagged_article, "html.parser").root + for entity in soup.find_all(True): + if entity.string != None and entity.string.strip().lower() in pronouns: + antecedent_id = entity["entityid"].split("_")[0] + antecedent = soup.find(mentionid=antecedent_id) + antecedent = str(antecedent).split(">", 1)[1].split("<", 1)[0] + #string = re.sub('<.*?>',' ',str(antecedent)) + #tok = nltk.word_tokenize(string) + #ants = [(x,y) for x,y in nltk.pos_tag(tok) if y in {'NNP','NN'}] + entity.string.replace_with(antecedent) + #print 'entity is: '+entity.string + resolved = re.sub("<.*?>", "", str(soup)) + resolved_articles[path_to_article] = resolved + + return resolved \ No newline at end of file