Skip to content

Commit

Permalink
Created coref module
Browse files Browse the repository at this point in the history
  • Loading branch information
ryhan committed Apr 16, 2013
1 parent 54e07db commit b352782
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 32 deletions.
35 changes: 3 additions & 32 deletions answer
Original file line number Diff line number Diff line change
Expand Up @@ -14,48 +14,19 @@ import itertools
import nltk
from nltk.stem import PorterStemmer
import bs4

# Import our modules from /modules
sys.path.append("modules")
import questionClassifier
import sourceContentSelector
import coref

# To answer yes/no question, we want to just answer yes or no,
# and not returna whole sentence. We do this by checking for
# any negatives in the sentence.
def contains_negative(sent):
return "no" in sent or "not" in sent or "n't" in sent

# the set of pronouns, used for anaphora resolution
pronouns = set(["he", "she", "it", "its", "it's", "him", "her", "his","they",
"their","we", "our","i","you","your","my","mine","yours","ours"])

resolved_articles = {}

# Runs coreference resolution on the article using arkref.
# This still needs to be implemented.
def coref(path_to_article):
if path_to_article in resolved_articles:
return resolved_articles[path_to_article]

subprocess.call(["./arkref.sh", "-input", path_to_article])
tagged_article = open(path_to_article.replace("txt", "tagged")).read()
tagged_article = "<root>"+tagged_article+"</root>" # trick arkref into doing entire doc
soup = bs4.BeautifulSoup(tagged_article, "html.parser").root
for entity in soup.find_all(True):
if entity.string != None and entity.string.strip().lower() in pronouns:
antecedent_id = entity["entityid"].split("_")[0]
antecedent = soup.find(mentionid=antecedent_id)
antecedent = str(antecedent).split(">", 1)[1].split("<", 1)[0]
#string = re.sub('<.*?>',' ',str(antecedent))
#tok = nltk.word_tokenize(string)
#ants = [(x,y) for x,y in nltk.pos_tag(tok) if y in {'NNP','NN'}]
entity.string.replace_with(antecedent)
#print 'entity is: '+entity.string
resolved = re.sub("<.*?>", "", str(soup))
resolved_articles[path_to_article] = resolved

return resolved

# Answers a question from the information in article.
# Ranks all the sentences and then returns the top choice.
def answer(question, article):
Expand Down Expand Up @@ -95,6 +66,6 @@ for year in ("S08", "S09", "S10"):
print "Difficulty from answerer:", difficulty_answerer
print "Difficulty from questioner:", difficulty_questioner

article = coref(path_to_article)
article = coref.process(path_to_article)
print "Our answer:", answer(question, article)
print "Correct answer:", correct_answer
43 changes: 43 additions & 0 deletions modules/coref.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#!/usr/bin/python

# coref.py

# Useful tools which should be pre-installed
import os, sys, errno
import subprocess
import re
import itertools
import nltk
from nltk.stem import PorterStemmer
import bs4

# the set of pronouns, used for anaphora resolution
pronouns = set(["he", "she", "it", "its", "it's", "him", "her", "his","they",
"their","we", "our","i","you","your","my","mine","yours","ours"])

resolved_articles = {}

# Runs coreference resolution on the article using arkref.
# This still needs to be implemented.
def process(path_to_article):
if path_to_article in resolved_articles:
return resolved_articles[path_to_article]

subprocess.call(["./arkref.sh", "-input", path_to_article])
tagged_article = open(path_to_article.replace("txt", "tagged")).read()
tagged_article = "<root>"+tagged_article+"</root>" # trick arkref into doing entire doc
soup = bs4.BeautifulSoup(tagged_article, "html.parser").root
for entity in soup.find_all(True):
if entity.string != None and entity.string.strip().lower() in pronouns:
antecedent_id = entity["entityid"].split("_")[0]
antecedent = soup.find(mentionid=antecedent_id)
antecedent = str(antecedent).split(">", 1)[1].split("<", 1)[0]
#string = re.sub('<.*?>',' ',str(antecedent))
#tok = nltk.word_tokenize(string)
#ants = [(x,y) for x,y in nltk.pos_tag(tok) if y in {'NNP','NN'}]
entity.string.replace_with(antecedent)
#print 'entity is: '+entity.string
resolved = re.sub("<.*?>", "", str(soup))
resolved_articles[path_to_article] = resolved

return resolved

0 comments on commit b352782

Please sign in to comment.