Skip to content

Commit

Permalink
Cleaned up code!!!!
Browse files Browse the repository at this point in the history
  • Loading branch information
Stephen Bly committed Apr 16, 2013
1 parent ecb7fad commit e339d9a
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 85 deletions.
77 changes: 54 additions & 23 deletions answer
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,7 @@ import re
import itertools
import nltk
from nltk.stem import PorterStemmer
import bs4

from bs4 import BeautifulSoup
# Import our modules from /modules
sys.path.append("modules")
import questionClassifier
Expand All @@ -27,6 +26,33 @@ import coref
def contains_negative(sent):
return "no" in sent or "not" in sent or "n't" in sent

# the set of pronouns, used for anaphora resolution
pronouns = set(["he", "she", "it", "its", "it's", "him", "her", "his","they",
"their","we", "our","i","you","your","my","mine","yours","ours"])

resolved_articles = {}

# Runs coreference resolution on the article using arkref.
# This still needs to be implemented.
def coref(path_to_article):
if path_to_article in resolved_articles:
return resolved_articles[path_to_article]

subprocess.call(["./arkref.sh", "-input", path_to_article])
tagged_article = open(path_to_article.replace("txt", "tagged")).read()
tagged_article = "<root>"+tagged_article+"</root>" # trick arkref into doing entire doc
soup = BeautifulSoup(tagged_article, "html.parser").root
for entity in soup.find_all(True):
if entity.string != None and entity.string.strip().lower() in pronouns:
antecedent_id = entity["entityid"].split("_")[0]
antecedent = soup.find(mentionid=antecedent_id)
antecedent = str(antecedent).split(">", 1)[1].split("<", 1)[0]
entity.string.replace_with(antecedent)
resolved = re.sub("<.*?>", "", str(soup))
resolved_articles[path_to_article] = resolved

return resolved

# Answers a question from the information in article.
# Ranks all the sentences and then returns the top choice.
def answer(question, article):
Expand All @@ -38,34 +64,39 @@ def answer(question, article):
if question_type == "BOOLEAN":
if contains_negative(top): return "No"
else: return "Yes"
else:
return top[0]
else: return top[0]

# The main script
if __name__ == '__main__':
article_name = sys.argv[1]

for year in ("S08", "S09", "S10"):
print "Year:", year
prefix = "Question_Answer_Dataset_v1.1/"+year+"/"
question_answer_pairs = open(prefix+"question_answer_pairs.txt").readlines()
question_answer_pairs.pop(0)
for line in question_answer_pairs:
if not line.startswith(article_name): continue
line = line.lstrip(article_name)
end = line.find("?")
if end == -1: continue
question = line[:end+1].strip()
line = line[end+1:].split()
path_to_article = prefix+line.pop()+".txt"
difficulty_answerer = line.pop()
difficulty_questioner = line.pop()
correct_answer = " ".join(line)
for year in ("S08", "S09", "S10"):
print "Year:", year
prefix = "Question_Answer_Dataset_v1.1/"+year+"/"
question_answer_pairs = open(prefix+"question_answer_pairs.txt").readlines()
question_answer_pairs.pop(0)
for line in question_answer_pairs:
if not line.startswith(article_name): continue
line = line.lstrip(article_name)
end = line.find("?")
if end == -1: continue
question = line[:end+1].strip()
line = line[end+1:].split()
path_to_article = prefix+line.pop()+".txt"
difficulty_answerer = line.pop()
difficulty_questioner = line.pop()
correct_answer = " ".join(line)

print "Question:", question
print "Difficulty from answerer:", difficulty_answerer
print "Difficulty from questioner:", difficulty_questioner
print "Question:", question
print "Difficulty from answerer:", difficulty_answerer
print "Difficulty from questioner:", difficulty_questioner

<<<<<<< HEAD
article = coref.process(path_to_article)
print "Our answer:", answer(question, article)
print "Correct answer:", correct_answer
=======
article = coref(path_to_article)
print "Our answer:", answer(question, article)
print "Correct answer:", correct_answer
>>>>>>> Cleaned up code!!!!
1 change: 0 additions & 1 deletion modules/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

import nltk

#sample = open("para.txt").read()
sample = 'I like Harvard and partitions'
sentences = nltk.sent_tokenize(sample)
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
Expand Down
2 changes: 0 additions & 2 deletions modules/lemma.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,12 @@ def lem(words):

for word, tag in pos_tagged:
temp = set({word})

if tag[0] == 'N':
for ss in wn.synsets(word, pos=wn.NOUN):
temp.update(set(lemma.name for lemma in ss.lemmas))
elif tag[0] == 'V':
for ss in wn.synsets(word, pos=wn.VERB):
temp.update(set(lemma.name for lemma in ss.lemmas))

lemmaList.append(temp)

return lemmaList
69 changes: 10 additions & 59 deletions modules/sourceContentSelector.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,44 +8,34 @@
from nltk.stem import PorterStemmer
import collections
import numpy as np
sys.path.append("modules")
import lemma

# Any keywords in the sentence will be these parts of speech
# Ignore words that don't have these parts of speech when computing keywords
key_POS = set(["CD","FW","NN","NNS","NNP","NPS","VB","VBD","VBG","VBN","VBP","VBZ"])
# auxiliary verbs we should ignore
aux = set(["is", "was", "did", "does", "do", "were", "are"])

# we should probably change this to the WordNet lemmatizer, but this is ok for now
# the porter stemmer! yay!
ps = PorterStemmer()

# check up to 5-grams for the bleu score
MAX_NGRAMS = 5

# Given a question, returns a list of keywords
def getKeywords(question):
tagged = nltk.tag.pos_tag(question)
#nltk.ne_chunk(tagged)
tagged = [pair for pair in tagged if pair[1] in key_POS and pair[0].lower() not in aux]
result = []
for tag in tagged:
if tag[1] == "NNP":
# named entities aren't that helpful until we implement coreference resolution
result.append(tag[0])
else:
result.append(ps.stem(tag[0]))
return set(result)
return {ps.stem(tag[0]) for tag in tagged}

# Given a question, return a list of each sentence in the article
# with a score attached to it
def getScoredSentences(question, article):
scored_sent = []
scored_sentences = []
sentences = nltk.tokenize.sent_tokenize(article)
for sent in sentences:
if sent.strip() == "": continue
sentence = nltk.tokenize.word_tokenize(sent)
s = score(question, sentence)
scored_sent.append((sent, s))
return scored_sent
for sentence in sentences:
if sentence.strip() == "": continue
s = score(question, nltk.word_tokenize(sentence))
scored_sentences.append((sentence, s))
return scored_sentences

# Scores a sentence based on how well we think it answers the question
def score(question, sentence):
Expand Down Expand Up @@ -87,7 +77,6 @@ def count_ngrams(tokens, n, all_smaller=False):
counts[tuple(tokens[i:i+k])] += 1

return counts
#end def

def bleu_score(ref_ngrams, ref_len, pred_ngrams, pred_len, n):
"""Calculate the BLEU precision and recall from ngram counts.
Expand All @@ -110,7 +99,6 @@ def bleu_score(ref_ngrams, ref_len, pred_ngrams, pred_len, n):

k = min(c, pred_ngrams[ngram])
ngram_score[len(ngram) - 1] += k
#end for

# compute the geometric mean of the ngrams precision/recall
precision = np.mean(np.log(ngram_score / len(pred_ngrams)))
Expand All @@ -124,40 +112,3 @@ def bleu_score(ref_ngrams, ref_len, pred_ngrams, pred_len, n):
recall = np.exp(recall)

return precision, recall
#end def


# Compare the overlap of two sentences using ngrams
# (up to trigrams). This is similar to the BLEU score.
#def ngramWeight(question, sentence):
# #stem and take set intersections for unigrams
# uniQ = map(ps.stem, question)
# uniS = sentence
# unigram = set(uniQ).intersection(set(uniS))
#
#
# #get all bigram overlaps, rolls around end of sentence
# if len(uniQ) > 1 and len(uniS) > 1:
# bigramQ = set([uniQ[i-1]+uniQ[i] for i,word in enumerate(uniQ)])
# bigramS = set([uniS[i-1]+uniS[i] for i,word in enumerate(uniS)])
# bigram = bigramQ.intersection(bigramS)
# else:
# bigram = {}
#
# if len(uniQ) > 2 and len(uniS) > 2:
# trigramQ = set([uniQ[i-2]+uniQ[i-1]+uniQ[i] for i,word in enumerate(uniQ)])
# trigramS = set([uniS[i-2]+uniS[i-1]+uniS[i] for i,word in enumerate(uniS)])
# trigram = trigramQ.intersection(trigramS)
# else:
# trigram = {}
#
#
# lam1 = 0.2
# lam2 = 0.3
# lam3 = 0.5
#
# return lam1*len(unigram) + lam2*len(bigram) + lam3*len(trigram)

# for testing
if __name__ == '__main__':
print proximity(set(["the", "moon", "stroke"]), ["I", "want", "to", "see", "the", "moon", "stroke"])

0 comments on commit e339d9a

Please sign in to comment.