From e339d9adf0c83452a15a49f42b292bb0412234c0 Mon Sep 17 00:00:00 2001 From: Stephen Bly Date: Mon, 15 Apr 2013 22:18:11 -0400 Subject: [PATCH] Cleaned up code!!!! --- answer | 77 ++++++++++++++++++++++---------- modules/extractor.py | 1 - modules/lemma.py | 2 - modules/sourceContentSelector.py | 69 +++++----------------------- 4 files changed, 64 insertions(+), 85 deletions(-) diff --git a/answer b/answer index f0de860..4d8de42 100755 --- a/answer +++ b/answer @@ -13,8 +13,7 @@ import re import itertools import nltk from nltk.stem import PorterStemmer -import bs4 - +from bs4 import BeautifulSoup # Import our modules from /modules sys.path.append("modules") import questionClassifier @@ -27,6 +26,33 @@ import coref def contains_negative(sent): return "no" in sent or "not" in sent or "n't" in sent +# the set of pronouns, used for anaphora resolution +pronouns = set(["he", "she", "it", "its", "it's", "him", "her", "his","they", + "their","we", "our","i","you","your","my","mine","yours","ours"]) + +resolved_articles = {} + +# Runs coreference resolution on the article using arkref. +# This still needs to be implemented. +def coref(path_to_article): + if path_to_article in resolved_articles: + return resolved_articles[path_to_article] + + subprocess.call(["./arkref.sh", "-input", path_to_article]) + tagged_article = open(path_to_article.replace("txt", "tagged")).read() + tagged_article = ""+tagged_article+"" # trick arkref into doing entire doc + soup = BeautifulSoup(tagged_article, "html.parser").root + for entity in soup.find_all(True): + if entity.string != None and entity.string.strip().lower() in pronouns: + antecedent_id = entity["entityid"].split("_")[0] + antecedent = soup.find(mentionid=antecedent_id) + antecedent = str(antecedent).split(">", 1)[1].split("<", 1)[0] + entity.string.replace_with(antecedent) + resolved = re.sub("<.*?>", "", str(soup)) + resolved_articles[path_to_article] = resolved + + return resolved + # Answers a question from the information in article. # Ranks all the sentences and then returns the top choice. def answer(question, article): @@ -38,34 +64,39 @@ def answer(question, article): if question_type == "BOOLEAN": if contains_negative(top): return "No" else: return "Yes" - else: - return top[0] + else: return top[0] # The main script if __name__ == '__main__': article_name = sys.argv[1] -for year in ("S08", "S09", "S10"): - print "Year:", year - prefix = "Question_Answer_Dataset_v1.1/"+year+"/" - question_answer_pairs = open(prefix+"question_answer_pairs.txt").readlines() - question_answer_pairs.pop(0) - for line in question_answer_pairs: - if not line.startswith(article_name): continue - line = line.lstrip(article_name) - end = line.find("?") - if end == -1: continue - question = line[:end+1].strip() - line = line[end+1:].split() - path_to_article = prefix+line.pop()+".txt" - difficulty_answerer = line.pop() - difficulty_questioner = line.pop() - correct_answer = " ".join(line) + for year in ("S08", "S09", "S10"): + print "Year:", year + prefix = "Question_Answer_Dataset_v1.1/"+year+"/" + question_answer_pairs = open(prefix+"question_answer_pairs.txt").readlines() + question_answer_pairs.pop(0) + for line in question_answer_pairs: + if not line.startswith(article_name): continue + line = line.lstrip(article_name) + end = line.find("?") + if end == -1: continue + question = line[:end+1].strip() + line = line[end+1:].split() + path_to_article = prefix+line.pop()+".txt" + difficulty_answerer = line.pop() + difficulty_questioner = line.pop() + correct_answer = " ".join(line) - print "Question:", question - print "Difficulty from answerer:", difficulty_answerer - print "Difficulty from questioner:", difficulty_questioner + print "Question:", question + print "Difficulty from answerer:", difficulty_answerer + print "Difficulty from questioner:", difficulty_questioner +<<<<<<< HEAD article = coref.process(path_to_article) print "Our answer:", answer(question, article) print "Correct answer:", correct_answer +======= + article = coref(path_to_article) + print "Our answer:", answer(question, article) + print "Correct answer:", correct_answer +>>>>>>> Cleaned up code!!!! diff --git a/modules/extractor.py b/modules/extractor.py index de77a2b..36b86f2 100644 --- a/modules/extractor.py +++ b/modules/extractor.py @@ -3,7 +3,6 @@ import nltk -#sample = open("para.txt").read() sample = 'I like Harvard and partitions' sentences = nltk.sent_tokenize(sample) tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences] diff --git a/modules/lemma.py b/modules/lemma.py index 7d8f5ef..50f5b17 100644 --- a/modules/lemma.py +++ b/modules/lemma.py @@ -11,14 +11,12 @@ def lem(words): for word, tag in pos_tagged: temp = set({word}) - if tag[0] == 'N': for ss in wn.synsets(word, pos=wn.NOUN): temp.update(set(lemma.name for lemma in ss.lemmas)) elif tag[0] == 'V': for ss in wn.synsets(word, pos=wn.VERB): temp.update(set(lemma.name for lemma in ss.lemmas)) - lemmaList.append(temp) return lemmaList \ No newline at end of file diff --git a/modules/sourceContentSelector.py b/modules/sourceContentSelector.py index 8a31a5c..0e0258a 100644 --- a/modules/sourceContentSelector.py +++ b/modules/sourceContentSelector.py @@ -8,44 +8,34 @@ from nltk.stem import PorterStemmer import collections import numpy as np -sys.path.append("modules") -import lemma -# Any keywords in the sentence will be these parts of speech +# Ignore words that don't have these parts of speech when computing keywords key_POS = set(["CD","FW","NN","NNS","NNP","NPS","VB","VBD","VBG","VBN","VBP","VBZ"]) # auxiliary verbs we should ignore aux = set(["is", "was", "did", "does", "do", "were", "are"]) -# we should probably change this to the WordNet lemmatizer, but this is ok for now +# the porter stemmer! yay! ps = PorterStemmer() +# check up to 5-grams for the bleu score MAX_NGRAMS = 5 # Given a question, returns a list of keywords def getKeywords(question): tagged = nltk.tag.pos_tag(question) - #nltk.ne_chunk(tagged) tagged = [pair for pair in tagged if pair[1] in key_POS and pair[0].lower() not in aux] - result = [] - for tag in tagged: - if tag[1] == "NNP": - # named entities aren't that helpful until we implement coreference resolution - result.append(tag[0]) - else: - result.append(ps.stem(tag[0])) - return set(result) + return {ps.stem(tag[0]) for tag in tagged} # Given a question, return a list of each sentence in the article # with a score attached to it def getScoredSentences(question, article): - scored_sent = [] + scored_sentences = [] sentences = nltk.tokenize.sent_tokenize(article) - for sent in sentences: - if sent.strip() == "": continue - sentence = nltk.tokenize.word_tokenize(sent) - s = score(question, sentence) - scored_sent.append((sent, s)) - return scored_sent + for sentence in sentences: + if sentence.strip() == "": continue + s = score(question, nltk.word_tokenize(sentence)) + scored_sentences.append((sentence, s)) + return scored_sentences # Scores a sentence based on how well we think it answers the question def score(question, sentence): @@ -87,7 +77,6 @@ def count_ngrams(tokens, n, all_smaller=False): counts[tuple(tokens[i:i+k])] += 1 return counts -#end def def bleu_score(ref_ngrams, ref_len, pred_ngrams, pred_len, n): """Calculate the BLEU precision and recall from ngram counts. @@ -110,7 +99,6 @@ def bleu_score(ref_ngrams, ref_len, pred_ngrams, pred_len, n): k = min(c, pred_ngrams[ngram]) ngram_score[len(ngram) - 1] += k - #end for # compute the geometric mean of the ngrams precision/recall precision = np.mean(np.log(ngram_score / len(pred_ngrams))) @@ -124,40 +112,3 @@ def bleu_score(ref_ngrams, ref_len, pred_ngrams, pred_len, n): recall = np.exp(recall) return precision, recall -#end def - - -# Compare the overlap of two sentences using ngrams -# (up to trigrams). This is similar to the BLEU score. -#def ngramWeight(question, sentence): -# #stem and take set intersections for unigrams -# uniQ = map(ps.stem, question) -# uniS = sentence -# unigram = set(uniQ).intersection(set(uniS)) -# -# -# #get all bigram overlaps, rolls around end of sentence -# if len(uniQ) > 1 and len(uniS) > 1: -# bigramQ = set([uniQ[i-1]+uniQ[i] for i,word in enumerate(uniQ)]) -# bigramS = set([uniS[i-1]+uniS[i] for i,word in enumerate(uniS)]) -# bigram = bigramQ.intersection(bigramS) -# else: -# bigram = {} -# -# if len(uniQ) > 2 and len(uniS) > 2: -# trigramQ = set([uniQ[i-2]+uniQ[i-1]+uniQ[i] for i,word in enumerate(uniQ)]) -# trigramS = set([uniS[i-2]+uniS[i-1]+uniS[i] for i,word in enumerate(uniS)]) -# trigram = trigramQ.intersection(trigramS) -# else: -# trigram = {} -# -# -# lam1 = 0.2 -# lam2 = 0.3 -# lam3 = 0.5 -# -# return lam1*len(unigram) + lam2*len(bigram) + lam3*len(trigram) - -# for testing -if __name__ == '__main__': - print proximity(set(["the", "moon", "stroke"]), ["I", "want", "to", "see", "the", "moon", "stroke"])