diff --git a/answer b/answer index a4c34cc..e81f669 100755 --- a/answer +++ b/answer @@ -9,49 +9,30 @@ import re import itertools import nltk from nltk.stem import PorterStemmer - # Import our modules from /modules sys.path.append("modules") - import questionClassifier import sourceContentSelector -#from nltk_contrib.coref.resolve import BaselineCorefResolver - -#entity_names = [] -# -#if hasattr(t, 'node') and t.node: -# if t.node == 'NE': -# entity_names.append(' '.join([child[0] for child in t])) -# else: -# for child in t: -# entity_names.extend(extract_entity_names(child)) def contains_negative(sent): - return "no" in sent or "not" in sent or \ - "didn't" in sent or "did not" in sent - -#resolver = BaselineCorefResolver() + return "no" in sent or "not" in sent or "n't" in sent # picks the sentence that has the most keywords in common with the question def answer(question, article): question = question.strip() question_type = questionClassifier.process(question) question = nltk.tokenize.word_tokenize(question) - relevant = sourceContentSelector.process(question, article) - + relevant = sourceContentSelector.getRelevantSentences(question, article) relevant.sort(key = lambda s: s[1], reverse=True) top = relevant[0][0] if question_type == "BOOLEAN": - if contains_negative(top): - return "NO" - else: - return "YES" + if contains_negative(top): return "NO" + else: return "YES" else: return top if __name__ == '__main__': -<<<<<<< HEAD article_name = sys.argv[1] for year in ("S08", "S09", "S10"): @@ -74,15 +55,6 @@ if __name__ == '__main__': print "Question:", question print "Difficulty from answerer:", difficulty_answerer print "Difficulty from questioner:", difficulty_questioner -======= - - if(len(sys.argv)) < 3: - print 'Usage:
' - sys.exit(1) - - path_to_article = sys.argv[1] - path_to_questions = sys.argv[2] ->>>>>>> Usefull printout # Open the question file and start answering questions. article = open(prefix+path_to_article+".txt").read() diff --git a/modules/extractor.py b/modules/extractor.py index 7224f03..36d81ec 100644 --- a/modules/extractor.py +++ b/modules/extractor.py @@ -2,29 +2,27 @@ # named entity recognition, currently unused. import nltk - -with open('para.txt', 'r') as f: - sample = f.read() - + +sample = open("para.txt").read() sentences = nltk.sent_tokenize(sample) tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences] tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences] chunked_sentences = nltk.batch_ne_chunk(tagged_sentences, binary=True) - + def extract_entity_names(t): entity_names = [] - + if hasattr(t, 'node') and t.node: if t.node == 'NE': entity_names.append(' '.join([child[0] for child in t])) else: for child in t: entity_names.extend(extract_entity_names(child)) - + return entity_names - + entity_names = [] for tree in chunked_sentences: # Print results per sentence @@ -32,10 +30,10 @@ def extract_entity_names(t): for node in tree: print node entity_names.extend(extract_entity_names(tree)) - + # Print all entity names #print entity_names - + # Print unique entity names print set(entity_names) @@ -45,6 +43,4 @@ def extract_entity_names(t): for (x,y) in tagged_sentences[0]: if y in wantset: entity_names.append(x) - - print entity_names \ No newline at end of file diff --git a/modules/lemma.py b/modules/lemma.py index d0d3dfc..431e404 100644 --- a/modules/lemma.py +++ b/modules/lemma.py @@ -4,24 +4,20 @@ import nltk from nltk.corpus import wordnet as wn -def lem(list): - - pos = nltk.pos_tag(list) - +def lem(words): + pos_tagged = nltk.pos_tag(words) lemmaList = [] - for word,tag in pos: + for word, tag in pos_tagged: temp = set({word}) if tag[0] == 'N': for ss in wn.synsets(word, pos=wn.NOUN): temp.update(set(lemma.name for lemma in ss.lemmas)) - lemmaList.append(temp) elif tag[0] == 'V': for ss in wn.synsets(word, pos=wn.VERB): temp.update(set(lemma.name for lemma in ss.lemmas)) - lemmaList.append(temp) - else: - lemmaList.append({word}) - return lemmaList + lemmaList.append(temp) + + return lemmaList \ No newline at end of file diff --git a/modules/questionClassifier.py b/modules/questionClassifier.py index 377c882..037b63c 100644 --- a/modules/questionClassifier.py +++ b/modules/questionClassifier.py @@ -21,7 +21,7 @@ def process(question): return "PHRASE" elif question.startswith("How many "): return "NUMERAL" - elif question.startswith(("Is ", "Was ", "Will ", "Are ", "Were ", "Do ", "Does ", "Did ")): + elif question.startswith(("Is ", "Was ", "Will ", "Are ", "Were ", "Do ", "Does ", "Did ", "Have ", "Has ")): return "BOOLEAN" else: return "UNKOWN" \ No newline at end of file diff --git a/modules/sourceContentSelector.py b/modules/sourceContentSelector.py index ad22297..e84dd6d 100644 --- a/modules/sourceContentSelector.py +++ b/modules/sourceContentSelector.py @@ -16,12 +16,6 @@ # we should probably change this to the WordNet lemmatizer, but this is ok for now ps = PorterStemmer() -# Given a question, returns relevant parts of an article -def process (question, article): - keywords = getKeywords(question) - relevant = getRelevantSentences(keywords, article) - return relevant - # Given a question, returns a list of keywords def getKeywords(question): tagged = nltk.tag.pos_tag(question) @@ -34,29 +28,41 @@ def getKeywords(question): result.append(tag[0]) else: result.append(ps.stem(tag[0])) - return result + return set(result) -def getRelevantSentences(keywords, article): +# Given a question, returns relevant parts of an article +def getRelevantSentences(question, article): relevant = [] sentences = nltk.tokenize.sent_tokenize(article) - print keywords for sent in sentences: - sentence_set = set(nltk.tokenize.word_tokenize(sent)) - sentence_set = map(ps.stem, sentence_set) - #print keywords - #print sentence_set - score = 0 - for word in keywords: - if word in sentence_set: - score += 1 - relevant.append((sent, score)) + sentence = nltk.tokenize.word_tokenize(sent) + sentence = map(ps.stem, sentence) + s = score(question, sentence) + relevant.append((sent, s)) return relevant +def score(question, sentence): + score = 0 + score += ngramWeight(question, sentence) + keywords = getKeywords(question) + score += proximity(keywords, sentence) + return score + +# measures the proximity of the keywords from the original query to each other +def proximity(keywords, sentence): + length = len(sentence) + for i in range(len(keywords), length+1): + for j in range(length+1-i): + words = set(sentence[j:i+j]) + if keywords <= words: + return 10-i + return 0 + # compare two sentences using ngrams (upto trigram) -def ngramWeight(question,sentence): +def ngramWeight(question, sentence): #stem and take set intersections for unigrams - uniQ = map(ps.stem, nltk.word_tokenize(question)) - uniS = map(ps.stem, nltk.word_tokenize(sentence)) + uniQ = map(ps.stem, question) + uniS = sentence unigram = set(uniQ).intersection(set(uniS)) #get all bigram overlaps, rolls around end of sentence @@ -67,7 +73,7 @@ def ngramWeight(question,sentence): trigramQ = {uniQ[i-2]+uniQ[i-1]+uniQ[i] for i,word in enumerate(uniQ)} trigramS = {uniS[i-2]+uniS[i-1]+uniS[i] for i,word in enumerate(uniS)} trigram = trigramQ.intersection(trigramS) - + lam1 = 0.2 lam2 = 0.3 lam3 = 0.5 @@ -75,7 +81,4 @@ def ngramWeight(question,sentence): return lam1*len(unigram) + lam2*len(bigram) + lam3*len(trigram) if __name__ == '__main__': - ngramWeight('I like dolphin now','Sam also likes dolphins now') - - - + print proximity(set(["the", "moon", "stroke"]), ["I", "want", "to", "see", "the", "moon", "stroke"])