Skip to content

Commit

Permalink
Worked on the scoring function! (and some other small things)
Browse files Browse the repository at this point in the history
  • Loading branch information
Stephen Bly committed Apr 5, 2013
1 parent 3d3b3c8 commit a0d5906
Show file tree
Hide file tree
Showing 5 changed files with 48 additions and 81 deletions.
36 changes: 4 additions & 32 deletions answer
Original file line number Diff line number Diff line change
Expand Up @@ -9,49 +9,30 @@ import re
import itertools
import nltk
from nltk.stem import PorterStemmer

# Import our modules from /modules
sys.path.append("modules")

import questionClassifier
import sourceContentSelector
#from nltk_contrib.coref.resolve import BaselineCorefResolver

#entity_names = []
#
#if hasattr(t, 'node') and t.node:
# if t.node == 'NE':
# entity_names.append(' '.join([child[0] for child in t]))
# else:
# for child in t:
# entity_names.extend(extract_entity_names(child))

def contains_negative(sent):
return "no" in sent or "not" in sent or \
"didn't" in sent or "did not" in sent

#resolver = BaselineCorefResolver()
return "no" in sent or "not" in sent or "n't" in sent

# picks the sentence that has the most keywords in common with the question
def answer(question, article):
question = question.strip()
question_type = questionClassifier.process(question)
question = nltk.tokenize.word_tokenize(question)
relevant = sourceContentSelector.process(question, article)

relevant = sourceContentSelector.getRelevantSentences(question, article)
relevant.sort(key = lambda s: s[1], reverse=True)
top = relevant[0][0]
if question_type == "BOOLEAN":
if contains_negative(top):
return "NO"
else:
return "YES"
if contains_negative(top): return "NO"
else: return "YES"
else:
return top


if __name__ == '__main__':
<<<<<<< HEAD
article_name = sys.argv[1]

for year in ("S08", "S09", "S10"):
Expand All @@ -74,15 +55,6 @@ if __name__ == '__main__':
print "Question:", question
print "Difficulty from answerer:", difficulty_answerer
print "Difficulty from questioner:", difficulty_questioner
=======

if(len(sys.argv)) < 3:
print 'Usage: <article path> <questions path>'
sys.exit(1)

path_to_article = sys.argv[1]
path_to_questions = sys.argv[2]
>>>>>>> Usefull printout

# Open the question file and start answering questions.
article = open(prefix+path_to_article+".txt").read()
Expand Down
20 changes: 8 additions & 12 deletions modules/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,40 +2,38 @@
# named entity recognition, currently unused.

import nltk

with open('para.txt', 'r') as f:
sample = f.read()


sample = open("para.txt").read()

sentences = nltk.sent_tokenize(sample)
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
chunked_sentences = nltk.batch_ne_chunk(tagged_sentences, binary=True)


def extract_entity_names(t):
entity_names = []

if hasattr(t, 'node') and t.node:
if t.node == 'NE':
entity_names.append(' '.join([child[0] for child in t]))
else:
for child in t:
entity_names.extend(extract_entity_names(child))

return entity_names

entity_names = []
for tree in chunked_sentences:
# Print results per sentence
# print extract_entity_names(tree)
for node in tree:
print node
entity_names.extend(extract_entity_names(tree))

# Print all entity names
#print entity_names

# Print unique entity names
print set(entity_names)

Expand All @@ -45,6 +43,4 @@ def extract_entity_names(t):
for (x,y) in tagged_sentences[0]:
if y in wantset:
entity_names.append(x)


print entity_names
16 changes: 6 additions & 10 deletions modules/lemma.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,24 +4,20 @@
import nltk
from nltk.corpus import wordnet as wn

def lem(list):

pos = nltk.pos_tag(list)

def lem(words):
pos_tagged = nltk.pos_tag(words)
lemmaList = []

for word,tag in pos:
for word, tag in pos_tagged:
temp = set({word})

if tag[0] == 'N':
for ss in wn.synsets(word, pos=wn.NOUN):
temp.update(set(lemma.name for lemma in ss.lemmas))
lemmaList.append(temp)
elif tag[0] == 'V':
for ss in wn.synsets(word, pos=wn.VERB):
temp.update(set(lemma.name for lemma in ss.lemmas))
lemmaList.append(temp)
else:
lemmaList.append({word})

return lemmaList
lemmaList.append(temp)

return lemmaList
2 changes: 1 addition & 1 deletion modules/questionClassifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def process(question):
return "PHRASE"
elif question.startswith("How many "):
return "NUMERAL"
elif question.startswith(("Is ", "Was ", "Will ", "Are ", "Were ", "Do ", "Does ", "Did ")):
elif question.startswith(("Is ", "Was ", "Will ", "Are ", "Were ", "Do ", "Does ", "Did ", "Have ", "Has ")):
return "BOOLEAN"
else:
return "UNKOWN"
55 changes: 29 additions & 26 deletions modules/sourceContentSelector.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,6 @@
# we should probably change this to the WordNet lemmatizer, but this is ok for now
ps = PorterStemmer()

# Given a question, returns relevant parts of an article
def process (question, article):
keywords = getKeywords(question)
relevant = getRelevantSentences(keywords, article)
return relevant

# Given a question, returns a list of keywords
def getKeywords(question):
tagged = nltk.tag.pos_tag(question)
Expand All @@ -34,29 +28,41 @@ def getKeywords(question):
result.append(tag[0])
else:
result.append(ps.stem(tag[0]))
return result
return set(result)

def getRelevantSentences(keywords, article):
# Given a question, returns relevant parts of an article
def getRelevantSentences(question, article):
relevant = []
sentences = nltk.tokenize.sent_tokenize(article)
print keywords
for sent in sentences:
sentence_set = set(nltk.tokenize.word_tokenize(sent))
sentence_set = map(ps.stem, sentence_set)
#print keywords
#print sentence_set
score = 0
for word in keywords:
if word in sentence_set:
score += 1
relevant.append((sent, score))
sentence = nltk.tokenize.word_tokenize(sent)
sentence = map(ps.stem, sentence)
s = score(question, sentence)
relevant.append((sent, s))
return relevant

def score(question, sentence):
score = 0
score += ngramWeight(question, sentence)
keywords = getKeywords(question)
score += proximity(keywords, sentence)
return score

# measures the proximity of the keywords from the original query to each other
def proximity(keywords, sentence):
length = len(sentence)
for i in range(len(keywords), length+1):
for j in range(length+1-i):
words = set(sentence[j:i+j])
if keywords <= words:
return 10-i
return 0

# compare two sentences using ngrams (upto trigram)
def ngramWeight(question,sentence):
def ngramWeight(question, sentence):
#stem and take set intersections for unigrams
uniQ = map(ps.stem, nltk.word_tokenize(question))
uniS = map(ps.stem, nltk.word_tokenize(sentence))
uniQ = map(ps.stem, question)
uniS = sentence
unigram = set(uniQ).intersection(set(uniS))

#get all bigram overlaps, rolls around end of sentence
Expand All @@ -67,15 +73,12 @@ def ngramWeight(question,sentence):
trigramQ = {uniQ[i-2]+uniQ[i-1]+uniQ[i] for i,word in enumerate(uniQ)}
trigramS = {uniS[i-2]+uniS[i-1]+uniS[i] for i,word in enumerate(uniS)}
trigram = trigramQ.intersection(trigramS)

lam1 = 0.2
lam2 = 0.3
lam3 = 0.5

return lam1*len(unigram) + lam2*len(bigram) + lam3*len(trigram)

if __name__ == '__main__':
ngramWeight('I like dolphin now','Sam also likes dolphins now')



print proximity(set(["the", "moon", "stroke"]), ["I", "want", "to", "see", "the", "moon", "stroke"])

0 comments on commit a0d5906

Please sign in to comment.