Skip to content

Commit

Permalink
please don't break
Browse files Browse the repository at this point in the history
  • Loading branch information
Stephen Bly committed Apr 12, 2013
1 parent eb8156a commit 888f605
Show file tree
Hide file tree
Showing 10 changed files with 1,543 additions and 32 deletions.
498 changes: 498 additions & 0 deletions Question_Answer_Dataset_v1.1/S08/data/set1/a5.osent

Large diffs are not rendered by default.

498 changes: 498 additions & 0 deletions Question_Answer_Dataset_v1.1/S08/data/set1/a5.parse

Large diffs are not rendered by default.

498 changes: 498 additions & 0 deletions Question_Answer_Dataset_v1.1/S08/data/set1/a5.sst

Large diffs are not rendered by default.

28 changes: 21 additions & 7 deletions answer
Original file line number Diff line number Diff line change
Expand Up @@ -3,40 +3,54 @@
# answer
# 11-411 NLP Spring 2013, Group 6

# To run, type ./answer article_name
# e.g. ./answer

# Useful tools which should be pre-installed
import os, sys, errno
import subprocess
import re
import itertools
import nltk
from nltk.stem import PorterStemmer
import xml.etree.ElementTree as ET
# Import our modules from /modules
sys.path.append("modules")
import questionClassifier
import sourceContentSelector

# To answer yes/no question, we want to just answer yes or no,
# and not returna whole sentence. We do this by checking for
# any negatives in the sentence.
def contains_negative(sent):
return "no" in sent or "not" in sent or "n't" in sent

# the set of pronouns, used for anaphora resolution
pronouns = set(["he", "she", "it", "him", "her", "his"])

# Runs coreference resolution on the article using arkref.
# This still needs to be implemented.
def coref(path_to_article):
#subprocess.call(["./arkref.sh", "-input", path_to_article])
subprocess.call(["./arkref.sh", "-input", path_to_article])
path_to_article = path_to_article.replace("txt", "tagged")
return open(path_to_article).read()

# picks the sentence that has the most keywords in common with the question
# Answers a question from the information in article.
# Ranks all the sentences and then returns the top choice.
def answer(question, article):
question = question.strip()
question_type = questionClassifier.process(question)
question = nltk.tokenize.word_tokenize(question)
relevant = sourceContentSelector.getRelevantSentences(question, article)
relevant.sort(key = lambda s: s[1], reverse=True)
top = relevant[0][0]
relevant = sourceContentSelector.getScoredSentences(question, article)
top = max(relevant, key = lambda s: s[1])
if question_type == "BOOLEAN":
if contains_negative(top): return "NO"
else: return "YES"
if contains_negative(top): return "No"
else: return "Yes"
else:
return top


# The main script
if __name__ == '__main__':
article_name = sys.argv[1]

Expand Down
7 changes: 1 addition & 6 deletions ask
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,8 @@ if __name__ == '__main__':

# Should probably apply co-reference resolution to article content here.

# Decide how many candidates we want to generate
# im thinking we should always generate as many questions as possible
# and just pick the n best
num_cand = num_questions*20

# Fetch sentence candidates that can be converted into questions.
selected_content = questionContentSelector.process(article_content, num_cand)
selected_content = questionContentSelector.process(article_content)

# Use POS Tagging and Transformation rules to generate questions
questions = questionFromSentence.process(selected_content)
Expand Down
5 changes: 3 additions & 2 deletions modules/lemma.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# takes in a word list and returns a list of lists
# each nested list is a list of lemmas for each word
# Takes in a word list (i.e. a sentence) and returns a list of sets.
# That is, each word in the original list is replaced by the set of
# its lemmas, from WordNet.

import nltk
from nltk.corpus import wordnet as wn
Expand Down
Empty file removed modules/naiveCoref.py
Empty file.
4 changes: 3 additions & 1 deletion modules/questionClassifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
import itertools
import nltk

# Returns what type of answer the question is looking for.
# Turned out not to be too useful.
def process(question):
if question.startswith("Who "):
return "PERSON"
Expand All @@ -21,7 +23,7 @@ def process(question):
return "PHRASE"
elif question.startswith("How many "):
return "NUMERAL"
elif question.startswith(("Is ", "Was ", "Will ", "Are ", "Were ", "Do ", "Does ", "Did ", "Have ", "Has ")):
elif question.startswith(("Is ", "Was ", "Will ", "Are ", "Were ", "Do ", "Does ", "Did ", "Have ", "Has ", "Can ")):
return "BOOLEAN"
else:
return "UNKOWN"
11 changes: 5 additions & 6 deletions modules/questionContentSelector.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import re
import nltk

# Use part-of-speech tagging to
# Use part-of-speech tagging to
# score the usefulness of a sentence.
def entity_score(sentence):
tokens = nltk.word_tokenize(sentence)
Expand All @@ -17,11 +17,10 @@ def entity_score(sentence):
if ("IS" in tokensU or "WAS" in tokensU or
"WERE" in tokensU or "BEING" in tokensU or
"ARE" in tokensU):

if (nltk.pos_tag([tokens[0]])[0] == "PRP"):
return 1.0
else:
return 0.5
return 0.5

#tagged = nltk.pos_tag(tokens)
# entities = nltk.chunk.ne_chunk(tagged)
Expand All @@ -37,7 +36,7 @@ def naive_score(sentence):
not weird, # Avoid weird characters
"It is" in sentence, # Look for "It is ..."
" is " in sentence, # Look for "[foo] is [bar]"
4 < word_count < 12,
4 < word_count < 12,
5 < word_count < 7
]
return float(sum(features))/len(features)
Expand All @@ -48,7 +47,7 @@ def sentence_score(sentence):
# GIVEN source_text string and
# GIVEN n integer representing number of candidates to return,
# RETURNS list of candidate strings
def process(source_text, n):
def process(source_text):
sentences = nltk.sent_tokenize(source_text)
sentences = sorted(sentences, key = lambda (x): -sentence_score(x))
return sentences[:int(n)]
return sentences
26 changes: 16 additions & 10 deletions modules/sourceContentSelector.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,11 @@
import sys, os, string, re
import nltk
from nltk.stem import PorterStemmer
sys.path.append("modules")
import lemma

# parts of speech of any "important" word
# Any keywords in the sentence will be these parts of speech
key_POS = set(["CD","FW","NN","NNS","NNP","NPS","VB","VBD","VBG","VBN","VBP","VBZ"])

# auxiliary verbs we should ignore
aux = set(["is", "was", "did", "does", "do", "were", "are"])

Expand All @@ -30,35 +31,39 @@ def getKeywords(question):
result.append(ps.stem(tag[0]))
return set(result)

# Given a question, returns relevant parts of an article
def getRelevantSentences(question, article):
relevant = []
# Given a question, return a list of each sentence in the article
# with a score attached to it
def getScoredSentences(question, article):
scored_sent = []
sentences = nltk.tokenize.sent_tokenize(article)
for sent in sentences:
sentence = nltk.tokenize.word_tokenize(sent)
sentence = map(ps.stem, sentence)
s = score(question, sentence)
relevant.append((sent, s))
return relevant
scored_sent.append((sent, s))
return scored_sent

# Scores a sentence based on how well we think it answers the question
def score(question, sentence):
score = 0
score += ngramWeight(question, sentence)
keywords = getKeywords(question)
score += proximity(keywords, sentence)
return score

# measures the proximity of the keywords from the original query to each other
# Finds the shortest window in the targest sentence
# in which all keywords appear, and assigns a score.
def proximity(keywords, sentence):
length = len(sentence)
for i in range(len(keywords), length+1):
for j in range(length+1-i):
words = set(sentence[j:i+j])
if keywords <= words:
return 10-i
return max(20-i, 0)
return 0

# compare two sentences using ngrams (upto trigram)
# Compare the overlap of two sentences using ngrams
# (up to trigrams). This is similar to the BLEU score.
def ngramWeight(question, sentence):
#stem and take set intersections for unigrams
uniQ = map(ps.stem, question)
Expand Down Expand Up @@ -88,5 +93,6 @@ def ngramWeight(question, sentence):

return lam1*len(unigram) + lam2*len(bigram) + lam3*len(trigram)

# for testing
if __name__ == '__main__':
print proximity(set(["the", "moon", "stroke"]), ["I", "want", "to", "see", "the", "moon", "stroke"])

0 comments on commit 888f605

Please sign in to comment.