please don't break

ryhan · Apr 12, 2013 · 888f605 · 888f605
1 parent eb8156a
commit 888f605
Show file tree

Hide file tree

Showing 10 changed files with 1,543 additions and 32 deletions.
diff --git a/Question_Answer_Dataset_v1.1/S08/data/set1/a5.osent b/Question_Answer_Dataset_v1.1/S08/data/set1/a5.osent
diff --git a/Question_Answer_Dataset_v1.1/S08/data/set1/a5.parse b/Question_Answer_Dataset_v1.1/S08/data/set1/a5.parse
diff --git a/Question_Answer_Dataset_v1.1/S08/data/set1/a5.sst b/Question_Answer_Dataset_v1.1/S08/data/set1/a5.sst
diff --git a/answer b/answer
@@ -3,40 +3,54 @@
 # answer
 # 11-411 NLP Spring 2013, Group 6
 
+# To run, type ./answer article_name
+# e.g. ./answer
+
 # Useful tools which should be pre-installed
 import os, sys, errno
 import subprocess
 import re
 import itertools
 import nltk
 from nltk.stem import PorterStemmer
+import xml.etree.ElementTree as ET
 # Import our modules from /modules
 sys.path.append("modules")
 import questionClassifier
 import sourceContentSelector
 
+# To answer yes/no question, we want to just answer yes or no,
+# and not returna  whole sentence. We do this by checking for
+# any negatives in the sentence.
 def contains_negative(sent):
   return "no" in sent or "not" in sent or "n't" in sent
 
+# the set of pronouns, used for anaphora resolution
+pronouns = set(["he", "she", "it", "him", "her", "his"])
+
+# Runs coreference resolution on the article using arkref.
+# This still needs to be implemented.
 def coref(path_to_article):
-  #subprocess.call(["./arkref.sh", "-input", path_to_article])
+  subprocess.call(["./arkref.sh", "-input", path_to_article])
+  path_to_article = path_to_article.replace("txt", "tagged")
   return open(path_to_article).read()
 
-# picks the sentence that has the most keywords in common with the question
+# Answers a question from the information in article.
+# Ranks all the sentences and then returns the top choice.
 def answer(question, article):
     question = question.strip()
     question_type = questionClassifier.process(question)
     question = nltk.tokenize.word_tokenize(question)
-    relevant = sourceContentSelector.getRelevantSentences(question, article)
-    relevant.sort(key = lambda s: s[1], reverse=True)
-    top = relevant[0][0]
+    relevant = sourceContentSelector.getScoredSentences(question, article)
+    top = max(relevant, key = lambda s: s[1])
     if question_type == "BOOLEAN":
-      if contains_negative(top): return "NO"
-      else: return "YES"
+      if contains_negative(top): return "No"
+      else: return "Yes"
     else:
       return top
 
 
+# The main script
 if __name__ == '__main__':
   article_name = sys.argv[1]
 

diff --git a/ask b/ask
@@ -24,13 +24,8 @@ if __name__ == '__main__':
 
   # Should probably apply co-reference resolution to article content here.
 
-  # Decide how many candidates we want to generate
-  # im thinking we should always generate as many questions as possible
-  # and just pick the n best
-  num_cand = num_questions*20
-
   # Fetch sentence candidates that can be converted into questions.
-  selected_content = questionContentSelector.process(article_content, num_cand)
+  selected_content = questionContentSelector.process(article_content)
 
   # Use POS Tagging and Transformation rules to generate questions
   questions = questionFromSentence.process(selected_content)

diff --git a/modules/lemma.py b/modules/lemma.py
@@ -1,5 +1,6 @@
-# takes in a word list and returns a list of lists
-# each nested list is a list of lemmas for each word
+# Takes in a word list (i.e. a sentence) and returns a list of sets.
+# That is, each word in the original list is replaced by the set of
+# its lemmas, from WordNet.
 
 import nltk
 from nltk.corpus import wordnet as wn

diff --git a/modules/naiveCoref.py b/modules/naiveCoref.py
diff --git a/modules/questionClassifier.py b/modules/questionClassifier.py
@@ -8,6 +8,8 @@
 import itertools
 import nltk
 
+# Returns what type of answer the question is looking for.
+# Turned out not to be too useful.
 def process(question):
     if question.startswith("Who "):
         return "PERSON"
@@ -21,7 +23,7 @@ def process(question):
         return "PHRASE"
     elif question.startswith("How many "):
         return "NUMERAL"
-    elif question.startswith(("Is ", "Was ", "Will ", "Are ", "Were ", "Do ", "Does ", "Did ", "Have ", "Has ")):
+    elif question.startswith(("Is ", "Was ", "Will ", "Are ", "Were ", "Do ", "Does ", "Did ", "Have ", "Has ", "Can ")):
         return "BOOLEAN"
     else:
         return "UNKOWN"
diff --git a/modules/questionContentSelector.py b/modules/questionContentSelector.py
@@ -8,7 +8,7 @@
 import re
 import nltk
 
-# Use part-of-speech tagging to 
+# Use part-of-speech tagging to
 # score the usefulness of a sentence.
 def entity_score(sentence):
   tokens = nltk.word_tokenize(sentence)
@@ -17,11 +17,10 @@ def entity_score(sentence):
     if ("IS" in tokensU or "WAS" in tokensU or
         "WERE" in tokensU or "BEING" in tokensU or
         "ARE" in tokensU):
-
       if (nltk.pos_tag([tokens[0]])[0] == "PRP"):
         return 1.0
       else:
-        return 0.5 
+        return 0.5
 
   #tagged = nltk.pos_tag(tokens)
   # entities = nltk.chunk.ne_chunk(tagged)
@@ -37,7 +36,7 @@ def naive_score(sentence):
     not weird,             # Avoid weird characters
     "It is" in sentence,   # Look for "It is ..."
     " is " in sentence,    # Look for "[foo] is [bar]"
-    4 < word_count < 12,  
+    4 < word_count < 12,
     5 < word_count < 7
   ]
   return float(sum(features))/len(features)
@@ -48,7 +47,7 @@ def sentence_score(sentence):
 # GIVEN source_text string and
 # GIVEN n integer representing number of candidates to return,
 # RETURNS list of candidate strings
-def process(source_text, n):
+def process(source_text):
   sentences = nltk.sent_tokenize(source_text)
   sentences = sorted(sentences, key = lambda (x): -sentence_score(x))
-  return sentences[:int(n)]
+  return sentences
diff --git a/modules/sourceContentSelector.py b/modules/sourceContentSelector.py
@@ -6,10 +6,11 @@
 import sys, os, string, re
 import nltk
 from nltk.stem import PorterStemmer
+sys.path.append("modules")
+import lemma
 
-# parts of speech of any "important" word
+# Any keywords in the sentence will be these parts of speech
 key_POS = set(["CD","FW","NN","NNS","NNP","NPS","VB","VBD","VBG","VBN","VBP","VBZ"])
-
 # auxiliary verbs we should ignore
 aux = set(["is", "was", "did", "does", "do", "were", "are"])
 
@@ -30,35 +31,39 @@ def getKeywords(question):
       result.append(ps.stem(tag[0]))
   return set(result)
 
-# Given a question, returns relevant parts of an article
-def getRelevantSentences(question, article):
-  relevant = []
+# Given a question, return a list of each sentence in the article
+# with a score attached to it
+def getScoredSentences(question, article):
+  scored_sent = []
   sentences = nltk.tokenize.sent_tokenize(article)
   for sent in sentences:
       sentence = nltk.tokenize.word_tokenize(sent)
       sentence = map(ps.stem, sentence)
       s = score(question, sentence)
-      relevant.append((sent, s))
-  return relevant
+      scored_sent.append((sent, s))
+  return scored_sent
 
+# Scores a sentence based on how well we think it answers the question
 def score(question, sentence):
     score = 0
     score += ngramWeight(question, sentence)
     keywords = getKeywords(question)
     score += proximity(keywords, sentence)
     return score
 
-# measures the proximity of the keywords from the original query to each other
+# Finds the shortest window in the targest sentence
+# in which all keywords appear, and assigns a score.
 def proximity(keywords, sentence):
     length = len(sentence)
     for i in range(len(keywords), length+1):
         for j in range(length+1-i):
             words = set(sentence[j:i+j])
             if keywords <= words:
-                return 10-i
+                return max(20-i, 0)
     return 0
 
-# compare two sentences using ngrams (upto trigram)
+# Compare the overlap of two sentences using ngrams
+# (up to trigrams). This is similar to the BLEU score.
 def ngramWeight(question, sentence):
   #stem and take set intersections for unigrams
   uniQ = map(ps.stem, question)
@@ -88,5 +93,6 @@ def ngramWeight(question, sentence):
 
   return lam1*len(unigram) + lam2*len(bigram) + lam3*len(trigram)
 
+# for testing
 if __name__ == '__main__':
     print proximity(set(["the", "moon", "stroke"]), ["I",  "want", "to", "see", "the", "moon", "stroke"])