Now just type in the name of the articles, and it will answer all que…

…stions for that article from all years.
ryhan · Mar 28, 2013 · efebb56 · efebb56
1 parent 741672d
commit efebb56
Show file tree

Hide file tree

Showing 13 changed files with 30 additions and 2,046 deletions.
diff --git a/answer b/answer
@@ -15,7 +15,7 @@ sys.path.append("modules")
 
 import questionClassifier
 import sourceContentSelector
-from nltk_contrib.coref.resolve import BaselineCorefResolver
+#from nltk_contrib.coref.resolve import BaselineCorefResolver
 
 #entity_names = []
 #
@@ -30,7 +30,7 @@ def contains_negative(sent):
   return "no" in sent or "not" in sent or \
   "didn't" in sent or "did not" in sent
 
-resolver = BaselineCorefResolver()
+#resolver = BaselineCorefResolver()
 
 # picks the sentence that has the most keywords in common with the question
 def answer(question, article):
@@ -51,14 +51,32 @@ def answer(question, article):
 
 
 if __name__ == '__main__':
-  path_to_article = sys.argv[1]
-  path_to_questions = sys.argv[2]
+  article_name = sys.argv[1]
 
-  # Pre-process article content.
-  article = open(path_to_article).read()
+  for year in ("S08", "S09", "S10"):
+    print "Year:", year
+    prefix = "Question_Answer_Dataset_v1.1/"+year+"/"
+    question_answer_pairs = open(prefix+"question_answer_pairs.txt").readlines()
+    question_answer_pairs.pop(0)
+    for line in question_answer_pairs:
+      if not line.startswith(article_name): continue
+      line = line.lstrip(article_name)
+      end = line.find("?")
+      if end == -1: continue
+      question = line[:end+1].strip()
+      line = line[end+1:].split()
+      path_to_article = line.pop()
+      difficulty_answerer = line.pop()
+      difficulty_questioner = line.pop()
+      correct_answer = " ".join(line)
 
+      print "Question:", question
+      print "Difficulty from answerer:", difficulty_answerer
+      print "Difficulty from questioner:", difficulty_questioner
 
-  # Open the question file and start answering questions.
-  for question in open(path_to_questions):
-    print question
-    print answer(question, article)
+      # Open the question file and start answering questions.
+      article = open(prefix+path_to_article+".txt").read()
+      print "Our answer:", answer(question, article)
+      print "Correct answer:", correct_answer
+
+    print
diff --git a/data/set1/a6.txt → data/David_Beckham.txt b/data/set1/a6.txt → data/David_Beckham.txt
diff --git a/data/set3/a6.txt → data/Latin.txt b/data/set3/a6.txt → data/Latin.txt
diff --git a/data/set5/a6.txt → data/Lisp.txt b/data/set5/a6.txt → data/Lisp.txt
diff --git a/data/set2/a6.txt → data/Orion.txt b/data/set2/a6.txt → data/Orion.txt
diff --git a/data/README.md b/data/README.md
@@ -2,13 +2,5 @@
 
 Saved from https://www.ark.cs.cmu.edu/NLP/S13/data/
 
-Within each `/set[n]` is a
-- `a6.html` containing the html of a wikipedia article, and
-- `a6.txt` containing the plaintext version of a wikipedia article.
-
-The titles of the articles are
-- David Beckham (Set 1)
-- Orion, constellation (Set 2)
-- Latin (Set 3)
-- Star Wars Episode IV: A New Hope (Set 4)
-- Lisp, programming language (Set 5)
+Articles from wikipedia
+Some have corresponding questions
diff --git a/data/set4/a6.txt → data/Star_Wars_Episode_IV.txt b/data/set4/a6.txt → data/Star_Wars_Episode_IV.txt
diff --git a/data/set1/a6.html b/data/set1/a6.html
diff --git a/data/set2/a6.html b/data/set2/a6.html
diff --git a/data/set3/a6.html b/data/set3/a6.html
diff --git a/data/set4/a6.html b/data/set4/a6.html
diff --git a/data/set5/a6.html b/data/set5/a6.html
diff --git a/output b/output