From aeba6302f60d27a8b9e65ad28d2d74e1276c7cd6 Mon Sep 17 00:00:00 2001
From: Daniel Sedra <danielsedra@aol.com>
Date: Mon, 15 Apr 2013 23:45:25 -0400
Subject: [PATCH] ready to try submitting

---
 modules/tdIDF.py | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/modules/tdIDF.py b/modules/tdIDF.py
index e26d997..5a9eedf 100644
--- a/modules/tdIDF.py
+++ b/modules/tdIDF.py
@@ -5,7 +5,8 @@
 
 # constructs a vector, for each word in question the tf-idf score with article
 def main(question, article):
-  ddict = get_counts()
+  ddict = {}
+  counts = get_counts()
   for tok in nltk.word_tokenize(article):
     ddict[tok] = ddict.get(tok, 0) + 1
 
@@ -16,21 +17,27 @@ def main(question, article):
     tf = ddict.get(tok, 0) 
 
     # total articles is 108 / number that have current token
-    idf = math.log(float(108)/len(filter(lambda x:tok in x.keys(),dicts)) + 1)
+    idf = math.log(float(108)/len(filter(lambda x:tok in x.keys(),counts)) + 1)
     vec.append(tf*idf)
 
   largest = max(vec)
   normalized = map(lambda y: y/largest, vec)
-  return normalized
+
+  finDic = {}
+  for word,i in enumerate(nltk.word_tokenize(question)):
+    finDic[word] = normalized[i]
+
+  print finDic
+  return finDic
 
 articles_per_set = 9
 
 # goes through sample wiki articles and gets word counts
 def get_counts():
-  counts = {}
+  counts = []
   sets_per_year = 4
   for year in ("S08", "S09", "S10"):
-    for i in xrange(1, num_sets):
+    for i in xrange(1, sets_per_year):
       for j in xrange(1, articles_per_set+1):
         path = "../Question_Answer_Dataset_v1.1/"+year+"/data/set"+str(i)+"/a"+str(j)+".txt"
         cfile = open(path).read()
@@ -39,4 +46,7 @@ def get_counts():
           partic[tok] = partic.get(tok, 0) + 1
         counts.append(partic)
     sets_per_year += 1
-  return counts
\ No newline at end of file
+  return counts
+
+
+main('and kangaroo','and and kangaroo')
\ No newline at end of file