From aeba6302f60d27a8b9e65ad28d2d74e1276c7cd6 Mon Sep 17 00:00:00 2001 From: Daniel Sedra Date: Mon, 15 Apr 2013 23:45:25 -0400 Subject: [PATCH] ready to try submitting --- modules/tdIDF.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/modules/tdIDF.py b/modules/tdIDF.py index e26d997..5a9eedf 100644 --- a/modules/tdIDF.py +++ b/modules/tdIDF.py @@ -5,7 +5,8 @@ # constructs a vector, for each word in question the tf-idf score with article def main(question, article): - ddict = get_counts() + ddict = {} + counts = get_counts() for tok in nltk.word_tokenize(article): ddict[tok] = ddict.get(tok, 0) + 1 @@ -16,21 +17,27 @@ def main(question, article): tf = ddict.get(tok, 0) # total articles is 108 / number that have current token - idf = math.log(float(108)/len(filter(lambda x:tok in x.keys(),dicts)) + 1) + idf = math.log(float(108)/len(filter(lambda x:tok in x.keys(),counts)) + 1) vec.append(tf*idf) largest = max(vec) normalized = map(lambda y: y/largest, vec) - return normalized + + finDic = {} + for word,i in enumerate(nltk.word_tokenize(question)): + finDic[word] = normalized[i] + + print finDic + return finDic articles_per_set = 9 # goes through sample wiki articles and gets word counts def get_counts(): - counts = {} + counts = [] sets_per_year = 4 for year in ("S08", "S09", "S10"): - for i in xrange(1, num_sets): + for i in xrange(1, sets_per_year): for j in xrange(1, articles_per_set+1): path = "../Question_Answer_Dataset_v1.1/"+year+"/data/set"+str(i)+"/a"+str(j)+".txt" cfile = open(path).read() @@ -39,4 +46,7 @@ def get_counts(): partic[tok] = partic.get(tok, 0) + 1 counts.append(partic) sets_per_year += 1 - return counts \ No newline at end of file + return counts + + +main('and kangaroo','and and kangaroo') \ No newline at end of file