diff --git a/modules/tdIDF.py b/modules/tdIDF.py index f95a31e..21a490c 100644 --- a/modules/tdIDF.py +++ b/modules/tdIDF.py @@ -1,34 +1,36 @@ import nltk import math -from nltk.tokenize import word_tokenize dicts = [] -def main(question,article): - files() - ddict = {} - vec = [] +def main(question, article): + ddict = get_counts() for tok in nltk.word_tokenize(article): - ddict[tok] = ddict.get(tok,0) + 1 + ddict[tok] = ddict.get(tok, 0) + 1 + vec = [] for tok in nltk.word_tokenize(question): - tf = ddict.get(tok,0) + tf = ddict.get(tok, 0) idf = math.log(float(108)/len(filter(lambda x:tok in x.keys(),dicts)) + 1) vec.append(tf*idf) largest = max(vec) - print map(lambda y: y/largest,vec) + normalized = map(lambda y: y/largest, vec) + return normalized -def files(): - for year in ("S08", "S09", "S10"): - for i in xrange(1,5): - for j in xrange(1,10): - partic = {} - path = "../Question_Answer_Dataset_v1.1/"+year+"/data/set'+str(i)+'/a'+str(j)+'.txt" - cfile = open(path).read() +articles_per_set = 9 - for tok in nltk.word_tokenize(cfile): - partic[tok] = partic.get(tok,0) + 1 - dicts.append(partic) - -main('and kangaroo','and and kangaroo') +def get_counts(): + counts = {} + sets_per_year = 4 + for year in ("S08", "S09", "S10"): + for i in xrange(1, num_sets): + for j in xrange(1, articles_per_set+1): + path = "../Question_Answer_Dataset_v1.1/"+year+"/data/set"+str(i)+"/a"+str(j)+".txt" + cfile = open(path).read() + partic = {} + for tok in nltk.word_tokenize(cfile): + partic[tok] = partic.get(tok, 0) + 1 + counts.append(partic) + sets_per_year += 1 + return counts \ No newline at end of file