Skip to content

Commit

Permalink
house cleaning
Browse files Browse the repository at this point in the history
  • Loading branch information
Stephen Bly committed Apr 16, 2013
1 parent b5c0f4b commit a8ea9ad
Showing 1 changed file with 22 additions and 20 deletions.
42 changes: 22 additions & 20 deletions modules/tdIDF.py
Original file line number Diff line number Diff line change
@@ -1,34 +1,36 @@
import nltk
import math
from nltk.tokenize import word_tokenize

dicts = []

def main(question,article):
files()
ddict = {}
vec = []
def main(question, article):
ddict = get_counts()
for tok in nltk.word_tokenize(article):
ddict[tok] = ddict.get(tok,0) + 1
ddict[tok] = ddict.get(tok, 0) + 1

vec = []
for tok in nltk.word_tokenize(question):
tf = ddict.get(tok,0)
tf = ddict.get(tok, 0)
idf = math.log(float(108)/len(filter(lambda x:tok in x.keys(),dicts)) + 1)
vec.append(tf*idf)

largest = max(vec)
print map(lambda y: y/largest,vec)
normalized = map(lambda y: y/largest, vec)
return normalized

def files():
for year in ("S08", "S09", "S10"):
for i in xrange(1,5):
for j in xrange(1,10):
partic = {}
path = "../Question_Answer_Dataset_v1.1/"+year+"/data/set'+str(i)+'/a'+str(j)+'.txt"
cfile = open(path).read()
articles_per_set = 9

for tok in nltk.word_tokenize(cfile):
partic[tok] = partic.get(tok,0) + 1
dicts.append(partic)

main('and kangaroo','and and kangaroo')
def get_counts():
counts = {}
sets_per_year = 4
for year in ("S08", "S09", "S10"):
for i in xrange(1, num_sets):
for j in xrange(1, articles_per_set+1):
path = "../Question_Answer_Dataset_v1.1/"+year+"/data/set"+str(i)+"/a"+str(j)+".txt"
cfile = open(path).read()
partic = {}
for tok in nltk.word_tokenize(cfile):
partic[tok] = partic.get(tok, 0) + 1
counts.append(partic)
sets_per_year += 1
return counts

0 comments on commit a8ea9ad

Please sign in to comment.