Skip to content

Commit

Permalink
ready to try submitting
Browse files Browse the repository at this point in the history
  • Loading branch information
Daniel Sedra committed Apr 16, 2013
1 parent df0efad commit aeba630
Showing 1 changed file with 16 additions and 6 deletions.
22 changes: 16 additions & 6 deletions modules/tdIDF.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@

# constructs a vector, for each word in question the tf-idf score with article
def main(question, article):
ddict = get_counts()
ddict = {}
counts = get_counts()
for tok in nltk.word_tokenize(article):
ddict[tok] = ddict.get(tok, 0) + 1

Expand All @@ -16,21 +17,27 @@ def main(question, article):
tf = ddict.get(tok, 0)

# total articles is 108 / number that have current token
idf = math.log(float(108)/len(filter(lambda x:tok in x.keys(),dicts)) + 1)
idf = math.log(float(108)/len(filter(lambda x:tok in x.keys(),counts)) + 1)
vec.append(tf*idf)

largest = max(vec)
normalized = map(lambda y: y/largest, vec)
return normalized

finDic = {}
for word,i in enumerate(nltk.word_tokenize(question)):
finDic[word] = normalized[i]

print finDic
return finDic

articles_per_set = 9

# goes through sample wiki articles and gets word counts
def get_counts():
counts = {}
counts = []
sets_per_year = 4
for year in ("S08", "S09", "S10"):
for i in xrange(1, num_sets):
for i in xrange(1, sets_per_year):
for j in xrange(1, articles_per_set+1):
path = "../Question_Answer_Dataset_v1.1/"+year+"/data/set"+str(i)+"/a"+str(j)+".txt"
cfile = open(path).read()
Expand All @@ -39,4 +46,7 @@ def get_counts():
partic[tok] = partic.get(tok, 0) + 1
counts.append(partic)
sets_per_year += 1
return counts
return counts


main('and kangaroo','and and kangaroo')

0 comments on commit aeba630

Please sign in to comment.