Skip to content

Commit aeba630

Browse files
author
Daniel Sedra
committed
ready to try submitting
1 parent df0efad commit aeba630

File tree

1 file changed

+16
-6
lines changed

1 file changed

+16
-6
lines changed

modules/tdIDF.py

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,8 @@
55

66
# constructs a vector, for each word in question the tf-idf score with article
77
def main(question, article):
8-
ddict = get_counts()
8+
ddict = {}
9+
counts = get_counts()
910
for tok in nltk.word_tokenize(article):
1011
ddict[tok] = ddict.get(tok, 0) + 1
1112

@@ -16,21 +17,27 @@ def main(question, article):
1617
tf = ddict.get(tok, 0)
1718

1819
# total articles is 108 / number that have current token
19-
idf = math.log(float(108)/len(filter(lambda x:tok in x.keys(),dicts)) + 1)
20+
idf = math.log(float(108)/len(filter(lambda x:tok in x.keys(),counts)) + 1)
2021
vec.append(tf*idf)
2122

2223
largest = max(vec)
2324
normalized = map(lambda y: y/largest, vec)
24-
return normalized
25+
26+
finDic = {}
27+
for word,i in enumerate(nltk.word_tokenize(question)):
28+
finDic[word] = normalized[i]
29+
30+
print finDic
31+
return finDic
2532

2633
articles_per_set = 9
2734

2835
# goes through sample wiki articles and gets word counts
2936
def get_counts():
30-
counts = {}
37+
counts = []
3138
sets_per_year = 4
3239
for year in ("S08", "S09", "S10"):
33-
for i in xrange(1, num_sets):
40+
for i in xrange(1, sets_per_year):
3441
for j in xrange(1, articles_per_set+1):
3542
path = "../Question_Answer_Dataset_v1.1/"+year+"/data/set"+str(i)+"/a"+str(j)+".txt"
3643
cfile = open(path).read()
@@ -39,4 +46,7 @@ def get_counts():
3946
partic[tok] = partic.get(tok, 0) + 1
4047
counts.append(partic)
4148
sets_per_year += 1
42-
return counts
49+
return counts
50+
51+
52+
main('and kangaroo','and and kangaroo')

0 commit comments

Comments
 (0)