5
5
6
6
# constructs a vector, for each word in question the tf-idf score with article
7
7
def main (question , article ):
8
- ddict = get_counts ()
8
+ ddict = {}
9
+ counts = get_counts ()
9
10
for tok in nltk .word_tokenize (article ):
10
11
ddict [tok ] = ddict .get (tok , 0 ) + 1
11
12
@@ -16,21 +17,27 @@ def main(question, article):
16
17
tf = ddict .get (tok , 0 )
17
18
18
19
# total articles is 108 / number that have current token
19
- idf = math .log (float (108 )/ len (filter (lambda x :tok in x .keys (),dicts )) + 1 )
20
+ idf = math .log (float (108 )/ len (filter (lambda x :tok in x .keys (),counts )) + 1 )
20
21
vec .append (tf * idf )
21
22
22
23
largest = max (vec )
23
24
normalized = map (lambda y : y / largest , vec )
24
- return normalized
25
+
26
+ finDic = {}
27
+ for word ,i in enumerate (nltk .word_tokenize (question )):
28
+ finDic [word ] = normalized [i ]
29
+
30
+ print finDic
31
+ return finDic
25
32
26
33
articles_per_set = 9
27
34
28
35
# goes through sample wiki articles and gets word counts
29
36
def get_counts ():
30
- counts = {}
37
+ counts = []
31
38
sets_per_year = 4
32
39
for year in ("S08" , "S09" , "S10" ):
33
- for i in xrange (1 , num_sets ):
40
+ for i in xrange (1 , sets_per_year ):
34
41
for j in xrange (1 , articles_per_set + 1 ):
35
42
path = "../Question_Answer_Dataset_v1.1/" + year + "/data/set" + str (i )+ "/a" + str (j )+ ".txt"
36
43
cfile = open (path ).read ()
@@ -39,4 +46,7 @@ def get_counts():
39
46
partic [tok ] = partic .get (tok , 0 ) + 1
40
47
counts .append (partic )
41
48
sets_per_year += 1
42
- return counts
49
+ return counts
50
+
51
+
52
+ main ('and kangaroo' ,'and and kangaroo' )
0 commit comments