Skip to content

Commit

Permalink
commented tf-idf
Browse files Browse the repository at this point in the history
  • Loading branch information
Daniel Sedra committed Apr 16, 2013
1 parent a8ea9ad commit af1abc0
Showing 1 changed file with 7 additions and 1 deletion.
8 changes: 7 additions & 1 deletion modules/tdIDF.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,19 @@

dicts = []

# constructs a vector, for each word in question the tf-idf score with article
def main(question, article):
ddict = get_counts()
for tok in nltk.word_tokenize(article):
ddict[tok] = ddict.get(tok, 0) + 1

vec = []
for tok in nltk.word_tokenize(question):
tf = ddict.get(tok, 0)

# count in article
tf = ddict.get(tok, 0)

# total articles is 108 / number that have current token
idf = math.log(float(108)/len(filter(lambda x:tok in x.keys(),dicts)) + 1)
vec.append(tf*idf)

Expand All @@ -20,6 +25,7 @@ def main(question, article):

articles_per_set = 9

# goes through sample wiki articles and gets word counts
def get_counts():
counts = {}
sets_per_year = 4
Expand Down

0 comments on commit af1abc0

Please sign in to comment.