Skip to content

Commit

Permalink
added new example tutorial/smallvoc-tfidf.py
Browse files Browse the repository at this point in the history
  • Loading branch information
William Cohen committed Sep 29, 2015
1 parent 4e1c581 commit b047297
Showing 1 changed file with 48 additions and 0 deletions.
48 changes: 48 additions & 0 deletions tutorial/smallvoc-tfidf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
from guineapig import *

# compute TFIDF in Guineapig

import sys
import math

def loadDictView(view):
result = {}
for (key,val) in GPig.rowsOf(view):
result[key] = val
return result

class TFIDF(Planner):

D = GPig.getArgvParams()
data = ReadLines(D.get('corpus','idcorpus.txt')) \
| Map(by=lambda line:line.strip().split("\t")) \
| Map(by=lambda (docid,doc): (docid,doc.lower().split())) \
| FlatMap(by=lambda (docid,words): map(lambda w:(docid,w),words))

#compute document frequency and inverse doc freq
docFreq = Distinct(data) \
| Group(by=lambda (docid,term):term, retaining=lambda(docid,term):docid, reducingTo=ReduceToCount())

ndoc = Map(data, by=lambda (docid,term):docid) \
| Distinct() \
| Group(by=lambda row:'ndoc', reducingTo=ReduceToCount())

inverseDocFreq = Augment(docFreq, sideview=ndoc, loadedBy=lambda v:GPig.onlyRowOf(v)) \
| Map(by=lambda((term,df),(dummy,ndoc)):(term,math.log(ndoc/df)))

#compute unweighted document vectors
udocvec = Augment(data, sideview=inverseDocFreq, loadedBy=loadDictView) \
| Map(by=lambda ((docid,term),idfDict):(docid,term,idfDict[term]))

#normalize
norm = Group( udocvec, by=lambda(docid,term,weight):docid,
retaining=lambda(docid,term,weight):weight*weight,
reducingTo=ReduceToSum() )

docvec = Augment(udocvec, sideview=norm, loadedBy=loadDictView) \
| Map( by=lambda ((docid,term,weight),normDict): (docid,term,weight/math.sqrt(normDict[docid])))

## always end like this
if __name__ == "__main__":
p = TFIDF()
p.main(sys.argv)

0 comments on commit b047297

Please sign in to comment.