diff --git a/tutorial/smallvoc-tfidf.py b/tutorial/smallvoc-tfidf.py new file mode 100644 index 0000000..e384f4e --- /dev/null +++ b/tutorial/smallvoc-tfidf.py @@ -0,0 +1,48 @@ +from guineapig import * + +# compute TFIDF in Guineapig + +import sys +import math + +def loadDictView(view): + result = {} + for (key,val) in GPig.rowsOf(view): + result[key] = val + return result + +class TFIDF(Planner): + + D = GPig.getArgvParams() + data = ReadLines(D.get('corpus','idcorpus.txt')) \ + | Map(by=lambda line:line.strip().split("\t")) \ + | Map(by=lambda (docid,doc): (docid,doc.lower().split())) \ + | FlatMap(by=lambda (docid,words): map(lambda w:(docid,w),words)) + + #compute document frequency and inverse doc freq + docFreq = Distinct(data) \ + | Group(by=lambda (docid,term):term, retaining=lambda(docid,term):docid, reducingTo=ReduceToCount()) + + ndoc = Map(data, by=lambda (docid,term):docid) \ + | Distinct() \ + | Group(by=lambda row:'ndoc', reducingTo=ReduceToCount()) + + inverseDocFreq = Augment(docFreq, sideview=ndoc, loadedBy=lambda v:GPig.onlyRowOf(v)) \ + | Map(by=lambda((term,df),(dummy,ndoc)):(term,math.log(ndoc/df))) + + #compute unweighted document vectors + udocvec = Augment(data, sideview=inverseDocFreq, loadedBy=loadDictView) \ + | Map(by=lambda ((docid,term),idfDict):(docid,term,idfDict[term])) + + #normalize + norm = Group( udocvec, by=lambda(docid,term,weight):docid, + retaining=lambda(docid,term,weight):weight*weight, + reducingTo=ReduceToSum() ) + + docvec = Augment(udocvec, sideview=norm, loadedBy=loadDictView) \ + | Map( by=lambda ((docid,term,weight),normDict): (docid,term,weight/math.sqrt(normDict[docid]))) + +## always end like this +if __name__ == "__main__": + p = TFIDF() + p.main(sys.argv)