diff --git a/mrs_gp.py b/mrs_gp.py index 6729758..6351cc5 100644 --- a/mrs_gp.py +++ b/mrs_gp.py @@ -709,7 +709,7 @@ def setupFiles(indirList,outdir,numReduceTasks): indirs.append(inhead) infiles.append(intail) else: - assert False,'illegal input location %s' % dir + assert False,'illegal input location %r from working dir %r' % (dir,os.environ['PWD']) numActualReduceTasks = len(infiles) if numReduceTasks<0 else numReduceTasks if inputsAreFiles and numActualReduceTasks==1 and not GPFileSystem.inGPFS(outdir): diff --git a/tutorial/phirl-improved.py b/tutorial/phirl-improved.py index 6eb51a3..4b09bca 100644 --- a/tutorial/phirl-improved.py +++ b/tutorial/phirl-improved.py @@ -60,7 +60,9 @@ class Phirl(Planner): # applied only to terms in the 'target' relation # 1) pick only top terms in each document - topTermsInEachDocForRel1 = Group(rel1Docs, by=lambda(rel,doc,term,weight):doc, retaining=lambda(rel,doc,term,weight):(weight,term)) \ + topTermsInEachDocForRel1 = Group(rel1Docs, + by=lambda(rel,doc,term,weight):doc, + retaining=lambda(rel,doc,term,weight):(weight,term)) \ | ReplaceEach(by=lambda(doc,termList):sorted(termList,reverse=True)[0:NUM_TOP_TERMS]) \ | Flatten(by=lambda x:x) | ReplaceEach(by=lambda(weight,term):term) @@ -69,7 +71,8 @@ class Phirl(Planner): | ReplaceEach(by=lambda(rel,doc,term,weight):term) # 3) pick terms with some maximal DF - lowDocFreqTerms = Filter(docFreq,by=lambda(rel,term,df):df<=MAX_TERM_DF) | ReplaceEach(by=lambda(rel,term,df):term) + lowDocFreqTerms = Filter(docFreq,by=lambda(rel,term,df):df<=MAX_TERM_DF) \ + | ReplaceEach(by=lambda(rel,term,df):term) # terms we will join on should pass all of the tests above usefulTerms = Join( Jin(topTermsInEachDocForRel1), Jin(highWeightTermsForRel1)) | ReplaceEach(by=lambda(term1,term2):term1) \ @@ -79,10 +82,17 @@ class Phirl(Planner): # a) since we're not considering all possible index terms, some pairs with non-zero similarity could be missed # b) since we're not adding up weight products for all terms, the score for a pair can be under-counted - softjoin = Join( Jin(rel1Docs,by=lambda(rel,doc,term,weight):term), Jin(usefulTerms)) | ReplaceEach(by=lambda(rel1doc,term):rel1doc) \ - | JoinTo( Jin(rel2Docs,by=lambda(rel,doc,term,weight):term), by=lambda(rel,doc,term,weight):term)\ - | ReplaceEach(by=lambda((rel1,doc1,term,weight1),(rel2,doc2,term_,weight2)): (doc1,doc2,weight1*weight2)) \ - | Group(by=lambda(doc1,doc2,p):(doc1,doc2), reducingTo=ReduceTo(float,lambda accum,(doc1,doc2,p): accum+p)) \ + softjoin = Join( Jin(rel1Docs,by=lambda(rel,doc,term,weight):term), + Jin(usefulTerms)) \ + | ReplaceEach(by=lambda(rel1doc,term):rel1doc) \ + | JoinTo( Jin(rel2Docs,by=lambda(rel,doc,term,weight):term), + by=lambda(rel,doc,term,weight):term)\ + | ReplaceEach( \ + by=lambda((rel1,doc1,term,weight1),(rel2,doc2,term_,weight2)): \ + (doc1,doc2,weight1*weight2)) \ + | Group(by=lambda(doc1,doc2,p):(doc1,doc2), \ + retaining=lambda accum,(doc1,doc2,p):p, \ + reducingTo=ReduceToSum()) \ | ReplaceEach(by=lambda((doc1,doc2),sim):(doc1,doc2,sim)) # get the strongly similar pairs diff --git a/tutorial/smallvoc-tfidf-simplified.py b/tutorial/smallvoc-tfidf-simplified.py new file mode 100644 index 0000000..39ab182 --- /dev/null +++ b/tutorial/smallvoc-tfidf-simplified.py @@ -0,0 +1,59 @@ +from guineapig import * + +# compute TFIDF in Guineapig +# +# Optimized to use sideviews for the relations that +# are only the size of the vocabulary +# +# sample invocation: +# % python smallvoc-tfidf-simplified.py --params input:dbpedia/withIds,output:dbpedia/docvec.gp --store docvec +# + +import sys +import math + +def loadAsDict(view): + result = {} + for (key,val) in GPig.rowsOf(view): + result[key] = val + return result + +class TFIDF(Planner): + + data = ReadLines('idcorpus.txt') \ + | Map(by=lambda line:line.strip().split("\t")) \ + | Map(by=lambda (docid,doc): (docid,doc.lower().split())) \ + | FlatMap(by=lambda (docid,words): map(lambda w:(docid,w),words)) + + #compute document frequency and inverse doc freq + docFreq = Distinct(data) \ + | Group(by=lambda (docid,term):term, \ + retaining=lambda x:1, \ + reducingTo=ReduceToSum()) + + # definitely use combiners when you aggregate + ndoc = Map(data, by=lambda (docid,term):docid) \ + | Distinct() \ + | Group(by=lambda row:'ndoc', retaining=lambda x:1, combiningTo=ReduceToSum(), reducingTo=ReduceToSum()) + + # convert raw docFreq to idf + inverseDocFreq = Augment(docFreq, sideview=ndoc, loadedBy=lambda v:GPig.onlyRowOf(v)) \ + | Map(by=lambda((term,df),(dummy,ndoc)):(term,math.log(ndoc/df))) + + #compute unweighted document vectors with a map-side join + udocvec = Augment(data, sideview=inverseDocFreq, loadedBy=loadAsDict) \ + | Map(by=lambda ((docid,term),idfDict):(docid,term,idfDict[term])) + + #normalize + norm = Group(udocvec, + by=lambda(docid,term,weight):docid, + retaining=lambda(docid,term,weight):weight*weight, + reducingTo=ReduceToSum() ) + + docvec = Augment(udocvec, sideview=norm, loadedBy=loadAsDict) \ + | Map( by=lambda ((docid,term,weight),normDict): (docid,term,weight/math.sqrt(normDict[docid]))) + +## always end like this +if __name__ == "__main__": + p = TFIDF() + p.main(sys.argv) diff --git a/tutorial/tfidf.py b/tutorial/tfidf.py index 06ab267..dadee59 100644 --- a/tutorial/tfidf.py +++ b/tutorial/tfidf.py @@ -16,6 +16,7 @@ class TFIDF(Planner): docFreq = Distinct(data) \ | Group(by=lambda (docid,term):term, retaining=lambda(docid,term):docid, reducingTo=ReduceToCount()) + # compute the number of documents docIds = Map(data, by=lambda (docid,term):docid) | Distinct() ndoc = Group(docIds, by=lambda row:'ndoc', reducingTo=ReduceToCount())