Skip to content

Commit

Permalink
small changes and docs
Browse files Browse the repository at this point in the history
  • Loading branch information
wwcohen committed Sep 15, 2017
1 parent 786f1e9 commit 3b051ca
Show file tree
Hide file tree
Showing 4 changed files with 77 additions and 7 deletions.
2 changes: 1 addition & 1 deletion mrs_gp.py
Original file line number Diff line number Diff line change
Expand Up @@ -709,7 +709,7 @@ def setupFiles(indirList,outdir,numReduceTasks):
indirs.append(inhead)
infiles.append(intail)
else:
assert False,'illegal input location %s' % dir
assert False,'illegal input location %r from working dir %r' % (dir,os.environ['PWD'])

numActualReduceTasks = len(infiles) if numReduceTasks<0 else numReduceTasks
if inputsAreFiles and numActualReduceTasks==1 and not GPFileSystem.inGPFS(outdir):
Expand Down
22 changes: 16 additions & 6 deletions tutorial/phirl-improved.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,9 @@ class Phirl(Planner):
# applied only to terms in the 'target' relation

# 1) pick only top terms in each document
topTermsInEachDocForRel1 = Group(rel1Docs, by=lambda(rel,doc,term,weight):doc, retaining=lambda(rel,doc,term,weight):(weight,term)) \
topTermsInEachDocForRel1 = Group(rel1Docs,
by=lambda(rel,doc,term,weight):doc,
retaining=lambda(rel,doc,term,weight):(weight,term)) \
| ReplaceEach(by=lambda(doc,termList):sorted(termList,reverse=True)[0:NUM_TOP_TERMS]) \
| Flatten(by=lambda x:x) | ReplaceEach(by=lambda(weight,term):term)

Expand All @@ -69,7 +71,8 @@ class Phirl(Planner):
| ReplaceEach(by=lambda(rel,doc,term,weight):term)

# 3) pick terms with some maximal DF
lowDocFreqTerms = Filter(docFreq,by=lambda(rel,term,df):df<=MAX_TERM_DF) | ReplaceEach(by=lambda(rel,term,df):term)
lowDocFreqTerms = Filter(docFreq,by=lambda(rel,term,df):df<=MAX_TERM_DF) \
| ReplaceEach(by=lambda(rel,term,df):term)

# terms we will join on should pass all of the tests above
usefulTerms = Join( Jin(topTermsInEachDocForRel1), Jin(highWeightTermsForRel1)) | ReplaceEach(by=lambda(term1,term2):term1) \
Expand All @@ -79,10 +82,17 @@ class Phirl(Planner):
# a) since we're not considering all possible index terms, some pairs with non-zero similarity could be missed
# b) since we're not adding up weight products for all terms, the score for a pair can be under-counted

softjoin = Join( Jin(rel1Docs,by=lambda(rel,doc,term,weight):term), Jin(usefulTerms)) | ReplaceEach(by=lambda(rel1doc,term):rel1doc) \
| JoinTo( Jin(rel2Docs,by=lambda(rel,doc,term,weight):term), by=lambda(rel,doc,term,weight):term)\
| ReplaceEach(by=lambda((rel1,doc1,term,weight1),(rel2,doc2,term_,weight2)): (doc1,doc2,weight1*weight2)) \
| Group(by=lambda(doc1,doc2,p):(doc1,doc2), reducingTo=ReduceTo(float,lambda accum,(doc1,doc2,p): accum+p)) \
softjoin = Join( Jin(rel1Docs,by=lambda(rel,doc,term,weight):term),
Jin(usefulTerms)) \
| ReplaceEach(by=lambda(rel1doc,term):rel1doc) \
| JoinTo( Jin(rel2Docs,by=lambda(rel,doc,term,weight):term),
by=lambda(rel,doc,term,weight):term)\
| ReplaceEach( \
by=lambda((rel1,doc1,term,weight1),(rel2,doc2,term_,weight2)): \
(doc1,doc2,weight1*weight2)) \
| Group(by=lambda(doc1,doc2,p):(doc1,doc2), \
retaining=lambda accum,(doc1,doc2,p):p, \
reducingTo=ReduceToSum()) \
| ReplaceEach(by=lambda((doc1,doc2),sim):(doc1,doc2,sim))

# get the strongly similar pairs
Expand Down
59 changes: 59 additions & 0 deletions tutorial/smallvoc-tfidf-simplified.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
from guineapig import *

# compute TFIDF in Guineapig
#
# Optimized to use sideviews for the relations that
# are only the size of the vocabulary
#
# sample invocation:
# % python smallvoc-tfidf-simplified.py --params input:dbpedia/withIds,output:dbpedia/docvec.gp --store docvec
#

import sys
import math

def loadAsDict(view):
result = {}
for (key,val) in GPig.rowsOf(view):
result[key] = val
return result

class TFIDF(Planner):

data = ReadLines('idcorpus.txt') \
| Map(by=lambda line:line.strip().split("\t")) \
| Map(by=lambda (docid,doc): (docid,doc.lower().split())) \
| FlatMap(by=lambda (docid,words): map(lambda w:(docid,w),words))

#compute document frequency and inverse doc freq
docFreq = Distinct(data) \
| Group(by=lambda (docid,term):term, \
retaining=lambda x:1, \
reducingTo=ReduceToSum())

# definitely use combiners when you aggregate
ndoc = Map(data, by=lambda (docid,term):docid) \
| Distinct() \
| Group(by=lambda row:'ndoc', retaining=lambda x:1, combiningTo=ReduceToSum(), reducingTo=ReduceToSum())

# convert raw docFreq to idf
inverseDocFreq = Augment(docFreq, sideview=ndoc, loadedBy=lambda v:GPig.onlyRowOf(v)) \
| Map(by=lambda((term,df),(dummy,ndoc)):(term,math.log(ndoc/df)))

#compute unweighted document vectors with a map-side join
udocvec = Augment(data, sideview=inverseDocFreq, loadedBy=loadAsDict) \
| Map(by=lambda ((docid,term),idfDict):(docid,term,idfDict[term]))

#normalize
norm = Group(udocvec,
by=lambda(docid,term,weight):docid,
retaining=lambda(docid,term,weight):weight*weight,
reducingTo=ReduceToSum() )

docvec = Augment(udocvec, sideview=norm, loadedBy=loadAsDict) \
| Map( by=lambda ((docid,term,weight),normDict): (docid,term,weight/math.sqrt(normDict[docid])))

## always end like this
if __name__ == "__main__":
p = TFIDF()
p.main(sys.argv)
1 change: 1 addition & 0 deletions tutorial/tfidf.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ class TFIDF(Planner):
docFreq = Distinct(data) \
| Group(by=lambda (docid,term):term, retaining=lambda(docid,term):docid, reducingTo=ReduceToCount())

# compute the number of documents
docIds = Map(data, by=lambda (docid,term):docid) | Distinct()
ndoc = Group(docIds, by=lambda row:'ndoc', reducingTo=ReduceToCount())

Expand Down

0 comments on commit 3b051ca

Please sign in to comment.