small changes and docs

TeamCohen · Sep 15, 2017 · 3b051ca · 3b051ca
1 parent 786f1e9
commit 3b051ca
Show file tree

Hide file tree

Showing 4 changed files with 77 additions and 7 deletions.
diff --git a/mrs_gp.py b/mrs_gp.py
@@ -709,7 +709,7 @@ def setupFiles(indirList,outdir,numReduceTasks):
             indirs.append(inhead)
             infiles.append(intail)
         else:
-            assert False,'illegal input location %s' % dir
+            assert False,'illegal input location %r from working dir %r' % (dir,os.environ['PWD'])
 
     numActualReduceTasks = len(infiles) if numReduceTasks<0 else numReduceTasks
     if inputsAreFiles and numActualReduceTasks==1 and not GPFileSystem.inGPFS(outdir):

diff --git a/tutorial/phirl-improved.py b/tutorial/phirl-improved.py
@@ -60,7 +60,9 @@ class Phirl(Planner):
     # applied only to terms in the 'target' relation
 
     # 1) pick only top terms in each document
-    topTermsInEachDocForRel1 = Group(rel1Docs, by=lambda(rel,doc,term,weight):doc, retaining=lambda(rel,doc,term,weight):(weight,term)) \
+    topTermsInEachDocForRel1 = Group(rel1Docs, 
+                                     by=lambda(rel,doc,term,weight):doc, 
+                                     retaining=lambda(rel,doc,term,weight):(weight,term)) \
         | ReplaceEach(by=lambda(doc,termList):sorted(termList,reverse=True)[0:NUM_TOP_TERMS]) \
         | Flatten(by=lambda x:x) | ReplaceEach(by=lambda(weight,term):term)
 
@@ -69,7 +71,8 @@ class Phirl(Planner):
         | ReplaceEach(by=lambda(rel,doc,term,weight):term)
 
     # 3) pick terms with some maximal DF
-    lowDocFreqTerms = Filter(docFreq,by=lambda(rel,term,df):df<=MAX_TERM_DF) | ReplaceEach(by=lambda(rel,term,df):term)
+    lowDocFreqTerms = Filter(docFreq,by=lambda(rel,term,df):df<=MAX_TERM_DF) \
+        | ReplaceEach(by=lambda(rel,term,df):term)
 
     # terms we will join on should pass all of the tests above
     usefulTerms = Join( Jin(topTermsInEachDocForRel1), Jin(highWeightTermsForRel1)) | ReplaceEach(by=lambda(term1,term2):term1) \
@@ -79,10 +82,17 @@ class Phirl(Planner):
     # a) since we're not considering all possible index terms, some pairs with non-zero similarity could be missed
     # b) since we're not adding up weight products for all terms, the score for a pair can be under-counted
 
-    softjoin = Join( Jin(rel1Docs,by=lambda(rel,doc,term,weight):term), Jin(usefulTerms)) | ReplaceEach(by=lambda(rel1doc,term):rel1doc) \
-        | JoinTo( Jin(rel2Docs,by=lambda(rel,doc,term,weight):term), by=lambda(rel,doc,term,weight):term)\
-        | ReplaceEach(by=lambda((rel1,doc1,term,weight1),(rel2,doc2,term_,weight2)): (doc1,doc2,weight1*weight2)) \
-        | Group(by=lambda(doc1,doc2,p):(doc1,doc2), reducingTo=ReduceTo(float,lambda accum,(doc1,doc2,p): accum+p)) \
+    softjoin = Join( Jin(rel1Docs,by=lambda(rel,doc,term,weight):term), 
+                     Jin(usefulTerms)) \
+        | ReplaceEach(by=lambda(rel1doc,term):rel1doc) \
+        | JoinTo( Jin(rel2Docs,by=lambda(rel,doc,term,weight):term), 
+                  by=lambda(rel,doc,term,weight):term)\
+        | ReplaceEach( \
+            by=lambda((rel1,doc1,term,weight1),(rel2,doc2,term_,weight2)): \
+                       (doc1,doc2,weight1*weight2)) \
+        | Group(by=lambda(doc1,doc2,p):(doc1,doc2), \
+                retaining=lambda accum,(doc1,doc2,p):p, \
+                reducingTo=ReduceToSum()) \
         | ReplaceEach(by=lambda((doc1,doc2),sim):(doc1,doc2,sim))
 
     # get the strongly similar pairs

diff --git a/tutorial/smallvoc-tfidf-simplified.py b/tutorial/smallvoc-tfidf-simplified.py
@@ -0,0 +1,59 @@
+from guineapig import *
+
+# compute TFIDF in Guineapig
+#
+# Optimized to use sideviews for the relations that
+# are only the size of the vocabulary
+# 
+# sample invocation:
+# % python smallvoc-tfidf-simplified.py --params input:dbpedia/withIds,output:dbpedia/docvec.gp --store docvec
+#
+
+import sys
+import math
+
+def loadAsDict(view):
+    result = {}
+    for (key,val) in GPig.rowsOf(view):
+        result[key] = val
+    return result
+
+class TFIDF(Planner):
+
+    data = ReadLines('idcorpus.txt') \
+        | Map(by=lambda line:line.strip().split("\t")) \
+        | Map(by=lambda (docid,doc): (docid,doc.lower().split())) \
+        | FlatMap(by=lambda (docid,words): map(lambda w:(docid,w),words))
+
+    #compute document frequency and inverse doc freq
+    docFreq = Distinct(data) \
+        | Group(by=lambda (docid,term):term, \
+                retaining=lambda x:1, \
+                reducingTo=ReduceToSum())
+
+    # definitely use combiners when you aggregate
+    ndoc = Map(data, by=lambda (docid,term):docid) \
+        | Distinct() \
+        | Group(by=lambda row:'ndoc', retaining=lambda x:1, combiningTo=ReduceToSum(), reducingTo=ReduceToSum())
+
+    # convert raw docFreq to idf
+    inverseDocFreq = Augment(docFreq, sideview=ndoc, loadedBy=lambda v:GPig.onlyRowOf(v)) \
+        | Map(by=lambda((term,df),(dummy,ndoc)):(term,math.log(ndoc/df)))
+
+    #compute unweighted document vectors with a map-side join
+    udocvec = Augment(data, sideview=inverseDocFreq, loadedBy=loadAsDict) \
+        | Map(by=lambda ((docid,term),idfDict):(docid,term,idfDict[term]))
+
+    #normalize
+    norm = Group(udocvec, 
+                 by=lambda(docid,term,weight):docid, 
+                 retaining=lambda(docid,term,weight):weight*weight,
+                 reducingTo=ReduceToSum() )
+
+    docvec = Augment(udocvec, sideview=norm, loadedBy=loadAsDict) \
+        | Map( by=lambda ((docid,term,weight),normDict): (docid,term,weight/math.sqrt(normDict[docid])))
+
+## always end like this
+if __name__ == "__main__":
+    p = TFIDF()
+    p.main(sys.argv)  
diff --git a/tutorial/tfidf.py b/tutorial/tfidf.py
@@ -16,6 +16,7 @@ class TFIDF(Planner):
     docFreq = Distinct(data) \
         | Group(by=lambda (docid,term):term, retaining=lambda(docid,term):docid, reducingTo=ReduceToCount())
 
+    # compute the number of documents
     docIds = Map(data, by=lambda (docid,term):docid) | Distinct()
     ndoc = Group(docIds, by=lambda row:'ndoc', reducingTo=ReduceToCount())