From bdd10d6782e228204bc941d64ca4dd917eb80f8d Mon Sep 17 00:00:00 2001 From: William Cohen Date: Wed, 15 Jul 2015 10:48:39 -0400 Subject: [PATCH] catching with changes --- gpextras.py | 77 ++ guineapig.py | 296 ++++-- spyk.py | 140 +++ testgp.py | 27 +- tutorial/Makefile | 22 +- tutorial/README.txt | 9 +- tutorial/guineapig.py | 1384 ---------------------------- tutorial/guineapig1_1.py | 1244 ------------------------- tutorial/guineapig1_2.py | 1284 -------------------------- tutorial/guineapig1_3.py | 1384 ---------------------------- tutorial/instance-wordcount.py | 2 +- tutorial/longer-wordcount.py | 2 +- tutorial/multi-wordcount-hadoop.py | 2 +- tutorial/multi-wordcount.py | 2 +- tutorial/ntup-wordcount.py | 2 +- tutorial/param-wordcount.py | 2 +- tutorial/phirl-naive1_3.py | 32 +- tutorial/prefix-count.py | 2 +- tutorial/tfidf.py | 40 + tutorial/wordcmp.py | 2 +- tutorial/wordcount.py | 2 +- tutorial/wordprob.py | 2 +- 22 files changed, 531 insertions(+), 5428 deletions(-) create mode 100644 gpextras.py create mode 100644 spyk.py delete mode 100644 tutorial/guineapig.py delete mode 100644 tutorial/guineapig1_1.py delete mode 100644 tutorial/guineapig1_2.py delete mode 100644 tutorial/guineapig1_3.py create mode 100644 tutorial/tfidf.py diff --git a/gpextras.py b/gpextras.py new file mode 100644 index 0000000..afdfd3a --- /dev/null +++ b/gpextras.py @@ -0,0 +1,77 @@ +############################################################################## +# (C) Copyright 2014, 2015 William W. Cohen. All rights reserved. +############################################################################## + +from guineapig import * + +class ReadCSV(Reader): + """ Returns the lines in a CSV file, converted to Python tuples.""" + + def __init__(self,src,**kw): + Reader.__init__(self,src) + self.kw = kw + + def rowGenerator(self): + for tup in csv.reader(sys.stdin,**self.kw): + yield tup + + def __str__(self): + return 'ReadCVS("%s",%s)' % (self.src,str(self.kw)) + self.showExtras() + +class ReadBlocks(Reader): + """ Returns blocks of non-empty lines, separated by empty lines""" + + def __init__(self,src,isEndBlock=lambda line:line=="\n"): + Reader.__init__(self,src) + self.isEndBlock = isEndBlock + + def rowGenerator(self): + buf = [] + for line in sys.stdin: + if self.isEndBlock(line): + yield buf + buf = [] + else: + buf.append(line) + if buf: + yield buf + + def __str__(self): + return 'ReadBlocks("%s")' % self.src + self.showExtras() + +class Log(ReplaceEach): + """Print logging messages to stderr as data is processed. + For every row, the logfun will be called with arguments + logfun(rowValue,rowIndex). + """ + + def __init__(self, inner=None, logfun=lambda rowV,rowI:None): + self.rowNum = 0 + def logfunCaller(rowValue): + self.rowValue += 1 + self.logfun(rowValue,self.rowNum) + return rowValue + ReplaceEach.__init__(self,inner,by=logfunCaller) + + def __str__(self): + return 'Log("%s")' % self.src + self.showExtras() + +class LogEchoFirst(Log): + + """Echo the first N things.""" + + def __init__(self, inner=None, first=10): + def logfirst(rowValue,rowIndex): + if rowIndex<=first: + print >> sys.stderr, 'row %d: "%s"' % (rowIndex,rowValue) + Log.__init__(self, inner=inner, logfun=logfirst) + +class LogProgress(Log): + + """Echo a status message every 'interval' rows.""" + + def __init__(self, inner=None, msg="Logging progress", interval=1000): + def logprogress(rowValue,rowIndex): + if (rowIndex % interval)==0: + print >> sys.stderr, "%s: %d rows done" % (msg,rowIndex) + Log.__init__(self, inner=inner, logfun=logprogress) diff --git a/guineapig.py b/guineapig.py index 4e0b117..05a0fde 100644 --- a/guineapig.py +++ b/guineapig.py @@ -1,5 +1,5 @@ ############################################################################## -# (C) Copyright 2014 William W. Cohen. All rights reserved. +# (C) Copyright 2014, 2015 William W. Cohen. All rights reserved. ############################################################################## import sys @@ -10,6 +10,7 @@ import os import os.path import urlparse +import urllib import getopt import csv @@ -20,11 +21,16 @@ class GPig(object): """Collection of utilities for Guinea Pig.""" - HADOOP_LOC = 'hadoop' #assume hadoop is on the path at planning time - MY_LOC = 'guineapig.py' + SORT_COMMAND = 'LC_COLLATE=C sort' # use standard ascii ordering, not locale-specific one + HADOOP_LOC = 'hadoop' # assume hadoop is on the path at planning time + MY_LOC = 'guineapig.py' # the name of this file + VERSION = '1.3.2' + COPYRIGHT = '(c) William Cohen 2014,2015' - #global options for Guinea Pig can be passed in with the --opts + #Global options for Guinea Pig can be passed in with the --opts #command-line option, and these are the default values + #The location of the streaming jar is a special case, + #in that it's also settable via an environment variable. defaultJar = '/usr/lib/hadoop/contrib/streaming/hadoop-streaming-1.2.0.1.3.0.0-107.jar' envjar = os.environ.get('GP_STREAMJAR', defaultJar) DEFAULT_OPTS = {'streamJar': envjar, @@ -33,15 +39,18 @@ class GPig(object): 'echo':0, 'viewdir':'gpig_views', } - #there are the types of each option that has a non-string value + #These are the types of each option that has a non-string value DEFAULT_OPT_TYPES = {'parallel':int,'echo':int} - #we need to pass non-default options in to mappers and reducers, - #but since the remote worker's environment can be different, we - #also need to pass in options computed from the environment + #We need to pass non-default options in to mappers and reducers, + #but since the remote worker's environment can be different that + #the environment of this script, we also need to pass in options + #computed from the environment COMPUTED_OPTION_DEFAULTS = {'streamJar':defaultJar} @staticmethod def getCompiler(target): + """Return the compiler object used to convert AbstractMapReduceTasks + to executable commands.""" if target=='shell': return ShellCompiler() elif target=='hadoop': return HadoopCompiler() else: assert 'illegal compilation target '+target @@ -67,12 +76,15 @@ def getArgvDict(optname): for i,a in enumerate(sys.argv): if a==optname: paramString = sys.argv[i+1] - return dict(pair.split(":") for pair in paramString.split(",")) + result = dict(pair.split(":") for pair in paramString.split(",")) + for key in result: + result[key] = urllib.unquote(result[key]) + return result return {} @staticmethod def rowsOf(view): - """Iterate over the rows in a view.""" + """Iterator over the rows in a view.""" for line in open(view.distributableFile()): yield view.planner._serializer.fromString(line.strip()) @@ -103,6 +115,9 @@ def __init__(self,view,by=(lambda x:x),outer=False): self.view = view self.joinBy = by self.outer = outer + #To implement the semantics for outer joins, if one Jin input + #for a join is outer, then the other inputs will be marked as + #_padWithNulls==True self._padWithNulls = False def __str__(self): @@ -112,8 +127,11 @@ def __str__(self): return "Jin(%s,by=%s%s%s)" % (viewStr,self.joinBy,outerStr,padStr) class ReduceTo(object): - """An object x that can be the argument of a reducingTo=x - parameter in a Group view.""" + """An object x that can be the argument of a reducingTo=x parameter in + a Group view. Basetype is a function f such that f() returns the + initial value of the accumuator, and 'by' is a function that + maps one accumulator and a single new value to the next accumulator. + """ def __init__(self,baseType,by=lambda accum,val:accum+val): self.baseType = baseType self.reduceBy = by @@ -124,8 +142,8 @@ def __init__(self): ReduceTo.__init__(self,int,by=lambda accum,val:accum+1) class ReduceToSum(ReduceTo): - """Produce the sum of the objects - which must be numbers - that would - be placed in a group.""" + """Produce the sum of the objects - which must be legal arguments of + the '+' function - that would be placed in a group.""" def __init__(self): ReduceTo.__init__(self,int,by=lambda accum,val:accum+val) @@ -408,7 +426,7 @@ def enforceStorageConstraints(self): inner.storeMe = True def mapPlan(self): - log.error("abstract method not implemented") + logging.error("abstract method not implemented") def doStoreKeyedRows(self,subview,key,index): """Utility method used by concrete map-reduce classes to compute keys @@ -445,6 +463,9 @@ def rowGenerator(self): for line in sys.stdin: yield self.planner._serializer.fromString(line.strip()) + def explanation(self): + return [ 'reuse view %s stored in %s' % (self.reusedViewTag,self.src)] + def __str__(self): return 'ReuseView("%s")' % self.src + self.showExtras() @@ -462,21 +483,6 @@ def rowGenerator(self): def __str__(self): return 'ReadLines("%s")' % self.src + self.showExtras() -class ReadCSV(Reader): - """ Returns the lines in a CSV file, converted to Python tuples.""" - - def __init__(self,src,**kw): - Reader.__init__(self,src) - self.kw = kw - - def rowGenerator(self): - for tup in csv.reader(sys.stdin,**self.kw): - yield tup - - def __str__(self): - return 'ReadCVS("%s",%s)' % (self.src,str(self.kw)) + self.showExtras() - - class ReplaceEach(Transformation): """ In 'by=f'' f is a python function that takes a row and produces its replacement.""" @@ -490,11 +496,14 @@ def rowGenerator(self): yield self.replaceBy(row) def explanation(self): - return self.inner.explanation() + [ 'replaced to %s' % self.tag ] + return self.inner.explanation() + [ 'replace to %s' % self.tag ] def __str__(self): return 'ReplaceEach(%s, by=%s)' % (View.asTag(self.inner),str(self.replaceBy)) + self.showExtras() +class Map(ReplaceEach): + """ Alternate name for ReplaceEach""" + class Augment(Transformation): def __init__(self,inner=None,sideviews=None,sideview=None,loadedBy=lambda v:list(GPig.rowsOf(v))): @@ -566,6 +575,9 @@ def explanation(self): def __str__(self): return 'Flatten(%s, by=%s)' % (View.asTag(self.inner),str(self.flattenBy)) + self.showExtras() +class FlatMap(Flatten): + """ Alternate name for Flatten""" + class Filter(Transformation): """Filter out a subset of rows that match some predicate.""" @@ -762,6 +774,56 @@ def acceptInnerView(self,otherView): self.joinInputs[0].view = otherView self.inners[0] = otherView +class Union(MapReduce): + """Combine two or more relations, also removing duplicates.""" + + def __init__(self,*inners): + #sets self.inners + MapReduce.__init__(self,list(inners),None) + + def acceptInnerView(self,otherView): + assert False, 'Union cannot be RHS of a pipe - use UnionTo instead' + + def mapPlan(self): + plan = Plan() + innerCheckpoints = map(lambda v:v.checkpoint(), self.inners) + step = PrereduceStep(view=self, whatToDo='doUnionMap',srcs=innerCheckpoints,dst=self.checkpoint(),why=self.explanation()) + plan.append(step) + return plan + + def explanation(self): + innerEx = [] + for inner in self.inners: + if innerEx: innerEx += ['CONCAT TO'] + innerEx += inner.explanation() + return innerEx + + def __str__(self): + return "Union(%s)" % ",".join(map(str,self.inners)) + self.showExtras() + + def rowGenerator(self): + lastLine = None + for line in sys.stdin: + if line!=lastLine: + yield self.planner._serializer.fromString(line.strip()) + lastLine = line + + def doUnionMap(self,i): + # called with argument index, and stdin pointing to innerCheckpoints[index] + for row in self.inners[i].rowGenerator(): + print self.planner._serializer.toString(row) + + +class UnionTo(Union): + """Special case of Union which can be used as RHS of a pipe operator.""" + + def __init__(self,*moreInners): + allInners = [None]+list(moreInners) + Union.__init__(self,*allInners) + + def acceptInnerView(self,otherView): + self.inners[0] = otherView + ############################################################################## # # the top-level planner, and its supporting classes @@ -803,16 +865,12 @@ def compile(self,gp): script = [] taskCompiler = GPig.getCompiler(gp.opts['target']) for task in self.tasks: + #print 'compiling',task script += taskCompiler.compile(task,gp) return script -# -# a single step in a plan produced by the planner -# - class Step(object): - """A single step of the plans produced by the planner, along with the - methods to convert the plans into executable shell commands.""" + """A single 'step' of the plans produced by the planner.""" def __init__(self,view): self.view = view @@ -852,6 +910,7 @@ def __str__(self): return "TransformStep("+",".join(map(repr, [self.view.tag,self.whatToDo,self.srcs,self.dst,self.reused]))+")" class PrereduceStep(Step): + """A step that can be followed by a reduce step.""" def __init__(self,view,whatToDo,srcs,dst,why): Step.__init__(self,view) self.whatToDo = whatToDo @@ -862,11 +921,22 @@ def __init__(self,view,whatToDo,srcs,dst,why): def __str__(self): return "PrereduceStep("+",".join(map(repr, [self.view.tag,self.whatToDo,self.srcs,self.dst,self.reused]))+")" -# combine steps into something executable via hadoop - or shell - class AbstractMapReduceTask(object): - """A collection of steps that can be executed as a single map-reduce operation, - possibly with some file managements steps to set up the task.""" + """A collection of steps that can be executed as a single map-reduce + operation, possibly with some file managements steps to set up the + task. More specifically, this consists of + + 1a) a maybe-empty sequence of DistributeStep's + 2a) a PrereduceStep followed by a TransformStep + or else + + 1b) a maybe-empty sequence of DistributeStep's + 2b) a PrereduceStep + 3b) a TransformStep + + Sequence 1a-2a is a map-only task, and sequence 1b-3b is a + map-reduce task. + """ def __init__(self): self.distributeSteps = [] @@ -880,16 +950,39 @@ def insert(self,step): self.distributeSteps.append(step) return True elif self.mapStep==None and (isinstance(step,TransformStep) or isinstance(step,PrereduceStep)): - #we can only have one map step, so fill up an empty slot if possible + #we can only have one map step, so fill up an empty mapstep slot if possible self.mapStep = step return True elif self.mapStep and isinstance(self.mapStep,PrereduceStep) and isinstance(step,TransformStep) and not self.reduceStep: - #if the mapstep is a prereduce, then we can also allow a reduce step + #if the mapstep is a prereduce, then we can also allow any TransformStep to be used as a reduceStep self.reduceStep = step return True else: return False + def explanation(self): + """Concatenate together the explanations for the different steps of + 2this task.""" + buf = [] + for step in self.distributeSteps: + buf += step.why + #reduce explanation copies the map explanation so we don't need both + if self.reduceStep: + buf += self.reduceStep.why + else: + buf += self.mapStep.why + return buf + + def inputsAndOutputs(self): + """Return a string summarizing the source files used as inputs, and + the view ultimately created by this task.""" + buf = ' + '.join(self.mapStep.srcs) + if self.reduceStep: + buf += ' => ' + self.reduceStep.view.tag + else: + buf += ' => ' + self.mapStep.view.tag + return buf + def __str__(self): buf = "mapreduce task:" for step in self.distributeSteps: @@ -900,6 +993,7 @@ def __str__(self): return buf class MRCompiler(object): + """Abstract compiler class to convert a task to a list of commands that can be executed by the shell.""" def compile(self,task,gp): @@ -909,25 +1003,25 @@ def compile(self,task,gp): script += ['echo create '+task.mapStep.view.tag + ' via map: ' + task.mapStep.explain()] else: script += ['echo create '+task.reduceStep.view.tag +' via map/reduce: '+task.reduceStep.explain()] - for step in task.distributeSteps: - localCopy = step.view.distributableFile() - maybeRemoteCopy = step.view.storedFile() - echoCom = 'echo distribute %s: making a local copy of %s in %s' % (step.view.tag,maybeRemoteCopy,localCopy) - script += [echoCom] + self.distributeCommands(task, gp, maybeRemoteCopy,localCopy) - if not task.reduceStep and len(task.mapStep.srcs)==1: + if not task.reduceStep and len(task.mapStep.srcs)==1: #a map-only step mapCom = self._coreCommand(task.mapStep,gp) script += self.simpleMapCommands(task, gp, mapCom, task.mapStep.srcs[0], task.mapStep.dst) - elif task.reduceStep and len(task.mapStep.srcs)==1: + elif task.reduceStep and len(task.mapStep.srcs)==1: #a map-reduce step mapCom = self._coreCommand(task.mapStep,gp) reduceCom = self._coreCommand(task.reduceStep,gp) script += self.simpleMapReduceCommands(task, gp, mapCom, reduceCom, task.mapStep.srcs[0], task.reduceStep.dst) - elif task.reduceStep and len(task.mapStep.srcs)>1: + elif task.reduceStep and len(task.mapStep.srcs)>1: #multiple mappers and one reduce mapComs = [self._ithCoreCommand(task.mapStep,gp,i) for i in range(len(task.mapStep.srcs))] reduceCom = self._coreCommand(task.reduceStep,gp) midpoint = gp.opts['viewdir']+'/'+task.mapStep.view.tag+'.gpmo' script += self.joinCommands(task, gp, mapComs, reduceCom, task.mapStep.srcs, midpoint, task.reduceStep.dst) else: assert False,'cannot compile task '+str(task) + for step in task.distributeSteps: #distribute the results, if necessary + localCopy = step.view.distributableFile() + maybeRemoteCopy = step.view.storedFile() + echoCom = 'echo distribute %s: making a local copy of %s in %s' % (step.view.tag,maybeRemoteCopy,localCopy) + script += [echoCom] + self.distributeCommands(task, gp, maybeRemoteCopy,localCopy) return script # abstract routines @@ -969,7 +1063,7 @@ def __coreCommandOptions(self,step,gp): for (k,v) in gp.opts.items(): #pass in non-default options, or options computed from the environment if (gp.opts[k] != GPig.DEFAULT_OPTS[k]) or ((k in GPig.COMPUTED_OPTION_DEFAULTS) and (gp.opts[k] != GPig.COMPUTED_OPTION_DEFAULTS[k])): - nonDefaults += ["%s:%s" % (k,str(v))] + nonDefaults += ["%s:%s" % (k,urllib.quote(str(v)))] optsOpts = '' if not nonDefaults else " --opts " + ",".join(nonDefaults) reuseOpts = '' if not step.reused else " --reuse "+ " ".join(step.reused) return paramOpts + optsOpts + reuseOpts @@ -989,14 +1083,14 @@ def simpleMapCommands(self,task,gp,mapCom,src,dst): def simpleMapReduceCommands(self,task,gp,mapCom,reduceCom,src,dst): """A map-reduce job with one input.""" - return [mapCom + ' < ' + src + ' | sort -k1 | '+reduceCom + ' > ' + dst] + return [mapCom + ' < ' + src + (' | %s -k1 | ' % GPig.SORT_COMMAND) +reduceCom + ' > ' + dst] def joinCommands(self,task,gp,mapComs,reduceCom,srcs,midpoint,dst): """A map-reduce job with several inputs.""" subplan = ['rm -f %s' % midpoint] for i,ithMapCom in enumerate(mapComs): subplan += [ithMapCom + ' < ' + srcs[i] + ' >> ' + midpoint] - subplan += [ 'sort -k1,2 < ' + midpoint + ' | ' + reduceCom + ' > ' + dst] + subplan += [ ('%s -k1,2 < '% GPig.SORT_COMMAND) + midpoint + ' | ' + reduceCom + ' > ' + dst] return subplan class HadoopCompiler(MRCompiler): @@ -1078,7 +1172,10 @@ def _hadoopCleanCommand(self,gp,fileName): # class RowSerializer(object): - """Saves row objects to disk and retrieves them.""" + """Saves row objects to disk and retrieves them. A RowSerializer is + used internally in a Planner, and by default the one used by a + Planner will be an instance of RowSerializer(). A user can + override this with planner.setSerializer(). """ def __init__(self): self.evaluator = GPig.SafeEvaluator() def toString(self,x): @@ -1095,38 +1192,47 @@ class Planner(object): def __init__(self,**kw): - #parameters are used for programmatically give user-defined + #Parameters are used for programmatically giving user-defined #config information to a planner, or they can be specified in - #the command-line + #the command-line. These are usually accessed in user-defined + #views. + self.param = kw for (key,val) in GPig.getArgvParams().items(): # don't override non-null values specified in the constructor if self.param.get(key)==None: self.param[key] = val - #opts are used for giving options to the planner from the shell + #opts are used for giving options to the planner from the + #shell, and are used in code in this file. + self.opts = GPig.getArgvOpts() for (key,val) in GPig.DEFAULT_OPTS.items(): if (not key in self.opts): self.opts[key] = val for (key,type) in GPig.DEFAULT_OPT_TYPES.items(): self.opts[key] = type(self.opts[key]) - #use appropriate for the target + #Provide a default serializer + self._serializer = RowSerializer() - #views that aren't associated with class variable, but are - #instead named automatically - ie, inner views with no + #These are views that aren't associated with class variable, + #but are instead named automatically - ie, inner views with no #user-provided names. self._autoNamedViews = {} - #by default, use info-level logging at planning time + #By default, use info-level logging at planning time only, not + #at view execution time. if not Planner.partOfPlan(sys.argv): logging.basicConfig(level=logging.INFO) + logging.info('GuineaPig v%s %s' % (GPig.VERSION,GPig.COPYRIGHT)) - #hadoop needs to know where to give the main script file, - #as well as the guineapig.py file it uses + #Hadoop needs to know where to give the main script file, as + #well as the guineapig.py file used here + self._shippedFiles = [] self._gpigSourceFile = sys.argv[0] - self._shippedFiles = [GPig.MY_LOC,self._gpigSourceFile] + self.ship(GPig.MY_LOC) + self.ship(self._gpigSourceFile) def setup(self): """Initialize planner, and views used by the planner. This has to be @@ -1264,9 +1370,16 @@ def _storageSeq(self,view,storedViews): # dealing with the file storage system and related stuff # - def ship(self,*fileNames): + def ship(self,fileName): """Declare a set of inputs to be 'shipped' to the hadoop cluster.""" - self._shippedFiles += fileNames + for d in sys.path: + location = os.path.join(d,fileName) + if os.path.isfile(location): + logging.info('located %s at %s' % (fileName,location)) + self._shippedFiles.append(location) + return + logging.error("didn't locate %s on sys.path: path is %r" % (fileName,sys.path)) + logging.warn("note that the working directory . should always be on your PYTHONPATH") def setSerializer(self,serializer): """Replace the default serializer another RowSerializer object.""" @@ -1294,12 +1407,17 @@ def main(self,argv): self.runMain(argv) def runMain(self,argv): + """Called by main().""" # parse the options and dispatch appropriately - argspec = ["store=", "cat=", "reuse", + argspec = ["store=", "cat=", "reuse", "help", "list", "pprint=", "steps=", "tasks=", "plan=", "params=", "opts=", "do=", "view="] - optlist,args = getopt.getopt(argv[1:], 'x', argspec) + try: + optlist,args = getopt.getopt(argv[1:], 'x', argspec) + except getopt.GetoptError: + logging.fatal('bad option: use "--help" to get help') + sys.exit(-1) optdict = dict(optlist) # decide what views can be re-used, vs which need fresh plans @@ -1329,12 +1447,21 @@ def runMain(self,argv): for s in plan.steps: print ' -',s return - elif '--tasks' in optdict: #print AbstractMapReduceTasks to produce a view + elif '--tasks' in optdict: #print AbstractMapReduceTasks rel = self.getView(optdict['--tasks'],mustExist=True) plan = rel.storagePlan() plan.buildTasks() - for t in plan.tasks: - print t + for k,task in enumerate(plan.tasks): + print '=' * 70 + taskType = 'map-reduce' if task.reduceStep else 'map-only' + print '%s task %d: %s' % (taskType,(k+1),task.inputsAndOutputs()) + print ' - +' + '-' * 20, 'explanation', '-' * 20 + for w in task.explanation(): + print ' - | ',w + print ' - +' + '-' * 20, 'commands', '-' * 20 + for c in GPig.getCompiler(self.opts['target']).compile(task,self): + if not c.startswith("echo"): + print ' - | ',c return elif '--plan' in optdict: #print a storage plan rel = self.getView(optdict['--plan'],mustExist=True) @@ -1373,12 +1500,31 @@ def runMain(self,argv): whatToDoMethod(arg) return else: - print 'usage: --[store|pprint|steps|plan|cat] view [--opts key:val,...] [--params key:val,...] --reuse view1 view2 ...]' - print ' --[list]' - print 'current legal keys for "opts", with default values:' + usageHint = {'pprint':'print the data structure associated with the VIEW', + 'tasks':'print the abstract map-reduce tasks needed to materialize the VIEW', + 'plan':'print the commands that invoke each abstract map-reduce task', + 'store':'materialize the named VIEW and store it in the view directory', + 'cat': 'store the VIEW and then print each line to stdout'} + print 'Guinea Pig',GPig.VERSION,GPig.COPYRIGHT + print 'usage: python %s --(store|pprint|tasks|plan|cat) VIEW [OPTIONS] [PARAMS] --reuse VIEW1 VIEW2 ...' % sys.argv[0] + print ' python %s --list' % sys.argv[0] + print '' + print 'Subcommands that take a VIEW as argument:' + for a in usageHint: + print ' --%s VIEW: %s'% (a,usageHint[a]) + print 'The --list subcommand lists possible VIEWs defined by this program.' + print '' + print 'OPTIONS are specified as "--opts key:value,...", where legal keys for "opts", with default values, are:' for (key,val) in GPig.DEFAULT_OPTS.items(): print ' %s:%s' % (key,str(val)) + print 'Values in the "opts" key/value pairs are assumed to be URL-escaped.' + print '' + print 'PARAMS are specified as "--params key:value,..." and the associated dictionary is accessible to' + print 'user programs via the function GPig.getArgvParams().' + print '' print 'There\'s more help at http://curtis.ml.cmu.edu/w/courses/index.php/Guinea_Pig' if __name__ == "__main__": + print 'Guinea Pig',GPig.VERSION,GPig.COPYRIGHT print 'There\'s help at http://curtis.ml.cmu.edu/w/courses/index.php/Guinea_Pig' + diff --git a/spyk.py b/spyk.py new file mode 100644 index 0000000..6976dab --- /dev/null +++ b/spyk.py @@ -0,0 +1,140 @@ +############################################################################## +# (C) Copyright 2014, 2015 William W. Cohen. All rights reserved. +############################################################################## + +import guineapig +import sys +import random + +class SpykContext(object): + + def __init__(self,**kw): + self.planner = guineapig.Planner(**kw) + self.tagCodeIndex = 0 + + #TODO setSerializer, setEvaluator, ship + + #returns a SpykRDD + def textFile(self,fileName): + rdd = SpykRDD('textFile', self, guineapig.ReadLines(fileName)) + return rdd + + def wholeTextFiles(self,dirName): + #TODO find this in royals, and make it a gpextra + pass + + def finalize(self): + """Declare the SpykRDD and all RDD definitions complete. This must be + called in the __name__=="__main__" part of the code, because + it also executes substeps when called recursively.""" + self.planner.setup() + if guineapig.Planner.partOfPlan(sys.argv): + self.planner.main(sys.argv) + + def usermain(self): + """Use this in an if statement before any Spyk actions.""" + return not guineapig.Planner.partOfPlan(sys.argv) + +class SpykRDD(object): + + def __init__(self,tag,context,view): + self.view = view + self.context = context + self.view.planner = context.planner + self.context.tagCodeIndex += 1 + self.context.planner._setView("%s__%d" % (tag,self.context.tagCodeIndex), view) + + #TODO this doesn't work, need to use a different mechanism, + #maybe with a wrapper around plan/execute + def cache(self): + self.view = self.view.opts(stored=True) + return self + + #transformations, which return new SpykRDD's + + #TODO + #union + #intersection - gpextra? + # ... and for keyed views only + #cogroup + + def map(self,mapfun): + return SpykRDD('map',self.context, guineapig.ReplaceEach(self.view,by=mapfun)) + + def flatMap(self,mapfun): + return SpykRDD('flatMap',self.context, guineapig.Flatten(self.view,by=mapfun)) + + def groupByKey(self): + return SpykRDD('groupByKey', + self.context, + guineapig.Group(self.view, + by=lambda (key,val):key, + retaining=lambda (key,val):val)) + + def reduceByKey(self,initValue,reduceOp): + return SpykRDD('reduceByKey', + self.context, + guineapig.Group(self.view, + by=lambda (key,val):key, + retaining=lambda (key,val):val, + reducingTo=guineapig.ReduceTo(initValue,reduceOp))) + def filter(self,filterfun): + return SpykRDD('filter',self.context, guineapig.Filter(self.view,by=filterfun)) + + def sample(self,withReplacement,fraction): + assert not withReplacement, 'sampling with replacement is not implemented' + return SpykRDD('sample',self.context, guineapig.Filter(self.view,by=lambda x:1 if random.random()30: s = s[0:30]+'...' - return 'Wrap(%s)' % s + self.showExtras() + return 'Wrap("%s")' % s + self.showExtras() someInts = list(range(10)) somePairs = [(i/3, i) for i in range(15)] @@ -78,6 +77,8 @@ def aPlanner(): p.augA = Augment(p.yA, sideview=p.yD, loadedBy=lambda v:GPig.onlyRowOf(v)) p.hiLo = p.augA | ReplaceEach(by=lambda(a,d):1 if a>d else -1) + p.uab = Union(p.yA,p.yD) + p.setup() return p @@ -129,8 +130,8 @@ def testPlanning(self): print 'TEST: Planner' v = self.p.getView('xMidA') #check inferred storage - self.assertTrue( self.p.getView('xA').storeMe ) plan = v.storagePlan() + self.assertTrue( self.p.getView('xA').storeMe ) print 'midA plan:\n',"\n".join(map(str,plan.steps)) #self.assertTrue(len(plan.steps)==5) self.checkEquiv(self.p, 'xMidA', [3,4,6,7]) @@ -139,6 +140,10 @@ def testAugment(self): print 'TEST: Augment' self.checkExact(self.p, 'hiLo', [-1]*5 + [+1]*4) + def testUnion(self): + print 'TEST: Union' + self.checkExact(self.p, 'uab', list(range(10))) + def checkEquiv(self,p,viewName,expected): v = p.getView(viewName) v.storagePlan().execute(p) @@ -155,8 +160,8 @@ def checkExact(self,p,viewName,expected): v = p.getView(viewName) v.storagePlan().execute(p) actual = list(self.rowsOf(v)) - print 'expected:',expected - print 'actual: ',actual + print 'exact expected:',expected,'len',len(expected) + print 'exact actual: ',actual,'len',len(actual) self.assertTrue(len(actual)==len(expected)) for i in range(len(actual)): self.assertTrue(actual[i]==expected[i]) diff --git a/tutorial/Makefile b/tutorial/Makefile index fb2b18c..27270bf 100644 --- a/tutorial/Makefile +++ b/tutorial/Makefile @@ -1,25 +1,20 @@ update: - cp ../guineapig.py . - cp ../guineapig1_1.py . - cp ../guineapig1_2.py . - cp ../guineapig1_3.py . -wc: - perl -ne 'print if /\S/ && !/^\#/' guineapig.py | wc + echo updates no longer needed clean: rm -rf gpig_views rm -f total.gp rm *.pyc -tar: update +tar: + cp ../guineapig.py . echo created on `date` > marker.txt - tar -cvzf tutorial.tgz README.txt marker.txt guineapig.py *corpus.txt id-parks.txt *.py phirl-naive.pig + tar -cvzf tutorial.tgz marker.txt guineapig.py *corpus.txt id-parks.txt *.py phirl-naive.pig upload: tar scp tutorial.tgz raff.ml.cmu.edu:~/afs-home/www/10-605/gpigtut.tgz -all-runs: run-wordcount run-ntup-wordcount run-prefcount run-wordcmp run-wordprob run-multi run-phirl run-reuse check-phirl \ - run-phirl1_3 check-phirl1_3 +all-runs: run-wordcount run-ntup-wordcount run-prefcount run-wordcmp run-wordprob run-multi run-phirl run-reuse check-phirl echo all tests run, did you see any problems\? run-wordcount: @@ -69,10 +64,6 @@ run-phirl: sort -gr gpig_views/look.gp | head sort -g gpig_views/look.gp | head -run-phirl1_3: - python phirl-naive1_3.py --store look - sort -gr gpig_views/look.gp | head - sort -g gpig_views/look.gp | head run-reuse: rm -f gpig_views/*.gp* @@ -83,7 +74,4 @@ check-phirl: run-phirl sort -gr gpig_views/look.gp > phirl-actual-output.txt diff phirl-actual-output.txt phirl-expected-output.txt -check-phirl1_3: run-phirl1_3 - sort -gr gpig_views/look.gp > phirl-actual-output.txt - diff phirl-actual-output.txt phirl-expected-output.txt diff --git a/tutorial/README.txt b/tutorial/README.txt index 47fd1b2..95e94f3 100644 --- a/tutorial/README.txt +++ b/tutorial/README.txt @@ -1,10 +1,9 @@ -Materials for tutorial on Guinea Pig. For more information, see: +For more information, see: http://curtis.ml.cmu.edu/w/courses/index.php/Guinea_Pig -In addition to the tutorial materials, there's also one larger example -here, a soft-join program: phirl-naive.py. +There's also one larger example here, a soft-join program: phirl-naive.py. -The date this version was last modified is stored in the file marker.txt. +The date this version was last modified is stored in the file marker.txt Recent changes: @@ -19,4 +18,4 @@ Recent changes: 10/9: added SafeEvaluator to 1.3. - 10/15: moved source control to git. Extended the documents. + 11/11: added LC_COLLATE=C to sort command diff --git a/tutorial/guineapig.py b/tutorial/guineapig.py deleted file mode 100644 index 4e0b117..0000000 --- a/tutorial/guineapig.py +++ /dev/null @@ -1,1384 +0,0 @@ -############################################################################## -# (C) Copyright 2014 William W. Cohen. All rights reserved. -############################################################################## - -import sys -import logging -import copy -import subprocess -import collections -import os -import os.path -import urlparse -import getopt -import csv - -############################################################################### -# helpers functions and data structures -############################################################################### - -class GPig(object): - """Collection of utilities for Guinea Pig.""" - - HADOOP_LOC = 'hadoop' #assume hadoop is on the path at planning time - MY_LOC = 'guineapig.py' - - #global options for Guinea Pig can be passed in with the --opts - #command-line option, and these are the default values - defaultJar = '/usr/lib/hadoop/contrib/streaming/hadoop-streaming-1.2.0.1.3.0.0-107.jar' - envjar = os.environ.get('GP_STREAMJAR', defaultJar) - DEFAULT_OPTS = {'streamJar': envjar, - 'parallel':5, - 'target':'shell', - 'echo':0, - 'viewdir':'gpig_views', - } - #there are the types of each option that has a non-string value - DEFAULT_OPT_TYPES = {'parallel':int,'echo':int} - #we need to pass non-default options in to mappers and reducers, - #but since the remote worker's environment can be different, we - #also need to pass in options computed from the environment - COMPUTED_OPTION_DEFAULTS = {'streamJar':defaultJar} - - @staticmethod - def getCompiler(target): - if target=='shell': return ShellCompiler() - elif target=='hadoop': return HadoopCompiler() - else: assert 'illegal compilation target '+target - - @staticmethod - def getArgvParams(): - """Return a dictionary holding the argument of the --params option in - sys.argv.""" - return GPig.getArgvDict('--params') - - @staticmethod - def getArgvOpts(): - """Return a dictionary holding the argument of the --opts option in - sys.argv.""" - return GPig.getArgvDict('--opts') - - @staticmethod - def getArgvDict(optname): - """Return a dictionary of parameter values that were defined on the command line - view an option like '--params filename:foo.txt,basedir:/tmp/glob/'. - """ - assert optname.startswith('--') - for i,a in enumerate(sys.argv): - if a==optname: - paramString = sys.argv[i+1] - return dict(pair.split(":") for pair in paramString.split(",")) - return {} - - @staticmethod - def rowsOf(view): - """Iterate over the rows in a view.""" - for line in open(view.distributableFile()): - yield view.planner._serializer.fromString(line.strip()) - - @staticmethod - def onlyRowOf(view): - """Return the first row in a side view, and raise an error if it - is not the only row of the view.""" - result = None - logging.info('loading '+view.distributableFile()) - for line in open(view.distributableFile()): - assert not result,'multiple rows in stored file for '+view.tag - result = view.planner._serializer.fromString(line.strip()) - return result - - @staticmethod - class SafeEvaluator(object): - """Evaluates expressions that correzpond to serialized guinea pig rows.""" - def __init__(self,restrictedBindings={}): - self.restrictedBindings = restrictedBindings - def eval(self,s): - code = compile(s,'','eval') - return eval(code,self.restrictedBindings) - -class Jin(object): - """"Object to hold description of a single join input.""" - - def __init__(self,view,by=(lambda x:x),outer=False): - self.view = view - self.joinBy = by - self.outer = outer - self._padWithNulls = False - - def __str__(self): - viewStr = View.asTag(self.view) if self.view else '_' - outerStr = ',outer=True' if self.outer else '' - padStr = ',_padWithNulls=True' if self._padWithNulls else '' - return "Jin(%s,by=%s%s%s)" % (viewStr,self.joinBy,outerStr,padStr) - -class ReduceTo(object): - """An object x that can be the argument of a reducingTo=x - parameter in a Group view.""" - def __init__(self,baseType,by=lambda accum,val:accum+val): - self.baseType = baseType - self.reduceBy = by - -class ReduceToCount(ReduceTo): - """Produce the count of the number of objects that would be placed in a group.""" - def __init__(self): - ReduceTo.__init__(self,int,by=lambda accum,val:accum+1) - -class ReduceToSum(ReduceTo): - """Produce the sum of the objects - which must be numbers - that would - be placed in a group.""" - def __init__(self): - ReduceTo.__init__(self,int,by=lambda accum,val:accum+val) - -class ReduceToList(ReduceTo): - """Produce a list of the objects that would be placed in a group.""" - def __init__(self): - ReduceTo.__init__(self,list,by=lambda accum,val:accum+[val]) - -############################################################################### -# abstract views -############################################################################## - -class View(object): - """A definition of a relation for Guinea Pig. A View object can be - produce a storagePlan(), which can then be executed to produce the - contents of the relation. Intutitively, a relation is and - unordered bag of rows, and a row an almost-arbitrary python data - structure. (It must be something that can be stored and retrieved - by the RowSerializer.) - - Steps in the storagePlan are executed by delegation, thru the - planner, to methods of a View class named doFoo. - """ - - def __init__(self): - """The planner and tag must be set before this is used.""" - self.planner = None #pointer to planner object - self.tag = None #for naming storedFile and checkpoints - self.storeMe = None #try and store this view if true - self.retainedPart = None #used in map-reduce views only - self.sideviews = [] #non-empty for Augment views only - self.inners = [] #always used - - #self.inner is shortcut for inners[0] - def _getInner(self): return self.inners[0] - def _setInner(self,val): self.inners = [val] - inner = property(_getInner,_setInner) - - # - # ways to modify a view - # - - def opts(self,stored=None): - """Return the same view with options set appropriately. Possible - options include: - - - stored=True - Explicitly store this view on disk whenever - it is used in another view's definition. This might be set - by the user for debugging purposes, or by the planner, - to prevent incorrect optimizations. Generally "inner" - views are not explicitly stored. - - - stored='distributedCache' - Store this view in the working - directory and/or the Hadoop distributed cache. - """ - - self.storeMe = stored - return self - - def showExtras(self): - """Printable representation of the options for a view.""" - result = '' - flagPairs = [] - if self.storeMe: flagPairs += ['stored=%s' % repr(self.storeMe)] - if flagPairs: result += '.opts(' + ",".join(flagPairs) + ')' - return result - - # - # how the view is saved on disk - # - - def storedFile(self): - """The file that will hold the materialized relation.""" - return self.planner.opts['viewdir'] + '/' + self.tag + '.gp' - - def distributableFile(self): - """The file that will hold the materialized relation in the working directory - in preparation to be uploaded to the distributed cache.""" - return self.tag + '.gp' - - @staticmethod - def viewNameFor(fileName): - """The view associated with the given file name""" - vname = os.path.basename(fileName) - if vname.endswith(".gp"): vname = vname[0:-len(".gp")] - return vname - - # - # semantics of the view - # - - def checkpoint(self): - """A checkpoint is an intermediate computation for the view, which is - saved on disk. The rowGenerator() for the view will assume - that the checkpoint is available. - """ - assert False, 'abstract method called' - - def checkpointPlan(self): - """A plan to produce checkpoint().""" - assert False, 'abstract method called' - - def rowGenerator(self): - """A generator for the rows in this relation, which assumes existence - of the checkpoint.""" - assert False, 'abstract method called' - - def explanation(self): - """Return an explanation of how rows are generated.""" - assert False, 'abstract method called' - - def storagePlan(self): - """A plan to store the view.""" - return self.planner.buildRecursiveStoragePlan(self) - - def nonrecursiveStoragePlan(self): - """Materialize the relation, assuming that there are no descendent - inner views that need to be materialized first.""" - plan = Plan() - plan.includeStepsOf(self.checkpointPlan()) - plan.append(TransformStep(view=self,whatToDo='doStoreRows',srcs=[self.checkpoint()],dst=self.storedFile(),why=self.explanation())) - return plan - - def applyDict(self,mapping,innerviewsOnly=False): - """Given a mapping from view tags to views, replace every inner view with - the appropriate value from the mapping, and return the result.""" - if self.tag in mapping and not innerviewsOnly: - return mapping[self.tag] - elif not self.inners: - return self - else: - result = copy.copy(self) - result.inners = map(lambda v:v.applyDict(mapping), self.inners) - return result - - def sideviewsNeeded(self): - """Sideviews needed by this view.""" - result = [] - for sv in self.sideviews: - result += [sv] - for v in self.inners: - result += list(v._sideviewsOfDescendants()) - return result - - def _sideviewsOfDescendants(self): - if not self.storeMe: - for sv in self.sideviews: - yield sv - for v in self.inners: - for sv in v._sideviewsOfDescendants(): - yield sv - - def enforceStorageConstraints(self): - """Subclass this, if there are constraints on when one must explicitly - store inner views.""" - pass - - def doStoreRows(self): - """Called by planner at execution time to store the rows of the view.""" - for row in self.rowGenerator(): - print self.planner._serializer.toString(row) - - # - # support the "pipe" syntax: view1 | view2 - # - - def __or__(self,otherView): - """Overload the pipe operator x | y to return with y, with x as its inner view.""" - otherView.acceptInnerView(self) - return otherView - - def acceptInnerView(self,otherView): - """Replace an appropriate input view with otherView. This is subclassed to - implement the the pipe operator.""" - assert not self.inner,'An inner view is defined for '+self.tag+' so you cannot use it as RHS of a pipe' - self.inner = otherView #subclass if needed - - # - # printing views - # - - def pprint(self,depth=0,alreadyPrinted=None,sideview=False): - """Print a readable representation of the view.""" - if alreadyPrinted==None: alreadyPrinted = set() - tabStr = '| ' * depth - tagStr = str(self.tag) - sideviewIndicator = '*' if sideview else '' - if self.tag in alreadyPrinted: - print tabStr + sideviewIndicator + tagStr + ' = ' + '...' - else: - sideviewInfo = " sideviews: {"+",".join(map(lambda x:x.tag, self.sideviews))+"}" if self.sideviews else "" - print tabStr + sideviewIndicator + tagStr + ' = ' + str(self) + sideviewInfo - alreadyPrinted.add(self.tag) - for inner in self.inners: - inner.pprint(depth+1,alreadyPrinted) - for inner in self.sideviews: - inner.pprint(depth+1,alreadyPrinted,sideview=True) - - @staticmethod - def asTag(view): - """Helper for printing views.""" - if not view: return '(null view)' - elif view.tag: return view.tag - else: return str(view) - -# -# abstract view types -# - -class Reader(View): - """Read data stored on the file system and make it look like a View.""" - - def __init__(self,src): - View.__init__(self) - self.src = src - self.inners = [] - - def checkpoint(self): - return self.src - - def checkpointPlan(self): - return Plan() #empty plan - - def explanation(self): - return [ 'read %s with %s' % (str(self.src),self.tag) ] - - def acceptInnerView(self,otherView): - assert False, "Reader views cannot be used as RHS of a pipe" - -class Transformation(View): - """Streaming transformation on a single inner view.""" - - def __init__(self,inner=None): - View.__init__(self) - self.inner = inner - - # A transformation will stream on-the-fly through the inner - # relation, and produce a new version, so the checkpoint and plan - # to produce it are delegated to the inner View. - - def checkpoint(self): - return self.inner.checkpoint() - - def checkpointPlan(self): - return self.inner.checkpointPlan() - - def explanation(self): - return self.inner.explanation() + [ 'transform to %s' % self.tag ] - -class MapReduce(View): - """A view that takes an inner relation and processes in a - map-reduce-like way.""" - - def __init__(self,inners,retaining): - View.__init__(self) - self.inners = inners - self.retainedPart = retaining - - def _isReduceInputFile(self,fileName): - return fileName.endswith('.gpri') - - def checkpoint(self): - ## the checkpoint is the reducer input file - return self.planner.opts['viewdir'] + '/' + self.tag + '.gpri' - - def checkpointPlan(self): - plan = Plan() - for inner in self.inners: - plan.includeStepsOf(inner.checkpointPlan()) - plan.includeStepsOf(self.mapPlan()) - return plan - - def enforceStorageConstraints(self): - for inner in self.inners: - innerChkpt = inner.checkpoint() - #optimizations break if you chain two map-reduces together - if innerChkpt and innerChkpt.endswith(".gpri"): - if not inner.storeMe: - logging.info('making %s stored, to make possible a downstream map-reduce view' % inner.tag) - inner.storeMe = True - - def mapPlan(self): - log.error("abstract method not implemented") - - def doStoreKeyedRows(self,subview,key,index): - """Utility method used by concrete map-reduce classes to compute keys - and store key-value pairs. Usually used as the main step in a - mapPlan. """ - for row in subview.rowGenerator(): - keyStr = self.planner._serializer.toString(key(row)) - rrow = self.retainedPart(row) if self.retainedPart else row - valStr = self.planner._serializer.toString(rrow) - if index<0: - print "%s\t%s" % (keyStr,valStr) - else: - print "%s\t%d\t%s" % (keyStr,index,valStr) - -############################################################################## -# -# concrete View classes -# -############################################################################## - -class ReuseView(Reader): - """Returns the objects in a previously stored view.""" - - def __init__(self,view): - if isinstance(view,View): - Reader.__init__(self,view.storedFile()) - self.tag = "reuse_"+view.tag - self.reusedViewTag = view.tag - self.planner = view.planner - else: - assert False,'user-defined ReuseView not supported (yet)' - - def rowGenerator(self): - for line in sys.stdin: - yield self.planner._serializer.fromString(line.strip()) - - def __str__(self): - return 'ReuseView("%s")' % self.src + self.showExtras() - - -class ReadLines(Reader): - """ Returns the lines in a file, as python strings.""" - - def __init__(self,src): - Reader.__init__(self,src) - - def rowGenerator(self): - for line in sys.stdin: - yield line - - def __str__(self): - return 'ReadLines("%s")' % self.src + self.showExtras() - -class ReadCSV(Reader): - """ Returns the lines in a CSV file, converted to Python tuples.""" - - def __init__(self,src,**kw): - Reader.__init__(self,src) - self.kw = kw - - def rowGenerator(self): - for tup in csv.reader(sys.stdin,**self.kw): - yield tup - - def __str__(self): - return 'ReadCVS("%s",%s)' % (self.src,str(self.kw)) + self.showExtras() - - -class ReplaceEach(Transformation): - """ In 'by=f'' f is a python function that takes a row and produces - its replacement.""" - - def __init__(self,inner=None,by=lambda x:x): - Transformation.__init__(self,inner) - self.replaceBy = by - - def rowGenerator(self): - for row in self.inner.rowGenerator(): - yield self.replaceBy(row) - - def explanation(self): - return self.inner.explanation() + [ 'replaced to %s' % self.tag ] - - def __str__(self): - return 'ReplaceEach(%s, by=%s)' % (View.asTag(self.inner),str(self.replaceBy)) + self.showExtras() - -class Augment(Transformation): - - def __init__(self,inner=None,sideviews=None,sideview=None,loadedBy=lambda v:list(GPig.rowsOf(v))): - Transformation.__init__(self,inner) - assert not (sideviews and sideview), 'cannot specify both "sideview" and "sideviews"' - self.sideviews = list(sideviews) if sideviews else [sideview] - self.loader = loadedBy - assert self.loader,'must specify a "loadedBy" function for Augment' - - def enforceStorageConstraints(self): - for sv in self.sideviews: - sv.storeMe = 'distributedCache' - - def rowGenerator(self): - augend = self.loader(*self.sideviews) - for row in self.inner.rowGenerator(): - yield (row,augend) - - def checkpointPlan(self): - plan = Plan() - plan.includeStepsOf(self.inner.checkpointPlan()) - #the sideviews should have been stored by the top-level - #planner already, but they will need to be moved to a - #distributable location - for sv in self.sideviews: - plan.append(DistributeStep(sv)) - return plan - - def explanation(self): - return self.inner.explanation() + [ 'augmented to %s' % self.tag ] - - def __str__(self): - sideviewTags = loaderTag = '*UNSPECIFIED*' - if self.sideviews!=None: sideviewTags = ",".join(map(View.asTag,self.sideviews)) - if self.loader!=None: loaderTag = str(self.loader) - return 'Augment(%s,sideviews=%s,loadedBy=s%s)' % (View.asTag(self.inner),sideviewTags,loaderTag) + self.showExtras() - - -class Format(ReplaceEach): - """ Like ReplaceEach, but output should be a string, and it will be be - stored as strings, ie without using the serializer.""" - - def __init__(self,inner=None,by=lambda x:str(x)): - ReplaceEach.__init__(self,inner,by) - - def __str__(self): - return 'Format(%s, by=%s)' % (View.asTag(self.inner),str(self.replaceBy)) + self.showExtras() - - def doStoreRows(self): - for row in self.rowGenerator(): - print row - -class Flatten(Transformation): - """ Like ReplaceEach, but output of 'by' is an iterable, and all - results will be returned. """ - - def __init__(self,inner=None,by=None): - Transformation.__init__(self,inner) - self.flattenBy = by - - def rowGenerator(self): - for row in self.inner.rowGenerator(): - for flatrow in self.flattenBy(row): - yield flatrow - - def explanation(self): - return self.inner.explanation() + [ 'flatten to %s' % self.tag ] - - def __str__(self): - return 'Flatten(%s, by=%s)' % (View.asTag(self.inner),str(self.flattenBy)) + self.showExtras() - -class Filter(Transformation): - """Filter out a subset of rows that match some predicate.""" - - def __init__(self,inner=None,by=lambda x:x): - Transformation.__init__(self,inner) - self.filterBy = by - - def rowGenerator(self): - for row in self.inner.rowGenerator(): - if self.filterBy(row): - yield row - - def explanation(self): - return self.inner.explanation() + [ 'filtered to %s' % self.tag ] - - def __str__(self): - return 'Filter(%s, by=%s)' % (View.asTag(self.inner),str(self.filterBy)) + self.showExtras() - -class Distinct(MapReduce): - """Remove duplicate rows.""" - - def __init__(self,inner=None,retaining=None): - MapReduce.__init__(self,[inner],retaining) - - def mapPlan(self): - plan = Plan() - plan.append(PrereduceStep(view=self,whatToDo='doDistinctMap',srcs=[self.inner.checkpoint()],dst=self.checkpoint(),why=self.explanation())) - return plan - - def rowGenerator(self): - """Extract distinct elements from a sorted list.""" - lastval = None - for line in sys.stdin: - valStr = line.strip() - val = self.planner._serializer.fromString(valStr) - if val != lastval and lastval: - yield lastval - lastval = val - if lastval: - yield lastval - - def explanation(self): - return self.inner.explanation() + [ 'make distinct to %s' % self.tag] - - def __str__(self): - return 'Distinct(%s)' % (View.asTag(self.inner)) + self.showExtras() - - def doDistinctMap(self): - self.inner.doStoreRows() - - -class Group(MapReduce): - """Group by some property of a row, defined with the 'by' option. - Default outputs are tuples (x,[r1,...,rk]) where the ri's are rows - that have 'by' values of x.""" - - def __init__(self,inner=None,by=lambda x:x,reducingTo=ReduceToList(),retaining=None): - MapReduce.__init__(self,[inner],retaining) - self.groupBy = by - self.reducingTo = reducingTo - - def mapPlan(self): - plan = Plan() - plan.append(PrereduceStep(view=self,whatToDo='doGroupMap',srcs=[self.inner.checkpoint()],dst=self.checkpoint(),why=self.explanation())) - return plan - - def rowGenerator(self): - """Group objects from stdin by key, yielding tuples (key,[g1,..,gn]).""" - lastkey = key = None - accum = self.reducingTo.baseType() - for line in sys.stdin: - keyStr,valStr = line.strip().split("\t") - key = self.planner._serializer.fromString(keyStr) - val = self.planner._serializer.fromString(valStr) - if key != lastkey and lastkey!=None: - yield (lastkey,accum) - accum = self.reducingTo.baseType() - accum = self.reducingTo.reduceBy(accum, val) - lastkey = key - if key: - yield (key,accum) - - def explanation(self): - return self.inner.explanation() + ['group to %s' % self.tag] - - def __str__(self): - return 'Group(%s,by=%s,reducingTo=%s)' % (View.asTag(self.inner),str(self.groupBy),str(self.reducingTo)) + self.showExtras() - - def doGroupMap(self): - self.doStoreKeyedRows(self.inner,self.groupBy,-1) - -class Join(MapReduce): - """Outputs tuples of the form (row1,row2,...rowk) where - rowi is from the i-th join input, and the rowi's have the same - value of the property being joined on.""" - - def __init__(self,*joinInputs): - #sets self.inners - MapReduce.__init__(self,map(lambda x:x.view, joinInputs),None) - self.joinInputs = joinInputs - #re-interpret the 'outer' join parameters - semantically - #if jin[i] is outer, then all other inputs must be marked as _padWithNulls - if any(map(lambda jin:jin.outer, self.joinInputs)): - assert len(self.joinInputs)==2,'outer joins are only supported on two-way joins '+str(self.joinInputs) - for i in range(len(self.joinInputs)): - if self.joinInputs[i].outer: - j = 1-i #the other index - self.joinInputs[j]._padWithNulls = True - - def acceptInnerView(self,otherView): - assert False, 'join cannot be RHS of a pipe - use JoinTo instead' - - def mapPlan(self): - plan = Plan() - innerCheckpoints = map(lambda v:v.checkpoint(), self.inners) - step = PrereduceStep(view=self, whatToDo='doJoinMap',srcs=innerCheckpoints,dst=self.checkpoint(),why=self.explanation()) - plan.append(step) - return plan - - def applyDict(self,mapping,innerviewsOnly=False): - result = MapReduce.applyDict(self,mapping,innerviewsOnly=innerviewsOnly) - #also need to map over the join inputs - if isinstance(result,Join): - for i in range(len(result.joinInputs)): - result.joinInputs[i].view = result.inners[i] - return result - - def rowGenerator(self): - """Group objects from stdin by key, yielding tuples (row1,row2,...).""" - lastkey = None - lastIndex = len(self.joinInputs)-1 - somethingProducedForLastKey = False - #accumulate a list of lists of all non-final inputs - accumList = [ [] for i in range(lastIndex) ] - for line in sys.stdin: - keyStr,indexStr,valStr = line.strip().split("\t") - key = self.planner._serializer.fromString(keyStr) - index = int(indexStr) - val = self.planner._serializer.fromString(valStr) - if key != lastkey and lastkey!=None: - #if the final join is marked as _padWithNulls, clear - #the accumulators, since we're doing an outer join - #with the last view - if self.joinInputs[lastIndex]._padWithNulls and not somethingProducedForLastKey: - for tup in self._joinAccumulatedValuesTo(accumList,lastIndex,None): - yield tup - #reset the accumulators, since they pertain to the - accumList = [ [] for i in range(lastIndex) ] - somethingProducedForLastKey = False - if index!=lastIndex: - #accumulate values to use in the join - accumList[index] = accumList[index] + [val] - else: - #produce tuples that match the key for the last view - for tup in self._joinAccumulatedValuesTo(accumList,lastIndex,val): - somethingProducedForLastKey = True - yield tup - lastkey = key - - def _joinAccumulatedValuesTo(self,accumList,lastIndex,finalVal): - # _padWithNulls as needed - for i in range(lastIndex): - if self.joinInputs[i]._padWithNulls and not accumList[i]: - accumList[i] = [None] - tupbuf = [ None for i in range(lastIndex+1) ] #holds output - tupbuf[lastIndex] = finalVal - for i in range(lastIndex): - for a in accumList[i]: - tupbuf[i] = a - if i==lastIndex-1 and any(tupbuf): - yield tuple(tupbuf) - - def explanation(self): - innerEx = [] - for inner in self.inners: - if innerEx: innerEx += ['THEN'] - innerEx += inner.explanation() - return innerEx + [ 'FINALLY join to %s' % self.tag ] - - def __str__(self): - return "Join(%s)" % ",".join(map(str,self.joinInputs)) + self.showExtras() - - def doJoinMap(self,i): - # called by joinMapPlan with argument index, and stdin pointing to innerCheckpoints[index] - self.doStoreKeyedRows(self.joinInputs[i].view,self.joinInputs[i].joinBy,i) - -class JoinTo(Join): - """Special case of Join which can be used as the RHS of a pipe operator.""" - - def __init__(self,joinInput,by=None): - Join.__init__(self,Jin(None,by),joinInput) - - def acceptInnerView(self,otherView): - self.joinInputs[0].view = otherView - self.inners[0] = otherView - -############################################################################## -# -# the top-level planner, and its supporting classes -# -############################################################################## - -class Plan(object): - """A plan constructed by a GuineaPig.""" - - def __init__(self): - self.steps = [] - self.tasks = [] - - def append(self,step): - self.steps.append(step) - - def includeStepsOf(self,subplan): - self.steps += subplan.steps - - def execute(self,gp,echo=False): - script = self.compile(gp) - for shellcom in script: - if echo: print 'calling:',shellcom - subprocess.check_call(shellcom,shell=True) - - def buildTasks(self): - """Group the steps into AbstractMapReduceTask's""" - self.tasks = [AbstractMapReduceTask()] - for step in self.steps: - if not self.tasks[-1].insert(step): - self.tasks.append(AbstractMapReduceTask()) - status = self.tasks[-1].insert(step) - assert status, 'failure to insert '+str(step)+' in fresh AbstractMapReduceTask' - - def compile(self,gp): - """Return a list of strings that can be run as shell commands.""" - self.buildTasks() - logging.info("%d steps converted to %d abstract map-reduce tasks" % (len(self.steps),len(self.tasks))) - script = [] - taskCompiler = GPig.getCompiler(gp.opts['target']) - for task in self.tasks: - script += taskCompiler.compile(task,gp) - return script - -# -# a single step in a plan produced by the planner -# - -class Step(object): - """A single step of the plans produced by the planner, along with the - methods to convert the plans into executable shell commands.""" - - def __init__(self,view): - self.view = view - self.reused = [] # list of views reused at this point - self.why = [] - - def setReusedViews(self,views): - self.reused = list(views) - - def explain(self): - """Convert an explanation - which is a list of strings - into a string""" - return "...".join(self.why) - -# -# a single step in a plan produced by the planner -# - -class DistributeStep(Step): - """Prepare a stored view for the dDistributed cache.""" - - def __init__(self,view): - Step.__init__(self,view) - - def __str__(self): - return "DistributeStep(%s,reused=%s)" % (repr(self.view.tag),repr(self.reused)) - -class TransformStep(Step): - """Tranform input to output.""" - def __init__(self,view,whatToDo,srcs,dst,why): - Step.__init__(self,view) - self.whatToDo = whatToDo - self.srcs = srcs - self.dst = dst - self.why = why - - def __str__(self): - return "TransformStep("+",".join(map(repr, [self.view.tag,self.whatToDo,self.srcs,self.dst,self.reused]))+")" - -class PrereduceStep(Step): - def __init__(self,view,whatToDo,srcs,dst,why): - Step.__init__(self,view) - self.whatToDo = whatToDo - self.srcs = srcs - self.dst = dst - self.why = why - - def __str__(self): - return "PrereduceStep("+",".join(map(repr, [self.view.tag,self.whatToDo,self.srcs,self.dst,self.reused]))+")" - -# combine steps into something executable via hadoop - or shell - -class AbstractMapReduceTask(object): - """A collection of steps that can be executed as a single map-reduce operation, - possibly with some file managements steps to set up the task.""" - - def __init__(self): - self.distributeSteps = [] - self.mapStep = None - self.reduceStep = None - - def insert(self,step): - """Treating the AbstractMapReduceTask as a buffer, add this step to it if possible.""" - if isinstance(step,DistributeStep): - #we can accept any number of distribute steps - self.distributeSteps.append(step) - return True - elif self.mapStep==None and (isinstance(step,TransformStep) or isinstance(step,PrereduceStep)): - #we can only have one map step, so fill up an empty slot if possible - self.mapStep = step - return True - elif self.mapStep and isinstance(self.mapStep,PrereduceStep) and isinstance(step,TransformStep) and not self.reduceStep: - #if the mapstep is a prereduce, then we can also allow a reduce step - self.reduceStep = step - return True - else: - return False - - def __str__(self): - buf = "mapreduce task:" - for step in self.distributeSteps: - buf += "\n - d "+str(step) - buf += "\n - m " + str(self.mapStep) - if self.reduceStep: - buf += "\n - r " + str(self.reduceStep) - return buf - -class MRCompiler(object): - """Abstract compiler class to convert a task to a list of commands that can be executed by the shell.""" - - def compile(self,task,gp): - script = [] - # an explanation/header - if not task.reduceStep: - script += ['echo create '+task.mapStep.view.tag + ' via map: ' + task.mapStep.explain()] - else: - script += ['echo create '+task.reduceStep.view.tag +' via map/reduce: '+task.reduceStep.explain()] - for step in task.distributeSteps: - localCopy = step.view.distributableFile() - maybeRemoteCopy = step.view.storedFile() - echoCom = 'echo distribute %s: making a local copy of %s in %s' % (step.view.tag,maybeRemoteCopy,localCopy) - script += [echoCom] + self.distributeCommands(task, gp, maybeRemoteCopy,localCopy) - if not task.reduceStep and len(task.mapStep.srcs)==1: - mapCom = self._coreCommand(task.mapStep,gp) - script += self.simpleMapCommands(task, gp, mapCom, task.mapStep.srcs[0], task.mapStep.dst) - elif task.reduceStep and len(task.mapStep.srcs)==1: - mapCom = self._coreCommand(task.mapStep,gp) - reduceCom = self._coreCommand(task.reduceStep,gp) - script += self.simpleMapReduceCommands(task, gp, mapCom, reduceCom, task.mapStep.srcs[0], task.reduceStep.dst) - elif task.reduceStep and len(task.mapStep.srcs)>1: - mapComs = [self._ithCoreCommand(task.mapStep,gp,i) for i in range(len(task.mapStep.srcs))] - reduceCom = self._coreCommand(task.reduceStep,gp) - midpoint = gp.opts['viewdir']+'/'+task.mapStep.view.tag+'.gpmo' - script += self.joinCommands(task, gp, mapComs, reduceCom, task.mapStep.srcs, midpoint, task.reduceStep.dst) - else: - assert False,'cannot compile task '+str(task) - return script - - # abstract routines - - def distributeCommands(self,task,gp,maybeRemoteCopy,localCopy): - """Distribute the remote copy to the local directory.""" - assert False, 'abstract method called' - - def simpleMapCommands(self,task,gp,mapCom,src,dst): - """A map-only task with zero or one inputs.""" - assert False, 'abstract method called' - - def simpleMapReduceCommands(self,task,gp,mapCom,reduceCom,src,dst): - """A map-reduce task with one inputs.""" - assert False, 'abstract method called' - - def joinCommands(self,task,gp,mapComs,reduceCom,srcs,midpoint,dst): - """A map-reduce task with several inputs.""" - assert False, 'abstract method called' - - # utilities - - def _stepSideviewFiles(self,step): - files = [] - for sv in step.view.sideviewsNeeded(): - files += [sv.distributableFile()] - - def _coreCommand(self,step,gp): - """Python command to call an individual plan step.""" - return 'python %s --view=%s --do=%s' % (gp._gpigSourceFile,step.view.tag,step.whatToDo) + self.__coreCommandOptions(step,gp) - - def _ithCoreCommand(self,step,gp,i): - """Like _coreCommand but allows index parameter to 'do' option""" - return 'python %s --view=%s --do=%s.%d' % (gp._gpigSourceFile,step.view.tag,step.whatToDo,i) + self.__coreCommandOptions(step,gp) - - def __coreCommandOptions(self,step,gp): - paramOpts = '' if not gp.param else " --params " + ",".join(map(lambda(k,v):k+':'+v, gp.param.items())) - nonDefaults = [] - for (k,v) in gp.opts.items(): - #pass in non-default options, or options computed from the environment - if (gp.opts[k] != GPig.DEFAULT_OPTS[k]) or ((k in GPig.COMPUTED_OPTION_DEFAULTS) and (gp.opts[k] != GPig.COMPUTED_OPTION_DEFAULTS[k])): - nonDefaults += ["%s:%s" % (k,str(v))] - optsOpts = '' if not nonDefaults else " --opts " + ",".join(nonDefaults) - reuseOpts = '' if not step.reused else " --reuse "+ " ".join(step.reused) - return paramOpts + optsOpts + reuseOpts - - -class ShellCompiler(MRCompiler): - """Compile tasks to commands that are executable to most Unix shells.""" - - def distributeCommands(self,task,gp,maybeRemoteCopy,localCopy): - """Distribute the remote copy to the local directory.""" - return ['cp -f %s %s || echo warning: the copy failed!' % (maybeRemoteCopy,localCopy)] - - def simpleMapCommands(self,task,gp,mapCom,src,dst): - """A map-only job with zero or one inputs.""" - if src: return [mapCom + ' < %s > %s' % (src,dst)] - else: return [self.mapCommand(gp) + (' > %s' % (dst))] - - def simpleMapReduceCommands(self,task,gp,mapCom,reduceCom,src,dst): - """A map-reduce job with one input.""" - return [mapCom + ' < ' + src + ' | sort -k1 | '+reduceCom + ' > ' + dst] - - def joinCommands(self,task,gp,mapComs,reduceCom,srcs,midpoint,dst): - """A map-reduce job with several inputs.""" - subplan = ['rm -f %s' % midpoint] - for i,ithMapCom in enumerate(mapComs): - subplan += [ithMapCom + ' < ' + srcs[i] + ' >> ' + midpoint] - subplan += [ 'sort -k1,2 < ' + midpoint + ' | ' + reduceCom + ' > ' + dst] - return subplan - -class HadoopCompiler(MRCompiler): - """Compile tasks to commands that are executable to most Unix shells - after hadoop has been installed.""" - - def distributeCommands(self,task,gp,maybeRemoteCopy,localCopy): - return ['rm -f %s' % localCopy, '%s fs -getmerge %s %s' % (GPig.HADOOP_LOC,maybeRemoteCopy,localCopy)] - - def simpleMapCommands(self,task,gp,mapCom,src,dst): - assert src,'Wrap not supported for hadoop' - hcom = self.HadoopCommandBuf(gp,task) - hcom.extendDef('-D','mapred.reduce.tasks=0') - hcom.extend('-input',src,'-output',dst) - hcom.extend("-mapper '%s'" % mapCom) - return [ self._hadoopCleanCommand(gp,dst), hcom.asEcho(), hcom.asString() ] - - def simpleMapReduceCommands(self,task,gp,mapCom,reduceCom,src,dst): - hcom = self.HadoopCommandBuf(gp,task) - hcom.extendDef('-D','mapred.reduce.tasks=%d' % gp.opts['parallel']) - hcom.extend('-input',src,'-output',dst) - hcom.extend("-mapper '%s'" % mapCom) - hcom.extend("-reducer '%s'" % reduceCom) - return [ self._hadoopCleanCommand(gp,dst), hcom.asEcho(), hcom.asString() ] - - def joinCommands(self,task,gp,mapComs,reduceCom,srcs,midpoint,dst): - def midi(i): return midpoint + '-' + str(i) - subplan = [] - for i in range(len(srcs)): - hcom = self.HadoopCommandBuf(gp,task) - hcom.extendDef('-D','mapred.reduce.tasks=%d' % gp.opts['parallel']) - hcom.extend('-input',srcs[i], '-output',midi(i)) - hcom.extend("-mapper","'%s'" % mapComs[i]) - subplan += [ self._hadoopCleanCommand(gp,midi(i)), hcom.asEcho(), hcom.asString() ] - hcombineCom = self.HadoopCommandBuf(gp,task) - hcombineCom.extendDef('-D','mapred.reduce.tasks=%d' % gp.opts['parallel']) - hcombineCom.extendDef('-jobconf','stream.num.map.output.key.fields=3') - hcombineCom.extendDef('-jobconf','num.key.fields.for.partition=1') - for i in range(len(srcs)): - hcombineCom.extend('-input',midi(i)) - hcombineCom.extend('-output',dst) - hcombineCom.extend('-mapper','cat') - hcombineCom.extend('-reducer',"'%s'" % reduceCom) - hcombineCom.extend('-partitioner','org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner') - subplan += [ self._hadoopCleanCommand(gp,dst), hcombineCom.asEcho(), hcombineCom.asString() ] - return subplan - - class HadoopCommandBuf(object): - """Utility to hold the various pieces of a hadoop command.""" - def __init__(self,gp,task): - logging.debug('building hadoop command for '+str(task.mapStep.view.tag)) - self.invocation = [GPig.HADOOP_LOC,'jar',gp.opts['streamJar']] - self.defs = [] - self.args = [] - self.files = [] - for f in gp._shippedFiles: - self.files += ['-file',f] - for sv in task.mapStep.view.sideviewsNeeded(): - self.files += ['-file',sv.distributableFile()] - if task.reduceStep: - for sv in task.reduceStep.view.sideviewsNeeded(): - self.files += ['-file',sv.distributableFile()] - logging.debug('files: '+str(self.files)) - def extend(self,*toks): - self.args += list(toks) - def extendDef(self,*toks): - self.defs += list(toks) - def asEcho(self): - return " ".join(['echo','hadoop'] + self.args + ['...']) - def asString(self): - return " ".join(self.invocation+self.defs+self.files+self.args) - - def _hadoopCleanCommand(self,gp,fileName): - """A command to remove a hdfs directory if it exists.""" - return '(%s fs -test -e %s && %s fs -rmr %s) || echo no need to remove %s' % (GPig.HADOOP_LOC,fileName, GPig.HADOOP_LOC,fileName, fileName) - -# -# replacable object to save objects to disk and retrieve them -# - -class RowSerializer(object): - """Saves row objects to disk and retrieves them.""" - def __init__(self): - self.evaluator = GPig.SafeEvaluator() - def toString(self,x): - return repr(x) - def fromString(self,s): - return self.evaluator.eval(s) - -# -# the planner -# - -class Planner(object): - """Can create storage plans for views that are defined as parts of it.""" - - def __init__(self,**kw): - - #parameters are used for programmatically give user-defined - #config information to a planner, or they can be specified in - #the command-line - self.param = kw - for (key,val) in GPig.getArgvParams().items(): - # don't override non-null values specified in the constructor - if self.param.get(key)==None: - self.param[key] = val - - #opts are used for giving options to the planner from the shell - self.opts = GPig.getArgvOpts() - for (key,val) in GPig.DEFAULT_OPTS.items(): - if (not key in self.opts): self.opts[key] = val - for (key,type) in GPig.DEFAULT_OPT_TYPES.items(): - self.opts[key] = type(self.opts[key]) - - #use appropriate for the target - self._serializer = RowSerializer() - - #views that aren't associated with class variable, but are - #instead named automatically - ie, inner views with no - #user-provided names. - self._autoNamedViews = {} - - #by default, use info-level logging at planning time - if not Planner.partOfPlan(sys.argv): - logging.basicConfig(level=logging.INFO) - - #hadoop needs to know where to give the main script file, - #as well as the guineapig.py file it uses - self._gpigSourceFile = sys.argv[0] - self._shippedFiles = [GPig.MY_LOC,self._gpigSourceFile] - - def setup(self): - """Initialize planner, and views used by the planner. This has to be - done after the planner is fully configured by adding views.""" - - self.reusableViews = {} - # make sure view directory is valid - if self.opts['target']=='shell' and not os.path.exists(self.opts['viewdir']): - logging.info('creating view directory ' + self.opts['viewdir']) - os.makedirs(self.opts['viewdir']) - elif self.opts['target']=='hadoop': - p = urlparse.urlparse(self.opts['viewdir']) - if not p.path.startswith("/"): - logging.warn('hadoop viewdir should be absolite path: will try prefixing /user/$LOGNAME') - username = os.environ.get('LOGNAME','me') - self.opts['viewdir'] = '/user/'+username+'/'+self.opts['viewdir'] - logging.warn('viewdir is set to '+self.opts['viewdir']) - - # Add 'tag' and planner fields to each view - for vname in self.listViewNames(): - v = self.getView(vname) - v.tag = vname - v.planner = self - def tagUnnamedViews(v,basename,index,depth): - assert v,'null inner view for '+basename - if not v.planner: - v.planner = self - autoname = '%s_%d_%s' % (basename,depth,index) - self._setView(autoname,v) - for i,inner in enumerate(v.inners + v.sideviews): - tagUnnamedViews(inner,vname,i,depth+1) - for vname in self.listViewNames(): - v = self.getView(vname) - for i,inner in enumerate(v.inners + v.sideviews): - tagUnnamedViews(inner,vname,i,1) - - # Add caching options as needed - for vname in self.listViewNames(): - v = self.getView(vname) - v.enforceStorageConstraints() - - # - # utils - # - - - def getView(self,str,mustExist=False): - """Find the defined relation named str, and if necessary bind its - planner and tag appropriately.""" - v = self.__class__.__dict__.get(str) or self.__dict__.get(str) or self._autoNamedViews.get(str) - if mustExist: assert v,'cannot find a view named '+str - return v - - def _setView(self,str,view): - """Internal use only: allow the view to be retreived by name later.""" - view.tag = str - self._autoNamedViews[str] = view - - def listViewNames(self): - def namedViews(d): return [vname for vname in d.keys() if isinstance(self.getView(vname),View)] - userNamedViews = namedViews(self.__class__.__dict__) + namedViews(self.__dict__) - return userNamedViews + self._autoNamedViews.keys() - - # - # planning - # - - def buildRecursiveStoragePlan(self,view): - """Called by view.storagePlan.""" - #figure out what to reuse - starting with what the user specified - storedViews = dict(self.reusableViews) - #also mark for eager storage anything that's used twice in the - #plan---i.e., anything that is consumed by two or more views - numParents = collections.defaultdict(int) - for dv in self._descendants(view): - for inner in dv.inners + dv.sideviews: - numParents[inner] += 1 - for (dv,n) in numParents.items(): - if n>1 and dv.storeMe==None: - logging.info('making %s stored because it is used %d times in creating %s' % (dv,n,view.tag)) - dv.storeMe = True - - #traverse view in pre-order and find a linear sequence of - #views to store, each of which requires only views earlier in - #the sequence - storageSeq = self._storageSeq(view,storedViews) + [view.tag] - logging.info('storage sequence is: ' + ",".join(storageSeq)) - - #splice together plans for each view in the sequence, - #after first modifying the view so that nothing is called - #directly, but only through the ReuseView proxies - plan = Plan() - for tag in storageSeq: - v = self.getView(tag,mustExist=True) - vm = v.applyDict(storedViews,innerviewsOnly=True) - subplan = vm.nonrecursiveStoragePlan() - #add the correct context of reused views to the subplan, - #so that that the actual definition of the view will be - #rewritten appropriately to include the new ReuseView - #proxy for it - viewsLocallyReused = self._reuseViewDescendants(vm) - for s in subplan.steps: - s.setReusedViews(viewsLocallyReused) - plan.includeStepsOf(subplan) - return plan - - def _reuseViewDescendants(self,view): - """Descendent views that are ReuseView's""" - result = set() - for dv in self._descendants(view): - if isinstance(dv,ReuseView): - result.add(dv.reusedViewTag) - return result - - def _descendants(self,view): - """Descendents of a view.""" - result = set() - result.add(view) - for inner in view.inners + view.sideviews: - result = result.union(self._descendants(inner)) - return result - - def _storageSeq(self,view,storedViews): - """Linear sequence of storage actions to take - as view tags.""" - seq = [] - for inner in view.inners + view.sideviews: - if not inner.tag in storedViews: - seq += self._storageSeq(inner,storedViews) - if inner.storeMe: - seq += [inner.tag] - storedViews[inner.tag] = ReuseView(inner) - return seq - - # - # dealing with the file storage system and related stuff - # - - def ship(self,*fileNames): - """Declare a set of inputs to be 'shipped' to the hadoop cluster.""" - self._shippedFiles += fileNames - - def setSerializer(self,serializer): - """Replace the default serializer another RowSerializer object.""" - self._serializer = serializer - return self - - def setEvaluator(self,rowEvaluator): - """Specify a function which will deserialize a string that was produced - by Python's 'repr' function.""" - self._serializer.evaluator = rowEvaluator - return self - - # - # rest of the API for the planner - # - - @staticmethod - def partOfPlan(argv): - """True if the command line was generated as part of a storage plan.""" - return any(s.startswith("--do") for s in argv) - - def main(self,argv): - """Run a main that lets you --store a view, as well as doing a few other things.""" - self.setup() - self.runMain(argv) - - def runMain(self,argv): - - # parse the options and dispatch appropriately - argspec = ["store=", "cat=", "reuse", - "list", "pprint=", "steps=", "tasks=", "plan=", - "params=", "opts=", "do=", "view="] - optlist,args = getopt.getopt(argv[1:], 'x', argspec) - optdict = dict(optlist) - - # decide what views can be re-used, vs which need fresh plans - if '--reuse' in optdict: #reuse the views listed in the arguments - for a in args: - vname = View.viewNameFor(a) - v = self.getView(vname) - if v: - self.reusableViews[v.tag] = ReuseView(v) - logging.info("re-using data stored for view "+vname+": "+str(v)) - else: - logging.warn("cannot re-use view "+vname+" since it's not used in this script") - - #choose the main action to take - if '--store' in optdict: #store a view - rel = self.getView(optdict['--store'],mustExist=True) - plan = rel.storagePlan() - plan.execute(self, echo=self.opts['echo']) - return - elif '--pprint' in optdict: #print a view - rel = self.getView(optdict['--pprint'],mustExist=True) - rel.applyDict(self.reusableViews).pprint() - return - elif '--steps' in optdict: #print steps to produce a view - rel = self.getView(optdict['--steps'],mustExist=True) - plan = rel.storagePlan() - for s in plan.steps: - print ' -',s - return - elif '--tasks' in optdict: #print AbstractMapReduceTasks to produce a view - rel = self.getView(optdict['--tasks'],mustExist=True) - plan = rel.storagePlan() - plan.buildTasks() - for t in plan.tasks: - print t - return - elif '--plan' in optdict: #print a storage plan - rel = self.getView(optdict['--plan'],mustExist=True) - plan = rel.storagePlan() - script = plan.compile(self) - print "\n".join(script) - return - elif '--cat' in optdict: #store and then print a view - assert self.opts['target']=='shell','cannot do --cat except in shell mode' - rel = self.getView(optdict['--cat'],mustExist=True) - plan = rel.storagePlan() - plan.execute(self, self.opts['echo']) - for line in open(rel.storedFile(),'r'): - print line, - return - elif '--list' in optdict: #list named views - for vname in self.listViewNames(): - print ' ',vname,'\t',self.getView(vname) - return - elif '--do' in optdict: #run an internally-generated action - #recover what should be stored when this action is performed - #work out what view to use and what routine to call - rel = self.getView(optdict['--view'],mustExist=True) - rel = rel.applyDict(self.reusableViews) - whatToDo = optdict['--do'] - #work out the method given by 'do' and call it - note it - #may have a single integer argument, eg doJoinMap.1 - k = whatToDo.find(".") - if k<0: - whatToDoMethod = getattr(rel,whatToDo) - whatToDoMethod() - else: - arg = int(whatToDo[k+1:]) - whatToDo = whatToDo[:k] - whatToDoMethod = getattr(rel,whatToDo) - whatToDoMethod(arg) - return - else: - print 'usage: --[store|pprint|steps|plan|cat] view [--opts key:val,...] [--params key:val,...] --reuse view1 view2 ...]' - print ' --[list]' - print 'current legal keys for "opts", with default values:' - for (key,val) in GPig.DEFAULT_OPTS.items(): - print ' %s:%s' % (key,str(val)) - print 'There\'s more help at http://curtis.ml.cmu.edu/w/courses/index.php/Guinea_Pig' - -if __name__ == "__main__": - print 'There\'s help at http://curtis.ml.cmu.edu/w/courses/index.php/Guinea_Pig' diff --git a/tutorial/guineapig1_1.py b/tutorial/guineapig1_1.py deleted file mode 100644 index 93c5fbf..0000000 --- a/tutorial/guineapig1_1.py +++ /dev/null @@ -1,1244 +0,0 @@ -############################################################################## -# (C) Copyright 2014 William W. Cohen. All rights reserved. -############################################################################## - -import sys -import logging -import getopt -import os -import os.path -import subprocess -import collections -import urlparse -import csv - -############################################################################### -# helpers and data structures -############################################################################### - -class GPig(object): - """Collection of utilities for Guinea Pig.""" - - HADOOP_LOC = 'hadoop' #assume hadoop is on the path at planning time - - #global options for Guinea Pig can be passed in with the --opts - #command-line option, and these are the default values - envjar = os.environ.get('GP_STREAMJAR') - defaultJar = '/usr/lib/hadoop/contrib/streaming/hadoop-streaming-1.2.0.1.3.0.0-107.jar' - DEFAULT_OPTS = {'streamJar': envjar or defaultJar, - 'parallel':5, - 'target':'shell', - 'echo':0, - 'viewdir':'gpig_views', - } - #there are the types of each option that has a non-string value - DEFAULT_OPT_TYPES = {'parallel':int,'echo':int} - #we need to pass non-default options in to mappers and reducers, - #but since the remote worker's environment can be different, we - #also need to pass in options computed from the environment - COMPUTED_OPTION_DEFAULTS = {'streamJar':defaultJar} - - @staticmethod - def getArgvParams(): return GPig.getArgvDict('--params') - - @staticmethod - def getArgvOpts(): return GPig.getArgvDict('--opts') - - @staticmethod - def getArgvDict(optname): - """Return a dictionary of parameter values that were defined on the command line - view an option like '--params filename:foo.txt,basedir:/tmp/glob/'. - """ - assert optname.startswith('--') - for i,a in enumerate(sys.argv): - if a==optname: - paramString = sys.argv[i+1] - return dict(pair.split(":") for pair in paramString.split(",")) - return {} - - @staticmethod - def rowsOf(view): - """Iterate over the rows in a view.""" - for line in open(view.distributableFile()): - yield view.planner._serializer.fromString(line.strip()) - - @staticmethod - def onlyRowOf(view): - """Return the first row in a side view, and raise an error if it - is not the only row of the view.""" - result = None - logging.info('loading '+view.distributableFile()) - for line in open(view.distributableFile()): - assert not result,'multiple rows in stored file for '+view.tag - result = view.planner._serializer.fromString(line.strip()) - return result - -class Jin(object): - """"Object to hold descripton of a single join input.""" - - def __init__(self,view,by=(lambda x:x),outer=False): - self.view = view - self.joinBy = by - self.outer = outer - self._padWithNulls = False - - def __str__(self): - if self.view: viewStr = View.asTag(self.view) - else: viewStr = '_' - if self.outer: outerStr = ',outer=True' - else: outerStr = '' - if self._padWithNulls: padStr = ',_padWithNulls=True' - else: padStr = '' - return "Jin(%s,by=%s%s%s)" % (viewStr,self.joinBy,outerStr,padStr) - -class ReduceTo(object): - """An object x that can be the argument of a reducingTo=x - parameter in a Group view.""" - def __init__(self,baseType,by=lambda accum,val:accum+val): - self.baseType = baseType - self.reduceBy = by - -class ReduceToCount(ReduceTo): - """Produce the count of the number of objects that would be placed in a group.""" - def __init__(self): - ReduceTo.__init__(self,int,by=lambda accum,val:accum+1) - -class ReduceToSum(ReduceTo): - """Produce the count of the number of objects that would be placed in a group.""" - def __init__(self): - ReduceTo.__init__(self,int,by=lambda accum,val:accum+val) - -class ReduceToList(ReduceTo): - """Produce a list of the objects that would be placed in a group.""" - def __init__(self): - ReduceTo.__init__(self,list,by=lambda accum,val:accum+[val]) - -############################################################################### -# abstract views -############################################################################## - -class View(object): - """A relation object for guineaPig. A view - usually abbreviated rel, - r, s, t, - can be "materialized" to produce and unordered bag of - rows - usually abbreviated ro. A row is just an arbitrary python - data structure, which must be something that can be stored and - retrieved by the RowSerializer. A row can be anything, but often - the top-level structure is a python tuple (a,b,c,...) or a dict - mapping small integers 0,1,... to different parts of the row. - - A guineapig "planner" knows how to construct "plans" that store - materialized relations on disk. These plans sometimes include - creating 'checkpoints', which are things places on disk, often - stored materialized relations, or sometimes intermediate outputs - or these.""" - - def __init__(self): - """The planner and tag must be set before this is used.""" - self.planner = None #pointer to planner object - self.tag = None #for naming storedFile and checkpoints - self.storeMe = None #try and store this view if true - self.retainedPart = None #used in map-reduce views only - - # - # ways to modify a view - # - - def opts(self,stored=None): - """Return the same view with options set appropriately.e""" - self.storeMe = stored - return self - - def showExtras(self): - """Printable representation of the options for a view.""" - result = '' - flagPairs = [] - if self.storeMe: flagPairs += ['stored=%s' % repr(self.storeMe)] - if flagPairs: result += '.opts(' + ",".join(flagPairs) + ')' - return result - - # - # how the view is saved on disk - # - - def storedFile(self): - """The file that will hold the materialized relation.""" - return self.planner.opts['viewdir'] + '/' + self.tag + '.gp' - - def distributableFile(self): - """The file that will hold the materialized relation in the working directory - in preparation to be uploaded to the distributed cache.""" - return self.tag + '.gp' - - @staticmethod - def viewNameFor(fileName): - """The view associated with the given file name""" - vname = os.path.basename(fileName) - if vname.endswith(".gp"): vname = vname[0:-len(".gp")] - elif vname.endswith(".done"): vname = vname[0:-len(".done")] - return vname - - # - # semantics of the view - # - - def checkpoint(self): - """A checkpoint is a file that is created in the course of - materializing a view. This function is the latest checkpoint - from which the the relation can be materialized.""" - if self.storeMe: return self.storedFile() - else: return self.unstoredCheckpoint() - - def unstoredCheckpoint(self): - """Checkpoint for this view, if the storeMe flag is not set.""" - assert False, 'abstract method called' - - def checkpointPlan(self): - """A plan to produce checkpoint(). Plans are constructed with the - help of the planner, and steps in the plan are executed by - delegation, thru the planner, to methods of this class named - doFoo.""" - if self.storeMe: return self.storagePlan() - else: return self.unstoredCheckpointPlan() - - def unstoredCheckpointPlan(self): - """Plan to produce the checkpoint for this view, if the - storeMe flag is not set.""" - assert False, 'abstract method called' - - def rowGenerator(self): - """A generator for the rows in this relation.""" - if self.storeMe and (self.tag in self.planner.alreadyStored): - for line in sys.stdin: - yield self.planner._serializer.fromString(line.strip()) - else: - for row in self.unstoredRowGenerator(): - yield row - - def explanation(self): - """Return an explanation of how rows are generated.""" - if self.storeMe and (self.tag in self.planner.viewsPlannedToExist): - return ['read %s with %s' % (self.storedFile(),self.tag)] - else: - return self.unstoredExplanation() - - def unstoredExplanation(self): - """Return an explanation of how rows were generated, ignoring caching issues.""" - assert False,'abstract method called' - - def storagePlan(self): - """A plan to materialize the relation. """ - if self.storeMe and self.tag in self.planner.viewsPlannedToExist: - return Plan() - else: - #WARNING: these computations have to be done in the right order, since planning has the side effect of updating - #the filePlannedToExist predicate. - # 1) build the pre-plan, to set up the view's checkpoints - plan = self.unstoredCheckpointPlan() - # 2a) compute the next step of the plan, along with the explanation - step = Step(self,'doStoreRows',self.unstoredCheckpoint(),self.storedFile(), - why=self.explanation(), - existingViews=set(self.planner.viewsPlannedToExist)) #shallow copy of current state - result = plan.extend( step ) - # 2b) if necessary, add a step to upload the - - # 3) Record that this file has been stored for lated calls to explanation() and storagePlan() - logging.debug('marking %s as planned-to-exist' % self.tag) - self.planner.viewsPlannedToExist.add(self.tag) - # 4) return the result - return result - - def doStoreRows(self): - """Called by planner at execution time to store the rows of the view.""" - for row in self.rowGenerator(): - print self.planner._serializer.toString(row) - - # - # traversing and defining views - # - - def innerViews(self): - """List of all views that are used as direct inputs.""" - return [] - - def nonInnerPrereqViews(self): - """List any non-inner views that need to be created before the view is executed.""" - return [] - - def __or__(self,otherView): - """Overload the pipe operator x | y to return with y, with x as its inner view.""" - otherView.acceptInnerView(self) - return otherView - - def acceptInnerView(self,otherView): - """Replace an appropriate input view with otherView. To be - used with the pipe operator.""" - assert False,'abstract method called' - - # - # meta plans - sequence of store commands - # - def metaplan(self,previouslyExistingViews): - plannedViews = set(previouslyExistingViews) #copy - return self._metaplanTraverse(plannedViews) + [self.tag] - - def _metaplanTraverse(self,plannedViews): - mplan = [] - try: - sideInnerviews = self.sideviews - except AttributeError: - sideInnerviews = [] - for inner in self.innerViews() + sideInnerviews: - if not inner.tag in plannedViews: - mplan += inner._metaplanTraverse(plannedViews) - if inner.storeMe: - mplan += [inner.tag] - plannedViews.add(inner.tag) - return mplan - - # - # printing views - # - - def pprint(self,depth=0,alreadyPrinted=None,sideview=False): - """Print a readable representation of the view.""" - if alreadyPrinted==None: alreadyPrinted = set() - tabStr = '| ' * depth - tagStr = str(self.tag) - sideviewIndicator = '*' if sideview else '' - if self in alreadyPrinted: - print tabStr + sideviewIndicator + tagStr + ' = ' + '...' - else: - sideViewInfo = " sideviews: {"+",".join(map(lambda x:x.tag, self.nonInnerPrereqViews()))+"}" if self.nonInnerPrereqViews() else "" - print tabStr + sideviewIndicator + tagStr + ' = ' + str(self) + sideViewInfo - alreadyPrinted.add(self) - for inner in self.innerViews(): - inner.pprint(depth+1,alreadyPrinted) - try: - for inner in self.sideviews: - inner.pprint(depth+1,alreadyPrinted,sideview=True) - except AttributeError: - pass - - @staticmethod - def asTag(view): - """Helper for printing views.""" - if not view: return '(null view)' - elif view.tag: return view.tag - else: return str(view) - -# -# abstract view types -# - -class Reader(View): - """Wrapper around a stored file relation.""" - - def __init__(self,src): - View.__init__(self) - self.src = src - - def unstoredCheckpoint(self): - return self.src - - def unstoredCheckpointPlan(self): - return Plan() - - def unstoredExplanation(self): - return [ 'read %s with %s' % (str(self.src),self.tag) ] - -class Transformation(View): - """A relation that takes an inner relation and processes in a - stream-like way, including operators like project, flatten, - select.""" - - def __init__(self,inner=None): - View.__init__(self) - self.inner = inner - - def innerViews(self): - return [self.inner] - - def nonInnerPrereqViews(self): - assert self.inner, 'no inner view defined for ' + str(self) - return self.inner.nonInnerPrereqViews() - - def acceptInnerView(self,otherView): - assert not self.inner,'An inner view is defined for '+self.tag+' so you cannot use it as RHS of a pipe' - self.inner = otherView - - # The transformation will stream through the inner relation, - # and produce a new version, so the latest checkpoint and - # plan to produce it are delegated to the inner View. - - def unstoredCheckpoint(self): - return self.inner.checkpoint() - - def unstoredCheckpointPlan(self): - plan = Plan() - plan.append(self.inner.checkpointPlan()) - return plan - - def unstoredExplanation(self): - return self.inner.explanation() + [ 'transform to %s' % self.tag ] - -class MapReduce(View): - """A view that takes an inner relation and processes in a - map-reduce-like way.""" - - def __init__(self,inners,retaining): - View.__init__(self) - self.inners = inners - self.retainedPart = retaining - - def innerViews(self): - return self.inners - - def nonInnerPrereqViews(self): - result = [] - for inner in self.inners: - result += inner.nonInnerPrereqViews() - return result - - def reduceInputFile(self): - ## the checkpoint is the reducer input file - return self.planner.opts['viewdir'] + '/' + self.tag + '.gpri' - - @staticmethod - def isReduceInputFile(fileName): - return fileName.endswith('.gpri') - - def unstoredCheckpoint(self): - return self.reduceInputFile() - - def unstoredCheckpointPlan(self): - plan = Plan() - for inner in self.inners: - plan = plan.append(inner.checkpointPlan()) - return plan.append(self.mapPlan()) - - def innerCheckpoints(self): - result = [] - for inner in self.inners: - result += [inner.checkpoint()] - return result - - def mapPlan(self): - log.error("abstract method not implemented") - - def doStoreKeyedRows(self,subview,key,index): - """Utility method to compute keys and store key-value pairs. Usually - used as the main step in a mapPlan. """ - for row in subview.rowGenerator(): - keyStr = self.planner._serializer.toString(key(row)) - rrow = self.retainedPart(row) if self.retainedPart else row - valStr = self.planner._serializer.toString(rrow) - if index<0: - print "%s\t%s" % (keyStr,valStr) - else: - print "%s\t%d\t%s" % (keyStr,index,valStr) - -############################################################################## -# -# concrete View classes -# -############################################################################## - -class ReadLines(Reader): - """ Returns the lines in a file, as python strings.""" - - def __init__(self,src): - Reader.__init__(self,src) - - def unstoredRowGenerator(self): - for line in sys.stdin: - yield line - - def __str__(self): - return 'ReadLines("%s")' % self.src + self.showExtras() - -class ReadCSV(Reader): - """ Returns the lines in a CSV file, converted to Python tuples.""" - - def __init__(self,src,**kw): - Reader.__init__(self,src) - self.kw = kw - - def unstoredRowGenerator(self): - for tup in csv.reader(sys.stdin,**self.kw): - yield tup - - def __str__(self): - return 'ReadCVS("%s",%s)' % (self.src,str(self.kw)) + self.showExtras() - - -class ReplaceEach(Transformation): - """ In 'by=f'' f is a python function that takes a row and produces - its replacement.""" - - def __init__(self,inner=None,by=lambda x:x): - Transformation.__init__(self,inner) - self.replaceBy = by - - def unstoredRowGenerator(self): - for row in self.inner.rowGenerator(): - yield self.replaceBy(row) - - def unstoredExplanation(self): - return self.inner.explanation() + [ 'replaced to %s' % self.tag ] - - def __str__(self): - return 'ReplaceEach(%s, by=%s)' % (View.asTag(self.inner),str(self.replaceBy)) + self.showExtras() - -class Augment(Transformation): - - def __init__(self,inner=None,sideviews=None,sideview=None,loadedBy=lambda v:list(GPig.rowsOf(v))): - Transformation.__init__(self,inner) - assert not (sideviews and sideview), 'cannot specify both "sideview" and "sideviews"' - self.sideviews = list(sideviews) if sideviews else [sideview] - self.loader = loadedBy - assert self.loader,'must specify a "loadedBy" function for Augment' - - def nonInnerPrereqViews(self): - return self.inner.nonInnerPrereqViews() + self.sideviews - - def unstoredRowGenerator(self): - augend = self.loader(*self.sideviews) - for row in self.inner.rowGenerator(): - yield (row,augend) - - def unstoredCheckpointPlan(self): - plan = Plan() - for sv in self.sideviews: - plan = plan.append(sv.storagePlan()) - plan = plan.extend(Step(sv, 'DISTRIBUTE')) - plan.append(self.inner.checkpointPlan()) - return plan - - def unstoredExplanation(self): - return self.inner.explanation() + [ 'augmented to %s' % self.tag ] - - def __str__(self): - sideviewTags = loaderTag = '*UNSPECIFIED*' - if self.sideviews!=None: - sideviewTags = ",".join(map(View.asTag,self.sideviews)) - if self.loader!=None: - loaderTag = str(self.loader) - return 'Augment(%s,sideviews=%s,loadedBy=s%s)' % (View.asTag(self.inner),sideviewTags,loaderTag) + self.showExtras() - - -class Format(ReplaceEach): - """ Like ReplaceEach, but output should be a string, and it will be be - stored as strings, ie without using the serializer.""" - - def __init__(self,inner=None,by=lambda x:str(x)): - ReplaceEach.__init__(self,inner,by) - - def doStoreRows(self): - for row in self.rowGenerator(): - print row - - def __str__(self): - return 'Format(%s, by=%s)' % (View.asTag(self.inner),str(self.replaceBy)) + self.showExtras() - -class Flatten(Transformation): - """ Example: - def idwordGen(row): - for w in row['words']: yield (row['id'],w) - x = gp.Flatten(y, by=idwordGen(row)) - - In 'by=f', f is a python function that takes a row and yields - multiple new rows. """ - - def __init__(self,inner=None,by=None): - Transformation.__init__(self,inner) - self.flattenBy = by - - def unstoredRowGenerator(self): - for row in self.inner.rowGenerator(): - for flatrow in self.flattenBy(row): - yield flatrow - - def unstoredExplanation(self): - return self.inner.explanation() + [ 'flatten to %s' % self.tag ] - - def __str__(self): - return 'Flatten(%s, by=%s)' % (View.asTag(self.inner),str(self.flattenBy)) + self.showExtras() - -class Filter(Transformation): - """Filter out a subset of rows that match some predicate.""" - - def __init__(self,inner=None,by=lambda x:x): - Transformation.__init__(self,inner) - self.filterBy = by - - def unstoredRowGenerator(self): - for row in self.inner.rowGenerator(): - if self.filterBy(row): - yield row - - def unstoredExplanation(self): - return self.inner.explanation() + [ 'filtered to %s' % self.tag ] - - def __str__(self): - return 'Filter(%s, by=%s)' % (View.asTag(self.inner),str(self.filterBy)) + self.showExtras() - -class Distinct(MapReduce): - """Remove duplicate rows.""" - - def __init__(self,inner=None,retaining=None): - MapReduce.__init__(self,[inner],retaining) - self.inner = inner - - def acceptInnerView(self,otherView): - assert not self.inner,'An inner view is defined for '+self.tag+' so you cannot use it as RHS of a pipe' - self.inner = otherView - self.inners = [self.inner] - - def mapPlan(self): - step = Step(self, 'doDistinctMap', self.inner.checkpoint(), self.checkpoint(), prereduce=True, - why=self.explanation(), - existingViews=set(self.planner.viewsPlannedToExist)) #shallow of current state - return Plan().extend(step) - - def doDistinctMap(self): - # called by groupMapAndSortStep - self.inner.doStoreRows() - - def unstoredRowGenerator(self): - """Extract distinct elements from a sorted list.""" - lastval = None - for line in sys.stdin: - valStr = line.strip() - val = self.planner._serializer.fromString(valStr) - if val != lastval and lastval: - yield lastval - lastval = val - if lastval: - yield lastval - - def unstoredExplanation(self): - return self.inner.explanation() + [ 'make distinct to %s' % self.tag] - - def __str__(self): - return 'Distinct(%s)' % (View.asTag(self.inner)) + self.showExtras() - -class Group(MapReduce): - """Group by some property of a row, defined with the 'by' option. - Default outputs are tuples (x,[r1,...,rk]) where the ri's are rows - that have property values of x.""" - - def __init__(self,inner=None,by=lambda x:x,reducingTo=ReduceToList(),retaining=None): - MapReduce.__init__(self,[inner],retaining) - self.inner = inner - self.groupBy = by - self.reducingTo = reducingTo - - def acceptInnerView(self,otherView): - assert not self.inner,'An inner view is defined for '+self.tag+' so you cannot use it as RHS of a pipe' - self.inner = otherView - self.inners = [self.inner] - - def mapPlan(self): - step = Step(self, 'doGroupMap',self.inner.checkpoint(),self.checkpoint(),prereduce=True, - why=self.explanation(), - existingViews=set(self.planner.viewsPlannedToExist)) #shallow copy of current state - return Plan().extend(step) - - def doGroupMap(self): - # called by groupMapAndSortStep - self.doStoreKeyedRows(self.inner,self.groupBy,-1) - - def unstoredRowGenerator(self): - """Group objects from stdin by key, yielding tuples (key,[g1,..,gn]).""" - lastkey = key = None - accum = self.reducingTo.baseType() - for line in sys.stdin: - keyStr,valStr = line.strip().split("\t") - key = self.planner._serializer.fromString(keyStr) - val = self.planner._serializer.fromString(valStr) - if key != lastkey and lastkey!=None: - yield (lastkey,accum) - accum = self.reducingTo.baseType() - accum = self.reducingTo.reduceBy(accum, val) - lastkey = key - if key: - yield (key,accum) - - def unstoredExplanation(self): - return self.inner.explanation() + ['group to %s' % self.tag] - - def __str__(self): - return 'Group(%s,by=%s,reducingTo=%s)' % (View.asTag(self.inner),str(self.groupBy),str(self.reducingTo)) + self.showExtras() - -class Join(MapReduce): - """Outputs tuples of the form (row1,row2,...rowk) where - rowi is from the i-th join input, and the rowi's have the same - value of the property being joined on.""" - - def __init__(self,*joinInputs): - #sets self.inners - MapReduce.__init__(self,map(lambda x:x.view, joinInputs),None) - self.joinInputs = joinInputs - #re-interpret the 'outer' join parameters - semantically - #if jin[i] is outer, then all other inputs must be marked as _padWithNulls - if any(map(lambda jin:jin.outer, self.joinInputs)): - assert len(self.joinInputs)==2,'outer joins are only supported on two-way joins '+str(self.joinInputs) - for i in range(len(self.joinInputs)): - if self.joinInputs[i].outer: - j = 1-i #the other index - self.joinInputs[j]._padWithNulls = True - - def acceptInnerView(self,otherView): - assert self.unpairedJoinBy, 'join cannot be RHS of a pipe' - #assert self.unpairedJoinBy, 'join can only be RHS of a pipe if it contains a "by" argument not inside a "Jin" join-input' - #self.joinInputs = joinInputs + [Jin(otherView,by=unpairedJoinBy)] - #self.inners = map(lambda x:x.view, self.joinInputs) - - def mapPlan(self): - previousCheckpoints = self.innerCheckpoints() - midfile = self.planner.opts['viewdir'] + '/' + self.tag+'.gpmo' - step = Step(self, 'doJoinMap', src=previousCheckpoints, dst=self.checkpoint(), prereduce=True, hasIndex=True, mid=midfile, - existingViews=set(self.planner.viewsPlannedToExist), #shallow copy of current state - why=self.explanation()) - return Plan().extend(step) - - def doJoinMap(self,i): - # called by joinMapPlan with argument index, and stdin pointing to previousCheckpoints[index] - self.doStoreKeyedRows(self.joinInputs[i].view,self.joinInputs[i].joinBy,i) - - def unstoredRowGenerator(self): - """Group objects from stdin by key, yielding tuples (row1,row2,...).""" - lastkey = None - lastIndex = len(self.joinInputs)-1 - somethingProducedForLastKey = False - #accumulate a list of lists of all non-final inputs - accumList = [ [] for i in range(lastIndex) ] - for line in sys.stdin: - keyStr,indexStr,valStr = line.strip().split("\t") - key = self.planner._serializer.fromString(keyStr) - index = int(indexStr) - val = self.planner._serializer.fromString(valStr) - if key != lastkey and lastkey!=None: - #if the final join is marked as _padWithNulls, clear - #the accumulators, since we're doing an outer join - #with the last view - if self.joinInputs[lastIndex]._padWithNulls and not somethingProducedForLastKey: - for tup in self._joinAccumulatedValuesTo(accumList,lastIndex,None): - yield tup - #reset the accumulators, since they pertain to the - accumList = [ [] for i in range(lastIndex) ] - somethingProducedForLastKey = False - if index!=lastIndex: - #accumulate values to use in the join - accumList[index] = accumList[index] + [val] - else: - #produce tuples that match the key for the last view - for tup in self._joinAccumulatedValuesTo(accumList,lastIndex,val): - somethingProducedForLastKey = True - yield tup - lastkey = key - - def _joinAccumulatedValuesTo(self,accumList,lastIndex,finalVal): - # _padWithNulls as needed - for i in range(lastIndex): - if self.joinInputs[i]._padWithNulls and not accumList[i]: - accumList[i] = [None] - tupbuf = [ None for i in range(lastIndex+1) ] #holds output - tupbuf[lastIndex] = finalVal - for i in range(lastIndex): - for a in accumList[i]: - tupbuf[i] = a - if i==lastIndex-1 and any(tupbuf): - yield tuple(tupbuf) - - def unstoredExplanation(self): - innerEx = [] - for inner in self.inners: - if innerEx: innerEx += ['THEN'] - innerEx += inner.explanation() - return innerEx + [ 'FINALLY join to %s' % self.tag ] - - def __str__(self): - return "Join(%s)" % ",".join(map(str,self.joinInputs)) + self.showExtras() - -class JoinTo(Join): - """Special case of Join which can be used as the RHS of a pipe operator.""" - - def __init__(self,joinInput,by=None): - Join.__init__(self,Jin(None,by),joinInput) - - def acceptInnerView(self,otherView): - self.joinInputs[0].view = otherView - self.inners[0] = otherView - -############################################################################## -# -# the top-level planner, and its supporting classes -# -############################################################################## - -class Plan(object): - """A plan constructed by a GuineaPig.""" - - def __init__(self): self.steps = [] - - def extend(self,step): - self.steps += [step] - return self - - def append(self,subPlan): - self.steps += subPlan.steps - return self - - def execute(self,gp,echo=False): - script = self.compile(gp) - for shellcom in script: - if echo: print 'calling:',shellcom - subprocess.check_call(shellcom,shell=True) - - def compile(self,gp): - """Return a list of strings that can be run as shell commands.""" - script = [] - i = 0 - while (i such that - items should be partitioned by key and sorted by index - - why is documentation/explanation. """ - - self.view = view - self.whatToDo = whatToDo - self.existingViews = existingViews - self.src = src - self.dst = dst - self.prereduce = prereduce - self.hasIndex = hasIndex - self.mid = mid - self.why = why - - def __str__(self): - return repr(self) - - def __repr__(self): - return "Step('%s','%s',src=%s,dst='%s',prereduce=%s,mid=%s,why=%s,existingViews=%s)" \ - % (self.view.tag,self.whatToDo,repr(self.src), - self.dst,repr(self.prereduce),repr(self.mid),repr(self.explain()),repr(self.existingViews)) - - def explain(self): - """Convert an explanation - which is a list of strings - into a string""" - return "...".join(self.why) - - # actual code generation for the steps - - class HadoopCommand(object): - def __init__(self,gp,view): - self.invocation = [GPig.HADOOP_LOC,'jar',gp.opts['streamJar']] - self.defs = [] - self.args = [] - self.files = [] - for f in gp._shippedFiles: - self.files += ['-file',f] - for v in view.nonInnerPrereqViews(): - self.files += ['-file',v.distributableFile()] - def append(self,*toks): - self.args += list(toks) - def appendDef(self,*toks): - self.defs += list(toks) - def asEcho(self): - return " ".join(['echo','hadoop'] + self.args + ['...']) - def asString(self): - return " ".join(self.invocation+self.defs+self.files+self.args) - - def subplanHeader(self,reduceStep=None): - """Generate an explanatory header for a step.""" - if not reduceStep: return ['#', 'echo map '+self.view.tag + ': '+self.explain()] - else: return ['#', 'echo map/reduce '+self.view.tag+ '/'+ reduceStep.view.tag + ': '+reduceStep.explain()] - - - def coreCommand(self,gp): - """Python command to call an individual plan step.""" - return 'python %s --view=%s --do=%s' % (gp._gpigSourceFile,self.view.tag,self.whatToDo) + self.coreCommandOptions(gp) - - def ithCoreCommand(self,gp,i): - """Like coreCommand but allows index parameter to 'do' option""" - return 'python %s --view=%s --do=%s.%d' % (gp._gpigSourceFile,self.view.tag,self.whatToDo,i) + self.coreCommandOptions(gp) - - def coreCommandOptions(self,gp): - paramOpts = '' if not gp.param else " --params " + ",".join(map(lambda(k,v):k+':'+v, gp.param.items())) - alreadyStoredOpts = '' if not self.existingViews else " --alreadyStored "+",".join(self.existingViews) - nonDefaults = [] - for (k,v) in gp.opts.items(): - #pass in non-default options, or options computed from the environment - if (gp.opts[k] != GPig.DEFAULT_OPTS[k]) or ((k in GPig.COMPUTED_OPTION_DEFAULTS) and (gp.opts[k] != GPig.COMPUTED_OPTION_DEFAULTS[k])): - nonDefaults += ["%s:%s" % (k,str(v))] - optsOpts = '' if not nonDefaults else " --opts " + ",".join(nonDefaults) - return paramOpts + optsOpts + alreadyStoredOpts - - def hadoopClean(self,gp,fileName): - """A command to remove a hdfs directory if it exists.""" - return '(%s fs -test -e %s && %s fs -rmr %s) || echo no need to remove %s' % (GPig.HADOOP_LOC,fileName, GPig.HADOOP_LOC,fileName, fileName) - - def distribute(self,gp): - """Make a view availablefor use as a side view.""" - localCopy = self.view.distributableFile() - maybeRemoteCopy = self.view.storedFile() - echo = 'echo making a local copy of %s in %s' % (maybeRemoteCopy,localCopy) - if gp.opts['target']=='hadoop': - return [echo, 'rm -f %s' % localCopy, '%s fs -getmerge %s %s' % (GPig.HADOOP_LOC,maybeRemoteCopy, localCopy)] - else: - return [echo, 'cp -f %s %s || echo warning: the copy failed!' % (maybeRemoteCopy,localCopy)] - - def mapOnlySubscript(self,gp): - """A subplan for a mapper-only step.""" - if gp.opts['target']=='shell': - command = None - if self.src: command = self.coreCommand(gp) + ' < %s > %s' % (self.src,self.dst) - else: command = self.coreCommand(gp) + (' > %s' % (self.dst)) - return self.subplanHeader() + [command] - elif gp.opts['target']=='hadoop': - assert self.src,'Wrap not supported for hadoop' - hcom = self.HadoopCommand(gp,self.view) - hcom.appendDef('-D','mapred.reduce.tasks=0') - hcom.append('-input',self.src,'-output',self.dst) - hcom.append("-mapper '%s'" % self.coreCommand(gp)) - return self.subplanHeader() + [ hcom.asEcho(), self.hadoopClean(gp,self.dst), hcom.asString() ] - else: - assert False - - def mapReduceSubscript(self,reduceStep,gp): - """A subplan for a map-reduce step followed by a reduce, where the map has one input.""" - if gp.opts['target']=='shell': - command = self.coreCommand(gp) + (' < %s' % self.src) + ' | sort -k1 | '+reduceStep.coreCommand(gp) + (' > %s' % reduceStep.dst) - return self.subplanHeader(reduceStep) + [command] - elif gp.opts['target']=='hadoop': - hcom = self.HadoopCommand(gp,self.view) - hcom.appendDef('-D','mapred.reduce.tasks=%d' % gp.opts['parallel']) - hcom.append('-input',self.src, '-output',reduceStep.dst) - hcom.append("-mapper '%s'" % self.coreCommand(gp)) - hcom.append("-reducer '%s'" % reduceStep.coreCommand(gp)) - return self.subplanHeader(reduceStep) + [ hcom.asEcho(), self.hadoopClean(gp,reduceStep.dst), hcom.asString() ] - else: - assert False - - def multiMapReduceSubscript(self,reduceStep,gp): - """A subplan for a map-reduce step followed by a reduce, where the map has many inputs.""" - if gp.opts['target']=='shell': - subplan = ['rm -f %s' % self.mid] - for i in range(len(self.src)): - subplan += [ self.ithCoreCommand(gp,i) + ' < %s >> %s' % (self.src[i],self.mid) ] - sortOpts = '-k1,2' if self.hasIndex else '-k1' - subplan += [ 'sort ' + sortOpts + ' < ' + self.mid + ' | ' + reduceStep.coreCommand(gp) + (' > %s' % reduceStep.dst)] - return self.subplanHeader(reduceStep) + subplan - elif gp.opts['target']=='hadoop': - def midi(i): return self.mid + '-' + str(i) - subplan = [] - for i in range(len(self.src)): - hcom = self.HadoopCommand(gp,self.view) - hcom.appendDef('-D','mapred.reduce.tasks=%d' % gp.opts['parallel']) - hcom.append('-input',self.src[i], '-output',midi(i)) - hcom.append("-mapper","'%s'" % self.ithCoreCommand(gp,i)) - subplan += [ self.hadoopClean(gp,midi(i)), hcom.asEcho(), hcom.asString() ] - hcombineCom = self.HadoopCommand(gp,self.view) - hcombineCom.appendDef('-D','mapred.reduce.tasks=%d' % gp.opts['parallel']) - if (self.hasIndex): - hcombineCom.appendDef('-jobconf','stream.num.map.output.key.fields=3') - hcombineCom.appendDef('-jobconf','num.key.fields.for.partition=1') - for i in range(len(self.src)): - hcombineCom.append('-input',midi(i)) - hcombineCom.append('-output',reduceStep.dst) - hcombineCom.append('-mapper','cat') - hcombineCom.append('-reducer',"'%s'" % reduceStep.coreCommand(gp)) - if (self.hasIndex): - hcombineCom.append('-partitioner','org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner') - subplan += [ self.hadoopClean(gp,reduceStep.dst), hcombineCom.asEcho(), hcombineCom.asString() ] - return self.subplanHeader(reduceStep) + subplan - else: - assert False - -class RowSerializer(object): - """Saves row objects to disk and retrieves them.""" - def __init__(self,target): - self._target = target - self._reprInverse = None - def toString(self,x): - return repr(x) - def fromString(self,s): - if self._reprInverse: return self._reprInverse(s) - else: return eval(s) - -# -# the planner -# - -class Planner(object): - """Can create storage plans for views that are defined as parts of it.""" - - def __init__(self,**kw): - - #parameters are used for programmatically give user-defined - #config information to a planner, or they can be specified in - #the command-line - self.param = kw - for (key,val) in GPig.getArgvParams().items(): - # don't override non-null values specified in the constructor - if self.param.get(key)==None: - self.param[key] = val - - #opts are used for giving options to the planner from the shell - self.opts = GPig.getArgvOpts() - for (key,val) in GPig.DEFAULT_OPTS.items(): - if (not key in self.opts): self.opts[key] = val - for (key,type) in GPig.DEFAULT_OPT_TYPES.items(): - self.opts[key] = type(self.opts[key]) - #the viewsPlannedToExist is set using the "--reuse" option at - #planning time, and incrementally added to as the plan is with - #commands that actually store a view. - self.viewsPlannedToExist = set() - self._serializer = RowSerializer(self.opts['target']) - - #views that aren't associated with class variable, but are - #instead named automatically - ie, inner views with no - #user-provided names. - self._autoNamedViews = {} - - #by default, use info-level logging at planning time - if not Planner.partOfPlan(sys.argv): - logging.basicConfig(level=logging.INFO) - - #hadoop needs to know where to give the main script file, - #as well as the guineapig.py file it uses - self._gpigSourceFile = sys.argv[0] - self._shippedFiles = ['guineapig.py',self._gpigSourceFile] - - def setup(self): - """Initialize planner, and views used by the planner. This has to be - done after the planner is fully configured by adding views.""" - - # make sure view directory is valid - if self.opts['target']=='shell' and not os.path.exists(self.opts['viewdir']): - logging.info('creating view directory ' + self.opts['viewdir']) - os.makedirs(self.opts['viewdir']) - elif self.opts['target']=='hadoop': - p = urlparse.urlparse(self.opts['viewdir']) - if not p.path.startswith("/"): - logging.warn('hadoop viewdir should be absolite path: will try prefixing /user/$LOGNAME') - username = os.environ.get('LOGNAME','me') - self.opts['viewdir'] = '/user/'+username+'/'+self.opts['viewdir'] - logging.warn('viewdir is set to '+self.opts['viewdir']) - - # Add 'tag' and planner fields to each view - for vname in self.listViewNames(): - v = self.getView(vname) - v.tag = vname - v.planner = self - def tagUnnamedViews(v,basename,index,depth): - assert v,'null inner view for '+basename - if not v.planner: - v.planner = self - autoname = '%s_%d_%s' % (basename,depth,index) - self._setView(autoname,v) - for i,inner in enumerate(v.innerViews() + v.nonInnerPrereqViews()): - tagUnnamedViews(inner,vname,i,depth+1) - for vname in self.listViewNames(): - v = self.getView(vname) - for i,inner in enumerate(v.innerViews() + v.nonInnerPrereqViews()): - tagUnnamedViews(inner,vname,i,1) - - # Add caching options as needed - - # a mapreduce step can't use a reduce-input as a checkpoint - # so introduce caching as needed - for vname in self.listViewNames(): - v = self.getView(vname) - if isinstance(v,MapReduce): - for inner in v.inners: - innerCheckpoint = inner.checkpoint() - if innerCheckpoint and MapReduce.isReduceInputFile(innerCheckpoint): - if not inner.storeMe: - logging.info('making %s stored, to make possible a downstream map-reduce view' % inner.tag) - inner.storeMe = True - - # you can't combine computation of an Augment with its inner - # view, because then the inner view would also need access to - # the Augment's side views, which isn't guaranteed - for vname in self.listViewNames(): - v = self.getView(vname) - if isinstance(v,Augment): - if not v.inner.storeMe: - logging.info('making %s stored, to make possible a downstream augment view' % v.inner.tag) - v.inner.storeMe = True - - #cache anything used more than twice - numberOfTimesUsed = collections.defaultdict(int) - for vname in self.listViewNames(): - v = self.getView(vname) - for inner in v.innerViews() + v.nonInnerPrereqViews(): - numberOfTimesUsed[inner] += 1 - for (v,n) in numberOfTimesUsed.items(): - if n>1 and v.storeMe==None: - logging.info('making %s stored because it might be used %d times' % (v.tag,n)) - v.storeMe = True - - #mark non-inner prereq views as storeMe = 'distributedCache' - for vname in self.listViewNames(): - v = self.getView(vname) - for inner in v.nonInnerPrereqViews(): - inner.storeMe = 'distributedCache' - - # - # rest of the API for the planner - # - - @staticmethod - def partOfPlan(argv): - """True if the command line was generated as part of a storage plan.""" - return any(s.startswith("--do") for s in argv) - - def main(self,argv): - """Run a main that lets you --store a view, as well as doing a few other things.""" - self.setup() - self.runMain(argv) - - def runMain(self,argv): - - # parse the options and dispatch appropriately - argspec = ["list", "pprint=", "plan=", "metaplan=", - "store=", "cat=", - "reuse", "alreadyStored=", - "params=", "opts=", "do=", "view="] - optlist,args = getopt.getopt(argv[1:], 'x', argspec) - optdict = dict(optlist) - - # decide what views can be re-used, vs which need fresh plans - if '--reuse' in optdict: #reuse the views listed in the arguments - for a in args: - vname = View.viewNameFor(a) - v = self.getView(vname) - if v: - self.viewsPlannedToExist.add(v.tag) - logging.info("re-using data stored for view "+vname+": "+str(v)) - else: - logging.warn("cannot re-use view "+vname+" since it's not used in this script") - - #choose the main action to take - if '--plan' in optdict: #print a storage plan - rel = self.getView(optdict['--plan'],mustExist=True) - plan = rel.storagePlan() - print "\n".join(plan.compile(self)) - return - elif '--pprint' in optdict: #print a view - rel = self.getView(optdict['--pprint'],mustExist=True) - rel.pprint() - return - elif '--metaplan' in optdict: - rel = self.getView(optdict['--metaplan'],mustExist=True) - print rel.metaplan(self.viewsPlannedToExist) - return - elif '--store' in optdict: #store a view - rel = self.getView(optdict['--store'],mustExist=True) - plan = rel.storagePlan() - plan.execute(self, echo=self.opts['echo']) - return - elif '--cat' in optdict: #store and then print a view - assert self.opts['target']=='shell','cannot do --cat except in shell mode' - rel = self.getView(optdict['--cat'],mustExist=True) - plan = rel.storagePlan() - plan.execute(self, self.opts['echo']) - for line in open(rel.storedFile(),'r'): - print line, - return - elif '--list' in optdict: #list named views - for vname in self.listViewNames(): - print ' ',vname,'\t',self.getView(vname) - return - elif '--do' in optdict: #run an internally-generated action - #recover what should be stored when this action is performed - self.alreadyStored = set(optdict.get('--alreadyStored','').split(',')) - #work out what view to use and what routine to call - rel = self.getView(optdict['--view'],mustExist=True) - whatToDo = optdict['--do'] - #work out the method given by 'do' and call it - note it - #may have a single integer argument, eg doJoinMap.1 - k = whatToDo.find(".") - if k<0: - whatToDoMethod = getattr(rel,whatToDo) - whatToDoMethod() - else: - arg = int(whatToDo[k+1:]) - whatToDo = whatToDo[:k] - whatToDoMethod = getattr(rel,whatToDo) - whatToDoMethod(arg) - return - else: - print 'usage: --[store|pprint|plan|cat] view [--opts key:val,...] [--params key:val,...] --reuse view1 view2 ...]' - print ' --[list]' - print 'current legal keys for "opts", with default values:' - for (key,val) in GPig.DEFAULT_OPTS.items(): - print ' %s:%s' % (key,str(val)) - print 'There\'s more help at http://curtis.ml.cmu.edu/w/courses/index.php/Guinea_Pig' - - def getView(self,str,mustExist=False): - """Find the defined relation named str, and if necessary bind its - planner and tag appropriately.""" - v = self.__class__.__dict__.get(str) or self.__dict__.get(str) or self._autoNamedViews.get(str) - if mustExist: assert v,'cannot find a view named '+str - return v - - def _setView(self,str,view): - """Internal use only: allow the view to be retreived by name later.""" - view.tag = str - self._autoNamedViews[str] = view - - def listViewNames(self): - def namedViews(d): return [vname for vname in d.keys() if isinstance(self.getView(vname),View)] - userNamedViews = namedViews(self.__class__.__dict__) + namedViews(self.__dict__) - return userNamedViews + self._autoNamedViews.keys() - - # - # dealing with the file storage system and related stuff - # - - def ship(self,*fileNames): - """Declare a set of inputs to be 'shipped' to the hadoop cluster.""" - self._shippedFiles += fileNames - - def setSerializer(self,serializer): - """Replace the default serializer another RowSerializer object.""" - self._serializer = serializer - return self - - def setReprInverseFun(self,reprInverse): - """Specify a function which will deserialize a string that was produced - by Python's 'repr' function.""" - self._serializer._reprInverse = reprInverse - return self - -if __name__ == "__main__": - print 'There\'s help at http://curtis.ml.cmu.edu/w/courses/index.php/Guinea_Pig' diff --git a/tutorial/guineapig1_2.py b/tutorial/guineapig1_2.py deleted file mode 100644 index e4fb776..0000000 --- a/tutorial/guineapig1_2.py +++ /dev/null @@ -1,1284 +0,0 @@ -############################################################################## -# (C) Copyright 2014 William W. Cohen. All rights reserved. -############################################################################## - -import sys -import logging -import copy -import subprocess -import collections -import os -import os.path -import urlparse -import getopt -import csv - -############################################################################### -# helpers functions and data structures -############################################################################### - -class GPig(object): - """Collection of utilities for Guinea Pig.""" - - HADOOP_LOC = 'hadoop' #assume hadoop is on the path at planning time - MY_LOC = 'guineapig1_2.py' - - #global options for Guinea Pig can be passed in with the --opts - #command-line option, and these are the default values - defaultJar = '/usr/lib/hadoop/contrib/streaming/hadoop-streaming-1.2.0.1.3.0.0-107.jar' - envjar = os.environ.get('GP_STREAMJAR', defaultJar) - DEFAULT_OPTS = {'streamJar': envjar, - 'parallel':5, - 'target':'shell', - 'echo':0, - 'viewdir':'gpig_views', - } - #there are the types of each option that has a non-string value - DEFAULT_OPT_TYPES = {'parallel':int,'echo':int} - #we need to pass non-default options in to mappers and reducers, - #but since the remote worker's environment can be different, we - #also need to pass in options computed from the environment - COMPUTED_OPTION_DEFAULTS = {'streamJar':defaultJar} - - @staticmethod - def getArgvParams(): - """Return a dictionary holding the argument of the --params option in - sys.argv.""" - return GPig.getArgvDict('--params') - - @staticmethod - def getArgvOpts(): - """Return a dictionary holding the argument of the --opts option in - sys.argv.""" - return GPig.getArgvDict('--opts') - - @staticmethod - def getArgvDict(optname): - """Return a dictionary of parameter values that were defined on the command line - view an option like '--params filename:foo.txt,basedir:/tmp/glob/'. - """ - assert optname.startswith('--') - for i,a in enumerate(sys.argv): - if a==optname: - paramString = sys.argv[i+1] - return dict(pair.split(":") for pair in paramString.split(",")) - return {} - - @staticmethod - def rowsOf(view): - """Iterate over the rows in a view.""" - for line in open(view.distributableFile()): - yield view.planner._serializer.fromString(line.strip()) - - @staticmethod - def onlyRowOf(view): - """Return the first row in a side view, and raise an error if it - is not the only row of the view.""" - result = None - logging.info('loading '+view.distributableFile()) - for line in open(view.distributableFile()): - assert not result,'multiple rows in stored file for '+view.tag - result = view.planner._serializer.fromString(line.strip()) - return result - -class Jin(object): - """"Object to hold description of a single join input.""" - - def __init__(self,view,by=(lambda x:x),outer=False): - self.view = view - self.joinBy = by - self.outer = outer - self._padWithNulls = False - - def __str__(self): - viewStr = View.asTag(self.view) if self.view else '_' - outerStr = ',outer=True' if self.outer else '' - padStr = ',_padWithNulls=True' if self._padWithNulls else '' - return "Jin(%s,by=%s%s%s)" % (viewStr,self.joinBy,outerStr,padStr) - -class ReduceTo(object): - """An object x that can be the argument of a reducingTo=x - parameter in a Group view.""" - def __init__(self,baseType,by=lambda accum,val:accum+val): - self.baseType = baseType - self.reduceBy = by - -class ReduceToCount(ReduceTo): - """Produce the count of the number of objects that would be placed in a group.""" - def __init__(self): - ReduceTo.__init__(self,int,by=lambda accum,val:accum+1) - -class ReduceToSum(ReduceTo): - """Produce the sum of the objects - which must be numbers - that would - be placed in a group.""" - def __init__(self): - ReduceTo.__init__(self,int,by=lambda accum,val:accum+val) - -class ReduceToList(ReduceTo): - """Produce a list of the objects that would be placed in a group.""" - def __init__(self): - ReduceTo.__init__(self,list,by=lambda accum,val:accum+[val]) - -############################################################################### -# abstract views -############################################################################## - -class View(object): - """A definition of a relation for Guinea Pig. A View object can be - produce a storagePlan(), which can then be executed to produce the - contents of the relation. Intutitively, a relation is and - unordered bag of rows, and a row an almost-arbitrary python data - structure. (It must be something that can be stored and retrieved - by the RowSerializer.) - - Steps in the storagePlan are executed by delegation, thru the - planner, to methods of a View class named doFoo. - """ - - def __init__(self): - """The planner and tag must be set before this is used.""" - self.planner = None #pointer to planner object - self.tag = None #for naming storedFile and checkpoints - self.storeMe = None #try and store this view if true - self.retainedPart = None #used in map-reduce views only - self.sideviews = [] #non-empty for Augment views only - self.inners = [] #always used - - #self.inner is shortcut for inners[0] - def _getInner(self): return self.inners[0] - def _setInner(self,val): self.inners = [val] - inner = property(_getInner,_setInner) - - # - # ways to modify a view - # - - def opts(self,stored=None): - """Return the same view with options set appropriately. Possible - options include: - - - stored=True - Explicitly store this view on disk whenever - it is used in another view's definition. This might be set - by the user for debugging purposes, or by the planner, - to prevent incorrect optimizations. Generally "inner" - views are not explicitly stored. - - - stored='distributedCache' - Store this view in the working - directory and/or the Hadoop distributed cache. - """ - - self.storeMe = stored - return self - - def showExtras(self): - """Printable representation of the options for a view.""" - result = '' - flagPairs = [] - if self.storeMe: flagPairs += ['stored=%s' % repr(self.storeMe)] - if flagPairs: result += '.opts(' + ",".join(flagPairs) + ')' - return result - - # - # how the view is saved on disk - # - - def storedFile(self): - """The file that will hold the materialized relation.""" - return self.planner.opts['viewdir'] + '/' + self.tag + '.gp' - - def distributableFile(self): - """The file that will hold the materialized relation in the working directory - in preparation to be uploaded to the distributed cache.""" - return self.tag + '.gp' - - @staticmethod - def viewNameFor(fileName): - """The view associated with the given file name""" - vname = os.path.basename(fileName) - if vname.endswith(".gp"): vname = vname[0:-len(".gp")] - return vname - - # - # semantics of the view - # - - def checkpoint(self): - """A checkpoint is an intermediate computation for the view, which is - saved on disk. The rowGenerator() for the view will assume - that the checkpoint is available. - """ - assert False, 'abstract method called' - - def checkpointPlan(self): - """A plan to produce checkpoint().""" - assert False, 'abstract method called' - - def rowGenerator(self): - """A generator for the rows in this relation, which assumes existence - of the checkpoint.""" - assert False, 'abstract method called' - - def explanation(self): - """Return an explanation of how rows are generated.""" - assert False, 'abstract method called' - - def storagePlan(self): - """A plan to store the view.""" - return self.planner.buildRecursiveStoragePlan(self) - - def nonrecursiveStoragePlan(self): - """Materialize the relation, assuming that there are no descendent - inner views that need to be materialized first.""" - plan = self.checkpointPlan() - result = plan.extend(Step(self,'doStoreRows',self.checkpoint(),self.storedFile(),why=self.explanation())) - return result - - def applyDict(self,mapping,innerviewsOnly=False): - """Given a mapping from view tags to views, replace every inner view with - the appropriate value from the mapping, and return the result.""" - if self.tag in mapping and not innerviewsOnly: - return mapping[self.tag] - elif not self.inners: - return self - else: - result = copy.copy(self) - result.inners = map(lambda v:v.applyDict(mapping), self.inners) - return result - - def sideviewsNeeded(self): - """Sideviews needed by this view.""" - result = [] - for sv in self.sideviews: - result += [sv] - for v in self.inners: - result += list(v._sideviewsOfDescendants()) - return result - - def _sideviewsOfDescendants(self): - if not self.storeMe: - for sv in self.sideviews: - yield sv - for v in self.inners: - for sv in v._sideviewsOfDescendants(): - yield sv - - def enforceStorageConstraints(self): - """Subclass this, if there are constraints on when one must explicitly - store inner views.""" - pass - - def doStoreRows(self): - """Called by planner at execution time to store the rows of the view.""" - for row in self.rowGenerator(): - print self.planner._serializer.toString(row) - - # - # support the "pipe" syntax: view1 | view2 - # - - def __or__(self,otherView): - """Overload the pipe operator x | y to return with y, with x as its inner view.""" - otherView.acceptInnerView(self) - return otherView - - def acceptInnerView(self,otherView): - """Replace an appropriate input view with otherView. This is subclassed to - implement the the pipe operator.""" - assert not self.inner,'An inner view is defined for '+self.tag+' so you cannot use it as RHS of a pipe' - self.inner = otherView #subclass if needed - - # - # printing views - # - - def pprint(self,depth=0,alreadyPrinted=None,sideview=False): - """Print a readable representation of the view.""" - if alreadyPrinted==None: alreadyPrinted = set() - tabStr = '| ' * depth - tagStr = str(self.tag) - sideviewIndicator = '*' if sideview else '' - if self.tag in alreadyPrinted: - print tabStr + sideviewIndicator + tagStr + ' = ' + '...' - else: - sideviewInfo = " sideviews: {"+",".join(map(lambda x:x.tag, self.sideviews))+"}" if self.sideviews else "" - print tabStr + sideviewIndicator + tagStr + ' = ' + str(self) + sideviewInfo - alreadyPrinted.add(self.tag) - for inner in self.inners: - inner.pprint(depth+1,alreadyPrinted) - for inner in self.sideviews: - inner.pprint(depth+1,alreadyPrinted,sideview=True) - - @staticmethod - def asTag(view): - """Helper for printing views.""" - if not view: return '(null view)' - elif view.tag: return view.tag - else: return str(view) - -# -# abstract view types -# - -class Reader(View): - """Read data stored on the file system and make it look like a View.""" - - def __init__(self,src): - View.__init__(self) - self.src = src - self.inners = [] - - def checkpoint(self): - return self.src - - def checkpointPlan(self): - return Plan() #empty plan - - def explanation(self): - return [ 'read %s with %s' % (str(self.src),self.tag) ] - - def acceptInnerView(self,otherView): - assert False, "Reader views cannot be used as RHS of a pipe" - -class Transformation(View): - """Streaming transformation on a single inner view.""" - - def __init__(self,inner=None): - View.__init__(self) - self.inner = inner - - # A transformation will stream on-the-fly through the inner - # relation, and produce a new version, so the checkpoint and plan - # to produce it are delegated to the inner View. - - def checkpoint(self): - return self.inner.checkpoint() - - def checkpointPlan(self): - return self.inner.checkpointPlan() - - def explanation(self): - return self.inner.explanation() + [ 'transform to %s' % self.tag ] - -class MapReduce(View): - """A view that takes an inner relation and processes in a - map-reduce-like way.""" - - def __init__(self,inners,retaining): - View.__init__(self) - self.inners = inners - self.retainedPart = retaining - - def _isReduceInputFile(self,fileName): - return fileName.endswith('.gpri') - - def checkpoint(self): - ## the checkpoint is the reducer input file - return self.planner.opts['viewdir'] + '/' + self.tag + '.gpri' - - def checkpointPlan(self): - plan = Plan() - for inner in self.inners: - plan = plan.append(inner.checkpointPlan()) - return plan.append(self.mapPlan()) - - def enforceStorageConstraints(self): - for inner in self.inners: - innerChkpt = inner.checkpoint() - #optimizations break if you chain two map-reduces together - if innerChkpt and innerChkpt.endswith(".gpri"): - if not inner.storeMe: - logging.info('making %s stored, to make possible a downstream map-reduce view' % inner.tag) - inner.storeMe = True - - def mapPlan(self): - log.error("abstract method not implemented") - - def doStoreKeyedRows(self,subview,key,index): - """Utility method used by concrete map-reduce classes to compute keys - and store key-value pairs. Usually used as the main step in a - mapPlan. """ - for row in subview.rowGenerator(): - keyStr = self.planner._serializer.toString(key(row)) - rrow = self.retainedPart(row) if self.retainedPart else row - valStr = self.planner._serializer.toString(rrow) - if index<0: - print "%s\t%s" % (keyStr,valStr) - else: - print "%s\t%d\t%s" % (keyStr,index,valStr) - -############################################################################## -# -# concrete View classes -# -############################################################################## - -class ReuseView(Reader): - """Returns the objects in a previously stored view.""" - - def __init__(self,view): - if isinstance(view,View): - Reader.__init__(self,view.storedFile()) - self.tag = "reuse_"+view.tag - self.reusedViewTag = view.tag - self.planner = view.planner - else: - assert False,'user-defined ReuseView not supported (yet)' - - def rowGenerator(self): - for line in sys.stdin: - yield self.planner._serializer.fromString(line.strip()) - - def __str__(self): - return 'ReuseView("%s")' % self.src + self.showExtras() - - -class ReadLines(Reader): - """ Returns the lines in a file, as python strings.""" - - def __init__(self,src): - Reader.__init__(self,src) - - def rowGenerator(self): - for line in sys.stdin: - yield line - - def __str__(self): - return 'ReadLines("%s")' % self.src + self.showExtras() - -class ReadCSV(Reader): - """ Returns the lines in a CSV file, converted to Python tuples.""" - - def __init__(self,src,**kw): - Reader.__init__(self,src) - self.kw = kw - - def rowGenerator(self): - for tup in csv.reader(sys.stdin,**self.kw): - yield tup - - def __str__(self): - return 'ReadCVS("%s",%s)' % (self.src,str(self.kw)) + self.showExtras() - - -class ReplaceEach(Transformation): - """ In 'by=f'' f is a python function that takes a row and produces - its replacement.""" - - def __init__(self,inner=None,by=lambda x:x): - Transformation.__init__(self,inner) - self.replaceBy = by - - def rowGenerator(self): - for row in self.inner.rowGenerator(): - yield self.replaceBy(row) - - def explanation(self): - return self.inner.explanation() + [ 'replaced to %s' % self.tag ] - - def __str__(self): - return 'ReplaceEach(%s, by=%s)' % (View.asTag(self.inner),str(self.replaceBy)) + self.showExtras() - -class Augment(Transformation): - - def __init__(self,inner=None,sideviews=None,sideview=None,loadedBy=lambda v:list(GPig.rowsOf(v))): - Transformation.__init__(self,inner) - assert not (sideviews and sideview), 'cannot specify both "sideview" and "sideviews"' - self.sideviews = list(sideviews) if sideviews else [sideview] - self.loader = loadedBy - assert self.loader,'must specify a "loadedBy" function for Augment' - - def enforceStorageConstraints(self): - for sv in self.sideviews: - sv.storeMe = 'distributedCache' - - def rowGenerator(self): - augend = self.loader(*self.sideviews) - for row in self.inner.rowGenerator(): - yield (row,augend) - - def checkpointPlan(self): - plan = Plan() - plan.append(self.inner.checkpointPlan()) - #the sideviews should have been stored by the top-level - #planner already, but they will need to be moved to a - #distributable location - for sv in self.sideviews: - plan.extend(Step(sv, 'DISTRIBUTE')) - return plan - - def explanation(self): - return self.inner.explanation() + [ 'augmented to %s' % self.tag ] - - def __str__(self): - sideviewTags = loaderTag = '*UNSPECIFIED*' - if self.sideviews!=None: sideviewTags = ",".join(map(View.asTag,self.sideviews)) - if self.loader!=None: loaderTag = str(self.loader) - return 'Augment(%s,sideviews=%s,loadedBy=s%s)' % (View.asTag(self.inner),sideviewTags,loaderTag) + self.showExtras() - - -class Format(ReplaceEach): - """ Like ReplaceEach, but output should be a string, and it will be be - stored as strings, ie without using the serializer.""" - - def __init__(self,inner=None,by=lambda x:str(x)): - ReplaceEach.__init__(self,inner,by) - - def __str__(self): - return 'Format(%s, by=%s)' % (View.asTag(self.inner),str(self.replaceBy)) + self.showExtras() - - def doStoreRows(self): - for row in self.rowGenerator(): - print row - -class Flatten(Transformation): - """ Like ReplaceEach, but output of 'by' is an iterable, and all - results will be returned. """ - - def __init__(self,inner=None,by=None): - Transformation.__init__(self,inner) - self.flattenBy = by - - def rowGenerator(self): - for row in self.inner.rowGenerator(): - for flatrow in self.flattenBy(row): - yield flatrow - - def explanation(self): - return self.inner.explanation() + [ 'flatten to %s' % self.tag ] - - def __str__(self): - return 'Flatten(%s, by=%s)' % (View.asTag(self.inner),str(self.flattenBy)) + self.showExtras() - -class Filter(Transformation): - """Filter out a subset of rows that match some predicate.""" - - def __init__(self,inner=None,by=lambda x:x): - Transformation.__init__(self,inner) - self.filterBy = by - - def rowGenerator(self): - for row in self.inner.rowGenerator(): - if self.filterBy(row): - yield row - - def explanation(self): - return self.inner.explanation() + [ 'filtered to %s' % self.tag ] - - def __str__(self): - return 'Filter(%s, by=%s)' % (View.asTag(self.inner),str(self.filterBy)) + self.showExtras() - -class Distinct(MapReduce): - """Remove duplicate rows.""" - - def __init__(self,inner=None,retaining=None): - MapReduce.__init__(self,[inner],retaining) - - def mapPlan(self): - step = Step(self, 'doDistinctMap', self.inner.checkpoint(), self.checkpoint(), prereduce=True, why=self.explanation()) - return Plan().extend(step) - - def rowGenerator(self): - """Extract distinct elements from a sorted list.""" - lastval = None - for line in sys.stdin: - valStr = line.strip() - val = self.planner._serializer.fromString(valStr) - if val != lastval and lastval: - yield lastval - lastval = val - if lastval: - yield lastval - - def explanation(self): - return self.inner.explanation() + [ 'make distinct to %s' % self.tag] - - def __str__(self): - return 'Distinct(%s)' % (View.asTag(self.inner)) + self.showExtras() - - def doDistinctMap(self): - self.inner.doStoreRows() - - -class Group(MapReduce): - """Group by some property of a row, defined with the 'by' option. - Default outputs are tuples (x,[r1,...,rk]) where the ri's are rows - that have 'by' values of x.""" - - def __init__(self,inner=None,by=lambda x:x,reducingTo=ReduceToList(),retaining=None): - MapReduce.__init__(self,[inner],retaining) - self.groupBy = by - self.reducingTo = reducingTo - - def mapPlan(self): - step = Step(self, 'doGroupMap',self.inner.checkpoint(),self.checkpoint(),prereduce=True,why=self.explanation()) - return Plan().extend(step) - - def rowGenerator(self): - """Group objects from stdin by key, yielding tuples (key,[g1,..,gn]).""" - lastkey = key = None - accum = self.reducingTo.baseType() - for line in sys.stdin: - keyStr,valStr = line.strip().split("\t") - key = self.planner._serializer.fromString(keyStr) - val = self.planner._serializer.fromString(valStr) - if key != lastkey and lastkey!=None: - yield (lastkey,accum) - accum = self.reducingTo.baseType() - accum = self.reducingTo.reduceBy(accum, val) - lastkey = key - if key: - yield (key,accum) - - def explanation(self): - return self.inner.explanation() + ['group to %s' % self.tag] - - def __str__(self): - return 'Group(%s,by=%s,reducingTo=%s)' % (View.asTag(self.inner),str(self.groupBy),str(self.reducingTo)) + self.showExtras() - - def doGroupMap(self): - self.doStoreKeyedRows(self.inner,self.groupBy,-1) - -class Join(MapReduce): - """Outputs tuples of the form (row1,row2,...rowk) where - rowi is from the i-th join input, and the rowi's have the same - value of the property being joined on.""" - - def __init__(self,*joinInputs): - #sets self.inners - MapReduce.__init__(self,map(lambda x:x.view, joinInputs),None) - self.joinInputs = joinInputs - #re-interpret the 'outer' join parameters - semantically - #if jin[i] is outer, then all other inputs must be marked as _padWithNulls - if any(map(lambda jin:jin.outer, self.joinInputs)): - assert len(self.joinInputs)==2,'outer joins are only supported on two-way joins '+str(self.joinInputs) - for i in range(len(self.joinInputs)): - if self.joinInputs[i].outer: - j = 1-i #the other index - self.joinInputs[j]._padWithNulls = True - - def acceptInnerView(self,otherView): - assert False, 'join cannot be RHS of a pipe - use JoinTo instead' - - def mapPlan(self): - innerCheckpoints = map(lambda v:v.checkpoint(), self.inners) - midfile = self.planner.opts['viewdir'] + '/' + self.tag+'.gpmo' - step = Step(self, 'doJoinMap', src=innerCheckpoints, dst=self.checkpoint(), prereduce=True, hasIndex=True, mid=midfile, why=self.explanation()) - return Plan().extend(step) - - def applyDict(self,mapping,innerviewsOnly=False): - result = MapReduce.applyDict(self,mapping,innerviewsOnly=innerviewsOnly) - #also need to map over the join inputs - if isinstance(result,Join): - for i in range(len(result.joinInputs)): - result.joinInputs[i].view = result.inners[i] - return result - - def rowGenerator(self): - """Group objects from stdin by key, yielding tuples (row1,row2,...).""" - lastkey = None - lastIndex = len(self.joinInputs)-1 - somethingProducedForLastKey = False - #accumulate a list of lists of all non-final inputs - accumList = [ [] for i in range(lastIndex) ] - for line in sys.stdin: - keyStr,indexStr,valStr = line.strip().split("\t") - key = self.planner._serializer.fromString(keyStr) - index = int(indexStr) - val = self.planner._serializer.fromString(valStr) - if key != lastkey and lastkey!=None: - #if the final join is marked as _padWithNulls, clear - #the accumulators, since we're doing an outer join - #with the last view - if self.joinInputs[lastIndex]._padWithNulls and not somethingProducedForLastKey: - for tup in self._joinAccumulatedValuesTo(accumList,lastIndex,None): - yield tup - #reset the accumulators, since they pertain to the - accumList = [ [] for i in range(lastIndex) ] - somethingProducedForLastKey = False - if index!=lastIndex: - #accumulate values to use in the join - accumList[index] = accumList[index] + [val] - else: - #produce tuples that match the key for the last view - for tup in self._joinAccumulatedValuesTo(accumList,lastIndex,val): - somethingProducedForLastKey = True - yield tup - lastkey = key - - def _joinAccumulatedValuesTo(self,accumList,lastIndex,finalVal): - # _padWithNulls as needed - for i in range(lastIndex): - if self.joinInputs[i]._padWithNulls and not accumList[i]: - accumList[i] = [None] - tupbuf = [ None for i in range(lastIndex+1) ] #holds output - tupbuf[lastIndex] = finalVal - for i in range(lastIndex): - for a in accumList[i]: - tupbuf[i] = a - if i==lastIndex-1 and any(tupbuf): - yield tuple(tupbuf) - - def explanation(self): - innerEx = [] - for inner in self.inners: - if innerEx: innerEx += ['THEN'] - innerEx += inner.explanation() - return innerEx + [ 'FINALLY join to %s' % self.tag ] - - def __str__(self): - return "Join(%s)" % ",".join(map(str,self.joinInputs)) + self.showExtras() - - def doJoinMap(self,i): - # called by joinMapPlan with argument index, and stdin pointing to innerCheckpoints[index] - self.doStoreKeyedRows(self.joinInputs[i].view,self.joinInputs[i].joinBy,i) - -class JoinTo(Join): - """Special case of Join which can be used as the RHS of a pipe operator.""" - - def __init__(self,joinInput,by=None): - Join.__init__(self,Jin(None,by),joinInput) - - def acceptInnerView(self,otherView): - self.joinInputs[0].view = otherView - self.inners[0] = otherView - -############################################################################## -# -# the top-level planner, and its supporting classes -# -############################################################################## - -class Plan(object): - """A plan constructed by a GuineaPig.""" - - def __init__(self): self.steps = [] - - def extend(self,step): - self.steps += [step] - return self - - def append(self,subPlan): - self.steps += subPlan.steps - return self - - def execute(self,gp,echo=False): - script = self.compile(gp) - for shellcom in script: - if echo: print 'calling:',shellcom - subprocess.check_call(shellcom,shell=True) - - def compile(self,gp): - """Return a list of strings that can be run as shell commands.""" - script = [] - i = 0 - while (i such that - # items should be partitioned by key and sorted by index - # - why is documentation/explanation. - # - reused is list of tags of views that should be reused. - - def __init__(self,view,whatToDo,src=None,dst=None,prereduce=False,hasIndex=False,mid=None,why=[],reused=[]): - self.view = view - self.whatToDo = whatToDo - self.src = src - self.dst = dst - self.prereduce = prereduce - self.hasIndex = hasIndex - self.mid = mid - self.why = why - self.reused = reused - - def setReusedViews(self,views): - self.reused = list(views) - - def __str__(self): - return repr(self) - - def __repr__(self): - return "Step(%s,%s,src=%s,dst=%s,prereduce=%s,mid=%s,why=%s,reused=%s)" \ - % (repr(self.view.tag),repr(self.whatToDo),repr(self.src), - repr(self.dst),repr(self.prereduce),repr(self.mid),repr(self.explain()),repr(self.reused)) - - def explain(self): - """Convert an explanation - which is a list of strings - into a string""" - return "...".join(self.why) - - # - # subroutines of the general case for code generation - # - - class HadoopCommand(object): - def __init__(self,gp,*views): - logging.info('building hadoop command for '+str(map(lambda v:v.tag, views))) - self.invocation = [GPig.HADOOP_LOC,'jar',gp.opts['streamJar']] - self.defs = [] - self.args = [] - self.files = [] - for f in gp._shippedFiles: - self.files += ['-file',f] - for view in views: - viewsToShip = view.sideviewsNeeded() - if viewsToShip: - logging.info('shipping for '+view.tag+': '+str(map(lambda sv:sv.tag, viewsToShip))) - for sv in viewsToShip: - self.files += ['-file',sv.distributableFile()] - logging.info('files: '+str(self.files)) - def append(self,*toks): - self.args += list(toks) - def appendDef(self,*toks): - self.defs += list(toks) - def asEcho(self): - return " ".join(['echo','hadoop'] + self.args + ['...']) - def asString(self): - return " ".join(self.invocation+self.defs+self.files+self.args) - - def subplanHeader(self,reduceStep=None): - """Generate an explanatory header for a step.""" - if not reduceStep: return ['#', 'echo create '+self.view.tag + ' via map: '+self.explain()] - else: return ['#', 'echo create '+reduceStep.view.tag+' via map/reduce: '+reduceStep.explain()] - - def coreCommand(self,gp): - """Python command to call an individual plan step.""" - return 'python %s --view=%s --do=%s' % (gp._gpigSourceFile,self.view.tag,self.whatToDo) + self.coreCommandOptions(gp) - - def ithCoreCommand(self,gp,i): - """Like coreCommand but allows index parameter to 'do' option""" - return 'python %s --view=%s --do=%s.%d' % (gp._gpigSourceFile,self.view.tag,self.whatToDo,i) + self.coreCommandOptions(gp) - - def coreCommandOptions(self,gp): - paramOpts = '' if not gp.param else " --params " + ",".join(map(lambda(k,v):k+':'+v, gp.param.items())) - nonDefaults = [] - for (k,v) in gp.opts.items(): - #pass in non-default options, or options computed from the environment - if (gp.opts[k] != GPig.DEFAULT_OPTS[k]) or ((k in GPig.COMPUTED_OPTION_DEFAULTS) and (gp.opts[k] != GPig.COMPUTED_OPTION_DEFAULTS[k])): - nonDefaults += ["%s:%s" % (k,str(v))] - optsOpts = '' if not nonDefaults else " --opts " + ",".join(nonDefaults) - reuseOpts = '' if not self.reused else " --reuse "+ " ".join(self.reused) - return paramOpts + optsOpts + reuseOpts - - def hadoopClean(self,gp,fileName): - """A command to remove a hdfs directory if it exists.""" - #return '(%s fs -test -e %s && %s fs -rmr %s) || echo no need to remove %s' % (GPig.HADOOP_LOC,fileName, GPig.HADOOP_LOC,fileName, fileName) - return '(%s fs -test -e %s && %s fs -rmr %s) || echo no need to remove %s' % (GPig.HADOOP_LOC,fileName, GPig.HADOOP_LOC,fileName, fileName) - - # - # actual code generation for the steps - # - - # one special case - 'distribute' a computed view, ie move to distributed cache - - def distributeCommands(self,gp): - """Special-purpose step: Make a view available for use as a side view.""" - localCopy = self.view.distributableFile() - maybeRemoteCopy = self.view.storedFile() - echoCom = 'echo DISTRIBUTE %s: making a local copy of %s in %s' % (self.view.tag,maybeRemoteCopy,localCopy) - if gp.opts['target']=='hadoop': - return [echoCom, 'rm -f %s' % localCopy, '%s fs -getmerge %s %s' % (GPig.HADOOP_LOC,maybeRemoteCopy, localCopy)] - else: - return [echoCom, 'cp -f %s %s || echo warning: the copy failed!' % (maybeRemoteCopy,localCopy)] - - - # one general case - a map-only step with only one input - - def mapOnlyCommands(self,gp): - """A subplan for a mapper-only step.""" - if gp.opts['target']=='shell': - command = None - if self.src: command = self.coreCommand(gp) + ' < %s > %s' % (self.src,self.dst) - else: command = self.coreCommand(gp) + (' > %s' % (self.dst)) - return self.subplanHeader() + [command] - elif gp.opts['target']=='hadoop': - assert self.src,'Wrap not supported for hadoop' - hcom = self.HadoopCommand(gp,self.view) - hcom.appendDef('-D','mapred.reduce.tasks=0') - hcom.append('-input',self.src,'-output',self.dst) - hcom.append("-mapper '%s'" % self.coreCommand(gp)) - return self.subplanHeader() + [ hcom.asEcho(), self.hadoopClean(gp,self.dst), hcom.asString() ] - else: - assert False - - # another general case - a map-reduce step - - def mapReduceCommands(self,reduceStep,gp): - """A subplan for a map-reduce step followed by a reduce, where the map has one input.""" - if gp.opts['target']=='shell': - command = self.coreCommand(gp) + (' < %s' % self.src) + ' | sort -k1 | '+reduceStep.coreCommand(gp) + (' > %s' % reduceStep.dst) - return self.subplanHeader(reduceStep) + [command] - elif gp.opts['target']=='hadoop': - hcom = self.HadoopCommand(gp,self.view,reduceStep.view) - hcom.appendDef('-D','mapred.reduce.tasks=%d' % gp.opts['parallel']) - hcom.append('-input',self.src,'-output',reduceStep.dst) - hcom.append("-mapper '%s'" % self.coreCommand(gp)) - hcom.append("-reducer '%s'" % reduceStep.coreCommand(gp)) - return self.subplanHeader(reduceStep) + [ hcom.asEcho(), self.hadoopClean(gp,reduceStep.dst), hcom.asString() ] - else: - assert False - - # another general case - a map-reduce step with multiple map inputs - - def multiMapReduceCommands(self,reduceStep,gp): - """A subplan for a map-reduce step followed by a reduce, where the map has many inputs.""" - if gp.opts['target']=='shell': - subplan = ['rm -f %s' % self.mid] - for i in range(len(self.src)): - subplan += [ self.ithCoreCommand(gp,i) + ' < %s >> %s' % (self.src[i],self.mid) ] - sortOpts = '-k1,2' if self.hasIndex else '-k1' - subplan += [ 'sort ' + sortOpts + ' < ' + self.mid + ' | ' + reduceStep.coreCommand(gp) + (' > %s' % reduceStep.dst)] - return self.subplanHeader(reduceStep) + subplan - elif gp.opts['target']=='hadoop': - def midi(i): return self.mid + '-' + str(i) - subplan = [] - for i in range(len(self.src)): - hcom = self.HadoopCommand(gp,self.view) - hcom.appendDef('-D','mapred.reduce.tasks=%d' % gp.opts['parallel']) - hcom.append('-input',self.src[i], '-output',midi(i)) - hcom.append("-mapper","'%s'" % self.ithCoreCommand(gp,i)) - subplan += [ self.hadoopClean(gp,midi(i)), hcom.asEcho(), hcom.asString() ] - hcombineCom = self.HadoopCommand(gp,reduceStep.view) - hcombineCom.appendDef('-D','mapred.reduce.tasks=%d' % gp.opts['parallel']) - if (self.hasIndex): - hcombineCom.appendDef('-jobconf','stream.num.map.output.key.fields=3') - hcombineCom.appendDef('-jobconf','num.key.fields.for.partition=1') - for i in range(len(self.src)): - hcombineCom.append('-input',midi(i)) - hcombineCom.append('-output',reduceStep.dst) - hcombineCom.append('-mapper','cat') - hcombineCom.append('-reducer',"'%s'" % reduceStep.coreCommand(gp)) - if (self.hasIndex): - hcombineCom.append('-partitioner','org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner') - subplan += [ self.hadoopClean(gp,reduceStep.dst), hcombineCom.asEcho(), hcombineCom.asString() ] - return self.subplanHeader(reduceStep) + subplan - else: - assert False - -class RowSerializer(object): - """Saves row objects to disk and retrieves them.""" - def __init__(self,target): - self._target = target - self._reprInverse = None - def toString(self,x): - return repr(x) - def fromString(self,s): - if self._reprInverse: return self._reprInverse(s) - else: return eval(s) - -# -# the planner -# - -class Planner(object): - """Can create storage plans for views that are defined as parts of it.""" - - def __init__(self,**kw): - - #parameters are used for programmatically give user-defined - #config information to a planner, or they can be specified in - #the command-line - self.param = kw - for (key,val) in GPig.getArgvParams().items(): - # don't override non-null values specified in the constructor - if self.param.get(key)==None: - self.param[key] = val - - #opts are used for giving options to the planner from the shell - self.opts = GPig.getArgvOpts() - for (key,val) in GPig.DEFAULT_OPTS.items(): - if (not key in self.opts): self.opts[key] = val - for (key,type) in GPig.DEFAULT_OPT_TYPES.items(): - self.opts[key] = type(self.opts[key]) - - #use serializer appropriate for the target - self._serializer = RowSerializer(self.opts['target']) - - #views that aren't associated with class variable, but are - #instead named automatically - ie, inner views with no - #user-provided names. - self._autoNamedViews = {} - - #by default, use info-level logging at planning time - if not Planner.partOfPlan(sys.argv): - logging.basicConfig(level=logging.INFO) - - #hadoop needs to know where to give the main script file, - #as well as the guineapig.py file it uses - self._gpigSourceFile = sys.argv[0] - self._shippedFiles = [GPig.MY_LOC,self._gpigSourceFile] - - def setup(self): - """Initialize planner, and views used by the planner. This has to be - done after the planner is fully configured by adding views.""" - - self.reusableViews = {} - - # make sure view directory is valid - if self.opts['target']=='shell' and not os.path.exists(self.opts['viewdir']): - logging.info('creating view directory ' + self.opts['viewdir']) - os.makedirs(self.opts['viewdir']) - elif self.opts['target']=='hadoop': - p = urlparse.urlparse(self.opts['viewdir']) - if not p.path.startswith("/"): - logging.warn('hadoop viewdir should be absolite path: will try prefixing /user/$LOGNAME') - username = os.environ.get('LOGNAME','me') - self.opts['viewdir'] = '/user/'+username+'/'+self.opts['viewdir'] - logging.warn('viewdir is set to '+self.opts['viewdir']) - - # Add 'tag' and planner fields to each view - for vname in self.listViewNames(): - v = self.getView(vname) - v.tag = vname - v.planner = self - def tagUnnamedViews(v,basename,index,depth): - assert v,'null inner view for '+basename - if not v.planner: - v.planner = self - autoname = '%s_%d_%s' % (basename,depth,index) - self._setView(autoname,v) - for i,inner in enumerate(v.inners + v.sideviews): - tagUnnamedViews(inner,vname,i,depth+1) - for vname in self.listViewNames(): - v = self.getView(vname) - for i,inner in enumerate(v.inners + v.sideviews): - tagUnnamedViews(inner,vname,i,1) - - # Add caching options as needed - for vname in self.listViewNames(): - v = self.getView(vname) - v.enforceStorageConstraints() - - # - # utils - # - - - def getView(self,str,mustExist=False): - """Find the defined relation named str, and if necessary bind its - planner and tag appropriately.""" - v = self.__class__.__dict__.get(str) or self.__dict__.get(str) or self._autoNamedViews.get(str) - if mustExist: assert v,'cannot find a view named '+str - return v - - def _setView(self,str,view): - """Internal use only: allow the view to be retreived by name later.""" - view.tag = str - self._autoNamedViews[str] = view - - def listViewNames(self): - def namedViews(d): return [vname for vname in d.keys() if isinstance(self.getView(vname),View)] - userNamedViews = namedViews(self.__class__.__dict__) + namedViews(self.__dict__) - return userNamedViews + self._autoNamedViews.keys() - - # - # planning - # - - def buildRecursiveStoragePlan(self,view): - """Called by view.storagePlan.""" - #figure out what to reuse - starting with what the user specified - storedViews = dict(self.reusableViews) - #also mark for eager storage anything that's used twice in the - #plan---i.e., anything that is consumed by two or more views - numParents = collections.defaultdict(int) - for dv in self._descendants(view): - for inner in dv.inners + dv.sideviews: - numParents[inner] += 1 - for (dv,n) in numParents.items(): - if n>1 and dv.storeMe==None: - logging.info('making %s stored because it is used %d times in creating %s' % (dv,n,view.tag)) - dv.storeMe = True - - #traverse view in pre-order and find a linear sequence of - #views to store, each of which requires only views earlier in - #the sequence - storageSeq = self._storageSeq(view,storedViews) + [view.tag] - logging.info('storage sequence is: ' + ",".join(storageSeq)) - - #splice together plans for each view in the sequence, - #after first modifying the view so that nothing is called - #directly, but only through the ReuseView proxies - plan = Plan() - for tag in storageSeq: - v = self.getView(tag,mustExist=True) - vm = v.applyDict(storedViews,innerviewsOnly=True) - subplan = vm.nonrecursiveStoragePlan() - #add the correct context of reused views to the subplan, - #so that that the actual definition of the view will be - #rewritten appropriately to include the new ReuseView - #proxy for it - viewsLocallyReused = self._reuseViewDescendants(vm) - for s in subplan.steps: - s.setReusedViews(viewsLocallyReused) - plan.append(subplan) - return plan - - def _reuseViewDescendants(self,view): - """Descendent views that are ReuseView's""" - result = set() - for dv in self._descendants(view): - if isinstance(dv,ReuseView): - result.add(dv.reusedViewTag) - return result - - def _descendants(self,view): - """Descendents of a view.""" - result = set() - result.add(view) - for inner in view.inners + view.sideviews: - result = result.union(self._descendants(inner)) - return result - - def _storageSeq(self,view,storedViews): - """Linear sequence of storage actions to take - as view tags.""" - seq = [] - for inner in view.inners + view.sideviews: - if not inner.tag in storedViews: - seq += self._storageSeq(inner,storedViews) - if inner.storeMe: - seq += [inner.tag] - storedViews[inner.tag] = ReuseView(inner) - return seq - - # - # dealing with the file storage system and related stuff - # - - def ship(self,*fileNames): - """Declare a set of inputs to be 'shipped' to the hadoop cluster.""" - self._shippedFiles += fileNames - - def setSerializer(self,serializer): - """Replace the default serializer another RowSerializer object.""" - self._serializer = serializer - return self - - def setReprInverseFun(self,reprInverse): - """Specify a function which will deserialize a string that was produced - by Python's 'repr' function.""" - self._serializer._reprInverse = reprInverse - return self - - # - # rest of the API for the planner - # - - @staticmethod - def partOfPlan(argv): - """True if the command line was generated as part of a storage plan.""" - return any(s.startswith("--do") for s in argv) - - def main(self,argv): - """Run a main that lets you --store a view, as well as doing a few other things.""" - self.setup() - self.runMain(argv) - - def runMain(self,argv): - - # parse the options and dispatch appropriately - argspec = ["store=", "cat=", "reuse", - "list", "pprint=", "steps=", "plan=", - "params=", "opts=", "do=", "view="] - optlist,args = getopt.getopt(argv[1:], 'x', argspec) - optdict = dict(optlist) - - # decide what views can be re-used, vs which need fresh plans - if '--reuse' in optdict: #reuse the views listed in the arguments - for a in args: - vname = View.viewNameFor(a) - v = self.getView(vname) - if v: - self.reusableViews[v.tag] = ReuseView(v) - logging.info("re-using data stored for view "+vname+": "+str(v)) - else: - logging.warn("cannot re-use view "+vname+" since it's not used in this script") - - #choose the main action to take - if '--store' in optdict: #store a view - rel = self.getView(optdict['--store'],mustExist=True) - plan = rel.storagePlan() - plan.execute(self, echo=self.opts['echo']) - return - elif '--pprint' in optdict: #print a view - rel = self.getView(optdict['--pprint'],mustExist=True) - rel.applyDict(self.reusableViews).pprint() - return - elif '--steps' in optdict: #print a view - rel = self.getView(optdict['--steps'],mustExist=True) - plan = rel.storagePlan() - for s in plan.steps: - print ' -',s - return - elif '--plan' in optdict: #print a storage plan - rel = self.getView(optdict['--plan'],mustExist=True) - plan = rel.storagePlan() - print "\n".join(plan.compile(self)) - return - elif '--cat' in optdict: #store and then print a view - assert self.opts['target']=='shell','cannot do --cat except in shell mode' - rel = self.getView(optdict['--cat'],mustExist=True) - plan = rel.storagePlan() - plan.execute(self, self.opts['echo']) - for line in open(rel.storedFile(),'r'): - print line, - return - elif '--list' in optdict: #list named views - for vname in self.listViewNames(): - print ' ',vname,'\t',self.getView(vname) - return - elif '--do' in optdict: #run an internally-generated action - #recover what should be stored when this action is performed - #work out what view to use and what routine to call - rel = self.getView(optdict['--view'],mustExist=True) - rel = rel.applyDict(self.reusableViews) - whatToDo = optdict['--do'] - #work out the method given by 'do' and call it - note it - #may have a single integer argument, eg doJoinMap.1 - k = whatToDo.find(".") - if k<0: - whatToDoMethod = getattr(rel,whatToDo) - whatToDoMethod() - else: - arg = int(whatToDo[k+1:]) - whatToDo = whatToDo[:k] - whatToDoMethod = getattr(rel,whatToDo) - whatToDoMethod(arg) - return - else: - print 'usage: --[store|pprint|steps|plan|cat] view [--opts key:val,...] [--params key:val,...] --reuse view1 view2 ...]' - print ' --[list]' - print 'current legal keys for "opts", with default values:' - for (key,val) in GPig.DEFAULT_OPTS.items(): - print ' %s:%s' % (key,str(val)) - print 'There\'s more help at http://curtis.ml.cmu.edu/w/courses/index.php/Guinea_Pig' - -if __name__ == "__main__": - print 'There\'s help at http://curtis.ml.cmu.edu/w/courses/index.php/Guinea_Pig' diff --git a/tutorial/guineapig1_3.py b/tutorial/guineapig1_3.py deleted file mode 100644 index 9b1fc5a..0000000 --- a/tutorial/guineapig1_3.py +++ /dev/null @@ -1,1384 +0,0 @@ -############################################################################## -# (C) Copyright 2014 William W. Cohen. All rights reserved. -############################################################################## - -import sys -import logging -import copy -import subprocess -import collections -import os -import os.path -import urlparse -import getopt -import csv - -############################################################################### -# helpers functions and data structures -############################################################################### - -class GPig(object): - """Collection of utilities for Guinea Pig.""" - - HADOOP_LOC = 'hadoop' #assume hadoop is on the path at planning time - MY_LOC = 'guineapig1_3.py' - - #global options for Guinea Pig can be passed in with the --opts - #command-line option, and these are the default values - defaultJar = '/usr/lib/hadoop/contrib/streaming/hadoop-streaming-1.2.0.1.3.0.0-107.jar' - envjar = os.environ.get('GP_STREAMJAR', defaultJar) - DEFAULT_OPTS = {'streamJar': envjar, - 'parallel':5, - 'target':'shell', - 'echo':0, - 'viewdir':'gpig_views', - } - #there are the types of each option that has a non-string value - DEFAULT_OPT_TYPES = {'parallel':int,'echo':int} - #we need to pass non-default options in to mappers and reducers, - #but since the remote worker's environment can be different, we - #also need to pass in options computed from the environment - COMPUTED_OPTION_DEFAULTS = {'streamJar':defaultJar} - - @staticmethod - def getCompiler(target): - if target=='shell': return ShellCompiler() - elif target=='hadoop': return HadoopCompiler() - else: assert 'illegal compilation target '+target - - @staticmethod - def getArgvParams(): - """Return a dictionary holding the argument of the --params option in - sys.argv.""" - return GPig.getArgvDict('--params') - - @staticmethod - def getArgvOpts(): - """Return a dictionary holding the argument of the --opts option in - sys.argv.""" - return GPig.getArgvDict('--opts') - - @staticmethod - def getArgvDict(optname): - """Return a dictionary of parameter values that were defined on the command line - view an option like '--params filename:foo.txt,basedir:/tmp/glob/'. - """ - assert optname.startswith('--') - for i,a in enumerate(sys.argv): - if a==optname: - paramString = sys.argv[i+1] - return dict(pair.split(":") for pair in paramString.split(",")) - return {} - - @staticmethod - def rowsOf(view): - """Iterate over the rows in a view.""" - for line in open(view.distributableFile()): - yield view.planner._serializer.fromString(line.strip()) - - @staticmethod - def onlyRowOf(view): - """Return the first row in a side view, and raise an error if it - is not the only row of the view.""" - result = None - logging.info('loading '+view.distributableFile()) - for line in open(view.distributableFile()): - assert not result,'multiple rows in stored file for '+view.tag - result = view.planner._serializer.fromString(line.strip()) - return result - - @staticmethod - class SafeEvaluator(object): - """Evaluates expressions that correzpond to serialized guinea pig rows.""" - def __init__(self,restrictedBindings={}): - self.restrictedBindings = restrictedBindings - def eval(self,s): - code = compile(s,'','eval') - return eval(code,self.restrictedBindings) - -class Jin(object): - """"Object to hold description of a single join input.""" - - def __init__(self,view,by=(lambda x:x),outer=False): - self.view = view - self.joinBy = by - self.outer = outer - self._padWithNulls = False - - def __str__(self): - viewStr = View.asTag(self.view) if self.view else '_' - outerStr = ',outer=True' if self.outer else '' - padStr = ',_padWithNulls=True' if self._padWithNulls else '' - return "Jin(%s,by=%s%s%s)" % (viewStr,self.joinBy,outerStr,padStr) - -class ReduceTo(object): - """An object x that can be the argument of a reducingTo=x - parameter in a Group view.""" - def __init__(self,baseType,by=lambda accum,val:accum+val): - self.baseType = baseType - self.reduceBy = by - -class ReduceToCount(ReduceTo): - """Produce the count of the number of objects that would be placed in a group.""" - def __init__(self): - ReduceTo.__init__(self,int,by=lambda accum,val:accum+1) - -class ReduceToSum(ReduceTo): - """Produce the sum of the objects - which must be numbers - that would - be placed in a group.""" - def __init__(self): - ReduceTo.__init__(self,int,by=lambda accum,val:accum+val) - -class ReduceToList(ReduceTo): - """Produce a list of the objects that would be placed in a group.""" - def __init__(self): - ReduceTo.__init__(self,list,by=lambda accum,val:accum+[val]) - -############################################################################### -# abstract views -############################################################################## - -class View(object): - """A definition of a relation for Guinea Pig. A View object can be - produce a storagePlan(), which can then be executed to produce the - contents of the relation. Intutitively, a relation is and - unordered bag of rows, and a row an almost-arbitrary python data - structure. (It must be something that can be stored and retrieved - by the RowSerializer.) - - Steps in the storagePlan are executed by delegation, thru the - planner, to methods of a View class named doFoo. - """ - - def __init__(self): - """The planner and tag must be set before this is used.""" - self.planner = None #pointer to planner object - self.tag = None #for naming storedFile and checkpoints - self.storeMe = None #try and store this view if true - self.retainedPart = None #used in map-reduce views only - self.sideviews = [] #non-empty for Augment views only - self.inners = [] #always used - - #self.inner is shortcut for inners[0] - def _getInner(self): return self.inners[0] - def _setInner(self,val): self.inners = [val] - inner = property(_getInner,_setInner) - - # - # ways to modify a view - # - - def opts(self,stored=None): - """Return the same view with options set appropriately. Possible - options include: - - - stored=True - Explicitly store this view on disk whenever - it is used in another view's definition. This might be set - by the user for debugging purposes, or by the planner, - to prevent incorrect optimizations. Generally "inner" - views are not explicitly stored. - - - stored='distributedCache' - Store this view in the working - directory and/or the Hadoop distributed cache. - """ - - self.storeMe = stored - return self - - def showExtras(self): - """Printable representation of the options for a view.""" - result = '' - flagPairs = [] - if self.storeMe: flagPairs += ['stored=%s' % repr(self.storeMe)] - if flagPairs: result += '.opts(' + ",".join(flagPairs) + ')' - return result - - # - # how the view is saved on disk - # - - def storedFile(self): - """The file that will hold the materialized relation.""" - return self.planner.opts['viewdir'] + '/' + self.tag + '.gp' - - def distributableFile(self): - """The file that will hold the materialized relation in the working directory - in preparation to be uploaded to the distributed cache.""" - return self.tag + '.gp' - - @staticmethod - def viewNameFor(fileName): - """The view associated with the given file name""" - vname = os.path.basename(fileName) - if vname.endswith(".gp"): vname = vname[0:-len(".gp")] - return vname - - # - # semantics of the view - # - - def checkpoint(self): - """A checkpoint is an intermediate computation for the view, which is - saved on disk. The rowGenerator() for the view will assume - that the checkpoint is available. - """ - assert False, 'abstract method called' - - def checkpointPlan(self): - """A plan to produce checkpoint().""" - assert False, 'abstract method called' - - def rowGenerator(self): - """A generator for the rows in this relation, which assumes existence - of the checkpoint.""" - assert False, 'abstract method called' - - def explanation(self): - """Return an explanation of how rows are generated.""" - assert False, 'abstract method called' - - def storagePlan(self): - """A plan to store the view.""" - return self.planner.buildRecursiveStoragePlan(self) - - def nonrecursiveStoragePlan(self): - """Materialize the relation, assuming that there are no descendent - inner views that need to be materialized first.""" - plan = Plan() - plan.includeStepsOf(self.checkpointPlan()) - plan.append(TransformStep(view=self,whatToDo='doStoreRows',srcs=[self.checkpoint()],dst=self.storedFile(),why=self.explanation())) - return plan - - def applyDict(self,mapping,innerviewsOnly=False): - """Given a mapping from view tags to views, replace every inner view with - the appropriate value from the mapping, and return the result.""" - if self.tag in mapping and not innerviewsOnly: - return mapping[self.tag] - elif not self.inners: - return self - else: - result = copy.copy(self) - result.inners = map(lambda v:v.applyDict(mapping), self.inners) - return result - - def sideviewsNeeded(self): - """Sideviews needed by this view.""" - result = [] - for sv in self.sideviews: - result += [sv] - for v in self.inners: - result += list(v._sideviewsOfDescendants()) - return result - - def _sideviewsOfDescendants(self): - if not self.storeMe: - for sv in self.sideviews: - yield sv - for v in self.inners: - for sv in v._sideviewsOfDescendants(): - yield sv - - def enforceStorageConstraints(self): - """Subclass this, if there are constraints on when one must explicitly - store inner views.""" - pass - - def doStoreRows(self): - """Called by planner at execution time to store the rows of the view.""" - for row in self.rowGenerator(): - print self.planner._serializer.toString(row) - - # - # support the "pipe" syntax: view1 | view2 - # - - def __or__(self,otherView): - """Overload the pipe operator x | y to return with y, with x as its inner view.""" - otherView.acceptInnerView(self) - return otherView - - def acceptInnerView(self,otherView): - """Replace an appropriate input view with otherView. This is subclassed to - implement the the pipe operator.""" - assert not self.inner,'An inner view is defined for '+self.tag+' so you cannot use it as RHS of a pipe' - self.inner = otherView #subclass if needed - - # - # printing views - # - - def pprint(self,depth=0,alreadyPrinted=None,sideview=False): - """Print a readable representation of the view.""" - if alreadyPrinted==None: alreadyPrinted = set() - tabStr = '| ' * depth - tagStr = str(self.tag) - sideviewIndicator = '*' if sideview else '' - if self.tag in alreadyPrinted: - print tabStr + sideviewIndicator + tagStr + ' = ' + '...' - else: - sideviewInfo = " sideviews: {"+",".join(map(lambda x:x.tag, self.sideviews))+"}" if self.sideviews else "" - print tabStr + sideviewIndicator + tagStr + ' = ' + str(self) + sideviewInfo - alreadyPrinted.add(self.tag) - for inner in self.inners: - inner.pprint(depth+1,alreadyPrinted) - for inner in self.sideviews: - inner.pprint(depth+1,alreadyPrinted,sideview=True) - - @staticmethod - def asTag(view): - """Helper for printing views.""" - if not view: return '(null view)' - elif view.tag: return view.tag - else: return str(view) - -# -# abstract view types -# - -class Reader(View): - """Read data stored on the file system and make it look like a View.""" - - def __init__(self,src): - View.__init__(self) - self.src = src - self.inners = [] - - def checkpoint(self): - return self.src - - def checkpointPlan(self): - return Plan() #empty plan - - def explanation(self): - return [ 'read %s with %s' % (str(self.src),self.tag) ] - - def acceptInnerView(self,otherView): - assert False, "Reader views cannot be used as RHS of a pipe" - -class Transformation(View): - """Streaming transformation on a single inner view.""" - - def __init__(self,inner=None): - View.__init__(self) - self.inner = inner - - # A transformation will stream on-the-fly through the inner - # relation, and produce a new version, so the checkpoint and plan - # to produce it are delegated to the inner View. - - def checkpoint(self): - return self.inner.checkpoint() - - def checkpointPlan(self): - return self.inner.checkpointPlan() - - def explanation(self): - return self.inner.explanation() + [ 'transform to %s' % self.tag ] - -class MapReduce(View): - """A view that takes an inner relation and processes in a - map-reduce-like way.""" - - def __init__(self,inners,retaining): - View.__init__(self) - self.inners = inners - self.retainedPart = retaining - - def _isReduceInputFile(self,fileName): - return fileName.endswith('.gpri') - - def checkpoint(self): - ## the checkpoint is the reducer input file - return self.planner.opts['viewdir'] + '/' + self.tag + '.gpri' - - def checkpointPlan(self): - plan = Plan() - for inner in self.inners: - plan.includeStepsOf(inner.checkpointPlan()) - plan.includeStepsOf(self.mapPlan()) - return plan - - def enforceStorageConstraints(self): - for inner in self.inners: - innerChkpt = inner.checkpoint() - #optimizations break if you chain two map-reduces together - if innerChkpt and innerChkpt.endswith(".gpri"): - if not inner.storeMe: - logging.info('making %s stored, to make possible a downstream map-reduce view' % inner.tag) - inner.storeMe = True - - def mapPlan(self): - log.error("abstract method not implemented") - - def doStoreKeyedRows(self,subview,key,index): - """Utility method used by concrete map-reduce classes to compute keys - and store key-value pairs. Usually used as the main step in a - mapPlan. """ - for row in subview.rowGenerator(): - keyStr = self.planner._serializer.toString(key(row)) - rrow = self.retainedPart(row) if self.retainedPart else row - valStr = self.planner._serializer.toString(rrow) - if index<0: - print "%s\t%s" % (keyStr,valStr) - else: - print "%s\t%d\t%s" % (keyStr,index,valStr) - -############################################################################## -# -# concrete View classes -# -############################################################################## - -class ReuseView(Reader): - """Returns the objects in a previously stored view.""" - - def __init__(self,view): - if isinstance(view,View): - Reader.__init__(self,view.storedFile()) - self.tag = "reuse_"+view.tag - self.reusedViewTag = view.tag - self.planner = view.planner - else: - assert False,'user-defined ReuseView not supported (yet)' - - def rowGenerator(self): - for line in sys.stdin: - yield self.planner._serializer.fromString(line.strip()) - - def __str__(self): - return 'ReuseView("%s")' % self.src + self.showExtras() - - -class ReadLines(Reader): - """ Returns the lines in a file, as python strings.""" - - def __init__(self,src): - Reader.__init__(self,src) - - def rowGenerator(self): - for line in sys.stdin: - yield line - - def __str__(self): - return 'ReadLines("%s")' % self.src + self.showExtras() - -class ReadCSV(Reader): - """ Returns the lines in a CSV file, converted to Python tuples.""" - - def __init__(self,src,**kw): - Reader.__init__(self,src) - self.kw = kw - - def rowGenerator(self): - for tup in csv.reader(sys.stdin,**self.kw): - yield tup - - def __str__(self): - return 'ReadCVS("%s",%s)' % (self.src,str(self.kw)) + self.showExtras() - - -class ReplaceEach(Transformation): - """ In 'by=f'' f is a python function that takes a row and produces - its replacement.""" - - def __init__(self,inner=None,by=lambda x:x): - Transformation.__init__(self,inner) - self.replaceBy = by - - def rowGenerator(self): - for row in self.inner.rowGenerator(): - yield self.replaceBy(row) - - def explanation(self): - return self.inner.explanation() + [ 'replaced to %s' % self.tag ] - - def __str__(self): - return 'ReplaceEach(%s, by=%s)' % (View.asTag(self.inner),str(self.replaceBy)) + self.showExtras() - -class Augment(Transformation): - - def __init__(self,inner=None,sideviews=None,sideview=None,loadedBy=lambda v:list(GPig.rowsOf(v))): - Transformation.__init__(self,inner) - assert not (sideviews and sideview), 'cannot specify both "sideview" and "sideviews"' - self.sideviews = list(sideviews) if sideviews else [sideview] - self.loader = loadedBy - assert self.loader,'must specify a "loadedBy" function for Augment' - - def enforceStorageConstraints(self): - for sv in self.sideviews: - sv.storeMe = 'distributedCache' - - def rowGenerator(self): - augend = self.loader(*self.sideviews) - for row in self.inner.rowGenerator(): - yield (row,augend) - - def checkpointPlan(self): - plan = Plan() - plan.includeStepsOf(self.inner.checkpointPlan()) - #the sideviews should have been stored by the top-level - #planner already, but they will need to be moved to a - #distributable location - for sv in self.sideviews: - plan.append(DistributeStep(sv)) - return plan - - def explanation(self): - return self.inner.explanation() + [ 'augmented to %s' % self.tag ] - - def __str__(self): - sideviewTags = loaderTag = '*UNSPECIFIED*' - if self.sideviews!=None: sideviewTags = ",".join(map(View.asTag,self.sideviews)) - if self.loader!=None: loaderTag = str(self.loader) - return 'Augment(%s,sideviews=%s,loadedBy=s%s)' % (View.asTag(self.inner),sideviewTags,loaderTag) + self.showExtras() - - -class Format(ReplaceEach): - """ Like ReplaceEach, but output should be a string, and it will be be - stored as strings, ie without using the serializer.""" - - def __init__(self,inner=None,by=lambda x:str(x)): - ReplaceEach.__init__(self,inner,by) - - def __str__(self): - return 'Format(%s, by=%s)' % (View.asTag(self.inner),str(self.replaceBy)) + self.showExtras() - - def doStoreRows(self): - for row in self.rowGenerator(): - print row - -class Flatten(Transformation): - """ Like ReplaceEach, but output of 'by' is an iterable, and all - results will be returned. """ - - def __init__(self,inner=None,by=None): - Transformation.__init__(self,inner) - self.flattenBy = by - - def rowGenerator(self): - for row in self.inner.rowGenerator(): - for flatrow in self.flattenBy(row): - yield flatrow - - def explanation(self): - return self.inner.explanation() + [ 'flatten to %s' % self.tag ] - - def __str__(self): - return 'Flatten(%s, by=%s)' % (View.asTag(self.inner),str(self.flattenBy)) + self.showExtras() - -class Filter(Transformation): - """Filter out a subset of rows that match some predicate.""" - - def __init__(self,inner=None,by=lambda x:x): - Transformation.__init__(self,inner) - self.filterBy = by - - def rowGenerator(self): - for row in self.inner.rowGenerator(): - if self.filterBy(row): - yield row - - def explanation(self): - return self.inner.explanation() + [ 'filtered to %s' % self.tag ] - - def __str__(self): - return 'Filter(%s, by=%s)' % (View.asTag(self.inner),str(self.filterBy)) + self.showExtras() - -class Distinct(MapReduce): - """Remove duplicate rows.""" - - def __init__(self,inner=None,retaining=None): - MapReduce.__init__(self,[inner],retaining) - - def mapPlan(self): - plan = Plan() - plan.append(PrereduceStep(view=self,whatToDo='doDistinctMap',srcs=[self.inner.checkpoint()],dst=self.checkpoint(),why=self.explanation())) - return plan - - def rowGenerator(self): - """Extract distinct elements from a sorted list.""" - lastval = None - for line in sys.stdin: - valStr = line.strip() - val = self.planner._serializer.fromString(valStr) - if val != lastval and lastval: - yield lastval - lastval = val - if lastval: - yield lastval - - def explanation(self): - return self.inner.explanation() + [ 'make distinct to %s' % self.tag] - - def __str__(self): - return 'Distinct(%s)' % (View.asTag(self.inner)) + self.showExtras() - - def doDistinctMap(self): - self.inner.doStoreRows() - - -class Group(MapReduce): - """Group by some property of a row, defined with the 'by' option. - Default outputs are tuples (x,[r1,...,rk]) where the ri's are rows - that have 'by' values of x.""" - - def __init__(self,inner=None,by=lambda x:x,reducingTo=ReduceToList(),retaining=None): - MapReduce.__init__(self,[inner],retaining) - self.groupBy = by - self.reducingTo = reducingTo - - def mapPlan(self): - plan = Plan() - plan.append(PrereduceStep(view=self,whatToDo='doGroupMap',srcs=[self.inner.checkpoint()],dst=self.checkpoint(),why=self.explanation())) - return plan - - def rowGenerator(self): - """Group objects from stdin by key, yielding tuples (key,[g1,..,gn]).""" - lastkey = key = None - accum = self.reducingTo.baseType() - for line in sys.stdin: - keyStr,valStr = line.strip().split("\t") - key = self.planner._serializer.fromString(keyStr) - val = self.planner._serializer.fromString(valStr) - if key != lastkey and lastkey!=None: - yield (lastkey,accum) - accum = self.reducingTo.baseType() - accum = self.reducingTo.reduceBy(accum, val) - lastkey = key - if key: - yield (key,accum) - - def explanation(self): - return self.inner.explanation() + ['group to %s' % self.tag] - - def __str__(self): - return 'Group(%s,by=%s,reducingTo=%s)' % (View.asTag(self.inner),str(self.groupBy),str(self.reducingTo)) + self.showExtras() - - def doGroupMap(self): - self.doStoreKeyedRows(self.inner,self.groupBy,-1) - -class Join(MapReduce): - """Outputs tuples of the form (row1,row2,...rowk) where - rowi is from the i-th join input, and the rowi's have the same - value of the property being joined on.""" - - def __init__(self,*joinInputs): - #sets self.inners - MapReduce.__init__(self,map(lambda x:x.view, joinInputs),None) - self.joinInputs = joinInputs - #re-interpret the 'outer' join parameters - semantically - #if jin[i] is outer, then all other inputs must be marked as _padWithNulls - if any(map(lambda jin:jin.outer, self.joinInputs)): - assert len(self.joinInputs)==2,'outer joins are only supported on two-way joins '+str(self.joinInputs) - for i in range(len(self.joinInputs)): - if self.joinInputs[i].outer: - j = 1-i #the other index - self.joinInputs[j]._padWithNulls = True - - def acceptInnerView(self,otherView): - assert False, 'join cannot be RHS of a pipe - use JoinTo instead' - - def mapPlan(self): - plan = Plan() - innerCheckpoints = map(lambda v:v.checkpoint(), self.inners) - step = PrereduceStep(view=self, whatToDo='doJoinMap',srcs=innerCheckpoints,dst=self.checkpoint(),why=self.explanation()) - plan.append(step) - return plan - - def applyDict(self,mapping,innerviewsOnly=False): - result = MapReduce.applyDict(self,mapping,innerviewsOnly=innerviewsOnly) - #also need to map over the join inputs - if isinstance(result,Join): - for i in range(len(result.joinInputs)): - result.joinInputs[i].view = result.inners[i] - return result - - def rowGenerator(self): - """Group objects from stdin by key, yielding tuples (row1,row2,...).""" - lastkey = None - lastIndex = len(self.joinInputs)-1 - somethingProducedForLastKey = False - #accumulate a list of lists of all non-final inputs - accumList = [ [] for i in range(lastIndex) ] - for line in sys.stdin: - keyStr,indexStr,valStr = line.strip().split("\t") - key = self.planner._serializer.fromString(keyStr) - index = int(indexStr) - val = self.planner._serializer.fromString(valStr) - if key != lastkey and lastkey!=None: - #if the final join is marked as _padWithNulls, clear - #the accumulators, since we're doing an outer join - #with the last view - if self.joinInputs[lastIndex]._padWithNulls and not somethingProducedForLastKey: - for tup in self._joinAccumulatedValuesTo(accumList,lastIndex,None): - yield tup - #reset the accumulators, since they pertain to the - accumList = [ [] for i in range(lastIndex) ] - somethingProducedForLastKey = False - if index!=lastIndex: - #accumulate values to use in the join - accumList[index] = accumList[index] + [val] - else: - #produce tuples that match the key for the last view - for tup in self._joinAccumulatedValuesTo(accumList,lastIndex,val): - somethingProducedForLastKey = True - yield tup - lastkey = key - - def _joinAccumulatedValuesTo(self,accumList,lastIndex,finalVal): - # _padWithNulls as needed - for i in range(lastIndex): - if self.joinInputs[i]._padWithNulls and not accumList[i]: - accumList[i] = [None] - tupbuf = [ None for i in range(lastIndex+1) ] #holds output - tupbuf[lastIndex] = finalVal - for i in range(lastIndex): - for a in accumList[i]: - tupbuf[i] = a - if i==lastIndex-1 and any(tupbuf): - yield tuple(tupbuf) - - def explanation(self): - innerEx = [] - for inner in self.inners: - if innerEx: innerEx += ['THEN'] - innerEx += inner.explanation() - return innerEx + [ 'FINALLY join to %s' % self.tag ] - - def __str__(self): - return "Join(%s)" % ",".join(map(str,self.joinInputs)) + self.showExtras() - - def doJoinMap(self,i): - # called by joinMapPlan with argument index, and stdin pointing to innerCheckpoints[index] - self.doStoreKeyedRows(self.joinInputs[i].view,self.joinInputs[i].joinBy,i) - -class JoinTo(Join): - """Special case of Join which can be used as the RHS of a pipe operator.""" - - def __init__(self,joinInput,by=None): - Join.__init__(self,Jin(None,by),joinInput) - - def acceptInnerView(self,otherView): - self.joinInputs[0].view = otherView - self.inners[0] = otherView - -############################################################################## -# -# the top-level planner, and its supporting classes -# -############################################################################## - -class Plan(object): - """A plan constructed by a GuineaPig.""" - - def __init__(self): - self.steps = [] - self.tasks = [] - - def append(self,step): - self.steps.append(step) - - def includeStepsOf(self,subplan): - self.steps += subplan.steps - - def execute(self,gp,echo=False): - script = self.compile(gp) - for shellcom in script: - if echo: print 'calling:',shellcom - subprocess.check_call(shellcom,shell=True) - - def buildTasks(self): - """Group the steps into AbstractMapReduceTask's""" - self.tasks = [AbstractMapReduceTask()] - for step in self.steps: - if not self.tasks[-1].insert(step): - self.tasks.append(AbstractMapReduceTask()) - status = self.tasks[-1].insert(step) - assert status, 'failure to insert '+str(step)+' in fresh AbstractMapReduceTask' - - def compile(self,gp): - """Return a list of strings that can be run as shell commands.""" - self.buildTasks() - logging.info("%d steps converted to %d abstract map-reduce tasks" % (len(self.steps),len(self.tasks))) - script = [] - taskCompiler = GPig.getCompiler(gp.opts['target']) - for task in self.tasks: - script += taskCompiler.compile(task,gp) - return script - -# -# a single step in a plan produced by the planner -# - -class Step(object): - """A single step of the plans produced by the planner, along with the - methods to convert the plans into executable shell commands.""" - - def __init__(self,view): - self.view = view - self.reused = [] # list of views reused at this point - self.why = [] - - def setReusedViews(self,views): - self.reused = list(views) - - def explain(self): - """Convert an explanation - which is a list of strings - into a string""" - return "...".join(self.why) - -# -# a single step in a plan produced by the planner -# - -class DistributeStep(Step): - """Prepare a stored view for the dDistributed cache.""" - - def __init__(self,view): - Step.__init__(self,view) - - def __str__(self): - return "DistributeStep(%s,reused=%s)" % (repr(self.view.tag),repr(self.reused)) - -class TransformStep(Step): - """Tranform input to output.""" - def __init__(self,view,whatToDo,srcs,dst,why): - Step.__init__(self,view) - self.whatToDo = whatToDo - self.srcs = srcs - self.dst = dst - self.why = why - - def __str__(self): - return "TransformStep("+",".join(map(repr, [self.view.tag,self.whatToDo,self.srcs,self.dst,self.reused]))+")" - -class PrereduceStep(Step): - def __init__(self,view,whatToDo,srcs,dst,why): - Step.__init__(self,view) - self.whatToDo = whatToDo - self.srcs = srcs - self.dst = dst - self.why = why - - def __str__(self): - return "PrereduceStep("+",".join(map(repr, [self.view.tag,self.whatToDo,self.srcs,self.dst,self.reused]))+")" - -# combine steps into something executable via hadoop - or shell - -class AbstractMapReduceTask(object): - """A collection of steps that can be executed as a single map-reduce operation, - possibly with some file managements steps to set up the task.""" - - def __init__(self): - self.distributeSteps = [] - self.mapStep = None - self.reduceStep = None - - def insert(self,step): - """Treating the AbstractMapReduceTask as a buffer, add this step to it if possible.""" - if isinstance(step,DistributeStep): - #we can accept any number of distribute steps - self.distributeSteps.append(step) - return True - elif self.mapStep==None and (isinstance(step,TransformStep) or isinstance(step,PrereduceStep)): - #we can only have one map step, so fill up an empty slot if possible - self.mapStep = step - return True - elif self.mapStep and isinstance(self.mapStep,PrereduceStep) and isinstance(step,TransformStep) and not self.reduceStep: - #if the mapstep is a prereduce, then we can also allow a reduce step - self.reduceStep = step - return True - else: - return False - - def __str__(self): - buf = "mapreduce task:" - for step in self.distributeSteps: - buf += "\n - d "+str(step) - buf += "\n - m " + str(self.mapStep) - if self.reduceStep: - buf += "\n - r " + str(self.reduceStep) - return buf - -class MRCompiler(object): - """Abstract compiler class to convert a task to a list of commands that can be executed by the shell.""" - - def compile(self,task,gp): - script = [] - # an explanation/header - if not task.reduceStep: - script += ['echo create '+task.mapStep.view.tag + ' via map: ' + task.mapStep.explain()] - else: - script += ['echo create '+task.reduceStep.view.tag +' via map/reduce: '+task.reduceStep.explain()] - for step in task.distributeSteps: - localCopy = step.view.distributableFile() - maybeRemoteCopy = step.view.storedFile() - echoCom = 'echo distribute %s: making a local copy of %s in %s' % (step.view.tag,maybeRemoteCopy,localCopy) - script += [echoCom] + self.distributeCommands(task, gp, maybeRemoteCopy,localCopy) - if not task.reduceStep and len(task.mapStep.srcs)==1: - mapCom = self._coreCommand(task.mapStep,gp) - script += self.simpleMapCommands(task, gp, mapCom, task.mapStep.srcs[0], task.mapStep.dst) - elif task.reduceStep and len(task.mapStep.srcs)==1: - mapCom = self._coreCommand(task.mapStep,gp) - reduceCom = self._coreCommand(task.reduceStep,gp) - script += self.simpleMapReduceCommands(task, gp, mapCom, reduceCom, task.mapStep.srcs[0], task.reduceStep.dst) - elif task.reduceStep and len(task.mapStep.srcs)>1: - mapComs = [self._ithCoreCommand(task.mapStep,gp,i) for i in range(len(task.mapStep.srcs))] - reduceCom = self._coreCommand(task.reduceStep,gp) - midpoint = gp.opts['viewdir']+'/'+task.mapStep.view.tag+'.gpmo' - script += self.joinCommands(task, gp, mapComs, reduceCom, task.mapStep.srcs, midpoint, task.reduceStep.dst) - else: - assert False,'cannot compile task '+str(task) - return script - - # abstract routines - - def distributeCommands(self,task,gp,maybeRemoteCopy,localCopy): - """Distribute the remote copy to the local directory.""" - assert False, 'abstract method called' - - def simpleMapCommands(self,task,gp,mapCom,src,dst): - """A map-only task with zero or one inputs.""" - assert False, 'abstract method called' - - def simpleMapReduceCommands(self,task,gp,mapCom,reduceCom,src,dst): - """A map-reduce task with one inputs.""" - assert False, 'abstract method called' - - def joinCommands(self,task,gp,mapComs,reduceCom,srcs,midpoint,dst): - """A map-reduce task with several inputs.""" - assert False, 'abstract method called' - - # utilities - - def _stepSideviewFiles(self,step): - files = [] - for sv in step.view.sideviewsNeeded(): - files += [sv.distributableFile()] - - def _coreCommand(self,step,gp): - """Python command to call an individual plan step.""" - return 'python %s --view=%s --do=%s' % (gp._gpigSourceFile,step.view.tag,step.whatToDo) + self.__coreCommandOptions(step,gp) - - def _ithCoreCommand(self,step,gp,i): - """Like _coreCommand but allows index parameter to 'do' option""" - return 'python %s --view=%s --do=%s.%d' % (gp._gpigSourceFile,step.view.tag,step.whatToDo,i) + self.__coreCommandOptions(step,gp) - - def __coreCommandOptions(self,step,gp): - paramOpts = '' if not gp.param else " --params " + ",".join(map(lambda(k,v):k+':'+v, gp.param.items())) - nonDefaults = [] - for (k,v) in gp.opts.items(): - #pass in non-default options, or options computed from the environment - if (gp.opts[k] != GPig.DEFAULT_OPTS[k]) or ((k in GPig.COMPUTED_OPTION_DEFAULTS) and (gp.opts[k] != GPig.COMPUTED_OPTION_DEFAULTS[k])): - nonDefaults += ["%s:%s" % (k,str(v))] - optsOpts = '' if not nonDefaults else " --opts " + ",".join(nonDefaults) - reuseOpts = '' if not step.reused else " --reuse "+ " ".join(step.reused) - return paramOpts + optsOpts + reuseOpts - - -class ShellCompiler(MRCompiler): - """Compile tasks to commands that are executable to most Unix shells.""" - - def distributeCommands(self,task,gp,maybeRemoteCopy,localCopy): - """Distribute the remote copy to the local directory.""" - return ['cp -f %s %s || echo warning: the copy failed!' % (maybeRemoteCopy,localCopy)] - - def simpleMapCommands(self,task,gp,mapCom,src,dst): - """A map-only job with zero or one inputs.""" - if src: return [mapCom + ' < %s > %s' % (src,dst)] - else: return [self.mapCommand(gp) + (' > %s' % (dst))] - - def simpleMapReduceCommands(self,task,gp,mapCom,reduceCom,src,dst): - """A map-reduce job with one input.""" - return [mapCom + ' < ' + src + ' | sort -k1 | '+reduceCom + ' > ' + dst] - - def joinCommands(self,task,gp,mapComs,reduceCom,srcs,midpoint,dst): - """A map-reduce job with several inputs.""" - subplan = ['rm -f %s' % midpoint] - for i,ithMapCom in enumerate(mapComs): - subplan += [ithMapCom + ' < ' + srcs[i] + ' >> ' + midpoint] - subplan += [ 'sort -k1,2 < ' + midpoint + ' | ' + reduceCom + ' > ' + dst] - return subplan - -class HadoopCompiler(MRCompiler): - """Compile tasks to commands that are executable to most Unix shells - after hadoop has been installed.""" - - def distributeCommands(self,task,gp,maybeRemoteCopy,localCopy): - return ['rm -f %s' % localCopy, '%s fs -getmerge %s %s' % (GPig.HADOOP_LOC,maybeRemoteCopy,localCopy)] - - def simpleMapCommands(self,task,gp,mapCom,src,dst): - assert src,'Wrap not supported for hadoop' - hcom = self.HadoopCommandBuf(gp,task) - hcom.extendDef('-D','mapred.reduce.tasks=0') - hcom.extend('-input',src,'-output',dst) - hcom.extend("-mapper '%s'" % mapCom) - return [ self._hadoopCleanCommand(gp,dst), hcom.asEcho(), hcom.asString() ] - - def simpleMapReduceCommands(self,task,gp,mapCom,reduceCom,src,dst): - hcom = self.HadoopCommandBuf(gp,task) - hcom.extendDef('-D','mapred.reduce.tasks=%d' % gp.opts['parallel']) - hcom.extend('-input',src,'-output',dst) - hcom.extend("-mapper '%s'" % mapCom) - hcom.extend("-reducer '%s'" % reduceCom) - return [ self._hadoopCleanCommand(gp,dst), hcom.asEcho(), hcom.asString() ] - - def joinCommands(self,task,gp,mapComs,reduceCom,srcs,midpoint,dst): - def midi(i): return midpoint + '-' + str(i) - subplan = [] - for i in range(len(srcs)): - hcom = self.HadoopCommandBuf(gp,task) - hcom.extendDef('-D','mapred.reduce.tasks=%d' % gp.opts['parallel']) - hcom.extend('-input',srcs[i], '-output',midi(i)) - hcom.extend("-mapper","'%s'" % mapComs[i]) - subplan += [ self._hadoopCleanCommand(gp,midi(i)), hcom.asEcho(), hcom.asString() ] - hcombineCom = self.HadoopCommandBuf(gp,task) - hcombineCom.extendDef('-D','mapred.reduce.tasks=%d' % gp.opts['parallel']) - hcombineCom.extendDef('-jobconf','stream.num.map.output.key.fields=3') - hcombineCom.extendDef('-jobconf','num.key.fields.for.partition=1') - for i in range(len(srcs)): - hcombineCom.extend('-input',midi(i)) - hcombineCom.extend('-output',dst) - hcombineCom.extend('-mapper','cat') - hcombineCom.extend('-reducer',"'%s'" % reduceCom) - hcombineCom.extend('-partitioner','org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner') - subplan += [ self._hadoopCleanCommand(gp,dst), hcombineCom.asEcho(), hcombineCom.asString() ] - return subplan - - class HadoopCommandBuf(object): - """Utility to hold the various pieces of a hadoop command.""" - def __init__(self,gp,task): - logging.debug('building hadoop command for '+str(task.mapStep.view.tag)) - self.invocation = [GPig.HADOOP_LOC,'jar',gp.opts['streamJar']] - self.defs = [] - self.args = [] - self.files = [] - for f in gp._shippedFiles: - self.files += ['-file',f] - for sv in task.mapStep.view.sideviewsNeeded(): - self.files += ['-file',sv.distributableFile()] - if task.reduceStep: - for sv in task.reduceStep.view.sideviewsNeeded(): - self.files += ['-file',sv.distributableFile()] - logging.debug('files: '+str(self.files)) - def extend(self,*toks): - self.args += list(toks) - def extendDef(self,*toks): - self.defs += list(toks) - def asEcho(self): - return " ".join(['echo','hadoop'] + self.args + ['...']) - def asString(self): - return " ".join(self.invocation+self.defs+self.files+self.args) - - def _hadoopCleanCommand(self,gp,fileName): - """A command to remove a hdfs directory if it exists.""" - return '(%s fs -test -e %s && %s fs -rmr %s) || echo no need to remove %s' % (GPig.HADOOP_LOC,fileName, GPig.HADOOP_LOC,fileName, fileName) - -# -# replacable object to save objects to disk and retrieve them -# - -class RowSerializer(object): - """Saves row objects to disk and retrieves them.""" - def __init__(self): - self.evaluator = GPig.SafeEvaluator() - def toString(self,x): - return repr(x) - def fromString(self,s): - return self.evaluator.eval(s) - -# -# the planner -# - -class Planner(object): - """Can create storage plans for views that are defined as parts of it.""" - - def __init__(self,**kw): - - #parameters are used for programmatically give user-defined - #config information to a planner, or they can be specified in - #the command-line - self.param = kw - for (key,val) in GPig.getArgvParams().items(): - # don't override non-null values specified in the constructor - if self.param.get(key)==None: - self.param[key] = val - - #opts are used for giving options to the planner from the shell - self.opts = GPig.getArgvOpts() - for (key,val) in GPig.DEFAULT_OPTS.items(): - if (not key in self.opts): self.opts[key] = val - for (key,type) in GPig.DEFAULT_OPT_TYPES.items(): - self.opts[key] = type(self.opts[key]) - - #use appropriate for the target - self._serializer = RowSerializer() - - #views that aren't associated with class variable, but are - #instead named automatically - ie, inner views with no - #user-provided names. - self._autoNamedViews = {} - - #by default, use info-level logging at planning time - if not Planner.partOfPlan(sys.argv): - logging.basicConfig(level=logging.INFO) - - #hadoop needs to know where to give the main script file, - #as well as the guineapig.py file it uses - self._gpigSourceFile = sys.argv[0] - self._shippedFiles = [GPig.MY_LOC,self._gpigSourceFile] - - def setup(self): - """Initialize planner, and views used by the planner. This has to be - done after the planner is fully configured by adding views.""" - - self.reusableViews = {} - # make sure view directory is valid - if self.opts['target']=='shell' and not os.path.exists(self.opts['viewdir']): - logging.info('creating view directory ' + self.opts['viewdir']) - os.makedirs(self.opts['viewdir']) - elif self.opts['target']=='hadoop': - p = urlparse.urlparse(self.opts['viewdir']) - if not p.path.startswith("/"): - logging.warn('hadoop viewdir should be absolite path: will try prefixing /user/$LOGNAME') - username = os.environ.get('LOGNAME','me') - self.opts['viewdir'] = '/user/'+username+'/'+self.opts['viewdir'] - logging.warn('viewdir is set to '+self.opts['viewdir']) - - # Add 'tag' and planner fields to each view - for vname in self.listViewNames(): - v = self.getView(vname) - v.tag = vname - v.planner = self - def tagUnnamedViews(v,basename,index,depth): - assert v,'null inner view for '+basename - if not v.planner: - v.planner = self - autoname = '%s_%d_%s' % (basename,depth,index) - self._setView(autoname,v) - for i,inner in enumerate(v.inners + v.sideviews): - tagUnnamedViews(inner,vname,i,depth+1) - for vname in self.listViewNames(): - v = self.getView(vname) - for i,inner in enumerate(v.inners + v.sideviews): - tagUnnamedViews(inner,vname,i,1) - - # Add caching options as needed - for vname in self.listViewNames(): - v = self.getView(vname) - v.enforceStorageConstraints() - - # - # utils - # - - - def getView(self,str,mustExist=False): - """Find the defined relation named str, and if necessary bind its - planner and tag appropriately.""" - v = self.__class__.__dict__.get(str) or self.__dict__.get(str) or self._autoNamedViews.get(str) - if mustExist: assert v,'cannot find a view named '+str - return v - - def _setView(self,str,view): - """Internal use only: allow the view to be retreived by name later.""" - view.tag = str - self._autoNamedViews[str] = view - - def listViewNames(self): - def namedViews(d): return [vname for vname in d.keys() if isinstance(self.getView(vname),View)] - userNamedViews = namedViews(self.__class__.__dict__) + namedViews(self.__dict__) - return userNamedViews + self._autoNamedViews.keys() - - # - # planning - # - - def buildRecursiveStoragePlan(self,view): - """Called by view.storagePlan.""" - #figure out what to reuse - starting with what the user specified - storedViews = dict(self.reusableViews) - #also mark for eager storage anything that's used twice in the - #plan---i.e., anything that is consumed by two or more views - numParents = collections.defaultdict(int) - for dv in self._descendants(view): - for inner in dv.inners + dv.sideviews: - numParents[inner] += 1 - for (dv,n) in numParents.items(): - if n>1 and dv.storeMe==None: - logging.info('making %s stored because it is used %d times in creating %s' % (dv,n,view.tag)) - dv.storeMe = True - - #traverse view in pre-order and find a linear sequence of - #views to store, each of which requires only views earlier in - #the sequence - storageSeq = self._storageSeq(view,storedViews) + [view.tag] - logging.info('storage sequence is: ' + ",".join(storageSeq)) - - #splice together plans for each view in the sequence, - #after first modifying the view so that nothing is called - #directly, but only through the ReuseView proxies - plan = Plan() - for tag in storageSeq: - v = self.getView(tag,mustExist=True) - vm = v.applyDict(storedViews,innerviewsOnly=True) - subplan = vm.nonrecursiveStoragePlan() - #add the correct context of reused views to the subplan, - #so that that the actual definition of the view will be - #rewritten appropriately to include the new ReuseView - #proxy for it - viewsLocallyReused = self._reuseViewDescendants(vm) - for s in subplan.steps: - s.setReusedViews(viewsLocallyReused) - plan.includeStepsOf(subplan) - return plan - - def _reuseViewDescendants(self,view): - """Descendent views that are ReuseView's""" - result = set() - for dv in self._descendants(view): - if isinstance(dv,ReuseView): - result.add(dv.reusedViewTag) - return result - - def _descendants(self,view): - """Descendents of a view.""" - result = set() - result.add(view) - for inner in view.inners + view.sideviews: - result = result.union(self._descendants(inner)) - return result - - def _storageSeq(self,view,storedViews): - """Linear sequence of storage actions to take - as view tags.""" - seq = [] - for inner in view.inners + view.sideviews: - if not inner.tag in storedViews: - seq += self._storageSeq(inner,storedViews) - if inner.storeMe: - seq += [inner.tag] - storedViews[inner.tag] = ReuseView(inner) - return seq - - # - # dealing with the file storage system and related stuff - # - - def ship(self,*fileNames): - """Declare a set of inputs to be 'shipped' to the hadoop cluster.""" - self._shippedFiles += fileNames - - def setSerializer(self,serializer): - """Replace the default serializer another RowSerializer object.""" - self._serializer = serializer - return self - - def setEvaluator(self,rowEvaluator): - """Specify a function which will deserialize a string that was produced - by Python's 'repr' function.""" - self._serializer.evaluator = rowEvaluator - return self - - # - # rest of the API for the planner - # - - @staticmethod - def partOfPlan(argv): - """True if the command line was generated as part of a storage plan.""" - return any(s.startswith("--do") for s in argv) - - def main(self,argv): - """Run a main that lets you --store a view, as well as doing a few other things.""" - self.setup() - self.runMain(argv) - - def runMain(self,argv): - - # parse the options and dispatch appropriately - argspec = ["store=", "cat=", "reuse", - "list", "pprint=", "steps=", "tasks=", "plan=", - "params=", "opts=", "do=", "view="] - optlist,args = getopt.getopt(argv[1:], 'x', argspec) - optdict = dict(optlist) - - # decide what views can be re-used, vs which need fresh plans - if '--reuse' in optdict: #reuse the views listed in the arguments - for a in args: - vname = View.viewNameFor(a) - v = self.getView(vname) - if v: - self.reusableViews[v.tag] = ReuseView(v) - logging.info("re-using data stored for view "+vname+": "+str(v)) - else: - logging.warn("cannot re-use view "+vname+" since it's not used in this script") - - #choose the main action to take - if '--store' in optdict: #store a view - rel = self.getView(optdict['--store'],mustExist=True) - plan = rel.storagePlan() - plan.execute(self, echo=self.opts['echo']) - return - elif '--pprint' in optdict: #print a view - rel = self.getView(optdict['--pprint'],mustExist=True) - rel.applyDict(self.reusableViews).pprint() - return - elif '--steps' in optdict: #print steps to produce a view - rel = self.getView(optdict['--steps'],mustExist=True) - plan = rel.storagePlan() - for s in plan.steps: - print ' -',s - return - elif '--tasks' in optdict: #print AbstractMapReduceTasks to produce a view - rel = self.getView(optdict['--tasks'],mustExist=True) - plan = rel.storagePlan() - plan.buildTasks() - for t in plan.tasks: - print t - return - elif '--plan' in optdict: #print a storage plan - rel = self.getView(optdict['--plan'],mustExist=True) - plan = rel.storagePlan() - script = plan.compile(self) - print "\n".join(script) - return - elif '--cat' in optdict: #store and then print a view - assert self.opts['target']=='shell','cannot do --cat except in shell mode' - rel = self.getView(optdict['--cat'],mustExist=True) - plan = rel.storagePlan() - plan.execute(self, self.opts['echo']) - for line in open(rel.storedFile(),'r'): - print line, - return - elif '--list' in optdict: #list named views - for vname in self.listViewNames(): - print ' ',vname,'\t',self.getView(vname) - return - elif '--do' in optdict: #run an internally-generated action - #recover what should be stored when this action is performed - #work out what view to use and what routine to call - rel = self.getView(optdict['--view'],mustExist=True) - rel = rel.applyDict(self.reusableViews) - whatToDo = optdict['--do'] - #work out the method given by 'do' and call it - note it - #may have a single integer argument, eg doJoinMap.1 - k = whatToDo.find(".") - if k<0: - whatToDoMethod = getattr(rel,whatToDo) - whatToDoMethod() - else: - arg = int(whatToDo[k+1:]) - whatToDo = whatToDo[:k] - whatToDoMethod = getattr(rel,whatToDo) - whatToDoMethod(arg) - return - else: - print 'usage: --[store|pprint|steps|plan|cat] view [--opts key:val,...] [--params key:val,...] --reuse view1 view2 ...]' - print ' --[list]' - print 'current legal keys for "opts", with default values:' - for (key,val) in GPig.DEFAULT_OPTS.items(): - print ' %s:%s' % (key,str(val)) - print 'There\'s more help at http://curtis.ml.cmu.edu/w/courses/index.php/Guinea_Pig' - -if __name__ == "__main__": - print 'There\'s help at http://curtis.ml.cmu.edu/w/courses/index.php/Guinea_Pig' diff --git a/tutorial/instance-wordcount.py b/tutorial/instance-wordcount.py index 00564f7..62b6d64 100644 --- a/tutorial/instance-wordcount.py +++ b/tutorial/instance-wordcount.py @@ -1,4 +1,4 @@ -from guineapig1_3 import * +from guineapig import * import sys def wordCountScript(): diff --git a/tutorial/longer-wordcount.py b/tutorial/longer-wordcount.py index 95b0cfe..3afdb6b 100644 --- a/tutorial/longer-wordcount.py +++ b/tutorial/longer-wordcount.py @@ -1,5 +1,5 @@ # always start like this -from guineapig1_3 import * +from guineapig import * import sys # supporting routines can go here diff --git a/tutorial/multi-wordcount-hadoop.py b/tutorial/multi-wordcount-hadoop.py index 303c6a0..e16dc68 100644 --- a/tutorial/multi-wordcount-hadoop.py +++ b/tutorial/multi-wordcount-hadoop.py @@ -1,4 +1,4 @@ -from guineapig1_3 import * +from guineapig import * import sys import os import subprocess diff --git a/tutorial/multi-wordcount.py b/tutorial/multi-wordcount.py index 86cab07..29d5775 100644 --- a/tutorial/multi-wordcount.py +++ b/tutorial/multi-wordcount.py @@ -1,4 +1,4 @@ -from guineapig1_3 import * +from guineapig import * import sys import os diff --git a/tutorial/ntup-wordcount.py b/tutorial/ntup-wordcount.py index fc334a4..72d5efc 100644 --- a/tutorial/ntup-wordcount.py +++ b/tutorial/ntup-wordcount.py @@ -1,4 +1,4 @@ -from guineapig1_3 import * +from guineapig import * import sys import math import collections diff --git a/tutorial/param-wordcount.py b/tutorial/param-wordcount.py index ac0445a..333ca79 100644 --- a/tutorial/param-wordcount.py +++ b/tutorial/param-wordcount.py @@ -1,4 +1,4 @@ -from guineapig1_3 import * +from guineapig import * import sys import os diff --git a/tutorial/phirl-naive1_3.py b/tutorial/phirl-naive1_3.py index 7d106e8..539e920 100644 --- a/tutorial/phirl-naive1_3.py +++ b/tutorial/phirl-naive1_3.py @@ -24,37 +24,41 @@ class Phirl(Planner): | ReplaceEach(by=lambda((rel,term),df):(rel,term,df)) #find total number of docs per relation - ndoc = ReplaceEach(data, by=lambda(rel,docid,term):(rel,docid)) | Distinct() | Group(by=lambda(rel,docid):rel, reducingTo=ReduceToCount()) + ndoc = ReplaceEach(data, by=lambda(rel,docid,term):(rel,docid)) \ + | Distinct() | Group(by=lambda(rel,docid):rel, reducingTo=ReduceToCount()) #unweighted document vectors - - udocvec = Join( Jin(data,by=lambda(rel,docid,term):(rel,term)), Jin(docFreq,by=lambda(rel,term,df):(rel,term)) ) \ + udocvec = Join( Jin(data,by=lambda(rel,docid,term):(rel,term)), + Jin(docFreq,by=lambda(rel,term,df):(rel,term)) ) \ | ReplaceEach(by=lambda((rel,doc,term),(rel_,term_,df)):(rel,doc,term,df)) \ | JoinTo( Jin(ndoc,by=lambda(rel,relCount):rel), by=lambda(rel,doc,term,df):rel ) \ | ReplaceEach(by=lambda((rel,doc,term,df),(rel_,relCount)):(rel,doc,term,df,relCount)) \ | ReplaceEach(by=lambda(rel,doc,term,df,relCount):(rel,doc,term,termWeight(relCount,df))) + #normalizers sumSquareWeights = ReduceTo(float, lambda accum,(rel,doc,term,weight): accum+weight*weight) - - norm = Group( udocvec, by=lambda(rel,doc,term,weight):(rel,doc), reducingTo=sumSquareWeights) \ - | ReplaceEach( by=lambda((rel,doc),z):(rel,doc,z)) + norm = Group( udocvec, + by=lambda(rel,doc,term,weight):(rel,doc), + retaining = lambda (rel,doc,term,weight): weight, + reducingTo=ReduceToSum() ) \ + | ReplaceEach( by=lambda((rel,doc),z):(rel,doc,z)) #normalized document vector - docvec = Join( Jin(norm,by=lambda(rel,doc,z):(rel,doc)), Jin(udocvec,by=lambda(rel,doc,term,weight):(rel,doc)) ) \ + docvec = Join( Jin(norm,by=lambda(rel,doc,z):(rel,doc)), + Jin(udocvec,by=lambda(rel,doc,term,weight):(rel,doc)) ) \ | ReplaceEach( by=lambda((rel,doc,z),(rel_,doc_,term,weight)): (rel,doc,term,weight/math.sqrt(z)) ) - # grab only the p component and reduce it - sumOfP = ReduceTo(float,lambda accum,(doc1,doc2,p): accum+p) - # naive algorithm: use all pairs for finding matches rel1Docs = Filter(docvec, by=lambda(rel,doc,term,weight):rel=='icepark') rel2Docs = Filter(docvec, by=lambda(rel,doc,term,weight):rel=='npspark') - softjoin = Join( Jin(rel1Docs,by=lambda(rel,doc,term,weight):term), Jin(rel2Docs,by=lambda(rel,doc,term,weight):term)) \ - | ReplaceEach(by=lambda((rel1,doc1,term,weight1),(rel2,doc2,term_,weight2)): (doc1,doc2,weight1*weight2)) \ - | Group(by=lambda(doc1,doc2,p):(doc1,doc2), reducingTo=sumOfP) \ + softjoin = Join( Jin(rel1Docs,by=lambda(rel,doc,term,weight):term), + Jin(rel2Docs,by=lambda(rel,doc,term,weight):term)) \ + | ReplaceEach(by=lambda((rel1,doc1,term,weight1),(rel2,doc2,term2,weight2)): (doc1,doc2,weight1*weight2)) \ + | Group(by=lambda(doc1,doc2,p):(doc1,doc2), \ + retaining=lambda(doc1,doc2,p):p, \ + reducingTo=ReduceToSum()) \ | ReplaceEach(by=lambda((doc1,doc2),sim):(doc1,doc2,sim)) - # get the top few similar pairs simpairs = Filter(softjoin, by=lambda(doc1,doc,sim):sim>0.75) # diagnostic output diff --git a/tutorial/prefix-count.py b/tutorial/prefix-count.py index cec9d7c..0bc8568 100644 --- a/tutorial/prefix-count.py +++ b/tutorial/prefix-count.py @@ -1,4 +1,4 @@ -from guineapig1_3 import * +from guineapig import * import sys import logging diff --git a/tutorial/tfidf.py b/tutorial/tfidf.py new file mode 100644 index 0000000..b235fd1 --- /dev/null +++ b/tutorial/tfidf.py @@ -0,0 +1,40 @@ +from guineapig import * + +# compute TFIDF in Guineapig + +import sys +import math + +class TFIDF(Planner): + + idDoc = ReadLines("idcorpus.txt") | Map(by=lambda line:line.strip().split("\t")) + idWords = Map(idDoc, by=lambda (docid,doc): (docid,doc.lower().split())) + data = FlatMap(idWords, by=lambda (docid,words): map(lambda w:(docid,w),words)) + + #compute document frequency + docFreq = Distinct(data) \ + | Group(by=lambda (docid,term):term, retaining=lambda(docid,term):docid, reducingTo=ReduceToCount()) + + docIds = Map(data, by=lambda (docid,term):docid) | Distinct() + ndoc = Group(docIds, by=lambda row:'ndoc', reducingTo=ReduceToCount()) + + #unweighted document vectors + + udocvec1 = Join( Jin(data,by=lambda(docid,term):term), Jin(docFreq,by=lambda(term,df):term) ) + udocvec2 = Map(udocvec1, by=lambda((docid,term1),(term2,df)):(docid,term1,df)) + udocvec3 = Join( Jin(udocvec2,by=lambda row:'const'), Jin(ndoc,by=lambda row:'const')) + udocvec = Map(udocvec3, by=lambda((docid,term,df),(dummy,ndoc)):(docid,term,math.log(ndoc/df))) + + sumSquareWeights = ReduceTo(float, lambda accum,(docid,term,weight): accum+weight*weight) + + norm = Group( udocvec, by=lambda(docid,term,weight):docid, + retaining=lambda(docid,term,weight):weight*weight, + reducingTo=ReduceToSum() ) + + docvec = Join( Jin(norm,by=lambda(docid,z):docid), Jin(udocvec,by=lambda(docid,term,weight):docid) ) \ + | Map( by=lambda((docid1,z),(docid2,term,weight)): (docid1,term,weight/math.sqrt(z)) ) + +# always end like this +if __name__ == "__main__": + p = TFIDF() + p.main(sys.argv) diff --git a/tutorial/wordcmp.py b/tutorial/wordcmp.py index 985994c..8e8d0c1 100644 --- a/tutorial/wordcmp.py +++ b/tutorial/wordcmp.py @@ -1,4 +1,4 @@ -from guineapig1_3 import * +from guineapig import * import sys import math diff --git a/tutorial/wordcount.py b/tutorial/wordcount.py index 06db5a4..5299a3a 100644 --- a/tutorial/wordcount.py +++ b/tutorial/wordcount.py @@ -1,4 +1,4 @@ -from guineapig1_3 import * +from guineapig import * import sys # supporting routines can go here diff --git a/tutorial/wordprob.py b/tutorial/wordprob.py index 213fbcf..0f747ce 100644 --- a/tutorial/wordprob.py +++ b/tutorial/wordprob.py @@ -1,4 +1,4 @@ -from guineapig1_3 import * +from guineapig import * import sys import math import logging