From bdd10d6782e228204bc941d64ca4dd917eb80f8d Mon Sep 17 00:00:00 2001
From: William Cohen <wcohen@furter.ml.cmu.edu>
Date: Wed, 15 Jul 2015 10:48:39 -0400
Subject: [PATCH] catching with changes

---
 gpextras.py                        |   77 ++
 guineapig.py                       |  296 ++++--
 spyk.py                            |  140 +++
 testgp.py                          |   27 +-
 tutorial/Makefile                  |   22 +-
 tutorial/README.txt                |    9 +-
 tutorial/guineapig.py              | 1384 ----------------------------
 tutorial/guineapig1_1.py           | 1244 -------------------------
 tutorial/guineapig1_2.py           | 1284 --------------------------
 tutorial/guineapig1_3.py           | 1384 ----------------------------
 tutorial/instance-wordcount.py     |    2 +-
 tutorial/longer-wordcount.py       |    2 +-
 tutorial/multi-wordcount-hadoop.py |    2 +-
 tutorial/multi-wordcount.py        |    2 +-
 tutorial/ntup-wordcount.py         |    2 +-
 tutorial/param-wordcount.py        |    2 +-
 tutorial/phirl-naive1_3.py         |   32 +-
 tutorial/prefix-count.py           |    2 +-
 tutorial/tfidf.py                  |   40 +
 tutorial/wordcmp.py                |    2 +-
 tutorial/wordcount.py              |    2 +-
 tutorial/wordprob.py               |    2 +-
 22 files changed, 531 insertions(+), 5428 deletions(-)
 create mode 100644 gpextras.py
 create mode 100644 spyk.py
 delete mode 100644 tutorial/guineapig.py
 delete mode 100644 tutorial/guineapig1_1.py
 delete mode 100644 tutorial/guineapig1_2.py
 delete mode 100644 tutorial/guineapig1_3.py
 create mode 100644 tutorial/tfidf.py

diff --git a/gpextras.py b/gpextras.py
new file mode 100644
index 0000000..afdfd3a
--- /dev/null
+++ b/gpextras.py
@@ -0,0 +1,77 @@
+##############################################################################
+# (C) Copyright 2014, 2015 William W. Cohen.  All rights reserved.
+##############################################################################
+
+from guineapig import *
+
+class ReadCSV(Reader):
+    """ Returns the lines in a CSV file, converted to Python tuples."""
+
+    def __init__(self,src,**kw):
+        Reader.__init__(self,src)
+        self.kw = kw
+
+    def rowGenerator(self):
+        for tup in csv.reader(sys.stdin,**self.kw):
+            yield tup
+
+    def __str__(self):
+        return 'ReadCVS("%s",%s)' % (self.src,str(self.kw)) + self.showExtras()
+
+class ReadBlocks(Reader):
+    """ Returns blocks of non-empty lines, separated by empty lines"""
+
+    def __init__(self,src,isEndBlock=lambda line:line=="\n"):
+        Reader.__init__(self,src)
+        self.isEndBlock = isEndBlock
+
+    def rowGenerator(self):
+        buf = []
+        for line in sys.stdin:
+            if self.isEndBlock(line):
+                yield buf
+                buf = []
+            else:
+                buf.append(line)
+        if buf:
+            yield buf
+
+    def __str__(self):
+        return 'ReadBlocks("%s")' % self.src + self.showExtras()
+
+class Log(ReplaceEach):
+    """Print logging messages to stderr as data is processed. 
+    For every row, the logfun will be called with arguments
+    logfun(rowValue,rowIndex).
+    """
+
+    def __init__(self, inner=None, logfun=lambda rowV,rowI:None):
+        self.rowNum = 0
+        def logfunCaller(rowValue):
+            self.rowValue += 1
+            self.logfun(rowValue,self.rowNum)
+            return rowValue
+        ReplaceEach.__init__(self,inner,by=logfunCaller)
+        
+    def __str__(self):
+        return 'Log("%s")' % self.src + self.showExtras()
+
+class LogEchoFirst(Log):
+
+    """Echo the first N things."""
+    
+    def __init__(self, inner=None, first=10):
+        def logfirst(rowValue,rowIndex):
+            if rowIndex<=first:
+                print >> sys.stderr, 'row %d: "%s"' % (rowIndex,rowValue)
+        Log.__init__(self, inner=inner, logfun=logfirst)
+
+class LogProgress(Log):
+
+    """Echo a status message every 'interval' rows."""
+    
+    def __init__(self, inner=None, msg="Logging progress", interval=1000):
+        def logprogress(rowValue,rowIndex):
+            if (rowIndex % interval)==0:
+                print >> sys.stderr, "%s: %d rows done" % (msg,rowIndex)
+        Log.__init__(self, inner=inner, logfun=logprogress)
diff --git a/guineapig.py b/guineapig.py
index 4e0b117..05a0fde 100644
--- a/guineapig.py
+++ b/guineapig.py
@@ -1,5 +1,5 @@
 ##############################################################################
-# (C) Copyright 2014 William W. Cohen.  All rights reserved.
+# (C) Copyright 2014, 2015 William W. Cohen.  All rights reserved.
 ##############################################################################
 
 import sys
@@ -10,6 +10,7 @@
 import os
 import os.path
 import urlparse
+import urllib
 import getopt
 import csv
 
@@ -20,11 +21,16 @@
 class GPig(object):
     """Collection of utilities for Guinea Pig."""
 
-    HADOOP_LOC = 'hadoop'  #assume hadoop is on the path at planning time
-    MY_LOC = 'guineapig.py'
+    SORT_COMMAND = 'LC_COLLATE=C sort'  # use standard ascii ordering, not locale-specific one
+    HADOOP_LOC = 'hadoop'               # assume hadoop is on the path at planning time
+    MY_LOC = 'guineapig.py'             # the name of this file
+    VERSION = '1.3.2'
+    COPYRIGHT = '(c) William Cohen 2014,2015'
 
-    #global options for Guinea Pig can be passed in with the --opts
+    #Global options for Guinea Pig can be passed in with the --opts
     #command-line option, and these are the default values
+    #The location of the streaming jar is a special case,
+    #in that it's also settable via an environment variable.
     defaultJar = '/usr/lib/hadoop/contrib/streaming/hadoop-streaming-1.2.0.1.3.0.0-107.jar'
     envjar = os.environ.get('GP_STREAMJAR', defaultJar)
     DEFAULT_OPTS = {'streamJar': envjar,
@@ -33,15 +39,18 @@ class GPig(object):
                     'echo':0,
                     'viewdir':'gpig_views',
                     }
-    #there are the types of each option that has a non-string value
+    #These are the types of each option that has a non-string value
     DEFAULT_OPT_TYPES = {'parallel':int,'echo':int}
-    #we need to pass non-default options in to mappers and reducers,
-    #but since the remote worker's environment can be different, we
-    #also need to pass in options computed from the environment
+    #We need to pass non-default options in to mappers and reducers,
+    #but since the remote worker's environment can be different that
+    #the environment of this script, we also need to pass in options
+    #computed from the environment
     COMPUTED_OPTION_DEFAULTS = {'streamJar':defaultJar}
 
     @staticmethod
     def getCompiler(target):
+        """Return the compiler object used to convert AbstractMapReduceTasks
+        to executable commands."""
         if target=='shell': return ShellCompiler()
         elif target=='hadoop': return HadoopCompiler()
         else: assert 'illegal compilation target '+target
@@ -67,12 +76,15 @@ def getArgvDict(optname):
         for i,a in enumerate(sys.argv):
             if a==optname:
                 paramString = sys.argv[i+1]
-                return dict(pair.split(":") for pair in paramString.split(","))
+                result = dict(pair.split(":") for pair in paramString.split(","))
+                for key in result:
+                    result[key] = urllib.unquote(result[key])
+                return result
         return {}
 
     @staticmethod
     def rowsOf(view):
-        """Iterate over the rows in a view."""
+        """Iterator over the rows in a view."""
         for line in open(view.distributableFile()):
             yield view.planner._serializer.fromString(line.strip())
 
@@ -103,6 +115,9 @@ def __init__(self,view,by=(lambda x:x),outer=False):
         self.view = view
         self.joinBy = by
         self.outer = outer
+        #To implement the semantics for outer joins, if one Jin input
+        #for a join is outer, then the other inputs will be marked as
+        #_padWithNulls==True
         self._padWithNulls = False
 
     def __str__(self):
@@ -112,8 +127,11 @@ def __str__(self):
         return "Jin(%s,by=%s%s%s)" % (viewStr,self.joinBy,outerStr,padStr)
 
 class ReduceTo(object):
-    """An object x that can be the argument of a reducingTo=x
-    parameter in a Group view."""
+    """An object x that can be the argument of a reducingTo=x parameter in
+    a Group view.  Basetype is a function f such that f() returns the
+    initial value of the accumuator, and 'by' is a function that
+    maps one accumulator and a single new value to the next accumulator.  
+    """
     def __init__(self,baseType,by=lambda accum,val:accum+val):
         self.baseType = baseType
         self.reduceBy = by
@@ -124,8 +142,8 @@ def __init__(self):
         ReduceTo.__init__(self,int,by=lambda accum,val:accum+1)
 
 class ReduceToSum(ReduceTo):
-    """Produce the sum of the objects - which must be numbers - that would
-    be placed in a group."""
+    """Produce the sum of the objects - which must be legal arguments of
+    the '+' function - that would be placed in a group."""
     def __init__(self):
         ReduceTo.__init__(self,int,by=lambda accum,val:accum+val)
 
@@ -408,7 +426,7 @@ def enforceStorageConstraints(self):
                     inner.storeMe = True
 
     def mapPlan(self):
-        log.error("abstract method not implemented")
+        logging.error("abstract method not implemented")
         
     def doStoreKeyedRows(self,subview,key,index):
         """Utility method used by concrete map-reduce classes to compute keys
@@ -445,6 +463,9 @@ def rowGenerator(self):
         for line in sys.stdin:
             yield self.planner._serializer.fromString(line.strip())
 
+    def explanation(self):
+        return [ 'reuse view %s stored in %s' % (self.reusedViewTag,self.src)]
+
     def __str__(self):
         return 'ReuseView("%s")' % self.src + self.showExtras()
 
@@ -462,21 +483,6 @@ def rowGenerator(self):
     def __str__(self):
         return 'ReadLines("%s")' % self.src + self.showExtras()
 
-class ReadCSV(Reader):
-    """ Returns the lines in a CSV file, converted to Python tuples."""
-
-    def __init__(self,src,**kw):
-        Reader.__init__(self,src)
-        self.kw = kw
-
-    def rowGenerator(self):
-        for tup in csv.reader(sys.stdin,**self.kw):
-            yield tup
-
-    def __str__(self):
-        return 'ReadCVS("%s",%s)' % (self.src,str(self.kw)) + self.showExtras()
-
-
 class ReplaceEach(Transformation):
     """ In 'by=f'' f is a python function that takes a row and produces
     its replacement."""
@@ -490,11 +496,14 @@ def rowGenerator(self):
             yield self.replaceBy(row)
 
     def explanation(self):
-        return self.inner.explanation() + [ 'replaced to %s' % self.tag ]
+        return self.inner.explanation() + [ 'replace to %s' % self.tag ]
 
     def __str__(self):
         return 'ReplaceEach(%s, by=%s)' % (View.asTag(self.inner),str(self.replaceBy)) + self.showExtras()
 
+class Map(ReplaceEach):
+    """ Alternate name for ReplaceEach"""
+
 class Augment(Transformation):
 
     def __init__(self,inner=None,sideviews=None,sideview=None,loadedBy=lambda v:list(GPig.rowsOf(v))):
@@ -566,6 +575,9 @@ def explanation(self):
     def __str__(self):
         return 'Flatten(%s, by=%s)' % (View.asTag(self.inner),str(self.flattenBy)) + self.showExtras()
 
+class FlatMap(Flatten):
+    """ Alternate name for Flatten"""
+
 class Filter(Transformation):
     """Filter out a subset of rows that match some predicate."""
     
@@ -762,6 +774,56 @@ def acceptInnerView(self,otherView):
         self.joinInputs[0].view = otherView
         self.inners[0] = otherView
 
+class Union(MapReduce):
+    """Combine two or more relations, also removing duplicates."""
+
+    def __init__(self,*inners):
+        #sets self.inners
+        MapReduce.__init__(self,list(inners),None)
+
+    def acceptInnerView(self,otherView):
+        assert False, 'Union cannot be RHS of a pipe - use UnionTo instead'
+
+    def mapPlan(self):
+        plan = Plan()
+        innerCheckpoints = map(lambda v:v.checkpoint(), self.inners)
+        step = PrereduceStep(view=self, whatToDo='doUnionMap',srcs=innerCheckpoints,dst=self.checkpoint(),why=self.explanation())
+        plan.append(step)
+        return plan
+
+    def explanation(self):
+        innerEx = []
+        for inner in self.inners:
+            if innerEx: innerEx += ['CONCAT TO']
+            innerEx += inner.explanation()
+        return innerEx
+
+    def __str__(self):
+        return "Union(%s)" % ",".join(map(str,self.inners)) + self.showExtras()
+
+    def rowGenerator(self):
+        lastLine = None
+        for line in sys.stdin:
+            if line!=lastLine:
+                yield self.planner._serializer.fromString(line.strip())
+            lastLine = line
+
+    def doUnionMap(self,i):
+        # called with argument index, and stdin pointing to innerCheckpoints[index]
+        for row in self.inners[i].rowGenerator():
+            print self.planner._serializer.toString(row)
+
+
+class UnionTo(Union):
+    """Special case of Union which can be used as RHS of a pipe operator."""
+    
+    def __init__(self,*moreInners):
+        allInners = [None]+list(moreInners)
+        Union.__init__(self,*allInners)
+        
+    def acceptInnerView(self,otherView):
+        self.inners[0] = otherView
+
 ##############################################################################
 #
 # the top-level planner, and its supporting classes
@@ -803,16 +865,12 @@ def compile(self,gp):
         script = []
         taskCompiler = GPig.getCompiler(gp.opts['target']) 
         for task in self.tasks:
+            #print 'compiling',task
             script += taskCompiler.compile(task,gp)
         return script
 
-#
-# a single step in a plan produced by the planner
-#
-
 class Step(object):
-    """A single step of the plans produced by the planner, along with the
-    methods to convert the plans into executable shell commands."""
+    """A single 'step' of the plans produced by the planner."""
 
     def __init__(self,view):
         self.view = view
@@ -852,6 +910,7 @@ def __str__(self):
         return "TransformStep("+",".join(map(repr, [self.view.tag,self.whatToDo,self.srcs,self.dst,self.reused]))+")"
 
 class PrereduceStep(Step):
+    """A step that can be followed by a reduce step."""
     def __init__(self,view,whatToDo,srcs,dst,why):
         Step.__init__(self,view)
         self.whatToDo = whatToDo
@@ -862,11 +921,22 @@ def __init__(self,view,whatToDo,srcs,dst,why):
     def __str__(self):
         return "PrereduceStep("+",".join(map(repr, [self.view.tag,self.whatToDo,self.srcs,self.dst,self.reused]))+")"
 
-# combine steps into something executable via hadoop - or shell
-
 class AbstractMapReduceTask(object):
-    """A collection of steps that can be executed as a single map-reduce operation,
-    possibly with some file managements steps to set up the task."""
+    """A collection of steps that can be executed as a single map-reduce
+    operation, possibly with some file managements steps to set up the
+    task.  More specifically, this consists of 
+
+    1a) a maybe-empty sequence of DistributeStep's
+    2a) a PrereduceStep followed by a TransformStep
+    or else
+
+    1b) a maybe-empty sequence of DistributeStep's
+    2b) a PrereduceStep
+    3b) a TransformStep
+
+    Sequence 1a-2a is a map-only task, and sequence 1b-3b is a
+    map-reduce task.
+    """
 
     def __init__(self):
         self.distributeSteps = []
@@ -880,16 +950,39 @@ def insert(self,step):
             self.distributeSteps.append(step)
             return True
         elif self.mapStep==None and (isinstance(step,TransformStep) or isinstance(step,PrereduceStep)):
-            #we can only have one map step, so fill up an empty slot if possible
+            #we can only have one map step, so fill up an empty mapstep slot if possible
             self.mapStep = step
             return True
         elif self.mapStep and isinstance(self.mapStep,PrereduceStep) and isinstance(step,TransformStep) and not self.reduceStep:
-            #if the mapstep is a prereduce, then we can also allow a reduce step
+            #if the mapstep is a prereduce, then we can also allow any TransformStep to be used as a reduceStep
             self.reduceStep = step
             return True
         else:
             return False
             
+    def explanation(self):
+        """Concatenate together the explanations for the different steps of
+        2this task."""
+        buf = []
+        for step in self.distributeSteps:
+            buf += step.why
+        #reduce explanation copies the map explanation so we don't need both
+        if self.reduceStep:
+            buf += self.reduceStep.why
+        else:
+            buf += self.mapStep.why
+        return buf
+
+    def inputsAndOutputs(self):
+        """Return a string summarizing the source files used as inputs, and
+        the view ultimately created by this task."""
+        buf = ' + '.join(self.mapStep.srcs)
+        if self.reduceStep:
+            buf += ' => ' + self.reduceStep.view.tag
+        else:
+            buf += ' => ' + self.mapStep.view.tag            
+        return buf
+
     def __str__(self):
         buf = "mapreduce task:"
         for step in self.distributeSteps:
@@ -900,6 +993,7 @@ def __str__(self):
         return buf
 
 class MRCompiler(object):
+
     """Abstract compiler class to convert a task to a list of commands that can be executed by the shell."""
 
     def compile(self,task,gp):
@@ -909,25 +1003,25 @@ def compile(self,task,gp):
             script += ['echo create '+task.mapStep.view.tag + ' via map: ' + task.mapStep.explain()]        
         else: 
             script += ['echo create '+task.reduceStep.view.tag +' via map/reduce: '+task.reduceStep.explain()]
-        for step in task.distributeSteps:
-            localCopy = step.view.distributableFile()                
-            maybeRemoteCopy = step.view.storedFile()
-            echoCom = 'echo distribute %s: making a local copy of %s in %s' % (step.view.tag,maybeRemoteCopy,localCopy)
-            script += [echoCom] + self.distributeCommands(task, gp, maybeRemoteCopy,localCopy)
-        if not task.reduceStep and len(task.mapStep.srcs)==1:
+        if not task.reduceStep and len(task.mapStep.srcs)==1:   #a map-only step
             mapCom = self._coreCommand(task.mapStep,gp)
             script += self.simpleMapCommands(task, gp, mapCom, task.mapStep.srcs[0], task.mapStep.dst)
-        elif task.reduceStep and len(task.mapStep.srcs)==1:
+        elif task.reduceStep and len(task.mapStep.srcs)==1:     #a map-reduce step
             mapCom = self._coreCommand(task.mapStep,gp)
             reduceCom = self._coreCommand(task.reduceStep,gp)
             script += self.simpleMapReduceCommands(task, gp, mapCom, reduceCom, task.mapStep.srcs[0], task.reduceStep.dst)
-        elif task.reduceStep and len(task.mapStep.srcs)>1:
+        elif task.reduceStep and len(task.mapStep.srcs)>1:      #multiple mappers and one reduce 
             mapComs = [self._ithCoreCommand(task.mapStep,gp,i) for i in range(len(task.mapStep.srcs))]
             reduceCom = self._coreCommand(task.reduceStep,gp)
             midpoint = gp.opts['viewdir']+'/'+task.mapStep.view.tag+'.gpmo'
             script += self.joinCommands(task, gp, mapComs, reduceCom, task.mapStep.srcs, midpoint, task.reduceStep.dst)
         else:
             assert False,'cannot compile task '+str(task)
+        for step in task.distributeSteps:                       #distribute the results, if necessary
+            localCopy = step.view.distributableFile()                
+            maybeRemoteCopy = step.view.storedFile()
+            echoCom = 'echo distribute %s: making a local copy of %s in %s' % (step.view.tag,maybeRemoteCopy,localCopy)
+            script += [echoCom] + self.distributeCommands(task, gp, maybeRemoteCopy,localCopy)
         return script
 
     # abstract routines
@@ -969,7 +1063,7 @@ def __coreCommandOptions(self,step,gp):
         for (k,v) in gp.opts.items():
             #pass in non-default options, or options computed from the environment
             if (gp.opts[k] != GPig.DEFAULT_OPTS[k]) or ((k in GPig.COMPUTED_OPTION_DEFAULTS) and (gp.opts[k] != GPig.COMPUTED_OPTION_DEFAULTS[k])):
-                nonDefaults += ["%s:%s" % (k,str(v))]
+                nonDefaults += ["%s:%s" % (k,urllib.quote(str(v)))]
         optsOpts = '' if not nonDefaults else " --opts " + ",".join(nonDefaults)
         reuseOpts = '' if not step.reused else " --reuse "+ " ".join(step.reused)
         return paramOpts  + optsOpts + reuseOpts
@@ -989,14 +1083,14 @@ def simpleMapCommands(self,task,gp,mapCom,src,dst):
         
     def simpleMapReduceCommands(self,task,gp,mapCom,reduceCom,src,dst):
         """A map-reduce job with one input."""
-        return [mapCom + ' < ' + src + ' | sort -k1 | '+reduceCom + ' > ' + dst]
+        return [mapCom + ' < ' + src + (' | %s -k1 | ' % GPig.SORT_COMMAND) +reduceCom + ' > ' + dst]
 
     def joinCommands(self,task,gp,mapComs,reduceCom,srcs,midpoint,dst):
         """A map-reduce job with several inputs."""
         subplan = ['rm -f %s' % midpoint]
         for i,ithMapCom in enumerate(mapComs):
             subplan += [ithMapCom + ' < ' + srcs[i] + ' >> ' + midpoint]
-        subplan += [ 'sort -k1,2  < ' + midpoint + ' | ' + reduceCom + ' > ' + dst]
+        subplan += [ ('%s -k1,2  < '% GPig.SORT_COMMAND) + midpoint + ' | ' + reduceCom + ' > ' + dst]
         return subplan
 
 class HadoopCompiler(MRCompiler):
@@ -1078,7 +1172,10 @@ def _hadoopCleanCommand(self,gp,fileName):
 #
 
 class RowSerializer(object):
-    """Saves row objects to disk and retrieves them."""
+    """Saves row objects to disk and retrieves them.  A RowSerializer is
+    used internally in a Planner, and by default the one used by a
+    Planner will be an instance of RowSerializer().  A user can
+    override this with planner.setSerializer(). """
     def __init__(self):
         self.evaluator = GPig.SafeEvaluator()
     def toString(self,x): 
@@ -1095,38 +1192,47 @@ class Planner(object):
 
     def __init__(self,**kw):
 
-        #parameters are used for programmatically give user-defined
+        #Parameters are used for programmatically giving user-defined
         #config information to a planner, or they can be specified in
-        #the command-line
+        #the command-line.  These are usually accessed in user-defined
+        #views.
+
         self.param = kw
         for (key,val) in GPig.getArgvParams().items():
                 # don't override non-null values specified in the constructor
                 if self.param.get(key)==None:
                     self.param[key] = val
 
-        #opts are used for giving options to the planner from the shell
+        #opts are used for giving options to the planner from the
+        #shell, and are used in code in this file.
+
         self.opts = GPig.getArgvOpts()
         for (key,val) in GPig.DEFAULT_OPTS.items():
             if (not key in self.opts): self.opts[key] = val
         for (key,type) in GPig.DEFAULT_OPT_TYPES.items():
             self.opts[key] = type(self.opts[key])
 
-        #use appropriate for the target
+        #Provide a default serializer
+
         self._serializer = RowSerializer()
 
-        #views that aren't associated with class variable, but are
-        #instead named automatically - ie, inner views with no
+        #These are views that aren't associated with class variable,
+        #but are instead named automatically - ie, inner views with no
         #user-provided names.
         self._autoNamedViews = {}
 
-        #by default, use info-level logging at planning time
+        #By default, use info-level logging at planning time only, not
+        #at view execution time.
         if not Planner.partOfPlan(sys.argv): 
             logging.basicConfig(level=logging.INFO)
+        logging.info('GuineaPig v%s %s' % (GPig.VERSION,GPig.COPYRIGHT))
 
-        #hadoop needs to know where to give the main script file,
-        #as well as the guineapig.py file it uses
+        #Hadoop needs to know where to give the main script file, as
+        #well as the guineapig.py file used here
+        self._shippedFiles = []
         self._gpigSourceFile = sys.argv[0]
-        self._shippedFiles = [GPig.MY_LOC,self._gpigSourceFile]
+        self.ship(GPig.MY_LOC)
+        self.ship(self._gpigSourceFile)
 
     def setup(self):
         """Initialize planner, and views used by the planner.  This has to be
@@ -1264,9 +1370,16 @@ def _storageSeq(self,view,storedViews):
     # dealing with the file storage system and related stuff
     #
 
-    def ship(self,*fileNames):
+    def ship(self,fileName):
         """Declare a set of inputs to be 'shipped' to the hadoop cluster."""
-        self._shippedFiles += fileNames
+        for d in sys.path:
+            location = os.path.join(d,fileName)
+            if os.path.isfile(location):
+                logging.info('located %s at %s' % (fileName,location))
+                self._shippedFiles.append(location)
+                return
+        logging.error("didn't locate %s on sys.path: path is %r" % (fileName,sys.path))
+        logging.warn("note that the working directory . should always be on your PYTHONPATH")
 
     def setSerializer(self,serializer):
         """Replace the default serializer another RowSerializer object."""
@@ -1294,12 +1407,17 @@ def main(self,argv):
         self.runMain(argv)
 
     def runMain(self,argv):
+        """Called by main()."""
 
         # parse the options and dispatch appropriately
-        argspec = ["store=", "cat=", "reuse", 
+        argspec = ["store=", "cat=", "reuse", "help",
                    "list", "pprint=", "steps=", "tasks=", "plan=", 
                    "params=", "opts=", "do=", "view="]
-        optlist,args = getopt.getopt(argv[1:], 'x', argspec)
+        try:
+            optlist,args = getopt.getopt(argv[1:], 'x', argspec)
+        except getopt.GetoptError:
+            logging.fatal('bad option: use "--help" to get help')
+            sys.exit(-1)
         optdict = dict(optlist)
         
         # decide what views can be re-used, vs which need fresh plans
@@ -1329,12 +1447,21 @@ def runMain(self,argv):
             for s in plan.steps:
                 print ' -',s
             return
-        elif '--tasks' in optdict: #print AbstractMapReduceTasks to produce a view 
+        elif '--tasks' in optdict: #print AbstractMapReduceTasks 
             rel = self.getView(optdict['--tasks'],mustExist=True)
             plan = rel.storagePlan()
             plan.buildTasks()
-            for t in plan.tasks:
-                print t
+            for k,task in enumerate(plan.tasks):
+                print '=' * 70
+                taskType = 'map-reduce' if task.reduceStep else 'map-only'
+                print '%s task %d: %s' % (taskType,(k+1),task.inputsAndOutputs())
+                print ' - +' + '-' * 20, 'explanation', '-' * 20
+                for w in task.explanation():
+                    print ' - | ',w
+                print ' - +' + '-' * 20, 'commands', '-' * 20
+                for c in GPig.getCompiler(self.opts['target']).compile(task,self):
+                    if not c.startswith("echo"):
+                        print ' - | ',c
             return
         elif '--plan' in optdict:    #print a storage plan
             rel = self.getView(optdict['--plan'],mustExist=True)
@@ -1373,12 +1500,31 @@ def runMain(self,argv):
                 whatToDoMethod(arg)                
             return
         else:
-            print 'usage: --[store|pprint|steps|plan|cat] view [--opts key:val,...] [--params key:val,...] --reuse view1 view2 ...]'
-            print '       --[list]'
-            print 'current legal keys for "opts", with default values:'
+            usageHint = {'pprint':'print the data structure associated with the VIEW',
+                         'tasks':'print the abstract map-reduce tasks needed to materialize the VIEW',
+                         'plan':'print the commands that invoke each abstract map-reduce task',
+                         'store':'materialize the named VIEW and store it in the view directory',
+                         'cat': 'store the VIEW and then print each line to stdout'}
+            print 'Guinea Pig',GPig.VERSION,GPig.COPYRIGHT
+            print 'usage: python %s --(store|pprint|tasks|plan|cat) VIEW [OPTIONS] [PARAMS] --reuse VIEW1 VIEW2 ...' % sys.argv[0]
+            print '       python %s --list' % sys.argv[0]
+            print ''
+            print 'Subcommands that take a VIEW as argument:'
+            for a in usageHint:
+                print ' --%s VIEW: %s'% (a,usageHint[a])
+            print 'The --list subcommand lists possible VIEWs defined by this program.'
+            print ''
+            print 'OPTIONS are specified as "--opts key:value,...", where legal keys for "opts", with default values, are:'
             for (key,val) in GPig.DEFAULT_OPTS.items():
                 print '  %s:%s' % (key,str(val))
+            print 'Values in the "opts" key/value pairs are assumed to be URL-escaped.'
+            print ''
+            print 'PARAMS are specified as "--params key:value,..." and the associated dictionary is accessible to' 
+            print 'user programs via the function GPig.getArgvParams().'
+            print ''
             print 'There\'s more help at http://curtis.ml.cmu.edu/w/courses/index.php/Guinea_Pig'
 
 if __name__ == "__main__":
+    print 'Guinea Pig',GPig.VERSION,GPig.COPYRIGHT
     print 'There\'s help at http://curtis.ml.cmu.edu/w/courses/index.php/Guinea_Pig'    
+
diff --git a/spyk.py b/spyk.py
new file mode 100644
index 0000000..6976dab
--- /dev/null
+++ b/spyk.py
@@ -0,0 +1,140 @@
+##############################################################################
+# (C) Copyright 2014, 2015 William W. Cohen.  All rights reserved.
+##############################################################################
+
+import guineapig
+import sys
+import random
+
+class SpykContext(object):
+
+    def __init__(self,**kw):
+        self.planner = guineapig.Planner(**kw)
+        self.tagCodeIndex = 0
+    
+    #TODO setSerializer, setEvaluator, ship
+
+    #returns a SpykRDD
+    def textFile(self,fileName):
+        rdd = SpykRDD('textFile', self, guineapig.ReadLines(fileName))
+        return rdd
+
+    def wholeTextFiles(self,dirName):
+        #TODO find this in royals, and make it a gpextra
+        pass
+
+    def finalize(self):
+        """Declare the SpykRDD and all RDD definitions complete.  This must be
+        called in the __name__=="__main__" part of the code, because
+        it also executes substeps when called recursively."""
+        self.planner.setup()        
+        if guineapig.Planner.partOfPlan(sys.argv):
+            self.planner.main(sys.argv)
+
+    def usermain(self):
+        """Use this in an if statement before any Spyk actions."""
+        return not guineapig.Planner.partOfPlan(sys.argv)
+
+class SpykRDD(object):
+
+    def __init__(self,tag,context,view):
+        self.view = view
+        self.context = context
+        self.view.planner = context.planner
+        self.context.tagCodeIndex += 1
+        self.context.planner._setView("%s__%d" % (tag,self.context.tagCodeIndex), view)
+
+    #TODO this doesn't work, need to use a different mechanism,
+    #maybe with a wrapper around plan/execute
+    def cache(self):
+        self.view = self.view.opts(stored=True)
+        return self
+
+    #transformations, which return new SpykRDD's
+
+    #TODO
+    #union
+    #intersection - gpextra?
+    # ... and for keyed views only
+    #cogroup
+
+    def map(self,mapfun):
+        return SpykRDD('map',self.context, guineapig.ReplaceEach(self.view,by=mapfun))
+
+    def flatMap(self,mapfun):
+        return SpykRDD('flatMap',self.context, guineapig.Flatten(self.view,by=mapfun))
+
+    def groupByKey(self):
+        return SpykRDD('groupByKey', 
+                       self.context, 
+                       guineapig.Group(self.view, 
+                                       by=lambda (key,val):key, 
+                                       retaining=lambda (key,val):val))
+
+    def reduceByKey(self,initValue,reduceOp):
+        return SpykRDD('reduceByKey',
+                       self.context, 
+                       guineapig.Group(self.view, 
+                                       by=lambda (key,val):key, 
+                                       retaining=lambda (key,val):val,
+                                       reducingTo=guineapig.ReduceTo(initValue,reduceOp)))
+    def filter(self,filterfun):
+        return SpykRDD('filter',self.context, guineapig.Filter(self.view,by=filterfun))
+
+    def sample(self,withReplacement,fraction):
+        assert not withReplacement, 'sampling with replacement is not implemented'
+        return SpykRDD('sample',self.context, guineapig.Filter(self.view,by=lambda x:1 if random.random()<fraction else 0))
+
+    def join(self,rdd):
+        return SpykRDD('join', 
+                       self.context, 
+                       (guineapig.Join(guineapig.Jin(self.view,by=lambda (k,v):k),
+                                      guineapig.Jin(rdd.view,by=lambda (k,v):k)) \
+                        | guineapig.ReplaceEach(by=lambda ((k1,v1),(k2,v2)):(k1,(v1,v2)))))
+
+    def distinct(self):
+        return SpykRDD('distinct', self.context, guineapig.Distinct(self.view))
+
+
+    #TODO
+    #actions, which setup(), store and return a python data structure
+    #can setup() be called more than once? note we need to initialize
+    #the argument view as part of sc.planner, and I guess mark it as
+    #re-useable.
+
+    #reduce
+    #save(path)
+    #countByKey
+    #foreach
+
+    def collect(self):
+        """Returns a generator."""
+        for x in self._take(-1): yield x
+
+    def take(self,n):
+        return list(self._take(n))
+
+    def first(self):
+        return list(self.take(1))[0]
+
+    def _take(self,n):
+        #subroutine, like take(n) but take(-1) collects all
+        plan = self.view.storagePlan()
+        plan.execute(self.context.planner, echo=self.context.planner.opts['echo'])
+        k = 0
+        for line in open(self.view.storedFile()):
+            k += 1
+            if n<0 or k<=n:
+                yield self.view.planner._serializer.fromString(line.strip())
+        
+    def count(self):
+        plan = self.view.storagePlan()
+        plan.execute(self.context.planner, echo=self.context.planner.opts['echo'])
+        n = 0
+        for line in open(self.view.storedFile()):
+            n += 1
+        return n
+
+    #debug - not in spark
+    #pprint, tasks, plan, list
+
diff --git a/testgp.py b/testgp.py
index e4606bf..523262f 100644
--- a/testgp.py
+++ b/testgp.py
@@ -11,22 +11,21 @@ class Wrap(View):
     def __init__(self,dataSource):
         View.__init__(self)
         self.dataSource = dataSource
-        self.storeMe = True
-    def unstoredCheckpoint(self):
-        return None
-    def unstoredCheckpointPlan(self):
+    def checkpoint(self):
+        return '/dev/null'
+    def checkpointPlan(self):
         plan = Plan()
-        plan.extend( Step(self,'doStoreRows','/dev/null',self.storedFile(),why=self.explanation()) )
+        plan.append( TransformStep(view=self,whatToDo='doStoreRows',srcs=['/dev/null'],dst=self.storedFile(),why=self.explanation()) )
         return plan
-    def unstoredRowGenerator(self):
+    def rowGenerator(self):
         for row in self.dataSource:
             yield row
-    def unstoredExplanation(self):
+    def explanation(self):
         return [ 'wrap data as %s' % self.tag ]
     def __str__(self):
         s = str(self.dataSource)        
         if len(s)>30: s = s[0:30]+'...'
-        return 'Wrap(%s)' % s + self.showExtras()
+        return 'Wrap("%s")' % s + self.showExtras()
 
 someInts = list(range(10))
 somePairs = [(i/3, i) for i in range(15)]
@@ -78,6 +77,8 @@ def aPlanner():
     p.augA = Augment(p.yA, sideview=p.yD, loadedBy=lambda v:GPig.onlyRowOf(v))
     p.hiLo = p.augA | ReplaceEach(by=lambda(a,d):1 if a>d else -1)
 
+    p.uab = Union(p.yA,p.yD)
+
     p.setup()
     return p
 
@@ -129,8 +130,8 @@ def testPlanning(self):
         print 'TEST: Planner'
         v = self.p.getView('xMidA')
         #check inferred storage
-        self.assertTrue( self.p.getView('xA').storeMe )
         plan = v.storagePlan()
+        self.assertTrue( self.p.getView('xA').storeMe )
         print 'midA plan:\n',"\n".join(map(str,plan.steps))
         #self.assertTrue(len(plan.steps)==5)
         self.checkEquiv(self.p, 'xMidA', [3,4,6,7])
@@ -139,6 +140,10 @@ def testAugment(self):
         print 'TEST: Augment'
         self.checkExact(self.p, 'hiLo', [-1]*5 + [+1]*4)
 
+    def testUnion(self):
+        print 'TEST: Union'
+        self.checkExact(self.p, 'uab', list(range(10)))
+
     def checkEquiv(self,p,viewName,expected):
         v = p.getView(viewName)
         v.storagePlan().execute(p)
@@ -155,8 +160,8 @@ def checkExact(self,p,viewName,expected):
         v = p.getView(viewName)
         v.storagePlan().execute(p)
         actual = list(self.rowsOf(v))
-        print 'expected:',expected
-        print 'actual:  ',actual
+        print 'exact expected:',expected,'len',len(expected)
+        print 'exact actual:  ',actual,'len',len(actual)
         self.assertTrue(len(actual)==len(expected))
         for i in range(len(actual)):
             self.assertTrue(actual[i]==expected[i])
diff --git a/tutorial/Makefile b/tutorial/Makefile
index fb2b18c..27270bf 100644
--- a/tutorial/Makefile
+++ b/tutorial/Makefile
@@ -1,25 +1,20 @@
 update:
-	cp ../guineapig.py .
-	cp ../guineapig1_1.py .
-	cp ../guineapig1_2.py .
-	cp ../guineapig1_3.py .
-wc:
-	perl -ne 'print if /\S/ && !/^\#/' guineapig.py | wc
+	echo updates no longer needed
 
 clean:
 	rm -rf gpig_views
 	rm -f total.gp
 	rm *.pyc
 
-tar: update
+tar: 
+	cp ../guineapig.py .
 	echo created on `date` > marker.txt
-	tar -cvzf tutorial.tgz README.txt marker.txt guineapig.py *corpus.txt id-parks.txt *.py phirl-naive.pig
+	tar -cvzf tutorial.tgz marker.txt guineapig.py *corpus.txt id-parks.txt *.py phirl-naive.pig
 
 upload: tar
 	scp tutorial.tgz raff.ml.cmu.edu:~/afs-home/www/10-605/gpigtut.tgz
 
-all-runs: run-wordcount run-ntup-wordcount run-prefcount run-wordcmp run-wordprob run-multi run-phirl run-reuse check-phirl \
-	run-phirl1_3 check-phirl1_3
+all-runs: run-wordcount run-ntup-wordcount run-prefcount run-wordcmp run-wordprob run-multi run-phirl run-reuse check-phirl
 	echo all tests run, did you see any problems\?
 
 run-wordcount:
@@ -69,10 +64,6 @@ run-phirl:
 	sort -gr gpig_views/look.gp | head
 	sort -g gpig_views/look.gp | head
 
-run-phirl1_3:
-	python phirl-naive1_3.py --store look
-	sort -gr gpig_views/look.gp | head
-	sort -g gpig_views/look.gp | head
 
 run-reuse:
 	rm -f gpig_views/*.gp*
@@ -83,7 +74,4 @@ check-phirl: run-phirl
 	sort -gr gpig_views/look.gp > phirl-actual-output.txt
 	diff phirl-actual-output.txt phirl-expected-output.txt
 
-check-phirl1_3: run-phirl1_3
-	sort -gr gpig_views/look.gp > phirl-actual-output.txt
-	diff phirl-actual-output.txt phirl-expected-output.txt
 
diff --git a/tutorial/README.txt b/tutorial/README.txt
index 47fd1b2..95e94f3 100644
--- a/tutorial/README.txt
+++ b/tutorial/README.txt
@@ -1,10 +1,9 @@
-Materials for tutorial on Guinea Pig. For more information, see:
+For more information, see:
 http://curtis.ml.cmu.edu/w/courses/index.php/Guinea_Pig
 
-In addition to the tutorial materials, there's also one larger example
-here, a soft-join program: phirl-naive.py.
+There's also one larger example here, a soft-join program: phirl-naive.py.
  
-The date this version was last modified is stored in the file marker.txt.
+The date this version was last modified is stored in the file marker.txt
 
 Recent changes:
 
@@ -19,4 +18,4 @@ Recent changes:
 
  10/9: added SafeEvaluator to 1.3.
 
- 10/15: moved source control to git. Extended the documents.
+ 11/11: added LC_COLLATE=C to sort command
diff --git a/tutorial/guineapig.py b/tutorial/guineapig.py
deleted file mode 100644
index 4e0b117..0000000
--- a/tutorial/guineapig.py
+++ /dev/null
@@ -1,1384 +0,0 @@
-##############################################################################
-# (C) Copyright 2014 William W. Cohen.  All rights reserved.
-##############################################################################
-
-import sys
-import logging
-import copy
-import subprocess
-import collections
-import os
-import os.path
-import urlparse
-import getopt
-import csv
-
-###############################################################################
-# helpers functions and data structures
-###############################################################################
-
-class GPig(object):
-    """Collection of utilities for Guinea Pig."""
-
-    HADOOP_LOC = 'hadoop'  #assume hadoop is on the path at planning time
-    MY_LOC = 'guineapig.py'
-
-    #global options for Guinea Pig can be passed in with the --opts
-    #command-line option, and these are the default values
-    defaultJar = '/usr/lib/hadoop/contrib/streaming/hadoop-streaming-1.2.0.1.3.0.0-107.jar'
-    envjar = os.environ.get('GP_STREAMJAR', defaultJar)
-    DEFAULT_OPTS = {'streamJar': envjar,
-                    'parallel':5,
-                    'target':'shell',
-                    'echo':0,
-                    'viewdir':'gpig_views',
-                    }
-    #there are the types of each option that has a non-string value
-    DEFAULT_OPT_TYPES = {'parallel':int,'echo':int}
-    #we need to pass non-default options in to mappers and reducers,
-    #but since the remote worker's environment can be different, we
-    #also need to pass in options computed from the environment
-    COMPUTED_OPTION_DEFAULTS = {'streamJar':defaultJar}
-
-    @staticmethod
-    def getCompiler(target):
-        if target=='shell': return ShellCompiler()
-        elif target=='hadoop': return HadoopCompiler()
-        else: assert 'illegal compilation target '+target
-
-    @staticmethod
-    def getArgvParams(): 
-        """Return a dictionary holding the argument of the --params option in
-        sys.argv."""
-        return GPig.getArgvDict('--params')
-
-    @staticmethod
-    def getArgvOpts(): 
-        """Return a dictionary holding the argument of the --opts option in
-        sys.argv."""
-        return GPig.getArgvDict('--opts')
-    
-    @staticmethod
-    def getArgvDict(optname):
-        """Return a dictionary of parameter values that were defined on the command line
-        view an option like '--params filename:foo.txt,basedir:/tmp/glob/'.
-        """
-        assert optname.startswith('--')
-        for i,a in enumerate(sys.argv):
-            if a==optname:
-                paramString = sys.argv[i+1]
-                return dict(pair.split(":") for pair in paramString.split(","))
-        return {}
-
-    @staticmethod
-    def rowsOf(view):
-        """Iterate over the rows in a view."""
-        for line in open(view.distributableFile()):
-            yield view.planner._serializer.fromString(line.strip())
-
-    @staticmethod
-    def onlyRowOf(view):
-        """Return the first row in a side view, and raise an error if it
-        is not the only row of the view."""
-        result = None
-        logging.info('loading '+view.distributableFile())
-        for line in open(view.distributableFile()):
-            assert not result,'multiple rows in stored file for '+view.tag
-            result = view.planner._serializer.fromString(line.strip())
-        return result
-
-    @staticmethod
-    class SafeEvaluator(object):
-        """Evaluates expressions that correzpond to serialized guinea pig rows."""
-        def __init__(self,restrictedBindings={}):
-            self.restrictedBindings = restrictedBindings
-        def eval(self,s):
-            code = compile(s,'<gpig row>','eval')
-            return eval(code,self.restrictedBindings)
-
-class Jin(object):
-    """"Object to hold description of a single join input."""
-
-    def __init__(self,view,by=(lambda x:x),outer=False):
-        self.view = view
-        self.joinBy = by
-        self.outer = outer
-        self._padWithNulls = False
-
-    def __str__(self):
-        viewStr = View.asTag(self.view) if self.view else '_'
-        outerStr = ',outer=True' if self.outer else ''
-        padStr = ',_padWithNulls=True' if self._padWithNulls else ''
-        return "Jin(%s,by=%s%s%s)" % (viewStr,self.joinBy,outerStr,padStr)
-
-class ReduceTo(object):
-    """An object x that can be the argument of a reducingTo=x
-    parameter in a Group view."""
-    def __init__(self,baseType,by=lambda accum,val:accum+val):
-        self.baseType = baseType
-        self.reduceBy = by
-
-class ReduceToCount(ReduceTo):
-    """Produce the count of the number of objects that would be placed in a group."""
-    def __init__(self):
-        ReduceTo.__init__(self,int,by=lambda accum,val:accum+1)
-
-class ReduceToSum(ReduceTo):
-    """Produce the sum of the objects - which must be numbers - that would
-    be placed in a group."""
-    def __init__(self):
-        ReduceTo.__init__(self,int,by=lambda accum,val:accum+val)
-
-class ReduceToList(ReduceTo):
-    """Produce a list of the objects that would be placed in a group."""
-    def __init__(self):
-        ReduceTo.__init__(self,list,by=lambda accum,val:accum+[val])
-
-###############################################################################
-# abstract views
-##############################################################################
-
-class View(object):
-    """A definition of a relation for Guinea Pig.  A View object can be
-    produce a storagePlan(), which can then be executed to produce the
-    contents of the relation.  Intutitively, a relation is and
-    unordered bag of rows, and a row an almost-arbitrary python data
-    structure. (It must be something that can be stored and retrieved
-    by the RowSerializer.)
-
-    Steps in the storagePlan are executed by delegation, thru the
-    planner, to methods of a View class named doFoo.
-    """
-
-    def __init__(self):
-        """The planner and tag must be set before this is used."""
-        self.planner = None       #pointer to planner object
-        self.tag = None           #for naming storedFile and checkpoints
-        self.storeMe = None       #try and store this view if true
-        self.retainedPart = None  #used in map-reduce views only
-        self.sideviews = []       #non-empty for Augment views only
-        self.inners = []          #always used
-
-    #self.inner is shortcut for inners[0]
-    def _getInner(self): return self.inners[0]
-    def _setInner(self,val): self.inners = [val]
-    inner = property(_getInner,_setInner)
-
-    #
-    # ways to modify a view
-    # 
-
-    def opts(self,stored=None):
-        """Return the same view with options set appropriately.  Possible
-        options include:
-
-          - stored=True - Explicitly store this view on disk whenever
-            it is used in another view's definition.  This might be set
-            by the user for debugging purposes, or by the planner,
-            to prevent incorrect optimizations.  Generally "inner"
-            views are not explicitly stored.
-            
-          - stored='distributedCache' - Store this view in the working
-            directory and/or the Hadoop distributed cache.
-            """
-
-        self.storeMe = stored
-        return self
-
-    def showExtras(self):
-        """Printable representation of the options for a view."""
-        result = ''
-        flagPairs = []
-        if self.storeMe: flagPairs += ['stored=%s' % repr(self.storeMe)]
-        if flagPairs: result += '.opts(' + ",".join(flagPairs) + ')'
-        return result
-
-    #
-    # how the view is saved on disk
-    #
-
-    def storedFile(self):
-        """The file that will hold the materialized relation."""
-        return self.planner.opts['viewdir'] + '/' + self.tag + '.gp'
-
-    def distributableFile(self):
-        """The file that will hold the materialized relation in the working directory
-        in preparation to be uploaded to the distributed cache."""
-        return self.tag + '.gp'
-
-    @staticmethod
-    def viewNameFor(fileName):
-        """The view associated with the given file name"""
-        vname = os.path.basename(fileName)
-        if vname.endswith(".gp"): vname = vname[0:-len(".gp")]
-        return vname
-
-    #
-    # semantics of the view
-    #
-
-    def checkpoint(self):
-        """A checkpoint is an intermediate computation for the view, which is
-        saved on disk.  The rowGenerator() for the view will assume
-        that the checkpoint is available.
-        """
-        assert False, 'abstract method called'
-
-    def checkpointPlan(self):
-        """A plan to produce checkpoint()."""
-        assert False, 'abstract method called'
-
-    def rowGenerator(self):
-        """A generator for the rows in this relation, which assumes existence
-        of the checkpoint."""
-        assert False, 'abstract method called'
-
-    def explanation(self):
-        """Return an explanation of how rows are generated."""
-        assert False, 'abstract method called'
-
-    def storagePlan(self):
-        """A plan to store the view."""
-        return self.planner.buildRecursiveStoragePlan(self)
-
-    def nonrecursiveStoragePlan(self):
-        """Materialize the relation, assuming that there are no descendent
-        inner views that need to be materialized first."""
-        plan = Plan()
-        plan.includeStepsOf(self.checkpointPlan())
-        plan.append(TransformStep(view=self,whatToDo='doStoreRows',srcs=[self.checkpoint()],dst=self.storedFile(),why=self.explanation()))
-        return plan
-            
-    def applyDict(self,mapping,innerviewsOnly=False):
-        """Given a mapping from view tags to views, replace every inner view with
-        the appropriate value from the mapping, and return the result."""
-        if self.tag in mapping and not innerviewsOnly:
-            return mapping[self.tag]
-        elif not self.inners:
-            return self
-        else:
-            result = copy.copy(self)
-            result.inners = map(lambda v:v.applyDict(mapping), self.inners)
-            return result
-
-    def sideviewsNeeded(self):
-        """Sideviews needed by this view."""
-        result = []
-        for sv in self.sideviews:
-            result += [sv]
-        for v in self.inners:
-            result += list(v._sideviewsOfDescendants())
-        return result
-        
-    def _sideviewsOfDescendants(self):
-        if not self.storeMe:
-            for sv in self.sideviews:
-                yield sv
-            for v in self.inners:
-                for sv in v._sideviewsOfDescendants():
-                    yield sv
-
-    def enforceStorageConstraints(self):
-        """Subclass this, if there are constraints on when one must explicitly
-        store inner views."""
-        pass
-
-    def doStoreRows(self):
-        """Called by planner at execution time to store the rows of the view."""
-        for row in self.rowGenerator():
-            print self.planner._serializer.toString(row)
-
-    #
-    # support the "pipe" syntax: view1 | view2
-    #
-
-    def __or__(self,otherView):
-        """Overload the pipe operator x | y to return with y, with x as its inner view."""
-        otherView.acceptInnerView(self)
-        return otherView
-
-    def acceptInnerView(self,otherView):
-        """Replace an appropriate input view with otherView. This is subclassed to 
-        implement the the pipe operator."""
-        assert not self.inner,'An inner view is defined for '+self.tag+' so you cannot use it as RHS of a pipe'
-        self.inner = otherView  #subclass if needed
-
-    #
-    # printing views
-    #
-
-    def pprint(self,depth=0,alreadyPrinted=None,sideview=False):
-        """Print a readable representation of the view."""
-        if alreadyPrinted==None: alreadyPrinted = set()
-        tabStr = '| ' * depth
-        tagStr = str(self.tag)
-        sideviewIndicator = '*' if sideview else ''
-        if self.tag in alreadyPrinted:
-            print tabStr + sideviewIndicator + tagStr + ' = ' + '...'
-        else:
-            sideviewInfo = "  sideviews: {"+",".join(map(lambda x:x.tag, self.sideviews))+"}" if self.sideviews else ""
-            print tabStr + sideviewIndicator + tagStr + ' = ' + str(self) + sideviewInfo
-            alreadyPrinted.add(self.tag)
-            for inner in self.inners:
-                inner.pprint(depth+1,alreadyPrinted)
-            for inner in self.sideviews:
-                inner.pprint(depth+1,alreadyPrinted,sideview=True)
-
-    @staticmethod
-    def asTag(view):
-        """Helper for printing views."""
-        if not view: return '(null view)'
-        elif view.tag: return view.tag 
-        else: return str(view)
-
-#
-# abstract view types
-#
-
-class Reader(View):
-    """Read data stored on the file system and make it look like a View."""
-
-    def __init__(self,src):
-        View.__init__(self)
-        self.src = src
-        self.inners = []
-
-    def checkpoint(self): 
-        return self.src
-
-    def checkpointPlan(self):
-        return Plan()  #empty plan
-
-    def explanation(self):
-        return [ 'read %s with %s' % (str(self.src),self.tag) ]
-
-    def acceptInnerView(self,otherView):
-        assert False, "Reader views cannot be used as RHS of a pipe"
-
-class Transformation(View):
-    """Streaming transformation on a single inner view."""
-
-    def __init__(self,inner=None):
-        View.__init__(self)
-        self.inner = inner
-    
-    # A transformation will stream on-the-fly through the inner
-    # relation, and produce a new version, so the checkpoint and plan
-    # to produce it are delegated to the inner View.
-
-    def checkpoint(self):
-        return self.inner.checkpoint()
-
-    def checkpointPlan(self):
-        return self.inner.checkpointPlan()
-
-    def explanation(self):
-        return self.inner.explanation() + [ 'transform to %s' % self.tag ]
-
-class MapReduce(View):
-    """A view that takes an inner relation and processes in a
-    map-reduce-like way."""
-
-    def __init__(self,inners,retaining):
-        View.__init__(self)
-        self.inners = inners
-        self.retainedPart = retaining
-    
-    def _isReduceInputFile(self,fileName):
-        return fileName.endswith('.gpri')
-
-    def checkpoint(self):
-        ## the checkpoint is the reducer input file
-        return self.planner.opts['viewdir'] + '/'  + self.tag + '.gpri'
-
-    def checkpointPlan(self):
-        plan = Plan()
-        for inner in self.inners:
-            plan.includeStepsOf(inner.checkpointPlan())
-        plan.includeStepsOf(self.mapPlan())
-        return plan
-
-    def enforceStorageConstraints(self):
-        for inner in self.inners:
-            innerChkpt = inner.checkpoint()
-            #optimizations break if you chain two map-reduces together
-            if innerChkpt and innerChkpt.endswith(".gpri"):
-                if not inner.storeMe:
-                    logging.info('making %s stored, to make possible a downstream map-reduce view' % inner.tag)
-                    inner.storeMe = True
-
-    def mapPlan(self):
-        log.error("abstract method not implemented")
-        
-    def doStoreKeyedRows(self,subview,key,index):
-        """Utility method used by concrete map-reduce classes to compute keys
-        and store key-value pairs.  Usually used as the main step in a
-        mapPlan. """
-        for row in subview.rowGenerator():
-            keyStr = self.planner._serializer.toString(key(row))
-            rrow = self.retainedPart(row) if self.retainedPart else row
-            valStr = self.planner._serializer.toString(rrow)
-            if index<0:
-                print "%s\t%s" % (keyStr,valStr)
-            else:
-                print "%s\t%d\t%s" % (keyStr,index,valStr)
-            
-##############################################################################
-#
-# concrete View classes
-#
-##############################################################################
-
-class ReuseView(Reader):
-    """Returns the objects in a previously stored view."""
-
-    def __init__(self,view):
-        if isinstance(view,View):
-            Reader.__init__(self,view.storedFile())
-            self.tag = "reuse_"+view.tag
-            self.reusedViewTag = view.tag
-            self.planner = view.planner
-        else:
-            assert False,'user-defined ReuseView not supported (yet)'
-
-    def rowGenerator(self):
-        for line in sys.stdin:
-            yield self.planner._serializer.fromString(line.strip())
-
-    def __str__(self):
-        return 'ReuseView("%s")' % self.src + self.showExtras()
-
-
-class ReadLines(Reader):
-    """ Returns the lines in a file, as python strings."""
-
-    def __init__(self,src):
-        Reader.__init__(self,src)
-
-    def rowGenerator(self):
-        for line in sys.stdin:
-            yield line
-
-    def __str__(self):
-        return 'ReadLines("%s")' % self.src + self.showExtras()
-
-class ReadCSV(Reader):
-    """ Returns the lines in a CSV file, converted to Python tuples."""
-
-    def __init__(self,src,**kw):
-        Reader.__init__(self,src)
-        self.kw = kw
-
-    def rowGenerator(self):
-        for tup in csv.reader(sys.stdin,**self.kw):
-            yield tup
-
-    def __str__(self):
-        return 'ReadCVS("%s",%s)' % (self.src,str(self.kw)) + self.showExtras()
-
-
-class ReplaceEach(Transformation):
-    """ In 'by=f'' f is a python function that takes a row and produces
-    its replacement."""
-    
-    def __init__(self,inner=None,by=lambda x:x):
-        Transformation.__init__(self,inner)
-        self.replaceBy = by
-
-    def rowGenerator(self):
-        for row in self.inner.rowGenerator():
-            yield self.replaceBy(row)
-
-    def explanation(self):
-        return self.inner.explanation() + [ 'replaced to %s' % self.tag ]
-
-    def __str__(self):
-        return 'ReplaceEach(%s, by=%s)' % (View.asTag(self.inner),str(self.replaceBy)) + self.showExtras()
-
-class Augment(Transformation):
-
-    def __init__(self,inner=None,sideviews=None,sideview=None,loadedBy=lambda v:list(GPig.rowsOf(v))):
-        Transformation.__init__(self,inner)
-        assert not (sideviews and sideview), 'cannot specify both "sideview" and "sideviews"'
-        self.sideviews = list(sideviews) if sideviews else [sideview]
-        self.loader = loadedBy
-        assert self.loader,'must specify a "loadedBy" function for Augment'
-
-    def enforceStorageConstraints(self):
-        for sv in self.sideviews:
-            sv.storeMe = 'distributedCache'
-
-    def rowGenerator(self):
-        augend = self.loader(*self.sideviews)
-        for row in self.inner.rowGenerator():
-            yield (row,augend)
-
-    def checkpointPlan(self):
-        plan = Plan()
-        plan.includeStepsOf(self.inner.checkpointPlan())
-        #the sideviews should have been stored by the top-level
-        #planner already, but they will need to be moved to a
-        #distributable location
-        for sv in self.sideviews:
-            plan.append(DistributeStep(sv))
-        return plan
-
-    def explanation(self):
-        return self.inner.explanation() + [ 'augmented to %s' % self.tag ]
-
-    def __str__(self):
-        sideviewTags = loaderTag = '*UNSPECIFIED*'
-        if self.sideviews!=None: sideviewTags = ",".join(map(View.asTag,self.sideviews))
-        if self.loader!=None: loaderTag = str(self.loader)
-        return 'Augment(%s,sideviews=%s,loadedBy=s%s)' % (View.asTag(self.inner),sideviewTags,loaderTag) + self.showExtras()
-
-
-class Format(ReplaceEach):
-    """ Like ReplaceEach, but output should be a string, and it will be be
-    stored as strings, ie without using the serializer."""
-
-    def __init__(self,inner=None,by=lambda x:str(x)):
-        ReplaceEach.__init__(self,inner,by)
-
-    def __str__(self):
-        return 'Format(%s, by=%s)' % (View.asTag(self.inner),str(self.replaceBy)) + self.showExtras()
-
-    def doStoreRows(self):
-        for row in self.rowGenerator():
-            print row
-
-class Flatten(Transformation):
-    """ Like ReplaceEach, but output of 'by' is an iterable, and all
-    results will be returned. """
-
-    def __init__(self,inner=None,by=None):
-        Transformation.__init__(self,inner)
-        self.flattenBy = by
-
-    def rowGenerator(self):
-        for row in self.inner.rowGenerator():
-            for flatrow in self.flattenBy(row):
-                yield flatrow
-
-    def explanation(self):
-        return self.inner.explanation() + [ 'flatten to %s' % self.tag ]
-
-    def __str__(self):
-        return 'Flatten(%s, by=%s)' % (View.asTag(self.inner),str(self.flattenBy)) + self.showExtras()
-
-class Filter(Transformation):
-    """Filter out a subset of rows that match some predicate."""
-    
-    def __init__(self,inner=None,by=lambda x:x):
-        Transformation.__init__(self,inner)
-        self.filterBy = by
-
-    def rowGenerator(self):
-        for row in self.inner.rowGenerator():
-            if self.filterBy(row):
-                yield row
-
-    def explanation(self):
-        return self.inner.explanation() + [ 'filtered to %s' % self.tag ]
-
-    def __str__(self):
-        return 'Filter(%s, by=%s)' % (View.asTag(self.inner),str(self.filterBy)) + self.showExtras()
-
-class Distinct(MapReduce):
-    """Remove duplicate rows."""
-
-    def __init__(self,inner=None,retaining=None):
-        MapReduce.__init__(self,[inner],retaining)
-
-    def mapPlan(self):
-        plan = Plan()
-        plan.append(PrereduceStep(view=self,whatToDo='doDistinctMap',srcs=[self.inner.checkpoint()],dst=self.checkpoint(),why=self.explanation()))
-        return plan
-
-    def rowGenerator(self):
-        """Extract distinct elements from a sorted list."""
-        lastval = None
-        for line in sys.stdin:
-            valStr = line.strip()
-            val = self.planner._serializer.fromString(valStr)
-            if val != lastval and lastval: 
-                yield lastval
-            lastval = val
-        if lastval: 
-            yield lastval
-
-    def explanation(self):
-        return self.inner.explanation() + [ 'make distinct to %s' % self.tag]
-
-    def __str__(self):
-        return 'Distinct(%s)' % (View.asTag(self.inner)) + self.showExtras()
-
-    def doDistinctMap(self):
-        self.inner.doStoreRows()
-
-
-class Group(MapReduce):
-    """Group by some property of a row, defined with the 'by' option.
-    Default outputs are tuples (x,[r1,...,rk]) where the ri's are rows
-    that have 'by' values of x."""
-
-    def __init__(self,inner=None,by=lambda x:x,reducingTo=ReduceToList(),retaining=None):
-        MapReduce.__init__(self,[inner],retaining)
-        self.groupBy = by
-        self.reducingTo = reducingTo
-    
-    def mapPlan(self):
-        plan = Plan()
-        plan.append(PrereduceStep(view=self,whatToDo='doGroupMap',srcs=[self.inner.checkpoint()],dst=self.checkpoint(),why=self.explanation()))
-        return plan
-
-    def rowGenerator(self):
-        """Group objects from stdin by key, yielding tuples (key,[g1,..,gn])."""
-        lastkey = key = None
-        accum = self.reducingTo.baseType()
-        for line in sys.stdin:
-            keyStr,valStr = line.strip().split("\t")
-            key = self.planner._serializer.fromString(keyStr)
-            val = self.planner._serializer.fromString(valStr)
-            if key != lastkey and lastkey!=None: 
-                yield (lastkey,accum)
-                accum = self.reducingTo.baseType()
-            accum = self.reducingTo.reduceBy(accum, val)
-            lastkey = key
-        if key: 
-            yield (key,accum)
-
-    def explanation(self):
-        return self.inner.explanation() + ['group to %s' % self.tag]
-
-    def __str__(self):
-        return 'Group(%s,by=%s,reducingTo=%s)' % (View.asTag(self.inner),str(self.groupBy),str(self.reducingTo)) + self.showExtras()
-
-    def doGroupMap(self):
-        self.doStoreKeyedRows(self.inner,self.groupBy,-1)
-
-class Join(MapReduce):
-    """Outputs tuples of the form (row1,row2,...rowk) where
-    rowi is from the i-th join input, and the rowi's have the same
-    value of the property being joined on."""
-
-    def __init__(self,*joinInputs):
-        #sets self.inners
-        MapReduce.__init__(self,map(lambda x:x.view, joinInputs),None)
-        self.joinInputs = joinInputs
-        #re-interpret the 'outer' join parameters - semantically
-        #if jin[i] is outer, then all other inputs must be marked as _padWithNulls
-        if any(map(lambda jin:jin.outer, self.joinInputs)):
-            assert len(self.joinInputs)==2,'outer joins are only supported on two-way joins '+str(self.joinInputs)
-            for i in range(len(self.joinInputs)):
-                if self.joinInputs[i].outer:
-                    j = 1-i  #the other index
-                    self.joinInputs[j]._padWithNulls = True
-    
-    def acceptInnerView(self,otherView):
-        assert False, 'join cannot be RHS of a pipe - use JoinTo instead'
-
-    def mapPlan(self):
-        plan = Plan()
-        innerCheckpoints = map(lambda v:v.checkpoint(), self.inners)
-        step = PrereduceStep(view=self, whatToDo='doJoinMap',srcs=innerCheckpoints,dst=self.checkpoint(),why=self.explanation())
-        plan.append(step)
-        return plan
-
-    def applyDict(self,mapping,innerviewsOnly=False):
-        result = MapReduce.applyDict(self,mapping,innerviewsOnly=innerviewsOnly)
-        #also need to map over the join inputs
-        if isinstance(result,Join):
-            for i in range(len(result.joinInputs)):
-                result.joinInputs[i].view = result.inners[i]
-        return result
-
-    def rowGenerator(self):
-        """Group objects from stdin by key, yielding tuples (row1,row2,...)."""
-        lastkey = None
-        lastIndex = len(self.joinInputs)-1
-        somethingProducedForLastKey = False
-        #accumulate a list of lists of all non-final inputs
-        accumList = [ [] for i in range(lastIndex) ]
-        for line in sys.stdin:
-            keyStr,indexStr,valStr = line.strip().split("\t")
-            key = self.planner._serializer.fromString(keyStr)
-            index = int(indexStr)
-            val = self.planner._serializer.fromString(valStr)
-            if key != lastkey and lastkey!=None: 
-                #if the final join is marked as _padWithNulls, clear
-                #the accumulators, since we're doing an outer join
-                #with the last view
-                if self.joinInputs[lastIndex]._padWithNulls and not somethingProducedForLastKey:
-                    for tup in self._joinAccumulatedValuesTo(accumList,lastIndex,None):
-                        yield tup
-                #reset the accumulators, since they pertain to the 
-                accumList = [ [] for i in range(lastIndex) ]
-                somethingProducedForLastKey = False
-            if index!=lastIndex:
-                #accumulate values to use in the join
-                accumList[index] = accumList[index] + [val]
-            else:
-                #produce tuples that match the key for the last view
-                for tup in self._joinAccumulatedValuesTo(accumList,lastIndex,val):
-                    somethingProducedForLastKey = True
-                    yield tup
-            lastkey = key
-
-    def _joinAccumulatedValuesTo(self,accumList,lastIndex,finalVal):
-        # _padWithNulls as needed
-        for i in range(lastIndex):
-            if self.joinInputs[i]._padWithNulls and not accumList[i]:
-                accumList[i] = [None]
-        tupbuf = [ None for i in range(lastIndex+1) ]  #holds output
-        tupbuf[lastIndex] = finalVal
-        for i in range(lastIndex):
-            for a in accumList[i]:
-                tupbuf[i] = a
-                if i==lastIndex-1 and any(tupbuf):
-                        yield tuple(tupbuf)
-
-    def explanation(self):
-        innerEx = []
-        for inner in self.inners:
-            if innerEx: innerEx += ['THEN']
-            innerEx += inner.explanation()
-        return innerEx + [ 'FINALLY join to %s' % self.tag ]
-
-    def __str__(self):
-        return "Join(%s)" % ",".join(map(str,self.joinInputs)) + self.showExtras()
-
-    def doJoinMap(self,i):
-        # called by joinMapPlan with argument index, and stdin pointing to innerCheckpoints[index]
-        self.doStoreKeyedRows(self.joinInputs[i].view,self.joinInputs[i].joinBy,i)
-
-class JoinTo(Join):
-    """Special case of Join which can be used as the RHS of a pipe operator."""
-
-    def __init__(self,joinInput,by=None):
-        Join.__init__(self,Jin(None,by),joinInput)
-        
-    def acceptInnerView(self,otherView):
-        self.joinInputs[0].view = otherView
-        self.inners[0] = otherView
-
-##############################################################################
-#
-# the top-level planner, and its supporting classes
-#
-##############################################################################
-
-class Plan(object):
-    """A plan constructed by a GuineaPig."""
-
-    def __init__(self): 
-        self.steps = []
-        self.tasks = []
-
-    def append(self,step): 
-        self.steps.append(step)
-
-    def includeStepsOf(self,subplan):
-        self.steps += subplan.steps
-
-    def execute(self,gp,echo=False):
-        script = self.compile(gp)
-        for shellcom in script:
-            if echo: print 'calling:',shellcom
-            subprocess.check_call(shellcom,shell=True)
-
-    def buildTasks(self):
-        """Group the steps into AbstractMapReduceTask's"""
-        self.tasks = [AbstractMapReduceTask()]
-        for step in self.steps:
-            if not self.tasks[-1].insert(step):
-                self.tasks.append(AbstractMapReduceTask())
-                status = self.tasks[-1].insert(step)
-                assert status, 'failure to insert '+str(step)+' in fresh AbstractMapReduceTask'
-
-    def compile(self,gp):
-        """Return a list of strings that can be run as shell commands."""
-        self.buildTasks()
-        logging.info("%d steps converted to %d abstract map-reduce tasks" % (len(self.steps),len(self.tasks)))
-        script = []
-        taskCompiler = GPig.getCompiler(gp.opts['target']) 
-        for task in self.tasks:
-            script += taskCompiler.compile(task,gp)
-        return script
-
-#
-# a single step in a plan produced by the planner
-#
-
-class Step(object):
-    """A single step of the plans produced by the planner, along with the
-    methods to convert the plans into executable shell commands."""
-
-    def __init__(self,view):
-        self.view = view
-        self.reused = []  # list of views reused at this point
-        self.why = []
-
-    def setReusedViews(self,views):
-        self.reused = list(views)
-
-    def explain(self):
-        """Convert an explanation - which is a list of strings - into a string"""
-        return "...".join(self.why)
-
-#
-# a single step in a plan produced by the planner
-#
-
-class DistributeStep(Step):
-    """Prepare a stored view for the dDistributed cache."""
-
-    def __init__(self,view):
-        Step.__init__(self,view)
-
-    def __str__(self):
-        return "DistributeStep(%s,reused=%s)" % (repr(self.view.tag),repr(self.reused))
-
-class TransformStep(Step):
-    """Tranform input to output."""
-    def __init__(self,view,whatToDo,srcs,dst,why):
-        Step.__init__(self,view)
-        self.whatToDo = whatToDo
-        self.srcs = srcs
-        self.dst = dst
-        self.why = why
-
-    def __str__(self):
-        return "TransformStep("+",".join(map(repr, [self.view.tag,self.whatToDo,self.srcs,self.dst,self.reused]))+")"
-
-class PrereduceStep(Step):
-    def __init__(self,view,whatToDo,srcs,dst,why):
-        Step.__init__(self,view)
-        self.whatToDo = whatToDo
-        self.srcs = srcs
-        self.dst = dst
-        self.why = why
-
-    def __str__(self):
-        return "PrereduceStep("+",".join(map(repr, [self.view.tag,self.whatToDo,self.srcs,self.dst,self.reused]))+")"
-
-# combine steps into something executable via hadoop - or shell
-
-class AbstractMapReduceTask(object):
-    """A collection of steps that can be executed as a single map-reduce operation,
-    possibly with some file managements steps to set up the task."""
-
-    def __init__(self):
-        self.distributeSteps = []
-        self.mapStep = None
-        self.reduceStep = None
-
-    def insert(self,step):
-        """Treating the AbstractMapReduceTask as a buffer, add this step to it if possible."""
-        if isinstance(step,DistributeStep):
-            #we can accept any number of distribute steps
-            self.distributeSteps.append(step)
-            return True
-        elif self.mapStep==None and (isinstance(step,TransformStep) or isinstance(step,PrereduceStep)):
-            #we can only have one map step, so fill up an empty slot if possible
-            self.mapStep = step
-            return True
-        elif self.mapStep and isinstance(self.mapStep,PrereduceStep) and isinstance(step,TransformStep) and not self.reduceStep:
-            #if the mapstep is a prereduce, then we can also allow a reduce step
-            self.reduceStep = step
-            return True
-        else:
-            return False
-            
-    def __str__(self):
-        buf = "mapreduce task:"
-        for step in self.distributeSteps:
-            buf += "\n - d "+str(step)
-        buf += "\n - m " + str(self.mapStep)
-        if self.reduceStep:
-            buf += "\n - r " + str(self.reduceStep)
-        return buf
-
-class MRCompiler(object):
-    """Abstract compiler class to convert a task to a list of commands that can be executed by the shell."""
-
-    def compile(self,task,gp):
-        script = []
-        # an explanation/header
-        if not task.reduceStep: 
-            script += ['echo create '+task.mapStep.view.tag + ' via map: ' + task.mapStep.explain()]        
-        else: 
-            script += ['echo create '+task.reduceStep.view.tag +' via map/reduce: '+task.reduceStep.explain()]
-        for step in task.distributeSteps:
-            localCopy = step.view.distributableFile()                
-            maybeRemoteCopy = step.view.storedFile()
-            echoCom = 'echo distribute %s: making a local copy of %s in %s' % (step.view.tag,maybeRemoteCopy,localCopy)
-            script += [echoCom] + self.distributeCommands(task, gp, maybeRemoteCopy,localCopy)
-        if not task.reduceStep and len(task.mapStep.srcs)==1:
-            mapCom = self._coreCommand(task.mapStep,gp)
-            script += self.simpleMapCommands(task, gp, mapCom, task.mapStep.srcs[0], task.mapStep.dst)
-        elif task.reduceStep and len(task.mapStep.srcs)==1:
-            mapCom = self._coreCommand(task.mapStep,gp)
-            reduceCom = self._coreCommand(task.reduceStep,gp)
-            script += self.simpleMapReduceCommands(task, gp, mapCom, reduceCom, task.mapStep.srcs[0], task.reduceStep.dst)
-        elif task.reduceStep and len(task.mapStep.srcs)>1:
-            mapComs = [self._ithCoreCommand(task.mapStep,gp,i) for i in range(len(task.mapStep.srcs))]
-            reduceCom = self._coreCommand(task.reduceStep,gp)
-            midpoint = gp.opts['viewdir']+'/'+task.mapStep.view.tag+'.gpmo'
-            script += self.joinCommands(task, gp, mapComs, reduceCom, task.mapStep.srcs, midpoint, task.reduceStep.dst)
-        else:
-            assert False,'cannot compile task '+str(task)
-        return script
-
-    # abstract routines
-
-    def distributeCommands(self,task,gp,maybeRemoteCopy,localCopy):
-        """Distribute the remote copy to the local directory."""
-        assert False, 'abstract method called'
-
-    def simpleMapCommands(self,task,gp,mapCom,src,dst):
-        """A map-only task with zero or one inputs."""
-        assert False, 'abstract method called'
-
-    def simpleMapReduceCommands(self,task,gp,mapCom,reduceCom,src,dst):
-        """A map-reduce task with one inputs."""
-        assert False, 'abstract method called'
-
-    def joinCommands(self,task,gp,mapComs,reduceCom,srcs,midpoint,dst):
-        """A map-reduce task with several inputs."""
-        assert False, 'abstract method called'
-
-    # utilities
-
-    def _stepSideviewFiles(self,step):
-        files = []
-        for sv in step.view.sideviewsNeeded():
-            files += [sv.distributableFile()]
-
-    def _coreCommand(self,step,gp):
-        """Python command to call an individual plan step."""
-        return 'python %s --view=%s --do=%s' % (gp._gpigSourceFile,step.view.tag,step.whatToDo) + self.__coreCommandOptions(step,gp)
-
-    def _ithCoreCommand(self,step,gp,i):
-        """Like _coreCommand but allows index parameter to 'do' option"""
-        return 'python %s --view=%s --do=%s.%d' % (gp._gpigSourceFile,step.view.tag,step.whatToDo,i) + self.__coreCommandOptions(step,gp)
-
-    def __coreCommandOptions(self,step,gp):
-        paramOpts = '' if not gp.param else " --params " + ",".join(map(lambda(k,v):k+':'+v, gp.param.items()))
-        nonDefaults = []
-        for (k,v) in gp.opts.items():
-            #pass in non-default options, or options computed from the environment
-            if (gp.opts[k] != GPig.DEFAULT_OPTS[k]) or ((k in GPig.COMPUTED_OPTION_DEFAULTS) and (gp.opts[k] != GPig.COMPUTED_OPTION_DEFAULTS[k])):
-                nonDefaults += ["%s:%s" % (k,str(v))]
-        optsOpts = '' if not nonDefaults else " --opts " + ",".join(nonDefaults)
-        reuseOpts = '' if not step.reused else " --reuse "+ " ".join(step.reused)
-        return paramOpts  + optsOpts + reuseOpts
-
-
-class ShellCompiler(MRCompiler):
-    """Compile tasks to commands that are executable to most Unix shells."""
-
-    def distributeCommands(self,task,gp,maybeRemoteCopy,localCopy):
-        """Distribute the remote copy to the local directory."""
-        return ['cp -f %s %s || echo warning: the copy failed!' % (maybeRemoteCopy,localCopy)]
-
-    def simpleMapCommands(self,task,gp,mapCom,src,dst):
-        """A map-only job with zero or one inputs."""
-        if src: return [mapCom + ' < %s > %s' % (src,dst)]
-        else: return [self.mapCommand(gp) + (' > %s' % (dst))]
-        
-    def simpleMapReduceCommands(self,task,gp,mapCom,reduceCom,src,dst):
-        """A map-reduce job with one input."""
-        return [mapCom + ' < ' + src + ' | sort -k1 | '+reduceCom + ' > ' + dst]
-
-    def joinCommands(self,task,gp,mapComs,reduceCom,srcs,midpoint,dst):
-        """A map-reduce job with several inputs."""
-        subplan = ['rm -f %s' % midpoint]
-        for i,ithMapCom in enumerate(mapComs):
-            subplan += [ithMapCom + ' < ' + srcs[i] + ' >> ' + midpoint]
-        subplan += [ 'sort -k1,2  < ' + midpoint + ' | ' + reduceCom + ' > ' + dst]
-        return subplan
-
-class HadoopCompiler(MRCompiler):
-    """Compile tasks to commands that are executable to most Unix shells
-    after hadoop has been installed."""
-
-    def distributeCommands(self,task,gp,maybeRemoteCopy,localCopy):
-        return ['rm -f %s' % localCopy, '%s fs -getmerge %s %s' % (GPig.HADOOP_LOC,maybeRemoteCopy,localCopy)]
-
-    def simpleMapCommands(self,task,gp,mapCom,src,dst):
-        assert src,'Wrap not supported for hadoop'
-        hcom = self.HadoopCommandBuf(gp,task)
-        hcom.extendDef('-D','mapred.reduce.tasks=0')
-        hcom.extend('-input',src,'-output',dst)
-        hcom.extend("-mapper '%s'" % mapCom)
-        return [ self._hadoopCleanCommand(gp,dst), hcom.asEcho(), hcom.asString() ]
-
-    def simpleMapReduceCommands(self,task,gp,mapCom,reduceCom,src,dst):
-        hcom = self.HadoopCommandBuf(gp,task)
-        hcom.extendDef('-D','mapred.reduce.tasks=%d' % gp.opts['parallel'])
-        hcom.extend('-input',src,'-output',dst)
-        hcom.extend("-mapper '%s'" % mapCom)
-        hcom.extend("-reducer '%s'" % reduceCom)
-        return [ self._hadoopCleanCommand(gp,dst), hcom.asEcho(), hcom.asString() ]
-        
-    def joinCommands(self,task,gp,mapComs,reduceCom,srcs,midpoint,dst):
-        def midi(i): return midpoint + '-' + str(i)
-        subplan = []
-        for i in range(len(srcs)):
-            hcom = self.HadoopCommandBuf(gp,task)
-            hcom.extendDef('-D','mapred.reduce.tasks=%d' % gp.opts['parallel'])
-            hcom.extend('-input',srcs[i], '-output',midi(i))
-            hcom.extend("-mapper","'%s'" % mapComs[i])
-            subplan += [ self._hadoopCleanCommand(gp,midi(i)), hcom.asEcho(), hcom.asString() ]
-        hcombineCom = self.HadoopCommandBuf(gp,task)
-        hcombineCom.extendDef('-D','mapred.reduce.tasks=%d' % gp.opts['parallel'])
-        hcombineCom.extendDef('-jobconf','stream.num.map.output.key.fields=3')
-        hcombineCom.extendDef('-jobconf','num.key.fields.for.partition=1')
-        for i in range(len(srcs)):
-            hcombineCom.extend('-input',midi(i))
-        hcombineCom.extend('-output',dst)
-        hcombineCom.extend('-mapper','cat')
-        hcombineCom.extend('-reducer',"'%s'" % reduceCom)
-        hcombineCom.extend('-partitioner','org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner')
-        subplan += [ self._hadoopCleanCommand(gp,dst),  hcombineCom.asEcho(), hcombineCom.asString() ]
-        return subplan
-
-    class HadoopCommandBuf(object):
-        """Utility to hold the various pieces of a hadoop command."""
-        def __init__(self,gp,task):
-            logging.debug('building hadoop command for '+str(task.mapStep.view.tag))
-            self.invocation = [GPig.HADOOP_LOC,'jar',gp.opts['streamJar']]
-            self.defs = []
-            self.args = []
-            self.files = []
-            for f in gp._shippedFiles:
-                self.files += ['-file',f]
-            for sv in task.mapStep.view.sideviewsNeeded():
-                self.files += ['-file',sv.distributableFile()]
-            if task.reduceStep:
-                for sv in task.reduceStep.view.sideviewsNeeded():
-                    self.files += ['-file',sv.distributableFile()]
-            logging.debug('files: '+str(self.files))
-        def extend(self,*toks):
-            self.args += list(toks)
-        def extendDef(self,*toks):
-            self.defs += list(toks)
-        def asEcho(self):
-            return " ".join(['echo','hadoop'] + self.args + ['...'])
-        def asString(self):
-            return " ".join(self.invocation+self.defs+self.files+self.args)
-
-    def _hadoopCleanCommand(self,gp,fileName):
-        """A command to remove a hdfs directory if it exists."""
-        return '(%s fs -test -e %s && %s fs -rmr %s) || echo no need to remove %s' % (GPig.HADOOP_LOC,fileName, GPig.HADOOP_LOC,fileName, fileName)
-
-#
-# replacable object to save objects to disk and retrieve them
-#
-
-class RowSerializer(object):
-    """Saves row objects to disk and retrieves them."""
-    def __init__(self):
-        self.evaluator = GPig.SafeEvaluator()
-    def toString(self,x): 
-        return repr(x)
-    def fromString(self,s): 
-        return self.evaluator.eval(s)
-
-#
-# the planner
-#
-
-class Planner(object):
-    """Can create storage plans for views that are defined as parts of it."""
-
-    def __init__(self,**kw):
-
-        #parameters are used for programmatically give user-defined
-        #config information to a planner, or they can be specified in
-        #the command-line
-        self.param = kw
-        for (key,val) in GPig.getArgvParams().items():
-                # don't override non-null values specified in the constructor
-                if self.param.get(key)==None:
-                    self.param[key] = val
-
-        #opts are used for giving options to the planner from the shell
-        self.opts = GPig.getArgvOpts()
-        for (key,val) in GPig.DEFAULT_OPTS.items():
-            if (not key in self.opts): self.opts[key] = val
-        for (key,type) in GPig.DEFAULT_OPT_TYPES.items():
-            self.opts[key] = type(self.opts[key])
-
-        #use appropriate for the target
-        self._serializer = RowSerializer()
-
-        #views that aren't associated with class variable, but are
-        #instead named automatically - ie, inner views with no
-        #user-provided names.
-        self._autoNamedViews = {}
-
-        #by default, use info-level logging at planning time
-        if not Planner.partOfPlan(sys.argv): 
-            logging.basicConfig(level=logging.INFO)
-
-        #hadoop needs to know where to give the main script file,
-        #as well as the guineapig.py file it uses
-        self._gpigSourceFile = sys.argv[0]
-        self._shippedFiles = [GPig.MY_LOC,self._gpigSourceFile]
-
-    def setup(self):
-        """Initialize planner, and views used by the planner.  This has to be
-        done after the planner is fully configured by adding views."""
-
-        self.reusableViews = {}
-        # make sure view directory is valid
-        if self.opts['target']=='shell' and not os.path.exists(self.opts['viewdir']):
-            logging.info('creating view directory ' + self.opts['viewdir'])
-            os.makedirs(self.opts['viewdir'])
-        elif self.opts['target']=='hadoop':
-            p = urlparse.urlparse(self.opts['viewdir'])
-            if not p.path.startswith("/"):
-                logging.warn('hadoop viewdir should be absolite path: will try prefixing /user/$LOGNAME')
-                username = os.environ.get('LOGNAME','me')
-                self.opts['viewdir'] = '/user/'+username+'/'+self.opts['viewdir']
-                logging.warn('viewdir is set to '+self.opts['viewdir'])
-
-        # Add 'tag' and planner fields to each view
-        for vname in self.listViewNames():
-            v = self.getView(vname)
-            v.tag = vname
-            v.planner = self
-        def tagUnnamedViews(v,basename,index,depth):
-            assert v,'null inner view for '+basename
-            if not v.planner:
-                v.planner = self
-                autoname = '%s_%d_%s' % (basename,depth,index)
-                self._setView(autoname,v)
-                for i,inner in enumerate(v.inners + v.sideviews):
-                    tagUnnamedViews(inner,vname,i,depth+1)
-        for vname in self.listViewNames():
-            v = self.getView(vname)
-            for i,inner in enumerate(v.inners + v.sideviews):
-                tagUnnamedViews(inner,vname,i,1)
-
-        # Add caching options as needed
-        for vname in self.listViewNames():
-            v = self.getView(vname)
-            v.enforceStorageConstraints()
-
-    #
-    # utils
-    # 
-            
-
-    def getView(self,str,mustExist=False):
-        """Find the defined relation named str, and if necessary bind its
-        planner and tag appropriately."""
-        v = self.__class__.__dict__.get(str) or self.__dict__.get(str) or self._autoNamedViews.get(str)
-        if mustExist: assert v,'cannot find a view named '+str
-        return v
-
-    def _setView(self,str,view):
-        """Internal use only: allow the view to be retreived by name later."""
-        view.tag = str
-        self._autoNamedViews[str] = view
-
-    def listViewNames(self):
-        def namedViews(d): return [vname for vname in d.keys() if isinstance(self.getView(vname),View)]
-        userNamedViews =  namedViews(self.__class__.__dict__) + namedViews(self.__dict__)
-        return userNamedViews + self._autoNamedViews.keys()
-
-    #
-    # planning
-    # 
-
-    def buildRecursiveStoragePlan(self,view):
-        """Called by view.storagePlan.""" 
-        #figure out what to reuse - starting with what the user specified
-        storedViews = dict(self.reusableViews)
-        #also mark for eager storage anything that's used twice in the
-        #plan---i.e., anything that is consumed by two or more views
-        numParents = collections.defaultdict(int)
-        for dv in self._descendants(view):
-            for inner in dv.inners + dv.sideviews:
-                numParents[inner] += 1
-        for (dv,n) in numParents.items():
-            if n>1 and dv.storeMe==None:
-                logging.info('making %s stored because it is used %d times in creating %s' % (dv,n,view.tag))
-                dv.storeMe = True
-
-        #traverse view in pre-order and find a linear sequence of
-        #views to store, each of which requires only views earlier in
-        #the sequence
-        storageSeq = self._storageSeq(view,storedViews) + [view.tag]
-        logging.info('storage sequence is: ' + ",".join(storageSeq))
-
-        #splice together plans for each view in the sequence,
-        #after first modifying the view so that nothing is called
-        #directly, but only through the ReuseView proxies
-        plan = Plan()
-        for tag in storageSeq:
-            v = self.getView(tag,mustExist=True)
-            vm = v.applyDict(storedViews,innerviewsOnly=True)
-            subplan = vm.nonrecursiveStoragePlan()
-            #add the correct context of reused views to the subplan,
-            #so that that the actual definition of the view will be
-            #rewritten appropriately to include the new ReuseView
-            #proxy for it
-            viewsLocallyReused = self._reuseViewDescendants(vm)
-            for s in subplan.steps:
-                s.setReusedViews(viewsLocallyReused)
-            plan.includeStepsOf(subplan)
-        return plan
-
-    def _reuseViewDescendants(self,view):
-        """Descendent views that are ReuseView's"""
-        result = set()
-        for dv in self._descendants(view):
-            if isinstance(dv,ReuseView):
-                result.add(dv.reusedViewTag)
-        return result
-
-    def _descendants(self,view):
-        """Descendents of a view."""
-        result = set()
-        result.add(view)
-        for inner in view.inners + view.sideviews:
-            result = result.union(self._descendants(inner))
-        return result
-
-    def _storageSeq(self,view,storedViews):
-        """Linear sequence of storage actions to take - as view tags."""
-        seq = []
-        for inner in view.inners + view.sideviews:
-            if not inner.tag in storedViews:
-                seq += self._storageSeq(inner,storedViews)
-                if inner.storeMe:
-                    seq += [inner.tag]
-                    storedViews[inner.tag] = ReuseView(inner)
-        return seq
-
-    #
-    # dealing with the file storage system and related stuff
-    #
-
-    def ship(self,*fileNames):
-        """Declare a set of inputs to be 'shipped' to the hadoop cluster."""
-        self._shippedFiles += fileNames
-
-    def setSerializer(self,serializer):
-        """Replace the default serializer another RowSerializer object."""
-        self._serializer = serializer
-        return self
-
-    def setEvaluator(self,rowEvaluator):
-        """Specify a function which will deserialize a string that was produced
-        by Python's 'repr' function."""
-        self._serializer.evaluator = rowEvaluator
-        return self
-
-    #
-    # rest of the API for the planner
-    # 
-
-    @staticmethod
-    def partOfPlan(argv):
-        """True if the command line was generated as part of a storage plan."""
-        return any(s.startswith("--do") for s in argv)
-
-    def main(self,argv):
-        """Run a main that lets you --store a view, as well as doing a few other things."""
-        self.setup()
-        self.runMain(argv)
-
-    def runMain(self,argv):
-
-        # parse the options and dispatch appropriately
-        argspec = ["store=", "cat=", "reuse", 
-                   "list", "pprint=", "steps=", "tasks=", "plan=", 
-                   "params=", "opts=", "do=", "view="]
-        optlist,args = getopt.getopt(argv[1:], 'x', argspec)
-        optdict = dict(optlist)
-        
-        # decide what views can be re-used, vs which need fresh plans
-        if '--reuse' in optdict:  #reuse the views listed in the arguments
-            for a in args:
-                vname = View.viewNameFor(a)
-                v = self.getView(vname)
-                if v:
-                    self.reusableViews[v.tag] = ReuseView(v)
-                    logging.info("re-using data stored for view "+vname+": "+str(v))
-                else:
-                    logging.warn("cannot re-use view "+vname+" since it's not used in this script")
-
-        #choose the main action to take
-        if '--store' in optdict:  #store a view
-            rel = self.getView(optdict['--store'],mustExist=True)            
-            plan = rel.storagePlan()
-            plan.execute(self, echo=self.opts['echo'])
-            return
-        elif '--pprint' in optdict: #print a view
-            rel = self.getView(optdict['--pprint'],mustExist=True)
-            rel.applyDict(self.reusableViews).pprint()
-            return
-        elif '--steps' in optdict: #print steps to produce a view 
-            rel = self.getView(optdict['--steps'],mustExist=True)
-            plan = rel.storagePlan()
-            for s in plan.steps:
-                print ' -',s
-            return
-        elif '--tasks' in optdict: #print AbstractMapReduceTasks to produce a view 
-            rel = self.getView(optdict['--tasks'],mustExist=True)
-            plan = rel.storagePlan()
-            plan.buildTasks()
-            for t in plan.tasks:
-                print t
-            return
-        elif '--plan' in optdict:    #print a storage plan
-            rel = self.getView(optdict['--plan'],mustExist=True)
-            plan = rel.storagePlan()
-            script = plan.compile(self)
-            print "\n".join(script)
-            return
-        elif '--cat' in optdict:    #store and then print a view
-            assert self.opts['target']=='shell','cannot do --cat except in shell mode'
-            rel = self.getView(optdict['--cat'],mustExist=True)
-            plan = rel.storagePlan()
-            plan.execute(self, self.opts['echo'])
-            for line in open(rel.storedFile(),'r'):
-                print line,
-            return
-        elif '--list' in optdict:   #list named views
-            for vname in self.listViewNames():
-                print '     ',vname,'\t',self.getView(vname)
-            return
-        elif '--do' in optdict:     #run an internally-generated action
-            #recover what should be stored when this action is performed
-            #work out what view to use and what routine to call
-            rel = self.getView(optdict['--view'],mustExist=True)
-            rel = rel.applyDict(self.reusableViews)
-            whatToDo = optdict['--do']
-            #work out the method given by 'do' and call it - note it
-            #may have a single integer argument, eg doJoinMap.1
-            k = whatToDo.find(".")
-            if k<0:
-                whatToDoMethod = getattr(rel,whatToDo)
-                whatToDoMethod()                
-            else:
-                arg = int(whatToDo[k+1:])
-                whatToDo = whatToDo[:k]
-                whatToDoMethod = getattr(rel,whatToDo)
-                whatToDoMethod(arg)                
-            return
-        else:
-            print 'usage: --[store|pprint|steps|plan|cat] view [--opts key:val,...] [--params key:val,...] --reuse view1 view2 ...]'
-            print '       --[list]'
-            print 'current legal keys for "opts", with default values:'
-            for (key,val) in GPig.DEFAULT_OPTS.items():
-                print '  %s:%s' % (key,str(val))
-            print 'There\'s more help at http://curtis.ml.cmu.edu/w/courses/index.php/Guinea_Pig'
-
-if __name__ == "__main__":
-    print 'There\'s help at http://curtis.ml.cmu.edu/w/courses/index.php/Guinea_Pig'    
diff --git a/tutorial/guineapig1_1.py b/tutorial/guineapig1_1.py
deleted file mode 100644
index 93c5fbf..0000000
--- a/tutorial/guineapig1_1.py
+++ /dev/null
@@ -1,1244 +0,0 @@
-##############################################################################
-# (C) Copyright 2014 William W. Cohen.  All rights reserved.
-##############################################################################
-
-import sys
-import logging
-import getopt
-import os
-import os.path
-import subprocess
-import collections
-import urlparse
-import csv
-
-###############################################################################
-# helpers and data structures
-###############################################################################
-
-class GPig(object):
-    """Collection of utilities for Guinea Pig."""
-
-    HADOOP_LOC = 'hadoop'  #assume hadoop is on the path at planning time
-
-    #global options for Guinea Pig can be passed in with the --opts
-    #command-line option, and these are the default values
-    envjar = os.environ.get('GP_STREAMJAR')
-    defaultJar = '/usr/lib/hadoop/contrib/streaming/hadoop-streaming-1.2.0.1.3.0.0-107.jar'
-    DEFAULT_OPTS = {'streamJar': envjar or defaultJar,
-                    'parallel':5,
-                    'target':'shell',
-                    'echo':0,
-                    'viewdir':'gpig_views',
-                    }
-    #there are the types of each option that has a non-string value
-    DEFAULT_OPT_TYPES = {'parallel':int,'echo':int}
-    #we need to pass non-default options in to mappers and reducers,
-    #but since the remote worker's environment can be different, we
-    #also need to pass in options computed from the environment
-    COMPUTED_OPTION_DEFAULTS = {'streamJar':defaultJar}
-
-    @staticmethod
-    def getArgvParams(): return GPig.getArgvDict('--params')
-
-    @staticmethod
-    def getArgvOpts(): return GPig.getArgvDict('--opts')
-    
-    @staticmethod
-    def getArgvDict(optname):
-        """Return a dictionary of parameter values that were defined on the command line
-        view an option like '--params filename:foo.txt,basedir:/tmp/glob/'.
-        """
-        assert optname.startswith('--')
-        for i,a in enumerate(sys.argv):
-            if a==optname:
-                paramString = sys.argv[i+1]
-                return dict(pair.split(":") for pair in paramString.split(","))
-        return {}
-
-    @staticmethod
-    def rowsOf(view):
-        """Iterate over the rows in a view."""
-        for line in open(view.distributableFile()):
-            yield view.planner._serializer.fromString(line.strip())
-
-    @staticmethod
-    def onlyRowOf(view):
-        """Return the first row in a side view, and raise an error if it
-        is not the only row of the view."""
-        result = None
-        logging.info('loading '+view.distributableFile())
-        for line in open(view.distributableFile()):
-            assert not result,'multiple rows in stored file for '+view.tag
-            result = view.planner._serializer.fromString(line.strip())
-        return result
-
-class Jin(object):
-    """"Object to hold descripton of a single join input."""
-
-    def __init__(self,view,by=(lambda x:x),outer=False):
-        self.view = view
-        self.joinBy = by
-        self.outer = outer
-        self._padWithNulls = False
-
-    def __str__(self):
-        if self.view: viewStr = View.asTag(self.view)
-        else: viewStr = '_'
-        if self.outer: outerStr = ',outer=True'
-        else: outerStr = ''
-        if self._padWithNulls: padStr = ',_padWithNulls=True'
-        else: padStr = ''
-        return "Jin(%s,by=%s%s%s)" % (viewStr,self.joinBy,outerStr,padStr)
-
-class ReduceTo(object):
-    """An object x that can be the argument of a reducingTo=x
-    parameter in a Group view."""
-    def __init__(self,baseType,by=lambda accum,val:accum+val):
-        self.baseType = baseType
-        self.reduceBy = by
-
-class ReduceToCount(ReduceTo):
-    """Produce the count of the number of objects that would be placed in a group."""
-    def __init__(self):
-        ReduceTo.__init__(self,int,by=lambda accum,val:accum+1)
-
-class ReduceToSum(ReduceTo):
-    """Produce the count of the number of objects that would be placed in a group."""
-    def __init__(self):
-        ReduceTo.__init__(self,int,by=lambda accum,val:accum+val)
-
-class ReduceToList(ReduceTo):
-    """Produce a list of the objects that would be placed in a group."""
-    def __init__(self):
-        ReduceTo.__init__(self,list,by=lambda accum,val:accum+[val])
-
-###############################################################################
-# abstract views
-##############################################################################
-
-class View(object):
-    """A relation object for guineaPig.  A view - usually abbreviated rel,
-    r, s, t, - can be "materialized" to produce and unordered bag of
-    rows - usually abbreviated ro.  A row is just an arbitrary python
-    data structure, which must be something that can be stored and
-    retrieved by the RowSerializer.  A row can be anything, but often
-    the top-level structure is a python tuple (a,b,c,...) or a dict
-    mapping small integers 0,1,... to different parts of the row.
-    
-    A guineapig "planner" knows how to construct "plans" that store
-    materialized relations on disk. These plans sometimes include
-    creating 'checkpoints', which are things places on disk, often
-    stored materialized relations, or sometimes intermediate outputs
-    or these."""
-
-    def __init__(self):
-        """The planner and tag must be set before this is used."""
-        self.planner = None       #pointer to planner object
-        self.tag = None           #for naming storedFile and checkpoints
-        self.storeMe = None       #try and store this view if true
-        self.retainedPart = None  #used in map-reduce views only
-
-    #
-    # ways to modify a view
-    # 
-
-    def opts(self,stored=None):
-        """Return the same view with options set appropriately.e"""
-        self.storeMe = stored
-        return self
-
-    def showExtras(self):
-        """Printable representation of the options for a view."""
-        result = ''
-        flagPairs = []
-        if self.storeMe: flagPairs += ['stored=%s' % repr(self.storeMe)]
-        if flagPairs: result += '.opts(' + ",".join(flagPairs) + ')'
-        return result
-
-    #
-    # how the view is saved on disk
-    #
-
-    def storedFile(self):
-        """The file that will hold the materialized relation."""
-        return self.planner.opts['viewdir'] + '/' + self.tag + '.gp'
-
-    def distributableFile(self):
-        """The file that will hold the materialized relation in the working directory
-        in preparation to be uploaded to the distributed cache."""
-        return self.tag + '.gp'
-
-    @staticmethod
-    def viewNameFor(fileName):
-        """The view associated with the given file name"""
-        vname = os.path.basename(fileName)
-        if vname.endswith(".gp"): vname = vname[0:-len(".gp")]
-        elif vname.endswith(".done"): vname = vname[0:-len(".done")]            
-        return vname
-
-    #
-    # semantics of the view
-    #
-
-    def checkpoint(self):
-        """A checkpoint is a file that is created in the course of
-        materializing a view.  This function is the latest checkpoint
-        from which the the relation can be materialized."""
-        if self.storeMe: return self.storedFile()
-        else: return self.unstoredCheckpoint()
-
-    def unstoredCheckpoint(self):
-        """Checkpoint for this view, if the storeMe flag is not set."""
-        assert False, 'abstract method called'
-
-    def checkpointPlan(self):
-        """A plan to produce checkpoint().  Plans are constructed with the
-        help of the planner, and steps in the plan are executed by
-        delegation, thru the planner, to methods of this class named
-        doFoo."""
-        if self.storeMe: return self.storagePlan()
-        else: return self.unstoredCheckpointPlan()
-
-    def unstoredCheckpointPlan(self):
-        """Plan to produce the checkpoint for this view, if the
-        storeMe flag is not set."""
-        assert False, 'abstract method called'
-
-    def rowGenerator(self):
-        """A generator for the rows in this relation."""
-        if self.storeMe and (self.tag in self.planner.alreadyStored):
-            for line in sys.stdin:
-                yield self.planner._serializer.fromString(line.strip())
-        else:
-            for row in self.unstoredRowGenerator():
-                yield row
-
-    def explanation(self):
-        """Return an explanation of how rows are generated."""
-        if self.storeMe and (self.tag in self.planner.viewsPlannedToExist):
-            return ['read %s with %s' % (self.storedFile(),self.tag)]
-        else:
-            return self.unstoredExplanation()
-
-    def unstoredExplanation(self):
-        """Return an explanation of how rows were generated, ignoring caching issues."""
-        assert False,'abstract method called'
-
-    def storagePlan(self):
-        """A plan to materialize the relation. """ 
-        if self.storeMe and self.tag in self.planner.viewsPlannedToExist:
-            return Plan()
-        else:
-            #WARNING: these computations have to be done in the right order, since planning has the side effect of updating
-            #the filePlannedToExist predicate.  
-            # 1) build the pre-plan, to set up the view's checkpoints
-            plan = self.unstoredCheckpointPlan()
-            # 2a) compute the next step of the plan, along with the explanation
-            step = Step(self,'doStoreRows',self.unstoredCheckpoint(),self.storedFile(),
-                        why=self.explanation(),
-                        existingViews=set(self.planner.viewsPlannedToExist)) #shallow copy of current state
-            result = plan.extend( step )
-            # 2b) if necessary, add a step to upload the 
-
-            # 3) Record that this file has been stored for lated calls to explanation() and storagePlan()
-            logging.debug('marking %s as planned-to-exist' % self.tag)
-            self.planner.viewsPlannedToExist.add(self.tag)
-            # 4) return the result
-            return result
-            
-    def doStoreRows(self):
-        """Called by planner at execution time to store the rows of the view."""
-        for row in self.rowGenerator():
-            print self.planner._serializer.toString(row)
-
-    #
-    # traversing and defining views
-    #
-
-    def innerViews(self):
-        """List of all views that are used as direct inputs."""
-        return []
-
-    def nonInnerPrereqViews(self):
-        """List any non-inner views that need to be created before the view is executed."""
-        return []
-
-    def __or__(self,otherView):
-        """Overload the pipe operator x | y to return with y, with x as its inner view."""
-        otherView.acceptInnerView(self)
-        return otherView
-
-    def acceptInnerView(self,otherView):
-        """Replace an appropriate input view with otherView. To be 
-        used with the pipe operator."""
-        assert False,'abstract method called'
-
-    #
-    # meta plans - sequence of store commands
-    #
-    def metaplan(self,previouslyExistingViews):
-        plannedViews = set(previouslyExistingViews) #copy
-        return self._metaplanTraverse(plannedViews) + [self.tag]
-
-    def _metaplanTraverse(self,plannedViews):
-        mplan = []
-        try:
-            sideInnerviews = self.sideviews
-        except AttributeError:
-            sideInnerviews = []            
-        for inner in self.innerViews() + sideInnerviews:
-            if not inner.tag in plannedViews:
-                mplan += inner._metaplanTraverse(plannedViews)
-                if inner.storeMe:
-                    mplan += [inner.tag]
-                    plannedViews.add(inner.tag)
-        return mplan
-
-    #
-    # printing views
-    #
-
-    def pprint(self,depth=0,alreadyPrinted=None,sideview=False):
-        """Print a readable representation of the view."""
-        if alreadyPrinted==None: alreadyPrinted = set()
-        tabStr = '| ' * depth
-        tagStr = str(self.tag)
-        sideviewIndicator = '*' if sideview else ''
-        if self in alreadyPrinted:
-            print tabStr + sideviewIndicator + tagStr + ' = ' + '...'
-        else:
-            sideViewInfo = "  sideviews: {"+",".join(map(lambda x:x.tag, self.nonInnerPrereqViews()))+"}" if self.nonInnerPrereqViews() else ""
-            print tabStr + sideviewIndicator + tagStr + ' = ' + str(self) + sideViewInfo
-            alreadyPrinted.add(self)
-            for inner in self.innerViews():
-                inner.pprint(depth+1,alreadyPrinted)
-            try:
-                for inner in self.sideviews:
-                    inner.pprint(depth+1,alreadyPrinted,sideview=True)
-            except AttributeError:
-                pass
-
-    @staticmethod
-    def asTag(view):
-        """Helper for printing views."""
-        if not view: return '(null view)'
-        elif view.tag: return view.tag 
-        else: return str(view)
-
-#
-# abstract view types
-#
-
-class Reader(View):
-    """Wrapper around a stored file relation."""
-
-    def __init__(self,src):
-        View.__init__(self)
-        self.src = src
-
-    def unstoredCheckpoint(self): 
-        return self.src
-
-    def unstoredCheckpointPlan(self):
-        return Plan()
-
-    def unstoredExplanation(self):
-        return [ 'read %s with %s' % (str(self.src),self.tag) ]
-
-class Transformation(View):
-    """A relation that takes an inner relation and processes in a
-    stream-like way, including operators like project, flatten,
-    select."""
-
-    def __init__(self,inner=None):
-        View.__init__(self)
-        self.inner = inner
-    
-    def innerViews(self):
-        return [self.inner]
-
-    def nonInnerPrereqViews(self):
-        assert self.inner, 'no inner view defined for ' + str(self)
-        return self.inner.nonInnerPrereqViews()
-
-    def acceptInnerView(self,otherView):
-        assert not self.inner,'An inner view is defined for '+self.tag+' so you cannot use it as RHS of a pipe'
-        self.inner = otherView
-
-    # The transformation will stream through the inner relation,
-    # and produce a new version, so the latest checkpoint and
-    # plan to produce it are delegated to the inner View.
-
-    def unstoredCheckpoint(self):
-        return self.inner.checkpoint()
-
-    def unstoredCheckpointPlan(self):
-        plan = Plan()
-        plan.append(self.inner.checkpointPlan())
-        return plan
-
-    def unstoredExplanation(self):
-        return self.inner.explanation() + [ 'transform to %s' % self.tag ]
-
-class MapReduce(View):
-    """A view that takes an inner relation and processes in a
-    map-reduce-like way."""
-
-    def __init__(self,inners,retaining):
-        View.__init__(self)
-        self.inners = inners
-        self.retainedPart = retaining
-    
-    def innerViews(self):
-        return self.inners
-
-    def nonInnerPrereqViews(self):
-        result = []
-        for inner in self.inners:
-            result += inner.nonInnerPrereqViews()
-        return result
-
-    def reduceInputFile(self): 
-        ## the checkpoint is the reducer input file
-        return self.planner.opts['viewdir'] + '/'  + self.tag + '.gpri'
-
-    @staticmethod
-    def isReduceInputFile(fileName):
-        return fileName.endswith('.gpri')
-
-    def unstoredCheckpoint(self):
-        return self.reduceInputFile()
-
-    def unstoredCheckpointPlan(self):
-        plan = Plan()
-        for inner in self.inners:
-            plan = plan.append(inner.checkpointPlan())
-        return plan.append(self.mapPlan())
-
-    def innerCheckpoints(self):
-        result = []
-        for inner in self.inners:
-            result += [inner.checkpoint()]
-        return result
-
-    def mapPlan(self):
-        log.error("abstract method not implemented")
-        
-    def doStoreKeyedRows(self,subview,key,index):
-        """Utility method to compute keys and store key-value pairs.  Usually
-        used as the main step in a mapPlan. """
-        for row in subview.rowGenerator():
-            keyStr = self.planner._serializer.toString(key(row))
-            rrow = self.retainedPart(row) if self.retainedPart else row
-            valStr = self.planner._serializer.toString(rrow)
-            if index<0:
-                print "%s\t%s" % (keyStr,valStr)
-            else:
-                print "%s\t%d\t%s" % (keyStr,index,valStr)
-            
-##############################################################################
-#
-# concrete View classes
-#
-##############################################################################
-
-class ReadLines(Reader):
-    """ Returns the lines in a file, as python strings."""
-
-    def __init__(self,src):
-        Reader.__init__(self,src)
-
-    def unstoredRowGenerator(self):
-        for line in sys.stdin:
-            yield line
-
-    def __str__(self):
-        return 'ReadLines("%s")' % self.src + self.showExtras()
-
-class ReadCSV(Reader):
-    """ Returns the lines in a CSV file, converted to Python tuples."""
-
-    def __init__(self,src,**kw):
-        Reader.__init__(self,src)
-        self.kw = kw
-
-    def unstoredRowGenerator(self):
-        for tup in csv.reader(sys.stdin,**self.kw):
-            yield tup
-
-    def __str__(self):
-        return 'ReadCVS("%s",%s)' % (self.src,str(self.kw)) + self.showExtras()
-
-
-class ReplaceEach(Transformation):
-    """ In 'by=f'' f is a python function that takes a row and produces
-    its replacement."""
-    
-    def __init__(self,inner=None,by=lambda x:x):
-        Transformation.__init__(self,inner)
-        self.replaceBy = by
-
-    def unstoredRowGenerator(self):
-        for row in self.inner.rowGenerator():
-            yield self.replaceBy(row)
-
-    def unstoredExplanation(self):
-        return self.inner.explanation() + [ 'replaced to %s' % self.tag ]
-
-    def __str__(self):
-        return 'ReplaceEach(%s, by=%s)' % (View.asTag(self.inner),str(self.replaceBy)) + self.showExtras()
-
-class Augment(Transformation):
-
-    def __init__(self,inner=None,sideviews=None,sideview=None,loadedBy=lambda v:list(GPig.rowsOf(v))):
-        Transformation.__init__(self,inner)
-        assert not (sideviews and sideview), 'cannot specify both "sideview" and "sideviews"'
-        self.sideviews = list(sideviews) if sideviews else [sideview]
-        self.loader = loadedBy
-        assert self.loader,'must specify a "loadedBy" function for Augment'
-
-    def nonInnerPrereqViews(self):
-        return self.inner.nonInnerPrereqViews() + self.sideviews
-
-    def unstoredRowGenerator(self):
-        augend = self.loader(*self.sideviews)
-        for row in self.inner.rowGenerator():
-            yield (row,augend)
-
-    def unstoredCheckpointPlan(self):
-        plan = Plan()
-        for sv in self.sideviews:
-            plan = plan.append(sv.storagePlan())
-            plan = plan.extend(Step(sv, 'DISTRIBUTE'))
-        plan.append(self.inner.checkpointPlan())
-        return plan
-
-    def unstoredExplanation(self):
-        return self.inner.explanation() + [ 'augmented to %s' % self.tag ]
-
-    def __str__(self):
-        sideviewTags = loaderTag = '*UNSPECIFIED*'
-        if self.sideviews!=None:
-            sideviewTags = ",".join(map(View.asTag,self.sideviews))
-        if self.loader!=None:
-            loaderTag = str(self.loader)
-        return 'Augment(%s,sideviews=%s,loadedBy=s%s)' % (View.asTag(self.inner),sideviewTags,loaderTag) + self.showExtras()
-
-
-class Format(ReplaceEach):
-    """ Like ReplaceEach, but output should be a string, and it will be be
-    stored as strings, ie without using the serializer."""
-
-    def __init__(self,inner=None,by=lambda x:str(x)):
-        ReplaceEach.__init__(self,inner,by)
-
-    def doStoreRows(self):
-        for row in self.rowGenerator():
-            print row
-
-    def __str__(self):
-        return 'Format(%s, by=%s)' % (View.asTag(self.inner),str(self.replaceBy)) + self.showExtras()
-
-class Flatten(Transformation):
-    """ Example: 
-        def idwordGen(row): 
-           for w in row['words']: yield (row['id'],w)
-        x = gp.Flatten(y, by=idwordGen(row))
-
-    In 'by=f', f is a python function that takes a row and yields
-    multiple new rows. """
-
-    def __init__(self,inner=None,by=None):
-        Transformation.__init__(self,inner)
-        self.flattenBy = by
-
-    def unstoredRowGenerator(self):
-        for row in self.inner.rowGenerator():
-            for flatrow in self.flattenBy(row):
-                yield flatrow
-
-    def unstoredExplanation(self):
-        return self.inner.explanation() + [ 'flatten to %s' % self.tag ]
-
-    def __str__(self):
-        return 'Flatten(%s, by=%s)' % (View.asTag(self.inner),str(self.flattenBy)) + self.showExtras()
-
-class Filter(Transformation):
-    """Filter out a subset of rows that match some predicate."""
-    
-    def __init__(self,inner=None,by=lambda x:x):
-        Transformation.__init__(self,inner)
-        self.filterBy = by
-
-    def unstoredRowGenerator(self):
-        for row in self.inner.rowGenerator():
-            if self.filterBy(row):
-                yield row
-
-    def unstoredExplanation(self):
-        return self.inner.explanation() + [ 'filtered to %s' % self.tag ]
-
-    def __str__(self):
-        return 'Filter(%s, by=%s)' % (View.asTag(self.inner),str(self.filterBy)) + self.showExtras()
-
-class Distinct(MapReduce):
-    """Remove duplicate rows."""
-
-    def __init__(self,inner=None,retaining=None):
-        MapReduce.__init__(self,[inner],retaining)
-        self.inner = inner
-
-    def acceptInnerView(self,otherView):
-        assert not self.inner,'An inner view is defined for '+self.tag+' so you cannot use it as RHS of a pipe'
-        self.inner = otherView
-        self.inners = [self.inner]
-
-    def mapPlan(self):
-        step = Step(self, 'doDistinctMap', self.inner.checkpoint(), self.checkpoint(), prereduce=True, 
-                    why=self.explanation(),
-                    existingViews=set(self.planner.viewsPlannedToExist)) #shallow of current state
-        return Plan().extend(step)
-
-    def doDistinctMap(self):
-        # called by groupMapAndSortStep
-        self.inner.doStoreRows()
-
-    def unstoredRowGenerator(self):
-        """Extract distinct elements from a sorted list."""
-        lastval = None
-        for line in sys.stdin:
-            valStr = line.strip()
-            val = self.planner._serializer.fromString(valStr)
-            if val != lastval and lastval: 
-                yield lastval
-            lastval = val
-        if lastval: 
-            yield lastval
-
-    def unstoredExplanation(self):
-        return self.inner.explanation() + [ 'make distinct to %s' % self.tag]
-
-    def __str__(self):
-        return 'Distinct(%s)' % (View.asTag(self.inner)) + self.showExtras()
-
-class Group(MapReduce):
-    """Group by some property of a row, defined with the 'by' option.
-    Default outputs are tuples (x,[r1,...,rk]) where the ri's are rows
-    that have property values of x."""
-
-    def __init__(self,inner=None,by=lambda x:x,reducingTo=ReduceToList(),retaining=None):
-        MapReduce.__init__(self,[inner],retaining)
-        self.inner = inner
-        self.groupBy = by
-        self.reducingTo = reducingTo
-    
-    def acceptInnerView(self,otherView):
-        assert not self.inner,'An inner view is defined for '+self.tag+' so you cannot use it as RHS of a pipe'
-        self.inner = otherView
-        self.inners = [self.inner]
-
-    def mapPlan(self):
-        step = Step(self, 'doGroupMap',self.inner.checkpoint(),self.checkpoint(),prereduce=True,
-                    why=self.explanation(),
-                    existingViews=set(self.planner.viewsPlannedToExist)) #shallow copy of current state
-        return Plan().extend(step)
-
-    def doGroupMap(self):
-        # called by groupMapAndSortStep
-        self.doStoreKeyedRows(self.inner,self.groupBy,-1)
-
-    def unstoredRowGenerator(self):
-        """Group objects from stdin by key, yielding tuples (key,[g1,..,gn])."""
-        lastkey = key = None
-        accum = self.reducingTo.baseType()
-        for line in sys.stdin:
-            keyStr,valStr = line.strip().split("\t")
-            key = self.planner._serializer.fromString(keyStr)
-            val = self.planner._serializer.fromString(valStr)
-            if key != lastkey and lastkey!=None: 
-                yield (lastkey,accum)
-                accum = self.reducingTo.baseType()
-            accum = self.reducingTo.reduceBy(accum, val)
-            lastkey = key
-        if key: 
-            yield (key,accum)
-
-    def unstoredExplanation(self):
-        return self.inner.explanation() + ['group to %s' % self.tag]
-
-    def __str__(self):
-        return 'Group(%s,by=%s,reducingTo=%s)' % (View.asTag(self.inner),str(self.groupBy),str(self.reducingTo)) + self.showExtras()
-
-class Join(MapReduce):
-    """Outputs tuples of the form (row1,row2,...rowk) where
-    rowi is from the i-th join input, and the rowi's have the same
-    value of the property being joined on."""
-
-    def __init__(self,*joinInputs):
-        #sets self.inners
-        MapReduce.__init__(self,map(lambda x:x.view, joinInputs),None)
-        self.joinInputs = joinInputs
-        #re-interpret the 'outer' join parameters - semantically
-        #if jin[i] is outer, then all other inputs must be marked as _padWithNulls
-        if any(map(lambda jin:jin.outer, self.joinInputs)):
-            assert len(self.joinInputs)==2,'outer joins are only supported on two-way joins '+str(self.joinInputs)
-            for i in range(len(self.joinInputs)):
-                if self.joinInputs[i].outer:
-                    j = 1-i  #the other index
-                    self.joinInputs[j]._padWithNulls = True
-    
-    def acceptInnerView(self,otherView):
-        assert self.unpairedJoinBy, 'join cannot be RHS of a pipe'
-        #assert self.unpairedJoinBy, 'join can only be RHS of a pipe if it contains a "by" argument not inside a "Jin" join-input'
-        #self.joinInputs = joinInputs + [Jin(otherView,by=unpairedJoinBy)]
-        #self.inners = map(lambda x:x.view, self.joinInputs)
-
-    def mapPlan(self):
-        previousCheckpoints = self.innerCheckpoints()
-        midfile = self.planner.opts['viewdir'] + '/'  + self.tag+'.gpmo'
-        step = Step(self, 'doJoinMap', src=previousCheckpoints, dst=self.checkpoint(), prereduce=True, hasIndex=True, mid=midfile, 
-                    existingViews=set(self.planner.viewsPlannedToExist), #shallow copy of current state
-                    why=self.explanation())
-        return Plan().extend(step)
-
-    def doJoinMap(self,i):
-        # called by joinMapPlan with argument index, and stdin pointing to previousCheckpoints[index]
-        self.doStoreKeyedRows(self.joinInputs[i].view,self.joinInputs[i].joinBy,i)
-
-    def unstoredRowGenerator(self):
-        """Group objects from stdin by key, yielding tuples (row1,row2,...)."""
-        lastkey = None
-        lastIndex = len(self.joinInputs)-1
-        somethingProducedForLastKey = False
-        #accumulate a list of lists of all non-final inputs
-        accumList = [ [] for i in range(lastIndex) ]
-        for line in sys.stdin:
-            keyStr,indexStr,valStr = line.strip().split("\t")
-            key = self.planner._serializer.fromString(keyStr)
-            index = int(indexStr)
-            val = self.planner._serializer.fromString(valStr)
-            if key != lastkey and lastkey!=None: 
-                #if the final join is marked as _padWithNulls, clear
-                #the accumulators, since we're doing an outer join
-                #with the last view
-                if self.joinInputs[lastIndex]._padWithNulls and not somethingProducedForLastKey:
-                    for tup in self._joinAccumulatedValuesTo(accumList,lastIndex,None):
-                        yield tup
-                #reset the accumulators, since they pertain to the 
-                accumList = [ [] for i in range(lastIndex) ]
-                somethingProducedForLastKey = False
-            if index!=lastIndex:
-                #accumulate values to use in the join
-                accumList[index] = accumList[index] + [val]
-            else:
-                #produce tuples that match the key for the last view
-                for tup in self._joinAccumulatedValuesTo(accumList,lastIndex,val):
-                    somethingProducedForLastKey = True
-                    yield tup
-            lastkey = key
-
-    def _joinAccumulatedValuesTo(self,accumList,lastIndex,finalVal):
-        # _padWithNulls as needed
-        for i in range(lastIndex):
-            if self.joinInputs[i]._padWithNulls and not accumList[i]:
-                accumList[i] = [None]
-        tupbuf = [ None for i in range(lastIndex+1) ]  #holds output
-        tupbuf[lastIndex] = finalVal
-        for i in range(lastIndex):
-            for a in accumList[i]:
-                tupbuf[i] = a
-                if i==lastIndex-1 and any(tupbuf):
-                        yield tuple(tupbuf)
-
-    def unstoredExplanation(self):
-        innerEx = []
-        for inner in self.inners:
-            if innerEx: innerEx += ['THEN']
-            innerEx += inner.explanation()
-        return innerEx + [ 'FINALLY join to %s' % self.tag ]
-
-    def __str__(self):
-        return "Join(%s)" % ",".join(map(str,self.joinInputs)) + self.showExtras()
-
-class JoinTo(Join):
-    """Special case of Join which can be used as the RHS of a pipe operator."""
-
-    def __init__(self,joinInput,by=None):
-        Join.__init__(self,Jin(None,by),joinInput)
-        
-    def acceptInnerView(self,otherView):
-        self.joinInputs[0].view = otherView
-        self.inners[0] = otherView
-
-##############################################################################
-#
-# the top-level planner, and its supporting classes
-#
-##############################################################################
-
-class Plan(object):
-    """A plan constructed by a GuineaPig."""
-
-    def __init__(self): self.steps = []
-
-    def extend(self,step): 
-        self.steps += [step]
-        return self
-
-    def append(self,subPlan): 
-        self.steps += subPlan.steps
-        return self
-
-    def execute(self,gp,echo=False):
-        script = self.compile(gp)
-        for shellcom in script:
-            if echo: print 'calling:',shellcom
-            subprocess.check_call(shellcom,shell=True)
-
-    def compile(self,gp):
-        """Return a list of strings that can be run as shell commands."""
-        script = []
-        i = 0
-        while (i<len(self.steps)):
-            s = self.steps[i]
-            i += 1
-            if s.whatToDo=='DISTRIBUTE':
-                script += s.distribute(gp)
-            elif not s.prereduce: 
-                script += s.mapOnlySubscript(gp)
-            else:
-                reduceStep = self.steps[i]
-                assert not reduceStep.prereduce, 'chained mapreduce steps:' + str(s) + str(reduceStep)
-                i += 1
-                if s.src.__class__ != [].__class__:
-                    script += s.mapReduceSubscript(reduceStep,gp)
-                else:
-                    script += s.multiMapReduceSubscript(reduceStep,gp)
-        return script
-
-class Step(object):
-    """Steps for the planner."""
-
-    def __init__(self,view,whatToDo,src=None,dst=None,prereduce=False,hasIndex=False,mid=None,why=[],existingViews=set()):
-        """Arguments: 
-        - view is view that created this step
-        - whatToDo is the operator, eg doStoreKeyedRows, or else DISTRIBUTE
-        - existingViews is a set of views that are expected to be already stored
-        - src is the file used as input, or possibly a LIST of files if there are multiple inputs
-        - dst is the file used as output
-        - mid is a file used as tmp storage for an unsorted version of output, if needed
-        - prereduce is True iff this is a checkpoint in a map-reduce step
-        - hasIndex is True iff the data is of the form <key,index,value> such that
-        items should be partitioned by key and sorted by index
-        - why is documentation/explanation.  """
-
-        self.view = view
-        self.whatToDo = whatToDo
-        self.existingViews = existingViews
-        self.src = src
-        self.dst = dst
-        self.prereduce = prereduce
-        self.hasIndex = hasIndex
-        self.mid = mid
-        self.why = why
-
-    def __str__(self):
-        return repr(self)
-
-    def __repr__(self):
-        return "Step('%s','%s',src=%s,dst='%s',prereduce=%s,mid=%s,why=%s,existingViews=%s)" \
-            % (self.view.tag,self.whatToDo,repr(self.src),
-               self.dst,repr(self.prereduce),repr(self.mid),repr(self.explain()),repr(self.existingViews))
-
-    def explain(self):
-        """Convert an explanation - which is a list of strings - into a string"""
-        return "...".join(self.why)
-
-    # actual code generation for the steps
-
-    class HadoopCommand(object):
-        def __init__(self,gp,view):
-            self.invocation = [GPig.HADOOP_LOC,'jar',gp.opts['streamJar']]
-            self.defs = []
-            self.args = []
-            self.files = []
-            for f in gp._shippedFiles:
-                self.files += ['-file',f]
-            for v in view.nonInnerPrereqViews():
-                self.files += ['-file',v.distributableFile()]
-        def append(self,*toks):
-            self.args += list(toks)
-        def appendDef(self,*toks):
-            self.defs += list(toks)
-        def asEcho(self):
-            return " ".join(['echo','hadoop'] + self.args + ['...'])
-        def asString(self):
-            return " ".join(self.invocation+self.defs+self.files+self.args)
-
-    def subplanHeader(self,reduceStep=None):
-        """Generate an explanatory header for a step."""
-        if not reduceStep: return ['#', 'echo map '+self.view.tag + ': '+self.explain()]        
-        else: return ['#', 'echo map/reduce '+self.view.tag+ '/'+ reduceStep.view.tag + ': '+reduceStep.explain()]                    
-
-
-    def coreCommand(self,gp):
-        """Python command to call an individual plan step."""
-        return 'python %s --view=%s --do=%s' % (gp._gpigSourceFile,self.view.tag,self.whatToDo) + self.coreCommandOptions(gp)
-
-    def ithCoreCommand(self,gp,i):
-        """Like coreCommand but allows index parameter to 'do' option"""
-        return 'python %s --view=%s --do=%s.%d' % (gp._gpigSourceFile,self.view.tag,self.whatToDo,i) + self.coreCommandOptions(gp)
-
-    def coreCommandOptions(self,gp):
-        paramOpts = '' if not gp.param else " --params " + ",".join(map(lambda(k,v):k+':'+v, gp.param.items()))
-        alreadyStoredOpts = '' if not self.existingViews else " --alreadyStored "+",".join(self.existingViews)
-        nonDefaults = []
-        for (k,v) in gp.opts.items():
-            #pass in non-default options, or options computed from the environment
-            if (gp.opts[k] != GPig.DEFAULT_OPTS[k]) or ((k in GPig.COMPUTED_OPTION_DEFAULTS) and (gp.opts[k] != GPig.COMPUTED_OPTION_DEFAULTS[k])):
-                nonDefaults += ["%s:%s" % (k,str(v))]
-        optsOpts = '' if not nonDefaults else " --opts " + ",".join(nonDefaults)
-        return paramOpts  + optsOpts + alreadyStoredOpts
-
-    def hadoopClean(self,gp,fileName):
-        """A command to remove a hdfs directory if it exists."""
-        return '(%s fs -test -e %s && %s fs -rmr %s) || echo no need to remove %s' % (GPig.HADOOP_LOC,fileName, GPig.HADOOP_LOC,fileName, fileName)
-
-    def distribute(self,gp):
-        """Make a view availablefor use as a side view."""
-        localCopy = self.view.distributableFile()                
-        maybeRemoteCopy = self.view.storedFile()
-        echo = 'echo making a local copy of %s in %s' % (maybeRemoteCopy,localCopy)
-        if gp.opts['target']=='hadoop':
-            return [echo, 'rm -f %s' % localCopy, '%s fs -getmerge %s %s' % (GPig.HADOOP_LOC,maybeRemoteCopy, localCopy)]
-        else:
-            return [echo, 'cp -f %s %s || echo warning: the copy failed!' % (maybeRemoteCopy,localCopy)]
-
-    def mapOnlySubscript(self,gp):
-        """A subplan for a mapper-only step."""
-        if gp.opts['target']=='shell':
-            command = None
-            if self.src: command = self.coreCommand(gp) + ' < %s > %s' % (self.src,self.dst)
-            else: command = self.coreCommand(gp) + (' > %s' % (self.dst))
-            return self.subplanHeader() + [command]
-        elif gp.opts['target']=='hadoop':
-            assert self.src,'Wrap not supported for hadoop'
-            hcom = self.HadoopCommand(gp,self.view)
-            hcom.appendDef('-D','mapred.reduce.tasks=0')
-            hcom.append('-input',self.src,'-output',self.dst)
-            hcom.append("-mapper '%s'" % self.coreCommand(gp))
-            return self.subplanHeader() + [ hcom.asEcho(), self.hadoopClean(gp,self.dst), hcom.asString() ]
-        else:
-            assert False
-
-    def mapReduceSubscript(self,reduceStep,gp):
-        """A subplan for a map-reduce step followed by a reduce, where the map has one input."""
-        if gp.opts['target']=='shell':
-            command = self.coreCommand(gp) + (' < %s' % self.src) + ' | sort -k1 | '+reduceStep.coreCommand(gp) + (' > %s' % reduceStep.dst)
-            return self.subplanHeader(reduceStep) + [command]
-        elif gp.opts['target']=='hadoop':
-            hcom = self.HadoopCommand(gp,self.view)
-            hcom.appendDef('-D','mapred.reduce.tasks=%d' % gp.opts['parallel'])
-            hcom.append('-input',self.src, '-output',reduceStep.dst)
-            hcom.append("-mapper '%s'" % self.coreCommand(gp))
-            hcom.append("-reducer '%s'" % reduceStep.coreCommand(gp))
-            return self.subplanHeader(reduceStep) + [ hcom.asEcho(), self.hadoopClean(gp,reduceStep.dst), hcom.asString() ]
-        else:
-            assert False
-
-    def multiMapReduceSubscript(self,reduceStep,gp):
-        """A subplan for a map-reduce step followed by a reduce, where the map has many inputs."""
-        if gp.opts['target']=='shell':
-            subplan = ['rm -f %s' % self.mid]
-            for i in range(len(self.src)):
-                subplan += [ self.ithCoreCommand(gp,i) + ' < %s >> %s' % (self.src[i],self.mid) ]
-            sortOpts = '-k1,2' if self.hasIndex else '-k1'
-            subplan += [ 'sort ' + sortOpts + ' < ' + self.mid + ' | ' + reduceStep.coreCommand(gp) + (' > %s' % reduceStep.dst)]
-            return self.subplanHeader(reduceStep) + subplan
-        elif gp.opts['target']=='hadoop':
-            def midi(i): return self.mid + '-' + str(i)
-            subplan = []
-            for i in range(len(self.src)):
-                hcom = self.HadoopCommand(gp,self.view)
-                hcom.appendDef('-D','mapred.reduce.tasks=%d' % gp.opts['parallel'])
-                hcom.append('-input',self.src[i], '-output',midi(i))
-                hcom.append("-mapper","'%s'" % self.ithCoreCommand(gp,i))
-                subplan += [ self.hadoopClean(gp,midi(i)), hcom.asEcho(), hcom.asString() ]
-            hcombineCom = self.HadoopCommand(gp,self.view)
-            hcombineCom.appendDef('-D','mapred.reduce.tasks=%d' % gp.opts['parallel'])
-            if (self.hasIndex): 
-                hcombineCom.appendDef('-jobconf','stream.num.map.output.key.fields=3')
-                hcombineCom.appendDef('-jobconf','num.key.fields.for.partition=1')
-            for i in range(len(self.src)):
-                hcombineCom.append('-input',midi(i))
-            hcombineCom.append('-output',reduceStep.dst)
-            hcombineCom.append('-mapper','cat')
-            hcombineCom.append('-reducer',"'%s'" % reduceStep.coreCommand(gp))
-            if (self.hasIndex): 
-                hcombineCom.append('-partitioner','org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner')
-            subplan += [ self.hadoopClean(gp,reduceStep.dst),  hcombineCom.asEcho(), hcombineCom.asString() ]
-            return self.subplanHeader(reduceStep) + subplan
-        else:
-            assert False
-
-class RowSerializer(object):
-    """Saves row objects to disk and retrieves them."""
-    def __init__(self,target):
-        self._target = target
-        self._reprInverse = None
-    def toString(self,x): 
-        return repr(x)
-    def fromString(self,s): 
-        if self._reprInverse: return self._reprInverse(s)
-        else: return eval(s)
-
-#
-# the planner
-#
-
-class Planner(object):
-    """Can create storage plans for views that are defined as parts of it."""
-
-    def __init__(self,**kw):
-
-        #parameters are used for programmatically give user-defined
-        #config information to a planner, or they can be specified in
-        #the command-line
-        self.param = kw
-        for (key,val) in GPig.getArgvParams().items():
-                # don't override non-null values specified in the constructor
-                if self.param.get(key)==None:
-                    self.param[key] = val
-
-        #opts are used for giving options to the planner from the shell
-        self.opts = GPig.getArgvOpts()
-        for (key,val) in GPig.DEFAULT_OPTS.items():
-            if (not key in self.opts): self.opts[key] = val
-        for (key,type) in GPig.DEFAULT_OPT_TYPES.items():
-            self.opts[key] = type(self.opts[key])
-        #the viewsPlannedToExist is set using the "--reuse" option at
-        #planning time, and incrementally added to as the plan is with
-        #commands that actually store a view.
-        self.viewsPlannedToExist = set()
-        self._serializer = RowSerializer(self.opts['target'])
-
-        #views that aren't associated with class variable, but are
-        #instead named automatically - ie, inner views with no
-        #user-provided names.
-        self._autoNamedViews = {}
-
-        #by default, use info-level logging at planning time
-        if not Planner.partOfPlan(sys.argv): 
-            logging.basicConfig(level=logging.INFO)
-
-        #hadoop needs to know where to give the main script file,
-        #as well as the guineapig.py file it uses
-        self._gpigSourceFile = sys.argv[0]
-        self._shippedFiles = ['guineapig.py',self._gpigSourceFile]
-
-    def setup(self):
-        """Initialize planner, and views used by the planner.  This has to be
-        done after the planner is fully configured by adding views."""
-
-        # make sure view directory is valid
-        if self.opts['target']=='shell' and not os.path.exists(self.opts['viewdir']):
-            logging.info('creating view directory ' + self.opts['viewdir'])
-            os.makedirs(self.opts['viewdir'])
-        elif self.opts['target']=='hadoop':
-            p = urlparse.urlparse(self.opts['viewdir'])
-            if not p.path.startswith("/"):
-                logging.warn('hadoop viewdir should be absolite path: will try prefixing /user/$LOGNAME')
-                username = os.environ.get('LOGNAME','me')
-                self.opts['viewdir'] = '/user/'+username+'/'+self.opts['viewdir']
-                logging.warn('viewdir is set to '+self.opts['viewdir'])
-
-        # Add 'tag' and planner fields to each view
-        for vname in self.listViewNames():
-            v = self.getView(vname)
-            v.tag = vname
-            v.planner = self
-        def tagUnnamedViews(v,basename,index,depth):
-            assert v,'null inner view for '+basename
-            if not v.planner:
-                v.planner = self
-                autoname = '%s_%d_%s' % (basename,depth,index)
-                self._setView(autoname,v)
-                for i,inner in enumerate(v.innerViews() + v.nonInnerPrereqViews()):
-                    tagUnnamedViews(inner,vname,i,depth+1)
-        for vname in self.listViewNames():
-            v = self.getView(vname)
-            for i,inner in enumerate(v.innerViews() + v.nonInnerPrereqViews()):
-                tagUnnamedViews(inner,vname,i,1)
-
-        # Add caching options as needed
-
-        # a mapreduce step can't use a reduce-input as a checkpoint
-        # so introduce caching as needed
-        for vname in self.listViewNames():
-            v = self.getView(vname)
-            if isinstance(v,MapReduce):
-                for inner in v.inners:
-                    innerCheckpoint = inner.checkpoint()
-                    if innerCheckpoint and MapReduce.isReduceInputFile(innerCheckpoint):
-                        if not inner.storeMe:
-                            logging.info('making %s stored, to make possible a downstream map-reduce view' % inner.tag)
-                            inner.storeMe = True
-
-        # you can't combine computation of an Augment with its inner
-        # view, because then the inner view would also need access to
-        # the Augment's side views, which isn't guaranteed
-        for vname in self.listViewNames():
-            v = self.getView(vname)
-            if isinstance(v,Augment):
-                if not v.inner.storeMe:
-                    logging.info('making %s stored, to make possible a downstream augment view' % v.inner.tag)
-                    v.inner.storeMe = True
-
-        #cache anything used more than twice
-        numberOfTimesUsed = collections.defaultdict(int)
-        for vname in self.listViewNames():
-            v = self.getView(vname)
-            for inner in v.innerViews() + v.nonInnerPrereqViews():
-                numberOfTimesUsed[inner] += 1
-        for (v,n) in numberOfTimesUsed.items():
-            if n>1 and v.storeMe==None:
-                logging.info('making %s stored because it might be used %d times' % (v.tag,n))
-                v.storeMe = True
-
-        #mark non-inner prereq views  as storeMe = 'distributedCache'
-        for vname in self.listViewNames():
-            v = self.getView(vname)
-            for inner in v.nonInnerPrereqViews():
-                inner.storeMe = 'distributedCache'
-
-    #
-    # rest of the API for the planner
-    # 
-
-    @staticmethod
-    def partOfPlan(argv):
-        """True if the command line was generated as part of a storage plan."""
-        return any(s.startswith("--do") for s in argv)
-
-    def main(self,argv):
-        """Run a main that lets you --store a view, as well as doing a few other things."""
-        self.setup()
-        self.runMain(argv)
-
-    def runMain(self,argv):
-
-        # parse the options and dispatch appropriately
-        argspec = ["list", "pprint=", "plan=", "metaplan=",
-                   "store=", "cat=", 
-                   "reuse", "alreadyStored=", 
-                   "params=", "opts=", "do=", "view="]
-        optlist,args = getopt.getopt(argv[1:], 'x', argspec)
-        optdict = dict(optlist)
-        
-        # decide what views can be re-used, vs which need fresh plans
-        if '--reuse' in optdict:  #reuse the views listed in the arguments
-            for a in args:
-                vname = View.viewNameFor(a)
-                v = self.getView(vname)
-                if v:
-                    self.viewsPlannedToExist.add(v.tag)
-                    logging.info("re-using data stored for view "+vname+": "+str(v))
-                else:
-                    logging.warn("cannot re-use view "+vname+" since it's not used in this script")
-
-        #choose the main action to take
-        if '--plan' in optdict:    #print a storage plan
-            rel = self.getView(optdict['--plan'],mustExist=True)
-            plan = rel.storagePlan()
-            print "\n".join(plan.compile(self))
-            return
-        elif '--pprint' in optdict: #print a view
-            rel = self.getView(optdict['--pprint'],mustExist=True)
-            rel.pprint()
-            return
-        elif '--metaplan' in optdict:
-            rel = self.getView(optdict['--metaplan'],mustExist=True)
-            print rel.metaplan(self.viewsPlannedToExist)
-            return
-        elif '--store' in optdict:  #store a view
-            rel = self.getView(optdict['--store'],mustExist=True)            
-            plan = rel.storagePlan()
-            plan.execute(self, echo=self.opts['echo'])
-            return
-        elif '--cat' in optdict:    #store and then print a view
-            assert self.opts['target']=='shell','cannot do --cat except in shell mode'
-            rel = self.getView(optdict['--cat'],mustExist=True)
-            plan = rel.storagePlan()
-            plan.execute(self, self.opts['echo'])
-            for line in open(rel.storedFile(),'r'):
-                print line,
-            return
-        elif '--list' in optdict:   #list named views
-            for vname in self.listViewNames():
-                print '     ',vname,'\t',self.getView(vname)
-            return
-        elif '--do' in optdict:     #run an internally-generated action
-            #recover what should be stored when this action is performed
-            self.alreadyStored = set(optdict.get('--alreadyStored','').split(','))
-            #work out what view to use and what routine to call
-            rel = self.getView(optdict['--view'],mustExist=True)
-            whatToDo = optdict['--do']
-            #work out the method given by 'do' and call it - note it
-            #may have a single integer argument, eg doJoinMap.1
-            k = whatToDo.find(".")
-            if k<0:
-                whatToDoMethod = getattr(rel,whatToDo)
-                whatToDoMethod()                
-            else:
-                arg = int(whatToDo[k+1:])
-                whatToDo = whatToDo[:k]
-                whatToDoMethod = getattr(rel,whatToDo)
-                whatToDoMethod(arg)                
-            return
-        else:
-            print 'usage: --[store|pprint|plan|cat] view [--opts key:val,...] [--params key:val,...] --reuse view1 view2 ...]'
-            print '       --[list]'
-            print 'current legal keys for "opts", with default values:'
-            for (key,val) in GPig.DEFAULT_OPTS.items():
-                print '  %s:%s' % (key,str(val))
-            print 'There\'s more help at http://curtis.ml.cmu.edu/w/courses/index.php/Guinea_Pig'
-
-    def getView(self,str,mustExist=False):
-        """Find the defined relation named str, and if necessary bind its
-        planner and tag appropriately."""
-        v = self.__class__.__dict__.get(str) or self.__dict__.get(str) or self._autoNamedViews.get(str)
-        if mustExist: assert v,'cannot find a view named '+str
-        return v
-
-    def _setView(self,str,view):
-        """Internal use only: allow the view to be retreived by name later."""
-        view.tag = str
-        self._autoNamedViews[str] = view
-
-    def listViewNames(self):
-        def namedViews(d): return [vname for vname in d.keys() if isinstance(self.getView(vname),View)]
-        userNamedViews =  namedViews(self.__class__.__dict__) + namedViews(self.__dict__)
-        return userNamedViews + self._autoNamedViews.keys()
-
-    #
-    # dealing with the file storage system and related stuff
-    #
-
-    def ship(self,*fileNames):
-        """Declare a set of inputs to be 'shipped' to the hadoop cluster."""
-        self._shippedFiles += fileNames
-
-    def setSerializer(self,serializer):
-        """Replace the default serializer another RowSerializer object."""
-        self._serializer = serializer
-        return self
-
-    def setReprInverseFun(self,reprInverse):
-        """Specify a function which will deserialize a string that was produced
-        by Python's 'repr' function."""
-        self._serializer._reprInverse = reprInverse
-        return self
-
-if __name__ == "__main__":
-    print 'There\'s help at http://curtis.ml.cmu.edu/w/courses/index.php/Guinea_Pig'    
diff --git a/tutorial/guineapig1_2.py b/tutorial/guineapig1_2.py
deleted file mode 100644
index e4fb776..0000000
--- a/tutorial/guineapig1_2.py
+++ /dev/null
@@ -1,1284 +0,0 @@
-##############################################################################
-# (C) Copyright 2014 William W. Cohen.  All rights reserved.
-##############################################################################
-
-import sys
-import logging
-import copy
-import subprocess
-import collections
-import os
-import os.path
-import urlparse
-import getopt
-import csv
-
-###############################################################################
-# helpers functions and data structures
-###############################################################################
-
-class GPig(object):
-    """Collection of utilities for Guinea Pig."""
-
-    HADOOP_LOC = 'hadoop'  #assume hadoop is on the path at planning time
-    MY_LOC = 'guineapig1_2.py'
-
-    #global options for Guinea Pig can be passed in with the --opts
-    #command-line option, and these are the default values
-    defaultJar = '/usr/lib/hadoop/contrib/streaming/hadoop-streaming-1.2.0.1.3.0.0-107.jar'
-    envjar = os.environ.get('GP_STREAMJAR', defaultJar)
-    DEFAULT_OPTS = {'streamJar': envjar,
-                    'parallel':5,
-                    'target':'shell',
-                    'echo':0,
-                    'viewdir':'gpig_views',
-                    }
-    #there are the types of each option that has a non-string value
-    DEFAULT_OPT_TYPES = {'parallel':int,'echo':int}
-    #we need to pass non-default options in to mappers and reducers,
-    #but since the remote worker's environment can be different, we
-    #also need to pass in options computed from the environment
-    COMPUTED_OPTION_DEFAULTS = {'streamJar':defaultJar}
-
-    @staticmethod
-    def getArgvParams(): 
-        """Return a dictionary holding the argument of the --params option in
-        sys.argv."""
-        return GPig.getArgvDict('--params')
-
-    @staticmethod
-    def getArgvOpts(): 
-        """Return a dictionary holding the argument of the --opts option in
-        sys.argv."""
-        return GPig.getArgvDict('--opts')
-    
-    @staticmethod
-    def getArgvDict(optname):
-        """Return a dictionary of parameter values that were defined on the command line
-        view an option like '--params filename:foo.txt,basedir:/tmp/glob/'.
-        """
-        assert optname.startswith('--')
-        for i,a in enumerate(sys.argv):
-            if a==optname:
-                paramString = sys.argv[i+1]
-                return dict(pair.split(":") for pair in paramString.split(","))
-        return {}
-
-    @staticmethod
-    def rowsOf(view):
-        """Iterate over the rows in a view."""
-        for line in open(view.distributableFile()):
-            yield view.planner._serializer.fromString(line.strip())
-
-    @staticmethod
-    def onlyRowOf(view):
-        """Return the first row in a side view, and raise an error if it
-        is not the only row of the view."""
-        result = None
-        logging.info('loading '+view.distributableFile())
-        for line in open(view.distributableFile()):
-            assert not result,'multiple rows in stored file for '+view.tag
-            result = view.planner._serializer.fromString(line.strip())
-        return result
-
-class Jin(object):
-    """"Object to hold description of a single join input."""
-
-    def __init__(self,view,by=(lambda x:x),outer=False):
-        self.view = view
-        self.joinBy = by
-        self.outer = outer
-        self._padWithNulls = False
-
-    def __str__(self):
-        viewStr = View.asTag(self.view) if self.view else '_'
-        outerStr = ',outer=True' if self.outer else ''
-        padStr = ',_padWithNulls=True' if self._padWithNulls else ''
-        return "Jin(%s,by=%s%s%s)" % (viewStr,self.joinBy,outerStr,padStr)
-
-class ReduceTo(object):
-    """An object x that can be the argument of a reducingTo=x
-    parameter in a Group view."""
-    def __init__(self,baseType,by=lambda accum,val:accum+val):
-        self.baseType = baseType
-        self.reduceBy = by
-
-class ReduceToCount(ReduceTo):
-    """Produce the count of the number of objects that would be placed in a group."""
-    def __init__(self):
-        ReduceTo.__init__(self,int,by=lambda accum,val:accum+1)
-
-class ReduceToSum(ReduceTo):
-    """Produce the sum of the objects - which must be numbers - that would
-    be placed in a group."""
-    def __init__(self):
-        ReduceTo.__init__(self,int,by=lambda accum,val:accum+val)
-
-class ReduceToList(ReduceTo):
-    """Produce a list of the objects that would be placed in a group."""
-    def __init__(self):
-        ReduceTo.__init__(self,list,by=lambda accum,val:accum+[val])
-
-###############################################################################
-# abstract views
-##############################################################################
-
-class View(object):
-    """A definition of a relation for Guinea Pig.  A View object can be
-    produce a storagePlan(), which can then be executed to produce the
-    contents of the relation.  Intutitively, a relation is and
-    unordered bag of rows, and a row an almost-arbitrary python data
-    structure. (It must be something that can be stored and retrieved
-    by the RowSerializer.)
-
-    Steps in the storagePlan are executed by delegation, thru the
-    planner, to methods of a View class named doFoo.
-    """
-
-    def __init__(self):
-        """The planner and tag must be set before this is used."""
-        self.planner = None       #pointer to planner object
-        self.tag = None           #for naming storedFile and checkpoints
-        self.storeMe = None       #try and store this view if true
-        self.retainedPart = None  #used in map-reduce views only
-        self.sideviews = []       #non-empty for Augment views only
-        self.inners = []          #always used
-
-    #self.inner is shortcut for inners[0]
-    def _getInner(self): return self.inners[0]
-    def _setInner(self,val): self.inners = [val]
-    inner = property(_getInner,_setInner)
-
-    #
-    # ways to modify a view
-    # 
-
-    def opts(self,stored=None):
-        """Return the same view with options set appropriately.  Possible
-        options include:
-
-          - stored=True - Explicitly store this view on disk whenever
-            it is used in another view's definition.  This might be set
-            by the user for debugging purposes, or by the planner,
-            to prevent incorrect optimizations.  Generally "inner"
-            views are not explicitly stored.
-            
-          - stored='distributedCache' - Store this view in the working
-            directory and/or the Hadoop distributed cache.
-            """
-
-        self.storeMe = stored
-        return self
-
-    def showExtras(self):
-        """Printable representation of the options for a view."""
-        result = ''
-        flagPairs = []
-        if self.storeMe: flagPairs += ['stored=%s' % repr(self.storeMe)]
-        if flagPairs: result += '.opts(' + ",".join(flagPairs) + ')'
-        return result
-
-    #
-    # how the view is saved on disk
-    #
-
-    def storedFile(self):
-        """The file that will hold the materialized relation."""
-        return self.planner.opts['viewdir'] + '/' + self.tag + '.gp'
-
-    def distributableFile(self):
-        """The file that will hold the materialized relation in the working directory
-        in preparation to be uploaded to the distributed cache."""
-        return self.tag + '.gp'
-
-    @staticmethod
-    def viewNameFor(fileName):
-        """The view associated with the given file name"""
-        vname = os.path.basename(fileName)
-        if vname.endswith(".gp"): vname = vname[0:-len(".gp")]
-        return vname
-
-    #
-    # semantics of the view
-    #
-
-    def checkpoint(self):
-        """A checkpoint is an intermediate computation for the view, which is
-        saved on disk.  The rowGenerator() for the view will assume
-        that the checkpoint is available.
-        """
-        assert False, 'abstract method called'
-
-    def checkpointPlan(self):
-        """A plan to produce checkpoint()."""
-        assert False, 'abstract method called'
-
-    def rowGenerator(self):
-        """A generator for the rows in this relation, which assumes existence
-        of the checkpoint."""
-        assert False, 'abstract method called'
-
-    def explanation(self):
-        """Return an explanation of how rows are generated."""
-        assert False, 'abstract method called'
-
-    def storagePlan(self):
-        """A plan to store the view."""
-        return self.planner.buildRecursiveStoragePlan(self)
-
-    def nonrecursiveStoragePlan(self):
-        """Materialize the relation, assuming that there are no descendent
-        inner views that need to be materialized first."""
-        plan = self.checkpointPlan()
-        result = plan.extend(Step(self,'doStoreRows',self.checkpoint(),self.storedFile(),why=self.explanation()))
-        return result
-            
-    def applyDict(self,mapping,innerviewsOnly=False):
-        """Given a mapping from view tags to views, replace every inner view with
-        the appropriate value from the mapping, and return the result."""
-        if self.tag in mapping and not innerviewsOnly:
-            return mapping[self.tag]
-        elif not self.inners:
-            return self
-        else:
-            result = copy.copy(self)
-            result.inners = map(lambda v:v.applyDict(mapping), self.inners)
-            return result
-
-    def sideviewsNeeded(self):
-        """Sideviews needed by this view."""
-        result = []
-        for sv in self.sideviews:
-            result += [sv]
-        for v in self.inners:
-            result += list(v._sideviewsOfDescendants())
-        return result
-        
-    def _sideviewsOfDescendants(self):
-        if not self.storeMe:
-            for sv in self.sideviews:
-                yield sv
-            for v in self.inners:
-                for sv in v._sideviewsOfDescendants():
-                    yield sv
-
-    def enforceStorageConstraints(self):
-        """Subclass this, if there are constraints on when one must explicitly
-        store inner views."""
-        pass
-
-    def doStoreRows(self):
-        """Called by planner at execution time to store the rows of the view."""
-        for row in self.rowGenerator():
-            print self.planner._serializer.toString(row)
-
-    #
-    # support the "pipe" syntax: view1 | view2
-    #
-
-    def __or__(self,otherView):
-        """Overload the pipe operator x | y to return with y, with x as its inner view."""
-        otherView.acceptInnerView(self)
-        return otherView
-
-    def acceptInnerView(self,otherView):
-        """Replace an appropriate input view with otherView. This is subclassed to 
-        implement the the pipe operator."""
-        assert not self.inner,'An inner view is defined for '+self.tag+' so you cannot use it as RHS of a pipe'
-        self.inner = otherView  #subclass if needed
-
-    #
-    # printing views
-    #
-
-    def pprint(self,depth=0,alreadyPrinted=None,sideview=False):
-        """Print a readable representation of the view."""
-        if alreadyPrinted==None: alreadyPrinted = set()
-        tabStr = '| ' * depth
-        tagStr = str(self.tag)
-        sideviewIndicator = '*' if sideview else ''
-        if self.tag in alreadyPrinted:
-            print tabStr + sideviewIndicator + tagStr + ' = ' + '...'
-        else:
-            sideviewInfo = "  sideviews: {"+",".join(map(lambda x:x.tag, self.sideviews))+"}" if self.sideviews else ""
-            print tabStr + sideviewIndicator + tagStr + ' = ' + str(self) + sideviewInfo
-            alreadyPrinted.add(self.tag)
-            for inner in self.inners:
-                inner.pprint(depth+1,alreadyPrinted)
-            for inner in self.sideviews:
-                inner.pprint(depth+1,alreadyPrinted,sideview=True)
-
-    @staticmethod
-    def asTag(view):
-        """Helper for printing views."""
-        if not view: return '(null view)'
-        elif view.tag: return view.tag 
-        else: return str(view)
-
-#
-# abstract view types
-#
-
-class Reader(View):
-    """Read data stored on the file system and make it look like a View."""
-
-    def __init__(self,src):
-        View.__init__(self)
-        self.src = src
-        self.inners = []
-
-    def checkpoint(self): 
-        return self.src
-
-    def checkpointPlan(self):
-        return Plan()  #empty plan
-
-    def explanation(self):
-        return [ 'read %s with %s' % (str(self.src),self.tag) ]
-
-    def acceptInnerView(self,otherView):
-        assert False, "Reader views cannot be used as RHS of a pipe"
-
-class Transformation(View):
-    """Streaming transformation on a single inner view."""
-
-    def __init__(self,inner=None):
-        View.__init__(self)
-        self.inner = inner
-    
-    # A transformation will stream on-the-fly through the inner
-    # relation, and produce a new version, so the checkpoint and plan
-    # to produce it are delegated to the inner View.
-
-    def checkpoint(self):
-        return self.inner.checkpoint()
-
-    def checkpointPlan(self):
-        return self.inner.checkpointPlan()
-
-    def explanation(self):
-        return self.inner.explanation() + [ 'transform to %s' % self.tag ]
-
-class MapReduce(View):
-    """A view that takes an inner relation and processes in a
-    map-reduce-like way."""
-
-    def __init__(self,inners,retaining):
-        View.__init__(self)
-        self.inners = inners
-        self.retainedPart = retaining
-    
-    def _isReduceInputFile(self,fileName):
-        return fileName.endswith('.gpri')
-
-    def checkpoint(self):
-        ## the checkpoint is the reducer input file
-        return self.planner.opts['viewdir'] + '/'  + self.tag + '.gpri'
-
-    def checkpointPlan(self):
-        plan = Plan()
-        for inner in self.inners:
-            plan = plan.append(inner.checkpointPlan())
-        return plan.append(self.mapPlan())
-
-    def enforceStorageConstraints(self):
-        for inner in self.inners:
-            innerChkpt = inner.checkpoint()
-            #optimizations break if you chain two map-reduces together
-            if innerChkpt and innerChkpt.endswith(".gpri"):
-                if not inner.storeMe:
-                    logging.info('making %s stored, to make possible a downstream map-reduce view' % inner.tag)
-                    inner.storeMe = True
-
-    def mapPlan(self):
-        log.error("abstract method not implemented")
-        
-    def doStoreKeyedRows(self,subview,key,index):
-        """Utility method used by concrete map-reduce classes to compute keys
-        and store key-value pairs.  Usually used as the main step in a
-        mapPlan. """
-        for row in subview.rowGenerator():
-            keyStr = self.planner._serializer.toString(key(row))
-            rrow = self.retainedPart(row) if self.retainedPart else row
-            valStr = self.planner._serializer.toString(rrow)
-            if index<0:
-                print "%s\t%s" % (keyStr,valStr)
-            else:
-                print "%s\t%d\t%s" % (keyStr,index,valStr)
-            
-##############################################################################
-#
-# concrete View classes
-#
-##############################################################################
-
-class ReuseView(Reader):
-    """Returns the objects in a previously stored view."""
-
-    def __init__(self,view):
-        if isinstance(view,View):
-            Reader.__init__(self,view.storedFile())
-            self.tag = "reuse_"+view.tag
-            self.reusedViewTag = view.tag
-            self.planner = view.planner
-        else:
-            assert False,'user-defined ReuseView not supported (yet)'
-
-    def rowGenerator(self):
-        for line in sys.stdin:
-            yield self.planner._serializer.fromString(line.strip())
-
-    def __str__(self):
-        return 'ReuseView("%s")' % self.src + self.showExtras()
-
-
-class ReadLines(Reader):
-    """ Returns the lines in a file, as python strings."""
-
-    def __init__(self,src):
-        Reader.__init__(self,src)
-
-    def rowGenerator(self):
-        for line in sys.stdin:
-            yield line
-
-    def __str__(self):
-        return 'ReadLines("%s")' % self.src + self.showExtras()
-
-class ReadCSV(Reader):
-    """ Returns the lines in a CSV file, converted to Python tuples."""
-
-    def __init__(self,src,**kw):
-        Reader.__init__(self,src)
-        self.kw = kw
-
-    def rowGenerator(self):
-        for tup in csv.reader(sys.stdin,**self.kw):
-            yield tup
-
-    def __str__(self):
-        return 'ReadCVS("%s",%s)' % (self.src,str(self.kw)) + self.showExtras()
-
-
-class ReplaceEach(Transformation):
-    """ In 'by=f'' f is a python function that takes a row and produces
-    its replacement."""
-    
-    def __init__(self,inner=None,by=lambda x:x):
-        Transformation.__init__(self,inner)
-        self.replaceBy = by
-
-    def rowGenerator(self):
-        for row in self.inner.rowGenerator():
-            yield self.replaceBy(row)
-
-    def explanation(self):
-        return self.inner.explanation() + [ 'replaced to %s' % self.tag ]
-
-    def __str__(self):
-        return 'ReplaceEach(%s, by=%s)' % (View.asTag(self.inner),str(self.replaceBy)) + self.showExtras()
-
-class Augment(Transformation):
-
-    def __init__(self,inner=None,sideviews=None,sideview=None,loadedBy=lambda v:list(GPig.rowsOf(v))):
-        Transformation.__init__(self,inner)
-        assert not (sideviews and sideview), 'cannot specify both "sideview" and "sideviews"'
-        self.sideviews = list(sideviews) if sideviews else [sideview]
-        self.loader = loadedBy
-        assert self.loader,'must specify a "loadedBy" function for Augment'
-
-    def enforceStorageConstraints(self):
-        for sv in self.sideviews:
-            sv.storeMe = 'distributedCache'
-
-    def rowGenerator(self):
-        augend = self.loader(*self.sideviews)
-        for row in self.inner.rowGenerator():
-            yield (row,augend)
-
-    def checkpointPlan(self):
-        plan = Plan()
-        plan.append(self.inner.checkpointPlan())
-        #the sideviews should have been stored by the top-level
-        #planner already, but they will need to be moved to a
-        #distributable location
-        for sv in self.sideviews:
-            plan.extend(Step(sv, 'DISTRIBUTE'))
-        return plan
-
-    def explanation(self):
-        return self.inner.explanation() + [ 'augmented to %s' % self.tag ]
-
-    def __str__(self):
-        sideviewTags = loaderTag = '*UNSPECIFIED*'
-        if self.sideviews!=None: sideviewTags = ",".join(map(View.asTag,self.sideviews))
-        if self.loader!=None: loaderTag = str(self.loader)
-        return 'Augment(%s,sideviews=%s,loadedBy=s%s)' % (View.asTag(self.inner),sideviewTags,loaderTag) + self.showExtras()
-
-
-class Format(ReplaceEach):
-    """ Like ReplaceEach, but output should be a string, and it will be be
-    stored as strings, ie without using the serializer."""
-
-    def __init__(self,inner=None,by=lambda x:str(x)):
-        ReplaceEach.__init__(self,inner,by)
-
-    def __str__(self):
-        return 'Format(%s, by=%s)' % (View.asTag(self.inner),str(self.replaceBy)) + self.showExtras()
-
-    def doStoreRows(self):
-        for row in self.rowGenerator():
-            print row
-
-class Flatten(Transformation):
-    """ Like ReplaceEach, but output of 'by' is an iterable, and all
-    results will be returned. """
-
-    def __init__(self,inner=None,by=None):
-        Transformation.__init__(self,inner)
-        self.flattenBy = by
-
-    def rowGenerator(self):
-        for row in self.inner.rowGenerator():
-            for flatrow in self.flattenBy(row):
-                yield flatrow
-
-    def explanation(self):
-        return self.inner.explanation() + [ 'flatten to %s' % self.tag ]
-
-    def __str__(self):
-        return 'Flatten(%s, by=%s)' % (View.asTag(self.inner),str(self.flattenBy)) + self.showExtras()
-
-class Filter(Transformation):
-    """Filter out a subset of rows that match some predicate."""
-    
-    def __init__(self,inner=None,by=lambda x:x):
-        Transformation.__init__(self,inner)
-        self.filterBy = by
-
-    def rowGenerator(self):
-        for row in self.inner.rowGenerator():
-            if self.filterBy(row):
-                yield row
-
-    def explanation(self):
-        return self.inner.explanation() + [ 'filtered to %s' % self.tag ]
-
-    def __str__(self):
-        return 'Filter(%s, by=%s)' % (View.asTag(self.inner),str(self.filterBy)) + self.showExtras()
-
-class Distinct(MapReduce):
-    """Remove duplicate rows."""
-
-    def __init__(self,inner=None,retaining=None):
-        MapReduce.__init__(self,[inner],retaining)
-
-    def mapPlan(self):
-        step = Step(self, 'doDistinctMap', self.inner.checkpoint(), self.checkpoint(), prereduce=True, why=self.explanation())
-        return Plan().extend(step)
-
-    def rowGenerator(self):
-        """Extract distinct elements from a sorted list."""
-        lastval = None
-        for line in sys.stdin:
-            valStr = line.strip()
-            val = self.planner._serializer.fromString(valStr)
-            if val != lastval and lastval: 
-                yield lastval
-            lastval = val
-        if lastval: 
-            yield lastval
-
-    def explanation(self):
-        return self.inner.explanation() + [ 'make distinct to %s' % self.tag]
-
-    def __str__(self):
-        return 'Distinct(%s)' % (View.asTag(self.inner)) + self.showExtras()
-
-    def doDistinctMap(self):
-        self.inner.doStoreRows()
-
-
-class Group(MapReduce):
-    """Group by some property of a row, defined with the 'by' option.
-    Default outputs are tuples (x,[r1,...,rk]) where the ri's are rows
-    that have 'by' values of x."""
-
-    def __init__(self,inner=None,by=lambda x:x,reducingTo=ReduceToList(),retaining=None):
-        MapReduce.__init__(self,[inner],retaining)
-        self.groupBy = by
-        self.reducingTo = reducingTo
-    
-    def mapPlan(self):
-        step = Step(self, 'doGroupMap',self.inner.checkpoint(),self.checkpoint(),prereduce=True,why=self.explanation())
-        return Plan().extend(step)
-
-    def rowGenerator(self):
-        """Group objects from stdin by key, yielding tuples (key,[g1,..,gn])."""
-        lastkey = key = None
-        accum = self.reducingTo.baseType()
-        for line in sys.stdin:
-            keyStr,valStr = line.strip().split("\t")
-            key = self.planner._serializer.fromString(keyStr)
-            val = self.planner._serializer.fromString(valStr)
-            if key != lastkey and lastkey!=None: 
-                yield (lastkey,accum)
-                accum = self.reducingTo.baseType()
-            accum = self.reducingTo.reduceBy(accum, val)
-            lastkey = key
-        if key: 
-            yield (key,accum)
-
-    def explanation(self):
-        return self.inner.explanation() + ['group to %s' % self.tag]
-
-    def __str__(self):
-        return 'Group(%s,by=%s,reducingTo=%s)' % (View.asTag(self.inner),str(self.groupBy),str(self.reducingTo)) + self.showExtras()
-
-    def doGroupMap(self):
-        self.doStoreKeyedRows(self.inner,self.groupBy,-1)
-
-class Join(MapReduce):
-    """Outputs tuples of the form (row1,row2,...rowk) where
-    rowi is from the i-th join input, and the rowi's have the same
-    value of the property being joined on."""
-
-    def __init__(self,*joinInputs):
-        #sets self.inners
-        MapReduce.__init__(self,map(lambda x:x.view, joinInputs),None)
-        self.joinInputs = joinInputs
-        #re-interpret the 'outer' join parameters - semantically
-        #if jin[i] is outer, then all other inputs must be marked as _padWithNulls
-        if any(map(lambda jin:jin.outer, self.joinInputs)):
-            assert len(self.joinInputs)==2,'outer joins are only supported on two-way joins '+str(self.joinInputs)
-            for i in range(len(self.joinInputs)):
-                if self.joinInputs[i].outer:
-                    j = 1-i  #the other index
-                    self.joinInputs[j]._padWithNulls = True
-    
-    def acceptInnerView(self,otherView):
-        assert False, 'join cannot be RHS of a pipe - use JoinTo instead'
-
-    def mapPlan(self):
-        innerCheckpoints = map(lambda v:v.checkpoint(), self.inners)
-        midfile = self.planner.opts['viewdir'] + '/'  + self.tag+'.gpmo'
-        step = Step(self, 'doJoinMap', src=innerCheckpoints, dst=self.checkpoint(), prereduce=True, hasIndex=True, mid=midfile, why=self.explanation())
-        return Plan().extend(step)
-
-    def applyDict(self,mapping,innerviewsOnly=False):
-        result = MapReduce.applyDict(self,mapping,innerviewsOnly=innerviewsOnly)
-        #also need to map over the join inputs
-        if isinstance(result,Join):
-            for i in range(len(result.joinInputs)):
-                result.joinInputs[i].view = result.inners[i]
-        return result
-
-    def rowGenerator(self):
-        """Group objects from stdin by key, yielding tuples (row1,row2,...)."""
-        lastkey = None
-        lastIndex = len(self.joinInputs)-1
-        somethingProducedForLastKey = False
-        #accumulate a list of lists of all non-final inputs
-        accumList = [ [] for i in range(lastIndex) ]
-        for line in sys.stdin:
-            keyStr,indexStr,valStr = line.strip().split("\t")
-            key = self.planner._serializer.fromString(keyStr)
-            index = int(indexStr)
-            val = self.planner._serializer.fromString(valStr)
-            if key != lastkey and lastkey!=None: 
-                #if the final join is marked as _padWithNulls, clear
-                #the accumulators, since we're doing an outer join
-                #with the last view
-                if self.joinInputs[lastIndex]._padWithNulls and not somethingProducedForLastKey:
-                    for tup in self._joinAccumulatedValuesTo(accumList,lastIndex,None):
-                        yield tup
-                #reset the accumulators, since they pertain to the 
-                accumList = [ [] for i in range(lastIndex) ]
-                somethingProducedForLastKey = False
-            if index!=lastIndex:
-                #accumulate values to use in the join
-                accumList[index] = accumList[index] + [val]
-            else:
-                #produce tuples that match the key for the last view
-                for tup in self._joinAccumulatedValuesTo(accumList,lastIndex,val):
-                    somethingProducedForLastKey = True
-                    yield tup
-            lastkey = key
-
-    def _joinAccumulatedValuesTo(self,accumList,lastIndex,finalVal):
-        # _padWithNulls as needed
-        for i in range(lastIndex):
-            if self.joinInputs[i]._padWithNulls and not accumList[i]:
-                accumList[i] = [None]
-        tupbuf = [ None for i in range(lastIndex+1) ]  #holds output
-        tupbuf[lastIndex] = finalVal
-        for i in range(lastIndex):
-            for a in accumList[i]:
-                tupbuf[i] = a
-                if i==lastIndex-1 and any(tupbuf):
-                        yield tuple(tupbuf)
-
-    def explanation(self):
-        innerEx = []
-        for inner in self.inners:
-            if innerEx: innerEx += ['THEN']
-            innerEx += inner.explanation()
-        return innerEx + [ 'FINALLY join to %s' % self.tag ]
-
-    def __str__(self):
-        return "Join(%s)" % ",".join(map(str,self.joinInputs)) + self.showExtras()
-
-    def doJoinMap(self,i):
-        # called by joinMapPlan with argument index, and stdin pointing to innerCheckpoints[index]
-        self.doStoreKeyedRows(self.joinInputs[i].view,self.joinInputs[i].joinBy,i)
-
-class JoinTo(Join):
-    """Special case of Join which can be used as the RHS of a pipe operator."""
-
-    def __init__(self,joinInput,by=None):
-        Join.__init__(self,Jin(None,by),joinInput)
-        
-    def acceptInnerView(self,otherView):
-        self.joinInputs[0].view = otherView
-        self.inners[0] = otherView
-
-##############################################################################
-#
-# the top-level planner, and its supporting classes
-#
-##############################################################################
-
-class Plan(object):
-    """A plan constructed by a GuineaPig."""
-
-    def __init__(self): self.steps = []
-
-    def extend(self,step): 
-        self.steps += [step]
-        return self
-
-    def append(self,subPlan): 
-        self.steps += subPlan.steps
-        return self
-
-    def execute(self,gp,echo=False):
-        script = self.compile(gp)
-        for shellcom in script:
-            if echo: print 'calling:',shellcom
-            subprocess.check_call(shellcom,shell=True)
-
-    def compile(self,gp):
-        """Return a list of strings that can be run as shell commands."""
-        script = []
-        i = 0
-        while (i<len(self.steps)):
-            s = self.steps[i]
-            i += 1
-            if s.whatToDo=='DISTRIBUTE':
-                script += s.distributeCommands(gp)
-            elif not s.prereduce: 
-                script += s.mapOnlyCommands(gp)
-            else:
-                #look ahead to find the next reduce command
-                while self.steps[i].whatToDo=='DISTRIBUTE':
-                    script += self.steps[i].distributeCommands(gp)
-                    i += 1
-                reduceStep = self.steps[i]
-                assert not reduceStep.prereduce, 'chained mapreduce steps:' + str(s) + str(reduceStep)
-                i += 1
-                if not isinstance(s.src,list):
-                    script += s.mapReduceCommands(reduceStep,gp)
-                else:
-                    script += s.multiMapReduceCommands(reduceStep,gp)
-        return script
-
-class Step(object):
-    """A single step of the plans produced by the planner, along with the
-    methods to convert the plans into executable shell commands."""
-
-    # Arguments: 
-    #    - view is view that created this step
-    #     - whatToDo is the operator, eg doStoreKeyedRows, or else DISTRIBUTE
-    #     - src is the file used as input, or possibly a list of files if there are multiple inputs
-    #     - dst is the file used as output
-    #     - mid is a file used as tmp storage for an unsorted version of output, if needed
-    #     - prereduce is True iff this is a checkpoint in a map-reduce step
-    #     - hasIndex is True iff the data is of the form <key,index,value> such that
-    #     items should be partitioned by key and sorted by index
-    #     - why is documentation/explanation.  
-    #     - reused is list of tags of views that should be reused.
-
-    def __init__(self,view,whatToDo,src=None,dst=None,prereduce=False,hasIndex=False,mid=None,why=[],reused=[]):
-        self.view = view
-        self.whatToDo = whatToDo
-        self.src = src
-        self.dst = dst
-        self.prereduce = prereduce
-        self.hasIndex = hasIndex
-        self.mid = mid
-        self.why = why
-        self.reused = reused
-
-    def setReusedViews(self,views):
-        self.reused = list(views)
-
-    def __str__(self):
-        return repr(self)
-
-    def __repr__(self):
-        return "Step(%s,%s,src=%s,dst=%s,prereduce=%s,mid=%s,why=%s,reused=%s)" \
-            % (repr(self.view.tag),repr(self.whatToDo),repr(self.src),
-               repr(self.dst),repr(self.prereduce),repr(self.mid),repr(self.explain()),repr(self.reused))
-
-    def explain(self):
-        """Convert an explanation - which is a list of strings - into a string"""
-        return "...".join(self.why)
-
-    #
-    # subroutines of the general case for code generation
-    #
-
-    class HadoopCommand(object):
-        def __init__(self,gp,*views):
-            logging.info('building hadoop command for '+str(map(lambda v:v.tag, views)))
-            self.invocation = [GPig.HADOOP_LOC,'jar',gp.opts['streamJar']]
-            self.defs = []
-            self.args = []
-            self.files = []
-            for f in gp._shippedFiles:
-                self.files += ['-file',f]
-            for view in views:
-                viewsToShip = view.sideviewsNeeded()
-                if viewsToShip:
-                    logging.info('shipping for '+view.tag+': '+str(map(lambda sv:sv.tag, viewsToShip)))
-                    for sv in viewsToShip:
-                        self.files += ['-file',sv.distributableFile()]
-            logging.info('files: '+str(self.files))
-        def append(self,*toks):
-            self.args += list(toks)
-        def appendDef(self,*toks):
-            self.defs += list(toks)
-        def asEcho(self):
-            return " ".join(['echo','hadoop'] + self.args + ['...'])
-        def asString(self):
-            return " ".join(self.invocation+self.defs+self.files+self.args)
-
-    def subplanHeader(self,reduceStep=None):
-        """Generate an explanatory header for a step."""
-        if not reduceStep: return ['#', 'echo create '+self.view.tag + ' via map: '+self.explain()]        
-        else: return ['#', 'echo create '+reduceStep.view.tag+' via map/reduce: '+reduceStep.explain()]                    
-
-    def coreCommand(self,gp):
-        """Python command to call an individual plan step."""
-        return 'python %s --view=%s --do=%s' % (gp._gpigSourceFile,self.view.tag,self.whatToDo) + self.coreCommandOptions(gp)
-
-    def ithCoreCommand(self,gp,i):
-        """Like coreCommand but allows index parameter to 'do' option"""
-        return 'python %s --view=%s --do=%s.%d' % (gp._gpigSourceFile,self.view.tag,self.whatToDo,i) + self.coreCommandOptions(gp)
-
-    def coreCommandOptions(self,gp):
-        paramOpts = '' if not gp.param else " --params " + ",".join(map(lambda(k,v):k+':'+v, gp.param.items()))
-        nonDefaults = []
-        for (k,v) in gp.opts.items():
-            #pass in non-default options, or options computed from the environment
-            if (gp.opts[k] != GPig.DEFAULT_OPTS[k]) or ((k in GPig.COMPUTED_OPTION_DEFAULTS) and (gp.opts[k] != GPig.COMPUTED_OPTION_DEFAULTS[k])):
-                nonDefaults += ["%s:%s" % (k,str(v))]
-        optsOpts = '' if not nonDefaults else " --opts " + ",".join(nonDefaults)
-        reuseOpts = '' if not self.reused else " --reuse "+ " ".join(self.reused)
-        return paramOpts  + optsOpts + reuseOpts
-
-    def hadoopClean(self,gp,fileName):
-        """A command to remove a hdfs directory if it exists."""
-        #return '(%s fs -test -e %s && %s fs -rmr %s) || echo no need to remove %s' % (GPig.HADOOP_LOC,fileName, GPig.HADOOP_LOC,fileName, fileName)
-        return '(%s fs -test -e %s && %s fs -rmr %s) || echo no need to remove %s' % (GPig.HADOOP_LOC,fileName, GPig.HADOOP_LOC,fileName, fileName)
-
-    #
-    # actual code generation for the steps
-    #
-    
-    # one special case - 'distribute' a computed view, ie move to distributed cache
-
-    def distributeCommands(self,gp):
-        """Special-purpose step: Make a view available for use as a side view."""
-        localCopy = self.view.distributableFile()                
-        maybeRemoteCopy = self.view.storedFile()
-        echoCom = 'echo DISTRIBUTE %s: making a local copy of %s in %s' % (self.view.tag,maybeRemoteCopy,localCopy)
-        if gp.opts['target']=='hadoop':
-            return [echoCom, 'rm -f %s' % localCopy, '%s fs -getmerge %s %s' % (GPig.HADOOP_LOC,maybeRemoteCopy, localCopy)]
-        else:
-            return [echoCom, 'cp -f %s %s || echo warning: the copy failed!' % (maybeRemoteCopy,localCopy)]
-
-
-    # one general case - a map-only step with only one input
-
-    def mapOnlyCommands(self,gp):
-        """A subplan for a mapper-only step."""
-        if gp.opts['target']=='shell':
-            command = None
-            if self.src: command = self.coreCommand(gp) + ' < %s > %s' % (self.src,self.dst)
-            else: command = self.coreCommand(gp) + (' > %s' % (self.dst))
-            return self.subplanHeader() + [command]
-        elif gp.opts['target']=='hadoop':
-            assert self.src,'Wrap not supported for hadoop'
-            hcom = self.HadoopCommand(gp,self.view)
-            hcom.appendDef('-D','mapred.reduce.tasks=0')
-            hcom.append('-input',self.src,'-output',self.dst)
-            hcom.append("-mapper '%s'" % self.coreCommand(gp))
-            return self.subplanHeader() + [ hcom.asEcho(), self.hadoopClean(gp,self.dst), hcom.asString() ]
-        else:
-            assert False
-
-    # another general case - a map-reduce step
-
-    def mapReduceCommands(self,reduceStep,gp):
-        """A subplan for a map-reduce step followed by a reduce, where the map has one input."""
-        if gp.opts['target']=='shell':
-            command = self.coreCommand(gp) + (' < %s' % self.src) + ' | sort -k1 | '+reduceStep.coreCommand(gp) + (' > %s' % reduceStep.dst)
-            return self.subplanHeader(reduceStep) + [command]
-        elif gp.opts['target']=='hadoop':
-            hcom = self.HadoopCommand(gp,self.view,reduceStep.view)
-            hcom.appendDef('-D','mapred.reduce.tasks=%d' % gp.opts['parallel'])
-            hcom.append('-input',self.src,'-output',reduceStep.dst)
-            hcom.append("-mapper '%s'" % self.coreCommand(gp))
-            hcom.append("-reducer '%s'" % reduceStep.coreCommand(gp))
-            return self.subplanHeader(reduceStep) + [ hcom.asEcho(), self.hadoopClean(gp,reduceStep.dst), hcom.asString() ]
-        else:
-            assert False
-
-    # another general case - a map-reduce step with multiple map inputs
-
-    def multiMapReduceCommands(self,reduceStep,gp):
-        """A subplan for a map-reduce step followed by a reduce, where the map has many inputs."""
-        if gp.opts['target']=='shell':
-            subplan = ['rm -f %s' % self.mid]
-            for i in range(len(self.src)):
-                subplan += [ self.ithCoreCommand(gp,i) + ' < %s >> %s' % (self.src[i],self.mid) ]
-            sortOpts = '-k1,2' if self.hasIndex else '-k1'
-            subplan += [ 'sort ' + sortOpts + ' < ' + self.mid + ' | ' + reduceStep.coreCommand(gp) + (' > %s' % reduceStep.dst)]
-            return self.subplanHeader(reduceStep) + subplan
-        elif gp.opts['target']=='hadoop':
-            def midi(i): return self.mid + '-' + str(i)
-            subplan = []
-            for i in range(len(self.src)):
-                hcom = self.HadoopCommand(gp,self.view)
-                hcom.appendDef('-D','mapred.reduce.tasks=%d' % gp.opts['parallel'])
-                hcom.append('-input',self.src[i], '-output',midi(i))
-                hcom.append("-mapper","'%s'" % self.ithCoreCommand(gp,i))
-                subplan += [ self.hadoopClean(gp,midi(i)), hcom.asEcho(), hcom.asString() ]
-            hcombineCom = self.HadoopCommand(gp,reduceStep.view)
-            hcombineCom.appendDef('-D','mapred.reduce.tasks=%d' % gp.opts['parallel'])
-            if (self.hasIndex): 
-                hcombineCom.appendDef('-jobconf','stream.num.map.output.key.fields=3')
-                hcombineCom.appendDef('-jobconf','num.key.fields.for.partition=1')
-            for i in range(len(self.src)):
-                hcombineCom.append('-input',midi(i))
-            hcombineCom.append('-output',reduceStep.dst)
-            hcombineCom.append('-mapper','cat')
-            hcombineCom.append('-reducer',"'%s'" % reduceStep.coreCommand(gp))
-            if (self.hasIndex): 
-                hcombineCom.append('-partitioner','org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner')
-            subplan += [ self.hadoopClean(gp,reduceStep.dst),  hcombineCom.asEcho(), hcombineCom.asString() ]
-            return self.subplanHeader(reduceStep) + subplan
-        else:
-            assert False
-
-class RowSerializer(object):
-    """Saves row objects to disk and retrieves them."""
-    def __init__(self,target):
-        self._target = target
-        self._reprInverse = None
-    def toString(self,x): 
-        return repr(x)
-    def fromString(self,s): 
-        if self._reprInverse: return self._reprInverse(s)
-        else: return eval(s)
-
-#
-# the planner
-#
-
-class Planner(object):
-    """Can create storage plans for views that are defined as parts of it."""
-
-    def __init__(self,**kw):
-
-        #parameters are used for programmatically give user-defined
-        #config information to a planner, or they can be specified in
-        #the command-line
-        self.param = kw
-        for (key,val) in GPig.getArgvParams().items():
-                # don't override non-null values specified in the constructor
-                if self.param.get(key)==None:
-                    self.param[key] = val
-
-        #opts are used for giving options to the planner from the shell
-        self.opts = GPig.getArgvOpts()
-        for (key,val) in GPig.DEFAULT_OPTS.items():
-            if (not key in self.opts): self.opts[key] = val
-        for (key,type) in GPig.DEFAULT_OPT_TYPES.items():
-            self.opts[key] = type(self.opts[key])
-
-        #use serializer appropriate for the target
-        self._serializer = RowSerializer(self.opts['target'])
-
-        #views that aren't associated with class variable, but are
-        #instead named automatically - ie, inner views with no
-        #user-provided names.
-        self._autoNamedViews = {}
-
-        #by default, use info-level logging at planning time
-        if not Planner.partOfPlan(sys.argv): 
-            logging.basicConfig(level=logging.INFO)
-
-        #hadoop needs to know where to give the main script file,
-        #as well as the guineapig.py file it uses
-        self._gpigSourceFile = sys.argv[0]
-        self._shippedFiles = [GPig.MY_LOC,self._gpigSourceFile]
-
-    def setup(self):
-        """Initialize planner, and views used by the planner.  This has to be
-        done after the planner is fully configured by adding views."""
-
-        self.reusableViews = {}
-
-        # make sure view directory is valid
-        if self.opts['target']=='shell' and not os.path.exists(self.opts['viewdir']):
-            logging.info('creating view directory ' + self.opts['viewdir'])
-            os.makedirs(self.opts['viewdir'])
-        elif self.opts['target']=='hadoop':
-            p = urlparse.urlparse(self.opts['viewdir'])
-            if not p.path.startswith("/"):
-                logging.warn('hadoop viewdir should be absolite path: will try prefixing /user/$LOGNAME')
-                username = os.environ.get('LOGNAME','me')
-                self.opts['viewdir'] = '/user/'+username+'/'+self.opts['viewdir']
-                logging.warn('viewdir is set to '+self.opts['viewdir'])
-
-        # Add 'tag' and planner fields to each view
-        for vname in self.listViewNames():
-            v = self.getView(vname)
-            v.tag = vname
-            v.planner = self
-        def tagUnnamedViews(v,basename,index,depth):
-            assert v,'null inner view for '+basename
-            if not v.planner:
-                v.planner = self
-                autoname = '%s_%d_%s' % (basename,depth,index)
-                self._setView(autoname,v)
-                for i,inner in enumerate(v.inners + v.sideviews):
-                    tagUnnamedViews(inner,vname,i,depth+1)
-        for vname in self.listViewNames():
-            v = self.getView(vname)
-            for i,inner in enumerate(v.inners + v.sideviews):
-                tagUnnamedViews(inner,vname,i,1)
-
-        # Add caching options as needed
-        for vname in self.listViewNames():
-            v = self.getView(vname)
-            v.enforceStorageConstraints()
-
-    #
-    # utils
-    # 
-            
-
-    def getView(self,str,mustExist=False):
-        """Find the defined relation named str, and if necessary bind its
-        planner and tag appropriately."""
-        v = self.__class__.__dict__.get(str) or self.__dict__.get(str) or self._autoNamedViews.get(str)
-        if mustExist: assert v,'cannot find a view named '+str
-        return v
-
-    def _setView(self,str,view):
-        """Internal use only: allow the view to be retreived by name later."""
-        view.tag = str
-        self._autoNamedViews[str] = view
-
-    def listViewNames(self):
-        def namedViews(d): return [vname for vname in d.keys() if isinstance(self.getView(vname),View)]
-        userNamedViews =  namedViews(self.__class__.__dict__) + namedViews(self.__dict__)
-        return userNamedViews + self._autoNamedViews.keys()
-
-    #
-    # planning
-    # 
-
-    def buildRecursiveStoragePlan(self,view):
-        """Called by view.storagePlan.""" 
-        #figure out what to reuse - starting with what the user specified
-        storedViews = dict(self.reusableViews)
-        #also mark for eager storage anything that's used twice in the
-        #plan---i.e., anything that is consumed by two or more views
-        numParents = collections.defaultdict(int)
-        for dv in self._descendants(view):
-            for inner in dv.inners + dv.sideviews:
-                numParents[inner] += 1
-        for (dv,n) in numParents.items():
-            if n>1 and dv.storeMe==None:
-                logging.info('making %s stored because it is used %d times in creating %s' % (dv,n,view.tag))
-                dv.storeMe = True
-
-        #traverse view in pre-order and find a linear sequence of
-        #views to store, each of which requires only views earlier in
-        #the sequence
-        storageSeq = self._storageSeq(view,storedViews) + [view.tag]
-        logging.info('storage sequence is: ' + ",".join(storageSeq))
-
-        #splice together plans for each view in the sequence,
-        #after first modifying the view so that nothing is called
-        #directly, but only through the ReuseView proxies
-        plan = Plan()
-        for tag in storageSeq:
-            v = self.getView(tag,mustExist=True)
-            vm = v.applyDict(storedViews,innerviewsOnly=True)
-            subplan = vm.nonrecursiveStoragePlan()
-            #add the correct context of reused views to the subplan,
-            #so that that the actual definition of the view will be
-            #rewritten appropriately to include the new ReuseView
-            #proxy for it
-            viewsLocallyReused = self._reuseViewDescendants(vm)
-            for s in subplan.steps:
-                s.setReusedViews(viewsLocallyReused)
-            plan.append(subplan)
-        return plan
-
-    def _reuseViewDescendants(self,view):
-        """Descendent views that are ReuseView's"""
-        result = set()
-        for dv in self._descendants(view):
-            if isinstance(dv,ReuseView):
-                result.add(dv.reusedViewTag)
-        return result
-
-    def _descendants(self,view):
-        """Descendents of a view."""
-        result = set()
-        result.add(view)
-        for inner in view.inners + view.sideviews:
-            result = result.union(self._descendants(inner))
-        return result
-
-    def _storageSeq(self,view,storedViews):
-        """Linear sequence of storage actions to take - as view tags."""
-        seq = []
-        for inner in view.inners + view.sideviews:
-            if not inner.tag in storedViews:
-                seq += self._storageSeq(inner,storedViews)
-                if inner.storeMe:
-                    seq += [inner.tag]
-                    storedViews[inner.tag] = ReuseView(inner)
-        return seq
-
-    #
-    # dealing with the file storage system and related stuff
-    #
-
-    def ship(self,*fileNames):
-        """Declare a set of inputs to be 'shipped' to the hadoop cluster."""
-        self._shippedFiles += fileNames
-
-    def setSerializer(self,serializer):
-        """Replace the default serializer another RowSerializer object."""
-        self._serializer = serializer
-        return self
-
-    def setReprInverseFun(self,reprInverse):
-        """Specify a function which will deserialize a string that was produced
-        by Python's 'repr' function."""
-        self._serializer._reprInverse = reprInverse
-        return self
-
-    #
-    # rest of the API for the planner
-    # 
-
-    @staticmethod
-    def partOfPlan(argv):
-        """True if the command line was generated as part of a storage plan."""
-        return any(s.startswith("--do") for s in argv)
-
-    def main(self,argv):
-        """Run a main that lets you --store a view, as well as doing a few other things."""
-        self.setup()
-        self.runMain(argv)
-
-    def runMain(self,argv):
-
-        # parse the options and dispatch appropriately
-        argspec = ["store=", "cat=", "reuse", 
-                   "list", "pprint=", "steps=", "plan=", 
-                   "params=", "opts=", "do=", "view="]
-        optlist,args = getopt.getopt(argv[1:], 'x', argspec)
-        optdict = dict(optlist)
-        
-        # decide what views can be re-used, vs which need fresh plans
-        if '--reuse' in optdict:  #reuse the views listed in the arguments
-            for a in args:
-                vname = View.viewNameFor(a)
-                v = self.getView(vname)
-                if v:
-                    self.reusableViews[v.tag] = ReuseView(v)
-                    logging.info("re-using data stored for view "+vname+": "+str(v))
-                else:
-                    logging.warn("cannot re-use view "+vname+" since it's not used in this script")
-
-        #choose the main action to take
-        if '--store' in optdict:  #store a view
-            rel = self.getView(optdict['--store'],mustExist=True)            
-            plan = rel.storagePlan()
-            plan.execute(self, echo=self.opts['echo'])
-            return
-        elif '--pprint' in optdict: #print a view
-            rel = self.getView(optdict['--pprint'],mustExist=True)
-            rel.applyDict(self.reusableViews).pprint()
-            return
-        elif '--steps' in optdict: #print a view
-            rel = self.getView(optdict['--steps'],mustExist=True)
-            plan = rel.storagePlan()
-            for s in plan.steps:
-                print ' -',s
-            return
-        elif '--plan' in optdict:    #print a storage plan
-            rel = self.getView(optdict['--plan'],mustExist=True)
-            plan = rel.storagePlan()
-            print "\n".join(plan.compile(self))
-            return
-        elif '--cat' in optdict:    #store and then print a view
-            assert self.opts['target']=='shell','cannot do --cat except in shell mode'
-            rel = self.getView(optdict['--cat'],mustExist=True)
-            plan = rel.storagePlan()
-            plan.execute(self, self.opts['echo'])
-            for line in open(rel.storedFile(),'r'):
-                print line,
-            return
-        elif '--list' in optdict:   #list named views
-            for vname in self.listViewNames():
-                print '     ',vname,'\t',self.getView(vname)
-            return
-        elif '--do' in optdict:     #run an internally-generated action
-            #recover what should be stored when this action is performed
-            #work out what view to use and what routine to call
-            rel = self.getView(optdict['--view'],mustExist=True)
-            rel = rel.applyDict(self.reusableViews)
-            whatToDo = optdict['--do']
-            #work out the method given by 'do' and call it - note it
-            #may have a single integer argument, eg doJoinMap.1
-            k = whatToDo.find(".")
-            if k<0:
-                whatToDoMethod = getattr(rel,whatToDo)
-                whatToDoMethod()                
-            else:
-                arg = int(whatToDo[k+1:])
-                whatToDo = whatToDo[:k]
-                whatToDoMethod = getattr(rel,whatToDo)
-                whatToDoMethod(arg)                
-            return
-        else:
-            print 'usage: --[store|pprint|steps|plan|cat] view [--opts key:val,...] [--params key:val,...] --reuse view1 view2 ...]'
-            print '       --[list]'
-            print 'current legal keys for "opts", with default values:'
-            for (key,val) in GPig.DEFAULT_OPTS.items():
-                print '  %s:%s' % (key,str(val))
-            print 'There\'s more help at http://curtis.ml.cmu.edu/w/courses/index.php/Guinea_Pig'
-
-if __name__ == "__main__":
-    print 'There\'s help at http://curtis.ml.cmu.edu/w/courses/index.php/Guinea_Pig'    
diff --git a/tutorial/guineapig1_3.py b/tutorial/guineapig1_3.py
deleted file mode 100644
index 9b1fc5a..0000000
--- a/tutorial/guineapig1_3.py
+++ /dev/null
@@ -1,1384 +0,0 @@
-##############################################################################
-# (C) Copyright 2014 William W. Cohen.  All rights reserved.
-##############################################################################
-
-import sys
-import logging
-import copy
-import subprocess
-import collections
-import os
-import os.path
-import urlparse
-import getopt
-import csv
-
-###############################################################################
-# helpers functions and data structures
-###############################################################################
-
-class GPig(object):
-    """Collection of utilities for Guinea Pig."""
-
-    HADOOP_LOC = 'hadoop'  #assume hadoop is on the path at planning time
-    MY_LOC = 'guineapig1_3.py'
-
-    #global options for Guinea Pig can be passed in with the --opts
-    #command-line option, and these are the default values
-    defaultJar = '/usr/lib/hadoop/contrib/streaming/hadoop-streaming-1.2.0.1.3.0.0-107.jar'
-    envjar = os.environ.get('GP_STREAMJAR', defaultJar)
-    DEFAULT_OPTS = {'streamJar': envjar,
-                    'parallel':5,
-                    'target':'shell',
-                    'echo':0,
-                    'viewdir':'gpig_views',
-                    }
-    #there are the types of each option that has a non-string value
-    DEFAULT_OPT_TYPES = {'parallel':int,'echo':int}
-    #we need to pass non-default options in to mappers and reducers,
-    #but since the remote worker's environment can be different, we
-    #also need to pass in options computed from the environment
-    COMPUTED_OPTION_DEFAULTS = {'streamJar':defaultJar}
-
-    @staticmethod
-    def getCompiler(target):
-        if target=='shell': return ShellCompiler()
-        elif target=='hadoop': return HadoopCompiler()
-        else: assert 'illegal compilation target '+target
-
-    @staticmethod
-    def getArgvParams(): 
-        """Return a dictionary holding the argument of the --params option in
-        sys.argv."""
-        return GPig.getArgvDict('--params')
-
-    @staticmethod
-    def getArgvOpts(): 
-        """Return a dictionary holding the argument of the --opts option in
-        sys.argv."""
-        return GPig.getArgvDict('--opts')
-    
-    @staticmethod
-    def getArgvDict(optname):
-        """Return a dictionary of parameter values that were defined on the command line
-        view an option like '--params filename:foo.txt,basedir:/tmp/glob/'.
-        """
-        assert optname.startswith('--')
-        for i,a in enumerate(sys.argv):
-            if a==optname:
-                paramString = sys.argv[i+1]
-                return dict(pair.split(":") for pair in paramString.split(","))
-        return {}
-
-    @staticmethod
-    def rowsOf(view):
-        """Iterate over the rows in a view."""
-        for line in open(view.distributableFile()):
-            yield view.planner._serializer.fromString(line.strip())
-
-    @staticmethod
-    def onlyRowOf(view):
-        """Return the first row in a side view, and raise an error if it
-        is not the only row of the view."""
-        result = None
-        logging.info('loading '+view.distributableFile())
-        for line in open(view.distributableFile()):
-            assert not result,'multiple rows in stored file for '+view.tag
-            result = view.planner._serializer.fromString(line.strip())
-        return result
-
-    @staticmethod
-    class SafeEvaluator(object):
-        """Evaluates expressions that correzpond to serialized guinea pig rows."""
-        def __init__(self,restrictedBindings={}):
-            self.restrictedBindings = restrictedBindings
-        def eval(self,s):
-            code = compile(s,'<gpig row>','eval')
-            return eval(code,self.restrictedBindings)
-
-class Jin(object):
-    """"Object to hold description of a single join input."""
-
-    def __init__(self,view,by=(lambda x:x),outer=False):
-        self.view = view
-        self.joinBy = by
-        self.outer = outer
-        self._padWithNulls = False
-
-    def __str__(self):
-        viewStr = View.asTag(self.view) if self.view else '_'
-        outerStr = ',outer=True' if self.outer else ''
-        padStr = ',_padWithNulls=True' if self._padWithNulls else ''
-        return "Jin(%s,by=%s%s%s)" % (viewStr,self.joinBy,outerStr,padStr)
-
-class ReduceTo(object):
-    """An object x that can be the argument of a reducingTo=x
-    parameter in a Group view."""
-    def __init__(self,baseType,by=lambda accum,val:accum+val):
-        self.baseType = baseType
-        self.reduceBy = by
-
-class ReduceToCount(ReduceTo):
-    """Produce the count of the number of objects that would be placed in a group."""
-    def __init__(self):
-        ReduceTo.__init__(self,int,by=lambda accum,val:accum+1)
-
-class ReduceToSum(ReduceTo):
-    """Produce the sum of the objects - which must be numbers - that would
-    be placed in a group."""
-    def __init__(self):
-        ReduceTo.__init__(self,int,by=lambda accum,val:accum+val)
-
-class ReduceToList(ReduceTo):
-    """Produce a list of the objects that would be placed in a group."""
-    def __init__(self):
-        ReduceTo.__init__(self,list,by=lambda accum,val:accum+[val])
-
-###############################################################################
-# abstract views
-##############################################################################
-
-class View(object):
-    """A definition of a relation for Guinea Pig.  A View object can be
-    produce a storagePlan(), which can then be executed to produce the
-    contents of the relation.  Intutitively, a relation is and
-    unordered bag of rows, and a row an almost-arbitrary python data
-    structure. (It must be something that can be stored and retrieved
-    by the RowSerializer.)
-
-    Steps in the storagePlan are executed by delegation, thru the
-    planner, to methods of a View class named doFoo.
-    """
-
-    def __init__(self):
-        """The planner and tag must be set before this is used."""
-        self.planner = None       #pointer to planner object
-        self.tag = None           #for naming storedFile and checkpoints
-        self.storeMe = None       #try and store this view if true
-        self.retainedPart = None  #used in map-reduce views only
-        self.sideviews = []       #non-empty for Augment views only
-        self.inners = []          #always used
-
-    #self.inner is shortcut for inners[0]
-    def _getInner(self): return self.inners[0]
-    def _setInner(self,val): self.inners = [val]
-    inner = property(_getInner,_setInner)
-
-    #
-    # ways to modify a view
-    # 
-
-    def opts(self,stored=None):
-        """Return the same view with options set appropriately.  Possible
-        options include:
-
-          - stored=True - Explicitly store this view on disk whenever
-            it is used in another view's definition.  This might be set
-            by the user for debugging purposes, or by the planner,
-            to prevent incorrect optimizations.  Generally "inner"
-            views are not explicitly stored.
-            
-          - stored='distributedCache' - Store this view in the working
-            directory and/or the Hadoop distributed cache.
-            """
-
-        self.storeMe = stored
-        return self
-
-    def showExtras(self):
-        """Printable representation of the options for a view."""
-        result = ''
-        flagPairs = []
-        if self.storeMe: flagPairs += ['stored=%s' % repr(self.storeMe)]
-        if flagPairs: result += '.opts(' + ",".join(flagPairs) + ')'
-        return result
-
-    #
-    # how the view is saved on disk
-    #
-
-    def storedFile(self):
-        """The file that will hold the materialized relation."""
-        return self.planner.opts['viewdir'] + '/' + self.tag + '.gp'
-
-    def distributableFile(self):
-        """The file that will hold the materialized relation in the working directory
-        in preparation to be uploaded to the distributed cache."""
-        return self.tag + '.gp'
-
-    @staticmethod
-    def viewNameFor(fileName):
-        """The view associated with the given file name"""
-        vname = os.path.basename(fileName)
-        if vname.endswith(".gp"): vname = vname[0:-len(".gp")]
-        return vname
-
-    #
-    # semantics of the view
-    #
-
-    def checkpoint(self):
-        """A checkpoint is an intermediate computation for the view, which is
-        saved on disk.  The rowGenerator() for the view will assume
-        that the checkpoint is available.
-        """
-        assert False, 'abstract method called'
-
-    def checkpointPlan(self):
-        """A plan to produce checkpoint()."""
-        assert False, 'abstract method called'
-
-    def rowGenerator(self):
-        """A generator for the rows in this relation, which assumes existence
-        of the checkpoint."""
-        assert False, 'abstract method called'
-
-    def explanation(self):
-        """Return an explanation of how rows are generated."""
-        assert False, 'abstract method called'
-
-    def storagePlan(self):
-        """A plan to store the view."""
-        return self.planner.buildRecursiveStoragePlan(self)
-
-    def nonrecursiveStoragePlan(self):
-        """Materialize the relation, assuming that there are no descendent
-        inner views that need to be materialized first."""
-        plan = Plan()
-        plan.includeStepsOf(self.checkpointPlan())
-        plan.append(TransformStep(view=self,whatToDo='doStoreRows',srcs=[self.checkpoint()],dst=self.storedFile(),why=self.explanation()))
-        return plan
-            
-    def applyDict(self,mapping,innerviewsOnly=False):
-        """Given a mapping from view tags to views, replace every inner view with
-        the appropriate value from the mapping, and return the result."""
-        if self.tag in mapping and not innerviewsOnly:
-            return mapping[self.tag]
-        elif not self.inners:
-            return self
-        else:
-            result = copy.copy(self)
-            result.inners = map(lambda v:v.applyDict(mapping), self.inners)
-            return result
-
-    def sideviewsNeeded(self):
-        """Sideviews needed by this view."""
-        result = []
-        for sv in self.sideviews:
-            result += [sv]
-        for v in self.inners:
-            result += list(v._sideviewsOfDescendants())
-        return result
-        
-    def _sideviewsOfDescendants(self):
-        if not self.storeMe:
-            for sv in self.sideviews:
-                yield sv
-            for v in self.inners:
-                for sv in v._sideviewsOfDescendants():
-                    yield sv
-
-    def enforceStorageConstraints(self):
-        """Subclass this, if there are constraints on when one must explicitly
-        store inner views."""
-        pass
-
-    def doStoreRows(self):
-        """Called by planner at execution time to store the rows of the view."""
-        for row in self.rowGenerator():
-            print self.planner._serializer.toString(row)
-
-    #
-    # support the "pipe" syntax: view1 | view2
-    #
-
-    def __or__(self,otherView):
-        """Overload the pipe operator x | y to return with y, with x as its inner view."""
-        otherView.acceptInnerView(self)
-        return otherView
-
-    def acceptInnerView(self,otherView):
-        """Replace an appropriate input view with otherView. This is subclassed to 
-        implement the the pipe operator."""
-        assert not self.inner,'An inner view is defined for '+self.tag+' so you cannot use it as RHS of a pipe'
-        self.inner = otherView  #subclass if needed
-
-    #
-    # printing views
-    #
-
-    def pprint(self,depth=0,alreadyPrinted=None,sideview=False):
-        """Print a readable representation of the view."""
-        if alreadyPrinted==None: alreadyPrinted = set()
-        tabStr = '| ' * depth
-        tagStr = str(self.tag)
-        sideviewIndicator = '*' if sideview else ''
-        if self.tag in alreadyPrinted:
-            print tabStr + sideviewIndicator + tagStr + ' = ' + '...'
-        else:
-            sideviewInfo = "  sideviews: {"+",".join(map(lambda x:x.tag, self.sideviews))+"}" if self.sideviews else ""
-            print tabStr + sideviewIndicator + tagStr + ' = ' + str(self) + sideviewInfo
-            alreadyPrinted.add(self.tag)
-            for inner in self.inners:
-                inner.pprint(depth+1,alreadyPrinted)
-            for inner in self.sideviews:
-                inner.pprint(depth+1,alreadyPrinted,sideview=True)
-
-    @staticmethod
-    def asTag(view):
-        """Helper for printing views."""
-        if not view: return '(null view)'
-        elif view.tag: return view.tag 
-        else: return str(view)
-
-#
-# abstract view types
-#
-
-class Reader(View):
-    """Read data stored on the file system and make it look like a View."""
-
-    def __init__(self,src):
-        View.__init__(self)
-        self.src = src
-        self.inners = []
-
-    def checkpoint(self): 
-        return self.src
-
-    def checkpointPlan(self):
-        return Plan()  #empty plan
-
-    def explanation(self):
-        return [ 'read %s with %s' % (str(self.src),self.tag) ]
-
-    def acceptInnerView(self,otherView):
-        assert False, "Reader views cannot be used as RHS of a pipe"
-
-class Transformation(View):
-    """Streaming transformation on a single inner view."""
-
-    def __init__(self,inner=None):
-        View.__init__(self)
-        self.inner = inner
-    
-    # A transformation will stream on-the-fly through the inner
-    # relation, and produce a new version, so the checkpoint and plan
-    # to produce it are delegated to the inner View.
-
-    def checkpoint(self):
-        return self.inner.checkpoint()
-
-    def checkpointPlan(self):
-        return self.inner.checkpointPlan()
-
-    def explanation(self):
-        return self.inner.explanation() + [ 'transform to %s' % self.tag ]
-
-class MapReduce(View):
-    """A view that takes an inner relation and processes in a
-    map-reduce-like way."""
-
-    def __init__(self,inners,retaining):
-        View.__init__(self)
-        self.inners = inners
-        self.retainedPart = retaining
-    
-    def _isReduceInputFile(self,fileName):
-        return fileName.endswith('.gpri')
-
-    def checkpoint(self):
-        ## the checkpoint is the reducer input file
-        return self.planner.opts['viewdir'] + '/'  + self.tag + '.gpri'
-
-    def checkpointPlan(self):
-        plan = Plan()
-        for inner in self.inners:
-            plan.includeStepsOf(inner.checkpointPlan())
-        plan.includeStepsOf(self.mapPlan())
-        return plan
-
-    def enforceStorageConstraints(self):
-        for inner in self.inners:
-            innerChkpt = inner.checkpoint()
-            #optimizations break if you chain two map-reduces together
-            if innerChkpt and innerChkpt.endswith(".gpri"):
-                if not inner.storeMe:
-                    logging.info('making %s stored, to make possible a downstream map-reduce view' % inner.tag)
-                    inner.storeMe = True
-
-    def mapPlan(self):
-        log.error("abstract method not implemented")
-        
-    def doStoreKeyedRows(self,subview,key,index):
-        """Utility method used by concrete map-reduce classes to compute keys
-        and store key-value pairs.  Usually used as the main step in a
-        mapPlan. """
-        for row in subview.rowGenerator():
-            keyStr = self.planner._serializer.toString(key(row))
-            rrow = self.retainedPart(row) if self.retainedPart else row
-            valStr = self.planner._serializer.toString(rrow)
-            if index<0:
-                print "%s\t%s" % (keyStr,valStr)
-            else:
-                print "%s\t%d\t%s" % (keyStr,index,valStr)
-            
-##############################################################################
-#
-# concrete View classes
-#
-##############################################################################
-
-class ReuseView(Reader):
-    """Returns the objects in a previously stored view."""
-
-    def __init__(self,view):
-        if isinstance(view,View):
-            Reader.__init__(self,view.storedFile())
-            self.tag = "reuse_"+view.tag
-            self.reusedViewTag = view.tag
-            self.planner = view.planner
-        else:
-            assert False,'user-defined ReuseView not supported (yet)'
-
-    def rowGenerator(self):
-        for line in sys.stdin:
-            yield self.planner._serializer.fromString(line.strip())
-
-    def __str__(self):
-        return 'ReuseView("%s")' % self.src + self.showExtras()
-
-
-class ReadLines(Reader):
-    """ Returns the lines in a file, as python strings."""
-
-    def __init__(self,src):
-        Reader.__init__(self,src)
-
-    def rowGenerator(self):
-        for line in sys.stdin:
-            yield line
-
-    def __str__(self):
-        return 'ReadLines("%s")' % self.src + self.showExtras()
-
-class ReadCSV(Reader):
-    """ Returns the lines in a CSV file, converted to Python tuples."""
-
-    def __init__(self,src,**kw):
-        Reader.__init__(self,src)
-        self.kw = kw
-
-    def rowGenerator(self):
-        for tup in csv.reader(sys.stdin,**self.kw):
-            yield tup
-
-    def __str__(self):
-        return 'ReadCVS("%s",%s)' % (self.src,str(self.kw)) + self.showExtras()
-
-
-class ReplaceEach(Transformation):
-    """ In 'by=f'' f is a python function that takes a row and produces
-    its replacement."""
-    
-    def __init__(self,inner=None,by=lambda x:x):
-        Transformation.__init__(self,inner)
-        self.replaceBy = by
-
-    def rowGenerator(self):
-        for row in self.inner.rowGenerator():
-            yield self.replaceBy(row)
-
-    def explanation(self):
-        return self.inner.explanation() + [ 'replaced to %s' % self.tag ]
-
-    def __str__(self):
-        return 'ReplaceEach(%s, by=%s)' % (View.asTag(self.inner),str(self.replaceBy)) + self.showExtras()
-
-class Augment(Transformation):
-
-    def __init__(self,inner=None,sideviews=None,sideview=None,loadedBy=lambda v:list(GPig.rowsOf(v))):
-        Transformation.__init__(self,inner)
-        assert not (sideviews and sideview), 'cannot specify both "sideview" and "sideviews"'
-        self.sideviews = list(sideviews) if sideviews else [sideview]
-        self.loader = loadedBy
-        assert self.loader,'must specify a "loadedBy" function for Augment'
-
-    def enforceStorageConstraints(self):
-        for sv in self.sideviews:
-            sv.storeMe = 'distributedCache'
-
-    def rowGenerator(self):
-        augend = self.loader(*self.sideviews)
-        for row in self.inner.rowGenerator():
-            yield (row,augend)
-
-    def checkpointPlan(self):
-        plan = Plan()
-        plan.includeStepsOf(self.inner.checkpointPlan())
-        #the sideviews should have been stored by the top-level
-        #planner already, but they will need to be moved to a
-        #distributable location
-        for sv in self.sideviews:
-            plan.append(DistributeStep(sv))
-        return plan
-
-    def explanation(self):
-        return self.inner.explanation() + [ 'augmented to %s' % self.tag ]
-
-    def __str__(self):
-        sideviewTags = loaderTag = '*UNSPECIFIED*'
-        if self.sideviews!=None: sideviewTags = ",".join(map(View.asTag,self.sideviews))
-        if self.loader!=None: loaderTag = str(self.loader)
-        return 'Augment(%s,sideviews=%s,loadedBy=s%s)' % (View.asTag(self.inner),sideviewTags,loaderTag) + self.showExtras()
-
-
-class Format(ReplaceEach):
-    """ Like ReplaceEach, but output should be a string, and it will be be
-    stored as strings, ie without using the serializer."""
-
-    def __init__(self,inner=None,by=lambda x:str(x)):
-        ReplaceEach.__init__(self,inner,by)
-
-    def __str__(self):
-        return 'Format(%s, by=%s)' % (View.asTag(self.inner),str(self.replaceBy)) + self.showExtras()
-
-    def doStoreRows(self):
-        for row in self.rowGenerator():
-            print row
-
-class Flatten(Transformation):
-    """ Like ReplaceEach, but output of 'by' is an iterable, and all
-    results will be returned. """
-
-    def __init__(self,inner=None,by=None):
-        Transformation.__init__(self,inner)
-        self.flattenBy = by
-
-    def rowGenerator(self):
-        for row in self.inner.rowGenerator():
-            for flatrow in self.flattenBy(row):
-                yield flatrow
-
-    def explanation(self):
-        return self.inner.explanation() + [ 'flatten to %s' % self.tag ]
-
-    def __str__(self):
-        return 'Flatten(%s, by=%s)' % (View.asTag(self.inner),str(self.flattenBy)) + self.showExtras()
-
-class Filter(Transformation):
-    """Filter out a subset of rows that match some predicate."""
-    
-    def __init__(self,inner=None,by=lambda x:x):
-        Transformation.__init__(self,inner)
-        self.filterBy = by
-
-    def rowGenerator(self):
-        for row in self.inner.rowGenerator():
-            if self.filterBy(row):
-                yield row
-
-    def explanation(self):
-        return self.inner.explanation() + [ 'filtered to %s' % self.tag ]
-
-    def __str__(self):
-        return 'Filter(%s, by=%s)' % (View.asTag(self.inner),str(self.filterBy)) + self.showExtras()
-
-class Distinct(MapReduce):
-    """Remove duplicate rows."""
-
-    def __init__(self,inner=None,retaining=None):
-        MapReduce.__init__(self,[inner],retaining)
-
-    def mapPlan(self):
-        plan = Plan()
-        plan.append(PrereduceStep(view=self,whatToDo='doDistinctMap',srcs=[self.inner.checkpoint()],dst=self.checkpoint(),why=self.explanation()))
-        return plan
-
-    def rowGenerator(self):
-        """Extract distinct elements from a sorted list."""
-        lastval = None
-        for line in sys.stdin:
-            valStr = line.strip()
-            val = self.planner._serializer.fromString(valStr)
-            if val != lastval and lastval: 
-                yield lastval
-            lastval = val
-        if lastval: 
-            yield lastval
-
-    def explanation(self):
-        return self.inner.explanation() + [ 'make distinct to %s' % self.tag]
-
-    def __str__(self):
-        return 'Distinct(%s)' % (View.asTag(self.inner)) + self.showExtras()
-
-    def doDistinctMap(self):
-        self.inner.doStoreRows()
-
-
-class Group(MapReduce):
-    """Group by some property of a row, defined with the 'by' option.
-    Default outputs are tuples (x,[r1,...,rk]) where the ri's are rows
-    that have 'by' values of x."""
-
-    def __init__(self,inner=None,by=lambda x:x,reducingTo=ReduceToList(),retaining=None):
-        MapReduce.__init__(self,[inner],retaining)
-        self.groupBy = by
-        self.reducingTo = reducingTo
-    
-    def mapPlan(self):
-        plan = Plan()
-        plan.append(PrereduceStep(view=self,whatToDo='doGroupMap',srcs=[self.inner.checkpoint()],dst=self.checkpoint(),why=self.explanation()))
-        return plan
-
-    def rowGenerator(self):
-        """Group objects from stdin by key, yielding tuples (key,[g1,..,gn])."""
-        lastkey = key = None
-        accum = self.reducingTo.baseType()
-        for line in sys.stdin:
-            keyStr,valStr = line.strip().split("\t")
-            key = self.planner._serializer.fromString(keyStr)
-            val = self.planner._serializer.fromString(valStr)
-            if key != lastkey and lastkey!=None: 
-                yield (lastkey,accum)
-                accum = self.reducingTo.baseType()
-            accum = self.reducingTo.reduceBy(accum, val)
-            lastkey = key
-        if key: 
-            yield (key,accum)
-
-    def explanation(self):
-        return self.inner.explanation() + ['group to %s' % self.tag]
-
-    def __str__(self):
-        return 'Group(%s,by=%s,reducingTo=%s)' % (View.asTag(self.inner),str(self.groupBy),str(self.reducingTo)) + self.showExtras()
-
-    def doGroupMap(self):
-        self.doStoreKeyedRows(self.inner,self.groupBy,-1)
-
-class Join(MapReduce):
-    """Outputs tuples of the form (row1,row2,...rowk) where
-    rowi is from the i-th join input, and the rowi's have the same
-    value of the property being joined on."""
-
-    def __init__(self,*joinInputs):
-        #sets self.inners
-        MapReduce.__init__(self,map(lambda x:x.view, joinInputs),None)
-        self.joinInputs = joinInputs
-        #re-interpret the 'outer' join parameters - semantically
-        #if jin[i] is outer, then all other inputs must be marked as _padWithNulls
-        if any(map(lambda jin:jin.outer, self.joinInputs)):
-            assert len(self.joinInputs)==2,'outer joins are only supported on two-way joins '+str(self.joinInputs)
-            for i in range(len(self.joinInputs)):
-                if self.joinInputs[i].outer:
-                    j = 1-i  #the other index
-                    self.joinInputs[j]._padWithNulls = True
-    
-    def acceptInnerView(self,otherView):
-        assert False, 'join cannot be RHS of a pipe - use JoinTo instead'
-
-    def mapPlan(self):
-        plan = Plan()
-        innerCheckpoints = map(lambda v:v.checkpoint(), self.inners)
-        step = PrereduceStep(view=self, whatToDo='doJoinMap',srcs=innerCheckpoints,dst=self.checkpoint(),why=self.explanation())
-        plan.append(step)
-        return plan
-
-    def applyDict(self,mapping,innerviewsOnly=False):
-        result = MapReduce.applyDict(self,mapping,innerviewsOnly=innerviewsOnly)
-        #also need to map over the join inputs
-        if isinstance(result,Join):
-            for i in range(len(result.joinInputs)):
-                result.joinInputs[i].view = result.inners[i]
-        return result
-
-    def rowGenerator(self):
-        """Group objects from stdin by key, yielding tuples (row1,row2,...)."""
-        lastkey = None
-        lastIndex = len(self.joinInputs)-1
-        somethingProducedForLastKey = False
-        #accumulate a list of lists of all non-final inputs
-        accumList = [ [] for i in range(lastIndex) ]
-        for line in sys.stdin:
-            keyStr,indexStr,valStr = line.strip().split("\t")
-            key = self.planner._serializer.fromString(keyStr)
-            index = int(indexStr)
-            val = self.planner._serializer.fromString(valStr)
-            if key != lastkey and lastkey!=None: 
-                #if the final join is marked as _padWithNulls, clear
-                #the accumulators, since we're doing an outer join
-                #with the last view
-                if self.joinInputs[lastIndex]._padWithNulls and not somethingProducedForLastKey:
-                    for tup in self._joinAccumulatedValuesTo(accumList,lastIndex,None):
-                        yield tup
-                #reset the accumulators, since they pertain to the 
-                accumList = [ [] for i in range(lastIndex) ]
-                somethingProducedForLastKey = False
-            if index!=lastIndex:
-                #accumulate values to use in the join
-                accumList[index] = accumList[index] + [val]
-            else:
-                #produce tuples that match the key for the last view
-                for tup in self._joinAccumulatedValuesTo(accumList,lastIndex,val):
-                    somethingProducedForLastKey = True
-                    yield tup
-            lastkey = key
-
-    def _joinAccumulatedValuesTo(self,accumList,lastIndex,finalVal):
-        # _padWithNulls as needed
-        for i in range(lastIndex):
-            if self.joinInputs[i]._padWithNulls and not accumList[i]:
-                accumList[i] = [None]
-        tupbuf = [ None for i in range(lastIndex+1) ]  #holds output
-        tupbuf[lastIndex] = finalVal
-        for i in range(lastIndex):
-            for a in accumList[i]:
-                tupbuf[i] = a
-                if i==lastIndex-1 and any(tupbuf):
-                        yield tuple(tupbuf)
-
-    def explanation(self):
-        innerEx = []
-        for inner in self.inners:
-            if innerEx: innerEx += ['THEN']
-            innerEx += inner.explanation()
-        return innerEx + [ 'FINALLY join to %s' % self.tag ]
-
-    def __str__(self):
-        return "Join(%s)" % ",".join(map(str,self.joinInputs)) + self.showExtras()
-
-    def doJoinMap(self,i):
-        # called by joinMapPlan with argument index, and stdin pointing to innerCheckpoints[index]
-        self.doStoreKeyedRows(self.joinInputs[i].view,self.joinInputs[i].joinBy,i)
-
-class JoinTo(Join):
-    """Special case of Join which can be used as the RHS of a pipe operator."""
-
-    def __init__(self,joinInput,by=None):
-        Join.__init__(self,Jin(None,by),joinInput)
-        
-    def acceptInnerView(self,otherView):
-        self.joinInputs[0].view = otherView
-        self.inners[0] = otherView
-
-##############################################################################
-#
-# the top-level planner, and its supporting classes
-#
-##############################################################################
-
-class Plan(object):
-    """A plan constructed by a GuineaPig."""
-
-    def __init__(self): 
-        self.steps = []
-        self.tasks = []
-
-    def append(self,step): 
-        self.steps.append(step)
-
-    def includeStepsOf(self,subplan):
-        self.steps += subplan.steps
-
-    def execute(self,gp,echo=False):
-        script = self.compile(gp)
-        for shellcom in script:
-            if echo: print 'calling:',shellcom
-            subprocess.check_call(shellcom,shell=True)
-
-    def buildTasks(self):
-        """Group the steps into AbstractMapReduceTask's"""
-        self.tasks = [AbstractMapReduceTask()]
-        for step in self.steps:
-            if not self.tasks[-1].insert(step):
-                self.tasks.append(AbstractMapReduceTask())
-                status = self.tasks[-1].insert(step)
-                assert status, 'failure to insert '+str(step)+' in fresh AbstractMapReduceTask'
-
-    def compile(self,gp):
-        """Return a list of strings that can be run as shell commands."""
-        self.buildTasks()
-        logging.info("%d steps converted to %d abstract map-reduce tasks" % (len(self.steps),len(self.tasks)))
-        script = []
-        taskCompiler = GPig.getCompiler(gp.opts['target']) 
-        for task in self.tasks:
-            script += taskCompiler.compile(task,gp)
-        return script
-
-#
-# a single step in a plan produced by the planner
-#
-
-class Step(object):
-    """A single step of the plans produced by the planner, along with the
-    methods to convert the plans into executable shell commands."""
-
-    def __init__(self,view):
-        self.view = view
-        self.reused = []  # list of views reused at this point
-        self.why = []
-
-    def setReusedViews(self,views):
-        self.reused = list(views)
-
-    def explain(self):
-        """Convert an explanation - which is a list of strings - into a string"""
-        return "...".join(self.why)
-
-#
-# a single step in a plan produced by the planner
-#
-
-class DistributeStep(Step):
-    """Prepare a stored view for the dDistributed cache."""
-
-    def __init__(self,view):
-        Step.__init__(self,view)
-
-    def __str__(self):
-        return "DistributeStep(%s,reused=%s)" % (repr(self.view.tag),repr(self.reused))
-
-class TransformStep(Step):
-    """Tranform input to output."""
-    def __init__(self,view,whatToDo,srcs,dst,why):
-        Step.__init__(self,view)
-        self.whatToDo = whatToDo
-        self.srcs = srcs
-        self.dst = dst
-        self.why = why
-
-    def __str__(self):
-        return "TransformStep("+",".join(map(repr, [self.view.tag,self.whatToDo,self.srcs,self.dst,self.reused]))+")"
-
-class PrereduceStep(Step):
-    def __init__(self,view,whatToDo,srcs,dst,why):
-        Step.__init__(self,view)
-        self.whatToDo = whatToDo
-        self.srcs = srcs
-        self.dst = dst
-        self.why = why
-
-    def __str__(self):
-        return "PrereduceStep("+",".join(map(repr, [self.view.tag,self.whatToDo,self.srcs,self.dst,self.reused]))+")"
-
-# combine steps into something executable via hadoop - or shell
-
-class AbstractMapReduceTask(object):
-    """A collection of steps that can be executed as a single map-reduce operation,
-    possibly with some file managements steps to set up the task."""
-
-    def __init__(self):
-        self.distributeSteps = []
-        self.mapStep = None
-        self.reduceStep = None
-
-    def insert(self,step):
-        """Treating the AbstractMapReduceTask as a buffer, add this step to it if possible."""
-        if isinstance(step,DistributeStep):
-            #we can accept any number of distribute steps
-            self.distributeSteps.append(step)
-            return True
-        elif self.mapStep==None and (isinstance(step,TransformStep) or isinstance(step,PrereduceStep)):
-            #we can only have one map step, so fill up an empty slot if possible
-            self.mapStep = step
-            return True
-        elif self.mapStep and isinstance(self.mapStep,PrereduceStep) and isinstance(step,TransformStep) and not self.reduceStep:
-            #if the mapstep is a prereduce, then we can also allow a reduce step
-            self.reduceStep = step
-            return True
-        else:
-            return False
-            
-    def __str__(self):
-        buf = "mapreduce task:"
-        for step in self.distributeSteps:
-            buf += "\n - d "+str(step)
-        buf += "\n - m " + str(self.mapStep)
-        if self.reduceStep:
-            buf += "\n - r " + str(self.reduceStep)
-        return buf
-
-class MRCompiler(object):
-    """Abstract compiler class to convert a task to a list of commands that can be executed by the shell."""
-
-    def compile(self,task,gp):
-        script = []
-        # an explanation/header
-        if not task.reduceStep: 
-            script += ['echo create '+task.mapStep.view.tag + ' via map: ' + task.mapStep.explain()]        
-        else: 
-            script += ['echo create '+task.reduceStep.view.tag +' via map/reduce: '+task.reduceStep.explain()]
-        for step in task.distributeSteps:
-            localCopy = step.view.distributableFile()                
-            maybeRemoteCopy = step.view.storedFile()
-            echoCom = 'echo distribute %s: making a local copy of %s in %s' % (step.view.tag,maybeRemoteCopy,localCopy)
-            script += [echoCom] + self.distributeCommands(task, gp, maybeRemoteCopy,localCopy)
-        if not task.reduceStep and len(task.mapStep.srcs)==1:
-            mapCom = self._coreCommand(task.mapStep,gp)
-            script += self.simpleMapCommands(task, gp, mapCom, task.mapStep.srcs[0], task.mapStep.dst)
-        elif task.reduceStep and len(task.mapStep.srcs)==1:
-            mapCom = self._coreCommand(task.mapStep,gp)
-            reduceCom = self._coreCommand(task.reduceStep,gp)
-            script += self.simpleMapReduceCommands(task, gp, mapCom, reduceCom, task.mapStep.srcs[0], task.reduceStep.dst)
-        elif task.reduceStep and len(task.mapStep.srcs)>1:
-            mapComs = [self._ithCoreCommand(task.mapStep,gp,i) for i in range(len(task.mapStep.srcs))]
-            reduceCom = self._coreCommand(task.reduceStep,gp)
-            midpoint = gp.opts['viewdir']+'/'+task.mapStep.view.tag+'.gpmo'
-            script += self.joinCommands(task, gp, mapComs, reduceCom, task.mapStep.srcs, midpoint, task.reduceStep.dst)
-        else:
-            assert False,'cannot compile task '+str(task)
-        return script
-
-    # abstract routines
-
-    def distributeCommands(self,task,gp,maybeRemoteCopy,localCopy):
-        """Distribute the remote copy to the local directory."""
-        assert False, 'abstract method called'
-
-    def simpleMapCommands(self,task,gp,mapCom,src,dst):
-        """A map-only task with zero or one inputs."""
-        assert False, 'abstract method called'
-
-    def simpleMapReduceCommands(self,task,gp,mapCom,reduceCom,src,dst):
-        """A map-reduce task with one inputs."""
-        assert False, 'abstract method called'
-
-    def joinCommands(self,task,gp,mapComs,reduceCom,srcs,midpoint,dst):
-        """A map-reduce task with several inputs."""
-        assert False, 'abstract method called'
-
-    # utilities
-
-    def _stepSideviewFiles(self,step):
-        files = []
-        for sv in step.view.sideviewsNeeded():
-            files += [sv.distributableFile()]
-
-    def _coreCommand(self,step,gp):
-        """Python command to call an individual plan step."""
-        return 'python %s --view=%s --do=%s' % (gp._gpigSourceFile,step.view.tag,step.whatToDo) + self.__coreCommandOptions(step,gp)
-
-    def _ithCoreCommand(self,step,gp,i):
-        """Like _coreCommand but allows index parameter to 'do' option"""
-        return 'python %s --view=%s --do=%s.%d' % (gp._gpigSourceFile,step.view.tag,step.whatToDo,i) + self.__coreCommandOptions(step,gp)
-
-    def __coreCommandOptions(self,step,gp):
-        paramOpts = '' if not gp.param else " --params " + ",".join(map(lambda(k,v):k+':'+v, gp.param.items()))
-        nonDefaults = []
-        for (k,v) in gp.opts.items():
-            #pass in non-default options, or options computed from the environment
-            if (gp.opts[k] != GPig.DEFAULT_OPTS[k]) or ((k in GPig.COMPUTED_OPTION_DEFAULTS) and (gp.opts[k] != GPig.COMPUTED_OPTION_DEFAULTS[k])):
-                nonDefaults += ["%s:%s" % (k,str(v))]
-        optsOpts = '' if not nonDefaults else " --opts " + ",".join(nonDefaults)
-        reuseOpts = '' if not step.reused else " --reuse "+ " ".join(step.reused)
-        return paramOpts  + optsOpts + reuseOpts
-
-
-class ShellCompiler(MRCompiler):
-    """Compile tasks to commands that are executable to most Unix shells."""
-
-    def distributeCommands(self,task,gp,maybeRemoteCopy,localCopy):
-        """Distribute the remote copy to the local directory."""
-        return ['cp -f %s %s || echo warning: the copy failed!' % (maybeRemoteCopy,localCopy)]
-
-    def simpleMapCommands(self,task,gp,mapCom,src,dst):
-        """A map-only job with zero or one inputs."""
-        if src: return [mapCom + ' < %s > %s' % (src,dst)]
-        else: return [self.mapCommand(gp) + (' > %s' % (dst))]
-        
-    def simpleMapReduceCommands(self,task,gp,mapCom,reduceCom,src,dst):
-        """A map-reduce job with one input."""
-        return [mapCom + ' < ' + src + ' | sort -k1 | '+reduceCom + ' > ' + dst]
-
-    def joinCommands(self,task,gp,mapComs,reduceCom,srcs,midpoint,dst):
-        """A map-reduce job with several inputs."""
-        subplan = ['rm -f %s' % midpoint]
-        for i,ithMapCom in enumerate(mapComs):
-            subplan += [ithMapCom + ' < ' + srcs[i] + ' >> ' + midpoint]
-        subplan += [ 'sort -k1,2  < ' + midpoint + ' | ' + reduceCom + ' > ' + dst]
-        return subplan
-
-class HadoopCompiler(MRCompiler):
-    """Compile tasks to commands that are executable to most Unix shells
-    after hadoop has been installed."""
-
-    def distributeCommands(self,task,gp,maybeRemoteCopy,localCopy):
-        return ['rm -f %s' % localCopy, '%s fs -getmerge %s %s' % (GPig.HADOOP_LOC,maybeRemoteCopy,localCopy)]
-
-    def simpleMapCommands(self,task,gp,mapCom,src,dst):
-        assert src,'Wrap not supported for hadoop'
-        hcom = self.HadoopCommandBuf(gp,task)
-        hcom.extendDef('-D','mapred.reduce.tasks=0')
-        hcom.extend('-input',src,'-output',dst)
-        hcom.extend("-mapper '%s'" % mapCom)
-        return [ self._hadoopCleanCommand(gp,dst), hcom.asEcho(), hcom.asString() ]
-
-    def simpleMapReduceCommands(self,task,gp,mapCom,reduceCom,src,dst):
-        hcom = self.HadoopCommandBuf(gp,task)
-        hcom.extendDef('-D','mapred.reduce.tasks=%d' % gp.opts['parallel'])
-        hcom.extend('-input',src,'-output',dst)
-        hcom.extend("-mapper '%s'" % mapCom)
-        hcom.extend("-reducer '%s'" % reduceCom)
-        return [ self._hadoopCleanCommand(gp,dst), hcom.asEcho(), hcom.asString() ]
-        
-    def joinCommands(self,task,gp,mapComs,reduceCom,srcs,midpoint,dst):
-        def midi(i): return midpoint + '-' + str(i)
-        subplan = []
-        for i in range(len(srcs)):
-            hcom = self.HadoopCommandBuf(gp,task)
-            hcom.extendDef('-D','mapred.reduce.tasks=%d' % gp.opts['parallel'])
-            hcom.extend('-input',srcs[i], '-output',midi(i))
-            hcom.extend("-mapper","'%s'" % mapComs[i])
-            subplan += [ self._hadoopCleanCommand(gp,midi(i)), hcom.asEcho(), hcom.asString() ]
-        hcombineCom = self.HadoopCommandBuf(gp,task)
-        hcombineCom.extendDef('-D','mapred.reduce.tasks=%d' % gp.opts['parallel'])
-        hcombineCom.extendDef('-jobconf','stream.num.map.output.key.fields=3')
-        hcombineCom.extendDef('-jobconf','num.key.fields.for.partition=1')
-        for i in range(len(srcs)):
-            hcombineCom.extend('-input',midi(i))
-        hcombineCom.extend('-output',dst)
-        hcombineCom.extend('-mapper','cat')
-        hcombineCom.extend('-reducer',"'%s'" % reduceCom)
-        hcombineCom.extend('-partitioner','org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner')
-        subplan += [ self._hadoopCleanCommand(gp,dst),  hcombineCom.asEcho(), hcombineCom.asString() ]
-        return subplan
-
-    class HadoopCommandBuf(object):
-        """Utility to hold the various pieces of a hadoop command."""
-        def __init__(self,gp,task):
-            logging.debug('building hadoop command for '+str(task.mapStep.view.tag))
-            self.invocation = [GPig.HADOOP_LOC,'jar',gp.opts['streamJar']]
-            self.defs = []
-            self.args = []
-            self.files = []
-            for f in gp._shippedFiles:
-                self.files += ['-file',f]
-            for sv in task.mapStep.view.sideviewsNeeded():
-                self.files += ['-file',sv.distributableFile()]
-            if task.reduceStep:
-                for sv in task.reduceStep.view.sideviewsNeeded():
-                    self.files += ['-file',sv.distributableFile()]
-            logging.debug('files: '+str(self.files))
-        def extend(self,*toks):
-            self.args += list(toks)
-        def extendDef(self,*toks):
-            self.defs += list(toks)
-        def asEcho(self):
-            return " ".join(['echo','hadoop'] + self.args + ['...'])
-        def asString(self):
-            return " ".join(self.invocation+self.defs+self.files+self.args)
-
-    def _hadoopCleanCommand(self,gp,fileName):
-        """A command to remove a hdfs directory if it exists."""
-        return '(%s fs -test -e %s && %s fs -rmr %s) || echo no need to remove %s' % (GPig.HADOOP_LOC,fileName, GPig.HADOOP_LOC,fileName, fileName)
-
-#
-# replacable object to save objects to disk and retrieve them
-#
-
-class RowSerializer(object):
-    """Saves row objects to disk and retrieves them."""
-    def __init__(self):
-        self.evaluator = GPig.SafeEvaluator()
-    def toString(self,x): 
-        return repr(x)
-    def fromString(self,s): 
-        return self.evaluator.eval(s)
-
-#
-# the planner
-#
-
-class Planner(object):
-    """Can create storage plans for views that are defined as parts of it."""
-
-    def __init__(self,**kw):
-
-        #parameters are used for programmatically give user-defined
-        #config information to a planner, or they can be specified in
-        #the command-line
-        self.param = kw
-        for (key,val) in GPig.getArgvParams().items():
-                # don't override non-null values specified in the constructor
-                if self.param.get(key)==None:
-                    self.param[key] = val
-
-        #opts are used for giving options to the planner from the shell
-        self.opts = GPig.getArgvOpts()
-        for (key,val) in GPig.DEFAULT_OPTS.items():
-            if (not key in self.opts): self.opts[key] = val
-        for (key,type) in GPig.DEFAULT_OPT_TYPES.items():
-            self.opts[key] = type(self.opts[key])
-
-        #use appropriate for the target
-        self._serializer = RowSerializer()
-
-        #views that aren't associated with class variable, but are
-        #instead named automatically - ie, inner views with no
-        #user-provided names.
-        self._autoNamedViews = {}
-
-        #by default, use info-level logging at planning time
-        if not Planner.partOfPlan(sys.argv): 
-            logging.basicConfig(level=logging.INFO)
-
-        #hadoop needs to know where to give the main script file,
-        #as well as the guineapig.py file it uses
-        self._gpigSourceFile = sys.argv[0]
-        self._shippedFiles = [GPig.MY_LOC,self._gpigSourceFile]
-
-    def setup(self):
-        """Initialize planner, and views used by the planner.  This has to be
-        done after the planner is fully configured by adding views."""
-
-        self.reusableViews = {}
-        # make sure view directory is valid
-        if self.opts['target']=='shell' and not os.path.exists(self.opts['viewdir']):
-            logging.info('creating view directory ' + self.opts['viewdir'])
-            os.makedirs(self.opts['viewdir'])
-        elif self.opts['target']=='hadoop':
-            p = urlparse.urlparse(self.opts['viewdir'])
-            if not p.path.startswith("/"):
-                logging.warn('hadoop viewdir should be absolite path: will try prefixing /user/$LOGNAME')
-                username = os.environ.get('LOGNAME','me')
-                self.opts['viewdir'] = '/user/'+username+'/'+self.opts['viewdir']
-                logging.warn('viewdir is set to '+self.opts['viewdir'])
-
-        # Add 'tag' and planner fields to each view
-        for vname in self.listViewNames():
-            v = self.getView(vname)
-            v.tag = vname
-            v.planner = self
-        def tagUnnamedViews(v,basename,index,depth):
-            assert v,'null inner view for '+basename
-            if not v.planner:
-                v.planner = self
-                autoname = '%s_%d_%s' % (basename,depth,index)
-                self._setView(autoname,v)
-                for i,inner in enumerate(v.inners + v.sideviews):
-                    tagUnnamedViews(inner,vname,i,depth+1)
-        for vname in self.listViewNames():
-            v = self.getView(vname)
-            for i,inner in enumerate(v.inners + v.sideviews):
-                tagUnnamedViews(inner,vname,i,1)
-
-        # Add caching options as needed
-        for vname in self.listViewNames():
-            v = self.getView(vname)
-            v.enforceStorageConstraints()
-
-    #
-    # utils
-    # 
-            
-
-    def getView(self,str,mustExist=False):
-        """Find the defined relation named str, and if necessary bind its
-        planner and tag appropriately."""
-        v = self.__class__.__dict__.get(str) or self.__dict__.get(str) or self._autoNamedViews.get(str)
-        if mustExist: assert v,'cannot find a view named '+str
-        return v
-
-    def _setView(self,str,view):
-        """Internal use only: allow the view to be retreived by name later."""
-        view.tag = str
-        self._autoNamedViews[str] = view
-
-    def listViewNames(self):
-        def namedViews(d): return [vname for vname in d.keys() if isinstance(self.getView(vname),View)]
-        userNamedViews =  namedViews(self.__class__.__dict__) + namedViews(self.__dict__)
-        return userNamedViews + self._autoNamedViews.keys()
-
-    #
-    # planning
-    # 
-
-    def buildRecursiveStoragePlan(self,view):
-        """Called by view.storagePlan.""" 
-        #figure out what to reuse - starting with what the user specified
-        storedViews = dict(self.reusableViews)
-        #also mark for eager storage anything that's used twice in the
-        #plan---i.e., anything that is consumed by two or more views
-        numParents = collections.defaultdict(int)
-        for dv in self._descendants(view):
-            for inner in dv.inners + dv.sideviews:
-                numParents[inner] += 1
-        for (dv,n) in numParents.items():
-            if n>1 and dv.storeMe==None:
-                logging.info('making %s stored because it is used %d times in creating %s' % (dv,n,view.tag))
-                dv.storeMe = True
-
-        #traverse view in pre-order and find a linear sequence of
-        #views to store, each of which requires only views earlier in
-        #the sequence
-        storageSeq = self._storageSeq(view,storedViews) + [view.tag]
-        logging.info('storage sequence is: ' + ",".join(storageSeq))
-
-        #splice together plans for each view in the sequence,
-        #after first modifying the view so that nothing is called
-        #directly, but only through the ReuseView proxies
-        plan = Plan()
-        for tag in storageSeq:
-            v = self.getView(tag,mustExist=True)
-            vm = v.applyDict(storedViews,innerviewsOnly=True)
-            subplan = vm.nonrecursiveStoragePlan()
-            #add the correct context of reused views to the subplan,
-            #so that that the actual definition of the view will be
-            #rewritten appropriately to include the new ReuseView
-            #proxy for it
-            viewsLocallyReused = self._reuseViewDescendants(vm)
-            for s in subplan.steps:
-                s.setReusedViews(viewsLocallyReused)
-            plan.includeStepsOf(subplan)
-        return plan
-
-    def _reuseViewDescendants(self,view):
-        """Descendent views that are ReuseView's"""
-        result = set()
-        for dv in self._descendants(view):
-            if isinstance(dv,ReuseView):
-                result.add(dv.reusedViewTag)
-        return result
-
-    def _descendants(self,view):
-        """Descendents of a view."""
-        result = set()
-        result.add(view)
-        for inner in view.inners + view.sideviews:
-            result = result.union(self._descendants(inner))
-        return result
-
-    def _storageSeq(self,view,storedViews):
-        """Linear sequence of storage actions to take - as view tags."""
-        seq = []
-        for inner in view.inners + view.sideviews:
-            if not inner.tag in storedViews:
-                seq += self._storageSeq(inner,storedViews)
-                if inner.storeMe:
-                    seq += [inner.tag]
-                    storedViews[inner.tag] = ReuseView(inner)
-        return seq
-
-    #
-    # dealing with the file storage system and related stuff
-    #
-
-    def ship(self,*fileNames):
-        """Declare a set of inputs to be 'shipped' to the hadoop cluster."""
-        self._shippedFiles += fileNames
-
-    def setSerializer(self,serializer):
-        """Replace the default serializer another RowSerializer object."""
-        self._serializer = serializer
-        return self
-
-    def setEvaluator(self,rowEvaluator):
-        """Specify a function which will deserialize a string that was produced
-        by Python's 'repr' function."""
-        self._serializer.evaluator = rowEvaluator
-        return self
-
-    #
-    # rest of the API for the planner
-    # 
-
-    @staticmethod
-    def partOfPlan(argv):
-        """True if the command line was generated as part of a storage plan."""
-        return any(s.startswith("--do") for s in argv)
-
-    def main(self,argv):
-        """Run a main that lets you --store a view, as well as doing a few other things."""
-        self.setup()
-        self.runMain(argv)
-
-    def runMain(self,argv):
-
-        # parse the options and dispatch appropriately
-        argspec = ["store=", "cat=", "reuse", 
-                   "list", "pprint=", "steps=", "tasks=", "plan=", 
-                   "params=", "opts=", "do=", "view="]
-        optlist,args = getopt.getopt(argv[1:], 'x', argspec)
-        optdict = dict(optlist)
-        
-        # decide what views can be re-used, vs which need fresh plans
-        if '--reuse' in optdict:  #reuse the views listed in the arguments
-            for a in args:
-                vname = View.viewNameFor(a)
-                v = self.getView(vname)
-                if v:
-                    self.reusableViews[v.tag] = ReuseView(v)
-                    logging.info("re-using data stored for view "+vname+": "+str(v))
-                else:
-                    logging.warn("cannot re-use view "+vname+" since it's not used in this script")
-
-        #choose the main action to take
-        if '--store' in optdict:  #store a view
-            rel = self.getView(optdict['--store'],mustExist=True)            
-            plan = rel.storagePlan()
-            plan.execute(self, echo=self.opts['echo'])
-            return
-        elif '--pprint' in optdict: #print a view
-            rel = self.getView(optdict['--pprint'],mustExist=True)
-            rel.applyDict(self.reusableViews).pprint()
-            return
-        elif '--steps' in optdict: #print steps to produce a view 
-            rel = self.getView(optdict['--steps'],mustExist=True)
-            plan = rel.storagePlan()
-            for s in plan.steps:
-                print ' -',s
-            return
-        elif '--tasks' in optdict: #print AbstractMapReduceTasks to produce a view 
-            rel = self.getView(optdict['--tasks'],mustExist=True)
-            plan = rel.storagePlan()
-            plan.buildTasks()
-            for t in plan.tasks:
-                print t
-            return
-        elif '--plan' in optdict:    #print a storage plan
-            rel = self.getView(optdict['--plan'],mustExist=True)
-            plan = rel.storagePlan()
-            script = plan.compile(self)
-            print "\n".join(script)
-            return
-        elif '--cat' in optdict:    #store and then print a view
-            assert self.opts['target']=='shell','cannot do --cat except in shell mode'
-            rel = self.getView(optdict['--cat'],mustExist=True)
-            plan = rel.storagePlan()
-            plan.execute(self, self.opts['echo'])
-            for line in open(rel.storedFile(),'r'):
-                print line,
-            return
-        elif '--list' in optdict:   #list named views
-            for vname in self.listViewNames():
-                print '     ',vname,'\t',self.getView(vname)
-            return
-        elif '--do' in optdict:     #run an internally-generated action
-            #recover what should be stored when this action is performed
-            #work out what view to use and what routine to call
-            rel = self.getView(optdict['--view'],mustExist=True)
-            rel = rel.applyDict(self.reusableViews)
-            whatToDo = optdict['--do']
-            #work out the method given by 'do' and call it - note it
-            #may have a single integer argument, eg doJoinMap.1
-            k = whatToDo.find(".")
-            if k<0:
-                whatToDoMethod = getattr(rel,whatToDo)
-                whatToDoMethod()                
-            else:
-                arg = int(whatToDo[k+1:])
-                whatToDo = whatToDo[:k]
-                whatToDoMethod = getattr(rel,whatToDo)
-                whatToDoMethod(arg)                
-            return
-        else:
-            print 'usage: --[store|pprint|steps|plan|cat] view [--opts key:val,...] [--params key:val,...] --reuse view1 view2 ...]'
-            print '       --[list]'
-            print 'current legal keys for "opts", with default values:'
-            for (key,val) in GPig.DEFAULT_OPTS.items():
-                print '  %s:%s' % (key,str(val))
-            print 'There\'s more help at http://curtis.ml.cmu.edu/w/courses/index.php/Guinea_Pig'
-
-if __name__ == "__main__":
-    print 'There\'s help at http://curtis.ml.cmu.edu/w/courses/index.php/Guinea_Pig'    
diff --git a/tutorial/instance-wordcount.py b/tutorial/instance-wordcount.py
index 00564f7..62b6d64 100644
--- a/tutorial/instance-wordcount.py
+++ b/tutorial/instance-wordcount.py
@@ -1,4 +1,4 @@
-from guineapig1_3 import *
+from guineapig import *
 import sys
 
 def wordCountScript():
diff --git a/tutorial/longer-wordcount.py b/tutorial/longer-wordcount.py
index 95b0cfe..3afdb6b 100644
--- a/tutorial/longer-wordcount.py
+++ b/tutorial/longer-wordcount.py
@@ -1,5 +1,5 @@
 # always start like this
-from guineapig1_3 import *
+from guineapig import *
 import sys
 
 # supporting routines can go here
diff --git a/tutorial/multi-wordcount-hadoop.py b/tutorial/multi-wordcount-hadoop.py
index 303c6a0..e16dc68 100644
--- a/tutorial/multi-wordcount-hadoop.py
+++ b/tutorial/multi-wordcount-hadoop.py
@@ -1,4 +1,4 @@
-from guineapig1_3 import *
+from guineapig import *
 import sys
 import os
 import subprocess
diff --git a/tutorial/multi-wordcount.py b/tutorial/multi-wordcount.py
index 86cab07..29d5775 100644
--- a/tutorial/multi-wordcount.py
+++ b/tutorial/multi-wordcount.py
@@ -1,4 +1,4 @@
-from guineapig1_3 import *
+from guineapig import *
 import sys
 import os
 
diff --git a/tutorial/ntup-wordcount.py b/tutorial/ntup-wordcount.py
index fc334a4..72d5efc 100644
--- a/tutorial/ntup-wordcount.py
+++ b/tutorial/ntup-wordcount.py
@@ -1,4 +1,4 @@
-from guineapig1_3 import *
+from guineapig import *
 import sys
 import math
 import collections
diff --git a/tutorial/param-wordcount.py b/tutorial/param-wordcount.py
index ac0445a..333ca79 100644
--- a/tutorial/param-wordcount.py
+++ b/tutorial/param-wordcount.py
@@ -1,4 +1,4 @@
-from guineapig1_3 import *
+from guineapig import *
 import sys
 import os
 
diff --git a/tutorial/phirl-naive1_3.py b/tutorial/phirl-naive1_3.py
index 7d106e8..539e920 100644
--- a/tutorial/phirl-naive1_3.py
+++ b/tutorial/phirl-naive1_3.py
@@ -24,37 +24,41 @@ class Phirl(Planner):
      | ReplaceEach(by=lambda((rel,term),df):(rel,term,df))
 
     #find total number of docs per relation
-    ndoc = ReplaceEach(data, by=lambda(rel,docid,term):(rel,docid)) | Distinct() | Group(by=lambda(rel,docid):rel, reducingTo=ReduceToCount())
+    ndoc = ReplaceEach(data, by=lambda(rel,docid,term):(rel,docid)) \
+           | Distinct() | Group(by=lambda(rel,docid):rel, reducingTo=ReduceToCount())
 
     #unweighted document vectors
-    
-    udocvec = Join( Jin(data,by=lambda(rel,docid,term):(rel,term)), Jin(docFreq,by=lambda(rel,term,df):(rel,term)) ) \
+    udocvec = Join( Jin(data,by=lambda(rel,docid,term):(rel,term)), 
+                    Jin(docFreq,by=lambda(rel,term,df):(rel,term)) ) \
         | ReplaceEach(by=lambda((rel,doc,term),(rel_,term_,df)):(rel,doc,term,df)) \
         | JoinTo( Jin(ndoc,by=lambda(rel,relCount):rel), by=lambda(rel,doc,term,df):rel ) \
         | ReplaceEach(by=lambda((rel,doc,term,df),(rel_,relCount)):(rel,doc,term,df,relCount)) \
         | ReplaceEach(by=lambda(rel,doc,term,df,relCount):(rel,doc,term,termWeight(relCount,df)))
 
+    #normalizers
     sumSquareWeights = ReduceTo(float, lambda accum,(rel,doc,term,weight): accum+weight*weight)
-
-    norm = Group( udocvec, by=lambda(rel,doc,term,weight):(rel,doc), reducingTo=sumSquareWeights) \
-        | ReplaceEach( by=lambda((rel,doc),z):(rel,doc,z))
+    norm = Group( udocvec, 
+                  by=lambda(rel,doc,term,weight):(rel,doc), 
+                  retaining = lambda (rel,doc,term,weight): weight,
+                  reducingTo=ReduceToSum() ) \
+           | ReplaceEach( by=lambda((rel,doc),z):(rel,doc,z))
 
     #normalized document vector
-    docvec = Join( Jin(norm,by=lambda(rel,doc,z):(rel,doc)), Jin(udocvec,by=lambda(rel,doc,term,weight):(rel,doc)) ) \
+    docvec = Join( Jin(norm,by=lambda(rel,doc,z):(rel,doc)), 
+                   Jin(udocvec,by=lambda(rel,doc,term,weight):(rel,doc)) ) \
         | ReplaceEach( by=lambda((rel,doc,z),(rel_,doc_,term,weight)): (rel,doc,term,weight/math.sqrt(z)) )
 
-    # grab only the p component and reduce it
-    sumOfP = ReduceTo(float,lambda accum,(doc1,doc2,p): accum+p)
-
     # naive algorithm: use all pairs for finding matches
     rel1Docs = Filter(docvec, by=lambda(rel,doc,term,weight):rel=='icepark')
     rel2Docs = Filter(docvec, by=lambda(rel,doc,term,weight):rel=='npspark')
-    softjoin = Join( Jin(rel1Docs,by=lambda(rel,doc,term,weight):term), Jin(rel2Docs,by=lambda(rel,doc,term,weight):term)) \
-        | ReplaceEach(by=lambda((rel1,doc1,term,weight1),(rel2,doc2,term_,weight2)): (doc1,doc2,weight1*weight2)) \
-        | Group(by=lambda(doc1,doc2,p):(doc1,doc2), reducingTo=sumOfP) \
+    softjoin = Join( Jin(rel1Docs,by=lambda(rel,doc,term,weight):term), 
+                     Jin(rel2Docs,by=lambda(rel,doc,term,weight):term)) \
+        | ReplaceEach(by=lambda((rel1,doc1,term,weight1),(rel2,doc2,term2,weight2)): (doc1,doc2,weight1*weight2)) \
+        | Group(by=lambda(doc1,doc2,p):(doc1,doc2), \
+                retaining=lambda(doc1,doc2,p):p, \
+                reducingTo=ReduceToSum()) \
         | ReplaceEach(by=lambda((doc1,doc2),sim):(doc1,doc2,sim))
 
-    # get the top few similar pairs
     simpairs = Filter(softjoin, by=lambda(doc1,doc,sim):sim>0.75)
 
     # diagnostic output
diff --git a/tutorial/prefix-count.py b/tutorial/prefix-count.py
index cec9d7c..0bc8568 100644
--- a/tutorial/prefix-count.py
+++ b/tutorial/prefix-count.py
@@ -1,4 +1,4 @@
-from guineapig1_3 import *
+from guineapig import *
 import sys
 import logging
 
diff --git a/tutorial/tfidf.py b/tutorial/tfidf.py
new file mode 100644
index 0000000..b235fd1
--- /dev/null
+++ b/tutorial/tfidf.py
@@ -0,0 +1,40 @@
+from guineapig import *
+
+# compute TFIDF in Guineapig
+
+import sys
+import math
+
+class TFIDF(Planner):
+    
+    idDoc = ReadLines("idcorpus.txt") | Map(by=lambda line:line.strip().split("\t"))
+    idWords = Map(idDoc, by=lambda (docid,doc): (docid,doc.lower().split()))
+    data = FlatMap(idWords, by=lambda (docid,words): map(lambda w:(docid,w),words))
+
+    #compute document frequency
+    docFreq = Distinct(data) \
+        | Group(by=lambda (docid,term):term, retaining=lambda(docid,term):docid, reducingTo=ReduceToCount())
+
+    docIds = Map(data, by=lambda (docid,term):docid) | Distinct()
+    ndoc = Group(docIds, by=lambda row:'ndoc', reducingTo=ReduceToCount())
+
+    #unweighted document vectors
+    
+    udocvec1 = Join( Jin(data,by=lambda(docid,term):term), Jin(docFreq,by=lambda(term,df):term) )
+    udocvec2 = Map(udocvec1, by=lambda((docid,term1),(term2,df)):(docid,term1,df))
+    udocvec3 = Join( Jin(udocvec2,by=lambda row:'const'), Jin(ndoc,by=lambda row:'const'))
+    udocvec = Map(udocvec3, by=lambda((docid,term,df),(dummy,ndoc)):(docid,term,math.log(ndoc/df)))
+
+    sumSquareWeights = ReduceTo(float, lambda accum,(docid,term,weight): accum+weight*weight)
+
+    norm = Group( udocvec, by=lambda(docid,term,weight):docid, 
+                           retaining=lambda(docid,term,weight):weight*weight,
+                           reducingTo=ReduceToSum() )
+
+    docvec = Join( Jin(norm,by=lambda(docid,z):docid), Jin(udocvec,by=lambda(docid,term,weight):docid) ) \
+        | Map( by=lambda((docid1,z),(docid2,term,weight)): (docid1,term,weight/math.sqrt(z)) )
+
+# always end like this
+if __name__ == "__main__":
+    p = TFIDF()
+    p.main(sys.argv)
diff --git a/tutorial/wordcmp.py b/tutorial/wordcmp.py
index 985994c..8e8d0c1 100644
--- a/tutorial/wordcmp.py
+++ b/tutorial/wordcmp.py
@@ -1,4 +1,4 @@
-from guineapig1_3 import *
+from guineapig import *
 import sys
 import math
 
diff --git a/tutorial/wordcount.py b/tutorial/wordcount.py
index 06db5a4..5299a3a 100644
--- a/tutorial/wordcount.py
+++ b/tutorial/wordcount.py
@@ -1,4 +1,4 @@
-from guineapig1_3 import *
+from guineapig import *
 import sys
 
 # supporting routines can go here
diff --git a/tutorial/wordprob.py b/tutorial/wordprob.py
index 213fbcf..0f747ce 100644
--- a/tutorial/wordprob.py
+++ b/tutorial/wordprob.py
@@ -1,4 +1,4 @@
-from guineapig1_3 import *
+from guineapig import *
 import sys
 import math
 import logging