From aa42697fc2541f32c49cf8f76853ab96c7d06c7b Mon Sep 17 00:00:00 2001 From: William Cohen Date: Mon, 20 Jul 2015 15:32:53 -0400 Subject: [PATCH] docs --- TODO.txt | 153 ++++++--------------------------------------------- guineapig.py | 2 +- 2 files changed, 17 insertions(+), 138 deletions(-) diff --git a/TODO.txt b/TODO.txt index 286a11b..81a5765 100644 --- a/TODO.txt +++ b/TODO.txt @@ -1,137 +1,38 @@ TODO - priorities -for 1.3 - - python wordprob.py --store prob seems to fail on a fresh tutorial - - clean up null src issue - do I need it? - - test hadoop - - safer eval - test and document, strip out reprInverse - FUNCTIONALITY - - add --dictSeps =, instead of default :, - - safer eval - - a GPig.registerImport('foo.py') - ok that's just ship? and GPig.registerCompiler('key',factoryClass) - - add --dictSeps =, instead of default :, for s3 - - a GPig.registerImport('foo.py'), GPig.registerCompiler('key',factoryClass) - what is registerCompiler? - option(storedIn=FILE) - so you can retrieve and store work on s3 - - add Reuse(FILE) view - - add Concat(view1,....,viewK) - add Stream(view1, through='shell command', shipping=[f1,..,fk]) - add StreamingMapReduce(view1, mapper='shell command', reducer='shell command', combiner='shell command', shipping=[f1,..,fk]) + - add user-defined Reuse(FILE) ? (why do I want this again?) - - extras, for debugging: - -- Log - -- ReadBlocks - -- Wrap - -- PPrint + - gpextras, for debugging: + -- PPrint? + -- Wrap? -- Describe? -- Illustrate? - --- standardize view.by argument - efficiency - -- combiners - -- compression - -- hadoop options (parallel, etc) - -- compiler for marime.py map-reducer with ramdisks (note: diskutil erasevolume HFS+ 'RAMDisk' `hdiutil attach -nomount ram://10315776`, - size is in 2048-byte blocks) + -- combiners: add combiner as combiningTo=.. option of Group. + -- compression -jobconf mapred.output.compress=true -jobconf mapred.output.compression.codec=org.apache.hadoop.io.compress.GzipCode + -- hadoop options (parallel, hopts, ...) + + - cleanup + -- standardize view.by argument -- clean up.gpmo and other tmp files? could do this via analysis at the AbstractMapReduceTask lst level + -- log created views so you can continue with --reuse `cat foo.log|grep ^created|cut -f2` + -- maybe add --config logging:warn,... DOCS: - - howto for EC2 EMR - some longer examples for the tutorial (phirl-naive?) - - document planner.ship, planner.setReprInverseFun, planner.setSerializer - -NOTES - EC2 - -1) Install AWS CLI - see http://docs.aws.amazon.com/cli/latest/userguide/installing.html - % curl https://s3.amazonaws.com/aws-cli/awscli-bundle.zip > awscli-bundle.zip - % unzip awscli-bundle.zip - % ./awscli-bundle/install -i `pwd`/install - % export PATH=$PATH:/Users/wcohen/Documents/code/aws-cli/install/bin/ -2) check install with - % aws --version -3) get an access key with https://console.aws.amazon.com/iam/home?#security_credential - and save in environment vars - - % aws configure -AWS Access Key ID [None]: ... -AWS Secret Access Key [None]: ... -Default region name [None]: us-east-1 -Default output format [None]: json - -4) create a keypair: http://docs.aws.amazon.com/cli/latest/userguide/cli-ec2-keypairs.html - - % aws ec2 create-key-pair --key-name MyKeyPair --query 'KeyMaterial' --output text > MyKeyPair.pem - -seems like the access key is for interactions with aws, the keypair will be for interactions with -the clusters you are going to create. - -4.5) create a security group: - -aws ec2 create-security-group --group-name MySecurityGroup --description "My security group" -aws ec2 authorize-security-group-ingress --group-name MySecurityGroup --protocol tcp --port 22 --cidr 0.0.0.0/0 - -5) now use emr subcommand [docs: http://docs.aws.amazon.com/cli/latest/reference/emr/index.html] to -build and access the cluster - - % aws emr create-cluster --ami-version 3.8.0 --instance-groups InstanceGroupType=MASTER,InstanceCount=1,InstanceType=m3.xlarge InstanceGroupType=CORE,InstanceCount=2,InstanceType=m3.xlarge --ec2-attributes KeyName=MyKeyPair --log-uri s3n://wcohen-gpig-log --bootstrap-action Path="s3n://wcohen-gpig-input/emr-bootstrap.sh" - - { - "ClusterId": "j-1LF855E531Y16" - } - -hint: https://s3.amazonaws.com/bucket-name/path-to-file accesses a file - - added tutorial/emr-bootstrap.sh as an action, might modify it to only run on the master, I think I can - replace the "echo running on master node" with an s3n:// script...but it should have a #!/bin/sh header. - --bootstrap-action Path=s3://elasticmapreduce/bootstrap-actions/run-if,Args=["instance.isMaster=true","echo running on master node"] - - - should add a name: --name "foo" - - - % aws emr put --cluster-id j-1LF855E531Y16 --key-pair-file MyKeyPair.pem --src path/to/tutorial.tgz - % aws emr ssh --cluster-id j-1LF855E531Y16 --key-pair-file MyKeyPair.pem - - you're logged in so to run - - % unpack the tutorial... TODO: add the hadoop startup and mkdir to emr-bootstrap - % - % export GP_STREAMJAR=/home/hadoop/contrib/streaming/hadoop-streaming.jar - % hadoop jar ~/hadoop-examples.jar pi 10 10000000 #somehow this was needed to set up hdfs:/user/hadoop - % hadoop fs -mkdir /user/hadoop/gp_views - % python param-wordcount.py --opts target:hadoop,viewdir:/user/hadoop/gp_views,echo:1 --params corpus:s3%3A//wcohen-gpig-input/corpus.txt --store wc - - -- this is where things fail now.... - --------------------- - -this is obsolete? - -1 follow: /Users/wcohen/Documents/code/elastic-mapreduce-cli - installed on eddy in /Users/wcohen/Documents/code/elastic-mapreduce-cli, keypair=wcohen - buckets: wcohen-gpig-input, wcohen-gpig-views - helpful: https://aws.amazon.com/articles/Elastic-MapReduce/3938 - - after installation: - $ ./elastic-mapreduce --create --alive --name "Testing streaming -- wcohen" --num-instances 5 --instance-type c1.medium - Created job flow j-1F8U85HWYBRBT - $ ./elastic-mapreduce --jobflow j-1F8U85HWYBRBT --put gpigtut.tgz - $ ./elastic-mapreduce --jobflow j-1F8U85HWYBRBT --ssh - # then I could copy data in from s3: by just reading it in a ReadLines view... - - $ ./elastic-mapreduce --set-termination-protection false - $ ./elastic-mapreduce --terminate - - # copying: see https://wiki.apache.org/hadoop/AmazonS3 - % ${HADOOP_HOME}/bin/hadoop distcp hdfs://domU-12-31-33-00-02-DF:9001/user/nutch/0070206153839-1998 s3://123:456@nutch/ - - logs are by default on master in /mnt/var/log/ - see http://docs.aws.amazon.com/ElasticMapReduce/latest/DeveloperGuide/emr-manage-view-web-log-files.html - + - document planner.ship, planner.setEvaluator TODO - MAJOR -- test on EC2 +- a GPig.registerCompiler('key',factoryClass), for adding new targets other than hadoop? + - compiler for marime.py map-reducer with ramdisks (note: diskutil erasevolume HFS+ 'RAMDisk' `hdiutil attach -nomount ram://10315776`, + size is in 2048-byte blocks) - multithreading ideas @@ -156,32 +57,10 @@ TODO - MAJOR K subprocesses, Ri, to run '... | RED > out/shard.k' -- or could use threads (subprocesses more modular) K threads to print from the shardBuffer to Ri -- benchmark hadoop stuff - - working: time python phirl-naive.py --opts viewdir:/user/wcohen/gpig_views,target:hadoop --store flook | tee tmp.log (real 13m44.946s) - - benchmark vs PIG: pig took 8min, launched 14 jobs; guineapig took 13:45, launched 27 jobs. - - issues: - - problem: hdoop processes don't seem to know where the user home dir is, workaround is a rooted path, - but maybe that's ok (TODO: warn if target=hadoop and relative path) - - can't run a program in a subdirectory, eg python demo/phirl-naive.py ... TODO: look for guineapig.py on pythonpath. - or, maybe figure out how the -file option works better.... - - - COMBINERS: add combiner as combiningTo=.. option of Group. - - DESCRIBE(...) - could be just a pretty_print? - ILLUSTRATE(view,[outputs]) - using definition of view, select the inputs from the innerviews that produce those outputs. Then, do that recursively to get a test case. -TODO - SMALL - -- add ReuseView(FILE) and option(location=FILE) so output views can be stored anywhere (eg s3 or s3n) -- log created views so you can continue with --reuse `cat foo.log|grep ^created|cut -f2` -- make safer version of 'eval' -- add --hopts to pass in to hadoop? -- maybe add --config logging:warn,... -- clean .gpmo outputs? -- find guineapig.py on sys.path -- jobconf mapred.job.name="...." -- compression jobconf mapred.output.compress=true -jobconf mapred.output.compression.codec=org.apache.hadoop.io.compress.GzipCode diff --git a/guineapig.py b/guineapig.py index d685d21..4c75460 100644 --- a/guineapig.py +++ b/guineapig.py @@ -103,7 +103,7 @@ def onlyRowOf(view): @staticmethod class SafeEvaluator(object): - """Evaluates expressions that correzpond to serialized guinea pig rows.""" + """Evaluates expressions that correspond to serialized guinea pig rows.""" def __init__(self,restrictedBindings={}): self.restrictedBindings = restrictedBindings def eval(self,s):