docs and emr-extras

TeamCohen · Jul 20, 2015 · 9e720f3 · 9e720f3
1 parent d1cfc77
commit 9e720f3
Show file tree

Hide file tree

Showing 7 changed files with 168 additions and 12 deletions.
diff --git a/TODO.txt b/TODO.txt
@@ -74,28 +74,28 @@ aws ec2 authorize-security-group-ingress --group-name MySecurityGroup --protocol
 5) now use emr subcommand [docs: http://docs.aws.amazon.com/cli/latest/reference/emr/index.html] to
 build and access the cluster
 
- % aws emr create-cluster --ami-version 3.8.0 --instance-groups InstanceGroupType=MASTER,InstanceCount=1,InstanceType=m3.xlarge InstanceGroupType=CORE,InstanceCount=2,InstanceType=m3.xlarge --ec2-attributes KeyName=MyKeyPair --log-uri s3n://wcohen-gpig-log
+ % aws emr create-cluster --ami-version 3.8.0 --instance-groups InstanceGroupType=MASTER,InstanceCount=1,InstanceType=m3.xlarge InstanceGroupType=CORE,InstanceCount=2,InstanceType=m3.xlarge --ec2-attributes KeyName=MyKeyPair --log-uri s3n://wcohen-gpig-log --bootstrap-action Path="s3n://wcohen-gpig-input/emr-bootstrap.sh"
 
  {
     "ClusterId": "j-1LF855E531Y16"
  }
 
+hint: https://s3.amazonaws.com/bucket-name/path-to-file accesses a file
+
+ added tutorial/emr-bootstrap.sh as an action, might modify it to only run on the master, I think I can
+ replace the "echo running on master node" with an s3n:// script...but it should have a #!/bin/sh header.
+ --bootstrap-action Path=s3://elasticmapreduce/bootstrap-actions/run-if,Args=["instance.isMaster=true","echo running on master node"]
+
+
  should add a name: --name "foo"
- should add a bootstrap action script: --bootstrap-action Path="s3://wcohen-.../foo.sh" to pull in gpig and notify me with
- mkdir gpig
- cd gpig
- wget http://www.cs.cmu.edu/~wcohen/10-605/gpigtut.tgz
- tar -xzf gpigtut.tgz
- echo the cluster is ready now | mail -s test [email protected]
 
- wait a bit then:
 
  % aws emr put --cluster-id j-1LF855E531Y16 --key-pair-file MyKeyPair.pem --src path/to/tutorial.tgz 
  % aws emr ssh --cluster-id j-1LF855E531Y16 --key-pair-file MyKeyPair.pem 
 
- you're logged in so to run
+ you're logged in so to run 
 
- % unpack the tutorial...
+ % unpack the tutorial... TODO: add the hadoop startup and mkdir to emr-bootstrap
  % 
  % export GP_STREAMJAR=/home/hadoop/contrib/streaming/hadoop-streaming.jar
  % hadoop jar ~/hadoop-examples.jar pi 10 10000000  #somehow this was needed to set up hdfs:/user/hadoop
@@ -126,6 +126,8 @@ this is obsolete?
   # copying: see https://wiki.apache.org/hadoop/AmazonS3
   % ${HADOOP_HOME}/bin/hadoop distcp hdfs://domU-12-31-33-00-02-DF:9001/user/nutch/0070206153839-1998 s3://123:456@nutch/
 
+  logs are by default on master in /mnt/var/log/ - see http://docs.aws.amazon.com/ElasticMapReduce/latest/DeveloperGuide/emr-manage-view-web-log-files.html
+
 
 TODO - MAJOR
 

diff --git a/emr-extras/emr-bootstrap-sample.sh b/emr-extras/emr-bootstrap-sample.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+[email protected]
+stat=`grep isMaster /mnt/var/lib/info/instance.json | cut -d: -f2`
+if [ "$stat" != "" ]; then
+    #get the code and unpack it
+    wget http://www.cs.cmu.edu/~wcohen/10-605/gpigtut.tgz
+    tar -xzf gpigtut.tgz
+    #this is needed to initialize the HDFS
+    hadoop jar ~/hadoop-examples.jar pi 10 10000000 >& pi-example.log
+    #create the default HDFS directory for Guinea Pig on HDFS
+    hadoop fs -mkdir /user/hadoop/gp_views
+    ########################################
+    #if you want, uncomment this section to get an email 
+    #notification - after defining your own email address above
+    #echo the cluster is ready now - ssh in and cd to tutorial | mail -s 'cluster is now up' $MY_EMAIL
+fi
diff --git a/emr-extras/emr-bootstrap-sample.sh~ b/emr-extras/emr-bootstrap-sample.sh~
@@ -0,0 +1,16 @@
+#!/bin/bash
+[email protected]
+stat=`grep isMaster /mnt/var/lib/info/instance.json | cut -d: -f2`
+if [ "$stat" != "" ]; then
+    #get the code and unpack it
+    wget http://www.cs.cmu.edu/~wcohen/10-605/gpigtut.tgz
+    tar -xzf gpigtut.tgz
+    #this is needed to initialize the HDFS
+    hadoop jar ~/hadoop-examples.jar pi 10 10000000 >& pi-example.log
+    #create the default HDFS directory for Guinea Pig on HDFS
+    hadoop fs -mkdir /user/hadoop/gp_views
+    ########################################
+    #if you want, uncomment this section to get an email 
+    #notification - after defining your own email address above
+    #echo the cluster is ready now - ssh in and cd to tutorial | mail -s 'cluster is now up' $MY_EMAIL
+fi
diff --git a/emr-extras/emr-create-cluster-sample.sh b/emr-extras/emr-create-cluster-sample.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+[email protected]
+stat=`grep isMaster /mnt/var/lib/info/instance.json | cut -d: -f2`
+if [ "$stat" != "" ]; then
+    #get the code and unpack it
+    wget http://www.cs.cmu.edu/~wcohen/10-605/gpigtut.tgz
+    tar -xzf gpigtut.tgz
+    #this is needed to initialize the HDFS
+    hadoop jar ~/hadoop-examples.jar pi 10 10000000 >& pi-example.log
+    #create the default HDFS directory for Guinea Pig on HDFS
+    hadoop fs -mkdir /user/hadoop/gp_views
+    ########################################
+    #if you want, uncomment this section to get an email 
+    #notification - after defining your own email address above
+    #echo the cluster is ready now - ssh in and cd to tutorial | mail -s 'cluster is now up' $MY_EMAIL
+fi
diff --git a/emr-extras/emr-create-cluster-sample.sh~ b/emr-extras/emr-create-cluster-sample.sh~
@@ -0,0 +1,15 @@
+VERSION=3.8.0
+MASTER=m3.xlarge
+WORKER=m3.xlarge
+NWORKERS=2
+BOOTSTRAP_SCRIPT=s3n://wcohen-gpig-input/emr-bootstrap.sh
+aws emr create-cluster \
+    --ami-version $VERSION \
+    --instance-groups InstanceGroupType=MASTER,InstanceCount=1,InstanceType=$MASTER \
+	              InstanceGroupType=CORE,InstanceCount=$NWORKERS,InstanceType=$WORKER \
+    --ec2-attributes KeyName=MyKeyPair \
+    --log-uri s3n://wcohen-gpig-log \
+    --bootstrap-action Path=$BOOTSTRAP_SCRIPT \
+ | tee emr-cluster-id.txt
+echo saved in emr-cluster-id.txt
+
diff --git a/emr-extras/emr-howto.txt b/emr-extras/emr-howto.txt
@@ -0,0 +1,89 @@
+EMR (Elastic MapReduce) is a popular cloud processing service from
+Amazon that includes Hadoop.  Running Guinea Pig on EMR is easy
+enough, but there are lots of steps.  This is a walkthrough.
+
+GENERIC INSTRUCTIONS FOR EMR:
+
+1) First you need to get an Amazon AWS account.  If you have an Amazon
+account, you can just use that password to log into AWS at
+https://console.aws.amazon.com.
+
+2) Install the tools: You need to establish the credentials you need
+to use EC2, the "Elastic Cloud" service that includes EMR, and also
+use EC2 to launch new virtual clusters in EMR.  I use a command-line
+program (aka a "CLI") to do this.  So first, install that program, the
+AWS CLI.  The details are in
+
+http://docs.aws.amazon.com/ElasticMapReduce/latest/DeveloperGuide/emr-manage-view-web-log-files.html
+
+but briefly, go to a convenient directory, say ~/code/aws-cli, and type
+
+ % curl https://s3.amazonaws.com/aws-cli/awscli-bundle.zip > awscli-bundle.zip
+ % unzip awscli-bundle.zip
+ % ./awscli-bundle/install -i `pwd`/install
+ % export PATH=$PATH:~/code/aws-cli/install/bin/
+
+To test, type 'aws --version' at the command prompt.
+
+3) Get your access codes.  An "access key" is a set of codes, one
+private, and one public, that are used to interact with the AWS CLI
+tool.  Follow the directions here, and save the result somewhere.
+
+https://console.aws.amazon.com/iam/home?#security_credential
+
+4) Tell the AWS CLI about your access codes.  The command for this is
+'aws configure': you'll be asked for your codes and some other info,
+and I used these:
+
+ % aws configure 
+ AWS Access Key ID [None]: ...
+ AWS Secret Access Key [None]:  ...
+ Default region name [None]: us-east-1
+ Default output format [None]: json
+
+This info is stored somewhere off your home directory.
+
+5) Create a key-pair. You'd think one set of codes would be enough,
+but you're not done yet; you need another set of public/private codes
+called a "keypair" to interact with the clusters you create.  The
+details are at http://docs.aws.amazon.com/cli/latest/userguide/cli-ec2-keypairs.html
+but the quick version is to use these commands (the second keeps the keys secret).
+
+ % aws ec2 create-key-pair --key-name MyKeyPair --query 'KeyMaterial' --output text > MyKeyPair.pem
+ % chmod 600 MyKeyPair.pem
+
+6) Create a security group.  This one will let any IP address try ssh
+into your cluster (but I believe they need the keypair you use at
+creation time to be successful).  You can specify a range of IPs if
+you want.
+
+ % aws ec2 create-security-group --group-name MySecurityGroup --description "My security group"
+ % aws ec2 authorize-security-group-ingress --group-name MySecurityGroup --protocol tcp --port 22 --cidr 0.0.0.0/0
+
+7) Create a cluster.  You only need to do steps 1-6 once (for each
+machine you want to work from anyway) and after that, you can create a
+cluster with just one more command.  This command is very customizable but
+one that works would be
+
+ % aws emr create-cluster --ami-version 3.8.0  --ec2-attributes KeyName=MyKeyPair \
+   --instance-groups InstanceGroupType=MASTER,InstanceCount=1,InstanceType=m3.xlarge InstanceGroupType=CORE,InstanceCount=2,InstanceType=m3.xlarge \
+
+The instance-groups stuff defines the cluster you want - this one is
+tiny, with three nodes.  The KeyName, which should have the name of
+the keypair you created in step 5, is how the new cluster will know
+whether or not to let you in.  This will output something like:
+
+  {
+      "ClusterId": "j-JEX5UT60ELD5"
+  }
+
+which is the name of the cluster.  It will take some time (10min?) to
+start up, and then you can log into the master using your keypair:
+
+ % aws emr ssh --cluster-id j-JEX5UT60ELD5 --key-pair-file MyKeyPair.pem 
+
+8) Use your cluster and then - when you are all done - TERMINATE IT.
+The meter keeps running until you do!
+
+INSTRUCTIONS FOR GUINEA PIG:
+
diff --git a/tutorial/Makefile b/tutorial/Makefile
@@ -1,14 +1,16 @@
 update:
+	cp ../emr-extras/*-sample.sh
 	cp ../guineapig.py  .
 
 clean:
 	rm -rf gpig_views
 	rm -f total.gp
-	rm *.pyc
+	rm -f *.pyc
 
 tar: update
 	echo created on `date` > marker.txt
-	tar -cvzf tutorial.tgz marker.txt guineapig.py *corpus.txt id-parks.txt *.py phirl-naive.pig
+	(cd ..; tar -cvzf tutorial.tgz tutorial/marker.txt tutorial/guineapig.py tutorial/*corpus.txt tutorial/id-parks.txt tutorial/*.py)
+	mv ../tutorial.tgz .
 
 upload: tar
 	scp tutorial.tgz raff.ml.cmu.edu:~/afs-home/www/10-605/gpigtut.tgz