diff --git a/TODO.txt b/TODO.txt index b76c4b6..286a11b 100644 --- a/TODO.txt +++ b/TODO.txt @@ -74,28 +74,28 @@ aws ec2 authorize-security-group-ingress --group-name MySecurityGroup --protocol 5) now use emr subcommand [docs: http://docs.aws.amazon.com/cli/latest/reference/emr/index.html] to build and access the cluster - % aws emr create-cluster --ami-version 3.8.0 --instance-groups InstanceGroupType=MASTER,InstanceCount=1,InstanceType=m3.xlarge InstanceGroupType=CORE,InstanceCount=2,InstanceType=m3.xlarge --ec2-attributes KeyName=MyKeyPair --log-uri s3n://wcohen-gpig-log + % aws emr create-cluster --ami-version 3.8.0 --instance-groups InstanceGroupType=MASTER,InstanceCount=1,InstanceType=m3.xlarge InstanceGroupType=CORE,InstanceCount=2,InstanceType=m3.xlarge --ec2-attributes KeyName=MyKeyPair --log-uri s3n://wcohen-gpig-log --bootstrap-action Path="s3n://wcohen-gpig-input/emr-bootstrap.sh" { "ClusterId": "j-1LF855E531Y16" } +hint: https://s3.amazonaws.com/bucket-name/path-to-file accesses a file + + added tutorial/emr-bootstrap.sh as an action, might modify it to only run on the master, I think I can + replace the "echo running on master node" with an s3n:// script...but it should have a #!/bin/sh header. + --bootstrap-action Path=s3://elasticmapreduce/bootstrap-actions/run-if,Args=["instance.isMaster=true","echo running on master node"] + + should add a name: --name "foo" - should add a bootstrap action script: --bootstrap-action Path="s3://wcohen-.../foo.sh" to pull in gpig and notify me with - mkdir gpig - cd gpig - wget http://www.cs.cmu.edu/~wcohen/10-605/gpigtut.tgz - tar -xzf gpigtut.tgz - echo the cluster is ready now | mail -s test wcohen@gmail.com - wait a bit then: % aws emr put --cluster-id j-1LF855E531Y16 --key-pair-file MyKeyPair.pem --src path/to/tutorial.tgz % aws emr ssh --cluster-id j-1LF855E531Y16 --key-pair-file MyKeyPair.pem - you're logged in so to run + you're logged in so to run - % unpack the tutorial... + % unpack the tutorial... TODO: add the hadoop startup and mkdir to emr-bootstrap % % export GP_STREAMJAR=/home/hadoop/contrib/streaming/hadoop-streaming.jar % hadoop jar ~/hadoop-examples.jar pi 10 10000000 #somehow this was needed to set up hdfs:/user/hadoop @@ -126,6 +126,8 @@ this is obsolete? # copying: see https://wiki.apache.org/hadoop/AmazonS3 % ${HADOOP_HOME}/bin/hadoop distcp hdfs://domU-12-31-33-00-02-DF:9001/user/nutch/0070206153839-1998 s3://123:456@nutch/ + logs are by default on master in /mnt/var/log/ - see http://docs.aws.amazon.com/ElasticMapReduce/latest/DeveloperGuide/emr-manage-view-web-log-files.html + TODO - MAJOR diff --git a/emr-extras/emr-bootstrap-sample.sh b/emr-extras/emr-bootstrap-sample.sh new file mode 100644 index 0000000..fe8e1de --- /dev/null +++ b/emr-extras/emr-bootstrap-sample.sh @@ -0,0 +1,16 @@ +#!/bin/bash +MY_EMAIL=somebody@an.email.address +stat=`grep isMaster /mnt/var/lib/info/instance.json | cut -d: -f2` +if [ "$stat" != "" ]; then + #get the code and unpack it + wget http://www.cs.cmu.edu/~wcohen/10-605/gpigtut.tgz + tar -xzf gpigtut.tgz + #this is needed to initialize the HDFS + hadoop jar ~/hadoop-examples.jar pi 10 10000000 >& pi-example.log + #create the default HDFS directory for Guinea Pig on HDFS + hadoop fs -mkdir /user/hadoop/gp_views + ######################################## + #if you want, uncomment this section to get an email + #notification - after defining your own email address above + #echo the cluster is ready now - ssh in and cd to tutorial | mail -s 'cluster is now up' $MY_EMAIL +fi diff --git a/emr-extras/emr-bootstrap-sample.sh~ b/emr-extras/emr-bootstrap-sample.sh~ new file mode 100644 index 0000000..c14e638 --- /dev/null +++ b/emr-extras/emr-bootstrap-sample.sh~ @@ -0,0 +1,16 @@ +#!/bin/bash +MY_EMAIL=wcohen@gmail.com +stat=`grep isMaster /mnt/var/lib/info/instance.json | cut -d: -f2` +if [ "$stat" != "" ]; then + #get the code and unpack it + wget http://www.cs.cmu.edu/~wcohen/10-605/gpigtut.tgz + tar -xzf gpigtut.tgz + #this is needed to initialize the HDFS + hadoop jar ~/hadoop-examples.jar pi 10 10000000 >& pi-example.log + #create the default HDFS directory for Guinea Pig on HDFS + hadoop fs -mkdir /user/hadoop/gp_views + ######################################## + #if you want, uncomment this section to get an email + #notification - after defining your own email address above + #echo the cluster is ready now - ssh in and cd to tutorial | mail -s 'cluster is now up' $MY_EMAIL +fi diff --git a/emr-extras/emr-create-cluster-sample.sh b/emr-extras/emr-create-cluster-sample.sh new file mode 100644 index 0000000..fe8e1de --- /dev/null +++ b/emr-extras/emr-create-cluster-sample.sh @@ -0,0 +1,16 @@ +#!/bin/bash +MY_EMAIL=somebody@an.email.address +stat=`grep isMaster /mnt/var/lib/info/instance.json | cut -d: -f2` +if [ "$stat" != "" ]; then + #get the code and unpack it + wget http://www.cs.cmu.edu/~wcohen/10-605/gpigtut.tgz + tar -xzf gpigtut.tgz + #this is needed to initialize the HDFS + hadoop jar ~/hadoop-examples.jar pi 10 10000000 >& pi-example.log + #create the default HDFS directory for Guinea Pig on HDFS + hadoop fs -mkdir /user/hadoop/gp_views + ######################################## + #if you want, uncomment this section to get an email + #notification - after defining your own email address above + #echo the cluster is ready now - ssh in and cd to tutorial | mail -s 'cluster is now up' $MY_EMAIL +fi diff --git a/emr-extras/emr-create-cluster-sample.sh~ b/emr-extras/emr-create-cluster-sample.sh~ new file mode 100644 index 0000000..1222a5b --- /dev/null +++ b/emr-extras/emr-create-cluster-sample.sh~ @@ -0,0 +1,15 @@ +VERSION=3.8.0 +MASTER=m3.xlarge +WORKER=m3.xlarge +NWORKERS=2 +BOOTSTRAP_SCRIPT=s3n://wcohen-gpig-input/emr-bootstrap.sh +aws emr create-cluster \ + --ami-version $VERSION \ + --instance-groups InstanceGroupType=MASTER,InstanceCount=1,InstanceType=$MASTER \ + InstanceGroupType=CORE,InstanceCount=$NWORKERS,InstanceType=$WORKER \ + --ec2-attributes KeyName=MyKeyPair \ + --log-uri s3n://wcohen-gpig-log \ + --bootstrap-action Path=$BOOTSTRAP_SCRIPT \ + | tee emr-cluster-id.txt +echo saved in emr-cluster-id.txt + diff --git a/emr-extras/emr-howto.txt b/emr-extras/emr-howto.txt new file mode 100644 index 0000000..292ae1e --- /dev/null +++ b/emr-extras/emr-howto.txt @@ -0,0 +1,89 @@ +EMR (Elastic MapReduce) is a popular cloud processing service from +Amazon that includes Hadoop. Running Guinea Pig on EMR is easy +enough, but there are lots of steps. This is a walkthrough. + +GENERIC INSTRUCTIONS FOR EMR: + +1) First you need to get an Amazon AWS account. If you have an Amazon +account, you can just use that password to log into AWS at +https://console.aws.amazon.com. + +2) Install the tools: You need to establish the credentials you need +to use EC2, the "Elastic Cloud" service that includes EMR, and also +use EC2 to launch new virtual clusters in EMR. I use a command-line +program (aka a "CLI") to do this. So first, install that program, the +AWS CLI. The details are in + +http://docs.aws.amazon.com/ElasticMapReduce/latest/DeveloperGuide/emr-manage-view-web-log-files.html + +but briefly, go to a convenient directory, say ~/code/aws-cli, and type + + % curl https://s3.amazonaws.com/aws-cli/awscli-bundle.zip > awscli-bundle.zip + % unzip awscli-bundle.zip + % ./awscli-bundle/install -i `pwd`/install + % export PATH=$PATH:~/code/aws-cli/install/bin/ + +To test, type 'aws --version' at the command prompt. + +3) Get your access codes. An "access key" is a set of codes, one +private, and one public, that are used to interact with the AWS CLI +tool. Follow the directions here, and save the result somewhere. + +https://console.aws.amazon.com/iam/home?#security_credential + +4) Tell the AWS CLI about your access codes. The command for this is +'aws configure': you'll be asked for your codes and some other info, +and I used these: + + % aws configure + AWS Access Key ID [None]: ... + AWS Secret Access Key [None]: ... + Default region name [None]: us-east-1 + Default output format [None]: json + +This info is stored somewhere off your home directory. + +5) Create a key-pair. You'd think one set of codes would be enough, +but you're not done yet; you need another set of public/private codes +called a "keypair" to interact with the clusters you create. The +details are at http://docs.aws.amazon.com/cli/latest/userguide/cli-ec2-keypairs.html +but the quick version is to use these commands (the second keeps the keys secret). + + % aws ec2 create-key-pair --key-name MyKeyPair --query 'KeyMaterial' --output text > MyKeyPair.pem + % chmod 600 MyKeyPair.pem + +6) Create a security group. This one will let any IP address try ssh +into your cluster (but I believe they need the keypair you use at +creation time to be successful). You can specify a range of IPs if +you want. + + % aws ec2 create-security-group --group-name MySecurityGroup --description "My security group" + % aws ec2 authorize-security-group-ingress --group-name MySecurityGroup --protocol tcp --port 22 --cidr 0.0.0.0/0 + +7) Create a cluster. You only need to do steps 1-6 once (for each +machine you want to work from anyway) and after that, you can create a +cluster with just one more command. This command is very customizable but +one that works would be + + % aws emr create-cluster --ami-version 3.8.0 --ec2-attributes KeyName=MyKeyPair \ + --instance-groups InstanceGroupType=MASTER,InstanceCount=1,InstanceType=m3.xlarge InstanceGroupType=CORE,InstanceCount=2,InstanceType=m3.xlarge \ + +The instance-groups stuff defines the cluster you want - this one is +tiny, with three nodes. The KeyName, which should have the name of +the keypair you created in step 5, is how the new cluster will know +whether or not to let you in. This will output something like: + + { + "ClusterId": "j-JEX5UT60ELD5" + } + +which is the name of the cluster. It will take some time (10min?) to +start up, and then you can log into the master using your keypair: + + % aws emr ssh --cluster-id j-JEX5UT60ELD5 --key-pair-file MyKeyPair.pem + +8) Use your cluster and then - when you are all done - TERMINATE IT. +The meter keeps running until you do! + +INSTRUCTIONS FOR GUINEA PIG: + diff --git a/tutorial/Makefile b/tutorial/Makefile index 47df754..e888456 100644 --- a/tutorial/Makefile +++ b/tutorial/Makefile @@ -1,14 +1,16 @@ update: + cp ../emr-extras/*-sample.sh cp ../guineapig.py . clean: rm -rf gpig_views rm -f total.gp - rm *.pyc + rm -f *.pyc tar: update echo created on `date` > marker.txt - tar -cvzf tutorial.tgz marker.txt guineapig.py *corpus.txt id-parks.txt *.py phirl-naive.pig + (cd ..; tar -cvzf tutorial.tgz tutorial/marker.txt tutorial/guineapig.py tutorial/*corpus.txt tutorial/id-parks.txt tutorial/*.py) + mv ../tutorial.tgz . upload: tar scp tutorial.tgz raff.ml.cmu.edu:~/afs-home/www/10-605/gpigtut.tgz