feature: image based on emr 6.1.0 (#34)

guoqiao1992 · web-flow · commit a6e444c9b5be · 2020-11-13T12:09:38.000-08:00
* add support for emr 6.1.0

* add more comment and avoid unncessary calls to s3
diff --git a/Makefile b/Makefile
@@ -7,10 +7,10 @@ SHELL          := /bin/sh
 
 # Set variables if testing locally
 ifeq ($(IS_RELEASE_BUILD),)
-    SPARK_VERSION := 2.4
+    SPARK_VERSION := 3.0
     PROCESSOR := cpu
     FRAMEWORK_VERSION := py37
-    SM_VERSION := 0.1
+    SM_VERSION := 1.0
     USE_CASE := processing
     BUILD_CONTEXT := ./spark/${USE_CASE}/${SPARK_VERSION}/py3
     AWS_PARTITION := aws
@@ -84,18 +84,19 @@ test-sagemaker: install-sdk build-tests
 	# History server tests can't run in parallel since they use the same container name.
 	pytest -s -vv test/integration/history \
 	--repo=$(DEST_REPO) --tag=$(VERSION) --durations=0 \
-	--spark-version=$(SPARK_VERSION)
-	--framework_version=$(FRAMEWORK_VERSION) \
+	--spark-version=$(SPARK_VERSION) \
+	--framework-version=$(FRAMEWORK_VERSION) \
 	--role $(ROLE) \
 	--image_uri $(IMAGE_URI) \
 	--region ${REGION} \
 	--domain ${AWS_DOMAIN}
 	# OBJC_DISABLE_INITIALIZE_FORK_SAFETY: https://github.com/ansible/ansible/issues/32499#issuecomment-341578864
 	OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES pytest --workers auto -s -vv test/integration/sagemaker \
 	--repo=$(DEST_REPO) --tag=$(VERSION) --durations=0 \
-	--spark-version=$(SPARK_VERSION)
-	--framework_version=$(FRAMEWORK_VERSION) \
+	--spark-version=$(SPARK_VERSION) \
+	--framework-version=$(FRAMEWORK_VERSION) \
 	--role $(ROLE) \
+	--account-id ${INTEG_TEST_ACCOUNT} \
 	--image_uri $(IMAGE_URI) \
 	--region ${REGION} \
 	--domain ${AWS_DOMAIN}
@@ -104,8 +105,8 @@ test-sagemaker: install-sdk build-tests
 test-prod:
 	pytest -s -vv test/integration/tag \
 	--repo=$(DEST_REPO) --tag=$(VERSION) --durations=0 \
-	--spark-version=$(SPARK_VERSION)
-	--framework_version=$(FRAMEWORK_VERSION) \
+	--spark-version=$(SPARK_VERSION) \
+	--framework-version=$(FRAMEWORK_VERSION) \
 	--role $(ROLE) \
 	--image_uri $(IMAGE_URI) \
 	--region ${REGION} \
diff --git a/new_images.yml b/new_images.yml
@@ -1,6 +1,6 @@
 ---
 new_images:
-  - spark: "2.4.4"
+  - spark: "3.0.0"
     use-case: "processing"
     processors: ["cpu"]
     python: ["py37"]
diff --git a/setup.py b/setup.py
@@ -44,7 +44,7 @@
     ],
     setup_requires=["setuptools", "wheel"],
     # Be frugal when adding dependencies. Prefer Python's standard library.
-    install_requires = install_reqs,
+    install_requires=install_reqs,
 
     extras_require={
         "test": test_install_reqs,
diff --git a/spark/processing/3.0/py3/container-bootstrap-config/bootstrap.sh b/spark/processing/3.0/py3/container-bootstrap-config/bootstrap.sh
@@ -0,0 +1 @@
+echo "Not implemented"
diff --git a/spark/processing/3.0/py3/docker/Dockerfile.cpu b/spark/processing/3.0/py3/docker/Dockerfile.cpu
@@ -0,0 +1,100 @@
+FROM amazonlinux:2
+ARG REGION
+ENV AWS_REGION ${REGION}
+RUN yum clean all
+RUN yum update -y
+RUN yum install -y awscli bigtop-utils curl gcc gzip unzip python3 python3-setuptools python3-pip python-devel python3-devel python-psutil gunzip tar wget liblapack* libblas* libopencv* libopenblas*
+
+# install nginx amazonlinux:2.0.20200304.0 does not have nginx, so need to install epel-release first
+RUN wget https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm
+RUN yum install -y epel-release-latest-7.noarch.rpm
+RUN yum install -y nginx
+
+RUN rm -rf /var/cache/yum
+
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+# http://blog.stuart.axelbrooke.com/python-3-on-spark-return-of-the-pythonhashseed
+ENV PYTHONHASHSEED 0
+ENV PYTHONIOENCODING UTF-8
+ENV PIP_DISABLE_PIP_VERSION_CHECK 1
+
+# Install EMR Spark/Hadoop
+ENV HADOOP_HOME /usr/lib/hadoop
+ENV HADOOP_CONF_DIR /usr/lib/hadoop/etc/hadoop
+ENV SPARK_HOME /usr/lib/spark
+
+COPY yum/emr-apps.repo /etc/yum.repos.d/emr-apps.repo
+
+# Install hadoop / spark dependencies from EMR's yum repository for Spark optimizations.
+# replace placeholder with region in repository URL
+RUN sed -i "s/REGION/${AWS_REGION}/g" /etc/yum.repos.d/emr-apps.repo
+
+# These packages are a subset of what EMR installs in a cluster with the
+# "hadoop", "spark", and "hive" applications.
+# They include EMR-optimized libraries and extras.
+RUN yum install -y aws-hm-client \
+    aws-java-sdk \
+    aws-sagemaker-spark-sdk \
+    emr-goodies \
+    emr-scripts \
+    emr-s3-select \
+    emrfs \
+    hadoop \
+    hadoop-client \
+    hadoop-hdfs \
+    hadoop-hdfs-datanode \
+    hadoop-hdfs-namenode \
+    hadoop-httpfs \
+    hadoop-kms \
+    hadoop-lzo \
+    hadoop-yarn \
+    hadoop-yarn-nodemanager \
+    hadoop-yarn-proxyserver \
+    hadoop-yarn-resourcemanager \
+    hadoop-yarn-timelineserver \
+    hive \
+    hive-hcatalog \
+    hive-hcatalog-server \
+    hive-jdbc \
+    hive-server2 \
+    python37-numpy \
+    python37-sagemaker_pyspark \
+    s3-dist-cp \
+    spark-core \
+    spark-datanucleus \
+    spark-external \
+    spark-history-server \
+    spark-python
+
+
+# Point Spark at proper python binary
+ENV PYSPARK_PYTHON=/usr/bin/python3
+
+# Setup Spark/Yarn/HDFS user as root
+ENV PATH="/usr/bin:/opt/program:${PATH}"
+ENV YARN_RESOURCEMANAGER_USER="root"
+ENV YARN_NODEMANAGER_USER="root"
+ENV HDFS_NAMENODE_USER="root"
+ENV HDFS_DATANODE_USER="root"
+ENV HDFS_SECONDARYNAMENODE_USER="root"
+
+# Set up bootstrapping program and Spark configuration
+COPY *.whl /opt/program/
+RUN /usr/bin/python3 -m pip install /opt/program/*.whl
+COPY hadoop-config /opt/hadoop-config
+COPY nginx-config /opt/nginx-config
+COPY aws-config /opt/aws-config
+
+# Setup container bootstrapper
+COPY container-bootstrap-config /opt/container-bootstrap-config
+RUN chmod +x /opt/container-bootstrap-config/bootstrap.sh
+RUN /opt/container-bootstrap-config/bootstrap.sh
+
+# With this config, spark history server will not run as daemon, otherwise there
+# will be no server running and container will terminate immediately
+ENV SPARK_NO_DAEMONIZE TRUE
+
+WORKDIR $SPARK_HOME
+
+ENTRYPOINT ["smspark-submit"]
diff --git a/spark/processing/3.0/py3/hadoop-config/core-site.xml b/spark/processing/3.0/py3/hadoop-config/core-site.xml
@@ -0,0 +1,26 @@
+<?xml version="1.0" encoding="UTF-8"?>
+ <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+ <!-- Put site-specific property overrides in this file. -->
+
+ <configuration>
+     <property>
+         <name>fs.defaultFS</name>
+         <value>hdfs://nn_uri/</value>
+         <description>NameNode URI</description>
+     </property>
+     <property>
+         <name>fs.s3a.aws.credentials.provider</name>
+         <value>com.amazonaws.auth.DefaultAWSCredentialsProviderChain</value>
+         <description>AWS S3 credential provider</description>
+     </property>
+     <property>
+         <name>fs.s3.impl</name>
+         <value>org.apache.hadoop.fs.s3a.S3AFileSystem</value>
+         <description>s3a filesystem implementation</description>
+     </property>
+     <property>
+         <name>fs.AbstractFileSystem.s3a.imp</name>
+         <value>org.apache.hadoop.fs.s3a.S3A</value>
+         <description>s3a filesystem implementation</description>
+     </property>
+ </configuration>
diff --git a/spark/processing/3.0/py3/hadoop-config/hdfs-site.xml b/spark/processing/3.0/py3/hadoop-config/hdfs-site.xml
@@ -0,0 +1,19 @@
+<?xml version="1.0" encoding="UTF-8"?>
+ <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+ <!-- Put site-specific property overrides in this file. -->
+
+ <configuration>
+     <property>
+         <name>dfs.datanode.data.dir</name>
+         <value>file:///opt/amazon/hadoop/hdfs/datanode</value>
+         <description>Comma separated list of paths on the local filesystem of a DataNode where it should store its\
+  blocks.</description>
+     </property>
+
+     <property>
+         <name>dfs.namenode.name.dir</name>
+         <value>file:///opt/amazon/hadoop/hdfs/namenode</value>
+         <description>Path on the local filesystem where the NameNode stores the namespace and transaction logs per\
+ sistently.</description>
+     </property>
+ </configuration>
diff --git a/spark/processing/3.0/py3/hadoop-config/spark-defaults.conf b/spark/processing/3.0/py3/hadoop-config/spark-defaults.conf
@@ -0,0 +1,6 @@
+spark.driver.extraClassPath      /usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar
+spark.driver.extraLibraryPath    /usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native
+spark.executor.extraClassPath    /usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar
+spark.executor.extraLibraryPath  /usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native
+spark.driver.host=sd_host
+spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version=2
diff --git a/spark/processing/3.0/py3/hadoop-config/spark-env.sh b/spark/processing/3.0/py3/hadoop-config/spark-env.sh
@@ -0,0 +1,3 @@
+#EMPTY FILE AVOID OVERRIDDING ENV VARS
+# Specifically, without copying the empty file, SPARK_HISTORY_OPTS will be overriden, 
+# spark.history.ui.port defaults to 18082, and spark.eventLog.dir defaults to local fs
diff --git a/spark/processing/3.0/py3/hadoop-config/yarn-site.xml b/spark/processing/3.0/py3/hadoop-config/yarn-site.xml
@@ -0,0 +1,34 @@
+<?xml version="1.0"?>
+<!-- Site specific YARN configuration properties -->
+ <configuration>
+     <property>
+         <name>yarn.resourcemanager.hostname</name>
+         <value>rm_hostname</value>
+         <description>The hostname of the RM.</description>
+     </property>
+     <property>
+         <name>yarn.nodemanager.hostname</name>
+         <value>nm_hostname</value>
+         <description>The hostname of the NM.</description>
+     </property>
+     <property>
+         <name>yarn.nodemanager.webapp.address</name>
+         <value>nm_webapp_address</value>
+     </property>
+     <property>
+         <name>yarn.nodemanager.vmem-pmem-ratio</name>
+         <value>5</value>
+         <description>Ratio between virtual memory to physical memory.</description>
+     </property>
+     <property>
+         <name>yarn.resourcemanager.am.max-attempts</name>
+         <value>1</value>
+         <description>The maximum number of application attempts.</description>
+     </property>
+     <property>
+         <name>yarn.nodemanager.env-whitelist</name>
+         <value>JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,YARN_HOME,AWS_CONTAINER_CREDENTIALS_RELATIVE_URI</value>
+         <description>Environment variable whitelist</description>
+     </property>
+
+ </configuration>
diff --git a/spark/processing/3.0/py3/nginx-config/default.conf b/spark/processing/3.0/py3/nginx-config/default.conf
@@ -0,0 +1,17 @@
+server {
+    listen 15050;
+    server_name localhost;
+    client_header_buffer_size 128k;
+    large_client_header_buffers 4 128k;
+
+    location ~ ^/history/(.*)/(.*)/jobs/$ {
+        proxy_pass http://localhost:18080/history/$1/jobs/;
+        proxy_redirect http://localhost:18080/history/$1/jobs/ $domain_name/proxy/15050/history/$1/jobs/;
+        expires off;
+    }
+
+    location / {
+        proxy_pass http://localhost:18080;
+        expires off;
+    }
+}
diff --git a/spark/processing/3.0/py3/nginx-config/nginx.conf b/spark/processing/3.0/py3/nginx-config/nginx.conf
@@ -0,0 +1,66 @@
+# For more information on configuration, see:
+#   * Official English Documentation: http://nginx.org/en/docs/
+#   * Official Russian Documentation: http://nginx.org/ru/docs/
+
+user nginx;
+worker_processes auto;
+error_log /var/log/nginx/error.log;
+pid /run/nginx.pid;
+
+# Load dynamic modules. See /usr/share/doc/nginx/README.dynamic.
+include /usr/share/nginx/modules/*.conf;
+
+events {
+    worker_connections 1024;
+}
+
+http {
+    log_format  main  '$remote_addr - $remote_user [$time_local] "$request" '
+                      '$status $body_bytes_sent "$http_referer" '
+                      '"$http_user_agent" "$http_x_forwarded_for"';
+
+    access_log  /var/log/nginx/access.log  main;
+
+    sendfile            on;
+    tcp_nopush          on;
+    tcp_nodelay         on;
+    keepalive_timeout   65;
+    types_hash_max_size 2048;
+
+    include             /etc/nginx/mime.types;
+    default_type        application/octet-stream;
+
+    # Load modular configuration files from the /etc/nginx/conf.d directory.
+    # See http://nginx.org/en/docs/ngx_core_module.html#include
+    # for more information.
+    include /etc/nginx/conf.d/*.conf;
+
+    server {
+        listen       80 default_server;
+        listen       [::]:80 default_server;
+        server_name  _;
+        root         /usr/share/nginx/html;
+
+        # Load configuration files for the default server block.
+        include /etc/nginx/default.d/*.conf;
+
+        location  /proxy/15050 {
+            proxy_pass http://localhost:15050/;
+        }
+
+        location ~ ^/proxy/15050/(.*) {
+            proxy_pass http://localhost:15050/$1;
+        }
+
+        location / {
+        }
+
+        error_page 404 /404.html;
+            location = /40x.html {
+        }
+
+        error_page 500 502 503 504 /50x.html;
+            location = /50x.html {
+        }
+    }
+}
diff --git a/spark/processing/3.0/py3/smspark-0.1-py3-none-any.whl b/spark/processing/3.0/py3/smspark-0.1-py3-none-any.whl
diff --git a/spark/processing/3.0/py3/yum/emr-apps.repo b/spark/processing/3.0/py3/yum/emr-apps.repo
@@ -0,0 +1,7 @@
+[emr-apps]
+name = EMR Application Repository
+gpgkey = https://s3-REGION.amazonaws.com/repo.REGION.emr.amazonaws.com/apps-repository/emr-6.1.0/72a9ec2e-9bf6-4d7d-9244-86a0ab1e50d6/repoPublicKey.txt
+enabled = 1
+baseurl = https://s3-REGION.amazonaws.com/repo.REGION.emr.amazonaws.com/apps-repository/emr-6.1.0/72a9ec2e-9bf6-4d7d-9244-86a0ab1e50d6
+priority = 5
+gpgcheck = 0
diff --git a/src/smspark/bootstrapper.py b/src/smspark/bootstrapper.py
diff --git a/test/integration/local/test_multinode_container.py b/test/integration/local/test_multinode_container.py
diff --git a/test/integration/sagemaker/test_spark.py b/test/integration/sagemaker/test_spark.py
diff --git a/test/resources/code/scala/hello-scala-spark/hello-scala-spark.sbt b/test/resources/code/scala/hello-scala-spark/hello-scala-spark.sbt
diff --git a/test/unit/test_bootstrapper.py b/test/unit/test_bootstrapper.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+#EMPTY FILE AVOID OVERRIDDING ENV VARS`
	`2`	`+# Specifically, without copying the empty file, SPARK_HISTORY_OPTS will be overriden,`
	`3`	`+# spark.history.ui.port defaults to 18082, and spark.eventLog.dir defaults to local fs`