Skip to content

Commit a6e444c

Browse files
authored
feature: image based on emr 6.1.0 (#34)
* add support for emr 6.1.0 * add more comment and avoid unncessary calls to s3
1 parent e3dc967 commit a6e444c

File tree

19 files changed

+399
-31
lines changed

19 files changed

+399
-31
lines changed

Makefile

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,10 @@ SHELL := /bin/sh
77

88
# Set variables if testing locally
99
ifeq ($(IS_RELEASE_BUILD),)
10-
SPARK_VERSION := 2.4
10+
SPARK_VERSION := 3.0
1111
PROCESSOR := cpu
1212
FRAMEWORK_VERSION := py37
13-
SM_VERSION := 0.1
13+
SM_VERSION := 1.0
1414
USE_CASE := processing
1515
BUILD_CONTEXT := ./spark/${USE_CASE}/${SPARK_VERSION}/py3
1616
AWS_PARTITION := aws
@@ -84,18 +84,19 @@ test-sagemaker: install-sdk build-tests
8484
# History server tests can't run in parallel since they use the same container name.
8585
pytest -s -vv test/integration/history \
8686
--repo=$(DEST_REPO) --tag=$(VERSION) --durations=0 \
87-
--spark-version=$(SPARK_VERSION)
88-
--framework_version=$(FRAMEWORK_VERSION) \
87+
--spark-version=$(SPARK_VERSION) \
88+
--framework-version=$(FRAMEWORK_VERSION) \
8989
--role $(ROLE) \
9090
--image_uri $(IMAGE_URI) \
9191
--region ${REGION} \
9292
--domain ${AWS_DOMAIN}
9393
# OBJC_DISABLE_INITIALIZE_FORK_SAFETY: https://github.com/ansible/ansible/issues/32499#issuecomment-341578864
9494
OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES pytest --workers auto -s -vv test/integration/sagemaker \
9595
--repo=$(DEST_REPO) --tag=$(VERSION) --durations=0 \
96-
--spark-version=$(SPARK_VERSION)
97-
--framework_version=$(FRAMEWORK_VERSION) \
96+
--spark-version=$(SPARK_VERSION) \
97+
--framework-version=$(FRAMEWORK_VERSION) \
9898
--role $(ROLE) \
99+
--account-id ${INTEG_TEST_ACCOUNT} \
99100
--image_uri $(IMAGE_URI) \
100101
--region ${REGION} \
101102
--domain ${AWS_DOMAIN}
@@ -104,8 +105,8 @@ test-sagemaker: install-sdk build-tests
104105
test-prod:
105106
pytest -s -vv test/integration/tag \
106107
--repo=$(DEST_REPO) --tag=$(VERSION) --durations=0 \
107-
--spark-version=$(SPARK_VERSION)
108-
--framework_version=$(FRAMEWORK_VERSION) \
108+
--spark-version=$(SPARK_VERSION) \
109+
--framework-version=$(FRAMEWORK_VERSION) \
109110
--role $(ROLE) \
110111
--image_uri $(IMAGE_URI) \
111112
--region ${REGION} \

new_images.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
---
22
new_images:
3-
- spark: "2.4.4"
3+
- spark: "3.0.0"
44
use-case: "processing"
55
processors: ["cpu"]
66
python: ["py37"]

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@
4444
],
4545
setup_requires=["setuptools", "wheel"],
4646
# Be frugal when adding dependencies. Prefer Python's standard library.
47-
install_requires = install_reqs,
47+
install_requires=install_reqs,
4848

4949
extras_require={
5050
"test": test_install_reqs,
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
echo "Not implemented"
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
FROM amazonlinux:2
2+
ARG REGION
3+
ENV AWS_REGION ${REGION}
4+
RUN yum clean all
5+
RUN yum update -y
6+
RUN yum install -y awscli bigtop-utils curl gcc gzip unzip python3 python3-setuptools python3-pip python-devel python3-devel python-psutil gunzip tar wget liblapack* libblas* libopencv* libopenblas*
7+
8+
# install nginx amazonlinux:2.0.20200304.0 does not have nginx, so need to install epel-release first
9+
RUN wget https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm
10+
RUN yum install -y epel-release-latest-7.noarch.rpm
11+
RUN yum install -y nginx
12+
13+
RUN rm -rf /var/cache/yum
14+
15+
ENV PYTHONDONTWRITEBYTECODE=1
16+
ENV PYTHONUNBUFFERED=1
17+
# http://blog.stuart.axelbrooke.com/python-3-on-spark-return-of-the-pythonhashseed
18+
ENV PYTHONHASHSEED 0
19+
ENV PYTHONIOENCODING UTF-8
20+
ENV PIP_DISABLE_PIP_VERSION_CHECK 1
21+
22+
# Install EMR Spark/Hadoop
23+
ENV HADOOP_HOME /usr/lib/hadoop
24+
ENV HADOOP_CONF_DIR /usr/lib/hadoop/etc/hadoop
25+
ENV SPARK_HOME /usr/lib/spark
26+
27+
COPY yum/emr-apps.repo /etc/yum.repos.d/emr-apps.repo
28+
29+
# Install hadoop / spark dependencies from EMR's yum repository for Spark optimizations.
30+
# replace placeholder with region in repository URL
31+
RUN sed -i "s/REGION/${AWS_REGION}/g" /etc/yum.repos.d/emr-apps.repo
32+
33+
# These packages are a subset of what EMR installs in a cluster with the
34+
# "hadoop", "spark", and "hive" applications.
35+
# They include EMR-optimized libraries and extras.
36+
RUN yum install -y aws-hm-client \
37+
aws-java-sdk \
38+
aws-sagemaker-spark-sdk \
39+
emr-goodies \
40+
emr-scripts \
41+
emr-s3-select \
42+
emrfs \
43+
hadoop \
44+
hadoop-client \
45+
hadoop-hdfs \
46+
hadoop-hdfs-datanode \
47+
hadoop-hdfs-namenode \
48+
hadoop-httpfs \
49+
hadoop-kms \
50+
hadoop-lzo \
51+
hadoop-yarn \
52+
hadoop-yarn-nodemanager \
53+
hadoop-yarn-proxyserver \
54+
hadoop-yarn-resourcemanager \
55+
hadoop-yarn-timelineserver \
56+
hive \
57+
hive-hcatalog \
58+
hive-hcatalog-server \
59+
hive-jdbc \
60+
hive-server2 \
61+
python37-numpy \
62+
python37-sagemaker_pyspark \
63+
s3-dist-cp \
64+
spark-core \
65+
spark-datanucleus \
66+
spark-external \
67+
spark-history-server \
68+
spark-python
69+
70+
71+
# Point Spark at proper python binary
72+
ENV PYSPARK_PYTHON=/usr/bin/python3
73+
74+
# Setup Spark/Yarn/HDFS user as root
75+
ENV PATH="/usr/bin:/opt/program:${PATH}"
76+
ENV YARN_RESOURCEMANAGER_USER="root"
77+
ENV YARN_NODEMANAGER_USER="root"
78+
ENV HDFS_NAMENODE_USER="root"
79+
ENV HDFS_DATANODE_USER="root"
80+
ENV HDFS_SECONDARYNAMENODE_USER="root"
81+
82+
# Set up bootstrapping program and Spark configuration
83+
COPY *.whl /opt/program/
84+
RUN /usr/bin/python3 -m pip install /opt/program/*.whl
85+
COPY hadoop-config /opt/hadoop-config
86+
COPY nginx-config /opt/nginx-config
87+
COPY aws-config /opt/aws-config
88+
89+
# Setup container bootstrapper
90+
COPY container-bootstrap-config /opt/container-bootstrap-config
91+
RUN chmod +x /opt/container-bootstrap-config/bootstrap.sh
92+
RUN /opt/container-bootstrap-config/bootstrap.sh
93+
94+
# With this config, spark history server will not run as daemon, otherwise there
95+
# will be no server running and container will terminate immediately
96+
ENV SPARK_NO_DAEMONIZE TRUE
97+
98+
WORKDIR $SPARK_HOME
99+
100+
ENTRYPOINT ["smspark-submit"]
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
3+
<!-- Put site-specific property overrides in this file. -->
4+
5+
<configuration>
6+
<property>
7+
<name>fs.defaultFS</name>
8+
<value>hdfs://nn_uri/</value>
9+
<description>NameNode URI</description>
10+
</property>
11+
<property>
12+
<name>fs.s3a.aws.credentials.provider</name>
13+
<value>com.amazonaws.auth.DefaultAWSCredentialsProviderChain</value>
14+
<description>AWS S3 credential provider</description>
15+
</property>
16+
<property>
17+
<name>fs.s3.impl</name>
18+
<value>org.apache.hadoop.fs.s3a.S3AFileSystem</value>
19+
<description>s3a filesystem implementation</description>
20+
</property>
21+
<property>
22+
<name>fs.AbstractFileSystem.s3a.imp</name>
23+
<value>org.apache.hadoop.fs.s3a.S3A</value>
24+
<description>s3a filesystem implementation</description>
25+
</property>
26+
</configuration>
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
3+
<!-- Put site-specific property overrides in this file. -->
4+
5+
<configuration>
6+
<property>
7+
<name>dfs.datanode.data.dir</name>
8+
<value>file:///opt/amazon/hadoop/hdfs/datanode</value>
9+
<description>Comma separated list of paths on the local filesystem of a DataNode where it should store its\
10+
blocks.</description>
11+
</property>
12+
13+
<property>
14+
<name>dfs.namenode.name.dir</name>
15+
<value>file:///opt/amazon/hadoop/hdfs/namenode</value>
16+
<description>Path on the local filesystem where the NameNode stores the namespace and transaction logs per\
17+
sistently.</description>
18+
</property>
19+
</configuration>
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
spark.driver.extraClassPath /usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar
2+
spark.driver.extraLibraryPath /usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native
3+
spark.executor.extraClassPath /usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar
4+
spark.executor.extraLibraryPath /usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native
5+
spark.driver.host=sd_host
6+
spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version=2
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
#EMPTY FILE AVOID OVERRIDDING ENV VARS
2+
# Specifically, without copying the empty file, SPARK_HISTORY_OPTS will be overriden,
3+
# spark.history.ui.port defaults to 18082, and spark.eventLog.dir defaults to local fs
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
<?xml version="1.0"?>
2+
<!-- Site specific YARN configuration properties -->
3+
<configuration>
4+
<property>
5+
<name>yarn.resourcemanager.hostname</name>
6+
<value>rm_hostname</value>
7+
<description>The hostname of the RM.</description>
8+
</property>
9+
<property>
10+
<name>yarn.nodemanager.hostname</name>
11+
<value>nm_hostname</value>
12+
<description>The hostname of the NM.</description>
13+
</property>
14+
<property>
15+
<name>yarn.nodemanager.webapp.address</name>
16+
<value>nm_webapp_address</value>
17+
</property>
18+
<property>
19+
<name>yarn.nodemanager.vmem-pmem-ratio</name>
20+
<value>5</value>
21+
<description>Ratio between virtual memory to physical memory.</description>
22+
</property>
23+
<property>
24+
<name>yarn.resourcemanager.am.max-attempts</name>
25+
<value>1</value>
26+
<description>The maximum number of application attempts.</description>
27+
</property>
28+
<property>
29+
<name>yarn.nodemanager.env-whitelist</name>
30+
<value>JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,YARN_HOME,AWS_CONTAINER_CREDENTIALS_RELATIVE_URI</value>
31+
<description>Environment variable whitelist</description>
32+
</property>
33+
34+
</configuration>

0 commit comments

Comments
 (0)