Skip to content

Commit df98f50

Browse files
authored
align with emr 6.2.1 and add entrypoint to install rpm (#91)
1 parent fffb514 commit df98f50

File tree

3 files changed

+144
-4
lines changed

3 files changed

+144
-4
lines changed

new_images.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
---
22
new_images:
3-
- spark: "3.1.1"
3+
- spark: "3.0.1"
44
use-case: "processing"
55
processors: ["cpu"]
66
python: ["py39"]
7-
sm_version: "1.2"
7+
sm_version: "1.0"
Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
FROM 137112412989.dkr.ecr.us-west-2.amazonaws.com/amazonlinux:2
2+
ARG REGION
3+
ENV AWS_REGION ${REGION}
4+
5+
RUN yum clean all \
6+
&& yum update -y \
7+
&& yum install -y awscli bigtop-utils curl gcc gzip unzip zip gunzip tar wget liblapack* libblas* libopencv* libopenblas*
8+
9+
# Install python 3.9
10+
ARG PYTHON_BASE_VERSION=3.9
11+
ARG PYTHON_WITH_BASE_VERSION=python${PYTHON_BASE_VERSION}
12+
ARG PIP_WITH_BASE_VERSION=pip${PYTHON_BASE_VERSION}
13+
ARG PYTHON_VERSION=${PYTHON_BASE_VERSION}.12
14+
RUN yum -y groupinstall 'Development Tools' \
15+
&& yum -y install openssl-devel bzip2-devel libffi-devel sqlite-devel xz-devel \
16+
&& wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz \
17+
&& tar xzf Python-${PYTHON_VERSION}.tgz \
18+
&& cd Python-*/ \
19+
&& ./configure --enable-optimizations \
20+
&& make altinstall \
21+
&& echo -e 'alias python3=python3.9\nalias pip3=pip3.9' >> ~/.bashrc \
22+
&& ln -s $(which ${PYTHON_WITH_BASE_VERSION}) /usr/local/bin/python3 \
23+
&& ln -s $(which ${PIP_WITH_BASE_VERSION}) /usr/local/bin/pip3 \
24+
&& cd .. \
25+
&& rm Python-${PYTHON_VERSION}.tgz \
26+
&& rm -rf Python-${PYTHON_VERSION}
27+
28+
# install nginx amazonlinux:2.0.20200304.0 does not have nginx, so need to install epel-release first
29+
RUN wget https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm
30+
RUN yum install -y epel-release-latest-7.noarch.rpm
31+
RUN yum install -y nginx
32+
33+
RUN rm -rf /var/cache/yum
34+
35+
ENV PYTHONDONTWRITEBYTECODE=1
36+
ENV PYTHONUNBUFFERED=1
37+
# http://blog.stuart.axelbrooke.com/python-3-on-spark-return-of-the-pythonhashseed
38+
ENV PYTHONHASHSEED 0
39+
ENV PYTHONIOENCODING UTF-8
40+
ENV PIP_DISABLE_PIP_VERSION_CHECK 1
41+
42+
# Install EMR Spark/Hadoop
43+
ENV HADOOP_HOME /usr/lib/hadoop
44+
ENV HADOOP_CONF_DIR /usr/lib/hadoop/etc/hadoop
45+
ENV SPARK_HOME /usr/lib/spark
46+
47+
COPY yum/emr-apps.repo /etc/yum.repos.d/emr-apps.repo
48+
49+
# Install hadoop / spark dependencies from EMR's yum repository for Spark optimizations.
50+
# replace placeholder with region in repository URL
51+
RUN sed -i "s/REGION/${AWS_REGION}/g" /etc/yum.repos.d/emr-apps.repo
52+
RUN adduser -N hadoop
53+
54+
# These packages are a subset of what EMR installs in a cluster with the
55+
# "hadoop", "spark", and "hive" applications.
56+
# They include EMR-optimized libraries and extras.
57+
RUN yum install -y aws-java-sdk \
58+
aws-sagemaker-spark-sdk \
59+
emr-goodies \
60+
emr-ruby \
61+
emr-scripts \
62+
emr-s3-select \
63+
emrfs \
64+
hadoop \
65+
hadoop-client \
66+
hadoop-hdfs \
67+
hadoop-hdfs-datanode \
68+
hadoop-hdfs-namenode \
69+
hadoop-httpfs \
70+
hadoop-kms \
71+
hadoop-lzo \
72+
hadoop-yarn \
73+
hadoop-yarn-nodemanager \
74+
hadoop-yarn-proxyserver \
75+
hadoop-yarn-resourcemanager \
76+
hadoop-yarn-timelineserver \
77+
hive \
78+
hive-hcatalog \
79+
hive-hcatalog-server \
80+
hive-jdbc \
81+
hive-server2 \
82+
s3-dist-cp \
83+
spark-core \
84+
spark-datanucleus \
85+
spark-external \
86+
spark-history-server \
87+
spark-python
88+
89+
# extra rpm patches
90+
COPY *.rpm /opt/rpm_override/
91+
RUN yum install -y /opt/rpm_override/*.rpm
92+
93+
# Point Spark at proper python binary
94+
ENV PYSPARK_PYTHON=/usr/local/bin/python3.9
95+
96+
# Setup Spark/Yarn/HDFS user as root
97+
ENV PATH="/usr/bin:/opt/program:${PATH}"
98+
ENV YARN_RESOURCEMANAGER_USER="root"
99+
ENV YARN_NODEMANAGER_USER="root"
100+
ENV HDFS_NAMENODE_USER="root"
101+
ENV HDFS_DATANODE_USER="root"
102+
ENV HDFS_SECONDARYNAMENODE_USER="root"
103+
104+
RUN zip -q -d /lib/hive/lib/log4j-core-2.10.0.jar org/apache/logging/log4j/core/lookup/JndiLookup.class
105+
106+
# Set up bootstrapping program and Spark configuration
107+
COPY hadoop-config /opt/hadoop-config
108+
COPY nginx-config /opt/nginx-config
109+
COPY aws-config /opt/aws-config
110+
COPY Pipfile Pipfile.lock setup.py *.whl /opt/program/
111+
ENV PIPENV_PIPFILE=/opt/program/Pipfile
112+
# Use --system flag, so it will install all packages into the system python,
113+
# and not into the virtualenv. Since docker containers do not need to have virtualenvs
114+
# pipenv > 2022.4.8 fails to build smspark
115+
RUN /usr/local/bin/python3.9 -m pip install pipenv==2022.4.8 \
116+
&& pipenv install --system \
117+
&& /usr/local/bin/python3.9 -m pip install /opt/program/*.whl
118+
119+
# Setup container bootstrapper
120+
COPY container-bootstrap-config /opt/container-bootstrap-config
121+
RUN chmod +x /opt/container-bootstrap-config/bootstrap.sh \
122+
&& /opt/container-bootstrap-config/bootstrap.sh
123+
124+
# With this config, spark history server will not run as daemon, otherwise there
125+
# will be no server running and container will terminate immediately
126+
ENV SPARK_NO_DAEMONIZE TRUE
127+
128+
WORKDIR $SPARK_HOME
129+
130+
ENTRYPOINT ["smspark-submit"]
Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,17 @@
11
[emr-apps]
22
name = EMR Application Repository
3-
gpgkey = https://s3-REGION.amazonaws.com/repo.REGION.emr.amazonaws.com/apps-repository/emr-6.1.0/72a9ec2e-9bf6-4d7d-9244-86a0ab1e50d6/repoPublicKey.txt
3+
gpgkey = https://s3-REGION.amazonaws.com/repo.REGION.emr.amazonaws.com/apps-repository/emr-6.2.1/e183bea5-a1eb-42d8-97a6-7c7a559af64e/repoPublicKey.txt
44
enabled = 1
5-
baseurl = https://s3-REGION.amazonaws.com/repo.REGION.emr.amazonaws.com/apps-repository/emr-6.1.0/72a9ec2e-9bf6-4d7d-9244-86a0ab1e50d6
5+
baseurl = https://s3-REGION.amazonaws.com/repo.REGION.emr.amazonaws.com/apps-repository/emr-6.2.1/e183bea5-a1eb-42d8-97a6-7c7a559af64e
66
priority = 5
77
gpgcheck = 0
8+
9+
[emr-puppet]
10+
mirrorlist: http://amazonlinux.$awsregion.$awsdomain/$releasever/extras/emr-puppet/latest/$basearch/mirror.list
11+
enabled: 1
12+
gpgcheck: 1
13+
name: Amazon Extras repo for emr-puppet
14+
gpgkey: file:///etc/pki/rpm-gpg/RPM-GPG-KEY-amazon-linux-2
15+
priority: 10
16+
skip_if_unavailable: 1
17+
report_instanceid: yes

0 commit comments

Comments
 (0)