Skip to content

Commit ab69baf

Browse files
authored
feat: separate Dockerfile for Hadoop (#1186)
* feat: separate Dockerfile for Hadoop * chore: changelog / lint fix * feat: remove unnecessary Hadoop components in distributed JARs * chore: rename HADOOP_HADOOP to HADOOP_VERSION * chore: rename HBASE_HBASE to HBASE_VERSION
1 parent f8cf694 commit ab69baf

File tree

46 files changed

+273
-222
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+273
-222
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ All notable changes to this project will be documented in this file.
9292
- opa: Enable custom versions ([#1170]).
9393
- use custom product versions for Hadoop, HBase, Phoenix, hbase-operator-tools, Druid, Hive and Spark ([#1173]).
9494
- hbase: Bump dependencies to the latest patch level for HBase `2.6.1` and `2.6.2` ([#1185]).
95+
- hadoop: Separate Dockerfiles for Hadoop build and HDFS image ([#1186]).
9596
- ubi-rust-builder: Bump Rust toolchain to 1.87.0, cargo-auditable to 0.7.0 and protoc to 31.1 ([#1197]).
9697
- stackable-base, stackable-devel, ubi-rust-builder: Update `ubi-minimal` base image ([#1197]).
9798
- testing-tools: Update `python` 3.12-slim-bullseye base image ([#1197]).
@@ -216,6 +217,7 @@ All notable changes to this project will be documented in this file.
216217
[#1180]: https://github.com/stackabletech/docker-images/pull/1180
217218
[#1184]: https://github.com/stackabletech/docker-images/pull/1184
218219
[#1185]: https://github.com/stackabletech/docker-images/pull/1185
220+
[#1186]: https://github.com/stackabletech/docker-images/pull/1186
219221
[#1188]: https://github.com/stackabletech/docker-images/pull/1188
220222
[#1189]: https://github.com/stackabletech/docker-images/pull/1189
221223
[#1197]: https://github.com/stackabletech/docker-images/pull/1197

conf.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
airflow = importlib.import_module("airflow.versions")
1414
druid = importlib.import_module("druid.versions")
1515
hadoop = importlib.import_module("hadoop.versions")
16+
hadoop_jars = importlib.import_module("hadoop.hadoop.versions")
1617
hbase = importlib.import_module("hbase.versions")
1718
hbase_jars = importlib.import_module("hbase.hbase.versions")
1819
hbase_phoenix = importlib.import_module("hbase.phoenix.versions")
@@ -48,6 +49,7 @@
4849
{"name": "airflow", "versions": airflow.versions},
4950
{"name": "druid", "versions": druid.versions},
5051
{"name": "hadoop", "versions": hadoop.versions},
52+
{"name": "hadoop/hadoop", "versions": hadoop_jars.versions},
5153
{"name": "hbase", "versions": hbase.versions},
5254
{"name": "hbase/hbase", "versions": hbase_jars.versions},
5355
{"name": "hbase/phoenix", "versions": hbase_phoenix.versions},

druid/Dockerfile

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# syntax=docker/dockerfile:1.16.0@sha256:e2dd261f92e4b763d789984f6eab84be66ab4f5f08052316d8eb8f173593acf7
22
# check=error=true
33

4-
FROM stackable/image/hadoop AS hadoop-builder
4+
FROM stackable/image/hadoop/hadoop AS hadoop-builder
55

66
FROM stackable/image/java-devel AS druid-builder
77

@@ -12,7 +12,9 @@ ARG STAX2_API
1212
ARG WOODSTOX_CORE
1313
ARG AUTHORIZER
1414
ARG STACKABLE_USER_UID
15-
ARG HADOOP
15+
ARG HADOOP_HADOOP
16+
# Reassign the arg to `HADOOP_VERSION` for better readability.
17+
ENV HADOOP_VERSION=${HADOOP_HADOOP}
1618

1719
# Setting this to anything other than "true" will keep the cache folders around (e.g. for Maven, NPM etc.)
1820
# This can be used to speed up builds when disk space is of no concern.
@@ -75,7 +77,7 @@ mvn \
7577
--no-transfer-progress \
7678
clean install \
7779
-Pdist,stackable-bundle-contrib-exts \
78-
-Dhadoop.compile.version=${HADOOP}-stackable${RELEASE} \
80+
-Dhadoop.compile.version=${HADOOP_VERSION}-stackable${RELEASE} \
7981
-DskipTests `# Skip test execution` \
8082
-Dcheckstyle.skip `# Skip checkstyle checks. We dont care if the code is properly formatted, it just wastes time` \
8183
-Dmaven.javadoc.skip=true `# Dont generate javadoc` \

druid/versions.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,23 +4,23 @@
44
# https://druid.apache.org/docs/30.0.1/operations/java/
55
"java-base": "17",
66
"java-devel": "17",
7-
"hadoop": "3.3.6",
7+
"hadoop/hadoop": "3.3.6",
88
"authorizer": "0.7.0",
99
},
1010
{
1111
"product": "31.0.1",
1212
# https://druid.apache.org/docs/31.0.1/operations/java/
1313
"java-base": "17",
1414
"java-devel": "17",
15-
"hadoop": "3.3.6",
15+
"hadoop/hadoop": "3.3.6",
1616
"authorizer": "0.7.0",
1717
},
1818
{
1919
"product": "33.0.0",
2020
# https://druid.apache.org/docs/33.0.0/operations/java/
2121
"java-base": "17",
2222
"java-devel": "17",
23-
"hadoop": "3.3.6",
23+
"hadoop/hadoop": "3.3.6",
2424
"authorizer": "0.7.0",
2525
},
2626
]

hadoop/Dockerfile

Lines changed: 38 additions & 155 deletions
Original file line numberDiff line numberDiff line change
@@ -1,145 +1,19 @@
11
# syntax=docker/dockerfile:1.16.0@sha256:e2dd261f92e4b763d789984f6eab84be66ab4f5f08052316d8eb8f173593acf7
22
# check=error=true
33

4-
FROM stackable/image/java-devel AS hadoop-builder
5-
6-
ARG PRODUCT
7-
ARG RELEASE
8-
ARG ASYNC_PROFILER
9-
ARG JMX_EXPORTER
10-
ARG PROTOBUF
11-
ARG TARGETARCH
12-
ARG TARGETOS
13-
ARG STACKABLE_USER_UID
14-
15-
WORKDIR /stackable
16-
17-
COPY --chown=${STACKABLE_USER_UID}:0 shared/protobuf/stackable/patches/patchable.toml /stackable/src/shared/protobuf/stackable/patches/patchable.toml
18-
COPY --chown=${STACKABLE_USER_UID}:0 shared/protobuf/stackable/patches/${PROTOBUF} /stackable/src/shared/protobuf/stackable/patches/${PROTOBUF}
19-
20-
RUN <<EOF
21-
rpm --install --replacepkgs https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm
22-
microdnf update
23-
# boost is a build dependency starting in Hadoop 3.4.0 if compiling native code
24-
# automake and libtool are required to build protobuf
25-
microdnf install boost1.78-devel automake libtool
26-
microdnf clean all
27-
rm -rf /var/cache/yum
28-
mkdir /opt/protobuf
29-
chown ${STACKABLE_USER_UID}:0 /opt/protobuf
30-
EOF
31-
32-
USER ${STACKABLE_USER_UID}
33-
# This Protobuf version is the exact version as used in the Hadoop Dockerfile
34-
# See https://github.com/apache/hadoop/blob/trunk/dev-support/docker/pkg-resolver/install-protobuf.sh
35-
# (this was hardcoded in the Dockerfile in earlier versions of Hadoop, make sure to look at the exact version in Github)
36-
RUN <<EOF
37-
cd "$(/stackable/patchable --images-repo-root=src checkout shared/protobuf ${PROTOBUF})"
38-
39-
# Create snapshot of the source code including custom patches
40-
tar -czf /stackable/protobuf-${PROTOBUF}-src.tar.gz .
41-
42-
./autogen.sh
43-
./configure --prefix=/opt/protobuf
44-
make "-j$(nproc)"
45-
make install
46-
(cd .. && rm -r ${PROTOBUF})
47-
EOF
48-
49-
ENV PROTOBUF_HOME=/opt/protobuf
50-
ENV PATH="${PATH}:/opt/protobuf/bin"
51-
52-
RUN <<EOF
53-
# async-profiler
54-
ARCH="${TARGETARCH/amd64/x64}"
55-
curl "https://repo.stackable.tech/repository/packages/async-profiler/async-profiler-${ASYNC_PROFILER}-${TARGETOS}-${ARCH}.tar.gz" | tar -xzC .
56-
ln -s "/stackable/async-profiler-${ASYNC_PROFILER}-${TARGETOS}-${ARCH}" /stackable/async-profiler
57-
58-
# JMX Exporter
59-
mkdir /stackable/jmx
60-
curl "https://repo.stackable.tech/repository/packages/jmx-exporter/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar" -o "/stackable/jmx/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar"
61-
chmod -x "/stackable/jmx/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar"
62-
ln -s "/stackable/jmx/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar" /stackable/jmx/jmx_prometheus_javaagent.jar
63-
EOF
64-
65-
WORKDIR /build
66-
COPY --chown=${STACKABLE_USER_UID}:0 hadoop/stackable/patches/patchable.toml /build/src/hadoop/stackable/patches/patchable.toml
67-
COPY --chown=${STACKABLE_USER_UID}:0 hadoop/stackable/patches/${PRODUCT} /build/src/hadoop/stackable/patches/${PRODUCT}
68-
COPY --chown=${STACKABLE_USER_UID}:0 hadoop/stackable/fuse_dfs_wrapper /build
69-
COPY --chown=${STACKABLE_USER_UID}:0 hadoop/stackable/jmx /stackable/jmx
70-
USER ${STACKABLE_USER_UID}
71-
# Hadoop Pipes requires libtirpc to build, whose headers are not packaged in RedHat UBI, so skip building this module
72-
# Build from source to enable FUSE module, and to apply custom patches.
73-
# Also skip building the yarn, mapreduce and minicluster modules: this will result in the modules being excluded but not all
74-
# jar files will be stripped if they are needed elsewhere e.g. share/hadoop/yarn will not be part of the build, but yarn jars
75-
# will still exist in share/hadoop/tools as they would be needed by the resource estimator tool. Such jars are removed in a later step.
76-
RUN <<EOF
77-
cd "$(/stackable/patchable --images-repo-root=src checkout hadoop ${PRODUCT})"
78-
79-
ORIGINAL_VERSION=$(mvn help:evaluate -Dexpression=project.version -q -DforceStdout)
80-
NEW_VERSION=${PRODUCT}-stackable${RELEASE}
81-
82-
mvn versions:set -DnewVersion=${NEW_VERSION}
83-
84-
# Since we skip building the hadoop-pipes module, we need to set the version to the original version so it can be pulled from Maven Central
85-
sed -e '/<artifactId>hadoop-pipes<\/artifactId>/,/<\/dependency>/ { s/<version>.*<\/version>/<version>'"$ORIGINAL_VERSION"'<\/version>/ }' -i hadoop-tools/hadoop-tools-dist/pom.xml
86-
87-
# Create snapshot of the source code including custom patches
88-
tar -czf /stackable/hadoop-${NEW_VERSION}-src.tar.gz .
89-
90-
mvn \
91-
--batch-mode \
92-
--no-transfer-progress \
93-
clean package install \
94-
-Pdist,native \
95-
-pl '!hadoop-tools/hadoop-pipes' \
96-
-Dhadoop.version=${NEW_VERSION} \
97-
-Drequire.fuse=true \
98-
-DskipTests \
99-
-Dmaven.javadoc.skip=true
100-
101-
mkdir -p /stackable/patched-libs/maven/org/apache
102-
cp -r /stackable/.m2/repository/org/apache/hadoop /stackable/patched-libs/maven/org/apache
103-
104-
cp -r hadoop-dist/target/hadoop-${NEW_VERSION} /stackable/hadoop-${NEW_VERSION}
105-
sed -i "s/${NEW_VERSION}/${ORIGINAL_VERSION}/g" hadoop-dist/target/bom.json
106-
mv hadoop-dist/target/bom.json /stackable/hadoop-${NEW_VERSION}/hadoop-${NEW_VERSION}.cdx.json
107-
108-
# HDFS fuse-dfs is not part of the regular dist output, so we need to copy it in ourselves
109-
cp hadoop-hdfs-project/hadoop-hdfs-native-client/target/main/native/fuse-dfs/fuse_dfs /stackable/hadoop-${NEW_VERSION}/bin
110-
111-
# Remove source code
112-
(cd .. && rm -r ${PRODUCT})
113-
114-
ln -s /stackable/hadoop-${NEW_VERSION} /stackable/hadoop
115-
116-
mv /build/fuse_dfs_wrapper /stackable/hadoop/bin
117-
118-
# Remove unneeded binaries:
119-
# - code sources
120-
# - mapreduce/yarn binaries that were built as cross-project dependencies
121-
# - minicluster (only used for testing) and test .jars
122-
# - json-io: this is a transitive dependency pulled in by cedarsoft/java-utils/json-io and is excluded in 3.4.0. See CVE-2023-34610.
123-
rm -rf /stackable/hadoop/share/hadoop/common/sources/
124-
rm -rf /stackable/hadoop/share/hadoop/hdfs/sources/
125-
rm -rf /stackable/hadoop/share/hadoop/tools/sources/
126-
rm -rf /stackable/hadoop/share/hadoop/tools/lib/json-io-*.jar
127-
rm -rf /stackable/hadoop/share/hadoop/tools/lib/hadoop-mapreduce-client-*.jar
128-
rm -rf /stackable/hadoop/share/hadoop/tools/lib/hadoop-yarn-server*.jar
129-
find /stackable/hadoop -name 'hadoop-minicluster-*.jar' -type f -delete
130-
find /stackable/hadoop -name 'hadoop-client-minicluster-*.jar' -type f -delete
131-
find /stackable/hadoop -name 'hadoop-*tests.jar' -type f -delete
132-
rm -rf /stackable/.m2
133-
134-
# Set correct groups; make sure only required artifacts for the final image are located in /stackable
135-
chmod -R g=u /stackable
136-
EOF
4+
FROM stackable/image/hadoop/hadoop AS hadoop-builder
1375

1386
FROM stackable/image/java-devel AS hdfs-utils-builder
1397

1408
ARG HDFS_UTILS
1419
ARG PRODUCT
10+
ARG RELEASE
14211
ARG STACKABLE_USER_UID
12+
ARG HADOOP_HADOOP
13+
# Reassign the arg to `HADOOP_VERSION` for better readability.
14+
# It is passed as `HADOOP_HADOOP`, because versions.py has to contain `hadoop/hadoop` to establish a dependency on the Hadoop builder.
15+
# The value of `hadoop/hadoop` is transformed by `bake` and automatically passed as `HADOOP_HADOOP` arg.
16+
ENV HADOOP_VERSION=${HADOOP_HADOOP}
14317

14418
# Starting with hdfs-utils 0.4.0 we need to use Java 17 for compilation.
14519
# We can not simply use java-devel with Java 17, as it is also used to compile Hadoop in this
@@ -161,25 +35,31 @@ WORKDIR /stackable
16135
COPY --chown=${STACKABLE_USER_UID}:0 hadoop/hdfs-utils/stackable/patches/patchable.toml /stackable/src/hadoop/hdfs-utils/stackable/patches/patchable.toml
16236
COPY --chown=${STACKABLE_USER_UID}:0 hadoop/hdfs-utils/stackable/patches/${HDFS_UTILS} /stackable/src/hadoop/hdfs-utils/stackable/patches/${HDFS_UTILS}
16337

38+
COPY --from=hadoop-builder --chown=${STACKABLE_USER_UID}:0 /stackable/patched-libs /stackable/patched-libs
39+
16440
# The Stackable HDFS utils contain an OPA authorizer, group mapper & topology provider.
16541
# The topology provider provides rack awareness functionality for HDFS by allowing users to specify Kubernetes
16642
# labels to build a rackID from.
16743
# Starting with hdfs-utils version 0.3.0 the topology provider is not a standalone jar anymore and included in hdfs-utils.
16844
RUN <<EOF
16945
cd "$(/stackable/patchable --images-repo-root=src checkout hadoop/hdfs-utils ${HDFS_UTILS})"
17046

47+
# Make Maven aware of custom Stackable libraries
48+
mkdir -p /stackable/.m2/repository
49+
cp -r /stackable/patched-libs/maven/* /stackable/.m2/repository
50+
17151
# Create snapshot of the source code including custom patches
17252
tar -czf /stackable/hdfs-utils-${HDFS_UTILS}-src.tar.gz .
17353

17454
mvn \
17555
--batch-mode \
17656
--no-transfer-progress\
17757
clean package \
178-
-P hadoop-${PRODUCT} \
58+
-P hadoop-${HADOOP_VERSION} \
59+
-Dhadoop.version=${HADOOP_VERSION}-stackable${RELEASE} \
17960
-DskipTests \
18061
-Dmaven.javadoc.skip=true
18162

182-
mkdir -p /stackable
18363
cp target/hdfs-utils-$HDFS_UTILS.jar /stackable/hdfs-utils-${HDFS_UTILS}.jar
18464
rm -rf hdfs-utils-main
18565

@@ -191,11 +71,15 @@ FROM stackable/image/java-base AS final
19171

19272
ARG PRODUCT
19373
ARG RELEASE
194-
ARG TARGETARCH
195-
ARG TARGETOS
74+
ARG HADOOP_HADOOP
75+
# Reassign the arg to `HADOOP_VERSION` for better readability.
76+
ENV HADOOP_VERSION=${HADOOP_HADOOP}
19677
ARG HDFS_UTILS
197-
ARG ASYNC_PROFILER
19878
ARG STACKABLE_USER_UID
79+
ARG ASYNC_PROFILER
80+
ARG JMX_EXPORTER
81+
ARG TARGETARCH
82+
ARG TARGETOS
19983

20084
LABEL \
20185
name="Apache Hadoop" \
@@ -206,17 +90,13 @@ LABEL \
20690
summary="The Stackable image for Apache Hadoop." \
20791
description="This image is deployed by the Stackable Operator for Apache Hadoop / HDFS."
20892

93+
COPY --chown=${STACKABLE_USER_UID}:0 --from=hadoop-builder /stackable/hadoop-${HADOOP_VERSION}-stackable${RELEASE} /stackable/hadoop-${HADOOP_VERSION}-stackable${RELEASE}
94+
COPY --chown=${STACKABLE_USER_UID}:0 --from=hadoop-builder /stackable/*-src.tar.gz /stackable
20995

210-
COPY --chown=${STACKABLE_USER_UID}:0 --from=hadoop-builder /stackable/hadoop-${PRODUCT}-stackable${RELEASE} /stackable/hadoop-${PRODUCT}-stackable${RELEASE}
211-
COPY --chown=${STACKABLE_USER_UID}:0 --from=hadoop-builder /stackable/hadoop-${PRODUCT}-stackable${RELEASE}-src.tar.gz /stackable/
212-
COPY --chown=${STACKABLE_USER_UID}:0 --from=hadoop-builder /stackable/async-profiler-${ASYNC_PROFILER}-* /stackable/async-profiler-${ASYNC_PROFILER}
213-
COPY --chown=${STACKABLE_USER_UID}:0 --from=hadoop-builder /stackable/jmx /stackable/jmx
214-
COPY --chown=${STACKABLE_USER_UID}:0 --from=hadoop-builder /stackable/protobuf-*-src.tar.gz /stackable/
215-
COPY --chown=${STACKABLE_USER_UID}:0 --from=hadoop-builder /stackable/patched-libs /stackable/patched-libs
216-
217-
COPY --chown=${STACKABLE_USER_UID}:0 --from=hdfs-utils-builder /stackable/hdfs-utils-${HDFS_UTILS}.jar /stackable/hadoop-${PRODUCT}-stackable${RELEASE}/share/hadoop/common/lib/hdfs-utils-${HDFS_UTILS}.jar
96+
COPY --chown=${STACKABLE_USER_UID}:0 --from=hdfs-utils-builder /stackable/hdfs-utils-${HDFS_UTILS}.jar /stackable/hadoop-${HADOOP_VERSION}-stackable${RELEASE}/share/hadoop/common/lib/hdfs-utils-${HDFS_UTILS}.jar
21897
COPY --chown=${STACKABLE_USER_UID}:0 --from=hdfs-utils-builder /stackable/hdfs-utils-${HDFS_UTILS}-src.tar.gz /stackable
21998

99+
COPY --chown=${STACKABLE_USER_UID}:0 hadoop/stackable/jmx /stackable/jmx
220100
COPY --chown=${STACKABLE_USER_UID}:0 hadoop/licenses /licenses
221101

222102
# fuse is required for fusermount (called by fuse_dfs)
@@ -241,21 +121,24 @@ rm -rf /var/cache/yum
241121
# It is so non-root users (as we are) can mount a FUSE device and let other users access it
242122
echo "user_allow_other" > /etc/fuse.conf
243123

244-
ln -s "/stackable/hadoop-${PRODUCT}-stackable${RELEASE}" /stackable/hadoop
245-
chown --no-dereference "${STACKABLE_USER_UID}:0" /stackable/hadoop
246-
chmod g=u "/stackable/hadoop-${PRODUCT}-stackable${RELEASE}"
247-
chmod g=u /stackable/*-src.tar.gz
124+
ln -s "/stackable/hadoop-${HADOOP_VERSION}-stackable${RELEASE}" /stackable/hadoop
248125

126+
# async-profiler
249127
ARCH="${TARGETARCH/amd64/x64}"
250-
mv /stackable/async-profiler-${ASYNC_PROFILER}* "/stackable/async-profiler-${ASYNC_PROFILER-}-${TARGETOS}-${ARCH}"
251-
chmod g=u "/stackable/async-profiler-${ASYNC_PROFILER-}-${TARGETOS}-${ARCH}"
128+
curl "https://repo.stackable.tech/repository/packages/async-profiler/async-profiler-${ASYNC_PROFILER}-${TARGETOS}-${ARCH}.tar.gz" | tar -xzC /stackable
252129
ln -s "/stackable/async-profiler-${ASYNC_PROFILER}-${TARGETOS}-${ARCH}" /stackable/async-profiler
253-
chown --no-dereference "${STACKABLE_USER_UID}:0" /stackable/async-profiler
254130

255-
chmod g=u /stackable/jmx
256-
chmod g=u /stackable/patched-libs
131+
# JMX Exporter
132+
curl "https://repo.stackable.tech/repository/packages/jmx-exporter/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar" -o "/stackable/jmx/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar"
133+
chmod -x "/stackable/jmx/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar"
134+
ln -s "/stackable/jmx/jmx_prometheus_javaagent-${JMX_EXPORTER}.jar" /stackable/jmx/jmx_prometheus_javaagent.jar
257135

136+
# Set correct permissions and ownerships
137+
chown --recursive ${STACKABLE_USER_UID}:0 /stackable/hadoop /stackable/jmx /stackable/async-profiler "/stackable/async-profiler-${ASYNC_PROFILER}-${TARGETOS}-${ARCH}"
138+
chmod --recursive g=u /stackable/jmx /stackable/async-profiler "/stackable/hadoop-${HADOOP_VERSION}-stackable${RELEASE}"
139+
EOF
258140

141+
RUN <<EOF
259142
# ----------------------------------------
260143
# Checks
261144
# This section is to run final checks to ensure the created final images

0 commit comments

Comments
 (0)