From 0a72a4c8385ef1fd3f771b5838b6b9e20ddd9b88 Mon Sep 17 00:00:00 2001 From: jkottu Date: Wed, 16 Jul 2025 10:11:29 -0700 Subject: [PATCH 01/16] build tensorflow 2.18 training sm --- dlc_developer_config.toml | 6 +++--- tensorflow/training/buildspec-2-18-sm.yml | 2 +- .../training/docker/2.18/py3/cu125/Dockerfile.gpu | 14 ++++++++++++++ 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 1962bfd69e21..159f125b279a 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -37,12 +37,12 @@ deep_canary_mode = false [build] # Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image. # available frameworks - ["base", "vllm", "autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"] -build_frameworks = [] +build_frameworks = ["tensorflow] # By default we build both training and inference containers. Set true/false values to determine which to build. build_training = true -build_inference = true +build_inference = false # Set do_build to "false" to skip builds and test the latest image built by this PR # Note: at least one build is required to set do_build to "false" @@ -120,7 +120,7 @@ use_scheduler = false # Standard Framework Training dlc-pr-pytorch-training = "" -dlc-pr-tensorflow-2-training = "" +dlc-pr-tensorflow-2-training = "tensorflow/training/buildspec-2-18-sm.yml" dlc-pr-autogluon-training = "" # ARM64 Training diff --git a/tensorflow/training/buildspec-2-18-sm.yml b/tensorflow/training/buildspec-2-18-sm.yml index 3b899394e87e..eab6fc553978 100644 --- a/tensorflow/training/buildspec-2-18-sm.yml +++ b/tensorflow/training/buildspec-2-18-sm.yml @@ -5,7 +5,7 @@ framework: &FRAMEWORK tensorflow version: &VERSION 2.18.0 short_version: &SHORT_VERSION "2.18" arch_type: x86 -autopatch_build: "True" +# autopatch_build: "True" repository_info: training_repository: &TRAINING_REPOSITORY diff --git a/tensorflow/training/docker/2.18/py3/cu125/Dockerfile.gpu b/tensorflow/training/docker/2.18/py3/cu125/Dockerfile.gpu index 4040968586c1..cbed4d98b743 100644 --- a/tensorflow/training/docker/2.18/py3/cu125/Dockerfile.gpu +++ b/tensorflow/training/docker/2.18/py3/cu125/Dockerfile.gpu @@ -112,6 +112,20 @@ RUN apt-get update && apt-get install -y --no-install-recommends --allow-unauthe && apt-get clean \ && mkdir -p /var/run/sshd +# patch nvjpeg +RUN mkdir -p /tmp/nvjpeg \ +&& cd /tmp/nvjpeg \ +&& wget https://developer.download.nvidia.com/compute/cuda/redist/libnvjpeg/linux-x86_64/libnvjpeg-linux-x86_64-12.4.0.76-archive.tar.xz \ +&& tar -xvf libnvjpeg-linux-x86_64-12.4.0.76-archive.tar.xz \ +&& rm -rf /usr/local/cuda/targets/x86_64-linux/lib/libnvjpeg* \ +&& rm -rf /usr/local/cuda/targets/x86_64-linux/include/nvjpeg.h \ +&& cp libnvjpeg-linux-x86_64-12.4.0.76-archive/lib/libnvjpeg* /usr/local/cuda/targets/x86_64-linux/lib/ \ +&& cp libnvjpeg-linux-x86_64-12.4.0.76-archive/include/* /usr/local/cuda/targets/x86_64-linux/include/ \ +&& rm -rf /tmp/nvjpeg \ +# patch cuobjdump and nvdisasm +&& rm -rf /usr/local/cuda/bin/cuobjdump* \ +&& rm -rf /usr/local/cuda/bin/nvdisasm* + # Install EFA without AWS OPEN_MPI RUN apt-get update \ From e8616b9c1ec62c503e8e0b4039e68ef75e687f5a Mon Sep 17 00:00:00 2001 From: jkottu Date: Wed, 16 Jul 2025 10:12:49 -0700 Subject: [PATCH 02/16] build tensorflow 2.18 training ec2 --- dlc_developer_config.toml | 2 +- tensorflow/training/buildspec-2-18-ec2.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 159f125b279a..4c5089e7ed31 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -120,7 +120,7 @@ use_scheduler = false # Standard Framework Training dlc-pr-pytorch-training = "" -dlc-pr-tensorflow-2-training = "tensorflow/training/buildspec-2-18-sm.yml" +dlc-pr-tensorflow-2-training = "tensorflow/training/buildspec-2-18-ec2.yml" dlc-pr-autogluon-training = "" # ARM64 Training diff --git a/tensorflow/training/buildspec-2-18-ec2.yml b/tensorflow/training/buildspec-2-18-ec2.yml index aac371965cd3..20148d5c29f5 100644 --- a/tensorflow/training/buildspec-2-18-ec2.yml +++ b/tensorflow/training/buildspec-2-18-ec2.yml @@ -5,7 +5,7 @@ framework: &FRAMEWORK tensorflow version: &VERSION 2.18.0 short_version: &SHORT_VERSION "2.18" arch_type: x86 -autopatch_build: "True" +# autopatch_build: "True" repository_info: training_repository: &TRAINING_REPOSITORY From be39c4d7c49ff4d14e1fdcc19877a23e3bf4012e Mon Sep 17 00:00:00 2001 From: jkottu Date: Wed, 16 Jul 2025 10:17:56 -0700 Subject: [PATCH 03/16] build tensorflow 2.19 training sm --- dlc_developer_config.toml | 2 +- .../training/docker/2.19/py3/cu125/Dockerfile.gpu | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 4c5089e7ed31..e975beb59aa4 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -120,7 +120,7 @@ use_scheduler = false # Standard Framework Training dlc-pr-pytorch-training = "" -dlc-pr-tensorflow-2-training = "tensorflow/training/buildspec-2-18-ec2.yml" +dlc-pr-tensorflow-2-training = "tensorflow/training/buildspec-2-19-sm.yml" dlc-pr-autogluon-training = "" # ARM64 Training diff --git a/tensorflow/training/docker/2.19/py3/cu125/Dockerfile.gpu b/tensorflow/training/docker/2.19/py3/cu125/Dockerfile.gpu index b73bb8df84a1..dd338ff20579 100644 --- a/tensorflow/training/docker/2.19/py3/cu125/Dockerfile.gpu +++ b/tensorflow/training/docker/2.19/py3/cu125/Dockerfile.gpu @@ -224,6 +224,20 @@ RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_confi # Add NGC vars ENV TF_AUTOTUNE_THRESHOLD=2 +# patch nvjpeg +RUN mkdir -p /tmp/nvjpeg \ +&& cd /tmp/nvjpeg \ +&& wget https://developer.download.nvidia.com/compute/cuda/redist/libnvjpeg/linux-x86_64/libnvjpeg-linux-x86_64-12.4.0.76-archive.tar.xz \ +&& tar -xvf libnvjpeg-linux-x86_64-12.4.0.76-archive.tar.xz \ +&& rm -rf /usr/local/cuda/targets/x86_64-linux/lib/libnvjpeg* \ +&& rm -rf /usr/local/cuda/targets/x86_64-linux/include/nvjpeg.h \ +&& cp libnvjpeg-linux-x86_64-12.4.0.76-archive/lib/libnvjpeg* /usr/local/cuda/targets/x86_64-linux/lib/ \ +&& cp libnvjpeg-linux-x86_64-12.4.0.76-archive/include/* /usr/local/cuda/targets/x86_64-linux/include/ \ +&& rm -rf /tmp/nvjpeg \ +# patch cuobjdump and nvdisasm +&& rm -rf /usr/local/cuda/bin/cuobjdump* \ +&& rm -rf /usr/local/cuda/bin/nvdisasm* + ADD https://raw.githubusercontent.com/aws/deep-learning-containers/master/src/deep_learning_container.py /usr/local/bin/deep_learning_container.py RUN chmod +x /usr/local/bin/deep_learning_container.py From ec2b7b94d07428050cfa3a25f8cf0a469e38d48b Mon Sep 17 00:00:00 2001 From: jkottu Date: Wed, 16 Jul 2025 10:22:05 -0700 Subject: [PATCH 04/16] build tensorflow 2.19 training sm --- dlc_developer_config.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index e975beb59aa4..e4f3090ea51d 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -37,7 +37,7 @@ deep_canary_mode = false [build] # Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image. # available frameworks - ["base", "vllm", "autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"] -build_frameworks = ["tensorflow] +build_frameworks = ["tensorflow"] # By default we build both training and inference containers. Set true/false values to determine which to build. From 245eb1bbdf5396fd24bfdaf93d7d6d901da9fd58 Mon Sep 17 00:00:00 2001 From: jkottu Date: Wed, 16 Jul 2025 10:22:22 -0700 Subject: [PATCH 05/16] build tensorflow 2.18 training sm --- dlc_developer_config.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index e4f3090ea51d..02425ff4e648 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -120,7 +120,7 @@ use_scheduler = false # Standard Framework Training dlc-pr-pytorch-training = "" -dlc-pr-tensorflow-2-training = "tensorflow/training/buildspec-2-19-sm.yml" +dlc-pr-tensorflow-2-training = "tensorflow/training/buildspec-2-18-sm.yml" dlc-pr-autogluon-training = "" # ARM64 Training From 840d4969b7203f54237eeb48531e64a7349b094a Mon Sep 17 00:00:00 2001 From: jkottu Date: Wed, 16 Jul 2025 10:22:34 -0700 Subject: [PATCH 06/16] build tensorflow 2.18 training ec2 --- dlc_developer_config.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 02425ff4e648..a4da32b3709e 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -120,7 +120,7 @@ use_scheduler = false # Standard Framework Training dlc-pr-pytorch-training = "" -dlc-pr-tensorflow-2-training = "tensorflow/training/buildspec-2-18-sm.yml" +dlc-pr-tensorflow-2-training = "tensorflow/training/buildspec-2-18-ec2.yml" dlc-pr-autogluon-training = "" # ARM64 Training From f587106fde49ef9057b4473af94d42f463c58465 Mon Sep 17 00:00:00 2001 From: jkottu Date: Wed, 16 Jul 2025 12:59:59 -0700 Subject: [PATCH 07/16] build tensorflow 2.18 with opencv pinned version ec2 --- .../2.18/py3/cu125/Dockerfile.ec2.gpu.core_packages.json | 3 +++ tensorflow/training/docker/2.18/py3/cu125/Dockerfile.gpu | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/tensorflow/training/docker/2.18/py3/cu125/Dockerfile.ec2.gpu.core_packages.json b/tensorflow/training/docker/2.18/py3/cu125/Dockerfile.ec2.gpu.core_packages.json index 6757b6ec2466..b26fddac0bd1 100644 --- a/tensorflow/training/docker/2.18/py3/cu125/Dockerfile.ec2.gpu.core_packages.json +++ b/tensorflow/training/docker/2.18/py3/cu125/Dockerfile.ec2.gpu.core_packages.json @@ -10,6 +10,9 @@ }, "tensorboard": { "version_specifier": "==2.18.0" + }, + "opencv-python": { + "version_specifier": "==4.11.0.86" }, "tensorboard-data-server": { "version_specifier": ">=0.7.0,<0.8.0" diff --git a/tensorflow/training/docker/2.18/py3/cu125/Dockerfile.gpu b/tensorflow/training/docker/2.18/py3/cu125/Dockerfile.gpu index cbed4d98b743..5079311ac597 100644 --- a/tensorflow/training/docker/2.18/py3/cu125/Dockerfile.gpu +++ b/tensorflow/training/docker/2.18/py3/cu125/Dockerfile.gpu @@ -371,7 +371,7 @@ RUN $PYTHON -m pip install --no-cache-dir -U \ numba \ bokeh \ imageio \ - opencv-python \ + "opencv-python==4.11.0.86" \ plotly \ seaborn \ shap From 14b3411aab0c0959cb18e5bbbdb12fd5599b3fcd Mon Sep 17 00:00:00 2001 From: jkottu Date: Wed, 16 Jul 2025 13:00:17 -0700 Subject: [PATCH 08/16] build tensorflow 2.18 with opencv pinned version sm --- dlc_developer_config.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index a4da32b3709e..02425ff4e648 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -120,7 +120,7 @@ use_scheduler = false # Standard Framework Training dlc-pr-pytorch-training = "" -dlc-pr-tensorflow-2-training = "tensorflow/training/buildspec-2-18-ec2.yml" +dlc-pr-tensorflow-2-training = "tensorflow/training/buildspec-2-18-sm.yml" dlc-pr-autogluon-training = "" # ARM64 Training From 0e99dc45be6af59d0aa23f081c94d2133714eb4e Mon Sep 17 00:00:00 2001 From: jkottu Date: Wed, 16 Jul 2025 13:27:25 -0700 Subject: [PATCH 09/16] build tensorflow 2.18 sm --- .../2.18/py3/cu125/Dockerfile.ec2.gpu.core_packages.json | 3 --- tensorflow/training/docker/2.18/py3/cu125/Dockerfile.gpu | 2 +- tensorflow/training/docker/2.19/py3/cu125/Dockerfile.gpu | 2 +- 3 files changed, 2 insertions(+), 5 deletions(-) diff --git a/tensorflow/training/docker/2.18/py3/cu125/Dockerfile.ec2.gpu.core_packages.json b/tensorflow/training/docker/2.18/py3/cu125/Dockerfile.ec2.gpu.core_packages.json index b26fddac0bd1..6757b6ec2466 100644 --- a/tensorflow/training/docker/2.18/py3/cu125/Dockerfile.ec2.gpu.core_packages.json +++ b/tensorflow/training/docker/2.18/py3/cu125/Dockerfile.ec2.gpu.core_packages.json @@ -10,9 +10,6 @@ }, "tensorboard": { "version_specifier": "==2.18.0" - }, - "opencv-python": { - "version_specifier": "==4.11.0.86" }, "tensorboard-data-server": { "version_specifier": ">=0.7.0,<0.8.0" diff --git a/tensorflow/training/docker/2.18/py3/cu125/Dockerfile.gpu b/tensorflow/training/docker/2.18/py3/cu125/Dockerfile.gpu index 5079311ac597..cbed4d98b743 100644 --- a/tensorflow/training/docker/2.18/py3/cu125/Dockerfile.gpu +++ b/tensorflow/training/docker/2.18/py3/cu125/Dockerfile.gpu @@ -371,7 +371,7 @@ RUN $PYTHON -m pip install --no-cache-dir -U \ numba \ bokeh \ imageio \ - "opencv-python==4.11.0.86" \ + opencv-python \ plotly \ seaborn \ shap diff --git a/tensorflow/training/docker/2.19/py3/cu125/Dockerfile.gpu b/tensorflow/training/docker/2.19/py3/cu125/Dockerfile.gpu index dd338ff20579..233b22bd834e 100644 --- a/tensorflow/training/docker/2.19/py3/cu125/Dockerfile.gpu +++ b/tensorflow/training/docker/2.19/py3/cu125/Dockerfile.gpu @@ -211,7 +211,7 @@ RUN ${PIP} install --no-cache-dir -U \ mpi4py \ h5py \ absl-py \ - opencv-python \ + "opencv-python==4.11.0.86" \ werkzeug \ urllib3 \ protobuf From f49b9e72f5af9a9d4d0c0d5e4ae8ba4f9a97511d Mon Sep 17 00:00:00 2001 From: jkottu Date: Wed, 16 Jul 2025 13:27:38 -0700 Subject: [PATCH 10/16] build tensorflow 2.18 ec2 --- dlc_developer_config.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 02425ff4e648..a4da32b3709e 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -120,7 +120,7 @@ use_scheduler = false # Standard Framework Training dlc-pr-pytorch-training = "" -dlc-pr-tensorflow-2-training = "tensorflow/training/buildspec-2-18-sm.yml" +dlc-pr-tensorflow-2-training = "tensorflow/training/buildspec-2-18-ec2.yml" dlc-pr-autogluon-training = "" # ARM64 Training From cab68290630c884d045878d386f30d99d4ded085 Mon Sep 17 00:00:00 2001 From: jkottu Date: Wed, 16 Jul 2025 13:27:52 -0700 Subject: [PATCH 11/16] build tensorflow 2.19 sm --- dlc_developer_config.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index a4da32b3709e..e4f3090ea51d 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -120,7 +120,7 @@ use_scheduler = false # Standard Framework Training dlc-pr-pytorch-training = "" -dlc-pr-tensorflow-2-training = "tensorflow/training/buildspec-2-18-ec2.yml" +dlc-pr-tensorflow-2-training = "tensorflow/training/buildspec-2-19-sm.yml" dlc-pr-autogluon-training = "" # ARM64 Training From 28412a77dca027fc659532ac294a466310eacebe Mon Sep 17 00:00:00 2001 From: jkottu Date: Wed, 16 Jul 2025 14:43:52 -0700 Subject: [PATCH 12/16] build tensorflow 2.18 sm --- dlc_developer_config.toml | 2 +- tensorflow/training/docker/2.18/py3/cu125/Dockerfile.gpu | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index e4f3090ea51d..02425ff4e648 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -120,7 +120,7 @@ use_scheduler = false # Standard Framework Training dlc-pr-pytorch-training = "" -dlc-pr-tensorflow-2-training = "tensorflow/training/buildspec-2-19-sm.yml" +dlc-pr-tensorflow-2-training = "tensorflow/training/buildspec-2-18-sm.yml" dlc-pr-autogluon-training = "" # ARM64 Training diff --git a/tensorflow/training/docker/2.18/py3/cu125/Dockerfile.gpu b/tensorflow/training/docker/2.18/py3/cu125/Dockerfile.gpu index cbed4d98b743..f62f78ca068b 100644 --- a/tensorflow/training/docker/2.18/py3/cu125/Dockerfile.gpu +++ b/tensorflow/training/docker/2.18/py3/cu125/Dockerfile.gpu @@ -112,6 +112,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends --allow-unauthe && apt-get clean \ && mkdir -p /var/run/sshd + # patch nvjpeg RUN mkdir -p /tmp/nvjpeg \ && cd /tmp/nvjpeg \ @@ -236,7 +237,7 @@ RUN ${PIP} install --no-cache-dir -U \ opencv-python \ werkzeug \ urllib3 \ - "protobuf<4" + "protobuf>=3.20.3,<4" # Install AWS OFI NCCL plug-in RUN apt-get update && apt-get install -y \ @@ -296,7 +297,7 @@ ARG TF_URL RUN ${PIP} install --no-cache-dir -U \ ${TF_URL} \ "tensorflow-io==0.37.*" \ - tensorflow-datasets + "tensorflow-datasets==4.9.7" RUN HOME_DIR=/root \ && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \ @@ -365,7 +366,7 @@ RUN ${PIP} install --no-cache-dir -U \ && ${PIP} install --no-cache-dir -U \ ${TF_URL} \ "tensorflow-io==0.37.*" \ - tensorflow-datasets + "tensorflow-datasets==4.9.7" RUN $PYTHON -m pip install --no-cache-dir -U \ numba \ From 181d9a32630ea0f0864fe1f98bfa0fdc1ee40422 Mon Sep 17 00:00:00 2001 From: jkottu Date: Wed, 16 Jul 2025 14:44:04 -0700 Subject: [PATCH 13/16] build tensorflow 2.18 ec2 --- dlc_developer_config.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 02425ff4e648..a4da32b3709e 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -120,7 +120,7 @@ use_scheduler = false # Standard Framework Training dlc-pr-pytorch-training = "" -dlc-pr-tensorflow-2-training = "tensorflow/training/buildspec-2-18-sm.yml" +dlc-pr-tensorflow-2-training = "tensorflow/training/buildspec-2-18-ec2.yml" dlc-pr-autogluon-training = "" # ARM64 Training From 7786b20070b59d5dd30db49fd298dc115c88f7fd Mon Sep 17 00:00:00 2001 From: jkottu Date: Wed, 16 Jul 2025 14:49:09 -0700 Subject: [PATCH 14/16] build tensorflow 2.19 sm with open cv pinned --- dlc_developer_config.toml | 2 +- tensorflow/training/docker/2.19/py3/cu125/Dockerfile.gpu | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index a4da32b3709e..e4f3090ea51d 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -120,7 +120,7 @@ use_scheduler = false # Standard Framework Training dlc-pr-pytorch-training = "" -dlc-pr-tensorflow-2-training = "tensorflow/training/buildspec-2-18-ec2.yml" +dlc-pr-tensorflow-2-training = "tensorflow/training/buildspec-2-19-sm.yml" dlc-pr-autogluon-training = "" # ARM64 Training diff --git a/tensorflow/training/docker/2.19/py3/cu125/Dockerfile.gpu b/tensorflow/training/docker/2.19/py3/cu125/Dockerfile.gpu index 233b22bd834e..b5aa9e95e112 100644 --- a/tensorflow/training/docker/2.19/py3/cu125/Dockerfile.gpu +++ b/tensorflow/training/docker/2.19/py3/cu125/Dockerfile.gpu @@ -352,7 +352,7 @@ RUN $PYTHON -m pip install --no-cache-dir -U \ numba \ bokeh \ imageio \ - opencv-python \ + "opencv-python==4.11.0.86" \ plotly \ seaborn \ shap From ab65536630bb80714356228dbe888776af4cd320 Mon Sep 17 00:00:00 2001 From: jkottu Date: Fri, 18 Jul 2025 09:16:19 -0700 Subject: [PATCH 15/16] build 2.19 tensorflow --- tensorflow/training/buildspec-2-18-sm.yml | 2 +- .../docker/2.18/py3/cu125/Dockerfile.gpu | 21 +++---------------- 2 files changed, 4 insertions(+), 19 deletions(-) diff --git a/tensorflow/training/buildspec-2-18-sm.yml b/tensorflow/training/buildspec-2-18-sm.yml index eab6fc553978..3b899394e87e 100644 --- a/tensorflow/training/buildspec-2-18-sm.yml +++ b/tensorflow/training/buildspec-2-18-sm.yml @@ -5,7 +5,7 @@ framework: &FRAMEWORK tensorflow version: &VERSION 2.18.0 short_version: &SHORT_VERSION "2.18" arch_type: x86 -# autopatch_build: "True" +autopatch_build: "True" repository_info: training_repository: &TRAINING_REPOSITORY diff --git a/tensorflow/training/docker/2.18/py3/cu125/Dockerfile.gpu b/tensorflow/training/docker/2.18/py3/cu125/Dockerfile.gpu index f62f78ca068b..4040968586c1 100644 --- a/tensorflow/training/docker/2.18/py3/cu125/Dockerfile.gpu +++ b/tensorflow/training/docker/2.18/py3/cu125/Dockerfile.gpu @@ -112,21 +112,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends --allow-unauthe && apt-get clean \ && mkdir -p /var/run/sshd - -# patch nvjpeg -RUN mkdir -p /tmp/nvjpeg \ -&& cd /tmp/nvjpeg \ -&& wget https://developer.download.nvidia.com/compute/cuda/redist/libnvjpeg/linux-x86_64/libnvjpeg-linux-x86_64-12.4.0.76-archive.tar.xz \ -&& tar -xvf libnvjpeg-linux-x86_64-12.4.0.76-archive.tar.xz \ -&& rm -rf /usr/local/cuda/targets/x86_64-linux/lib/libnvjpeg* \ -&& rm -rf /usr/local/cuda/targets/x86_64-linux/include/nvjpeg.h \ -&& cp libnvjpeg-linux-x86_64-12.4.0.76-archive/lib/libnvjpeg* /usr/local/cuda/targets/x86_64-linux/lib/ \ -&& cp libnvjpeg-linux-x86_64-12.4.0.76-archive/include/* /usr/local/cuda/targets/x86_64-linux/include/ \ -&& rm -rf /tmp/nvjpeg \ -# patch cuobjdump and nvdisasm -&& rm -rf /usr/local/cuda/bin/cuobjdump* \ -&& rm -rf /usr/local/cuda/bin/nvdisasm* - # Install EFA without AWS OPEN_MPI RUN apt-get update \ @@ -237,7 +222,7 @@ RUN ${PIP} install --no-cache-dir -U \ opencv-python \ werkzeug \ urllib3 \ - "protobuf>=3.20.3,<4" + "protobuf<4" # Install AWS OFI NCCL plug-in RUN apt-get update && apt-get install -y \ @@ -297,7 +282,7 @@ ARG TF_URL RUN ${PIP} install --no-cache-dir -U \ ${TF_URL} \ "tensorflow-io==0.37.*" \ - "tensorflow-datasets==4.9.7" + tensorflow-datasets RUN HOME_DIR=/root \ && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \ @@ -366,7 +351,7 @@ RUN ${PIP} install --no-cache-dir -U \ && ${PIP} install --no-cache-dir -U \ ${TF_URL} \ "tensorflow-io==0.37.*" \ - "tensorflow-datasets==4.9.7" + tensorflow-datasets RUN $PYTHON -m pip install --no-cache-dir -U \ numba \ From 7b651436193621679e43ca8c237b9823cc222cc7 Mon Sep 17 00:00:00 2001 From: jkottu Date: Fri, 18 Jul 2025 10:39:47 -0700 Subject: [PATCH 16/16] revert toml --- dlc_developer_config.toml | 6 +++--- tensorflow/training/buildspec-2-18-ec2.yml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index e4f3090ea51d..1962bfd69e21 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -37,12 +37,12 @@ deep_canary_mode = false [build] # Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image. # available frameworks - ["base", "vllm", "autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "pytorch", "stabilityai_pytorch"] -build_frameworks = ["tensorflow"] +build_frameworks = [] # By default we build both training and inference containers. Set true/false values to determine which to build. build_training = true -build_inference = false +build_inference = true # Set do_build to "false" to skip builds and test the latest image built by this PR # Note: at least one build is required to set do_build to "false" @@ -120,7 +120,7 @@ use_scheduler = false # Standard Framework Training dlc-pr-pytorch-training = "" -dlc-pr-tensorflow-2-training = "tensorflow/training/buildspec-2-19-sm.yml" +dlc-pr-tensorflow-2-training = "" dlc-pr-autogluon-training = "" # ARM64 Training diff --git a/tensorflow/training/buildspec-2-18-ec2.yml b/tensorflow/training/buildspec-2-18-ec2.yml index 20148d5c29f5..aac371965cd3 100644 --- a/tensorflow/training/buildspec-2-18-ec2.yml +++ b/tensorflow/training/buildspec-2-18-ec2.yml @@ -5,7 +5,7 @@ framework: &FRAMEWORK tensorflow version: &VERSION 2.18.0 short_version: &SHORT_VERSION "2.18" arch_type: x86 -# autopatch_build: "True" +autopatch_build: "True" repository_info: training_repository: &TRAINING_REPOSITORY