diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 59eaa3a68a93..1962bfd69e21 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -72,15 +72,12 @@ ec2_benchmark_tests = false ### These tests are run in EC2 test jobs, so ec2_tests must be true if ec2_tests_on_heavy_instances is true. ### Off by default (set to false) ec2_tests_on_heavy_instances = false - ### SM specific tests ### On by default sagemaker_local_tests = true - ### Set enable_ipv6 = true to run tests with IPv6-enabled resources ### Off by default (set to false) enable_ipv6 = false - ### Set the VPC name to be used for IPv6 testing, this variable is empty by default ### To create an IPv6-enabled VPC and its related resources: ### 1. Follow this AWS doc: https://docs.aws.amazon.com/vpc/latest/userguide/create-vpc.html#create-vpc-and-other-resources diff --git a/tensorflow/training/buildspec-2-18-ec2.yml b/tensorflow/training/buildspec-2-18-ec2.yml index aac371965cd3..20148d5c29f5 100644 --- a/tensorflow/training/buildspec-2-18-ec2.yml +++ b/tensorflow/training/buildspec-2-18-ec2.yml @@ -5,7 +5,7 @@ framework: &FRAMEWORK tensorflow version: &VERSION 2.18.0 short_version: &SHORT_VERSION "2.18" arch_type: x86 -autopatch_build: "True" +# autopatch_build: "True" repository_info: training_repository: &TRAINING_REPOSITORY diff --git a/tensorflow/training/buildspec-2-18-sm.yml b/tensorflow/training/buildspec-2-18-sm.yml index 3b899394e87e..eab6fc553978 100644 --- a/tensorflow/training/buildspec-2-18-sm.yml +++ b/tensorflow/training/buildspec-2-18-sm.yml @@ -5,7 +5,7 @@ framework: &FRAMEWORK tensorflow version: &VERSION 2.18.0 short_version: &SHORT_VERSION "2.18" arch_type: x86 -autopatch_build: "True" +# autopatch_build: "True" repository_info: training_repository: &TRAINING_REPOSITORY diff --git a/tensorflow/training/docker/2.18/py3/Dockerfile.cpu b/tensorflow/training/docker/2.18/py3/Dockerfile.cpu index eee6cd96e3cf..984994f57bce 100644 --- a/tensorflow/training/docker/2.18/py3/Dockerfile.cpu +++ b/tensorflow/training/docker/2.18/py3/Dockerfile.cpu @@ -159,9 +159,8 @@ RUN ${PIP} install --no-cache-dir -U \ absl-py \ opencv-python \ werkzeug \ - psutil \ - "protobuf<4" - + psutil + ADD https://raw.githubusercontent.com/aws/deep-learning-containers/master/src/deep_learning_container.py /usr/local/bin/deep_learning_container.py RUN chmod +x /usr/local/bin/deep_learning_container.py @@ -189,7 +188,7 @@ ARG TF_URL RUN ${PIP} install --no-cache-dir -U \ ${TF_URL} \ "tensorflow-io==0.37.*" \ - tensorflow-datasets + "tensorflow-datasets==4.9.7" RUN HOME_DIR=/root \ && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \ @@ -261,26 +260,41 @@ RUN ${PIP} install --no-cache-dir -U \ && ${PIP} install --no-cache-dir -U \ ${TF_URL} \ "tensorflow-io==0.37.*" \ - tensorflow-datasets + "tensorflow-datasets==4.9.7" RUN $PYTHON -m pip install --no-cache-dir -U \ - numba \ + numba==0.61.0 \ bokeh \ imageio \ opencv-python \ plotly \ seaborn \ - shap + shap + +RUN $PYTHON -m pip install --no-cache-dir -U \ + "sagemaker<3" RUN $PYTHON -m pip install --no-cache-dir -U \ - "sagemaker<3" \ - sagemaker-experiments==0.* \ - sagemaker-tensorflow-training \ - sagemaker-training \ - "sagemaker-studio-analytics-extension<1" \ - "sparkmagic<1" \ - "sagemaker-studio-sparkmagic-lib<1" \ - smclarify + sagemaker-experiments==0.1.45 + +RUN $PYTHON -m pip install --no-cache-dir -U \ + sagemaker-tensorflow-training + +RUN $PYTHON -m pip install --no-cache-dir -U \ + sagemaker-training + +RUN $PYTHON -m pip install --no-cache-dir -U \ + sagemaker-studio-analytics-extension==0.1.4 + +RUN $PYTHON -m pip install --no-cache-dir -U \ + sagemaker-studio-sparkmagic-lib==0.2.0 + +RUN $PYTHON -m pip install --no-cache-dir -U \ + sparkmagic==0.21.0 \ + smclarify + +#pin numpy version because of sagemaker-tensorflow-training dependency +RUN $PYTHON -m pip install --no-cache-dir numpy==1.26.4 # Remove python kernel installed by sparkmagic RUN /usr/local/bin/jupyter-kernelspec remove -f python3 diff --git a/tensorflow/training/docker/2.18/py3/Dockerfile.ec2.cpu.core_packages.json b/tensorflow/training/docker/2.18/py3/Dockerfile.ec2.cpu.core_packages.json index d71af1c4fd77..9488cc04e4cd 100644 --- a/tensorflow/training/docker/2.18/py3/Dockerfile.ec2.cpu.core_packages.json +++ b/tensorflow/training/docker/2.18/py3/Dockerfile.ec2.cpu.core_packages.json @@ -3,7 +3,7 @@ "version_specifier": ">=1.33.13,<2" }, "protobuf": { - "version_specifier": ">=3.20.3,<4" + "version_specifier": ">=4.21.12" }, "pyyaml": { "version_specifier": ">=6.0,<6.1" diff --git a/tensorflow/training/docker/2.18/py3/Dockerfile.ec2.cpu.py_scan_allowlist.json b/tensorflow/training/docker/2.18/py3/Dockerfile.ec2.cpu.py_scan_allowlist.json new file mode 100644 index 000000000000..8ba0ee7d69f4 --- /dev/null +++ b/tensorflow/training/docker/2.18/py3/Dockerfile.ec2.cpu.py_scan_allowlist.json @@ -0,0 +1,3 @@ +{ + "77740": "[Package: protobuf] Affected versions of this package are vulnerable to a potential Denial of Service (DoS) attack due to unbounded recursion when parsing untrusted Protocol Buffers data. The pure-Python implementation fails to enforce recursion depth limits when processing recursive groups, recursive messages, or a series of SGROUP tags, leading to stack overflow conditions that can crash the application by exceeding Python's recursion limit." +} \ No newline at end of file diff --git a/tensorflow/training/docker/2.18/py3/Dockerfile.sagemaker.cpu.core_packages.json b/tensorflow/training/docker/2.18/py3/Dockerfile.sagemaker.cpu.core_packages.json index 12cd446d16c2..dfe57edd5fc6 100644 --- a/tensorflow/training/docker/2.18/py3/Dockerfile.sagemaker.cpu.core_packages.json +++ b/tensorflow/training/docker/2.18/py3/Dockerfile.sagemaker.cpu.core_packages.json @@ -6,7 +6,7 @@ "version_specifier": "<3.0" }, "protobuf": { - "version_specifier": ">=3.20.3,<4" + "version_specifier": ">=5.29.5" }, "pyyaml": { "version_specifier": ">=6.0,<6.1" @@ -18,7 +18,7 @@ "version_specifier": ">=20.4.1,<21" }, "sagemaker-training": { - "version_specifier": ">=4.7.4,<5" + "version_specifier": ">=5" }, "sagemaker-studio-analytics-extension": { "version_specifier": "<1" diff --git a/tensorflow/training/docker/2.18/py3/Dockerfile.sagemaker.cpu.py_scan_allowlist.json b/tensorflow/training/docker/2.18/py3/Dockerfile.sagemaker.cpu.py_scan_allowlist.json new file mode 100644 index 000000000000..8ba0ee7d69f4 --- /dev/null +++ b/tensorflow/training/docker/2.18/py3/Dockerfile.sagemaker.cpu.py_scan_allowlist.json @@ -0,0 +1,3 @@ +{ + "77740": "[Package: protobuf] Affected versions of this package are vulnerable to a potential Denial of Service (DoS) attack due to unbounded recursion when parsing untrusted Protocol Buffers data. The pure-Python implementation fails to enforce recursion depth limits when processing recursive groups, recursive messages, or a series of SGROUP tags, leading to stack overflow conditions that can crash the application by exceeding Python's recursion limit." +} \ No newline at end of file diff --git a/tensorflow/training/docker/2.18/py3/cu125/Dockerfile.ec2.gpu.core_packages.json b/tensorflow/training/docker/2.18/py3/cu125/Dockerfile.ec2.gpu.core_packages.json index 6757b6ec2466..683f162f200b 100644 --- a/tensorflow/training/docker/2.18/py3/cu125/Dockerfile.ec2.gpu.core_packages.json +++ b/tensorflow/training/docker/2.18/py3/cu125/Dockerfile.ec2.gpu.core_packages.json @@ -3,7 +3,7 @@ "version_specifier": ">=1.33.13,<2" }, "protobuf": { - "version_specifier": ">=3.20.3,<4" + "version_specifier": ">=4.21.12" }, "pyyaml": { "version_specifier": ">=6.0,<6.1" diff --git a/tensorflow/training/docker/2.18/py3/cu125/Dockerfile.ec2.gpu.py_scan_allowlist.json b/tensorflow/training/docker/2.18/py3/cu125/Dockerfile.ec2.gpu.py_scan_allowlist.json new file mode 100644 index 000000000000..8ba0ee7d69f4 --- /dev/null +++ b/tensorflow/training/docker/2.18/py3/cu125/Dockerfile.ec2.gpu.py_scan_allowlist.json @@ -0,0 +1,3 @@ +{ + "77740": "[Package: protobuf] Affected versions of this package are vulnerable to a potential Denial of Service (DoS) attack due to unbounded recursion when parsing untrusted Protocol Buffers data. The pure-Python implementation fails to enforce recursion depth limits when processing recursive groups, recursive messages, or a series of SGROUP tags, leading to stack overflow conditions that can crash the application by exceeding Python's recursion limit." +} \ No newline at end of file diff --git a/tensorflow/training/docker/2.18/py3/cu125/Dockerfile.gpu b/tensorflow/training/docker/2.18/py3/cu125/Dockerfile.gpu index 4040968586c1..5fd32e0d07e0 100644 --- a/tensorflow/training/docker/2.18/py3/cu125/Dockerfile.gpu +++ b/tensorflow/training/docker/2.18/py3/cu125/Dockerfile.gpu @@ -221,9 +221,8 @@ RUN ${PIP} install --no-cache-dir -U \ absl-py \ opencv-python \ werkzeug \ - urllib3 \ - "protobuf<4" - + urllib3 + # Install AWS OFI NCCL plug-in RUN apt-get update && apt-get install -y \ autoconf \ @@ -246,6 +245,20 @@ RUN mkdir /tmp/efa-ofi-nccl \ && make install \ && rm -rf /tmp/efa-ofi-nccl +# patch nvjpeg +RUN mkdir -p /tmp/nvjpeg \ +&& cd /tmp/nvjpeg \ +&& wget https://developer.download.nvidia.com/compute/cuda/redist/libnvjpeg/linux-x86_64/libnvjpeg-linux-x86_64-12.4.0.76-archive.tar.xz \ +&& tar -xvf libnvjpeg-linux-x86_64-12.4.0.76-archive.tar.xz \ +&& rm -rf /usr/local/cuda/targets/x86_64-linux/lib/libnvjpeg* \ +&& rm -rf /usr/local/cuda/targets/x86_64-linux/include/nvjpeg.h \ +&& cp libnvjpeg-linux-x86_64-12.4.0.76-archive/lib/libnvjpeg* /usr/local/cuda/targets/x86_64-linux/lib/ \ +&& cp libnvjpeg-linux-x86_64-12.4.0.76-archive/include/* /usr/local/cuda/targets/x86_64-linux/include/ \ +&& rm -rf /tmp/nvjpeg \ +# patch cuobjdump and nvdisasm +&& rm -rf /usr/local/cuda/bin/cuobjdump* \ +&& rm -rf /usr/local/cuda/bin/nvdisasm* + # Allow OpenSSH to talk to containers without asking for confirmation RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new \ && echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new \ @@ -282,7 +295,7 @@ ARG TF_URL RUN ${PIP} install --no-cache-dir -U \ ${TF_URL} \ "tensorflow-io==0.37.*" \ - tensorflow-datasets + "tensorflow-datasets==4.9.7" RUN HOME_DIR=/root \ && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \ @@ -351,26 +364,41 @@ RUN ${PIP} install --no-cache-dir -U \ && ${PIP} install --no-cache-dir -U \ ${TF_URL} \ "tensorflow-io==0.37.*" \ - tensorflow-datasets + "tensorflow-datasets==4.9.7" RUN $PYTHON -m pip install --no-cache-dir -U \ - numba \ + numba==0.61.0 \ bokeh \ imageio \ opencv-python \ plotly \ seaborn \ - shap + shap + +RUN $PYTHON -m pip install --no-cache-dir -U \ + "sagemaker<3" + +RUN $PYTHON -m pip install --no-cache-dir -U \ + sagemaker-experiments==0.1.45 + +RUN $PYTHON -m pip install --no-cache-dir -U \ + sagemaker-tensorflow-training RUN $PYTHON -m pip install --no-cache-dir -U \ - "sagemaker<3" \ - sagemaker-experiments==0.* \ - sagemaker-tensorflow-training \ - sagemaker-training \ - "sagemaker-studio-analytics-extension<1" \ - "sparkmagic<1" \ - "sagemaker-studio-sparkmagic-lib<1" \ - smclarify + sagemaker-training + +RUN $PYTHON -m pip install --no-cache-dir -U \ + sagemaker-studio-analytics-extension==0.1.4 + +RUN $PYTHON -m pip install --no-cache-dir -U \ + sagemaker-studio-sparkmagic-lib==0.2.0 + +RUN $PYTHON -m pip install --no-cache-dir -U \ + sparkmagic==0.21.0 \ + smclarify + +#pin numpy version because of sagemaker-tensorflow-training dependency +RUN $PYTHON -m pip install --no-cache-dir numpy==1.26.4 # install boost # tensorflow is compiled with --cxxopt="-D_GLIBCXX_USE_CXX11_ABI=1" diff --git a/tensorflow/training/docker/2.18/py3/cu125/Dockerfile.sagemaker.gpu.core_packages.json b/tensorflow/training/docker/2.18/py3/cu125/Dockerfile.sagemaker.gpu.core_packages.json index 936725dcf254..f4bfbaa9c2e8 100644 --- a/tensorflow/training/docker/2.18/py3/cu125/Dockerfile.sagemaker.gpu.core_packages.json +++ b/tensorflow/training/docker/2.18/py3/cu125/Dockerfile.sagemaker.gpu.core_packages.json @@ -6,7 +6,7 @@ "version_specifier": "<3.0" }, "protobuf": { - "version_specifier": ">=3.20.3,<4" + "version_specifier": ">=5.29.5" }, "pyyaml": { "version_specifier": ">=6.0,<6.1" @@ -18,7 +18,7 @@ "version_specifier": ">=20.4.1,<21" }, "sagemaker-training": { - "version_specifier": ">=4.7.4,<5" + "version_specifier": ">=5" }, "sagemaker-studio-analytics-extension": { "version_specifier": "<1" diff --git a/tensorflow/training/docker/2.18/py3/cu125/Dockerfile.sagemaker.gpu.py_scan_allowlist.json b/tensorflow/training/docker/2.18/py3/cu125/Dockerfile.sagemaker.gpu.py_scan_allowlist.json new file mode 100644 index 000000000000..8ba0ee7d69f4 --- /dev/null +++ b/tensorflow/training/docker/2.18/py3/cu125/Dockerfile.sagemaker.gpu.py_scan_allowlist.json @@ -0,0 +1,3 @@ +{ + "77740": "[Package: protobuf] Affected versions of this package are vulnerable to a potential Denial of Service (DoS) attack due to unbounded recursion when parsing untrusted Protocol Buffers data. The pure-Python implementation fails to enforce recursion depth limits when processing recursive groups, recursive messages, or a series of SGROUP tags, leading to stack overflow conditions that can crash the application by exceeding Python's recursion limit." +} \ No newline at end of file