From 15b1c7e3cf19050540798ec77726a3e5ce0a890c Mon Sep 17 00:00:00 2001 From: Christopher Desiniotis Date: Mon, 22 May 2023 12:21:42 -0700 Subject: [PATCH 1/3] Set NVIDIA_VISIBLE_DEVICES=void for toolkit-container Signed-off-by: Christopher Desiniotis --- assets/state-container-toolkit/0500_daemonset.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/assets/state-container-toolkit/0500_daemonset.yaml b/assets/state-container-toolkit/0500_daemonset.yaml index 8a3b197bf..85f68869d 100644 --- a/assets/state-container-toolkit/0500_daemonset.yaml +++ b/assets/state-container-toolkit/0500_daemonset.yaml @@ -65,6 +65,8 @@ spec: value: "" - name: NVIDIA_CONTAINER_RUNTIME_MODES_CDI_DEFAULT_KIND value: "management.nvidia.com/gpu" + - name: NVIDIA_VISIBLE_DEVICES + value: "void" imagePullPolicy: IfNotPresent name: nvidia-container-toolkit-ctr securityContext: From 80385153427da0f7248093fc0f8e36411d81fcd8 Mon Sep 17 00:00:00 2001 From: Christopher Desiniotis Date: Mon, 22 May 2023 21:24:26 +0000 Subject: [PATCH 2/3] Bump version to v23.3.2 and cuda base image version to 12.1.1 --- .../gpu-operator-certified.clusterserviceversion.yaml | 8 ++++---- deployments/gpu-operator/values.yaml | 4 ++-- versions.mk | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/bundle/manifests/gpu-operator-certified.clusterserviceversion.yaml b/bundle/manifests/gpu-operator-certified.clusterserviceversion.yaml index 87e9f8650..43da77274 100644 --- a/bundle/manifests/gpu-operator-certified.clusterserviceversion.yaml +++ b/bundle/manifests/gpu-operator-certified.clusterserviceversion.yaml @@ -175,13 +175,13 @@ spec: - name: mig-manager-image image: nvcr.io/nvidia/cloud-native/k8s-mig-manager@sha256:8f03f74076b5aa6672a5d75a398fa7515fcde57fdb1cf8079e015b3366efd3b5 - name: init-container-image - image: nvcr.io/nvidia/cuda@sha256:00291fe0accce8b2cb0c69d65743afb924d5b205a4e80296210b9924687c2c7a + image: nvcr.io/nvidia/cuda@sha256:62d845b4bd6de4be65f0b6daaae6831c29d4529955f99db5d26f5f7179106285 - name: gpu-operator-validator-image image: registry.gitlab.com/nvidia/kubernetes/gpu-operator/staging/gpu-operator-validator:release-23.03-latest-ubi8 - name: k8s-driver-manager-image image: nvcr.io/nvidia/cloud-native/k8s-driver-manager@sha256:cab21c93987a5c884075efe0fb4a8abaa1997e1696cbc773ba69889f42f8329b - name: vfio-manager-image - image: nvcr.io/nvidia/cuda@sha256:00291fe0accce8b2cb0c69d65743afb924d5b205a4e80296210b9924687c2c7a + image: nvcr.io/nvidia/cuda@sha256:62d845b4bd6de4be65f0b6daaae6831c29d4529955f99db5d26f5f7179106285 - name: sandbox-device-plugin-image image: nvcr.io/nvidia/kubevirt-gpu-device-plugin@sha256:0d47dad29d2ef445b301c5c64717758eed43a606345b79f97bce2e64b40a91a8 - name: vgpu-device-manager-image @@ -821,9 +821,9 @@ spec: - name: "MIG_MANAGER_IMAGE" value: "nvcr.io/nvidia/cloud-native/k8s-mig-manager@sha256:8f03f74076b5aa6672a5d75a398fa7515fcde57fdb1cf8079e015b3366efd3b5" - name: "CUDA_BASE_IMAGE" - value: "nvcr.io/nvidia/cuda@sha256:00291fe0accce8b2cb0c69d65743afb924d5b205a4e80296210b9924687c2c7a" + value: "nvcr.io/nvidia/cuda@sha256:62d845b4bd6de4be65f0b6daaae6831c29d4529955f99db5d26f5f7179106285" - name: "VFIO_MANAGER_IMAGE" - value: "nvcr.io/nvidia/cuda@sha256:00291fe0accce8b2cb0c69d65743afb924d5b205a4e80296210b9924687c2c7a" + value: "nvcr.io/nvidia/cuda@sha256:62d845b4bd6de4be65f0b6daaae6831c29d4529955f99db5d26f5f7179106285" - name: "SANDBOX_DEVICE_PLUGIN_IMAGE" value: "nvcr.io/nvidia/kubevirt-gpu-device-plugin@sha256:0d47dad29d2ef445b301c5c64717758eed43a606345b79f97bce2e64b40a91a8" - name: "VGPU_DEVICE_MANAGER_IMAGE" diff --git a/deployments/gpu-operator/values.yaml b/deployments/gpu-operator/values.yaml index 4e02f6369..3b9eb2873 100644 --- a/deployments/gpu-operator/values.yaml +++ b/deployments/gpu-operator/values.yaml @@ -70,7 +70,7 @@ operator: initContainer: image: cuda repository: nvcr.io/nvidia - version: 12.1.0-base-ubi8 + version: 12.1.1-base-ubi8 imagePullPolicy: IfNotPresent tolerations: - key: "node-role.kubernetes.io/master" @@ -383,7 +383,7 @@ vfioManager: enabled: true repository: nvcr.io/nvidia image: cuda - version: 12.1.0-base-ubi8 + version: 12.1.1-base-ubi8 imagePullPolicy: IfNotPresent imagePullSecrets: [] env: [] diff --git a/versions.mk b/versions.mk index d0288014a..9e808cabe 100644 --- a/versions.mk +++ b/versions.mk @@ -17,9 +17,9 @@ # To re-generate a bundle for another specific version without changing the standard setup, you can: # - use the VERSION as arg of the bundle target (e.g make bundle VERSION=0.0.2) # - use environment variables to overwrite this value (e.g export VERSION=0.0.2) -VERSION ?= v23.3.0 +VERSION ?= v23.3.2 -CUDA_VERSION ?= 12.1.0 +CUDA_VERSION ?= 12.1.1 GOLANG_VERSION ?= 1.20.1 GIT_COMMIT ?= $(shell git describe --match="" --dirty --long --always 2> /dev/null || echo "") From 6a0f59d497eafa53c86cbf6bb8555911e9d00bc3 Mon Sep 17 00:00:00 2001 From: Christopher Desiniotis Date: Mon, 22 May 2023 14:44:11 -0700 Subject: [PATCH 3/3] Bump cuda base image version to 12.1.1 for validator container Signed-off-by: Christopher Desiniotis --- validator/versions.mk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/validator/versions.mk b/validator/versions.mk index b16a2e373..979432d2b 100644 --- a/validator/versions.mk +++ b/validator/versions.mk @@ -19,6 +19,6 @@ # - use environment variables to overwrite this value (e.g export VERSION=0.0.2) VERSION ?= v0.1.0 -CUDA_VERSION ?= 12.1.0 +CUDA_VERSION ?= 12.1.1 CUDA_SAMPLES_VERSION ?= 11.7.1 GOLANG_VERSION ?= 1.20.1