Skip to content

Commit ce2ef9a

Browse files
Run toolkit validation in operand init containers
The toolkit-validation init containers in operand DaemonSets previously polled for a toolkit-ready sentinel file on the host. During driver reinstall cycles, the driver-manager restores node scheduling labels before the driver container has loaded the nvidia kernel module. All operand pods get created simultaneously and find a stale toolkit-ready file from a previous cycle, passing the init gate while nvidia-smi would actually fail. Replace the shell-based sentinel file check with a nvidia-validator check. This runs nvidia-smi through the toolkit runtime wrapper and retries until it succeeds, validating both toolkit injection and driver module readiness without depending on host sentinel files.
1 parent 652724d commit ce2ef9a

File tree

8 files changed

+60
-14
lines changed

8 files changed

+60
-14
lines changed

assets/gpu-feature-discovery/0500_daemonset.yaml

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,13 +29,20 @@ spec:
2929
- name: toolkit-validation
3030
image: "FILLED BY THE OPERATOR"
3131
command: ['sh', '-c']
32-
args: ["until [ -f /run/nvidia/validations/toolkit-ready ]; do echo waiting for nvidia container stack to be setup; sleep 5; done"]
32+
args: ["nvidia-validator"]
33+
env:
34+
- name: NVIDIA_VISIBLE_DEVICES
35+
value: "all"
36+
- name: WITH_WAIT
37+
value: "true"
38+
- name: COMPONENT
39+
value: toolkit
3340
securityContext:
3441
privileged: true
3542
volumeMounts:
3643
- name: run-nvidia
3744
mountPath: /run/nvidia
38-
mountPropagation: HostToContainer
45+
mountPropagation: Bidirectional
3946
- name: config-manager-init
4047
image: "FILLED BY THE OPERATOR"
4148
command: ["config-manager"]

assets/state-container-toolkit/0500_daemonset.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,10 @@ spec:
7777
fieldPath: metadata.namespace
7878
imagePullPolicy: IfNotPresent
7979
name: nvidia-container-toolkit-ctr
80+
lifecycle:
81+
preStop:
82+
exec:
83+
command: ["/bin/sh", "-c", "rm -f /run/nvidia/validations/toolkit-ready"]
8084
securityContext:
8185
privileged: true
8286
seLinuxOptions:

assets/state-dcgm-exporter/0800_daemonset.yaml

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,13 +28,20 @@ spec:
2828
- name: toolkit-validation
2929
image: "FILLED BY THE OPERATOR"
3030
command: ['sh', '-c']
31-
args: ["until [ -f /run/nvidia/validations/toolkit-ready ]; do echo waiting for nvidia container stack to be setup; sleep 5; done"]
31+
args: ["nvidia-validator"]
32+
env:
33+
- name: NVIDIA_VISIBLE_DEVICES
34+
value: "all"
35+
- name: WITH_WAIT
36+
value: "true"
37+
- name: COMPONENT
38+
value: toolkit
3239
securityContext:
3340
privileged: true
3441
volumeMounts:
3542
- name: run-nvidia
36-
mountPath: "/run/nvidia"
37-
mountPropagation: HostToContainer
43+
mountPath: /run/nvidia
44+
mountPropagation: Bidirectional
3845
containers:
3946
- image: "FILLED BY THE OPERATOR"
4047
name: nvidia-dcgm-exporter

assets/state-dcgm/0400_dcgm.yml

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,13 +28,20 @@ spec:
2828
- name: toolkit-validation
2929
image: "FILLED BY THE OPERATOR"
3030
command: ['sh', '-c']
31-
args: ["until [ -f /run/nvidia/validations/toolkit-ready ]; do echo waiting for nvidia container stack to be setup; sleep 5; done"]
31+
args: ["nvidia-validator"]
32+
env:
33+
- name: NVIDIA_VISIBLE_DEVICES
34+
value: "all"
35+
- name: WITH_WAIT
36+
value: "true"
37+
- name: COMPONENT
38+
value: toolkit
3239
securityContext:
3340
privileged: true
3441
volumeMounts:
3542
- name: run-nvidia
3643
mountPath: /run/nvidia
37-
mountPropagation: HostToContainer
44+
mountPropagation: Bidirectional
3845
containers:
3946
- image: "FILLED BY THE OPERATOR"
4047
name: nvidia-dcgm-ctr

assets/state-device-plugin/0500_daemonset.yaml

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,13 +28,20 @@ spec:
2828
- image: "FILLED BY THE OPERATOR"
2929
name: toolkit-validation
3030
command: ['sh', '-c']
31-
args: ["until [ -f /run/nvidia/validations/toolkit-ready ]; do echo waiting for nvidia container stack to be setup; sleep 5; done"]
31+
args: ["nvidia-validator"]
32+
env:
33+
- name: NVIDIA_VISIBLE_DEVICES
34+
value: "all"
35+
- name: WITH_WAIT
36+
value: "true"
37+
- name: COMPONENT
38+
value: toolkit
3239
securityContext:
3340
privileged: true
3441
volumeMounts:
3542
- name: run-nvidia-validations
3643
mountPath: /run/nvidia/validations
37-
mountPropagation: HostToContainer
44+
mountPropagation: Bidirectional
3845
- image: "FILLED BY THE OPERATOR"
3946
name: config-manager-init
4047
command: ["config-manager"]

assets/state-driver/0500_daemonset.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,7 @@ spec:
150150
lifecycle:
151151
preStop:
152152
exec:
153-
command: ["/bin/sh", "-c", "rm -f /run/nvidia/validations/.driver-ctr-ready"]
153+
command: ["/bin/sh", "-c", "rm -f /run/nvidia/validations/.driver-ctr-ready /run/nvidia/validations/driver-ready"]
154154
- image: "FILLED BY THE OPERATOR"
155155
imagePullPolicy: IfNotPresent
156156
name: nvidia-peermem-ctr

assets/state-mig-manager/0600_daemonset.yaml

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,13 +28,20 @@ spec:
2828
- name: toolkit-validation
2929
image: "FILLED BY THE OPERATOR"
3030
command: ['sh', '-c']
31-
args: ["until [ -f /run/nvidia/validations/toolkit-ready ]; do echo waiting for nvidia container toolkit to be setup; sleep 5; done"]
31+
args: ["nvidia-validator"]
32+
env:
33+
- name: NVIDIA_VISIBLE_DEVICES
34+
value: "all"
35+
- name: WITH_WAIT
36+
value: "true"
37+
- name: COMPONENT
38+
value: toolkit
3239
securityContext:
3340
privileged: true
3441
volumeMounts:
3542
- name: run-nvidia-validations
3643
mountPath: /run/nvidia/validations
37-
mountPropagation: HostToContainer
44+
mountPropagation: Bidirectional
3845
containers:
3946
- name: nvidia-mig-manager
4047
image: "FILLED BY THE OPERATOR"

assets/state-mps-control-daemon/0400_daemonset.yaml

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,13 +30,20 @@ spec:
3030
- image: "FILLED BY THE OPERATOR"
3131
name: toolkit-validation
3232
command: ['sh', '-c']
33-
args: ["until [ -f /run/nvidia/validations/toolkit-ready ]; do echo waiting for nvidia container stack to be setup; sleep 5; done"]
33+
args: ["nvidia-validator"]
34+
env:
35+
- name: NVIDIA_VISIBLE_DEVICES
36+
value: "all"
37+
- name: WITH_WAIT
38+
value: "true"
39+
- name: COMPONENT
40+
value: toolkit
3441
securityContext:
3542
privileged: true
3643
volumeMounts:
3744
- name: run-nvidia
3845
mountPath: /run/nvidia
39-
mountPropagation: HostToContainer
46+
mountPropagation: Bidirectional
4047
- image: "FILLED BY THE OPERATOR"
4148
name: mps-control-daemon-mounts
4249
command: [mps-control-daemon, mount-shm]

0 commit comments

Comments
 (0)