diff --git a/perfkitbenchmarker/data/container/azure/nvidia-device-plugin.yaml b/perfkitbenchmarker/data/container/azure/nvidia-device-plugin.yaml new file mode 100644 index 000000000..8f871a44b --- /dev/null +++ b/perfkitbenchmarker/data/container/azure/nvidia-device-plugin.yaml @@ -0,0 +1,41 @@ +# According to the official Microsoft documentation, the NVIDIA device plugin +# must be deployed as a DaemonSet to enable GPU support in the Kubernetes cluster. +# Reference: https://learn.microsoft.com/en-us/azure/aks/use-nvidia-gpu?tabs=add-ubuntu-gpu-node-pool#nvidia-device-plugin-installation +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: nvidia-device-plugin-daemonset + namespace: kube-system +spec: + selector: + matchLabels: + name: nvidia-device-plugin-ds + updateStrategy: + type: RollingUpdate + template: + metadata: + labels: + name: nvidia-device-plugin-ds + spec: + tolerations: + - key: 'nvidia.com/gpu' + operator: Exists + effect: NoSchedule + priorityClassName: 'system-node-critical' + containers: + - image: nvcr.io/nvidia/k8s-device-plugin:v0.18.0 + name: nvidia-device-plugin-ctr + env: + - name: FAIL_ON_INIT_ERROR + value: 'false' + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ['ALL'] + volumeMounts: + - name: device-plugin + mountPath: /var/lib/kubelet/device-plugins + volumes: + - name: device-plugin + hostPath: + path: /var/lib/kubelet/device-plugins diff --git a/perfkitbenchmarker/providers/azure/azure_kubernetes_service.py b/perfkitbenchmarker/providers/azure/azure_kubernetes_service.py index 45b6945b6..ae436d727 100644 --- a/perfkitbenchmarker/providers/azure/azure_kubernetes_service.py +++ b/perfkitbenchmarker/providers/azure/azure_kubernetes_service.py @@ -314,6 +314,15 @@ def _PostCreate(self): ] vm_util.IssueCommand(set_tags_cmd) self._AttachContainerRegistry() + # Install NVIDIA device plugin as a DaemonSet to enable GPU support + # in the Kubernetes cluster. + if ( + virtual_machine.GPU_COUNT.value is not None + and virtual_machine.GPU_COUNT.value > 0 + ): + self.ApplyManifest( + 'container/azure/nvidia-device-plugin.yaml', + ) def _GetCredentials(self, use_admin: bool) -> None: """Helper method to get credentials and check service account readiness.