From 07ab2819c3217e6d101859d9a58d4e03737daba8 Mon Sep 17 00:00:00 2001 From: Joey Chen <142381267+JoeyC-Dev@users.noreply.github.com> Date: Tue, 14 Jan 2025 16:57:11 +0800 Subject: [PATCH 1/2] Enhancement of nvidia-device-plugin-daemonset --- articles/aks/gpu-cluster.md | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/articles/aks/gpu-cluster.md b/articles/aks/gpu-cluster.md index 7a1346dc5..abf193f9b 100644 --- a/articles/aks/gpu-cluster.md +++ b/articles/aks/gpu-cluster.md @@ -138,7 +138,7 @@ To use Azure Linux, you specify the OS SKU by setting `os-sku` to `AzureLinux` d kind: DaemonSet metadata: name: nvidia-device-plugin-daemonset - namespace: kube-system + namespace: gpu-operator spec: selector: matchLabels: @@ -155,13 +155,19 @@ To use Azure Linux, you specify the OS SKU by setting `os-sku` to `AzureLinux` d operator: "Equal" value: "gpu" effect: "NoSchedule" + - key: "kubernetes.azure.com/scalesetpriority" + operator: "Equal" + value: "spot" + effect: "NoSchedule" + nodeSelector: + kubernetes.azure.com/accelerator: nvidia # Mark this pod as a critical add-on; when enabled, the critical add-on # scheduler reserves resources for critical add-on pods so that they can # be rescheduled after a failure. # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ priorityClassName: "system-node-critical" containers: - - image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0 + - image: nvcr.io/nvidia/k8s-device-plugin:v0.17.0 name: nvidia-device-plugin-ctr env: - name: FAIL_ON_INIT_ERROR From 8047ccb6c1c40e4ae5acbdb73ca7b0e3e942f5ba Mon Sep 17 00:00:00 2001 From: Joey Chen <142381267+JoeyC-Dev@users.noreply.github.com> Date: Sat, 22 Mar 2025 07:45:57 +0800 Subject: [PATCH 2/2] Apply suggestions from code review --- articles/aks/gpu-cluster.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/articles/aks/gpu-cluster.md b/articles/aks/gpu-cluster.md index abf193f9b..6128672c8 100644 --- a/articles/aks/gpu-cluster.md +++ b/articles/aks/gpu-cluster.md @@ -128,7 +128,7 @@ To use Azure Linux, you specify the OS SKU by setting `os-sku` to `AzureLinux` d 1. Create a namespace using the [`kubectl create namespace`][kubectl-create] command. ```bash - kubectl create namespace gpu-operator + kubectl create namespace gpu-resources ``` 2. Create a file named *nvidia-device-plugin-ds.yaml* and paste the following YAML manifest provided as part of the [NVIDIA device plugin for Kubernetes project][nvidia-github]: @@ -138,7 +138,7 @@ To use Azure Linux, you specify the OS SKU by setting `os-sku` to `AzureLinux` d kind: DaemonSet metadata: name: nvidia-device-plugin-daemonset - namespace: gpu-operator + namespace: gpu-resources spec: selector: matchLabels: @@ -167,7 +167,7 @@ To use Azure Linux, you specify the OS SKU by setting `os-sku` to `AzureLinux` d # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ priorityClassName: "system-node-critical" containers: - - image: nvcr.io/nvidia/k8s-device-plugin:v0.17.0 + - image: nvcr.io/nvidia/k8s-device-plugin:v0.17.1 name: nvidia-device-plugin-ctr env: - name: FAIL_ON_INIT_ERROR