From 07ab2819c3217e6d101859d9a58d4e03737daba8 Mon Sep 17 00:00:00 2001
From: Joey Chen <142381267+JoeyC-Dev@users.noreply.github.com>
Date: Tue, 14 Jan 2025 16:57:11 +0800
Subject: [PATCH 1/2] Enhancement of nvidia-device-plugin-daemonset

---
 articles/aks/gpu-cluster.md | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/articles/aks/gpu-cluster.md b/articles/aks/gpu-cluster.md
index 7a1346dc5..abf193f9b 100644
--- a/articles/aks/gpu-cluster.md
+++ b/articles/aks/gpu-cluster.md
@@ -138,7 +138,7 @@ To use Azure Linux, you specify the OS SKU by setting `os-sku` to `AzureLinux` d
     kind: DaemonSet
     metadata:
       name: nvidia-device-plugin-daemonset
-      namespace: kube-system
+      namespace: gpu-operator
     spec:
       selector:
         matchLabels:
@@ -155,13 +155,19 @@ To use Azure Linux, you specify the OS SKU by setting `os-sku` to `AzureLinux` d
             operator: "Equal"
             value: "gpu"
             effect: "NoSchedule"
+          - key: "kubernetes.azure.com/scalesetpriority"
+            operator: "Equal"
+            value: "spot"
+            effect: "NoSchedule"
+          nodeSelector:
+            kubernetes.azure.com/accelerator: nvidia
           # Mark this pod as a critical add-on; when enabled, the critical add-on
           # scheduler reserves resources for critical add-on pods so that they can
           # be rescheduled after a failure.
           # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
           priorityClassName: "system-node-critical"
           containers:
-          - image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0
+          - image: nvcr.io/nvidia/k8s-device-plugin:v0.17.0
             name: nvidia-device-plugin-ctr
             env:
               - name: FAIL_ON_INIT_ERROR

From 8047ccb6c1c40e4ae5acbdb73ca7b0e3e942f5ba Mon Sep 17 00:00:00 2001
From: Joey Chen <142381267+JoeyC-Dev@users.noreply.github.com>
Date: Sat, 22 Mar 2025 07:45:57 +0800
Subject: [PATCH 2/2] Apply suggestions from code review

---
 articles/aks/gpu-cluster.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/articles/aks/gpu-cluster.md b/articles/aks/gpu-cluster.md
index abf193f9b..6128672c8 100644
--- a/articles/aks/gpu-cluster.md
+++ b/articles/aks/gpu-cluster.md
@@ -128,7 +128,7 @@ To use Azure Linux, you specify the OS SKU by setting `os-sku` to `AzureLinux` d
 1. Create a namespace using the [`kubectl create namespace`][kubectl-create] command.
 
     ```bash
-    kubectl create namespace gpu-operator
+    kubectl create namespace gpu-resources
     ```
 
 2. Create a file named *nvidia-device-plugin-ds.yaml* and paste the following YAML manifest provided as part of the [NVIDIA device plugin for Kubernetes project][nvidia-github]:
@@ -138,7 +138,7 @@ To use Azure Linux, you specify the OS SKU by setting `os-sku` to `AzureLinux` d
     kind: DaemonSet
     metadata:
       name: nvidia-device-plugin-daemonset
-      namespace: gpu-operator
+      namespace: gpu-resources
     spec:
       selector:
         matchLabels:
@@ -167,7 +167,7 @@ To use Azure Linux, you specify the OS SKU by setting `os-sku` to `AzureLinux` d
           # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
           priorityClassName: "system-node-critical"
           containers:
-          - image: nvcr.io/nvidia/k8s-device-plugin:v0.17.0
+          - image: nvcr.io/nvidia/k8s-device-plugin:v0.17.1
             name: nvidia-device-plugin-ctr
             env:
               - name: FAIL_ON_INIT_ERROR