Skip to content

Commit 1bd4960

Browse files
committed
Change nvidia device plugin to tegra strategy
1 parent 8b78387 commit 1bd4960

File tree

4 files changed

+70
-5
lines changed

4 files changed

+70
-5
lines changed

helm/nvidia-device-plugin/templates/daemonset.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,9 @@ spec:
3737
value: "true"
3838
- name: FAIL_ON_INIT_ERROR
3939
value: "false"
40+
{{- if .Values.env }}
41+
{{- toYaml .Values.env | nindent 8 }}
42+
{{- else }}
4043
- name: DEVICE_LIST_STRATEGY
4144
value: envvar
4245
- name: DEVICE_ID_STRATEGY
@@ -45,6 +48,7 @@ spec:
4548
value: all
4649
- name: NVIDIA_DRIVER_CAPABILITIES
4750
value: "compute,utility"
51+
{{- end }}
4852
volumes:
4953
{{- toYaml .Values.volumes | nindent 8 }}
5054
hostNetwork: true

helm/nvidia-device-plugin/values.yaml

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,26 @@ resources:
2020
securityContext:
2121
privileged: true
2222

23+
# Tolerations for GPU nodes
24+
tolerations:
25+
- key: gpu
26+
operator: Equal
27+
value: "true"
28+
effect: NoExecute
29+
30+
# Environment variables for nvidia-device-plugin
31+
env:
32+
- name: DEVICE_DISCOVERY_STRATEGY
33+
value: "tegra" # Use tegra discovery strategy
34+
- name: DEVICE_LIST_STRATEGY
35+
value: "envvar" # Use envvar for device list strategy
36+
- name: DEVICE_ID_STRATEGY
37+
value: "uuid"
38+
- name: NVIDIA_VISIBLE_DEVICES
39+
value: "all"
40+
- name: NVIDIA_DRIVER_CAPABILITIES
41+
value: "compute,utility"
42+
2343
# Mount the NVIDIA libraries and device files
2444
volumeMounts:
2545
- name: device-plugin

helm/vllm/templates/deployment.yaml

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@ spec:
1515
labels:
1616
{{- include "vllm.selectorLabels" . | nindent 8 }}
1717
spec:
18+
{{- if .Values.runtimeClassName }}
19+
runtimeClassName: {{ .Values.runtimeClassName }}
20+
{{- end }}
1821
{{- with .Values.nodeSelector }}
1922
nodeSelector:
2023
{{- toYaml . | nindent 8 }}
@@ -46,6 +49,9 @@ spec:
4649
- "{{ .Values.service.targetPort }}"
4750
- "--tensor-parallel-size"
4851
- "{{ .Values.model.tensorParallelSize }}"
52+
- "--device"
53+
- "cuda"
54+
- "--trust-remote-code"
4955
{{- if .Values.persistence.enabled }}
5056
- "--download-dir"
5157
- "{{ .Values.model.downloadDir }}"
@@ -54,11 +60,14 @@ spec:
5460
{{- toYaml .Values.env | nindent 8 }}
5561
resources:
5662
{{- toYaml .Values.resources | nindent 10 }}
57-
{{- if .Values.persistence.enabled }}
5863
volumeMounts:
64+
{{- if .Values.persistence.enabled }}
5965
- name: model-storage
6066
mountPath: {{ .Values.model.downloadDir }}
6167
{{- end }}
68+
{{- if .Values.volumeMounts }}
69+
{{- toYaml .Values.volumeMounts | nindent 8 }}
70+
{{- end }}
6271
readinessProbe:
6372
httpGet:
6473
path: /health
@@ -75,9 +84,12 @@ spec:
7584
periodSeconds: 30
7685
timeoutSeconds: 10
7786
failureThreshold: 3
78-
{{- if .Values.persistence.enabled }}
7987
volumes:
88+
{{- if .Values.persistence.enabled }}
8089
- name: model-storage
8190
persistentVolumeClaim:
8291
claimName: {{ include "vllm.fullname" . }}-models
92+
{{- end }}
93+
{{- if .Values.volumes }}
94+
{{- toYaml .Values.volumes | nindent 6 }}
8395
{{- end }}

helm/vllm/values.yaml

Lines changed: 32 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
image:
22
repository: vllm/vllm-openai
3-
tag: latest
3+
tag: v0.6.3
44
pullPolicy: Always
55

66
replicaCount: 1
@@ -57,11 +57,40 @@ persistence:
5757
# Environment variables
5858
env:
5959
- name: NVIDIA_VISIBLE_DEVICES
60-
value: "all"
60+
value: "0"
61+
- name: CUDA_VISIBLE_DEVICES
62+
value: "0"
6163
- name: NVIDIA_DRIVER_CAPABILITIES
6264
value: "compute,utility"
65+
- name: VLLM_LOGGING_LEVEL
66+
value: "DEBUG"
67+
- name: CUDA_HOME
68+
value: "/usr/local/cuda"
69+
- name: LD_LIBRARY_PATH
70+
value: "/usr/lib/x86_64-linux-gnu:/usr/local/cuda/lib64"
6371

6472
# Security context
6573
securityContext:
6674
runAsNonRoot: false
67-
runAsUser: 0
75+
runAsUser: 0
76+
77+
# Runtime class for NVIDIA GPU support (commented out for k3s compatibility)
78+
# runtimeClassName: nvidia
79+
80+
# Volume mounts for NVIDIA libraries
81+
volumeMounts:
82+
- name: nvidia-libs
83+
mountPath: /usr/lib/x86_64-linux-gnu
84+
readOnly: true
85+
- name: nvidia-driver
86+
mountPath: /usr/local/nvidia
87+
readOnly: true
88+
89+
# Volumes for NVIDIA libraries
90+
volumes:
91+
- name: nvidia-libs
92+
hostPath:
93+
path: /usr/lib/x86_64-linux-gnu
94+
- name: nvidia-driver
95+
hostPath:
96+
path: /usr/local/nvidia

0 commit comments

Comments
 (0)