feat(dcgm-exporter): add perPodGPUUtil ClusterPolicy field for time-slicing

zbennett10 · zbennett10 · commit e93686e76023 · 2026-03-01T19:22:24.000-06:00
Wires the dcgm-exporter per-pod GPU utilization feature (NVIDIA/dcgm-exporter#<PR>) into the ClusterPolicy CRD so GPU Operator users can enable it with a single field instead of manually patching DaemonSet args. ## What changes ClusterPolicy gets a new `spec.dcgmExporter.perPodGPUUtil` stanza: spec: dcgmExporter: perPodGPUUtil: enabled: true podResourcesSocketPath: /var/lib/kubelet/pod-resources/kubelet.sock When enabled, the operator automatically: - Sets DCGM_EXPORTER_ENABLE_PER_POD_GPU_UTIL=true env var - Mounts /var/lib/kubelet/pod-resources/ as a read-only hostPath volume - Sets hostPID: true (required to resolve /proc/<pid>/cgroup) ## Why Time-slicing is configured via ClusterPolicy (spec.devicePlugin.config) but the resulting loss of per-pod GPU observability had no equivalent ClusterPolicy lever to restore it. This closes that gap. See: NVIDIA/dcgm-exporter#587 ## Files changed - api/nvidia/v1/clusterpolicy_types.go — DCGMExporterPerPodGPUUtilConfig struct, PerPodGPUUtil field on DCGMExporterSpec, helper methods, constant - api/nvidia/v1/zz_generated.deepcopy.go — deep copy for new struct - controllers/object_controls.go — wire perPodGPUUtil into DaemonSet spec - docs/dcgm-exporter-per-pod-gpu-metrics.md — usage + cost model
diff --git a/api/nvidia/v1/clusterpolicy_types.go b/api/nvidia/v1/clusterpolicy_types.go
@@ -36,6 +36,8 @@ const (
 	ClusterPolicyCRDName = "ClusterPolicy"
 	// DefaultDCGMJobMappingDir is the default directory for DCGM Exporter HPC job mapping files
 	DefaultDCGMJobMappingDir = "/var/lib/dcgm-exporter/job-mapping"
+	// DefaultDCGMPodResourcesSocket is the default kubelet pod-resources socket path
+	DefaultDCGMPodResourcesSocket = "/var/lib/kubelet/pod-resources/kubelet.sock"
 )
 
 // ClusterPolicySpec defines the desired state of ClusterPolicy
@@ -969,6 +971,38 @@ type DCGMExporterSpec struct {
 	// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="HPC Job Mapping Configuration"
 	// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:advanced"
 	HPCJobMapping *DCGMExporterHPCJobMappingConfig `json:"hpcJobMapping,omitempty"`
+
+	// Optional: Per-pod GPU utilization metrics for CUDA time-slicing workloads.
+	// When enabled, dcgm-exporter emits dcgm_fi_dev_sm_util_per_pod gauges that
+	// attribute SM utilization to individual pods sharing a GPU via time-slicing.
+	// Requires dcgm-exporter v3.4.0+ built with --enable-per-pod-gpu-util support.
+	// See: https://github.com/NVIDIA/dcgm-exporter/issues/587
+	// +kubebuilder:validation:Optional
+	// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
+	// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Per-Pod GPU Utilization Metrics"
+	// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:advanced"
+	PerPodGPUUtil *DCGMExporterPerPodGPUUtilConfig `json:"perPodGPUUtil,omitempty"`
+}
+
+// DCGMExporterPerPodGPUUtilConfig configures per-pod GPU SM utilization metrics.
+// This feature is useful when CUDA time-slicing is active and multiple pods share
+// one physical GPU — standard per-device metrics lose per-workload attribution.
+type DCGMExporterPerPodGPUUtilConfig struct {
+	// Enable per-pod GPU utilization collection via NVML process utilization API.
+	// Requires hostPID: true (automatically set when enabled).
+	// +kubebuilder:validation:Optional
+	// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
+	// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable Per-Pod GPU Utilization"
+	// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch"
+	Enabled *bool `json:"enabled,omitempty"`
+
+	// PodResourcesSocketPath is the path to the kubelet pod-resources gRPC socket.
+	// Defaults to /var/lib/kubelet/pod-resources/kubelet.sock.
+	// +kubebuilder:validation:Optional
+	// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
+	// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Pod Resources Socket Path"
+	// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:text"
+	PodResourcesSocketPath string `json:"podResourcesSocketPath,omitempty"`
 }
 
 // DCGMExporterHPCJobMappingConfig defines HPC job mapping configuration for NVIDIA DCGM Exporter
@@ -2101,6 +2135,24 @@ func (e *DCGMExporterSpec) GetHPCJobMappingDirectory() string {
 	return e.HPCJobMapping.Directory
 }
 
+// IsPerPodGPUUtilEnabled returns true if per-pod GPU utilization metrics are enabled.
+// This feature attributes SM utilization to individual pods when CUDA time-slicing is active.
+func (e *DCGMExporterSpec) IsPerPodGPUUtilEnabled() bool {
+	if e.PerPodGPUUtil == nil || e.PerPodGPUUtil.Enabled == nil {
+		return false
+	}
+	return *e.PerPodGPUUtil.Enabled
+}
+
+// GetPerPodGPUUtilSocketPath returns the kubelet pod-resources socket path for per-pod GPU util.
+// Falls back to DefaultDCGMPodResourcesSocket if not explicitly configured.
+func (e *DCGMExporterSpec) GetPerPodGPUUtilSocketPath() string {
+	if e.PerPodGPUUtil == nil || e.PerPodGPUUtil.PodResourcesSocketPath == "" {
+		return DefaultDCGMPodResourcesSocket
+	}
+	return e.PerPodGPUUtil.PodResourcesSocketPath
+}
+
 // IsEnabled returns true if gpu-feature-discovery is enabled(default) through gpu-operator
 func (g *GPUFeatureDiscoverySpec) IsEnabled() bool {
 	if g.Enabled == nil {
diff --git a/api/nvidia/v1/zz_generated.deepcopy.go b/api/nvidia/v1/zz_generated.deepcopy.go
diff --git a/controllers/object_controls.go b/controllers/object_controls.go
@@ -1785,6 +1785,39 @@ func TransformDCGMExporter(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe
 		obj.Spec.Template.Spec.Volumes = append(obj.Spec.Template.Spec.Volumes, jobMappingVol)
 	}
 
+	// configure per-pod GPU utilization metrics when enabled (for CUDA time-slicing workloads)
+	// See: https://github.com/NVIDIA/dcgm-exporter/issues/587
+	if config.DCGMExporter.IsPerPodGPUUtilEnabled() {
+		// enable the feature flag in dcgm-exporter
+		setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), "DCGM_EXPORTER_ENABLE_PER_POD_GPU_UTIL", "true")
+
+		// resolve pod→GPU mapping via kubelet pod-resources gRPC API
+		socketPath := config.DCGMExporter.GetPerPodGPUUtilSocketPath()
+		socketDir := socketPath[:strings.LastIndex(socketPath, "/")]
+
+		podResourcesVolMount := corev1.VolumeMount{
+			Name:      "pod-resources",
+			ReadOnly:  true,
+			MountPath: socketDir,
+		}
+		obj.Spec.Template.Spec.Containers[0].VolumeMounts = append(
+			obj.Spec.Template.Spec.Containers[0].VolumeMounts, podResourcesVolMount)
+
+		podResourcesVol := corev1.Volume{
+			Name: "pod-resources",
+			VolumeSource: corev1.VolumeSource{
+				HostPath: &corev1.HostPathVolumeSource{
+					Path: socketDir,
+					Type: ptr.To(corev1.HostPathDirectory),
+				},
+			},
+		}
+		obj.Spec.Template.Spec.Volumes = append(obj.Spec.Template.Spec.Volumes, podResourcesVol)
+
+		// per-pod attribution requires resolving PIDs via /proc/<pid>/cgroup
+		obj.Spec.Template.Spec.HostPID = true
+	}
+
 	// mount configmap for custom metrics if provided by user
 	if config.DCGMExporter.MetricsConfig != nil && config.DCGMExporter.MetricsConfig.Name != "" {
 		metricsConfigVolMount := corev1.VolumeMount{Name: "metrics-config", ReadOnly: true, MountPath: MetricsConfigMountPath, SubPath: MetricsConfigFileName}
diff --git a/docs/dcgm-exporter-per-pod-gpu-metrics.md b/docs/dcgm-exporter-per-pod-gpu-metrics.md
@@ -0,0 +1,142 @@
+# Per-Pod GPU Utilization with DCGM Exporter (Time-Slicing)
+
+## Overview
+
+When GPU time-slicing is enabled via `ClusterPolicy`, multiple pods share a
+single physical GPU device. Standard DCGM metrics report aggregate utilization
+for the whole device — `dcgm_fi_dev_gpu_util` cannot distinguish how much of
+the GPU proxy, embeddings, or inference pods are each using.
+
+GPU Operator v24.x+ integrates with dcgm-exporter's per-pod GPU utilization
+feature to restore workload-level attribution without requiring MIG.
+
+## Prerequisite: dcgm-exporter v3.4.0+
+
+This feature requires dcgm-exporter v3.4.0 or later, which adds the
+`--enable-per-pod-gpu-util` flag and `dcgm_fi_dev_sm_util_per_pod` metric.
+
+See: [NVIDIA/dcgm-exporter#587](https://github.com/NVIDIA/dcgm-exporter/issues/587)
+
+## Enabling Time-Slicing + Per-Pod Metrics
+
+A complete `ClusterPolicy` for a T4 cluster running three shared workloads:
+
+```yaml
+apiVersion: nvidia.com/v1
+kind: ClusterPolicy
+metadata:
+  name: gpu-cluster-policy
+spec:
+  # 1. Configure time-slicing: 3 virtual slices per physical GPU
+  devicePlugin:
+    config:
+      name: time-slicing-config
+      default: any
+
+  # 2. Enable per-pod GPU utilization metrics in dcgm-exporter
+  dcgmExporter:
+    perPodGPUUtil:
+      enabled: true
+      # Optional: custom path (default: /var/lib/kubelet/pod-resources/kubelet.sock)
+      # podResourcesSocketPath: /var/lib/kubelet/pod-resources/kubelet.sock
+```
+
+The time-slicing ConfigMap referenced above must be deployed separately:
+
+```yaml
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: time-slicing-config
+  namespace: gpu-operator
+data:
+  any: |-
+    version: v1
+    flags:
+      migStrategy: none
+    sharing:
+      timeSlicing:
+        replicas: 3
+        renameByDefault: false
+        resources:
+          - name: nvidia.com/gpu
+            replicas: 3
+```
+
+## What GPU Operator does automatically
+
+When `dcgmExporter.perPodGPUUtil.enabled: true` is set, GPU Operator:
+
+1. Sets `DCGM_EXPORTER_ENABLE_PER_POD_GPU_UTIL=true` in the dcgm-exporter
+   DaemonSet environment.
+2. Mounts `/var/lib/kubelet/pod-resources/` as a read-only `hostPath` volume
+   so dcgm-exporter can reach the kubelet pod-resources gRPC socket.
+3. Sets `hostPID: true` on the DaemonSet so dcgm-exporter can read
+   `/proc/<pid>/cgroup` to resolve NVML PIDs to containers.
+
+## Emitted metric
+
+```
+# HELP dcgm_fi_dev_sm_util_per_pod SM utilization attributed to a pod (time-slicing)
+# TYPE dcgm_fi_dev_sm_util_per_pod gauge
+dcgm_fi_dev_sm_util_per_pod{
+  gpu="0",
+  uuid="GPU-abc123",
+  pod="synapse-proxy-7f9d4b-xkz2p",
+  namespace="synapse-staging",
+  container="proxy"
+} 42
+dcgm_fi_dev_sm_util_per_pod{...,pod="synapse-jina-...",container="jina"} 18
+dcgm_fi_dev_sm_util_per_pod{...,pod="synapse-vllm-...",container="vllm"} 35
+```
+
+## Example Prometheus alert
+
+```yaml
+groups:
+  - name: per-pod-gpu
+    rules:
+      - alert: PodGPUHighUtilization
+        expr: dcgm_fi_dev_sm_util_per_pod > 80
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "{{ $labels.namespace }}/{{ $labels.pod }} using >80% GPU SM"
+```
+
+## Cost model (example: g4dn.xlarge T4)
+
+| Setup | Nodes | Cost/day |
+|-------|-------|----------|
+| 3 workloads, no time-slicing | 3 × g4dn.xlarge | ~$38/day |
+| 3 workloads, time-slicing (3 replicas) | 1 × g4dn.xlarge | ~$13/day |
+| **Savings** | | **~$25/day (~$9,000/year)** |
+
+Time-slicing is appropriate for inference + embedding workloads that do not
+fully saturate the GPU. For compute-bound training workloads, MIG or dedicated
+GPUs remain the right choice.
+
+## Security considerations
+
+Enabling `perPodGPUUtil` grants dcgm-exporter:
+- Read access to `/var/lib/kubelet/pod-resources/` (lists all GPU-using pods)
+- Host PID namespace access (to read `/proc/<pid>/cgroup`)
+
+These are the same permissions used by other node-level monitoring agents
+(e.g., node-exporter, cAdvisor). Review your security policy before enabling
+in sensitive environments.
+
+## Compatibility
+
+| GPU Operator | dcgm-exporter | Feature available |
+|-------------|---------------|-------------------|
+| < v24.x | any | No |
+| ≥ v24.x | < v3.4.0 | Field accepted but no-op |
+| ≥ v24.x | ≥ v3.4.0 | Yes |
+
+## Related
+
+- dcgm-exporter feature: [docs/per-pod-gpu-metrics.md](https://github.com/NVIDIA/dcgm-exporter/blob/main/docs/per-pod-gpu-metrics.md)
+- Time-slicing setup: [GPU Sharing with Time-Slicing](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/gpu-sharing.html)
+- Issue: [NVIDIA/dcgm-exporter#587](https://github.com/NVIDIA/dcgm-exporter/issues/587)