Skip to content

Commit

Permalink
feat: add support for gpu sharing metrics in k8s
Browse files Browse the repository at this point in the history
We add support for capturing separate metrics when running dcgm-exporter
in K8s clusters that have GPU sharing enabled, including time-sharing
and MPS. This should now support GPU sharing on MIG devices as well.

We ensure this is supported for both the NVIDIA and GKE device plugins,
respectively at:
* https://github.com/NVIDIA/k8s-device-plugin
* https://github.com/GoogleCloudPlatform/container-engine-accelerators

The change is guarded by a new configuration parameter,
which can be passed in as a flag `--kubernetes-virtual-gpus` or as an
environment variable `KUBERNETES_VIRTUAL_GPUS`. If set, the Kubernetes
PodMapper tranform processor uses a different mechanism to build the
device mapping, which creates a copy of the metric for every shared
(i.e. virtual) GPU exposed by the device plugin. To disambiguate the
generated timeseries, it adds a new label "vgpu" set to the detected
shared GPU replica.

This also fixes an issue where pod attributes are not guaranteed to be
consistently associated with the same metric. If the podresources API
does not consistently return the device-ids in the same order between
calls, the device-to-pod association in the map can change between
scrapes due to an overwrite that happens in the Process loop.

Ultimately, we may wish to make this the default behavior. However,
guarding it behind a flag:
1. Mitigates any risk of the change in case of bugs
2. Given the feature adds a new label, it is possible PromQL queries
   performing deaggregation using, e.g.  `without` or `ignore`
   functions, may break existing dashboards and alerts. Allowing users
   to opt-in via a flag ensures backwards compatibility in these
   scenarios.

Finally, we update the unit tests to ensure thorough coverage for the
changes.
  • Loading branch information
pintohutch committed Jan 13, 2025
1 parent 964ae23 commit 17f7828
Show file tree
Hide file tree
Showing 7 changed files with 389 additions and 51 deletions.
2 changes: 0 additions & 2 deletions cmd/dcgm-exporter/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,6 @@ import (
"log/slog"
"os"

_ "go.uber.org/automaxprocs"

"github.com/NVIDIA/dcgm-exporter/pkg/cmd"
)

Expand Down
1 change: 1 addition & 0 deletions internal/pkg/appconfig/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,4 +56,5 @@ type Config struct {
PodResourcesKubeletSocket string
HPCJobMappingDir string
NvidiaResourceNames []string
KubernetesVirtualGPUs bool
}
1 change: 1 addition & 0 deletions internal/pkg/transformation/const.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ const (
podAttribute = "pod"
namespaceAttribute = "namespace"
containerAttribute = "container"
vgpuAttribute = "vgpu"

hpcJobAttribute = "hpc_job"

Expand Down
127 changes: 127 additions & 0 deletions internal/pkg/transformation/kubernetes.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ import (
"github.com/NVIDIA/dcgm-exporter/internal/pkg/collector"
"github.com/NVIDIA/dcgm-exporter/internal/pkg/deviceinfo"
"github.com/NVIDIA/dcgm-exporter/internal/pkg/nvmlprovider"
"github.com/NVIDIA/dcgm-exporter/internal/pkg/utils"
)

var (
Expand Down Expand Up @@ -79,6 +80,52 @@ func (p *PodMapper) Process(metrics collector.MetricsByCounter, deviceInfo devic
}

slog.Debug(fmt.Sprintf("Podresources API response: %+v", pods))
if p.Config.KubernetesVirtualGPUs {
deviceToPods := p.toDeviceToSharingPods(pods, deviceInfo)

slog.Debug(fmt.Sprintf("Device to sharing pods mapping: %+v", deviceToPods))

// For each counter metric, init a slice to collect metrics to associate with shared virtual GPUs.
for counter := range metrics {
var newmetrics []collector.Metric
// For each instrumented device, build list of metrics and create
// new metrics for any shared GPUs.
for j, val := range metrics[counter] {
deviceID, err := val.GetIDOfType(p.Config.KubernetesGPUIdType)
if err != nil {
return err
}

podInfos, _ := deviceToPods[deviceID]
// For all containers using the GPU, extract and annotate a metric
// with the container info and the shared GPU label, if it exists.
// Notably, this will increase the number of unique metrics (i.e. labelsets)
// to by the number of containers sharing the GPU.
for _, pi := range podInfos {
metric, err := utils.DeepCopy(metrics[counter][j])
if err != nil {
return err
}
if !p.Config.UseOldNamespace {
metric.Attributes[podAttribute] = pi.Name
metric.Attributes[namespaceAttribute] = pi.Namespace
metric.Attributes[containerAttribute] = pi.Container
} else {
metric.Attributes[oldPodAttribute] = pi.Name
metric.Attributes[oldNamespaceAttribute] = pi.Namespace
metric.Attributes[oldContainerAttribute] = pi.Container
}
if pi.VGPU != "" {
metric.Attributes[vgpuAttribute] = pi.VGPU
}
newmetrics = append(newmetrics, metric)
}
}
// Upsert the annotated metrics into the final map.
metrics[counter] = newmetrics
}
return nil
}

deviceToPod := p.toDeviceToPod(pods, deviceInfo)

Expand Down Expand Up @@ -142,6 +189,86 @@ func (p *PodMapper) listPods(conn *grpc.ClientConn) (*podresourcesapi.ListPodRes
return resp, nil
}

func getSharedGPU(deviceID string) (string, bool) {
// Check if we're using the GKE device plugin or NVIDIA device plugin.
if strings.Contains(deviceID, gkeVirtualGPUDeviceIDSeparator) {
return strings.Split(deviceID, gkeVirtualGPUDeviceIDSeparator)[1], true
} else if strings.Contains(deviceID, "::") {
return strings.Split(deviceID, "::")[1], true
}
return "", false
}

// toDeviceToSharingPods uses the same general logic as toDeviceToPod but
// allows for multiple contianers to be associated with a metric when sharing
// strategies are used in Kubernetes.
// TODO(pintohuch): the logic is manually duplicated from toDeviceToPod for
// better isolation and easier review. Ultimately, this logic should be
// merged into a single function that can handle both shared and non-shared
// GPU states.
func (p *PodMapper) toDeviceToSharingPods(devicePods *podresourcesapi.ListPodResourcesResponse, deviceInfo deviceinfo.Provider) map[string][]PodInfo {
deviceToPodsMap := make(map[string][]PodInfo)

for _, pod := range devicePods.GetPodResources() {
for _, container := range pod.GetContainers() {
for _, device := range container.GetDevices() {

resourceName := device.GetResourceName()
if resourceName != appconfig.NvidiaResourceName && !slices.Contains(p.Config.NvidiaResourceNames, resourceName) {
// Mig resources appear differently than GPU resources
if !strings.HasPrefix(resourceName, appconfig.NvidiaMigResourcePrefix) {
continue
}
}

podInfo := PodInfo{
Name: pod.GetName(),
Namespace: pod.GetNamespace(),
Container: container.GetName(),
}

for _, deviceID := range device.GetDeviceIds() {
if vgpu, ok := getSharedGPU(deviceID); ok {
podInfo.VGPU = vgpu
}
if strings.HasPrefix(deviceID, appconfig.MIG_UUID_PREFIX) {
migDevice, err := nvmlprovider.Client().GetMIGDeviceInfoByID(deviceID)
if err == nil {
giIdentifier := deviceinfo.GetGPUInstanceIdentifier(deviceInfo, migDevice.ParentUUID,
uint(migDevice.GPUInstanceID))
deviceToPodsMap[giIdentifier] = append(deviceToPodsMap[giIdentifier], podInfo)
}
gpuUUID := deviceID[len(appconfig.MIG_UUID_PREFIX):]
deviceToPodsMap[gpuUUID] = append(deviceToPodsMap[gpuUUID], podInfo)
} else if gkeMigDeviceIDMatches := gkeMigDeviceIDRegex.FindStringSubmatch(deviceID); gkeMigDeviceIDMatches != nil {
var gpuIndex string
var gpuInstanceID string
for groupIdx, group := range gkeMigDeviceIDMatches {
switch groupIdx {
case 1:
gpuIndex = group
case 2:
gpuInstanceID = group
}
}
giIdentifier := fmt.Sprintf("%s-%s", gpuIndex, gpuInstanceID)
deviceToPodsMap[giIdentifier] = append(deviceToPodsMap[giIdentifier], podInfo)
} else if strings.Contains(deviceID, gkeVirtualGPUDeviceIDSeparator) {
deviceToPodsMap[strings.Split(deviceID, gkeVirtualGPUDeviceIDSeparator)[0]] = append(deviceToPodsMap[strings.Split(deviceID, gkeVirtualGPUDeviceIDSeparator)[0]], podInfo)
} else if strings.Contains(deviceID, "::") {
gpuInstanceID := strings.Split(deviceID, "::")[0]
deviceToPodsMap[gpuInstanceID] = append(deviceToPodsMap[gpuInstanceID], podInfo)
}
// Default mapping between deviceID and pod information
deviceToPodsMap[deviceID] = append(deviceToPodsMap[deviceID], podInfo)
}
}
}
}

return deviceToPodsMap
}

func (p *PodMapper) toDeviceToPod(
devicePods *podresourcesapi.ListPodResourcesResponse, deviceInfo deviceinfo.Provider,
) map[string]PodInfo {
Expand Down
Loading

0 comments on commit 17f7828

Please sign in to comment.