Skip to content

Commit

Permalink
exporter endpoint svc to check for gpu health
Browse files Browse the repository at this point in the history
  • Loading branch information
spraveenio authored and y2kenny-amd committed Feb 20, 2025
1 parent 2c785c0 commit 7ccad63
Show file tree
Hide file tree
Showing 11 changed files with 960 additions and 8 deletions.
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,10 @@ or
kubectl create -f https://raw.githubusercontent.com/ROCm/k8s-device-plugin/master/k8s-ds-amdgpu-labeller.yaml
```

# Health per GPU
* Extends more granular health detection per GPU using the exporter health
service over grpc socket service mounted on /var/lib/amd-metrics-exporter/

## Notes

* This plugin uses [`go modules`][gm] for dependencies management
Expand Down
4 changes: 2 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ require (
github.com/golang/glog v1.2.4
github.com/kubevirt/device-plugin-manager v1.19.5
golang.org/x/net v0.33.0
google.golang.org/grpc v1.65.0
google.golang.org/protobuf v1.34.2
k8s.io/api v0.31.0
k8s.io/apimachinery v0.31.0
k8s.io/kubelet v0.31.0
Expand Down Expand Up @@ -56,8 +58,6 @@ require (
golang.org/x/time v0.3.0 // indirect
gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect
google.golang.org/genproto/googleapis/rpc v0.0.0-20240701130421-f6361c86f094 // indirect
google.golang.org/grpc v1.65.0 // indirect
google.golang.org/protobuf v1.34.2 // indirect
gopkg.in/inf.v0 v0.9.1 // indirect
gopkg.in/yaml.v2 v2.4.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
Expand Down
6 changes: 6 additions & 0 deletions helm/amd-gpu/templates/deviceplugin-daemonset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ spec:
mountPath: /var/lib/kubelet/device-plugins
- name: sys
mountPath: /sys
- name: health
mountPath: /var/lib/amd-metrics-exporter/
resources:
{{- toYaml .Values.dp.resources | nindent 12 }}
volumes:
Expand All @@ -46,3 +48,7 @@ spec:
- name: sys
hostPath:
path: /sys
- name: health
hostPath:
path: /var/lib/amd-metrics-exporter/
type : DirectoryOrCreate
101 changes: 101 additions & 0 deletions internal/pkg/exporter/health.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
/**
# Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the \"License\");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an \"AS IS\" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/

// Package health is a collection of utility to access health exporter grpc service
// hosted by amd-metrics-exporter service
package exporter

import (
"context"
"fmt"
"os"
"strings"
"github.com/ROCm/k8s-device-plugin/internal/pkg/exporter/metricssvc"
"google.golang.org/grpc"
"google.golang.org/grpc/credentials/insecure"
"google.golang.org/protobuf/types/known/emptypb"
"github.com/golang/glog"
pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
)

const (
healthSocket = "/var/lib/amd-metrics-exporter/amdgpu_device_metrics_exporter_grpc.socket"
)

// getGPUHealth returns device id map with health state if the metrics service
// is available else returns error
func getGPUHealth() (hMap map[string]string, err error) {
// if the exporter service is not available done proceed
healthSvcAddress := fmt.Sprintf("unix://%v", healthSocket)
if _, err = os.Stat(healthSocket); err != nil {
return
}

hMap = make(map[string]string)

// the connection is short lived as the exporter can come and go
// independently
conn, err := grpc.Dial(healthSvcAddress,
grpc.WithTransportCredentials(insecure.NewCredentials()),
)
if err != nil {
glog.Errorf("Error opening client metrics svc : %v", err)
return
}

// create a new gRPC echo client through the compiled stub
client := metricssvc.NewMetricsServiceClient(conn)

defer conn.Close()

resp, err := client.List(context.Background(), &emptypb.Empty{})
if err != nil {
glog.Errorf("Error getting health info svc : %v", err)
return
}
for _, gpu := range resp.GPUState {
if gpu.Health == strings.ToLower(pluginapi.Healthy) {
hMap[gpu.Device] = pluginapi.Healthy
} else {
hMap[gpu.Device] = pluginapi.Unhealthy
}
}
return
}

// PopulatePerGPUDHealth populate the per gpu health status if available,
// else return simple health status
func PopulatePerGPUDHealth(devs []*pluginapi.Device, defaultHealth string) {
var hasHealthSvc = false
hMap, err := getGPUHealth()
if err == nil {
hasHealthSvc = true
}

for i := 0; i < len(devs); i++ {
if !hasHealthSvc {
devs[i].Health = defaultHealth
}else {
// only use if we have the device id entry
if gpuHealth, ok := hMap[devs[i].ID]; ok {
devs[i].Health = gpuHealth
} else {
// revert to simpleHealthCheck if not found
devs[i].Health = defaultHealth
}
}
}
}
Loading

0 comments on commit 7ccad63

Please sign in to comment.