From c1df5eedc7d11edbd525fca0b362372633be6b34 Mon Sep 17 00:00:00 2001 From: googs1025 Date: Sat, 14 Dec 2024 10:37:12 +0800 Subject: [PATCH] fix: use subsystem prefix in gpu operator metrics Signed-off-by: googs1025 --- controllers/object_controls_test.go | 2 +- controllers/operator_metrics.go | 88 ++++++++++++++++++----------- controllers/state_manager.go | 2 +- 3 files changed, 56 insertions(+), 36 deletions(-) diff --git a/controllers/object_controls_test.go b/controllers/object_controls_test.go index ae17a9f3d..a2c66c98b 100644 --- a/controllers/object_controls_test.go +++ b/controllers/object_controls_test.go @@ -213,7 +213,7 @@ func setup() error { scheme: s, } - clusterPolicyController.operatorMetrics = initOperatorMetrics(&clusterPolicyController) + clusterPolicyController.operatorMetrics = initOperatorMetrics() hasNFDLabels, gpuNodeCount, err := clusterPolicyController.labelGPUNodes() if err != nil { diff --git a/controllers/operator_metrics.go b/controllers/operator_metrics.go index b1e1e741d..25a923746 100644 --- a/controllers/operator_metrics.go +++ b/controllers/operator_metrics.go @@ -61,25 +61,31 @@ const ( driverAutoUpgradeEnabled = 1 driverAutoUpgradeDisabled = 0 + + // gpuOperatorSystemName is the name of the subsystem used for the GPU Operator metrics. + gpuOperatorSystemName = "gpu_operator" ) -func initOperatorMetrics(n *ClusterPolicyController) *OperatorMetrics { +func initOperatorMetrics() *OperatorMetrics { m := &OperatorMetrics{ gpuNodesTotal: promcli.NewGauge( promcli.GaugeOpts{ - Name: "gpu_operator_gpu_nodes_total", - Help: "Number of nodes with GPUs", + Subsystem: gpuOperatorSystemName, + Name: "gpu_nodes_total", + Help: "Number of nodes with GPUs", }, ), reconciliationLastSuccess: promcli.NewGauge( promcli.GaugeOpts{ - Name: "gpu_operator_reconciliation_last_success_ts_seconds", - Help: "Timestamp (in seconds) of the last reconciliation loop success", + Subsystem: gpuOperatorSystemName, + Name: "reconciliation_last_success_ts_seconds", + Help: "Timestamp (in seconds) of the last reconciliation loop success", }, ), reconciliationStatus: promcli.NewGauge( promcli.GaugeOpts{ - Name: "gpu_operator_reconciliation_status", + Subsystem: gpuOperatorSystemName, + Name: "reconciliation_status", Help: fmt.Sprintf("%d if the reconciliation is currently successful, %d if the operands are not ready, %d if the cluster policy is unavailable, %d if an error occurred within the operator.", reconciliationStatusSuccess, reconciliationStatusNotReady, @@ -89,87 +95,101 @@ func initOperatorMetrics(n *ClusterPolicyController) *OperatorMetrics { ), reconciliationTotal: promcli.NewCounter( promcli.CounterOpts{ - Name: "gpu_operator_reconciliation_total", - Help: "Total number of reconciliation", + Subsystem: gpuOperatorSystemName, + Name: "reconciliation_total", + Help: "Total number of reconciliation", }, ), reconciliationFailed: promcli.NewCounter( promcli.CounterOpts{ - Name: "gpu_operator_reconciliation_failed_total", - Help: "Number of failed reconciliation", + Subsystem: gpuOperatorSystemName, + Name: "reconciliation_failed_total", + Help: "Number of failed reconciliation", }, ), reconciliationHasNFDLabels: promcli.NewGauge( promcli.GaugeOpts{ - Name: "gpu_operator_reconciliation_has_nfd_labels", - Help: "1 if NFD mandatory kernel labels have been found, 0 otherwise", + Subsystem: gpuOperatorSystemName, + Name: "reconciliation_has_nfd_labels", + Help: "1 if NFD mandatory kernel labels have been found, 0 otherwise", }, ), openshiftDriverToolkitEnabled: promcli.NewGauge( promcli.GaugeOpts{ - Name: "gpu_operator_openshift_driver_toolkit_enabled", - Help: "1 if OCP DriverToolkit is enabled, -1 if requested but could not be enabled, 0 if not requested", + Subsystem: gpuOperatorSystemName, + Name: "openshift_driver_toolkit_enabled", + Help: "1 if OCP DriverToolkit is enabled, -1 if requested but could not be enabled, 0 if not requested", }, ), openshiftDriverToolkitNfdTooOld: promcli.NewGauge( promcli.GaugeOpts{ - Name: "gpu_operator_openshift_driver_toolkit_nfd_too_old", - Help: "1 if OCP DriverToolkit is enabled but NFD doesn't expose OSTREE labels, 0 otherwise", + Subsystem: gpuOperatorSystemName, + Name: "openshift_driver_toolkit_nfd_too_old", + Help: "1 if OCP DriverToolkit is enabled but NFD doesn't expose OSTREE labels, 0 otherwise", }, ), openshiftDriverToolkitIsMissing: promcli.NewGauge( promcli.GaugeOpts{ - Name: "gpu_operator_openshift_driver_toolkit_imagestream_missing", - Help: "1 if OCP DriverToolkit is enabled but its imagestream is not available, 0 otherwise", + Subsystem: gpuOperatorSystemName, + Name: "openshift_driver_toolkit_imagestream_missing", + Help: "1 if OCP DriverToolkit is enabled but its imagestream is not available, 0 otherwise", }, ), openshiftDriverToolkitRhcosTagsMissing: promcli.NewGauge( promcli.GaugeOpts{ - Name: "gpu_operator_openshift_driver_toolkit_rhcos_tags_missing", - Help: "1 if OCP DriverToolkit is enabled but some of the RHCOS tags are missing, 0 otherwise", + Subsystem: gpuOperatorSystemName, + Name: "openshift_driver_toolkit_rhcos_tags_missing", + Help: "1 if OCP DriverToolkit is enabled but some of the RHCOS tags are missing, 0 otherwise", }, ), openshiftDriverToolkitIsBroken: promcli.NewGauge( promcli.GaugeOpts{ - Name: "gpu_operator_openshift_driver_toolkit_imagestream_broken", - Help: "1 if OCP DriverToolkit is enabled but its imagestream is broken (rhbz#2015024), 0 otherwise", + Subsystem: gpuOperatorSystemName, + Name: "openshift_driver_toolkit_imagestream_broken", + Help: "1 if OCP DriverToolkit is enabled but its imagestream is broken (rhbz#2015024), 0 otherwise", }, ), driverAutoUpgradeEnabled: promcli.NewGauge( promcli.GaugeOpts{ - Name: "gpu_operator_driver_auto_upgrade_enabled", - Help: "1 if driver auto upgrade is enabled 0 if not", + Subsystem: gpuOperatorSystemName, + Name: "driver_auto_upgrade_enabled", + Help: "1 if driver auto upgrade is enabled 0 if not", }, ), upgradesInProgress: promcli.NewGauge( promcli.GaugeOpts{ - Name: "gpu_operator_nodes_upgrades_in_progress", - Help: "Total number of nodes on which the gpu operator pods are being upgraded", + Subsystem: gpuOperatorSystemName, + Name: "nodes_upgrades_in_progress", + Help: "Total number of nodes on which the gpu operator pods are being upgraded", }, ), upgradesDone: promcli.NewGauge( promcli.GaugeOpts{ - Name: "gpu_operator_nodes_upgrades_done", - Help: "Total number of nodes on which the gpu operator pods are successfully upgraded", + Subsystem: gpuOperatorSystemName, + Name: "nodes_upgrades_done", + Help: "Total number of nodes on which the gpu operator pods are successfully upgraded", }, ), upgradesFailed: promcli.NewGauge( promcli.GaugeOpts{ - Name: "gpu_operator_nodes_upgrades_failed", - Help: "Total number of nodes on which the gpu operator pod upgrades have failed", + Subsystem: gpuOperatorSystemName, + Name: "nodes_upgrades_failed", + Help: "Total number of nodes on which the gpu operator pod upgrades have failed", }, ), upgradesAvailable: promcli.NewGauge( promcli.GaugeOpts{ - Name: "gpu_operator_nodes_upgrades_available", - Help: "Total number of nodes on which the gpu operator pod upgrades can be done", + Subsystem: gpuOperatorSystemName, + Name: "nodes_upgrades_available", + Help: "Total number of nodes on which the gpu operator pod upgrades can be done", }, ), upgradesPending: promcli.NewGauge( promcli.GaugeOpts{ - Name: "gpu_operator_nodes_upgrades_pending", - Help: "Total number of nodes on which the gpu operator pod upgrades are pending", + Subsystem: gpuOperatorSystemName, + Name: "nodes_upgrades_pending", + Help: "Total number of nodes on which the gpu operator pod upgrades are pending", }, ), } diff --git a/controllers/state_manager.go b/controllers/state_manager.go index 9c1028ebc..d8f3ef7e6 100644 --- a/controllers/state_manager.go +++ b/controllers/state_manager.go @@ -785,7 +785,7 @@ func (n *ClusterPolicyController) init(ctx context.Context, reconciler *ClusterP n.k8sVersion = k8sVersion n.logger.Info("Kubernetes version detected", "version", k8sVersion) - n.operatorMetrics = initOperatorMetrics(n) + n.operatorMetrics = initOperatorMetrics() n.logger.Info("Operator metrics initialized.") addState(n, "/opt/gpu-operator/pre-requisites")