Skip to content

Commit

Permalink
fix: use subsystem prefix in gpu operator metrics
Browse files Browse the repository at this point in the history
Signed-off-by: googs1025 <[email protected]>
  • Loading branch information
googs1025 committed Dec 14, 2024
1 parent 26c6282 commit c1df5ee
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 36 deletions.
2 changes: 1 addition & 1 deletion controllers/object_controls_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@ func setup() error {
scheme: s,
}

clusterPolicyController.operatorMetrics = initOperatorMetrics(&clusterPolicyController)
clusterPolicyController.operatorMetrics = initOperatorMetrics()

hasNFDLabels, gpuNodeCount, err := clusterPolicyController.labelGPUNodes()
if err != nil {
Expand Down
88 changes: 54 additions & 34 deletions controllers/operator_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,25 +61,31 @@ const (

driverAutoUpgradeEnabled = 1
driverAutoUpgradeDisabled = 0

// gpuOperatorSystemName is the name of the subsystem used for the GPU Operator metrics.
gpuOperatorSystemName = "gpu_operator"
)

func initOperatorMetrics(n *ClusterPolicyController) *OperatorMetrics {
func initOperatorMetrics() *OperatorMetrics {
m := &OperatorMetrics{
gpuNodesTotal: promcli.NewGauge(
promcli.GaugeOpts{
Name: "gpu_operator_gpu_nodes_total",
Help: "Number of nodes with GPUs",
Subsystem: gpuOperatorSystemName,
Name: "gpu_nodes_total",
Help: "Number of nodes with GPUs",
},
),
reconciliationLastSuccess: promcli.NewGauge(
promcli.GaugeOpts{
Name: "gpu_operator_reconciliation_last_success_ts_seconds",
Help: "Timestamp (in seconds) of the last reconciliation loop success",
Subsystem: gpuOperatorSystemName,
Name: "reconciliation_last_success_ts_seconds",
Help: "Timestamp (in seconds) of the last reconciliation loop success",
},
),
reconciliationStatus: promcli.NewGauge(
promcli.GaugeOpts{
Name: "gpu_operator_reconciliation_status",
Subsystem: gpuOperatorSystemName,
Name: "reconciliation_status",
Help: fmt.Sprintf("%d if the reconciliation is currently successful, %d if the operands are not ready, %d if the cluster policy is unavailable, %d if an error occurred within the operator.",
reconciliationStatusSuccess,
reconciliationStatusNotReady,
Expand All @@ -89,87 +95,101 @@ func initOperatorMetrics(n *ClusterPolicyController) *OperatorMetrics {
),
reconciliationTotal: promcli.NewCounter(
promcli.CounterOpts{
Name: "gpu_operator_reconciliation_total",
Help: "Total number of reconciliation",
Subsystem: gpuOperatorSystemName,
Name: "reconciliation_total",
Help: "Total number of reconciliation",
},
),
reconciliationFailed: promcli.NewCounter(
promcli.CounterOpts{
Name: "gpu_operator_reconciliation_failed_total",
Help: "Number of failed reconciliation",
Subsystem: gpuOperatorSystemName,
Name: "reconciliation_failed_total",
Help: "Number of failed reconciliation",
},
),
reconciliationHasNFDLabels: promcli.NewGauge(
promcli.GaugeOpts{
Name: "gpu_operator_reconciliation_has_nfd_labels",
Help: "1 if NFD mandatory kernel labels have been found, 0 otherwise",
Subsystem: gpuOperatorSystemName,
Name: "reconciliation_has_nfd_labels",
Help: "1 if NFD mandatory kernel labels have been found, 0 otherwise",
},
),

openshiftDriverToolkitEnabled: promcli.NewGauge(
promcli.GaugeOpts{
Name: "gpu_operator_openshift_driver_toolkit_enabled",
Help: "1 if OCP DriverToolkit is enabled, -1 if requested but could not be enabled, 0 if not requested",
Subsystem: gpuOperatorSystemName,
Name: "openshift_driver_toolkit_enabled",
Help: "1 if OCP DriverToolkit is enabled, -1 if requested but could not be enabled, 0 if not requested",
},
),
openshiftDriverToolkitNfdTooOld: promcli.NewGauge(
promcli.GaugeOpts{
Name: "gpu_operator_openshift_driver_toolkit_nfd_too_old",
Help: "1 if OCP DriverToolkit is enabled but NFD doesn't expose OSTREE labels, 0 otherwise",
Subsystem: gpuOperatorSystemName,
Name: "openshift_driver_toolkit_nfd_too_old",
Help: "1 if OCP DriverToolkit is enabled but NFD doesn't expose OSTREE labels, 0 otherwise",
},
),
openshiftDriverToolkitIsMissing: promcli.NewGauge(
promcli.GaugeOpts{
Name: "gpu_operator_openshift_driver_toolkit_imagestream_missing",
Help: "1 if OCP DriverToolkit is enabled but its imagestream is not available, 0 otherwise",
Subsystem: gpuOperatorSystemName,
Name: "openshift_driver_toolkit_imagestream_missing",
Help: "1 if OCP DriverToolkit is enabled but its imagestream is not available, 0 otherwise",
},
),
openshiftDriverToolkitRhcosTagsMissing: promcli.NewGauge(
promcli.GaugeOpts{
Name: "gpu_operator_openshift_driver_toolkit_rhcos_tags_missing",
Help: "1 if OCP DriverToolkit is enabled but some of the RHCOS tags are missing, 0 otherwise",
Subsystem: gpuOperatorSystemName,
Name: "openshift_driver_toolkit_rhcos_tags_missing",
Help: "1 if OCP DriverToolkit is enabled but some of the RHCOS tags are missing, 0 otherwise",
},
),
openshiftDriverToolkitIsBroken: promcli.NewGauge(
promcli.GaugeOpts{
Name: "gpu_operator_openshift_driver_toolkit_imagestream_broken",
Help: "1 if OCP DriverToolkit is enabled but its imagestream is broken (rhbz#2015024), 0 otherwise",
Subsystem: gpuOperatorSystemName,
Name: "openshift_driver_toolkit_imagestream_broken",
Help: "1 if OCP DriverToolkit is enabled but its imagestream is broken (rhbz#2015024), 0 otherwise",
},
),
driverAutoUpgradeEnabled: promcli.NewGauge(
promcli.GaugeOpts{
Name: "gpu_operator_driver_auto_upgrade_enabled",
Help: "1 if driver auto upgrade is enabled 0 if not",
Subsystem: gpuOperatorSystemName,
Name: "driver_auto_upgrade_enabled",
Help: "1 if driver auto upgrade is enabled 0 if not",
},
),
upgradesInProgress: promcli.NewGauge(
promcli.GaugeOpts{
Name: "gpu_operator_nodes_upgrades_in_progress",
Help: "Total number of nodes on which the gpu operator pods are being upgraded",
Subsystem: gpuOperatorSystemName,
Name: "nodes_upgrades_in_progress",
Help: "Total number of nodes on which the gpu operator pods are being upgraded",
},
),
upgradesDone: promcli.NewGauge(
promcli.GaugeOpts{
Name: "gpu_operator_nodes_upgrades_done",
Help: "Total number of nodes on which the gpu operator pods are successfully upgraded",
Subsystem: gpuOperatorSystemName,
Name: "nodes_upgrades_done",
Help: "Total number of nodes on which the gpu operator pods are successfully upgraded",
},
),
upgradesFailed: promcli.NewGauge(
promcli.GaugeOpts{
Name: "gpu_operator_nodes_upgrades_failed",
Help: "Total number of nodes on which the gpu operator pod upgrades have failed",
Subsystem: gpuOperatorSystemName,
Name: "nodes_upgrades_failed",
Help: "Total number of nodes on which the gpu operator pod upgrades have failed",
},
),
upgradesAvailable: promcli.NewGauge(
promcli.GaugeOpts{
Name: "gpu_operator_nodes_upgrades_available",
Help: "Total number of nodes on which the gpu operator pod upgrades can be done",
Subsystem: gpuOperatorSystemName,
Name: "nodes_upgrades_available",
Help: "Total number of nodes on which the gpu operator pod upgrades can be done",
},
),
upgradesPending: promcli.NewGauge(
promcli.GaugeOpts{
Name: "gpu_operator_nodes_upgrades_pending",
Help: "Total number of nodes on which the gpu operator pod upgrades are pending",
Subsystem: gpuOperatorSystemName,
Name: "nodes_upgrades_pending",
Help: "Total number of nodes on which the gpu operator pod upgrades are pending",
},
),
}
Expand Down
2 changes: 1 addition & 1 deletion controllers/state_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -785,7 +785,7 @@ func (n *ClusterPolicyController) init(ctx context.Context, reconciler *ClusterP
n.k8sVersion = k8sVersion
n.logger.Info("Kubernetes version detected", "version", k8sVersion)

n.operatorMetrics = initOperatorMetrics(n)
n.operatorMetrics = initOperatorMetrics()
n.logger.Info("Operator metrics initialized.")

addState(n, "/opt/gpu-operator/pre-requisites")
Expand Down

0 comments on commit c1df5ee

Please sign in to comment.