Skip to content

Commit

Permalink
The value of the DCGM_FI_DEV_XID_ERRORS field is always printed twice (
Browse files Browse the repository at this point in the history
…#304)

* The value of the DCGM_FI_DEV_XID_ERRORS field is always printed twice

Signed-off-by: Vadym Fedorov <[email protected]>

* Addressed code review comments

Signed-off-by: Vadym Fedorov <[email protected]>
  • Loading branch information
nvvfedorov authored Apr 3, 2024
1 parent 1c79aca commit 57fa1c6
Show file tree
Hide file tree
Showing 4 changed files with 266 additions and 10 deletions.
32 changes: 32 additions & 0 deletions internal/pkg/testutils/testutils.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,14 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package testutils

import (
"reflect"
"runtime"
"testing"
"unsafe"
)

// RequireLinux checks if
Expand All @@ -27,3 +30,32 @@ func RequireLinux(t *testing.T) {
t.Skipf("Test is not supported on %q", runtime.GOOS)
}
}

// GetStructPrivateFieldValue returns private field value
func GetStructPrivateFieldValue[T any](t *testing.T, v any, fieldName string) T {
t.Helper()
var result T
value := reflect.ValueOf(v)
if value.Kind() == reflect.Ptr {
value = value.Elem()
}

if value.Kind() != reflect.Struct {
t.Errorf("The type %s is not stuct", value.Type())
return result
}

fieldVal := value.FieldByName(fieldName)

if !fieldVal.IsValid() {
t.Errorf("The field %s is invalid for the %s type", fieldName, value.Type())
return result
}

fieldPtr := unsafe.Pointer(fieldVal.UnsafeAddr())

// Cast the field pointer to a pointer of the correct type
realPtr := (*T)(fieldPtr)

return *realPtr
}
47 changes: 38 additions & 9 deletions pkg/cmd/app.go
Original file line number Diff line number Diff line change
Expand Up @@ -397,17 +397,12 @@ func enableDCGMExpXIDErrorsCountCollector(cs *dcgmexporter.CounterSet, fieldEnti
}

func getFieldEntityGroupTypeSystemInfo(cs *dcgmexporter.CounterSet, config *dcgmexporter.Config) *dcgmexporter.FieldEntityGroupTypeSystemInfo {
allCounters := []dcgmexporter.Counter{}
var allCounters []dcgmexporter.Counter

allCounters = append(allCounters, cs.DCGMCounters...)
allCounters = append(allCounters,
dcgmexporter.Counter{
FieldID: dcgm.DCGM_FI_DEV_CLOCK_THROTTLE_REASONS,
},
dcgmexporter.Counter{
FieldID: dcgm.DCGM_FI_DEV_XID_ERRORS,
},
)

allCounters = appendDCGMXIDErrorsCountDependency(allCounters, cs)
allCounters = appendDCGMClockEventsCountDependency(cs, allCounters)

fieldEntityGroupTypeSystemInfo := dcgmexporter.NewEntityGroupTypeSystemInfo(allCounters, config)

Expand All @@ -420,6 +415,40 @@ func getFieldEntityGroupTypeSystemInfo(cs *dcgmexporter.CounterSet, config *dcgm
return fieldEntityGroupTypeSystemInfo
}

// appendDCGMXIDErrorsCountDependency appends DCGM counters required for the DCGM_EXP_CLOCK_EVENTS_COUNT metric
func appendDCGMClockEventsCountDependency(cs *dcgmexporter.CounterSet, allCounters []dcgmexporter.Counter) []dcgmexporter.Counter {
if len(cs.ExporterCounters) > 0 {
if containsField(cs.ExporterCounters, dcgmexporter.DCGMClockEventsCount) &&
!containsField(allCounters, dcgm.DCGM_FI_DEV_CLOCK_THROTTLE_REASONS) {
allCounters = append(allCounters,
dcgmexporter.Counter{
FieldID: dcgm.DCGM_FI_DEV_CLOCK_THROTTLE_REASONS,
})
}
}
return allCounters
}

// appendDCGMXIDErrorsCountDependency appends DCGM counters required for the DCGM_EXP_XID_ERRORS_COUNT metric
func appendDCGMXIDErrorsCountDependency(allCounters []dcgmexporter.Counter, cs *dcgmexporter.CounterSet) []dcgmexporter.Counter {
if len(cs.ExporterCounters) > 0 {
if containsField(cs.ExporterCounters, dcgmexporter.DCGMXIDErrorsCount) &&
!containsField(allCounters, dcgm.DCGM_FI_DEV_XID_ERRORS) {
allCounters = append(allCounters,
dcgmexporter.Counter{
FieldID: dcgm.DCGM_FI_DEV_XID_ERRORS,
})
}
}
return allCounters
}

func containsField(slice []dcgmexporter.Counter, fieldID dcgmexporter.ExporterCounter) bool {
return slices.ContainsFunc(slice, func(counter dcgmexporter.Counter) bool {
return counter.FieldID == dcgm.Short(fieldID)
})
}

func getCounters(config *dcgmexporter.Config) *dcgmexporter.CounterSet {
cs, err := dcgmexporter.GetCounterSet(config)
if err != nil {
Expand Down
195 changes: 195 additions & 0 deletions pkg/cmd/app_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package cmd

import (
"testing"

"github.com/NVIDIA/go-dcgm/pkg/dcgm"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"

"github.com/NVIDIA/dcgm-exporter/internal/pkg/testutils"
"github.com/NVIDIA/dcgm-exporter/pkg/dcgmexporter"
)

func Test_getFieldEntityGroupTypeSystemInfo(t *testing.T) {
config := &dcgmexporter.Config{
GPUDevices: dcgmexporter.DeviceOptions{},
SwitchDevices: dcgmexporter.DeviceOptions{},
CPUDevices: dcgmexporter.DeviceOptions{},
UseFakeGPUs: true,
}

tests := []struct {
name string
counterSet *dcgmexporter.CounterSet
assertion func(*testing.T, *dcgmexporter.FieldEntityGroupTypeSystemInfo)
}{
{
name: "When DCGM_FI_DEV_XID_ERRORS and DCGM_EXP_XID_ERRORS_COUNT enabled",
counterSet: &dcgmexporter.CounterSet{
DCGMCounters: []dcgmexporter.Counter{
{
FieldID: 230,
FieldName: "DCGM_FI_DEV_XID_ERRORS",
PromType: "gauge",
Help: "Value of the last XID error encountered.",
},
},
ExporterCounters: []dcgmexporter.Counter{
{
FieldID: 9001,
FieldName: "DCGM_EXP_XID_ERRORS_COUNT",
PromType: "gauge",
Help: "Count of XID Errors within user-specified time window (see xid-count-window-size param).",
},
},
},
assertion: func(t *testing.T, got *dcgmexporter.FieldEntityGroupTypeSystemInfo) {
require.NotNil(t, got)
values := testutils.GetStructPrivateFieldValue[[]dcgmexporter.Counter](t, got, "counters")
require.Len(t, values, 1)
assert.Equal(t, dcgm.Short(230), values[0].FieldID)
},
},
{
name: "When DCGM_FI_DEV_XID_ERRORS enabled",
counterSet: &dcgmexporter.CounterSet{
DCGMCounters: []dcgmexporter.Counter{
{
FieldID: 230,
FieldName: "DCGM_FI_DEV_XID_ERRORS",
PromType: "gauge",
Help: "Value of the last XID error encountered.",
},
},
},
assertion: func(t *testing.T, got *dcgmexporter.FieldEntityGroupTypeSystemInfo) {
require.NotNil(t, got)
values := testutils.GetStructPrivateFieldValue[[]dcgmexporter.Counter](t, got, "counters")
require.Len(t, values, 1)
assert.Equal(t, dcgm.Short(230), values[0].FieldID)
},
},
{
name: "When DCGM_EXP_XID_ERRORS_COUNT enabled",
counterSet: &dcgmexporter.CounterSet{
ExporterCounters: []dcgmexporter.Counter{
{
FieldID: 9001,
FieldName: "DCGM_EXP_XID_ERRORS_COUNT",
PromType: "gauge",
Help: "Count of XID Errors within user-specified time window (see xid-count-window-size param).",
},
},
},
assertion: func(t *testing.T, got *dcgmexporter.FieldEntityGroupTypeSystemInfo) {
require.NotNil(t, got)
values := testutils.GetStructPrivateFieldValue[[]dcgmexporter.Counter](t, got, "counters")
require.Len(t, values, 1)
assert.Equal(t, dcgm.Short(230), values[0].FieldID)
},
},
{
name: "When no counters",
counterSet: &dcgmexporter.CounterSet{},
assertion: func(t *testing.T, got *dcgmexporter.FieldEntityGroupTypeSystemInfo) {
require.NotNil(t, got)
values := testutils.GetStructPrivateFieldValue[[]dcgmexporter.Counter](t, got, "counters")
require.Len(t, values, 0)
},
},
{
name: "When DCGM_FI_DEV_CLOCK_THROTTLE_REASON and DCGM_EXP_CLOCK_EVENTS_COUNT enabled",
counterSet: &dcgmexporter.CounterSet{
DCGMCounters: []dcgmexporter.Counter{
{
FieldID: 112,
FieldName: "DCGM_FI_DEV_CLOCK_THROTTLE_REASON",
PromType: "gauge",
},
},
ExporterCounters: []dcgmexporter.Counter{
{
FieldID: 9002,
FieldName: "DCGM_EXP_CLOCK_EVENTS_COUNT",
PromType: "gauge",
Help: "Count of clock events within the user-specified time window (see clock-events-count-window-size param).",
},
},
},
assertion: func(t *testing.T, got *dcgmexporter.FieldEntityGroupTypeSystemInfo) {
require.NotNil(t, got)
require.NotNil(t, got)
values := testutils.GetStructPrivateFieldValue[[]dcgmexporter.Counter](t, got, "counters")
require.Len(t, values, 1)
assert.Equal(t, dcgm.Short(112), values[0].FieldID)
},
},
{
name: "When DCGM_FI_DEV_CLOCK_THROTTLE_REASON enabled",
counterSet: &dcgmexporter.CounterSet{
DCGMCounters: []dcgmexporter.Counter{
{
FieldID: 112,
FieldName: "DCGM_FI_DEV_CLOCK_THROTTLE_REASON",
PromType: "gauge",
},
},
},
assertion: func(t *testing.T, got *dcgmexporter.FieldEntityGroupTypeSystemInfo) {
require.NotNil(t, got)
values := testutils.GetStructPrivateFieldValue[[]dcgmexporter.Counter](t, got, "counters")
require.Len(t, values, 1)
assert.Equal(t, dcgm.Short(112), values[0].FieldID)
},
},
{
name: "When DCGM_EXP_CLOCK_EVENTS_COUNT enabled",
counterSet: &dcgmexporter.CounterSet{
ExporterCounters: []dcgmexporter.Counter{
{
FieldID: 9002,
FieldName: "DCGM_EXP_CLOCK_EVENTS_COUNT",
PromType: "gauge",
Help: "Count of clock events within the user-specified time window (see clock-events-count-window-size param).",
},
},
},
assertion: func(t *testing.T, got *dcgmexporter.FieldEntityGroupTypeSystemInfo) {
require.NotNil(t, got)
values := testutils.GetStructPrivateFieldValue[[]dcgmexporter.Counter](t, got, "counters")
require.Len(t, values, 1)
assert.Equal(t, dcgm.Short(112), values[0].FieldID)
},
},
}

cleanupDCGM := initDCGM(config)
defer cleanupDCGM()

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got := getFieldEntityGroupTypeSystemInfo(tt.counterSet, config)
if tt.assertion == nil {
t.Skip(tt.name)
}
tt.assertion(t, got)
})
}
}
2 changes: 1 addition & 1 deletion pkg/dcgmexporter/exporter_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ type ExporterCounter uint16
const (
DCGMFIUnknown ExporterCounter = 0
DCGMXIDErrorsCount ExporterCounter = iota + 9000
DCGMClockEventsCount ExporterCounter = iota
DCGMClockEventsCount ExporterCounter = iota + 9000
)

// String method to convert the enum value to a string
Expand Down

0 comments on commit 57fa1c6

Please sign in to comment.