diff --git a/components/infiniband/checker/pcie_helpers_test.go b/components/infiniband/checker/pcie_helpers_test.go new file mode 100644 index 00000000..c36f62ca --- /dev/null +++ b/components/infiniband/checker/pcie_helpers_test.go @@ -0,0 +1,64 @@ +/* +Copyright 2024 The Scitix Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package checker + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestPcieSpeedLessThan(t *testing.T) { + cases := []struct { + name string + a, b string + want bool + }{ + {"less_simple", "16", "32", true}, + {"greater", "32", "16", false}, + {"equal_no_decimals", "16", "16", false}, + {"equal_with_decimals", "32.0", "32", false}, + {"less_gt_suffix", "16.0 GT/s PCIe", "32.0 GT/s PCIe", true}, + {"unparseable_a_returns_false", "abc", "32", false}, + {"unparseable_b_returns_false", "16", "xyz", false}, + {"empty_returns_false", "", "32", false}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + assert.Equal(t, tc.want, pcieSpeedLessThan(tc.a, tc.b)) + }) + } +} + +func TestMinNumericSpeed(t *testing.T) { + cases := []struct { + name string + a, b string + want string + }{ + {"a_smaller", "16.0 GT/s PCIe", "32.0 GT/s PCIe", "16.0 GT/s PCIe"}, + {"b_smaller", "32", "16", "16"}, + {"equal_returns_a", "32", "32", "32"}, + {"a_unparseable_returns_empty", "abc", "16", ""}, + {"b_unparseable_returns_empty", "16", "xyz", ""}, + {"both_empty_returns_empty", "", "", ""}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + assert.Equal(t, tc.want, minNumericSpeed(tc.a, tc.b)) + }) + } +} diff --git a/components/infiniband/checker/pcie_tree_speed.go b/components/infiniband/checker/pcie_tree_speed.go index 8ed05c1e..69f4b2ec 100644 --- a/components/infiniband/checker/pcie_tree_speed.go +++ b/components/infiniband/checker/pcie_tree_speed.go @@ -18,7 +18,6 @@ package checker import ( "context" "fmt" - "math" "strconv" "strings" @@ -26,22 +25,8 @@ import ( "github.com/scitix/sichek/components/infiniband/collector" "github.com/scitix/sichek/components/infiniband/config" "github.com/scitix/sichek/consts" - "github.com/sirupsen/logrus" ) -// pcieSpeedEqual compares two PCIe speed strings ("32", "32.0", "32.00 GT/s") -// numerically so spec authors do not have to mirror sysfs's trailing-zero -// formatting verbatim. Falls back to string equality when either value is -// not parseable, preserving prior behaviour for free-form spec values. -func pcieSpeedEqual(a, b string) bool { - af, errA := strconv.ParseFloat(strings.TrimSpace(extractNumericSpeed(a)), 64) - bf, errB := strconv.ParseFloat(strings.TrimSpace(extractNumericSpeed(b)), 64) - if errA != nil || errB != nil { - return a == b - } - return math.Abs(af-bf) < 1e-9 -} - type IBPCIETreeSpeedChecker struct { id string name string @@ -89,61 +74,71 @@ func (c *IBPCIETreeSpeedChecker) Check(ctx context.Context, data any) (*common.C return &result, fmt.Errorf("fail to get the IB device") } - failedDevices := make([]string, 0) - spec := make([]string, 0, hwInfoLen) - curr := make([]string, 0, hwInfoLen) - var failedSpec []string - var failedCurr []string - infinibandInfo.RLock() hws := uniqueByDev(infinibandInfo.IBHardWareInfo) infinibandInfo.RUnlock() + + failedDevices := make([]string, 0) + failedCurr := make([]string, 0) + failedCap := make([]string, 0) + detailLines := make([]string, 0) + suggestionLines := make([]string, 0) + for _, hwInfo := range hws { - if _, ok := c.spec.HCAs[hwInfo.BoardID]; !ok { - logrus.WithField("component", "infiniband").Warnf("HCA %s not found in spec, skipping %s", hwInfo.BoardID, c.name) + if len(hwInfo.PCIETreeLinks) == 0 { + // Direct-to-CPU or sysfs unavailable; treat as normal. continue } - hcaSpec := c.spec.HCAs[hwInfo.BoardID] - // Tree-speed has its own spec field (yaml: pcie_tree_speed) because - // upstream switches and root complexes are often slower than the - // device-level link. CX8 e.g. links at PCIe Gen6 (64 GT/s) but the - // upstream Gen5 switch caps the path at 32 GT/s. Fall back to - // PCIESpeed for board specs that predate the dedicated field. - treeSpec := hcaSpec.Hardware.PCIETreeSpeedMin - if treeSpec == "" { - treeSpec = hcaSpec.Hardware.PCIESpeed - } - expectedSpeed := extractNumericSpeed(treeSpec) - spec = append(spec, treeSpec) - treeSpeedMin := hwInfo.PCIETreeSpeedMin - if treeSpeedMin == "" { - // No upstream tree info available (e.g., direct to CPU), skip - curr = append(curr, hwInfo.PCIESpeed) + // Compute the path-level capability: the minimum parseable cap across + // all links. A link whose max is unparseable is excluded from the + // path cap calculation (treated as ∞ for this purpose). If no link + // yields a parseable cap, skip the whole NIC. + pathCap := "" + for _, link := range hwInfo.PCIETreeLinks { + cap := minNumericSpeed(link.ParentMaxSpeed, link.ChildMaxSpeed) + if cap == "" { + continue + } + if pathCap == "" { + pathCap = cap + } else { + pathCap = minNumericSpeed(pathCap, cap) + } + } + if pathCap == "" { + // No parseable caps on any link; skip silently. continue } - curr = append(curr, treeSpeedMin) - // Compare numerically so "32" / "32.0" / "32.00" all match without - // requiring spec authors to keep the trailing zero in sync with sysfs. - if !pcieSpeedEqual(treeSpeedMin, expectedSpeed) { - result.Status = consts.StatusAbnormal - devInfo := fmt.Sprintf("%s(%s)", hwInfo.IBDev, hwInfo.PCIEBDF) - if hwInfo.PCIETreeSpeedMinBDF != "" { - devInfo = fmt.Sprintf("%s(%s, bottleneck@%s)", hwInfo.IBDev, hwInfo.PCIEBDF, hwInfo.PCIETreeSpeedMinBDF) + // Flag each link whose current speed falls below the path cap — these + // are the true bottlenecks. A link running at the path cap (because + // its own max matches the path cap) is expected and not flagged. + for _, link := range hwInfo.PCIETreeLinks { + if !pcieSpeedLessThan(link.CurSpeed, pathCap) { + continue } + result.Status = consts.StatusAbnormal + devInfo := fmt.Sprintf("%s(%s, bottleneck@%s->%s)", + hwInfo.IBDev, hwInfo.PCIEBDF, link.ParentBDF, link.ChildBDF) failedDevices = append(failedDevices, devInfo) - failedSpec = append(failedSpec, treeSpec) - failedCurr = append(failedCurr, treeSpeedMin) + failedCurr = append(failedCurr, link.CurSpeed) + failedCap = append(failedCap, pathCap) + detailLines = append(detailLines, fmt.Sprintf( + "%s upstream link %s->%s current %s < cap %s", + hwInfo.IBDev, link.ParentBDF, link.ChildBDF, link.CurSpeed, pathCap)) + suggestionLines = append(suggestionLines, fmt.Sprintf( + "Check upstream PCIe link %s->%s for %s, current %s is below link capability %s (min of both endpoints' max).", + link.ParentBDF, link.ChildBDF, hwInfo.IBDev, link.CurSpeed, pathCap)) } } - result.Curr = strings.Join(curr, ",") - result.Spec = strings.Join(spec, ",") + result.Curr = strings.Join(failedCurr, ",") + result.Spec = strings.Join(failedCap, ",") result.Device = strings.Join(failedDevices, ",") if len(failedDevices) != 0 { - result.Detail = fmt.Sprintf("PCIETreeSpeed check fail: %s upstream path min speed %s, expect %s", strings.Join(failedDevices, ","), strings.Join(failedCurr, ","), strings.Join(failedSpec, ",")) - result.Suggestion = fmt.Sprintf("Check upstream PCIe switch/bridge speed for %s, expected %s but found %s in path to root complex", strings.Join(failedDevices, ","), strings.Join(failedSpec, ","), strings.Join(failedCurr, ",")) + result.Detail = strings.Join(detailLines, "\n") + result.Suggestion = strings.Join(suggestionLines, "\n") } return &result, nil @@ -159,13 +154,31 @@ func extractNumericSpeed(speed string) string { return parts[0] } -// numericSpeedEqual compares two speed strings numerically to avoid -// false mismatches like "16" != "16.0". -func numericSpeedEqual(a, b string) bool { - va, errA := strconv.ParseFloat(a, 64) - vb, errB := strconv.ParseFloat(b, 64) +// pcieSpeedLessThan returns true iff a < b after extracting the leading numeric +// part of each (so "16.0 GT/s PCIe" parses as 16.0). Returns false when either +// value cannot be parsed — callers must treat "unknown" as "not less" so the +// checker stays normal on unreadable sysfs entries. +func pcieSpeedLessThan(a, b string) bool { + af, errA := strconv.ParseFloat(strings.TrimSpace(extractNumericSpeed(a)), 64) + bf, errB := strconv.ParseFloat(strings.TrimSpace(extractNumericSpeed(b)), 64) if errA != nil || errB != nil { - return a == b + return false + } + return af < bf-1e-9 +} + +// minNumericSpeed returns whichever of a or b parses to the smaller numeric +// value, preserving the raw string form. If either side is unparseable +// returns "" so the checker can skip the link rather than emit a noisy +// "unknown" comparison. +func minNumericSpeed(a, b string) string { + af, errA := strconv.ParseFloat(strings.TrimSpace(extractNumericSpeed(a)), 64) + bf, errB := strconv.ParseFloat(strings.TrimSpace(extractNumericSpeed(b)), 64) + if errA != nil || errB != nil { + return "" + } + if af <= bf { + return a } - return va == vb + return b } diff --git a/components/infiniband/checker/pcie_tree_speed_test.go b/components/infiniband/checker/pcie_tree_speed_test.go new file mode 100644 index 00000000..987e0f6f --- /dev/null +++ b/components/infiniband/checker/pcie_tree_speed_test.go @@ -0,0 +1,171 @@ +/* +Copyright 2024 The Scitix Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package checker + +import ( + "context" + "strings" + "testing" + + "github.com/scitix/sichek/components/infiniband/collector" + "github.com/scitix/sichek/components/infiniband/config" + "github.com/scitix/sichek/consts" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func buildHW(dev, bdf string, links []collector.PCIETreeLink) collector.IBHardWareInfo { + return collector.IBHardWareInfo{ + IBDev: dev, + PCIEBDF: bdf, + PCIETreeLinks: links, + } +} + +// runSpeedChecker constructs an InfinibandInfo from the given map of IBDev → +// IBHardWareInfo and runs the checker. The unexported sync.RWMutex inside +// InfinibandInfo is zero-value usable, so we don't touch it. +func runSpeedChecker(t *testing.T, hws map[string]collector.IBHardWareInfo) (string, string, string, string) { + t.Helper() + ck, err := NewIBPCIETreeSpeedChecker(&config.InfinibandSpec{}) + require.NoError(t, err) + info := &collector.InfinibandInfo{IBHardWareInfo: hws} + res, err := ck.Check(context.Background(), info) + require.NoError(t, err) + return res.Status, res.Device, res.Curr, res.Spec +} + +func TestIBPCIETreeSpeedChecker_NoLinksReportsNormal(t *testing.T) { + status, dev, curr, spec := runSpeedChecker(t, map[string]collector.IBHardWareInfo{ + "mlx5_0": buildHW("mlx5_0", "0000:82:00.0", nil), + }) + assert.Equal(t, consts.StatusNormal, status) + assert.Empty(t, dev) + assert.Empty(t, curr) + assert.Empty(t, spec) +} + +func TestIBPCIETreeSpeedChecker_FullSpeedNormal(t *testing.T) { + hw := buildHW("mlx5_0", "0000:82:00.0", []collector.PCIETreeLink{ + { + ParentBDF: "0000:80:01.0", ChildBDF: "0000:81:00.0", + CurSpeed: "32.0 GT/s PCIe", + ParentMaxSpeed: "32.0 GT/s PCIe", ChildMaxSpeed: "32.0 GT/s PCIe", + }, + { + ParentBDF: "0000:81:00.0", ChildBDF: "0000:82:00.0", + CurSpeed: "32.0 GT/s PCIe", + ParentMaxSpeed: "32.0 GT/s PCIe", ChildMaxSpeed: "32.0 GT/s PCIe", + }, + }) + status, dev, _, _ := runSpeedChecker(t, map[string]collector.IBHardWareInfo{"mlx5_0": hw}) + assert.Equal(t, consts.StatusNormal, status) + assert.Empty(t, dev) +} + +func TestIBPCIETreeSpeedChecker_RootGen4_NoAlarm(t *testing.T) { + hw := buildHW("mlx5_0", "0000:82:00.0", []collector.PCIETreeLink{ + { + ParentBDF: "0000:80:01.0", ChildBDF: "0000:81:00.0", + CurSpeed: "16.0 GT/s PCIe", + ParentMaxSpeed: "16.0 GT/s PCIe", ChildMaxSpeed: "32.0 GT/s PCIe", + }, + { + ParentBDF: "0000:81:00.0", ChildBDF: "0000:82:00.0", + CurSpeed: "16.0 GT/s PCIe", + ParentMaxSpeed: "32.0 GT/s PCIe", ChildMaxSpeed: "32.0 GT/s PCIe", + }, + }) + status, _, _, _ := runSpeedChecker(t, map[string]collector.IBHardWareInfo{"mlx5_0": hw}) + assert.Equal(t, consts.StatusNormal, status) +} + +func TestIBPCIETreeSpeedChecker_OneLinkDegraded(t *testing.T) { + hw := buildHW("mlx5_0", "0000:82:00.0", []collector.PCIETreeLink{ + { + ParentBDF: "0000:80:01.0", ChildBDF: "0000:81:00.0", + CurSpeed: "32.0 GT/s PCIe", + ParentMaxSpeed: "32.0 GT/s PCIe", ChildMaxSpeed: "32.0 GT/s PCIe", + }, + { + ParentBDF: "0000:81:00.0", ChildBDF: "0000:82:00.0", + CurSpeed: "16.0 GT/s PCIe", + ParentMaxSpeed: "32.0 GT/s PCIe", ChildMaxSpeed: "32.0 GT/s PCIe", + }, + }) + status, dev, curr, spec := runSpeedChecker(t, map[string]collector.IBHardWareInfo{"mlx5_0": hw}) + assert.Equal(t, consts.StatusAbnormal, status) + assert.Contains(t, dev, "mlx5_0(0000:82:00.0, bottleneck@0000:81:00.0->0000:82:00.0)") + assert.Equal(t, "16.0 GT/s PCIe", curr) + assert.Equal(t, "32.0 GT/s PCIe", spec) +} + +func TestIBPCIETreeSpeedChecker_MultipleLinksDegraded(t *testing.T) { + hw := buildHW("mlx5_3", "0000:c1:00.0", []collector.PCIETreeLink{ + { + ParentBDF: "0000:c0:01.0", ChildBDF: "0000:c1:00.0", + CurSpeed: "8.0 GT/s PCIe", + ParentMaxSpeed: "32.0 GT/s PCIe", ChildMaxSpeed: "32.0 GT/s PCIe", + }, + { + ParentBDF: "0000:bf:00.0", ChildBDF: "0000:c0:01.0", + CurSpeed: "16.0 GT/s PCIe", + ParentMaxSpeed: "32.0 GT/s PCIe", ChildMaxSpeed: "32.0 GT/s PCIe", + }, + }) + status, dev, curr, spec := runSpeedChecker(t, map[string]collector.IBHardWareInfo{"mlx5_3": hw}) + assert.Equal(t, consts.StatusAbnormal, status) + assert.Equal(t, 2, strings.Count(dev, "bottleneck")) + // Both currents and both caps joined by comma, order matches PCIETreeLinks order. + assert.Equal(t, "8.0 GT/s PCIe,16.0 GT/s PCIe", curr) + assert.Equal(t, "32.0 GT/s PCIe,32.0 GT/s PCIe", spec) +} + +func TestIBPCIETreeSpeedChecker_MultiNIC_OneDegraded(t *testing.T) { + good := buildHW("mlx5_0", "0000:82:00.0", []collector.PCIETreeLink{ + { + ParentBDF: "0000:80:01.0", ChildBDF: "0000:82:00.0", + CurSpeed: "32.0 GT/s PCIe", + ParentMaxSpeed: "32.0 GT/s PCIe", ChildMaxSpeed: "32.0 GT/s PCIe", + }, + }) + bad := buildHW("mlx5_3", "0000:c1:00.0", []collector.PCIETreeLink{ + { + ParentBDF: "0000:c0:01.0", ChildBDF: "0000:c1:00.0", + CurSpeed: "16.0 GT/s PCIe", + ParentMaxSpeed: "32.0 GT/s PCIe", ChildMaxSpeed: "32.0 GT/s PCIe", + }, + }) + status, dev, _, _ := runSpeedChecker(t, map[string]collector.IBHardWareInfo{ + "mlx5_0": good, + "mlx5_3": bad, + }) + assert.Equal(t, consts.StatusAbnormal, status) + assert.Contains(t, dev, "mlx5_3") + assert.NotContains(t, dev, "mlx5_0") +} + +func TestIBPCIETreeSpeedChecker_UnparseableMaxSkipped(t *testing.T) { + hw := buildHW("mlx5_0", "0000:82:00.0", []collector.PCIETreeLink{ + { + ParentBDF: "0000:80:01.0", ChildBDF: "0000:82:00.0", + CurSpeed: "16.0 GT/s PCIe", + ParentMaxSpeed: "", ChildMaxSpeed: "Unknown", + }, + }) + status, _, _, _ := runSpeedChecker(t, map[string]collector.IBHardWareInfo{"mlx5_0": hw}) + assert.Equal(t, consts.StatusNormal, status) +} diff --git a/components/infiniband/checker/pcie_tree_width.go b/components/infiniband/checker/pcie_tree_width.go index 45371382..bbd04e25 100644 --- a/components/infiniband/checker/pcie_tree_width.go +++ b/components/infiniband/checker/pcie_tree_width.go @@ -24,7 +24,6 @@ import ( "github.com/scitix/sichek/components/infiniband/collector" "github.com/scitix/sichek/components/infiniband/config" "github.com/scitix/sichek/consts" - "github.com/sirupsen/logrus" ) type IBPCIETreeWidthChecker struct { @@ -74,55 +73,68 @@ func (c *IBPCIETreeWidthChecker) Check(ctx context.Context, data any) (*common.C return &result, fmt.Errorf("fail to get the IB device") } - failedDevices := make([]string, 0) - spec := make([]string, 0, hwInfoLen) - curr := make([]string, 0, hwInfoLen) - var failedSpec []string - var failedCurr []string - infinibandInfo.RLock() hws := uniqueByDev(infinibandInfo.IBHardWareInfo) infinibandInfo.RUnlock() + + failedDevices := make([]string, 0) + failedCurr := make([]string, 0) + failedCap := make([]string, 0) + detailLines := make([]string, 0) + suggestionLines := make([]string, 0) + for _, hwInfo := range hws { - if _, ok := c.spec.HCAs[hwInfo.BoardID]; !ok { - logrus.WithField("component", "infiniband").Warnf("HCA %s not found in spec, skipping %s", hwInfo.BoardID, c.name) + if len(hwInfo.PCIETreeLinks) == 0 { + // Direct-to-CPU or sysfs unavailable; treat as normal. continue } - hcaSpec := c.spec.HCAs[hwInfo.BoardID] - // Prefer the dedicated tree spec (yaml: pcie_tree_width); fall back - // to the device-level PCIEWidth for older specs that omit it. - expectedWidth := hcaSpec.Hardware.PCIETreeWidthMin - if expectedWidth == "" { - expectedWidth = hcaSpec.Hardware.PCIEWidth - } - spec = append(spec, expectedWidth) - treeWidthMin := hwInfo.PCIETreeWidthMin - if treeWidthMin == "" { - // No upstream tree info available (e.g., direct to CPU), skip - curr = append(curr, hwInfo.PCIEWidth) + // Compute the path-level cap: minimum parseable cap across all links. + // Links whose max is unparseable are excluded from the path cap (not + // constraining the path). If no link yields a parseable cap, skip + // the whole NIC silently. + pathCap := "" + for _, link := range hwInfo.PCIETreeLinks { + cap := minNumericSpeed(link.ParentMaxWidth, link.ChildMaxWidth) + if cap == "" { + continue + } + if pathCap == "" { + pathCap = cap + } else { + pathCap = minNumericSpeed(pathCap, cap) + } + } + if pathCap == "" { continue } - curr = append(curr, treeWidthMin) - if !numericSpeedEqual(treeWidthMin, expectedWidth) { - result.Status = consts.StatusAbnormal - devInfo := fmt.Sprintf("%s(%s)", hwInfo.IBDev, hwInfo.PCIEBDF) - if hwInfo.PCIETreeWidthMinBDF != "" { - devInfo = fmt.Sprintf("%s(%s, bottleneck@%s)", hwInfo.IBDev, hwInfo.PCIEBDF, hwInfo.PCIETreeWidthMinBDF) + // Flag each link whose current width falls below the path cap. + for _, link := range hwInfo.PCIETreeLinks { + if !pcieSpeedLessThan(link.CurWidth, pathCap) { + continue } + result.Status = consts.StatusAbnormal + devInfo := fmt.Sprintf("%s(%s, bottleneck@%s->%s)", + hwInfo.IBDev, hwInfo.PCIEBDF, link.ParentBDF, link.ChildBDF) failedDevices = append(failedDevices, devInfo) - failedSpec = append(failedSpec, expectedWidth) - failedCurr = append(failedCurr, treeWidthMin) + failedCurr = append(failedCurr, link.CurWidth) + failedCap = append(failedCap, pathCap) + detailLines = append(detailLines, fmt.Sprintf( + "%s upstream link %s->%s current width x%s < cap x%s", + hwInfo.IBDev, link.ParentBDF, link.ChildBDF, link.CurWidth, pathCap)) + suggestionLines = append(suggestionLines, fmt.Sprintf( + "Check upstream PCIe link %s->%s for %s, current width x%s is below link capability x%s (min of both endpoints' max).", + link.ParentBDF, link.ChildBDF, hwInfo.IBDev, link.CurWidth, pathCap)) } } - result.Curr = strings.Join(curr, ",") - result.Spec = strings.Join(spec, ",") + result.Curr = strings.Join(failedCurr, ",") + result.Spec = strings.Join(failedCap, ",") result.Device = strings.Join(failedDevices, ",") if len(failedDevices) != 0 { - result.Detail = fmt.Sprintf("PCIETreeWidth check fail: %s upstream path min width x%s, expect x%s", strings.Join(failedDevices, ","), strings.Join(failedCurr, ","), strings.Join(failedSpec, ",")) - result.Suggestion = fmt.Sprintf("Check upstream PCIe switch/bridge width for %s, expected x%s but found x%s in path to root complex", strings.Join(failedDevices, ","), strings.Join(failedSpec, ","), strings.Join(failedCurr, ",")) + result.Detail = strings.Join(detailLines, "\n") + result.Suggestion = strings.Join(suggestionLines, "\n") } return &result, nil diff --git a/components/infiniband/checker/pcie_tree_width_test.go b/components/infiniband/checker/pcie_tree_width_test.go new file mode 100644 index 00000000..1e5e1763 --- /dev/null +++ b/components/infiniband/checker/pcie_tree_width_test.go @@ -0,0 +1,95 @@ +/* +Copyright 2024 The Scitix Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package checker + +import ( + "context" + "testing" + + "github.com/scitix/sichek/components/infiniband/collector" + "github.com/scitix/sichek/components/infiniband/config" + "github.com/scitix/sichek/consts" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func runWidthChecker(t *testing.T, hws map[string]collector.IBHardWareInfo) (string, string, string, string) { + t.Helper() + ck, err := NewIBPCIETreeWidthChecker(&config.InfinibandSpec{}) + require.NoError(t, err) + info := &collector.InfinibandInfo{IBHardWareInfo: hws} + res, err := ck.Check(context.Background(), info) + require.NoError(t, err) + return res.Status, res.Device, res.Curr, res.Spec +} + +func TestIBPCIETreeWidthChecker_NoLinksNormal(t *testing.T) { + status, dev, _, _ := runWidthChecker(t, map[string]collector.IBHardWareInfo{ + "mlx5_0": {IBDev: "mlx5_0", PCIEBDF: "0000:82:00.0", PCIETreeLinks: nil}, + }) + assert.Equal(t, consts.StatusNormal, status) + assert.Empty(t, dev) +} + +func TestIBPCIETreeWidthChecker_FullWidthNormal(t *testing.T) { + hw := collector.IBHardWareInfo{ + IBDev: "mlx5_0", PCIEBDF: "0000:82:00.0", + PCIETreeLinks: []collector.PCIETreeLink{ + { + ParentBDF: "0000:80:01.0", ChildBDF: "0000:82:00.0", + CurWidth: "16", + ParentMaxWidth: "16", ChildMaxWidth: "16", + }, + }, + } + status, dev, _, _ := runWidthChecker(t, map[string]collector.IBHardWareInfo{"mlx5_0": hw}) + assert.Equal(t, consts.StatusNormal, status) + assert.Empty(t, dev) +} + +func TestIBPCIETreeWidthChecker_OneLinkDegraded(t *testing.T) { + hw := collector.IBHardWareInfo{ + IBDev: "mlx5_0", PCIEBDF: "0000:82:00.0", + PCIETreeLinks: []collector.PCIETreeLink{ + { + ParentBDF: "0000:80:01.0", ChildBDF: "0000:82:00.0", + CurWidth: "8", + ParentMaxWidth: "16", ChildMaxWidth: "16", + }, + }, + } + status, dev, curr, spec := runWidthChecker(t, map[string]collector.IBHardWareInfo{"mlx5_0": hw}) + assert.Equal(t, consts.StatusAbnormal, status) + assert.Contains(t, dev, "mlx5_0(0000:82:00.0, bottleneck@0000:80:01.0->0000:82:00.0)") + assert.Equal(t, "8", curr) + assert.Equal(t, "16", spec) +} + +func TestIBPCIETreeWidthChecker_ParentSlotWiderThanChild_NoAlarm(t *testing.T) { + // E.g. slot is x16 but the NIC supports only x8; running at x8 is correct. + hw := collector.IBHardWareInfo{ + IBDev: "mlx5_0", PCIEBDF: "0000:82:00.0", + PCIETreeLinks: []collector.PCIETreeLink{ + { + ParentBDF: "0000:80:01.0", ChildBDF: "0000:82:00.0", + CurWidth: "8", + ParentMaxWidth: "16", ChildMaxWidth: "8", + }, + }, + } + status, _, _, _ := runWidthChecker(t, map[string]collector.IBHardWareInfo{"mlx5_0": hw}) + assert.Equal(t, consts.StatusNormal, status) +} diff --git a/components/infiniband/collector/ib_hardware_info.go b/components/infiniband/collector/ib_hardware_info.go index 40f67109..03389cd9 100644 --- a/components/infiniband/collector/ib_hardware_info.go +++ b/components/infiniband/collector/ib_hardware_info.go @@ -33,33 +33,34 @@ import ( ) type IBHardWareInfo struct { - IBDev string `json:"IBdev" yaml:"IBdev"` - Port int `json:"port,omitempty" yaml:"port,omitempty"` - NetDev string `json:"net_dev" yaml:"net_dev"` - HCAType string `json:"hca_type" yaml:"hca_type"` - SystemGUID string `json:"system_guid" yaml:"system_guid"` - NodeGUID string `json:"node_guid" yaml:"node_guid"` - PFGW string `json:"pf_gw" yaml:"pf_gw"` - VFSpec string `json:"vf_spec" yaml:"vf_spec"` - VFNum string `json:"vf_num" yaml:"vf_num"` - PhyState string `json:"phy_state" yaml:"phy_state"` - PortState string `json:"port_state" yaml:"port_state"` - LinkLayer string `json:"link_layer" yaml:"link_layer"` - NetOperstate string `json:"net_operstate" yaml:"net_operstate"` - PortSpeed string `json:"port_speed" yaml:"port_speed"` - PortSpeedState string `json:"port_speed_state" yaml:"port_speed_state"` - BoardID string `json:"board_id" yaml:"board_id"` - DeviceID string `json:"device_id" yaml:"device_id"` - PCIEBDF string `json:"pcie_bdf" yaml:"pcie_bdf"` - PCIESpeed string `json:"pcie_speed" yaml:"pcie_speed"` - PCIESpeedState string `json:"pcie_speed_state" yaml:"pcie_speed_state"` - PCIEWidth string `json:"pcie_width" yaml:"pcie_width"` - PCIEWidthState string `json:"pcie_width_state" yaml:"pcie_width_state"` - PCIETreeSpeedMin string `json:"pcie_tree_speed" yaml:"pcie_tree_speed"` - PCIETreeSpeedMinBDF string `json:"pcie_tree_speed_bdf" yaml:"pcie_tree_speed_bdf"` - PCIETreeWidthMin string `json:"pcie_tree_width" yaml:"pcie_tree_width"` - PCIETreeWidthMinBDF string `json:"pcie_tree_width_bdf" yaml:"pcie_tree_width_bdf"` - PCIEMRR string `json:"pcie_mrr" yaml:"pcie_mrr"` + IBDev string `json:"IBdev" yaml:"IBdev"` + Port int `json:"port,omitempty" yaml:"port,omitempty"` + NetDev string `json:"net_dev" yaml:"net_dev"` + HCAType string `json:"hca_type" yaml:"hca_type"` + SystemGUID string `json:"system_guid" yaml:"system_guid"` + NodeGUID string `json:"node_guid" yaml:"node_guid"` + PFGW string `json:"pf_gw" yaml:"pf_gw"` + VFSpec string `json:"vf_spec" yaml:"vf_spec"` + VFNum string `json:"vf_num" yaml:"vf_num"` + PhyState string `json:"phy_state" yaml:"phy_state"` + PortState string `json:"port_state" yaml:"port_state"` + LinkLayer string `json:"link_layer" yaml:"link_layer"` + NetOperstate string `json:"net_operstate" yaml:"net_operstate"` + PortSpeed string `json:"port_speed" yaml:"port_speed"` + PortSpeedState string `json:"port_speed_state" yaml:"port_speed_state"` + BoardID string `json:"board_id" yaml:"board_id"` + DeviceID string `json:"device_id" yaml:"device_id"` + PCIEBDF string `json:"pcie_bdf" yaml:"pcie_bdf"` + PCIESpeed string `json:"pcie_speed" yaml:"pcie_speed"` + PCIESpeedState string `json:"pcie_speed_state" yaml:"pcie_speed_state"` + PCIEWidth string `json:"pcie_width" yaml:"pcie_width"` + PCIEWidthState string `json:"pcie_width_state" yaml:"pcie_width_state"` + PCIETreeSpeedMin string `json:"pcie_tree_speed" yaml:"pcie_tree_speed"` + PCIETreeSpeedMinBDF string `json:"pcie_tree_speed_bdf" yaml:"pcie_tree_speed_bdf"` + PCIETreeWidthMin string `json:"pcie_tree_width" yaml:"pcie_tree_width"` + PCIETreeWidthMinBDF string `json:"pcie_tree_width_bdf" yaml:"pcie_tree_width_bdf"` + PCIETreeLinks []PCIETreeLink `json:"pcie_tree_links" yaml:"pcie_tree_links"` + PCIEMRR string `json:"pcie_mrr" yaml:"pcie_mrr"` // Slot string `json:"slot" yaml:"slot"` NumaNode string `json:"numa_node" yaml:"numa_node"` CPULists string `json:"cpu_lists" yaml:"cpu_lists"` @@ -119,8 +120,9 @@ func (hw *IBHardWareInfo) Collect(ctx context.Context, IBDev string, port int, i if len(GetPCIEMRR(ctx, IBDev)) >= 1 { hw.PCIEMRR = GetPCIEMRR(ctx, IBDev)[0] } - hw.PCIETreeSpeedMin, hw.PCIETreeSpeedMinBDF = GetPCIETreeMin(IBDev, "current_link_speed") - hw.PCIETreeWidthMin, hw.PCIETreeWidthMinBDF = GetPCIETreeMin(IBDev, "current_link_width") + hw.PCIETreeLinks = GetPCIETreeLinks(IBDev) + hw.PCIETreeSpeedMin, hw.PCIETreeSpeedMinBDF = minLinkCurSpeed(hw.PCIETreeLinks) + hw.PCIETreeWidthMin, hw.PCIETreeWidthMinBDF = minLinkCurWidth(hw.PCIETreeLinks) if len(GetNumaNode(IBDev)) >= 1 { hw.NumaNode = GetNumaNode(IBDev)[0] } @@ -460,3 +462,72 @@ func GetCPUList(IBDev string) []string { } return CPUList } + +// minLinkCurSpeed returns the smallest CurSpeed across links (using numeric +// comparison after parseNumericSpeed) and the ChildBDF of that link. +// Empty CurSpeed entries are skipped. Returns ("", "") if no link has a +// parseable speed. +func minLinkCurSpeed(links []PCIETreeLink) (string, string) { + var ( + bestVal string + bestBDF string + bestNum float64 + found bool + ) + for _, l := range links { + if l.CurSpeed == "" { + continue + } + num, ok := parseNumericSpeed(l.CurSpeed) + if !ok { + continue + } + if !found || num < bestNum { + bestNum = num + bestVal = l.CurSpeed + bestBDF = l.ChildBDF + found = true + } + } + return bestVal, bestBDF +} + +// minLinkCurWidth is the width counterpart of minLinkCurSpeed. +func minLinkCurWidth(links []PCIETreeLink) (string, string) { + var ( + bestVal string + bestBDF string + bestNum float64 + found bool + ) + for _, l := range links { + if l.CurWidth == "" { + continue + } + num, ok := parseNumericSpeed(l.CurWidth) + if !ok { + continue + } + if !found || num < bestNum { + bestNum = num + bestVal = l.CurWidth + bestBDF = l.ChildBDF + found = true + } + } + return bestVal, bestBDF +} + +// parseNumericSpeed reuses the same heuristic the checker uses: +// strip the unit suffix and parse the leading float. +func parseNumericSpeed(s string) (float64, bool) { + parts := strings.Fields(strings.TrimSpace(s)) + if len(parts) == 0 { + return 0, false + } + f, err := strconv.ParseFloat(parts[0], 64) + if err != nil { + return 0, false + } + return f, true +} diff --git a/components/infiniband/collector/ib_hardware_info_test.go b/components/infiniband/collector/ib_hardware_info_test.go new file mode 100644 index 00000000..8fb28ff9 --- /dev/null +++ b/components/infiniband/collector/ib_hardware_info_test.go @@ -0,0 +1,112 @@ +/* +Copyright 2024 The Scitix Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package collector + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestMinLinkCurSpeed(t *testing.T) { + cases := []struct { + name string + links []PCIETreeLink + wantVal string + wantBDF string + }{ + { + name: "empty", + links: nil, + wantVal: "", + wantBDF: "", + }, + { + name: "single_link", + links: []PCIETreeLink{ + {ParentBDF: "a", ChildBDF: "b", CurSpeed: "32.0 GT/s PCIe"}, + }, + wantVal: "32.0 GT/s PCIe", + wantBDF: "b", + }, + { + name: "lowest_wins_returns_child_bdf", + links: []PCIETreeLink{ + {ParentBDF: "a", ChildBDF: "b", CurSpeed: "32.0 GT/s PCIe"}, + {ParentBDF: "b", ChildBDF: "c", CurSpeed: "16.0 GT/s PCIe"}, + {ParentBDF: "c", ChildBDF: "d", CurSpeed: "8.0 GT/s PCIe"}, + }, + wantVal: "8.0 GT/s PCIe", + wantBDF: "d", + }, + { + name: "blank_speeds_skipped", + links: []PCIETreeLink{ + {ParentBDF: "a", ChildBDF: "b", CurSpeed: ""}, + {ParentBDF: "b", ChildBDF: "c", CurSpeed: "16.0 GT/s PCIe"}, + }, + wantVal: "16.0 GT/s PCIe", + wantBDF: "c", + }, + { + name: "all_blank", + links: []PCIETreeLink{ + {ParentBDF: "a", ChildBDF: "b", CurSpeed: ""}, + }, + wantVal: "", + wantBDF: "", + }, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + val, bdf := minLinkCurSpeed(tc.links) + assert.Equal(t, tc.wantVal, val) + assert.Equal(t, tc.wantBDF, bdf) + }) + } +} + +func TestMinLinkCurWidth(t *testing.T) { + cases := []struct { + name string + links []PCIETreeLink + wantVal string + wantBDF string + }{ + { + name: "empty", + links: nil, + wantVal: "", + wantBDF: "", + }, + { + name: "lowest_wins", + links: []PCIETreeLink{ + {ParentBDF: "a", ChildBDF: "b", CurWidth: "16"}, + {ParentBDF: "b", ChildBDF: "c", CurWidth: "8"}, + }, + wantVal: "8", + wantBDF: "c", + }, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + val, bdf := minLinkCurWidth(tc.links) + assert.Equal(t, tc.wantVal, val) + assert.Equal(t, tc.wantBDF, bdf) + }) + } +} diff --git a/components/infiniband/collector/pcie_info.go b/components/infiniband/collector/pcie_info.go index 03081441..bf10e279 100644 --- a/components/infiniband/collector/pcie_info.go +++ b/components/infiniband/collector/pcie_info.go @@ -19,7 +19,6 @@ import ( "bytes" "context" "fmt" - "math" "os" "os/exec" "path/filepath" @@ -32,9 +31,9 @@ import ( "github.com/sirupsen/logrus" ) -const ( - PCIPath = "/sys/bus/pci/devices" -) +// PCIPath is the root of the PCI sysfs tree. It is a var (not a const) so +// tests can redirect it to a t.TempDir() before exercising the collector. +var PCIPath = "/sys/bus/pci/devices" var ( targetVendorID = []string{ @@ -128,6 +127,22 @@ type PCIETreeWidthInfo struct { Width string `json:"width" yaml:"width"` } +// PCIETreeLink represents a single PCIe link on the upstream path from an +// IB device to the root complex. Each adjacent (parent, child) BDF pair in +// the readlink path is one link. Speed/width strings keep their sysfs raw +// form (e.g. "32.0 GT/s PCIe" or "16") so downstream comparisons can choose +// how to parse them. +type PCIETreeLink struct { + ParentBDF string `json:"parent_bdf" yaml:"parent_bdf"` + ChildBDF string `json:"child_bdf" yaml:"child_bdf"` + CurSpeed string `json:"cur_speed" yaml:"cur_speed"` + CurWidth string `json:"cur_width" yaml:"cur_width"` + ParentMaxSpeed string `json:"parent_max_speed" yaml:"parent_max_speed"` + ChildMaxSpeed string `json:"child_max_speed" yaml:"child_max_speed"` + ParentMaxWidth string `json:"parent_max_width" yaml:"parent_max_width"` + ChildMaxWidth string `json:"child_max_width" yaml:"child_max_width"` +} + // Collect collects PCIe tree information for a given IB device func (pcie *PCIETreeInfo) Collect(IBDev string) { pcie.PCIETreeSpeed = GetPCIETreeSpeed(IBDev) @@ -231,101 +246,75 @@ func GetPCIEMRR(ctx context.Context, IBDev string) []string { return mrr } -// GetPCIETreeMin gets minimum PCIe tree value for a given link type -// alongside the upstream BDF where that minimum was observed. The returned -// BDF identifies the actual bottleneck bridge, not the IB device's own BDF. -func GetPCIETreeMin(IBDev, linkType string) (string, string) { +// GetPCIETreeLinks enumerates every PCIe link on the upstream path of an IB +// device. Each adjacent (parent, child) BDF pair in the readlink path becomes +// one link, with current_link_{speed,width} and max_link_{speed,width} read +// from both endpoints. Direct-to-CPU devices (<2 BDFs in path) return nil. +// +// This is the spec-free replacement for GetPCIETreeMin: the checker compares +// link.CurSpeed against min(ParentMaxSpeed, ChildMaxSpeed) per link, so no +// HCA yaml expected value is needed. +func GetPCIETreeLinks(IBDev string) []PCIETreeLink { bdfList := GetIBDevBDF(IBDev) if len(bdfList) == 0 { logrus.WithField("component", "infiniband").Warnf("Could not get BDF for IB device %s", IBDev) - return "", "" + return nil } - // bdf is the BDF address of the terminal device itself - bdf := bdfList[0] + return getPCIETreeLinksByBDF(bdfList[0]) +} - devicePath := filepath.Join(PCIPath, bdf) +// getPCIETreeLinksByBDF is the testable core of GetPCIETreeLinks. It does +// the readlink + per-link sysfs walk for a single NIC BDF. Sysfs failures +// on individual files leave the corresponding fields blank rather than +// dropping the entire link, so the checker can still emit a useful message. +func getPCIETreeLinksByBDF(nicBDF string) []PCIETreeLink { + devicePath := filepath.Join(PCIPath, nicBDF) linkPath, err := os.Readlink(devicePath) if err != nil { logrus.WithField("component", "infiniband").Errorf("Failed to resolve symlink for %s: %v", devicePath, err) - return "", "" + return nil } bdfRegexPattern := `\b[0-9a-fA-F]{4}:[0-9a-fA-F]{2}:[0-9a-fA-F]{2}\.[0-7]\b` re := regexp.MustCompile(bdfRegexPattern) - - allBdfsInPath := re.FindAllString(string(linkPath), -1) - - // Filter out the device's own BDF - var upstreamBdfs []string - for _, foundBdf := range allBdfsInPath { - if foundBdf != bdf { - upstreamBdfs = append(upstreamBdfs, foundBdf) - } - } - - if len(upstreamBdfs) == 0 { - // If there are no upstream devices (e.g., device directly connected to CPU), this is normal, just return - logrus.WithField("component", "infiniband").Infof("No upstream PCIe devices found in path for %s, skipping check.", bdf) - return "", "" - } - - if len(upstreamBdfs) == 1 { - // If there's only one upstream device, it means it's directly connected to CPU, this is also normal, just return - logrus.WithField("component", "infiniband").Infof("Only one upstream PCIe device found in path for %s, likely direct to CPU, skipping check.", bdf) - return "", "" + allBdfs := re.FindAllString(linkPath, -1) + if len(allBdfs) < 2 { + logrus.WithField("component", "infiniband").Infof("No upstream PCIe link for %s (path has <2 BDFs), skipping.", nicBDF) + return nil } - logrus.WithField("component", "infiniband").Infof("Checking upstream devices for %s: %v", bdf, upstreamBdfs) - - var minNumericString string - var minBdf string - minNumericValue := math.MaxFloat64 - - // Now, we only iterate through the BDF list of upstream devices - for _, currentBdf := range upstreamBdfs { - logrus.WithField("component", "infiniband").Infof("Checking upstream device %s for property %s", currentBdf, linkType) - propertyFilePath := filepath.Join(PCIPath, currentBdf, linkType) - - propertyContents, err := GetFileCnt(propertyFilePath) - if err != nil || len(propertyContents) == 0 { - if err != nil { - logrus.WithField("component", "infiniband").Debugf("Property file '%s' is unreadable for BDF %s: %v, skipping.", linkType, currentBdf, err) - } else { - logrus.WithField("component", "infiniband").Debugf("Property file '%s' is empty for BDF %s, skipping.", linkType, currentBdf) - } - continue + links := make([]PCIETreeLink, 0, len(allBdfs)-1) + for i := 1; i < len(allBdfs); i++ { + parent := allBdfs[i-1] + child := allBdfs[i] + link := PCIETreeLink{ParentBDF: parent, ChildBDF: child} + link.CurSpeed = readSysfsString(filepath.Join(PCIPath, child, "current_link_speed")) + if link.CurSpeed == "" { + // Fall back to parent's report; the two endpoints share the link. + link.CurSpeed = readSysfsString(filepath.Join(PCIPath, parent, "current_link_speed")) } - - currentPropertyString := strings.TrimSpace(propertyContents[0]) - parts := strings.Fields(currentPropertyString) - if len(parts) == 0 { - logrus.WithField("component", "infiniband").Warnf("Malformed property string '%s' for BDF %s", currentPropertyString, currentBdf) - continue - } - numericStringPart := parts[0] - - currentNumericValue, err := strconv.ParseFloat(numericStringPart, 64) - if err != nil { - logrus.WithField("component", "infiniband").Warnf("Could not parse numeric value from '%s' in file %s", numericStringPart, propertyFilePath) - continue - } - - if currentNumericValue < minNumericValue { - minNumericValue = currentNumericValue - minNumericString = numericStringPart - minBdf = currentBdf - logrus.WithField("component", "infiniband").Debugf( - "Found new upstream minimum for %s (%s): %s (full value: '%s', at BDF: %s)", - IBDev, linkType, minNumericString, currentPropertyString, currentBdf, - ) + link.CurWidth = readSysfsString(filepath.Join(PCIPath, child, "current_link_width")) + if link.CurWidth == "" { + link.CurWidth = readSysfsString(filepath.Join(PCIPath, parent, "current_link_width")) } + link.ParentMaxSpeed = readSysfsString(filepath.Join(PCIPath, parent, "max_link_speed")) + link.ChildMaxSpeed = readSysfsString(filepath.Join(PCIPath, child, "max_link_speed")) + link.ParentMaxWidth = readSysfsString(filepath.Join(PCIPath, parent, "max_link_width")) + link.ChildMaxWidth = readSysfsString(filepath.Join(PCIPath, child, "max_link_width")) + links = append(links, link) } + return links +} - if minNumericString == "" { - logrus.WithField("component", "infiniband").Warnf("Could not determine a minimum value for property '%s' on upstream path of device %s", linkType, IBDev) +// readSysfsString reads a single-line sysfs file and trims trailing whitespace. +// Returns "" on any read error; callers downgrade gracefully. +func readSysfsString(path string) string { + data, err := os.ReadFile(path) + if err != nil { + logrus.WithField("component", "infiniband").Debugf("readSysfsString: %s: %v", path, err) + return "" } - - return minNumericString, minBdf + return strings.TrimRight(string(data), "\n\r\t ") } // GetPCIETreeSpeed gets PCIe tree speed information diff --git a/components/infiniband/collector/pcie_info_test.go b/components/infiniband/collector/pcie_info_test.go new file mode 100644 index 00000000..8b3a98cf --- /dev/null +++ b/components/infiniband/collector/pcie_info_test.go @@ -0,0 +1,245 @@ +/* +Copyright 2024 The Scitix Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package collector + +import ( + "os" + "path/filepath" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// fakeBDF writes the four sysfs files for a single PCIe device under root. +// Empty values are skipped so a test can simulate "file missing" cases. +func fakeBDF(t *testing.T, root, bdf, curSpeed, maxSpeed, curWidth, maxWidth string) { + t.Helper() + dir := filepath.Join(root, bdf) + require.NoError(t, os.MkdirAll(dir, 0o755)) + write := func(name, val string) { + if val == "" { + return + } + require.NoError(t, os.WriteFile(filepath.Join(dir, name), []byte(val+"\n"), 0o644)) + } + write("current_link_speed", curSpeed) + write("max_link_speed", maxSpeed) + write("current_link_width", curWidth) + write("max_link_width", maxWidth) +} + +// nicEntryWithSymlink creates the symlink the collector will Readlink. The +// target is a path string that embeds pathBDFs; the collector parses the +// BDF list out of it via regex. The string need not point to a real file. +func nicEntryWithSymlink(t *testing.T, root, entryBDF string, pathBDFs []string) { + t.Helper() + parts := []string{"..", "..", "devices", "pci0000:80"} + parts = append(parts, pathBDFs...) + target := filepath.Join(parts...) + require.NoError(t, os.Symlink(target, filepath.Join(root, entryBDF))) +} + +func withPCIPath(t *testing.T, root string) { + t.Helper() + orig := PCIPath + PCIPath = root + t.Cleanup(func() { PCIPath = orig }) +} + +func TestGetPCIETreeLinksByBDF(t *testing.T) { + type setup struct { + // upstreamBDFs are written as real dirs with sysfs files. + upstream []struct { + bdf string + curSpeed string + maxSpeed string + curWidth string + maxWidth string + } + // pathBDFs is the BDF sequence (root → leaf) embedded in the symlink target. + pathBDFs []string + // nicBDF is the device passed to getPCIETreeLinksByBDF. The function reads + // / as a symlink (not a directory). + nicBDF string + } + + cases := []struct { + name string + s setup + want []PCIETreeLink + }{ + { + name: "full_speed_all_links", + s: setup{ + upstream: []struct { + bdf string + curSpeed string + maxSpeed string + curWidth string + maxWidth string + }{ + {"0000:80:01.0", "32.0 GT/s PCIe", "32.0 GT/s PCIe", "16", "16"}, + {"0000:81:00.0", "32.0 GT/s PCIe", "32.0 GT/s PCIe", "16", "16"}, + {"0000:82:00.0", "32.0 GT/s PCIe", "32.0 GT/s PCIe", "16", "16"}, + }, + pathBDFs: []string{"0000:80:01.0", "0000:81:00.0", "0000:82:00.0"}, + nicBDF: "nic_entry_full", + }, + want: []PCIETreeLink{ + { + ParentBDF: "0000:80:01.0", ChildBDF: "0000:81:00.0", + CurSpeed: "32.0 GT/s PCIe", CurWidth: "16", + ParentMaxSpeed: "32.0 GT/s PCIe", ChildMaxSpeed: "32.0 GT/s PCIe", + ParentMaxWidth: "16", ChildMaxWidth: "16", + }, + { + ParentBDF: "0000:81:00.0", ChildBDF: "0000:82:00.0", + CurSpeed: "32.0 GT/s PCIe", CurWidth: "16", + ParentMaxSpeed: "32.0 GT/s PCIe", ChildMaxSpeed: "32.0 GT/s PCIe", + ParentMaxWidth: "16", ChildMaxWidth: "16", + }, + }, + }, + { + name: "nic_link_downgraded", + s: setup{ + upstream: []struct { + bdf string + curSpeed string + maxSpeed string + curWidth string + maxWidth string + }{ + {"0000:80:01.0", "32.0 GT/s PCIe", "32.0 GT/s PCIe", "16", "16"}, + {"0000:81:00.0", "32.0 GT/s PCIe", "32.0 GT/s PCIe", "16", "16"}, + {"0000:82:00.0", "16.0 GT/s PCIe", "32.0 GT/s PCIe", "8", "16"}, + }, + pathBDFs: []string{"0000:80:01.0", "0000:81:00.0", "0000:82:00.0"}, + nicBDF: "nic_entry_down", + }, + want: []PCIETreeLink{ + { + ParentBDF: "0000:80:01.0", ChildBDF: "0000:81:00.0", + CurSpeed: "32.0 GT/s PCIe", CurWidth: "16", + ParentMaxSpeed: "32.0 GT/s PCIe", ChildMaxSpeed: "32.0 GT/s PCIe", + ParentMaxWidth: "16", ChildMaxWidth: "16", + }, + { + ParentBDF: "0000:81:00.0", ChildBDF: "0000:82:00.0", + CurSpeed: "16.0 GT/s PCIe", CurWidth: "8", + ParentMaxSpeed: "32.0 GT/s PCIe", ChildMaxSpeed: "32.0 GT/s PCIe", + ParentMaxWidth: "16", ChildMaxWidth: "16", + }, + }, + }, + { + name: "root_port_only_supports_gen4", + s: setup{ + upstream: []struct { + bdf string + curSpeed string + maxSpeed string + curWidth string + maxWidth string + }{ + {"0000:80:01.0", "16.0 GT/s PCIe", "16.0 GT/s PCIe", "16", "16"}, + {"0000:81:00.0", "16.0 GT/s PCIe", "32.0 GT/s PCIe", "16", "16"}, + {"0000:82:00.0", "16.0 GT/s PCIe", "32.0 GT/s PCIe", "16", "16"}, + }, + pathBDFs: []string{"0000:80:01.0", "0000:81:00.0", "0000:82:00.0"}, + nicBDF: "nic_entry_gen4root", + }, + want: []PCIETreeLink{ + { + ParentBDF: "0000:80:01.0", ChildBDF: "0000:81:00.0", + CurSpeed: "16.0 GT/s PCIe", CurWidth: "16", + ParentMaxSpeed: "16.0 GT/s PCIe", ChildMaxSpeed: "32.0 GT/s PCIe", + ParentMaxWidth: "16", ChildMaxWidth: "16", + }, + { + ParentBDF: "0000:81:00.0", ChildBDF: "0000:82:00.0", + CurSpeed: "16.0 GT/s PCIe", CurWidth: "16", + ParentMaxSpeed: "32.0 GT/s PCIe", ChildMaxSpeed: "32.0 GT/s PCIe", + ParentMaxWidth: "16", ChildMaxWidth: "16", + }, + }, + }, + { + name: "direct_to_cpu_returns_nil", + s: setup{ + upstream: []struct { + bdf string + curSpeed string + maxSpeed string + curWidth string + maxWidth string + }{ + {"0000:00:00.0", "32.0 GT/s PCIe", "32.0 GT/s PCIe", "16", "16"}, + }, + pathBDFs: []string{"0000:00:00.0"}, + nicBDF: "nic_entry_direct", + }, + want: nil, + }, + { + name: "missing_max_speed_on_one_node_keeps_link_with_blank_field", + s: setup{ + upstream: []struct { + bdf string + curSpeed string + maxSpeed string + curWidth string + maxWidth string + }{ + {"0000:80:01.0", "32.0 GT/s PCIe", "", "16", "16"}, + {"0000:81:00.0", "32.0 GT/s PCIe", "32.0 GT/s PCIe", "16", "16"}, + {"0000:82:00.0", "32.0 GT/s PCIe", "32.0 GT/s PCIe", "16", "16"}, + }, + pathBDFs: []string{"0000:80:01.0", "0000:81:00.0", "0000:82:00.0"}, + nicBDF: "nic_entry_missingmax", + }, + want: []PCIETreeLink{ + { + ParentBDF: "0000:80:01.0", ChildBDF: "0000:81:00.0", + CurSpeed: "32.0 GT/s PCIe", CurWidth: "16", + ParentMaxSpeed: "", ChildMaxSpeed: "32.0 GT/s PCIe", + ParentMaxWidth: "16", ChildMaxWidth: "16", + }, + { + ParentBDF: "0000:81:00.0", ChildBDF: "0000:82:00.0", + CurSpeed: "32.0 GT/s PCIe", CurWidth: "16", + ParentMaxSpeed: "32.0 GT/s PCIe", ChildMaxSpeed: "32.0 GT/s PCIe", + ParentMaxWidth: "16", ChildMaxWidth: "16", + }, + }, + }, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + root := t.TempDir() + withPCIPath(t, root) + for _, u := range tc.s.upstream { + fakeBDF(t, root, u.bdf, u.curSpeed, u.maxSpeed, u.curWidth, u.maxWidth) + } + nicEntryWithSymlink(t, root, tc.s.nicBDF, tc.s.pathBDFs) + + got := getPCIETreeLinksByBDF(tc.s.nicBDF) + assert.Equal(t, tc.want, got) + }) + } +} diff --git a/docs/infiniband.md b/docs/infiniband.md index ddfbb445..d3c7d0c4 100644 --- a/docs/infiniband.md +++ b/docs/infiniband.md @@ -48,15 +48,23 @@ The detection processes are categorized into three main areas: *hardware*, *soft - Criticality: critical - Suggestion: Check whether the HCA is installed in the correct PCIe slot. -#### HCA_PCIe_TREE_SPEED -- Description: Evaluates the integrity of the PCIe links between the HCAs and the motherboard to detect any issues that could impair communication. -- Criticality: critical -- Suggestion: check the PCIe switch is work properly. +#### PCIe Tree Speed/Width 检测 -#### HCA_PCIe_TREE_WIDTH -- Description: Confirms that the PCIe bandwidth settings are consistent across all HCAs, preventing disparities that could affect network performance. -- Criticality: critical -- Suggestion: check the hca card port speed properly. +`PCIETreeSpeedDownDegraded` / `PCIETreeWidthIncorrect` 不再依赖 HCA `board_id` spec +中的 `pcie_tree_speed` / `pcie_tree_width` 字段。Collector 直接从 sysfs +(`/sys/bus/pci/devices//{current,max}_link_{speed,width}`)枚举从 NIC 到 root +complex 的每一条 PCIe link,取整条路径上所有 link 的 `min(parent.max, child.max)` +的最小值作为路径理论上限;当某条 link 的 `current_link_*` 低于该上限时上报。 + +效果: + +- 多 NIC 混插场景无需补 spec;新 SKU 上线零维护。 +- 当 root port 物理上限低于 NIC(例如 root port 仅支持 Gen4 而 NIC 是 Gen5)时 + 不再误报:路径上所有 link 都按 Gen4 跑是正常的。 +- `Detail` / `Suggestion` 字段精确到出问题的具体 link,例如 `bottleneck@0000:80:01.0->0000:81:00.0`。 + +`pcie_tree_speed` / `pcie_tree_width` 字段在 HCA spec yaml 中保留为历史信息, +运行时不再读取。 #### HCA_PCIe_ACS - Description: Checks the Access Control Services (ACS) settings of the PCIe to ensure proper traffic routing and prevent potential security vulnerabilities within the pcie topology. diff --git a/docs/superpowers/plans/2026-05-18-pcie-tree-spec-free.md b/docs/superpowers/plans/2026-05-18-pcie-tree-spec-free.md new file mode 100644 index 00000000..f5efaaeb --- /dev/null +++ b/docs/superpowers/plans/2026-05-18-pcie-tree-spec-free.md @@ -0,0 +1,1507 @@ +# PCIe Tree Speed/Width spec-free 检查实施计划 + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** 把 `PCIETreeSpeedDownDegraded` 和 `PCIETreeWidthIncorrect` 两个 checker 从"按 NIC `board_id` 查 HCA spec 拿 expect 值"改成"每条上行 PCIe link 取 `min(parent.max, self.max)` 从 sysfs 自描述判定",多 NIC 混插场景下无需任何 spec 维护。 + +**Architecture:** Collector 一次扫描产出 `[]PCIETreeLink`(每条上行 link 一条记录,带 cur/max speed/width 及两端 BDF),Speed/Width 两个 checker 共用同一份数据做 per-link 比较;旧的 `PCIETreeSpeedMin*` / `PCIETreeWidthMin*` 字段保留并由 `PCIETreeLinks` 派生,保持下游 JSON/Prometheus 兼容。 + +**Tech Stack:** Go 1.23、`testify`、Linux sysfs(`/sys/bus/pci/devices//{current,max}_link_{speed,width}`)。 + +**Spec:** `docs/superpowers/specs/2026-05-18-pcie-tree-spec-free-design.md` +**Branch:** `feat/pcie-tree-spec-free` (`a9628d5`) + +--- + +## File Structure + +| 路径 | 操作 | 职责 | +| --- | --- | --- | +| `components/infiniband/collector/pcie_info.go` | Modify | `PCIPath` 转 var;新增 `PCIETreeLink` 结构、`GetPCIETreeLinks`、`getPCIETreeLinksByBDF`;删除旧 `GetPCIETreeMin` | +| `components/infiniband/collector/pcie_info_test.go` | Create | `GetPCIETreeLinks` 表驱动测试,用 `t.TempDir()` 伪造 sysfs | +| `components/infiniband/collector/ib_hardware_info.go` | Modify | `IBHardWareInfo` 增 `PCIETreeLinks`;`Collect` 调用新函数;新增 `minLinkCurSpeed`/`minLinkCurWidth` 派生旧 4 字段 | +| `components/infiniband/collector/ib_hardware_info_test.go` | Create | 验证 `minLinkCurSpeed`/`minLinkCurWidth` 派生正确 | +| `components/infiniband/checker/pcie_tree_speed.go` | Rewrite | 改用 `PCIETreeLinks` 做 per-link 比较;新增 `pcieSpeedLessThan` / `minNumericSpeed` 辅助 | +| `components/infiniband/checker/pcie_tree_speed_test.go` | Create | 表驱动 checker 测试 | +| `components/infiniband/checker/pcie_tree_width.go` | Rewrite | 与 speed 对称,复用辅助 | +| `components/infiniband/checker/pcie_tree_width_test.go` | Create | 表驱动 checker 测试 | +| `docs/infiniband.md` | Modify | 更新 PCIe Tree Speed/Width 检查的描述 | + +--- + +## Conventions + +- 测试用 `github.com/stretchr/testify/require`(错误中止)+ `github.com/stretchr/testify/assert`(断言)。 +- 表驱动子用例:`t.Run(tc.name, func(t *testing.T) { ... })`。 +- 新文件加上仓库标准的 Apache-2.0 Scitix 版权头(见 `CLAUDE.md`)。 +- 提交信息:与最近 commit 风格一致(`Feat/...` 或动词开头),**不要** 加 `Co-Authored-By: Claude...` 或 `Generated with Claude Code` 尾巴。 +- 每个 task 完成后 `go test ./components/infiniband/...` 至少跑通;最终 task 跑全量 `go test ./...`。 + +--- + +## Task 1: 让 `PCIPath` 可注入(var 改造 + 注入测试) + +**Files:** +- Modify: `components/infiniband/collector/pcie_info.go:35-37` + +- [ ] **Step 1: 把 `PCIPath` 从 const 改成 var** + +Edit `components/infiniband/collector/pcie_info.go`:把 + +```go +const ( + PCIPath = "/sys/bus/pci/devices" +) +``` + +改为 + +```go +// PCIPath is the root of the PCI sysfs tree. It is a var (not a const) so +// tests can redirect it to a t.TempDir() before exercising the collector. +var PCIPath = "/sys/bus/pci/devices" +``` + +- [ ] **Step 2: 构建确认编译通过** + +Run: `cd /root/devnet/sichek && go build ./components/infiniband/...` +Expected: `(no output)`,退出码 0。 + +- [ ] **Step 3: 提交** + +```bash +git add components/infiniband/collector/pcie_info.go +git commit -m "Refactor/infiniband: make PCIPath injectable for tests" +``` + +--- + +## Task 2: 定义 `PCIETreeLink` 结构 + +**Files:** +- Modify: `components/infiniband/collector/pcie_info.go`(在 `PCIETreeInfo` 类型附近添加) + +- [ ] **Step 1: 添加 `PCIETreeLink` 结构** + +在 `pcie_info.go` 中 `PCIETreeWidthInfo` 类型定义之后(约 line 130 之后)追加: + +```go +// PCIETreeLink represents a single PCIe link on the upstream path from an +// IB device to the root complex. Each adjacent (parent, child) BDF pair in +// the readlink path is one link. Speed/width strings keep their sysfs raw +// form (e.g. "32.0 GT/s PCIe" or "16") so downstream comparisons can choose +// how to parse them. +type PCIETreeLink struct { + ParentBDF string `json:"parent_bdf" yaml:"parent_bdf"` + ChildBDF string `json:"child_bdf" yaml:"child_bdf"` + CurSpeed string `json:"cur_speed" yaml:"cur_speed"` + CurWidth string `json:"cur_width" yaml:"cur_width"` + ParentMaxSpeed string `json:"parent_max_speed" yaml:"parent_max_speed"` + ChildMaxSpeed string `json:"child_max_speed" yaml:"child_max_speed"` + ParentMaxWidth string `json:"parent_max_width" yaml:"parent_max_width"` + ChildMaxWidth string `json:"child_max_width" yaml:"child_max_width"` +} +``` + +- [ ] **Step 2: 构建确认** + +Run: `cd /root/devnet/sichek && go build ./components/infiniband/...` +Expected: `(no output)`,退出码 0。 + +- [ ] **Step 3: 提交** + +```bash +git add components/infiniband/collector/pcie_info.go +git commit -m "Feat/infiniband: add PCIETreeLink struct" +``` + +--- + +## Task 3: 实现 `GetPCIETreeLinks` + 表驱动测试 + +**Files:** +- Create: `components/infiniband/collector/pcie_info_test.go` +- Modify: `components/infiniband/collector/pcie_info.go` + +### Step 1: 写失败测试 + +> 关键测试基础设施约定:`getPCIETreeLinksByBDF` 只对 `/` 调用 `os.Readlink` 取 symlink 目标字符串;之后用正则提取目标里所有 BDF,并在 `//` 下读 sysfs 文件。因此测试里: +> 1. 把每个上行 BDF(包括"看起来像 NIC 的"那个)写成 `//` 真实目录 + 四个 sysfs 文件; +> 2. 在 `/` 创建一条 symlink(target 字符串内嵌路径 BDF 列表),并把 `entryBDF` 作为 NIC BDF 传入函数。`entryBDF` 用区别于真实 BDF 的字符串(如 `nic_entry_full`),避免和真实目录冲突。 + +- [ ] 创建 `components/infiniband/collector/pcie_info_test.go`: + +```go +/* +Copyright 2024 The Scitix Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package collector + +import ( + "os" + "path/filepath" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// fakeBDF writes the four sysfs files for a single PCIe device under root. +// Empty values are skipped so a test can simulate "file missing" cases. +func fakeBDF(t *testing.T, root, bdf, curSpeed, maxSpeed, curWidth, maxWidth string) { + t.Helper() + dir := filepath.Join(root, bdf) + require.NoError(t, os.MkdirAll(dir, 0o755)) + write := func(name, val string) { + if val == "" { + return + } + require.NoError(t, os.WriteFile(filepath.Join(dir, name), []byte(val+"\n"), 0o644)) + } + write("current_link_speed", curSpeed) + write("max_link_speed", maxSpeed) + write("current_link_width", curWidth) + write("max_link_width", maxWidth) +} + +// nicEntryWithSymlink creates the symlink the collector will Readlink. The +// target is a path string that embeds pathBDFs; the collector parses the +// BDF list out of it via regex. The string need not point to a real file. +func nicEntryWithSymlink(t *testing.T, root, entryBDF string, pathBDFs []string) { + t.Helper() + parts := []string{"..", "..", "devices", "pci0000:80"} + parts = append(parts, pathBDFs...) + target := filepath.Join(parts...) + require.NoError(t, os.Symlink(target, filepath.Join(root, entryBDF))) +} + +func withPCIPath(t *testing.T, root string) { + t.Helper() + orig := PCIPath + PCIPath = root + t.Cleanup(func() { PCIPath = orig }) +} + +func TestGetPCIETreeLinksByBDF(t *testing.T) { + type setup struct { + // upstreamBDFs are written as real dirs with sysfs files. + upstream []struct { + bdf string + curSpeed string + maxSpeed string + curWidth string + maxWidth string + } + // pathBDFs is the BDF sequence (root → leaf) embedded in the symlink target. + pathBDFs []string + // nicBDF is the device passed to getPCIETreeLinksByBDF. The function reads + // / as a symlink (not a directory). + nicBDF string + } + + cases := []struct { + name string + s setup + want []PCIETreeLink + }{ + { + name: "full_speed_all_links", + s: setup{ + upstream: []struct { + bdf string + curSpeed string + maxSpeed string + curWidth string + maxWidth string + }{ + {"0000:80:01.0", "32.0 GT/s PCIe", "32.0 GT/s PCIe", "16", "16"}, + {"0000:81:00.0", "32.0 GT/s PCIe", "32.0 GT/s PCIe", "16", "16"}, + {"0000:82:00.0", "32.0 GT/s PCIe", "32.0 GT/s PCIe", "16", "16"}, + }, + pathBDFs: []string{"0000:80:01.0", "0000:81:00.0", "0000:82:00.0"}, + nicBDF: "nic_entry_full", + }, + want: []PCIETreeLink{ + { + ParentBDF: "0000:80:01.0", ChildBDF: "0000:81:00.0", + CurSpeed: "32.0 GT/s PCIe", CurWidth: "16", + ParentMaxSpeed: "32.0 GT/s PCIe", ChildMaxSpeed: "32.0 GT/s PCIe", + ParentMaxWidth: "16", ChildMaxWidth: "16", + }, + { + ParentBDF: "0000:81:00.0", ChildBDF: "0000:82:00.0", + CurSpeed: "32.0 GT/s PCIe", CurWidth: "16", + ParentMaxSpeed: "32.0 GT/s PCIe", ChildMaxSpeed: "32.0 GT/s PCIe", + ParentMaxWidth: "16", ChildMaxWidth: "16", + }, + }, + }, + { + name: "nic_link_downgraded", + s: setup{ + upstream: []struct { + bdf string + curSpeed string + maxSpeed string + curWidth string + maxWidth string + }{ + {"0000:80:01.0", "32.0 GT/s PCIe", "32.0 GT/s PCIe", "16", "16"}, + {"0000:81:00.0", "32.0 GT/s PCIe", "32.0 GT/s PCIe", "16", "16"}, + {"0000:82:00.0", "16.0 GT/s PCIe", "32.0 GT/s PCIe", "8", "16"}, + }, + pathBDFs: []string{"0000:80:01.0", "0000:81:00.0", "0000:82:00.0"}, + nicBDF: "nic_entry_down", + }, + want: []PCIETreeLink{ + { + ParentBDF: "0000:80:01.0", ChildBDF: "0000:81:00.0", + CurSpeed: "32.0 GT/s PCIe", CurWidth: "16", + ParentMaxSpeed: "32.0 GT/s PCIe", ChildMaxSpeed: "32.0 GT/s PCIe", + ParentMaxWidth: "16", ChildMaxWidth: "16", + }, + { + ParentBDF: "0000:81:00.0", ChildBDF: "0000:82:00.0", + CurSpeed: "16.0 GT/s PCIe", CurWidth: "8", + ParentMaxSpeed: "32.0 GT/s PCIe", ChildMaxSpeed: "32.0 GT/s PCIe", + ParentMaxWidth: "16", ChildMaxWidth: "16", + }, + }, + }, + { + name: "root_port_only_supports_gen4", + s: setup{ + upstream: []struct { + bdf string + curSpeed string + maxSpeed string + curWidth string + maxWidth string + }{ + {"0000:80:01.0", "16.0 GT/s PCIe", "16.0 GT/s PCIe", "16", "16"}, + {"0000:81:00.0", "16.0 GT/s PCIe", "32.0 GT/s PCIe", "16", "16"}, + {"0000:82:00.0", "16.0 GT/s PCIe", "32.0 GT/s PCIe", "16", "16"}, + }, + pathBDFs: []string{"0000:80:01.0", "0000:81:00.0", "0000:82:00.0"}, + nicBDF: "nic_entry_gen4root", + }, + want: []PCIETreeLink{ + { + ParentBDF: "0000:80:01.0", ChildBDF: "0000:81:00.0", + CurSpeed: "16.0 GT/s PCIe", CurWidth: "16", + ParentMaxSpeed: "16.0 GT/s PCIe", ChildMaxSpeed: "32.0 GT/s PCIe", + ParentMaxWidth: "16", ChildMaxWidth: "16", + }, + { + ParentBDF: "0000:81:00.0", ChildBDF: "0000:82:00.0", + CurSpeed: "16.0 GT/s PCIe", CurWidth: "16", + ParentMaxSpeed: "32.0 GT/s PCIe", ChildMaxSpeed: "32.0 GT/s PCIe", + ParentMaxWidth: "16", ChildMaxWidth: "16", + }, + }, + }, + { + name: "direct_to_cpu_returns_nil", + s: setup{ + upstream: []struct { + bdf string + curSpeed string + maxSpeed string + curWidth string + maxWidth string + }{ + {"0000:00:00.0", "32.0 GT/s PCIe", "32.0 GT/s PCIe", "16", "16"}, + }, + pathBDFs: []string{"0000:00:00.0"}, + nicBDF: "nic_entry_direct", + }, + want: nil, + }, + { + name: "missing_max_speed_on_one_node_keeps_link_with_blank_field", + s: setup{ + upstream: []struct { + bdf string + curSpeed string + maxSpeed string + curWidth string + maxWidth string + }{ + {"0000:80:01.0", "32.0 GT/s PCIe", "", "16", "16"}, + {"0000:81:00.0", "32.0 GT/s PCIe", "32.0 GT/s PCIe", "16", "16"}, + {"0000:82:00.0", "32.0 GT/s PCIe", "32.0 GT/s PCIe", "16", "16"}, + }, + pathBDFs: []string{"0000:80:01.0", "0000:81:00.0", "0000:82:00.0"}, + nicBDF: "nic_entry_missingmax", + }, + want: []PCIETreeLink{ + { + ParentBDF: "0000:80:01.0", ChildBDF: "0000:81:00.0", + CurSpeed: "32.0 GT/s PCIe", CurWidth: "16", + ParentMaxSpeed: "", ChildMaxSpeed: "32.0 GT/s PCIe", + ParentMaxWidth: "16", ChildMaxWidth: "16", + }, + { + ParentBDF: "0000:81:00.0", ChildBDF: "0000:82:00.0", + CurSpeed: "32.0 GT/s PCIe", CurWidth: "16", + ParentMaxSpeed: "32.0 GT/s PCIe", ChildMaxSpeed: "32.0 GT/s PCIe", + ParentMaxWidth: "16", ChildMaxWidth: "16", + }, + }, + }, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + root := t.TempDir() + withPCIPath(t, root) + for _, u := range tc.s.upstream { + fakeBDF(t, root, u.bdf, u.curSpeed, u.maxSpeed, u.curWidth, u.maxWidth) + } + nicEntryWithSymlink(t, root, tc.s.nicBDF, tc.s.pathBDFs) + + got := getPCIETreeLinksByBDF(tc.s.nicBDF) + assert.Equal(t, tc.want, got) + }) + } +} +``` + +- [ ] **Step 2: 跑测试确认失败** + +Run: `cd /root/devnet/sichek && go test ./components/infiniband/collector/ -run TestGetPCIETreeLinksByBDF -v` +Expected: 编译错误 `undefined: getPCIETreeLinksByBDF`。 + +### Step 3: 实现 `getPCIETreeLinksByBDF` 和 `GetPCIETreeLinks` + +- [ ] 在 `components/infiniband/collector/pcie_info.go` 里追加(位置:放在旧的 `GetPCIETreeMin` 函数后面、`GetPCIETreeSpeed` 之前): + +```go +// GetPCIETreeLinks enumerates every PCIe link on the upstream path of an IB +// device. Each adjacent (parent, child) BDF pair in the readlink path becomes +// one link, with current_link_{speed,width} and max_link_{speed,width} read +// from both endpoints. Direct-to-CPU devices (<2 BDFs in path) return nil. +// +// This is the spec-free replacement for GetPCIETreeMin: the checker compares +// link.CurSpeed against min(ParentMaxSpeed, ChildMaxSpeed) per link, so no +// HCA yaml expected value is needed. +func GetPCIETreeLinks(IBDev string) []PCIETreeLink { + bdfList := GetIBDevBDF(IBDev) + if len(bdfList) == 0 { + logrus.WithField("component", "infiniband").Warnf("Could not get BDF for IB device %s", IBDev) + return nil + } + return getPCIETreeLinksByBDF(bdfList[0]) +} + +// getPCIETreeLinksByBDF is the testable core of GetPCIETreeLinks. It does +// the readlink + per-link sysfs walk for a single NIC BDF. Sysfs failures +// on individual files leave the corresponding fields blank rather than +// dropping the entire link, so the checker can still emit a useful message. +func getPCIETreeLinksByBDF(nicBDF string) []PCIETreeLink { + devicePath := filepath.Join(PCIPath, nicBDF) + linkPath, err := os.Readlink(devicePath) + if err != nil { + logrus.WithField("component", "infiniband").Errorf("Failed to resolve symlink for %s: %v", devicePath, err) + return nil + } + + bdfRegexPattern := `\b[0-9a-fA-F]{4}:[0-9a-fA-F]{2}:[0-9a-fA-F]{2}\.[0-7]\b` + re := regexp.MustCompile(bdfRegexPattern) + allBdfs := re.FindAllString(linkPath, -1) + if len(allBdfs) < 2 { + logrus.WithField("component", "infiniband").Infof("No upstream PCIe link for %s (path has <2 BDFs), skipping.", nicBDF) + return nil + } + + links := make([]PCIETreeLink, 0, len(allBdfs)-1) + for i := 1; i < len(allBdfs); i++ { + parent := allBdfs[i-1] + child := allBdfs[i] + link := PCIETreeLink{ParentBDF: parent, ChildBDF: child} + link.CurSpeed = readSysfsString(filepath.Join(PCIPath, child, "current_link_speed")) + if link.CurSpeed == "" { + // Fall back to parent's report; the two endpoints share the link. + link.CurSpeed = readSysfsString(filepath.Join(PCIPath, parent, "current_link_speed")) + } + link.CurWidth = readSysfsString(filepath.Join(PCIPath, child, "current_link_width")) + if link.CurWidth == "" { + link.CurWidth = readSysfsString(filepath.Join(PCIPath, parent, "current_link_width")) + } + link.ParentMaxSpeed = readSysfsString(filepath.Join(PCIPath, parent, "max_link_speed")) + link.ChildMaxSpeed = readSysfsString(filepath.Join(PCIPath, child, "max_link_speed")) + link.ParentMaxWidth = readSysfsString(filepath.Join(PCIPath, parent, "max_link_width")) + link.ChildMaxWidth = readSysfsString(filepath.Join(PCIPath, child, "max_link_width")) + links = append(links, link) + } + return links +} + +// readSysfsString reads a single-line sysfs file and trims trailing whitespace. +// Returns "" on any read error; callers downgrade gracefully. +func readSysfsString(path string) string { + data, err := os.ReadFile(path) + if err != nil { + logrus.WithField("component", "infiniband").Debugf("readSysfsString: %s: %v", path, err) + return "" + } + return strings.TrimRight(string(data), "\n\r\t ") +} +``` + +- [ ] **Step 4: 确认 import 列表里有 `os`、`regexp`、`strings`、`path/filepath`、`logrus`**(这些应该已在文件顶部,不需要新加)。 + +- [ ] **Step 5: 跑测试确认通过** + +Run: `cd /root/devnet/sichek && go test ./components/infiniband/collector/ -run TestGetPCIETreeLinksByBDF -v` +Expected: 所有 5 个子用例 PASS。 + +- [ ] **Step 6: 跑包内全部测试,确认没有回归** + +Run: `cd /root/devnet/sichek && go test ./components/infiniband/collector/` +Expected: `ok`。 + +### Step 7: 提交 + +- [ ] 提交 + +```bash +git add components/infiniband/collector/pcie_info.go components/infiniband/collector/pcie_info_test.go +git commit -m "Feat/infiniband: add GetPCIETreeLinks spec-free collector" +``` + +--- + +## Task 4: 把 `PCIETreeLinks` 接进 `IBHardWareInfo`;旧 4 字段派生 + +**Files:** +- Modify: `components/infiniband/collector/ib_hardware_info.go` +- Create: `components/infiniband/collector/ib_hardware_info_test.go` + +### Step 1: 加入字段 + +- [ ] Edit `ib_hardware_info.go`,在 `IBHardWareInfo` 结构中(line 58~61 之后)追加 `PCIETreeLinks` 字段: + +```go +PCIETreeSpeedMin string `json:"pcie_tree_speed" yaml:"pcie_tree_speed"` +PCIETreeSpeedMinBDF string `json:"pcie_tree_speed_bdf" yaml:"pcie_tree_speed_bdf"` +PCIETreeWidthMin string `json:"pcie_tree_width" yaml:"pcie_tree_width"` +PCIETreeWidthMinBDF string `json:"pcie_tree_width_bdf" yaml:"pcie_tree_width_bdf"` +PCIETreeLinks []PCIETreeLink `json:"pcie_tree_links" yaml:"pcie_tree_links"` +``` + +### Step 2: 改 `Collect` 走新函数 + 派生旧字段 + +- [ ] Edit `ib_hardware_info.go` 的 `Collect`,替换 line 122-123 的两行: + +```go +hw.PCIETreeSpeedMin, hw.PCIETreeSpeedMinBDF = GetPCIETreeMin(IBDev, "current_link_speed") +hw.PCIETreeWidthMin, hw.PCIETreeWidthMinBDF = GetPCIETreeMin(IBDev, "current_link_width") +``` + +为: + +```go +hw.PCIETreeLinks = GetPCIETreeLinks(IBDev) +hw.PCIETreeSpeedMin, hw.PCIETreeSpeedMinBDF = minLinkCurSpeed(hw.PCIETreeLinks) +hw.PCIETreeWidthMin, hw.PCIETreeWidthMinBDF = minLinkCurWidth(hw.PCIETreeLinks) +``` + +### Step 3: 写失败测试 + +- [ ] 创建 `components/infiniband/collector/ib_hardware_info_test.go`: + +```go +/* +Copyright 2024 The Scitix Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package collector + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestMinLinkCurSpeed(t *testing.T) { + cases := []struct { + name string + links []PCIETreeLink + wantVal string + wantBDF string + }{ + { + name: "empty", + links: nil, + wantVal: "", + wantBDF: "", + }, + { + name: "single_link", + links: []PCIETreeLink{ + {ParentBDF: "a", ChildBDF: "b", CurSpeed: "32.0 GT/s PCIe"}, + }, + wantVal: "32.0 GT/s PCIe", + wantBDF: "b", + }, + { + name: "lowest_wins_returns_child_bdf", + links: []PCIETreeLink{ + {ParentBDF: "a", ChildBDF: "b", CurSpeed: "32.0 GT/s PCIe"}, + {ParentBDF: "b", ChildBDF: "c", CurSpeed: "16.0 GT/s PCIe"}, + {ParentBDF: "c", ChildBDF: "d", CurSpeed: "8.0 GT/s PCIe"}, + }, + wantVal: "8.0 GT/s PCIe", + wantBDF: "d", + }, + { + name: "blank_speeds_skipped", + links: []PCIETreeLink{ + {ParentBDF: "a", ChildBDF: "b", CurSpeed: ""}, + {ParentBDF: "b", ChildBDF: "c", CurSpeed: "16.0 GT/s PCIe"}, + }, + wantVal: "16.0 GT/s PCIe", + wantBDF: "c", + }, + { + name: "all_blank", + links: []PCIETreeLink{ + {ParentBDF: "a", ChildBDF: "b", CurSpeed: ""}, + }, + wantVal: "", + wantBDF: "", + }, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + val, bdf := minLinkCurSpeed(tc.links) + assert.Equal(t, tc.wantVal, val) + assert.Equal(t, tc.wantBDF, bdf) + }) + } +} + +func TestMinLinkCurWidth(t *testing.T) { + cases := []struct { + name string + links []PCIETreeLink + wantVal string + wantBDF string + }{ + { + name: "empty", + links: nil, + wantVal: "", + wantBDF: "", + }, + { + name: "lowest_wins", + links: []PCIETreeLink{ + {ParentBDF: "a", ChildBDF: "b", CurWidth: "16"}, + {ParentBDF: "b", ChildBDF: "c", CurWidth: "8"}, + }, + wantVal: "8", + wantBDF: "c", + }, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + val, bdf := minLinkCurWidth(tc.links) + assert.Equal(t, tc.wantVal, val) + assert.Equal(t, tc.wantBDF, bdf) + }) + } +} +``` + +- [ ] **Step 4: 跑测试确认失败** + +Run: `cd /root/devnet/sichek && go test ./components/infiniband/collector/ -run "TestMinLinkCur" -v` +Expected: 编译错误 `undefined: minLinkCurSpeed`、`undefined: minLinkCurWidth`。 + +### Step 5: 实现派生函数 + +- [ ] 在 `ib_hardware_info.go` 文件末尾追加: + +```go +// minLinkCurSpeed returns the smallest CurSpeed across links (using numeric +// comparison after extractNumericSpeed) and the ChildBDF of that link. +// Empty CurSpeed entries are skipped. Returns ("", "") if no link has a +// parseable speed. +func minLinkCurSpeed(links []PCIETreeLink) (string, string) { + var ( + bestVal string + bestBDF string + bestNum float64 + found bool + ) + for _, l := range links { + if l.CurSpeed == "" { + continue + } + num, ok := parseNumericSpeed(l.CurSpeed) + if !ok { + continue + } + if !found || num < bestNum { + bestNum = num + bestVal = l.CurSpeed + bestBDF = l.ChildBDF + found = true + } + } + return bestVal, bestBDF +} + +// minLinkCurWidth is the width counterpart of minLinkCurSpeed. +func minLinkCurWidth(links []PCIETreeLink) (string, string) { + var ( + bestVal string + bestBDF string + bestNum float64 + found bool + ) + for _, l := range links { + if l.CurWidth == "" { + continue + } + num, ok := parseNumericSpeed(l.CurWidth) + if !ok { + continue + } + if !found || num < bestNum { + bestNum = num + bestVal = l.CurWidth + bestBDF = l.ChildBDF + found = true + } + } + return bestVal, bestBDF +} + +// parseNumericSpeed reuses the same heuristic the checker uses: +// strip the unit suffix and parse the leading float. +func parseNumericSpeed(s string) (float64, bool) { + parts := strings.Fields(strings.TrimSpace(s)) + if len(parts) == 0 { + return 0, false + } + f, err := strconv.ParseFloat(parts[0], 64) + if err != nil { + return 0, false + } + return f, true +} +``` + +- [ ] **Step 6: 确认 import** — `ib_hardware_info.go` 顶部需要 `strconv` 和 `strings`;`strings` 已存在,若 `strconv` 缺失则手动添加。检查: + +Run: `cd /root/devnet/sichek && head -30 components/infiniband/collector/ib_hardware_info.go` +Expected: 输出中 import 块包含 `strconv` 和 `strings`。如缺失,手动加 import。 + +### Step 7: 跑测试 + +- [ ] Run: `cd /root/devnet/sichek && go test ./components/infiniband/collector/ -run "TestMinLinkCur" -v` +Expected: 7 个子用例 PASS。 + +- [ ] Run: `cd /root/devnet/sichek && go build ./components/infiniband/...` +Expected: (no output),退出码 0。 + +- [ ] Run: `cd /root/devnet/sichek && go test ./components/infiniband/...` +Expected: 全部 PASS(包括既有 checker,因为 `PCIETreeSpeedMin` 字段语义仍是"链路最小当前速度",老 checker 的 spec 比较还能工作)。 + +### Step 8: 提交 + +- [ ] 提交 + +```bash +git add components/infiniband/collector/ib_hardware_info.go components/infiniband/collector/ib_hardware_info_test.go +git commit -m "Feat/infiniband: wire PCIETreeLinks into IBHardWareInfo, derive legacy fields" +``` + +--- + +## Task 5: Checker 共享比较辅助 `pcieSpeedLessThan` + `minNumericSpeed` + +**Files:** +- Modify: `components/infiniband/checker/pcie_tree_speed.go`(追加 helper,**暂不动主逻辑**) +- Create: `components/infiniband/checker/pcie_helpers_test.go` + +### Step 1: 写失败测试 + +- [ ] 创建 `components/infiniband/checker/pcie_helpers_test.go`: + +```go +/* +Copyright 2024 The Scitix Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package checker + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestPcieSpeedLessThan(t *testing.T) { + cases := []struct { + name string + a, b string + want bool + }{ + {"less_simple", "16", "32", true}, + {"greater", "32", "16", false}, + {"equal_no_decimals", "16", "16", false}, + {"equal_with_decimals", "32.0", "32", false}, + {"less_gt_suffix", "16.0 GT/s PCIe", "32.0 GT/s PCIe", true}, + {"unparseable_a_returns_false", "abc", "32", false}, + {"unparseable_b_returns_false", "16", "xyz", false}, + {"empty_returns_false", "", "32", false}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + assert.Equal(t, tc.want, pcieSpeedLessThan(tc.a, tc.b)) + }) + } +} + +func TestMinNumericSpeed(t *testing.T) { + cases := []struct { + name string + a, b string + want string + }{ + {"a_smaller", "16.0 GT/s PCIe", "32.0 GT/s PCIe", "16.0 GT/s PCIe"}, + {"b_smaller", "32", "16", "16"}, + {"equal_returns_a", "32", "32", "32"}, + {"a_unparseable_returns_empty", "abc", "16", ""}, + {"b_unparseable_returns_empty", "16", "xyz", ""}, + {"both_empty_returns_empty", "", "", ""}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + assert.Equal(t, tc.want, minNumericSpeed(tc.a, tc.b)) + }) + } +} +``` + +- [ ] **Step 2: 跑测试确认失败** + +Run: `cd /root/devnet/sichek && go test ./components/infiniband/checker/ -run "TestPcieSpeedLessThan|TestMinNumericSpeed" -v` +Expected: 编译错误 `undefined: pcieSpeedLessThan`、`undefined: minNumericSpeed`。 + +### Step 3: 实现 helper + +- [ ] 在 `components/infiniband/checker/pcie_tree_speed.go` 文件**末尾**追加(保留现有 `extractNumericSpeed`、`pcieSpeedEqual`、`numericSpeedEqual` 不动): + +```go +// pcieSpeedLessThan returns true iff a < b after extracting the leading numeric +// part of each (so "16.0 GT/s PCIe" parses as 16.0). Returns false when either +// value cannot be parsed — callers must treat "unknown" as "not less" so the +// checker stays normal on unreadable sysfs entries. +func pcieSpeedLessThan(a, b string) bool { + af, errA := strconv.ParseFloat(strings.TrimSpace(extractNumericSpeed(a)), 64) + bf, errB := strconv.ParseFloat(strings.TrimSpace(extractNumericSpeed(b)), 64) + if errA != nil || errB != nil { + return false + } + return af < bf-1e-9 +} + +// minNumericSpeed returns whichever of a or b parses to the smaller numeric +// value, preserving the raw string form. If either side is unparseable +// returns "" so the checker can skip the link rather than emit a noisy +// "unknown" comparison. +func minNumericSpeed(a, b string) string { + af, errA := strconv.ParseFloat(strings.TrimSpace(extractNumericSpeed(a)), 64) + bf, errB := strconv.ParseFloat(strings.TrimSpace(extractNumericSpeed(b)), 64) + if errA != nil || errB != nil { + return "" + } + if af <= bf { + return a + } + return b +} +``` + +### Step 4: 跑测试 + +- [ ] Run: `cd /root/devnet/sichek && go test ./components/infiniband/checker/ -run "TestPcieSpeedLessThan|TestMinNumericSpeed" -v` +Expected: 14 个子用例全部 PASS。 + +### Step 5: 提交 + +- [ ] 提交 + +```bash +git add components/infiniband/checker/pcie_tree_speed.go components/infiniband/checker/pcie_helpers_test.go +git commit -m "Feat/infiniband: add pcieSpeedLessThan and minNumericSpeed helpers" +``` + +--- + +## Task 6: 重写 `IBPCIETreeSpeedChecker.Check` + +**Files:** +- Rewrite: `components/infiniband/checker/pcie_tree_speed.go`(主逻辑部分) +- Create: `components/infiniband/checker/pcie_tree_speed_test.go` + +### Step 1: 写失败测试 + +- [ ] 创建 `components/infiniband/checker/pcie_tree_speed_test.go`: + +```go +/* +Copyright 2024 The Scitix Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package checker + +import ( + "context" + "strings" + "testing" + + "github.com/scitix/sichek/components/infiniband/collector" + "github.com/scitix/sichek/components/infiniband/config" + "github.com/scitix/sichek/consts" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func buildHW(dev, bdf string, links []collector.PCIETreeLink) collector.IBHardWareInfo { + return collector.IBHardWareInfo{ + IBDev: dev, + PCIEBDF: bdf, + PCIETreeLinks: links, + } +} + +// runSpeedChecker constructs an InfinibandInfo from the given map of IBDev → +// IBHardWareInfo and runs the checker. The unexported sync.RWMutex inside +// InfinibandInfo is zero-value usable, so we don't touch it. +func runSpeedChecker(t *testing.T, hws map[string]collector.IBHardWareInfo) (string, string, string, string) { + t.Helper() + ck, err := NewIBPCIETreeSpeedChecker(&config.InfinibandSpec{}) + require.NoError(t, err) + info := &collector.InfinibandInfo{IBHardWareInfo: hws} + res, err := ck.Check(context.Background(), info) + require.NoError(t, err) + return res.Status, res.Device, res.Curr, res.Spec +} + +func TestIBPCIETreeSpeedChecker_NoLinksReportsNormal(t *testing.T) { + status, dev, curr, spec := runSpeedChecker(t, map[string]collector.IBHardWareInfo{ + "mlx5_0": buildHW("mlx5_0", "0000:82:00.0", nil), + }) + assert.Equal(t, consts.StatusNormal, status) + assert.Empty(t, dev) + assert.Empty(t, curr) + assert.Empty(t, spec) +} + +func TestIBPCIETreeSpeedChecker_FullSpeedNormal(t *testing.T) { + hw := buildHW("mlx5_0", "0000:82:00.0", []collector.PCIETreeLink{ + { + ParentBDF: "0000:80:01.0", ChildBDF: "0000:81:00.0", + CurSpeed: "32.0 GT/s PCIe", + ParentMaxSpeed: "32.0 GT/s PCIe", ChildMaxSpeed: "32.0 GT/s PCIe", + }, + { + ParentBDF: "0000:81:00.0", ChildBDF: "0000:82:00.0", + CurSpeed: "32.0 GT/s PCIe", + ParentMaxSpeed: "32.0 GT/s PCIe", ChildMaxSpeed: "32.0 GT/s PCIe", + }, + }) + status, dev, _, _ := runSpeedChecker(t, map[string]collector.IBHardWareInfo{"mlx5_0": hw}) + assert.Equal(t, consts.StatusNormal, status) + assert.Empty(t, dev) +} + +func TestIBPCIETreeSpeedChecker_RootGen4_NoAlarm(t *testing.T) { + hw := buildHW("mlx5_0", "0000:82:00.0", []collector.PCIETreeLink{ + { + ParentBDF: "0000:80:01.0", ChildBDF: "0000:81:00.0", + CurSpeed: "16.0 GT/s PCIe", + ParentMaxSpeed: "16.0 GT/s PCIe", ChildMaxSpeed: "32.0 GT/s PCIe", + }, + { + ParentBDF: "0000:81:00.0", ChildBDF: "0000:82:00.0", + CurSpeed: "16.0 GT/s PCIe", + ParentMaxSpeed: "32.0 GT/s PCIe", ChildMaxSpeed: "32.0 GT/s PCIe", + }, + }) + status, _, _, _ := runSpeedChecker(t, map[string]collector.IBHardWareInfo{"mlx5_0": hw}) + assert.Equal(t, consts.StatusNormal, status) +} + +func TestIBPCIETreeSpeedChecker_OneLinkDegraded(t *testing.T) { + hw := buildHW("mlx5_0", "0000:82:00.0", []collector.PCIETreeLink{ + { + ParentBDF: "0000:80:01.0", ChildBDF: "0000:81:00.0", + CurSpeed: "32.0 GT/s PCIe", + ParentMaxSpeed: "32.0 GT/s PCIe", ChildMaxSpeed: "32.0 GT/s PCIe", + }, + { + ParentBDF: "0000:81:00.0", ChildBDF: "0000:82:00.0", + CurSpeed: "16.0 GT/s PCIe", + ParentMaxSpeed: "32.0 GT/s PCIe", ChildMaxSpeed: "32.0 GT/s PCIe", + }, + }) + status, dev, curr, spec := runSpeedChecker(t, map[string]collector.IBHardWareInfo{"mlx5_0": hw}) + assert.Equal(t, consts.StatusAbnormal, status) + assert.Contains(t, dev, "mlx5_0(0000:82:00.0, bottleneck@0000:81:00.0->0000:82:00.0)") + assert.Equal(t, "16.0 GT/s PCIe", curr) + assert.Equal(t, "32.0 GT/s PCIe", spec) +} + +func TestIBPCIETreeSpeedChecker_MultipleLinksDegraded(t *testing.T) { + hw := buildHW("mlx5_3", "0000:c1:00.0", []collector.PCIETreeLink{ + { + ParentBDF: "0000:c0:01.0", ChildBDF: "0000:c1:00.0", + CurSpeed: "8.0 GT/s PCIe", + ParentMaxSpeed: "32.0 GT/s PCIe", ChildMaxSpeed: "32.0 GT/s PCIe", + }, + { + ParentBDF: "0000:bf:00.0", ChildBDF: "0000:c0:01.0", + CurSpeed: "16.0 GT/s PCIe", + ParentMaxSpeed: "32.0 GT/s PCIe", ChildMaxSpeed: "32.0 GT/s PCIe", + }, + }) + status, dev, curr, spec := runSpeedChecker(t, map[string]collector.IBHardWareInfo{"mlx5_3": hw}) + assert.Equal(t, consts.StatusAbnormal, status) + assert.Equal(t, 2, strings.Count(dev, "bottleneck")) + // Both currents and both caps joined by comma, order matches PCIETreeLinks order. + assert.Equal(t, "8.0 GT/s PCIe,16.0 GT/s PCIe", curr) + assert.Equal(t, "32.0 GT/s PCIe,32.0 GT/s PCIe", spec) +} + +func TestIBPCIETreeSpeedChecker_MultiNIC_OneDegraded(t *testing.T) { + good := buildHW("mlx5_0", "0000:82:00.0", []collector.PCIETreeLink{ + { + ParentBDF: "0000:80:01.0", ChildBDF: "0000:82:00.0", + CurSpeed: "32.0 GT/s PCIe", + ParentMaxSpeed: "32.0 GT/s PCIe", ChildMaxSpeed: "32.0 GT/s PCIe", + }, + }) + bad := buildHW("mlx5_3", "0000:c1:00.0", []collector.PCIETreeLink{ + { + ParentBDF: "0000:c0:01.0", ChildBDF: "0000:c1:00.0", + CurSpeed: "16.0 GT/s PCIe", + ParentMaxSpeed: "32.0 GT/s PCIe", ChildMaxSpeed: "32.0 GT/s PCIe", + }, + }) + status, dev, _, _ := runSpeedChecker(t, map[string]collector.IBHardWareInfo{ + "mlx5_0": good, + "mlx5_3": bad, + }) + assert.Equal(t, consts.StatusAbnormal, status) + assert.Contains(t, dev, "mlx5_3") + assert.NotContains(t, dev, "mlx5_0") +} + +func TestIBPCIETreeSpeedChecker_UnparseableMaxSkipped(t *testing.T) { + hw := buildHW("mlx5_0", "0000:82:00.0", []collector.PCIETreeLink{ + { + ParentBDF: "0000:80:01.0", ChildBDF: "0000:82:00.0", + CurSpeed: "16.0 GT/s PCIe", + ParentMaxSpeed: "", ChildMaxSpeed: "Unknown", + }, + }) + status, _, _, _ := runSpeedChecker(t, map[string]collector.IBHardWareInfo{"mlx5_0": hw}) + assert.Equal(t, consts.StatusNormal, status) +} +``` + +- [ ] **Step 2: 跑测试确认失败** + +Run: `cd /root/devnet/sichek && go test ./components/infiniband/checker/ -run "TestIBPCIETreeSpeedChecker" -v` +Expected: 测试不通过 —— 当前 checker 仍走旧 spec 比较,结构上行为不对。 + +### Step 3: 重写 checker 主逻辑 + +- [ ] 替换 `components/infiniband/checker/pcie_tree_speed.go` 中 **`Check` 方法**整段(line 72-150)为: + +```go +func (c *IBPCIETreeSpeedChecker) Check(ctx context.Context, data any) (*common.CheckerResult, error) { + infinibandInfo, ok := data.(*collector.InfinibandInfo) + if !ok { + return nil, fmt.Errorf("invalid InfinibandInfo type") + } + + result := config.InfinibandCheckItems[c.name] + result.Status = consts.StatusNormal + + infinibandInfo.RLock() + hwInfoLen := len(infinibandInfo.IBHardWareInfo) + infinibandInfo.RUnlock() + + if hwInfoLen == 0 { + result.Status = consts.StatusAbnormal + result.Suggestion = "" + result.Detail = config.NOIBFOUND + return &result, fmt.Errorf("fail to get the IB device") + } + + infinibandInfo.RLock() + hws := uniqueByDev(infinibandInfo.IBHardWareInfo) + infinibandInfo.RUnlock() + + failedDevices := make([]string, 0) + failedCurr := make([]string, 0) + failedCap := make([]string, 0) + detailLines := make([]string, 0) + suggestionLines := make([]string, 0) + + for _, hwInfo := range hws { + if len(hwInfo.PCIETreeLinks) == 0 { + // Direct-to-CPU or sysfs unavailable; treat as normal. + continue + } + for _, link := range hwInfo.PCIETreeLinks { + cap := minNumericSpeed(link.ParentMaxSpeed, link.ChildMaxSpeed) + if cap == "" { + // At least one endpoint's max is unparseable; skip silently + // (collector already logged at debug). + continue + } + if !pcieSpeedLessThan(link.CurSpeed, cap) { + continue + } + result.Status = consts.StatusAbnormal + devInfo := fmt.Sprintf("%s(%s, bottleneck@%s->%s)", + hwInfo.IBDev, hwInfo.PCIEBDF, link.ParentBDF, link.ChildBDF) + failedDevices = append(failedDevices, devInfo) + failedCurr = append(failedCurr, link.CurSpeed) + failedCap = append(failedCap, cap) + detailLines = append(detailLines, fmt.Sprintf( + "%s upstream link %s->%s current %s < cap %s", + hwInfo.IBDev, link.ParentBDF, link.ChildBDF, link.CurSpeed, cap)) + suggestionLines = append(suggestionLines, fmt.Sprintf( + "Check upstream PCIe link %s->%s for %s, current %s is below link capability %s (min of both endpoints' max).", + link.ParentBDF, link.ChildBDF, hwInfo.IBDev, link.CurSpeed, cap)) + } + } + + result.Curr = strings.Join(failedCurr, ",") + result.Spec = strings.Join(failedCap, ",") + result.Device = strings.Join(failedDevices, ",") + if len(failedDevices) != 0 { + result.Detail = strings.Join(detailLines, "\n") + result.Suggestion = strings.Join(suggestionLines, "\n") + } + + return &result, nil +} +``` + +- [ ] 同时**删除**该文件里 line 32-43 之间的 `pcieSpeedEqual` 函数(不再被引用)。**仅当 `go build` 后报 unused 错时再删,否则保留以减少改动半径。** + +> 先不删,等所有 task 收尾时统一清理(Task 8)。 + +### Step 4: 跑测试 + +- [ ] Run: `cd /root/devnet/sichek && go test ./components/infiniband/checker/ -run "TestIBPCIETreeSpeedChecker" -v` +Expected: 7 个子用例全部 PASS。 + +- [ ] Run: `cd /root/devnet/sichek && go test ./components/infiniband/...` +Expected: 全部 PASS(width checker 仍是旧逻辑,但还能编译运行)。 + +### Step 5: 提交 + +- [ ] 提交 + +```bash +git add components/infiniband/checker/pcie_tree_speed.go components/infiniband/checker/pcie_tree_speed_test.go +git commit -m "Feat/infiniband: rewrite PCIETreeSpeedDownDegraded as per-link check" +``` + +--- + +## Task 7: 重写 `IBPCIETreeWidthChecker.Check` + +**Files:** +- Rewrite: `components/infiniband/checker/pcie_tree_width.go` +- Create: `components/infiniband/checker/pcie_tree_width_test.go` + +### Step 1: 写失败测试 + +- [ ] 创建 `components/infiniband/checker/pcie_tree_width_test.go`: + +```go +/* +Copyright 2024 The Scitix Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package checker + +import ( + "context" + "testing" + + "github.com/scitix/sichek/components/infiniband/collector" + "github.com/scitix/sichek/components/infiniband/config" + "github.com/scitix/sichek/consts" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func runWidthChecker(t *testing.T, hws map[string]collector.IBHardWareInfo) (string, string, string, string) { + t.Helper() + ck, err := NewIBPCIETreeWidthChecker(&config.InfinibandSpec{}) + require.NoError(t, err) + info := &collector.InfinibandInfo{IBHardWareInfo: hws} + res, err := ck.Check(context.Background(), info) + require.NoError(t, err) + return res.Status, res.Device, res.Curr, res.Spec +} + +func TestIBPCIETreeWidthChecker_NoLinksNormal(t *testing.T) { + status, dev, _, _ := runWidthChecker(t, map[string]collector.IBHardWareInfo{ + "mlx5_0": {IBDev: "mlx5_0", PCIEBDF: "0000:82:00.0", PCIETreeLinks: nil}, + }) + assert.Equal(t, consts.StatusNormal, status) + assert.Empty(t, dev) +} + +func TestIBPCIETreeWidthChecker_FullWidthNormal(t *testing.T) { + hw := collector.IBHardWareInfo{ + IBDev: "mlx5_0", PCIEBDF: "0000:82:00.0", + PCIETreeLinks: []collector.PCIETreeLink{ + { + ParentBDF: "0000:80:01.0", ChildBDF: "0000:82:00.0", + CurWidth: "16", + ParentMaxWidth: "16", ChildMaxWidth: "16", + }, + }, + } + status, dev, _, _ := runWidthChecker(t, map[string]collector.IBHardWareInfo{"mlx5_0": hw}) + assert.Equal(t, consts.StatusNormal, status) + assert.Empty(t, dev) +} + +func TestIBPCIETreeWidthChecker_OneLinkDegraded(t *testing.T) { + hw := collector.IBHardWareInfo{ + IBDev: "mlx5_0", PCIEBDF: "0000:82:00.0", + PCIETreeLinks: []collector.PCIETreeLink{ + { + ParentBDF: "0000:80:01.0", ChildBDF: "0000:82:00.0", + CurWidth: "8", + ParentMaxWidth: "16", ChildMaxWidth: "16", + }, + }, + } + status, dev, curr, spec := runWidthChecker(t, map[string]collector.IBHardWareInfo{"mlx5_0": hw}) + assert.Equal(t, consts.StatusAbnormal, status) + assert.Contains(t, dev, "mlx5_0(0000:82:00.0, bottleneck@0000:80:01.0->0000:82:00.0)") + assert.Equal(t, "8", curr) + assert.Equal(t, "16", spec) +} + +func TestIBPCIETreeWidthChecker_ParentSlotWiderThanChild_NoAlarm(t *testing.T) { + // E.g. slot is x16 but the NIC supports only x8; running at x8 is correct. + hw := collector.IBHardWareInfo{ + IBDev: "mlx5_0", PCIEBDF: "0000:82:00.0", + PCIETreeLinks: []collector.PCIETreeLink{ + { + ParentBDF: "0000:80:01.0", ChildBDF: "0000:82:00.0", + CurWidth: "8", + ParentMaxWidth: "16", ChildMaxWidth: "8", + }, + }, + } + status, _, _, _ := runWidthChecker(t, map[string]collector.IBHardWareInfo{"mlx5_0": hw}) + assert.Equal(t, consts.StatusNormal, status) +} +``` + +- [ ] **Step 2: 跑测试确认失败** + +Run: `cd /root/devnet/sichek && go test ./components/infiniband/checker/ -run "TestIBPCIETreeWidthChecker" -v` +Expected: 不通过(旧 checker 仍读 yaml)。 + +### Step 3: 重写 width checker + +- [ ] 替换 `components/infiniband/checker/pcie_tree_width.go` 中 `Check` 方法整段(line 57-129)为: + +```go +func (c *IBPCIETreeWidthChecker) Check(ctx context.Context, data any) (*common.CheckerResult, error) { + infinibandInfo, ok := data.(*collector.InfinibandInfo) + if !ok { + return nil, fmt.Errorf("invalid InfinibandInfo type") + } + + result := config.InfinibandCheckItems[c.name] + result.Status = consts.StatusNormal + + infinibandInfo.RLock() + hwInfoLen := len(infinibandInfo.IBHardWareInfo) + infinibandInfo.RUnlock() + + if hwInfoLen == 0 { + result.Status = consts.StatusAbnormal + result.Suggestion = "" + result.Detail = config.NOIBFOUND + return &result, fmt.Errorf("fail to get the IB device") + } + + infinibandInfo.RLock() + hws := uniqueByDev(infinibandInfo.IBHardWareInfo) + infinibandInfo.RUnlock() + + failedDevices := make([]string, 0) + failedCurr := make([]string, 0) + failedCap := make([]string, 0) + detailLines := make([]string, 0) + suggestionLines := make([]string, 0) + + for _, hwInfo := range hws { + if len(hwInfo.PCIETreeLinks) == 0 { + continue + } + for _, link := range hwInfo.PCIETreeLinks { + cap := minNumericSpeed(link.ParentMaxWidth, link.ChildMaxWidth) + if cap == "" { + continue + } + if !pcieSpeedLessThan(link.CurWidth, cap) { + continue + } + result.Status = consts.StatusAbnormal + devInfo := fmt.Sprintf("%s(%s, bottleneck@%s->%s)", + hwInfo.IBDev, hwInfo.PCIEBDF, link.ParentBDF, link.ChildBDF) + failedDevices = append(failedDevices, devInfo) + failedCurr = append(failedCurr, link.CurWidth) + failedCap = append(failedCap, cap) + detailLines = append(detailLines, fmt.Sprintf( + "%s upstream link %s->%s current width x%s < cap x%s", + hwInfo.IBDev, link.ParentBDF, link.ChildBDF, link.CurWidth, cap)) + suggestionLines = append(suggestionLines, fmt.Sprintf( + "Check upstream PCIe link %s->%s for %s, current width x%s is below link capability x%s (min of both endpoints' max).", + link.ParentBDF, link.ChildBDF, hwInfo.IBDev, link.CurWidth, cap)) + } + } + + result.Curr = strings.Join(failedCurr, ",") + result.Spec = strings.Join(failedCap, ",") + result.Device = strings.Join(failedDevices, ",") + if len(failedDevices) != 0 { + result.Detail = strings.Join(detailLines, "\n") + result.Suggestion = strings.Join(suggestionLines, "\n") + } + + return &result, nil +} +``` + +- [ ] 确认 `pcie_tree_width.go` 顶部 import 包含 `strings`、`fmt`、`common`、`collector`、`config`、`consts`、`logrus`(多数已存在;`logrus` 不再用可删——但若编译报 unused 再删)。 + +### Step 4: 跑测试 + +- [ ] Run: `cd /root/devnet/sichek && go test ./components/infiniband/checker/ -run "TestIBPCIETreeWidthChecker" -v` +Expected: 4 个子用例 PASS。 + +- [ ] Run: `cd /root/devnet/sichek && go test ./components/infiniband/...` +Expected: 全部 PASS。 + +### Step 5: 提交 + +- [ ] 提交 + +```bash +git add components/infiniband/checker/pcie_tree_width.go components/infiniband/checker/pcie_tree_width_test.go +git commit -m "Feat/infiniband: rewrite PCIETreeWidthIncorrect as per-link check" +``` + +--- + +## Task 8: 删除已废弃的 `GetPCIETreeMin`、清理未引用 helper + +**Files:** +- Modify: `components/infiniband/collector/pcie_info.go` +- Modify: `components/infiniband/checker/pcie_tree_speed.go`(清理未引用的 `pcieSpeedEqual` / `numericSpeedEqual`) +- Modify: `components/infiniband/checker/pcie_tree_width.go`(清理未引用的 `logrus` import 等) + +### Step 1: 确认无外部引用 + +- [ ] Run: `cd /root/devnet/sichek && grep -rn "GetPCIETreeMin\b" --include="*.go" | grep -v _test.go` +Expected: 仅 `components/infiniband/collector/pcie_info.go` 内部。 + +- [ ] Run: `cd /root/devnet/sichek && grep -rn "pcieSpeedEqual\b\|numericSpeedEqual\b" --include="*.go" | grep -v _test.go` +Expected: 仅在 `pcie_tree_speed.go` 内部定义;checker 主体已不再调用。 + +### Step 2: 删除函数与未引用 helper + +- [ ] 从 `components/infiniband/collector/pcie_info.go` 删除 `GetPCIETreeMin`(line 234-329 整段含 doc 注释)。 + +- [ ] 从 `components/infiniband/checker/pcie_tree_speed.go` 删除(line 31-43 的 `pcieSpeedEqual` 和 line 163-171 的 `numericSpeedEqual`),保留 `extractNumericSpeed`(仍被 helpers 使用)以及新加的 `pcieSpeedLessThan`、`minNumericSpeed`。 + +### Step 3: 修复任何编译错误(unused import 等) + +- [ ] Run: `cd /root/devnet/sichek && go build ./components/infiniband/...` +Expected: 退出码 0。如有 `imported and not used` 报错,删除对应 import 行。常见受影响 import: + - `pcie_info.go`:`math`(仅 `GetPCIETreeMin` 用过),删除 `math` import。 + - `pcie_tree_width.go`:旧版可能有 `logrus`,新版若未使用则删除该 import 行。 + +### Step 4: 全量测试 + +- [ ] Run: `cd /root/devnet/sichek && go test ./components/infiniband/...` +Expected: ok。 + +- [ ] Run: `cd /root/devnet/sichek && go vet ./components/infiniband/...` +Expected: (no output)。 + +### Step 5: 提交 + +- [ ] 提交 + +```bash +git add components/infiniband/collector/pcie_info.go components/infiniband/checker/pcie_tree_speed.go components/infiniband/checker/pcie_tree_width.go +git commit -m "Refactor/infiniband: drop legacy GetPCIETreeMin and unused checker helpers" +``` + +--- + +## Task 9: 全量构建 + 单测回归 + +**Files:** (none) + +- [ ] **Step 1: 全量构建** + +Run: `cd /root/devnet/sichek && make` +Expected: `build/bin/sichek` 生成;构建无 warning。 + +- [ ] **Step 2: 全量单测** + +Run: `cd /root/devnet/sichek && go test ./...` +Expected: 所有包 `ok`。 + +- [ ] **Step 3: vet** + +Run: `cd /root/devnet/sichek && go vet ./...` +Expected: (no output)。 + +- [ ] **Step 4: 如果 Makefile / go.mod 有未提交的非本任务改动** + +不在本计划范围;保留为 working tree 改动,不提交。 + +> 此 Task 不产生 commit。 + +--- + +## Task 10: 更新 `docs/infiniband.md` + +**Files:** +- Modify: `docs/infiniband.md` + +### Step 1: 找到对应章节 + +- [ ] Run: `grep -n -i "pcie.tree\|treespeed\|treewidth\|PCIETreeSpeed\|PCIETreeWidth" /root/devnet/sichek/docs/infiniband.md` +Expected: 若返回行号 N,则在 N 附近做修改;若返回空,则文末添加新小节"PCIe Tree Speed/Width"。 + +### Step 2: 在对应位置追加 / 替换一段说明(中文,与现有文档风格一致) + +- [ ] 添加(或替换)如下段落: + +```markdown +## PCIe Tree Speed/Width 检测 + +`PCIETreeSpeedDownDegraded` / `PCIETreeWidthIncorrect` 不再依赖 HCA `board_id` spec +中的 `pcie_tree_speed` / `pcie_tree_width` 字段。Collector 直接从 sysfs +(`/sys/bus/pci/devices//{current,max}_link_{speed,width}`)枚举从 NIC 到 root +complex 的每一条 PCIe link,对每条 link 取两端 `max_link_*` 的较小者作为该链路 +理论上限;当 `current_link_*` 低于该上限时上报。 + +效果: + +- 多 NIC 混插场景无需补 spec;新 SKU 上线零维护。 +- 当 root port 物理上限低于 NIC(例如 root port 仅支持 Gen4 而 NIC 是 Gen5)时 + 不再误报。 +- `Detail` / `Suggestion` 字段精确到出问题的具体 link,例如 `bottleneck@0000:80:01.0->0000:81:00.0`。 + +`pcie_tree_speed` / `pcie_tree_width` 字段在 HCA spec yaml 中保留为历史信息, +运行时不再读取。 +``` + +### Step 3: 提交 + +- [ ] 提交(用 `-f` 因为 `*.md` 在 .gitignore 中,但 `docs/` 下的现有文件已被跟踪——如果是修改而非新增不需要 `-f`): + +```bash +git add docs/infiniband.md +git commit -m "Docs/infiniband: explain spec-free PCIe Tree Speed/Width check" +``` + +--- + +## Summary + +10 个 task: +- Task 1-4:collector 侧(注入点 + 新结构 + 派生); +- Task 5:共享辅助函数; +- Task 6-7:两个 checker 重写; +- Task 8:清理废弃代码; +- Task 9:全量回归; +- Task 10:文档。 + +每个 task 自带表驱动测试 + 单独 commit;TDD 顺序明确(先红后绿再重构)。落地后 `docs/superpowers/specs/2026-05-18-pcie-tree-spec-free-design.md` 中所有"目标"都有对应任务覆盖。 diff --git a/docs/superpowers/specs/2026-05-18-pcie-tree-spec-free-design.md b/docs/superpowers/specs/2026-05-18-pcie-tree-spec-free-design.md new file mode 100644 index 00000000..4cf6e43b --- /dev/null +++ b/docs/superpowers/specs/2026-05-18-pcie-tree-spec-free-design.md @@ -0,0 +1,351 @@ +# PCIe Tree Speed/Width 检测 spec-free 化设计 + +- 状态:Draft(待评审) +- 范围:`components/infiniband` 中的 `PCIETreeSpeedDownDegraded` 和 `PCIETreeWidthIncorrect` 两个 checker +- 目标分支:`feat/pcie-tree-spec-free` +- 设计日期:2026-05-18 + +--- + +## 1. 背景与动机 + +目前 sichek 的 PCIe 链路类检查共有 4 项,全部依赖 `components/hca/config/default_spec.yaml` 中按 NIC `board_id` 维度填写的期望值: + +| Checker | yaml 字段(HCA spec) | ErrorName | +| --- | --- | --- | +| `CheckPCIETreeSpeed` | `pcie_tree_speed`(无则回退 `pcie_speed`) | `PCIETreeSpeedDownDegraded` | +| `CheckPCIETreeWidth` | `pcie_tree_width`(无则回退 `pcie_width`) | `PCIETreeWidthIncorrect` | +| `CheckPCIESpeed` | `pcie_speed` | `PCIELinkSpeedDownDegraded` | +| `CheckPCIEWidth` | `pcie_width` | `PCIELinkWidthIncorrect` | + +Tree Speed/Width 的判定流程(`components/infiniband/checker/pcie_tree_speed.go`、`pcie_tree_width.go`): + +1. Collector `GetPCIETreeMin` 用 `readlink /sys/bus/pci/devices/` 解出整条上行 BDF 路径,剔除 NIC 自身 BDF,对剩余每个 BDF 读 `current_link_speed` / `current_link_width`,取最小值作为 `PCIETreeSpeedMin` / `PCIETreeWidthMin`,并记录瓶颈 BDF。 +2. Checker 把该最小值与 HCA spec 里通过 `board_id` 查到的 `pcie_tree_speed` / `pcie_tree_width` 比较,不等就报 Critical。 +3. 若上行只有 0/1 个节点(直连 CPU),跳过检查。 + +存在的问题: + +- yaml 的"期望值"本质上是"该 NIC 在该机型这条 PCIe 路径上理论能跑到的最高速率",是**对硬件能力的复述**;硬件本身已经把同样信息写在 sysfs 的 `max_link_speed` / `max_link_width` 中。 +- 新机型 / 新 NIC SKU 上线要补 `board_id` 条目(OSS 下载或手工 PR),易过时、易遗漏。 +- **多 NIC 混插场景下(生产+管理 NIC、不同代次 NIC 同机)**,必须保证每张卡的 board_id 在 spec 都有项,维护成本随 SKU 数量线性增长。 +- 当 NIC = Gen5、root port = Gen4 时,yaml 通常按 NIC 能力填 Gen5,整条链路实际只能跑 Gen4 → **当前规则会假阳报警**。 + +本设计去掉 Tree Speed/Width 两个 checker 对 HCA spec 的依赖,改为完全从 sysfs 自描述地推导 expected。 + +## 2. 目标与非目标 + +### 目标 + +- 去除 `PCIETreeSpeedDownDegraded` 和 `PCIETreeWidthIncorrect` 这两个 checker 对 HCA spec 的运行时依赖。 +- 每条上行 PCIe link 独立判定降速;多 NIC 混插下无需任何 spec 维护即可生效。 +- 诊断输出从"NIC 整体最小值"细化到"哪条上行 link 降速、当前值、能力上限",提高现场可定位性。 +- 不破坏现有 Prometheus 指标 schema(label 集合不变;`spec` label 取值含义变化但维度保持)。 + +### 非目标 + +- **不动** `CheckPCIESpeed` / `CheckPCIEWidth`(NIC 自身 link 的检查),它们走 NIC 自身 `current_link_speed` vs spec 比较,下一阶段再讨论。 +- **不动** `components/hca/config/default_spec.yaml` 中现有的 `pcie_tree_speed` / `pcie_tree_width` 字段;checker 运行时直接忽略它们,yaml 留作"历史信息",不迁移不删除。 +- 不引入 spec override 逃生舱(用户已确认)。本期完全 sysfs only;若实际遇到 sysfs 不可信的平台,再单独评估是否加白名单。 +- 不改告警级别(仍是 `critical`),不改 metric 名称、不改 ErrorName。 + +## 3. 总体方案 + +``` +sysfs /sys/bus/pci/devices/ + readlink → [BDF_root, BDF_switch_up, BDF_switch_dn, BDF_nic] + │ + ▼ + for each adjacent pair (parent, child) in path: + read parent.{current,max}_link_speed/width + read child.{current,max}_link_speed/width + │ + ▼ + []PCIETreeLink { + ParentBDF, ChildBDF, + CurSpeed, CurWidth, + ParentMaxSpeed, ChildMaxSpeed, + ParentMaxWidth, ChildMaxWidth, + } + │ (附在 IBHardWareInfo 上,per-NIC) + ▼ + IBPCIETreeSpeedChecker / IBPCIETreeWidthChecker + for each link in PCIETreeLinks: + cap_speed = min(ParentMaxSpeed, ChildMaxSpeed) + if link.CurSpeed < cap_speed: 记录降速链路(多条都记) + 同理 Width +``` + +关键变化: + +- **expected = 每条 link 的 `min(parent.max, self.max)`**,不再来自 yaml。这能正确处理"root port 只支持 Gen4 而 NIC 是 Gen5"的合法场景:那条 link 的 cap 就是 Gen4,current=Gen4 时不报警。 +- collector 一次扫描产出 `[]PCIETreeLink`,**Speed/Width 两个 checker 共用同一份数据**,不会重复读 sysfs。 +- HCA spec yaml 不动;checker 不再读 `pcie_tree_speed` / `pcie_tree_width` 字段。 + +## 4. 模块改动清单 + +``` +components/infiniband/ +├── collector/ +│ ├── pcie_info.go (改:GetPCIETreeMin → GetPCIETreeLinks) +│ └── ib_hardware_info.go (改:IBHardWareInfo 新增 PCIETreeLinks) +├── checker/ +│ ├── pcie_tree_speed.go (改写:基于 PCIETreeLinks) +│ └── pcie_tree_width.go (改写:基于 PCIETreeLinks) +``` + +### 4.1 `IBHardWareInfo` 数据结构(`collector/ib_hardware_info.go`) + +新增字段: + +```go +type PCIETreeLink struct { + ParentBDF string `json:"parent_bdf" yaml:"parent_bdf"` + ChildBDF string `json:"child_bdf" yaml:"child_bdf"` + CurSpeed string `json:"cur_speed" yaml:"cur_speed"` + CurWidth string `json:"cur_width" yaml:"cur_width"` + ParentMaxSpeed string `json:"parent_max_speed" yaml:"parent_max_speed"` + ChildMaxSpeed string `json:"child_max_speed" yaml:"child_max_speed"` + ParentMaxWidth string `json:"parent_max_width" yaml:"parent_max_width"` + ChildMaxWidth string `json:"child_max_width" yaml:"child_max_width"` +} + +type IBHardWareInfo struct { + // ...existing fields... + PCIETreeSpeedMin string `json:"pcie_tree_speed" yaml:"pcie_tree_speed"` + PCIETreeSpeedMinBDF string `json:"pcie_tree_speed_bdf" yaml:"pcie_tree_speed_bdf"` + PCIETreeWidthMin string `json:"pcie_tree_width" yaml:"pcie_tree_width"` + PCIETreeWidthMinBDF string `json:"pcie_tree_width_bdf" yaml:"pcie_tree_width_bdf"` + PCIETreeLinks []PCIETreeLink `json:"pcie_tree_links" yaml:"pcie_tree_links"` // NEW +} +``` + +> **兼容策略**:保留旧 4 个字段(`PCIETreeSpeedMin / PCIETreeWidthMin / PCIETreeSpeedMinBDF / PCIETreeWidthMinBDF`),值由 `PCIETreeLinks` 派生填充(`PCIETreeSpeedMin` = `PCIETreeLinks` 中 `CurSpeed` 最小的那条,对应的 `ChildBDF` 即 `PCIETreeSpeedMinBDF`;Width 同理)。这保证:JSON snapshot 消费方、Prometheus 现有 label 维度不会破坏。 + +### 4.2 Collector:`GetPCIETreeLinks`(替换 `GetPCIETreeMin`) + +签名: + +```go +// GetPCIETreeLinks enumerates every PCIe link on the upstream path of an IB +// device. Each adjacent (parent, child) BDF pair in the readlink path becomes +// one PCIETreeLink, with current_link_{speed,width} and max_link_{speed,width} +// read from both endpoints. Direct-to-CPU devices (<2 BDFs in path) return nil. +func GetPCIETreeLinks(IBDev string) []PCIETreeLink +``` + +实现要点: + +1. `readlink /sys/bus/pci/devices/` 提取 BDF 序列 `[bdf_0, bdf_1, ..., bdf_n]`(`bdf_n` 即 NIC 自身)。 +2. 若 `len(bdfs) < 2`,返回 `nil`(保持今天"直连 CPU 跳过"的语义)。 +3. 对每对相邻 `(bdf_{i-1}, bdf_i)`: + - 读 `current_link_speed` / `current_link_width`:优先取 child(`bdf_i`)的;与 parent 不一致时打 debug log,仍以 child 为准(链路两端共享同一 link,理论上一致)。 + - 读 parent.`max_link_speed` / child.`max_link_speed`;同 Width。 +4. 任一文件读不到 → **该 link 跳过**,warn log;不阻塞其它 link。 +5. `PCIPath` 抽取为可注入参数 / 包级变量,方便 t.TempDir() 单测。 + +新增辅助 `readSysfsLine(filepath) (string, error)`(trim 换行、空文件返回错误),如果 repo 中已有类似工具复用之。 + +### 4.3 `IBHardWareInfo.Get` 派生填充旧字段 + +在 `ib_hardware_info.go` 调用 `GetPCIETreeLinks` 后,做一次 reduce: + +```go +hw.PCIETreeLinks = GetPCIETreeLinks(IBDev) +hw.PCIETreeSpeedMin, hw.PCIETreeSpeedMinBDF = minLinkCurSpeed(hw.PCIETreeLinks) +hw.PCIETreeWidthMin, hw.PCIETreeWidthMinBDF = minLinkCurWidth(hw.PCIETreeLinks) +``` + +`minLinkCurSpeed` / `minLinkCurWidth` 是新加的小工具函数:扫一遍 `PCIETreeLinks`,按数值最小的 `CurSpeed` / `CurWidth` 返回值 + 对应 `ChildBDF`;空列表返回 `("","")`。 + +### 4.4 Checker:`pcie_tree_speed.go` + +完全去掉对 `c.spec.HCAs[hwInfo.BoardID].Hardware.PCIETreeSpeedMin` / `PCIESpeed` 的读取。重写主循环: + +```go +for _, hwInfo := range uniqueByDev(infinibandInfo.IBHardWareInfo) { + if len(hwInfo.PCIETreeLinks) == 0 { + // direct-to-CPU or sysfs unavailable; treat as normal + continue + } + for _, link := range hwInfo.PCIETreeLinks { + cap := minNumericSpeed(link.ParentMaxSpeed, link.ChildMaxSpeed) + if cap == "" { + // both endpoints' max unreadable; skip silently (warn already in collector) + continue + } + if pcieSpeedLessThan(link.CurSpeed, cap) { + failedDevices = append(failedDevices, fmt.Sprintf( + "%s(%s, bottleneck@%s->%s)", + hwInfo.IBDev, hwInfo.PCIEBDF, link.ParentBDF, link.ChildBDF)) + failedCurr = append(failedCurr, link.CurSpeed) + failedCap = append(failedCap, cap) + } + } +} +``` + +- 新增 `minNumericSpeed(a, b string) string`:解析两侧数值,返回较小者的原始字符串;任意一侧解析失败返回 `""`(让 checker 跳过该 link)。 +- 新增 `pcieSpeedLessThan(a, b string) bool`:浮点解析后 `a < b`;解析失败时返回 `false`(保守,不报警)。复用 `extractNumericSpeed`。 + +`CheckerResult` 填充: + +```go +result.Status = "abnormal" // 仅当 failedDevices 非空 +result.Level = "critical" // 与现有一致 +result.Device = strings.Join(failedDevices, ",") +result.Curr = strings.Join(failedCurr, ",") +result.Spec = strings.Join(failedCap, ",") // ← 填实际 cap 值("16.0 GT/s" 等),不是 yaml +result.Detail = "" +result.Suggestion = "" +result.ErrorName = "PCIETreeSpeedDownDegraded" // 不变 +``` + +正常分支与今天保持一致:`result.Curr` 填该 NIC 整体最小 current speed(来自派生字段 `PCIETreeSpeedMin`),`result.Spec` 填 `min(所有 link cap)`(描述链路理论上限)。**告警 series 在没有降速时不会出现,这点由 metrics 层的 `ResetMetric` 保证(见 `metrics/prometheus.go:62-109`),跟今天行为一致。** + +### 4.5 Checker:`pcie_tree_width.go` + +与 §4.4 完全对称:字段名从 `Speed` 换成 `Width`(读 `link.CurWidth` / `ParentMaxWidth` / `ChildMaxWidth`),数值比较**复用同一个 `pcieSpeedLessThan` 辅助函数**——`extractNumericSpeed` + `strconv.ParseFloat` 对 width 字符串(`"16"` / `"8"`)同样成立。`ErrorName` 维持 `PCIETreeWidthIncorrect`。 + +## 5. 算法细节与边界情况 + +### 5.1 数值比较 + +复用现有 `extractNumericSpeed`:把 `"32.0 GT/s PCIe"` / `"32"` / `"32.00"` 等统一抽出首段数字字符串,再 `strconv.ParseFloat`。 + +```go +func pcieSpeedLessThan(a, b string) bool { + af, errA := strconv.ParseFloat(extractNumericSpeed(a), 64) + bf, errB := strconv.ParseFloat(extractNumericSpeed(b), 64) + if errA != nil || errB != nil { + return false // unknown → not less; checker stays normal + } + return af < bf-1e-9 // 用一个小 epsilon 避免浮点误判 +} +``` + +Width 解析路径直接走整数解析(`current_link_width` 是纯数字 "16" / "8")。 + +### 5.2 边界情况一览 + +| 场景 | 行为 | +| --- | --- | +| `readlink` 路径 < 2 个 BDF(NIC 直连 CPU) | `PCIETreeLinks=nil`,checker 跳过该 NIC,不报警 | +| 某条 link 的 `current_link_speed` 文件读不到 | 该 link 不进 `PCIETreeLinks`,collector warn log,不报警 | +| 某条 link 的 `max_link_speed` 任一侧读不到 | 该 link 在 collector 仍可收集 current,但 checker 阶段 `cap=""` 直接跳过;不报警 | +| 某 BDF 的 `max_link_speed` 解析失败("Unknown" / 非数字) | 同上 | +| NIC = Gen5、root port = Gen4 → 实际跑 Gen4 | root↔switch 这条 link 的 `cap = min(Gen5, Gen4) = Gen4`,current = Gen4 → 不报警(修复今天的假阳) | +| 多条 link 同时降速 | 全部进 `failedDevices`,`Device` 多条 `bottleneck@parent->child` 并列;`Curr` / `Spec` 同步多条拼接 | +| 同一节点多张 NIC(混插 CX-7 + CX-6) | 每张 NIC 独立扫树独立判定,无干扰 | +| Bond IB / 虚拟函数 / `mezz` 卡 | 已在 `GetIBPFBoardIDs` 阶段过滤,本设计无变化 | +| 读到 `max_link_speed = 0` / 负值 | collector 视为异常,warn log,该 link 不进结果 | +| parent 与 child 报告的 `current_link_speed` 不一致 | 取 child 的,打 debug log(不抛错);保留诊断空间 | + +### 5.3 输出示例 + +正常: +``` +status: normal +curr: 16.0 GT/s (派生自 PCIETreeSpeedMin) +spec: 16.0 GT/s (链路理论 cap) +``` + +降速(mlx5_3 root↔switch 这条 link 跑到 Gen3,而两端 max 都是 Gen5): +``` +status: abnormal +level: critical +error_name: PCIETreeSpeedDownDegraded +device: mlx5_3(0000:c1:00.0, bottleneck@0000:80:01.0->0000:81:00.0) +curr: 8.0 GT/s +spec: 32.0 GT/s +detail: mlx5_3 upstream link 0000:80:01.0->0000:81:00.0 current 8.0 GT/s < cap 32.0 GT/s +suggestion: Check upstream PCIe bridge/switch link 0000:80:01.0->0000:81:00.0 for mlx5_3, + current 8.0 GT/s is below link capability 32.0 GT/s (min of both endpoints' max). +``` + +多条降速: +``` +device: mlx5_3(...,bottleneck@A->B), mlx5_3(...,bottleneck@C->D) +curr: 8.0 GT/s,16.0 GT/s +spec: 32.0 GT/s,32.0 GT/s +detail: <两行> +``` + +## 6. 兼容性 / 风险 + +### 6.1 现有 Prometheus 标签的 churn + +`metrics/prometheus.go` 把 `CheckerResult` 的字段当 label。`spec` label 这次从 yaml 字面量(按 NIC 型号大致只有几种取值)变成 sysfs 派生的 cap 值("16.0 GT/s" / "32.0 GT/s" 等),维度数没变,但**取值集合可能更分散**,且老 series 会因 label 值变化"消失"、出现新 series。 + +**应对**: +- PR 描述里 highlight 这一行为变化,提示监控/告警面板侧 review 报警规则中是否用到 `spec` label 的具体值。 +- 若有过强依赖,可以考虑把 `spec` label 改为统一字符串 `"link-cap"` 等占位值;但失去诊断价值,**默认不采用**。 + +### 6.2 sysfs `max_link_speed` 不准的平台 + +少数老平台 / 虚拟化场景 root port 的 `max_link_speed` 可能为空 / "Unknown"。本设计选择 **静默跳过**(不报警,warn log),是为了避免假阳。代价是这些场景下"真降速"也无法识别 → 假阴。 + +**应对**:上线后跑一份现网 sample(`for d in /sys/bus/pci/devices/*/; do echo $d; cat $d/{current,max}_link_speed; done` 在各机型上采样),核对覆盖率;如发现某 vendor:device 普遍不报 `max_link_speed`,再考虑加例外路径。 + +### 6.3 sysfs 调用次数放大 + +每条 link 4 个 sysfs 文件 vs 今天每个上行 BDF 1 个文件。8 NIC * 平均 3 跳 * 4 文件 = 96 次读,比今天约 8 * 3 * 1 = 24 次多 4 倍。10 s 周期下 sysfs 小文件读完全可忽略。 + +### 6.4 JSON snapshot schema 兼容 + +`IBHardWareInfo` 新增 `PCIETreeLinks` 字段(数组,包含 8 个 string 字段的对象)。若 sichek-collector 后端有强 schema 校验,需先同步;否则解析方一般忽略未知字段。**老 4 个字段保留**意味着已有消费方不会断。 + +### 6.5 yaml 字段未来如何处理 + +本期不动 `pcie_tree_speed` / `pcie_tree_width`。后续若 Link Speed/Width 也走 spec-free,再统一从 spec 中清理;记 follow-up issue 即可。 + +## 7. 测试计划 + +### 7.1 collector 单测(`collector/pcie_info_test.go`,新增) + +要求把 `PCIPath` 从硬编码常量改为可注入(包级变量 + setter,或 `GetPCIETreeLinks` 接收 root 参数)。这是测试基础设施的小重构。 + +测试用例: + +1. **全链路全速**:3 个 BDF(root→switch→NIC),两条 link 两端 max 均 = 32,current=32 → `len(PCIETreeLinks)=2`,每条 cap=32、cur=32。 +2. **switch↔NIC 降速**:current=16,两端 max=32 → 该 link cur=16、cap=32。 +3. **root port 只支持 Gen4**:parent.max=16、child.max=32、current=16 → 该 link cur=16、cap=16(关键场景:不应被 checker 报警)。 +4. **switch 的 `max_link_speed` 文件缺失** → 该 link 在 collector 输出中 `ParentMaxSpeed` 为 ""(视具体实现也可能整个 link 跳过),checker 阶段安全降级。 +5. **NIC 直连 CPU**(路径只有 NIC + 1 个 root port,少于 2 个 upstream) → 返回 nil。 +6. **多条 link 同时降速** → 都进入 `PCIETreeLinks`。 +7. **parent 与 child 的 current 报不一致** → 取 child;不抛错。 + +### 7.2 checker 单测(`checker/pcie_tree_speed_test.go`、`pcie_tree_width_test.go`) + +直接构造 `IBHardWareInfo.PCIETreeLinks` 输入,**不涉及 sysfs**: + +1. `PCIETreeLinks` 为空 → `status=normal`。 +2. 1 条 link 降速 → `status=abnormal`,`Device`/`Curr`/`Spec`/`Detail` 全字段 assert。 +3. 多条 link 降速 → `Device` 多段、`Detail` 多行。 +4. 多 NIC:A 全速、B 一条 link 降速 → 只 B 在 `failedDevices`。 +5. `cap` 任一侧无法解析 → 该 link 静默跳过,不报警。 +6. `current` 与 `cap` 等值(边界)→ 不报警(用 `<` 不用 `<=`)。 + +### 7.3 真机回归 + +按 `sichek-field-regression` skill 流程:在至少 1 台多 NIC 节点上: + +1. 跑 `./build/bin/sichek infiniband`,对比 stdout 中 `PCIETreeSpeedDownDegraded` / `PCIETreeWidthIncorrect` 输出。 +2. `lspci -vv | grep -E "LnkSta:|LnkCap:"` 抓取每个 PCIe 设备的当前/能力速率,与 sichek 的 Detail 对照,验证 cap 值取自正确的 endpoint。 +3. (如可能)人为制造一条降速 link(更换 cable、PCIe slot 切到旁边 slot)并验证报警与 Detail 准确。 + +## 8. 落地步骤(不约束顺序,给 writing-plans 排) + +1. collector 重构:`GetPCIETreeMin` → `GetPCIETreeLinks`,`PCIPath` 可注入,写表驱动单测。 +2. `IBHardWareInfo` 新增 `PCIETreeLinks`;老 4 个字段从 `PCIETreeLinks` 派生填充。 +3. `pcie_tree_speed.go` / `pcie_tree_width.go` 改写并写单测。 +4. `result.Spec` 填实际 cap 值;`Detail` / `Suggestion` 文案落到上面 §5.3 示例。 +5. dev 机真机跑一遍 sichek,对照 `lspci -vv`。 +6. 更新 `docs/infiniband.md` 中 PCIe Tree 检查的描述。 +7. PR 描述中明确点出 Prometheus `spec` label 取值变化,提示监控侧 review。 + +## 9. 不在本设计范围、但记为 follow-up + +- `CheckPCIESpeed` / `CheckPCIEWidth`(NIC 自身 link 检查)下一阶段同样走 spec-free(直接用 NIC 自身的 `current_link_speed` vs `max_link_speed`)。 +- `default_spec.yaml` 中 `pcie_tree_speed` / `pcie_tree_width` 字段的最终清理(先观察一个版本,确认无消费方再删)。 +- 若现网采样发现确有"sysfs `max_link_speed` 不准"的平台,再单独评估白名单 / fallback 路径。