Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
2eedd91
fix(transceiver): concurrent collect and BDF dedup for RoCE environments
Apr 23, 2026
2702544
fix(transceiver): skip SR-IOV VF ethernet interfaces and cap worker pool
Apr 23, 2026
36e5674
feat(consts): differentiate alert colors by severity level
Apr 23, 2026
4e87b40
docs: add sichek-collector design spec
Apr 23, 2026
5c57d6c
docs: add implementation plans for sichek-collector and reporter module
Apr 23, 2026
5fc3304
feat(service): reporter config loader with defaults
Apr 24, 2026
eefd0e9
feat(service): Reporter.pushOnce with gzip + retry
Apr 24, 2026
604d4f8
feat(service): Reporter.Run periodic loop with panic recover
Apr 24, 2026
ac6bc83
feat(service): ResolveNodeName prefers NODE_NAME env
Apr 24, 2026
8eecbc1
feat(service): wire Reporter into DaemonService lifecycle
Apr 24, 2026
1bfc9a7
feat(config): add reporter block (disabled by default)
Apr 24, 2026
da2466e
fix(infiniband): show upstream BDF in PCIETreeWidth/Speed checks
Apr 30, 2026
0288568
fix(infiniband): raise PCIETreeSpeed/Width severity to Critical
Apr 30, 2026
48e476b
feat(infiniband): per-port collection for multi-plane HCAs (CX8 4P RoCE)
May 7, 2026
b553058
fix(infiniband): pcie tree checkers read tree spec, not link spec
May 7, 2026
3d2a026
fix(infiniband): no panic / readable error when init fails
May 7, 2026
f67e991
chore(infiniband): drop mezz_* from zy spec ib_devs
May 7, 2026
2dc1f33
Merge origin/feat/roce-multiplane into feat/roce-multiplane-bundle
May 7, 2026
621f965
Merge origin/fix/ib-pcie-tree-show-upstream-bdf into feat/roce-multip…
May 7, 2026
4b64bab
Merge origin/feat/sichek-collector into feat/roce-multiplane-bundle
May 7, 2026
c3fc1f3
Merge origin/feat/alert-color-by-level into feat/roce-multiplane-bundle
May 7, 2026
4c931d0
Merge origin/fix/transceiver-roce-concurrent-collect into feat/roce-m…
May 7, 2026
d2a69fe
Merge origin/main (sichek config sync) into feat/roce-multiplane-bundle
May 7, 2026
128c3f0
Merge branch 'scitix:main' into main
lzi-a11y May 8, 2026
b39de21
Merge branch 'scitix:main' into main
lzi-a11y May 18, 2026
a9628d5
docs(superpowers): design pcie-tree spec-free check
May 18, 2026
e92c5a0
docs(superpowers): plan PCIe Tree spec-free implementation
May 18, 2026
6cbbaf3
Refactor/infiniband: make PCIPath injectable for tests
May 18, 2026
63f3b7f
Feat/infiniband: add PCIETreeLink struct
May 18, 2026
b765f67
Feat/infiniband: add GetPCIETreeLinks spec-free collector
May 18, 2026
becec81
Feat/infiniband: wire PCIETreeLinks into IBHardWareInfo, derive legac…
May 18, 2026
b8b8397
Style/infiniband: gofmt ib_hardware_info.go and its test
May 18, 2026
423d39c
Feat/infiniband: add pcieSpeedLessThan and minNumericSpeed helpers
May 18, 2026
fc3ef6c
Feat/infiniband: rewrite PCIETreeSpeedDownDegraded as per-link check
May 18, 2026
32d320d
Feat/infiniband: rewrite PCIETreeWidthIncorrect as per-link check
May 18, 2026
5e2a5bf
Refactor/infiniband: drop legacy GetPCIETreeMin and unused checker he…
May 18, 2026
0ddc98a
Docs/infiniband: explain spec-free PCIe Tree Speed/Width check
May 18, 2026
94de218
Docs/infiniband: fix PCIe Tree section heading level
May 18, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 64 additions & 0 deletions components/infiniband/checker/pcie_helpers_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
/*
Copyright 2024 The Scitix Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package checker

import (
"testing"

"github.com/stretchr/testify/assert"
)

func TestPcieSpeedLessThan(t *testing.T) {
cases := []struct {
name string
a, b string
want bool
}{
{"less_simple", "16", "32", true},
{"greater", "32", "16", false},
{"equal_no_decimals", "16", "16", false},
{"equal_with_decimals", "32.0", "32", false},
{"less_gt_suffix", "16.0 GT/s PCIe", "32.0 GT/s PCIe", true},
{"unparseable_a_returns_false", "abc", "32", false},
{"unparseable_b_returns_false", "16", "xyz", false},
{"empty_returns_false", "", "32", false},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
assert.Equal(t, tc.want, pcieSpeedLessThan(tc.a, tc.b))
})
}
}

func TestMinNumericSpeed(t *testing.T) {
cases := []struct {
name string
a, b string
want string
}{
{"a_smaller", "16.0 GT/s PCIe", "32.0 GT/s PCIe", "16.0 GT/s PCIe"},
{"b_smaller", "32", "16", "16"},
{"equal_returns_a", "32", "32", "32"},
{"a_unparseable_returns_empty", "abc", "16", ""},
{"b_unparseable_returns_empty", "16", "xyz", ""},
{"both_empty_returns_empty", "", "", ""},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
assert.Equal(t, tc.want, minNumericSpeed(tc.a, tc.b))
})
}
}
133 changes: 73 additions & 60 deletions components/infiniband/checker/pcie_tree_speed.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,30 +18,15 @@ package checker
import (
"context"
"fmt"
"math"
"strconv"
"strings"

"github.com/scitix/sichek/components/common"
"github.com/scitix/sichek/components/infiniband/collector"
"github.com/scitix/sichek/components/infiniband/config"
"github.com/scitix/sichek/consts"
"github.com/sirupsen/logrus"
)

// pcieSpeedEqual compares two PCIe speed strings ("32", "32.0", "32.00 GT/s")
// numerically so spec authors do not have to mirror sysfs's trailing-zero
// formatting verbatim. Falls back to string equality when either value is
// not parseable, preserving prior behaviour for free-form spec values.
func pcieSpeedEqual(a, b string) bool {
af, errA := strconv.ParseFloat(strings.TrimSpace(extractNumericSpeed(a)), 64)
bf, errB := strconv.ParseFloat(strings.TrimSpace(extractNumericSpeed(b)), 64)
if errA != nil || errB != nil {
return a == b
}
return math.Abs(af-bf) < 1e-9
}

type IBPCIETreeSpeedChecker struct {
id string
name string
Expand Down Expand Up @@ -89,61 +74,71 @@ func (c *IBPCIETreeSpeedChecker) Check(ctx context.Context, data any) (*common.C
return &result, fmt.Errorf("fail to get the IB device")
}

failedDevices := make([]string, 0)
spec := make([]string, 0, hwInfoLen)
curr := make([]string, 0, hwInfoLen)
var failedSpec []string
var failedCurr []string

infinibandInfo.RLock()
hws := uniqueByDev(infinibandInfo.IBHardWareInfo)
infinibandInfo.RUnlock()

failedDevices := make([]string, 0)
failedCurr := make([]string, 0)
failedCap := make([]string, 0)
detailLines := make([]string, 0)
suggestionLines := make([]string, 0)

for _, hwInfo := range hws {
if _, ok := c.spec.HCAs[hwInfo.BoardID]; !ok {
logrus.WithField("component", "infiniband").Warnf("HCA %s not found in spec, skipping %s", hwInfo.BoardID, c.name)
if len(hwInfo.PCIETreeLinks) == 0 {
// Direct-to-CPU or sysfs unavailable; treat as normal.
continue
}
hcaSpec := c.spec.HCAs[hwInfo.BoardID]
// Tree-speed has its own spec field (yaml: pcie_tree_speed) because
// upstream switches and root complexes are often slower than the
// device-level link. CX8 e.g. links at PCIe Gen6 (64 GT/s) but the
// upstream Gen5 switch caps the path at 32 GT/s. Fall back to
// PCIESpeed for board specs that predate the dedicated field.
treeSpec := hcaSpec.Hardware.PCIETreeSpeedMin
if treeSpec == "" {
treeSpec = hcaSpec.Hardware.PCIESpeed
}
expectedSpeed := extractNumericSpeed(treeSpec)
spec = append(spec, treeSpec)

treeSpeedMin := hwInfo.PCIETreeSpeedMin
if treeSpeedMin == "" {
// No upstream tree info available (e.g., direct to CPU), skip
curr = append(curr, hwInfo.PCIESpeed)
// Compute the path-level capability: the minimum parseable cap across
// all links. A link whose max is unparseable is excluded from the
// path cap calculation (treated as ∞ for this purpose). If no link
// yields a parseable cap, skip the whole NIC.
pathCap := ""
for _, link := range hwInfo.PCIETreeLinks {
cap := minNumericSpeed(link.ParentMaxSpeed, link.ChildMaxSpeed)
if cap == "" {
continue
}
if pathCap == "" {
pathCap = cap
} else {
pathCap = minNumericSpeed(pathCap, cap)
}
}
if pathCap == "" {
// No parseable caps on any link; skip silently.
continue
}
curr = append(curr, treeSpeedMin)

// Compare numerically so "32" / "32.0" / "32.00" all match without
// requiring spec authors to keep the trailing zero in sync with sysfs.
if !pcieSpeedEqual(treeSpeedMin, expectedSpeed) {
result.Status = consts.StatusAbnormal
devInfo := fmt.Sprintf("%s(%s)", hwInfo.IBDev, hwInfo.PCIEBDF)
if hwInfo.PCIETreeSpeedMinBDF != "" {
devInfo = fmt.Sprintf("%s(%s, bottleneck@%s)", hwInfo.IBDev, hwInfo.PCIEBDF, hwInfo.PCIETreeSpeedMinBDF)
// Flag each link whose current speed falls below the path cap — these
// are the true bottlenecks. A link running at the path cap (because
// its own max matches the path cap) is expected and not flagged.
for _, link := range hwInfo.PCIETreeLinks {
if !pcieSpeedLessThan(link.CurSpeed, pathCap) {
continue
}
result.Status = consts.StatusAbnormal
devInfo := fmt.Sprintf("%s(%s, bottleneck@%s->%s)",
hwInfo.IBDev, hwInfo.PCIEBDF, link.ParentBDF, link.ChildBDF)
failedDevices = append(failedDevices, devInfo)
failedSpec = append(failedSpec, treeSpec)
failedCurr = append(failedCurr, treeSpeedMin)
failedCurr = append(failedCurr, link.CurSpeed)
failedCap = append(failedCap, pathCap)
detailLines = append(detailLines, fmt.Sprintf(
"%s upstream link %s->%s current %s < cap %s",
hwInfo.IBDev, link.ParentBDF, link.ChildBDF, link.CurSpeed, pathCap))
suggestionLines = append(suggestionLines, fmt.Sprintf(
"Check upstream PCIe link %s->%s for %s, current %s is below link capability %s (min of both endpoints' max).",
link.ParentBDF, link.ChildBDF, hwInfo.IBDev, link.CurSpeed, pathCap))
}
}

result.Curr = strings.Join(curr, ",")
result.Spec = strings.Join(spec, ",")
result.Curr = strings.Join(failedCurr, ",")
result.Spec = strings.Join(failedCap, ",")
result.Device = strings.Join(failedDevices, ",")
if len(failedDevices) != 0 {
result.Detail = fmt.Sprintf("PCIETreeSpeed check fail: %s upstream path min speed %s, expect %s", strings.Join(failedDevices, ","), strings.Join(failedCurr, ","), strings.Join(failedSpec, ","))
result.Suggestion = fmt.Sprintf("Check upstream PCIe switch/bridge speed for %s, expected %s but found %s in path to root complex", strings.Join(failedDevices, ","), strings.Join(failedSpec, ","), strings.Join(failedCurr, ","))
result.Detail = strings.Join(detailLines, "\n")
result.Suggestion = strings.Join(suggestionLines, "\n")
}

return &result, nil
Expand All @@ -159,13 +154,31 @@ func extractNumericSpeed(speed string) string {
return parts[0]
}

// numericSpeedEqual compares two speed strings numerically to avoid
// false mismatches like "16" != "16.0".
func numericSpeedEqual(a, b string) bool {
va, errA := strconv.ParseFloat(a, 64)
vb, errB := strconv.ParseFloat(b, 64)
// pcieSpeedLessThan returns true iff a < b after extracting the leading numeric
// part of each (so "16.0 GT/s PCIe" parses as 16.0). Returns false when either
// value cannot be parsed — callers must treat "unknown" as "not less" so the
// checker stays normal on unreadable sysfs entries.
func pcieSpeedLessThan(a, b string) bool {
af, errA := strconv.ParseFloat(strings.TrimSpace(extractNumericSpeed(a)), 64)
bf, errB := strconv.ParseFloat(strings.TrimSpace(extractNumericSpeed(b)), 64)
if errA != nil || errB != nil {
return a == b
return false
}
return af < bf-1e-9
}

// minNumericSpeed returns whichever of a or b parses to the smaller numeric
// value, preserving the raw string form. If either side is unparseable
// returns "" so the checker can skip the link rather than emit a noisy
// "unknown" comparison.
func minNumericSpeed(a, b string) string {
af, errA := strconv.ParseFloat(strings.TrimSpace(extractNumericSpeed(a)), 64)
bf, errB := strconv.ParseFloat(strings.TrimSpace(extractNumericSpeed(b)), 64)
if errA != nil || errB != nil {
return ""
}
if af <= bf {
return a
}
return va == vb
return b
}
Loading