Skip to content

Commit d9e0769

Browse files
authored
Merge branch 'main' into k8s-ds-design-doc
2 parents ca9c7f5 + 028d47e commit d9e0769

103 files changed

Lines changed: 6019 additions & 322 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/actions/publish-container/action.yml

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,16 @@ inputs:
3232
runs:
3333
using: 'composite'
3434
steps:
35+
- name: Workaround for freeing up more disk space
36+
shell: bash
37+
run: |
38+
sudo rm -rf /usr/local/lib/android
39+
sudo rm -rf /usr/share/dotnet
40+
sudo rm -rf /opt/ghc
41+
sudo rm -rf /opt/hostedtoolcache/CodeQL
42+
sudo docker image prune --all --force
43+
sudo docker system prune -f
44+
3545
- name: Set up Docker Buildx
3646
uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # v3.11.1
3747
with:
@@ -86,6 +96,22 @@ runs:
8696
rm -rf /tmp/.buildx-cache
8797
mv /tmp/.buildx-cache-new /tmp/.buildx-cache
8898
fi
99+
100+
- name: Free disk space for SBOM generation
101+
shell: bash
102+
run: |
103+
echo "=== Disk usage BEFORE cleanup ==="
104+
df -h
105+
# Remove pre-installed tools (if they exist on this runner)
106+
sudo rm -rf /usr/local/lib/android || true
107+
sudo rm -rf /usr/share/dotnet || true
108+
sudo rm -rf /opt/ghc || true
109+
# Thorough Docker cleanup - build cache no longer needed after push
110+
docker buildx prune -f --all || true
111+
docker system prune -af --volumes || true
112+
rm -rf /tmp/.buildx-cache || true
113+
echo "=== Disk usage AFTER cleanup ==="
114+
df -h
89115
90116
- name: Generate SBOM and Attest
91117
uses: ./.github/actions/sbom-and-attest

.github/copy-pr-bot.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,3 +25,4 @@ additional_trustees:
2525
- deesharma24
2626
- natherz97
2727
- suket22
28+
- cbumb

.github/workflows/cleanup-untagged-images.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ jobs:
8989
- nvsentinel-janitor
9090
- nvsentinel-fake-dcgm
9191
- nvsentinel/preflight
92-
- nvsentinel/ping
92+
- nvsentinel/preflight-dcgm-diag
9393

9494
steps:
9595
- name: Delete untagged images for ${{ matrix.package }}

.github/workflows/container-build-test.yml

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,14 @@ jobs:
6565
make_command: 'make -C log-collector docker-build-log-collector'
6666
- component: file-server-cleanup
6767
make_command: 'make -C log-collector docker-build-file-server-cleanup'
68+
# Preflight Checks (Docker-based)
69+
- component: preflight-dcgm-diag
70+
make_command: 'make -C preflight-checks/dcgm-diag docker-build'
71+
- component: preflight-nccl-loopback
72+
make_command: 'make -C preflight-checks/nccl-loopback docker-build'
73+
# GPU Reset (Docker-based)
74+
- component: gpu-reset
75+
make_command: 'make -C gpu-reset docker-build'
6876
steps:
6977
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
7078

@@ -128,8 +136,6 @@ jobs:
128136
path: .
129137
- module: preflight
130138
path: .
131-
- module: preflight-checks/ping
132-
path: .
133139
steps:
134140
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
135141

.github/workflows/lint-test.yml

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,10 @@ jobs:
5858
make_command: 'make -C log-collector lint-log-collector'
5959
step_name: 'Run lint'
6060
replace_imports: 'false'
61+
- component: gpu-reset
62+
make_command: 'make -C gpu-reset lint'
63+
step_name: 'Run lint'
64+
replace_imports: 'false'
6165
- component: file-server-cleanup
6266
make_command: 'make -C log-collector lint-file-server-cleanup'
6367
step_name: 'Run lint'
@@ -139,7 +143,8 @@ jobs:
139143
strategy:
140144
matrix:
141145
component:
142-
- ping
146+
- dcgm-diag
147+
- nccl-loopback
143148
steps:
144149
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
145150

@@ -149,15 +154,6 @@ jobs:
149154
- name: Run lint and test
150155
run: make -C preflight-checks/${{ matrix.component }} lint-test
151156

152-
- name: Upload artifacts
153-
uses: ./.github/actions/upload-test-artifacts
154-
with:
155-
component-name: preflight-${{ matrix.component }}
156-
file-paths: |
157-
preflight-checks/${{ matrix.component }}/coverage.xml
158-
preflight-checks/${{ matrix.component }}/coverage.txt
159-
preflight-checks/${{ matrix.component }}/report.xml
160-
161157
modules-lint-test:
162158
if: github.repository == 'nvidia/nvsentinel'
163159
runs-on: linux-amd64-cpu16

.github/workflows/publish.yml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,15 @@ jobs:
117117
- component: file-server-cleanup
118118
make_command: 'make -C log-collector docker-publish-file-server-cleanup'
119119
container_name: 'nvsentinel/file-server-cleanup'
120+
- component: preflight-dcgm-diag
121+
make_command: 'make -C preflight-checks/dcgm-diag docker-publish'
122+
container_name: 'nvsentinel/preflight-dcgm-diag'
123+
- component: preflight-nccl-loopback
124+
make_command: 'make -C preflight-checks/nccl-loopback docker-publish'
125+
container_name: 'nvsentinel/preflight-nccl-loopback'
126+
- component: gpu-reset
127+
make_command: 'make -C gpu-reset docker-publish'
128+
container_name: 'nvsentinel/gpu-reset'
120129
steps:
121130
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
122131
with:

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -444,3 +444,5 @@ tests/scale-tests/FQM_LATENCY_TEST_PLAN.md
444444
tests/scale-tests/CONCURRENT_DRAIN_TEST_PLAN.md
445445
tests/scale-tests/results/*.csv
446446
tests/scale-tests/cmd/fqm-scale-test/results/
447+
preflight-checks/dcgm-diag/dcgm-diag
448+
preflight/preflight

Makefile

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,8 @@ PYTHON_MODULES := \
6262

6363
# Container-only modules
6464
CONTAINER_MODULES := \
65-
log-collector
65+
log-collector \
66+
gpu-reset
6667

6768
# Special modules requiring private repo access
6869
PRIVATE_MODULES := \
@@ -267,7 +268,7 @@ install-go-ci: ## Install Go $(GO_VERSION) for CI environments (Linux/macOS, amd
267268

268269
# Lint and test all modules (delegates to sub-Makefiles)
269270
.PHONY: lint-test-all
270-
lint-test-all: protos-lint license-headers-lint gomod-lint health-monitors-lint-test-all go-lint-test-all python-lint-test-all kubernetes-distro-lint log-collector-lint ## Lint and test all modules
271+
lint-test-all: protos-lint license-headers-lint gomod-lint health-monitors-lint-test-all go-lint-test-all python-lint-test-all kubernetes-distro-lint log-collector-lint gpu-reset-lint ## Lint and test all modules
271272

272273
# Health monitors lint-test (delegate to health-monitors/Makefile)
273274
.PHONY: health-monitors-lint-test-all
@@ -301,6 +302,8 @@ protos-generate: protos-clean ## Generate protobuf files from .proto sources
301302
$(MAKE) -C api protos-generate
302303
# Generate Python protobuf files for gpu-health-monitor
303304
$(MAKE) -C health-monitors/gpu-health-monitor protos-generate
305+
# Generate Python protobuf files for dcgm-diag preflight check
306+
$(MAKE) -C preflight-checks/dcgm-diag protos-generate
304307

305308
# Check protobuf files
306309
.PHONY: protos-lint
@@ -502,6 +505,12 @@ log-collector-lint: ## Lint shell scripts in log collector
502505
@echo "Linting log collector shell scripts..."
503506
$(MAKE) -C log-collector lint
504507

508+
# GPU reset lint (shell script)
509+
.PHONY: gpu-reset-lint
510+
gpu-reset-lint: ## Lint shell scripts in GPU reset
511+
@echo "Linting GPU reset shell scripts..."
512+
$(MAKE) -C gpu-reset lint
513+
505514
# Build targets (delegate to sub-Makefiles for better organization)
506515
.PHONY: build-all
507516
build-all: build-health-monitors build-main-modules ## Build all modules

commons/go.sum

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -80,8 +80,8 @@ github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
8080
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
8181
github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc=
8282
github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw=
83-
github.com/lib/pq v1.10.9 h1:YXG7RB+JIjhP29X+OtkiDnYaXQwpS4JEWq7dtCCRUEw=
84-
github.com/lib/pq v1.10.9/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o=
83+
github.com/lib/pq v1.11.1 h1:wuChtj2hfsGmmx3nf1m7xC2XpK6OtelS2shMY+bGMtI=
84+
github.com/lib/pq v1.11.1/go.mod h1:/p+8NSbOcwzAEI7wiMXFlgydTwcgTr3OSKMsD2BitpA=
8585
github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
8686
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg=
8787
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=

demos/local-fault-injection-demo/scripts/01-show-cluster.sh

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -104,14 +104,28 @@ main() {
104104
node_name=$(echo "$node" | cut -d'/' -f2)
105105
echo "Node: $node_name"
106106

107-
# Get all conditions
108-
kubectl get "$node" -o jsonpath='{range .status.conditions[*]}{.type}{"\t"}{.status}{"\t"}{.message}{"\n"}{end}' | \
109-
grep -v "^Ready\|^MemoryPressure\|^DiskPressure\|^PIDPressure\|^NetworkUnavailable" || true
107+
# Get all conditions and capture for checking while also displaying
108+
node_conditions=$(kubectl get "$node" -o jsonpath='{range .status.conditions[*]}{.type}{"\t"}{.status}{"\t"}{.message}{"\n"}{end}') || {
109+
echo " ⚠️ Failed to fetch conditions for $node_name"
110+
continue
111+
}
112+
conditions_output=$(echo "$node_conditions" | \
113+
grep -v "^Ready\|^MemoryPressure\|^DiskPressure\|^PIDPressure\|^NetworkUnavailable" || true)
114+
115+
echo "$conditions_output"
116+
117+
# Track if we found any actual health failures (Status=True indicating a problem)
118+
if echo "$conditions_output" | grep -q $'\t'"True"$'\t'; then
119+
has_conditions=true
120+
fi
110121

111122
done
112123

113-
if [ "$has_conditions" = false ]; then
114-
echo " ℹ️ No health event conditions found (cluster is healthy)"
124+
echo ""
125+
if [ "$has_conditions" = true ]; then
126+
echo " ⚠️ Health issues detected (see conditions above)"
127+
else
128+
echo " ✅ No health event conditions found (cluster is healthy)"
115129
fi
116130

117131
section "Recent Events"

0 commit comments

Comments
 (0)