Skip to content

Commit afad630

Browse files
feat(ci): add weekly forward compatibility testing
Add forward-compatibility.yaml workflow that runs weekly to test the GPU operator against latest upstream component images (toolkit, device-plugin, mig-manager). Includes get-latest-images.sh with retry/backoff for image verification and generate-values-overrides.sh for Helm values generation. Signed-off-by: Carlos Eduardo Arango Gutierrez <eduardoa@nvidia.com>
1 parent e6b39bc commit afad630

File tree

3 files changed

+268
-0
lines changed

3 files changed

+268
-0
lines changed
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
#!/usr/bin/env bash
2+
3+
# Copyright NVIDIA CORPORATION
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
set -euo pipefail
18+
19+
# Usage: generate-values-overrides.sh OUTPUT_FILE TOOLKIT_IMAGE DEVICE_PLUGIN_IMAGE MIG_MANAGER_IMAGE
20+
#
21+
# Generates a Helm values override file for GPU Operator component images.
22+
# This file can be used with `helm install -f values-overrides.yaml` to
23+
# override default component image versions.
24+
25+
if [[ $# -ne 4 ]]; then
26+
echo "Usage: $0 OUTPUT_FILE TOOLKIT_IMAGE DEVICE_PLUGIN_IMAGE MIG_MANAGER_IMAGE" >&2
27+
echo "" >&2
28+
echo "Example:" >&2
29+
echo " $0 values.yaml \\" >&2
30+
echo " ghcr.io/nvidia/container-toolkit:v1.18.0-ubuntu20.04 \\" >&2
31+
echo " ghcr.io/nvidia/k8s-device-plugin:v0.17.0-ubi8 \\" >&2
32+
echo " ghcr.io/nvidia/k8s-mig-manager:v0.10.0-ubuntu20.04" >&2
33+
exit 1
34+
fi
35+
36+
OUTPUT_FILE="$1"
37+
TOOLKIT_IMAGE="$2"
38+
DEVICE_PLUGIN_IMAGE="$3"
39+
MIG_MANAGER_IMAGE="$4"
40+
41+
# Generate values override file
42+
cat > "${OUTPUT_FILE}" <<EOF
43+
# Generated by generate-values-overrides.sh
44+
# Date: $(date -u +"%Y-%m-%d %H:%M:%S UTC")
45+
#
46+
# This file overrides default GPU Operator component images with
47+
# specific versions for forward compatibility testing.
48+
49+
toolkit:
50+
repository: ""
51+
version: ""
52+
image: "${TOOLKIT_IMAGE}"
53+
54+
devicePlugin:
55+
repository: ""
56+
version: ""
57+
image: "${DEVICE_PLUGIN_IMAGE}"
58+
59+
migManager:
60+
repository: ""
61+
version: ""
62+
image: "${MIG_MANAGER_IMAGE}"
63+
EOF
64+
65+
echo "Generated values override file: ${OUTPUT_FILE}"
66+
echo ""
67+
echo "=== Component Images ==="
68+
echo "Container Toolkit: ${TOOLKIT_IMAGE}"
69+
echo "Device Plugin: ${DEVICE_PLUGIN_IMAGE}"
70+
echo "MIG Manager: ${MIG_MANAGER_IMAGE}"
71+
echo ""
72+
echo "=== File Contents ==="
73+
cat "${OUTPUT_FILE}"
74+
Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
#!/bin/bash
2+
# Copyright NVIDIA CORPORATION
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
set -euo pipefail
17+
18+
COMPONENT=${1:-}
19+
20+
if [[ -z "${COMPONENT}" ]]; then
21+
echo "Usage: $0 <toolkit|device-plugin|mig-manager>" >&2
22+
exit 1
23+
fi
24+
25+
# Verify regctl is available
26+
if ! command -v regctl &> /dev/null; then
27+
echo "Error: regctl not found. Please install regctl first." >&2
28+
exit 1
29+
fi
30+
31+
# Map component names to GHCR image repositories and GitHub source repositories
32+
case "${COMPONENT}" in
33+
toolkit)
34+
IMAGE_REPO="ghcr.io/nvidia/container-toolkit"
35+
GITHUB_REPO="NVIDIA/container-toolkit"
36+
;;
37+
device-plugin)
38+
IMAGE_REPO="ghcr.io/nvidia/k8s-device-plugin"
39+
GITHUB_REPO="NVIDIA/k8s-device-plugin"
40+
;;
41+
mig-manager)
42+
IMAGE_REPO="ghcr.io/nvidia/k8s-mig-manager"
43+
GITHUB_REPO="NVIDIA/k8s-mig-manager"
44+
;;
45+
*)
46+
echo "Error: Unknown component '${COMPONENT}'" >&2
47+
echo "Valid components: toolkit, device-plugin, mig-manager" >&2
48+
exit 1
49+
;;
50+
esac
51+
52+
echo "Fetching latest commit from ${GITHUB_REPO}..." >&2
53+
54+
# Get the latest commit SHA from the main branch using GitHub API
55+
GITHUB_API_URL="https://api.github.com/repos/${GITHUB_REPO}/commits/main"
56+
57+
# Use GITHUB_TOKEN if available for authentication (higher rate limits)
58+
if [[ -n "${GITHUB_TOKEN:-}" ]]; then
59+
LATEST_COMMIT=$(curl -sSL \
60+
-H "Authorization: Bearer ${GITHUB_TOKEN}" \
61+
-H "Accept: application/vnd.github.v3+json" \
62+
"${GITHUB_API_URL}" | \
63+
jq -r '.sha[0:8]')
64+
else
65+
LATEST_COMMIT=$(curl -sSL \
66+
-H "Accept: application/vnd.github.v3+json" \
67+
"${GITHUB_API_URL}" | \
68+
jq -r '.sha[0:8]')
69+
fi
70+
71+
if [[ -z "${LATEST_COMMIT}" || "${LATEST_COMMIT}" == "null" ]]; then
72+
echo "Error: Failed to fetch latest commit from ${GITHUB_REPO}" >&2
73+
exit 1
74+
fi
75+
76+
echo "Latest commit SHA: ${LATEST_COMMIT}" >&2
77+
78+
# Construct full image path with commit tag
79+
FULL_IMAGE="${IMAGE_REPO}:${LATEST_COMMIT}"
80+
81+
echo "Verifying image exists: ${FULL_IMAGE}" >&2
82+
83+
# Verify the image exists using regctl with retry
84+
MAX_RETRIES=5
85+
RETRY_DELAY=30
86+
for i in $(seq 1 ${MAX_RETRIES}); do
87+
if regctl manifest head "${FULL_IMAGE}" &> /dev/null; then
88+
echo "Verified ${COMPONENT} image: ${FULL_IMAGE}" >&2
89+
echo "${FULL_IMAGE}"
90+
exit 0
91+
fi
92+
93+
if [[ $i -lt ${MAX_RETRIES} ]]; then
94+
echo "Image not found (attempt $i/${MAX_RETRIES}), waiting ${RETRY_DELAY}s for CI to build..." >&2
95+
sleep ${RETRY_DELAY}
96+
# Exponential backoff: 30s, 60s, 120s, 240s
97+
RETRY_DELAY=$((RETRY_DELAY * 2))
98+
fi
99+
done
100+
101+
echo "Error: Image ${FULL_IMAGE} does not exist after ${MAX_RETRIES} attempts" >&2
102+
echo "The image may not have been built yet for commit ${LATEST_COMMIT}" >&2
103+
exit 1
Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
# Copyright NVIDIA CORPORATION
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
name: Forward Compatibility
16+
17+
on:
18+
schedule:
19+
- cron: '0 2 * * 1' # Weekly on Monday at 2 AM UTC
20+
workflow_dispatch: # Manual trigger
21+
22+
concurrency:
23+
group: ${{ github.workflow }}-${{ github.ref }}
24+
cancel-in-progress: true
25+
26+
jobs:
27+
fetch-latest-images:
28+
runs-on: ubuntu-latest
29+
steps:
30+
- uses: actions/checkout@v6
31+
32+
- name: Install regctl
33+
run: |
34+
REGCTL_VERSION=v0.9.2
35+
mkdir -p bin
36+
curl -sSLo bin/regctl https://github.com/regclient/regclient/releases/download/${REGCTL_VERSION}/regctl-linux-amd64
37+
chmod +x bin/regctl
38+
echo "$(pwd)/bin" >> $GITHUB_PATH
39+
40+
- name: Get latest component images and generate values override file
41+
env:
42+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
43+
run: |
44+
# Fetch latest images from component repositories
45+
echo "::notice::Fetching latest container-toolkit image..."
46+
TOOLKIT=$(.github/scripts/get-latest-images.sh toolkit)
47+
48+
echo "::notice::Fetching latest device-plugin image..."
49+
DEVICE_PLUGIN=$(.github/scripts/get-latest-images.sh device-plugin)
50+
51+
echo "::notice::Fetching latest mig-manager image..."
52+
MIG_MANAGER=$(.github/scripts/get-latest-images.sh mig-manager)
53+
54+
# Generate values override file
55+
.github/scripts/generate-values-overrides.sh \
56+
values-overrides.yaml \
57+
"${TOOLKIT}" \
58+
"${DEVICE_PLUGIN}" \
59+
"${MIG_MANAGER}"
60+
61+
- name: Upload values override file
62+
uses: actions/upload-artifact@v6
63+
with:
64+
name: values-overrides
65+
path: values-overrides.yaml
66+
retention-days: 30
67+
68+
run-e2e-tests:
69+
needs: [fetch-latest-images]
70+
uses: ./.github/workflows/e2e-tests.yaml
71+
with:
72+
operator_image: ghcr.io/nvidia/gpu-operator
73+
operator_version: main-latest
74+
use_values_override: true
75+
secrets: inherit
76+
77+
notify-failure:
78+
runs-on: ubuntu-latest
79+
needs: [fetch-latest-images, run-e2e-tests]
80+
if: ${{ always() && (needs.fetch-latest-images.result == 'failure' || needs.run-e2e-tests.result == 'failure') }}
81+
steps:
82+
- name: Send Slack alert notification
83+
uses: slackapi/slack-github-action@v2.1.1
84+
with:
85+
method: chat.postMessage
86+
token: ${{ secrets.SLACK_BOT_TOKEN }}
87+
payload: |
88+
{
89+
"channel": "${{ secrets.SLACK_CHANNEL_ID }}",
90+
"text": ":x: *Forward Compatibility Test Failed for GPU Operator*\n\n*Workflow:* ${{ github.workflow }}\n*Repository:* ${{ github.repository }}\n*Trigger:* ${{ github.event_name }}\n\n*Tested Components:*\nDownload `values-overrides` artifact to see tested component versions\n\n*Details:* <https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}|View Failed Run>\n\n${{ secrets.SLACK_MENTION_LIST }}"
91+
}

0 commit comments

Comments
 (0)