Skip to content

Commit 4a756ed

Browse files
committed
Testing the GPU golden images
1 parent 2289474 commit 4a756ed

15 files changed

Lines changed: 104 additions & 581 deletions

File tree

Lines changed: 6 additions & 130 deletions
Original file line numberDiff line numberDiff line change
@@ -1,133 +1,9 @@
11
set -e
2-
source /etc/os-release
3-
MAJOR_VERSION_ID=${VERSION_ID%%.*}
42

5-
verify_driver() {
6-
# Verify NVIDIA driver:
7-
# Installation could finish successfully but the driver is still unusable
8-
# A common error when running this check:
9-
# "NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA
10-
# driver. Make sure that the latest NVIDIA driver is installed and running."
11-
nvidia-smi
12-
}
3+
# DCGM and CUDA toolkit share the same NVIDIA repo, which has already been configured in the image
4+
# Install DCGM
5+
sudo yum install -y datacenter-gpu-manager
6+
sudo systemctl --now enable nvidia-dcgm
137

14-
install_cuda_from_runfile() {
15-
# Ref: https://docs.nvidia.com/datacenter/tesla/tesla-installation-notes/index.html#runfile
16-
# This method requires the matching kernel-devel package to be installed, and
17-
# the package may be absent from the repo and cause this method to fail
18-
# Remove existing installation before using the runfile
19-
remove_cuda_package
20-
remove_driver_package
21-
# For Rocky Linux 9: when a new OS version becomes available, the default
22-
# repo setting (/etc/yum.repos.d/rocky.repo) will automatically point to the
23-
# new version's repo. This is problematic since the new OS is not available
24-
# right away on GCE. Set up the matched repo to install the correct
25-
# kernel-devel-$(uname -r)
26-
# Not needed for RL8 since 8.10 is already the last RL8 release.
27-
REPO_URL="https://dl.rockylinux.org/vault/rocky/$VERSION_ID/AppStream/x86_64/os/"
28-
REPO_METADATA="$REPO_URL/repodata/repomd.xml"
29-
STATUS_CODE=$(curl -s -o /dev/null -w "%{http_code}" "$REPO_METADATA")
30-
if [[ $ID == rocky && "$MAJOR_VERSION_ID" == 9 && "$STATUS_CODE" == "200" ]]; then
31-
cat <<EOF | sudo tee /etc/yum.repos.d/rocky-matched.repo
32-
[appstream-matched]
33-
name=Rocky Linux \$releasever - AppStream - Matched
34-
baseurl=$REPO_URL
35-
gpgcheck=1
36-
enabled=1
37-
countme=1
38-
metadata_expire=6h
39-
gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-9
40-
EOF
41-
fi
42-
sudo yum install -y pciutils gcc make wget yum-utils
43-
local KERNEL_PACKAGE="kernel-devel-$(uname -r)"
44-
if [[ $ID == rocky && "$MAJOR_VERSION_ID" == 9 && "$STATUS_CODE" == "403" ]]; then
45-
wget https://dl.rockylinux.org/vault/rocky/$VERSION_ID/AppStream/x86_64/os/Packages/k/${KERNEL_PACKAGE}.rpm
46-
KERNEL_PACKAGE=${KERNEL_PACKAGE}.rpm
47-
fi
48-
49-
sudo yum install -y $KERNEL_PACKAGE
50-
# Installing latest version of NVIDIA CUDA and driver
51-
local CUDA_VERSION=12.9.0
52-
local CUDA_BUNDLED_DRIVER_VERSION=575.51.03
53-
echo "Installing CUDA Toolkit $CUDA_VERSION from CUDA installer with bundled driver $CUDA_BUNDLED_DRIVER_VERSION"
54-
curl -fSsl -O https://developer.download.nvidia.com/compute/cuda/$CUDA_VERSION/local_installers/cuda_${CUDA_VERSION}_${CUDA_BUNDLED_DRIVER_VERSION}_linux.run
55-
sudo sh cuda_${CUDA_VERSION}_${CUDA_BUNDLED_DRIVER_VERSION}_linux.run --silent
56-
verify_driver
57-
}
58-
59-
setup_repo() {
60-
# Enable EPEL (Extra Packages for Enterprise Linux) for packages such as DKMS
61-
# Ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/#prepare-rhel-9-rocky-9
62-
sudo yum install -y yum-utils epel-release
63-
sudo yum-config-manager \
64-
--add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel$MAJOR_VERSION_ID/x86_64/cuda-rhel$MAJOR_VERSION_ID.repo
65-
sudo yum clean all
66-
}
67-
68-
install_cuda_from_package_manager() {
69-
setup_repo
70-
install_driver_package
71-
# TODO(b/377558109): remove the temporary fix once the repo is updated
72-
sudo yum -y install cuda-toolkit-12-9 cuda-demo*
73-
verify_driver
74-
}
75-
76-
remove_cuda_package() {
77-
# Ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#removing-cuda-toolkit-and-driver
78-
sudo yum -y remove "cuda*" "*cublas*" "*cufft*" "*cufile*" "*curand*" \
79-
"*cusolver*" "*cusparse*" "*gds-tools*" "*npp*" "*nvjpeg*" "nsight*" \
80-
"*nvvm*"
81-
}
82-
83-
install_dcgm() {
84-
# Ref: https://docs.nvidia.com/datacenter/dcgm/latest/user-guide/getting-started.html#rhel-centos-rocky-linux
85-
setup_repo
86-
sudo yum install -y datacenter-gpu-manager
87-
sudo systemctl --now enable nvidia-dcgm
88-
89-
# check DCGM service running and load profiling module
90-
dcgmi discovery --list
91-
}
92-
93-
try_install() {
94-
# Export all functions for the bash subprocess
95-
eval "$(declare -F | sed 's/ -f / -fx /')"
96-
export ID MAJOR_VERSION_ID VERSION_ID
97-
for install_method in "$@"; do
98-
echo "Installing NVIDIA driver and CUDA with $install_method..."
99-
# Can't use a subshell because of https://lists.gnu.org/archive/html/bug-bash/2012-12/msg00094.html
100-
bash -$- -c $install_method && {
101-
echo "NVIDIA driver and CUDA has been installed successfully with $install_method."
102-
return 0
103-
}
104-
done
105-
echo "NVIDIA driver and CUDA cannot be installed; all installation methods failed."
106-
return 1
107-
}
108-
109-
handle_rhel9() {
110-
install_driver_package() {
111-
# Ref: https://developer.nvidia.com/cuda-12-9-0-download-archive?target_os=Linux&target_arch=x86_64&Distribution=RHEL&target_version=8&target_type=rpm_network
112-
sudo yum -y module install nvidia-driver:575-dkms
113-
}
114-
}
115-
116-
handle_common() {
117-
install_driver_package() {
118-
# Ref: https://developer.nvidia.com/cuda-12-2-2-download-archive?target_os=Linux&target_arch=x86_64&Distribution=RHEL&target_version=8&target_type=rpm_network
119-
sudo yum -y module install nvidia-driver
120-
}
121-
}
122-
123-
remove_driver_package() {
124-
# Ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#removing-cuda-toolkit-and-driver
125-
sudo yum -y module remove --all nvidia-driver
126-
}
127-
128-
case "$MAJOR_VERSION_ID" in
129-
9) handle_rhel9;;
130-
*) handle_common;;
131-
esac
132-
try_install install_cuda_from_package_manager install_cuda_from_runfile
133-
install_dcgm
8+
# check DCGM service running and load profiling module
9+
dcgmi discovery --list

integration_test/third_party_apps_test/applications/dcgm/debian_ubuntu/install

Lines changed: 14 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1,50 +1,22 @@
11
set -e
22
source /etc/os-release
3-
4-
sudo apt update
5-
KERNEL_VERSION=`uname -r`
6-
sudo apt install -y linux-headers-${KERNEL_VERSION} software-properties-common pciutils gcc make dkms wget
7-
8-
# Install CUDA and driver the same way as the nvml app
9-
# Prefer to install from the package manager since it is normally faster and has
10-
# less errors on installation; fallback to the runfile method if the package
11-
# manager's package is not working or not compitible with the GPU model
123
DISTRIBUTION=$(echo $ID$VERSION_ID | sed -e 's/\.//g')
13-
# Need to add the keyring for installing CUDA and DCGM
14-
wget --no-verbose https://developer.download.nvidia.com/compute/cuda/repos/${DISTRIBUTION}/x86_64/cuda-keyring_1.1-1_all.deb
15-
sudo dpkg -i cuda-keyring_1.1-1_all.deb
16-
echo "Installing latest version of NVIDIA CUDA and driver"
17-
if [[ $ID == debian ]]; then
18-
sudo add-apt-repository contrib
19-
fi
20-
sudo apt update
214

22-
DEVICE_CODE=$(lspci -n | grep -Po '10de:[\w\d]{4}')
23-
case $DEVICE_CODE in
24-
# V100 | P4 | P100
25-
# Device PCIe ID lookup: https://envytools.readthedocs.io/en/latest/hw/pciid.html
26-
10de:1db1|10de:1bb3|10de:15f8)
27-
# For GPUs older than Turing (Volta: V100, Pascal: P4, P100):
28-
# 1. R580 is the last driver branch to support the Pascal (P4 and P100) and Volta architecture (V100).
29-
# https://docs.cloud.google.com/compute/docs/gpus/install-drivers-gpu#recommended-driver-branches
30-
# 2. They need proprietary kernel modules, not the open kernel modules (nvidia-open-*)
31-
sudo apt -y install nvidia-driver-575
32-
sudo apt -y install cuda-12-9
33-
;;
34-
*)
35-
# For newer GPUs, install the latest version
36-
if [[ $ID == debian && "${VERSION_ID}" == 11 ]]; then
37-
# cuda-12-6 is the latest version that supports Debian 11
38-
sudo apt -y install cuda-12-6
39-
else
40-
sudo apt -y install nvidia-driver-575
41-
sudo apt -y install cuda-12-9
42-
fi
43-
;;
44-
esac
5+
# Fix for broken Bullseye Backports (prevents apt update failure)
6+
if [[ "$DISTRIBUTION" == "debian11" ]]; then
7+
if grep -q "bullseye-backports" /etc/apt/sources.list /etc/apt/sources.list.d/* 2>/dev/null; then
8+
sudo sed -i '/bullseye-backports/s/^/#/' /etc/apt/sources.list
9+
sudo sed -i '/bullseye-backports/s/^/#/' /etc/apt/sources.list.d/*.list
10+
fi
11+
fi
4512

46-
# check NVIDIA driver installation succeeded
47-
nvidia-smi
13+
if ! dpkg -s cuda-keyring >/dev/null 2>&1; then
14+
filename="cuda-keyring_1.1-1_all.deb"
15+
url="https://developer.download.nvidia.com/compute/cuda/repos/${DISTRIBUTION}/x86_64/${filename}"
16+
17+
wget --no-verbose "$url"
18+
sudo dpkg -i "$filename"
19+
fi
4820

4921
# Install DCGM
5022
sudo apt-get update
Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,18 @@
11
set -e
22

3-
# Run the bandwidthTest demo with a large range to create a process that uses
3+
# Run the gpu burn with a large range to create a process that uses
44
# GPU for a period that is longer than default collection interval of 60s
5-
/usr/local/cuda/extras/demo_suite/bandwidthTest --memory=pinned --mode=range \
6-
--start=1024 --end=20480 --increment=1
5+
git clone https://github.com/wilicc/gpu-burn
6+
cd gpu-burn
7+
DEVICE_CODE=$(lspci -n | grep -Po '10de:[\w\d]{4}')
8+
case $DEVICE_CODE in
9+
# V100 | P4 | P100
10+
# Device PCIe ID lookup: https://envytools.readthedocs.io/en/latest/hw/pciid.html
11+
10de:1db1|10de:1bb3|10de:15f8)
12+
make COMPUTE=60
13+
;;
14+
*)
15+
make
16+
;;
17+
esac
18+
./gpu_burn -d 180

integration_test/third_party_apps_test/applications/dcgm/metadata.yaml

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,28 +23,32 @@ configure_integration: |-
2323
You must install DCGM and run the DCGM daemon service.
2424
supported_operating_systems: linux
2525
supported_app_version: ["3.1"]
26-
gpu_platforms: # p4, p100 don't emit DCGM profiling metrics
26+
gpu_platforms:
27+
# Platform selection: one most common distro (Ubuntu/Debian) for all GPU models + one easy-to-access GPU model (L4) for all distros
28+
# Debian 11 has the proprietary version of the driver that supports older GPUs (V100, P100, P4)
29+
# P4, P100 don't emit DCGM profiling metrics
2730
- model: a100
2831
platforms:
2932
- ubuntu-os-cloud:ubuntu-2204-lts
3033
- model: v100
3134
platforms:
32-
- ubuntu-os-cloud:ubuntu-2204-lts
35+
- debian-cloud:debian-11
3336
- model: t4
3437
platforms:
3538
- ubuntu-os-cloud:ubuntu-2204-lts
3639
- model: l4
3740
platforms:
3841
- debian-cloud:debian-11
39-
- ml-images:common-gpu-debian-11-py310
42+
- debian-cloud:debian-12
43+
# DCGM 3 not available on debian-cloud:debian-13
4044
- rocky-linux-cloud:rocky-linux-8
4145
- rocky-linux-cloud:rocky-linux-9
4246
- suse-cloud:sles-15
4347
- ubuntu-os-cloud:ubuntu-2204-lts
4448
- ubuntu-os-cloud:ubuntu-2404-lts-amd64
4549
- model: h100
4650
platforms:
47-
- ubuntu-os-cloud:ubuntu-minimal-2204-lts # due to H100 quota, choose an image from the exhaustive list to skip presubmits
51+
# Need quota
4852
expected_metrics:
4953
- type: workload.googleapis.com/gpu.dcgm.utilization
5054
value_type: DOUBLE

integration_test/third_party_apps_test/applications/dcgm/sles/install

Lines changed: 1 addition & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,6 @@
11
set -e
22

3-
sudo zypper --non-interactive install -y kernel-default-devel=$(uname -r | sed 's/\-default//') pciutils gcc make wget
4-
5-
# Install CUDA and driver the same way as the nvml app
6-
# Prefer to install from the package manager since it is normally faster and has
7-
# less errors on installation; fallback to the runfile method if the package
8-
# manager's package is not working or not compitible with the GPU model
9-
DISTRIBUTION=$(. /etc/os-release;echo $ID$VERSION_ID | sed -e 's/\.[0-9]//')
10-
# Need to add the repo for installing CUDA and DCGM
11-
sudo zypper --non-interactive ar http://developer.download.nvidia.com/compute/cuda/repos/${DISTRIBUTION}/x86_64/cuda-${DISTRIBUTION}.repo
12-
sudo zypper --gpg-auto-import-keys --non-interactive refresh
13-
echo "Installing latest version of NVIDIA CUDA and driver"
14-
sudo zypper --non-interactive install -y nvidia-compute-utils-G06
15-
sudo zypper --non-interactive install -y cuda-12-9
16-
17-
# check NVIDIA driver installation succeeded
18-
nvidia-smi
19-
3+
# DCGM and CUDA toolkit share the same NVIDIA repo, which has already been configured in the image
204
# Install DCGM
215
sudo zypper --non-interactive install datacenter-gpu-manager
226
sudo systemctl --now enable nvidia-dcgm

0 commit comments

Comments
 (0)