Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion integration_test/agents/agents.go
Original file line number Diff line number Diff line change
Expand Up @@ -516,7 +516,8 @@ func IsRPMBased(imageSpec string) bool {
strings.HasPrefix(imageSpec, "suse-cloud") ||
strings.HasPrefix(imageSpec, "suse-sap-cloud") ||
strings.HasPrefix(imageSpec, "opensuse-cloud") ||
strings.Contains(imageSpec, "sles-")
strings.Contains(imageSpec, "sles-") ||
strings.Contains(imageSpec, "rocky-linux-")
}

// StripTildeSuffix strips off everything after the first ~ character. We see
Expand Down
Original file line number Diff line number Diff line change
@@ -1,133 +1,9 @@
set -e
source /etc/os-release
MAJOR_VERSION_ID=${VERSION_ID%%.*}

verify_driver() {
# Verify NVIDIA driver:
# Installation could finish successfully but the driver is still unusable
# A common error when running this check:
# "NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA
# driver. Make sure that the latest NVIDIA driver is installed and running."
nvidia-smi
}
# DCGM and CUDA toolkit share the same NVIDIA repo, which has already been configured in the image
# Install DCGM
sudo yum install -y datacenter-gpu-manager
sudo systemctl --now enable nvidia-dcgm

install_cuda_from_runfile() {
# Ref: https://docs.nvidia.com/datacenter/tesla/tesla-installation-notes/index.html#runfile
# This method requires the matching kernel-devel package to be installed, and
# the package may be absent from the repo and cause this method to fail
# Remove existing installation before using the runfile
remove_cuda_package
remove_driver_package
# For Rocky Linux 9: when a new OS version becomes available, the default
# repo setting (/etc/yum.repos.d/rocky.repo) will automatically point to the
# new version's repo. This is problematic since the new OS is not available
# right away on GCE. Set up the matched repo to install the correct
# kernel-devel-$(uname -r)
# Not needed for RL8 since 8.10 is already the last RL8 release.
REPO_URL="https://dl.rockylinux.org/vault/rocky/$VERSION_ID/AppStream/x86_64/os/"
REPO_METADATA="$REPO_URL/repodata/repomd.xml"
STATUS_CODE=$(curl -s -o /dev/null -w "%{http_code}" "$REPO_METADATA")
if [[ $ID == rocky && "$MAJOR_VERSION_ID" == 9 && "$STATUS_CODE" == "200" ]]; then
cat <<EOF | sudo tee /etc/yum.repos.d/rocky-matched.repo
[appstream-matched]
name=Rocky Linux \$releasever - AppStream - Matched
baseurl=$REPO_URL
gpgcheck=1
enabled=1
countme=1
metadata_expire=6h
gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-9
EOF
fi
sudo yum install -y pciutils gcc make wget yum-utils
local KERNEL_PACKAGE="kernel-devel-$(uname -r)"
if [[ $ID == rocky && "$MAJOR_VERSION_ID" == 9 && "$STATUS_CODE" == "403" ]]; then
wget https://dl.rockylinux.org/vault/rocky/$VERSION_ID/AppStream/x86_64/os/Packages/k/${KERNEL_PACKAGE}.rpm
KERNEL_PACKAGE=${KERNEL_PACKAGE}.rpm
fi

sudo yum install -y $KERNEL_PACKAGE
# Installing latest version of NVIDIA CUDA and driver
local CUDA_VERSION=12.9.0
local CUDA_BUNDLED_DRIVER_VERSION=575.51.03
echo "Installing CUDA Toolkit $CUDA_VERSION from CUDA installer with bundled driver $CUDA_BUNDLED_DRIVER_VERSION"
curl -fSsl -O https://developer.download.nvidia.com/compute/cuda/$CUDA_VERSION/local_installers/cuda_${CUDA_VERSION}_${CUDA_BUNDLED_DRIVER_VERSION}_linux.run
sudo sh cuda_${CUDA_VERSION}_${CUDA_BUNDLED_DRIVER_VERSION}_linux.run --silent
verify_driver
}

setup_repo() {
# Enable EPEL (Extra Packages for Enterprise Linux) for packages such as DKMS
# Ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/#prepare-rhel-9-rocky-9
sudo yum install -y yum-utils epel-release
sudo yum-config-manager \
--add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel$MAJOR_VERSION_ID/x86_64/cuda-rhel$MAJOR_VERSION_ID.repo
sudo yum clean all
}

install_cuda_from_package_manager() {
setup_repo
install_driver_package
# TODO(b/377558109): remove the temporary fix once the repo is updated
sudo yum -y install cuda-toolkit-12-9 cuda-demo*
verify_driver
}

remove_cuda_package() {
# Ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#removing-cuda-toolkit-and-driver
sudo yum -y remove "cuda*" "*cublas*" "*cufft*" "*cufile*" "*curand*" \
"*cusolver*" "*cusparse*" "*gds-tools*" "*npp*" "*nvjpeg*" "nsight*" \
"*nvvm*"
}

install_dcgm() {
# Ref: https://docs.nvidia.com/datacenter/dcgm/latest/user-guide/getting-started.html#rhel-centos-rocky-linux
setup_repo
sudo yum install -y datacenter-gpu-manager
sudo systemctl --now enable nvidia-dcgm

# check DCGM service running and load profiling module
dcgmi discovery --list
}

try_install() {
# Export all functions for the bash subprocess
eval "$(declare -F | sed 's/ -f / -fx /')"
export ID MAJOR_VERSION_ID VERSION_ID
for install_method in "$@"; do
echo "Installing NVIDIA driver and CUDA with $install_method..."
# Can't use a subshell because of https://lists.gnu.org/archive/html/bug-bash/2012-12/msg00094.html
bash -$- -c $install_method && {
echo "NVIDIA driver and CUDA has been installed successfully with $install_method."
return 0
}
done
echo "NVIDIA driver and CUDA cannot be installed; all installation methods failed."
return 1
}

handle_rhel9() {
install_driver_package() {
# Ref: https://developer.nvidia.com/cuda-12-9-0-download-archive?target_os=Linux&target_arch=x86_64&Distribution=RHEL&target_version=8&target_type=rpm_network
sudo yum -y module install nvidia-driver:575-dkms
}
}

handle_common() {
install_driver_package() {
# Ref: https://developer.nvidia.com/cuda-12-2-2-download-archive?target_os=Linux&target_arch=x86_64&Distribution=RHEL&target_version=8&target_type=rpm_network
sudo yum -y module install nvidia-driver
}
}

remove_driver_package() {
# Ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#removing-cuda-toolkit-and-driver
sudo yum -y module remove --all nvidia-driver
}

case "$MAJOR_VERSION_ID" in
9) handle_rhel9;;
*) handle_common;;
esac
try_install install_cuda_from_package_manager install_cuda_from_runfile
install_dcgm
# check DCGM service running and load profiling module
dcgmi discovery --list
Original file line number Diff line number Diff line change
@@ -1,50 +1,22 @@
set -e
source /etc/os-release

sudo apt update
KERNEL_VERSION=`uname -r`
sudo apt install -y linux-headers-${KERNEL_VERSION} software-properties-common pciutils gcc make dkms wget

# Install CUDA and driver the same way as the nvml app
# Prefer to install from the package manager since it is normally faster and has
# less errors on installation; fallback to the runfile method if the package
# manager's package is not working or not compitible with the GPU model
DISTRIBUTION=$(echo $ID$VERSION_ID | sed -e 's/\.//g')
# Need to add the keyring for installing CUDA and DCGM
wget --no-verbose https://developer.download.nvidia.com/compute/cuda/repos/${DISTRIBUTION}/x86_64/cuda-keyring_1.1-1_all.deb
sudo dpkg -i cuda-keyring_1.1-1_all.deb
echo "Installing latest version of NVIDIA CUDA and driver"
if [[ $ID == debian ]]; then
sudo add-apt-repository contrib
fi
sudo apt update

DEVICE_CODE=$(lspci -n | grep -Po '10de:[\w\d]{4}')
case $DEVICE_CODE in
# V100 | P4 | P100
# Device PCIe ID lookup: https://envytools.readthedocs.io/en/latest/hw/pciid.html
10de:1db1|10de:1bb3|10de:15f8)
# For GPUs older than Turing (Volta: V100, Pascal: P4, P100):
# 1. R580 is the last driver branch to support the Pascal (P4 and P100) and Volta architecture (V100).
# https://docs.cloud.google.com/compute/docs/gpus/install-drivers-gpu#recommended-driver-branches
# 2. They need proprietary kernel modules, not the open kernel modules (nvidia-open-*)
sudo apt -y install nvidia-driver-575
sudo apt -y install cuda-12-9
;;
*)
# For newer GPUs, install the latest version
if [[ $ID == debian && "${VERSION_ID}" == 11 ]]; then
# cuda-12-6 is the latest version that supports Debian 11
sudo apt -y install cuda-12-6
else
sudo apt -y install nvidia-driver-575
sudo apt -y install cuda-12-9
fi
;;
esac
# Fix for broken Bullseye Backports (prevents apt update failure)
if [[ "$DISTRIBUTION" == "debian11" ]]; then
if grep -q "bullseye-backports" /etc/apt/sources.list /etc/apt/sources.list.d/* 2>/dev/null; then
sudo sed -i '/bullseye-backports/s/^/#/' /etc/apt/sources.list
sudo sed -i '/bullseye-backports/s/^/#/' /etc/apt/sources.list.d/*.list
fi
fi

# check NVIDIA driver installation succeeded
nvidia-smi
if ! dpkg -s cuda-keyring >/dev/null 2>&1; then
filename="cuda-keyring_1.1-1_all.deb"
url="https://developer.download.nvidia.com/compute/cuda/repos/${DISTRIBUTION}/x86_64/${filename}"

wget --no-verbose "$url"
sudo dpkg -i "$filename"
fi

# Install DCGM
sudo apt-get update
Expand Down
18 changes: 15 additions & 3 deletions integration_test/third_party_apps_test/applications/dcgm/exercise
Original file line number Diff line number Diff line change
@@ -1,6 +1,18 @@
set -e

# Run the bandwidthTest demo with a large range to create a process that uses
# Run the gpu burn with a large range to create a process that uses
# GPU for a period that is longer than default collection interval of 60s
/usr/local/cuda/extras/demo_suite/bandwidthTest --memory=pinned --mode=range \
--start=1024 --end=20480 --increment=1
git clone https://github.com/wilicc/gpu-burn
cd gpu-burn
DEVICE_CODE=$(lspci -n | grep -Po '10de:[\w\d]{4}')
case $DEVICE_CODE in
# V100 | P4 | P100
# Device PCIe ID lookup: https://envytools.readthedocs.io/en/latest/hw/pciid.html
10de:1db1|10de:1bb3|10de:15f8)
make COMPUTE=60
;;
*)
make
;;
esac
./gpu_burn -d 180
Original file line number Diff line number Diff line change
Expand Up @@ -23,28 +23,31 @@ configure_integration: |-
You must install DCGM and run the DCGM daemon service.
supported_operating_systems: linux
supported_app_version: ["3.1"]
gpu_platforms: # p4, p100 don't emit DCGM profiling metrics
gpu_platforms:
# Platform selection: one most common distro (Ubuntu/Debian) for all GPU models + one easy-to-access GPU model (L4) for all distros
# Debian 11 has the proprietary version of the driver that supports older GPUs (V100, P100, P4)
# P4, P100 don't emit DCGM profiling metrics
- model: a100
platforms:
- ubuntu-os-cloud:ubuntu-2204-lts
- model: v100
platforms:
- ubuntu-os-cloud:ubuntu-2204-lts
- debian-cloud:debian-11
- model: t4
platforms:
- ubuntu-os-cloud:ubuntu-2204-lts
- model: l4
platforms:
- debian-cloud:debian-11
- ml-images:common-gpu-debian-11-py310
- debian-cloud:debian-12
# DCGM 3 not available on debian-cloud:debian-13
- rocky-linux-cloud:rocky-linux-8
- rocky-linux-cloud:rocky-linux-9
- suse-cloud:sles-15
- ubuntu-os-cloud:ubuntu-2204-lts
- ubuntu-os-cloud:ubuntu-2404-lts-amd64
- model: h100
platforms:
- ubuntu-os-cloud:ubuntu-minimal-2204-lts # due to H100 quota, choose an image from the exhaustive list to skip presubmits
platforms: [] # Need quota
expected_metrics:
- type: workload.googleapis.com/gpu.dcgm.utilization
value_type: DOUBLE
Expand Down
Original file line number Diff line number Diff line change
@@ -1,22 +1,6 @@
set -e

sudo zypper --non-interactive install -y kernel-default-devel=$(uname -r | sed 's/\-default//') pciutils gcc make wget

# Install CUDA and driver the same way as the nvml app
# Prefer to install from the package manager since it is normally faster and has
# less errors on installation; fallback to the runfile method if the package
# manager's package is not working or not compitible with the GPU model
DISTRIBUTION=$(. /etc/os-release;echo $ID$VERSION_ID | sed -e 's/\.[0-9]//')
# Need to add the repo for installing CUDA and DCGM
sudo zypper --non-interactive ar http://developer.download.nvidia.com/compute/cuda/repos/${DISTRIBUTION}/x86_64/cuda-${DISTRIBUTION}.repo
sudo zypper --gpg-auto-import-keys --non-interactive refresh
echo "Installing latest version of NVIDIA CUDA and driver"
sudo zypper --non-interactive install -y nvidia-compute-utils-G06
sudo zypper --non-interactive install -y cuda-12-9

# check NVIDIA driver installation succeeded
nvidia-smi

# DCGM and CUDA toolkit share the same NVIDIA repo, which has already been configured in the image
# Install DCGM
sudo zypper --non-interactive install datacenter-gpu-manager
sudo systemctl --now enable nvidia-dcgm
Expand Down
Loading