GoogleCloudPlatform · LujieDuan · Jan 2, 2026 · Jan 2, 2026
diff --git a/integration_test/agents/agents.go b/integration_test/agents/agents.go
@@ -516,7 +516,8 @@ func IsRPMBased(imageSpec string) bool {
 		strings.HasPrefix(imageSpec, "suse-cloud") ||
 		strings.HasPrefix(imageSpec, "suse-sap-cloud") ||
 		strings.HasPrefix(imageSpec, "opensuse-cloud") ||
-		strings.Contains(imageSpec, "sles-")
+		strings.Contains(imageSpec, "sles-") ||
+		strings.Contains(imageSpec, "rocky-linux-")
 }
 
 // StripTildeSuffix strips off everything after the first ~ character. We see

diff --git a/integration_test/third_party_apps_test/applications/dcgm/centos_rhel/install b/integration_test/third_party_apps_test/applications/dcgm/centos_rhel/install
@@ -1,133 +1,9 @@
 set -e
-source /etc/os-release
-MAJOR_VERSION_ID=${VERSION_ID%%.*}
 
-verify_driver() {
-    # Verify NVIDIA driver:
-    # Installation could finish successfully but the driver is still unusable
-    # A common error when running this check:
-    # "NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA
-    # driver. Make sure that the latest NVIDIA driver is installed and running."
-    nvidia-smi
-}
+# DCGM and CUDA toolkit share the same NVIDIA repo, which has already been configured in the image
+# Install DCGM
+sudo yum install -y datacenter-gpu-manager
+sudo systemctl --now enable nvidia-dcgm
 
-install_cuda_from_runfile() {
-    # Ref: https://docs.nvidia.com/datacenter/tesla/tesla-installation-notes/index.html#runfile
-    # This method requires the matching kernel-devel package to be installed, and
-    # the package may be absent from the repo and cause this method to fail
-    # Remove existing installation before using the runfile
-    remove_cuda_package
-    remove_driver_package
-    # For Rocky Linux 9: when a new OS version becomes available, the default
-    # repo setting (/etc/yum.repos.d/rocky.repo) will automatically point to the
-    # new version's repo. This is problematic since the new OS is not available
-    # right away on GCE. Set up the matched repo to install the correct
-    # kernel-devel-$(uname -r)
-    # Not needed for RL8 since 8.10 is already the last RL8 release.
-    REPO_URL="https://dl.rockylinux.org/vault/rocky/$VERSION_ID/AppStream/x86_64/os/"
-    REPO_METADATA="$REPO_URL/repodata/repomd.xml"
-    STATUS_CODE=$(curl -s -o /dev/null -w "%{http_code}" "$REPO_METADATA")
-    if [[ $ID == rocky && "$MAJOR_VERSION_ID" == 9 && "$STATUS_CODE" == "200" ]]; then
-        cat <<EOF | sudo tee /etc/yum.repos.d/rocky-matched.repo
-[appstream-matched]
-name=Rocky Linux \$releasever - AppStream - Matched
-baseurl=$REPO_URL
-gpgcheck=1
-enabled=1
-countme=1
-metadata_expire=6h
-gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-Rocky-9
-EOF
-    fi
-    sudo yum install -y pciutils gcc make wget yum-utils
-    local KERNEL_PACKAGE="kernel-devel-$(uname -r)"
-    if [[ $ID == rocky && "$MAJOR_VERSION_ID" == 9 && "$STATUS_CODE" == "403" ]]; then
-      wget https://dl.rockylinux.org/vault/rocky/$VERSION_ID/AppStream/x86_64/os/Packages/k/${KERNEL_PACKAGE}.rpm
-      KERNEL_PACKAGE=${KERNEL_PACKAGE}.rpm
-    fi
-
-    sudo yum install -y $KERNEL_PACKAGE
-    # Installing latest version of NVIDIA CUDA and driver
-    local CUDA_VERSION=12.9.0
-    local CUDA_BUNDLED_DRIVER_VERSION=575.51.03
-    echo "Installing CUDA Toolkit $CUDA_VERSION from CUDA installer with bundled driver $CUDA_BUNDLED_DRIVER_VERSION"
-    curl -fSsl -O https://developer.download.nvidia.com/compute/cuda/$CUDA_VERSION/local_installers/cuda_${CUDA_VERSION}_${CUDA_BUNDLED_DRIVER_VERSION}_linux.run
-    sudo sh cuda_${CUDA_VERSION}_${CUDA_BUNDLED_DRIVER_VERSION}_linux.run --silent
-    verify_driver
-}
-
-setup_repo() {
-    # Enable EPEL (Extra Packages for Enterprise Linux) for packages such as DKMS
-    # Ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/#prepare-rhel-9-rocky-9
-    sudo yum install -y yum-utils epel-release
-    sudo yum-config-manager \
-        --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel$MAJOR_VERSION_ID/x86_64/cuda-rhel$MAJOR_VERSION_ID.repo
-    sudo yum clean all
-}
-
-install_cuda_from_package_manager() {
-    setup_repo
-    install_driver_package
-    # TODO(b/377558109): remove the temporary fix once the repo is updated
-    sudo yum -y install cuda-toolkit-12-9 cuda-demo*
-    verify_driver
-}
-
-remove_cuda_package() {
-    # Ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#removing-cuda-toolkit-and-driver
-    sudo yum -y remove "cuda*" "*cublas*" "*cufft*" "*cufile*" "*curand*" \
-        "*cusolver*" "*cusparse*" "*gds-tools*" "*npp*" "*nvjpeg*" "nsight*" \
-        "*nvvm*"
-}
-
-install_dcgm() {
-    # Ref: https://docs.nvidia.com/datacenter/dcgm/latest/user-guide/getting-started.html#rhel-centos-rocky-linux
-    setup_repo
-    sudo yum install -y datacenter-gpu-manager
-    sudo systemctl --now enable nvidia-dcgm
-
-    # check DCGM service running and load profiling module
-    dcgmi discovery --list
-}
-
-try_install() {
-    # Export all functions for the bash subprocess
-    eval "$(declare -F | sed 's/ -f / -fx /')"
-    export ID MAJOR_VERSION_ID VERSION_ID
-    for install_method in "$@"; do
-        echo "Installing NVIDIA driver and CUDA with $install_method..."
-        # Can't use a subshell because of https://lists.gnu.org/archive/html/bug-bash/2012-12/msg00094.html
-        bash -$- -c $install_method && {
-            echo "NVIDIA driver and CUDA has been installed successfully with $install_method."
-            return 0
-        }
-    done
-    echo "NVIDIA driver and CUDA cannot be installed; all installation methods failed."
-    return 1
-}
-
-handle_rhel9() {
-    install_driver_package() {
-        # Ref: https://developer.nvidia.com/cuda-12-9-0-download-archive?target_os=Linux&target_arch=x86_64&Distribution=RHEL&target_version=8&target_type=rpm_network
-        sudo yum -y module install nvidia-driver:575-dkms
-    }
-}
-
-handle_common() {
-    install_driver_package() {
-        # Ref: https://developer.nvidia.com/cuda-12-2-2-download-archive?target_os=Linux&target_arch=x86_64&Distribution=RHEL&target_version=8&target_type=rpm_network
-        sudo yum -y module install nvidia-driver
-    }
-}
-
-remove_driver_package() {
-    # Ref: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#removing-cuda-toolkit-and-driver
-    sudo yum -y module remove --all nvidia-driver
-}
-
-case "$MAJOR_VERSION_ID" in
-    9) handle_rhel9;;
-    *) handle_common;;
-esac
-try_install install_cuda_from_package_manager install_cuda_from_runfile
-install_dcgm
+# check DCGM service running and load profiling module
+dcgmi discovery --list
diff --git a/integration_test/third_party_apps_test/applications/dcgm/debian_ubuntu/install b/integration_test/third_party_apps_test/applications/dcgm/debian_ubuntu/install
@@ -1,50 +1,22 @@
 set -e
 source /etc/os-release
-
-sudo apt update
-KERNEL_VERSION=`uname -r`
-sudo apt install -y linux-headers-${KERNEL_VERSION} software-properties-common pciutils gcc make dkms wget
-
-# Install CUDA and driver the same way as the nvml app
-# Prefer to install from the package manager since it is normally faster and has
-# less errors on installation; fallback to the runfile method if the package
-# manager's package is not working or not compitible with the GPU model
 DISTRIBUTION=$(echo $ID$VERSION_ID | sed -e 's/\.//g')
-# Need to add the keyring for installing CUDA and DCGM
-wget --no-verbose https://developer.download.nvidia.com/compute/cuda/repos/${DISTRIBUTION}/x86_64/cuda-keyring_1.1-1_all.deb
-sudo dpkg -i cuda-keyring_1.1-1_all.deb
-echo "Installing latest version of NVIDIA CUDA and driver"
-if [[ $ID == debian ]]; then
-    sudo add-apt-repository contrib
-fi
-sudo apt update
 
-DEVICE_CODE=$(lspci -n | grep -Po '10de:[\w\d]{4}')
-case $DEVICE_CODE in
-    # V100 | P4 | P100
-    # Device PCIe ID lookup: https://envytools.readthedocs.io/en/latest/hw/pciid.html
-    10de:1db1|10de:1bb3|10de:15f8)
-        # For GPUs older than Turing (Volta: V100, Pascal: P4, P100):
-        # 1. R580 is the last driver branch to support the Pascal (P4 and P100) and Volta architecture (V100).
-        # https://docs.cloud.google.com/compute/docs/gpus/install-drivers-gpu#recommended-driver-branches
-        # 2. They need proprietary kernel modules, not the open kernel modules (nvidia-open-*)
-        sudo apt -y install nvidia-driver-575
-        sudo apt -y install cuda-12-9
-        ;;
-    *)
-        # For newer GPUs, install the latest version
-        if [[ $ID == debian && "${VERSION_ID}" == 11 ]]; then
-            # cuda-12-6 is the latest version that supports Debian 11
-            sudo apt -y install cuda-12-6
-        else
-            sudo apt -y install nvidia-driver-575
-            sudo apt -y install cuda-12-9
-        fi
-        ;;
-esac
+# Fix for broken Bullseye Backports (prevents apt update failure)
+if [[ "$DISTRIBUTION" == "debian11" ]]; then 
+    if grep -q "bullseye-backports" /etc/apt/sources.list /etc/apt/sources.list.d/* 2>/dev/null; then
+        sudo sed -i '/bullseye-backports/s/^/#/' /etc/apt/sources.list
+        sudo sed -i '/bullseye-backports/s/^/#/' /etc/apt/sources.list.d/*.list
+    fi
+fi
 
-# check NVIDIA driver installation succeeded
-nvidia-smi
+if ! dpkg -s cuda-keyring >/dev/null 2>&1; then
+    filename="cuda-keyring_1.1-1_all.deb"
+    url="https://developer.download.nvidia.com/compute/cuda/repos/${DISTRIBUTION}/x86_64/${filename}"
+
+    wget --no-verbose "$url"
+    sudo dpkg -i "$filename"
+fi
 
 # Install DCGM
 sudo apt-get update

diff --git a/integration_test/third_party_apps_test/applications/dcgm/exercise b/integration_test/third_party_apps_test/applications/dcgm/exercise
@@ -1,6 +1,18 @@
 set -e
 
-# Run the bandwidthTest demo with a large range to create a process that uses
+# Run the gpu burn with a large range to create a process that uses
 # GPU for a period that is longer than default collection interval of 60s
-/usr/local/cuda/extras/demo_suite/bandwidthTest --memory=pinned --mode=range \
-  --start=1024 --end=20480 --increment=1
+git clone https://github.com/wilicc/gpu-burn
+cd gpu-burn
+DEVICE_CODE=$(lspci -n | grep -Po '10de:[\w\d]{4}')
+case $DEVICE_CODE in
+    # V100 | P4 | P100
+    # Device PCIe ID lookup: https://envytools.readthedocs.io/en/latest/hw/pciid.html
+    10de:1db1|10de:1bb3|10de:15f8)
+        make COMPUTE=60
+        ;;
+    *)
+        make
+        ;;
+esac
+./gpu_burn -d 180
diff --git a/integration_test/third_party_apps_test/applications/dcgm/metadata.yaml b/integration_test/third_party_apps_test/applications/dcgm/metadata.yaml
@@ -23,28 +23,31 @@ configure_integration: |-
   You must install DCGM and run the DCGM daemon service.
 supported_operating_systems: linux
 supported_app_version: ["3.1"]
-gpu_platforms: # p4, p100 don't emit DCGM profiling metrics
+gpu_platforms:
+ # Platform selection: one most common distro (Ubuntu/Debian) for all GPU models + one easy-to-access GPU model (L4) for all distros
+# Debian 11 has the proprietary version of the driver that supports older GPUs (V100, P100, P4) 
+# P4, P100 don't emit DCGM profiling metrics
   - model: a100
     platforms:
       - ubuntu-os-cloud:ubuntu-2204-lts
   - model: v100
     platforms:
-      - ubuntu-os-cloud:ubuntu-2204-lts
+      - debian-cloud:debian-11
   - model: t4
     platforms:
       - ubuntu-os-cloud:ubuntu-2204-lts
   - model: l4
     platforms:
       - debian-cloud:debian-11
-      - ml-images:common-gpu-debian-11-py310
+      - debian-cloud:debian-12
+      # DCGM 3 not available on debian-cloud:debian-13
       - rocky-linux-cloud:rocky-linux-8
       - rocky-linux-cloud:rocky-linux-9
       - suse-cloud:sles-15
       - ubuntu-os-cloud:ubuntu-2204-lts
       - ubuntu-os-cloud:ubuntu-2404-lts-amd64
   - model: h100
-    platforms:
-      - ubuntu-os-cloud:ubuntu-minimal-2204-lts # due to H100 quota, choose an image from the exhaustive list to skip presubmits
+    platforms: [] # Need quota
 expected_metrics:
   - type: workload.googleapis.com/gpu.dcgm.utilization
     value_type: DOUBLE

diff --git a/integration_test/third_party_apps_test/applications/dcgm/sles/install b/integration_test/third_party_apps_test/applications/dcgm/sles/install
@@ -1,22 +1,6 @@
 set -e
 
-sudo zypper --non-interactive install -y kernel-default-devel=$(uname -r | sed 's/\-default//') pciutils gcc make wget
-
-# Install CUDA and driver the same way as the nvml app
-# Prefer to install from the package manager since it is normally faster and has
-# less errors on installation; fallback to the runfile method if the package
-# manager's package is not working or not compitible with the GPU model
-DISTRIBUTION=$(. /etc/os-release;echo $ID$VERSION_ID | sed -e 's/\.[0-9]//')
-# Need to add the repo for installing CUDA and DCGM
-sudo zypper --non-interactive ar http://developer.download.nvidia.com/compute/cuda/repos/${DISTRIBUTION}/x86_64/cuda-${DISTRIBUTION}.repo
-sudo zypper --gpg-auto-import-keys --non-interactive refresh
-echo "Installing latest version of NVIDIA CUDA and driver"
-sudo zypper --non-interactive install -y nvidia-compute-utils-G06
-sudo zypper --non-interactive install -y cuda-12-9
-
-# check NVIDIA driver installation succeeded
-nvidia-smi
-
+# DCGM and CUDA toolkit share the same NVIDIA repo, which has already been configured in the image
 # Install DCGM
 sudo zypper --non-interactive install datacenter-gpu-manager
 sudo systemctl --now enable nvidia-dcgm