Skip to content
Draft
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions cloudbuild/gpu-image-builder/build_packer_builder.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#!/bin/bash
# build_packer_builder.sh
# Builds the custom Packer Cloud Build builder if it doesn't exist.
# https://docs.cloud.google.com/build/docs/building/build-vm-images-with-packer

set -euo pipefail

PROJECT_ID="${1}"
PACKER_BUILDER_IMAGE="gcr.io/${PROJECT_ID}/packer"

if gcloud container images describe "${PACKER_BUILDER_IMAGE}" > /dev/null 2>&1; then
echo "Packer builder image '${PACKER_BUILDER_IMAGE}' exists, skipping build."
else
echo "Packer builder image not found. Building it now..."
git clone https://github.com/GoogleCloudPlatform/cloud-builders-community.git --depth=1
cd cloud-builders-community/packer
gcloud builds submit --project="${PROJECT_ID}" .
cd -
echo "Packer builder image built."
fi
41 changes: 41 additions & 0 deletions cloudbuild/gpu-image-builder/check_source_image.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#!/bin/bash
# check_source_image.sh
# Checks if the latest public image is newer than the source of our last build and if we need a new build

set -euo pipefail

PROJECT_ID="${1}"
SOURCE_IMAGE_FAMILY="${2}"
SOURCE_IMAGE_PROJECT="${3}"
TARGET_IMAGE_FAMILY="${4}"
# Louhi set trigger type as either "cron-trigger" or "git-change-trigger"
LOUHI_TRIGGER_TYPE="${5}"

echo "--- Checking for New Source Image ---"
LATEST_PUBLIC_IMAGE=$(gcloud compute images describe-from-family "${SOURCE_IMAGE_FAMILY}" --project="${SOURCE_IMAGE_PROJECT}" --format="value(name)")
echo "Latest available public image: ${LATEST_PUBLIC_IMAGE}"

LAST_CURATED_SOURCE_IMAGE=""
if gcloud compute images describe-from-family "${TARGET_IMAGE_FAMILY}" --project="${PROJECT_ID}" &> /dev/null; then
LAST_CURATED_SOURCE_IMAGE=$(gcloud compute images describe-from-family "${TARGET_IMAGE_FAMILY}" --project="${PROJECT_ID}" --format="value(labels.source-gce-image)")
echo "Source image of our latest curated image: ${LAST_CURATED_SOURCE_IMAGE}"
else
echo "Image family '${TARGET_IMAGE_FAMILY}' not found. Assuming this is the first build."
fi

# Only skip when running nightly, and there is no new base image
if [[ "${LATEST_PUBLIC_IMAGE}" == "${LAST_CURATED_SOURCE_IMAGE}" ]] && \
[[ "${LOUHI_TRIGGER_TYPE}" == "cron-trigger" ]]; then
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would like to try and keep our scripts as agnostic to whats running them as possible. There is a different option that we can take to skip the build. Lets sink offline.

echo "Source image '${LATEST_PUBLIC_IMAGE}' has not changed. Signaling to skip build."
echo "SKIP" > /workspace/build_status.txt
# Else, we either have a new image, or this is trigger by git changes
# Note that we set the Louhi Git trigger to only watch cloudbuild/gpu-image-builder directory
else
if [[ "${LATEST_PUBLIC_IMAGE}" != "${LAST_CURATED_SOURCE_IMAGE}" ]]; then
echo "New source image '${LATEST_PUBLIC_IMAGE}' detected or first run. Signaling to run build."
else
echo "New image building triggered by GitHub changes (Louhi trigger type = '${LOUHI_TRIGGER_TYPE}')"
fi
echo "${LATEST_PUBLIC_IMAGE}" > /workspace/new_source_image.txt
echo "RUN" > /workspace/build_status.txt
fi
52 changes: 52 additions & 0 deletions cloudbuild/gpu-image-builder/cloudbuild.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# cloudbuild.yaml
steps:
# Check for new source image. Runs 'check_source_image.sh'.
- id: 'check-source-image'
name: 'gcr.io/cloud-builders/gcloud'
entrypoint: 'bash'
args:
- '-c'
- |
chmod +x /workspace/louhi_ws/ops-agent/cloudbuild/gpu-image-builder/gpu-check_source_image.sh
/workspace/louhi_ws/ops-agent/cloudbuild/gpu-image-builder/check_source_image.sh "${PROJECT_ID}" \
"${_LOUHI_PARAM_SOURCE_IMAGE_FAMILY}" \
"${_LOUHI_PARAM_SOURCE_IMAGE_PROJECT}" \
"${_LOUHI_PARAM_OUTPUT_IMAGE_FAMILY}" \
"${_LOUHI_TRIGGER_TYPE}"
waitFor: ['-']

# Conditionally build the Packer builder image. Runs 'build_packer_builder.sh'.
- id: 'build-packer-builder'
name: 'gcr.io/cloud-builders/gcloud'
entrypoint: 'bash'
args:
- '-c'
- |
chmod +x /workspace/louhi_ws/ops-agent/cloudbuild/gpu-image-builder/build_packer_builder.sh
/workspace/louhi_ws/ops-agent/cloudbuild/gpu-image-builder/build_packer_builder.sh "${PROJECT_ID}"
waitFor: ['-'] # Can run in parallel with check-source-image

# Run Packer to build the GCE image, but only if 'check-source-image' signaled to RUN.
- id: 'packer-build-gpu-image'
name: 'gcr.io/${PROJECT_ID}/packer'
entrypoint: 'bash'
args:
- '-c'
- |
if [[ "$(cat /workspace/build_status.txt)" == "SKIP" ]]; then
echo "Skipping Packer build as source image has not changed."
exit 0
fi

/usr/bin/packer build \
-var "project_id=${PROJECT_ID}" \
-var "image_name=${_LOUHI_PARAM_OUTPUT_IMAGE_FAMILY}-$(date -u +%Y%m%d-%H%M%S)" \
-var "image_family=${_LOUHI_PARAM_OUTPUT_IMAGE_FAMILY}" \
-var "source_image=$(cat /workspace/new_source_image.txt)" \
-var "source_image_project=${_LOUHI_PARAM_SOURCE_IMAGE_PROJECT}" \
-var "zone=us-central1-a" \
-var "build_id=${BUILD_ID}" \
/workspace/louhi_ws/ops-agent/cloudbuild/gpu-image-builder/packer.pkr.hcl
waitFor: ['check-source-image', 'build-packer-builder']

timeout: 14400s
77 changes: 77 additions & 0 deletions cloudbuild/gpu-image-builder/packer.pkr.hcl
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
// packer.pkr.hcl
variable "project_id" {
type = string
description = "GCP Project ID"
}

variable "image_name" {
type = string
description = "Name of the created GCE image"
}

variable "image_family" {
type = string
description = "Image family for the created GCE image"
}

variable "source_image" {
type = string
description = "The specific source GCE image name (e.g., ubuntu-2204-jammy-v20240115)"
}

variable "source_image_project" {
type = string
description = "The specific source GCE image project (e.g., ubuntu-os-cloud)"
}

variable "zone" {
type = string
default = "us-central1-a"
description = "GCP zone for the temporary build instance"
}

variable "build_id" {
type = string
description = "Cloud Build ID for traceability"
default = "manual"
}

source "googlecompute" "gpu_image" {
project_id = var.project_id
zone = var.zone
source_image = var.source_image
source_image_project_id = [var.source_image_project]
image_name = var.image_name
image_family = var.image_family
ssh_username = "packer"
disk_size = 50
disk_type = "pd-standard"
machine_type = "n1-standard-4" // Use a standard VM for building, no GPU needed here
tags = ["packer-build"]

// *** IMPORTANT: Label the created image with its source image ***
image_labels = {
source-gce-image = "${var.source_image}"
built-by = "louhi"
cloud-build-id = "${var.build_id}"
}
}

build {
sources = ["source.googlecompute.gpu_image"]

// Provisioner 1: Most distros only need one step
provisioner "shell" {
script = "./scripts/${var.image_family}/setup_vm.sh"
expect_disconnect = true // Expect a disconnect/reboot after GPU driver install
timeout = "240m"
}

// Provisioner 2: Handles the post-reboot part, ONLY for Debian 12.
provisioner "shell" {
script = var.image_family == "debian-12" ? "./scripts/${var.image_family}/post_reboot.sh" : "./scripts/noop.sh"
pause_before = "60s" // Wait for the reboot to be complete
expect_disconnect = false // No reboot expected in this second phase.
timeout = "240m"
}
}
10 changes: 10 additions & 0 deletions cloudbuild/gpu-image-builder/scripts/debian-11/setup_vm.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/bin/bash
# setup_vm.sh - Provisioning script for Packer, executed via Shell Provisioner.
set -euo pipefail

# Source Image: ml-images:common-gpu-debian-11-py310
# Source Image description: Google, Deep Learning VM with CUDA 11.8, M126, Debian 11, Python 3.10. With CUDA 11.8 preinstalled.
# Output Image: stackdriver-test-143416:debian-11

# DLVM images come with a script to install the driver and CUDA toolkit.
/opt/deeplearning/install-driver.sh
12 changes: 12 additions & 0 deletions cloudbuild/gpu-image-builder/scripts/debian-12/post_reboot.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/bin/bash
# post_reboot.sh - Runs setup steps after the VM has rebooted on Debian 12. Provisioning script for Packer, executed via Shell Provisioner.
set -euo pipefail

INSTALLER_DIR="/var/lib/cuda-installer"
CUDA_INSTALLER_PATH="${INSTALLER_DIR}/cuda_installer.pyz"

# Rerun `install_driver` to finish driver installation
sudo python3 "${CUDA_INSTALLER_PATH}" install_driver --ignore-no-gpu --installation-mode=repo --installation-branch=nfb || { echo "ERROR: cuda_installer.pyz install_driver failed!"; exit 1; }

# Install CUDA toolkit
sudo python3 "${CUDA_INSTALLER_PATH}" install_cuda --ignore-no-gpu --installation-mode=repo --installation-branch=nfb || { echo "ERROR: cuda_installer.pyz install_cuda failed!"; exit 1; }
18 changes: 18 additions & 0 deletions cloudbuild/gpu-image-builder/scripts/debian-12/setup_vm.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#!/bin/bash
# setup_vm.sh - Provisioning script for Packer, executed via Shell Provisioner.
set -euo pipefail

# Source Image: debian-cloud:debian-12
# Output Image: stackdriver-test-143416:debian-12

sudo apt update -y
sudo apt install -y --no-install-recommends python3 python3-pip wget curl gnupg git || { echo "ERROR: Failed to install prerequisites!"; exit 1; }

INSTALLER_DIR="/var/lib/cuda-installer"
CUDA_INSTALLER_PATH="${INSTALLER_DIR}/cuda_installer.pyz"
sudo mkdir -p "${INSTALLER_DIR}"
sudo curl -L https://storage.googleapis.com/compute-gpu-installation-us/installer/latest/cuda_installer.pyz --output "${CUDA_INSTALLER_PATH}"
sudo chmod +x "${CUDA_INSTALLER_PATH}"

sudo python3 "${CUDA_INSTALLER_PATH}" install_driver --ignore-no-gpu --installation-mode=repo --installation-branch=nfb || { echo "ERROR: cuda_installer.pyz install_driver failed!"; exit 1; }
# The script will reboot
18 changes: 18 additions & 0 deletions cloudbuild/gpu-image-builder/scripts/debian-13/setup_vm.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#!/bin/bash
# setup_vm.sh - Provisioning script for Packer, executed via Shell Provisioner.
set -euo pipefail

# Source Image: debian-cloud:debian-13
# Output Image: stackdriver-test-143416:debian-13

# Install driver and CUDA toolkit
sudo apt update -y
KERNEL_VERSION=`uname -r`
sudo apt install -y linux-headers-${KERNEL_VERSION} pciutils gcc make dkms wget git

wget https://developer.download.nvidia.com/compute/cuda/repos/debian13/x86_64/cuda-keyring_1.1-1_all.deb
sudo dpkg -i cuda-keyring_1.1-1_all.deb
sudo apt update

sudo apt -y install cuda-13-1

2 changes: 2 additions & 0 deletions cloudbuild/gpu-image-builder/scripts/noop.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
#!/bin/bash
# Empty Script as a placeholder for noop steps
14 changes: 14 additions & 0 deletions cloudbuild/gpu-image-builder/scripts/rocky-linux-8/setup_vm.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/bin/bash
# setup_vm.sh - Provisioning script for Packer, executed via Shell Provisioner.
set -euo pipefail

# Source Image: rocky-linux-accelerator-cloud:rocky-linux-8-optimized-gcp-nvidia-580
# Source Image Description: Rocky Linux, Rocky Linux, 8 with the Nvidia 580 driver, x86_64 optimized for GCP built on {date}
# Output Image: stackdriver-test-143416:rocky-linux-8

# The accelerator image already has the driver (R580) installed.
# Follow https://developer.nvidia.com/cuda-13-0-0-download-archive?target_os=Linux&target_arch=x86_64&Distribution=Rocky&target_version=8&target_type=rpm_network
# to install the matching CUDA toolkit 13.0 (without driver)
sudo dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo
sudo dnf clean all
sudo dnf -y install cuda-toolkit-13-0 git make
14 changes: 14 additions & 0 deletions cloudbuild/gpu-image-builder/scripts/rocky-linux-9/setup_vm.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/bin/bash
# setup_vm.sh - Provisioning script for Packer, executed via Shell Provisioner.
set -euo pipefail

# Source Image: rocky-linux-accelerator-cloud:rocky-linux-9-optimized-gcp-nvidia-580
# Source Image Description: Rocky Linux, Rocky Linux, 9 with the Nvidia 580 driver, x86_64 optimized for GCP with the Nvidia 580 driver built on {date}
# Output Image: stackdriver-test-143416:rocky-linux-9

# The accelerator image already has the driver (R580) installed.
# Follow https://developer.nvidia.com/cuda-13-0-0-download-archive?target_os=Linux&target_arch=x86_64&Distribution=Rocky&target_version=9&target_type=rpm_network
# to install the matching CUDA toolkit 13.0 (without driver)
sudo dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo
sudo dnf clean all
sudo dnf -y install cuda-toolkit-13-0 git make
56 changes: 56 additions & 0 deletions cloudbuild/gpu-image-builder/scripts/sles-15/setup_vm.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#!/bin/bash
# setup_vm.sh - Provisioning script for Packer, executed via Shell Provisioner.
set -euo pipefail

# Source Image: suse-cloud:sles-15
# Output Image: stackdriver-test-143416:sles-15

# Mimic our prepareSLES() logic in gce_testing.go
# https://github.com/GoogleCloudPlatform/opentelemetry-operations-collector/blob/ec757f2f48c865c7aa1afaed27891d8727a28f2e/integration_test/gce-testing-internal/gce/gce_testing.go#L1057
retry_command() {
local max_attempts="$1"
local sleep_time="$2"
local cmd="$3"

echo "Starting command: $cmd"
echo "----------------------------------------"

for ((i=1; i<=max_attempts; i++)); do
echo "[Attempt $i/$max_attempts] Running..."

# Run the command using bash -c to handle complex commands (like those with &&)
if bash -c "$cmd"; then
echo "----------------------------------------"
echo "Success!"
return 0
fi

echo "Attempt failed."

# Sleep only if we have attempts left
if [ $i -lt $max_attempts ]; then
echo "Waiting $sleep_time seconds before retrying..."
sleep $sleep_time
fi
done

echo "----------------------------------------"
echo "Error: Command failed after $max_attempts attempts."
exit 1
}

retry_command 5 5 "sudo /usr/sbin/registercloudguest --force"
retry_command 120 5 "sudo zypper --non-interactive --gpg-auto-import-keys refresh && sudo zypper --non-interactive install --force coreutils"

sudo zypper --non-interactive install -y kernel-default-devel=$(uname -r | sed 's/\-default//') pciutils gcc make wget git

# Install CUDA and driver together, since the `exercise` script needs to run a
# CUDA app to generating GPU process metrics
# Prefer to install from the package manager since it is normally faster and has
# less errors on installation. The cuda-12-9 mega-package installs driver and
# CUDA together
sudo zypper --non-interactive addrepo https://developer.download.nvidia.com/compute/cuda/repos/sles15/x86_64/cuda-sles15.repo
sudo zypper --gpg-auto-import-keys --non-interactive refresh
# CUDA 13 is not yet working with the SLES 15 image
sudo zypper --non-interactive install -y nvidia-compute-utils-G06
sudo zypper --non-interactive install -y cuda-12-9
15 changes: 15 additions & 0 deletions cloudbuild/gpu-image-builder/scripts/ubuntu-2204-lts/setup_vm.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#!/bin/bash
# setup_vm.sh - Provisioning script for Packer, executed via Shell Provisioner.
set -euo pipefail

# Source Image: ubuntu-os-accelerator-images:ubuntu-accelerator-2204-amd64-with-nvidia-580
# Source Image Description: Canonical, Ubuntu, 22.04 LTS NVIDIA version: 580, amd64 jammy image built on {date}
# Output Image: stackdriver-test-143416:ubuntu-2204-lts

# The accelerator image already has the driver (R580) installed.
# Follow https://developer.nvidia.com/cuda-13-0-0-download-archive?target_os=Linux&target_arch=x86_64&Distribution=Ubuntu&target_version=22.04&target_type=deb_network
# to install the matching CUDA toolkit 13.0 (without driver)
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
sudo dpkg -i cuda-keyring_1.1-1_all.deb
sudo apt-get update
sudo apt-get -y install build-essential cuda-toolkit-13-0
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#!/bin/bash
# setup_vm.sh - Provisioning script for Packer, executed via Shell Provisioner.
set -euo pipefail

# Source Image: ubuntu-os-accelerator-images:ubuntu-accelerator-2404-amd64-with-nvidia-580
# Source Image Description: Canonical, Ubuntu, 24.04 LTS NVIDIA version: 580, amd64 noble image built on {date}
# Output Image: stackdriver-test-143416:ubuntu-2404-lts

# The accelerator image already has the driver (R580) installed.
# Follow https://developer.nvidia.com/cuda-13-0-0-download-archive?target_os=Linux&target_arch=x86_64&Distribution=Ubuntu&target_version=24.04&target_type=deb_network
# to install the matching CUDA toolkit 13.0 (without driver)
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb
sudo dpkg -i cuda-keyring_1.1-1_all.deb
sudo apt-get update
sudo apt-get -y install build-essential cuda-toolkit-13-0
Loading