Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
integrate gpu-driver-util into the driver images
Browse files Browse the repository at this point in the history
Signed-off-by: Tariq Ibrahim <tibrahim@nvidia.com>
tariq1890 committed Jan 14, 2025
1 parent 0f4ecad commit c1569fa
Showing 12 changed files with 182 additions and 27 deletions.
8 changes: 5 additions & 3 deletions rhel8/Dockerfile
Original file line number Diff line number Diff line change
@@ -17,9 +17,10 @@ ENV PATH /usr/local/go/bin:$PATH
WORKDIR /work

RUN git clone https://github.com/NVIDIA/gpu-driver-container driver && \
cd driver/vgpu/src && \
go build -o vgpu-util && \
mv vgpu-util /work
go build -C driver/vgpu/src -o vgpu-util && \
mv driver/vgpu/src/vgpu-util /work && \
go build -C driver/gpu-driver-util -o gpu-driver-util && \
mv driver/gpu-driver-util/gpu-driver-util /work

FROM nvcr.io/nvidia/cuda:12.6.3-base-ubi8

@@ -84,6 +85,7 @@ COPY ocp_dtk_entrypoint /usr/local/bin
COPY common.sh /usr/local/bin

COPY --from=build /work/vgpu-util /usr/local/bin
COPY --from=build /work/gpu-driver-util /usr/local/bin

WORKDIR /drivers

23 changes: 20 additions & 3 deletions rhel8/nvidia-driver
Original file line number Diff line number Diff line change
@@ -17,9 +17,7 @@ USE_HOST_MOFED="${USE_HOST_MOFED:-false}"
DNF_RELEASEVER=${DNF_RELEASEVER:-""}
RHEL_VERSION=${RHEL_VERSION:-""}
RHEL_MAJOR_VERSION=8

OPEN_KERNEL_MODULES_ENABLED=${OPEN_KERNEL_MODULES_ENABLED:-false}
[[ "${OPEN_KERNEL_MODULES_ENABLED}" == "true" ]] && KERNEL_TYPE=kernel-open || KERNEL_TYPE=kernel
KERNEL_MODULE_TYPE=${KERNEL_MODULE_TYPE:-auto}

DRIVER_ARCH=${TARGETARCH/amd64/x86_64} && DRIVER_ARCH=${DRIVER_ARCH/arm64/aarch64}
echo "DRIVER_ARCH is $DRIVER_ARCH"
@@ -577,6 +575,24 @@ _start_vgpu_topology_daemon() {
nvidia-topologyd
}

_resolve_kernel_type() {
if [ "${KERNEL_MODULE_TYPE}" == "proprietary" ]; then
KERNEL_TYPE=kernel
elif [ "${KERNEL_MODULE_TYPE}" == "open" ]; then
KERNEL_TYPE=kernel-open
elif [ "${KERNEL_MODULE_TYPE}" == "auto" ]; then
KERNEL_TYPE=$(gpu-driver-util get-kernel-module-type -b "${DRIVER_BRANCH}")
if [ $? -ne 0 ]; then
echo "cannot autodetect the kernel module type, printing error logs from /var/log/gpu-driver-util.log..."
tail -n 3 /var/log/gpu-driver-util.log
return 1
fi
else
echo "invalid value for the KERNEL_MODULE_TYPE variable: ${KERNEL_MODULE_TYPE}"
return 1
fi
}

_prepare() {
if [ "${DRIVER_TYPE}" = "vgpu" ]; then
_find_vgpu_driver_version || exit 1
@@ -797,5 +813,6 @@ if [ $# -ne 0 ]; then
fi

_resolve_rhel_version || exit 1
_resolve_kernel_type || exit 1

$command
8 changes: 5 additions & 3 deletions rhel9/Dockerfile
Original file line number Diff line number Diff line change
@@ -17,9 +17,10 @@ ENV PATH /usr/local/go/bin:$PATH
WORKDIR /work

RUN git clone https://github.com/NVIDIA/gpu-driver-container driver && \
cd driver/vgpu/src && \
go build -o vgpu-util && \
mv vgpu-util /work
go build -C driver/vgpu/src -o vgpu-util && \
mv driver/vgpu/src/vgpu-util /work && \
go build -C driver/gpu-driver-util -o gpu-driver-util && \
mv driver/gpu-driver-util/gpu-driver-util /work

FROM nvcr.io/nvidia/cuda:12.6.3-base-ubi9

@@ -78,6 +79,7 @@ COPY ocp_dtk_entrypoint /usr/local/bin
COPY common.sh /usr/local/bin

COPY --from=build /work/vgpu-util /usr/local/bin
COPY --from=build /work/gpu-driver-util /usr/local/bin

WORKDIR /drivers

23 changes: 20 additions & 3 deletions rhel9/nvidia-driver
Original file line number Diff line number Diff line change
@@ -17,9 +17,7 @@ USE_HOST_MOFED="${USE_HOST_MOFED:-false}"
DNF_RELEASEVER=${DNF_RELEASEVER:-""}
RHEL_VERSION=${RHEL_VERSION:-""}
RHEL_MAJOR_VERSION=9

OPEN_KERNEL_MODULES_ENABLED=${OPEN_KERNEL_MODULES_ENABLED:-false}
[[ "${OPEN_KERNEL_MODULES_ENABLED}" == "true" ]] && KERNEL_TYPE=kernel-open || KERNEL_TYPE=kernel
KERNEL_MODULE_TYPE=${KERNEL_MODULE_TYPE:-auto}

DRIVER_ARCH=${TARGETARCH/amd64/x86_64} && DRIVER_ARCH=${DRIVER_ARCH/arm64/aarch64}
echo "DRIVER_ARCH is $DRIVER_ARCH"
@@ -571,6 +569,24 @@ _find_vgpu_driver_version() {
return 0
}

_resolve_kernel_type() {
if [ "${KERNEL_MODULE_TYPE}" == "proprietary" ]; then
KERNEL_TYPE=kernel
elif [ "${KERNEL_MODULE_TYPE}" == "open" ]; then
KERNEL_TYPE=kernel-open
elif [ "${KERNEL_MODULE_TYPE}" == "auto" ]; then
KERNEL_TYPE=$(gpu-driver-util get-kernel-module-type -b "${DRIVER_BRANCH}")
if [ $? -ne 0 ]; then
echo "cannot autodetect the kernel module type, printing error logs from /var/log/gpu-driver-util.log..."
tail -n 3 /var/log/gpu-driver-util.log
return 1
fi
else
echo "invalid value for the KERNEL_MODULE_TYPE variable: ${KERNEL_MODULE_TYPE}"
return 1
fi
}

_start_vgpu_topology_daemon() {
type nvidia-topologyd > /dev/null 2>&1 || return 0
echo "Starting nvidia-topologyd.."
@@ -797,5 +813,6 @@ if [ $# -ne 0 ]; then
fi

_resolve_rhel_version || exit 1
_resolve_kernel_type || exit 1

$command
8 changes: 5 additions & 3 deletions ubuntu20.04/Dockerfile
Original file line number Diff line number Diff line change
@@ -28,9 +28,10 @@ ENV PATH /usr/local/go/bin:$PATH
WORKDIR /work

RUN git clone https://github.com/NVIDIA/gpu-driver-container driver && \
cd driver/vgpu/src && \
go build -o vgpu-util && \
mv vgpu-util /work
go build -C driver/vgpu/src -o vgpu-util && \
mv driver/vgpu/src/vgpu-util /work && \
go build -C driver/gpu-driver-util -o gpu-driver-util && \
mv driver/gpu-driver-util/gpu-driver-util /work

FROM nvcr.io/nvidia/cuda:12.6.3-base-ubuntu20.04

@@ -72,6 +73,7 @@ RUN /tmp/install.sh reposetup && /tmp/install.sh depinstall && \
COPY nvidia-driver /usr/local/bin

COPY --from=build /work/vgpu-util /usr/local/bin
COPY --from=build /work/gpu-driver-util /usr/local/bin

ADD drivers drivers/

25 changes: 23 additions & 2 deletions ubuntu20.04/nvidia-driver
Original file line number Diff line number Diff line change
@@ -16,8 +16,7 @@ NVIDIA_MODESET_MODULE_PARAMS=()
NVIDIA_PEERMEM_MODULE_PARAMS=()
TARGETARCH=${TARGETARCH:?"Missing TARGETARCH env"}

OPEN_KERNEL_MODULES_ENABLED=${OPEN_KERNEL_MODULES_ENABLED:-false}
[[ "${OPEN_KERNEL_MODULES_ENABLED}" == "true" ]] && KERNEL_TYPE=kernel-open || KERNEL_TYPE=kernel
KERNEL_MODULE_TYPE=${KERNEL_MODULE_TYPE:-auto}

export DEBIAN_FRONTEND=noninteractive

@@ -477,6 +476,24 @@ _shutdown() {
return 1
}

_resolve_kernel_type() {
if [ "${KERNEL_MODULE_TYPE}" == "proprietary" ]; then
KERNEL_TYPE=kernel
elif [ "${KERNEL_MODULE_TYPE}" == "open" ]; then
KERNEL_TYPE=kernel-open
elif [ "${KERNEL_MODULE_TYPE}" == "auto" ]; then
KERNEL_TYPE=$(gpu-driver-util get-kernel-module-type -b "${DRIVER_BRANCH}")
if [ $? -ne 0 ]; then
echo "cannot autodetect the kernel module type, printing error logs from /var/log/gpu-driver-util.log..."
tail -n 3 /var/log/gpu-driver-util.log
return 1
fi
else
echo "invalid value for the KERNEL_MODULE_TYPE variable: ${KERNEL_MODULE_TYPE}"
return 1
fi
}

_find_vgpu_driver_version() {
local count=""
local version=""
@@ -520,6 +537,8 @@ init() {
_find_vgpu_driver_version || exit 1
fi

_resolve_kernel_type || exit 1

# Install the userspace components and copy the kernel module sources.
sh NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION.run -x && \
cd NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION && \
@@ -592,6 +611,8 @@ update() {
fi
exec 3>&-

_resolve_kernel_type || exit 1

# vgpu driver version is choosen dynamically during runtime, so pre-compile modules for
# only non-vgpu driver types
if [ "${DRIVER_TYPE}" != "vgpu" ]; then
8 changes: 5 additions & 3 deletions ubuntu22.04/Dockerfile
Original file line number Diff line number Diff line change
@@ -28,9 +28,10 @@ ENV PATH /usr/local/go/bin:$PATH
WORKDIR /work

RUN git clone https://github.com/NVIDIA/gpu-driver-container driver && \
cd driver/vgpu/src && \
go build -o vgpu-util && \
mv vgpu-util /work
go build -C driver/vgpu/src -o vgpu-util && \
mv driver/vgpu/src/vgpu-util /work && \
go build -C driver/gpu-driver-util -o gpu-driver-util && \
mv driver/gpu-driver-util/gpu-driver-util /work

FROM nvcr.io/nvidia/cuda:12.6.3-base-ubuntu22.04

@@ -72,6 +73,7 @@ RUN /tmp/install.sh reposetup && /tmp/install.sh depinstall && \
COPY nvidia-driver /usr/local/bin

COPY --from=build /work/vgpu-util /usr/local/bin
COPY --from=build /work/gpu-driver-util /usr/local/bin

ADD drivers drivers/

26 changes: 23 additions & 3 deletions ubuntu22.04/nvidia-driver
Original file line number Diff line number Diff line change
@@ -15,9 +15,7 @@ NVIDIA_UVM_MODULE_PARAMS=()
NVIDIA_MODESET_MODULE_PARAMS=()
NVIDIA_PEERMEM_MODULE_PARAMS=()
TARGETARCH=${TARGETARCH:?"Missing TARGETARCH env"}

OPEN_KERNEL_MODULES_ENABLED=${OPEN_KERNEL_MODULES_ENABLED:-false}
[[ "${OPEN_KERNEL_MODULES_ENABLED}" == "true" ]] && KERNEL_TYPE=kernel-open || KERNEL_TYPE=kernel
KERNEL_MODULE_TYPE=${KERNEL_MODULE_TYPE:-auto}

export DEBIAN_FRONTEND=noninteractive

@@ -481,6 +479,24 @@ _shutdown() {
return 1
}

_resolve_kernel_type() {
if [ "${KERNEL_MODULE_TYPE}" == "proprietary" ]; then
KERNEL_TYPE=kernel
elif [ "${KERNEL_MODULE_TYPE}" == "open" ]; then
KERNEL_TYPE=kernel-open
elif [ "${KERNEL_MODULE_TYPE}" == "auto" ]; then
KERNEL_TYPE=$(gpu-driver-util get-kernel-module-type -b "${DRIVER_BRANCH}")
if [ $? -ne 0 ]; then
echo "cannot autodetect the kernel module type, printing error logs from /var/log/gpu-driver-util.log..."
tail -n 3 /var/log/gpu-driver-util.log
return 1
fi
else
echo "invalid value for the KERNEL_MODULE_TYPE variable: ${KERNEL_MODULE_TYPE}"
return 1
fi
}

_find_vgpu_driver_version() {
local count=""
local version=""
@@ -524,6 +540,8 @@ init() {
_find_vgpu_driver_version || exit 1
fi

_resolve_kernel_type || exit 1

# Install the userspace components and copy the kernel module sources.
sh NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION.run -x && \
cd NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION && \
@@ -596,6 +614,8 @@ update() {
fi
exec 3>&-

_resolve_kernel_type || exit 1

# vgpu driver version is choosen dynamically during runtime, so pre-compile modules for
# only non-vgpu driver types
if [ "${DRIVER_TYPE}" != "vgpu" ]; then
17 changes: 17 additions & 0 deletions ubuntu22.04/precompiled/Dockerfile
Original file line number Diff line number Diff line change
@@ -2,6 +2,8 @@ FROM nvcr.io/nvidia/cuda:12.6.2-base-ubuntu22.04

ENV DEBIAN_FRONTEND=noninteractive

ARG TARGETARCH
ARG GOLANG_VERSION
ARG DRIVER_BRANCH=535
ENV DRIVER_BRANCH=$DRIVER_BRANCH
ARG DRIVER_VERSION=535.216.03
@@ -12,6 +14,8 @@ ENV KERNEL_VERSION=$KERNEL_VERSION

ENV NVIDIA_VISIBLE_DEVICES=void

SHELL ["/bin/bash", "-c"]

RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections

# Fetch GPG keys for CUDA repo
@@ -26,6 +30,7 @@ RUN dpkg --add-architecture i386 && \
curl \
kmod \
file \
git \
libelf-dev \
libglvnd-dev \
pkg-config && \
@@ -41,6 +46,18 @@ RUN echo "deb [arch=amd64] http://archive.ubuntu.com/ubuntu/ jammy main universe
RUN curl -fsSL -o /usr/local/bin/donkey https://github.com/3XX0/donkey/releases/download/v1.1.0/donkey && \
chmod +x /usr/local/bin/donkey

# download appropriate binary based on the target architecture for multi-arch builds
RUN OS_ARCH=${TARGETARCH/x86_64/amd64} && OS_ARCH=${OS_ARCH/aarch64/arm64} && \
curl https://storage.googleapis.com/golang/go${GOLANG_VERSION}.linux-${OS_ARCH}.tar.gz \
| tar -C /usr/local -xz

ENV PATH=/usr/local/go/bin:$PATH

RUN git clone https://github.com/NVIDIA/gpu-driver-container driver && \
go build -C driver/gpu-driver-util -o gpu-driver-util && \
mv driver/gpu-driver-util/gpu-driver-util /usr/local/bin && \
rm -rf driver

# Install / upgrade packages here that are required to resolve CVEs
ARG CVE_UPDATES
RUN if [ -n "${CVE_UPDATES}" ]; then \
23 changes: 21 additions & 2 deletions ubuntu22.04/precompiled/nvidia-driver
Original file line number Diff line number Diff line change
@@ -4,7 +4,7 @@
set -eu

KERNEL_VERSION=$(uname -r)
OPEN_KERNEL_MODULES_ENABLED="${OPEN_KERNEL_MODULES_ENABLED:-false}"
KERNEL_MODULE_TYPE=${KERNEL_MODULE_TYPE:-auto}
RUN_DIR=/run/nvidia
PID_FILE=${RUN_DIR}/${0##*/}.pid
DRIVER_BRANCH=${DRIVER_BRANCH:?"Missing driver version"}
@@ -96,6 +96,24 @@ _get_module_params() {
fi
}

_resolve_kernel_type() {
if [ "${KERNEL_MODULE_TYPE}" == "proprietary" ]; then
KERNEL_TYPE=kernel
elif [ "${KERNEL_MODULE_TYPE}" == "open" ]; then
KERNEL_TYPE=kernel-open
elif [ "${KERNEL_MODULE_TYPE}" == "auto" ]; then
KERNEL_TYPE=$(gpu-driver-util get-kernel-module-type -b "${DRIVER_BRANCH}")
if [ $? -ne 0 ]; then
echo "cannot autodetect the kernel module type, printing error logs from /var/log/gpu-driver-util.log..."
tail -n 3 /var/log/gpu-driver-util.log
return 1
fi
else
echo "invalid value for the KERNEL_MODULE_TYPE variable: ${KERNEL_MODULE_TYPE}"
return 1
fi
}

# Load the kernel modules and start persistenced.
_load_driver() {
echo "Parsing kernel module parameters..."
@@ -245,7 +263,7 @@ _install_driver() {
xserver-xorg-video-nvidia-${DRIVER_BRANCH}-server

# Now install the precompiled kernel module packages signed by Canonical
if [ "$OPEN_KERNEL_MODULES_ENABLED" = true ]; then
if [ "${KERNEL_TYPE}" == "kernel-open" ]; then
echo "Installing Open NVIDIA driver kernel modules..."
apt-get install --no-install-recommends -y \
linux-signatures-nvidia-${KERNEL_VERSION} \
@@ -293,6 +311,7 @@ init() {
_unload_driver || exit 1
_unmount_rootfs

_resolve_kernel_type || exit 1
_install_driver
_load_driver || exit 1
_mount_rootfs
Loading

0 comments on commit c1569fa

Please sign in to comment.