Skip to content

Commit

Permalink
integrate gpu-driver-util into the driver images
Browse files Browse the repository at this point in the history
Signed-off-by: Tariq Ibrahim <[email protected]>
  • Loading branch information
tariq1890 committed Jan 14, 2025
1 parent 0f4ecad commit 29fc943
Show file tree
Hide file tree
Showing 14 changed files with 192 additions and 27 deletions.
10 changes: 7 additions & 3 deletions rhel8/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,10 @@ ENV PATH /usr/local/go/bin:$PATH
WORKDIR /work

RUN git clone https://github.com/NVIDIA/gpu-driver-container driver && \
cd driver/vgpu/src && \
go build -o vgpu-util && \
mv vgpu-util /work
go build -C driver/vgpu/src -o vgpu-util && \
mv driver/vgpu/src/vgpu-util /work && \
go build -C driver/gpu-driver-util -o gpu-driver-util && \
mv driver/gpu-driver-util/gpu-driver-util /work

FROM nvcr.io/nvidia/cuda:12.6.3-base-ubi8

Expand All @@ -36,6 +37,8 @@ ENV DRIVER_VERSION=$DRIVER_VERSION
# Arg to indicate if driver type is either of passthrough/baremetal or vgpu
ARG DRIVER_TYPE=passthrough
ENV DRIVER_TYPE=$DRIVER_TYPE
ARG DRIVER_BRANCH=550
ENV DRIVER_BRANCH=$DRIVER_BRANCH
ARG VGPU_LICENSE_SERVER_TYPE=NLS
ENV VGPU_LICENSE_SERVER_TYPE=$VGPU_LICENSE_SERVER_TYPE
# Enable vGPU version compability check by default
Expand Down Expand Up @@ -84,6 +87,7 @@ COPY ocp_dtk_entrypoint /usr/local/bin
COPY common.sh /usr/local/bin

COPY --from=build /work/vgpu-util /usr/local/bin
COPY --from=build /work/gpu-driver-util /usr/local/bin

WORKDIR /drivers

Expand Down
24 changes: 21 additions & 3 deletions rhel8/nvidia-driver
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ set -eu
RUN_DIR=/run/nvidia
PID_FILE=${RUN_DIR}/${0##*/}.pid
DRIVER_VERSION=${DRIVER_VERSION:?"Missing DRIVER_VERSION env"}
DRIVER_BRANCH=${DRIVER_BRANCH:?"Missing DRIVER_BRANCH env"}
KERNEL_UPDATE_HOOK=/run/kernel/postinst.d/update-nvidia-driver
NUM_VGPU_DEVICES=0
NVIDIA_MODULE_PARAMS=()
Expand All @@ -17,9 +18,7 @@ USE_HOST_MOFED="${USE_HOST_MOFED:-false}"
DNF_RELEASEVER=${DNF_RELEASEVER:-""}
RHEL_VERSION=${RHEL_VERSION:-""}
RHEL_MAJOR_VERSION=8

OPEN_KERNEL_MODULES_ENABLED=${OPEN_KERNEL_MODULES_ENABLED:-false}
[[ "${OPEN_KERNEL_MODULES_ENABLED}" == "true" ]] && KERNEL_TYPE=kernel-open || KERNEL_TYPE=kernel
KERNEL_MODULE_TYPE=${KERNEL_MODULE_TYPE:-auto}

DRIVER_ARCH=${TARGETARCH/amd64/x86_64} && DRIVER_ARCH=${DRIVER_ARCH/arm64/aarch64}
echo "DRIVER_ARCH is $DRIVER_ARCH"
Expand Down Expand Up @@ -577,6 +576,24 @@ _start_vgpu_topology_daemon() {
nvidia-topologyd
}

_resolve_kernel_type() {
if [ "${KERNEL_MODULE_TYPE}" == "proprietary" ]; then
KERNEL_TYPE=kernel
elif [ "${KERNEL_MODULE_TYPE}" == "open" ]; then
KERNEL_TYPE=kernel-open
elif [ "${KERNEL_MODULE_TYPE}" == "auto" ]; then
KERNEL_TYPE=$(gpu-driver-util get-kernel-module-type -b "${DRIVER_BRANCH}")
if [ $? -ne 0 ]; then
echo "cannot autodetect the kernel module type, printing error logs from /var/log/gpu-driver-util.log..."
tail -n 3 /var/log/gpu-driver-util.log
return 1
fi
else
echo "invalid value for the KERNEL_MODULE_TYPE variable: ${KERNEL_MODULE_TYPE}"
return 1
fi
}

_prepare() {
if [ "${DRIVER_TYPE}" = "vgpu" ]; then
_find_vgpu_driver_version || exit 1
Expand Down Expand Up @@ -797,5 +814,6 @@ if [ $# -ne 0 ]; then
fi

_resolve_rhel_version || exit 1
_resolve_kernel_type || exit 1

$command
2 changes: 2 additions & 0 deletions rhel8/ocp_dtk_entrypoint
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ nv-ctr-run-with-dtk() {
/usr/local/bin/nvidia-driver \
/usr/local/bin/common.sh \
/usr/local/bin/extract-vmlinux \
/usr/local/bin/gpu-driver-util \
/usr/local/bin/vgpu-util \
/drivers \
/licenses \
Expand Down Expand Up @@ -136,6 +137,7 @@ dtk-build-driver() {
"$DRIVER_TOOLKIT_SHARED_DIR/nvidia-driver" \
"$DRIVER_TOOLKIT_SHARED_DIR/common.sh" \
"$DRIVER_TOOLKIT_SHARED_DIR/extract-vmlinux" \
"$DRIVER_TOOLKIT_SHARED_DIR/gpu-driver-util" \
"$DRIVER_TOOLKIT_SHARED_DIR/vgpu-util" \
"${DRIVER_TOOLKIT_SHARED_DIR}/bin"

Expand Down
10 changes: 7 additions & 3 deletions rhel9/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,10 @@ ENV PATH /usr/local/go/bin:$PATH
WORKDIR /work

RUN git clone https://github.com/NVIDIA/gpu-driver-container driver && \
cd driver/vgpu/src && \
go build -o vgpu-util && \
mv vgpu-util /work
go build -C driver/vgpu/src -o vgpu-util && \
mv driver/vgpu/src/vgpu-util /work && \
go build -C driver/gpu-driver-util -o gpu-driver-util && \
mv driver/gpu-driver-util/gpu-driver-util /work

FROM nvcr.io/nvidia/cuda:12.6.3-base-ubi9

Expand All @@ -36,6 +37,8 @@ ENV DRIVER_VERSION=$DRIVER_VERSION
# Arg to indicate if driver type is either of passthrough/baremetal or vgpu
ARG DRIVER_TYPE=passthrough
ENV DRIVER_TYPE=$DRIVER_TYPE
ARG DRIVER_BRANCH=550
ENV DRIVER_BRANCH=$DRIVER_BRANCH
ARG VGPU_LICENSE_SERVER_TYPE=NLS
ENV VGPU_LICENSE_SERVER_TYPE=$VGPU_LICENSE_SERVER_TYPE
# Enable vGPU version compability check by default
Expand Down Expand Up @@ -78,6 +81,7 @@ COPY ocp_dtk_entrypoint /usr/local/bin
COPY common.sh /usr/local/bin

COPY --from=build /work/vgpu-util /usr/local/bin
COPY --from=build /work/gpu-driver-util /usr/local/bin

WORKDIR /drivers

Expand Down
24 changes: 21 additions & 3 deletions rhel9/nvidia-driver
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ set -eu
RUN_DIR=/run/nvidia
PID_FILE=${RUN_DIR}/${0##*/}.pid
DRIVER_VERSION=${DRIVER_VERSION:?"Missing DRIVER_VERSION env"}
DRIVER_BRANCH=${DRIVER_BRANCH:?"Missing DRIVER_BRANCH env"}
KERNEL_UPDATE_HOOK=/run/kernel/postinst.d/update-nvidia-driver
NUM_VGPU_DEVICES=0
NVIDIA_MODULE_PARAMS=()
Expand All @@ -17,9 +18,7 @@ USE_HOST_MOFED="${USE_HOST_MOFED:-false}"
DNF_RELEASEVER=${DNF_RELEASEVER:-""}
RHEL_VERSION=${RHEL_VERSION:-""}
RHEL_MAJOR_VERSION=9

OPEN_KERNEL_MODULES_ENABLED=${OPEN_KERNEL_MODULES_ENABLED:-false}
[[ "${OPEN_KERNEL_MODULES_ENABLED}" == "true" ]] && KERNEL_TYPE=kernel-open || KERNEL_TYPE=kernel
KERNEL_MODULE_TYPE=${KERNEL_MODULE_TYPE:-auto}

DRIVER_ARCH=${TARGETARCH/amd64/x86_64} && DRIVER_ARCH=${DRIVER_ARCH/arm64/aarch64}
echo "DRIVER_ARCH is $DRIVER_ARCH"
Expand Down Expand Up @@ -571,6 +570,24 @@ _find_vgpu_driver_version() {
return 0
}

_resolve_kernel_type() {
if [ "${KERNEL_MODULE_TYPE}" == "proprietary" ]; then
KERNEL_TYPE=kernel
elif [ "${KERNEL_MODULE_TYPE}" == "open" ]; then
KERNEL_TYPE=kernel-open
elif [ "${KERNEL_MODULE_TYPE}" == "auto" ]; then
KERNEL_TYPE=$(gpu-driver-util get-kernel-module-type -b "${DRIVER_BRANCH}")
if [ $? -ne 0 ]; then
echo "cannot autodetect the kernel module type, printing error logs from /var/log/gpu-driver-util.log..."
tail -n 3 /var/log/gpu-driver-util.log
return 1
fi
else
echo "invalid value for the KERNEL_MODULE_TYPE variable: ${KERNEL_MODULE_TYPE}"
return 1
fi
}

_start_vgpu_topology_daemon() {
type nvidia-topologyd > /dev/null 2>&1 || return 0
echo "Starting nvidia-topologyd.."
Expand Down Expand Up @@ -797,5 +814,6 @@ if [ $# -ne 0 ]; then
fi

_resolve_rhel_version || exit 1
_resolve_kernel_type || exit 1

$command
2 changes: 2 additions & 0 deletions rhel9/ocp_dtk_entrypoint
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ nv-ctr-run-with-dtk() {
/usr/local/bin/nvidia-driver \
/usr/local/bin/common.sh \
/usr/local/bin/extract-vmlinux \
/usr/local/bin/gpu-driver-util \
/usr/local/bin/vgpu-util \
/drivers \
/licenses \
Expand Down Expand Up @@ -136,6 +137,7 @@ dtk-build-driver() {
"$DRIVER_TOOLKIT_SHARED_DIR/nvidia-driver" \
"$DRIVER_TOOLKIT_SHARED_DIR/common.sh" \
"$DRIVER_TOOLKIT_SHARED_DIR/extract-vmlinux" \
"$DRIVER_TOOLKIT_SHARED_DIR/gpu-driver-util" \
"$DRIVER_TOOLKIT_SHARED_DIR/vgpu-util" \
"${DRIVER_TOOLKIT_SHARED_DIR}/bin"

Expand Down
8 changes: 5 additions & 3 deletions ubuntu20.04/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,10 @@ ENV PATH /usr/local/go/bin:$PATH
WORKDIR /work

RUN git clone https://github.com/NVIDIA/gpu-driver-container driver && \
cd driver/vgpu/src && \
go build -o vgpu-util && \
mv vgpu-util /work
go build -C driver/vgpu/src -o vgpu-util && \
mv driver/vgpu/src/vgpu-util /work && \
go build -C driver/gpu-driver-util -o gpu-driver-util && \
mv driver/gpu-driver-util/gpu-driver-util /work

FROM nvcr.io/nvidia/cuda:12.6.3-base-ubuntu20.04

Expand Down Expand Up @@ -72,6 +73,7 @@ RUN /tmp/install.sh reposetup && /tmp/install.sh depinstall && \
COPY nvidia-driver /usr/local/bin

COPY --from=build /work/vgpu-util /usr/local/bin
COPY --from=build /work/gpu-driver-util /usr/local/bin

ADD drivers drivers/

Expand Down
25 changes: 23 additions & 2 deletions ubuntu20.04/nvidia-driver
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,7 @@ NVIDIA_MODESET_MODULE_PARAMS=()
NVIDIA_PEERMEM_MODULE_PARAMS=()
TARGETARCH=${TARGETARCH:?"Missing TARGETARCH env"}

OPEN_KERNEL_MODULES_ENABLED=${OPEN_KERNEL_MODULES_ENABLED:-false}
[[ "${OPEN_KERNEL_MODULES_ENABLED}" == "true" ]] && KERNEL_TYPE=kernel-open || KERNEL_TYPE=kernel
KERNEL_MODULE_TYPE=${KERNEL_MODULE_TYPE:-auto}

export DEBIAN_FRONTEND=noninteractive

Expand Down Expand Up @@ -477,6 +476,24 @@ _shutdown() {
return 1
}

_resolve_kernel_type() {
if [ "${KERNEL_MODULE_TYPE}" == "proprietary" ]; then
KERNEL_TYPE=kernel
elif [ "${KERNEL_MODULE_TYPE}" == "open" ]; then
KERNEL_TYPE=kernel-open
elif [ "${KERNEL_MODULE_TYPE}" == "auto" ]; then
KERNEL_TYPE=$(gpu-driver-util get-kernel-module-type -b "${DRIVER_BRANCH}")
if [ $? -ne 0 ]; then
echo "cannot autodetect the kernel module type, printing error logs from /var/log/gpu-driver-util.log..."
tail -n 3 /var/log/gpu-driver-util.log
return 1
fi
else
echo "invalid value for the KERNEL_MODULE_TYPE variable: ${KERNEL_MODULE_TYPE}"
return 1
fi
}

_find_vgpu_driver_version() {
local count=""
local version=""
Expand Down Expand Up @@ -520,6 +537,8 @@ init() {
_find_vgpu_driver_version || exit 1
fi

_resolve_kernel_type || exit 1

# Install the userspace components and copy the kernel module sources.
sh NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION.run -x && \
cd NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION && \
Expand Down Expand Up @@ -592,6 +611,8 @@ update() {
fi
exec 3>&-

_resolve_kernel_type || exit 1

# vgpu driver version is choosen dynamically during runtime, so pre-compile modules for
# only non-vgpu driver types
if [ "${DRIVER_TYPE}" != "vgpu" ]; then
Expand Down
8 changes: 5 additions & 3 deletions ubuntu22.04/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,10 @@ ENV PATH /usr/local/go/bin:$PATH
WORKDIR /work

RUN git clone https://github.com/NVIDIA/gpu-driver-container driver && \
cd driver/vgpu/src && \
go build -o vgpu-util && \
mv vgpu-util /work
go build -C driver/vgpu/src -o vgpu-util && \
mv driver/vgpu/src/vgpu-util /work && \
go build -C driver/gpu-driver-util -o gpu-driver-util && \
mv driver/gpu-driver-util/gpu-driver-util /work

FROM nvcr.io/nvidia/cuda:12.6.3-base-ubuntu22.04

Expand Down Expand Up @@ -72,6 +73,7 @@ RUN /tmp/install.sh reposetup && /tmp/install.sh depinstall && \
COPY nvidia-driver /usr/local/bin

COPY --from=build /work/vgpu-util /usr/local/bin
COPY --from=build /work/gpu-driver-util /usr/local/bin

ADD drivers drivers/

Expand Down
26 changes: 23 additions & 3 deletions ubuntu22.04/nvidia-driver
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,7 @@ NVIDIA_UVM_MODULE_PARAMS=()
NVIDIA_MODESET_MODULE_PARAMS=()
NVIDIA_PEERMEM_MODULE_PARAMS=()
TARGETARCH=${TARGETARCH:?"Missing TARGETARCH env"}

OPEN_KERNEL_MODULES_ENABLED=${OPEN_KERNEL_MODULES_ENABLED:-false}
[[ "${OPEN_KERNEL_MODULES_ENABLED}" == "true" ]] && KERNEL_TYPE=kernel-open || KERNEL_TYPE=kernel
KERNEL_MODULE_TYPE=${KERNEL_MODULE_TYPE:-auto}

export DEBIAN_FRONTEND=noninteractive

Expand Down Expand Up @@ -481,6 +479,24 @@ _shutdown() {
return 1
}

_resolve_kernel_type() {
if [ "${KERNEL_MODULE_TYPE}" == "proprietary" ]; then
KERNEL_TYPE=kernel
elif [ "${KERNEL_MODULE_TYPE}" == "open" ]; then
KERNEL_TYPE=kernel-open
elif [ "${KERNEL_MODULE_TYPE}" == "auto" ]; then
KERNEL_TYPE=$(gpu-driver-util get-kernel-module-type -b "${DRIVER_BRANCH}")
if [ $? -ne 0 ]; then
echo "cannot autodetect the kernel module type, printing error logs from /var/log/gpu-driver-util.log..."
tail -n 3 /var/log/gpu-driver-util.log
return 1
fi
else
echo "invalid value for the KERNEL_MODULE_TYPE variable: ${KERNEL_MODULE_TYPE}"
return 1
fi
}

_find_vgpu_driver_version() {
local count=""
local version=""
Expand Down Expand Up @@ -524,6 +540,8 @@ init() {
_find_vgpu_driver_version || exit 1
fi

_resolve_kernel_type || exit 1

# Install the userspace components and copy the kernel module sources.
sh NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION.run -x && \
cd NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION && \
Expand Down Expand Up @@ -596,6 +614,8 @@ update() {
fi
exec 3>&-

_resolve_kernel_type || exit 1

# vgpu driver version is choosen dynamically during runtime, so pre-compile modules for
# only non-vgpu driver types
if [ "${DRIVER_TYPE}" != "vgpu" ]; then
Expand Down
17 changes: 17 additions & 0 deletions ubuntu22.04/precompiled/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ FROM nvcr.io/nvidia/cuda:12.6.2-base-ubuntu22.04

ENV DEBIAN_FRONTEND=noninteractive

ARG TARGETARCH
ARG GOLANG_VERSION
ARG DRIVER_BRANCH=535
ENV DRIVER_BRANCH=$DRIVER_BRANCH
ARG DRIVER_VERSION=535.216.03
Expand All @@ -12,6 +14,8 @@ ENV KERNEL_VERSION=$KERNEL_VERSION

ENV NVIDIA_VISIBLE_DEVICES=void

SHELL ["/bin/bash", "-c"]

RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections

# Fetch GPG keys for CUDA repo
Expand All @@ -26,6 +30,7 @@ RUN dpkg --add-architecture i386 && \
curl \
kmod \
file \
git \
libelf-dev \
libglvnd-dev \
pkg-config && \
Expand All @@ -41,6 +46,18 @@ RUN echo "deb [arch=amd64] http://archive.ubuntu.com/ubuntu/ jammy main universe
RUN curl -fsSL -o /usr/local/bin/donkey https://github.com/3XX0/donkey/releases/download/v1.1.0/donkey && \
chmod +x /usr/local/bin/donkey

# download appropriate binary based on the target architecture for multi-arch builds
RUN OS_ARCH=${TARGETARCH/x86_64/amd64} && OS_ARCH=${OS_ARCH/aarch64/arm64} && \
curl https://storage.googleapis.com/golang/go${GOLANG_VERSION}.linux-${OS_ARCH}.tar.gz \
| tar -C /usr/local -xz

ENV PATH=/usr/local/go/bin:$PATH

RUN git clone https://github.com/NVIDIA/gpu-driver-container driver && \
go build -C driver/gpu-driver-util -o gpu-driver-util && \
mv driver/gpu-driver-util/gpu-driver-util /usr/local/bin && \
rm -rf driver

# Install / upgrade packages here that are required to resolve CVEs
ARG CVE_UPDATES
RUN if [ -n "${CVE_UPDATES}" ]; then \
Expand Down
Loading

0 comments on commit 29fc943

Please sign in to comment.