Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions .github/workflows/ubuntu-20.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ jobs:
base-tag: 11.8.0-cudnn8-devel-ubuntu20.04
cuda-version-minor: "11.8.0"
cuda-version-major: "11.8"
nccl-version: 2.16.5-1
nccl-version: 2.21.5-1
cuda-samples-version: "11.6"
hpcx-distribution: "hpcx-v2.14-gcc-MLNX_OFED_LINUX-5-ubuntu20.04-cuda11-gdrcopy2-nccl2.16-x86_64"

Expand All @@ -29,7 +29,7 @@ jobs:
base-tag: 12.0.1-cudnn8-devel-ubuntu20.04
cuda-version-minor: "12.0.1"
cuda-version-major: "12.0"
nccl-version: 2.19.3-1
nccl-version: 2.21.5-1
cuda-samples-version: "12.0"
hpcx-distribution: "hpcx-v2.18-gcc-mlnx_ofed-ubuntu20.04-cuda12-x86_64"

Expand All @@ -42,7 +42,7 @@ jobs:
base-tag: 12.1.1-cudnn8-devel-ubuntu20.04
cuda-version-minor: "12.1.1"
cuda-version-major: "12.1"
nccl-version: 2.18.3-1
nccl-version: 2.21.5-1
cuda-samples-version: "12.1"
hpcx-distribution: "hpcx-v2.18-gcc-mlnx_ofed-ubuntu20.04-cuda12-x86_64"

Expand All @@ -68,7 +68,7 @@ jobs:
base-tag: 12.3.2-cudnn9-devel-ubuntu20.04
cuda-version-minor: "12.3.2"
cuda-version-major: "12.3"
nccl-version: 2.20.3-1
nccl-version: 2.21.5-1
cuda-samples-version: "12.3"
hpcx-distribution: "hpcx-v2.18-gcc-mlnx_ofed-ubuntu20.04-cuda12-x86_64"

Expand Down
8 changes: 4 additions & 4 deletions .github/workflows/ubuntu-22.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ jobs:
base-tag: 12.0.1-cudnn8-devel-ubuntu22.04
cuda-version-minor: "12.0.1"
cuda-version-major: "12.0"
nccl-version: 2.18.5-1
nccl-version: 2.21.5-1
cuda-samples-version: "12.0"
hpcx-distribution: "hpcx-v2.18-gcc-mlnx_ofed-ubuntu22.04-cuda12-x86_64"

Expand All @@ -29,7 +29,7 @@ jobs:
base-tag: 12.1.1-cudnn8-devel-ubuntu22.04
cuda-version-minor: "12.1.1"
cuda-version-major: "12.1"
nccl-version: 2.18.3-1
nccl-version: 2.21.5-1
cuda-samples-version: "12.1"
hpcx-distribution: "hpcx-v2.18-gcc-mlnx_ofed-ubuntu22.04-cuda12-x86_64"

Expand All @@ -42,7 +42,7 @@ jobs:
base-tag: 12.2.2-cudnn8-devel-ubuntu22.04
cuda-version-minor: "12.2.2"
cuda-version-major: "12.2"
nccl-version: 2.19.3-1
nccl-version: 2.21.5-1
cuda-samples-version: "12.2"
hpcx-distribution: "hpcx-v2.18-gcc-mlnx_ofed-ubuntu22.04-cuda12-x86_64"

Expand All @@ -55,7 +55,7 @@ jobs:
base-tag: 12.3.2-cudnn9-devel-ubuntu22.04
cuda-version-minor: "12.3.2"
cuda-version-major: "12.3"
nccl-version: 2.20.3-1
nccl-version: 2.21.5-1
cuda-samples-version: "12.3"
hpcx-distribution: "hpcx-v2.18-gcc-mlnx_ofed-ubuntu22.04-cuda12-x86_64"

Expand Down
18 changes: 16 additions & 2 deletions Dockerfile.ubuntu20
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,13 @@ RUN apt-get -qq update && \
--no-install-recommends \
--allow-downgrades \
build-essential libtool autoconf automake autotools-dev unzip \
devscripts debhelper fakeroot \
ca-certificates \
wget curl openssh-server vim environment-modules \
iputils-ping net-tools \
libnuma1 libsubunit0 libpci-dev \
libpmix-dev \
datacenter-gpu-manager \
libnccl2=$TARGET_NCCL_VERSION+cuda${CUDA_VERSION_MAJOR} \
libnccl-dev=${TARGET_NCCL_VERSION}+cuda${CUDA_VERSION_MAJOR} \
git

# Mellanox OFED (latest)
Expand All @@ -32,6 +31,21 @@ RUN apt-get -qq update \
&& rm -rf /var/lib/apt/lists/*
# mlnx-ofed-hpc-user-only

ARG NCCL_ARCH_LIST='70 75 80 86 89 90'
RUN mkdir /tmp/build && \
cd /tmp/build && \
NCCL_TAG="$(echo $TARGET_NCCL_VERSION | tr -dc '[:digit:].-')" && \
wget -qO - "https://github.com/NVIDIA/nccl/archive/refs/tags/v${NCCL_TAG}.tar.gz" \
| tar --strip-components=1 -xzf - && \
REAL_ARCH() { printf -- '-gencode=arch=compute_%s,code=sm_%s ' "$1" "$1"; } && \
VIRTUAL_ARCH() { printf -- '-gencode=arch=compute_%s,code=compute_%s' "$1" "$1"; } && \
LAST_ARCH="${NCCL_ARCH_LIST##* }" && \
make -j20 pkg.debian.build \
"NVCC_GENCODE=$(for I in $NCCL_ARCH_LIST; do REAL_ARCH "$I"; done && VIRTUAL_ARCH "$LAST_ARCH")" && \
dpkg -i ./build/pkg/deb/libnccl*.deb && \
cd /tmp && \
rm -r /tmp/build

# IB perftest with GDR
ENV PERFTEST_VERSION_HASH=5b47ede

Expand Down
18 changes: 16 additions & 2 deletions Dockerfile.ubuntu22
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,13 @@ RUN apt-get -qq update && \
--no-install-recommends \
--allow-downgrades \
build-essential libtool autoconf automake autotools-dev unzip \
devscripts debhelper fakeroot \
ca-certificates \
wget curl openssh-server vim environment-modules \
iputils-ping net-tools \
libnuma1 libsubunit0 libpci-dev \
libpmix-dev \
datacenter-gpu-manager \
libnccl2=$TARGET_NCCL_VERSION+cuda${CUDA_VERSION_MAJOR} \
libnccl-dev=${TARGET_NCCL_VERSION}+cuda${CUDA_VERSION_MAJOR} \
git && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
Expand All @@ -34,6 +33,21 @@ RUN apt-get -qq update \
&& rm -rf /var/lib/apt/lists/*
# mlnx-ofed-hpc-user-only

ARG NCCL_ARCH_LIST='70 75 80 86 89 90'
RUN mkdir /tmp/build && \
cd /tmp/build && \
NCCL_TAG="$(echo $TARGET_NCCL_VERSION | tr -dc '[:digit:].-')" && \
wget -qO - "https://github.com/NVIDIA/nccl/archive/refs/tags/v${NCCL_TAG}.tar.gz" \
| tar --strip-components=1 -xzf - && \
REAL_ARCH() { printf -- '-gencode=arch=compute_%s,code=sm_%s ' "$1" "$1"; } && \
VIRTUAL_ARCH() { printf -- '-gencode=arch=compute_%s,code=compute_%s' "$1" "$1"; } && \
LAST_ARCH="${NCCL_ARCH_LIST##* }" && \
make -j20 pkg.debian.build \
"NVCC_GENCODE=$(for I in $NCCL_ARCH_LIST; do REAL_ARCH "$I"; done && VIRTUAL_ARCH "$LAST_ARCH")" && \
dpkg -i ./build/pkg/deb/libnccl*.deb && \
cd /tmp && \
rm -r /tmp/build

# IB perftest with GDR
ENV PERFTEST_VERSION_HASH=5b47ede

Expand Down