Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 60 additions & 0 deletions examples/v2beta1/nccl-tests/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
ARG CUDA_VERSION_MINOR=12.4.1
ARG BASE_IMAGE=nvidia/cuda:${CUDA_VERSION_MINOR}-devel-ubuntu22.04
FROM ${BASE_IMAGE} as base

ARG CUDA_VERSION_MAJOR=12.4
ARG TARGET_NCCL_VERSION=2.21.5-1

ARG DEBIAN_FRONTEND=noninteractive
RUN apt-get -qq update && \
apt-get -qq install -y \
--allow-change-held-packages \
--no-install-recommends \
--allow-downgrades \
build-essential libtool autoconf automake autotools-dev unzip \
ca-certificates \
wget curl openssh-server vim environment-modules \
iputils-ping net-tools \
libnuma1 libsubunit0 libpci-dev \
libpmix-dev \
datacenter-gpu-manager \
g++ libopenmpi-dev openmpi-bin \
libnccl2=$TARGET_NCCL_VERSION+cuda${CUDA_VERSION_MAJOR} \
libnccl-dev=${TARGET_NCCL_VERSION}+cuda${CUDA_VERSION_MAJOR} \
git && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

RUN apt-get -qq update \
&& apt-get -qq install -y --no-install-recommends \
ibverbs-utils libibverbs-dev libibumad3 libibumad-dev librdmacm-dev rdmacm-utils infiniband-diags ibverbs-utils \
&& rm -rf /var/lib/apt/lists/*

RUN apt-get update && apt-get install -y apt-transport-https ca-certificates gnupg curl && \
curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add - && \
echo "deb https://packages.cloud.google.com/apt cloud-sdk main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && \
apt-get update && apt-get install -y google-cloud-sdk && \
apt-get clean && rm -rf /var/lib/apt/lists/*



# NCCL Tests
ENV NCCL_TESTS_COMMITISH=c6afef0
ENV CUDA12_GENCODE='-gencode=arch=compute_90,code=sm_90'
ENV CUDA12_PTX='-gencode=arch=compute_90,code=compute_90'
WORKDIR /opt/nccl-tests
RUN wget -q -O - https://github.com/NVIDIA/nccl-tests/archive/${NCCL_TESTS_COMMITISH}.tar.gz | tar --strip-components=1 -xzf - && \
mpicc -show && \
export CXX=mpic++ && \
make -j20 MPI=1 MPI_HOME=/usr/include/openmpi NVCC_GENCODE="$CUDA12_GENCODE $CUDA12_PTX" && \
ln -s /opt/nccl-tests /opt/nccl_tests

RUN ldconfig

# SSH dependencies for MPI
RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \
echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \
sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config && \
sed -i "s/[ #]\(.*Port \).*/ \12222/g" /etc/ssh/ssh_config && \
sed -i "s/#\(Port \).*/\12222/g" /etc/ssh/sshd_config && \
mkdir /var/run/sshd -p
68 changes: 68 additions & 0 deletions examples/v2beta1/nccl-tests/nccl-tests.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
apiVersion: kubeflow.org/v2beta1
kind: MPIJob
metadata:
name: nccl-tests
spec:
slotsPerWorker: 8
runPolicy:
cleanPodPolicy: Running
activeDeadlineSeconds: 666
mpiReplicaSpecs:
Launcher:
replicas: 1
template:
spec:
restartPolicy: OnFailure
containers:
- image: mpioperator/nccl-tests:latest
name: nccl
securityContext:
privileged: true
env:
- name: OMPI_ALLOW_RUN_AS_ROOT
value: "1"
- name: OMPI_ALLOW_RUN_AS_ROOT_CONFIRM
value: "1"
- name: OMPI_MCA_orte_base_help_aggregate
value: "0"
command: ["/bin/bash", "-c"]
args:
- |
set -xe
export NCCL_DEBUG=INFO
until mpirun -np 16 -x LD_LIBRARY_PATH -bind-to none /usr/local/nvidia/bin/nvidia-smi; do sleep 5; done
mpirun -np ${NP} -bind-to none \
-x NCCL_DEBUG \
/opt/nccl_tests/build/all_reduce_perf -c 0 -b 8 -e 16G \
-f 4 -g 1 -n 10
resources:
requests:
cpu: 50m
memory: 128Mi
enableServiceLinks: false
automountServiceAccountToken: false
Worker:
replicas: 2
template:
metadata:
annotations:
spec:
volumes:
- name: shared-memory
emptyDir:
medium: "Memory"

containers:
- image: mpioperator/nccl-tests:latest
name: nccl
securityContext:
privileged: true
resources:
limits:
nvidia.com/gpu: 8
volumeMounts:
- name: shared-memory
mountPath: /dev/shm

enableServiceLinks: false
automountServiceAccountToken: false