Skip to content

Commit

Permalink
Merge branch 'main' into ko3n1g/ci/selective-triggering
Browse files Browse the repository at this point in the history
  • Loading branch information
ko3n1g authored May 15, 2024
2 parents a1b346c + fe7e193 commit f71c124
Show file tree
Hide file tree
Showing 17 changed files with 386 additions and 22 deletions.
11 changes: 9 additions & 2 deletions .github/container/Dockerfile.base
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# syntax=docker/dockerfile:1-labs
ARG BASE_IMAGE=nvidia/cuda:12.3.0-devel-ubuntu22.04
ARG BASE_IMAGE=nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04
ARG GIT_USER_NAME="JAX Toolbox"
ARG [email protected]
ARG CLANG_VERSION=17
Expand Down Expand Up @@ -146,7 +146,6 @@ RUN install-nsight.sh
###############################################################################

ADD install-cudnn.sh /usr/local/bin
RUN install-cudnn.sh

###############################################################################
## Install NCCL
Expand All @@ -173,6 +172,14 @@ ADD --chmod=777 \
ENV LD_LIBRARY_PATH=/opt/amazon/efa/lib:${LD_LIBRARY_PATH}
ENV PATH=/opt/amazon/efa/bin:${PATH}

##############################################################################
## NCCL sanity check utility
##############################################################################

ADD install-nccl-sanity-check.sh /usr/local/bin
ADD nccl-sanity-check.cu /opt
RUN install-nccl-sanity-check.sh

###############################################################################
## Add the systemcheck to the entrypoint.
###############################################################################
Expand Down
10 changes: 6 additions & 4 deletions .github/container/install-cudnn.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ set -ex
export DEBIAN_FRONTEND=noninteractive
export TZ=America/Los_Angeles

CUDNN_MAJOR_VERSION=9

apt-get update

# Extract major CUDA version from `nvcc --version` output line
Expand All @@ -18,17 +20,17 @@ cuda_major_version=$(nvcc --version | sed -n 's/^.*release \([0-9]*\.[0-9]*\).*$
# version of CUDA and cuDNN are compatible.
# For example, CUDA 12.3 + cuDNN 8.9.6 (libcudnn8 version: 8.9.6.50-1+cuda12.2) is
# considered to be compatible.
libcudnn_version=$(apt-cache show libcudnn8 | sed -n "s/^Version: \(.*+cuda${cuda_major_version}\.[0-9]*\)$/\1/p" | head -n 1)
libcudnn_dev_version=$(apt-cache show libcudnn8-dev | sed -n "s/^Version: \(.*+cuda${cuda_major_version}\.[0-9]\)$/\1/p" | head -n 1)
libcudnn_version=$(apt-cache show libcudnn${CUDNN_MAJOR_VERSION} | sed -n "s/^Version: \(.*+cuda${cuda_major_version}\.[0-9]*\)$/\1/p" | head -n 1)
libcudnn_dev_version=$(apt-cache show libcudnn${CUDNN_MAJOR_VERSION}-dev | sed -n "s/^Version: \(.*+cuda${cuda_major_version}\.[0-9]\)$/\1/p" | head -n 1)
if [[ -z "${libcudnn_version}" || -z "${libcudnn_dev_version}" ]]; then
echo "Could not find compatible cuDNN version for CUDA ${cuda_version}"
exit 1
fi

apt-get update
apt-get install -y \
libcudnn8=${libcudnn_version} \
libcudnn8-dev=${libcudnn_dev_version}
libcudnn${CUDNN_MAJOR_VERSION}=${libcudnn_version} \
libcudnn${CUDNN_MAJOR_VERSION}-dev=${libcudnn_dev_version}

apt-get clean
rm -rf /var/lib/apt/lists/*
20 changes: 20 additions & 0 deletions .github/container/install-nccl-sanity-check.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#!/bin/bash

set -ex

BIN_DIR=/usr/local/bin
NAME=nccl-sanity-check

# Build binary from source
nvcc -o "$BIN_DIR/$NAME" "/opt/$NAME.cu" -lcudart -lnccl

# Create the wrapper script that queries jax for the configuration
cat <<"EOF" > "$BIN_DIR/$NAME.sh"
#!/bin/bash
set -e
export NCCL_SANITY_CHECK_LATENCY_US=1000
NCCL_SANITY_CHECK_ARGS=$(python3 -c 'import jax; import jax.distributed; jax.distributed.initialize(); lds = jax.local_devices(); assert(len(lds) == 1); from jax._src.distributed import global_state as gs; print(gs.num_processes, gs.process_id, lds[0].local_hardware_id, gs.coordinator_address)')
set -x
nccl-sanity-check $NCCL_SANITY_CHECK_ARGS
EOF
chmod +x "$BIN_DIR/$NAME.sh"
163 changes: 163 additions & 0 deletions .github/container/nccl-sanity-check.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
#include <stdio.h>
#include "cuda_runtime.h"
#include "nccl.h"
#include <unistd.h>
#include <stdint.h>
#include <stdlib.h>
#include <inttypes.h>
#include <chrono>
#include <limits>
#include <tuple>


#define CUDACHECK(cmd) do { \
cudaError_t e = cmd; \
if( e != cudaSuccess ) { \
printf("Failed: CUDA error %s:%d '%s'\n", \
__FILE__,__LINE__,cudaGetErrorString(e)); \
exit(EXIT_FAILURE); \
} \
} while(0)


#define NCCLCHECK(cmd) do { \
ncclResult_t r = cmd; \
if (r!= ncclSuccess) { \
printf("Failed: NCCL error %s:%d '%s'\n", \
__FILE__,__LINE__,ncclGetErrorString(r)); \
exit(EXIT_FAILURE); \
} \
} while(0)


static int getEnvAsInt(const char* name, int default_val) {
char* str_val = getenv(name);
if (!str_val) {
return default_val;
}
int int_val;
if (sscanf(str_val, "%d", &int_val) != 1) {
printf("Failed: Could not parse env var %s as int: '%s'\n", name, str_val);
exit(EXIT_FAILURE);
}
return int_val;
}


void printUsageAndAbort(char* progName) {
printf("Usage: %s world-size world-rank local-rank coordinator-address\n", progName);
exit(EXIT_FAILURE);
}


void parseArgs(int argc, char* argv[], int* nRanks, int* myRank, int* localRank,
char* coordinatorAddress) {
if (argc != 5) {
printUsageAndAbort(argv[0]);
}
if (sscanf(argv[1], "%d", nRanks) != 1 || *nRanks <= 0) {
printf("Expected world-size to be a positive integer\n");
printUsageAndAbort(argv[0]);
}
if (sscanf(argv[2], "%d", myRank) != 1 || *myRank < 0 || *myRank >= *nRanks) {
printf("Expected world-rank to be an integer in [0;world-size)\n");
printUsageAndAbort(argv[0]);
}
if (sscanf(argv[3], "%d", localRank) != 1 || *localRank < 0) {
printf("Expected local-rank to be a non-negative integer\n");
printUsageAndAbort(argv[0]);
}
if (sscanf(argv[4], "%127s", coordinatorAddress) != 1 ||
strlen(coordinatorAddress) >= 127) {
printf("Expected coordinator-address to be a string (ip:port)\n");
printUsageAndAbort(argv[0]);
}
}


std::tuple<uint64_t, uint64_t> sampleAllReduces(int rank, int nRanks, ncclUniqueId id,
int size, int rounds) {
float *sendbuff, *recvbuff;
CUDACHECK(cudaMalloc((void**) &sendbuff, size * sizeof(float)));
CUDACHECK(cudaMalloc((void**) &recvbuff, size * sizeof(float)));

cudaStream_t s;
CUDACHECK(cudaStreamCreate(&s));

//initializing NCCL
ncclComm_t comm;
NCCLCHECK(ncclCommInitRank(&comm, nRanks, id, rank));

// Sample a few rounds of minimal all-reduces
uint64_t minDuration = std::numeric_limits<uint64_t>::max();
uint64_t maxDuration = 0;
for (int i=0; i<rounds; i++) {
auto t_start = std::chrono::high_resolution_clock::now();

NCCLCHECK(ncclAllReduce((const void*)sendbuff, (void*)recvbuff, size, ncclFloat,
ncclSum, comm, s));
CUDACHECK(cudaStreamSynchronize(s));

uint64_t duration = std::chrono::duration_cast<std::chrono::microseconds>(
std::chrono::high_resolution_clock::now() - t_start).count();
if (duration < minDuration) {
minDuration = duration;
}
if (duration > maxDuration) {
maxDuration = duration;
}
}

// Clean up
CUDACHECK(cudaFree(sendbuff));
CUDACHECK(cudaFree(recvbuff));
ncclCommDestroy(comm);

return {minDuration, maxDuration};
}


int main(int argc, char* argv[])
{
// Number of floats communicated in all-reduce
int size = 1;
// Number of all-reduces to sample, only best result is considered.
int rounds = 10;
// The minimum duration required to pass the sanity check.
int threshold = getEnvAsInt("NCCL_SANITY_CHECK_LATENCY_US", 1000);

int nRanks, myRank, localRank;
char coordinatorAddress[128];
parseArgs(argc, argv, &nRanks, &myRank, &localRank, coordinatorAddress);

CUDACHECK(cudaSetDevice(localRank));

// Compute same NCCL unique id in all ranks
ncclUniqueId id;
if (setenv("NCCL_COMM_ID", coordinatorAddress, 1) != 0) {
printf("Failed: Could not set NCCL_COMM_ID\n");
exit(EXIT_FAILURE);
}

// ncclUniqueId is just a ncclBootstrapHandle (with some padding), see:
// https://github.com/NVIDIA/nccl/blob/v2.19/src/include/bootstrap.h#L14
// In the following call, the addr is initialized using NCCL_COMM_ID, but the
// magic is drawn from urandom bits. Usually this would only be done on rank 0
// and the resulting id would then be broadcast to all the other processes
// out-of-band (e.g. using standard MPI). Instead, here we've already settled
// on an appropriate ip and port for rank 0 (given in NCCL_COMMI_ID), and all
// that remains is fixing the magic.
ncclGetUniqueId(&id);
*((uint64_t*) &id) = 0xDEADBEEFDEADBEEF;

// Estimate latency by running several all-reduces
auto[minDuration, maxDuration] = sampleAllReduces(myRank, nRanks, id, size, rounds);

// Report result of the sanity check
bool success = threshold >= minDuration;
printf(
"nccl-sanity-check success=%d rank=%d nRanks=%d rounds=%d threshold=%dus "
"minDuration=%" PRIu64 "us maxDuration=%" PRIu64 "us\n",
success, myRank, nRanks, rounds, threshold, minDuration, maxDuration);
return !success;
}
5 changes: 2 additions & 3 deletions .github/container/test-maxtext.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#!/bin/bash
set -exou pipefail

# # Parse command-line arguments

Expand Down Expand Up @@ -151,7 +152,7 @@ else
ici_DP=$DP
fi

if [ $ATTN_TYPE -eq 'cudnn_flash_te' ]; then
if [[ $ATTN_TYPE == 'cudnn_flash_te' ]]; then
ENABLE_FUSED_ATTN=1
REMAT_POLICY="minimal_flash"
fi
Expand Down Expand Up @@ -179,7 +180,6 @@ MAXTEXT_DIR="/opt/maxtext"
pushd ${MAXTEXT_DIR}

## Launch
set -ex

export NVTE_FUSED_ATTN=${ENABLE_FUSED_ATTN}
export XLA_PYTHON_CLIENT_MEM_FRACTION=${MEM_FRACTION}
Expand Down Expand Up @@ -217,5 +217,4 @@ RUN_SETTINGS="MaxText/train.py MaxText/configs/base.yml run_name=${RUN_NAME} log
echo "Command: python3 $RUN_SETTINGS"
python3 $RUN_SETTINGS

set +x
echo "Output at ${OUTPUT}"
4 changes: 3 additions & 1 deletion .github/container/test-pax.sh
Original file line number Diff line number Diff line change
Expand Up @@ -388,6 +388,9 @@ else
fi

if [[ ${EVALUATE} -ne 0 ]]; then

trap "rm -rf ${OUTPUT}/checkpoints" ERR INT HUP TERM EXIT

## train for 0 steps to generate an initial checkpoint
python -m paxml.main \
--fdl_config=${CONFIG} \
Expand All @@ -408,7 +411,6 @@ if [[ ${EVALUATE} -ne 0 ]]; then
$ADDITIONAL_ARGS \
$([[ $MULTIPROCESS != 0 ]] && echo --multiprocess_gpu)

rm -rf ${OUTPUT}/checkpoints
else
python -m paxml.main \
--fdl_config=${CONFIG} \
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/_ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -494,7 +494,7 @@ jobs:
docker run -i --gpus all --shm-size=1g \
${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }} \
bash <<"EOF" |& tee test-levanter.log
pip install pytest
pip install flake8 pytest soundfile librosa
PYTHONPATH=/opt/levanter/tests:$PYTHONPATH pytest /opt/levanter/tests
EOF
STATISTICS_SCRIPT: |
Expand Down
51 changes: 49 additions & 2 deletions .github/workflows/_finalize.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -168,12 +168,59 @@ jobs:
done
echo "]" >> fsitrep.json
mv fsitrep.json sitrep.json
- name: Combine all json into final summary
shell: bash -x -e {0}
run: |
# Combine all json files into a final summary json
output="combined.json"
combinedJson="{}"
# Loop over subdirectories in the current directory
while IFS= read -r dir; do
echo $dir
dirName=$(basename $dir) && [ -d "$dir" ] || continue
# Initialize default empty JSON objects
sitrep="{}"
metricSummary="{}"
status="{}"
# Check if the sitrep.json file exists and read it, else use default
if [ -f "$dir/sitrep.json" ]; then
sitrep=$(<"$dir/sitrep.json")
fi
# Check if the metrics_summary.json file exists and read it, else use default
if [ -f "$dir/metrics_summary.json" ]; then
metricSummary=$(<"$dir/metrics_summary.json")
fi
# Use Bash globbing to find the *-status.json file and read it, else use default
statusFiles=("$dir"/*-status.json)
statusFile=${statusFiles[0]}
if [ -f $statusFile ]; then
status=$(<"${statusFile}")
fi
# Use jq to merge the JSON data
combinedJson=$(jq --arg dirName "$dirName" \
--argjson sitrep "$sitrep" \
--argjson status "$status" \
--argjson metricSummary "$metricSummary" \
'.[$dirName] = {"sitrep": $sitrep, "status": $status, "metrics_summary": $metricSummary}' <<<"$combinedJson")
done < <(find . -maxdepth 1 -type d)
# Output the combined JSON to the file, nicely formatted
echo "$combinedJson" | jq '.' > "$output"
- name: Upload training logs as artifacts
uses: actions/upload-artifact@v4
with:
name: ${{ inputs.ARTIFACT_NAME }}
path: ./sitrep.json
path: |
./sitrep.json
./combined.json
publish-badge:
needs: [upload-badge]
Expand Down
Loading

0 comments on commit f71c124

Please sign in to comment.