Merge branch 'main' into ko3n1g/ci/selective-triggering

NVIDIA · May 15, 2024 · f71c124 · f71c124
2 parents a1b346c + fe7e193
commit f71c124
Show file tree

Hide file tree

Showing 17 changed files with 386 additions and 22 deletions.
diff --git a/.github/container/Dockerfile.base b/.github/container/Dockerfile.base
@@ -1,5 +1,5 @@
 # syntax=docker/dockerfile:1-labs
-ARG BASE_IMAGE=nvidia/cuda:12.3.0-devel-ubuntu22.04
+ARG BASE_IMAGE=nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04
 ARG GIT_USER_NAME="JAX Toolbox"
 ARG [email protected]
 ARG CLANG_VERSION=17
@@ -146,7 +146,6 @@ RUN install-nsight.sh
 ###############################################################################
 
 ADD install-cudnn.sh /usr/local/bin
-RUN install-cudnn.sh
 
 ###############################################################################
 ## Install NCCL
@@ -173,6 +172,14 @@ ADD --chmod=777 \
 ENV LD_LIBRARY_PATH=/opt/amazon/efa/lib:${LD_LIBRARY_PATH}
 ENV PATH=/opt/amazon/efa/bin:${PATH}
 
+##############################################################################
+## NCCL sanity check utility
+##############################################################################
+
+ADD install-nccl-sanity-check.sh /usr/local/bin
+ADD nccl-sanity-check.cu /opt
+RUN install-nccl-sanity-check.sh
+
 ###############################################################################
 ## Add the systemcheck to the entrypoint.
 ###############################################################################

diff --git a/.github/container/install-cudnn.sh b/.github/container/install-cudnn.sh
@@ -5,6 +5,8 @@ set -ex
 export DEBIAN_FRONTEND=noninteractive
 export TZ=America/Los_Angeles
 
+CUDNN_MAJOR_VERSION=9
+
 apt-get update
 
 # Extract major CUDA version from `nvcc --version` output line
@@ -18,17 +20,17 @@ cuda_major_version=$(nvcc --version | sed -n 's/^.*release \([0-9]*\.[0-9]*\).*$
 # version of CUDA and cuDNN are compatible.
 # For example, CUDA 12.3 + cuDNN 8.9.6 (libcudnn8 version: 8.9.6.50-1+cuda12.2) is 
 # considered to be compatible.
-libcudnn_version=$(apt-cache show libcudnn8 |  sed -n "s/^Version: \(.*+cuda${cuda_major_version}\.[0-9]*\)$/\1/p" | head -n 1)
-libcudnn_dev_version=$(apt-cache show libcudnn8-dev | sed -n "s/^Version: \(.*+cuda${cuda_major_version}\.[0-9]\)$/\1/p" | head -n 1)
+libcudnn_version=$(apt-cache show libcudnn${CUDNN_MAJOR_VERSION} |  sed -n "s/^Version: \(.*+cuda${cuda_major_version}\.[0-9]*\)$/\1/p" | head -n 1)
+libcudnn_dev_version=$(apt-cache show libcudnn${CUDNN_MAJOR_VERSION}-dev | sed -n "s/^Version: \(.*+cuda${cuda_major_version}\.[0-9]\)$/\1/p" | head -n 1)
 if [[ -z "${libcudnn_version}" || -z "${libcudnn_dev_version}" ]]; then
     echo "Could not find compatible cuDNN version for CUDA ${cuda_version}"
     exit 1
 fi
 
 apt-get update
 apt-get install -y \
-    libcudnn8=${libcudnn_version} \
-    libcudnn8-dev=${libcudnn_dev_version}
+    libcudnn${CUDNN_MAJOR_VERSION}=${libcudnn_version} \
+    libcudnn${CUDNN_MAJOR_VERSION}-dev=${libcudnn_dev_version}
 
 apt-get clean
 rm -rf /var/lib/apt/lists/*
diff --git a/.github/container/install-nccl-sanity-check.sh b/.github/container/install-nccl-sanity-check.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+set -ex
+
+BIN_DIR=/usr/local/bin
+NAME=nccl-sanity-check
+
+# Build binary from source
+nvcc -o "$BIN_DIR/$NAME" "/opt/$NAME.cu" -lcudart -lnccl
+
+# Create the wrapper script that queries jax for the configuration
+cat <<"EOF" > "$BIN_DIR/$NAME.sh"
+#!/bin/bash
+set -e
+export NCCL_SANITY_CHECK_LATENCY_US=1000
+NCCL_SANITY_CHECK_ARGS=$(python3 -c 'import jax; import jax.distributed; jax.distributed.initialize(); lds = jax.local_devices(); assert(len(lds) == 1); from jax._src.distributed import global_state as gs; print(gs.num_processes, gs.process_id, lds[0].local_hardware_id, gs.coordinator_address)')
+set -x
+nccl-sanity-check $NCCL_SANITY_CHECK_ARGS
+EOF
+chmod +x "$BIN_DIR/$NAME.sh"
diff --git a/.github/container/nccl-sanity-check.cu b/.github/container/nccl-sanity-check.cu
@@ -0,0 +1,163 @@
+#include <stdio.h>
+#include "cuda_runtime.h"
+#include "nccl.h"
+#include <unistd.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <inttypes.h>
+#include <chrono>
+#include <limits>
+#include <tuple>
+
+
+#define CUDACHECK(cmd) do {                         \
+  cudaError_t e = cmd;                              \
+  if( e != cudaSuccess ) {                          \
+    printf("Failed: CUDA error %s:%d '%s'\n",       \
+        __FILE__,__LINE__,cudaGetErrorString(e));   \
+    exit(EXIT_FAILURE);                             \
+  }                                                 \
+} while(0)
+
+
+#define NCCLCHECK(cmd) do {                         \
+  ncclResult_t r = cmd;                             \
+  if (r!= ncclSuccess) {                            \
+    printf("Failed: NCCL error %s:%d '%s'\n",       \
+        __FILE__,__LINE__,ncclGetErrorString(r));   \
+    exit(EXIT_FAILURE);                             \
+  }                                                 \
+} while(0)
+
+
+static int getEnvAsInt(const char* name, int default_val) {
+  char* str_val = getenv(name);
+  if (!str_val) {
+    return default_val;
+  }
+  int int_val;
+  if (sscanf(str_val, "%d", &int_val) != 1) {
+    printf("Failed: Could not parse env var %s as int: '%s'\n", name, str_val);
+    exit(EXIT_FAILURE);
+  }
+  return int_val;
+}
+
+
+void printUsageAndAbort(char* progName) {
+  printf("Usage: %s world-size world-rank local-rank coordinator-address\n", progName);
+  exit(EXIT_FAILURE);
+}
+
+
+void parseArgs(int argc, char* argv[], int* nRanks, int* myRank, int* localRank,
+    char* coordinatorAddress) {
+  if (argc != 5) {
+    printUsageAndAbort(argv[0]);
+  }
+  if (sscanf(argv[1], "%d", nRanks) != 1 || *nRanks <= 0) {
+    printf("Expected world-size to be a positive integer\n");
+    printUsageAndAbort(argv[0]);
+  }
+  if (sscanf(argv[2], "%d", myRank) != 1 || *myRank < 0 || *myRank >= *nRanks) {
+    printf("Expected world-rank to be an integer in [0;world-size)\n");
+    printUsageAndAbort(argv[0]);
+  }
+  if (sscanf(argv[3], "%d", localRank) != 1 || *localRank < 0) {
+    printf("Expected local-rank to be a non-negative integer\n");
+    printUsageAndAbort(argv[0]);
+  }
+  if (sscanf(argv[4], "%127s", coordinatorAddress) != 1 ||
+      strlen(coordinatorAddress) >= 127) {
+    printf("Expected coordinator-address to be a string (ip:port)\n");
+    printUsageAndAbort(argv[0]);
+  }
+}
+
+
+std::tuple<uint64_t, uint64_t> sampleAllReduces(int rank, int nRanks, ncclUniqueId id,
+    int size, int rounds) {
+  float *sendbuff, *recvbuff;
+  CUDACHECK(cudaMalloc((void**) &sendbuff, size * sizeof(float)));
+  CUDACHECK(cudaMalloc((void**) &recvbuff, size * sizeof(float)));
+
+  cudaStream_t s;
+  CUDACHECK(cudaStreamCreate(&s));
+
+  //initializing NCCL
+  ncclComm_t comm;
+  NCCLCHECK(ncclCommInitRank(&comm, nRanks, id, rank));
+
+  // Sample a few rounds of minimal all-reduces
+  uint64_t minDuration = std::numeric_limits<uint64_t>::max();
+  uint64_t maxDuration = 0;
+  for (int i=0; i<rounds; i++) {
+    auto t_start = std::chrono::high_resolution_clock::now();
+
+    NCCLCHECK(ncclAllReduce((const void*)sendbuff, (void*)recvbuff, size, ncclFloat,
+      ncclSum, comm, s));
+    CUDACHECK(cudaStreamSynchronize(s));
+
+    uint64_t duration = std::chrono::duration_cast<std::chrono::microseconds>(
+      std::chrono::high_resolution_clock::now() - t_start).count();
+    if (duration < minDuration) {
+      minDuration = duration;
+    }
+    if (duration > maxDuration) {
+      maxDuration = duration;
+    }
+  }
+
+  // Clean up
+  CUDACHECK(cudaFree(sendbuff));
+  CUDACHECK(cudaFree(recvbuff));
+  ncclCommDestroy(comm);
+
+  return {minDuration, maxDuration};
+}
+
+
+int main(int argc, char* argv[])
+{
+  // Number of floats communicated in all-reduce
+  int size = 1;
+  // Number of all-reduces to sample, only best result is considered.
+  int rounds = 10;
+  // The minimum duration required to pass the sanity check.
+  int threshold = getEnvAsInt("NCCL_SANITY_CHECK_LATENCY_US", 1000);
+
+  int nRanks, myRank, localRank;
+  char coordinatorAddress[128];
+  parseArgs(argc, argv, &nRanks, &myRank, &localRank, coordinatorAddress);
+
+  CUDACHECK(cudaSetDevice(localRank));
+
+  // Compute same NCCL unique id in all ranks
+  ncclUniqueId id;
+  if (setenv("NCCL_COMM_ID", coordinatorAddress, 1) != 0) {
+    printf("Failed: Could not set NCCL_COMM_ID\n");
+    exit(EXIT_FAILURE);
+  }
+
+  // ncclUniqueId is just a ncclBootstrapHandle (with some padding), see:
+  //   https://github.com/NVIDIA/nccl/blob/v2.19/src/include/bootstrap.h#L14
+  // In the following call, the addr is initialized using NCCL_COMM_ID, but the
+  // magic is drawn from urandom bits. Usually this would only be done on rank 0
+  // and the resulting id would then be broadcast to all the other processes
+  // out-of-band (e.g. using standard MPI). Instead, here we've already settled
+  // on an appropriate ip and port for rank 0 (given in NCCL_COMMI_ID), and all
+  // that remains is fixing the magic.
+  ncclGetUniqueId(&id);
+  *((uint64_t*) &id) = 0xDEADBEEFDEADBEEF;
+
+  // Estimate latency by running several all-reduces
+  auto[minDuration, maxDuration] = sampleAllReduces(myRank, nRanks, id, size, rounds);
+
+  // Report result of the sanity check
+  bool success = threshold >= minDuration;
+  printf(
+    "nccl-sanity-check success=%d rank=%d nRanks=%d rounds=%d threshold=%dus "
+    "minDuration=%" PRIu64 "us maxDuration=%" PRIu64 "us\n",
+    success, myRank, nRanks, rounds, threshold, minDuration, maxDuration);
+  return !success;
+}
diff --git a/.github/container/test-maxtext.sh b/.github/container/test-maxtext.sh
@@ -1,4 +1,5 @@
 #!/bin/bash
+set -exou pipefail
 
 # # Parse command-line arguments
 
@@ -151,7 +152,7 @@ else
     ici_DP=$DP
 fi
 
-if [ $ATTN_TYPE -eq 'cudnn_flash_te' ]; then
+if [[ $ATTN_TYPE == 'cudnn_flash_te' ]]; then
     ENABLE_FUSED_ATTN=1
     REMAT_POLICY="minimal_flash"
 fi
@@ -179,7 +180,6 @@ MAXTEXT_DIR="/opt/maxtext"
 pushd ${MAXTEXT_DIR}
 
 ## Launch
-set -ex
 
 export NVTE_FUSED_ATTN=${ENABLE_FUSED_ATTN}
 export XLA_PYTHON_CLIENT_MEM_FRACTION=${MEM_FRACTION}
@@ -217,5 +217,4 @@ RUN_SETTINGS="MaxText/train.py MaxText/configs/base.yml run_name=${RUN_NAME} log
 echo "Command: python3 $RUN_SETTINGS"
 python3 $RUN_SETTINGS
 
-set +x
 echo "Output at ${OUTPUT}"
diff --git a/.github/container/test-pax.sh b/.github/container/test-pax.sh
@@ -388,6 +388,9 @@ else
 fi
 
 if [[ ${EVALUATE} -ne 0 ]]; then
+
+  trap "rm -rf ${OUTPUT}/checkpoints" ERR INT HUP TERM EXIT
+
   ## train for 0 steps to generate an initial checkpoint
   python -m paxml.main \
     --fdl_config=${CONFIG} \
@@ -408,7 +411,6 @@ if [[ ${EVALUATE} -ne 0 ]]; then
     $ADDITIONAL_ARGS \
     $([[ $MULTIPROCESS != 0 ]] && echo --multiprocess_gpu)
 
-  rm -rf ${OUTPUT}/checkpoints
 else
   python -m paxml.main \
     --fdl_config=${CONFIG} \

diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml
@@ -494,7 +494,7 @@ jobs:
         docker run -i --gpus all --shm-size=1g \
         ${{ needs.build-levanter.outputs.DOCKER_TAG_FINAL }} \
         bash <<"EOF" |& tee test-levanter.log
-          pip install pytest
+          pip install flake8 pytest soundfile librosa
           PYTHONPATH=/opt/levanter/tests:$PYTHONPATH pytest /opt/levanter/tests
         EOF
       STATISTICS_SCRIPT: |

diff --git a/.github/workflows/_finalize.yaml b/.github/workflows/_finalize.yaml
@@ -168,12 +168,59 @@ jobs:
           done
           echo "]" >> fsitrep.json
           mv fsitrep.json sitrep.json
-          
+
+      - name: Combine all json into final summary
+        shell: bash -x -e {0}
+        run: |
+          # Combine all json files into a final summary json
+          output="combined.json"
+          combinedJson="{}"
+
+          # Loop over subdirectories in the current directory
+          while IFS= read -r dir; do
+            echo $dir
+            dirName=$(basename $dir) && [ -d "$dir" ] || continue
+
+            # Initialize default empty JSON objects
+            sitrep="{}"
+            metricSummary="{}"
+            status="{}"
+
+            # Check if the sitrep.json file exists and read it, else use default
+            if [ -f "$dir/sitrep.json" ]; then
+                sitrep=$(<"$dir/sitrep.json")
+            fi
+
+            # Check if the metrics_summary.json file exists and read it, else use default
+            if [ -f "$dir/metrics_summary.json" ]; then
+                metricSummary=$(<"$dir/metrics_summary.json")
+            fi
+
+            # Use Bash globbing to find the *-status.json file and read it, else use default
+            statusFiles=("$dir"/*-status.json)
+            statusFile=${statusFiles[0]}
+            if [ -f $statusFile ]; then
+                status=$(<"${statusFile}")
+            fi
+
+            # Use jq to merge the JSON data
+            combinedJson=$(jq --arg dirName "$dirName" \
+                              --argjson sitrep "$sitrep" \
+                              --argjson status "$status" \
+                              --argjson metricSummary "$metricSummary" \
+                              '.[$dirName] = {"sitrep": $sitrep, "status": $status, "metrics_summary": $metricSummary}' <<<"$combinedJson")
+          done < <(find . -maxdepth 1 -type d)
+
+          # Output the combined JSON to the file, nicely formatted
+          echo "$combinedJson" | jq '.' > "$output"
+
       - name: Upload training logs as artifacts
         uses: actions/upload-artifact@v4
         with:
           name: ${{ inputs.ARTIFACT_NAME }}
-          path: ./sitrep.json
+          path: |
+            ./sitrep.json
+            ./combined.json
 
   publish-badge:
     needs: [upload-badge]