From b0ec72a3e72600379453eb320afec9a45538e9c3 Mon Sep 17 00:00:00 2001 From: Olli Lupton Date: Thu, 16 Jan 2025 09:41:13 +0100 Subject: [PATCH] CI: run MaxText tests on AWS with NGC release candidate images (#1237) --- .github/eks-workflow-files/maxtext-job.yaml | 120 ++++++++++++++++++++ .github/workflows/_test_maxtext_k8s.yaml | 107 +++++++++++++++++ .github/workflows/ngc-release-testing.yaml | 13 ++- 3 files changed, 237 insertions(+), 3 deletions(-) create mode 100644 .github/eks-workflow-files/maxtext-job.yaml create mode 100644 .github/workflows/_test_maxtext_k8s.yaml diff --git a/.github/eks-workflow-files/maxtext-job.yaml b/.github/eks-workflow-files/maxtext-job.yaml new file mode 100644 index 000000000..7d9728f87 --- /dev/null +++ b/.github/eks-workflow-files/maxtext-job.yaml @@ -0,0 +1,120 @@ +apiVersion: v1 +kind: Service +metadata: + name: PLACEHOLDER +spec: + clusterIP: None # clusterIP must be None to create a headless service + selector: + job-name: PLACEHOLDER # must match Job name +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: PLACEHOLDER + labels: + kueue.x-k8s.io/queue-name: p5-queue +spec: + completions: 2 # number of nodes + parallelism: 2 # number of nodes + completionMode: Indexed + backoffLimitPerIndex: 0 # max failures per index + maxFailedIndexes: 0 # all indices must succeed + template: + spec: + subdomain: PLACEHOLDER # has to match Service name + restartPolicy: Never + imagePullSecrets: + - name: PLACEHOLDER + containers: + - name: maxtext + image: PLACEHOLDER + ports: + - containerPort: 3389 + command: + - bash + - -c + # The logging logic: stream stdout/stderr from the 0th process inside this pod, + # record all of the processes' stdout/stderr + the INFO-level NCCL logs to file + - | + export SERVICE_NAME=$0 + export JOB_NAME=$1 + cat >each-process.sh <<'EOL' + export JAX_COORDINATOR_IP=${JOB_NAME}-0.${SERVICE_NAME} + export JAX_COORDINATOR_PORT=3389 + export NNODES=16 # actually #processes == #GPUs + export NODE_RANK=$((JOB_COMPLETION_INDEX*8 + LOCAL_RANK)) + export JAX_LOCAL_DEVICE_IDS=$LOCAL_RANK + export NCCL_DEBUG=INFO + export NCCL_DEBUG_FILE=/opt/output/nccl.$NODE_RANK.log + [[ $LOCAL_RANK == 0 ]] && console="/dev/stdout" || console="/dev/null" + nsys-jax \ + --capture-range=cudaProfilerApi \ + --capture-range-end=stop \ + -o /opt/output/profile.$NODE_RANK.zip \ + -- \ + test-maxtext.sh \ + -n 2 \ + -b 2 \ + --model-name=llama2-7b \ + --attn-type=cudnn_flash_te \ + --remat-policy=minimal_flash \ + --steps=20 \ + --fsdp=16 \ + -a "scan_layers=false \ + max_target_length=4096 \ + use_iota_embed=true \ + logits_dot_in_fp32=false \ + profiler=nsys \ + skip_first_n_steps_for_profiler=3 \ + profiler_steps=8" \ + |& tee /opt/output/output.$NODE_RANK.log >"${console}" + code=$? + # Should run even on failure + cat /opt/output/nccl.$NODE_RANK.log >"${console}" + exit $code + EOL + # TODO: upgrade parallel-launch to return a failure code as soon as any + # of its children do (it already does this eventually, but it could + # be slow) + parallel-launch LOCAL_RANK 8 bash each-process.sh + code=$? + # Should run even on failure + touch /opt/output/.done + exit $code + - PLACEHOLDER + - PLACEHOLDER + resources: + limits: + nvidia.com/gpu: 8 + vpc.amazonaws.com/efa: 32 + volumeMounts: + - mountPath: /dev/shm + name: shmem + - mountPath: /opt/output + name: output + - name: upload + image: amazon/aws-cli + command: + - bash + - -c + - | + JOB_NAME="$0" + while [[ ! -f /opt/output/.done ]]; do + sleep 1 + done + rm /opt/output/.done + aws s3 cp \ + --recursive \ + /opt/output \ + "s3://jax-toolbox-eks-output/${JOB_NAME}/" + - PLACEHOLDER + volumeMounts: + - mountPath: /opt/output + name: output + volumes: + - name: output + emptyDir: {} + - name: shmem + emptyDir: + medium: Memory + sizeLimit: 16Gi diff --git a/.github/workflows/_test_maxtext_k8s.yaml b/.github/workflows/_test_maxtext_k8s.yaml new file mode 100644 index 000000000..7f82d3f42 --- /dev/null +++ b/.github/workflows/_test_maxtext_k8s.yaml @@ -0,0 +1,107 @@ +name: ~test MaxText functionality on Kubernetes + +on: + workflow_call: + inputs: + MAXTEXT_IMAGE: + type: string + description: MaxText container to test + required: true + +permissions: + contents: read # to fetch code + +jobs: + maxtext: + runs-on: eks + env: + CONTAINER_IMAGE: "${{ inputs.MAXTEXT_IMAGE }}" + JOB_NAME: "maxtext-${{ github.run_id }}-${{ github.run_attempt }}" + steps: + - name: Check out the repository + uses: actions/checkout@v4 + - name: Login to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: Login to NVIDIA Container Registry + uses: docker/login-action@v3 + with: + registry: nvcr.io + username: $oauthtoken + password: ${{ secrets.NVCR_TOKEN }} + - name: Store GitHub Container Registry token as Kubernetes secret + run: | + # Make this available to later steps + TOKEN_NAME="${JOB_NAME}-token" + echo "TOKEN_NAME=${TOKEN_NAME}" >> "$GITHUB_ENV" + kubectl create secret generic \ + ${TOKEN_NAME} \ + --from-file=.dockerconfigjson=$HOME/.docker/config.json \ + --type=kubernetes.io/dockerconfigjson + - name: Configure Kubernetes job + run: | + export SERVICE_NAME="${JOB_NAME}-svc" + yq -i ea 'select(di == 0).metadata.name = strenv(SERVICE_NAME) + | select(di == 0).spec.selector.job-name = strenv(JOB_NAME) + | select(di == 1).metadata.name = strenv(JOB_NAME) + | select(di == 1).spec.template.spec.subdomain = strenv(SERVICE_NAME) + | select(di == 1).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME) + | select(di == 1).spec.template.spec.containers[0].image = strenv(CONTAINER_IMAGE) + | select(di == 1).spec.template.spec.containers[0].command[3] = strenv(SERVICE_NAME) + | select(di == 1).spec.template.spec.containers[0].command[4] = strenv(JOB_NAME) + | select(di == 1).spec.template.spec.containers[1].command[3] = strenv(JOB_NAME)' \ + .github/eks-workflow-files/maxtext-job.yaml + git diff .github/eks-workflow-files/maxtext-job.yaml + - name: Submit Kubernetes job + run: kubectl apply -f .github/eks-workflow-files/maxtext-job.yaml + - name: Wait for Kubernetes job to start + run: | + # Launcher job is created eagerly, but suspended. Kueue un-suspends it when + # resources are available, but that is where there can be a long wait if the + # cluster is busy executing other jobs. + kubectl wait --for=create job/${JOB_NAME} + kubectl wait --for=jsonpath='{.spec.suspend}=false' job/${JOB_NAME} --timeout=3600s + - name: Stream Kubernetes job output + run: | + # Streaming logs will fail if the container/pod is still pending + while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${JOB_NAME} --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do + sleep 1 + done + kubectl logs --all-containers=true --all-pods=true --follow job/${JOB_NAME} + - name: Retrieve Kubernetes job status + shell: bash -exo pipefail {0} + run: | + while readarray -d : -t status < <(kubectl get job/${JOB_NAME} -o 'jsonpath={.status.failed}:{.status.succeeded}'); do + failure=${status[0]:-0} + success=${status[1]:-0} + total=$((failure+success)) + if [[ ${total} < 2 ]]; then + sleep 1 + elif [[ ${total} == 2 ]]; then + break + else + # FIXME + exit 255 + fi + done + exit ${failure} + # Provide more debug output in case of failure; note that some kinds of launch + # failure do not produce any log output. + - name: Debug failed Kubernetes job + if: failure() + run: | + # Provide better debug in case of launch failures that will not produce log output + pods=$(kubectl get pods --selector=batch.kubernetes.io/job-name=${JOB_NAME} -o name) + if [[ -n "${pods}" ]]; then + kubectl describe ${pods} + fi + # Clean up in case of errors as well as success + - name: Delete Kubernetes job + if: always() + run: kubectl delete -f .github/eks-workflow-files/maxtext-job.yaml + - name: Delete GitHub Container Registry token + if: always() + run: kubectl delete secret ${TOKEN_NAME} diff --git a/.github/workflows/ngc-release-testing.yaml b/.github/workflows/ngc-release-testing.yaml index 15e0ed1f7..ac2bea923 100644 --- a/.github/workflows/ngc-release-testing.yaml +++ b/.github/workflows/ngc-release-testing.yaml @@ -45,7 +45,7 @@ jobs: docker run -i --shm-size=1g --gpus all \ ${{ inputs.JAX_IMAGE }} \ bash <<"EOF" |& tee test-backend-independent.log - test-jax.sh -b backend-independent + test-jax.sh -b backend-independent EOF docker run -i --shm-size=1g --gpus all \ ${{ inputs.JAX_IMAGE }} \ @@ -80,8 +80,15 @@ jobs: MAXTEXT_IMAGE: ${{ inputs.MAXTEXT_IMAGE }} secrets: inherit + test-maxtext-eks: + if: inputs.MAXTEXT_IMAGE != '' + uses: ./.github/workflows/_test_maxtext_k8s.yaml + with: + MAXTEXT_IMAGE: ${{ inputs.MAXTEXT_IMAGE }} + secrets: inherit + finalize: - needs: [ test-nccl, test-jax, test-rosetta-pax, test-maxtext ] + needs: [ test-nccl, test-jax, test-rosetta-pax, test-maxtext, test-maxtext-eks ] if: "!cancelled()" uses: ./.github/workflows/_finalize.yaml - secrets: inherit \ No newline at end of file + secrets: inherit