From b0ec72a3e72600379453eb320afec9a45538e9c3 Mon Sep 17 00:00:00 2001
From: Olli Lupton <olupton@nvidia.com>
Date: Thu, 16 Jan 2025 09:41:13 +0100
Subject: [PATCH]  CI: run MaxText tests on AWS with NGC release candidate
 images (#1237)

---
 .github/eks-workflow-files/maxtext-job.yaml | 120 ++++++++++++++++++++
 .github/workflows/_test_maxtext_k8s.yaml    | 107 +++++++++++++++++
 .github/workflows/ngc-release-testing.yaml  |  13 ++-
 3 files changed, 237 insertions(+), 3 deletions(-)
 create mode 100644 .github/eks-workflow-files/maxtext-job.yaml
 create mode 100644 .github/workflows/_test_maxtext_k8s.yaml

diff --git a/.github/eks-workflow-files/maxtext-job.yaml b/.github/eks-workflow-files/maxtext-job.yaml
new file mode 100644
index 000000000..7d9728f87
--- /dev/null
+++ b/.github/eks-workflow-files/maxtext-job.yaml
@@ -0,0 +1,120 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: PLACEHOLDER
+spec:
+  clusterIP: None # clusterIP must be None to create a headless service
+  selector:
+    job-name: PLACEHOLDER # must match Job name
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: PLACEHOLDER
+  labels:
+    kueue.x-k8s.io/queue-name: p5-queue
+spec:
+  completions: 2 # number of nodes
+  parallelism: 2 # number of nodes
+  completionMode: Indexed
+  backoffLimitPerIndex: 0 # max failures per index
+  maxFailedIndexes:     0 # all indices must succeed
+  template:
+    spec:
+      subdomain: PLACEHOLDER # has to match Service name
+      restartPolicy: Never
+      imagePullSecrets:
+        - name: PLACEHOLDER
+      containers:
+        - name: maxtext
+          image: PLACEHOLDER
+          ports:
+            - containerPort: 3389
+          command:
+            - bash
+            - -c
+            # The logging logic: stream stdout/stderr from the 0th process inside this pod,
+            # record all of the processes' stdout/stderr + the INFO-level NCCL logs to file
+            - |
+              export SERVICE_NAME=$0
+              export JOB_NAME=$1
+              cat >each-process.sh <<'EOL'
+              export JAX_COORDINATOR_IP=${JOB_NAME}-0.${SERVICE_NAME}
+              export JAX_COORDINATOR_PORT=3389
+              export NNODES=16 # actually #processes == #GPUs
+              export NODE_RANK=$((JOB_COMPLETION_INDEX*8 + LOCAL_RANK))
+              export JAX_LOCAL_DEVICE_IDS=$LOCAL_RANK
+              export NCCL_DEBUG=INFO
+              export NCCL_DEBUG_FILE=/opt/output/nccl.$NODE_RANK.log
+              [[ $LOCAL_RANK == 0 ]] && console="/dev/stdout" || console="/dev/null"
+              nsys-jax \
+                --capture-range=cudaProfilerApi \
+                --capture-range-end=stop \
+                -o /opt/output/profile.$NODE_RANK.zip \
+                -- \
+                test-maxtext.sh \
+                -n 2 \
+                -b 2 \
+                --model-name=llama2-7b \
+                --attn-type=cudnn_flash_te \
+                --remat-policy=minimal_flash \
+                --steps=20 \
+                --fsdp=16 \
+                -a "scan_layers=false \
+                    max_target_length=4096 \
+                    use_iota_embed=true \
+                    logits_dot_in_fp32=false \
+                    profiler=nsys \
+                    skip_first_n_steps_for_profiler=3 \
+                    profiler_steps=8" \
+                |& tee /opt/output/output.$NODE_RANK.log >"${console}"
+              code=$?
+              # Should run even on failure
+              cat /opt/output/nccl.$NODE_RANK.log >"${console}"
+              exit $code
+              EOL
+              # TODO: upgrade parallel-launch to return a failure code as soon as any
+              #       of its children do (it already does this eventually, but it could
+              #       be slow)
+              parallel-launch LOCAL_RANK 8 bash each-process.sh
+              code=$?
+              # Should run even on failure
+              touch /opt/output/.done
+              exit $code
+            - PLACEHOLDER
+            - PLACEHOLDER
+          resources:
+            limits:
+              nvidia.com/gpu: 8
+              vpc.amazonaws.com/efa: 32
+          volumeMounts:
+            - mountPath: /dev/shm
+              name: shmem
+            - mountPath: /opt/output
+              name: output
+        - name: upload
+          image: amazon/aws-cli
+          command:
+            - bash
+            - -c
+            - |
+              JOB_NAME="$0"
+              while [[ ! -f /opt/output/.done ]]; do
+                sleep 1
+              done
+              rm /opt/output/.done
+              aws s3 cp \
+                --recursive \
+                /opt/output \
+                "s3://jax-toolbox-eks-output/${JOB_NAME}/"
+            - PLACEHOLDER
+          volumeMounts:
+            - mountPath: /opt/output
+              name: output
+      volumes:
+        - name: output
+          emptyDir: {}
+        - name: shmem
+          emptyDir:
+            medium: Memory
+            sizeLimit: 16Gi
diff --git a/.github/workflows/_test_maxtext_k8s.yaml b/.github/workflows/_test_maxtext_k8s.yaml
new file mode 100644
index 000000000..7f82d3f42
--- /dev/null
+++ b/.github/workflows/_test_maxtext_k8s.yaml
@@ -0,0 +1,107 @@
+name: ~test MaxText functionality on Kubernetes
+
+on:
+  workflow_call:
+    inputs:
+      MAXTEXT_IMAGE:
+        type: string
+        description: MaxText container to test
+        required: true
+
+permissions:
+  contents: read  # to fetch code
+
+jobs:
+  maxtext:
+    runs-on: eks
+    env:
+      CONTAINER_IMAGE: "${{ inputs.MAXTEXT_IMAGE }}"
+      JOB_NAME: "maxtext-${{ github.run_id }}-${{ github.run_attempt }}"
+    steps:
+      - name: Check out the repository
+        uses: actions/checkout@v4
+      - name: Login to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.repository_owner }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+      - name: Login to NVIDIA Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: nvcr.io
+          username: $oauthtoken
+          password: ${{ secrets.NVCR_TOKEN }}
+      - name: Store GitHub Container Registry token as Kubernetes secret
+        run: |
+          # Make this available to later steps
+          TOKEN_NAME="${JOB_NAME}-token"
+          echo "TOKEN_NAME=${TOKEN_NAME}" >> "$GITHUB_ENV"
+          kubectl create secret generic \
+            ${TOKEN_NAME} \
+            --from-file=.dockerconfigjson=$HOME/.docker/config.json \
+            --type=kubernetes.io/dockerconfigjson
+      - name: Configure Kubernetes job
+        run: |
+          export SERVICE_NAME="${JOB_NAME}-svc"
+          yq -i ea 'select(di == 0).metadata.name = strenv(SERVICE_NAME)
+            | select(di == 0).spec.selector.job-name = strenv(JOB_NAME)
+            | select(di == 1).metadata.name = strenv(JOB_NAME)
+            | select(di == 1).spec.template.spec.subdomain = strenv(SERVICE_NAME)
+            | select(di == 1).spec.template.spec.imagePullSecrets[].name = strenv(TOKEN_NAME)
+            | select(di == 1).spec.template.spec.containers[0].image = strenv(CONTAINER_IMAGE)
+            | select(di == 1).spec.template.spec.containers[0].command[3] = strenv(SERVICE_NAME)
+            | select(di == 1).spec.template.spec.containers[0].command[4] = strenv(JOB_NAME)
+            | select(di == 1).spec.template.spec.containers[1].command[3] = strenv(JOB_NAME)' \
+            .github/eks-workflow-files/maxtext-job.yaml
+          git diff .github/eks-workflow-files/maxtext-job.yaml
+      - name: Submit Kubernetes job
+        run: kubectl apply -f .github/eks-workflow-files/maxtext-job.yaml
+      - name: Wait for Kubernetes job to start
+        run: |
+          # Launcher job is created eagerly, but suspended. Kueue un-suspends it when
+          # resources are available, but that is where there can be a long wait if the
+          # cluster is busy executing other jobs.
+          kubectl wait --for=create job/${JOB_NAME}
+          kubectl wait --for=jsonpath='{.spec.suspend}=false' job/${JOB_NAME} --timeout=3600s
+      - name: Stream Kubernetes job output
+        run: |
+          # Streaming logs will fail if the container/pod is still pending
+          while [[ -n $(kubectl get pods --selector=batch.kubernetes.io/job-name=${JOB_NAME} --output=jsonpath='{.items[?(@.status.phase == "Pending")].metadata.name}') ]]; do
+            sleep 1
+          done
+          kubectl logs --all-containers=true --all-pods=true --follow job/${JOB_NAME}
+      - name: Retrieve Kubernetes job status
+        shell: bash -exo pipefail {0}
+        run: |
+          while readarray -d : -t status < <(kubectl get job/${JOB_NAME} -o 'jsonpath={.status.failed}:{.status.succeeded}'); do
+            failure=${status[0]:-0}
+            success=${status[1]:-0}
+            total=$((failure+success))
+            if [[ ${total} < 2 ]]; then
+              sleep 1
+            elif [[ ${total} == 2 ]]; then
+              break
+            else
+              # FIXME
+              exit 255
+            fi
+          done
+          exit ${failure}
+      # Provide more debug output in case of failure; note that some kinds of launch
+      # failure do not produce any log output.
+      - name: Debug failed Kubernetes job
+        if: failure()
+        run: |
+          # Provide better debug in case of launch failures that will not produce log output
+          pods=$(kubectl get pods --selector=batch.kubernetes.io/job-name=${JOB_NAME} -o name)
+          if [[ -n "${pods}" ]]; then
+            kubectl describe ${pods}
+          fi
+      # Clean up in case of errors as well as success
+      - name: Delete Kubernetes job
+        if: always()
+        run: kubectl delete -f .github/eks-workflow-files/maxtext-job.yaml
+      - name: Delete GitHub Container Registry token
+        if: always()
+        run: kubectl delete secret ${TOKEN_NAME}
diff --git a/.github/workflows/ngc-release-testing.yaml b/.github/workflows/ngc-release-testing.yaml
index 15e0ed1f7..ac2bea923 100644
--- a/.github/workflows/ngc-release-testing.yaml
+++ b/.github/workflows/ngc-release-testing.yaml
@@ -45,7 +45,7 @@ jobs:
         docker run -i --shm-size=1g --gpus all \
         ${{ inputs.JAX_IMAGE }} \
         bash <<"EOF" |& tee test-backend-independent.log
-          test-jax.sh -b backend-independent 
+          test-jax.sh -b backend-independent
         EOF
         docker run -i --shm-size=1g --gpus all \
         ${{ inputs.JAX_IMAGE }} \
@@ -80,8 +80,15 @@ jobs:
       MAXTEXT_IMAGE: ${{ inputs.MAXTEXT_IMAGE }}
     secrets: inherit
 
+  test-maxtext-eks:
+    if: inputs.MAXTEXT_IMAGE != ''
+    uses: ./.github/workflows/_test_maxtext_k8s.yaml
+    with:
+      MAXTEXT_IMAGE: ${{ inputs.MAXTEXT_IMAGE }}
+    secrets: inherit
+
   finalize:
-    needs: [ test-nccl, test-jax, test-rosetta-pax, test-maxtext ]
+    needs: [ test-nccl, test-jax, test-rosetta-pax, test-maxtext, test-maxtext-eks ]
     if: "!cancelled()"
     uses: ./.github/workflows/_finalize.yaml
-    secrets: inherit
\ No newline at end of file
+    secrets: inherit