meta-pytorch · dulinriley · Apr 1, 2026
diff --git a/.github/workflows/build-cuda.yml b/.github/workflows/build-cuda.yml
@@ -2,41 +2,53 @@ name: Build CUDA
 
 on:
   workflow_call:
+    inputs:
+      cuda-version:
+        description: 'CUDA version (e.g. 12.8, 13.1)'
+        required: true
+        type: string
+      python-version:
+        description: 'Python version (e.g. 3.10, 3.12)'
+        required: true
+        type: string
+      torch-spec:
+        description: 'PyTorch install spec (e.g. --pre torch --extra-index-url ...)'
+        required: true
+        type: string
+      artifact-name:
+        description: 'Name for the uploaded wheel artifact'
+        required: true
+        type: string
+      docker-image:
+        description: 'Docker image to use for the build'
+        required: true
+        type: string
 
 concurrency:
-  group: build-cuda-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
+  group: build-cuda-${{ inputs.cuda-version }}-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
   cancel-in-progress: true
 
 jobs:
   build-cuda:
-    name: Build CUDA (cuda12.8)
+    name: Build CUDA (cuda${{ inputs.cuda-version }}-py${{ inputs.python-version }})
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
-    strategy:
-      fail-fast: true
-      matrix:
-        python-version: ['3.10', '3.12']
-        include:
-          - name: 4xlargegpu
-            runs-on: linux.g5.4xlarge.nvidia.gpu
-            torch-spec: '--pre torch --extra-index-url https://download.pytorch.org/whl/nightly/cu128'
-            gpu-arch-type: "cuda"
-            gpu-arch-version: "12.8"
     with:
       timeout: 60
-      runner: ${{ matrix.runs-on }}
-      gpu-arch-type: ${{ matrix.gpu-arch-type }}
-      gpu-arch-version: ${{ matrix.gpu-arch-version }}
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: ${{ inputs.cuda-version }}
+      docker-image: ${{ inputs.docker-image }}
       submodules: recursive
-      upload-artifact: monarch-cuda-${{ github.sha }}-py${{ matrix.python-version }}
+      upload-artifact: ${{ inputs.artifact-name }}
       script: |
         # Source common setup functions
         source scripts/common-setup.sh
 
         # Setup build environment (conda + system deps + rust + build deps)
-        setup_build_environment ${{ matrix.python-version }}
+        setup_build_environment ${{ inputs.python-version }}
 
         # Install torch nightly
-        pip install ${{ matrix.torch-spec }}
+        pip install ${{ inputs.torch-spec }}
         pip install -r build-requirements.txt
 
         # Setup Tensor Engine

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -15,9 +15,35 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  build-cuda:
-    name: Build CUDA
+  build-cuda-12-8-py3-10:
+    name: Build CUDA 12.8 / py3.10
     uses: ./.github/workflows/build-cuda.yml
+    with:
+      cuda-version: '12.8'
+      python-version: '3.10'
+      torch-spec: '--pre torch --extra-index-url https://download.pytorch.org/whl/nightly/cu128'
+      artifact-name: monarch-cuda12.8-${{ github.sha }}-py3.10
+      docker-image: 'pytorch/almalinux-builder:cuda12.8'
+
+  build-cuda-12-8-py3-12:
+    name: Build CUDA 12.8 / py3.12
+    uses: ./.github/workflows/build-cuda.yml
+    with:
+      cuda-version: '12.8'
+      python-version: '3.12'
+      torch-spec: '--pre torch --extra-index-url https://download.pytorch.org/whl/nightly/cu128'
+      artifact-name: monarch-cuda12.8-${{ github.sha }}-py3.12
+      docker-image: 'pytorch/almalinux-builder:cuda12.8'
+
+  build-cuda-13-0-py3-10:
+    name: Build CUDA 13.0 / py3.10
+    uses: ./.github/workflows/build-cuda.yml
+    with:
+      cuda-version: '13.0'
+      python-version: '3.10'
+      torch-spec: '--pre torch --extra-index-url https://download.pytorch.org/whl/nightly/cu130'
+      artifact-name: monarch-cuda13.0-${{ github.sha }}-py3.10
+      docker-image: 'pytorch/almalinux-builder:cuda13.0'
 
   build-rocm:
     name: Build ROCm
@@ -35,11 +61,28 @@ jobs:
       artifact-name: monarch-cpu-${{ github.sha }}-py3.10
 
   test-gpu-python:
-    name: Test GPU Python
-    needs: build-cuda
+    name: Test GPU Python (cuda${{ matrix.cuda-version }}-py${{ matrix.python-version }})
+    needs: [build-cuda-12-8-py3-10, build-cuda-13-0-py3-10]
+    strategy:
+      fail-fast: true
+      matrix:
+        python-version: ['3.10']
+        cuda-version: ['12.8', '13.0']
+        include:
+          - cuda-version: '12.8'
+            torch-spec: '--pre torch --extra-index-url https://download.pytorch.org/whl/nightly/cu128'
+            docker-image: 'pytorch/almalinux-builder:cuda12.8'
+          - cuda-version: '13.0'
+            torch-spec: '--pre torch --extra-index-url https://download.pytorch.org/whl/nightly/cu130'
+            docker-image: 'pytorch/almalinux-builder:cuda13.0'
     uses: ./.github/workflows/test-gpu-python.yml
     with:
-      artifact-name: monarch-cuda-${{ github.sha }}-py3.10
+      artifact-name: monarch-cuda${{ matrix.cuda-version }}-${{ github.sha }}-py${{ matrix.python-version }}
+      torch-spec: ${{ matrix.torch-spec }}
+      gpu-arch-type: cuda
+      gpu-arch-version: ${{ matrix.cuda-version }}
+      python-version: ${{ matrix.python-version }}
+      docker-image: ${{ matrix.docker-image }}
 
   test-cpu-rust:
     name: Test CPU Rust
@@ -53,19 +96,33 @@ jobs:
       docker-image: 'pytorch/manylinuxaarch64-builder:cuda12.8'
 
   test-gpu-rust:
-    name: Test GPU Rust
-    needs: build-cuda
+    name: Test GPU Rust (cuda${{ matrix.cuda-version }}-py${{ matrix.python-version }})
+    strategy:
+      fail-fast: true
+      matrix:
+        python-version: ['3.10']
+        cuda-version: ['12.8', '13.0']
+        include:
+          - cuda-version: '12.8'
+            torch-spec: '--pre torch --extra-index-url https://download.pytorch.org/whl/nightly/cu128'
+            docker-image: 'pytorch/almalinux-builder:cuda12.8'
+          - cuda-version: '13.0'
+            torch-spec: '--pre torch --extra-index-url https://download.pytorch.org/whl/nightly/cu130'
+            docker-image: 'pytorch/almalinux-builder:cuda13.0'
     uses: ./.github/workflows/test-gpu-rust.yml
     with:
-      artifact-name: monarch-cuda-${{ github.sha }}-py3.10
+      torch-spec: ${{ matrix.torch-spec }}
+      gpu-arch-type: cuda
+      gpu-arch-version: ${{ matrix.cuda-version }}
+      python-version: ${{ matrix.python-version }}
+      docker-image: ${{ matrix.docker-image }}
 
   build-docker:
     name: Build Docker image
-    needs: build-cuda
+    needs: build-cuda-12-8-py3-12
     uses: ./.github/workflows/build-docker.yml
     with:
-      # Docker image requires python 3.12
-      artifact-name: monarch-cuda-${{ github.sha }}-py3.12
+      artifact-name: monarch-cuda12.8-${{ github.sha }}-py3.12
 
   status-check:
     name: Status Check

diff --git a/.github/workflows/test-gpu-python.yml b/.github/workflows/test-gpu-python.yml
@@ -7,29 +7,41 @@ on:
         description: 'Wheel artifact name from build workflow'
         required: true
         type: string
+      torch-spec:
+        description: 'PyTorch install spec (e.g. --pre torch --extra-index-url ...)'
+        required: true
+        type: string
+      gpu-arch-type:
+        description: 'GPU architecture type (e.g. cuda)'
+        required: true
+        type: string
+      gpu-arch-version:
+        description: 'GPU architecture version (e.g. 12.8, 13.1)'
+        required: true
+        type: string
+      python-version:
+        description: 'Python version (e.g. 3.10, 3.12)'
+        required: true
+        type: string
+      docker-image:
+        description: 'Docker image to use for the test'
+        required: true
+        type: string
 
 concurrency:
   group: test-gpu-python-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
   cancel-in-progress: true
 
 jobs:
   test-gpu-python:
-    name: Test GPU Python (cuda12.8-py3.10)
+    name: Test GPU Python (cuda${{ inputs.gpu-arch-version }}-py${{ inputs.python-version }})
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
-    strategy:
-      fail-fast: true
-      matrix:
-        include:
-          - name: 4xlargegpu
-            runs-on: linux.g5.4xlarge.nvidia.gpu
-            torch-spec: '--pre torch --extra-index-url https://download.pytorch.org/whl/nightly/cu128'
-            gpu-arch-type: "cuda"
-            gpu-arch-version: "12.8"
     with:
       timeout: 120
-      runner: ${{ matrix.runs-on }}
-      gpu-arch-type: ${{ matrix.gpu-arch-type }}
-      gpu-arch-version: ${{ matrix.gpu-arch-version }}
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: ${{ inputs.gpu-arch-type }}
+      gpu-arch-version: ${{ inputs.gpu-arch-version }}
+      docker-image: ${{ inputs.docker-image }}
       submodules: recursive
       download-artifact: ${{ inputs.artifact-name }}
       script: |
@@ -51,7 +63,7 @@ jobs:
 
         # Install torch nightly before installing the wheel,
         # so that we can test the wheel against the latest nightly
-        pip install ${{ matrix.torch-spec }}
+        pip install ${{ inputs.torch-spec }}
 
         # Install the built wheel from artifact
         install_wheel_from_artifact

diff --git a/.github/workflows/test-gpu-rust.yml b/.github/workflows/test-gpu-rust.yml
@@ -3,8 +3,24 @@ name: Test GPU Rust
 on:
   workflow_call:
     inputs:
-      artifact-name:
-        description: 'Wheel artifact name from build workflow'
+      torch-spec:
+        description: 'PyTorch install spec (e.g. --pre torch --extra-index-url ...)'
+        required: true
+        type: string
+      gpu-arch-type:
+        description: 'GPU architecture type (e.g. cuda)'
+        required: true
+        type: string
+      gpu-arch-version:
+        description: 'GPU architecture version (e.g. 12.8, 13.1)'
+        required: true
+        type: string
+      python-version:
+        description: 'Python version (e.g. 3.10, 3.12)'
+        required: true
+        type: string
+      docker-image:
+        description: 'Docker image to use for the test'
         required: true
         type: string
 
@@ -14,24 +30,15 @@ concurrency:
 
 jobs:
   test-gpu-rust:
-    name: Test GPU Rust (cuda12.8)
+    name: Test GPU Rust (cuda${{ inputs.gpu-arch-version }}-py${{ inputs.python-version }})
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
-    strategy:
-      fail-fast: true
-      matrix:
-        include:
-          - name: 4xlargegpu
-            runs-on: linux.g5.4xlarge.nvidia.gpu
-            torch-spec: '--pre torch --extra-index-url https://download.pytorch.org/whl/nightly/cu128'
-            gpu-arch-type: "cuda"
-            gpu-arch-version: "12.8"
     with:
       timeout: 120
-      runner: ${{ matrix.runs-on }}
-      gpu-arch-type: ${{ matrix.gpu-arch-type }}
-      gpu-arch-version: ${{ matrix.gpu-arch-version }}
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: ${{ inputs.gpu-arch-type }}
+      gpu-arch-version: ${{ inputs.gpu-arch-version }}
+      docker-image: ${{ inputs.docker-image }}
       submodules: recursive
-      download-artifact: ${{ inputs.artifact-name }}
       script: |
         # Source common setup functions
         source scripts/common-setup.sh
@@ -56,7 +63,7 @@ jobs:
         setup_cuda_environment
 
         # Setup PyTorch with C++ headers using common-setup utility
-        setup_pytorch_with_headers "${{ matrix.gpu-arch-version }}" "${{ matrix.torch-spec }}"
+        setup_pytorch_with_headers "${{ inputs.gpu-arch-version }}" "${{ inputs.torch-spec }}"
 
         # Run GPU Rust tests
         echo "Running OSS Rust tests..."

diff --git a/rdmaxcel-sys/src/driver_api.cpp b/rdmaxcel-sys/src/driver_api.cpp
@@ -59,6 +59,12 @@
 #define SYM_DEVICE_GET cuDeviceGet
 #define SYM_DEVICE_GET_COUNT cuDeviceGetCount
 #define SYM_DEVICE_GET_ATTRIBUTE cuDeviceGetAttribute
+// CUDA 13.x removed cuCtxCreate_v2 from headers, but libcuda.so still
+// exports it for backward compatibility. Provide our own declaration so
+// decltype and STRINGIFY resolve correctly.
+#if CUDA_VERSION >= 13000
+CUresult CUDAAPI cuCtxCreate_v2(CUcontext *pctx, unsigned int flags, CUdevice dev);
+#endif
 #define SYM_CTX_CREATE cuCtxCreate_v2
 #define SYM_DEVICE_PRIMARY_CTX_RETAIN cuDevicePrimaryCtxRetain
 #define SYM_CTX_SET_CURRENT cuCtxSetCurrent

diff --git a/scripts/common-setup.sh b/scripts/common-setup.sh
@@ -139,19 +139,24 @@ setup_pytorch_with_headers() {
     local cuda_version_short=$(echo "${gpu_arch_version}" | tr -d '.')
     local libtorch_url="https://download.pytorch.org/libtorch/nightly/cu${cuda_version_short}/libtorch-cxx11-abi-shared-with-deps-latest.zip"
 
+    # Install PyTorch Python package first (needed as fallback for headers)
+    echo "Installing PyTorch Python package with: ${torch_spec}"
+    pip install ${torch_spec}
+
     echo "Downloading libtorch from: ${libtorch_url}"
-    wget -q "${libtorch_url}"
-    unzip -q "libtorch-cxx11-abi-shared-with-deps-latest.zip"
+    if wget -q "${libtorch_url}" && unzip -q "libtorch-cxx11-abi-shared-with-deps-latest.zip"; then
+        export LIBTORCH_ROOT="$PWD/libtorch"
+    else
+        # Libtorch zip not available (e.g. newer CUDA versions); fall back to
+        # the pip-installed torch package which includes C++ headers and libs.
+        echo "Libtorch download unavailable, using pip-installed torch for C++ headers"
+        export LIBTORCH_ROOT=$(python -c "import torch; print(torch.utils.cmake_prefix_path)")/../../
+    fi
 
     # Set environment variables for libtorch
-    export LIBTORCH_ROOT="$PWD/libtorch"
     export LD_LIBRARY_PATH="$LIBTORCH_ROOT/lib:${LD_LIBRARY_PATH:-}"
     export CMAKE_PREFIX_PATH="$LIBTORCH_ROOT:${CMAKE_PREFIX_PATH:-}"
 
-    # Install PyTorch Python package using provided torch-spec
-    echo "Installing PyTorch Python package with: ${torch_spec}"
-    pip install ${torch_spec}
-
     # Verify installation
     echo "LibTorch C++ headers available at: $LIBTORCH_ROOT/include"
     if [[ -d "$LIBTORCH_ROOT/include/torch/csrc/api/include/torch" ]]; then