diff --git a/.github/workflows/build-cuda.yml b/.github/workflows/build-cuda.yml index 89586ac735..bf27474e03 100644 --- a/.github/workflows/build-cuda.yml +++ b/.github/workflows/build-cuda.yml @@ -2,41 +2,53 @@ name: Build CUDA on: workflow_call: + inputs: + cuda-version: + description: 'CUDA version (e.g. 12.8, 13.1)' + required: true + type: string + python-version: + description: 'Python version (e.g. 3.10, 3.12)' + required: true + type: string + torch-spec: + description: 'PyTorch install spec (e.g. --pre torch --extra-index-url ...)' + required: true + type: string + artifact-name: + description: 'Name for the uploaded wheel artifact' + required: true + type: string + docker-image: + description: 'Docker image to use for the build' + required: true + type: string concurrency: - group: build-cuda-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }} + group: build-cuda-${{ inputs.cuda-version }}-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }} cancel-in-progress: true jobs: build-cuda: - name: Build CUDA (cuda12.8) + name: Build CUDA (cuda${{ inputs.cuda-version }}-py${{ inputs.python-version }}) uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main - strategy: - fail-fast: true - matrix: - python-version: ['3.10', '3.12'] - include: - - name: 4xlargegpu - runs-on: linux.g5.4xlarge.nvidia.gpu - torch-spec: '--pre torch --extra-index-url https://download.pytorch.org/whl/nightly/cu128' - gpu-arch-type: "cuda" - gpu-arch-version: "12.8" with: timeout: 60 - runner: ${{ matrix.runs-on }} - gpu-arch-type: ${{ matrix.gpu-arch-type }} - gpu-arch-version: ${{ matrix.gpu-arch-version }} + runner: linux.g5.4xlarge.nvidia.gpu + gpu-arch-type: cuda + gpu-arch-version: ${{ inputs.cuda-version }} + docker-image: ${{ inputs.docker-image }} submodules: recursive - upload-artifact: monarch-cuda-${{ github.sha }}-py${{ matrix.python-version }} + upload-artifact: ${{ inputs.artifact-name }} script: | # Source common setup functions source scripts/common-setup.sh # Setup build environment (conda + system deps + rust + build deps) - setup_build_environment ${{ matrix.python-version }} + setup_build_environment ${{ inputs.python-version }} # Install torch nightly - pip install ${{ matrix.torch-spec }} + pip install ${{ inputs.torch-spec }} pip install -r build-requirements.txt # Setup Tensor Engine diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 627c5dbeb4..6c4f89bf89 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -15,9 +15,35 @@ concurrency: cancel-in-progress: true jobs: - build-cuda: - name: Build CUDA + build-cuda-12-8-py3-10: + name: Build CUDA 12.8 / py3.10 uses: ./.github/workflows/build-cuda.yml + with: + cuda-version: '12.8' + python-version: '3.10' + torch-spec: '--pre torch --extra-index-url https://download.pytorch.org/whl/nightly/cu128' + artifact-name: monarch-cuda12.8-${{ github.sha }}-py3.10 + docker-image: 'pytorch/almalinux-builder:cuda12.8' + + build-cuda-12-8-py3-12: + name: Build CUDA 12.8 / py3.12 + uses: ./.github/workflows/build-cuda.yml + with: + cuda-version: '12.8' + python-version: '3.12' + torch-spec: '--pre torch --extra-index-url https://download.pytorch.org/whl/nightly/cu128' + artifact-name: monarch-cuda12.8-${{ github.sha }}-py3.12 + docker-image: 'pytorch/almalinux-builder:cuda12.8' + + build-cuda-13-0-py3-10: + name: Build CUDA 13.0 / py3.10 + uses: ./.github/workflows/build-cuda.yml + with: + cuda-version: '13.0' + python-version: '3.10' + torch-spec: '--pre torch --extra-index-url https://download.pytorch.org/whl/nightly/cu130' + artifact-name: monarch-cuda13.0-${{ github.sha }}-py3.10 + docker-image: 'pytorch/almalinux-builder:cuda13.0' build-rocm: name: Build ROCm @@ -35,11 +61,28 @@ jobs: artifact-name: monarch-cpu-${{ github.sha }}-py3.10 test-gpu-python: - name: Test GPU Python - needs: build-cuda + name: Test GPU Python (cuda${{ matrix.cuda-version }}-py${{ matrix.python-version }}) + needs: [build-cuda-12-8-py3-10, build-cuda-13-0-py3-10] + strategy: + fail-fast: true + matrix: + python-version: ['3.10'] + cuda-version: ['12.8', '13.0'] + include: + - cuda-version: '12.8' + torch-spec: '--pre torch --extra-index-url https://download.pytorch.org/whl/nightly/cu128' + docker-image: 'pytorch/almalinux-builder:cuda12.8' + - cuda-version: '13.0' + torch-spec: '--pre torch --extra-index-url https://download.pytorch.org/whl/nightly/cu130' + docker-image: 'pytorch/almalinux-builder:cuda13.0' uses: ./.github/workflows/test-gpu-python.yml with: - artifact-name: monarch-cuda-${{ github.sha }}-py3.10 + artifact-name: monarch-cuda${{ matrix.cuda-version }}-${{ github.sha }}-py${{ matrix.python-version }} + torch-spec: ${{ matrix.torch-spec }} + gpu-arch-type: cuda + gpu-arch-version: ${{ matrix.cuda-version }} + python-version: ${{ matrix.python-version }} + docker-image: ${{ matrix.docker-image }} test-cpu-rust: name: Test CPU Rust @@ -53,19 +96,33 @@ jobs: docker-image: 'pytorch/manylinuxaarch64-builder:cuda12.8' test-gpu-rust: - name: Test GPU Rust - needs: build-cuda + name: Test GPU Rust (cuda${{ matrix.cuda-version }}-py${{ matrix.python-version }}) + strategy: + fail-fast: true + matrix: + python-version: ['3.10'] + cuda-version: ['12.8', '13.0'] + include: + - cuda-version: '12.8' + torch-spec: '--pre torch --extra-index-url https://download.pytorch.org/whl/nightly/cu128' + docker-image: 'pytorch/almalinux-builder:cuda12.8' + - cuda-version: '13.0' + torch-spec: '--pre torch --extra-index-url https://download.pytorch.org/whl/nightly/cu130' + docker-image: 'pytorch/almalinux-builder:cuda13.0' uses: ./.github/workflows/test-gpu-rust.yml with: - artifact-name: monarch-cuda-${{ github.sha }}-py3.10 + torch-spec: ${{ matrix.torch-spec }} + gpu-arch-type: cuda + gpu-arch-version: ${{ matrix.cuda-version }} + python-version: ${{ matrix.python-version }} + docker-image: ${{ matrix.docker-image }} build-docker: name: Build Docker image - needs: build-cuda + needs: build-cuda-12-8-py3-12 uses: ./.github/workflows/build-docker.yml with: - # Docker image requires python 3.12 - artifact-name: monarch-cuda-${{ github.sha }}-py3.12 + artifact-name: monarch-cuda12.8-${{ github.sha }}-py3.12 status-check: name: Status Check diff --git a/.github/workflows/test-gpu-python.yml b/.github/workflows/test-gpu-python.yml index afc961e20c..03e0c8102f 100644 --- a/.github/workflows/test-gpu-python.yml +++ b/.github/workflows/test-gpu-python.yml @@ -7,6 +7,26 @@ on: description: 'Wheel artifact name from build workflow' required: true type: string + torch-spec: + description: 'PyTorch install spec (e.g. --pre torch --extra-index-url ...)' + required: true + type: string + gpu-arch-type: + description: 'GPU architecture type (e.g. cuda)' + required: true + type: string + gpu-arch-version: + description: 'GPU architecture version (e.g. 12.8, 13.1)' + required: true + type: string + python-version: + description: 'Python version (e.g. 3.10, 3.12)' + required: true + type: string + docker-image: + description: 'Docker image to use for the test' + required: true + type: string concurrency: group: test-gpu-python-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }} @@ -14,22 +34,14 @@ concurrency: jobs: test-gpu-python: - name: Test GPU Python (cuda12.8-py3.10) + name: Test GPU Python (cuda${{ inputs.gpu-arch-version }}-py${{ inputs.python-version }}) uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main - strategy: - fail-fast: true - matrix: - include: - - name: 4xlargegpu - runs-on: linux.g5.4xlarge.nvidia.gpu - torch-spec: '--pre torch --extra-index-url https://download.pytorch.org/whl/nightly/cu128' - gpu-arch-type: "cuda" - gpu-arch-version: "12.8" with: timeout: 120 - runner: ${{ matrix.runs-on }} - gpu-arch-type: ${{ matrix.gpu-arch-type }} - gpu-arch-version: ${{ matrix.gpu-arch-version }} + runner: linux.g5.4xlarge.nvidia.gpu + gpu-arch-type: ${{ inputs.gpu-arch-type }} + gpu-arch-version: ${{ inputs.gpu-arch-version }} + docker-image: ${{ inputs.docker-image }} submodules: recursive download-artifact: ${{ inputs.artifact-name }} script: | @@ -51,7 +63,7 @@ jobs: # Install torch nightly before installing the wheel, # so that we can test the wheel against the latest nightly - pip install ${{ matrix.torch-spec }} + pip install ${{ inputs.torch-spec }} # Install the built wheel from artifact install_wheel_from_artifact diff --git a/.github/workflows/test-gpu-rust.yml b/.github/workflows/test-gpu-rust.yml index 6f788c4c35..f6230db8e2 100644 --- a/.github/workflows/test-gpu-rust.yml +++ b/.github/workflows/test-gpu-rust.yml @@ -3,8 +3,24 @@ name: Test GPU Rust on: workflow_call: inputs: - artifact-name: - description: 'Wheel artifact name from build workflow' + torch-spec: + description: 'PyTorch install spec (e.g. --pre torch --extra-index-url ...)' + required: true + type: string + gpu-arch-type: + description: 'GPU architecture type (e.g. cuda)' + required: true + type: string + gpu-arch-version: + description: 'GPU architecture version (e.g. 12.8, 13.1)' + required: true + type: string + python-version: + description: 'Python version (e.g. 3.10, 3.12)' + required: true + type: string + docker-image: + description: 'Docker image to use for the test' required: true type: string @@ -14,24 +30,15 @@ concurrency: jobs: test-gpu-rust: - name: Test GPU Rust (cuda12.8) + name: Test GPU Rust (cuda${{ inputs.gpu-arch-version }}-py${{ inputs.python-version }}) uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main - strategy: - fail-fast: true - matrix: - include: - - name: 4xlargegpu - runs-on: linux.g5.4xlarge.nvidia.gpu - torch-spec: '--pre torch --extra-index-url https://download.pytorch.org/whl/nightly/cu128' - gpu-arch-type: "cuda" - gpu-arch-version: "12.8" with: timeout: 120 - runner: ${{ matrix.runs-on }} - gpu-arch-type: ${{ matrix.gpu-arch-type }} - gpu-arch-version: ${{ matrix.gpu-arch-version }} + runner: linux.g5.4xlarge.nvidia.gpu + gpu-arch-type: ${{ inputs.gpu-arch-type }} + gpu-arch-version: ${{ inputs.gpu-arch-version }} + docker-image: ${{ inputs.docker-image }} submodules: recursive - download-artifact: ${{ inputs.artifact-name }} script: | # Source common setup functions source scripts/common-setup.sh @@ -56,7 +63,7 @@ jobs: setup_cuda_environment # Setup PyTorch with C++ headers using common-setup utility - setup_pytorch_with_headers "${{ matrix.gpu-arch-version }}" "${{ matrix.torch-spec }}" + setup_pytorch_with_headers "${{ inputs.gpu-arch-version }}" "${{ inputs.torch-spec }}" # Run GPU Rust tests echo "Running OSS Rust tests..." diff --git a/rdmaxcel-sys/src/driver_api.cpp b/rdmaxcel-sys/src/driver_api.cpp index 18c219b2ba..ea6e03ed69 100644 --- a/rdmaxcel-sys/src/driver_api.cpp +++ b/rdmaxcel-sys/src/driver_api.cpp @@ -59,6 +59,12 @@ #define SYM_DEVICE_GET cuDeviceGet #define SYM_DEVICE_GET_COUNT cuDeviceGetCount #define SYM_DEVICE_GET_ATTRIBUTE cuDeviceGetAttribute +// CUDA 13.x removed cuCtxCreate_v2 from headers, but libcuda.so still +// exports it for backward compatibility. Provide our own declaration so +// decltype and STRINGIFY resolve correctly. +#if CUDA_VERSION >= 13000 +CUresult CUDAAPI cuCtxCreate_v2(CUcontext *pctx, unsigned int flags, CUdevice dev); +#endif #define SYM_CTX_CREATE cuCtxCreate_v2 #define SYM_DEVICE_PRIMARY_CTX_RETAIN cuDevicePrimaryCtxRetain #define SYM_CTX_SET_CURRENT cuCtxSetCurrent diff --git a/scripts/common-setup.sh b/scripts/common-setup.sh index 667cf3e658..e6c7875960 100755 --- a/scripts/common-setup.sh +++ b/scripts/common-setup.sh @@ -139,19 +139,24 @@ setup_pytorch_with_headers() { local cuda_version_short=$(echo "${gpu_arch_version}" | tr -d '.') local libtorch_url="https://download.pytorch.org/libtorch/nightly/cu${cuda_version_short}/libtorch-cxx11-abi-shared-with-deps-latest.zip" + # Install PyTorch Python package first (needed as fallback for headers) + echo "Installing PyTorch Python package with: ${torch_spec}" + pip install ${torch_spec} + echo "Downloading libtorch from: ${libtorch_url}" - wget -q "${libtorch_url}" - unzip -q "libtorch-cxx11-abi-shared-with-deps-latest.zip" + if wget -q "${libtorch_url}" && unzip -q "libtorch-cxx11-abi-shared-with-deps-latest.zip"; then + export LIBTORCH_ROOT="$PWD/libtorch" + else + # Libtorch zip not available (e.g. newer CUDA versions); fall back to + # the pip-installed torch package which includes C++ headers and libs. + echo "Libtorch download unavailable, using pip-installed torch for C++ headers" + export LIBTORCH_ROOT=$(python -c "import torch; print(torch.utils.cmake_prefix_path)")/../../ + fi # Set environment variables for libtorch - export LIBTORCH_ROOT="$PWD/libtorch" export LD_LIBRARY_PATH="$LIBTORCH_ROOT/lib:${LD_LIBRARY_PATH:-}" export CMAKE_PREFIX_PATH="$LIBTORCH_ROOT:${CMAKE_PREFIX_PATH:-}" - # Install PyTorch Python package using provided torch-spec - echo "Installing PyTorch Python package with: ${torch_spec}" - pip install ${torch_spec} - # Verify installation echo "LibTorch C++ headers available at: $LIBTORCH_ROOT/include" if [[ -d "$LIBTORCH_ROOT/include/torch/csrc/api/include/torch" ]]; then