ROCm · jithunnair-amd · Sep 29, 2025 · Sep 29, 2025 · Sep 29, 2025 · Sep 29, 2025
diff --git a/.ci/aarch64_linux/aarch64_ci_build.sh b/.ci/aarch64_linux/aarch64_ci_build.sh
@@ -15,6 +15,8 @@ fi
 # Compress the fatbin with -compress-mode=size for CUDA 13
 if [[ "$DESIRED_CUDA" == *"13"* ]]; then
     export TORCH_NVCC_FLAGS="-compress-mode=size"
+    # Bundle ptxas into the cu13 wheel, see https://github.com/pytorch/pytorch/issues/163801
+    export BUILD_BUNDLE_PTXAS=1
 fi
 
 SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"

diff --git a/.ci/aarch64_linux/aarch64_wheel_ci_build.py b/.ci/aarch64_linux/aarch64_wheel_ci_build.py
@@ -13,49 +13,6 @@ def list_dir(path: str) -> list[str]:
     return check_output(["ls", "-1", path]).decode().split("\n")
 
 
-def build_ArmComputeLibrary() -> None:
-    """
-    Using ArmComputeLibrary for aarch64 PyTorch
-    """
-    print("Building Arm Compute Library")
-    acl_build_flags = [
-        "debug=0",
-        "neon=1",
-        "opencl=0",
-        "os=linux",
-        "openmp=1",
-        "cppthreads=0",
-        "arch=armv8a",
-        "multi_isa=1",
-        "fixed_format_kernels=1",
-        "build=native",
-    ]
-    acl_install_dir = "/acl"
-    acl_checkout_dir = os.getenv("ACL_SOURCE_DIR", "ComputeLibrary")
-    if os.path.isdir(acl_install_dir):
-        shutil.rmtree(acl_install_dir)
-    if not os.path.isdir(acl_checkout_dir) or not len(os.listdir(acl_checkout_dir)):
-        check_call(
-            [
-                "git",
-                "clone",
-                "https://github.com/ARM-software/ComputeLibrary.git",
-                "-b",
-                "v25.02",
-                "--depth",
-                "1",
-                "--shallow-submodules",
-            ]
-        )
-
-    check_call(
-        ["scons", "Werror=1", f"-j{os.cpu_count()}"] + acl_build_flags,
-        cwd=acl_checkout_dir,
-    )
-    for d in ["arm_compute", "include", "utils", "support", "src", "build"]:
-        shutil.copytree(f"{acl_checkout_dir}/{d}", f"{acl_install_dir}/{d}")
-
-
 def replace_tag(filename) -> None:
     with open(filename) as f:
         lines = f.readlines()
@@ -356,23 +313,17 @@ def parse_arguments():
         build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1 : branch.find('-')]} PYTORCH_BUILD_NUMBER=1 "
 
     if enable_mkldnn:
-        build_ArmComputeLibrary()
         print("build pytorch with mkldnn+acl backend")
-        build_vars += (
-            "USE_MKLDNN=ON USE_MKLDNN_ACL=ON "
-            "ACL_ROOT_DIR=/acl "
-            "LD_LIBRARY_PATH=/pytorch/build/lib:/acl/build:$LD_LIBRARY_PATH "
-            "ACL_INCLUDE_DIR=/acl/build "
-            "ACL_LIBRARY=/acl/build "
-        )
+        build_vars += "USE_MKLDNN=ON USE_MKLDNN_ACL=ON "
+        build_vars += "ACL_ROOT_DIR=/acl "
         if enable_cuda:
             build_vars += "BLAS=NVPL "
         else:
-            build_vars += "BLAS=OpenBLAS OpenBLAS_HOME=/OpenBLAS "
+            build_vars += "BLAS=OpenBLAS OpenBLAS_HOME=/opt/OpenBLAS "
     else:
         print("build pytorch without mkldnn backend")
 
-    os.system(f"cd /pytorch; {build_vars} python3 setup.py bdist_wheel")
+    os.system(f"cd /pytorch; {build_vars} python3 -m build --wheel --no-isolation")
     if enable_cuda:
         print("Updating Cuda Dependency")
         filename = os.listdir("/pytorch/dist/")

diff --git a/.ci/aarch64_linux/build_aarch64_wheel.py b/.ci/aarch64_linux/build_aarch64_wheel.py
@@ -299,40 +299,6 @@ def install_condaforge_python(host: RemoteHost, python_version="3.8") -> None:
         )
 
 
-def build_OpenBLAS(host: RemoteHost, git_clone_flags: str = "") -> None:
-    print("Building OpenBLAS")
-    host.run_cmd(
-        f"git clone https://github.com/xianyi/OpenBLAS -b v0.3.28 {git_clone_flags}"
-    )
-    make_flags = "NUM_THREADS=64 USE_OPENMP=1 NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=ARMV8"
-    host.run_cmd(
-        f"pushd OpenBLAS && make {make_flags} -j8 && sudo make {make_flags} install && popd && rm -rf OpenBLAS"
-    )
-
-
-def build_ArmComputeLibrary(host: RemoteHost, git_clone_flags: str = "") -> None:
-    print("Building Arm Compute Library")
-    acl_build_flags = " ".join(
-        [
-            "debug=0",
-            "neon=1",
-            "opencl=0",
-            "os=linux",
-            "openmp=1",
-            "cppthreads=0",
-            "arch=armv8a",
-            "multi_isa=1",
-            "fixed_format_kernels=1",
-            "build=native",
-        ]
-    )
-    host.run_cmd(
-        f"git clone https://github.com/ARM-software/ComputeLibrary.git -b v25.02 {git_clone_flags}"
-    )
-
-    host.run_cmd(f"cd ComputeLibrary && scons Werror=1 -j8 {acl_build_flags}")
-
-
 def embed_libgomp(host: RemoteHost, use_conda, wheel_name) -> None:
     host.run_cmd("pip3 install auditwheel")
     host.run_cmd(
@@ -442,7 +408,7 @@ def build_torchvision(
     if host.using_docker():
         build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
 
-    host.run_cmd(f"cd vision && {build_vars} python3 setup.py bdist_wheel")
+    host.run_cmd(f"cd vision && {build_vars} python3 -m build --wheel --no-isolation")
     vision_wheel_name = host.list_dir("vision/dist")[0]
     embed_libgomp(host, use_conda, os.path.join("vision", "dist", vision_wheel_name))
 
@@ -497,7 +463,7 @@ def build_torchdata(
     if host.using_docker():
         build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
 
-    host.run_cmd(f"cd data && {build_vars} python3 setup.py bdist_wheel")
+    host.run_cmd(f"cd data && {build_vars} python3 -m build --wheel --no-isolation")
     wheel_name = host.list_dir("data/dist")[0]
     embed_libgomp(host, use_conda, os.path.join("data", "dist", wheel_name))
 
@@ -553,7 +519,7 @@ def build_torchtext(
     if host.using_docker():
         build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
 
-    host.run_cmd(f"cd text && {build_vars} python3 setup.py bdist_wheel")
+    host.run_cmd(f"cd text && {build_vars} python3 -m build --wheel --no-isolation")
     wheel_name = host.list_dir("text/dist")[0]
     embed_libgomp(host, use_conda, os.path.join("text", "dist", wheel_name))
 
@@ -614,7 +580,7 @@ def build_torchaudio(
     host.run_cmd(
         f"cd audio && export FFMPEG_ROOT=$(pwd)/third_party/ffmpeg && export USE_FFMPEG=1 \
         && ./packaging/ffmpeg/build.sh \
-        && {build_vars} python3 setup.py bdist_wheel"
+        && {build_vars} python3 -m build --wheel --no-isolation"
     )
 
     wheel_name = host.list_dir("audio/dist")[0]
@@ -700,7 +666,6 @@ def start_build(
     configure_system(
         host, compiler=compiler, use_conda=use_conda, python_version=python_version
     )
-    build_OpenBLAS(host, git_clone_flags)
 
     if host.using_docker():
         print("Move libgfortant.a into a standard location")
@@ -723,10 +688,12 @@ def start_build(
         f"git clone --recurse-submodules -b {branch} https://github.com/pytorch/pytorch {git_clone_flags}"
     )
 
+    host.run_cmd("pytorch/.ci/docker/common/install_openblas.sh")
+
     print("Building PyTorch wheel")
     build_opts = ""
     if pytorch_build_number is not None:
-        build_opts += f" --build-number {pytorch_build_number}"
+        build_opts += f" -C--build-option=--build-number={pytorch_build_number}"
     # Breakpad build fails on aarch64
     build_vars = "USE_BREAKPAD=0 "
     if branch == "nightly":
@@ -743,15 +710,18 @@ def start_build(
     if host.using_docker():
         build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
     if enable_mkldnn:
-        build_ArmComputeLibrary(host, git_clone_flags)
+        host.run_cmd("pytorch/.ci/docker/common/install_acl.sh")
         print("build pytorch with mkldnn+acl backend")
         build_vars += " USE_MKLDNN=ON USE_MKLDNN_ACL=ON"
+        build_vars += " BLAS=OpenBLAS"
+        build_vars += " OpenBLAS_HOME=/opt/OpenBLAS"
+        build_vars += " ACL_ROOT_DIR=/acl"
         host.run_cmd(
-            f"cd $HOME/pytorch && export ACL_ROOT_DIR=$HOME/ComputeLibrary && {build_vars} python3 setup.py bdist_wheel{build_opts}"
+            f"cd $HOME/pytorch && {build_vars} python3 -m build --wheel --no-isolation{build_opts}"
         )
         print("Repair the wheel")
         pytorch_wheel_name = host.list_dir("pytorch/dist")[0]
-        ld_library_path = "$HOME/acl/build:$HOME/pytorch/build/lib"
+        ld_library_path = "/acl/build:$HOME/pytorch/build/lib"
         host.run_cmd(
             f"export LD_LIBRARY_PATH={ld_library_path} && auditwheel repair $HOME/pytorch/dist/{pytorch_wheel_name}"
         )
@@ -763,7 +733,7 @@ def start_build(
     else:
         print("build pytorch without mkldnn backend")
         host.run_cmd(
-            f"cd pytorch && {build_vars} python3 setup.py bdist_wheel{build_opts}"
+            f"cd pytorch && {build_vars} python3 -m build --wheel --no-isolation{build_opts}"
         )
 
     print("Deleting build folder")
@@ -907,7 +877,7 @@ def terminate_instances(instance_type: str) -> None:
 def parse_arguments():
     from argparse import ArgumentParser
 
-    parser = ArgumentParser("Builid and test AARCH64 wheels using EC2")
+    parser = ArgumentParser("Build and test AARCH64 wheels using EC2")
     parser.add_argument("--key-name", type=str)
     parser.add_argument("--debug", action="store_true")
     parser.add_argument("--build-only", action="store_true")

@@ -69,7 +69,8 @@ RUN bash ./install_cuda.sh 13.0
 ENV DESIRED_CUDA=13.0
 
 FROM ${ROCM_IMAGE} as rocm
-ENV PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
+ARG PYTORCH_ROCM_ARCH
+ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH}
 ADD ./common/install_mkl.sh install_mkl.sh
 RUN bash ./install_mkl.sh && rm install_mkl.sh
 ENV MKLROOT /opt/intel

@@ -36,6 +36,12 @@ case ${DOCKER_TAG_PREFIX} in
     ;;
   rocm*)
     BASE_TARGET=rocm
+    PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
+    # add gfx950 conditionally starting in ROCm 7.0
+    if [[ "$ROCM_VERSION" == *"7.0"* ]]; then
+        PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950"
+    fi
+    EXTRA_BUILD_ARGS="${EXTRA_BUILD_ARGS} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}"
     ;;
   *)
     echo "ERROR: Unknown docker tag ${DOCKER_TAG_PREFIX}"

@@ -84,8 +84,8 @@ fi
 _UCX_COMMIT=7836b165abdbe468a2f607e7254011c07d788152
 _UCC_COMMIT=430e241bf5d38cbc73fc7a6b89155397232e3f96
 if [[ "$image" == *rocm* ]]; then
-  _UCX_COMMIT=cc312eaa4655c0cc5c2bcd796db938f90563bcf6
-  _UCC_COMMIT=0c0fc21559835044ab107199e334f7157d6a0d3d
+  _UCX_COMMIT=29831d319e6be55cb8c768ca61de335c934ca39e
+  _UCC_COMMIT=9f4b242cbbd8b1462cbc732eb29316cdfa124b77
 fi
 
 tag=$(echo $image | awk -F':' '{print $2}')
@@ -175,28 +175,17 @@ case "$tag" in
     fi
     GCC_VERSION=11
     VISION=yes
-    ROCM_VERSION=6.4
+    ROCM_VERSION=7.0
     NINJA_VERSION=1.9.0
     TRITON=yes
     KATEX=yes
     UCX_COMMIT=${_UCX_COMMIT}
     UCC_COMMIT=${_UCC_COMMIT}
+    PYTORCH_ROCM_ARCH="gfx90a;gfx942;gfx950"
     if [[ $tag =~ "benchmarks" ]]; then
       INDUCTOR_BENCHMARKS=yes
     fi
     ;;
-  pytorch-linux-noble-rocm-alpha-py3)
-    ANACONDA_PYTHON_VERSION=3.12
-    GCC_VERSION=11
-    VISION=yes
-    ROCM_VERSION=7.0
-    NINJA_VERSION=1.9.0
-    TRITON=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    PYTORCH_ROCM_ARCH="gfx90a;gfx942;gfx950"
-    ;;
   pytorch-linux-jammy-xpu-n-1-py3)
     ANACONDA_PYTHON_VERSION=3.10
     GCC_VERSION=11
@@ -456,12 +445,3 @@ elif [ "$HAS_TRITON" = "yes" ]; then
   echo "expecting triton to not be installed, but it is"
   exit 1
 fi
-
-# Sanity check cmake version.  Executorch reinstalls cmake and I'm not sure if
-# they support 4.0.0 yet, so exclude them from this check.
-CMAKE_VERSION=$(drun cmake --version)
-if [[ "$EXECUTORCH" != *yes* && "$CMAKE_VERSION" != *4.* ]]; then
-  echo "CMake version is not 4.0.0:"
-  drun cmake --version
-  exit 1
-fi
@@ -1 +1 @@
-v2.27.5-1
+v2.27.5-1
@@ -1,16 +1,27 @@
-set -euo pipefail
+#!/bin/bash
+# Script used only in CD pipeline
 
-readonly version=v25.02
-readonly src_host=https://github.com/ARM-software
-readonly src_repo=ComputeLibrary
+set -eux
 
-# Clone ACL
-[[ ! -d ${src_repo} ]] && git clone ${src_host}/${src_repo}.git
-cd ${src_repo}
+ACL_VERSION=${ACL_VERSION:-"v25.02"}
+ACL_INSTALL_DIR="/acl"
 
-git checkout $version
+# Clone ACL
+git clone https://github.com/ARM-software/ComputeLibrary.git -b "${ACL_VERSION}" --depth 1 --shallow-submodules
 
+ACL_CHECKOUT_DIR="ComputeLibrary"
 # Build with scons
+pushd $ACL_CHECKOUT_DIR
 scons -j8  Werror=0 debug=0 neon=1 opencl=0 embed_kernels=0 \
   os=linux arch=armv8a build=native multi_isa=1 \
   fixed_format_kernels=1 openmp=1 cppthreads=0
+popd
+
+# Install ACL
+sudo mkdir -p ${ACL_INSTALL_DIR}
+for d in arm_compute include utils support src build
+do
+  sudo cp -r ${ACL_CHECKOUT_DIR}/${d} ${ACL_INSTALL_DIR}/${d}
+done
+
+rm -rf $ACL_CHECKOUT_DIR
@@ -3,8 +3,10 @@
 
 set -ex
 
-cd /
-git clone https://github.com/OpenMathLib/OpenBLAS.git -b "${OPENBLAS_VERSION:-v0.3.30}" --depth 1 --shallow-submodules
+OPENBLAS_VERSION=${OPENBLAS_VERSION:-"v0.3.30"}
+
+# Clone OpenBLAS
+git clone https://github.com/OpenMathLib/OpenBLAS.git -b "${OPENBLAS_VERSION}" --depth 1 --shallow-submodules
 
 OPENBLAS_CHECKOUT_DIR="OpenBLAS"
 OPENBLAS_BUILD_FLAGS="
@@ -17,5 +19,7 @@ CFLAGS=-O3
 BUILD_BFLOAT16=1
 "
 
-make -j8 ${OPENBLAS_BUILD_FLAGS} -C ${OPENBLAS_CHECKOUT_DIR}
-make -j8 ${OPENBLAS_BUILD_FLAGS} install -C ${OPENBLAS_CHECKOUT_DIR}
+make -j8 ${OPENBLAS_BUILD_FLAGS} -C $OPENBLAS_CHECKOUT_DIR
+sudo make install -C $OPENBLAS_CHECKOUT_DIR
+
+rm -rf $OPENBLAS_CHECKOUT_DIR
@@ -42,12 +42,6 @@ EOF
     rocm_baseurl="http://repo.radeon.com/rocm/apt/${ROCM_VERSION}"
     amdgpu_baseurl="https://repo.radeon.com/amdgpu/${ROCM_VERSION}/ubuntu"
 
-    # Special case for ROCM_VERSION == 7.0
-    if [[ $(ver "$ROCM_VERSION") -eq $(ver 7.0) ]]; then
-        rocm_baseurl="https://repo.radeon.com/rocm/apt/7.0_alpha2"
-        amdgpu_baseurl="https://repo.radeon.com/amdgpu/30.10_alpha2/ubuntu"
-    fi
-
     # Add amdgpu repository
     UBUNTU_VERSION_NAME=`cat /etc/os-release | grep UBUNTU_CODENAME | awk -F= '{print $2}'`
     echo "deb [arch=amd64] ${amdgpu_baseurl} ${UBUNTU_VERSION_NAME} main" > /etc/apt/sources.list.d/amdgpu.list

@@ -12,8 +12,8 @@ function do_install() {
 
     rocm_version_nodot=${rocm_version//./}
 
-    # Version 2.7.2 + ROCm related updates
-    MAGMA_VERSION=a1625ff4d9bc362906bd01f805dbbe12612953f6
+    # https://github.com/icl-utk-edu/magma/pull/65
+    MAGMA_VERSION=d6e4117bc88e73f06d26c6c2e14f064e8fc3d1ec
     magma_archive="magma-rocm${rocm_version_nodot}-${MAGMA_VERSION}-1.tar.bz2"
 
     rocm_dir="/opt/rocm"