diff --git a/.github/scripts/build-rocm.sh b/.github/scripts/build-rocm.sh index fc7515aa7..b508fac69 100644 --- a/.github/scripts/build-rocm.sh +++ b/.github/scripts/build-rocm.sh @@ -1,19 +1,21 @@ #!/bin/bash declare build_arch declare build_os +declare rocm_version set -xeuo pipefail +bnb_rocm_arch="gfx90a;gfx942;gfx1100" if [ "${build_os:0:6}" == ubuntu ]; then - image=rocm/dev-ubuntu-22.04:6.1-complete + image=rocm/dev-ubuntu-22.04:${rocm_version}-complete echo "Using image $image" docker run --rm --platform "linux/$build_arch" -i \ -w /src -v "$PWD:/src" "$image" sh -c \ "apt-get update \ && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cmake \ - && cmake -DCOMPUTE_BACKEND=hip . \ + && cmake -DCOMPUTE_BACKEND=hip -DBNB_ROCM_ARCH=\"${bnb_rocm_arch}\" . \ && cmake --build ." fi -#output_dir="output/${build_os}/${build_arch}" -#mkdir -p "${output_dir}" -#(shopt -s nullglob && cp bitsandbytes/*.{so,dylib,dll} "${output_dir}") +output_dir="output/${build_os}/${build_arch}" +mkdir -p "${output_dir}" +(shopt -s nullglob && cp bitsandbytes/*.{so,dylib,dll} "${output_dir}") diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 671dfee1c..91e6d82a6 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -106,6 +106,8 @@ jobs: matrix: os: [ubuntu-latest] arch: [x86_64] + rocm_version: + ["6.1.2"] runs-on: ${{ matrix.os }} # One day, we could run them on native agents. Azure supports this now but it's planned only for Q3 2023 for hosted agents steps: - uses: actions/checkout@v4 @@ -123,10 +125,18 @@ jobs: env: build_os: ${{ matrix.os }} build_arch: ${{ matrix.arch }} + rocm_version: ${{ matrix.rocm_version }} + - name: Upload build artifact + uses: actions/upload-artifact@v4 + with: + name: shared_library_rocm_${{ matrix.os }}_${{ matrix.arch }}_${{ matrix.rocm_version }} + path: output/* + retention-days: 7 build-wheels: needs: - build-shared-libs - build-shared-libs-cuda + - build-shared-libs-rocm strategy: matrix: os: [ubuntu-latest, macos-latest, windows-latest] diff --git a/CMakeLists.txt b/CMakeLists.txt index ec48b9d97..eac72fe52 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -185,7 +185,7 @@ elseif(BUILD_HIP) set(CMAKE_HIP_ARCHITECTURES ${BNB_ROCM_ARCH}) else() if (NOT AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES) - set(CMAKE_HIP_ARCHITECTURES "gfx908;gfx90a;gfx940;gfx941;gfx942") + set(CMAKE_HIP_ARCHITECTURES "gfx90a;gfx942;gfx1100") elseif (AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES) set(CMAKE_HIP_ARCHITECTURES ${AMDGPU_TARGETS}) endif() @@ -194,12 +194,14 @@ elseif(BUILD_HIP) list(APPEND SRC_FILES ${HIP_FILES}) - string(APPEND BNB_OUTPUT_NAME "_hip") + string(APPEND BNB_OUTPUT_NAME "_rocm") # get hip version execute_process(COMMAND hipconfig --version OUTPUT_VARIABLE HIP_CONFIG_VERSION) string(REGEX MATCH "[0-9]+\\.[0-9]+" HIP_VERSION "${HIP_CONFIG_VERSION}") + string(REPLACE "." "" HIP_VERSION_SHORT "${HIP_VERSION}") + string(APPEND BNB_OUTPUT_NAME "${HIP_VERSION_SHORT}") if(NO_CUBLASLT OR HIP_VERSION VERSION_LESS "6.1") string(APPEND BNB_OUTPUT_NAME "_nohipblaslt") endif() diff --git a/bitsandbytes/cextension.py b/bitsandbytes/cextension.py index 03d2cbd61..cfeaf4f44 100644 --- a/bitsandbytes/cextension.py +++ b/bitsandbytes/cextension.py @@ -38,9 +38,9 @@ def get_cuda_bnb_library_path(cuda_specs: CUDASpecs) -> Path: """ if torch.version.hip: if BNB_HIP_VERSION < 601: - return PACKAGE_DIR / f"libbitsandbytes_hip_nohipblaslt{DYNAMIC_LIBRARY_SUFFIX}" + return PACKAGE_DIR / f"libbitsandbytes_rocm{BNB_HIP_VERSION_SHORT}_nohipblaslt{DYNAMIC_LIBRARY_SUFFIX}" else: - return PACKAGE_DIR / f"libbitsandbytes_hip{DYNAMIC_LIBRARY_SUFFIX}" + return PACKAGE_DIR / f"libbitsandbytes_rocm{BNB_HIP_VERSION_SHORT}{DYNAMIC_LIBRARY_SUFFIX}" library_name = f"libbitsandbytes_cuda{cuda_specs.cuda_version_string}" if not cuda_specs.has_cublaslt: # if not has_cublaslt (CC < 7.5), then we have to choose _nocublaslt @@ -119,8 +119,10 @@ def get_native_library() -> BNBNativeLibrary: if torch.version.hip: hip_major, hip_minor = map(int, torch.version.hip.split(".")[0:2]) HIP_ENVIRONMENT, BNB_HIP_VERSION = True, hip_major * 100 + hip_minor + BNB_HIP_VERSION_SHORT = f"{hip_major}{hip_minor}" else: HIP_ENVIRONMENT, BNB_HIP_VERSION = False, 0 + BNB_HIP_VERSION_SHORT = "" lib = get_native_library() except Exception as e: lib = None diff --git a/csrc/kernels.hip b/csrc/kernels.hip index ca77dceda..d8d7cdba5 100644 --- a/csrc/kernels.hip +++ b/csrc/kernels.hip @@ -10,8 +10,6 @@ #include #include -#include -#include //#include diff --git a/csrc/ops_hip.cuh b/csrc/ops_hip.cuh index 1b9c13063..e57cbb3b5 100644 --- a/csrc/ops_hip.cuh +++ b/csrc/ops_hip.cuh @@ -21,12 +21,6 @@ #include #include -/* -#include -#include -*/ - - #define CUDA_CHECK_RETURN(value) { \ hipError_t _m_cudaStat = value; \ if (_m_cudaStat != hipSuccess) { \ diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx index f917f2623..0e8da0cda 100644 --- a/docs/source/installation.mdx +++ b/docs/source/installation.mdx @@ -146,15 +146,25 @@ Please follow these steps to install bitsandbytes with device-specific backend s bitsandbytes is fully supported from ROCm 6.1 onwards (currently in alpha release). > [!TIP] -> If you already installed ROCm and PyTorch, skip Docker steps below and please check that the torch version matches your ROCm install. To install torch for a specific ROCm version, please refer to step 3 of wheels install in [Installing PyTorch for ROCm](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/3rd-party/pytorch-install.html#using-wheels-package) guide. +> If you would like to install ROCm and PyTorch on bare metal, skip Docker steps and refer to our official guides at [ROCm installation overview](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/install-overview.html#rocm-install-overview) and [Installing PyTorch for ROCm](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/3rd-party/pytorch-install.html#using-wheels-package) (Step 3 of wheels build for quick installation). Please make sure to get PyTorch wheel for the installed ROCm version. ```bash -# Create a docker container with latest pytorch. It comes with ROCm and pytorch preinstalled -docker pull rocm/pytorch:latest -docker run -it --device=/dev/kfd --device=/dev/dri --group-add video rocm/pytorch:latest +# Create a docker container with latest ROCm image, which includes ROCm libraries +docker pull rocm/dev-ubuntu-22.04:6.1.2-complete +docker run -it --device=/dev/kfd --device=/dev/dri --group-add video rocm/dev-ubuntu-22.04:6.1.2-complete +apt-get update && apt-get install -y git && cd home +# Install pytorch compatible with above ROCm version +pip install torch --index-url https://download.pytorch.org/whl/rocm6.1/ + +# Install bitsandbytes from PyPI +# (This is supported on Ubuntu 22.04, Python 3.10, ROCm 6.1.0/6.1.1/6.1.2 and gpu arch - gfx90a, gfx942, gfx1100 +# Please install from source if your configuration doesn't match with these) +pip install bitsandbytes + +# Install bitsandbytes from source # Clone bitsandbytes repo, ROCm backend is currently enabled on multi-backend-refactor branch -git clone --depth 1 -b multi-backend-refactor https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/ +git clone --depth 1 -b multi-backend-refactor https://github.com/bitsandbytes-foundation/bitsandbytes.git && cd bitsandbytes/ # Install dependencies pip install -r requirements-dev.txt diff --git a/tests/test_functional.py b/tests/test_functional.py index 4e82c530a..a9d926b89 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -584,6 +584,9 @@ def test_nvidia_transform(dim1, dim2, dim3, dims, dtype, orderA, orderOut, trans @pytest.mark.parametrize("ldb", (0,), ids=id_formatter("ldb")) @pytest.mark.parametrize("device", ("cuda", "cpu"), ids=id_formatter("device")) def test_igemmlt_int(dim1, dim2, dim3, dim4, dims, ldb, device): + if HIP_ENVIRONMENT and device == "cpu": + pytest.skip("this test is not supported on ROCm yet") + for i in range(k): if dims == 2: A = torch.randint(-128, 127, size=(dim1, dim3), device=device).to(torch.int8)