From 4f1989881bc7aca7348c6d8ae447c89aea07fb67 Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Mon, 30 Sep 2024 22:34:36 +0000
Subject: [PATCH 1/9] tweak continuous release of `main`

---
 .github/workflows/python-package.yml | 38 ++++++++++------------------
 1 file changed, 14 insertions(+), 24 deletions(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 945266bbd..0d6aae184 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -163,7 +163,7 @@ jobs:
     needs:
       - build-wheels
     steps:
-      - name: Download artifacts to tmp directory
+      - name: Download and rename artifacts
         uses: actions/download-artifact@v4
         with:
           path: tmp/
@@ -171,37 +171,27 @@ jobs:
           merge-multiple: true
       - name: Inspect tmp directory after downloading artifacts
         run: ls -alFR tmp/
-      - name: Move and rename wheel files
+      - name: Move and rename wheel files with pattern replacement
         run: |
           mkdir -p wheels/
           find tmp/ -type f -name '*.whl' -print0 | while IFS= read -r -d '' wheel; do
             wheel_filename=$(basename "$wheel")
-            if [[ $wheel_filename == *linux*x86_64* ]]; then
-              mv "$wheel" wheels/bnb-linux-x86_64.whl
-            elif [[ $wheel_filename == *linux*aarch64* ]]; then
-              mv "$wheel" wheels/bnb-linux-aarch64.whl
-            elif [[ $wheel_filename == *macosx*x86_64* ]]; then
-              mv "$wheel" wheels/bnb-macos-x86_64.whl
-            elif [[ $wheel_filename == *macosx*arm64* ]]; then
-              mv "$wheel" wheels/bnb-macos-arm64.whl
-            elif [[ $wheel_filename == *win*amd64* ]]; then
-              mv "$wheel" wheels/bnb-windows-x86_64.whl
-            else
-              echo "Unknown wheel format: $wheel_filename"
-              exit 1
-            fi
+            # Remove the gith hash, e.g. `+1234567`, for a stable download link on the multi-backend pre-release
+            cleaned_filename=$(echo "$wheel_filename" | sed -E 's/\+[0-9a-f]{7}-/-/g')
+            mv "$wheel" "wheels/$cleaned_filename"
           done
       - name: Inspect wheels directory after renaming files
         run: ls -alFR wheels/
       - name: Create release and upload artifacts
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          GITHUB_CONTINUOUS_RELEASE_TYPE: prerelease
-          GITHUB_CONTINUOUS_RELEASE_TAG: continuous-release_main
-        run: |
-          wget -q https://github.com/TheAssassin/pyuploadtool/releases/download/continuous/pyuploadtool-x86_64.AppImage
-          chmod +x pyuploadtool-x86_64.AppImage
-          ./pyuploadtool-x86_64.AppImage --appimage-extract-and-run wheels/*.whl
+        uses: softprops/action-gh-release@v2.0.8
+        with:
+          files: wheels/*.whl
+          prerelease: true
+          name: Multi-Backend Preview
+          tag_name: continuous-release_main
+          make_latest: false
+          draft: false
+          target_commitish: ${{ github.sha }}
 
   audit-wheels:
     needs: build-wheels

From 2a1ff2c0541d789655603883f74fb3bd4597e255 Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Mon, 30 Sep 2024 22:37:17 +0000
Subject: [PATCH 2/9] more descriptive continuous release name

---
 .github/workflows/python-package.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 0d6aae184..ceab10fe7 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -187,7 +187,7 @@ jobs:
         with:
           files: wheels/*.whl
           prerelease: true
-          name: Multi-Backend Preview
+          name: Latest `main` wheel
           tag_name: continuous-release_main
           make_latest: false
           draft: false

From d873fb346d36d9001aa73502996381d4c46beb57 Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Mon, 30 Sep 2024 23:29:32 +0000
Subject: [PATCH 3/9] omit macos wheels for now

---
 .github/workflows/python-package.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index ceab10fe7..560741edb 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -174,7 +174,8 @@ jobs:
       - name: Move and rename wheel files with pattern replacement
         run: |
           mkdir -p wheels/
-          find tmp/ -type f -name '*.whl' -print0 | while IFS= read -r -d '' wheel; do
+          # exclude macos wheels for now
+          find tmp/ -type f -name '*.whl' ! -name '*macos*' -print0 | while IFS= read -r -d '' wheel; do
             wheel_filename=$(basename "$wheel")
             # Remove the gith hash, e.g. `+1234567`, for a stable download link on the multi-backend pre-release
             cleaned_filename=$(echo "$wheel_filename" | sed -E 's/\+[0-9a-f]{7}-/-/g')

From 485427f183eff1f2b7ee029a3a3bca8d26ddcacb Mon Sep 17 00:00:00 2001
From: Titus <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Mon, 30 Sep 2024 17:49:11 -0600
Subject: [PATCH 4/9] refine docs for multi-backend alpha release (#1380)

* refine docs for multi-backend alpha release

* docs: further tweaks to multi-backend alpha docs

* docs: further tweaks to multi-backend alpha docs

* docs: further tweaks to multi-backend alpha docs

* docs: add multi-backend feedback links

* docs: add request for contributions

* docs: small fixes

* docs: small fixes

* docs: add info about `main` continuous build

* docs: further tweaks to multi-backend alpha docs

* docs: further tweaks to multi-backend alpha docs
---
 docs/source/installation.mdx      | 224 ++++++++++++++++++++++++------
 docs/source/non_cuda_backends.mdx |   3 +
 2 files changed, 184 insertions(+), 43 deletions(-)

diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index 2f82c199b..2ac56e03f 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -1,29 +1,45 @@
-# Installation
+# Installation Guide
 
-## CUDA
+Welcome to the installation guide for the `bitsandbytes` library! This document provides step-by-step instructions to install `bitsandbytes` across various platforms and hardware configurations. The library primarily supports CUDA-based GPUs, but the team is actively working on enabling support for additional backends like AMD ROCm, Intel, and Apple Silicon.
 
-bitsandbytes is only supported on CUDA GPUs for CUDA versions **11.0 - 12.5**. However, there's a multi-backend effort under way which is currently in alpha release, check [the respective section below in case you're interested to help us with early feedback](#multi-backend).
+> [!TIP]
+> For a high-level overview of backend support and compatibility, see the [Multi-backend Support](#multi-backend) section.
 
-The latest version of bitsandbytes builds on:
+## Table of Contents
 
-| OS | CUDA | Compiler |
-|---|---|---|
-| Linux | 11.7 - 12.3 | GCC 11.4 |
-|  | 12.4+ | GCC 13.2 |
-| Windows | 11.7 - 12.4 | MSVC 19.38+ (VS2022 17.8.0+) |
+- [CUDA](#cuda)
+  - [Installation via PyPI](#cuda-pip)
+  - [Compile from Source](#cuda-compile)
+- [Multi-backend Support (Alpha Release)](#multi-backend)
+  - [Supported Backends](#multi-backend-supported-backends)
+  - [Pre-requisites](#multi-backend-pre-requisites)
+  - [Installation](#multi-backend-pip)
+  - [Compile from Source](#multi-backend-compile)
+- [PyTorch CUDA Versions](#pytorch-cuda-versions)
 
-> [!TIP]
-> MacOS support is still a work in progress! Subscribe to this [issue](https://github.com/TimDettmers/bitsandbytes/issues/1020) to get notified about discussions and to track the integration progress.
+## CUDA[[cuda]]
 
-For Linux systems, make sure your hardware meets the following requirements to use bitsandbytes features.
+`bitsandbytes` is currently only supported on CUDA GPUs for CUDA versions **11.0 - 12.5**. However, there's an ongoing multi-backend effort under development, which is currently in alpha. If you're interested in providing feedback or testing, check out [the multi-backend section below](#multi-backend).
 
-| **Feature** | **Hardware requirement** |
-|---|---|
-| LLM.int8() | NVIDIA Turing (RTX 20 series, T4) or Ampere (RTX 30 series, A4-A100) GPUs |
-| 8-bit optimizers/quantization | NVIDIA Kepler (GTX 780 or newer) |
+### Supported CUDA Configurations[[cuda-pip]]
+
+The latest version of `bitsandbytes` builds on the following configurations:
+
+| **OS**      | **CUDA Version** | **Compiler**         |
+|-------------|------------------|----------------------|
+| **Linux**   | 11.7 - 12.3      | GCC 11.4             |
+|             | 12.4+            | GCC 13.2             |
+| **Windows** | 11.7 - 12.4      | MSVC 19.38+ (VS2022) |
+
+For Linux systems, ensure your hardware meets the following requirements:
+
+| **Feature**                     | **Hardware Requirement**                                           |
+|---------------------------------|--------------------------------------------------------------------|
+| LLM.int8()                      | NVIDIA Turing (RTX 20 series, T4) or Ampere (RTX 30 series, A4-A100) GPUs |
+| 8-bit optimizers/quantization   | NVIDIA Kepler (GTX 780 or newer)                                    |
 
 > [!WARNING]
-> bitsandbytes >= 0.39.1 no longer includes Kepler binaries in pip installations. This requires manual compilation, and you should follow the general steps and use `cuda11x_nomatmul_kepler` for Kepler-targeted compilation.
+> `bitsandbytes >= 0.39.1` no longer includes Kepler binaries in pip installations. This requires [manual compilation using](#cuda-compile) the `cuda11x_nomatmul_kepler` configuration.
 
 To install from PyPI.
 
@@ -31,14 +47,41 @@ To install from PyPI.
 pip install bitsandbytes
 ```
 
-### Compile from source[[compile]]
+### `pip install` pre-built wheel from latest `main` commit
+
+If you would like to use new feature even before they are officially released and help us test them, feel free to install the wheel directly from our CI (*the wheel links will remain stable!*):
+
+<hfoptions id="OS">
+<hfoption id="Linux">
+
+```
+# Note, if you don't want to reinstall BNBs dependencies, append the `--no-deps` flag!
+pip install --force-reinstall 'https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/bitsandbytes-0.44.2.dev0-py3-none-manylinux_2_24_x86_64.whl'
+```
+
+</hfoption>
+<hfoption id="Windows">
+
+```
+# Note, if you don't want to reinstall BNBs dependencies, append the `--no-deps` flag!
+pip install --force-reinstall 'https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_multi-backend-refactor/bitsandbytes-0.44.1.dev0-py3-none-macosx_13_1_arm64.whl'
+```
+</hfoption>
+</hfoptions>
+
+### Compile from source[[cuda-compile]]
+
+> [!TIP]
+> Don't hesitate to compile from source! The process is pretty straight forward and resilient. This might be needed for older CUDA versions or other less common configurations, which we don't support out of the box due to package size.
 
-For Linux and Windows systems, you can compile bitsandbytes from source. Installing from source allows for more build options with different CMake configurations.
+For Linux and Windows systems, compiling from source allows you to customize the build configurations. See below for detailed platform-specific instructions (see the `CMakeLists.txt` if you want to check the specifics and explore some additional options):
 
 <hfoptions id="source">
 <hfoption id="Linux">
 
-To compile from source, you need CMake >= **3.22.1** and Python >= **3.8** installed. Make sure you have a compiler installed to compile C++ (gcc, make, headers, etc.). For example, to install a compiler and CMake on Ubuntu:
+To compile from source, you need CMake >= **3.22.1** and Python >= **3.8** installed. Make sure you have a compiler installed to compile C++ (`gcc`, `make`, headers, etc.).
+
+For example, to install a compiler and CMake on Ubuntu:
 
 ```bash
 apt-get install -y build-essential cmake
@@ -48,16 +91,16 @@ You should also install CUDA Toolkit by following the [NVIDIA CUDA Installation
 
 Refer to the following table if you're using another CUDA Toolkit version.
 
-| CUDA Toolkit | GCC |
-|---|---|
-| >= 11.4.1 | >= 11 |
-| >= 12.0 | >= 12 |
-| >= 12.4 | >= 13 |
+| CUDA Toolkit |  GCC  |
+|--------------|-------|
+| >= 11.4.1    | >= 11 |
+| >= 12.0      | >= 12 |
+| >= 12.4      | >= 13 |
 
 Now to install the bitsandbytes package from source, run the following commands:
 
 ```bash
-git clone https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/
+git clone https://github.com/bitsandbytes-foundation/bitsandbytes.git && cd bitsandbytes/
 pip install -r requirements-dev.txt
 cmake -DCOMPUTE_BACKEND=cuda -S .
 make
@@ -81,7 +124,7 @@ Refer to the following table if you're using another CUDA Toolkit version.
 | >= 11.6 | 19.30+ (VS2022) |
 
 ```bash
-git clone https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/
+git clone https://github.com/bitsandbytes-foundation/bitsandbytes.git && cd bitsandbytes/
 pip install -r requirements-dev.txt
 cmake -DCOMPUTE_BACKEND=cuda -S .
 cmake --build . --config Release
@@ -93,7 +136,7 @@ Big thanks to [wkpark](https://github.com/wkpark), [Jamezo97](https://github.com
 </hfoption>
 </hfoptions>
 
-### PyTorch CUDA versions
+### PyTorch CUDA versions[[pytorch-cuda-versions]]
 
 Some bitsandbytes features may need a newer CUDA version than the one currently supported by PyTorch binaries from Conda and pip. In this case, you should follow these instructions to load a precompiled bitsandbytes binary.
 
@@ -105,7 +148,7 @@ Some bitsandbytes features may need a newer CUDA version than the one currently
 Then locally install the CUDA version you need with this script from bitsandbytes:
 
 ```bash
-wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/install_cuda.sh
+wget https://raw.githubusercontent.com/bitsandbytes-foundation/bitsandbytes/main/install_cuda.sh
 # Syntax cuda_install CUDA_VERSION INSTALL_PREFIX EXPORT_TO_BASH
 #   CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122, 123, 124, 125}
 #   EXPORT_TO_BASH in {0, 1} with 0=False and 1=True
@@ -134,28 +177,62 @@ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/YOUR_USERNAME/local/cuda-11.7
 
 3. Now when you launch bitsandbytes with these environment variables, the PyTorch CUDA version is overridden by the new CUDA version (in this example, version 11.7) and a different bitsandbytes library is loaded.
 
-## Multi-backend[[multi-backend]]
+## Multi-backend Support (Alpha Release)[[multi-backend]]
 
 > [!TIP]
-> This functionality is currently in preview and therefore not yet production-ready! Please reference [this guide](./non_cuda_backends) for more in-depth information about the different backends and their current status.
+> This functionality is currently in preview and not yet production-ready. We very much welcome community feedback, contributions and leadership on topics like Apple Silicon as well as other less common accellerators! For more information, see [this guide on multi-backend support](./non_cuda_backends).
+
+**Link to give us feedback** (bugs, install issues, perf results, requests, etc.)**:**
+
+<hfoptions id="platform">
+<hfoption id="ROCm">
+
+[**Multi-backend refactor: Alpha release (AMD ROCm ONLY)**](https://github.com/bitsandbytes-foundation/bitsandbytes/discussions/1339)
+
+</hfoption>
+<hfoption id="Intel CPU+GPU">
+
+[**Multi-backend refactor: Alpha release (INTEL ONLY)**](https://github.com/bitsandbytes-foundation/bitsandbytes/discussions/1338)
+
+</hfoption>
+<hfoption id="Apple Silicon / Metal (MPS)">
 
-Please follow these steps to install bitsandbytes with device-specific backend support other than CUDA:
+[**Github Discussion space on coordinating the kickoff of MPS backend development**](https://github.com/bitsandbytes-foundation/bitsandbytes/discussions/1340)
 
-### Pip install the pre-built wheel (recommended for most)
+</hfoption>
+</hfoptions>
 
-WIP (will be added in the coming days)
+### Supported Backends[[multi-backend-supported-backends]]
 
-### Compilation
+| **Backend** | **Supported Versions** | **Python versions** | **Architecture Support** | **Status** |
+|-------------|------------------------|---------------------------|-------------------------|------------|
+| **AMD ROCm** | 6.1+                   | 3.10+                     | minimum CDNA - `gfx90a`, RDNA - `gfx1100` | Alpha      |
+| **Apple Silicon (MPS)** | WIP                        | 3.10+                     | M1/M2 chips                    | Planned    |
+| **Intel CPU** | v2.4.0+ (`ipex`)         | 3.10+                     | Intel CPU | Alpha |
+| **Intel GPU** | v2.4.0+ (`ipex`)         | 3.10+                     | Intel GPU | Experimental |
+
+For each supported backend, follow the respective instructions below:
+
+### Pre-requisites[[multi-backend-pre-requisites]]
+
+To use bitsandbytes non-CUDA backends, be sure to install:
+
+```
+pip install "transformers>=4.45.1"
+```
 
 <hfoptions id="backend">
 <hfoption id="AMD ROCm">
 
-#### AMD GPU
-
-bitsandbytes is fully supported from ROCm 6.1 onwards (currently in alpha release).
+> [!WARNING]
+> Pre-compiled binaries are only built for ROCm versions `6.1.0`/`6.1.1`/`6.1.2`/`6.2.0` and `gfx90a`, `gfx942`, `gfx1100` GPU architectures. [Find the pip install instructions here](#multi-backend-pip).
+>
+> Other supported versions that don't come with pre-compiled binaries [can be compiled for with these instructions](#multi-backend-compile).
+>
+> **Windows is not supported for the ROCm backend**; also not WSL2 to our knowledge.
 
 > [!TIP]
-> If you would like to install ROCm and PyTorch on bare metal, skip Docker steps and refer to our official guides at [ROCm installation overview](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/install-overview.html#rocm-install-overview) and [Installing PyTorch for ROCm](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/3rd-party/pytorch-install.html#using-wheels-package) (Step 3 of wheels build for quick installation). Please make sure to get PyTorch wheel for the installed ROCm version.
+> If you would like to install ROCm and PyTorch on bare metal, skip the Docker steps and refer to ROCm's official guides at [ROCm installation overview](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/install-overview.html#rocm-install-overview) and [Installing PyTorch for ROCm](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/3rd-party/pytorch-install.html#using-wheels-package) (Step 3 of wheels build for quick installation). Special note: please make sure to get the respective ROCm-specific PyTorch wheel for the installed ROCm version, e.g. `https://download.pytorch.org/whl/nightly/rocm6.2/`!
 
 ```bash
 # Create a docker container with latest ROCm image, which includes ROCm libraries
@@ -165,9 +242,70 @@ apt-get update && apt-get install -y git && cd home
 
 # Install pytorch compatible with above ROCm version
 pip install torch --index-url https://download.pytorch.org/whl/rocm6.1/
+```
 
-# Install bitsandbytes from PyPI
-# (This is supported on Ubuntu 22.04, Python 3.10, ROCm 6.1.0/6.1.1/6.1.2/6.2.0 and gpu arch - gfx90a, gfx942, gfx1100
+</hfoption>
+<hfoption id="Intel CPU + GPU">
+
+Compatible hardware and functioning `import intel_extension_for_pytorch as ipex` capable environment with Python `3.10` as the minimum requirement.
+
+Please refer to [the official Intel installations instructions](https://intel.github.io/intel-extension-for-pytorch/index.html#installation?platform=cpu&version=v2.4.0%2bcpu&os=linux%2fwsl2) for guidance on how to pip install the necessary `intel_extension_for_pytorch` dependency.
+
+</hfoption>
+<hfoption id="Apple Silicon (MPS)">
+
+> [!TIP]
+> Apple Silicon support is still a WIP. Please visit and write us in [this Github Discussion space on coordinating the kickoff of MPS backend development](https://github.com/bitsandbytes-foundation/bitsandbytes/discussions/1340) and coordinate a community-led effort to implement this backend.
+
+</hfoption>
+</hfoptions>
+
+### Installation
+
+You can install the pre-built wheels for each backend, or compile from source for custom configurations.
+
+#### Pre-built Wheel Installation (recommended)[[multi-backend-pip]]
+
+<hfoptions id="platform">
+<hfoption id="Linux">
+
+```
+# Note, if you don't want to reinstall BNBs dependencies, append the `--no-deps` flag!
+pip install --force-reinstall 'https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_multi-backend-refactor/bitsandbytes-0.44.1.dev0-py3-none-manylinux_2_24_x86_64.whl'
+```
+
+</hfoption>
+<hfoption id="Windows">
+
+```
+# Note, if you don't want to reinstall BNBs dependencies, append the `--no-deps` flag!
+pip install --force-reinstall 'https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_multi-backend-refactor/bitsandbytes-0.44.1.dev0-py3-none-win_amd64.whl'
+```
+
+</hfoption>
+<hfoption id="Mac">
+
+> [!WARNING]
+> bitsandbytes does not yet support Apple Silicon / Metal with a dedicated backend. However, the build infrastructure is in place and the below pip install will eventually provide Apple Silicon support as it becomes available on the `multi-backend-refactor` branch based on community contributions.
+
+```
+# Note, if you don't want to reinstall BNBs dependencies, append the `--no-deps` flag!
+pip install --force-reinstall 'https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_multi-backend-refactor/bitsandbytes-0.44.1.dev0-py3-none-macosx_13_1_arm64.whl'
+```
+
+</hfoption>
+</hfoptions>
+
+#### Compile from Source[[multi-backend-compile]]
+
+<hfoptions id="backend">
+<hfoption id="AMD ROCm">
+
+#### AMD GPU
+
+bitsandbytes is fully supported from ROCm 6.1 onwards (currently in alpha release).
+
+```bash
 # Please install from source if your configuration doesn't match with these)
 pip install bitsandbytes
 
@@ -195,10 +333,10 @@ pip install -e .   # `-e` for "editable" install, when developing BNB (otherwise
 
 Similar to the CUDA case, you can compile bitsandbytes from source for Linux and Windows systems.
 
-The below commands are for Linux. For installing on Windows, please adapt the below commands according to the same pattern as described [the section above on compiling from source under the Windows tab](#compile).
+The below commands are for Linux. For installing on Windows, please adapt the below commands according to the same pattern as described [the section above on compiling from source under the Windows tab](#cuda-compile).
 
 ```
-git clone --depth 1 -b multi-backend-refactor https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/
+git clone --depth 1 -b multi-backend-refactor https://github.com/bitsandbytes-foundation/bitsandbytes.git && cd bitsandbytes/
 pip install intel_extension_for_pytorch
 pip install -r requirements-dev.txt
 cmake -DCOMPUTE_BACKEND=cpu -S .
diff --git a/docs/source/non_cuda_backends.mdx b/docs/source/non_cuda_backends.mdx
index fc7c6ac27..728606b7b 100644
--- a/docs/source/non_cuda_backends.mdx
+++ b/docs/source/non_cuda_backends.mdx
@@ -1,5 +1,8 @@
 # Multi-backend support (non-CUDA backends)
 
+> [!Tip]
+> If you feel these docs need some additional info, please consider submitting a PR or respectfully request the missing info in one of the below mentioned Github discussion spaces.
+
 As part of a recent refactoring effort, we will soon offer official multi-backend support. Currently, this feature is available in a preview alpha release, allowing us to gather early feedback from users to improve the functionality and identify any bugs.
 
 At present, the Intel CPU and AMD ROCm backends are considered fully functional. The Intel XPU backend has limited functionality and is less mature.

From 9b3c2b34f9a0c0cc82c17bf1c14ed200dfc76c0b Mon Sep 17 00:00:00 2001
From: Titus <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Mon, 30 Sep 2024 19:40:57 -0600
Subject: [PATCH 5/9] README.md: announce multi-backend alpha release

---
 README.md | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 7823168ac..ab4b523e2 100644
--- a/README.md
+++ b/README.md
@@ -12,15 +12,19 @@ There are ongoing efforts to support further hardware backends, i.e. Intel CPU +
 
 **[https://huggingface.co/docs/bitsandbytes/main](https://huggingface.co/docs/bitsandbytes/main)**
 
-## ALPHA TESTERS WANTED: `multi-backend-refactor` AMD GPU + Intel CPU/GPU specific BNB backend implementations
+## `𝗯𝗶𝘁𝘀𝗮𝗻𝗱𝗯𝘆𝘁𝗲𝘀` 𝗺𝘂𝗹𝘁𝗶-𝗯𝗮𝗰𝗸𝗲𝗻𝗱 𝙖𝙡𝙥𝙝𝙖 𝗿𝗲𝗹𝗲𝗮𝘀𝗲 is out!
 
-We're in the process of a complex refactor in order to allow the support of additional hardware backends, other than CUDA, in BNB. The efforts around this are already quite far along and there's plenty of functionality already in place that is in need for users to take a hands-on approach! Mac support will likely soon also see progress. However, I recommend waiting 2 weeks until the device abstraction has further consolidated (**breaking changes upcoming**).
+🚀 Big news! After months of hard work and incredible community contributions, we're thrilled to announce the 𝗯𝗶𝘁𝘀𝗮𝗻𝗱𝗯𝘆𝘁𝗲𝘀 𝗺𝘂𝗹𝘁𝗶-𝗯𝗮𝗰𝗸𝗲𝗻𝗱 𝙖𝙡𝙥𝙝𝙖 𝗿𝗲𝗹𝗲𝗮𝘀𝗲! 💥
 
-Currently, you still need to compile from source, after checking out the `multi-backend-refactor` branch (instructions WIP, but [the current docs on the compilation from source](https://huggingface.co/docs/bitsandbytes/main/en/installation#compile-from-source) are a good starting point; [feel free to share tips / input in this Github discussion](https://github.com/TimDettmers/bitsandbytes/discussions/1219). We'll soon enable nightly releases to make this much easier for you!
+Now supporting:
+- 🔥 𝗔𝗠𝗗 𝗚𝗣𝗨𝘀 (ROCm)
+- ⚡ 𝗜𝗻𝘁𝗲𝗹 𝗖𝗣𝗨𝘀 & 𝗚𝗣𝗨𝘀
 
-Please give feedback to us in [this dedicated Github Discussion space](https://github.com/TimDettmers/bitsandbytes/discussions/categories/catch-all-alpha-testing-the-multi-backend-refactor)!
+We’d love your early feedback! 🙏
 
-We're super excited about these recent developments and grateful for any constructive input or support that you can give to help us make this a reality. BNB is a community project and we're excited for your collaboration 🤗
+👉 [Instructions for your `𝚙𝚒𝚙 𝚒𝚗𝚜𝚝𝚊𝚕𝚕` here](https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend)
+
+We're super excited about these recent developments and grateful for any constructive input or support that you can give to help us make this a reality (e.g. helping us with the upcoming Apple Silicon backend or reporting bugs). BNB is a community project and we're excited for your collaboration 🤗
 
 ## License
 

From 723e2162fd514deee66268106775f4bfbee25794 Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Tue, 1 Oct 2024 14:01:09 +0000
Subject: [PATCH 6/9] docs: remove 2 obsolete lines

---
 docs/source/installation.mdx | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index 2ac56e03f..609865436 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -306,9 +306,6 @@ pip install --force-reinstall 'https://github.com/bitsandbytes-foundation/bitsan
 bitsandbytes is fully supported from ROCm 6.1 onwards (currently in alpha release).
 
 ```bash
-# Please install from source if your configuration doesn't match with these)
-pip install bitsandbytes
-
 # Install bitsandbytes from source
 # Clone bitsandbytes repo, ROCm backend is currently enabled on multi-backend-refactor branch
 git clone --depth 1 -b multi-backend-refactor https://github.com/bitsandbytes-foundation/bitsandbytes.git && cd bitsandbytes/

From 2da2f1945ce5567e27d45df43acd4f65108d5c25 Mon Sep 17 00:00:00 2001
From: Aarni Koskela <akx@iki.fi>
Date: Mon, 14 Oct 2024 18:53:07 +0300
Subject: [PATCH 7/9] README: Replace special Unicode text symbols with regular
 characters (#1385)

---
 README.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index ab4b523e2..c1d7a1ae7 100644
--- a/README.md
+++ b/README.md
@@ -12,17 +12,17 @@ There are ongoing efforts to support further hardware backends, i.e. Intel CPU +
 
 **[https://huggingface.co/docs/bitsandbytes/main](https://huggingface.co/docs/bitsandbytes/main)**
 
-## `𝗯𝗶𝘁𝘀𝗮𝗻𝗱𝗯𝘆𝘁𝗲𝘀` 𝗺𝘂𝗹𝘁𝗶-𝗯𝗮𝗰𝗸𝗲𝗻𝗱 𝙖𝙡𝙥𝙝𝙖 𝗿𝗲𝗹𝗲𝗮𝘀𝗲 is out!
+## `bitsandbytes` multi-backend _alpha_ release is out!
 
-🚀 Big news! After months of hard work and incredible community contributions, we're thrilled to announce the 𝗯𝗶𝘁𝘀𝗮𝗻𝗱𝗯𝘆𝘁𝗲𝘀 𝗺𝘂𝗹𝘁𝗶-𝗯𝗮𝗰𝗸𝗲𝗻𝗱 𝙖𝙡𝙥𝙝𝙖 𝗿𝗲𝗹𝗲𝗮𝘀𝗲! 💥
+🚀 Big news! After months of hard work and incredible community contributions, we're thrilled to announce the **bitsandbytes multi-backend _alpha_ release**! 💥
 
 Now supporting:
-- 🔥 𝗔𝗠𝗗 𝗚𝗣𝗨𝘀 (ROCm)
-- ⚡ 𝗜𝗻𝘁𝗲𝗹 𝗖𝗣𝗨𝘀 & 𝗚𝗣𝗨𝘀
+- 🔥 **AMD GPUs** (ROCm)
+- ⚡ **Intel CPUs** & **GPUs**
 
 We’d love your early feedback! 🙏
 
-👉 [Instructions for your `𝚙𝚒𝚙 𝚒𝚗𝚜𝚝𝚊𝚕𝚕` here](https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend)
+👉 [Instructions for your `pip install` here](https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend)
 
 We're super excited about these recent developments and grateful for any constructive input or support that you can give to help us make this a reality (e.g. helping us with the upcoming Apple Silicon backend or reporting bugs). BNB is a community project and we're excited for your collaboration 🤗
 

From c8f2769b42616ac30f29646085e6e75366fb1ede Mon Sep 17 00:00:00 2001
From: pnunna93 <104791500+pnunna93@users.noreply.github.com>
Date: Wed, 16 Oct 2024 15:51:32 -0500
Subject: [PATCH 8/9] Remove depth option in installation steps (#1395)

* Add build job for rocm

* Add rocm build script

* Copy shared obj file into output_dir

* upload build artifacts and enable wheels build

* Remove cuda build temporarily

* Add ROCm version to .so filename

* Add rocm_version to whls build

* Revert "Remove cuda build temporarily"

This reverts commit 1413c5f3a2aed51140b86daa8ee9283c67cce738.

* Add rocm_version env var

* Remove thrush header files

* Print node info

* print cuda node info

* Revert "print cuda node info"

This reverts commit cdb209a2eb896d9c4166f53e9b2aa580c10e42c0.

* Revert "Print node info"

This reverts commit 7e9a65c33f66fffcb14ee2438170718777c06022.

* Add rocm arch to compile command

* Rename .so files to rocm

* Update default gpu arch

* Skip cpu based igemmlt int tests on ROCm

* Update Documentation

* Update upstream repo name

* Update docs

* Update string format

Co-authored-by: Aarni Koskela <akx@iki.fi>

* Remove pre-release option for torch install

* Update pytorch install path

Co-authored-by: Titus <9048635+Titus-von-Koeller@users.noreply.github.com>

* Add messages for Heuristics error

* Remove toolcache for disk space

* print disk usage

* Clean disk space for linux

* Fix for ubuntu

* Add sudo for apt clean

* Update clean up disk list

* remove disk usage print

* Add BNB_BACKEND variable

* Update diagnostic functions for ROCm

* Fix tuple error

* Fix library detection bug for recursive and symlink cases

* fix pre-commit errors

* Remove recursive path lib search

* Create function for runtime lib patterns

* Update logger format

Co-authored-by: Aarni Koskela <akx@iki.fi>

* Update error reporting

Co-authored-by: Aarni Koskela <akx@iki.fi>

* Remove commented code

Co-authored-by: Aarni Koskela <akx@iki.fi>

* Update error reporting

Co-authored-by: Aarni Koskela <akx@iki.fi>

* Update error reporting

* Create hip diagnostics functions

* Fix Typo

* Fix pre-commit checks

* Enable 6.2 build

* Skip gemv 4 bit cpu test

* Update documentation for 6.2.0 pip install

* Update README for default branch change

* Fix typo

* Sync README with upstream

* Remove depth

---------

Co-authored-by: Aarni Koskela <akx@iki.fi>
Co-authored-by: Titus <9048635+Titus-von-Koeller@users.noreply.github.com>
Co-authored-by: Aswin John Mathews <81309834+amathews-amd@users.noreply.github.com>
Co-authored-by: root <root@banff-cyxtera-s78-4.ctr.dcgpu>
---
 docs/source/installation.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index 609865436..d1acb2cd6 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -308,7 +308,7 @@ bitsandbytes is fully supported from ROCm 6.1 onwards (currently in alpha releas
 ```bash
 # Install bitsandbytes from source
 # Clone bitsandbytes repo, ROCm backend is currently enabled on multi-backend-refactor branch
-git clone --depth 1 -b multi-backend-refactor https://github.com/bitsandbytes-foundation/bitsandbytes.git && cd bitsandbytes/
+git clone -b multi-backend-refactor https://github.com/bitsandbytes-foundation/bitsandbytes.git && cd bitsandbytes/
 
 # Install dependencies
 pip install -r requirements-dev.txt

From 9568735b21b9325e4789d6a5004517f2287f47c8 Mon Sep 17 00:00:00 2001
From: Aarni Koskela <akx@iki.fi>
Date: Wed, 23 Oct 2024 20:57:07 +0300
Subject: [PATCH 9/9] Update CI tools & fix typos (#1386)

* Update pre-commit tools

* Fix typos
---
 .pre-commit-config.yaml    | 6 +++---
 _typos.toml                | 6 ++++--
 bitsandbytes/functional.py | 2 +-
 csrc/kernels.cu            | 6 +++---
 pyproject.toml             | 1 -
 5 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index a859d05af..8ac37502e 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,13 +1,13 @@
 repos:
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.3.2
+    rev: v0.6.9
     hooks:
       - id: ruff
         args:
           - --fix
       - id: ruff-format
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.5.0
+    rev: v5.0.0
     hooks:
       - id: check-merge-conflict
       - id: check-yaml
@@ -18,6 +18,6 @@ repos:
         args:
           - --fix=lf
   - repo: https://github.com/crate-ci/typos
-    rev: v1.18.2
+    rev: v1.26.0
     hooks:
       - id: typos
diff --git a/_typos.toml b/_typos.toml
index e4e7287fb..955c6cb79 100644
--- a/_typos.toml
+++ b/_typos.toml
@@ -4,8 +4,10 @@
 extend-ignore-re = [
     "@Ther-nul",  # valid Github user
 ]
-
-[default.extend-identifiers]
+extend-ignore-identifiers-re = [
+    ".*arange.*",
+    ".*ARANGE.*",
+]
 
 [type.py.extend-words]
 "BA" = "BA"  # used as a commented-out variable in tests
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index 34b3c0293..7503ad73c 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -1864,7 +1864,7 @@ def percentile_clipping(grad: Tensor, gnorm_vec: Tensor, step: int, percentile:
     gnorm_vec: torch.Tensor
         Vector of gradient norms. 100 elements expected.
     step: int
-        The current optimiation steps (number of past gradient norms).
+        The current optimization steps (number of past gradient norms).
 
     """
     prev_device = pre_call(grad.device)
diff --git a/csrc/kernels.cu b/csrc/kernels.cu
index be7779de1..867390f2c 100644
--- a/csrc/kernels.cu
+++ b/csrc/kernels.cu
@@ -2661,7 +2661,7 @@ template <int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_COLS, int T
                   //const int global_col = base_row; // block offset for col
                   if((base_col + subrow_loop_row + jrow + warp_id < outRows) && (base_row+warp_lane < rows))
                   {
-                    // each row hae 32 columns and is offset by 1 to prevent bank conflict during storage into smem
+                    // each row has 32 columns and is offset by 1 to prevent bank conflict during storage into smem
                     char data = smem_data[(subrow_loop_row + jrow + warp_id)*33 + warp_lane];
 
                     // each 32 columns we have new tile
@@ -2700,7 +2700,7 @@ template <int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_COLS, int T
                   //const int global_col = base_row; // block offset for col
                   if((base_col + subrow_loop_row + jrow + warp_id < outRows) && (base_row+warp_lane < rows))
                   {
-                    // each row hae 32 columns and is offset by 1 to prevent bank conflict during storage into smem
+                    // each row has 32 columns and is offset by 1 to prevent bank conflict during storage into smem
                     char data = smem_data[(subrow_loop_row + jrow + warp_id)*33 + warp_lane];
 
                     // each 32 columns we have new tile
@@ -2777,7 +2777,7 @@ template <int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_COLS, int T
 										//const int global_col = base_row; // block offset for col
 										if((base_col + subrow_loop_row + jrow + warp_id < outRows) && (base_row+warp_lane < rows))
 										{
-											// each row hae 32 columns and is offset by 1 to prevent bank conflict during storage into smem
+											// each row has 32 columns and is offset by 1 to prevent bank conflict during storage into smem
 											char data = smem_data[(subrow_loop_row + jrow + warp_id)*33 + warp_lane];
 
 											// each 32 columns we have new tile
diff --git a/pyproject.toml b/pyproject.toml
index 609ff84fa..271edc84e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -31,7 +31,6 @@ ignore = [
     "F841",  # Local assigned but not used (TODO: enable, these are likely bugs)
     "RUF012",  # Mutable class attribute annotations
 ]
-ignore-init-module-imports = true  # allow to expose in __init__.py via imports
 
 [tool.ruff.lint.extend-per-file-ignores]
 "**/__init__.py" = ["F401"]  # allow unused imports in __init__.py