diff --git a/.github/scripts/build-rocm.sh b/.github/scripts/build-rocm.sh
index fc7515aa7..b508fac69 100644
--- a/.github/scripts/build-rocm.sh
+++ b/.github/scripts/build-rocm.sh
@@ -1,19 +1,21 @@
 #!/bin/bash
 declare build_arch
 declare build_os
+declare rocm_version
 
 set -xeuo pipefail
+bnb_rocm_arch="gfx90a;gfx942;gfx1100"
 if [ "${build_os:0:6}" == ubuntu ]; then
-	image=rocm/dev-ubuntu-22.04:6.1-complete
+	image=rocm/dev-ubuntu-22.04:${rocm_version}-complete
 	echo "Using image $image"
 	docker run --rm --platform "linux/$build_arch" -i \
 		-w /src -v "$PWD:/src" "$image" sh -c \
 		"apt-get update \
       && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cmake \
-      && cmake -DCOMPUTE_BACKEND=hip . \
+      && cmake -DCOMPUTE_BACKEND=hip -DBNB_ROCM_ARCH=\"${bnb_rocm_arch}\" . \
       && cmake --build ."
 fi
 
-#output_dir="output/${build_os}/${build_arch}"
-#mkdir -p "${output_dir}"
-#(shopt -s nullglob && cp bitsandbytes/*.{so,dylib,dll} "${output_dir}")
+output_dir="output/${build_os}/${build_arch}"
+mkdir -p "${output_dir}"
+(shopt -s nullglob && cp bitsandbytes/*.{so,dylib,dll} "${output_dir}")
diff --git a/.github/workflows/build_documentation.yml b/.github/workflows/build_documentation.yml
index a19e7511d..ce4a55aaa 100644
--- a/.github/workflows/build_documentation.yml
+++ b/.github/workflows/build_documentation.yml
@@ -13,7 +13,9 @@ jobs:
     with:
       commit_sha: ${{ github.sha }}
       package: bitsandbytes
-      repo_owner: TimDettmers
+      repo_owner: bitsandbytes-foundation
+      # avoid /src suffix leading to wrong links, like bitsandbytes/blob/main/src/bitsandbytes/nn/
+      version_tag_suffix: ''  # defaults to '/src'
       custom_container: huggingface/transformers-doc-builder
     secrets:
       hf_token: ${{ secrets.HUGGINGFACE_PUSH }}
diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml
index cc833df5d..4679761c6 100644
--- a/.github/workflows/build_pr_documentation.yml
+++ b/.github/workflows/build_pr_documentation.yml
@@ -9,11 +9,13 @@ concurrency:
 
 jobs:
   build:
-    if: github.repository == 'TimDettmers/bitsandbytes'
+    if: github.repository == 'bitsandbytes-foundation/bitsandbytes'
     uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main
     with:
       commit_sha: ${{ github.event.pull_request.head.sha }}
       pr_number: ${{ github.event.number }}
       package: bitsandbytes
-      repo_owner: TimDettmers
+      repo_owner: bitsandbytes-foundation
+      # avoid /src suffix leading to wrong links, like bitsandbytes/blob/main/src/bitsandbytes/nn/
+      version_tag_suffix: ''  # defaults to '/src'
       custom_container: huggingface/transformers-doc-builder
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 78bc747c3..91e6d82a6 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -63,12 +63,10 @@ jobs:
         os: [ubuntu-latest, windows-latest]
         arch: [x86_64, aarch64]
         cuda_version:
-          ["11.7.1", "11.8.0", "12.0.1", "12.1.1", "12.2.2", "12.3.2", "12.4.0"]
+          ["11.7.1", "11.8.0", "12.0.1", "12.1.1", "12.2.2", "12.3.2", "12.4.1", "12.5.0"]
         exclude:
           - os: windows-latest # This probably requires arm64 Windows agents
             arch: aarch64
-          - os: windows-latest  # The Jimver/cuda-toolkit is action used for Windows builds is not updated for 12.4 yet.
-            cuda_version: "12.4.0"
           - os: ubuntu-latest # Temporary. Takes too long, not ready yet.
             arch: aarch64
     runs-on: ${{ matrix.os }} # One day, we could run them on native agents. Azure supports this now but it's planned only for Q3 2023 for hosted agents
@@ -79,7 +77,7 @@ jobs:
         if: startsWith(matrix.os, 'ubuntu')
         uses: docker/setup-qemu-action@v2
         # Windows: We install Cuda on the agent (slow)
-      - uses: Jimver/cuda-toolkit@v0.2.14
+      - uses: Jimver/cuda-toolkit@v0.2.16
         if: startsWith(matrix.os, 'windows')
         id: cuda-toolkit
         with:
@@ -108,6 +106,8 @@ jobs:
       matrix:
         os: [ubuntu-latest]
         arch: [x86_64]
+        rocm_version:
+          ["6.1.2"]
     runs-on: ${{ matrix.os }} # One day, we could run them on native agents. Azure supports this now but it's planned only for Q3 2023 for hosted agents
     steps:
       - uses: actions/checkout@v4
@@ -125,10 +125,18 @@ jobs:
         env:
           build_os: ${{ matrix.os }}
           build_arch: ${{ matrix.arch }}
+          rocm_version: ${{ matrix.rocm_version }}
+      - name: Upload build artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: shared_library_rocm_${{ matrix.os }}_${{ matrix.arch }}_${{ matrix.rocm_version }}
+          path: output/*
+          retention-days: 7
   build-wheels:
     needs:
       - build-shared-libs
       - build-shared-libs-cuda
+      - build-shared-libs-rocm
     strategy:
       matrix:
         os: [ubuntu-latest, macos-latest, windows-latest]
diff --git a/CHANGELOG.md b/CHANGELOG.md
index c456fa9e5..ed324f09e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,29 @@
+### 0.43.2
+
+This release is quite significant as the QLoRA bug fix big implications for higher `seqlen` and batch sizes.
+
+For each sequence (i.e. batch size increase of one) we expect memory savings of:
+- 405B: 39GB for `seqlen=1024`, and 4888GB for `seqlen=128,00`
+- 70B: 10.1GB for `seqlen=1024` and  1258GB for `seqlen=128,00`
+
+This was due to activations being unnecessary for frozen parameters, yet the memory for them was still erroneously allocated due to the now fixed bug.
+
+#### Improvements:
+
+- docs: FSDP+QLoRA and CPU install guide (#1211 #1227, thanks @stevhliu)
+- Add CUDA 12.5 and update 12.4 builds (#1284)
+
+#### Bug Fixes
+
+- 4bit getstate and 8bit deepcopy (#1230 #1231, thanks @BenjaminBossan)
+- missing optimizers in `str2optimizer32bit` (#1222, thanks @EtienneDosSantos)
+- CUDA 12.5 build issue (#1273, thanks @HennerM)
+- fix for min_8bit_size functionality in Optimizer base classes (#1286, thanks @Edenzzzz)
+- QLoRA mem bug (#1270, thanks @Ther-nullptr)
+- tests for cpu only platforms (#1259, thanks @galqiwi)
+- restoration of quant_storage for CPU offloading (#1279)
+- optim update error with non-contiguous grads/params (deepspeed) (#1187)
+
 ### 0.43.1
 
 #### Improvements:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3bedefd51..eac72fe52 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -77,6 +77,13 @@ endif()
 
 
 if(BUILD_CUDA)
+    # NVCC normally will only work with MSVC up to 1939. VS2022 17.10+ starts using versions 1940+.
+    # Workaround: use --allow-unsupported-compiler
+    # This needs to be added *before* we try to enable the CUDA language so CMake's compiler check passes.
+    if(MSVC AND MSVC_VERSION VERSION_GREATER_EQUAL 1940)
+        string(APPEND CMAKE_CUDA_FLAGS " --allow-unsupported-compiler")
+    endif()
+
     enable_language(CUDA) # This will fail if CUDA is not found
     find_package(CUDAToolkit REQUIRED)
 
@@ -178,7 +185,7 @@ elseif(BUILD_HIP)
       set(CMAKE_HIP_ARCHITECTURES ${BNB_ROCM_ARCH})
     else()
       if (NOT AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES)
-        set(CMAKE_HIP_ARCHITECTURES "gfx908;gfx90a;gfx940;gfx941;gfx942")
+        set(CMAKE_HIP_ARCHITECTURES "gfx90a;gfx942;gfx1100")
       elseif (AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES)
         set(CMAKE_HIP_ARCHITECTURES ${AMDGPU_TARGETS})
       endif()
@@ -187,12 +194,14 @@ elseif(BUILD_HIP)
 
     list(APPEND SRC_FILES ${HIP_FILES})
 
-    string(APPEND BNB_OUTPUT_NAME "_hip")
+    string(APPEND BNB_OUTPUT_NAME "_rocm")
 
     # get hip version
     execute_process(COMMAND hipconfig --version OUTPUT_VARIABLE HIP_CONFIG_VERSION)
     string(REGEX MATCH "[0-9]+\\.[0-9]+" HIP_VERSION "${HIP_CONFIG_VERSION}")
+    string(REPLACE "." "" HIP_VERSION_SHORT "${HIP_VERSION}")
 
+    string(APPEND BNB_OUTPUT_NAME "${HIP_VERSION_SHORT}")
     if(NO_CUBLASLT OR HIP_VERSION VERSION_LESS "6.1")
         string(APPEND BNB_OUTPUT_NAME "_nohipblaslt")
     endif()
@@ -229,7 +238,6 @@ if(WIN32)
     set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
 endif()
 
-# Weird MSVC hacks
 if(MSVC)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2 /fp:fast")
 endif()
diff --git a/_typos.toml b/_typos.toml
index a04206b8d..e4e7287fb 100644
--- a/_typos.toml
+++ b/_typos.toml
@@ -1,5 +1,10 @@
 [files]
 
+[default]
+extend-ignore-re = [
+    "@Ther-nul",  # valid Github user
+]
+
 [default.extend-identifiers]
 
 [type.py.extend-words]
diff --git a/bitsandbytes/__init__.py b/bitsandbytes/__init__.py
index eff7fc686..129ac1536 100644
--- a/bitsandbytes/__init__.py
+++ b/bitsandbytes/__init__.py
@@ -20,6 +20,13 @@
 from .cextension import lib
 from .nn import modules
 
+# NOTE: this is a temporary flag to allow outside libraries to employ conditional logic while the refactor is still in
+# alpha/beta: sth like `if getattr(bitsandbytes, "is_multi_backend_refactor_preview", False): do sth`
+# the getattr() call above would default to False and any string evaluates to True. This way we have temporary thing
+# that we can remove in Transformers with the next release after the official BNB multi-platform release; then
+# eventually making it the new default (e.g. just remove if statement and dedent in Transformers)
+is_multi_backend_refactor_preview = "TO BE REMOVED ONCE MERGED TO `main`"  # bool evals to True for str
+
 # Always register the CPU backend.
 register_backend("cpu", CPUBackend())
 
@@ -67,4 +74,4 @@
     "optim.optimizer.MockArgs": False,
 }
 
-__version__ = "0.43.2.dev"
+__version__ = "0.43.3.dev"
diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py
index 8e296a8ee..59e26ad09 100644
--- a/bitsandbytes/autograd/_functions.py
+++ b/bitsandbytes/autograd/_functions.py
@@ -524,7 +524,7 @@ def forward(ctx, A, B, out=None, bias=None, quant_state: Optional[F.QuantState]
         ctx.dtype_A, ctx.dtype_B, ctx.dtype_bias = A.dtype, B.dtype, None if bias is None else bias.dtype
 
         if any(ctx.needs_input_grad[:2]):
-            ctx.tensors = (A, B)
+            ctx.tensors = (None, B)
         else:
             ctx.tensors = (None, None)
 
@@ -537,7 +537,7 @@ def backward(ctx, grad_output):
             return torch.zeros_like(ctx.A), torch.zeros_like(ctx.B), None, bias_grad, None
 
         req_gradA, _, _, req_gradBias, _ = ctx.needs_input_grad
-        A, B = ctx.tensors
+        _, B = ctx.tensors
 
         grad_A, grad_B, grad_bias = None, None, None
 
diff --git a/bitsandbytes/backends/cpu_xpu_common.py b/bitsandbytes/backends/cpu_xpu_common.py
index 396234853..04755ed2d 100644
--- a/bitsandbytes/backends/cpu_xpu_common.py
+++ b/bitsandbytes/backends/cpu_xpu_common.py
@@ -1,3 +1,4 @@
+import subprocess
 from typing import Optional
 import warnings
 
@@ -19,6 +20,14 @@
     ipex_xpu = None
 
 
+gxx_available = False
+try:
+    subprocess.run(["g++", "--version"])
+    gxx_available = True
+except BaseException:
+    warnings.warn("g++ not found, torch.compile disabled for CPU/XPU.")
+
+
 Tensor = torch.Tensor
 
 
@@ -45,8 +54,8 @@ def _ipex_xpu_version_prereq(major, minor):
 
 
 def _maybe_torch_compile(func):
-    # torch.compile requires pytorch >= 2.0
-    if _torch_version_prereq(2, 0):
+    # torch.compile requires g++ and pytorch >= 2.0
+    if gxx_available and _torch_version_prereq(2, 0):
         options = {}
         # fx_graph_cache requires pytorch >= 2.2
         if _torch_version_prereq(2, 2):
@@ -515,7 +524,7 @@ def gemm_4bit_impl(
         output = torch.ops.torch_ipex.ipex_woq_linear(A, state.op_context.get_data_handle())
     else:
         dqB = dequantize_4bit_impl(B, state, blocksize=state.blocksize)
-        output = torch.matmul(A, dqB)
+        output = torch.matmul(A, dqB.to(A.dtype))
     if out is not None:
         out.copy_(output)
     else:
diff --git a/bitsandbytes/cextension.py b/bitsandbytes/cextension.py
index 03d2cbd61..a096d0d51 100644
--- a/bitsandbytes/cextension.py
+++ b/bitsandbytes/cextension.py
@@ -38,9 +38,9 @@ def get_cuda_bnb_library_path(cuda_specs: CUDASpecs) -> Path:
     """
     if torch.version.hip:
         if BNB_HIP_VERSION < 601:
-            return PACKAGE_DIR / f"libbitsandbytes_hip_nohipblaslt{DYNAMIC_LIBRARY_SUFFIX}"
+            return PACKAGE_DIR / f"libbitsandbytes_rocm{BNB_HIP_VERSION_SHORT}_nohipblaslt{DYNAMIC_LIBRARY_SUFFIX}"
         else:
-            return PACKAGE_DIR / f"libbitsandbytes_hip{DYNAMIC_LIBRARY_SUFFIX}"
+            return PACKAGE_DIR / f"libbitsandbytes_rocm{BNB_HIP_VERSION_SHORT}{DYNAMIC_LIBRARY_SUFFIX}"
     library_name = f"libbitsandbytes_cuda{cuda_specs.cuda_version_string}"
     if not cuda_specs.has_cublaslt:
         # if not has_cublaslt (CC < 7.5), then we have to choose _nocublaslt
@@ -119,8 +119,10 @@ def get_native_library() -> BNBNativeLibrary:
     if torch.version.hip:
         hip_major, hip_minor = map(int, torch.version.hip.split(".")[0:2])
         HIP_ENVIRONMENT, BNB_HIP_VERSION = True, hip_major * 100 + hip_minor
+        BNB_HIP_VERSION_SHORT = str(hip_major) + str(hip_minor)
     else:
         HIP_ENVIRONMENT, BNB_HIP_VERSION = False, 0
+        BNB_HIP_VERSION_SHORT = ""
     lib = get_native_library()
 except Exception as e:
     lib = None
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index 2041589b3..6cf64df28 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -27,6 +27,35 @@ def prod(iterable):
 
 if lib and lib.compiled_with_cuda:
     """C FUNCTIONS FOR OPTIMIZERS"""
+    str2optimizer32bit = {
+        "adam": (
+            lib.cadam32bit_grad_fp32,
+            lib.cadam32bit_grad_fp16,
+            lib.cadam32bit_grad_bf16,
+        ),
+        "momentum": (
+            lib.cmomentum32bit_grad_32,
+            lib.cmomentum32bit_grad_16,
+        ),
+        "rmsprop": (
+            lib.crmsprop32bit_grad_32,
+            lib.crmsprop32bit_grad_16,
+        ),
+        "lion": (
+            lib.clion32bit_grad_fp32,
+            lib.clion32bit_grad_fp16,
+            lib.clion32bit_grad_bf16,
+        ),
+        "adagrad": (
+            lib.cadagrad32bit_grad_32,
+            lib.cadagrad32bit_grad_16,
+        ),
+        "lamb": (
+            lib.cadam32bit_grad_fp32,
+            lib.cadam32bit_grad_fp16,
+        ),
+    }
+
     str2optimizer8bit = {
         "adam": (
             lib.cadam_static_8bit_grad_32,
diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index 7ab070785..c92b25e2c 100644
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -240,7 +240,7 @@ def __new__(
         return self
 
     def __getstate__(self):
-        state = self.__dict__
+        state = self.__dict__.copy()
         state["data"] = self.data
         state["requires_grad"] = self.requires_grad
         return state
@@ -286,6 +286,9 @@ def from_prequantized(
         self.compress_statistics = self.quant_state.nested
         self.quant_type = self.quant_state.quant_type
         self.bnb_quantized = True
+
+        self.quant_storage = data.dtype
+
         return self
 
     def _quantize(self, device):
@@ -340,6 +343,7 @@ def to(self, *args, **kwargs):
                 blocksize=self.blocksize,
                 compress_statistics=self.compress_statistics,
                 quant_type=self.quant_type,
+                quant_storage=self.quant_storage,
             )
 
             return new_param
@@ -457,7 +461,7 @@ def forward(self, x: torch.Tensor):
                 # since we registered the module, we can recover the state here
                 assert self.weight.shape[1] == 1
                 if not isinstance(self.weight, Params4bit):
-                    self.weight = Params4bit(self.weight, quant_storage=self.quant_storage)
+                    self.weight = Params4bit(self.weight, quant_storage=self.quant_storage, bnb_quantized=True)
                 self.weight.quant_state = self.quant_state
             else:
                 print(
@@ -567,13 +571,12 @@ def __new__(
         CB=None,
         SCB=None,
     ):
-        cls.has_fp16_weights = has_fp16_weights
-        cls.CB = None
-        cls.SCB = None
         if data is None:
             data = torch.empty(0)
         obj = torch.Tensor._make_subclass(cls, data, requires_grad)
-        obj.CB, obj.SCB = cls.CB, cls.SCB
+        obj.CB = CB
+        obj.SCB = SCB
+        obj.has_fp16_weights = has_fp16_weights
         return obj
 
     def cuda(self, device):
@@ -592,6 +595,18 @@ def cuda(self, device):
 
         return self
 
+    def __deepcopy__(self, memo):
+        # adjust this if new arguments are added to the constructor
+        new_instance = type(self).__new__(
+            type(self),
+            data=copy.deepcopy(self.data, memo),
+            requires_grad=self.requires_grad,
+            has_fp16_weights=self.has_fp16_weights,
+            CB=copy.deepcopy(self.CB, memo),
+            SCB=copy.deepcopy(self.SCB, memo),
+        )
+        return new_instance
+
     def cpu(self):
         # we store the 8-bit rows-major weight
         B = self.data.contiguous().bfloat16().cpu()
diff --git a/bitsandbytes/optim/optimizer.py b/bitsandbytes/optim/optimizer.py
index f1e60e5e7..e9c857d49 100644
--- a/bitsandbytes/optim/optimizer.py
+++ b/bitsandbytes/optim/optimizer.py
@@ -437,7 +437,7 @@ def init_state(self, group, p, gindex, pindex):
         state = self.state[p]
         state["step"] = 0
 
-        if dtype == torch.float32 or (dtype == torch.uint8 and p.numel() < 4096):
+        if dtype == torch.float32:
             state["state1"] = self.get_state_buffer(p, dtype=torch.float32)
             state["state2"] = self.get_state_buffer(p, dtype=torch.float32)
         elif dtype == torch.uint8:
@@ -474,6 +474,10 @@ def init_state(self, group, p, gindex, pindex):
 
     @torch.no_grad()
     def update_step(self, group, p, gindex, pindex):
+        # avoid update error from non-contiguous memory layout
+        p.data = p.data.contiguous()
+        p.grad = p.grad.contiguous()
+
         state = self.state[p]
         grad = p.grad
 
@@ -656,7 +660,7 @@ def init_state(self, group, p, gindex, pindex):
         state = self.state[p]
         state["step"] = 0
 
-        if dtype == torch.float32 or (dtype == torch.uint8 and p.numel() < 4096):
+        if dtype == torch.float32:
             state["state1"] = self.get_state_buffer(p, dtype=torch.float32)
         elif dtype == torch.uint8:
             if state["step"] == 0:
@@ -685,6 +689,10 @@ def init_state(self, group, p, gindex, pindex):
 
     @torch.no_grad()
     def update_step(self, group, p, gindex, pindex):
+        # avoid update error from non-contiguous memory layout
+        p.data = p.data.contiguous()
+        p.grad = p.grad.contiguous()
+
         state = self.state[p]
         grad = p.grad
 
diff --git a/csrc/kernels.cu b/csrc/kernels.cu
index f4673359b..e4d459961 100644
--- a/csrc/kernels.cu
+++ b/csrc/kernels.cu
@@ -12,8 +12,6 @@
 #include <cub/block/block_reduce.cuh>
 #include <cub/cub.cuh>
 #include <math_constants.h>
-#include <thrust/host_vector.h>
-#include <thrust/device_vector.h>
 #include <mma.h>
 
 
diff --git a/csrc/kernels.hip b/csrc/kernels.hip
index ca77dceda..d8d7cdba5 100644
--- a/csrc/kernels.hip
+++ b/csrc/kernels.hip
@@ -10,8 +10,6 @@
 #include <hipcub/hipcub.hpp>
 #include <hip/hip_math_constants.h>
 
-#include <thrust/host_vector.h>
-#include <thrust/device_vector.h>
 //#include <mma.h>
 
 
diff --git a/csrc/ops.cuh b/csrc/ops.cuh
index da9df6af0..8b9a4f449 100644
--- a/csrc/ops.cuh
+++ b/csrc/ops.cuh
@@ -19,10 +19,6 @@
 #include <vector>
 #include <functional>
 
-#include <thrust/host_vector.h>
-#include <thrust/device_vector.h>
-
-
 
 #define CUDA_CHECK_RETURN(value) {                      \
   cudaError_t _m_cudaStat = value;                    \
diff --git a/csrc/ops_hip.cuh b/csrc/ops_hip.cuh
index 1b9c13063..e57cbb3b5 100644
--- a/csrc/ops_hip.cuh
+++ b/csrc/ops_hip.cuh
@@ -21,12 +21,6 @@
 #include <vector>
 #include <functional>
 
-/*
-#include <thrust/host_vector.h>
-#include <thrust/device_vector.h>
-*/
-
-
 #define CUDA_CHECK_RETURN(value) {                      \
   hipError_t _m_cudaStat = value;                    \
   if (_m_cudaStat != hipSuccess) {                   \
diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index caf22488f..3ed694ac1 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -1,8 +1,10 @@
 # Installation
 
-bitsandbytes is only supported on CUDA GPUs for CUDA versions **11.0 - 12.3**.
+## CUDA
 
-The latest version of bitsandbytes (v0.43.0) builds on:
+bitsandbytes is only supported on CUDA GPUs for CUDA versions **11.0 - 12.5**. However, there's a multi-backend effort under way which is currently in alpha release, check [the respective section below in case you're interested to help us with early feedback](#multi-backend).
+
+The latest version of bitsandbytes builds on:
 
 | OS | CUDA | Compiler |
 |---|---|---|
@@ -29,7 +31,7 @@ To install from PyPI.
 pip install bitsandbytes
 ```
 
-## Compile from source
+### Compile from source[[compile]]
 
 For Linux and Windows systems, you can compile bitsandbytes from source. Installing from source allows for more build options with different CMake configurations.
 
@@ -59,7 +61,7 @@ git clone https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/
 pip install -r requirements-dev.txt
 cmake -DCOMPUTE_BACKEND=cuda -S .
 make
-pip install .
+pip install -e .   # `-e` for "editable" install, when developing BNB (otherwise leave that out)
 ```
 
 > [!TIP]
@@ -83,7 +85,7 @@ git clone https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/
 pip install -r requirements-dev.txt
 cmake -DCOMPUTE_BACKEND=cuda -S .
 cmake --build . --config Release
-python -m build --wheel
+pip install -e .   # `-e` for "editable" install, when developing BNB (otherwise leave that out)
 ```
 
 Big thanks to [wkpark](https://github.com/wkpark), [Jamezo97](https://github.com/Jamezo97), [rickardp](https://github.com/rickardp), [akx](https://github.com/akx) for their amazing contributions to make bitsandbytes compatible with Windows.
@@ -91,7 +93,7 @@ Big thanks to [wkpark](https://github.com/wkpark), [Jamezo97](https://github.com
 </hfoption>
 </hfoptions>
 
-## PyTorch CUDA versions
+### PyTorch CUDA versions
 
 Some bitsandbytes features may need a newer CUDA version than the one currently supported by PyTorch binaries from Conda and pip. In this case, you should follow these instructions to load a precompiled bitsandbytes binary.
 
@@ -105,7 +107,7 @@ Then locally install the CUDA version you need with this script from bitsandbyte
 ```bash
 wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/install_cuda.sh
 # Syntax cuda_install CUDA_VERSION INSTALL_PREFIX EXPORT_TO_BASH
-#   CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122, 123, 124}
+#   CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122, 123, 124, 125}
 #   EXPORT_TO_BASH in {0, 1} with 0=False and 1=True
 
 # For example, the following installs CUDA 11.7 to ~/local/cuda-11.7 and exports the path to your .bashrc
@@ -127,7 +129,78 @@ For example, to use a local install path:
 
 ```bash
 export BNB_CUDA_VERSION=117
-export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/tim/local/cuda-11.7
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/YOUR_USERNAME/local/cuda-11.7
 ```
 
 3. Now when you launch bitsandbytes with these environment variables, the PyTorch CUDA version is overridden by the new CUDA version (in this example, version 11.7) and a different bitsandbytes library is loaded.
+
+## Multi-backend preview release compilation[[multi-backend]]
+
+Please follow these steps to install bitsandbytes with device-specific backend support other than CUDA:
+
+<hfoptions id="backend">
+<hfoption id="AMD ROCm">
+
+### AMD GPU
+
+bitsandbytes is fully supported from ROCm 6.1 onwards (currently in alpha release).
+
+> [!TIP]
+> If you would like to install ROCm and PyTorch on bare metal, skip Docker steps and refer to our official guides at [ROCm installation overview](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/install-overview.html#rocm-install-overview) and [Installing PyTorch for ROCm](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/3rd-party/pytorch-install.html#using-wheels-package) (Step 3 of wheels build for quick installation). Please make sure to get PyTorch wheel for the installed ROCm version.
+
+```bash
+# Create a docker container with latest ROCm image, which includes ROCm libraries
+docker pull rocm/dev-ubuntu-22.04:6.1.2-complete
+docker run -it --device=/dev/kfd --device=/dev/dri --group-add video rocm/dev-ubuntu-22.04:6.1.2-complete
+apt-get update && apt-get install -y git && cd home
+
+# Install pytorch compatible with above ROCm version
+pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.1/
+
+# Install bitsandbytes from PyPI
+# (This is supported on Ubuntu 22.04, Python 3.10, ROCm 6.1.0/6.1.1/6.1.2 and gpu arch - gfx90a, gfx942, gfx1100
+# Please install from source if your configuration doesn't match with these)
+pip install bitsandbytes
+
+# Install bitsandbytes from source
+# Clone bitsandbytes repo, ROCm backend is currently enabled on multi-backend-refactor branch
+git clone --depth 1 -b multi-backend-refactor https://github.com/bitsandbytes-foundation/bitsandbytes.git && cd bitsandbytes/
+
+# Install dependencies
+pip install -r requirements-dev.txt
+
+# Compile & install
+apt-get install -y build-essential cmake  # install build tools dependencies, unless present
+cmake -DCOMPUTE_BACKEND=hip -S .  # Use -DBNB_ROCM_ARCH="gfx90a;gfx942" to target specific gpu arch
+make
+pip install -e .   # `-e` for "editable" install, when developing BNB (otherwise leave that out)
+```
+
+</hfoption>
+<hfoption id="Intel CPU + GPU">
+
+### Intel CPU
+
+> [!TIP]
+> Intel CPU backend only supports building from source; for now, please follow the instructions below.
+
+Similar to the CUDA case, you can compile bitsandbytes from source for Linux and Windows systems.
+
+The below commands are for Linux. For installing on Windows, please adapt the below commands according to the same pattern as described [the section above on compiling from source under the Windows tab](#compile).
+
+```
+git clone --depth 1 -b multi-backend-refactor https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/
+pip install intel_extension_for_pytorch
+pip install -r requirements-dev.txt
+cmake -DCOMPUTE_BACKEND=cpu -S .
+make
+pip install -e .   # `-e` for "editable" install, when developing BNB (otherwise leave that out)
+```
+
+</hfoption>
+<hfoption id="Apple Silicon (MPS)">
+
+WIP
+
+</hfoption>
+</hfoptions>
diff --git a/docs/source/rocm_installation.mdx b/docs/source/rocm_installation.mdx
deleted file mode 100644
index 5d4381e7d..000000000
--- a/docs/source/rocm_installation.mdx
+++ /dev/null
@@ -1,46 +0,0 @@
-# ROCm Installation
-
-Please follow these steps to install bitsandbytes on ROCm.
-
-<hfoptions id="rocm_install">
-<hfoption id="latest">
-
-For latest installation:
-
-```bash
-git clone https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/
-pip install -r requirements-dev.txt
-cmake -DCOMPUTE_BACKEND=hip -S . #Use -DBNB_ROCM_ARCH="gfx90a;gfx942" to target specific gpu arch
-make
-pip install .
-```
-
-<hfoption>
-<hfoption id="version specific">
-
-For ROCm specific versions:
-
-Install Dependencies:
-
-```bash
-# hipblaslt installation needed only for rocm<6.0
-apt install hipblaslt
-pip install --upgrade pip
-pip install einops lion_pytorch accelerate
-pip install git+https://github.com/ROCm/transformers.git
-```
-
-Install bitsandbytes from [ROCm](https://github.com/ROCm/bitsandbytes) repo:
-
-```bash
-git clone --recurse https://github.com/ROCm/bitsandbytes
-cd bitsandbytes
-# Checkout branch as needed
-# for rocm 5.7 - rocm5.7_internal_testing
-# for rocm 6.x - rocm6.2_internal_testing
-git checkout <branch>
-make hip
-python setup.py install
-```
-<hfoption>
-</hfoptions>
diff --git a/install_cuda.py b/install_cuda.py
index cf7c8ee71..8267c5e2b 100644
--- a/install_cuda.py
+++ b/install_cuda.py
@@ -17,7 +17,8 @@
     "121": "https://developer.download.nvidia.com/compute/cuda/12.1.1/local_installers/cuda_12.1.1_530.30.02_linux.run",
     "122": "https://developer.download.nvidia.com/compute/cuda/12.2.2/local_installers/cuda_12.2.2_535.104.05_linux.run",
     "123": "https://developer.download.nvidia.com/compute/cuda/12.3.2/local_installers/cuda_12.3.2_545.23.08_linux.run",
-    "124": "https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_550.54.14_linux.run",
+    "124": "https://developer.download.nvidia.com/compute/cuda/12.4.1/local_installers/cuda_12.4.1_550.54.15_linux.run",
+    "125": "https://developer.download.nvidia.com/compute/cuda/12.5.0/local_installers/cuda_12.5.0_555.42.02_linux.run",
 }
 
 
diff --git a/install_cuda.sh b/install_cuda.sh
index 2e7fe8ed2..0aa9531fc 100644
--- a/install_cuda.sh
+++ b/install_cuda.sh
@@ -11,7 +11,8 @@ URL120=https://developer.download.nvidia.com/compute/cuda/12.0.1/local_installer
 URL121=https://developer.download.nvidia.com/compute/cuda/12.1.1/local_installers/cuda_12.1.1_530.30.02_linux.run
 URL122=https://developer.download.nvidia.com/compute/cuda/12.2.2/local_installers/cuda_12.2.2_535.104.05_linux.run
 URL123=https://developer.download.nvidia.com/compute/cuda/12.3.2/local_installers/cuda_12.3.2_545.23.08_linux.run
-URL124=https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_550.54.14_linux.run
+URL124=https://developer.download.nvidia.com/compute/cuda/12.4.1/local_installers/cuda_12.4.1_550.54.15_linux.run
+URL125=https://developer.download.nvidia.com/compute/cuda/12.5.0/local_installers/cuda_12.5.0_555.42.02_linux.run
 
 CUDA_VERSION=$1
 BASE_PATH=$2
@@ -60,11 +61,14 @@ if [[ -n "$CUDA_VERSION" ]]; then
   elif [[ "$CUDA_VERSION" -eq "124" ]]; then
     URL=$URL124
     FOLDER=cuda-12.4
+  elif [[ "$CUDA_VERSION" -eq "125" ]]; then
+    URL=$URL125
+    FOLDER=cuda-12.5
   else
-    echo "argument error: No cuda version passed as input. Choose among versions 110 to 124"
+    echo "argument error: No cuda version passed as input. Choose among versions 110 to 125"
   fi
 else
-    echo "argument error: No cuda version passed as input. Choose among versions 92 to 123"
+    echo "argument error: No cuda version passed as input. Choose among versions 110 to 125"
 fi
 
 FILE=$(basename $URL)
diff --git a/requirements-ci.txt b/requirements-ci.txt
index 0e9dd2407..182e1023e 100644
--- a/requirements-ci.txt
+++ b/requirements-ci.txt
@@ -1,6 +1,6 @@
 # Requirements used for GitHub actions
-pytest==8.2.1
+pytest==8.3.1
 einops==0.8.0
-lion-pytorch==0.1.4
+lion-pytorch==0.2.2
 scipy==1.10.1; python_version < "3.9"
-scipy==1.13.1; python_version >= "3.9"
+scipy==1.14.0; python_version >= "3.9"
diff --git a/requirements-dev.txt b/requirements-dev.txt
index de7adce94..41211880c 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,9 +1,9 @@
 # Requirements used for local development
 setuptools>=63
-pytest~=8.2.1
+pytest~=8.3.1
 einops~=0.8.0
 wheel~=0.43.0
-lion-pytorch~=0.1.4
-scipy~=1.13.1
+lion-pytorch~=0.2.2
+scipy~=1.14.0
 pandas~=2.2.2
-matplotlib~=3.9.0
+matplotlib~=3.9.1
diff --git a/setup.py b/setup.py
index f8d6a92a1..18de0fe5b 100644
--- a/setup.py
+++ b/setup.py
@@ -25,7 +25,7 @@ def has_ext_modules(self):
 
 setup(
     name="bitsandbytes",
-    version="0.43.2.dev",
+    version="0.43.3.dev",
     author="Tim Dettmers",
     author_email="dettmers@cs.washington.edu",
     description="k-bit optimizers and matrix multiplication routines.",
diff --git a/tests/test_functional.py b/tests/test_functional.py
index 4e82c530a..a9d926b89 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -584,6 +584,9 @@ def test_nvidia_transform(dim1, dim2, dim3, dims, dtype, orderA, orderOut, trans
 @pytest.mark.parametrize("ldb", (0,), ids=id_formatter("ldb"))
 @pytest.mark.parametrize("device", ("cuda", "cpu"), ids=id_formatter("device"))
 def test_igemmlt_int(dim1, dim2, dim3, dim4, dims, ldb, device):
+    if HIP_ENVIRONMENT and device == "cpu":
+        pytest.skip("this test is not supported on ROCm yet")
+
     for i in range(k):
         if dims == 2:
             A = torch.randint(-128, 127, size=(dim1, dim3), device=device).to(torch.int8)
diff --git a/tests/test_linear4bit.py b/tests/test_linear4bit.py
index bbbd05335..2f094be27 100644
--- a/tests/test_linear4bit.py
+++ b/tests/test_linear4bit.py
@@ -186,19 +186,30 @@ def test_copy_param():
 def test_deepcopy_param():
     tensor = torch.tensor([1.0, 2.0, 3.0, 4.0])
     param = bnb.nn.Params4bit(data=tensor, requires_grad=False).cuda(0)
+    dict_keys_before = set(param.__dict__.keys())
     copy_param = copy.deepcopy(param)
+    dict_keys_after = set(param.__dict__.keys())
+    dict_keys_copy = set(copy_param.__dict__.keys())
+
     assert param.quant_state is not copy_param.quant_state
     assert param.data.data_ptr() != copy_param.data.data_ptr()
 
+    # there was a bug where deepcopy would modify the original object
+    assert dict_keys_before == dict_keys_after
+    assert dict_keys_before == dict_keys_copy
+
 
 def test_params4bit_real_serialization():
     original_tensor = torch.tensor([1.0, 2.0, 3.0, 4.0], dtype=torch.float32)
     original_param = bnb.nn.Params4bit(data=original_tensor, quant_type="fp4")
+    dict_keys_before = set(original_param.__dict__.keys())
 
     original_param.cuda(0)  # move to CUDA to trigger quantization
 
     serialized_param = pickle.dumps(original_param)
     deserialized_param = pickle.loads(serialized_param)
+    dict_keys_after = set(original_param.__dict__.keys())
+    dict_keys_deserialized = set(deserialized_param.__dict__.keys())
 
     assert torch.equal(original_param.data, deserialized_param.data)
     assert original_param.requires_grad == deserialized_param.requires_grad == False
@@ -206,3 +217,7 @@ def test_params4bit_real_serialization():
     assert original_param.blocksize == deserialized_param.blocksize
     assert original_param.compress_statistics == deserialized_param.compress_statistics
     assert original_param.quant_state == deserialized_param.quant_state
+
+    # there was a bug where deepcopy would modify the original object
+    assert dict_keys_before == dict_keys_after
+    assert dict_keys_before == dict_keys_deserialized
diff --git a/tests/test_linear8bitlt.py b/tests/test_linear8bitlt.py
index 2a4bd02e2..c4409cc2e 100644
--- a/tests/test_linear8bitlt.py
+++ b/tests/test_linear8bitlt.py
@@ -1,5 +1,7 @@
 from contextlib import nullcontext
+import copy
 import os
+import pickle
 from tempfile import TemporaryDirectory
 
 import pytest
@@ -181,3 +183,59 @@ def test_linear_serialization(
         assert torch.allclose(x_first.grad, x_second.grad, atol=1e-5)
     assert torch.allclose(fx_first, fx_third, atol=1e-5)
     assert torch.allclose(x_first.grad, x_third.grad, atol=1e-5)
+
+
+@pytest.fixture
+def linear8bit(requires_cuda):
+    linear = torch.nn.Linear(32, 96)
+    linear_custom = Linear8bitLt(
+        linear.in_features,
+        linear.out_features,
+        linear.bias is not None,
+        has_fp16_weights=False,
+        threshold=6.0,
+    )
+    linear_custom.weight = bnb.nn.Int8Params(
+        linear.weight.data.clone(),
+        requires_grad=False,
+        has_fp16_weights=False,
+    )
+    linear_custom.bias = linear.bias
+    linear_custom = linear_custom.cuda()
+    return linear_custom
+
+
+def test_linear8bit_copy_param(linear8bit):
+    shallow_copy = copy.copy(linear8bit)
+    assert linear8bit.weight is shallow_copy.weight
+    assert linear8bit.bias is shallow_copy.bias
+    assert linear8bit.weight.data.data_ptr() == shallow_copy.weight.data.data_ptr()
+
+
+def test_linear8bit_deepcopy_param(linear8bit):
+    deep_copy = copy.deepcopy(linear8bit)
+    assert linear8bit.weight is not deep_copy.weight
+    assert linear8bit.bias is not deep_copy.bias
+    assert linear8bit.weight.data.data_ptr() != deep_copy.weight.data.data_ptr()
+    assert torch.allclose(linear8bit.weight.data, deep_copy.weight.data)
+    assert linear8bit.state == deep_copy.state
+
+    # check for a bug where SCB and CB were not copied
+    assert deep_copy.weight.SCB is not None
+    assert (linear8bit.weight.SCB == deep_copy.weight.SCB).all()
+    assert deep_copy.weight.CB is not None
+    assert (linear8bit.weight.CB == deep_copy.weight.CB).all()
+
+
+def test_linear8bit_serialization(linear8bit):
+    serialized = pickle.dumps(linear8bit)
+    deserialized = pickle.loads(serialized)
+    assert linear8bit.weight.data.data_ptr() != deserialized.weight.data.data_ptr()
+    assert torch.allclose(linear8bit.weight.data, deserialized.weight.data)
+    assert linear8bit.bias.data.data_ptr() != deserialized.bias.data.data_ptr()
+    assert torch.allclose(linear8bit.bias.data, deserialized.bias.data)
+    assert linear8bit.state == deserialized.state
+
+    # check for a bug where SCB and CB were not copied
+    assert (linear8bit.weight.SCB == deserialized.weight.SCB).all()
+    assert (linear8bit.weight.CB == deserialized.weight.CB).all()
diff --git a/tests/test_modules.py b/tests/test_modules.py
index 8235b600c..1947ba52d 100644
--- a/tests/test_modules.py
+++ b/tests/test_modules.py
@@ -620,7 +620,7 @@ def test_fp8linear():
     assert bgraderr < 0.00002
 
 
-def test_4bit_warnings():
+def test_4bit_warnings(requires_cuda):
     dim1 = 64
 
     with pytest.warns(UserWarning, match=r"inference or training"):