From a9a1c44abaf671f06794f6664fe3ca47bcade5e5 Mon Sep 17 00:00:00 2001
From: EtienneDosSantos <130935112+EtienneDosSantos@users.noreply.github.com>
Date: Sun, 26 May 2024 15:43:24 +0200
Subject: [PATCH 01/54] Add `"lamb"` to `str2optimizer32bit`

---
 bitsandbytes/functional.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index f915223ca..dc1490482 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -49,6 +49,10 @@ def prod(iterable):
             lib.cadagrad32bit_grad_32,
             lib.cadagrad32bit_grad_16,
         ),
+        "lamb": (
+            lib.cadam32bit_grad_fp32,
+            lib.cadam32bit_grad_fp16,
+            ),
     }
 
     str2optimizer8bit = {

From 2e46eefcb214cffc0fb9d6ace71f53924f9c7873 Mon Sep 17 00:00:00 2001
From: EtienneDosSantos <130935112+EtienneDosSantos@users.noreply.github.com>
Date: Tue, 28 May 2024 18:35:31 +0200
Subject: [PATCH 02/54] Sorted alphabetically for better overview

---
 bitsandbytes/functional.py | 64 +++++++++++++++++++-------------------
 1 file changed, 32 insertions(+), 32 deletions(-)

diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index dc1490482..0b1e7d5c4 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -27,11 +27,24 @@ def prod(iterable):
 if lib and lib.compiled_with_cuda:
     """C FUNCTIONS FOR OPTIMIZERS"""
     str2optimizer32bit = {
+        "adagrad": (
+            lib.cadagrad32bit_grad_32,
+            lib.cadagrad32bit_grad_16,
+        ),
         "adam": (
             lib.cadam32bit_grad_fp32,
             lib.cadam32bit_grad_fp16,
             lib.cadam32bit_grad_bf16,
         ),
+        "lamb": (
+            lib.cadam32bit_grad_fp32,
+            lib.cadam32bit_grad_fp16,
+        ),
+        "lion": (
+            lib.clion32bit_grad_fp32,
+            lib.clion32bit_grad_fp16,
+            lib.clion32bit_grad_bf16,
+        ),
         "momentum": (
             lib.cmomentum32bit_grad_32,
             lib.cmomentum32bit_grad_16,
@@ -40,19 +53,6 @@ def prod(iterable):
             lib.crmsprop32bit_grad_32,
             lib.crmsprop32bit_grad_16,
         ),
-        "lion": (
-            lib.clion32bit_grad_fp32,
-            lib.clion32bit_grad_fp16,
-            lib.clion32bit_grad_bf16,
-        ),
-        "adagrad": (
-            lib.cadagrad32bit_grad_32,
-            lib.cadagrad32bit_grad_16,
-        ),
-        "lamb": (
-            lib.cadam32bit_grad_fp32,
-            lib.cadam32bit_grad_fp16,
-            ),
     }
 
     str2optimizer8bit = {
@@ -60,34 +60,43 @@ def prod(iterable):
             lib.cadam_static_8bit_grad_32,
             lib.cadam_static_8bit_grad_16,
         ),
-        "momentum": (
+        "lamb": (
+            lib.cadam_static_8bit_grad_32,
+            lib.cadam_static_8bit_grad_16,
+        ),
+        "lars": (
             lib.cmomentum_static_8bit_grad_32,
             lib.cmomentum_static_8bit_grad_16,
         ),
-        "rmsprop": (
-            lib.crmsprop_static_8bit_grad_32,
-            lib.crmsprop_static_8bit_grad_16,
-        ),
         "lion": (
             lib.clion_static_8bit_grad_32,
             lib.clion_static_8bit_grad_16,
         ),
-        "lamb": (
-            lib.cadam_static_8bit_grad_32,
-            lib.cadam_static_8bit_grad_16,
-        ),
-        "lars": (
+        "momentum": (
             lib.cmomentum_static_8bit_grad_32,
             lib.cmomentum_static_8bit_grad_16,
         ),
+        "rmsprop": (
+            lib.crmsprop_static_8bit_grad_32,
+            lib.crmsprop_static_8bit_grad_16,
+        ),
     }
 
     str2optimizer8bit_blockwise = {
+        "adagrad": (
+            lib.cadagrad_8bit_blockwise_grad_fp32,
+            lib.cadagrad_8bit_blockwise_grad_fp16,
+        ),
         "adam": (
             lib.cadam_8bit_blockwise_grad_fp32,
             lib.cadam_8bit_blockwise_grad_fp16,
             lib.cadam_8bit_blockwise_grad_bf16,
         ),
+        "lion": (
+            lib.clion_8bit_blockwise_grad_fp32,
+            lib.clion_8bit_blockwise_grad_fp16,
+            lib.clion_8bit_blockwise_grad_bf16,
+        ),
         "momentum": (
             lib.cmomentum_8bit_blockwise_grad_fp32,
             lib.cmomentum_8bit_blockwise_grad_fp16,
@@ -96,15 +105,6 @@ def prod(iterable):
             lib.crmsprop_8bit_blockwise_grad_fp32,
             lib.crmsprop_8bit_blockwise_grad_fp16,
         ),
-        "lion": (
-            lib.clion_8bit_blockwise_grad_fp32,
-            lib.clion_8bit_blockwise_grad_fp16,
-            lib.clion_8bit_blockwise_grad_bf16,
-        ),
-        "adagrad": (
-            lib.cadagrad_8bit_blockwise_grad_fp32,
-            lib.cadagrad_8bit_blockwise_grad_fp16,
-        ),
     }
 
 
From 7a338db2eccbd60b7da3b7bed9c927117c6b3806 Mon Sep 17 00:00:00 2001
From: EtienneDosSantos <130935112+EtienneDosSantos@users.noreply.github.com>
Date: Tue, 28 May 2024 19:53:57 +0200
Subject: [PATCH 03/54] Update functional.py

---
 bitsandbytes/functional.py | 88 ++++++++++++++++++++++++++++++--------
 1 file changed, 70 insertions(+), 18 deletions(-)

diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index 0b1e7d5c4..bbfbf0007 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -28,57 +28,94 @@ def prod(iterable):
     """C FUNCTIONS FOR OPTIMIZERS"""
     str2optimizer32bit = {
         "adagrad": (
-            lib.cadagrad32bit_grad_32,
-            lib.cadagrad32bit_grad_16,
+            lib.cadagrad32bit_grad_fp32,
+            lib.cadagrad32bit_grad_fp16,
         ),
         "adam": (
             lib.cadam32bit_grad_fp32,
             lib.cadam32bit_grad_fp16,
             lib.cadam32bit_grad_bf16,
         ),
+        "pagedadam": (
+            lib.cpagedadam32bit_grad_fp32,
+            lib.cpagedadam32bit_grad_fp16,
+            lib.cpagedadam32bit_grad_bf16,
+        ),
+        "adamw": (
+            lib.cadam32bit_grad_fp32,
+            lib.cadam32bit_grad_fp16,
+            lib.cadam32bit_grad_bf16,
+        ),
+        "pagedadamw": (
+            lib.cpagedadam32bit_grad_fp32,
+            lib.cpagedadam32bit_grad_fp16,
+            lib.cpagedadam32bit_grad_bf16,
+        ),
         "lamb": (
             lib.cadam32bit_grad_fp32,
             lib.cadam32bit_grad_fp16,
         ),
+        "lars": (
+            lib.clars32bit_grad_fp32,
+            lib.clars32bit_grad_fp16,
+        ),
         "lion": (
             lib.clion32bit_grad_fp32,
             lib.clion32bit_grad_fp16,
             lib.clion32bit_grad_bf16,
         ),
         "momentum": (
-            lib.cmomentum32bit_grad_32,
-            lib.cmomentum32bit_grad_16,
+            lib.cmomentum32bit_grad_fp32,
+            lib.cmomentum32bit_grad_fp16,
         ),
         "rmsprop": (
-            lib.crmsprop32bit_grad_32,
-            lib.crmsprop32bit_grad_16,
+            lib.crmsprop32bit_grad_fp32,
+            lib.crmsprop32bit_grad_fp16,
         ),
     }
 
     str2optimizer8bit = {
+        "adagrad": (
+            lib.cadagrad8bit_grad_fp32,
+            lib.cadagrad8bit_grad_fp16,
+        ),
         "adam": (
-            lib.cadam_static_8bit_grad_32,
-            lib.cadam_static_8bit_grad_16,
+            lib.cadam_static_8bit_grad_fp32,
+            lib.cadam_static_8bit_grad_fp16,
+        ),
+        "pagedadam": (
+            lib.cpagedadam8bit_grad_fp32,
+            lib.cpagedadam8bit_grad_fp16,
+            lib.cpagedadam8bit_grad_bf16,
+        ),
+        "adamw": (
+            lib.cadam_static_8bit_grad_fp32,
+            lib.cadam_static_8bit_grad_fp16,
+        ),
+        "pagedadamw": (
+            lib.cpagedadam8bit_grad_fp32,
+            lib.cpagedadam8bit_grad_fp16,
+            lib.cpagedadam8bit_grad_bf16,
         ),
         "lamb": (
-            lib.cadam_static_8bit_grad_32,
-            lib.cadam_static_8bit_grad_16,
+            lib.cadam_static_8bit_grad_fp32,
+            lib.cadam_static_8bit_grad_fp16,
         ),
         "lars": (
-            lib.cmomentum_static_8bit_grad_32,
-            lib.cmomentum_static_8bit_grad_16,
+            lib.clars8bit_grad_fp32,
+            lib.clars8bit_grad_fp16,
         ),
         "lion": (
-            lib.clion_static_8bit_grad_32,
-            lib.clion_static_8bit_grad_16,
+            lib.clion_static_8bit_grad_fp32,
+            lib.clion_static_8bit_grad_fp16,
         ),
         "momentum": (
-            lib.cmomentum_static_8bit_grad_32,
-            lib.cmomentum_static_8bit_grad_16,
+            lib.cmomentum_static_8bit_grad_fp32,
+            lib.cmomentum_static_8bit_grad_fp16,
         ),
         "rmsprop": (
-            lib.crmsprop_static_8bit_grad_32,
-            lib.crmsprop_static_8bit_grad_16,
+            lib.crmsprop_static_8bit_grad_fp32,
+            lib.crmsprop_static_8bit_grad_fp16,
         ),
     }
 
@@ -92,6 +129,21 @@ def prod(iterable):
             lib.cadam_8bit_blockwise_grad_fp16,
             lib.cadam_8bit_blockwise_grad_bf16,
         ),
+        "pagedadam": (
+            lib.cpagedadam8bit_blockwise_fp32,
+            lib.cpagedadam8bit_blockwise_fp16,
+            lib.cpagedadam8bit_blockwise_bf16,
+        ),
+        "adamw": (
+            lib.cadam_8bit_blockwise_grad_fp32,
+            lib.cadam_8bit_blockwise_grad_fp16,
+            lib.cadam_8bit_blockwise_grad_bf16,
+        ),
+        "pagedadamw": (
+            lib.cpagedadam8bit_blockwise_fp32,
+            lib.cpagedadam8bit_blockwise_fp16,
+            lib.cpagedadam8bit_blockwise_bf16,
+        ),
         "lion": (
             lib.clion_8bit_blockwise_grad_fp32,
             lib.clion_8bit_blockwise_grad_fp16,

From 2fb212bdf983891451db73a4c4bef6c91ec0786d Mon Sep 17 00:00:00 2001
From: Benjamin Bossan <benjamin.bossan@gmail.com>
Date: Wed, 29 May 2024 15:36:44 +0200
Subject: [PATCH 04/54] FIX Prevent __getstate__ from mutating Params4bit

As discussed internally, use state = self.__dict__.copy(), which is also
what the Python docs recommend.
---
 bitsandbytes/nn/modules.py |  2 +-
 tests/test_linear4bit.py   | 15 +++++++++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index 24a155ab1..df347ebba 100644
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -236,7 +236,7 @@ def __new__(
         return self
 
     def __getstate__(self):
-        state = self.__dict__
+        state = self.__dict__.copy()
         state["data"] = self.data
         state["requires_grad"] = self.requires_grad
         return state
diff --git a/tests/test_linear4bit.py b/tests/test_linear4bit.py
index bbbd05335..2f094be27 100644
--- a/tests/test_linear4bit.py
+++ b/tests/test_linear4bit.py
@@ -186,19 +186,30 @@ def test_copy_param():
 def test_deepcopy_param():
     tensor = torch.tensor([1.0, 2.0, 3.0, 4.0])
     param = bnb.nn.Params4bit(data=tensor, requires_grad=False).cuda(0)
+    dict_keys_before = set(param.__dict__.keys())
     copy_param = copy.deepcopy(param)
+    dict_keys_after = set(param.__dict__.keys())
+    dict_keys_copy = set(copy_param.__dict__.keys())
+
     assert param.quant_state is not copy_param.quant_state
     assert param.data.data_ptr() != copy_param.data.data_ptr()
 
+    # there was a bug where deepcopy would modify the original object
+    assert dict_keys_before == dict_keys_after
+    assert dict_keys_before == dict_keys_copy
+
 
 def test_params4bit_real_serialization():
     original_tensor = torch.tensor([1.0, 2.0, 3.0, 4.0], dtype=torch.float32)
     original_param = bnb.nn.Params4bit(data=original_tensor, quant_type="fp4")
+    dict_keys_before = set(original_param.__dict__.keys())
 
     original_param.cuda(0)  # move to CUDA to trigger quantization
 
     serialized_param = pickle.dumps(original_param)
     deserialized_param = pickle.loads(serialized_param)
+    dict_keys_after = set(original_param.__dict__.keys())
+    dict_keys_deserialized = set(deserialized_param.__dict__.keys())
 
     assert torch.equal(original_param.data, deserialized_param.data)
     assert original_param.requires_grad == deserialized_param.requires_grad == False
@@ -206,3 +217,7 @@ def test_params4bit_real_serialization():
     assert original_param.blocksize == deserialized_param.blocksize
     assert original_param.compress_statistics == deserialized_param.compress_statistics
     assert original_param.quant_state == deserialized_param.quant_state
+
+    # there was a bug where deepcopy would modify the original object
+    assert dict_keys_before == dict_keys_after
+    assert dict_keys_before == dict_keys_deserialized

From ed99b3c118d73e829e7581c04a414b80ee2b7030 Mon Sep 17 00:00:00 2001
From: Benjamin Bossan <benjamin.bossan@gmail.com>
Date: Thu, 30 May 2024 17:03:48 +0200
Subject: [PATCH 05/54] FIX Make Int8Params deepcopy-able

This requires to implement the __deepcopy__ method in Int8Params.
Moreover, there was an issue in the Linear8BitLT constructor that would
assign instance attributes to the class, which is now fixed.

Please review carefully that this does not impact existing code.

Tests that I ran:

- pytest tests/test_linear8bitlt.py
- in PEFT: python -m pytest -m "single_gpu_tests and bitsandbytes" tests/test_gpu_examples.py
- in PEFT: python -m pytest -m "single_gpu_tests and bitsandbytes" tests/test_common_gpu.py
- in transformers: RUN_SLOW=1 python -m pytest tests/quantization/bnb -x
---
 bitsandbytes/nn/modules.py | 19 ++++++++++---
 tests/test_linear8bitlt.py | 58 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 73 insertions(+), 4 deletions(-)

diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index 24a155ab1..e44c77ac6 100644
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -560,13 +560,12 @@ def __new__(
         CB=None,
         SCB=None,
     ):
-        cls.has_fp16_weights = has_fp16_weights
-        cls.CB = None
-        cls.SCB = None
         if data is None:
             data = torch.empty(0)
         obj = torch.Tensor._make_subclass(cls, data, requires_grad)
-        obj.CB, obj.SCB = cls.CB, cls.SCB
+        obj.CB = CB
+        obj.SCB = SCB
+        obj.has_fp16_weights = has_fp16_weights
         return obj
 
     def cuda(self, device):
@@ -585,6 +584,18 @@ def cuda(self, device):
 
         return self
 
+    def __deepcopy__(self, memo):
+        # adjust this if new arguments are added to the constructor
+        new_instance = type(self).__new__(
+            type(self),
+            data=copy.deepcopy(self.data, memo),
+            requires_grad=self.requires_grad,
+            has_fp16_weights=self.has_fp16_weights,
+            CB=copy.deepcopy(self.CB, memo),
+            SCB=copy.deepcopy(self.SCB, memo),
+        )
+        return new_instance
+
     @overload
     def to(
         self: T,
diff --git a/tests/test_linear8bitlt.py b/tests/test_linear8bitlt.py
index 4b62abd6d..e55abe110 100644
--- a/tests/test_linear8bitlt.py
+++ b/tests/test_linear8bitlt.py
@@ -1,5 +1,7 @@
 from contextlib import nullcontext
+import copy
 import os
+import pickle
 from tempfile import TemporaryDirectory
 
 import pytest
@@ -177,3 +179,59 @@ def test_linear_serialization(
         assert torch.allclose(x_first.grad, x_second.grad, atol=1e-5)
     assert torch.allclose(fx_first, fx_third, atol=1e-5)
     assert torch.allclose(x_first.grad, x_third.grad, atol=1e-5)
+
+
+@pytest.fixture
+def linear8bit():
+    linear = torch.nn.Linear(32, 96)
+    linear_custom = Linear8bitLt(
+        linear.in_features,
+        linear.out_features,
+        linear.bias is not None,
+        has_fp16_weights=False,
+        threshold=6.0,
+    )
+    linear_custom.weight = bnb.nn.Int8Params(
+        linear.weight.data.clone(),
+        requires_grad=False,
+        has_fp16_weights=False,
+    )
+    linear_custom.bias = linear.bias
+    linear_custom = linear_custom.cuda()
+    return linear_custom
+
+
+def test_linear8bit_copy_param(linear8bit):
+    shallow_copy = copy.copy(linear8bit)
+    assert linear8bit.weight is shallow_copy.weight
+    assert linear8bit.bias is shallow_copy.bias
+    assert linear8bit.weight.data.data_ptr() == shallow_copy.weight.data.data_ptr()
+
+
+def test_linear8bit_deepcopy_param(linear8bit):
+    deep_copy = copy.deepcopy(linear8bit)
+    assert linear8bit.weight is not deep_copy.weight
+    assert linear8bit.bias is not deep_copy.bias
+    assert linear8bit.weight.data.data_ptr() != deep_copy.weight.data.data_ptr()
+    assert torch.allclose(linear8bit.weight.data, deep_copy.weight.data)
+    assert linear8bit.state == deep_copy.state
+
+    # check for a bug where SCB and CB were not copied
+    assert deep_copy.weight.SCB is not None
+    assert (linear8bit.weight.SCB == deep_copy.weight.SCB).all()
+    assert deep_copy.weight.CB is not None
+    assert (linear8bit.weight.CB == deep_copy.weight.CB).all()
+
+
+def test_linear8bit_serialization(linear8bit):
+    serialized = pickle.dumps(linear8bit)
+    deserialized = pickle.loads(serialized)
+    assert linear8bit.weight.data.data_ptr() != deserialized.weight.data.data_ptr()
+    assert torch.allclose(linear8bit.weight.data, deserialized.weight.data)
+    assert linear8bit.bias.data.data_ptr() != deserialized.bias.data.data_ptr()
+    assert torch.allclose(linear8bit.bias.data, deserialized.bias.data)
+    assert linear8bit.state == deserialized.state
+
+    # check for a bug where SCB and CB were not copied
+    assert (linear8bit.weight.SCB == deserialized.weight.SCB).all()
+    assert (linear8bit.weight.CB == deserialized.weight.CB).all()

From b22ae26d4c9f0bec6b85987423737d447a78c387 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Wed, 5 Jun 2024 14:36:37 +0200
Subject: [PATCH 06/54] fix for faulty #1222 ("Add `"lamb"` to
 `str2optimizer32bit`") (#1240)

* Revert "Add `"lamb"` to `str2optimizer32bit`"

* Update bitsandbytes/functional.py
---
 bitsandbytes/functional.py | 124 +++++++++++--------------------------
 1 file changed, 36 insertions(+), 88 deletions(-)

diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index bbfbf0007..cea3179a1 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -27,128 +27,67 @@ def prod(iterable):
 if lib and lib.compiled_with_cuda:
     """C FUNCTIONS FOR OPTIMIZERS"""
     str2optimizer32bit = {
-        "adagrad": (
-            lib.cadagrad32bit_grad_fp32,
-            lib.cadagrad32bit_grad_fp16,
-        ),
         "adam": (
             lib.cadam32bit_grad_fp32,
             lib.cadam32bit_grad_fp16,
             lib.cadam32bit_grad_bf16,
         ),
-        "pagedadam": (
-            lib.cpagedadam32bit_grad_fp32,
-            lib.cpagedadam32bit_grad_fp16,
-            lib.cpagedadam32bit_grad_bf16,
-        ),
-        "adamw": (
-            lib.cadam32bit_grad_fp32,
-            lib.cadam32bit_grad_fp16,
-            lib.cadam32bit_grad_bf16,
-        ),
-        "pagedadamw": (
-            lib.cpagedadam32bit_grad_fp32,
-            lib.cpagedadam32bit_grad_fp16,
-            lib.cpagedadam32bit_grad_bf16,
-        ),
-        "lamb": (
-            lib.cadam32bit_grad_fp32,
-            lib.cadam32bit_grad_fp16,
+        "momentum": (
+            lib.cmomentum32bit_grad_32,
+            lib.cmomentum32bit_grad_16,
         ),
-        "lars": (
-            lib.clars32bit_grad_fp32,
-            lib.clars32bit_grad_fp16,
+        "rmsprop": (
+            lib.crmsprop32bit_grad_32,
+            lib.crmsprop32bit_grad_16,
         ),
         "lion": (
             lib.clion32bit_grad_fp32,
             lib.clion32bit_grad_fp16,
             lib.clion32bit_grad_bf16,
         ),
-        "momentum": (
-            lib.cmomentum32bit_grad_fp32,
-            lib.cmomentum32bit_grad_fp16,
+        "adagrad": (
+            lib.cadagrad32bit_grad_32,
+            lib.cadagrad32bit_grad_16,
         ),
-        "rmsprop": (
-            lib.crmsprop32bit_grad_fp32,
-            lib.crmsprop32bit_grad_fp16,
+        "lamb": (
+            lib.cadam32bit_grad_fp32,
+            lib.cadam32bit_grad_fp16,
         ),
     }
 
     str2optimizer8bit = {
-        "adagrad": (
-            lib.cadagrad8bit_grad_fp32,
-            lib.cadagrad8bit_grad_fp16,
-        ),
         "adam": (
-            lib.cadam_static_8bit_grad_fp32,
-            lib.cadam_static_8bit_grad_fp16,
+            lib.cadam_static_8bit_grad_32,
+            lib.cadam_static_8bit_grad_16,
         ),
-        "pagedadam": (
-            lib.cpagedadam8bit_grad_fp32,
-            lib.cpagedadam8bit_grad_fp16,
-            lib.cpagedadam8bit_grad_bf16,
+        "momentum": (
+            lib.cmomentum_static_8bit_grad_32,
+            lib.cmomentum_static_8bit_grad_16,
         ),
-        "adamw": (
-            lib.cadam_static_8bit_grad_fp32,
-            lib.cadam_static_8bit_grad_fp16,
+        "rmsprop": (
+            lib.crmsprop_static_8bit_grad_32,
+            lib.crmsprop_static_8bit_grad_16,
         ),
-        "pagedadamw": (
-            lib.cpagedadam8bit_grad_fp32,
-            lib.cpagedadam8bit_grad_fp16,
-            lib.cpagedadam8bit_grad_bf16,
+        "lion": (
+            lib.clion_static_8bit_grad_32,
+            lib.clion_static_8bit_grad_16,
         ),
         "lamb": (
-            lib.cadam_static_8bit_grad_fp32,
-            lib.cadam_static_8bit_grad_fp16,
+            lib.cadam_static_8bit_grad_32,
+            lib.cadam_static_8bit_grad_16,
         ),
         "lars": (
-            lib.clars8bit_grad_fp32,
-            lib.clars8bit_grad_fp16,
-        ),
-        "lion": (
-            lib.clion_static_8bit_grad_fp32,
-            lib.clion_static_8bit_grad_fp16,
-        ),
-        "momentum": (
-            lib.cmomentum_static_8bit_grad_fp32,
-            lib.cmomentum_static_8bit_grad_fp16,
-        ),
-        "rmsprop": (
-            lib.crmsprop_static_8bit_grad_fp32,
-            lib.crmsprop_static_8bit_grad_fp16,
+            lib.cmomentum_static_8bit_grad_32,
+            lib.cmomentum_static_8bit_grad_16,
         ),
     }
 
     str2optimizer8bit_blockwise = {
-        "adagrad": (
-            lib.cadagrad_8bit_blockwise_grad_fp32,
-            lib.cadagrad_8bit_blockwise_grad_fp16,
-        ),
         "adam": (
             lib.cadam_8bit_blockwise_grad_fp32,
             lib.cadam_8bit_blockwise_grad_fp16,
             lib.cadam_8bit_blockwise_grad_bf16,
         ),
-        "pagedadam": (
-            lib.cpagedadam8bit_blockwise_fp32,
-            lib.cpagedadam8bit_blockwise_fp16,
-            lib.cpagedadam8bit_blockwise_bf16,
-        ),
-        "adamw": (
-            lib.cadam_8bit_blockwise_grad_fp32,
-            lib.cadam_8bit_blockwise_grad_fp16,
-            lib.cadam_8bit_blockwise_grad_bf16,
-        ),
-        "pagedadamw": (
-            lib.cpagedadam8bit_blockwise_fp32,
-            lib.cpagedadam8bit_blockwise_fp16,
-            lib.cpagedadam8bit_blockwise_bf16,
-        ),
-        "lion": (
-            lib.clion_8bit_blockwise_grad_fp32,
-            lib.clion_8bit_blockwise_grad_fp16,
-            lib.clion_8bit_blockwise_grad_bf16,
-        ),
         "momentum": (
             lib.cmomentum_8bit_blockwise_grad_fp32,
             lib.cmomentum_8bit_blockwise_grad_fp16,
@@ -157,6 +96,15 @@ def prod(iterable):
             lib.crmsprop_8bit_blockwise_grad_fp32,
             lib.crmsprop_8bit_blockwise_grad_fp16,
         ),
+        "lion": (
+            lib.clion_8bit_blockwise_grad_fp32,
+            lib.clion_8bit_blockwise_grad_fp16,
+            lib.clion_8bit_blockwise_grad_bf16,
+        ),
+        "adagrad": (
+            lib.cadagrad_8bit_blockwise_grad_fp32,
+            lib.cadagrad_8bit_blockwise_grad_fp16,
+        ),
     }
 
 
From ec9000f5444726589935ba8107249eddade9689d Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.Nunna@amd.com>
Date: Thu, 20 Jun 2024 21:02:16 +0000
Subject: [PATCH 07/54] Copy shared obj file into output_dir

---
 .github/scripts/build-rocm.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/scripts/build-rocm.sh b/.github/scripts/build-rocm.sh
index fc7515aa7..616e8c250 100644
--- a/.github/scripts/build-rocm.sh
+++ b/.github/scripts/build-rocm.sh
@@ -14,6 +14,6 @@ if [ "${build_os:0:6}" == ubuntu ]; then
       && cmake --build ."
 fi
 
-#output_dir="output/${build_os}/${build_arch}"
-#mkdir -p "${output_dir}"
-#(shopt -s nullglob && cp bitsandbytes/*.{so,dylib,dll} "${output_dir}")
+output_dir="output/${build_os}/${build_arch}"
+mkdir -p "${output_dir}"
+(shopt -s nullglob && cp bitsandbytes/*.{so,dylib,dll} "${output_dir}")

From 9b8c1da639c76c0fd41df00b835ab02b3508a64b Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.Nunna@amd.com>
Date: Thu, 20 Jun 2024 21:02:50 +0000
Subject: [PATCH 08/54] upload build artifacts and enable wheels build

---
 .github/workflows/python-package.yml | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 78bc747c3..3b243993b 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -125,10 +125,17 @@ jobs:
         env:
           build_os: ${{ matrix.os }}
           build_arch: ${{ matrix.arch }}
+      - name: Upload build artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: shared_library_rocm_${{ matrix.os }}_${{ matrix.arch }}
+          path: output/*
+          retention-days: 7
   build-wheels:
     needs:
       - build-shared-libs
       - build-shared-libs-cuda
+      - build-shared-libs-rocm
     strategy:
       matrix:
         os: [ubuntu-latest, macos-latest, windows-latest]

From 1413c5f3a2aed51140b86daa8ee9283c67cce738 Mon Sep 17 00:00:00 2001
From: pnunna93 <104791500+pnunna93@users.noreply.github.com>
Date: Thu, 20 Jun 2024 16:10:54 -0500
Subject: [PATCH 09/54] Remove cuda build temporarily

---
 .github/workflows/python-package.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 3b243993b..0b0b35416 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -134,7 +134,6 @@ jobs:
   build-wheels:
     needs:
       - build-shared-libs
-      - build-shared-libs-cuda
       - build-shared-libs-rocm
     strategy:
       matrix:

From 195ae616d63906673c5025d67a785a2455787896 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Fri, 21 Jun 2024 18:36:12 +0200
Subject: [PATCH 10/54] Bump the minor-patch group across 1 directory with 2
 updates (#1253)

Bumps the minor-patch group with 2 updates in the / directory: [pytest](https://github.com/pytest-dev/pytest) and [lion-pytorch](https://github.com/lucidrains/lion-pytorch).


Updates `pytest` from 8.2.1 to 8.2.2
- [Release notes](https://github.com/pytest-dev/pytest/releases)
- [Changelog](https://github.com/pytest-dev/pytest/blob/main/CHANGELOG.rst)
- [Commits](https://github.com/pytest-dev/pytest/compare/8.2.1...8.2.2)

Updates `lion-pytorch` from 0.1.4 to 0.2.2
- [Release notes](https://github.com/lucidrains/lion-pytorch/releases)
- [Commits](https://github.com/lucidrains/lion-pytorch/compare/0.1.4...0.2.2)

---
updated-dependencies:
- dependency-name: pytest
  dependency-type: direct:production
  update-type: version-update:semver-patch
  dependency-group: minor-patch
- dependency-name: lion-pytorch
  dependency-type: direct:production
  update-type: version-update:semver-minor
  dependency-group: minor-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 requirements-ci.txt  | 4 ++--
 requirements-dev.txt | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/requirements-ci.txt b/requirements-ci.txt
index 0e9dd2407..285b5e7d8 100644
--- a/requirements-ci.txt
+++ b/requirements-ci.txt
@@ -1,6 +1,6 @@
 # Requirements used for GitHub actions
-pytest==8.2.1
+pytest==8.2.2
 einops==0.8.0
-lion-pytorch==0.1.4
+lion-pytorch==0.2.2
 scipy==1.10.1; python_version < "3.9"
 scipy==1.13.1; python_version >= "3.9"
diff --git a/requirements-dev.txt b/requirements-dev.txt
index de7adce94..80927a4cb 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,9 +1,9 @@
 # Requirements used for local development
 setuptools>=63
-pytest~=8.2.1
+pytest~=8.2.2
 einops~=0.8.0
 wheel~=0.43.0
-lion-pytorch~=0.1.4
+lion-pytorch~=0.2.2
 scipy~=1.13.1
 pandas~=2.2.2
 matplotlib~=3.9.0

From 193120d1677ff0c4c502fc81835251e4b29d0c48 Mon Sep 17 00:00:00 2001
From: Titus <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Fri, 21 Jun 2024 18:48:44 +0200
Subject: [PATCH 11/54] cleanup docs-build breaking install instructs (#1244)

* cleanup docs-build breaking install instructs

* Update install instructions for ROCm

* Update installation.mdx

---------

Co-authored-by: Prasanth Nunna <Prasanth.Nunna@amd.com>
Co-authored-by: pnunna93 <104791500+pnunna93@users.noreply.github.com>
---
 docs/source/installation.mdx      | 43 +++++++++++++++++++++++++++++
 docs/source/rocm_installation.mdx | 46 -------------------------------
 2 files changed, 43 insertions(+), 46 deletions(-)
 delete mode 100644 docs/source/rocm_installation.mdx

diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index caf22488f..c07ef29f6 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -91,6 +91,49 @@ Big thanks to [wkpark](https://github.com/wkpark), [Jamezo97](https://github.com
 </hfoption>
 </hfoptions>
 
+## Multi-backend preview release (+ compilation)
+
+Please follow these steps to install bitsandbytes with device-specific backend support other than CUDA:
+
+<hfoptions id="backend">
+<hfoption id="AMD ROCm">
+
+For a ROCm specific install:
+
+bitsandbytes is fully supported from ROCm 6.1.
+
+**Note:** If you already installed ROCm and PyTorch, skip docker steps below and please check that the torch version matches your ROCm install. To install torch for a specific ROCm version, please refer to step 3 of wheels install in [Installing PyTorch for ROCm](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/3rd-party/pytorch-install.html#using-wheels-package) guide.
+
+```bash
+# Create a docker container with latest pytorch. It comes with ROCm and pytorch preinstalled
+docker pull rocm/pytorch:latest
+docker run -it --device=/dev/kfd --device=/dev/dri --group-add video rocm/pytorch:latest
+
+# Clone bitsandbytes repo, ROCm backend is currently enabled on multi-backend-refactor branch
+git clone --depth 1 -b multi-backend-refactor https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/
+
+# Install dependencies
+pip install -r requirements-dev.txt
+
+# Compile & install
+cmake -DCOMPUTE_BACKEND=hip -S .  # Use -DBNB_ROCM_ARCH="gfx90a;gfx942" to target specific gpu arch
+make
+pip install .
+```
+
+</hfoption>
+<hfoption id="Intel CPU + GPU">
+
+WIP
+
+</hfoption>
+<hfoption id="Apple Silicon (MPS)">
+
+WIP
+
+</hfoption>
+</hfoptions>
+
 ## PyTorch CUDA versions
 
 Some bitsandbytes features may need a newer CUDA version than the one currently supported by PyTorch binaries from Conda and pip. In this case, you should follow these instructions to load a precompiled bitsandbytes binary.
diff --git a/docs/source/rocm_installation.mdx b/docs/source/rocm_installation.mdx
deleted file mode 100644
index 5d4381e7d..000000000
--- a/docs/source/rocm_installation.mdx
+++ /dev/null
@@ -1,46 +0,0 @@
-# ROCm Installation
-
-Please follow these steps to install bitsandbytes on ROCm.
-
-<hfoptions id="rocm_install">
-<hfoption id="latest">
-
-For latest installation:
-
-```bash
-git clone https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/
-pip install -r requirements-dev.txt
-cmake -DCOMPUTE_BACKEND=hip -S . #Use -DBNB_ROCM_ARCH="gfx90a;gfx942" to target specific gpu arch
-make
-pip install .
-```
-
-<hfoption>
-<hfoption id="version specific">
-
-For ROCm specific versions:
-
-Install Dependencies:
-
-```bash
-# hipblaslt installation needed only for rocm<6.0
-apt install hipblaslt
-pip install --upgrade pip
-pip install einops lion_pytorch accelerate
-pip install git+https://github.com/ROCm/transformers.git
-```
-
-Install bitsandbytes from [ROCm](https://github.com/ROCm/bitsandbytes) repo:
-
-```bash
-git clone --recurse https://github.com/ROCm/bitsandbytes
-cd bitsandbytes
-# Checkout branch as needed
-# for rocm 5.7 - rocm5.7_internal_testing
-# for rocm 6.x - rocm6.2_internal_testing
-git checkout <branch>
-make hip
-python setup.py install
-```
-<hfoption>
-</hfoptions>

From dada530149212d64d4b69534716202659ef37ec8 Mon Sep 17 00:00:00 2001
From: jiqing-feng <107918818+jiqing-feng@users.noreply.github.com>
Date: Sat, 22 Jun 2024 00:53:26 +0800
Subject: [PATCH 12/54] cpu install guide (#1227)

* cpu install guide

* update readme

* fix format

* fix format

* fix typo

* add windows guide

* fix readme to pip install . instead of building wheel

* Update docs/source/installation.mdx

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>

* Update docs/source/installation.mdx

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>

* Update docs/source/installation.mdx

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>

---------

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>
---
 docs/source/installation.mdx | 54 ++++++++++++++++++++++++++++++++++--
 1 file changed, 52 insertions(+), 2 deletions(-)

diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index caf22488f..c84d0c2ef 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -1,5 +1,7 @@
 # Installation
 
+## CUDA
+
 bitsandbytes is only supported on CUDA GPUs for CUDA versions **11.0 - 12.3**.
 
 The latest version of bitsandbytes (v0.43.0) builds on:
@@ -29,7 +31,7 @@ To install from PyPI.
 pip install bitsandbytes
 ```
 
-## Compile from source
+### Compile from source
 
 For Linux and Windows systems, you can compile bitsandbytes from source. Installing from source allows for more build options with different CMake configurations.
 
@@ -91,7 +93,7 @@ Big thanks to [wkpark](https://github.com/wkpark), [Jamezo97](https://github.com
 </hfoption>
 </hfoptions>
 
-## PyTorch CUDA versions
+### PyTorch CUDA versions
 
 Some bitsandbytes features may need a newer CUDA version than the one currently supported by PyTorch binaries from Conda and pip. In this case, you should follow these instructions to load a precompiled bitsandbytes binary.
 
@@ -131,3 +133,51 @@ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/tim/local/cuda-11.7
 ```
 
 3. Now when you launch bitsandbytes with these environment variables, the PyTorch CUDA version is overridden by the new CUDA version (in this example, version 11.7) and a different bitsandbytes library is loaded.
+
+
+## Intel CPU
+
+> [!TIP]
+> Intel CPU backend only supports building from source; for now, please follow the instructions below.
+
+Like CUDA, you can compile bitsandbytes from source for Linux and Windows systems. Installing from source allows for more build options with different CMake configurations.
+
+<hfoptions id="source">
+<hfoption id="Linux">
+
+To compile from source, you need CMake >= **3.22.1** and Python >= **3.8** installed. Make sure you have a compiler installed to compile C++ (gcc, make, headers, etc.). For example, to install a compiler and CMake on Ubuntu:
+
+```bash
+apt-get install -y build-essential cmake
+```
+
+We recommend installing **GCC >= 11** and have at least **GCC >= 6**.
+
+Now to install the bitsandbytes package from source, run the following commands:
+
+```bash
+git clone --branch multi-backend-refactor https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/
+pip install -r requirements-dev.txt
+pip install intel_extension_for_pytorch
+cmake -DCOMPUTE_BACKEND=cpu -S .
+make
+pip install .
+```
+
+</hfoption>
+<hfoption id="Windows">
+
+Windows systems require Visual Studio with C++ support.
+
+To compile from source, you need CMake >= **3.22.1** and Python >= **3.8** installed.
+
+```bash
+git clone --branch multi-backend-refactor https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/
+pip install -r requirements-dev.txt
+cmake -DCOMPUTE_BACKEND=cpu -S .
+cmake --build . --config Release
+pip install .
+```
+
+</hfoption>
+</hfoptions>

From c79b1e926b05e856775e4962253e2dbf67bed103 Mon Sep 17 00:00:00 2001
From: Titus <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Fri, 21 Jun 2024 19:02:47 +0200
Subject: [PATCH 13/54] provide temp flag for outside libs to detect
 multi-backend preview (#1243)

* provide temp flag for outside libs to detect multi-backend preview

* fix typo in comment

Co-authored-by: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>

---------

Co-authored-by: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>
---
 bitsandbytes/__init__.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/bitsandbytes/__init__.py b/bitsandbytes/__init__.py
index eff7fc686..c3a2f2402 100644
--- a/bitsandbytes/__init__.py
+++ b/bitsandbytes/__init__.py
@@ -20,6 +20,13 @@
 from .cextension import lib
 from .nn import modules
 
+# NOTE: this is a temporary flag to allow outside libraries to employ conditional logic while the refactor is still in
+# alpha/beta: sth like `if getattr(bitsandbytes, "is_multi_backend_refactor_preview", False): do sth`
+# the getattr() call above would default to False and any string evaluates to True. This way we have temporary thing
+# that we can remove in Transformers with the next release after the official BNB multi-platform release; then
+# eventually making it the new default (e.g. just remove if statement and dedent in Transformers)
+is_multi_backend_refactor_preview = "TO BE REMOVED ONCE MERGED TO `main`"  # bool evals to True for str
+
 # Always register the CPU backend.
 register_backend("cpu", CPUBackend())
 

From 1bfecc81e9f3b9a67a3b9bb9e1ab57468b1b9497 Mon Sep 17 00:00:00 2001
From: Xia Weiwen <xia.weiwen@hotmail.com>
Date: Wed, 10 Jul 2024 15:26:35 +0800
Subject: [PATCH 14/54] CPU/XPU: disable torch.compile if g++ is not available
 (#1251)

* CPU/XPU: disable torch.compile if g++ is not available

* Fix lint issue
---
 bitsandbytes/backends/cpu_xpu_common.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/bitsandbytes/backends/cpu_xpu_common.py b/bitsandbytes/backends/cpu_xpu_common.py
index 396234853..c936dce14 100644
--- a/bitsandbytes/backends/cpu_xpu_common.py
+++ b/bitsandbytes/backends/cpu_xpu_common.py
@@ -1,3 +1,4 @@
+import subprocess
 from typing import Optional
 import warnings
 
@@ -19,6 +20,14 @@
     ipex_xpu = None
 
 
+gxx_available = False
+try:
+    subprocess.run(["g++", "--version"])
+    gxx_available = True
+except BaseException:
+    warnings.warn("g++ not found, torch.compile disabled for CPU/XPU.")
+
+
 Tensor = torch.Tensor
 
 
@@ -45,8 +54,8 @@ def _ipex_xpu_version_prereq(major, minor):
 
 
 def _maybe_torch_compile(func):
-    # torch.compile requires pytorch >= 2.0
-    if _torch_version_prereq(2, 0):
+    # torch.compile requires g++ and pytorch >= 2.0
+    if gxx_available and _torch_version_prereq(2, 0):
         options = {}
         # fx_graph_cache requires pytorch >= 2.2
         if _torch_version_prereq(2, 2):

From 08597844023a5c59e9b5d5dbeafbac4174fae5cc Mon Sep 17 00:00:00 2001
From: pnunna93 <104791500+pnunna93@users.noreply.github.com>
Date: Fri, 12 Jul 2024 09:15:10 -0500
Subject: [PATCH 15/54] Create build job for ROCm (#1255)

* Add build job for rocm

* Add rocm build script
---
 .github/scripts/build-rocm.sh        | 19 +++++++++++++++++++
 .github/workflows/python-package.yml | 22 ++++++++++++++++++++++
 2 files changed, 41 insertions(+)
 create mode 100644 .github/scripts/build-rocm.sh

diff --git a/.github/scripts/build-rocm.sh b/.github/scripts/build-rocm.sh
new file mode 100644
index 000000000..fc7515aa7
--- /dev/null
+++ b/.github/scripts/build-rocm.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+declare build_arch
+declare build_os
+
+set -xeuo pipefail
+if [ "${build_os:0:6}" == ubuntu ]; then
+	image=rocm/dev-ubuntu-22.04:6.1-complete
+	echo "Using image $image"
+	docker run --rm --platform "linux/$build_arch" -i \
+		-w /src -v "$PWD:/src" "$image" sh -c \
+		"apt-get update \
+      && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cmake \
+      && cmake -DCOMPUTE_BACKEND=hip . \
+      && cmake --build ."
+fi
+
+#output_dir="output/${build_os}/${build_arch}"
+#mkdir -p "${output_dir}"
+#(shopt -s nullglob && cp bitsandbytes/*.{so,dylib,dll} "${output_dir}")
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 72e1b099a..78bc747c3 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -103,6 +103,28 @@ jobs:
           name: shared_library_cuda_${{ matrix.os }}_${{ matrix.arch }}_${{ matrix.cuda_version }}
           path: output/*
           retention-days: 7
+  build-shared-libs-rocm:
+    strategy:
+      matrix:
+        os: [ubuntu-latest]
+        arch: [x86_64]
+    runs-on: ${{ matrix.os }} # One day, we could run them on native agents. Azure supports this now but it's planned only for Q3 2023 for hosted agents
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Docker multiarch
+        if: startsWith(matrix.os, 'ubuntu')
+        uses: docker/setup-qemu-action@v2
+      - name: Clean up disk space
+        run: |
+          sudo rm -rf /usr/share/dotnet
+          sudo rm -rf /opt/ghc
+          sudo rm -rf "/usr/local/share/boost"
+          sudo rm -rf "$AGENT_TOOLSDIRECTORY"
+      - name: Build C++
+        run: bash .github/scripts/build-rocm.sh
+        env:
+          build_os: ${{ matrix.os }}
+          build_arch: ${{ matrix.arch }}
   build-wheels:
     needs:
       - build-shared-libs

From 1935a459f7c72d1ec8349e4ad1e84cbd6ff94e68 Mon Sep 17 00:00:00 2001
From: Titus <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Fri, 12 Jul 2024 16:40:17 +0200
Subject: [PATCH 16/54] fix broken <source> links in autodoc API reference
 (#1275)

* Update build_documentation.yml

* Update build_pr_documentation.yml

* Update build_pr_documentation.yml
---
 .github/workflows/build_documentation.yml    | 2 ++
 .github/workflows/build_pr_documentation.yml | 4 +++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build_documentation.yml b/.github/workflows/build_documentation.yml
index a19e7511d..e027f7556 100644
--- a/.github/workflows/build_documentation.yml
+++ b/.github/workflows/build_documentation.yml
@@ -14,6 +14,8 @@ jobs:
       commit_sha: ${{ github.sha }}
       package: bitsandbytes
       repo_owner: TimDettmers
+      # avoid /src suffix leading to wrong links, like bitsandbytes/blob/main/src/bitsandbytes/nn/
+      version_tag_suffix: ''  # defaults to '/src'
       custom_container: huggingface/transformers-doc-builder
     secrets:
       hf_token: ${{ secrets.HUGGINGFACE_PUSH }}
diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml
index cc833df5d..b83794a5f 100644
--- a/.github/workflows/build_pr_documentation.yml
+++ b/.github/workflows/build_pr_documentation.yml
@@ -9,11 +9,13 @@ concurrency:
 
 jobs:
   build:
-    if: github.repository == 'TimDettmers/bitsandbytes'
+    if: github.repository == 'bitsandbytes-foundation/bitsandbytes'
     uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main
     with:
       commit_sha: ${{ github.event.pull_request.head.sha }}
       pr_number: ${{ github.event.number }}
       package: bitsandbytes
       repo_owner: TimDettmers
+      # avoid /src suffix leading to wrong links, like bitsandbytes/blob/main/src/bitsandbytes/nn/
+      version_tag_suffix: ''  # defaults to '/src'
       custom_container: huggingface/transformers-doc-builder

From 85e01276874b7563bd23caf56ac2c3bdbf7c90fc Mon Sep 17 00:00:00 2001
From: Markus Hennerbichler <markushennerbichler@gmail.com>
Date: Fri, 12 Jul 2024 15:40:37 +0100
Subject: [PATCH 17/54] Fix CUDA 12.5 build issue (#1273)

pythonInterface.cpp depends on ops.cuh
which in turn depends on some thrust headers.
It is defined as a C++ compilation unit
which is problematic  becuase thrift doesn't guarantee
compatibility with a host compiler.

This is starting to cause issues with CUDA 12.5.
There is no dependency on the thrust headers,
which means they can be removed without other consequences.
---
 csrc/kernels.cu | 2 --
 csrc/ops.cuh    | 4 ----
 2 files changed, 6 deletions(-)

diff --git a/csrc/kernels.cu b/csrc/kernels.cu
index f4673359b..e4d459961 100644
--- a/csrc/kernels.cu
+++ b/csrc/kernels.cu
@@ -12,8 +12,6 @@
 #include <cub/block/block_reduce.cuh>
 #include <cub/cub.cuh>
 #include <math_constants.h>
-#include <thrust/host_vector.h>
-#include <thrust/device_vector.h>
 #include <mma.h>
 
 
diff --git a/csrc/ops.cuh b/csrc/ops.cuh
index da9df6af0..8b9a4f449 100644
--- a/csrc/ops.cuh
+++ b/csrc/ops.cuh
@@ -19,10 +19,6 @@
 #include <vector>
 #include <functional>
 
-#include <thrust/host_vector.h>
-#include <thrust/device_vector.h>
-
-
 
 #define CUDA_CHECK_RETURN(value) {                      \
   cudaError_t _m_cudaStat = value;                    \

From 6866a4ad464239a3a06c9d8911237c0da294e4d7 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Fri, 12 Jul 2024 16:42:28 +0200
Subject: [PATCH 18/54] Bump scipy from 1.13.1 to 1.14.0 in the minor-patch
 group (#1266)

Bumps the minor-patch group with 1 update: [scipy](https://github.com/scipy/scipy).


Updates `scipy` from 1.13.1 to 1.14.0
- [Release notes](https://github.com/scipy/scipy/releases)
- [Commits](https://github.com/scipy/scipy/compare/v1.13.1...v1.14.0)

---
updated-dependencies:
- dependency-name: scipy
  dependency-type: direct:production
  update-type: version-update:semver-minor
  dependency-group: minor-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 requirements-ci.txt  | 2 +-
 requirements-dev.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements-ci.txt b/requirements-ci.txt
index 285b5e7d8..b36fd6586 100644
--- a/requirements-ci.txt
+++ b/requirements-ci.txt
@@ -3,4 +3,4 @@ pytest==8.2.2
 einops==0.8.0
 lion-pytorch==0.2.2
 scipy==1.10.1; python_version < "3.9"
-scipy==1.13.1; python_version >= "3.9"
+scipy==1.14.0; python_version >= "3.9"
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 80927a4cb..dc75f9685 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -4,6 +4,6 @@ pytest~=8.2.2
 einops~=0.8.0
 wheel~=0.43.0
 lion-pytorch~=0.2.2
-scipy~=1.13.1
+scipy~=1.14.0
 pandas~=2.2.2
 matplotlib~=3.9.0

From 8c6ab698d76baad7265d1a91965ade0982596704 Mon Sep 17 00:00:00 2001
From: Titus <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Fri, 12 Jul 2024 16:48:26 +0200
Subject: [PATCH 19/54] update repo owner

---
 .github/workflows/build_pr_documentation.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml
index b83794a5f..4679761c6 100644
--- a/.github/workflows/build_pr_documentation.yml
+++ b/.github/workflows/build_pr_documentation.yml
@@ -15,7 +15,7 @@ jobs:
       commit_sha: ${{ github.event.pull_request.head.sha }}
       pr_number: ${{ github.event.number }}
       package: bitsandbytes
-      repo_owner: TimDettmers
+      repo_owner: bitsandbytes-foundation
       # avoid /src suffix leading to wrong links, like bitsandbytes/blob/main/src/bitsandbytes/nn/
       version_tag_suffix: ''  # defaults to '/src'
       custom_container: huggingface/transformers-doc-builder

From 7be11439954d38a8b784ea86286ac0045769db53 Mon Sep 17 00:00:00 2001
From: Titus <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Fri, 12 Jul 2024 16:48:51 +0200
Subject: [PATCH 20/54] update repo owner

---
 .github/workflows/build_documentation.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_documentation.yml b/.github/workflows/build_documentation.yml
index e027f7556..ce4a55aaa 100644
--- a/.github/workflows/build_documentation.yml
+++ b/.github/workflows/build_documentation.yml
@@ -13,7 +13,7 @@ jobs:
     with:
       commit_sha: ${{ github.sha }}
       package: bitsandbytes
-      repo_owner: TimDettmers
+      repo_owner: bitsandbytes-foundation
       # avoid /src suffix leading to wrong links, like bitsandbytes/blob/main/src/bitsandbytes/nn/
       version_tag_suffix: ''  # defaults to '/src'
       custom_container: huggingface/transformers-doc-builder

From 6948f0b8fe3295b6c6fe1263bc0d1ce874468cd2 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Mon, 15 Jul 2024 04:58:02 -0400
Subject: [PATCH 21/54] Fix Windows CUDA build compatibility with newest MSVC
 (#1276)

* Add support for building with latest MSVC

* Update MSVC 1940+ support for CUDA builds.
---
 CMakeLists.txt | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index be0d3555f..6f3914456 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -64,6 +64,13 @@ endif()
 
 
 if(BUILD_CUDA)
+    # NVCC normally will only work with MSVC up to 1939. VS2022 17.10+ starts using versions 1940+.
+    # Workaround: use --allow-unsupported-compiler
+    # This needs to be added *before* we try to enable the CUDA language so CMake's compiler check passes.
+    if(MSVC AND MSVC_VERSION VERSION_GREATER_EQUAL 1940)
+        string(APPEND CMAKE_CUDA_FLAGS " --allow-unsupported-compiler")
+    endif()
+
     enable_language(CUDA) # This will fail if CUDA is not found
     find_package(CUDAToolkit REQUIRED)
 
@@ -188,7 +195,6 @@ if(WIN32)
     set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
 endif()
 
-# Weird MSVC hacks
 if(MSVC)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2 /fp:fast")
 endif()

From f2b2310eb4b7034c14e87dca2a61604ea2a0163f Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 15 Jul 2024 16:42:04 +0200
Subject: [PATCH 22/54] Update matplotlib requirement from ~=3.9.0 to ~=3.9.1
 in the major group (#1278)

Updates the requirements on [matplotlib](https://github.com/matplotlib/matplotlib) to permit the latest version.

Updates `matplotlib` to 3.9.1
- [Release notes](https://github.com/matplotlib/matplotlib/releases)
- [Commits](https://github.com/matplotlib/matplotlib/compare/v3.9.0...v3.9.1)

---
updated-dependencies:
- dependency-name: matplotlib
  dependency-type: direct:development
  dependency-group: major
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 requirements-dev.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements-dev.txt b/requirements-dev.txt
index dc75f9685..94098e012 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -6,4 +6,4 @@ wheel~=0.43.0
 lion-pytorch~=0.2.2
 scipy~=1.14.0
 pandas~=2.2.2
-matplotlib~=3.9.0
+matplotlib~=3.9.1

From 39b42e749f623193d69b917a01dcb4ca5b4bbdc0 Mon Sep 17 00:00:00 2001
From: Vladimir Malinovskii <galqiwi@galqiwi.ru>
Date: Mon, 15 Jul 2024 17:51:21 +0300
Subject: [PATCH 23/54] Fixed tests for cpu only platforms (#1259)

* fixed test_4bit_warnings on cpu-only platforms

* fixed linear8bit-based tests for cpu only platforms
---
 tests/test_linear8bitlt.py | 2 +-
 tests/test_modules.py      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_linear8bitlt.py b/tests/test_linear8bitlt.py
index e55abe110..9b7923312 100644
--- a/tests/test_linear8bitlt.py
+++ b/tests/test_linear8bitlt.py
@@ -182,7 +182,7 @@ def test_linear_serialization(
 
 
 @pytest.fixture
-def linear8bit():
+def linear8bit(requires_cuda):
     linear = torch.nn.Linear(32, 96)
     linear_custom = Linear8bitLt(
         linear.in_features,
diff --git a/tests/test_modules.py b/tests/test_modules.py
index db4d72410..9d507c6b4 100644
--- a/tests/test_modules.py
+++ b/tests/test_modules.py
@@ -616,7 +616,7 @@ def test_fp8linear():
     assert bgraderr < 0.00002
 
 
-def test_4bit_warnings():
+def test_4bit_warnings(requires_cuda):
     dim1 = 64
 
     with pytest.warns(UserWarning, match=r"inference or training"):

From 9e75374104cec965d9a4a630a68ce4e2b77b066f Mon Sep 17 00:00:00 2001
From: Ther <1329438302@qq.com>
Date: Tue, 16 Jul 2024 16:56:41 +0800
Subject: [PATCH 24/54] fix QLoRA mem bug: delete useless buffered activation
 (#1270)

* chore: delete useless buffered activation

* fix: fix bugs
---
 bitsandbytes/autograd/_functions.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py
index e9821cd36..d33dd1bc5 100644
--- a/bitsandbytes/autograd/_functions.py
+++ b/bitsandbytes/autograd/_functions.py
@@ -513,7 +513,7 @@ def forward(ctx, A, B, out=None, bias=None, quant_state: Optional[F.QuantState]
         ctx.dtype_A, ctx.dtype_B, ctx.dtype_bias = A.dtype, B.dtype, None if bias is None else bias.dtype
 
         if any(ctx.needs_input_grad[:2]):
-            ctx.tensors = (A, B)
+            ctx.tensors = (None, B)
         else:
             ctx.tensors = (None, None)
 
@@ -526,7 +526,7 @@ def backward(ctx, grad_output):
             return torch.zeros_like(ctx.A), torch.zeros_like(ctx.B), None, bias_grad, None
 
         req_gradA, _, _, req_gradBias, _ = ctx.needs_input_grad
-        A, B = ctx.tensors
+        _, B = ctx.tensors
 
         grad_A, grad_B, grad_bias = None, None, None
 

From 0bdd57ccb20e9690b95c2fa02315d9507afa69dd Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Sun, 21 Jul 2024 08:31:06 -0400
Subject: [PATCH 25/54] Add CUDA 12.5 and update 12.4 builds (#1284)

* Add CUDA 12.5 builds and enable CUDA 12.4 on Windows

* Update install doc
---
 .github/workflows/python-package.yml |  6 ++----
 docs/source/installation.mdx         |  4 ++--
 install_cuda.py                      |  3 ++-
 install_cuda.sh                      | 10 +++++++---
 4 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 72e1b099a..698c21481 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -63,12 +63,10 @@ jobs:
         os: [ubuntu-latest, windows-latest]
         arch: [x86_64, aarch64]
         cuda_version:
-          ["11.7.1", "11.8.0", "12.0.1", "12.1.1", "12.2.2", "12.3.2", "12.4.0"]
+          ["11.7.1", "11.8.0", "12.0.1", "12.1.1", "12.2.2", "12.3.2", "12.4.1", "12.5.0"]
         exclude:
           - os: windows-latest # This probably requires arm64 Windows agents
             arch: aarch64
-          - os: windows-latest  # The Jimver/cuda-toolkit is action used for Windows builds is not updated for 12.4 yet.
-            cuda_version: "12.4.0"
           - os: ubuntu-latest # Temporary. Takes too long, not ready yet.
             arch: aarch64
     runs-on: ${{ matrix.os }} # One day, we could run them on native agents. Azure supports this now but it's planned only for Q3 2023 for hosted agents
@@ -79,7 +77,7 @@ jobs:
         if: startsWith(matrix.os, 'ubuntu')
         uses: docker/setup-qemu-action@v2
         # Windows: We install Cuda on the agent (slow)
-      - uses: Jimver/cuda-toolkit@v0.2.14
+      - uses: Jimver/cuda-toolkit@v0.2.16
         if: startsWith(matrix.os, 'windows')
         id: cuda-toolkit
         with:
diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index c84d0c2ef..877c97456 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -2,7 +2,7 @@
 
 ## CUDA
 
-bitsandbytes is only supported on CUDA GPUs for CUDA versions **11.0 - 12.3**.
+bitsandbytes is only supported on CUDA GPUs for CUDA versions **11.0 - 12.5**.
 
 The latest version of bitsandbytes (v0.43.0) builds on:
 
@@ -107,7 +107,7 @@ Then locally install the CUDA version you need with this script from bitsandbyte
 ```bash
 wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/install_cuda.sh
 # Syntax cuda_install CUDA_VERSION INSTALL_PREFIX EXPORT_TO_BASH
-#   CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122, 123, 124}
+#   CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122, 123, 124, 125}
 #   EXPORT_TO_BASH in {0, 1} with 0=False and 1=True
 
 # For example, the following installs CUDA 11.7 to ~/local/cuda-11.7 and exports the path to your .bashrc
diff --git a/install_cuda.py b/install_cuda.py
index cf7c8ee71..8267c5e2b 100644
--- a/install_cuda.py
+++ b/install_cuda.py
@@ -17,7 +17,8 @@
     "121": "https://developer.download.nvidia.com/compute/cuda/12.1.1/local_installers/cuda_12.1.1_530.30.02_linux.run",
     "122": "https://developer.download.nvidia.com/compute/cuda/12.2.2/local_installers/cuda_12.2.2_535.104.05_linux.run",
     "123": "https://developer.download.nvidia.com/compute/cuda/12.3.2/local_installers/cuda_12.3.2_545.23.08_linux.run",
-    "124": "https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_550.54.14_linux.run",
+    "124": "https://developer.download.nvidia.com/compute/cuda/12.4.1/local_installers/cuda_12.4.1_550.54.15_linux.run",
+    "125": "https://developer.download.nvidia.com/compute/cuda/12.5.0/local_installers/cuda_12.5.0_555.42.02_linux.run",
 }
 
 
diff --git a/install_cuda.sh b/install_cuda.sh
index 2e7fe8ed2..0aa9531fc 100644
--- a/install_cuda.sh
+++ b/install_cuda.sh
@@ -11,7 +11,8 @@ URL120=https://developer.download.nvidia.com/compute/cuda/12.0.1/local_installer
 URL121=https://developer.download.nvidia.com/compute/cuda/12.1.1/local_installers/cuda_12.1.1_530.30.02_linux.run
 URL122=https://developer.download.nvidia.com/compute/cuda/12.2.2/local_installers/cuda_12.2.2_535.104.05_linux.run
 URL123=https://developer.download.nvidia.com/compute/cuda/12.3.2/local_installers/cuda_12.3.2_545.23.08_linux.run
-URL124=https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_550.54.14_linux.run
+URL124=https://developer.download.nvidia.com/compute/cuda/12.4.1/local_installers/cuda_12.4.1_550.54.15_linux.run
+URL125=https://developer.download.nvidia.com/compute/cuda/12.5.0/local_installers/cuda_12.5.0_555.42.02_linux.run
 
 CUDA_VERSION=$1
 BASE_PATH=$2
@@ -60,11 +61,14 @@ if [[ -n "$CUDA_VERSION" ]]; then
   elif [[ "$CUDA_VERSION" -eq "124" ]]; then
     URL=$URL124
     FOLDER=cuda-12.4
+  elif [[ "$CUDA_VERSION" -eq "125" ]]; then
+    URL=$URL125
+    FOLDER=cuda-12.5
   else
-    echo "argument error: No cuda version passed as input. Choose among versions 110 to 124"
+    echo "argument error: No cuda version passed as input. Choose among versions 110 to 125"
   fi
 else
-    echo "argument error: No cuda version passed as input. Choose among versions 92 to 123"
+    echo "argument error: No cuda version passed as input. Choose among versions 110 to 125"
 fi
 
 FILE=$(basename $URL)

From 5212a0f2a585abba1cc2a65f82f7c4fc939c453f Mon Sep 17 00:00:00 2001
From: Titus <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Mon, 22 Jul 2024 16:36:31 +0200
Subject: [PATCH 26/54] Edenzzzz's fix for min_8bit_size functionality in
 Optimizer base classes (#1286)

* fix min_8bit_size invalid bug

* Apply same fix to other optimizer base class

---------

Co-authored-by: Edenzzzz <wtan45@wisc.edu>
---
 bitsandbytes/optim/optimizer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bitsandbytes/optim/optimizer.py b/bitsandbytes/optim/optimizer.py
index f1e60e5e7..39fa0e7ff 100644
--- a/bitsandbytes/optim/optimizer.py
+++ b/bitsandbytes/optim/optimizer.py
@@ -437,7 +437,7 @@ def init_state(self, group, p, gindex, pindex):
         state = self.state[p]
         state["step"] = 0
 
-        if dtype == torch.float32 or (dtype == torch.uint8 and p.numel() < 4096):
+        if dtype == torch.float32:
             state["state1"] = self.get_state_buffer(p, dtype=torch.float32)
             state["state2"] = self.get_state_buffer(p, dtype=torch.float32)
         elif dtype == torch.uint8:
@@ -656,7 +656,7 @@ def init_state(self, group, p, gindex, pindex):
         state = self.state[p]
         state["step"] = 0
 
-        if dtype == torch.float32 or (dtype == torch.uint8 and p.numel() < 4096):
+        if dtype == torch.float32:
             state["state1"] = self.get_state_buffer(p, dtype=torch.float32)
         elif dtype == torch.uint8:
             if state["step"] == 0:

From a3f55cea3ab29218067809770bc8bf2380ec46cd Mon Sep 17 00:00:00 2001
From: Edenzzzz <wenxuan.tan@wisc.edu>
Date: Mon, 22 Jul 2024 23:14:46 +0800
Subject: [PATCH 27/54] Fixed optim update error with non-contiguous
 grads/params (#1187)

* Fixed optim update error with non-contiguous grads
* fix formatting

Thanks @Edenzzzz for this contribution!

---------

Co-authored-by: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
---
 bitsandbytes/optim/optimizer.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/bitsandbytes/optim/optimizer.py b/bitsandbytes/optim/optimizer.py
index 39fa0e7ff..e9c857d49 100644
--- a/bitsandbytes/optim/optimizer.py
+++ b/bitsandbytes/optim/optimizer.py
@@ -474,6 +474,10 @@ def init_state(self, group, p, gindex, pindex):
 
     @torch.no_grad()
     def update_step(self, group, p, gindex, pindex):
+        # avoid update error from non-contiguous memory layout
+        p.data = p.data.contiguous()
+        p.grad = p.grad.contiguous()
+
         state = self.state[p]
         grad = p.grad
 
@@ -685,6 +689,10 @@ def init_state(self, group, p, gindex, pindex):
 
     @torch.no_grad()
     def update_step(self, group, p, gindex, pindex):
+        # avoid update error from non-contiguous memory layout
+        p.data = p.data.contiguous()
+        p.grad = p.grad.contiguous()
+
         state = self.state[p]
         grad = p.grad
 

From e3ae243be2f8bfb36715610e837363a515840b39 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 22 Jul 2024 17:15:51 +0200
Subject: [PATCH 28/54] Bump pytest from 8.2.2 to 8.3.1 in the minor-patch
 group (#1287)

Bumps the minor-patch group with 1 update: [pytest](https://github.com/pytest-dev/pytest).


Updates `pytest` from 8.2.2 to 8.3.1
- [Release notes](https://github.com/pytest-dev/pytest/releases)
- [Changelog](https://github.com/pytest-dev/pytest/blob/main/CHANGELOG.rst)
- [Commits](https://github.com/pytest-dev/pytest/compare/8.2.2...8.3.1)

---
updated-dependencies:
- dependency-name: pytest
  dependency-type: direct:production
  update-type: version-update:semver-minor
  dependency-group: minor-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 requirements-ci.txt  | 2 +-
 requirements-dev.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements-ci.txt b/requirements-ci.txt
index b36fd6586..182e1023e 100644
--- a/requirements-ci.txt
+++ b/requirements-ci.txt
@@ -1,5 +1,5 @@
 # Requirements used for GitHub actions
-pytest==8.2.2
+pytest==8.3.1
 einops==0.8.0
 lion-pytorch==0.2.2
 scipy==1.10.1; python_version < "3.9"
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 94098e012..41211880c 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,6 +1,6 @@
 # Requirements used for local development
 setuptools>=63
-pytest~=8.2.2
+pytest~=8.3.1
 einops~=0.8.0
 wheel~=0.43.0
 lion-pytorch~=0.2.2

From 7fed393aa8380f2d7f7c760bbd6a2f68b5caa9ea Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Tue, 23 Jul 2024 11:32:50 -0400
Subject: [PATCH 29/54] Fix restoration of quant_storage for CPU offloading
 (#1279)

* Fix restoration of quant_storage for CPU offloading

* Clarify comment on default quant_storage in Params4bit.from_prequantized()

* fix to make quant_storage dynamic based on serialized dtype

* delete obsolete comment

---------

Co-authored-by: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
---
 bitsandbytes/nn/modules.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index 05f7c04db..40766ad41 100644
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -282,10 +282,13 @@ def from_prequantized(
         self.compress_statistics = self.quant_state.nested
         self.quant_type = self.quant_state.quant_type
         self.bnb_quantized = True
+
+        self.quant_storage = data.dtype
+
         return self
 
     def _quantize(self, device):
-        w = self.data.contiguous().cuda(device)
+        w = self.data.contiguous().to(device)
         w_4bit, quant_state = bnb.functional.quantize_4bit(
             w,
             blocksize=self.blocksize,
@@ -333,6 +336,7 @@ def to(self, *args, **kwargs):
                 blocksize=self.blocksize,
                 compress_statistics=self.compress_statistics,
                 quant_type=self.quant_type,
+                quant_storage=self.quant_storage,
             )
 
             return new_param
@@ -450,7 +454,7 @@ def forward(self, x: torch.Tensor):
                 # since we registered the module, we can recover the state here
                 assert self.weight.shape[1] == 1
                 if not isinstance(self.weight, Params4bit):
-                    self.weight = Params4bit(self.weight, quant_storage=self.quant_storage)
+                    self.weight = Params4bit(self.weight, quant_storage=self.quant_storage, bnb_quantized=True)
                 self.weight.quant_state = self.quant_state
             else:
                 print(

From 1571110648dc5b0e603316c9ce2b0f16ac85cdbb Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Tue, 23 Jul 2024 16:40:31 +0000
Subject: [PATCH 30/54] remove unnecessary version mention

---
 docs/source/installation.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index 877c97456..8187fbf81 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -4,7 +4,7 @@
 
 bitsandbytes is only supported on CUDA GPUs for CUDA versions **11.0 - 12.5**.
 
-The latest version of bitsandbytes (v0.43.0) builds on:
+The latest version of bitsandbytes builds on:
 
 | OS | CUDA | Compiler |
 |---|---|---|

From ce53caf3c358ec3f81db6a9edc0b6fc2f17d9503 Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Tue, 23 Jul 2024 17:45:26 +0000
Subject: [PATCH 31/54] release 0.43.2

---
 CHANGELOG.md             | 18 ++++++++++++++++++
 _typos.toml              |  5 +++++
 bitsandbytes/__init__.py |  2 +-
 setup.py                 |  2 +-
 4 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index c456fa9e5..8ad648df1 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,21 @@
+### 0.43.2
+
+#### Improvements:
+
+- docs: FSDP+QLoRA and CPU install guide (#1211 #1227, thanks @stevhliu)
+- Add CUDA 12.5 and update 12.4 builds (#1284)
+
+#### Bug Fixes
+
+- 4bit getstate and 8bit deepcopy (#1230 #1231, thanks @BenjaminBossan)
+- missing optimizers in `str2optimizer32bit` (#1222, thanks @EtienneDosSantos)
+- CUDA 12.5 build issue (#1273, thanks @HennerM)
+- fix for min_8bit_size functionality in Optimizer base classes (#1286, thanks @Edenzzzz)
+- QLoRA mem bug (#1270, thanks @Ther-nullptr)
+- tests for cpu only platforms (#1259, thanks @galqiwi)
+- restoration of quant_storage for CPU offloading (#1279)
+- optim update error with non-contiguous grads/params (deepspeed) (#1187)
+
 ### 0.43.1
 
 #### Improvements:
diff --git a/_typos.toml b/_typos.toml
index a04206b8d..e4e7287fb 100644
--- a/_typos.toml
+++ b/_typos.toml
@@ -1,5 +1,10 @@
 [files]
 
+[default]
+extend-ignore-re = [
+    "@Ther-nul",  # valid Github user
+]
+
 [default.extend-identifiers]
 
 [type.py.extend-words]
diff --git a/bitsandbytes/__init__.py b/bitsandbytes/__init__.py
index 51cbde208..ad5f01539 100644
--- a/bitsandbytes/__init__.py
+++ b/bitsandbytes/__init__.py
@@ -21,4 +21,4 @@
     "optim.optimizer.MockArgs": False,
 }
 
-__version__ = "0.43.2.dev"
+__version__ = "0.43.2"
diff --git a/setup.py b/setup.py
index f8d6a92a1..d2b78f6b5 100644
--- a/setup.py
+++ b/setup.py
@@ -25,7 +25,7 @@ def has_ext_modules(self):
 
 setup(
     name="bitsandbytes",
-    version="0.43.2.dev",
+    version="0.43.2",
     author="Tim Dettmers",
     author_email="dettmers@cs.washington.edu",
     description="k-bit optimizers and matrix multiplication routines.",

From a7c08afd70af46646ccdebcd6bb459b66f0e9e54 Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Tue, 23 Jul 2024 18:26:24 +0000
Subject: [PATCH 32/54] bump version tag to next dev

---
 bitsandbytes/__init__.py | 2 +-
 setup.py                 | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/bitsandbytes/__init__.py b/bitsandbytes/__init__.py
index ad5f01539..a8acfbfc5 100644
--- a/bitsandbytes/__init__.py
+++ b/bitsandbytes/__init__.py
@@ -21,4 +21,4 @@
     "optim.optimizer.MockArgs": False,
 }
 
-__version__ = "0.43.2"
+__version__ = "0.43.3.dev"
diff --git a/setup.py b/setup.py
index d2b78f6b5..18de0fe5b 100644
--- a/setup.py
+++ b/setup.py
@@ -25,7 +25,7 @@ def has_ext_modules(self):
 
 setup(
     name="bitsandbytes",
-    version="0.43.2",
+    version="0.43.3.dev",
     author="Tim Dettmers",
     author_email="dettmers@cs.washington.edu",
     description="k-bit optimizers and matrix multiplication routines.",

From 9b726798542e01c45a7a4a841e144311980b90d6 Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Tue, 23 Jul 2024 19:13:24 +0000
Subject: [PATCH 33/54] Changelog: add explanation r. QLoRA mem savings

---
 CHANGELOG.md | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8ad648df1..e446155b0 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,13 @@
 ### 0.43.2
 
+This release is quite significant as the QLoRA bug fix big implications for higher `seqlen` and batch sizes.
+
+For each sequence (i.e. batch size increase of one) we expect memory savings of:
+- 405B: 39GB for seqlen 1024, and 4888GB for 128k
+- 70B: 20.1GB for 1024 and 2516GB for 128k
+
+This was due to activations being unnecessary for frozen parameters, yet the memory for them was still erroneously allocated due to the now fixed bug.
+
 #### Improvements:
 
 - docs: FSDP+QLoRA and CPU install guide (#1211 #1227, thanks @stevhliu)

From 81375f8e67e9433c778fce3011930159357271c8 Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Sat, 27 Jul 2024 13:11:00 +0000
Subject: [PATCH 34/54] docs: add more details to Intel install

---
 docs/source/installation.mdx | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index 5b2cfe1d3..2f8fe4db7 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -31,7 +31,7 @@ To install from PyPI.
 pip install bitsandbytes
 ```
 
-### Compile from source
+### Compile from source[[compile]]
 
 For Linux and Windows systems, you can compile bitsandbytes from source. Installing from source allows for more build options with different CMake configurations.
 
@@ -174,7 +174,18 @@ pip install -e .   # `-e` for "editable" install, when developing BNB (otherwise
 > [!TIP]
 > Intel CPU backend only supports building from source; for now, please follow the instructions below.
 
-Like CUDA, you can compile bitsandbytes from source for Linux and Windows systems. Installing from source allows for more build options with different CMake configurations.
+Similar to the CUDA case, you can compile bitsandbytes from source for Linux and Windows systems.
+
+The below commands are for Linux. For installing on Windows, please adapt the below commands according to the same pattern as described [the section above on compiling from source under the Windows tab](#compile).
+
+```
+git clone --branch multi-backend-refactor https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/
+pip install intel_extension_for_pytorch
+pip install -r requirements-dev.txt
+cmake -DCOMPUTE_BACKEND=cpu -S .
+make
+pip install -e .   # `-e` for "editable" install, when developing BNB (otherwise leave that out)
+```
 
 </hfoption>
 <hfoption id="Apple Silicon (MPS)">

From 7800734637d9982b3aab9f4606f6f06456073a83 Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Tue, 23 Jul 2024 19:13:24 +0000
Subject: [PATCH 35/54] Changelog: add explanation r. QLoRA mem savings

---
 CHANGELOG.md | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8ad648df1..ed324f09e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,13 @@
 ### 0.43.2
 
+This release is quite significant as the QLoRA bug fix big implications for higher `seqlen` and batch sizes.
+
+For each sequence (i.e. batch size increase of one) we expect memory savings of:
+- 405B: 39GB for `seqlen=1024`, and 4888GB for `seqlen=128,00`
+- 70B: 10.1GB for `seqlen=1024` and  1258GB for `seqlen=128,00`
+
+This was due to activations being unnecessary for frozen parameters, yet the memory for them was still erroneously allocated due to the now fixed bug.
+
 #### Improvements:
 
 - docs: FSDP+QLoRA and CPU install guide (#1211 #1227, thanks @stevhliu)

From 24f7b652cec822849fba69c583b8e73d84446627 Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Sat, 27 Jul 2024 14:08:30 +0000
Subject: [PATCH 36/54] docs: cleanup of compilation instructions

---
 docs/source/installation.mdx | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index 2f8fe4db7..f917f2623 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -2,7 +2,7 @@
 
 ## CUDA
 
-bitsandbytes is only supported on CUDA GPUs for CUDA versions **11.0 - 12.5**. There's a multi-backend effort under way which is currently in alpha release, see further down in this document.
+bitsandbytes is only supported on CUDA GPUs for CUDA versions **11.0 - 12.5**. However, there's a multi-backend effort under way which is currently in alpha release, check [the respective section below in case you're interested to help us with early feedback](#multi-backend).
 
 The latest version of bitsandbytes builds on:
 
@@ -134,7 +134,7 @@ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/YOUR_USERNAME/local/cuda-11.7
 
 3. Now when you launch bitsandbytes with these environment variables, the PyTorch CUDA version is overridden by the new CUDA version (in this example, version 11.7) and a different bitsandbytes library is loaded.
 
-## Multi-backend preview release (+ compilation)
+## Multi-backend preview release compilation[[multi-backend]]
 
 Please follow these steps to install bitsandbytes with device-specific backend support other than CUDA:
 
@@ -143,11 +143,10 @@ Please follow these steps to install bitsandbytes with device-specific backend s
 
 ### AMD GPU
 
-For a ROCm specific install:
+bitsandbytes is fully supported from ROCm 6.1 onwards (currently in alpha release).
 
-bitsandbytes is fully supported from ROCm 6.1.
-
-**Note:** If you already installed ROCm and PyTorch, skip docker steps below and please check that the torch version matches your ROCm install. To install torch for a specific ROCm version, please refer to step 3 of wheels install in [Installing PyTorch for ROCm](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/3rd-party/pytorch-install.html#using-wheels-package) guide.
+> [!TIP]
+> If you already installed ROCm and PyTorch, skip Docker steps below and please check that the torch version matches your ROCm install. To install torch for a specific ROCm version, please refer to step 3 of wheels install in [Installing PyTorch for ROCm](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/3rd-party/pytorch-install.html#using-wheels-package) guide.
 
 ```bash
 # Create a docker container with latest pytorch. It comes with ROCm and pytorch preinstalled
@@ -161,6 +160,7 @@ git clone --depth 1 -b multi-backend-refactor https://github.com/TimDettmers/bit
 pip install -r requirements-dev.txt
 
 # Compile & install
+apt-get install -y build-essential cmake  # install build tools dependencies, unless present
 cmake -DCOMPUTE_BACKEND=hip -S .  # Use -DBNB_ROCM_ARCH="gfx90a;gfx942" to target specific gpu arch
 make
 pip install -e .   # `-e` for "editable" install, when developing BNB (otherwise leave that out)
@@ -179,7 +179,7 @@ Similar to the CUDA case, you can compile bitsandbytes from source for Linux and
 The below commands are for Linux. For installing on Windows, please adapt the below commands according to the same pattern as described [the section above on compiling from source under the Windows tab](#compile).
 
 ```
-git clone --branch multi-backend-refactor https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/
+git clone --depth 1 -b multi-backend-refactor https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/
 pip install intel_extension_for_pytorch
 pip install -r requirements-dev.txt
 cmake -DCOMPUTE_BACKEND=cpu -S .

From e3b27805346b7d55a5ca4ba91fb374415c11dc05 Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Sat, 27 Jul 2024 14:16:49 +0000
Subject: [PATCH 37/54] docs: CHANGELOG.md fix

---
 CHANGELOG.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e446155b0..ed324f09e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,8 +3,8 @@
 This release is quite significant as the QLoRA bug fix big implications for higher `seqlen` and batch sizes.
 
 For each sequence (i.e. batch size increase of one) we expect memory savings of:
-- 405B: 39GB for seqlen 1024, and 4888GB for 128k
-- 70B: 20.1GB for 1024 and 2516GB for 128k
+- 405B: 39GB for `seqlen=1024`, and 4888GB for `seqlen=128,00`
+- 70B: 10.1GB for `seqlen=1024` and  1258GB for `seqlen=128,00`
 
 This was due to activations being unnecessary for frozen parameters, yet the memory for them was still erroneously allocated due to the now fixed bug.
 

From c8b4b33ef40d240b9650268dfe6ae15ac5472664 Mon Sep 17 00:00:00 2001
From: jiqing-feng <107918818+jiqing-feng@users.noreply.github.com>
Date: Sat, 27 Jul 2024 23:28:30 +0800
Subject: [PATCH 38/54] fix dtype mismatch (#1285)

---
 bitsandbytes/backends/cpu_xpu_common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bitsandbytes/backends/cpu_xpu_common.py b/bitsandbytes/backends/cpu_xpu_common.py
index c936dce14..04755ed2d 100644
--- a/bitsandbytes/backends/cpu_xpu_common.py
+++ b/bitsandbytes/backends/cpu_xpu_common.py
@@ -524,7 +524,7 @@ def gemm_4bit_impl(
         output = torch.ops.torch_ipex.ipex_woq_linear(A, state.op_context.get_data_handle())
     else:
         dqB = dequantize_4bit_impl(B, state, blocksize=state.blocksize)
-        output = torch.matmul(A, dqB)
+        output = torch.matmul(A, dqB.to(A.dtype))
     if out is not None:
         out.copy_(output)
     else:

From fd655b02663d1f692734b1a6376421dfbe1064b9 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.Nunna@amd.com>
Date: Mon, 29 Jul 2024 21:00:16 +0000
Subject: [PATCH 39/54] Add ROCm version to .so filename

---
 CMakeLists.txt             | 2 ++
 bitsandbytes/cextension.py | 6 ++++--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3bedefd51..c526678c7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -192,7 +192,9 @@ elseif(BUILD_HIP)
     # get hip version
     execute_process(COMMAND hipconfig --version OUTPUT_VARIABLE HIP_CONFIG_VERSION)
     string(REGEX MATCH "[0-9]+\\.[0-9]+" HIP_VERSION "${HIP_CONFIG_VERSION}")
+    string(REPLACE "." "" HIP_VERSION_SHORT "${HIP_VERSION}")
 
+    string(APPEND BNB_OUTPUT_NAME "${HIP_VERSION_SHORT}")
     if(NO_CUBLASLT OR HIP_VERSION VERSION_LESS "6.1")
         string(APPEND BNB_OUTPUT_NAME "_nohipblaslt")
     endif()
diff --git a/bitsandbytes/cextension.py b/bitsandbytes/cextension.py
index 03d2cbd61..6e391a752 100644
--- a/bitsandbytes/cextension.py
+++ b/bitsandbytes/cextension.py
@@ -38,9 +38,9 @@ def get_cuda_bnb_library_path(cuda_specs: CUDASpecs) -> Path:
     """
     if torch.version.hip:
         if BNB_HIP_VERSION < 601:
-            return PACKAGE_DIR / f"libbitsandbytes_hip_nohipblaslt{DYNAMIC_LIBRARY_SUFFIX}"
+            return PACKAGE_DIR / f"libbitsandbytes_hip{BNB_HIP_VERSION_SHORT}_nohipblaslt{DYNAMIC_LIBRARY_SUFFIX}"
         else:
-            return PACKAGE_DIR / f"libbitsandbytes_hip{DYNAMIC_LIBRARY_SUFFIX}"
+            return PACKAGE_DIR / f"libbitsandbytes_hip{BNB_HIP_VERSION_SHORT}{DYNAMIC_LIBRARY_SUFFIX}"
     library_name = f"libbitsandbytes_cuda{cuda_specs.cuda_version_string}"
     if not cuda_specs.has_cublaslt:
         # if not has_cublaslt (CC < 7.5), then we have to choose _nocublaslt
@@ -119,8 +119,10 @@ def get_native_library() -> BNBNativeLibrary:
     if torch.version.hip:
         hip_major, hip_minor = map(int, torch.version.hip.split(".")[0:2])
         HIP_ENVIRONMENT, BNB_HIP_VERSION = True, hip_major * 100 + hip_minor
+        BNB_HIP_VERSION_SHORT = str(hip_major) + str(hip_minor)
     else:
         HIP_ENVIRONMENT, BNB_HIP_VERSION = False, 0
+        BNB_HIP_VERSION_SHORT = ""
     lib = get_native_library()
 except Exception as e:
     lib = None

From 6b77f4c3f7aa042518d566489e13b774c96f68e3 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.Nunna@amd.com>
Date: Mon, 29 Jul 2024 21:16:57 +0000
Subject: [PATCH 40/54] Add rocm_version to whls build

---
 .github/scripts/build-rocm.sh        | 3 ++-
 .github/workflows/python-package.yml | 4 +++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/.github/scripts/build-rocm.sh b/.github/scripts/build-rocm.sh
index 616e8c250..cc15210fd 100644
--- a/.github/scripts/build-rocm.sh
+++ b/.github/scripts/build-rocm.sh
@@ -1,10 +1,11 @@
 #!/bin/bash
 declare build_arch
 declare build_os
+declare rocm_version
 
 set -xeuo pipefail
 if [ "${build_os:0:6}" == ubuntu ]; then
-	image=rocm/dev-ubuntu-22.04:6.1-complete
+	image=rocm/dev-ubuntu-22.04:${rocm_version}-complete
 	echo "Using image $image"
 	docker run --rm --platform "linux/$build_arch" -i \
 		-w /src -v "$PWD:/src" "$image" sh -c \
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 0b0b35416..cab735562 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -108,6 +108,8 @@ jobs:
       matrix:
         os: [ubuntu-latest]
         arch: [x86_64]
+        rocm_version:
+          ["6.1.2"]
     runs-on: ${{ matrix.os }} # One day, we could run them on native agents. Azure supports this now but it's planned only for Q3 2023 for hosted agents
     steps:
       - uses: actions/checkout@v4
@@ -128,7 +130,7 @@ jobs:
       - name: Upload build artifact
         uses: actions/upload-artifact@v4
         with:
-          name: shared_library_rocm_${{ matrix.os }}_${{ matrix.arch }}
+          name: shared_library_rocm_${{ matrix.os }}_${{ matrix.arch }}_${{ matrix.rocm_version }}
           path: output/*
           retention-days: 7
   build-wheels:

From 78324b32075b7ae6076c304e1dfd5f71db01704b Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.Nunna@amd.com>
Date: Mon, 29 Jul 2024 21:23:24 +0000
Subject: [PATCH 41/54] Revert "Remove cuda build temporarily"

This reverts commit 1413c5f3a2aed51140b86daa8ee9283c67cce738.
---
 .github/workflows/python-package.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index cab735562..6a4a6205b 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -136,6 +136,7 @@ jobs:
   build-wheels:
     needs:
       - build-shared-libs
+      - build-shared-libs-cuda
       - build-shared-libs-rocm
     strategy:
       matrix:

From c146b8b8f2fe9d6fec5f1f1b8da25b1ec60d6ac6 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.Nunna@amd.com>
Date: Mon, 29 Jul 2024 21:36:41 +0000
Subject: [PATCH 42/54] Add rocm_version env var

---
 .github/workflows/python-package.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 6a4a6205b..f4cc5486b 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -127,6 +127,7 @@ jobs:
         env:
           build_os: ${{ matrix.os }}
           build_arch: ${{ matrix.arch }}
+          rocm_version: ${{ matrix.rocm_version }}
       - name: Upload build artifact
         uses: actions/upload-artifact@v4
         with:

From d6c3df47a46d55c093e9bd8bf61ee3489bac605e Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.Nunna@amd.com>
Date: Tue, 30 Jul 2024 05:01:18 +0000
Subject: [PATCH 43/54] Remove thrush header files

---
 csrc/kernels.hip | 2 --
 csrc/ops_hip.cuh | 6 ------
 2 files changed, 8 deletions(-)

diff --git a/csrc/kernels.hip b/csrc/kernels.hip
index ca77dceda..d8d7cdba5 100644
--- a/csrc/kernels.hip
+++ b/csrc/kernels.hip
@@ -10,8 +10,6 @@
 #include <hipcub/hipcub.hpp>
 #include <hip/hip_math_constants.h>
 
-#include <thrust/host_vector.h>
-#include <thrust/device_vector.h>
 //#include <mma.h>
 
 
diff --git a/csrc/ops_hip.cuh b/csrc/ops_hip.cuh
index 1b9c13063..e57cbb3b5 100644
--- a/csrc/ops_hip.cuh
+++ b/csrc/ops_hip.cuh
@@ -21,12 +21,6 @@
 #include <vector>
 #include <functional>
 
-/*
-#include <thrust/host_vector.h>
-#include <thrust/device_vector.h>
-*/
-
-
 #define CUDA_CHECK_RETURN(value) {                      \
   hipError_t _m_cudaStat = value;                    \
   if (_m_cudaStat != hipSuccess) {                   \

From 7e9a65c33f66fffcb14ee2438170718777c06022 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.Nunna@amd.com>
Date: Tue, 30 Jul 2024 06:44:49 +0000
Subject: [PATCH 44/54] Print node info

---
 .github/scripts/build-rocm.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/scripts/build-rocm.sh b/.github/scripts/build-rocm.sh
index cc15210fd..8aac2c04b 100644
--- a/.github/scripts/build-rocm.sh
+++ b/.github/scripts/build-rocm.sh
@@ -11,7 +11,7 @@ if [ "${build_os:0:6}" == ubuntu ]; then
 		-w /src -v "$PWD:/src" "$image" sh -c \
 		"apt-get update \
       && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cmake \
-      && cmake -DCOMPUTE_BACKEND=hip . \
+      && rocminfo && cmake -DCOMPUTE_BACKEND=hip . \
       && cmake --build ."
 fi
 

From cdb209a2eb896d9c4166f53e9b2aa580c10e42c0 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.Nunna@amd.com>
Date: Tue, 30 Jul 2024 06:51:34 +0000
Subject: [PATCH 45/54] print cuda node info

---
 .github/scripts/build-cuda.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/scripts/build-cuda.sh b/.github/scripts/build-cuda.sh
index 0f9b8d726..fc79a92f6 100644
--- a/.github/scripts/build-cuda.sh
+++ b/.github/scripts/build-cuda.sh
@@ -15,7 +15,7 @@ for NO_CUBLASLT in ON OFF; do
 		docker run --platform "linux/$build_arch" -i -w /src -v "$PWD:/src" "$image" sh -c \
 			"apt-get update \
       && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cmake \
-      && cmake -DCOMPUTE_BACKEND=cuda -DCOMPUTE_CAPABILITY=\"${build_capability}\" -DNO_CUBLASLT=${NO_CUBLASLT} . \
+      && nvidia-smi && cmake -DCOMPUTE_BACKEND=cuda -DCOMPUTE_CAPABILITY=\"${build_capability}\" -DNO_CUBLASLT=${NO_CUBLASLT} . \
       && cmake --build ."
 	else
 		pip install cmake==3.28.3

From 77e149917dd4bb5be87099289edf53421fef6fe8 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.Nunna@amd.com>
Date: Tue, 30 Jul 2024 06:56:11 +0000
Subject: [PATCH 46/54] Revert "print cuda node info"

This reverts commit cdb209a2eb896d9c4166f53e9b2aa580c10e42c0.
---
 .github/scripts/build-cuda.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/scripts/build-cuda.sh b/.github/scripts/build-cuda.sh
index fc79a92f6..0f9b8d726 100644
--- a/.github/scripts/build-cuda.sh
+++ b/.github/scripts/build-cuda.sh
@@ -15,7 +15,7 @@ for NO_CUBLASLT in ON OFF; do
 		docker run --platform "linux/$build_arch" -i -w /src -v "$PWD:/src" "$image" sh -c \
 			"apt-get update \
       && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cmake \
-      && nvidia-smi && cmake -DCOMPUTE_BACKEND=cuda -DCOMPUTE_CAPABILITY=\"${build_capability}\" -DNO_CUBLASLT=${NO_CUBLASLT} . \
+      && cmake -DCOMPUTE_BACKEND=cuda -DCOMPUTE_CAPABILITY=\"${build_capability}\" -DNO_CUBLASLT=${NO_CUBLASLT} . \
       && cmake --build ."
 	else
 		pip install cmake==3.28.3

From 7c9190990478d3980eece86909b5faf4d36b3e16 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.Nunna@amd.com>
Date: Tue, 30 Jul 2024 06:56:54 +0000
Subject: [PATCH 47/54] Revert "Print node info"

This reverts commit 7e9a65c33f66fffcb14ee2438170718777c06022.
---
 .github/scripts/build-rocm.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/scripts/build-rocm.sh b/.github/scripts/build-rocm.sh
index 8aac2c04b..cc15210fd 100644
--- a/.github/scripts/build-rocm.sh
+++ b/.github/scripts/build-rocm.sh
@@ -11,7 +11,7 @@ if [ "${build_os:0:6}" == ubuntu ]; then
 		-w /src -v "$PWD:/src" "$image" sh -c \
 		"apt-get update \
       && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cmake \
-      && rocminfo && cmake -DCOMPUTE_BACKEND=hip . \
+      && cmake -DCOMPUTE_BACKEND=hip . \
       && cmake --build ."
 fi
 

From b78b3400b59e2f40dad5ba4f676e7ffd46dff978 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.Nunna@amd.com>
Date: Tue, 30 Jul 2024 07:14:18 +0000
Subject: [PATCH 48/54] Add rocm arch to compile command

---
 .github/scripts/build-rocm.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/scripts/build-rocm.sh b/.github/scripts/build-rocm.sh
index cc15210fd..a5933da3f 100644
--- a/.github/scripts/build-rocm.sh
+++ b/.github/scripts/build-rocm.sh
@@ -4,6 +4,7 @@ declare build_os
 declare rocm_version
 
 set -xeuo pipefail
+bnb_rocm_arch="gfx906;gfx908;gfx90a;gfx942;gfx1100;gfx1030"
 if [ "${build_os:0:6}" == ubuntu ]; then
 	image=rocm/dev-ubuntu-22.04:${rocm_version}-complete
 	echo "Using image $image"
@@ -11,7 +12,7 @@ if [ "${build_os:0:6}" == ubuntu ]; then
 		-w /src -v "$PWD:/src" "$image" sh -c \
 		"apt-get update \
       && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cmake \
-      && cmake -DCOMPUTE_BACKEND=hip . \
+      && cmake -DCOMPUTE_BACKEND=hip -DBNB_ROCM_ARCH=\"${bnb_rocm_arch}\" . \
       && cmake --build ."
 fi
 

From a62b9d454b25f5f7933e7793fd553cc81699a925 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.Nunna@amd.com>
Date: Tue, 30 Jul 2024 19:21:47 +0000
Subject: [PATCH 49/54] Rename .so files to rocm

---
 CMakeLists.txt             | 2 +-
 bitsandbytes/cextension.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7a7671239..0891f75b0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -194,7 +194,7 @@ elseif(BUILD_HIP)
 
     list(APPEND SRC_FILES ${HIP_FILES})
 
-    string(APPEND BNB_OUTPUT_NAME "_hip")
+    string(APPEND BNB_OUTPUT_NAME "_rocm")
 
     # get hip version
     execute_process(COMMAND hipconfig --version OUTPUT_VARIABLE HIP_CONFIG_VERSION)
diff --git a/bitsandbytes/cextension.py b/bitsandbytes/cextension.py
index 6e391a752..a096d0d51 100644
--- a/bitsandbytes/cextension.py
+++ b/bitsandbytes/cextension.py
@@ -38,9 +38,9 @@ def get_cuda_bnb_library_path(cuda_specs: CUDASpecs) -> Path:
     """
     if torch.version.hip:
         if BNB_HIP_VERSION < 601:
-            return PACKAGE_DIR / f"libbitsandbytes_hip{BNB_HIP_VERSION_SHORT}_nohipblaslt{DYNAMIC_LIBRARY_SUFFIX}"
+            return PACKAGE_DIR / f"libbitsandbytes_rocm{BNB_HIP_VERSION_SHORT}_nohipblaslt{DYNAMIC_LIBRARY_SUFFIX}"
         else:
-            return PACKAGE_DIR / f"libbitsandbytes_hip{BNB_HIP_VERSION_SHORT}{DYNAMIC_LIBRARY_SUFFIX}"
+            return PACKAGE_DIR / f"libbitsandbytes_rocm{BNB_HIP_VERSION_SHORT}{DYNAMIC_LIBRARY_SUFFIX}"
     library_name = f"libbitsandbytes_cuda{cuda_specs.cuda_version_string}"
     if not cuda_specs.has_cublaslt:
         # if not has_cublaslt (CC < 7.5), then we have to choose _nocublaslt

From 9059bff540eac1b871eca220120c37a32186d481 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.Nunna@amd.com>
Date: Tue, 30 Jul 2024 19:27:12 +0000
Subject: [PATCH 50/54] Update default gpu arch

---
 .github/scripts/build-rocm.sh | 2 +-
 CMakeLists.txt                | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/scripts/build-rocm.sh b/.github/scripts/build-rocm.sh
index a5933da3f..b508fac69 100644
--- a/.github/scripts/build-rocm.sh
+++ b/.github/scripts/build-rocm.sh
@@ -4,7 +4,7 @@ declare build_os
 declare rocm_version
 
 set -xeuo pipefail
-bnb_rocm_arch="gfx906;gfx908;gfx90a;gfx942;gfx1100;gfx1030"
+bnb_rocm_arch="gfx90a;gfx942;gfx1100"
 if [ "${build_os:0:6}" == ubuntu ]; then
 	image=rocm/dev-ubuntu-22.04:${rocm_version}-complete
 	echo "Using image $image"
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0891f75b0..eac72fe52 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -185,7 +185,7 @@ elseif(BUILD_HIP)
       set(CMAKE_HIP_ARCHITECTURES ${BNB_ROCM_ARCH})
     else()
       if (NOT AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES)
-        set(CMAKE_HIP_ARCHITECTURES "gfx908;gfx90a;gfx940;gfx941;gfx942")
+        set(CMAKE_HIP_ARCHITECTURES "gfx90a;gfx942;gfx1100")
       elseif (AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES)
         set(CMAKE_HIP_ARCHITECTURES ${AMDGPU_TARGETS})
       endif()

From c5a406ad9aed7738a236c9c184a4b1bf2ccd422c Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.Nunna@amd.com>
Date: Tue, 30 Jul 2024 19:48:32 +0000
Subject: [PATCH 51/54] Skip cpu based igemmlt int tests on ROCm

---
 tests/test_functional.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/test_functional.py b/tests/test_functional.py
index 4e82c530a..a9d926b89 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -584,6 +584,9 @@ def test_nvidia_transform(dim1, dim2, dim3, dims, dtype, orderA, orderOut, trans
 @pytest.mark.parametrize("ldb", (0,), ids=id_formatter("ldb"))
 @pytest.mark.parametrize("device", ("cuda", "cpu"), ids=id_formatter("device"))
 def test_igemmlt_int(dim1, dim2, dim3, dim4, dims, ldb, device):
+    if HIP_ENVIRONMENT and device == "cpu":
+        pytest.skip("this test is not supported on ROCm yet")
+
     for i in range(k):
         if dims == 2:
             A = torch.randint(-128, 127, size=(dim1, dim3), device=device).to(torch.int8)

From 9cbb5e12a8987c57188917e5353e46492ef8d1eb Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.Nunna@amd.com>
Date: Tue, 30 Jul 2024 21:19:53 +0000
Subject: [PATCH 52/54] Update Documentation

---
 docs/source/installation.mdx | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index f917f2623..0daa5e279 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -146,13 +146,23 @@ Please follow these steps to install bitsandbytes with device-specific backend s
 bitsandbytes is fully supported from ROCm 6.1 onwards (currently in alpha release).
 
 > [!TIP]
-> If you already installed ROCm and PyTorch, skip Docker steps below and please check that the torch version matches your ROCm install. To install torch for a specific ROCm version, please refer to step 3 of wheels install in [Installing PyTorch for ROCm](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/3rd-party/pytorch-install.html#using-wheels-package) guide.
+> If you would like to install ROCm and PyTorch on bare metal, skip Docker steps and refer to our official guides at [ROCm installation overview](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/install-overview.html#rocm-install-overview) and [Installing PyTorch for ROCm](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/3rd-party/pytorch-install.html#using-wheels-package) (Step 3 of wheels build for quick installation). Please make sure to get PyTorch wheel for the installed ROCm version.
 
 ```bash
-# Create a docker container with latest pytorch. It comes with ROCm and pytorch preinstalled
-docker pull rocm/pytorch:latest
-docker run -it --device=/dev/kfd --device=/dev/dri --group-add video rocm/pytorch:latest
+# Create a docker container with latest ROCm image, which includes ROCm libraries
+docker pull rocm/dev-ubuntu-22.04:6.1.2-complete
+docker run -it --device=/dev/kfd --device=/dev/dri --group-add video rocm/dev-ubuntu-22.04:6.1.2-complete
+apt-get update && apt-get install -y git && cd home
 
+# Install pytorch compatible with above ROCm version
+pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.1/
+
+# Install bitsandbytes from PyPI
+# (This is supported on Ubuntu 22.04, Python 3.10, ROCm 6.1.0/6.1.1/6.1.2 and gpu arch - gfx90a, gfx942, gfx1100
+# Please install from source as given below if your configuration doesn't match with these)
+pip install bitsandbytes
+
+# Install bitsandbytes from source
 # Clone bitsandbytes repo, ROCm backend is currently enabled on multi-backend-refactor branch
 git clone --depth 1 -b multi-backend-refactor https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/
 

From 358062473b3ec475ecf14dd7effc2b02754ff947 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.Nunna@amd.com>
Date: Tue, 30 Jul 2024 21:23:07 +0000
Subject: [PATCH 53/54] Update upstream repo name

---
 docs/source/installation.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index 0daa5e279..a71fe3261 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -164,7 +164,7 @@ pip install bitsandbytes
 
 # Install bitsandbytes from source
 # Clone bitsandbytes repo, ROCm backend is currently enabled on multi-backend-refactor branch
-git clone --depth 1 -b multi-backend-refactor https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/
+git clone --depth 1 -b multi-backend-refactor https://github.com/bitsandbytes-foundation/bitsandbytes.git && cd bitsandbytes/
 
 # Install dependencies
 pip install -r requirements-dev.txt

From 3bde1b7bc3b52163d8f35bf654e933879213992e Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.Nunna@amd.com>
Date: Tue, 30 Jul 2024 21:28:39 +0000
Subject: [PATCH 54/54] Update docs

---
 docs/source/installation.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index a71fe3261..3ed694ac1 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -159,7 +159,7 @@ pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/roc
 
 # Install bitsandbytes from PyPI
 # (This is supported on Ubuntu 22.04, Python 3.10, ROCm 6.1.0/6.1.1/6.1.2 and gpu arch - gfx90a, gfx942, gfx1100
-# Please install from source as given below if your configuration doesn't match with these)
+# Please install from source if your configuration doesn't match with these)
 pip install bitsandbytes
 
 # Install bitsandbytes from source