From a9a1c44abaf671f06794f6664fe3ca47bcade5e5 Mon Sep 17 00:00:00 2001
From: EtienneDosSantos <130935112+EtienneDosSantos@users.noreply.github.com>
Date: Sun, 26 May 2024 15:43:24 +0200
Subject: [PATCH 01/26] Add `"lamb"` to `str2optimizer32bit`

---
 bitsandbytes/functional.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index f915223ca..dc1490482 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -49,6 +49,10 @@ def prod(iterable):
             lib.cadagrad32bit_grad_32,
             lib.cadagrad32bit_grad_16,
         ),
+        "lamb": (
+            lib.cadam32bit_grad_fp32,
+            lib.cadam32bit_grad_fp16,
+            ),
     }
 
     str2optimizer8bit = {

From 2e46eefcb214cffc0fb9d6ace71f53924f9c7873 Mon Sep 17 00:00:00 2001
From: EtienneDosSantos <130935112+EtienneDosSantos@users.noreply.github.com>
Date: Tue, 28 May 2024 18:35:31 +0200
Subject: [PATCH 02/26] Sorted alphabetically for better overview

---
 bitsandbytes/functional.py | 64 +++++++++++++++++++-------------------
 1 file changed, 32 insertions(+), 32 deletions(-)

diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index dc1490482..0b1e7d5c4 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -27,11 +27,24 @@ def prod(iterable):
 if lib and lib.compiled_with_cuda:
     """C FUNCTIONS FOR OPTIMIZERS"""
     str2optimizer32bit = {
+        "adagrad": (
+            lib.cadagrad32bit_grad_32,
+            lib.cadagrad32bit_grad_16,
+        ),
         "adam": (
             lib.cadam32bit_grad_fp32,
             lib.cadam32bit_grad_fp16,
             lib.cadam32bit_grad_bf16,
         ),
+        "lamb": (
+            lib.cadam32bit_grad_fp32,
+            lib.cadam32bit_grad_fp16,
+        ),
+        "lion": (
+            lib.clion32bit_grad_fp32,
+            lib.clion32bit_grad_fp16,
+            lib.clion32bit_grad_bf16,
+        ),
         "momentum": (
             lib.cmomentum32bit_grad_32,
             lib.cmomentum32bit_grad_16,
@@ -40,19 +53,6 @@ def prod(iterable):
             lib.crmsprop32bit_grad_32,
             lib.crmsprop32bit_grad_16,
         ),
-        "lion": (
-            lib.clion32bit_grad_fp32,
-            lib.clion32bit_grad_fp16,
-            lib.clion32bit_grad_bf16,
-        ),
-        "adagrad": (
-            lib.cadagrad32bit_grad_32,
-            lib.cadagrad32bit_grad_16,
-        ),
-        "lamb": (
-            lib.cadam32bit_grad_fp32,
-            lib.cadam32bit_grad_fp16,
-            ),
     }
 
     str2optimizer8bit = {
@@ -60,34 +60,43 @@ def prod(iterable):
             lib.cadam_static_8bit_grad_32,
             lib.cadam_static_8bit_grad_16,
         ),
-        "momentum": (
+        "lamb": (
+            lib.cadam_static_8bit_grad_32,
+            lib.cadam_static_8bit_grad_16,
+        ),
+        "lars": (
             lib.cmomentum_static_8bit_grad_32,
             lib.cmomentum_static_8bit_grad_16,
         ),
-        "rmsprop": (
-            lib.crmsprop_static_8bit_grad_32,
-            lib.crmsprop_static_8bit_grad_16,
-        ),
         "lion": (
             lib.clion_static_8bit_grad_32,
             lib.clion_static_8bit_grad_16,
         ),
-        "lamb": (
-            lib.cadam_static_8bit_grad_32,
-            lib.cadam_static_8bit_grad_16,
-        ),
-        "lars": (
+        "momentum": (
             lib.cmomentum_static_8bit_grad_32,
             lib.cmomentum_static_8bit_grad_16,
         ),
+        "rmsprop": (
+            lib.crmsprop_static_8bit_grad_32,
+            lib.crmsprop_static_8bit_grad_16,
+        ),
     }
 
     str2optimizer8bit_blockwise = {
+        "adagrad": (
+            lib.cadagrad_8bit_blockwise_grad_fp32,
+            lib.cadagrad_8bit_blockwise_grad_fp16,
+        ),
         "adam": (
             lib.cadam_8bit_blockwise_grad_fp32,
             lib.cadam_8bit_blockwise_grad_fp16,
             lib.cadam_8bit_blockwise_grad_bf16,
         ),
+        "lion": (
+            lib.clion_8bit_blockwise_grad_fp32,
+            lib.clion_8bit_blockwise_grad_fp16,
+            lib.clion_8bit_blockwise_grad_bf16,
+        ),
         "momentum": (
             lib.cmomentum_8bit_blockwise_grad_fp32,
             lib.cmomentum_8bit_blockwise_grad_fp16,
@@ -96,15 +105,6 @@ def prod(iterable):
             lib.crmsprop_8bit_blockwise_grad_fp32,
             lib.crmsprop_8bit_blockwise_grad_fp16,
         ),
-        "lion": (
-            lib.clion_8bit_blockwise_grad_fp32,
-            lib.clion_8bit_blockwise_grad_fp16,
-            lib.clion_8bit_blockwise_grad_bf16,
-        ),
-        "adagrad": (
-            lib.cadagrad_8bit_blockwise_grad_fp32,
-            lib.cadagrad_8bit_blockwise_grad_fp16,
-        ),
     }
 
 
From 7a338db2eccbd60b7da3b7bed9c927117c6b3806 Mon Sep 17 00:00:00 2001
From: EtienneDosSantos <130935112+EtienneDosSantos@users.noreply.github.com>
Date: Tue, 28 May 2024 19:53:57 +0200
Subject: [PATCH 03/26] Update functional.py

---
 bitsandbytes/functional.py | 88 ++++++++++++++++++++++++++++++--------
 1 file changed, 70 insertions(+), 18 deletions(-)

diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index 0b1e7d5c4..bbfbf0007 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -28,57 +28,94 @@ def prod(iterable):
     """C FUNCTIONS FOR OPTIMIZERS"""
     str2optimizer32bit = {
         "adagrad": (
-            lib.cadagrad32bit_grad_32,
-            lib.cadagrad32bit_grad_16,
+            lib.cadagrad32bit_grad_fp32,
+            lib.cadagrad32bit_grad_fp16,
         ),
         "adam": (
             lib.cadam32bit_grad_fp32,
             lib.cadam32bit_grad_fp16,
             lib.cadam32bit_grad_bf16,
         ),
+        "pagedadam": (
+            lib.cpagedadam32bit_grad_fp32,
+            lib.cpagedadam32bit_grad_fp16,
+            lib.cpagedadam32bit_grad_bf16,
+        ),
+        "adamw": (
+            lib.cadam32bit_grad_fp32,
+            lib.cadam32bit_grad_fp16,
+            lib.cadam32bit_grad_bf16,
+        ),
+        "pagedadamw": (
+            lib.cpagedadam32bit_grad_fp32,
+            lib.cpagedadam32bit_grad_fp16,
+            lib.cpagedadam32bit_grad_bf16,
+        ),
         "lamb": (
             lib.cadam32bit_grad_fp32,
             lib.cadam32bit_grad_fp16,
         ),
+        "lars": (
+            lib.clars32bit_grad_fp32,
+            lib.clars32bit_grad_fp16,
+        ),
         "lion": (
             lib.clion32bit_grad_fp32,
             lib.clion32bit_grad_fp16,
             lib.clion32bit_grad_bf16,
         ),
         "momentum": (
-            lib.cmomentum32bit_grad_32,
-            lib.cmomentum32bit_grad_16,
+            lib.cmomentum32bit_grad_fp32,
+            lib.cmomentum32bit_grad_fp16,
         ),
         "rmsprop": (
-            lib.crmsprop32bit_grad_32,
-            lib.crmsprop32bit_grad_16,
+            lib.crmsprop32bit_grad_fp32,
+            lib.crmsprop32bit_grad_fp16,
         ),
     }
 
     str2optimizer8bit = {
+        "adagrad": (
+            lib.cadagrad8bit_grad_fp32,
+            lib.cadagrad8bit_grad_fp16,
+        ),
         "adam": (
-            lib.cadam_static_8bit_grad_32,
-            lib.cadam_static_8bit_grad_16,
+            lib.cadam_static_8bit_grad_fp32,
+            lib.cadam_static_8bit_grad_fp16,
+        ),
+        "pagedadam": (
+            lib.cpagedadam8bit_grad_fp32,
+            lib.cpagedadam8bit_grad_fp16,
+            lib.cpagedadam8bit_grad_bf16,
+        ),
+        "adamw": (
+            lib.cadam_static_8bit_grad_fp32,
+            lib.cadam_static_8bit_grad_fp16,
+        ),
+        "pagedadamw": (
+            lib.cpagedadam8bit_grad_fp32,
+            lib.cpagedadam8bit_grad_fp16,
+            lib.cpagedadam8bit_grad_bf16,
         ),
         "lamb": (
-            lib.cadam_static_8bit_grad_32,
-            lib.cadam_static_8bit_grad_16,
+            lib.cadam_static_8bit_grad_fp32,
+            lib.cadam_static_8bit_grad_fp16,
         ),
         "lars": (
-            lib.cmomentum_static_8bit_grad_32,
-            lib.cmomentum_static_8bit_grad_16,
+            lib.clars8bit_grad_fp32,
+            lib.clars8bit_grad_fp16,
         ),
         "lion": (
-            lib.clion_static_8bit_grad_32,
-            lib.clion_static_8bit_grad_16,
+            lib.clion_static_8bit_grad_fp32,
+            lib.clion_static_8bit_grad_fp16,
         ),
         "momentum": (
-            lib.cmomentum_static_8bit_grad_32,
-            lib.cmomentum_static_8bit_grad_16,
+            lib.cmomentum_static_8bit_grad_fp32,
+            lib.cmomentum_static_8bit_grad_fp16,
         ),
         "rmsprop": (
-            lib.crmsprop_static_8bit_grad_32,
-            lib.crmsprop_static_8bit_grad_16,
+            lib.crmsprop_static_8bit_grad_fp32,
+            lib.crmsprop_static_8bit_grad_fp16,
         ),
     }
 
@@ -92,6 +129,21 @@ def prod(iterable):
             lib.cadam_8bit_blockwise_grad_fp16,
             lib.cadam_8bit_blockwise_grad_bf16,
         ),
+        "pagedadam": (
+            lib.cpagedadam8bit_blockwise_fp32,
+            lib.cpagedadam8bit_blockwise_fp16,
+            lib.cpagedadam8bit_blockwise_bf16,
+        ),
+        "adamw": (
+            lib.cadam_8bit_blockwise_grad_fp32,
+            lib.cadam_8bit_blockwise_grad_fp16,
+            lib.cadam_8bit_blockwise_grad_bf16,
+        ),
+        "pagedadamw": (
+            lib.cpagedadam8bit_blockwise_fp32,
+            lib.cpagedadam8bit_blockwise_fp16,
+            lib.cpagedadam8bit_blockwise_bf16,
+        ),
         "lion": (
             lib.clion_8bit_blockwise_grad_fp32,
             lib.clion_8bit_blockwise_grad_fp16,

From 2fb212bdf983891451db73a4c4bef6c91ec0786d Mon Sep 17 00:00:00 2001
From: Benjamin Bossan <benjamin.bossan@gmail.com>
Date: Wed, 29 May 2024 15:36:44 +0200
Subject: [PATCH 04/26] FIX Prevent __getstate__ from mutating Params4bit

As discussed internally, use state = self.__dict__.copy(), which is also
what the Python docs recommend.
---
 bitsandbytes/nn/modules.py |  2 +-
 tests/test_linear4bit.py   | 15 +++++++++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index 24a155ab1..df347ebba 100644
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -236,7 +236,7 @@ def __new__(
         return self
 
     def __getstate__(self):
-        state = self.__dict__
+        state = self.__dict__.copy()
         state["data"] = self.data
         state["requires_grad"] = self.requires_grad
         return state
diff --git a/tests/test_linear4bit.py b/tests/test_linear4bit.py
index bbbd05335..2f094be27 100644
--- a/tests/test_linear4bit.py
+++ b/tests/test_linear4bit.py
@@ -186,19 +186,30 @@ def test_copy_param():
 def test_deepcopy_param():
     tensor = torch.tensor([1.0, 2.0, 3.0, 4.0])
     param = bnb.nn.Params4bit(data=tensor, requires_grad=False).cuda(0)
+    dict_keys_before = set(param.__dict__.keys())
     copy_param = copy.deepcopy(param)
+    dict_keys_after = set(param.__dict__.keys())
+    dict_keys_copy = set(copy_param.__dict__.keys())
+
     assert param.quant_state is not copy_param.quant_state
     assert param.data.data_ptr() != copy_param.data.data_ptr()
 
+    # there was a bug where deepcopy would modify the original object
+    assert dict_keys_before == dict_keys_after
+    assert dict_keys_before == dict_keys_copy
+
 
 def test_params4bit_real_serialization():
     original_tensor = torch.tensor([1.0, 2.0, 3.0, 4.0], dtype=torch.float32)
     original_param = bnb.nn.Params4bit(data=original_tensor, quant_type="fp4")
+    dict_keys_before = set(original_param.__dict__.keys())
 
     original_param.cuda(0)  # move to CUDA to trigger quantization
 
     serialized_param = pickle.dumps(original_param)
     deserialized_param = pickle.loads(serialized_param)
+    dict_keys_after = set(original_param.__dict__.keys())
+    dict_keys_deserialized = set(deserialized_param.__dict__.keys())
 
     assert torch.equal(original_param.data, deserialized_param.data)
     assert original_param.requires_grad == deserialized_param.requires_grad == False
@@ -206,3 +217,7 @@ def test_params4bit_real_serialization():
     assert original_param.blocksize == deserialized_param.blocksize
     assert original_param.compress_statistics == deserialized_param.compress_statistics
     assert original_param.quant_state == deserialized_param.quant_state
+
+    # there was a bug where deepcopy would modify the original object
+    assert dict_keys_before == dict_keys_after
+    assert dict_keys_before == dict_keys_deserialized

From ed99b3c118d73e829e7581c04a414b80ee2b7030 Mon Sep 17 00:00:00 2001
From: Benjamin Bossan <benjamin.bossan@gmail.com>
Date: Thu, 30 May 2024 17:03:48 +0200
Subject: [PATCH 05/26] FIX Make Int8Params deepcopy-able

This requires to implement the __deepcopy__ method in Int8Params.
Moreover, there was an issue in the Linear8BitLT constructor that would
assign instance attributes to the class, which is now fixed.

Please review carefully that this does not impact existing code.

Tests that I ran:

- pytest tests/test_linear8bitlt.py
- in PEFT: python -m pytest -m "single_gpu_tests and bitsandbytes" tests/test_gpu_examples.py
- in PEFT: python -m pytest -m "single_gpu_tests and bitsandbytes" tests/test_common_gpu.py
- in transformers: RUN_SLOW=1 python -m pytest tests/quantization/bnb -x
---
 bitsandbytes/nn/modules.py | 19 ++++++++++---
 tests/test_linear8bitlt.py | 58 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 73 insertions(+), 4 deletions(-)

diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index 24a155ab1..e44c77ac6 100644
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -560,13 +560,12 @@ def __new__(
         CB=None,
         SCB=None,
     ):
-        cls.has_fp16_weights = has_fp16_weights
-        cls.CB = None
-        cls.SCB = None
         if data is None:
             data = torch.empty(0)
         obj = torch.Tensor._make_subclass(cls, data, requires_grad)
-        obj.CB, obj.SCB = cls.CB, cls.SCB
+        obj.CB = CB
+        obj.SCB = SCB
+        obj.has_fp16_weights = has_fp16_weights
         return obj
 
     def cuda(self, device):
@@ -585,6 +584,18 @@ def cuda(self, device):
 
         return self
 
+    def __deepcopy__(self, memo):
+        # adjust this if new arguments are added to the constructor
+        new_instance = type(self).__new__(
+            type(self),
+            data=copy.deepcopy(self.data, memo),
+            requires_grad=self.requires_grad,
+            has_fp16_weights=self.has_fp16_weights,
+            CB=copy.deepcopy(self.CB, memo),
+            SCB=copy.deepcopy(self.SCB, memo),
+        )
+        return new_instance
+
     @overload
     def to(
         self: T,
diff --git a/tests/test_linear8bitlt.py b/tests/test_linear8bitlt.py
index 4b62abd6d..e55abe110 100644
--- a/tests/test_linear8bitlt.py
+++ b/tests/test_linear8bitlt.py
@@ -1,5 +1,7 @@
 from contextlib import nullcontext
+import copy
 import os
+import pickle
 from tempfile import TemporaryDirectory
 
 import pytest
@@ -177,3 +179,59 @@ def test_linear_serialization(
         assert torch.allclose(x_first.grad, x_second.grad, atol=1e-5)
     assert torch.allclose(fx_first, fx_third, atol=1e-5)
     assert torch.allclose(x_first.grad, x_third.grad, atol=1e-5)
+
+
+@pytest.fixture
+def linear8bit():
+    linear = torch.nn.Linear(32, 96)
+    linear_custom = Linear8bitLt(
+        linear.in_features,
+        linear.out_features,
+        linear.bias is not None,
+        has_fp16_weights=False,
+        threshold=6.0,
+    )
+    linear_custom.weight = bnb.nn.Int8Params(
+        linear.weight.data.clone(),
+        requires_grad=False,
+        has_fp16_weights=False,
+    )
+    linear_custom.bias = linear.bias
+    linear_custom = linear_custom.cuda()
+    return linear_custom
+
+
+def test_linear8bit_copy_param(linear8bit):
+    shallow_copy = copy.copy(linear8bit)
+    assert linear8bit.weight is shallow_copy.weight
+    assert linear8bit.bias is shallow_copy.bias
+    assert linear8bit.weight.data.data_ptr() == shallow_copy.weight.data.data_ptr()
+
+
+def test_linear8bit_deepcopy_param(linear8bit):
+    deep_copy = copy.deepcopy(linear8bit)
+    assert linear8bit.weight is not deep_copy.weight
+    assert linear8bit.bias is not deep_copy.bias
+    assert linear8bit.weight.data.data_ptr() != deep_copy.weight.data.data_ptr()
+    assert torch.allclose(linear8bit.weight.data, deep_copy.weight.data)
+    assert linear8bit.state == deep_copy.state
+
+    # check for a bug where SCB and CB were not copied
+    assert deep_copy.weight.SCB is not None
+    assert (linear8bit.weight.SCB == deep_copy.weight.SCB).all()
+    assert deep_copy.weight.CB is not None
+    assert (linear8bit.weight.CB == deep_copy.weight.CB).all()
+
+
+def test_linear8bit_serialization(linear8bit):
+    serialized = pickle.dumps(linear8bit)
+    deserialized = pickle.loads(serialized)
+    assert linear8bit.weight.data.data_ptr() != deserialized.weight.data.data_ptr()
+    assert torch.allclose(linear8bit.weight.data, deserialized.weight.data)
+    assert linear8bit.bias.data.data_ptr() != deserialized.bias.data.data_ptr()
+    assert torch.allclose(linear8bit.bias.data, deserialized.bias.data)
+    assert linear8bit.state == deserialized.state
+
+    # check for a bug where SCB and CB were not copied
+    assert (linear8bit.weight.SCB == deserialized.weight.SCB).all()
+    assert (linear8bit.weight.CB == deserialized.weight.CB).all()

From b22ae26d4c9f0bec6b85987423737d447a78c387 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Wed, 5 Jun 2024 14:36:37 +0200
Subject: [PATCH 06/26] fix for faulty #1222 ("Add `"lamb"` to
 `str2optimizer32bit`") (#1240)

* Revert "Add `"lamb"` to `str2optimizer32bit`"

* Update bitsandbytes/functional.py
---
 bitsandbytes/functional.py | 124 +++++++++++--------------------------
 1 file changed, 36 insertions(+), 88 deletions(-)

diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index bbfbf0007..cea3179a1 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -27,128 +27,67 @@ def prod(iterable):
 if lib and lib.compiled_with_cuda:
     """C FUNCTIONS FOR OPTIMIZERS"""
     str2optimizer32bit = {
-        "adagrad": (
-            lib.cadagrad32bit_grad_fp32,
-            lib.cadagrad32bit_grad_fp16,
-        ),
         "adam": (
             lib.cadam32bit_grad_fp32,
             lib.cadam32bit_grad_fp16,
             lib.cadam32bit_grad_bf16,
         ),
-        "pagedadam": (
-            lib.cpagedadam32bit_grad_fp32,
-            lib.cpagedadam32bit_grad_fp16,
-            lib.cpagedadam32bit_grad_bf16,
-        ),
-        "adamw": (
-            lib.cadam32bit_grad_fp32,
-            lib.cadam32bit_grad_fp16,
-            lib.cadam32bit_grad_bf16,
-        ),
-        "pagedadamw": (
-            lib.cpagedadam32bit_grad_fp32,
-            lib.cpagedadam32bit_grad_fp16,
-            lib.cpagedadam32bit_grad_bf16,
-        ),
-        "lamb": (
-            lib.cadam32bit_grad_fp32,
-            lib.cadam32bit_grad_fp16,
+        "momentum": (
+            lib.cmomentum32bit_grad_32,
+            lib.cmomentum32bit_grad_16,
         ),
-        "lars": (
-            lib.clars32bit_grad_fp32,
-            lib.clars32bit_grad_fp16,
+        "rmsprop": (
+            lib.crmsprop32bit_grad_32,
+            lib.crmsprop32bit_grad_16,
         ),
         "lion": (
             lib.clion32bit_grad_fp32,
             lib.clion32bit_grad_fp16,
             lib.clion32bit_grad_bf16,
         ),
-        "momentum": (
-            lib.cmomentum32bit_grad_fp32,
-            lib.cmomentum32bit_grad_fp16,
+        "adagrad": (
+            lib.cadagrad32bit_grad_32,
+            lib.cadagrad32bit_grad_16,
         ),
-        "rmsprop": (
-            lib.crmsprop32bit_grad_fp32,
-            lib.crmsprop32bit_grad_fp16,
+        "lamb": (
+            lib.cadam32bit_grad_fp32,
+            lib.cadam32bit_grad_fp16,
         ),
     }
 
     str2optimizer8bit = {
-        "adagrad": (
-            lib.cadagrad8bit_grad_fp32,
-            lib.cadagrad8bit_grad_fp16,
-        ),
         "adam": (
-            lib.cadam_static_8bit_grad_fp32,
-            lib.cadam_static_8bit_grad_fp16,
+            lib.cadam_static_8bit_grad_32,
+            lib.cadam_static_8bit_grad_16,
         ),
-        "pagedadam": (
-            lib.cpagedadam8bit_grad_fp32,
-            lib.cpagedadam8bit_grad_fp16,
-            lib.cpagedadam8bit_grad_bf16,
+        "momentum": (
+            lib.cmomentum_static_8bit_grad_32,
+            lib.cmomentum_static_8bit_grad_16,
         ),
-        "adamw": (
-            lib.cadam_static_8bit_grad_fp32,
-            lib.cadam_static_8bit_grad_fp16,
+        "rmsprop": (
+            lib.crmsprop_static_8bit_grad_32,
+            lib.crmsprop_static_8bit_grad_16,
         ),
-        "pagedadamw": (
-            lib.cpagedadam8bit_grad_fp32,
-            lib.cpagedadam8bit_grad_fp16,
-            lib.cpagedadam8bit_grad_bf16,
+        "lion": (
+            lib.clion_static_8bit_grad_32,
+            lib.clion_static_8bit_grad_16,
         ),
         "lamb": (
-            lib.cadam_static_8bit_grad_fp32,
-            lib.cadam_static_8bit_grad_fp16,
+            lib.cadam_static_8bit_grad_32,
+            lib.cadam_static_8bit_grad_16,
         ),
         "lars": (
-            lib.clars8bit_grad_fp32,
-            lib.clars8bit_grad_fp16,
-        ),
-        "lion": (
-            lib.clion_static_8bit_grad_fp32,
-            lib.clion_static_8bit_grad_fp16,
-        ),
-        "momentum": (
-            lib.cmomentum_static_8bit_grad_fp32,
-            lib.cmomentum_static_8bit_grad_fp16,
-        ),
-        "rmsprop": (
-            lib.crmsprop_static_8bit_grad_fp32,
-            lib.crmsprop_static_8bit_grad_fp16,
+            lib.cmomentum_static_8bit_grad_32,
+            lib.cmomentum_static_8bit_grad_16,
         ),
     }
 
     str2optimizer8bit_blockwise = {
-        "adagrad": (
-            lib.cadagrad_8bit_blockwise_grad_fp32,
-            lib.cadagrad_8bit_blockwise_grad_fp16,
-        ),
         "adam": (
             lib.cadam_8bit_blockwise_grad_fp32,
             lib.cadam_8bit_blockwise_grad_fp16,
             lib.cadam_8bit_blockwise_grad_bf16,
         ),
-        "pagedadam": (
-            lib.cpagedadam8bit_blockwise_fp32,
-            lib.cpagedadam8bit_blockwise_fp16,
-            lib.cpagedadam8bit_blockwise_bf16,
-        ),
-        "adamw": (
-            lib.cadam_8bit_blockwise_grad_fp32,
-            lib.cadam_8bit_blockwise_grad_fp16,
-            lib.cadam_8bit_blockwise_grad_bf16,
-        ),
-        "pagedadamw": (
-            lib.cpagedadam8bit_blockwise_fp32,
-            lib.cpagedadam8bit_blockwise_fp16,
-            lib.cpagedadam8bit_blockwise_bf16,
-        ),
-        "lion": (
-            lib.clion_8bit_blockwise_grad_fp32,
-            lib.clion_8bit_blockwise_grad_fp16,
-            lib.clion_8bit_blockwise_grad_bf16,
-        ),
         "momentum": (
             lib.cmomentum_8bit_blockwise_grad_fp32,
             lib.cmomentum_8bit_blockwise_grad_fp16,
@@ -157,6 +96,15 @@ def prod(iterable):
             lib.crmsprop_8bit_blockwise_grad_fp32,
             lib.crmsprop_8bit_blockwise_grad_fp16,
         ),
+        "lion": (
+            lib.clion_8bit_blockwise_grad_fp32,
+            lib.clion_8bit_blockwise_grad_fp16,
+            lib.clion_8bit_blockwise_grad_bf16,
+        ),
+        "adagrad": (
+            lib.cadagrad_8bit_blockwise_grad_fp32,
+            lib.cadagrad_8bit_blockwise_grad_fp16,
+        ),
     }
 
 
From 195ae616d63906673c5025d67a785a2455787896 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Fri, 21 Jun 2024 18:36:12 +0200
Subject: [PATCH 07/26] Bump the minor-patch group across 1 directory with 2
 updates (#1253)

Bumps the minor-patch group with 2 updates in the / directory: [pytest](https://github.com/pytest-dev/pytest) and [lion-pytorch](https://github.com/lucidrains/lion-pytorch).


Updates `pytest` from 8.2.1 to 8.2.2
- [Release notes](https://github.com/pytest-dev/pytest/releases)
- [Changelog](https://github.com/pytest-dev/pytest/blob/main/CHANGELOG.rst)
- [Commits](https://github.com/pytest-dev/pytest/compare/8.2.1...8.2.2)

Updates `lion-pytorch` from 0.1.4 to 0.2.2
- [Release notes](https://github.com/lucidrains/lion-pytorch/releases)
- [Commits](https://github.com/lucidrains/lion-pytorch/compare/0.1.4...0.2.2)

---
updated-dependencies:
- dependency-name: pytest
  dependency-type: direct:production
  update-type: version-update:semver-patch
  dependency-group: minor-patch
- dependency-name: lion-pytorch
  dependency-type: direct:production
  update-type: version-update:semver-minor
  dependency-group: minor-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 requirements-ci.txt  | 4 ++--
 requirements-dev.txt | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/requirements-ci.txt b/requirements-ci.txt
index 0e9dd2407..285b5e7d8 100644
--- a/requirements-ci.txt
+++ b/requirements-ci.txt
@@ -1,6 +1,6 @@
 # Requirements used for GitHub actions
-pytest==8.2.1
+pytest==8.2.2
 einops==0.8.0
-lion-pytorch==0.1.4
+lion-pytorch==0.2.2
 scipy==1.10.1; python_version < "3.9"
 scipy==1.13.1; python_version >= "3.9"
diff --git a/requirements-dev.txt b/requirements-dev.txt
index de7adce94..80927a4cb 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,9 +1,9 @@
 # Requirements used for local development
 setuptools>=63
-pytest~=8.2.1
+pytest~=8.2.2
 einops~=0.8.0
 wheel~=0.43.0
-lion-pytorch~=0.1.4
+lion-pytorch~=0.2.2
 scipy~=1.13.1
 pandas~=2.2.2
 matplotlib~=3.9.0

From dada530149212d64d4b69534716202659ef37ec8 Mon Sep 17 00:00:00 2001
From: jiqing-feng <107918818+jiqing-feng@users.noreply.github.com>
Date: Sat, 22 Jun 2024 00:53:26 +0800
Subject: [PATCH 08/26] cpu install guide (#1227)

* cpu install guide

* update readme

* fix format

* fix format

* fix typo

* add windows guide

* fix readme to pip install . instead of building wheel

* Update docs/source/installation.mdx

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>

* Update docs/source/installation.mdx

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>

* Update docs/source/installation.mdx

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>

---------

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>
---
 docs/source/installation.mdx | 54 ++++++++++++++++++++++++++++++++++--
 1 file changed, 52 insertions(+), 2 deletions(-)

diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index caf22488f..c84d0c2ef 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -1,5 +1,7 @@
 # Installation
 
+## CUDA
+
 bitsandbytes is only supported on CUDA GPUs for CUDA versions **11.0 - 12.3**.
 
 The latest version of bitsandbytes (v0.43.0) builds on:
@@ -29,7 +31,7 @@ To install from PyPI.
 pip install bitsandbytes
 ```
 
-## Compile from source
+### Compile from source
 
 For Linux and Windows systems, you can compile bitsandbytes from source. Installing from source allows for more build options with different CMake configurations.
 
@@ -91,7 +93,7 @@ Big thanks to [wkpark](https://github.com/wkpark), [Jamezo97](https://github.com
 </hfoption>
 </hfoptions>
 
-## PyTorch CUDA versions
+### PyTorch CUDA versions
 
 Some bitsandbytes features may need a newer CUDA version than the one currently supported by PyTorch binaries from Conda and pip. In this case, you should follow these instructions to load a precompiled bitsandbytes binary.
 
@@ -131,3 +133,51 @@ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/tim/local/cuda-11.7
 ```
 
 3. Now when you launch bitsandbytes with these environment variables, the PyTorch CUDA version is overridden by the new CUDA version (in this example, version 11.7) and a different bitsandbytes library is loaded.
+
+
+## Intel CPU
+
+> [!TIP]
+> Intel CPU backend only supports building from source; for now, please follow the instructions below.
+
+Like CUDA, you can compile bitsandbytes from source for Linux and Windows systems. Installing from source allows for more build options with different CMake configurations.
+
+<hfoptions id="source">
+<hfoption id="Linux">
+
+To compile from source, you need CMake >= **3.22.1** and Python >= **3.8** installed. Make sure you have a compiler installed to compile C++ (gcc, make, headers, etc.). For example, to install a compiler and CMake on Ubuntu:
+
+```bash
+apt-get install -y build-essential cmake
+```
+
+We recommend installing **GCC >= 11** and have at least **GCC >= 6**.
+
+Now to install the bitsandbytes package from source, run the following commands:
+
+```bash
+git clone --branch multi-backend-refactor https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/
+pip install -r requirements-dev.txt
+pip install intel_extension_for_pytorch
+cmake -DCOMPUTE_BACKEND=cpu -S .
+make
+pip install .
+```
+
+</hfoption>
+<hfoption id="Windows">
+
+Windows systems require Visual Studio with C++ support.
+
+To compile from source, you need CMake >= **3.22.1** and Python >= **3.8** installed.
+
+```bash
+git clone --branch multi-backend-refactor https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/
+pip install -r requirements-dev.txt
+cmake -DCOMPUTE_BACKEND=cpu -S .
+cmake --build . --config Release
+pip install .
+```
+
+</hfoption>
+</hfoptions>

From 1935a459f7c72d1ec8349e4ad1e84cbd6ff94e68 Mon Sep 17 00:00:00 2001
From: Titus <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Fri, 12 Jul 2024 16:40:17 +0200
Subject: [PATCH 09/26] fix broken <source> links in autodoc API reference
 (#1275)

* Update build_documentation.yml

* Update build_pr_documentation.yml

* Update build_pr_documentation.yml
---
 .github/workflows/build_documentation.yml    | 2 ++
 .github/workflows/build_pr_documentation.yml | 4 +++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build_documentation.yml b/.github/workflows/build_documentation.yml
index a19e7511d..e027f7556 100644
--- a/.github/workflows/build_documentation.yml
+++ b/.github/workflows/build_documentation.yml
@@ -14,6 +14,8 @@ jobs:
       commit_sha: ${{ github.sha }}
       package: bitsandbytes
       repo_owner: TimDettmers
+      # avoid /src suffix leading to wrong links, like bitsandbytes/blob/main/src/bitsandbytes/nn/
+      version_tag_suffix: ''  # defaults to '/src'
       custom_container: huggingface/transformers-doc-builder
     secrets:
       hf_token: ${{ secrets.HUGGINGFACE_PUSH }}
diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml
index cc833df5d..b83794a5f 100644
--- a/.github/workflows/build_pr_documentation.yml
+++ b/.github/workflows/build_pr_documentation.yml
@@ -9,11 +9,13 @@ concurrency:
 
 jobs:
   build:
-    if: github.repository == 'TimDettmers/bitsandbytes'
+    if: github.repository == 'bitsandbytes-foundation/bitsandbytes'
     uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main
     with:
       commit_sha: ${{ github.event.pull_request.head.sha }}
       pr_number: ${{ github.event.number }}
       package: bitsandbytes
       repo_owner: TimDettmers
+      # avoid /src suffix leading to wrong links, like bitsandbytes/blob/main/src/bitsandbytes/nn/
+      version_tag_suffix: ''  # defaults to '/src'
       custom_container: huggingface/transformers-doc-builder

From 85e01276874b7563bd23caf56ac2c3bdbf7c90fc Mon Sep 17 00:00:00 2001
From: Markus Hennerbichler <markushennerbichler@gmail.com>
Date: Fri, 12 Jul 2024 15:40:37 +0100
Subject: [PATCH 10/26] Fix CUDA 12.5 build issue (#1273)

pythonInterface.cpp depends on ops.cuh
which in turn depends on some thrust headers.
It is defined as a C++ compilation unit
which is problematic  becuase thrift doesn't guarantee
compatibility with a host compiler.

This is starting to cause issues with CUDA 12.5.
There is no dependency on the thrust headers,
which means they can be removed without other consequences.
---
 csrc/kernels.cu | 2 --
 csrc/ops.cuh    | 4 ----
 2 files changed, 6 deletions(-)

diff --git a/csrc/kernels.cu b/csrc/kernels.cu
index f4673359b..e4d459961 100644
--- a/csrc/kernels.cu
+++ b/csrc/kernels.cu
@@ -12,8 +12,6 @@
 #include <cub/block/block_reduce.cuh>
 #include <cub/cub.cuh>
 #include <math_constants.h>
-#include <thrust/host_vector.h>
-#include <thrust/device_vector.h>
 #include <mma.h>
 
 
diff --git a/csrc/ops.cuh b/csrc/ops.cuh
index da9df6af0..8b9a4f449 100644
--- a/csrc/ops.cuh
+++ b/csrc/ops.cuh
@@ -19,10 +19,6 @@
 #include <vector>
 #include <functional>
 
-#include <thrust/host_vector.h>
-#include <thrust/device_vector.h>
-
-
 
 #define CUDA_CHECK_RETURN(value) {                      \
   cudaError_t _m_cudaStat = value;                    \

From 6866a4ad464239a3a06c9d8911237c0da294e4d7 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Fri, 12 Jul 2024 16:42:28 +0200
Subject: [PATCH 11/26] Bump scipy from 1.13.1 to 1.14.0 in the minor-patch
 group (#1266)

Bumps the minor-patch group with 1 update: [scipy](https://github.com/scipy/scipy).


Updates `scipy` from 1.13.1 to 1.14.0
- [Release notes](https://github.com/scipy/scipy/releases)
- [Commits](https://github.com/scipy/scipy/compare/v1.13.1...v1.14.0)

---
updated-dependencies:
- dependency-name: scipy
  dependency-type: direct:production
  update-type: version-update:semver-minor
  dependency-group: minor-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 requirements-ci.txt  | 2 +-
 requirements-dev.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements-ci.txt b/requirements-ci.txt
index 285b5e7d8..b36fd6586 100644
--- a/requirements-ci.txt
+++ b/requirements-ci.txt
@@ -3,4 +3,4 @@ pytest==8.2.2
 einops==0.8.0
 lion-pytorch==0.2.2
 scipy==1.10.1; python_version < "3.9"
-scipy==1.13.1; python_version >= "3.9"
+scipy==1.14.0; python_version >= "3.9"
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 80927a4cb..dc75f9685 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -4,6 +4,6 @@ pytest~=8.2.2
 einops~=0.8.0
 wheel~=0.43.0
 lion-pytorch~=0.2.2
-scipy~=1.13.1
+scipy~=1.14.0
 pandas~=2.2.2
 matplotlib~=3.9.0

From 8c6ab698d76baad7265d1a91965ade0982596704 Mon Sep 17 00:00:00 2001
From: Titus <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Fri, 12 Jul 2024 16:48:26 +0200
Subject: [PATCH 12/26] update repo owner

---
 .github/workflows/build_pr_documentation.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml
index b83794a5f..4679761c6 100644
--- a/.github/workflows/build_pr_documentation.yml
+++ b/.github/workflows/build_pr_documentation.yml
@@ -15,7 +15,7 @@ jobs:
       commit_sha: ${{ github.event.pull_request.head.sha }}
       pr_number: ${{ github.event.number }}
       package: bitsandbytes
-      repo_owner: TimDettmers
+      repo_owner: bitsandbytes-foundation
       # avoid /src suffix leading to wrong links, like bitsandbytes/blob/main/src/bitsandbytes/nn/
       version_tag_suffix: ''  # defaults to '/src'
       custom_container: huggingface/transformers-doc-builder

From 7be11439954d38a8b784ea86286ac0045769db53 Mon Sep 17 00:00:00 2001
From: Titus <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Fri, 12 Jul 2024 16:48:51 +0200
Subject: [PATCH 13/26] update repo owner

---
 .github/workflows/build_documentation.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_documentation.yml b/.github/workflows/build_documentation.yml
index e027f7556..ce4a55aaa 100644
--- a/.github/workflows/build_documentation.yml
+++ b/.github/workflows/build_documentation.yml
@@ -13,7 +13,7 @@ jobs:
     with:
       commit_sha: ${{ github.sha }}
       package: bitsandbytes
-      repo_owner: TimDettmers
+      repo_owner: bitsandbytes-foundation
       # avoid /src suffix leading to wrong links, like bitsandbytes/blob/main/src/bitsandbytes/nn/
       version_tag_suffix: ''  # defaults to '/src'
       custom_container: huggingface/transformers-doc-builder

From 6948f0b8fe3295b6c6fe1263bc0d1ce874468cd2 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Mon, 15 Jul 2024 04:58:02 -0400
Subject: [PATCH 14/26] Fix Windows CUDA build compatibility with newest MSVC
 (#1276)

* Add support for building with latest MSVC

* Update MSVC 1940+ support for CUDA builds.
---
 CMakeLists.txt | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index be0d3555f..6f3914456 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -64,6 +64,13 @@ endif()
 
 
 if(BUILD_CUDA)
+    # NVCC normally will only work with MSVC up to 1939. VS2022 17.10+ starts using versions 1940+.
+    # Workaround: use --allow-unsupported-compiler
+    # This needs to be added *before* we try to enable the CUDA language so CMake's compiler check passes.
+    if(MSVC AND MSVC_VERSION VERSION_GREATER_EQUAL 1940)
+        string(APPEND CMAKE_CUDA_FLAGS " --allow-unsupported-compiler")
+    endif()
+
     enable_language(CUDA) # This will fail if CUDA is not found
     find_package(CUDAToolkit REQUIRED)
 
@@ -188,7 +195,6 @@ if(WIN32)
     set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
 endif()
 
-# Weird MSVC hacks
 if(MSVC)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2 /fp:fast")
 endif()

From f2b2310eb4b7034c14e87dca2a61604ea2a0163f Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 15 Jul 2024 16:42:04 +0200
Subject: [PATCH 15/26] Update matplotlib requirement from ~=3.9.0 to ~=3.9.1
 in the major group (#1278)

Updates the requirements on [matplotlib](https://github.com/matplotlib/matplotlib) to permit the latest version.

Updates `matplotlib` to 3.9.1
- [Release notes](https://github.com/matplotlib/matplotlib/releases)
- [Commits](https://github.com/matplotlib/matplotlib/compare/v3.9.0...v3.9.1)

---
updated-dependencies:
- dependency-name: matplotlib
  dependency-type: direct:development
  dependency-group: major
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 requirements-dev.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements-dev.txt b/requirements-dev.txt
index dc75f9685..94098e012 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -6,4 +6,4 @@ wheel~=0.43.0
 lion-pytorch~=0.2.2
 scipy~=1.14.0
 pandas~=2.2.2
-matplotlib~=3.9.0
+matplotlib~=3.9.1

From 39b42e749f623193d69b917a01dcb4ca5b4bbdc0 Mon Sep 17 00:00:00 2001
From: Vladimir Malinovskii <galqiwi@galqiwi.ru>
Date: Mon, 15 Jul 2024 17:51:21 +0300
Subject: [PATCH 16/26] Fixed tests for cpu only platforms (#1259)

* fixed test_4bit_warnings on cpu-only platforms

* fixed linear8bit-based tests for cpu only platforms
---
 tests/test_linear8bitlt.py | 2 +-
 tests/test_modules.py      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_linear8bitlt.py b/tests/test_linear8bitlt.py
index e55abe110..9b7923312 100644
--- a/tests/test_linear8bitlt.py
+++ b/tests/test_linear8bitlt.py
@@ -182,7 +182,7 @@ def test_linear_serialization(
 
 
 @pytest.fixture
-def linear8bit():
+def linear8bit(requires_cuda):
     linear = torch.nn.Linear(32, 96)
     linear_custom = Linear8bitLt(
         linear.in_features,
diff --git a/tests/test_modules.py b/tests/test_modules.py
index db4d72410..9d507c6b4 100644
--- a/tests/test_modules.py
+++ b/tests/test_modules.py
@@ -616,7 +616,7 @@ def test_fp8linear():
     assert bgraderr < 0.00002
 
 
-def test_4bit_warnings():
+def test_4bit_warnings(requires_cuda):
     dim1 = 64
 
     with pytest.warns(UserWarning, match=r"inference or training"):

From 9e75374104cec965d9a4a630a68ce4e2b77b066f Mon Sep 17 00:00:00 2001
From: Ther <1329438302@qq.com>
Date: Tue, 16 Jul 2024 16:56:41 +0800
Subject: [PATCH 17/26] fix QLoRA mem bug: delete useless buffered activation
 (#1270)

* chore: delete useless buffered activation

* fix: fix bugs
---
 bitsandbytes/autograd/_functions.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py
index e9821cd36..d33dd1bc5 100644
--- a/bitsandbytes/autograd/_functions.py
+++ b/bitsandbytes/autograd/_functions.py
@@ -513,7 +513,7 @@ def forward(ctx, A, B, out=None, bias=None, quant_state: Optional[F.QuantState]
         ctx.dtype_A, ctx.dtype_B, ctx.dtype_bias = A.dtype, B.dtype, None if bias is None else bias.dtype
 
         if any(ctx.needs_input_grad[:2]):
-            ctx.tensors = (A, B)
+            ctx.tensors = (None, B)
         else:
             ctx.tensors = (None, None)
 
@@ -526,7 +526,7 @@ def backward(ctx, grad_output):
             return torch.zeros_like(ctx.A), torch.zeros_like(ctx.B), None, bias_grad, None
 
         req_gradA, _, _, req_gradBias, _ = ctx.needs_input_grad
-        A, B = ctx.tensors
+        _, B = ctx.tensors
 
         grad_A, grad_B, grad_bias = None, None, None
 

From 0bdd57ccb20e9690b95c2fa02315d9507afa69dd Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Sun, 21 Jul 2024 08:31:06 -0400
Subject: [PATCH 18/26] Add CUDA 12.5 and update 12.4 builds (#1284)

* Add CUDA 12.5 builds and enable CUDA 12.4 on Windows

* Update install doc
---
 .github/workflows/python-package.yml |  6 ++----
 docs/source/installation.mdx         |  4 ++--
 install_cuda.py                      |  3 ++-
 install_cuda.sh                      | 10 +++++++---
 4 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 72e1b099a..698c21481 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -63,12 +63,10 @@ jobs:
         os: [ubuntu-latest, windows-latest]
         arch: [x86_64, aarch64]
         cuda_version:
-          ["11.7.1", "11.8.0", "12.0.1", "12.1.1", "12.2.2", "12.3.2", "12.4.0"]
+          ["11.7.1", "11.8.0", "12.0.1", "12.1.1", "12.2.2", "12.3.2", "12.4.1", "12.5.0"]
         exclude:
           - os: windows-latest # This probably requires arm64 Windows agents
             arch: aarch64
-          - os: windows-latest  # The Jimver/cuda-toolkit is action used for Windows builds is not updated for 12.4 yet.
-            cuda_version: "12.4.0"
           - os: ubuntu-latest # Temporary. Takes too long, not ready yet.
             arch: aarch64
     runs-on: ${{ matrix.os }} # One day, we could run them on native agents. Azure supports this now but it's planned only for Q3 2023 for hosted agents
@@ -79,7 +77,7 @@ jobs:
         if: startsWith(matrix.os, 'ubuntu')
         uses: docker/setup-qemu-action@v2
         # Windows: We install Cuda on the agent (slow)
-      - uses: Jimver/cuda-toolkit@v0.2.14
+      - uses: Jimver/cuda-toolkit@v0.2.16
         if: startsWith(matrix.os, 'windows')
         id: cuda-toolkit
         with:
diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index c84d0c2ef..877c97456 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -2,7 +2,7 @@
 
 ## CUDA
 
-bitsandbytes is only supported on CUDA GPUs for CUDA versions **11.0 - 12.3**.
+bitsandbytes is only supported on CUDA GPUs for CUDA versions **11.0 - 12.5**.
 
 The latest version of bitsandbytes (v0.43.0) builds on:
 
@@ -107,7 +107,7 @@ Then locally install the CUDA version you need with this script from bitsandbyte
 ```bash
 wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/install_cuda.sh
 # Syntax cuda_install CUDA_VERSION INSTALL_PREFIX EXPORT_TO_BASH
-#   CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122, 123, 124}
+#   CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122, 123, 124, 125}
 #   EXPORT_TO_BASH in {0, 1} with 0=False and 1=True
 
 # For example, the following installs CUDA 11.7 to ~/local/cuda-11.7 and exports the path to your .bashrc
diff --git a/install_cuda.py b/install_cuda.py
index cf7c8ee71..8267c5e2b 100644
--- a/install_cuda.py
+++ b/install_cuda.py
@@ -17,7 +17,8 @@
     "121": "https://developer.download.nvidia.com/compute/cuda/12.1.1/local_installers/cuda_12.1.1_530.30.02_linux.run",
     "122": "https://developer.download.nvidia.com/compute/cuda/12.2.2/local_installers/cuda_12.2.2_535.104.05_linux.run",
     "123": "https://developer.download.nvidia.com/compute/cuda/12.3.2/local_installers/cuda_12.3.2_545.23.08_linux.run",
-    "124": "https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_550.54.14_linux.run",
+    "124": "https://developer.download.nvidia.com/compute/cuda/12.4.1/local_installers/cuda_12.4.1_550.54.15_linux.run",
+    "125": "https://developer.download.nvidia.com/compute/cuda/12.5.0/local_installers/cuda_12.5.0_555.42.02_linux.run",
 }
 
 
diff --git a/install_cuda.sh b/install_cuda.sh
index 2e7fe8ed2..0aa9531fc 100644
--- a/install_cuda.sh
+++ b/install_cuda.sh
@@ -11,7 +11,8 @@ URL120=https://developer.download.nvidia.com/compute/cuda/12.0.1/local_installer
 URL121=https://developer.download.nvidia.com/compute/cuda/12.1.1/local_installers/cuda_12.1.1_530.30.02_linux.run
 URL122=https://developer.download.nvidia.com/compute/cuda/12.2.2/local_installers/cuda_12.2.2_535.104.05_linux.run
 URL123=https://developer.download.nvidia.com/compute/cuda/12.3.2/local_installers/cuda_12.3.2_545.23.08_linux.run
-URL124=https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_550.54.14_linux.run
+URL124=https://developer.download.nvidia.com/compute/cuda/12.4.1/local_installers/cuda_12.4.1_550.54.15_linux.run
+URL125=https://developer.download.nvidia.com/compute/cuda/12.5.0/local_installers/cuda_12.5.0_555.42.02_linux.run
 
 CUDA_VERSION=$1
 BASE_PATH=$2
@@ -60,11 +61,14 @@ if [[ -n "$CUDA_VERSION" ]]; then
   elif [[ "$CUDA_VERSION" -eq "124" ]]; then
     URL=$URL124
     FOLDER=cuda-12.4
+  elif [[ "$CUDA_VERSION" -eq "125" ]]; then
+    URL=$URL125
+    FOLDER=cuda-12.5
   else
-    echo "argument error: No cuda version passed as input. Choose among versions 110 to 124"
+    echo "argument error: No cuda version passed as input. Choose among versions 110 to 125"
   fi
 else
-    echo "argument error: No cuda version passed as input. Choose among versions 92 to 123"
+    echo "argument error: No cuda version passed as input. Choose among versions 110 to 125"
 fi
 
 FILE=$(basename $URL)

From 5212a0f2a585abba1cc2a65f82f7c4fc939c453f Mon Sep 17 00:00:00 2001
From: Titus <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Mon, 22 Jul 2024 16:36:31 +0200
Subject: [PATCH 19/26] Edenzzzz's fix for min_8bit_size functionality in
 Optimizer base classes (#1286)

* fix min_8bit_size invalid bug

* Apply same fix to other optimizer base class

---------

Co-authored-by: Edenzzzz <wtan45@wisc.edu>
---
 bitsandbytes/optim/optimizer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bitsandbytes/optim/optimizer.py b/bitsandbytes/optim/optimizer.py
index f1e60e5e7..39fa0e7ff 100644
--- a/bitsandbytes/optim/optimizer.py
+++ b/bitsandbytes/optim/optimizer.py
@@ -437,7 +437,7 @@ def init_state(self, group, p, gindex, pindex):
         state = self.state[p]
         state["step"] = 0
 
-        if dtype == torch.float32 or (dtype == torch.uint8 and p.numel() < 4096):
+        if dtype == torch.float32:
             state["state1"] = self.get_state_buffer(p, dtype=torch.float32)
             state["state2"] = self.get_state_buffer(p, dtype=torch.float32)
         elif dtype == torch.uint8:
@@ -656,7 +656,7 @@ def init_state(self, group, p, gindex, pindex):
         state = self.state[p]
         state["step"] = 0
 
-        if dtype == torch.float32 or (dtype == torch.uint8 and p.numel() < 4096):
+        if dtype == torch.float32:
             state["state1"] = self.get_state_buffer(p, dtype=torch.float32)
         elif dtype == torch.uint8:
             if state["step"] == 0:

From a3f55cea3ab29218067809770bc8bf2380ec46cd Mon Sep 17 00:00:00 2001
From: Edenzzzz <wenxuan.tan@wisc.edu>
Date: Mon, 22 Jul 2024 23:14:46 +0800
Subject: [PATCH 20/26] Fixed optim update error with non-contiguous
 grads/params (#1187)

* Fixed optim update error with non-contiguous grads
* fix formatting

Thanks @Edenzzzz for this contribution!

---------

Co-authored-by: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
---
 bitsandbytes/optim/optimizer.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/bitsandbytes/optim/optimizer.py b/bitsandbytes/optim/optimizer.py
index 39fa0e7ff..e9c857d49 100644
--- a/bitsandbytes/optim/optimizer.py
+++ b/bitsandbytes/optim/optimizer.py
@@ -474,6 +474,10 @@ def init_state(self, group, p, gindex, pindex):
 
     @torch.no_grad()
     def update_step(self, group, p, gindex, pindex):
+        # avoid update error from non-contiguous memory layout
+        p.data = p.data.contiguous()
+        p.grad = p.grad.contiguous()
+
         state = self.state[p]
         grad = p.grad
 
@@ -685,6 +689,10 @@ def init_state(self, group, p, gindex, pindex):
 
     @torch.no_grad()
     def update_step(self, group, p, gindex, pindex):
+        # avoid update error from non-contiguous memory layout
+        p.data = p.data.contiguous()
+        p.grad = p.grad.contiguous()
+
         state = self.state[p]
         grad = p.grad
 

From e3ae243be2f8bfb36715610e837363a515840b39 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 22 Jul 2024 17:15:51 +0200
Subject: [PATCH 21/26] Bump pytest from 8.2.2 to 8.3.1 in the minor-patch
 group (#1287)

Bumps the minor-patch group with 1 update: [pytest](https://github.com/pytest-dev/pytest).


Updates `pytest` from 8.2.2 to 8.3.1
- [Release notes](https://github.com/pytest-dev/pytest/releases)
- [Changelog](https://github.com/pytest-dev/pytest/blob/main/CHANGELOG.rst)
- [Commits](https://github.com/pytest-dev/pytest/compare/8.2.2...8.3.1)

---
updated-dependencies:
- dependency-name: pytest
  dependency-type: direct:production
  update-type: version-update:semver-minor
  dependency-group: minor-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 requirements-ci.txt  | 2 +-
 requirements-dev.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements-ci.txt b/requirements-ci.txt
index b36fd6586..182e1023e 100644
--- a/requirements-ci.txt
+++ b/requirements-ci.txt
@@ -1,5 +1,5 @@
 # Requirements used for GitHub actions
-pytest==8.2.2
+pytest==8.3.1
 einops==0.8.0
 lion-pytorch==0.2.2
 scipy==1.10.1; python_version < "3.9"
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 94098e012..41211880c 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,6 +1,6 @@
 # Requirements used for local development
 setuptools>=63
-pytest~=8.2.2
+pytest~=8.3.1
 einops~=0.8.0
 wheel~=0.43.0
 lion-pytorch~=0.2.2

From 7fed393aa8380f2d7f7c760bbd6a2f68b5caa9ea Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Tue, 23 Jul 2024 11:32:50 -0400
Subject: [PATCH 22/26] Fix restoration of quant_storage for CPU offloading
 (#1279)

* Fix restoration of quant_storage for CPU offloading

* Clarify comment on default quant_storage in Params4bit.from_prequantized()

* fix to make quant_storage dynamic based on serialized dtype

* delete obsolete comment

---------

Co-authored-by: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
---
 bitsandbytes/nn/modules.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index 05f7c04db..40766ad41 100644
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -282,10 +282,13 @@ def from_prequantized(
         self.compress_statistics = self.quant_state.nested
         self.quant_type = self.quant_state.quant_type
         self.bnb_quantized = True
+
+        self.quant_storage = data.dtype
+
         return self
 
     def _quantize(self, device):
-        w = self.data.contiguous().cuda(device)
+        w = self.data.contiguous().to(device)
         w_4bit, quant_state = bnb.functional.quantize_4bit(
             w,
             blocksize=self.blocksize,
@@ -333,6 +336,7 @@ def to(self, *args, **kwargs):
                 blocksize=self.blocksize,
                 compress_statistics=self.compress_statistics,
                 quant_type=self.quant_type,
+                quant_storage=self.quant_storage,
             )
 
             return new_param
@@ -450,7 +454,7 @@ def forward(self, x: torch.Tensor):
                 # since we registered the module, we can recover the state here
                 assert self.weight.shape[1] == 1
                 if not isinstance(self.weight, Params4bit):
-                    self.weight = Params4bit(self.weight, quant_storage=self.quant_storage)
+                    self.weight = Params4bit(self.weight, quant_storage=self.quant_storage, bnb_quantized=True)
                 self.weight.quant_state = self.quant_state
             else:
                 print(

From 1571110648dc5b0e603316c9ce2b0f16ac85cdbb Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Tue, 23 Jul 2024 16:40:31 +0000
Subject: [PATCH 23/26] remove unnecessary version mention

---
 docs/source/installation.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index 877c97456..8187fbf81 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -4,7 +4,7 @@
 
 bitsandbytes is only supported on CUDA GPUs for CUDA versions **11.0 - 12.5**.
 
-The latest version of bitsandbytes (v0.43.0) builds on:
+The latest version of bitsandbytes builds on:
 
 | OS | CUDA | Compiler |
 |---|---|---|

From ce53caf3c358ec3f81db6a9edc0b6fc2f17d9503 Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Tue, 23 Jul 2024 17:45:26 +0000
Subject: [PATCH 24/26] release 0.43.2

---
 CHANGELOG.md             | 18 ++++++++++++++++++
 _typos.toml              |  5 +++++
 bitsandbytes/__init__.py |  2 +-
 setup.py                 |  2 +-
 4 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index c456fa9e5..8ad648df1 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,21 @@
+### 0.43.2
+
+#### Improvements:
+
+- docs: FSDP+QLoRA and CPU install guide (#1211 #1227, thanks @stevhliu)
+- Add CUDA 12.5 and update 12.4 builds (#1284)
+
+#### Bug Fixes
+
+- 4bit getstate and 8bit deepcopy (#1230 #1231, thanks @BenjaminBossan)
+- missing optimizers in `str2optimizer32bit` (#1222, thanks @EtienneDosSantos)
+- CUDA 12.5 build issue (#1273, thanks @HennerM)
+- fix for min_8bit_size functionality in Optimizer base classes (#1286, thanks @Edenzzzz)
+- QLoRA mem bug (#1270, thanks @Ther-nullptr)
+- tests for cpu only platforms (#1259, thanks @galqiwi)
+- restoration of quant_storage for CPU offloading (#1279)
+- optim update error with non-contiguous grads/params (deepspeed) (#1187)
+
 ### 0.43.1
 
 #### Improvements:
diff --git a/_typos.toml b/_typos.toml
index a04206b8d..e4e7287fb 100644
--- a/_typos.toml
+++ b/_typos.toml
@@ -1,5 +1,10 @@
 [files]
 
+[default]
+extend-ignore-re = [
+    "@Ther-nul",  # valid Github user
+]
+
 [default.extend-identifiers]
 
 [type.py.extend-words]
diff --git a/bitsandbytes/__init__.py b/bitsandbytes/__init__.py
index 51cbde208..ad5f01539 100644
--- a/bitsandbytes/__init__.py
+++ b/bitsandbytes/__init__.py
@@ -21,4 +21,4 @@
     "optim.optimizer.MockArgs": False,
 }
 
-__version__ = "0.43.2.dev"
+__version__ = "0.43.2"
diff --git a/setup.py b/setup.py
index f8d6a92a1..d2b78f6b5 100644
--- a/setup.py
+++ b/setup.py
@@ -25,7 +25,7 @@ def has_ext_modules(self):
 
 setup(
     name="bitsandbytes",
-    version="0.43.2.dev",
+    version="0.43.2",
     author="Tim Dettmers",
     author_email="dettmers@cs.washington.edu",
     description="k-bit optimizers and matrix multiplication routines.",

From a7c08afd70af46646ccdebcd6bb459b66f0e9e54 Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Tue, 23 Jul 2024 18:26:24 +0000
Subject: [PATCH 25/26] bump version tag to next dev

---
 bitsandbytes/__init__.py | 2 +-
 setup.py                 | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/bitsandbytes/__init__.py b/bitsandbytes/__init__.py
index ad5f01539..a8acfbfc5 100644
--- a/bitsandbytes/__init__.py
+++ b/bitsandbytes/__init__.py
@@ -21,4 +21,4 @@
     "optim.optimizer.MockArgs": False,
 }
 
-__version__ = "0.43.2"
+__version__ = "0.43.3.dev"
diff --git a/setup.py b/setup.py
index d2b78f6b5..18de0fe5b 100644
--- a/setup.py
+++ b/setup.py
@@ -25,7 +25,7 @@ def has_ext_modules(self):
 
 setup(
     name="bitsandbytes",
-    version="0.43.2",
+    version="0.43.3.dev",
     author="Tim Dettmers",
     author_email="dettmers@cs.washington.edu",
     description="k-bit optimizers and matrix multiplication routines.",

From 9b726798542e01c45a7a4a841e144311980b90d6 Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Tue, 23 Jul 2024 19:13:24 +0000
Subject: [PATCH 26/26] Changelog: add explanation r. QLoRA mem savings

---
 CHANGELOG.md | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8ad648df1..e446155b0 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,13 @@
 ### 0.43.2
 
+This release is quite significant as the QLoRA bug fix big implications for higher `seqlen` and batch sizes.
+
+For each sequence (i.e. batch size increase of one) we expect memory savings of:
+- 405B: 39GB for seqlen 1024, and 4888GB for 128k
+- 70B: 20.1GB for 1024 and 2516GB for 128k
+
+This was due to activations being unnecessary for frozen parameters, yet the memory for them was still erroneously allocated due to the now fixed bug.
+
 #### Improvements:
 
 - docs: FSDP+QLoRA and CPU install guide (#1211 #1227, thanks @stevhliu)