From 94c7f2c5056fa9fc482759ec9aec6f8611b2c5b3 Mon Sep 17 00:00:00 2001 From: Miles Cranmer Date: Fri, 26 Jan 2024 00:33:38 +0900 Subject: [PATCH 001/112] Fix `max_memory` example on README (#944) * Fix `max_memory` example on README - The new `max_memory` syntax expects a dictionary - This change also accounts for multiple devices * Fix model name in `from_pretrained` on README --- README.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 5cf92dcc5..a4586d6ca 100644 --- a/README.md +++ b/README.md @@ -41,7 +41,11 @@ model = AutoModelForCausalLM.from_pretrained( 'decapoda-research/llama-7b-hf', device_map='auto', load_in_8bit=True, - max_memory=f'{int(torch.cuda.mem_get_info()[0]/1024**3)-2}GB') + max_memory={ + i: f'{int(torch.cuda.mem_get_info(i)[0]/1024**3)-2}GB' + for i in range(torch.cuda.device_count()) + } +) ``` A more detailed example, can be found in [examples/int8_inference_huggingface.py](examples/int8_inference_huggingface.py). From e651e8ed61b8fe8dcc3322c34ea3d53c6b323021 Mon Sep 17 00:00:00 2001 From: Won-Kyu Park Date: Sat, 27 Jan 2024 07:50:52 +0900 Subject: [PATCH 002/112] fix library detection, enable Windows (#873) * fix library loading Signed-off-by: Won-Kyu Park * fixed library loading * use os.pathsep * use glob(), search CUDA_PATH * call find_file_recursive() without ext --------- Signed-off-by: Won-Kyu Park Co-authored-by: James Wyatt --- bitsandbytes/__main__.py | 49 +++++++++++------------------ bitsandbytes/cuda_setup/env_vars.py | 2 +- bitsandbytes/cuda_setup/main.py | 33 ++++++++++++------- setup.py | 1 + 4 files changed, 43 insertions(+), 42 deletions(-) diff --git a/bitsandbytes/__main__.py b/bitsandbytes/__main__.py index ebbf2653e..8f58e1665 100644 --- a/bitsandbytes/__main__.py +++ b/bitsandbytes/__main__.py @@ -11,34 +11,18 @@ HEADER_WIDTH = 60 -def execute_and_return(command_string: str) -> Tuple[str, str]: - def _decode(subprocess_err_out_tuple): - return tuple( - to_decode.decode("UTF-8").strip() - for to_decode in subprocess_err_out_tuple - ) - - def execute_and_return_decoded_std_streams(command_string): - return _decode( - subprocess.Popen( - shlex.split(command_string), - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - ).communicate() - ) - - std_out, std_err = execute_and_return_decoded_std_streams(command_string) - return std_out, std_err def find_file_recursive(folder, filename): - folder = shlex.quote(folder) - filename = shlex.quote(filename) - cmd = f'find {folder} -name {filename}' - out, err = execute_and_return(cmd) - if len(err) > 0: - raise RuntimeError('Something when wrong when trying to find file. Maybe you do not have a linux system?') + import glob + outs = [] + try: + for ext in ["so", "dll", "dylib"]: + out = glob.glob(os.path.join(folder, "**", filename + ext)) + outs.extend(out) + except Exception as e: + raise RuntimeError('Error: Something when wrong when trying to find file. {e}') - return out + return outs def generate_bug_report_information(): @@ -48,18 +32,23 @@ def generate_bug_report_information(): print('') if 'CONDA_PREFIX' in os.environ: - paths = find_file_recursive(os.environ['CONDA_PREFIX'], '*cuda*so') + paths = find_file_recursive(os.environ['CONDA_PREFIX'], '*cuda*') print_header("ANACONDA CUDA PATHS") print(paths) print('') if isdir('/usr/local/'): - paths = find_file_recursive('/usr/local', '*cuda*so') + paths = find_file_recursive('/usr/local', '*cuda*') print_header("/usr/local CUDA PATHS") print(paths) print('') + if 'CUDA_PATH' in os.environ and isdir(os.environ['CUDA_PATH']): + paths = find_file_recursive(os.environ['CUDA_PATH'], '*cuda*') + print_header("CUDA PATHS") + print(paths) + print('') if isdir(os.getcwd()): - paths = find_file_recursive(os.getcwd(), '*cuda*so') + paths = find_file_recursive(os.getcwd(), '*cuda*') print_header("WORKING DIRECTORY CUDA PATHS") print(paths) print('') @@ -67,11 +56,11 @@ def generate_bug_report_information(): print_header("LD_LIBRARY CUDA PATHS") if 'LD_LIBRARY_PATH' in os.environ: lib_path = os.environ['LD_LIBRARY_PATH'].strip() - for path in set(lib_path.split(':')): + for path in set(lib_path.split(os.pathsep)): try: if isdir(path): print_header(f"{path} CUDA PATHS") - paths = find_file_recursive(path, '*cuda*so') + paths = find_file_recursive(path, '*cuda*') print(paths) except: print(f'Could not read LD_LIBRARY_PATH: {path}') diff --git a/bitsandbytes/cuda_setup/env_vars.py b/bitsandbytes/cuda_setup/env_vars.py index e8268fcaa..4b2549653 100644 --- a/bitsandbytes/cuda_setup/env_vars.py +++ b/bitsandbytes/cuda_setup/env_vars.py @@ -26,7 +26,7 @@ def to_be_ignored(env_var: str, value: str) -> bool: def might_contain_a_path(candidate: str) -> bool: - return "/" in candidate + return os.sep in candidate def is_active_conda_env(env_var: str) -> bool: diff --git a/bitsandbytes/cuda_setup/main.py b/bitsandbytes/cuda_setup/main.py index 6fa671e63..af32819df 100644 --- a/bitsandbytes/cuda_setup/main.py +++ b/bitsandbytes/cuda_setup/main.py @@ -19,6 +19,7 @@ import ctypes as ct import os import errno +import platform import torch from warnings import warn from itertools import product @@ -31,7 +32,11 @@ # libcudart.so is missing by default for a conda install with PyTorch 2.0 and instead # we have libcudart.so.11.0 which causes a lot of errors before # not sure if libcudart.so.12.0 exists in pytorch installs, but it does not hurt -CUDA_RUNTIME_LIBS: list = ["libcudart.so", 'libcudart.so.11.0', 'libcudart.so.12.0', 'libcudart.so.12.1', 'libcudart.so.12.2'] +system = platform.system() +if system == 'Windows': + CUDA_RUNTIME_LIBS: list = ["nvcuda.dll"] +else: # Linux or other + CUDA_RUNTIME_LIBS: list = ["libcudart.so", 'libcudart.so.11.0', 'libcudart.so.12.0', 'libcudart.so.12.1', 'libcudart.so.12.2'] # this is a order list of backup paths to search CUDA in, if it cannot be found in the main environmental paths backup_paths = [] @@ -114,7 +119,9 @@ def manual_override(self): 'For example by adding the following to your .bashrc: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH: Set[Path]: - return {Path(ld_path) for ld_path in paths_list_candidate.split(":") if ld_path} + return {Path(ld_path) for ld_path in paths_list_candidate.split(os.pathsep) if ld_path} def remove_non_existent_dirs(candidate_paths: Set[Path]) -> Set[Path]: @@ -336,13 +344,14 @@ def get_compute_capabilities(): def evaluate_cuda_setup(): cuda_setup = CUDASetup.get_instance() + suffix = ".so" if os.name != "nt" else ".dll" if 'BITSANDBYTES_NOWELCOME' not in os.environ or str(os.environ['BITSANDBYTES_NOWELCOME']) == '0': cuda_setup.add_log_entry('') cuda_setup.add_log_entry('='*35 + 'BUG REPORT' + '='*35) cuda_setup.add_log_entry(('Welcome to bitsandbytes. For bug reports, please run\n\npython -m bitsandbytes\n\n'), ('and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues')) cuda_setup.add_log_entry('='*80) - if not torch.cuda.is_available(): return 'libbitsandbytes_cpu.so', None, None, None + if not torch.cuda.is_available(): return f'libbitsandbytes_cpu{suffix}', None, None, None cudart_path = determine_cuda_runtime_lib_path() ccs = get_compute_capabilities() @@ -366,9 +375,11 @@ def evaluate_cuda_setup(): # since most installations will have the libcudart.so installed, but not the compiler if has_cublaslt: - binary_name = f"libbitsandbytes_cuda{cuda_version_string}.so" + binary_name = f"libbitsandbytes_cuda{cuda_version_string}" else: - "if not has_cublaslt (CC < 7.5), then we have to choose _nocublaslt.so" - binary_name = f"libbitsandbytes_cuda{cuda_version_string}_nocublaslt.so" + "if not has_cublaslt (CC < 7.5), then we have to choose _nocublaslt" + binary_name = f"libbitsandbytes_cuda{cuda_version_string}_nocublaslt" + + binary_name = f"{binary_name}{suffix}" return binary_name, cudart_path, cc, cuda_version_string diff --git a/setup.py b/setup.py index c07451d20..b28f6fa4f 100644 --- a/setup.py +++ b/setup.py @@ -9,6 +9,7 @@ libs = list(glob.glob("./bitsandbytes/libbitsandbytes*.so")) +libs += list(glob.glob("./bitsandbytes/libbitsandbytes*.dll")) libs = [os.path.basename(p) for p in libs] print("libs:", libs) From 8ddfda13bcb0658d333eb8a613488145fe7f2f8e Mon Sep 17 00:00:00 2001 From: SUN Haibo Date: Sun, 28 Jan 2024 12:19:39 -0500 Subject: [PATCH 003/112] Correct type hint in functional.py (#992) --- bitsandbytes/functional.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py index 0b18d9b06..a461d1749 100644 --- a/bitsandbytes/functional.py +++ b/bitsandbytes/functional.py @@ -890,7 +890,7 @@ def quantize_fp4(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize def quantize_nf4(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=64, compress_statistics=False, quant_storage=torch.uint8): return quantize_4bit(A, absmax, out, blocksize, compress_statistics, 'nf4', quant_storage) -def quantize_4bit(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=64, compress_statistics=False, quant_type='fp4', quant_storage=torch.uint8) -> Tensor: +def quantize_4bit(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=64, compress_statistics=False, quant_type='fp4', quant_storage=torch.uint8) -> (Tensor, QuantState): """ Quantize tensor A in blocks of 4-bit values. From 277ac27bcbf73b0ee45464fe3095e2a5802e3228 Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Mon, 29 Jan 2024 14:46:46 +0200 Subject: [PATCH 004/112] Correct type annotation for quantize_4bit (#994) --- bitsandbytes/functional.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py index a461d1749..3e90364bb 100644 --- a/bitsandbytes/functional.py +++ b/bitsandbytes/functional.py @@ -890,7 +890,16 @@ def quantize_fp4(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize def quantize_nf4(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=64, compress_statistics=False, quant_storage=torch.uint8): return quantize_4bit(A, absmax, out, blocksize, compress_statistics, 'nf4', quant_storage) -def quantize_4bit(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=64, compress_statistics=False, quant_type='fp4', quant_storage=torch.uint8) -> (Tensor, QuantState): + +def quantize_4bit( + A: Tensor, + absmax: Tensor = None, + out: Tensor = None, + blocksize=64, + compress_statistics=False, + quant_type='fp4', + quant_storage=torch.uint8, +) -> Tuple[Tensor, QuantState]: """ Quantize tensor A in blocks of 4-bit values. From 619e9b3b4e68e041f14767112ac661c7caaadc9a Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Mon, 29 Jan 2024 14:48:28 +0200 Subject: [PATCH 005/112] Add .git-blame-ignore-revs file (#987) --- .git-blame-ignore-revs | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 .git-blame-ignore-revs diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs new file mode 100644 index 000000000..f7dd01bdf --- /dev/null +++ b/.git-blame-ignore-revs @@ -0,0 +1,8 @@ +# ran black and isort for coherent code formatting +bfa0e33294f2b1dc25e65a33be2397f989824298 + +# reran black with linelength 80 for greater readability +ea7c14f8ef64924f2d0ff80df3cdabf2c7299848 + +# Remove f-prefix from strings that don't use formatting +7727fa4c8c6c1ef2b109120aff4196a0a6bf3ed6 From 32be289735cdf059f7f89997d70be5ea6bd4ba30 Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Mon, 29 Jan 2024 14:51:05 +0200 Subject: [PATCH 006/112] Don't require scipy for regular use (#948) --- bitsandbytes/functional.py | 9 ++++++++- setup.py | 7 +++++-- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py index 3e90364bb..739d922a4 100644 --- a/bitsandbytes/functional.py +++ b/bitsandbytes/functional.py @@ -233,8 +233,15 @@ def create_linear_map(signed=True, total_bits=8, add_zero=True): l = values.numel()//2 return torch.Tensor(values[:l].tolist() + [0]*gap + values[l:].tolist()) + def create_normal_map(offset=0.9677083, use_extra_value=True): - from scipy.stats import norm + try: + from scipy.stats import norm + except ImportError as ie: + raise ImportError( + "Scipy is required for `create_normal_map`. " + "Install `bitsandbytes` with the `[test]` extra." + ) from ie if use_extra_value: # one more positive value, this is an asymmetric type diff --git a/setup.py b/setup.py index b28f6fa4f..7a82b7717 100644 --- a/setup.py +++ b/setup.py @@ -29,8 +29,11 @@ def read(fname): url="https://github.com/TimDettmers/bitsandbytes", packages=find_packages(), package_data={"": libs}, - install_requires=['torch', 'numpy', 'scipy'], - extras_require={'benchmark': ['pandas', 'matplotlib']}, + install_requires=['torch', 'numpy'], + extras_require={ + 'benchmark': ['pandas', 'matplotlib'], + 'test': ['scipy'], + }, long_description=read("README.md"), long_description_content_type="text/markdown", classifiers=[ From a8c9dfa6aefd63c1ccdb62b5f37a99dd90906e8c Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Mon, 29 Jan 2024 17:01:18 +0200 Subject: [PATCH 007/112] Fix some issues found by Mypy (#995) * Fix erroneous type aliasing * Fix `Optional` typings (see PEP 484) * Add Mypy ignores * Fix Mypy complaints for method tables * Fix type for get_ptr * Fix various Mypy errors * Fix missed call to is_triton_available --- bitsandbytes/autograd/_functions.py | 22 +- bitsandbytes/cuda_setup/main.py | 4 +- bitsandbytes/functional.py | 212 +++++++++++-------- bitsandbytes/nn/modules.py | 2 +- bitsandbytes/nn/triton_based_modules.py | 2 +- bitsandbytes/research/autograd/_functions.py | 33 ++- pyproject.toml | 10 +- 7 files changed, 168 insertions(+), 117 deletions(-) diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py index 19f224391..9917e326e 100644 --- a/bitsandbytes/autograd/_functions.py +++ b/bitsandbytes/autograd/_functions.py @@ -2,7 +2,7 @@ import warnings from dataclasses import dataclass from functools import reduce # Required in Python 3 -from typing import Tuple, Optional, List +from typing import Tuple, Optional, Callable from warnings import warn import torch @@ -14,9 +14,6 @@ def prod(iterable): return reduce(operator.mul, iterable, 1) -tensor = torch.Tensor - - # The inverse transformation for the colTuring and colAmpere format were contributed by Alex Borzunov: # https://github.com/bigscience-workshop/petals/blob/main/src/petals/utils/linear8bitlt_patch.py @@ -56,7 +53,10 @@ def get_current_outlier_idx(self): return torch.Tensor(list(self.outliers)).to(torch.int64) -def get_inverse_transform_indices(transform_tile: callable, tile_size: Tuple[int, int]): +def get_inverse_transform_indices( + transform_tile: Callable[[torch.Tensor], torch.Tensor], + tile_size: Tuple[int, int], +): """ Compute a permutation of indices that invert the specified (tiled) matrix transformation @@ -496,7 +496,7 @@ class MatMul4Bit(torch.autograd.Function): # backward is mostly the same, but adds one extra clause (see "elif state.CxB is not None") @staticmethod - def forward(ctx, A, B, out=None, bias=None, quant_state: F.QuantState = None): + def forward(ctx, A, B, out=None, bias=None, quant_state: Optional[F.QuantState] = None): # default of pytorch behavior if inputs are empty ctx.is_empty = False if prod(A.shape) == 0: @@ -549,10 +549,10 @@ def backward(ctx, grad_output): def matmul( - A: tensor, - B: tensor, - out: tensor = None, - state: MatmulLtState = None, + A: torch.Tensor, + B: torch.Tensor, + out: Optional[torch.Tensor] = None, + state: Optional[MatmulLtState] = None, threshold=0.0, bias=None ): @@ -562,7 +562,7 @@ def matmul( return MatMul8bitLt.apply(A, B, out, bias, state) -def matmul_4bit(A: tensor, B: tensor, quant_state: F.QuantState, out: tensor = None, bias=None): +def matmul_4bit(A: torch.Tensor, B: torch.Tensor, quant_state: F.QuantState, out: Optional[torch.Tensor] = None, bias=None): assert quant_state is not None if A.numel() == A.shape[-1] and A.requires_grad == False: if A.shape[-1] % quant_state.blocksize != 0: diff --git a/bitsandbytes/cuda_setup/main.py b/bitsandbytes/cuda_setup/main.py index af32819df..a5931ef5e 100644 --- a/bitsandbytes/cuda_setup/main.py +++ b/bitsandbytes/cuda_setup/main.py @@ -34,9 +34,9 @@ # not sure if libcudart.so.12.0 exists in pytorch installs, but it does not hurt system = platform.system() if system == 'Windows': - CUDA_RUNTIME_LIBS: list = ["nvcuda.dll"] + CUDA_RUNTIME_LIBS = ["nvcuda.dll"] else: # Linux or other - CUDA_RUNTIME_LIBS: list = ["libcudart.so", 'libcudart.so.11.0', 'libcudart.so.12.0', 'libcudart.so.12.1', 'libcudart.so.12.2'] + CUDA_RUNTIME_LIBS = ["libcudart.so", 'libcudart.so.11.0', 'libcudart.so.12.0', 'libcudart.so.12.1', 'libcudart.so.12.2'] # this is a order list of backup paths to search CUDA in, if it cannot be found in the main environmental paths backup_paths = [] diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py index 739d922a4..25aa4e531 100644 --- a/bitsandbytes/functional.py +++ b/bitsandbytes/functional.py @@ -12,7 +12,7 @@ import numpy as np from functools import reduce # Required in Python 3 -from typing import Tuple, Any, Dict +from typing import Tuple, Any, Dict, Optional from torch import Tensor from bitsandbytes.utils import pack_dict_to_tensor, unpack_tensor_to_dict @@ -27,71 +27,83 @@ def prod(iterable): if COMPILED_WITH_CUDA: """C FUNCTIONS FOR OPTIMIZERS""" - str2optimizer32bit = {} - str2optimizer32bit["adam"] = (lib.cadam32bit_grad_fp32, lib.cadam32bit_grad_fp16, lib.cadam32bit_grad_bf16) - str2optimizer32bit["momentum"] = ( - lib.cmomentum32bit_grad_32, - lib.cmomentum32bit_grad_16, - ) - str2optimizer32bit["rmsprop"] = ( - lib.crmsprop32bit_grad_32, - lib.crmsprop32bit_grad_16, - ) - str2optimizer32bit["lion"] = (lib.clion32bit_grad_fp32, lib.clion32bit_grad_fp16, lib.clion32bit_grad_bf16) - str2optimizer32bit["adagrad"] = ( - lib.cadagrad32bit_grad_32, - lib.cadagrad32bit_grad_16, - ) + str2optimizer32bit = { + "adam": ( + lib.cadam32bit_grad_fp32, + lib.cadam32bit_grad_fp16, + lib.cadam32bit_grad_bf16, + ), + "momentum": ( + lib.cmomentum32bit_grad_32, + lib.cmomentum32bit_grad_16, + ), + "rmsprop": ( + lib.crmsprop32bit_grad_32, + lib.crmsprop32bit_grad_16, + ), + "lion": ( + lib.clion32bit_grad_fp32, + lib.clion32bit_grad_fp16, + lib.clion32bit_grad_bf16, + ), + "adagrad": ( + lib.cadagrad32bit_grad_32, + lib.cadagrad32bit_grad_16, + ), + } + + str2optimizer8bit = { + "adam": ( + lib.cadam_static_8bit_grad_32, + lib.cadam_static_8bit_grad_16, + ), + "momentum": ( + lib.cmomentum_static_8bit_grad_32, + lib.cmomentum_static_8bit_grad_16, + ), + "rmsprop": ( + lib.crmsprop_static_8bit_grad_32, + lib.crmsprop_static_8bit_grad_16, + ), + "lion": ( + lib.clion_static_8bit_grad_32, + lib.clion_static_8bit_grad_16, + ), + "lamb": ( + lib.cadam_static_8bit_grad_32, + lib.cadam_static_8bit_grad_16, + ), + "lars": ( + lib.cmomentum_static_8bit_grad_32, + lib.cmomentum_static_8bit_grad_16, + ), + } + + str2optimizer8bit_blockwise = { + "adam": ( + lib.cadam_8bit_blockwise_grad_fp32, + lib.cadam_8bit_blockwise_grad_fp16, + lib.cadam_8bit_blockwise_grad_bf16, + ), + "momentum": ( + lib.cmomentum_8bit_blockwise_grad_fp32, + lib.cmomentum_8bit_blockwise_grad_fp16, + ), + "rmsprop": ( + lib.crmsprop_8bit_blockwise_grad_fp32, + lib.crmsprop_8bit_blockwise_grad_fp16, + ), + "lion": ( + lib.clion_8bit_blockwise_grad_fp32, + lib.clion_8bit_blockwise_grad_fp16, + lib.clion_8bit_blockwise_grad_bf16, + ), + "adagrad": ( + lib.cadagrad_8bit_blockwise_grad_fp32, + lib.cadagrad_8bit_blockwise_grad_fp16, + ), + } - str2optimizer8bit = {} - str2optimizer8bit["adam"] = ( - lib.cadam_static_8bit_grad_32, - lib.cadam_static_8bit_grad_16, - ) - str2optimizer8bit["momentum"] = ( - lib.cmomentum_static_8bit_grad_32, - lib.cmomentum_static_8bit_grad_16, - ) - str2optimizer8bit["rmsprop"] = ( - lib.crmsprop_static_8bit_grad_32, - lib.crmsprop_static_8bit_grad_16, - ) - str2optimizer8bit["lion"] = ( - lib.clion_static_8bit_grad_32, - lib.clion_static_8bit_grad_16, - ) - str2optimizer8bit["lamb"] = ( - lib.cadam_static_8bit_grad_32, - lib.cadam_static_8bit_grad_16, - ) - str2optimizer8bit["lars"] = ( - lib.cmomentum_static_8bit_grad_32, - lib.cmomentum_static_8bit_grad_16, - ) - - str2optimizer8bit_blockwise = {} - str2optimizer8bit_blockwise["adam"] = ( - lib.cadam_8bit_blockwise_grad_fp32, - lib.cadam_8bit_blockwise_grad_fp16, - lib.cadam_8bit_blockwise_grad_bf16, - ) - str2optimizer8bit_blockwise["momentum"] = ( - lib.cmomentum_8bit_blockwise_grad_fp32, - lib.cmomentum_8bit_blockwise_grad_fp16, - ) - str2optimizer8bit_blockwise["rmsprop"] = ( - lib.crmsprop_8bit_blockwise_grad_fp32, - lib.crmsprop_8bit_blockwise_grad_fp16, - ) - str2optimizer8bit_blockwise["lion"] = ( - lib.clion_8bit_blockwise_grad_fp32, - lib.clion_8bit_blockwise_grad_fp16, - lib.clion_8bit_blockwise_grad_bf16, - ) - str2optimizer8bit_blockwise["adagrad"] = ( - lib.cadagrad_8bit_blockwise_grad_fp32, - lib.cadagrad_8bit_blockwise_grad_fp16, - ) class GlobalPageManager: _instance = None @@ -400,7 +412,8 @@ def is_on_gpu(tensors): raise TypeError(f'Input tensors need to be on the same GPU, but found the following tensor and device combinations:\n {[(t.shape, t.device) for t in tensors]}') return on_gpu -def get_ptr(A: Tensor) -> ct.c_void_p: + +def get_ptr(A: Optional[Tensor]) -> Optional[ct.c_void_p]: """ Get the ctypes pointer from a PyTorch Tensor. @@ -521,7 +534,7 @@ def nvidia_transform( return out, new_state -def estimate_quantiles(A: Tensor, out: Tensor = None, offset: float = 1 / 512, num_quantiles=256) -> Tensor: +def estimate_quantiles(A: Tensor, out: Optional[torch.Tensor] = None, offset: float = 1 / 512, num_quantiles=256) -> Tensor: ''' Estimates 256 equidistant quantiles on the input tensor eCDF. @@ -626,8 +639,8 @@ def from_dict(cls, qs_dict: Dict[str, Any], device: torch.device) -> 'QuantState # unpacking minor and non-tensor quant state items if necessary if len(qs_key) == 1: - qs_key = qs_key[0] - qs_dict.update(unpack_tensor_to_dict(qs_dict.pop(qs_key))) + first_qs_key = qs_key[0] + qs_dict.update(unpack_tensor_to_dict(qs_dict.pop(first_qs_key))) qs_dict = {k.split('.')[-1]: v for k, v in qs_dict.items()} # strip prefixes assert set(qs_dict.keys()).issubset(cls.valid_qs_keys) @@ -694,7 +707,14 @@ def to(self, device): self.state2.code = self.state2.code.to(device) -def quantize_blockwise(A: Tensor, code: Tensor = None, absmax: Tensor = None, out: Tensor = None, blocksize=4096, nested=False) -> Tensor: +def quantize_blockwise( + A: Tensor, + code: Optional[torch.Tensor] = None, + absmax: Optional[torch.Tensor] = None, + out: Optional[torch.Tensor] = None, + blocksize=4096, + nested=False, +) -> Tuple[Tensor, QuantState]: """ Quantize tensor A in blocks of size 4096 values. @@ -769,10 +789,10 @@ def quantize_blockwise(A: Tensor, code: Tensor = None, absmax: Tensor = None, ou def dequantize_blockwise( A: Tensor, - quant_state: QuantState = None, - absmax: Tensor = None, - code: Tensor = None, - out: Tensor = None, + quant_state: Optional[QuantState] = None, + absmax: Optional[torch.Tensor] = None, + code: Optional[torch.Tensor] = None, + out: Optional[torch.Tensor] = None, blocksize: int = 4096, nested=False ) -> Tensor: @@ -891,17 +911,17 @@ def get_4bit_type(typename, device=None, blocksize=64): return data.to(device) -def quantize_fp4(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=64, compress_statistics=False, quant_storage=torch.uint8): +def quantize_fp4(A: Tensor, absmax: Optional[torch.Tensor] = None, out: Optional[torch.Tensor] = None, blocksize=64, compress_statistics=False, quant_storage=torch.uint8): return quantize_4bit(A, absmax, out, blocksize, compress_statistics, 'fp4', quant_storage) -def quantize_nf4(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=64, compress_statistics=False, quant_storage=torch.uint8): +def quantize_nf4(A: Tensor, absmax: Optional[torch.Tensor] = None, out: Optional[torch.Tensor] = None, blocksize=64, compress_statistics=False, quant_storage=torch.uint8): return quantize_4bit(A, absmax, out, blocksize, compress_statistics, 'nf4', quant_storage) def quantize_4bit( A: Tensor, - absmax: Tensor = None, - out: Tensor = None, + absmax: Optional[torch.Tensor] = None, + out: Optional[torch.Tensor] = None, blocksize=64, compress_statistics=False, quant_type='fp4', @@ -987,13 +1007,13 @@ def quantize_4bit( return out, state -def dequantize_fp4(A: Tensor, quant_state: QuantState = None, absmax: Tensor = None, out: Tensor = None, blocksize: int = 64) -> Tensor: +def dequantize_fp4(A: Tensor, quant_state: Optional[QuantState] = None, absmax: Optional[torch.Tensor] = None, out: Optional[torch.Tensor] = None, blocksize: int = 64) -> Tensor: return dequantize_4bit(A, quant_state, absmax, out, blocksize, 'fp4') -def dequantize_nf4(A: Tensor, quant_state: QuantState = None, absmax: Tensor = None, out: Tensor = None, blocksize: int = 64) -> Tensor: +def dequantize_nf4(A: Tensor, quant_state: Optional[QuantState] = None, absmax: Optional[torch.Tensor] = None, out: Optional[torch.Tensor] = None, blocksize: int = 64) -> Tensor: return dequantize_4bit(A, quant_state, absmax, out, blocksize, 'nf4') -def dequantize_4bit(A: Tensor, quant_state: QuantState = None, absmax: Tensor = None, out: Tensor = None, blocksize: int = 64, quant_type='fp4') -> Tensor: +def dequantize_4bit(A: Tensor, quant_state: Optional[QuantState] = None, absmax: Optional[torch.Tensor] = None, out: Optional[torch.Tensor] = None, blocksize: int = 64, quant_type='fp4') -> Tensor: """ Dequantizes FP4 blockwise quantized values. @@ -1070,7 +1090,11 @@ def dequantize_4bit(A: Tensor, quant_state: QuantState = None, absmax: Tensor = else: return out -def quantize(A: Tensor, code: Tensor = None, out: Tensor = None) -> Tensor: +def quantize( + A: Tensor, + code: Optional[torch.Tensor] = None, + out: Optional[torch.Tensor] = None, +) -> Tuple[Tensor, Tuple[Tensor, Tensor]]: if code is None: if "dynamic" not in name2qmap: name2qmap["dynamic"] = create_dynamic_map().to(A.device) @@ -1086,10 +1110,10 @@ def quantize(A: Tensor, code: Tensor = None, out: Tensor = None) -> Tensor: def dequantize( A: Tensor, - state: Tuple[Tensor, Tensor] = None, - absmax: Tensor = None, - code: Tensor = None, - out: Tensor = None, + state: Optional[Tuple[Tensor, Tensor]] = None, + absmax: Optional[torch.Tensor] = None, + code: Optional[torch.Tensor] = None, + out: Optional[torch.Tensor] = None, ) -> Tensor: assert state is not None or absmax is not None if code is None and state is None: @@ -1104,7 +1128,7 @@ def dequantize( return out * state[0] -def quantize_no_absmax(A: Tensor, code: Tensor, out: Tensor = None) -> Tensor: +def quantize_no_absmax(A: Tensor, code: Tensor, out: Optional[torch.Tensor] = None) -> Tensor: ''' Quantizes input tensor to 8-bit. @@ -1133,7 +1157,7 @@ def quantize_no_absmax(A: Tensor, code: Tensor, out: Tensor = None) -> Tensor: return out -def dequantize_no_absmax(A: Tensor, code: Tensor, out: Tensor = None) -> Tensor: +def dequantize_no_absmax(A: Tensor, code: Tensor, out: Optional[torch.Tensor] = None) -> Tensor: ''' Dequantizes the 8-bit tensor to 32-bit. @@ -1171,11 +1195,11 @@ def optimizer_update_32bit( eps: float, step: int, lr: float, - state2: Tensor = None, + state2: Optional[torch.Tensor] = None, beta2: float = 0.0, weight_decay: float = 0.0, gnorm_scale: float = 1.0, - unorm_vec: Tensor = None, + unorm_vec: Optional[torch.Tensor] = None, max_unorm: float = 0.0, skip_zeros=False, ) -> None: @@ -1274,7 +1298,7 @@ def optimizer_update_8bit( new_max2: Tensor, weight_decay: float = 0.0, gnorm_scale: float = 1.0, - unorm_vec: Tensor = None, + unorm_vec: Optional[torch.Tensor] = None, max_unorm: float = 0.0, ) -> None: """ @@ -1603,7 +1627,7 @@ def check_matmul(A, B, out, transposed_A, transposed_B, expected_type=torch.int8 def gemv_4bit( A: Tensor, B: Tensor, - out: Tensor = None, + out: Optional[torch.Tensor] = None, transposed_A=False, transposed_B=False, state=None @@ -1663,7 +1687,7 @@ def gemv_4bit( def igemm( A: Tensor, B: Tensor, - out: Tensor = None, + out: Optional[torch.Tensor] = None, transposed_A=False, transposed_B=False, ): @@ -1752,7 +1776,7 @@ def igemm( def batched_igemm( A: Tensor, B: Tensor, - out: Tensor = None, + out: Optional[torch.Tensor] = None, transposed_A=False, transposed_B=False, ): diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py index 0b1dc5c6f..b1f6deb21 100644 --- a/bitsandbytes/nn/modules.py +++ b/bitsandbytes/nn/modules.py @@ -145,7 +145,7 @@ def __new__( cls, data: Optional[torch.Tensor] = None, requires_grad=True, - quant_state: QuantState = None, + quant_state: Optional[QuantState] = None, blocksize: int = 64, compress_statistics: bool = True, quant_type: str = 'fp4', diff --git a/bitsandbytes/nn/triton_based_modules.py b/bitsandbytes/nn/triton_based_modules.py index de07ac647..67b45f4a5 100644 --- a/bitsandbytes/nn/triton_based_modules.py +++ b/bitsandbytes/nn/triton_based_modules.py @@ -162,7 +162,7 @@ def __init__( ): super().__init__(in_features, out_features, bias, device, dtype) - if not is_triton_available: + if not is_triton_available(): raise ImportError('''Could not import triton. Please install triton to use SwitchBackLinear. Alternatively, you can use bnb.nn.SwitchBackLinearBnb, but it will be slower''') diff --git a/bitsandbytes/research/autograd/_functions.py b/bitsandbytes/research/autograd/_functions.py index 0dff351e0..06b0748ff 100644 --- a/bitsandbytes/research/autograd/_functions.py +++ b/bitsandbytes/research/autograd/_functions.py @@ -2,6 +2,7 @@ import warnings from dataclasses import dataclass from functools import reduce # Required in Python 3 +from typing import Optional import torch @@ -14,7 +15,6 @@ def prod(iterable): return reduce(operator.mul, iterable, 1) -tensor = torch.Tensor class MatMulFP8Mixed(torch.autograd.Function): # forward is the same, but we added the fallback for pre-turing GPUs @@ -389,19 +389,38 @@ def get_block_sizes(input_matrix, weight_matrix): return bsz, bsz2 -def matmul_fp8_global(A: tensor, B: tensor, fw_code: tensor, bw_code: tensor, out: tensor = None, bsz : int = -1, bsz2 : int = -1): + +def matmul_fp8_global( + A: torch.Tensor, + B: torch.Tensor, + fw_code: torch.Tensor, + bw_code: torch.Tensor, + out: Optional[torch.Tensor] = None, + bsz: int = -1, + bsz2: int = -1, +): if bsz == -1 or bsz2 == -1: bsz, bsz2 = get_block_sizes(A, B) return MatMulFP8Global.apply(A, B, out, fw_code, bw_code, bsz, bsz2) -def matmul_fp8_mixed(A: tensor, B: tensor, fw_code: tensor, bw_code: tensor, out: tensor = None, bsz : int = -1, bsz2 : int = -1): + +def matmul_fp8_mixed( + A: torch.Tensor, + B: torch.Tensor, + fw_code: torch.Tensor, + bw_code: torch.Tensor, + out: Optional[torch.Tensor] = None, + bsz: int = -1, + bsz2: int = -1, +): if bsz == -1 or bsz2 == -1: bsz, bsz2 = get_block_sizes(A, B) return MatMulFP8Mixed.apply(A, B, out, fw_code, bw_code, bsz, bsz2) + def switchback_bnb( - A: tensor, - B: tensor, - out: tensor = None, - state: MatmulLtState = None, + A: torch.Tensor, + B: torch.Tensor, + out: Optional[torch.Tensor] = None, + state: Optional[MatmulLtState] = None, threshold=0.0, bias=None ): diff --git a/pyproject.toml b/pyproject.toml index 74d17dd90..c73f579e0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,4 +34,12 @@ ignore-init-module-imports = true # allow to expose in __init__.py via imports combine-as-imports = true detect-same-package = true force-sort-within-sections = true -known-first-party = ["bitsandbytes"] \ No newline at end of file +known-first-party = ["bitsandbytes"] + +[[tool.mypy.overrides]] +module = "triton.*" +ignore_missing_imports = true + +[[tool.mypy.overrides]] +module = "scipy.stats" +ignore_missing_imports = true From 706ec24d2f5717cf484191d2f09011432640a8e6 Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Tue, 30 Jan 2024 02:05:29 +0200 Subject: [PATCH 008/112] Ruff fixes (#984) * Adjust Ruff configuration * do not autofix always * be less strict around tests and benchmarks * adjust ignores for now * Ruff: autofix I and F401 * Apply ruff autofixes * Fix RUF013 complaint * Fix mutable default in replace_linear * Don't use bare except * Wrap bitsandbytes.__main__ entrypoint in function; fix "sensible" typo * Fix ruff B008 (function call in arguments) * Add ruff noqas as suitable * Fix RUF005 (splat instead of concatenating) * Fix B018 (useless expression) * Add pre-commit configuration + GitHub Actions lint workflow * Fix unused `e` in bitsandbytes/__main__.py * fix merge conflict resolution error * run pre-commit hook --------- Co-authored-by: Titus <9048635+Titus-von-Koeller@users.noreply.github.com> --- .github/workflows/lint.yml | 19 ++++ .pre-commit-config.yaml | 8 ++ .../switchback/make_plot_with_jsonl.py | 6 +- benchmarking/switchback/speed_benchmark.py | 20 +++- bitsandbytes/__init__.py | 4 +- bitsandbytes/__main__.py | 103 ++++++++---------- bitsandbytes/autograd/__init__.py | 2 +- bitsandbytes/autograd/_functions.py | 6 +- bitsandbytes/cextension.py | 9 +- bitsandbytes/cuda_setup/main.py | 42 ++++--- bitsandbytes/functional.py | 28 ++--- bitsandbytes/nn/__init__.py | 20 +++- bitsandbytes/nn/modules.py | 16 +-- bitsandbytes/nn/triton_based_modules.py | 24 ++-- bitsandbytes/optim/__init__.py | 11 +- bitsandbytes/optim/adamw.py | 1 - bitsandbytes/optim/lion.py | 1 + bitsandbytes/optim/optimizer.py | 3 +- bitsandbytes/research/__init__.py | 2 +- bitsandbytes/research/autograd/_functions.py | 12 +- bitsandbytes/research/nn/__init__.py | 2 +- bitsandbytes/research/nn/modules.py | 7 +- bitsandbytes/triton/dequantize_rowwise.py | 4 +- .../triton/int8_matmul_mixed_dequantize.py | 4 +- .../triton/int8_matmul_rowwise_dequantize.py | 3 +- .../quantize_columnwise_and_transpose.py | 4 +- bitsandbytes/triton/quantize_global.py | 5 +- bitsandbytes/triton/quantize_rowwise.py | 3 +- bitsandbytes/triton/triton_utils.py | 1 + bitsandbytes/utils.py | 14 ++- check_bnb_install.py | 3 +- install_cuda.py | 2 +- pyproject.toml | 23 +++- scripts/stale.py | 4 +- setup.py | 3 +- tests/test_autograd.py | 2 +- tests/test_cuda_setup_evaluator.py | 5 +- tests/test_functional.py | 6 +- tests/test_generation.py | 13 +-- tests/test_linear4bit.py | 3 +- tests/test_linear8bitlt.py | 3 +- tests/test_modules.py | 3 +- tests/test_optim.py | 10 +- tests/test_triton.py | 5 +- 44 files changed, 267 insertions(+), 202 deletions(-) create mode 100644 .github/workflows/lint.yml create mode 100644 .pre-commit-config.yaml diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 000000000..01084d44f --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,19 @@ +name: Lint + +on: + push: + branches: + - main + pull_request: + +jobs: + Lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v4 + with: + python-version: "3.12" + - uses: pre-commit/action@v3.0.0 + env: + RUFF_OUTPUT_FORMAT: github diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 000000000..d568a849f --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,8 @@ +repos: + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.1.15 + hooks: + - id: ruff + args: + - --fix + # - id: ruff-format # TODO: enable when the time is right diff --git a/benchmarking/switchback/make_plot_with_jsonl.py b/benchmarking/switchback/make_plot_with_jsonl.py index 8897564e7..3ef87d6b2 100644 --- a/benchmarking/switchback/make_plot_with_jsonl.py +++ b/benchmarking/switchback/make_plot_with_jsonl.py @@ -1,9 +1,7 @@ -import matplotlib.pyplot as plt -import pandas as pd -import numpy as np -import os import matplotlib.gridspec as gridspec +import matplotlib.pyplot as plt +import pandas as pd cmap=plt.get_cmap('cool') diff --git a/benchmarking/switchback/speed_benchmark.py b/benchmarking/switchback/speed_benchmark.py index b0983d0b8..d70df0386 100644 --- a/benchmarking/switchback/speed_benchmark.py +++ b/benchmarking/switchback/speed_benchmark.py @@ -1,14 +1,22 @@ import json - import time + import torch -import torch.nn as nn +from bitsandbytes.triton.int8_matmul_mixed_dequantize import ( + int8_matmul_mixed_dequantize, +) +from bitsandbytes.triton.int8_matmul_rowwise_dequantize import ( + int8_matmul_rowwise_dequantize, +) +from bitsandbytes.triton.quantize_columnwise_and_transpose import ( + quantize_columnwise_and_transpose, +) +from bitsandbytes.triton.quantize_global import ( + quantize_global, + quantize_global_transpose, +) from bitsandbytes.triton.quantize_rowwise import quantize_rowwise -from bitsandbytes.triton.quantize_columnwise_and_transpose import quantize_columnwise_and_transpose -from bitsandbytes.triton.int8_matmul_rowwise_dequantize import int8_matmul_rowwise_dequantize -from bitsandbytes.triton.quantize_global import quantize_global, quantize_global_transpose -from bitsandbytes.triton.int8_matmul_mixed_dequantize import int8_matmul_mixed_dequantize # KNOW ISSUE: need to optimize "w_quantize_colwise_transpose" when embeddim is too large. diff --git a/bitsandbytes/__init__.py b/bitsandbytes/__init__.py index 01d5527f5..87307a9d2 100644 --- a/bitsandbytes/__init__.py +++ b/bitsandbytes/__init__.py @@ -3,14 +3,14 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. -from . import cuda_setup, utils, research +from . import cuda_setup, research, utils from .autograd._functions import ( MatmulLtState, bmm_cublas, matmul, + matmul_4bit, matmul_cublas, mm_cublas, - matmul_4bit ) from .cextension import COMPILED_WITH_CUDA from .nn import modules diff --git a/bitsandbytes/__main__.py b/bitsandbytes/__main__.py index 8f58e1665..af5c1c523 100644 --- a/bitsandbytes/__main__.py +++ b/bitsandbytes/__main__.py @@ -1,11 +1,7 @@ import os +from os.path import isdir import sys -import shlex -import subprocess - from warnings import warn -from typing import Tuple -from os.path import isdir import torch @@ -20,7 +16,7 @@ def find_file_recursive(folder, filename): out = glob.glob(os.path.join(folder, "**", filename + ext)) outs.extend(out) except Exception as e: - raise RuntimeError('Error: Something when wrong when trying to find file. {e}') + raise RuntimeError('Error: Something when wrong when trying to find file.') from e return outs @@ -62,14 +58,11 @@ def generate_bug_report_information(): print_header(f"{path} CUDA PATHS") paths = find_file_recursive(path, '*cuda*') print(paths) - except: - print(f'Could not read LD_LIBRARY_PATH: {path}') + except Exception as e: + print(f'Could not read LD_LIBRARY_PATH: {path} ({e})') print('') - - - def print_header( txt: str, width: int = HEADER_WIDTH, filler: str = "+" ) -> None: @@ -78,67 +71,61 @@ def print_header( def print_debug_info() -> None: + from . import PACKAGE_GITHUB_URL print( "\nAbove we output some debug information. Please provide this info when " f"creating an issue via {PACKAGE_GITHUB_URL}/issues/new/choose ...\n" ) -generate_bug_report_information() +def main(): + generate_bug_report_information() + from . import COMPILED_WITH_CUDA + from .cuda_setup.main import get_compute_capabilities -from . import COMPILED_WITH_CUDA, PACKAGE_GITHUB_URL -from .cuda_setup.env_vars import to_be_ignored -from .cuda_setup.main import get_compute_capabilities - + print_header("OTHER") + print(f"COMPILED_WITH_CUDA = {COMPILED_WITH_CUDA}") + print(f"COMPUTE_CAPABILITIES_PER_GPU = {get_compute_capabilities()}") + print_header("") + print_header("DEBUG INFO END") + print_header("") + print("Checking that the library is importable and CUDA is callable...") + print("\nWARNING: Please be sure to sanitize sensitive info from any such env vars!\n") -print_header("OTHER") -print(f"COMPILED_WITH_CUDA = {COMPILED_WITH_CUDA}") -print(f"COMPUTE_CAPABILITIES_PER_GPU = {get_compute_capabilities()}") -print_header("") -print_header("DEBUG INFO END") -print_header("") -print( - """ -Running a quick check that: - + library is importable - + CUDA function is callable -""" -) -print("\nWARNING: Please be sure to sanitize sensible info from any such env vars!\n") + try: + from bitsandbytes.optim import Adam -try: - from bitsandbytes.optim import Adam + p = torch.nn.Parameter(torch.rand(10, 10).cuda()) + a = torch.rand(10, 10).cuda() - p = torch.nn.Parameter(torch.rand(10, 10).cuda()) - a = torch.rand(10, 10).cuda() + p1 = p.data.sum().item() - p1 = p.data.sum().item() + adam = Adam([p]) - adam = Adam([p]) + out = a * p + loss = out.sum() + loss.backward() + adam.step() - out = a * p - loss = out.sum() - loss.backward() - adam.step() + p2 = p.data.sum().item() - p2 = p.data.sum().item() + assert p1 != p2 + print("SUCCESS!") + print("Installation was successful!") + except ImportError: + print() + warn( + f"WARNING: {__package__} is currently running as CPU-only!\n" + "Therefore, 8-bit optimizers and GPU quantization are unavailable.\n\n" + f"If you think that this is so erroneously,\nplease report an issue!" + ) + print_debug_info() + except Exception as e: + print(e) + print_debug_info() + sys.exit(1) - assert p1 != p2 - print("SUCCESS!") - print("Installation was successful!") - sys.exit(0) -except ImportError: - print() - warn( - f"WARNING: {__package__} is currently running as CPU-only!\n" - "Therefore, 8-bit optimizers and GPU quantization are unavailable.\n\n" - f"If you think that this is so erroneously,\nplease report an issue!" - ) - print_debug_info() - sys.exit(0) -except Exception as e: - print(e) - print_debug_info() - sys.exit(1) +if __name__ == "__main__": + main() diff --git a/bitsandbytes/autograd/__init__.py b/bitsandbytes/autograd/__init__.py index 6b9a7e4d1..f262d89ed 100644 --- a/bitsandbytes/autograd/__init__.py +++ b/bitsandbytes/autograd/__init__.py @@ -1 +1 @@ -from ._functions import undo_layout, get_inverse_transform_indices +from ._functions import get_inverse_transform_indices, undo_layout diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py index 9917e326e..6cbb6efd9 100644 --- a/bitsandbytes/autograd/_functions.py +++ b/bitsandbytes/autograd/_functions.py @@ -1,8 +1,8 @@ -import operator -import warnings from dataclasses import dataclass from functools import reduce # Required in Python 3 -from typing import Tuple, Optional, Callable +import operator +from typing import Callable, Optional, Tuple +import warnings from warnings import warn import torch diff --git a/bitsandbytes/cextension.py b/bitsandbytes/cextension.py index d52a6d607..858365f02 100644 --- a/bitsandbytes/cextension.py +++ b/bitsandbytes/cextension.py @@ -1,12 +1,9 @@ import ctypes as ct -import os -import torch - -from pathlib import Path from warnings import warn -from bitsandbytes.cuda_setup.main import CUDASetup +import torch +from bitsandbytes.cuda_setup.main import CUDASetup setup = CUDASetup.get_instance() if setup.initialized != True: @@ -25,7 +22,7 @@ Inspect the output of the command and see if you can locate CUDA libraries. You might need to add them to your LD_LIBRARY_PATH. If you suspect a bug, please take the information from python -m bitsandbytes and open an issue at: https://github.com/TimDettmers/bitsandbytes/issues''') - lib.cadam32bit_grad_fp32 # runs on an error if the library could not be found -> COMPILED_WITH_CUDA=False + _ = lib.cadam32bit_grad_fp32 # runs on an error if the library could not be found -> COMPILED_WITH_CUDA=False lib.get_context.restype = ct.c_void_p lib.get_cusparse.restype = ct.c_void_p lib.cget_managed_ptr.restype = ct.c_void_p diff --git a/bitsandbytes/cuda_setup/main.py b/bitsandbytes/cuda_setup/main.py index a5931ef5e..a34385b1f 100644 --- a/bitsandbytes/cuda_setup/main.py +++ b/bitsandbytes/cuda_setup/main.py @@ -17,15 +17,15 @@ """ import ctypes as ct -import os import errno +import os +from pathlib import Path import platform -import torch +from typing import Set, Union from warnings import warn -from itertools import product -from pathlib import Path -from typing import Set, Union +import torch + from .env_vars import get_potentially_lib_path_containing_env_vars # these are the most common libs names @@ -111,14 +111,16 @@ def manual_override(self): if torch.cuda.is_available(): if 'BNB_CUDA_VERSION' in os.environ: if len(os.environ['BNB_CUDA_VERSION']) > 0: - warn((f'\n\n{"="*80}\n' - 'WARNING: Manual override via BNB_CUDA_VERSION env variable detected!\n' - 'BNB_CUDA_VERSION=XXX can be used to load a bitsandbytes version that is different from the PyTorch CUDA version.\n' - 'If this was unintended set the BNB_CUDA_VERSION variable to an empty string: export BNB_CUDA_VERSION=\n' - 'If you use the manual override make sure the right libcudart.so is in your LD_LIBRARY_PATH\n' - 'For example by adding the following to your .bashrc: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH: Set[Path]: try: if path.exists(): existent_directories.add(path) - except PermissionError as pex: + except PermissionError: # Handle the PermissionError first as it is a subtype of OSError # https://docs.python.org/3/library/exceptions.html#exception-hierarchy pass @@ -217,8 +219,10 @@ def remove_non_existent_dirs(candidate_paths: Set[Path]) -> Set[Path]: non_existent_directories: Set[Path] = candidate_paths - existent_directories if non_existent_directories: - CUDASetup.get_instance().add_log_entry("The following directories listed in your path were found to " - f"be non-existent: {non_existent_directories}", is_warning=False) + CUDASetup.get_instance().add_log_entry( + f"The following directories listed in your path were found to be non-existent: {non_existent_directories}", + is_warning=False, + ) return existent_directories @@ -360,8 +364,10 @@ def evaluate_cuda_setup(): cuda_version_string = get_cuda_version() cuda_setup.add_log_entry(f"CUDA SETUP: PyTorch settings found: CUDA_VERSION={cuda_version_string}, Highest Compute Capability: {cc}.") - cuda_setup.add_log_entry(f"CUDA SETUP: To manually override the PyTorch CUDA version please see:" - "https://github.com/TimDettmers/bitsandbytes/blob/main/how_to_use_nonpytorch_cuda.md") + cuda_setup.add_log_entry( + "CUDA SETUP: To manually override the PyTorch CUDA version please see:" + "https://github.com/TimDettmers/bitsandbytes/blob/main/how_to_use_nonpytorch_cuda.md" + ) # 7.5 is the minimum CC vor cublaslt diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py index 25aa4e531..1f624a7a8 100644 --- a/bitsandbytes/functional.py +++ b/bitsandbytes/functional.py @@ -3,17 +3,15 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import ctypes as ct +from functools import reduce # Required in Python 3 import itertools import operator -import random -import torch -import itertools -import math -import numpy as np +from typing import Any, Dict, Optional, Tuple -from functools import reduce # Required in Python 3 -from typing import Tuple, Any, Dict, Optional +import numpy as np +import torch from torch import Tensor + from bitsandbytes.utils import pack_dict_to_tensor, unpack_tensor_to_dict from .cextension import COMPILED_WITH_CUDA, lib @@ -178,7 +176,9 @@ def get_instance(cls): dtype2bytes[torch.uint8] = 1 dtype2bytes[torch.int8] = 1 -def get_paged(*shape, dtype=torch.float32, device=torch.device('cuda', index=0)): +FIRST_CUDA_DEVICE = torch.device('cuda', index=0) + +def get_paged(*shape, dtype=torch.float32, device=FIRST_CUDA_DEVICE): num_bytes = dtype2bytes[dtype]*prod(shape) cuda_ptr = lib.cget_managed_ptr(ct.c_size_t(num_bytes)) c_ptr = ct.cast(cuda_ptr, ct.POINTER(ct.c_int)) @@ -242,7 +242,7 @@ def create_linear_map(signed=True, total_bits=8, add_zero=True): if gap == 0: return values else: - l = values.numel()//2 + l = values.numel()//2 # noqa: E741 return torch.Tensor(values[:l].tolist() + [0]*gap + values[l:].tolist()) @@ -283,7 +283,7 @@ def create_fp8_map(signed=True, exponent_bits=5, precision_bits=2, total_bits=8) # the exponent is biased to 2^(e-1) -1 == 0 evalues = [] pvalues = [] - for i, val in enumerate(range(-((2**(exponent_bits-has_sign))), 2**(exponent_bits-has_sign), 1)): + for i, val in enumerate(range(-(2**(exponent_bits-has_sign)), 2**(exponent_bits-has_sign), 1)): evalues.append(2**val) @@ -345,7 +345,7 @@ def create_dynamic_map(signed=True, max_exponent_bits=7, total_bits=8): non_sign_bits = total_bits - (1 if signed else 1) additional_items = 2 ** (non_sign_bits - max_exponent_bits) - 1 for i in range(max_exponent_bits): - fraction_items = int((2 ** (i + non_sign_bits - max_exponent_bits) + 1 if signed else 2 ** (i + non_sign_bits - max_exponent_bits + 1) + 1)) + fraction_items = int(2 ** (i + non_sign_bits - max_exponent_bits) + 1 if signed else 2 ** (i + non_sign_bits - max_exponent_bits + 1) + 1) boundaries = torch.linspace(0.1, 1, fraction_items) means = (boundaries[:-1] + boundaries[1:]) / 2.0 data += ((10 ** (-(max_exponent_bits - 1) + i)) * means).tolist() @@ -899,7 +899,7 @@ def get_4bit_type(typename, device=None, blocksize=64): -0.04934812, 0., 0.04273164, 0.12934483, 0.21961274, 0.31675666, 0.42563882, 0.55496234, 0.72424863, 1.][::-1] else: - raise NotImplementedError(f'4-bit AbnormalFloats currently only support blocksize 64.') + raise NotImplementedError('4-bit AbnormalFloats currently only support blocksize 64.') if data is None: raise NotImplementedError(f'Typename {typename} not supported') @@ -1635,10 +1635,10 @@ def gemv_4bit( prev_device = pre_call(A.device) #sout = check_matmul(A, B, out, transposed_A, transposed_B, expected_type=A.dtype) if state is None: - raise ValueError(f'state cannot None. gem_4bit( ) requires the state from quantize_4bit( )') + raise ValueError('state cannot None. gem_4bit( ) requires the state from quantize_4bit( )') if A.numel() != A.shape[-1]: - raise ValueError(f'Dimensions of A are invalid. Must be a vector with the leading dimensions of "1", e.g. [1, 1, 2048]') + raise ValueError('Dimensions of A are invalid. Must be a vector with the leading dimensions of "1", e.g. [1, 1, 2048]') Bshape = state.shape bout = Bshape[0] diff --git a/bitsandbytes/nn/__init__.py b/bitsandbytes/nn/__init__.py index 6fa6d1183..96f4359bf 100644 --- a/bitsandbytes/nn/__init__.py +++ b/bitsandbytes/nn/__init__.py @@ -2,5 +2,21 @@ # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. -from .modules import Int8Params, Linear8bitLt, StableEmbedding, Linear4bit, LinearNF4, LinearFP4, Params4bit, OutlierAwareLinear, SwitchBackLinearBnb, Embedding -from .triton_based_modules import SwitchBackLinear, SwitchBackLinearGlobal, SwitchBackLinearVectorwise, StandardLinear +from .modules import ( + Embedding, + Int8Params, + Linear4bit, + Linear8bitLt, + LinearFP4, + LinearNF4, + OutlierAwareLinear, + Params4bit, + StableEmbedding, + SwitchBackLinearBnb, +) +from .triton_based_modules import ( + StandardLinear, + SwitchBackLinear, + SwitchBackLinearGlobal, + SwitchBackLinearVectorwise, +) diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py index b1f6deb21..922feae15 100644 --- a/bitsandbytes/nn/modules.py +++ b/bitsandbytes/nn/modules.py @@ -3,17 +3,17 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from typing import Any, Dict, Optional, TypeVar, Union, overload - import warnings + import torch -import torch.nn.functional as F from torch import Tensor, device, dtype, nn +import torch.nn.functional as F import bitsandbytes as bnb +from bitsandbytes.autograd._functions import get_tile_inds, undo_layout from bitsandbytes.functional import QuantState -from bitsandbytes.autograd._functions import undo_layout, get_tile_inds from bitsandbytes.optim import GlobalOptimManager -from bitsandbytes.utils import OutlierTracer, find_outlier_dims +from bitsandbytes.utils import OutlierTracer T = TypeVar("T", bound="torch.nn.Module") @@ -242,10 +242,10 @@ def set_compute_type(self, x): if self.compute_dtype == torch.float32 and (x.numel() == x.shape[-1]): # single batch inference with input torch.float16 and compute_dtype float32 -> slow inference when it could be fast # warn the user about this - warnings.warn(f'Input type into Linear4bit is torch.float16, but bnb_4bit_compute_dtype=torch.float32 (default). This will lead to slow inference.') + warnings.warn('Input type into Linear4bit is torch.float16, but bnb_4bit_compute_dtype=torch.float32 (default). This will lead to slow inference.') warnings.filterwarnings('ignore', message='.*inference.') if self.compute_dtype == torch.float32 and (x.numel() != x.shape[-1]): - warnings.warn(f'Input type into Linear4bit is torch.float16, but bnb_4bit_compute_dtype=torch.float32 (default). This will lead to slow inference or training speed.') + warnings.warn('Input type into Linear4bit is torch.float16, but bnb_4bit_compute_dtype=torch.float32 (default). This will lead to slow inference or training speed.') warnings.filterwarnings('ignore', message='.*inference or training') def _save_to_state_dict(self, destination, prefix, keep_vars): @@ -337,8 +337,8 @@ def cuda(self, device): del CBt del SCBt self.data = CB - setattr(self, "CB", CB) - setattr(self, "SCB", SCB) + self.CB = CB + self.SCB = SCB return self diff --git a/bitsandbytes/nn/triton_based_modules.py b/bitsandbytes/nn/triton_based_modules.py index 67b45f4a5..9c7738c59 100644 --- a/bitsandbytes/nn/triton_based_modules.py +++ b/bitsandbytes/nn/triton_based_modules.py @@ -1,16 +1,24 @@ -import torch -import torch.nn as nn -import time from functools import partial -from bitsandbytes.triton.triton_utils import is_triton_available +import torch +import torch.nn as nn from bitsandbytes.triton.dequantize_rowwise import dequantize_rowwise +from bitsandbytes.triton.int8_matmul_mixed_dequantize import ( + int8_matmul_mixed_dequantize, +) +from bitsandbytes.triton.int8_matmul_rowwise_dequantize import ( + int8_matmul_rowwise_dequantize, +) +from bitsandbytes.triton.quantize_columnwise_and_transpose import ( + quantize_columnwise_and_transpose, +) +from bitsandbytes.triton.quantize_global import ( + quantize_global, + quantize_global_transpose, +) from bitsandbytes.triton.quantize_rowwise import quantize_rowwise -from bitsandbytes.triton.quantize_columnwise_and_transpose import quantize_columnwise_and_transpose -from bitsandbytes.triton.int8_matmul_rowwise_dequantize import int8_matmul_rowwise_dequantize -from bitsandbytes.triton.quantize_global import quantize_global, quantize_global_transpose -from bitsandbytes.triton.int8_matmul_mixed_dequantize import int8_matmul_mixed_dequantize +from bitsandbytes.triton.triton_utils import is_triton_available class _switchback_global(torch.autograd.Function): diff --git a/bitsandbytes/optim/__init__.py b/bitsandbytes/optim/__init__.py index 83a57bd9f..6796b8e0e 100644 --- a/bitsandbytes/optim/__init__.py +++ b/bitsandbytes/optim/__init__.py @@ -7,10 +7,17 @@ from .adagrad import Adagrad, Adagrad8bit, Adagrad32bit from .adam import Adam, Adam8bit, Adam32bit, PagedAdam, PagedAdam8bit, PagedAdam32bit -from .adamw import AdamW, AdamW8bit, AdamW32bit, PagedAdamW, PagedAdamW8bit, PagedAdamW32bit +from .adamw import ( + AdamW, + AdamW8bit, + AdamW32bit, + PagedAdamW, + PagedAdamW8bit, + PagedAdamW32bit, +) from .lamb import LAMB, LAMB8bit, LAMB32bit from .lars import LARS, LARS8bit, LARS32bit, PytorchLARS +from .lion import Lion, Lion8bit, Lion32bit, PagedLion, PagedLion8bit, PagedLion32bit from .optimizer import GlobalOptimManager from .rmsprop import RMSprop, RMSprop8bit, RMSprop32bit -from .lion import Lion, Lion8bit, Lion32bit, PagedLion, PagedLion8bit, PagedLion32bit from .sgd import SGD, SGD8bit, SGD32bit diff --git a/bitsandbytes/optim/adamw.py b/bitsandbytes/optim/adamw.py index 21077f1a0..9ea5812ea 100644 --- a/bitsandbytes/optim/adamw.py +++ b/bitsandbytes/optim/adamw.py @@ -5,7 +5,6 @@ from bitsandbytes.optim.optimizer import Optimizer2State - class AdamW(Optimizer2State): def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=1e-2, amsgrad=False, optim_bits=32, args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True, is_paged=False): diff --git a/bitsandbytes/optim/lion.py b/bitsandbytes/optim/lion.py index 2bde1a447..b6ba4a9f1 100644 --- a/bitsandbytes/optim/lion.py +++ b/bitsandbytes/optim/lion.py @@ -4,6 +4,7 @@ # LICENSE file in the root directory of this source tree. from bitsandbytes.optim.optimizer import Optimizer1State + class Lion(Optimizer1State): def __init__(self, params, lr=1e-4, betas=(0.9, 0.99), weight_decay=0, optim_bits=32, args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True, is_paged=False): super().__init__("lion", params, lr, betas, 0., weight_decay, optim_bits, args, min_8bit_size, percentile_clipping, block_wise, is_paged=is_paged) diff --git a/bitsandbytes/optim/optimizer.py b/bitsandbytes/optim/optimizer.py index fb83eddf0..8254d16b4 100644 --- a/bitsandbytes/optim/optimizer.py +++ b/bitsandbytes/optim/optimizer.py @@ -2,8 +2,7 @@ # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. -from collections import abc as container_abcs -from collections import defaultdict +from collections import abc as container_abcs, defaultdict from copy import deepcopy from itertools import chain diff --git a/bitsandbytes/research/__init__.py b/bitsandbytes/research/__init__.py index 47b720d78..31db4f282 100644 --- a/bitsandbytes/research/__init__.py +++ b/bitsandbytes/research/__init__.py @@ -1,6 +1,6 @@ from . import nn from .autograd._functions import ( - switchback_bnb, matmul_fp8_global, matmul_fp8_mixed, + switchback_bnb, ) diff --git a/bitsandbytes/research/autograd/_functions.py b/bitsandbytes/research/autograd/_functions.py index 06b0748ff..e515bfeff 100644 --- a/bitsandbytes/research/autograd/_functions.py +++ b/bitsandbytes/research/autograd/_functions.py @@ -1,15 +1,13 @@ -import operator -import warnings -from dataclasses import dataclass from functools import reduce # Required in Python 3 +import operator from typing import Optional +import warnings import torch +from bitsandbytes.autograd._functions import GlobalOutlierPooler, MatmulLtState import bitsandbytes.functional as F -from bitsandbytes.autograd._functions import MatmulLtState, GlobalOutlierPooler - # math.prod not compatible with python < 3.8 def prod(iterable): @@ -186,7 +184,9 @@ def backward(ctx, grad_output): class SwitchBackBnb(torch.autograd.Function): @staticmethod - def forward(ctx, A, B, out=None, bias=None, state=MatmulLtState()): + # TODO: the B008 on the line below is a likely bug; the current implementation will + # have each SwitchBackBnb instance share a single MatmulLtState instance!!! + def forward(ctx, A, B, out=None, bias=None, state=MatmulLtState()): # noqa: B008 # default to pytorch behavior if inputs are empty ctx.is_empty = False if prod(A.shape) == 0: diff --git a/bitsandbytes/research/nn/__init__.py b/bitsandbytes/research/nn/__init__.py index 8faec10bb..417011218 100644 --- a/bitsandbytes/research/nn/__init__.py +++ b/bitsandbytes/research/nn/__init__.py @@ -1 +1 @@ -from .modules import LinearFP8Mixed, LinearFP8Global +from .modules import LinearFP8Global, LinearFP8Mixed diff --git a/bitsandbytes/research/nn/modules.py b/bitsandbytes/research/nn/modules.py index 2a46b40c3..7fca34d23 100644 --- a/bitsandbytes/research/nn/modules.py +++ b/bitsandbytes/research/nn/modules.py @@ -1,12 +1,9 @@ -from typing import Optional, TypeVar, Union, overload +from typing import TypeVar import torch -import torch.nn.functional as F -from torch import Tensor, device, dtype, nn +from torch import nn import bitsandbytes as bnb -from bitsandbytes.optim import GlobalOptimManager -from bitsandbytes.utils import OutlierTracer, find_outlier_dims T = TypeVar("T", bound="torch.nn.Module") diff --git a/bitsandbytes/triton/dequantize_rowwise.py b/bitsandbytes/triton/dequantize_rowwise.py index e092680b8..daa59da9c 100644 --- a/bitsandbytes/triton/dequantize_rowwise.py +++ b/bitsandbytes/triton/dequantize_rowwise.py @@ -1,6 +1,7 @@ import math + import torch -import time + from bitsandbytes.triton.triton_utils import is_triton_available if not is_triton_available(): @@ -9,7 +10,6 @@ def dequantize_rowwise(x: torch.Tensor, state_x: torch.Tensor): return None import triton import triton.language as tl - from triton.ops.matmul_perf_model import early_config_prune, estimate_matmul_time # rowwise quantize diff --git a/bitsandbytes/triton/int8_matmul_mixed_dequantize.py b/bitsandbytes/triton/int8_matmul_mixed_dequantize.py index b0961f558..1b80ab1a0 100644 --- a/bitsandbytes/triton/int8_matmul_mixed_dequantize.py +++ b/bitsandbytes/triton/int8_matmul_mixed_dequantize.py @@ -1,4 +1,5 @@ import torch + from bitsandbytes.triton.triton_utils import is_triton_available if not is_triton_available(): @@ -57,7 +58,8 @@ def get_configs_io_bound(): triton.Config({'BLOCK_M': 64, 'BLOCK_N': 128, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=4, num_warps=4), triton.Config({'BLOCK_M': 128, 'BLOCK_N': 32, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=4, num_warps=4), triton.Config({'BLOCK_M': 64, 'BLOCK_N': 32, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=5, num_warps=2), - ] + get_configs_io_bound(), + *get_configs_io_bound(), + ], key=['M', 'N', 'K'], prune_configs_by={ 'early_config_prune': early_config_prune, diff --git a/bitsandbytes/triton/int8_matmul_rowwise_dequantize.py b/bitsandbytes/triton/int8_matmul_rowwise_dequantize.py index 33f4d13f2..1f28b0d10 100644 --- a/bitsandbytes/triton/int8_matmul_rowwise_dequantize.py +++ b/bitsandbytes/triton/int8_matmul_rowwise_dequantize.py @@ -57,7 +57,8 @@ def get_configs_io_bound(): triton.Config({'BLOCK_M': 64, 'BLOCK_N': 128, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=4, num_warps=4), triton.Config({'BLOCK_M': 128, 'BLOCK_N': 32, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=4, num_warps=4), triton.Config({'BLOCK_M': 64, 'BLOCK_N': 32, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=5, num_warps=2), - ] + get_configs_io_bound(), + *get_configs_io_bound(), + ], key=['M', 'N', 'K'], prune_configs_by={ 'early_config_prune': early_config_prune, diff --git a/bitsandbytes/triton/quantize_columnwise_and_transpose.py b/bitsandbytes/triton/quantize_columnwise_and_transpose.py index 54220d95a..fcadaba3e 100644 --- a/bitsandbytes/triton/quantize_columnwise_and_transpose.py +++ b/bitsandbytes/triton/quantize_columnwise_and_transpose.py @@ -1,6 +1,7 @@ import math + import torch -import time + from bitsandbytes.triton.triton_utils import is_triton_available if not is_triton_available(): @@ -9,7 +10,6 @@ def quantize_columnwise_and_transpose(x: torch.Tensor): return None import triton import triton.language as tl - from triton.ops.matmul_perf_model import early_config_prune, estimate_matmul_time # This kernel does fused columnwise quantization and transpose. diff --git a/bitsandbytes/triton/quantize_global.py b/bitsandbytes/triton/quantize_global.py index 845db6ecd..a73a5bbaa 100644 --- a/bitsandbytes/triton/quantize_global.py +++ b/bitsandbytes/triton/quantize_global.py @@ -1,6 +1,6 @@ -import math + import torch -import time + from bitsandbytes.triton.triton_utils import is_triton_available if not is_triton_available(): @@ -10,7 +10,6 @@ def quantize_global(x: torch.Tensor): return None import triton import triton.language as tl - from triton.ops.matmul_perf_model import early_config_prune, estimate_matmul_time # global quantize @triton.autotune( diff --git a/bitsandbytes/triton/quantize_rowwise.py b/bitsandbytes/triton/quantize_rowwise.py index 26d218321..fce464b19 100644 --- a/bitsandbytes/triton/quantize_rowwise.py +++ b/bitsandbytes/triton/quantize_rowwise.py @@ -1,6 +1,6 @@ import math + import torch -import time from bitsandbytes.triton.triton_utils import is_triton_available @@ -10,7 +10,6 @@ def quantize_rowwise(x: torch.Tensor): return None import triton import triton.language as tl - from triton.ops.matmul_perf_model import early_config_prune, estimate_matmul_time # rowwise quantize diff --git a/bitsandbytes/triton/triton_utils.py b/bitsandbytes/triton/triton_utils.py index c74c23962..6bbdbf1c1 100644 --- a/bitsandbytes/triton/triton_utils.py +++ b/bitsandbytes/triton/triton_utils.py @@ -1,4 +1,5 @@ import importlib + def is_triton_available(): return importlib.util.find_spec("triton") is not None diff --git a/bitsandbytes/utils.py b/bitsandbytes/utils.py index 48373a1fe..0582f7fc0 100644 --- a/bitsandbytes/utils.py +++ b/bitsandbytes/utils.py @@ -1,9 +1,11 @@ import json import shlex import subprocess -import torch from typing import Tuple +import torch + + def outlier_hook(module, input): assert isinstance(module, torch.nn.Linear) tracer = OutlierTracer.get_instance() @@ -37,7 +39,7 @@ def outlier_hook(module, input): hook.remove() -class OutlierTracer(object): +class OutlierTracer: _instance = None def __init__(self): @@ -122,7 +124,13 @@ def execute_and_return_decoded_std_streams(command_string): -def replace_linear(model, linear_replacement, skip_modules=["lm_head"], copy_weights=False, post_processing_function=None): +def replace_linear( + model, + linear_replacement, + skip_modules=("lm_head",), + copy_weights=False, + post_processing_function=None, +): """ Replace linear modules with a new Linear module. Parameters: diff --git a/check_bnb_install.py b/check_bnb_install.py index 77cd03ec4..5a7f74f89 100644 --- a/check_bnb_install.py +++ b/check_bnb_install.py @@ -1,6 +1,7 @@ -import bitsandbytes as bnb import torch +import bitsandbytes as bnb + p = torch.nn.Parameter(torch.rand(10,10).cuda()) a = torch.rand(10,10).cuda() diff --git a/install_cuda.py b/install_cuda.py index e90f6b6fb..77e258609 100644 --- a/install_cuda.py +++ b/install_cuda.py @@ -1,6 +1,6 @@ import os -import sys import subprocess +import sys from urllib.request import urlretrieve cuda_versions = { diff --git a/pyproject.toml b/pyproject.toml index c73f579e0..53942bc41 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,9 +11,7 @@ src = [ "tests", "benchmarking" ] -fix = true select = [ - "A", # prevent using keywords that clobber python builtins "B", # bugbear: security warnings "E", # pycodestyle "F", # pyflakes @@ -24,12 +22,29 @@ select = [ ] target-version = "py38" ignore = [ - "E712", # Allow using if x == False, as it's not always equivalent to if x. + "B007", # Loop control variable not used within the loop body (TODO: enable) + "B028", # Warning without stacklevel (TODO: enable) "E501", # Supress line-too-long warnings: trust yapf's judgement on this one. - "F401", + "E701", # Multiple statements on one line (TODO: enable) + "E712", # Allow using if x == False, as it's not always equivalent to if x. + "E731", # Do not use lambda + "F841", # Local assigned but not used (TODO: enable, these are likely bugs) + "RUF012", # Mutable class attribute annotations ] ignore-init-module-imports = true # allow to expose in __init__.py via imports +[tool.ruff.extend-per-file-ignores] +"**/__init__.py" = ["F401"] # allow unused imports in __init__.py +"{benchmarking,tests}/**/*.py" = [ + "B007", + "B011", + "B023", + "E701", + "E731", + "F841", + "UP030", +] + [tool.ruff.isort] combine-as-imports = true detect-same-package = true diff --git a/scripts/stale.py b/scripts/stale.py index b7f34c1fb..c299643ae 100644 --- a/scripts/stale.py +++ b/scripts/stale.py @@ -15,13 +15,11 @@ Script to close stale issue. Taken in part from the AllenNLP repository. https://github.com/allenai/allennlp. """ +from datetime import datetime as dt, timezone import os -from datetime import datetime as dt -from datetime import timezone from github import Github - # All labels that we don't want to touch LABELS_TO_EXEMPT = [ "feature-request", diff --git a/setup.py b/setup.py index 7a82b7717..407116fbe 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,6 @@ from setuptools import find_packages, setup - libs = list(glob.glob("./bitsandbytes/libbitsandbytes*.so")) libs += list(glob.glob("./bitsandbytes/libbitsandbytes*.dll")) libs = [os.path.basename(p) for p in libs] @@ -19,7 +18,7 @@ def read(fname): setup( - name=f"bitsandbytes", + name="bitsandbytes", version="0.42.0", author="Tim Dettmers", author_email="dettmers@cs.washington.edu", diff --git a/tests/test_autograd.py b/tests/test_autograd.py index 27b010105..ed482b356 100644 --- a/tests/test_autograd.py +++ b/tests/test_autograd.py @@ -1,4 +1,4 @@ -from itertools import permutations, product +from itertools import product import pytest import torch diff --git a/tests/test_cuda_setup_evaluator.py b/tests/test_cuda_setup_evaluator.py index 596d0a030..5e1a548e5 100644 --- a/tests/test_cuda_setup_evaluator.py +++ b/tests/test_cuda_setup_evaluator.py @@ -1,8 +1,9 @@ import os -import pytest -import torch from pathlib import Path +import torch + + # hardcoded test. Not good, but a sanity check for now # TODO: improve this def test_manual_override(requires_cuda): diff --git a/tests/test_functional.py b/tests/test_functional.py index 970b4dbdb..5b7f83bc3 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -1,16 +1,16 @@ +from itertools import product import math import random import time -from itertools import product import einops +import numpy as np import pytest +from scipy.stats import norm import torch -import numpy as np import bitsandbytes as bnb from bitsandbytes import functional as F -from scipy.stats import norm torch.set_printoptions( precision=5, sci_mode=False, linewidth=120, edgeitems=20, threshold=10000 diff --git a/tests/test_generation.py b/tests/test_generation.py index ecafdddf8..753623b27 100644 --- a/tests/test_generation.py +++ b/tests/test_generation.py @@ -1,22 +1,15 @@ -import pytest -import torch -import math - from itertools import product +import math +import pytest +import torch import transformers from transformers import ( - AutoConfig, AutoModelForCausalLM, - AutoTokenizer, BitsAndBytesConfig, - GenerationConfig, - set_seed, - ) - def get_4bit_config(): return BitsAndBytesConfig( load_in_4bit=True, diff --git a/tests/test_linear4bit.py b/tests/test_linear4bit.py index 478255eee..d396a910b 100644 --- a/tests/test_linear4bit.py +++ b/tests/test_linear4bit.py @@ -1,6 +1,5 @@ -import os -from contextlib import nullcontext from itertools import product +import os from tempfile import TemporaryDirectory import pytest diff --git a/tests/test_linear8bitlt.py b/tests/test_linear8bitlt.py index 8904aaf1b..d4967969c 100644 --- a/tests/test_linear8bitlt.py +++ b/tests/test_linear8bitlt.py @@ -1,6 +1,6 @@ -import os from contextlib import nullcontext from itertools import product +import os from tempfile import TemporaryDirectory import pytest @@ -11,7 +11,6 @@ from bitsandbytes.autograd import get_inverse_transform_indices, undo_layout from bitsandbytes.nn.modules import Linear8bitLt - # contributed by Alex Borzunov, see: # https://github.com/bigscience-workshop/petals/blob/main/tests/test_linear8bitlt.py diff --git a/tests/test_modules.py b/tests/test_modules.py index cabd7cf54..c98f7a6d4 100644 --- a/tests/test_modules.py +++ b/tests/test_modules.py @@ -1,5 +1,6 @@ -from itertools import product +import math +import einops import pytest import torch from torch import nn diff --git a/tests/test_optim.py b/tests/test_optim.py index 49d4f442a..993ac8b60 100644 --- a/tests/test_optim.py +++ b/tests/test_optim.py @@ -1,14 +1,12 @@ -import ctypes +from itertools import product import os +from os.path import join import shutil import time import uuid -from itertools import product -from os.path import join -import pytest from lion_pytorch import Lion - +import pytest import torch import bitsandbytes as bnb @@ -27,7 +25,7 @@ def assert_most_approx_close(a, b, rtol=1e-3, atol=1e-3, max_error_count=0): def get_temp_dir(): - path = f"/tmp/autoswap/{str(uuid.uuid4())}" + path = f"/tmp/autoswap/{uuid.uuid4()}" os.makedirs(path, exist_ok=True) return path diff --git a/tests/test_triton.py b/tests/test_triton.py index e18c7a930..d0397ee4a 100644 --- a/tests/test_triton.py +++ b/tests/test_triton.py @@ -1,9 +1,10 @@ import pytest import torch -from bitsandbytes.triton.triton_utils import is_triton_available -from bitsandbytes.nn.triton_based_modules import SwitchBackLinear from bitsandbytes.nn import Linear8bitLt +from bitsandbytes.nn.triton_based_modules import SwitchBackLinear +from bitsandbytes.triton.triton_utils import is_triton_available + @pytest.mark.skipif(not is_triton_available() or not torch.cuda.is_available() or not torch.cuda.get_device_capability()[0] >= 8, reason="This test requires triton and a GPU with compute capability 8.0 or higher.") From 29a637bce75341dae1b6a171eabb2fe80d75ef57 Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Tue, 30 Jan 2024 23:05:59 +0200 Subject: [PATCH 009/112] Don't crash Python interpreter via assert(false) (#998) --- bitsandbytes/functional.py | 5 ++++- csrc/ops.cu | 13 ++++--------- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py index 1f624a7a8..11db74859 100644 --- a/bitsandbytes/functional.py +++ b/bitsandbytes/functional.py @@ -1944,7 +1944,10 @@ def igemmlt(A, B, SA, SB, out=None, Sout=None, dtype=torch.int32): ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc ) - if has_error == 1: + if has_error == 100: # `ERR_NOT_IMPLEMENTED` is defined as 100 in `ops.cu` + raise NotImplementedError("igemmlt not available (probably built with NO_CUBLASLT)") + + if has_error: print(f'A: {shapeA}, B: {shapeB}, C: {Sout[0]}; (lda, ldb, ldc): {(lda, ldb, ldc)}; (m, n, k): {(m, n, k)}') raise Exception('cublasLt ran into an error!') diff --git a/csrc/ops.cu b/csrc/ops.cu index 97761216c..796211fed 100644 --- a/csrc/ops.cu +++ b/csrc/ops.cu @@ -11,6 +11,8 @@ #include #include +#define ERR_NOT_IMPLEMENTED 100 + using namespace BinSearch; using std::cout; @@ -421,14 +423,7 @@ template void transform(cublasLtHandle_t ltHandl template int igemmlt(cublasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc) { #ifdef NO_CUBLASLT - cout << "" << endl; - cout << "=============================================" << endl; - cout << "ERROR: Your GPU does not support Int8 Matmul!" << endl; - cout << "=============================================" << endl; - cout << "" << endl; - assert(false); - - return 0; + return ERR_NOT_IMPLEMENTED; #else int has_error = 0; cublasLtMatmulDesc_t matmulDesc = NULL; @@ -484,7 +479,7 @@ template int igemmlt(cublasLtHandle printf("error detected"); return has_error; -#endif +#endif // NO_CUBLASLT } int fill_up_to_nearest_multiple(int value, int multiple) From b90db7eaee2d0561cdc8b11dd71b5359ed7121f3 Mon Sep 17 00:00:00 2001 From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> Date: Wed, 31 Jan 2024 12:15:59 +0100 Subject: [PATCH 010/112] Update build_documentation.yml (#999) --- .github/workflows/build_documentation.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_documentation.yml b/.github/workflows/build_documentation.yml index 2921d70df..f5dc1153d 100644 --- a/.github/workflows/build_documentation.yml +++ b/.github/workflows/build_documentation.yml @@ -15,4 +15,4 @@ jobs: package: bitsandbytes repo_owner: TimDettmers secrets: - token: ${{ secrets.HUGGINGFACE_PUSH }} \ No newline at end of file + hf_token: ${{ secrets.HUGGINGFACE_PUSH }} From fd319d517a2d5b7008ef304b9672150811ffbf99 Mon Sep 17 00:00:00 2001 From: James Wyatt Date: Mon, 25 Sep 2023 01:24:41 +1000 Subject: [PATCH 011/112] minimal fix to support Windows based on @Jamezo97 and @acpopescu work manually cherry-picked from PR #788 and PR #229 and cleanup by wkpark Signed-off-by: Won-Kyu Park --- csrc/cpu_ops.cpp | 19 ++++++++++++++++++- csrc/kernels.cu | 12 ++++++------ csrc/ops.cuh | 1 - 3 files changed, 24 insertions(+), 8 deletions(-) diff --git a/csrc/cpu_ops.cpp b/csrc/cpu_ops.cpp index e28e7b2c2..4741aa6aa 100644 --- a/csrc/cpu_ops.cpp +++ b/csrc/cpu_ops.cpp @@ -1,5 +1,9 @@ #include +#ifdef _WIN32 +#include +#else #include +#endif #include using namespace BinSearch; @@ -31,7 +35,11 @@ void quantize_cpu(float *code, float *A, float *absmax, unsigned char *out, long for(long long offset = 0; offset < num_blocks; offset+=thread_wave_size) { long long valid_chunks = num_blocks - offset >= thread_wave_size ? thread_wave_size : num_blocks - offset; +#ifdef _WIN32 + std::thread *threads = (std::thread *) malloc(sizeof(std::thread) * valid_chunks); +#else pthread_t *threads = (pthread_t *) malloc(sizeof(pthread_t) * valid_chunks); +#endif struct quantize_block_args **args = (quantize_block_args **) malloc(valid_chunks * sizeof(quantize_block_args *)); @@ -55,14 +63,23 @@ void quantize_cpu(float *code, float *A, float *absmax, unsigned char *out, long arg->threadidx = block_idx / blocksize; arg->blocksize = blocksize; +#ifdef _WIN32 + new (&threads[chunks_processed]) std::thread(quantize_block, arg); +#else pthread_create(&threads[chunks_processed], NULL, &quantize_block, (void *) arg); +#endif chunks_processed += 1; if(chunks_processed == valid_chunks){ break; } } for (int i = 0; i < valid_chunks; i++) + { +#ifdef _WIN32 + threads[i].join(); +#else int err = pthread_join(threads[i], NULL); - +#endif + } free(threads); for (int i = 0; i < valid_chunks; i++) free(args[i]); diff --git a/csrc/kernels.cu b/csrc/kernels.cu index 9ebe0a69e..0fff83665 100644 --- a/csrc/kernels.cu +++ b/csrc/kernels.cu @@ -3821,12 +3821,12 @@ template __global__ void kgemm_4bit_inference_naive(int M, int N template __global__ void kExtractOutliers(char *A, int *idx, char *out, int idx_size, int rowsA, int colsA, int tiledRowsA, int tiledColsA); template __global__ void kExtractOutliers(char *A, int *idx, char *out, int idx_size, int rowsA, int colsA, int tiledRowsA, int tiledColsA); -template __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float *dequant_stats, int nnz, int rowsA, int rowsB, int colsB); -template __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float *dequant_stats, int nnz, int rowsA, int rowsB, int colsB); -template __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float *dequant_stats, int nnz, int rowsA, int rowsB, int colsB); -template __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float *dequant_stats, int nnz, int rowsA, int rowsB, int colsB); -template __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float *dequant_stats, int nnz, int rowsA, int rowsB, int colsB); -template __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float *dequant_stats, int nnz, int rowsA, int rowsB, int colsB); +template __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB); +template __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB); +template __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB); +template __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB); +template __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB); +template __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB); template __global__ void kTransformRowToFormat<256, 8, 32, 32*8, 0, COL32>(char *__restrict__ const A, char *out, int rows, int cols, int tiledCols, int outRows, int outCols); template __global__ void kTransformRowToFormat<256, 8, 32, 32*8, 1, COL32>(char *__restrict__ const A, char *out, int rows, int cols, int tiledCols, int outRows, int outCols); diff --git a/csrc/ops.cuh b/csrc/ops.cuh index f37b3b3af..da9df6af0 100644 --- a/csrc/ops.cuh +++ b/csrc/ops.cuh @@ -9,7 +9,6 @@ #include #include -#include #include #include From 1a0dc5c3bbdd78ae7c6a67283596ccadb7951c70 Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Thu, 1 Feb 2024 04:06:47 +0200 Subject: [PATCH 012/112] test_nvidia_transform: fix variable reference (#1000) `out_order` is the global parametrization list, not the test fixture argument --- tests/test_functional.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_functional.py b/tests/test_functional.py index 5b7f83bc3..340278912 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -613,9 +613,9 @@ def test_vector_quant(dim1, dim2, dim3): @pytest.mark.parametrize("dim1, dim2, dim3, dims, dtype, orderA, orderOut, transpose",values,ids=names) def test_nvidia_transform(dim1, dim2, dim3, dims, dtype, orderA, orderOut, transpose): - if dims == 3 and out_order != "col32": + if dims == 3 and orderOut != "col32": return - if dtype == torch.int32 and out_order != "col32": + if dtype == torch.int32 and orderOut != "col32": return try: func = F.get_transform_func(dtype, orderA, orderOut, transpose) From 2336a45cedde1a7b9909c586aa8793b4eb8d00c4 Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Thu, 1 Feb 2024 04:23:28 +0200 Subject: [PATCH 013/112] Test improvements (#1001) * test_nvidia_transform: fix variable reference `out_order` is the global parametrization list, not the test fixture argument * Make `parametrize` use more idiomatic * Use a more deterministic helper for `dim*` determination * Convert NO_CUBLASLT errors into skips too * Mark slow and benchmark tests as such (allows `-k "not benchmark"`) --- pytest.ini | 5 +- tests/conftest.py | 4 + tests/helpers.py | 51 ++++ tests/test_autograd.py | 212 ++++----------- tests/test_functional.py | 521 +++++++++++-------------------------- tests/test_generation.py | 20 +- tests/test_linear4bit.py | 10 +- tests/test_linear8bitlt.py | 8 +- tests/test_modules.py | 35 ++- tests/test_optim.py | 99 +++---- tests/test_triton.py | 3 +- 11 files changed, 344 insertions(+), 624 deletions(-) create mode 100644 tests/helpers.py diff --git a/pytest.ini b/pytest.ini index 9902b98fa..ac6d72e63 100644 --- a/pytest.ini +++ b/pytest.ini @@ -7,4 +7,7 @@ addopts = -rP log_cli = True log_cli_level = INFO -log_file = logs/pytest.log \ No newline at end of file +log_file = logs/pytest.log +markers = + benchmark: mark test as benchmark + slow: mark test as slow diff --git a/tests/conftest.py b/tests/conftest.py index 0b4b91225..7aee8c922 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -5,6 +5,10 @@ def pytest_runtest_call(item): try: item.runtest() + except NotImplementedError as nie: + if "NO_CUBLASLT" in str(nie): + pytest.skip("CUBLASLT not available") + raise except AssertionError as ae: if str(ae) == "Torch not compiled with CUDA enabled": pytest.skip("Torch not compiled with CUDA enabled") diff --git a/tests/helpers.py b/tests/helpers.py new file mode 100644 index 000000000..46c6ef93d --- /dev/null +++ b/tests/helpers.py @@ -0,0 +1,51 @@ +from itertools import product +import random +from typing import Any + +import torch + +test_dims_rng = random.Random(42) + + +def get_test_dims(min: int, max: int, *, n: int) -> list[int]: + return [test_dims_rng.randint(min, max) for _ in range(n)] + + +def format_with_label(label: str, value: Any) -> str: + if isinstance(value, bool): + formatted = "T" if value else "F" + elif isinstance(value, (list, tuple)) and all(isinstance(v, bool) for v in value): + formatted = "".join("T" if b else "F" for b in value) + else: + formatted = str(value) + return f"{label}={formatted}" + + +def id_formatter(label: str): + """ + Return a function that formats the value given to it with the given label. + """ + return lambda value: format_with_label(label, value) + + +DTYPE_NAMES = { + torch.bfloat16: "bf16", + torch.bool: "bool", + torch.float16: "fp16", + torch.float32: "fp32", + torch.float64: "fp64", + torch.int32: "int32", + torch.int64: "int64", + torch.int8: "int8", +} + + +def describe_dtype(dtype: torch.dtype) -> str: + return DTYPE_NAMES.get(dtype) or str(dtype).rpartition(".")[2] + + +TRUE_FALSE = (True, False) +BOOLEAN_TRIPLES = list( + product(TRUE_FALSE, repeat=3) +) # all combinations of (bool, bool, bool) +BOOLEAN_TUPLES = list(product(TRUE_FALSE, repeat=2)) # all combinations of (bool, bool) diff --git a/tests/test_autograd.py b/tests/test_autograd.py index ed482b356..7e70a30ca 100644 --- a/tests/test_autograd.py +++ b/tests/test_autograd.py @@ -1,50 +1,35 @@ -from itertools import product +from typing import Tuple import pytest import torch import bitsandbytes as bnb - -n = 1 -k = 25 -dim1 = torch.randint(16, 64, size=(n,)).tolist() -dim2 = torch.randint(32, 96, size=(n,)).tolist() -dim3 = torch.randint(32, 96, size=(n,)).tolist() -dim4 = torch.randint(32, 96, size=(n,)).tolist() -funcs = [(torch.bmm, bnb.bmm_cublas), (torch.matmul, bnb.matmul_cublas)] -str_funcs = ["bmm", "matmul"] -req_grad = [(False, False), (True, False), (True, True), (False, True)] -req_grad_str = ["FF", "TF", "TT", "FT"] -transpose = [(False, False), (False, True), (True, True), (True, False)] -str_transpose = ["FF", "FT", "TT", "TF"] -dtype = [torch.float32, torch.float16] -values = list( - product(dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose) -) -str_values = list( - product( - dim1, dim2, dim3, dim4, str_funcs, dtype, req_grad_str, str_transpose - ) -) -names = [ - "dim1_{}_dim2_{}_dim3_{}_dim4_{}_func_{}_dtype_{}_requires_grad_{}_transpose_{}".format( - *vals - ) - for vals in str_values -] - - -@pytest.mark.parametrize( - "dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose", - values, - ids=names, +from tests.helpers import ( + BOOLEAN_TRIPLES, + BOOLEAN_TUPLES, + TRUE_FALSE, + describe_dtype, + get_test_dims, + id_formatter, ) -def test_matmul(dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose): + +TRANSPOSE_VALS = [(False, True), (False, False)] + + +@pytest.mark.parametrize("dim1", get_test_dims(16, 64, n=1), ids=id_formatter("dim1")) +@pytest.mark.parametrize("dim2", get_test_dims(32, 96, n=1), ids=id_formatter("dim2")) +@pytest.mark.parametrize("dim3", get_test_dims(32, 96, n=1), ids=id_formatter("dim3")) +@pytest.mark.parametrize("dim4", get_test_dims(32, 96, n=1), ids=id_formatter("dim4")) +@pytest.mark.parametrize("funcs", [(torch.bmm, bnb.bmm_cublas), (torch.matmul, bnb.matmul_cublas)], ids=["func=bmm", "func=matmul"]) +@pytest.mark.parametrize("dtype", [torch.float32, torch.float16], ids=describe_dtype) +@pytest.mark.parametrize("req_grad", BOOLEAN_TUPLES, ids=id_formatter("req_grad")) +@pytest.mark.parametrize("transpose", BOOLEAN_TUPLES, ids=id_formatter("transpose")) +def test_matmul(dim1, dim2, dim3, dim4, funcs, dtype, req_grad: Tuple[bool, bool], transpose: Tuple[bool, bool]): if dim2 > 0: dim2 = dim2 - (dim2 % 16) dim3 = dim3 - (dim3 % 16) dim4 = dim4 - (dim4 % 16) - for i in range(k): + for i in range(25): # normal multiply if funcs[0] in [torch.mm, torch.matmul]: @@ -228,71 +213,17 @@ def test_matmul(dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose): assert (idx == 0).sum().item() < n * 0.02 -n = 1 -k = 3 -dim1 = torch.randint(16, 64, size=(n,)).tolist() -dim2 = torch.randint(32, 96, size=(n,)).tolist() -dim3 = torch.randint(32, 96, size=(n,)).tolist() -dim4 = torch.randint(32, 96, size=(n,)).tolist() - -dim2.append(0) - -decomp = [0.0, 6.0] -funcs = [(torch.matmul, bnb.matmul), (torch.matmul, bnb.research.switchback_bnb)] -str_funcs = ["matmullt", 'switchback_bnb'] -req_grad = [(False, False), (True, False), (True, True), (False, True)] -req_grad = list(product([True, False], repeat=3)) -req_grad_str = [] -for c in req_grad: - strval = '' - for v in c: - if v == True: strval += 'T' - else: strval += 'F' - req_grad_str.append(strval) - -transpose = [(False, True), (False, False)] -str_transpose = ["NT", "NN"] -dtype = [torch.float16, torch.bfloat16, torch.float32] -has_fp16_weights = [True, False] -has_bias = [True, False] -values = list( - product( - dim1, - dim2, - dim3, - dim4, - funcs, - dtype, - req_grad, - transpose, - decomp, - has_fp16_weights, - has_bias - ) -) -str_values = list( - product( - dim1, - dim2, - dim3, - dim4, - str_funcs, - dtype, - req_grad_str, - str_transpose, - decomp, - has_fp16_weights, - has_bias - ) -) -names = ["dim1_{}_dim2_{}_dim3_{}_dim4_{}_func_{}_dtype_{}_requires_grad_{}_transpose_{}_decomp_{}_has_fp16_weights_{}_has_bias_{}".format(*vals) for vals in str_values] - - -@pytest.mark.parametrize( - "dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose, decomp, has_fp16_weights, has_bias", - values, - ids=names, -) +@pytest.mark.parametrize("dim1", get_test_dims(16, 64, n=1), ids=id_formatter("dim1")) +@pytest.mark.parametrize("dim2", [*get_test_dims(32, 96, n=1), 0], ids=id_formatter("dim2")) +@pytest.mark.parametrize("dim3", get_test_dims(32, 96, n=1), ids=id_formatter("dim3")) +@pytest.mark.parametrize("dim4", get_test_dims(32, 96, n=1), ids=id_formatter("dim4")) +@pytest.mark.parametrize("decomp", [0.0, 6.0], ids=id_formatter("decomp")) +@pytest.mark.parametrize("funcs", [(torch.matmul, bnb.matmul), (torch.matmul, bnb.research.switchback_bnb)], ids=["func=matmul", "func=switchback_bnb"]) +@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype) +@pytest.mark.parametrize("req_grad", BOOLEAN_TRIPLES, ids=id_formatter("req_grad")) +@pytest.mark.parametrize("transpose", TRANSPOSE_VALS, ids=id_formatter("transpose")) +@pytest.mark.parametrize("has_fp16_weights", TRUE_FALSE, ids=id_formatter("has_fp16_weights")) +@pytest.mark.parametrize("has_bias", TRUE_FALSE, ids=id_formatter("has_bias")) def test_matmullt( dim1, dim2, @@ -313,7 +244,7 @@ def test_matmullt( req_grad = list(req_grad) req_grad[2] = False - for i in range(k): + for i in range(3): # normal multiply if funcs[0] in [torch.mm, torch.matmul]: @@ -429,45 +360,25 @@ def test_matmullt( torch.testing.assert_close(gradBias1, gradBias2) -n = 1 -k = 3 -dim1 = torch.randint(16, 64, size=(n,)).tolist() -dim2 = torch.randint(32, 96, size=(n,)).tolist() -dim3 = torch.randint(32, 96, size=(n,)).tolist() -dim4 = torch.randint(32, 96, size=(n,)).tolist() - -dim2.append(0) - -funcs = [(torch.matmul, bnb.matmul_4bit)] -str_funcs = ["matmul"] -req_grad = list(product([True, False], repeat=3)) -req_grad_str = [] -for c in req_grad: - strval = '' - for v in c: - if v == True: strval += 'T' - else: strval += 'F' - req_grad_str.append(strval) - -transpose = [(False, True), (False, False)] -str_transpose = ["NT", "NN"] -dtype = [torch.float16, torch.float32] -compress_statistics = [False, True] -has_fp16_weights = [True, False] -has_bias = [True, False] -quant_type = ['fp4', 'nf4'] -values = list(product(dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose, has_bias, compress_statistics, quant_type)) -str_values = list(product(dim1, dim2, dim3, dim4, str_funcs, dtype, req_grad_str, str_transpose, has_bias, compress_statistics, quant_type)) -names = ["dim1_{}_dim2_{}_dim3_{}_dim4_{}_func_{}_dtype_{}_requires_grad_{}_transpose_{}_has_bias_{}_compress_statistics_{}_quant_type_{}".format(*vals) for vals in str_values] -@pytest.mark.parametrize( "dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose, has_bias, compress_statistics, quant_type", values, ids=names) -def test_matmul_4bit( dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose, has_bias, compress_statistics, quant_type): +@pytest.mark.parametrize("dim1", get_test_dims(16, 64, n=1), ids=id_formatter("dim1")) +@pytest.mark.parametrize("dim2", [*get_test_dims(32, 96, n=1), 0], ids=id_formatter("dim2")) +@pytest.mark.parametrize("dim3", get_test_dims(32, 96, n=1), ids=id_formatter("dim3")) +@pytest.mark.parametrize("dim4", get_test_dims(32, 96, n=1), ids=id_formatter("dim4")) +@pytest.mark.parametrize("funcs", [(torch.matmul, bnb.matmul_4bit)], ids=["func=matmul"]) +@pytest.mark.parametrize("req_grad", BOOLEAN_TRIPLES, ids=id_formatter("req_grad")) +@pytest.mark.parametrize("transpose", TRANSPOSE_VALS, ids=id_formatter("transpose")) +@pytest.mark.parametrize("has_bias", TRUE_FALSE, ids=id_formatter("has_bias")) +@pytest.mark.parametrize("dtype", [torch.float16, torch.float32], ids=describe_dtype) +@pytest.mark.parametrize("compress_statistics", TRUE_FALSE, ids=id_formatter("compress_statistics")) +@pytest.mark.parametrize("quant_type", ['fp4', 'nf4'], ids=id_formatter("quant_type")) +def test_matmul_4bit(dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose, has_bias, compress_statistics, quant_type): dimA = (dim2, dim3) if not transpose[0] else (dim3, dim2) dimB = (dim3, dim4) if not transpose[1] else (dim4, dim3) if has_bias == False: req_grad = list(req_grad) req_grad[2] = False - for i in range(k): + for i in range(3): # normal multiply if funcs[0] in [torch.mm, torch.matmul]: A = torch.randn(size=dimA, device="cuda", requires_grad=req_grad[0], dtype=dtype) @@ -530,32 +441,21 @@ def test_matmul_4bit( dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose, torch.testing.assert_close(gradBias1, gradBias2) -funcs = [(torch.matmul, bnb.research.matmul_fp8_mixed), (torch.matmul, bnb.research.matmul_fp8_global)] -str_funcs = ["matmul_fp8_mixed", 'matmul_fp8_global'] -req_grad = list(product([True, False], repeat=3)) -req_grad_str = [] -for c in req_grad: - strval = '' - for v in c: - if v == True: strval += 'T' - else: strval += 'F' - req_grad_str.append(strval) - -transpose = [(False, True), (False, False)] -str_transpose = ["NT", "NN"] -dtype = [torch.float16, torch.float32] -has_fp16_weights = [True, False] -values = list(product(dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose)) -str_values = list(product(dim1, dim2, dim3, dim4, str_funcs, dtype, req_grad_str, str_transpose)) -names = ["dim1_{}_dim2_{}_dim3_{}_dim4_{}_func_{}_dtype_{}_requires_grad_{}_transpose_{}".format(*vals) for vals in str_values] -@pytest.mark.parametrize( "dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose", values, ids=names) +@pytest.mark.parametrize("dim1", get_test_dims(16, 64, n=1), ids=id_formatter("dim1")) +@pytest.mark.parametrize("dim2", [*get_test_dims(32, 96, n=1), 0], ids=id_formatter("dim2")) +@pytest.mark.parametrize("dim3", get_test_dims(32, 96, n=1), ids=id_formatter("dim3")) +@pytest.mark.parametrize("dim4", get_test_dims(32, 96, n=1), ids=id_formatter("dim4")) +@pytest.mark.parametrize("req_grad", BOOLEAN_TRIPLES, ids=id_formatter("req_grad")) +@pytest.mark.parametrize("transpose", TRANSPOSE_VALS, ids=id_formatter("transpose")) +@pytest.mark.parametrize("dtype", [torch.float16, torch.float32], ids=describe_dtype) +@pytest.mark.parametrize("funcs", [(torch.matmul, bnb.research.matmul_fp8_mixed), (torch.matmul, bnb.research.matmul_fp8_global)], ids=["matmul_fp8_mixed", 'matmul_fp8_global']) def test_matmul_fp8( dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose): dimA = (dim2, dim3) if not transpose[0] else (dim3, dim2) dimB = (dim3, dim4) if not transpose[1] else (dim4, dim3) req_grad = list(req_grad) req_grad[2] = False - for i in range(k): + for i in range(3): # normal multiply if funcs[0] in [torch.mm, torch.matmul]: A = torch.randn(size=dimA, device="cuda", requires_grad=req_grad[0], dtype=dtype) diff --git a/tests/test_functional.py b/tests/test_functional.py index 340278912..f4b8fca51 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -11,6 +11,13 @@ import bitsandbytes as bnb from bitsandbytes import functional as F +from tests.helpers import ( + BOOLEAN_TUPLES, + TRUE_FALSE, + describe_dtype, + get_test_dims, + id_formatter, +) torch.set_printoptions( precision=5, sci_mode=False, linewidth=120, edgeitems=20, threshold=10000 @@ -155,10 +162,10 @@ def test_dynamic_quantization(): -@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16], ids=["fp32", "fp16", "bf16"]) -@pytest.mark.parametrize("nested", [False, True], ids=["False", "True"]) +@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16], ids=describe_dtype) +@pytest.mark.parametrize("nested", TRUE_FALSE, ids=id_formatter("nested")) @pytest.mark.parametrize("blocksize", [4096, 2048, 1024, 512, 256, 128, 64]) -@pytest.mark.parametrize("signed", [True, False], ids=['signed_True', 'signed_False']) +@pytest.mark.parametrize("signed", TRUE_FALSE, ids=id_formatter("signed")) def test_dynamic_blockwise_quantization(dtype, nested, blocksize, signed): #print('') diffs = [] @@ -281,34 +288,22 @@ def mean(xx): return sum(xx) / float(len(xx)) -# dim1 = torch.randint(1,1024*4, size=(4,)).tolist() -# dim2 = torch.randint(1,1024*4, size=(4,)).tolist() -dim1 = [1024 * 2] -dim2 = [1024 * 16] -methods = [ - ( +methods = { + "linear": ( lambda x, dim: quant(x), lambda x, dim: quant(x), dequant, dequant, mm_dequant, - ) -] -methods.append((quant_multi, quant_multi, dequant, dequant, mm_dequant)) -# methods.append((lambda x: quant_multi_chunk(x, dim=-1), lambda x: quant_multi_chunk(x, dim=0), dequant, dequant, mm_dequant)) -method_names = ["linear", "vectorwise"] -batched = [False, True] -values = list(product(dim1, dim2, methods, batched)) -values_names = list(product(dim1, dim2, method_names, batched)) -names = [ - "dim1_{}_dim2_{}_quant_{}_batched_{}".format(*vals) - for vals in values_names -] + ), + "vectorwise": (quant_multi, quant_multi, dequant, dequant, mm_dequant), +} -@pytest.mark.parametrize( - "dim1, dim2, quant_methods, batched", values, ids=names -) +@pytest.mark.parametrize("dim1", [1024 * 2], ids=id_formatter("dim1")) +@pytest.mark.parametrize("dim2", [1024 * 16], ids=id_formatter("dim2")) +@pytest.mark.parametrize("quant_methods", methods.values(), ids=methods.keys()) +@pytest.mark.parametrize("batched", TRUE_FALSE, ids=id_formatter("batched")) def test_approx_igemm(dim1, dim2, quant_methods, batched): dim1 = dim1 - (dim1 % 32) dim2 = dim2 - (dim2 % 32) @@ -352,21 +347,10 @@ def test_stable_embedding(): layer.reset_parameters() -n = 2 -hidden_dim = torch.randint(32, 256, size=(n,)).tolist() -batch_dim = torch.randint(16, 256, size=(n,)).tolist() -seq_dim = torch.randint(16, 256, size=(n,)).tolist() -transpose = [(False, False), (False, True), (True, False), (True, True)] -values = list(product(hidden_dim, batch_dim, transpose, seq_dim)) -names = [ - "hidden_dim_{}_batch_dim_{},transpose_{}_seq_dim_{}".format(*vals) - for vals in values -] - - -@pytest.mark.parametrize( - "hidden_dim, batch_dim, transpose, seq_dim", values, ids=names -) +@pytest.mark.parametrize("hidden_dim", get_test_dims(32, 256, n=2), ids=id_formatter("hidden_dim")) +@pytest.mark.parametrize("batch_dim", get_test_dims(16, 256, n=2), ids=id_formatter("batch_dim")) +@pytest.mark.parametrize("seq_dim", get_test_dims(16, 256, n=2), ids=id_formatter("seq_dim")) +@pytest.mark.parametrize("transpose", BOOLEAN_TUPLES, ids=id_formatter("transpose")) def test_igemm(hidden_dim, batch_dim, transpose, seq_dim): hidden_dim = hidden_dim - (hidden_dim % 32) batch_dim = batch_dim - (batch_dim % 16) @@ -418,17 +402,9 @@ def test_igemm(hidden_dim, batch_dim, transpose, seq_dim): torch.testing.assert_close(out.float(), out2) -n = 3 -seq_dim = torch.randint(32, 512, size=(n,)).tolist() -hidden_dim = torch.randint(32, 1024 * 4, size=(n,)).tolist() -batch_dim = torch.randint(2, 16, size=(n,)).tolist() -values = list(product(seq_dim, hidden_dim, batch_dim)) -names = [ - "seq_dim{}_hidden_dim{}_batch_dim{}".format(*vals) for vals in values -] - - -@pytest.mark.parametrize("seq_dim, hidden_dim, batch_dim", values, ids=names) +@pytest.mark.parametrize("seq_dim", get_test_dims(32, 512, n=3), ids=id_formatter("seq_dim")) +@pytest.mark.parametrize("hidden_dim", get_test_dims(32, 1024 * 4, n=3), ids=id_formatter("hidden_dim")) +@pytest.mark.parametrize("batch_dim", get_test_dims(2, 16, n=3), ids=id_formatter("batch_dim")) def test_dim3_igemm(seq_dim, hidden_dim, batch_dim): seq_dim = seq_dim - (seq_dim % 32) hidden_dim = hidden_dim - (hidden_dim % 32) @@ -449,21 +425,10 @@ def test_dim3_igemm(seq_dim, hidden_dim, batch_dim): torch.testing.assert_close(out.float(), out2) -n = 2 -seq_dim = torch.randint(32, 512, size=(n,)).tolist() -hidden_dim = torch.randint(32, 1024 * 4, size=(n,)).tolist() -batch_dim = torch.randint(2, 16, size=(n,)).tolist() -transpose = [False, True] -values = list(product(seq_dim, hidden_dim, batch_dim, transpose)) -names = [ - "seq_dim={}_hidden_dim={}_batch_dim={}_transpose{}".format(*vals) - for vals in values -] - - -@pytest.mark.parametrize( - "seq_dim, hidden_dim, batch_dim, transpose", values, ids=names -) +@pytest.mark.parametrize("seq_dim", get_test_dims(32, 512, n=2), ids=id_formatter("seq_dim")) +@pytest.mark.parametrize("hidden_dim", get_test_dims(32, 1024 * 4, n=2), ids=id_formatter("hidden_dim")) +@pytest.mark.parametrize("batch_dim", get_test_dims(2, 16, n=2), ids=id_formatter("batch_dim")) +@pytest.mark.parametrize("transpose", TRUE_FALSE, ids=id_formatter("transpose")) def test_minmax_igemm(seq_dim, hidden_dim, batch_dim, transpose): def min_max(x): maxA = torch.amax(x, dim=2, keepdim=True) @@ -533,20 +498,11 @@ def min_max(x): assert mean(relerrs) < 0.3 -n = 2 -dim1 = torch.randint(1, 64, size=(n,)).tolist() -dim2 = torch.randint(32, 128, size=(n,)).tolist() -dim3 = torch.randint(32, 256, size=(n,)).tolist() -dim4 = torch.randint(32, 256, size=(n,)).tolist() -transpose = [(False, False), (True, False), (False, True), (True, True)] -values = list(product(dim1, dim2, dim3, dim4, transpose)) -names = [ - "dim1_{}_dim2_{}_dim3_{}_dim4_{}_transpose_{}".format(*vals) - for vals in values -] - - -@pytest.mark.parametrize("dim1, dim2, dim3, dim4, transpose", values, ids=names) +@pytest.mark.parametrize("dim1", get_test_dims(1, 64, n=2), ids=id_formatter("dim1")) +@pytest.mark.parametrize("dim2", get_test_dims(32, 128, n=2), ids=id_formatter("dim2")) +@pytest.mark.parametrize("dim3", get_test_dims(32, 256, n=2), ids=id_formatter("dim3")) +@pytest.mark.parametrize("dim4", get_test_dims(32, 256, n=2), ids=id_formatter("dim4")) +@pytest.mark.parametrize("transpose", BOOLEAN_TUPLES, ids=id_formatter("transpose")) def test_ibmm(dim1, dim2, dim3, dim4, transpose): dim2 = dim2 - (dim2 % 16) dim3 = dim3 - (dim3 % 16) @@ -574,15 +530,9 @@ def test_ibmm(dim1, dim2, dim3, dim4, transpose): torch.testing.assert_close(out.float(), out2.float()) -n = 1 -dim1 = torch.randint(1, 64, size=(n,)).tolist() -dim2 = torch.randint(32, 128, size=(n,)).tolist() -dim3 = torch.randint(32, 256, size=(n,)).tolist() -values = list(product(dim1, dim2, dim3)) -names = ["dim1_{}_dim2_{}_dim3_{}".format(*vals) for vals in values] - - -@pytest.mark.parametrize("dim1, dim2, dim3", values, ids=names) +@pytest.mark.parametrize("dim1", get_test_dims(1, 64, n=1), ids=id_formatter("dim1")) +@pytest.mark.parametrize("dim2", get_test_dims(32, 128, n=1), ids=id_formatter("dim2")) +@pytest.mark.parametrize("dim3", get_test_dims(32, 256, n=1), ids=id_formatter("dim3")) def test_vector_quant(dim1, dim2, dim3): dim2 = dim2 - (dim2 % 16) dim3 = dim3 - (dim3 % 16) @@ -594,24 +544,14 @@ def test_vector_quant(dim1, dim2, dim3): assert_all_approx_close(A1, A, atol=0.01, rtol=0.1, count=int(n*0.002)) - - -n = 2 -dim1 = torch.randint(2, 256, size=(n,)).tolist() -dim2 = torch.randint(2, 256, size=(n,)).tolist() -dim3 = torch.randint(2, 256, size=(n,)).tolist() -# dim1, dim2 = (256,), (256,) -dtype = [torch.int8, torch.int32] -a_order = ["row"] -out_order = ["col", "row", "col32"] -transpose = [False] -dims = [2, 3] -values = list(product(dim1, dim2, dim3, dims, dtype, a_order, out_order, transpose)) - -names = ["dim1_{}_dim2_{}_dim3_{}_dims_{}_dtype_{}_orderA_{}_orderOut_{}_transpose_{}".format(*vals)for vals in values] - - -@pytest.mark.parametrize("dim1, dim2, dim3, dims, dtype, orderA, orderOut, transpose",values,ids=names) +@pytest.mark.parametrize("dim1", get_test_dims(2, 256, n=2), ids=id_formatter("dim1")) +@pytest.mark.parametrize("dim2", get_test_dims(2, 256, n=2), ids=id_formatter("dim2")) +@pytest.mark.parametrize("dim3", get_test_dims(2, 256, n=2), ids=id_formatter("dim3")) +@pytest.mark.parametrize("dtype", [torch.int8, torch.int32], ids=describe_dtype) +@pytest.mark.parametrize("orderA", ["row"], ids=id_formatter("orderA")) +@pytest.mark.parametrize("orderOut", ["col", "row", "col32"], ids=id_formatter("orderOut")) +@pytest.mark.parametrize("transpose", [False], ids=id_formatter("transpose")) +@pytest.mark.parametrize("dims", [2, 3], ids=id_formatter("dims")) def test_nvidia_transform(dim1, dim2, dim3, dims, dtype, orderA, orderOut, transpose): if dims == 3 and orderOut != "col32": return @@ -677,28 +617,12 @@ def test_nvidia_transform(dim1, dim2, dim3, dims, dtype, orderA, orderOut, trans torch.testing.assert_close(A, out2) -n = 1 -dim1 = torch.randint(1, 256, size=(n,)).tolist() -dim2 = torch.randint(32, 512, size=(n,)).tolist() -dim3 = torch.randint(32, 1024, size=(n,)).tolist() -dim4 = torch.randint(32, 1024, size=(n,)).tolist() - -# dim1 = [2] -# dim2 = [2] -# dim3 = [2] -# dim4 = [2] - -dims = (2, 3) -ldb = [0] -# ldb = list(range(256, 1*1024, 256)) -values = list(product(dim1, dim2, dim3, dim4, dims, ldb)) -names = [ - "dim1_{}_dim2_{}_dim3_{}_dim4_{}_dims_{}_ldb_{}".format(*vals) - for vals in values -] - - -@pytest.mark.parametrize("dim1, dim2, dim3, dim4, dims, ldb", values, ids=names) +@pytest.mark.parametrize("dim1", get_test_dims(1, 256, n=1), ids=id_formatter("dim1")) +@pytest.mark.parametrize("dim2", get_test_dims(32, 512, n=1), ids=id_formatter("dim2")) +@pytest.mark.parametrize("dim3", get_test_dims(32, 1024, n=1), ids=id_formatter("dim3")) +@pytest.mark.parametrize("dim4", get_test_dims(32, 1024, n=1), ids=id_formatter("dim4")) +@pytest.mark.parametrize("dims", (2, 3), ids=id_formatter("dims")) +@pytest.mark.parametrize("ldb", (0,), ids=id_formatter("ldb")) def test_igemmlt_int(dim1, dim2, dim3, dim4, dims, ldb): for i in range(k): if dims == 2: @@ -732,21 +656,11 @@ def test_igemmlt_int(dim1, dim2, dim3, dim4, dims, ldb): torch.testing.assert_close(C1, C3.float()) -dim1 = [32] -dim2 = [32] -dim3 = [32] -dim4 = [32] - -dims = (2,) -# ldb = list(range(256, 1*1024, 256)) -values = list(product(dim1, dim2, dim3, dim4, dims)) -names = [ - "dim1_{}_dim2_{}_dim3_{}_dim4_{}_dims_{}".format(*vals) - for vals in values -] - - -@pytest.mark.parametrize("dim1, dim2, dim3, dim4, dims", values, ids=names) +@pytest.mark.parametrize("dim1", [32], ids=id_formatter("dim1")) +@pytest.mark.parametrize("dim2", [32], ids=id_formatter("dim2")) +@pytest.mark.parametrize("dim3", [32], ids=id_formatter("dim3")) +@pytest.mark.parametrize("dim4", [32], ids=id_formatter("dim4")) +@pytest.mark.parametrize("dims", (2,), ids=id_formatter("dims")) def test_igemmlt_half(dim1, dim2, dim3, dim4, dims): formatB = F.get_special_format_str() for i in range(k): @@ -786,24 +700,15 @@ def test_igemmlt_half(dim1, dim2, dim3, dim4, dims): # C3, S = F.transform(C2, 'row', state=SC) # torch.testing.assert_close(C1, C3.float()) - -batch_size = 2 -seqdim = 512 -# values = [(batch_size, seqdim, 4*1024, 16*1024),(batch_size, seqdim, 5120, 4*5120),(batch_size, seqdim, 12*1024, 4*12*1024)] -values = [ - (batch_size, seqdim, 4 * 1024, 3 * 4 * 1024), - (batch_size, seqdim, 5120, 3 * 5120), - (batch_size, seqdim, 12 * 1024, 4 * 12 * 1024), -] - - -# values = list(product(batch, seq, model, hidden)) -names = [ - "batch_{}_seq_{}_model_{}_hidden_{}".format(*vals) for vals in values -] - - -@pytest.mark.parametrize("batch, seq, model, hidden", values, ids=names) +@pytest.mark.parametrize( + ("batch", "seq", "model", "hidden"), + [ + pytest.param(2, 512, 4 * 1024, 3 * 4 * 1024, id="batch=2, seq=512, model=4k, hidden=12k"), + pytest.param(2, 512, 5120, 3 * 5120, id="batch=2, seq=512, model=5k, hidden=15k"), + pytest.param(2, 512, 12 * 1024, 4 * 12 * 1024, id="batch=2, seq=512, model=12k, hidden=48k"), + ], +) +@pytest.mark.benchmark def test_bench_8bit_training(batch, seq, model, hidden): formatB = F.get_special_format_str() A = torch.randn(batch, seq, model, device="cuda").half() @@ -953,24 +858,11 @@ def test_bench_8bit_training(batch, seq, model, hidden): # print(t8) -n = 2 -dim1 = torch.randint(64, 256, size=(n,)).tolist() -dim4 = torch.randint(64, 1024, size=(n,)).tolist() - -#dim1 = [2*1024] -#dim4 = [2*1024] - -#dim1 = [4] -#dim4 = [4] - -dims = (2,) -formatB = ["col_turing", "col_ampere"] -has_bias = [True, False] -values = list(product(dim1, dim4, dims, formatB, has_bias)) -names = ["dim1_{}_dim4_{}_dims_{}_formatB_{}_has_bias_{}".format(*vals) for vals in values] - - -@pytest.mark.parametrize("dim1, dim4, dims, formatB, has_bias", values, ids=names) +@pytest.mark.parametrize("dim1", get_test_dims(64, 256, n=2), ids=id_formatter("dim1")) +@pytest.mark.parametrize("dim4", get_test_dims(64, 1024, n=2), ids=id_formatter("dim4")) +@pytest.mark.parametrize("dims", (2,), ids=id_formatter("dims")) +@pytest.mark.parametrize("formatB", ["col_turing", "col_ampere"], ids=id_formatter("formatB")) +@pytest.mark.parametrize("has_bias", TRUE_FALSE, ids=id_formatter("has_bias")) def test_dequant_mm(dim1, dim4, dims, formatB, has_bias): inner = torch.randint(1, 128, size=(1,)).item() bias = None @@ -994,33 +886,23 @@ def test_dequant_mm(dim1, dim4, dims, formatB, has_bias): if has_bias: C4 += bias # TODO: is something wrong here? If so, the problem goes deeper - #n = C1.numel() - #p = 0.06 + # n = C1.numel() + # p = 0.06 std = C1.std(0).view(1, -1) C1 /= std C4 /= std - #assert_all_approx_close(C1, C4, atol=0.02, rtol=0.1, count=int(n*0.06)) - #assert (count / n < p), f"error in more than {p} of elements: {count}/{n}={count/n}" + # assert_all_approx_close(C1, C4, atol=0.02, rtol=0.1, count=int(n*0.06)) + # assert (count / n < p), f"error in more than {p} of elements: {count}/{n}={count/n}" C5 = F.mm_dequant(C2, SC, maxA.flatten(), maxB.flatten(), bias=bias) - #torch.testing.assert_close(C5, C4, atol=0.015, rtol=0.1) + # torch.testing.assert_close(C5, C4, atol=0.015, rtol=0.1) n = C5.numel() - assert_all_approx_close(C1, C4, atol=0.015, rtol=0.1, count=int(0.01*n)) - - -n = 2 -dim1 = [1 * 1024] -dim2 = [1 * 1024] -# dim1 = torch.randint(1,4*1024, size=(n,)).tolist() -# dim2 = torch.randint(1,4*1024, size=(n,)).tolist() + assert_all_approx_close(C1, C4, atol=0.015, rtol=0.1, count=int(0.01 * n)) -dims = (2,) -# ldb = list(range(256, 1*1024, 256)) -values = list(product(dim1, dim2, dims)) -names = ["dim1_{}_dim2_{}_dims_{}".format(*vals) for vals in values] - -@pytest.mark.parametrize("dim1, dim2, dims", values, ids=names) +@pytest.mark.parametrize("dim1", [1 * 1024], ids=id_formatter("dim1")) +@pytest.mark.parametrize("dim2", [1 * 1024], ids=id_formatter("dim2")) +@pytest.mark.parametrize("dims", (2,), ids=id_formatter("dims")) def test_colrow_absmax(dim1, dim2, dims): for i in range(k): threshold = 3.0 @@ -1066,17 +948,8 @@ def test_colrow_absmax(dim1, dim2, dims): assert nnz_block_ptr2 is None -n = 2 -# dim1 = [8*1024] -# dim2 = [4*1024] -dim1 = torch.randint(1, 4 * 1024, size=(n,)).tolist() -dim2 = torch.randint(1, 4 * 1024, size=(n,)).tolist() - -values = list(product(dim1, dim2)) -names = ["dim1_{}_dim2_{}".format(*vals) for vals in values] - - -@pytest.mark.parametrize("dim1, dim2", values, ids=names) +@pytest.mark.parametrize("dim1", get_test_dims(1, 4 * 1024, n=2), ids=id_formatter("dim1")) +@pytest.mark.parametrize("dim2", get_test_dims(1, 4 * 1024, n=2), ids=id_formatter("dim2")) def test_double_quant(dim1, dim2): for i in range(k): A = torch.randn(dim1, dim2, device="cuda").half() @@ -1114,16 +987,18 @@ def test_double_quant(dim1, dim2): torch.testing.assert_close(Scol.flatten().float(), statsAt) -n = 4 -dim1 = torch.randint(1, 4 * 1024, size=(n,)).tolist() -dim4 = torch.randint(1, 4 * 1024, size=(n,)).tolist() -inner = torch.randint(1, 4 * 1024, size=(n,)).tolist() - -values = list(zip(dim1, dim4, inner)) -names = ["dim1_{}_dim4_{}_inner_{}".format(*vals) for vals in values] - - -@pytest.mark.parametrize("dim1, dim4, inner", values, ids=names) +@pytest.mark.parametrize( + ("dim1", "dim4", "inner"), + ( + pytest.param(dim1, dim4, inner, id=f"{dim1=},{dim4=},{inner=}") + for (dim1, dim4, inner) + in zip( + get_test_dims(1, 4 * 1024, n=4), + get_test_dims(1, 4 * 1024, n=4), + get_test_dims(1, 4 * 1024, n=4), + ) + ) +) def test_integrated_igemmlt(dim1, dim4, inner): for i in range(k): A = torch.randn(dim1, inner, device="cuda").half() @@ -1158,16 +1033,18 @@ def test_integrated_igemmlt(dim1, dim4, inner): assert err2 <= err1 * 1.025 -n = 6 -dim1 = torch.randint(1, 4 * 1024, size=(n,)).tolist() -dim4 = torch.randint(1, 4 * 1024, size=(n,)).tolist() -inner = torch.randint(1, 4 * 1024, size=(n,)).tolist() - -values = list(zip(dim1, dim4, inner)) -names = ["dim1_{}_dim4_{}_inner_{}".format(*vals) for vals in values] - - -@pytest.mark.parametrize("dim1, dim4, inner", values, ids=names) +@pytest.mark.parametrize( + ("dim1", "dim4", "inner"), + ( + pytest.param(dim1, dim4, inner, id=f"{dim1=},{dim4=},{inner=}") + for (dim1, dim4, inner) + in zip( + get_test_dims(1, 4 * 1024, n=6), + get_test_dims(1, 4 * 1024, n=6), + get_test_dims(1, 4 * 1024, n=6), + ) + ) +) @pytest.mark.skip("Row scale has some bugs for ampere") def test_igemmlt_row_scale(dim1, dim4, inner): formatB = F.get_special_format_str() @@ -1234,17 +1111,17 @@ def test_igemmlt_row_scale(dim1, dim4, inner): print(sum(err3) / len(err3)) -dim1 = [1024, 2048] -inner = [12288 * 4, 4096 * 4] -dim4 = [12288, 4096] - -values = list(zip(dim1, dim4, inner)) -names = ["dim1_{}_dim4_{}_inner_{}".format(*vals) for vals in values] - - -@pytest.mark.parametrize("dim1, dim4, inner", values, ids=names) +@pytest.mark.parametrize( + ("dim1", "dim4", "inner"), + [ + pytest.param(1024, 12288 * 4, 12288, id="1024, 12288*4, 12288"), + pytest.param(2048, 4096 * 4, 4096, id="2048, 4096*4, 4096"), + ], +) @pytest.mark.skip("Row scale has some bugs for ampere") +@pytest.mark.benchmark def test_row_scale_bench(dim1, dim4, inner): + formatB = F.get_special_format_str() err1, err2, err3 = [], [], [] relerr1, relerr2 = [], [] scale = 1 @@ -1289,34 +1166,14 @@ def test_row_scale_bench(dim1, dim4, inner): print("vector-wise", time.time() - t0) -n = 2 -dim1 = torch.randint(2, 1024, size=(n,)).tolist() -dim2 = torch.randint(2, 1024, size=(n,)).tolist() -# dim1 = [8*1024] -# dim2 = [4*1024] - -dim3 = [0] -dtype = [torch.int8] -a_order = ["row"] -out_order = ["col32", "col_turing", "col_ampere"] -transpose = [False, True] -dims = [2] -values = list( - product(dim1, dim2, dim3, dims, dtype, a_order, out_order, transpose) -) -names = [ - "dim1_{}_dim2_{}_dim3_{}_dims_{}_dtype_{}_orderA_{}_orderOut_{}_{}".format( - *vals - ) - for vals in values -] - - -@pytest.mark.parametrize( - "dim1, dim2, dim3, dims, dtype, orderA, orderOut, transpose", - values, - ids=names, -) +@pytest.mark.parametrize("dim1", get_test_dims(2, 1024, n=2), ids=id_formatter("dim1")) +@pytest.mark.parametrize("dim2", get_test_dims(2, 1024, n=2), ids=id_formatter("dim2")) +@pytest.mark.parametrize("dim3", [0], ids=id_formatter("dim3")) +@pytest.mark.parametrize("dims", [2], ids=id_formatter("dims")) +@pytest.mark.parametrize("dtype", [torch.int8], ids=describe_dtype) +@pytest.mark.parametrize("orderA", ["row"], ids=id_formatter("orderA")) +@pytest.mark.parametrize("orderOut", ["col32", "col_turing", "col_ampere"], ids=id_formatter("orderOut")) +@pytest.mark.parametrize("transpose", TRUE_FALSE, ids=id_formatter("transpose")) def test_transform(dim1, dim2, dim3, dims, dtype, orderA, orderOut, transpose): for i in range(k): if dims == 2: @@ -1344,23 +1201,6 @@ def test_transform(dim1, dim2, dim3, dims, dtype, orderA, orderOut, transpose): torch.testing.assert_close(out1, out2) -n = 2 -# dim1 = torch.randint(2,1024, size=(n,)).tolist() -# dim2 = torch.randint(2,1024, size=(n,)).tolist() -dim1 = [1] -dim2 = [33] - -dtype = [torch.int8] -# a_order = ['col_turing', 'col_ampere'] -a_order = ["col_turing"] -out_order = ["row"] -values = list(product(dim1, dim2, dtype, a_order, out_order)) -names = [ - "dim1_{}_dim2_{}_dtype_{}_orderA_{}_orderOut_{}".format(*vals) - for vals in values -] - - def test_overflow(): formatB = F.get_special_format_str() print(formatB) @@ -1375,17 +1215,8 @@ def test_overflow(): c2 = torch.matmul(a.float(), b.float().t()) -n = 2 -dim1 = torch.randint(1, 4 * 1024, size=(n,)).tolist() -dim2 = torch.randint(1, 4 * 1024, size=(n,)).tolist() -# dim1 = [4] -# dim2 = [5] - -values = list(product(dim1, dim2)) -names = ["dim1_{}_dim2_{}".format(*vals) for vals in values] - - -@pytest.mark.parametrize("dim1, dim2", values, ids=names) +@pytest.mark.parametrize("dim1", get_test_dims(1, 4 * 1024, n=2), ids=id_formatter("dim1")) +@pytest.mark.parametrize("dim2", get_test_dims(1, 4 * 1024, n=2), ids=id_formatter("dim2")) def test_coo_double_quant(dim1, dim2): threshold = 3.00 for i in range(k): @@ -1412,17 +1243,9 @@ def test_coo_double_quant(dim1, dim2): ) -n = 2 -dim1 = torch.randint(1, 1 * 1024, size=(n,)).tolist() -dim2 = torch.randint(1, 1 * 1024, size=(n,)).tolist() -# dim1 = [7] -# dim2 = [11] -transposed_B = [False, True] -values = list(product(dim1, dim2, transposed_B)) -names = ["dim1_{}_dim2_{}_transposed_B_{}".format(*vals) for vals in values] - - -@pytest.mark.parametrize("dim1, dim2, transposed_B", values, ids=names) +@pytest.mark.parametrize("dim1", get_test_dims(1, 1 * 1024, n=2), ids=id_formatter("dim1")) +@pytest.mark.parametrize("dim2", get_test_dims(1, 1 * 1024, n=2), ids=id_formatter("dim2")) +@pytest.mark.parametrize("transposed_B", TRUE_FALSE, ids=id_formatter("transposed_B")) def test_spmm_coo(dim1, dim2, transposed_B): threshold = 1.5 dim3 = torch.randint(32, 128, size=(1,)).item() @@ -1453,6 +1276,7 @@ def test_spmm_coo(dim1, dim2, transposed_B): assert_all_approx_close(out1, out2, rtol=0.01, atol=3.0e-2, count=30) +@pytest.mark.benchmark def test_spmm_bench(): batch = 2 model = 1024 * 1 @@ -1496,14 +1320,8 @@ def test_spmm_bench(): print(tsp / t8) -n = 2 -dim1 = torch.randint(256, 1 * 1024, size=(n,)).tolist() -dim2 = torch.randint(256, 1 * 1024, size=(n,)).tolist() -values = list(product(dim1, dim2)) -names = ["dim1_{}_dim2_{}".format(*vals) for vals in values] - - -@pytest.mark.parametrize("dim1, dim2", values, ids=names) +@pytest.mark.parametrize("dim1", get_test_dims(256, 1024, n=2), ids=id_formatter("dim1")) +@pytest.mark.parametrize("dim2", get_test_dims(256, 1024, n=2), ids=id_formatter("dim2")) def test_integrated_sparse_decomp(dim1, dim2): threshold = 3.0 formatB = "col_turing" @@ -1553,23 +1371,10 @@ def test_matmuls(): print(err1, err2) -n = 2 -# dim1 = torch.randint(1,1*1024, size=(n,)).tolist() -# dim2 = torch.randint(1,4*1024, size=(n,)).tolist() -dim1 = [1 * 2048] -dim2 = [12288] -# dim1 = [32] -# dim2 = [32] -# dtype = [torch.float16, torch.int8] -dtype = [torch.float16] -out_function = ["zeros", "ones"] -values = list(product(dim1, dim2, dtype, out_function)) -names = [ - "dim1_{}_dim2_{}_dtype_{}_out_func_{}".format(*vals) for vals in values -] - - -@pytest.mark.parametrize("dim1, dim2, dtype, out_func", values, ids=names) +@pytest.mark.parametrize("dim1", [1 * 2048], ids=id_formatter("dim1")) +@pytest.mark.parametrize("dim2", [12288], ids=id_formatter("dim2")) +@pytest.mark.parametrize("dtype", [torch.float16], ids=describe_dtype) +@pytest.mark.parametrize("out_func", ["zeros", "ones"], ids=id_formatter("out_func")) def test_spmm_coo_very_sparse(dim1, dim2, dtype, out_func): out_func = getattr(torch, out_func) @@ -1672,20 +1477,9 @@ def test_coo2csc(): torch.testing.assert_close(A2.t()[idx], cscA.values) -n = 2 -# dim1 = torch.randint(1,1*1024, size=(n,)).tolist() -# dim2 = torch.randint(1,4*1024, size=(n,)).tolist() -dim1 = [1 * 2048] -# dim2 = [12288] -dim2 = [2048] -# dim1 = [2] -# dim2 = [2] -dtype = [torch.int8] -values = list(product(dim1, dim2, dtype)) -names = ["dim1_{}_dim2_{}_dtype_{}".format(*vals) for vals in values] - - -@pytest.mark.parametrize("dim1, dim2, dtype", values, ids=names) +@pytest.mark.parametrize("dim1", [1 * 2048]) +@pytest.mark.parametrize("dim2", [2048]) +@pytest.mark.parametrize("dtype", [torch.int8]) def test_spmm_coo_dequant(dim1, dim2, dtype): threshold = 6.0 # threshold = 2.8 @@ -1786,22 +1580,11 @@ def test_spmm_coo_dequant(dim1, dim2, dtype): print("partial matmul", time.time() - t0) -batch_size = 1 -seqdim = 1 -values = [] -#values.append((batch_size, seqdim, 768, 4 * 768)) -#values.append((batch_size, seqdim, 1024, 4*1024)) -#values.append((batch_size, seqdim, 1536, 4*1536)) -#values.append((batch_size, seqdim, 2048, 4*2048)) -#values.append((batch_size, seqdim, 2560, 4*2560)) -#values.append((batch_size, seqdim, 4096, 4*4096)) -#values.append((batch_size, seqdim, 5120, 4*5120)) -values.append((batch_size, seqdim, 6656, 4*6656)) -#values.append((batch_size, seqdim, 8192, 4*8192)) -#values.append((batch_size, seqdim, 5140, 4*5140)) -#values.append((batch_size, seqdim, 12288, 4*12288)) -names = ["batch_{}_seq_{}_model_{}_hidden_{}".format(*vals) for vals in values] -@pytest.mark.parametrize("batch, seq, model, hidden", values, ids=names) +@pytest.mark.parametrize( + ("batch", "seq", "model", "hidden"), + [pytest.param(1, 1, 6656, 4*6656, id="batch=1, seq=1, model=6656, hidden=26k")], +) +@pytest.mark.benchmark def test_bench_matmul(batch, seq, model, hidden): iters = 1000 formatB = F.get_special_format_str() @@ -2226,6 +2009,7 @@ def test_kbit_quantile_estimation(): assert err < 0.035 +@pytest.mark.benchmark def test_bench_dequantization(): a = torch.rand(1024, 1024, device='cuda').half() code =F.create_fp8_map(True, 3, 0, 4).cuda() @@ -2244,7 +2028,7 @@ def test_bench_dequantization(): -@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16], ids=["fp32", "fp16", "bf16"]) +@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16], ids=describe_dtype) def test_fp4_quant(dtype): vals = list(product([0, 1], repeat=4)) @@ -2321,6 +2105,7 @@ def test_4bit_compressed_stats(quant_type): #@pytest.mark.parametrize("quant_type", ['fp4', 'nf4']) @pytest.mark.parametrize("quant_type", ['nf4']) +@pytest.mark.benchmark def test_bench_4bit_dequant(quant_type): blocksize = 256 a = torch.rand(1024*12*4, 1024*12, device='cuda').half() @@ -2367,11 +2152,11 @@ def test_normal_map_tree(): #print(pivots) -@pytest.mark.parametrize("double_quant", [True, False], ids=['DQ_True', 'DQ_False']) -@pytest.mark.parametrize("storage_type", ['nf4', 'fp4'], ids=['nf4', 'fp4']) -@pytest.mark.parametrize("kind", ['fc1', 'fc2', 'attn', 'attn_packed'], ids=['fc1', 'fc2', 'attn', 'attn_packed']) -@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=['fp16', 'bf16', 'fp32']) -@pytest.mark.parametrize("quant_storage", [torch.uint8, torch.float16, torch.bfloat16, torch.float32], ids=['uint8', 'fp16', 'bf16', 'fp32']) +@pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}") +@pytest.mark.parametrize("storage_type", ['nf4', 'fp4']) +@pytest.mark.parametrize("kind", ['fc1', 'fc2', 'attn', 'attn_packed']) +@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype) +@pytest.mark.parametrize("quant_storage", [torch.uint8, torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype) def test_gemv_4bit(dtype, storage_type, quant_storage, double_quant, kind): for dim in [128, 256, 512, 1024]: #for dim in [4*1024]: @@ -2537,12 +2322,12 @@ def test_managed(): @pytest.mark.parametrize("storage_type", ['nf4', 'fp4'], ids=['nf4', 'fp4']) -@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=['fp16', 'bf16', 'fp32']) +@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype) @pytest.mark.parametrize("double_quant", [False], ids=['DQ_True']) def test_gemv_eye_4bit(storage_type, dtype, double_quant): dims = 10 torch.random.manual_seed(np.random.randint(0, 412424242)) - dims = torch.randint(0, 8192, size=(dims,)).tolist() + dims = get_test_dims(0, 8192, n=dims) dims = [dim + (64-(dim % 64)) for dim in dims] #for dim in [576, 5120, 3520, 5184, 1280, 4992, 5312, 2048]: for dim in dims: diff --git a/tests/test_generation.py b/tests/test_generation.py index 753623b27..9ed30cd2a 100644 --- a/tests/test_generation.py +++ b/tests/test_generation.py @@ -9,6 +9,8 @@ BitsAndBytesConfig, ) +from tests.helpers import TRUE_FALSE, describe_dtype, id_formatter + def get_4bit_config(): return BitsAndBytesConfig( @@ -59,23 +61,19 @@ def generate(model, tokenizer, text, generation_config, prompt_func=get_prompt_f models = ['huggyllama/llama-7b', 'bigscience/bloom-1b7'] dtypes = ['nf4', 'fp4'] -load_in_4bit = [True, False] -values = list(product(models, dtypes)) -strfunc = lambda lst: [str(x) for x in lst] -ids = ['_'.join(strfunc(x)) for x in values] -@pytest.fixture(scope='session', params=values, ids=ids) + +@pytest.fixture(scope='session', params=product(models, dtypes)) def model_and_tokenizer(request): model, tokenizer = get_model_and_tokenizer(request.param) yield request.param, model, tokenizer del model -@pytest.mark.parametrize("DQ", [True, False], ids=['DQ_True', 'DQ_False']) -@pytest.mark.parametrize("inference_kernel", [True, False], ids=['inference_kernel_True', 'inference_kernel_False']) -#@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=['fp16', 'bf16', 'fp32']) -def test_pi(requires_cuda, model_and_tokenizer, inference_kernel, DQ): - print('') - dtype = torch.float16 +@pytest.mark.parametrize("DQ", TRUE_FALSE, ids=id_formatter("dq")) +@pytest.mark.parametrize("inference_kernel", TRUE_FALSE, ids=id_formatter("inference_kernel")) +@pytest.mark.parametrize("dtype", [torch.float16], ids=describe_dtype) +@pytest.mark.slow +def test_pi(requires_cuda, model_and_tokenizer, inference_kernel, DQ, dtype): fixture_config, model, tokenizer = model_and_tokenizer generation_config = transformers.GenerationConfig( diff --git a/tests/test_linear4bit.py b/tests/test_linear4bit.py index d396a910b..13db28ed4 100644 --- a/tests/test_linear4bit.py +++ b/tests/test_linear4bit.py @@ -1,4 +1,3 @@ -from itertools import product import os from tempfile import TemporaryDirectory @@ -6,6 +5,7 @@ import torch import bitsandbytes as bnb +from tests.helpers import TRUE_FALSE storage = { 'uint8': torch.uint8, @@ -14,10 +14,10 @@ 'float32': torch.float32 } -@pytest.mark.parametrize( - "quant_type, compress_statistics, bias, quant_storage", - list(product(["nf4", "fp4"], [False, True], [False, True], ['uint8', 'float16', 'bfloat16', 'float32'])), -) +@pytest.mark.parametrize("quant_storage", ['uint8', 'float16', 'bfloat16', 'float32']) +@pytest.mark.parametrize("bias", TRUE_FALSE) +@pytest.mark.parametrize("compress_statistics", TRUE_FALSE) +@pytest.mark.parametrize("quant_type", ["nf4", "fp4"]) def test_linear_serialization(quant_type, compress_statistics, bias, quant_storage): original_dtype = torch.float16 compute_dtype = None diff --git a/tests/test_linear8bitlt.py b/tests/test_linear8bitlt.py index d4967969c..6fa7efb8d 100644 --- a/tests/test_linear8bitlt.py +++ b/tests/test_linear8bitlt.py @@ -1,5 +1,4 @@ from contextlib import nullcontext -from itertools import product import os from tempfile import TemporaryDirectory @@ -10,6 +9,7 @@ from bitsandbytes import functional as F from bitsandbytes.autograd import get_inverse_transform_indices, undo_layout from bitsandbytes.nn.modules import Linear8bitLt +from tests.helpers import TRUE_FALSE, id_formatter # contributed by Alex Borzunov, see: # https://github.com/bigscience-workshop/petals/blob/main/tests/test_linear8bitlt.py @@ -66,8 +66,10 @@ def test_linear_no_igemmlt(): assert linear_custom.state.CxB is None -@pytest.mark.parametrize("has_fp16_weights, serialize_before_forward, deserialize_before_cuda, force_no_igemmlt", - list(product([False, True], [False, True], [False, True], [False, True]))) +@pytest.mark.parametrize("has_fp16_weights", TRUE_FALSE, ids=id_formatter("has_fp16_weights")) +@pytest.mark.parametrize("serialize_before_forward", TRUE_FALSE, ids=id_formatter("serialize_before_forward")) +@pytest.mark.parametrize("deserialize_before_cuda", TRUE_FALSE, ids=id_formatter("deserialize_before_cuda")) +@pytest.mark.parametrize("force_no_igemmlt", TRUE_FALSE, ids=id_formatter("force_no_igemmlt")) def test_linear_serialization(has_fp16_weights, serialize_before_forward, deserialize_before_cuda, force_no_igemmlt): linear = torch.nn.Linear(32, 96) x = torch.randn(3, 32, dtype=torch.half) diff --git a/tests/test_modules.py b/tests/test_modules.py index c98f7a6d4..1cb04044f 100644 --- a/tests/test_modules.py +++ b/tests/test_modules.py @@ -6,6 +6,7 @@ from torch import nn import bitsandbytes as bnb +from tests.helpers import id_formatter class MockArgs: @@ -311,12 +312,7 @@ def forward(self, x): return LinearFunction.apply(x, self.weight, self.bias, self.args) -threshold = [0.0, 3.0] -values = threshold -names = [f"threshold_{vals}" for vals in values] - - -@pytest.mark.parametrize("threshold", values, ids=names) +@pytest.mark.parametrize("threshold", [0.0, 3.0], ids=id_formatter("threshold")) def test_linear8bitlt_inference(threshold): l1 = bnb.nn.Linear8bitLt(32, 64, threshold=threshold).cuda().half() assert l1.weight.device.type == "cuda" @@ -510,18 +506,21 @@ def test_linear_kbit_fp32_bias(module): o1 = l1(b1) assert l1.bias is None -modules = [] -modules.append(bnb.nn.Linear8bitLt) -modules.append(bnb.nn.Linear4bit) -modules.append(bnb.nn.LinearFP4) -modules.append(bnb.nn.LinearNF4) -modules.append(lambda d1, d2: bnb.nn.LinearFP4(d1, d2, compress_statistics=True)) -modules.append(lambda d1, d2: bnb.nn.LinearNF4(d1, d2, compress_statistics=True)) -modules.append(lambda d1, d2: bnb.nn.LinearFP4(d1, d2, compute_dtype=torch.float32)) -modules.append(lambda d1, d2: bnb.nn.LinearFP4(d1, d2, compute_dtype=torch.float16)) -modules.append(lambda d1, d2: bnb.nn.LinearFP4(d1, d2, compute_dtype=torch.bfloat16)) -names = ['Int8Lt', '4bit', 'FP4', 'NF4', 'FP4+C', 'NF4+C', 'NF4+fp32', 'NF4+fp16', 'NF4+bf16'] -@pytest.mark.parametrize("module", modules, ids=names) + +module_dict = { + "Int8Lt": bnb.nn.Linear8bitLt, + "4bit": bnb.nn.Linear4bit, + "FP4": bnb.nn.LinearFP4, + "NF4": bnb.nn.LinearNF4, + "FP4+C": lambda d1, d2: bnb.nn.LinearFP4(d1, d2, compress_statistics=True), + "NF4+C": lambda d1, d2: bnb.nn.LinearNF4(d1, d2, compress_statistics=True), + "NF4+fp32": lambda d1, d2: bnb.nn.LinearFP4(d1, d2, compute_dtype=torch.float32), + "NF4+fp16": lambda d1, d2: bnb.nn.LinearFP4(d1, d2, compute_dtype=torch.float16), + "NF4+bf16": lambda d1, d2: bnb.nn.LinearFP4(d1, d2, compute_dtype=torch.bfloat16), +} + + +@pytest.mark.parametrize("module", module_dict.values(), ids=module_dict.keys()) def test_kbit_backprop(module): b = 17 dim1 = 37 diff --git a/tests/test_optim.py b/tests/test_optim.py index 993ac8b60..e379c424a 100644 --- a/tests/test_optim.py +++ b/tests/test_optim.py @@ -1,4 +1,3 @@ -from itertools import product import os from os.path import join import shutil @@ -11,6 +10,7 @@ import bitsandbytes as bnb import bitsandbytes.functional as F +from tests.helpers import describe_dtype, id_formatter # import apex @@ -101,15 +101,16 @@ def rm_path(path): str2statenames["lion8bit_blockwise"] = [("exp_avg", "state1", "qmap1", "absmax1")] str2statenames["paged_lion8bit_blockwise"] = [("exp_avg", "state1", "qmap1", "absmax1")] -dim1 = [1024] -dim2 = [32, 1024, 4097, 1] -gtype = [torch.float32, torch.float16, torch.bfloat16] -optimizer_names = ["adam", "momentum", "rmsprop", 'paged_adamw', 'paged_adam', 'lion', 'paged_lion'] -values = list(product(dim1, dim2, gtype, optimizer_names)) -names = ["dim1_{}_dim2_{}_gtype_{}_optim_{}".format(*vals) for vals in values] -@pytest.mark.parametrize("dim1, dim2, gtype, optim_name", values, ids=names) +optimizer_names_32bit = ["adam", "momentum", "rmsprop", 'paged_adamw', 'paged_adam', 'lion', 'paged_lion'] + + +@pytest.mark.parametrize("optim_name", optimizer_names_32bit, ids=id_formatter("opt")) +@pytest.mark.parametrize("gtype", [torch.float32, torch.float16, torch.bfloat16], ids=describe_dtype) +@pytest.mark.parametrize("dim1", [1024], ids=id_formatter("dim1")) +@pytest.mark.parametrize("dim2", [32, 1024, 4097, 1], ids=id_formatter("dim2")) def test_optimizer32bit(dim1, dim2, gtype, optim_name): - if gtype == torch.bfloat16 and optim_name in ['momentum', 'rmsprop']: pytest.skip() + if gtype == torch.bfloat16 and optim_name in ['momentum', 'rmsprop']: + pytest.skip() if dim1 == 1 and dim2 == 1: return p1 = torch.randn(dim1, dim2, device="cuda", dtype=gtype) * 0.1 @@ -134,7 +135,6 @@ def test_optimizer32bit(dim1, dim2, gtype, optim_name): bnb_optimizer.step() torch_optimizer.step() - for name1, name2 in str2statenames[optim_name]: torch.testing.assert_close( torch_optimizer.state[p1][name1], @@ -177,14 +177,9 @@ def test_optimizer32bit(dim1, dim2, gtype, optim_name): assert bnb_optimizer.state[p2]["unorm_vec"] > 0.0 -dim1 = [1024] -dim2 = [32, 1024, 4097] -gtype = [torch.float32, torch.float16] -values = list(product(dim1, dim2, gtype)) -names = ["dim1_{}_dim2_{}_gtype_{}".format(*vals) for vals in values] - - -@pytest.mark.parametrize("dim1, dim2, gtype", values, ids=names) +@pytest.mark.parametrize("dim1", [1024], ids=id_formatter("dim1")) +@pytest.mark.parametrize("dim2", [32, 1024, 4097], ids=id_formatter("dim2")) +@pytest.mark.parametrize("gtype", [torch.float32, torch.float16], ids=describe_dtype) def test_global_config(dim1, dim2, gtype): if dim1 == 1 and dim2 == 1: return @@ -230,10 +225,7 @@ def test_global_config(dim1, dim2, gtype): assert adam2.state[p3]["state2"].dtype == torch.uint8 -dim1 = [1024] -dim2 = [32, 1024, 4097] -gtype = [torch.float32, torch.float16, torch.bfloat16] -optimizer_names = [ +optimizer_names_8bit = [ "adam8bit", "lion8bit", "momentum8bit", @@ -243,13 +235,12 @@ def test_global_config(dim1, dim2, gtype): "momentum8bit_blockwise", "rmsprop8bit_blockwise", ] -values = list(product(dim1, dim2, gtype, optimizer_names)) -names = [ - "dim1_{}_dim2_{}_gtype_{}_optim_{}".format(*vals) for vals in values -] -@pytest.mark.parametrize("dim1, dim2, gtype, optim_name", values, ids=names) +@pytest.mark.parametrize("optim_name", optimizer_names_8bit, ids=id_formatter("opt")) +@pytest.mark.parametrize("gtype", [torch.float32, torch.float16, torch.bfloat16], ids=describe_dtype) +@pytest.mark.parametrize("dim2", [32, 1024, 4097], ids=id_formatter("dim2")) +@pytest.mark.parametrize("dim1", [1024], ids=id_formatter("dim1")) def test_optimizer8bit(dim1, dim2, gtype, optim_name): if gtype == torch.bfloat16 and optim_name not in ['adam8bit_blockwise', 'lion8bit_blockwise']: pytest.skip() if dim1 == 1 and dim2 == 1: @@ -375,18 +366,10 @@ def test_optimizer8bit(dim1, dim2, gtype, optim_name): # print(sum(relerrors)/len(relerrors)) -dim1 = [1024] -dim2 = [32, 1024, 4097] -gtype = [torch.float32] -optim_bits = [32, 8] -values = list(product(dim1, dim2, gtype, optim_bits)) -names = [ - "dim1_{}_dim2_{}_gtype_{}_optim_bits_{}".format(*vals) - for vals in values -] - - -@pytest.mark.parametrize("dim1, dim2, gtype, optim_bits", values, ids=names) +@pytest.mark.parametrize("optim_bits", [32, 8], ids=id_formatter("optim_bits")) +@pytest.mark.parametrize("gtype", [torch.float32], ids=describe_dtype) +@pytest.mark.parametrize("dim2", [32, 1024, 4097], ids=id_formatter("dim2")) +@pytest.mark.parametrize("dim1", [1024], ids=id_formatter("dim1")) def test_adam_percentile_clipping(dim1, dim2, gtype, optim_bits): if dim1 == 1 and dim2 == 1: return @@ -474,22 +457,19 @@ def test_adam_percentile_clipping(dim1, dim2, gtype, optim_bits): adam2.load_state_dict(torch.load(join(path, "opt.pt"))) -dim1 = [4096] -dim2 = [4096] -gtype = [torch.float32, torch.float16] -# optimizer_names = ['adam8bit_blockwise', 'adam8bit', 'lamb8bit'] -# optimizer_names = ['adam8bit_blockwise', 'adam_apex', 'adam8bit', 'adam', 'adam_pytorch'] -# optimizer_names = ['momentum_apex', 'momentum8bit', 'momentum_pytorch'] -# optimizer_names = ['lamb_apex', 'lamb8bit'] -# optimizer_names = ['lars_apex', 'lars8bit'] -optimizer_names = ["adam8bit_blockwise", 'paged_adam8bit_blockwise', 'paged_adamw8bit_blockwise', 'paged_lion8bit_blockwise'] -values = list(product(dim1, dim2, gtype, optimizer_names)) -names = [ - "dim1_{}_dim2_{}_gtype_{}_optim_{}".format(*vals) for vals in values +optimizer_names_benchmark = [ + "adam8bit_blockwise", + "paged_adam8bit_blockwise", + "paged_adamw8bit_blockwise", + "paged_lion8bit_blockwise", ] -@pytest.mark.parametrize("dim1, dim2, gtype, optim_name", values, ids=names) +@pytest.mark.parametrize("dim1", [4096], ids=id_formatter("dim1")) +@pytest.mark.parametrize("dim2", [4096], ids=id_formatter("dim2")) +@pytest.mark.parametrize("gtype", [torch.float32, torch.float16], ids=describe_dtype) +@pytest.mark.parametrize("optim_name", optimizer_names_benchmark, ids=id_formatter("opt")) +@pytest.mark.benchmark def test_benchmark_blockwise(dim1, dim2, gtype, optim_name): if dim1 == 1 and dim2 == 1: return @@ -514,15 +494,12 @@ def test_benchmark_blockwise(dim1, dim2, gtype, optim_name): print(optim_name, gtype, s / params) # assert s < 3.9 -dim1 = [2*1024] -gtype = [torch.float16] -#mode = ['torch', 'bnb'] -mode = ['bnb'] -optimizer_names = ['paged_adamw'] -#optimizer_names = ['paged_adamw8bit_blockwise'] -values = list(product(dim1,gtype, optimizer_names, mode)) -names = ['dim1_{0}_gtype_{1}_optim_{2}_mode_{3}'.format(*vals) for vals in values] -@pytest.mark.parametrize("dim1, gtype, optim_name, mode", values, ids=names) + +@pytest.mark.parametrize("dim1", [2 * 1024], ids=id_formatter("dim1")) +@pytest.mark.parametrize("gtype", [torch.float16], ids=describe_dtype) +@pytest.mark.parametrize("optim_name", ['paged_adamw'], ids=id_formatter("optim_name")) +@pytest.mark.parametrize("mode", ['bnb'], ids=id_formatter("mode")) +@pytest.mark.benchmark def test_stream_optimizer_bench(dim1, gtype, optim_name, mode): layers1 = torch.nn.Sequential(*torch.nn.ModuleList([torch.nn.Linear(dim1, dim1) for i in range(10)])) layers1 = layers1.to(gtype) diff --git a/tests/test_triton.py b/tests/test_triton.py index d0397ee4a..943db067a 100644 --- a/tests/test_triton.py +++ b/tests/test_triton.py @@ -4,11 +4,12 @@ from bitsandbytes.nn import Linear8bitLt from bitsandbytes.nn.triton_based_modules import SwitchBackLinear from bitsandbytes.triton.triton_utils import is_triton_available +from tests.helpers import TRUE_FALSE @pytest.mark.skipif(not is_triton_available() or not torch.cuda.is_available() or not torch.cuda.get_device_capability()[0] >= 8, reason="This test requires triton and a GPU with compute capability 8.0 or higher.") -@pytest.mark.parametrize("vector_wise_quantization", [False, True]) +@pytest.mark.parametrize("vector_wise_quantization", TRUE_FALSE) def test_switchback(vector_wise_quantization): for dim in [83]: for batch in [13]: From 25fe140ebb3845260d4e674910795a53287dd2f7 Mon Sep 17 00:00:00 2001 From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> Date: Thu, 1 Feb 2024 03:53:10 +0100 Subject: [PATCH 014/112] Create upload_pr_documentation.yml --- .github/workflows/upload_pr_documentation.yml | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 .github/workflows/upload_pr_documentation.yml diff --git a/.github/workflows/upload_pr_documentation.yml b/.github/workflows/upload_pr_documentation.yml new file mode 100644 index 000000000..3414c255b --- /dev/null +++ b/.github/workflows/upload_pr_documentation.yml @@ -0,0 +1,16 @@ +name: Upload PR Documentation + +on: + workflow_run: + workflows: ["Build PR Documentation"] + types: + - completed + +jobs: + build: + uses: huggingface/doc-builder/.github/workflows/upload_pr_documentation.yml@main + with: + package_name: peft + secrets: + hf_token: ${{ secrets.HUGGINGFACE_PUSH }} + comment_bot_token: ${{ secrets.GITHUB_TOKEN }} From 6aa85a595dc5665c75b282f003c0449d0ae76c1c Mon Sep 17 00:00:00 2001 From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> Date: Thu, 1 Feb 2024 03:57:48 +0100 Subject: [PATCH 015/112] Update .github/workflows/upload_pr_documentation.yml --- .github/workflows/upload_pr_documentation.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/upload_pr_documentation.yml b/.github/workflows/upload_pr_documentation.yml index 3414c255b..6497caf2d 100644 --- a/.github/workflows/upload_pr_documentation.yml +++ b/.github/workflows/upload_pr_documentation.yml @@ -10,7 +10,7 @@ jobs: build: uses: huggingface/doc-builder/.github/workflows/upload_pr_documentation.yml@main with: - package_name: peft + package_name: bitsandbytes secrets: hf_token: ${{ secrets.HUGGINGFACE_PUSH }} comment_bot_token: ${{ secrets.GITHUB_TOKEN }} From d97700af6870745ffac37d7f80f4926586403da3 Mon Sep 17 00:00:00 2001 From: Won-Kyu Park Date: Thu, 1 Feb 2024 13:16:26 +0900 Subject: [PATCH 016/112] add missing definitions accidentally removed by mistake in PR #876 --- include/SIMD.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/include/SIMD.h b/include/SIMD.h index a2ac1a9ae..d559e9f55 100644 --- a/include/SIMD.h +++ b/include/SIMD.h @@ -64,6 +64,16 @@ template <> struct InstrFloatTraits typedef __m128d vec_t; }; +template <> struct InstrFloatTraits +{ + typedef float vec_t; +}; + +template <> struct InstrFloatTraits +{ + typedef double vec_t; +}; + template struct FTOITraits { From 5f76fe9d79baaff9b26f3dcbfb66c61f7536ca07 Mon Sep 17 00:00:00 2001 From: James Wyatt Date: Mon, 25 Sep 2023 01:25:24 +1000 Subject: [PATCH 017/112] Add CMakeLists.txt * fix project name and add lib prefix for win32 (2024/01/31) * set LIBRARY_OUTPUT_DIRECTORY property Co-authored-by: Won-Kyu Park --- CMakeLists.txt | 132 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 132 insertions(+) create mode 100644 CMakeLists.txt diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 000000000..140753af4 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,132 @@ +# This CMake config hopefully makes it easier to compile. +# Ensure the CUDA Toolkit is available on your path. Then run: +# For GCC: `cmake -B build . && cmake --build build` +# For MSVC: `cmake -B build . && cmake --build build --config Release` +# You can also use the following options +# - BUILD_CUDA: Default ON, will build with CUDA +# - NO_CUBLASLT: Default OFF, will skip building/linking CUBLASLT support +# - CUDA_VERSION: The expected CUDA version, for sanity checking. The actual version +# is whatever CMake finds on your path. +# - COMPUTE_CAPABILITY: Which GPU Arch/Compute codes to provide to NVCC. +# Separate by semicolons, i.e. `-DCOMPUTE_CAPABILITY=89;90` +# Check your compute capability here: https://developer.nvidia.com/cuda-gpus +# - PTXAS_VERBOSE: Pass the `-v` option to the PTX Assembler +cmake_minimum_required(VERSION 3.18) + +project(bitsandbytes LANGUAGES C CXX) + +option(BUILD_CUDA "Build bitsandbytes with CUDA support" ON) +option(NO_CUBLASLT "Disable CUBLAS" OFF) +option(PTXAS_VERBOSE "Pass through -v flag to PTX Assembler" OFF) + +set(CPP_FILES csrc/common.cpp csrc/cpu_ops.cpp csrc/pythonInterface.c) +list(APPEND CUDA_FILES csrc/ops.cu csrc/kernels.cu) +list(APPEND SRC_FILES ${CPP_FILES}) + +message(STATUS "BUILD_CUDA := ${BUILD_CUDA}") +message(STATUS "NO_CUBLASLT := ${NO_CUBLASLT}") + +set(BNB_OUTPUT_NAME "bitsandbytes") + +if(BUILD_CUDA) + enable_language(CUDA) # This will fail if CUDA is not found + + # Convert the CUDA version from X.Y.z to XY. There's probably a shorter way of doing this + string(REGEX MATCH "^[0-9]+.[0-9]+" _CUDA_VERSION_FIRST_TWO "${CMAKE_CUDA_COMPILER_VERSION}") + string(REPLACE "." "" CUDA_VERSION_SHORT "${_CUDA_VERSION_FIRST_TWO}") + + # Expose a cache variable that the user can set to ensure the correct version of CUDA is found + set(CUDA_VERSION "${CUDA_VERSION_SHORT}" CACHE STRING "Expected CUDA Version Shortcode") + + message(STATUS "CUDA Version: ${CUDA_VERSION_SHORT} (${CMAKE_CUDA_COMPILER_VERSION})") + message(STATUS "CUDA Compiler: ${CMAKE_CUDA_COMPILER}") + + # It should match the discovered version + if(NOT CUDA_VERSION STREQUAL "${CUDA_VERSION_SHORT}") + message(FATAL_ERROR "You've specified CUDA version ${CUDA_VERSION} however the CUDA compiler found is ${CUDA_VERSION_SHORT}." + " Ensure the desired CUDA compiler is the first one available on your PATH." + ) + endif() + + if(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS "11.0") + message(FATAL_ERROR "CUDA Version < 11 is not supported") + elseif(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "13.0") + message(FATAL_ERROR "CUDA Version > 12 is not supported") + endif() + + string(APPEND CMAKE_CUDA_FLAGS " --use_fast_math") + if(PTXAS_VERBOSE) + # Verbose? Outputs register usage information, and other things... + string(APPEND CMAKE_CUDA_FLAGS " -Xptxas=-v") + endif() + + foreach(capability ${CMAKE_CUDA_ARCHITECTURES_ALL}) + # Most of the items here are like: `xx-real`, so we just extract the `xx` portion + string(REGEX MATCH "[0-9]+" capability_id "${capability}") + if(capability_id GREATER 0) + list(APPEND POSSIBLE_CAPABILITIES ${capability_id}) + endif() + endforeach() + + # This can be changed via -D argument to CMake + # By default all possible capabilities are compiled + set(COMPUTE_CAPABILITY "${POSSIBLE_CAPABILITIES}" CACHE STRING "Compute Capabilities Targeted") + + message(STATUS "CUDA Capabilities Available: ${POSSIBLE_CAPABILITIES}") + message(STATUS "CUDA Capabilities Selected: ${COMPUTE_CAPABILITY}") + + foreach(capability ${COMPUTE_CAPABILITY}) + string(APPEND CMAKE_CUDA_FLAGS " -gencode arch=compute_${capability},code=sm_${capability}") + endforeach() + + message(STATUS "CUDA NVCC Flags: ${CMAKE_CUDA_FLAGS}") + + list(APPEND SRC_FILES ${CUDA_FILES}) + + string(APPEND BNB_OUTPUT_NAME "_cuda${CUDA_VERSION_SHORT}") + if(NO_CUBLASLT) + string(APPEND BNB_OUTPUT_NAME "_nocublaslt") + endif() +else() + message(STATUS "Building CPU Only") + string(APPEND BNB_OUTPUT_NAME "_cpu") + if(NO_CUBLASLT) + message(WARNING "We're building in CPU only mode but NO_CUBLASLT is enabled. It will have no effect.") + endif() +endif() + +set_source_files_properties(${CPP_FILES} PROPERTIES LANGUAGE CXX) +add_library(bitsandbytes SHARED ${SRC_FILES}) +include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) +target_include_directories(bitsandbytes PUBLIC csrc include) +target_compile_features(bitsandbytes PUBLIC cxx_std_14) + + +if(BUILD_CUDA) + target_compile_definitions(bitsandbytes PUBLIC BUILD_CUDA) + target_link_libraries(bitsandbytes PUBLIC cudart cublas cusparse) + if(NO_CUBLASLT) + target_compile_definitions(bitsandbytes PUBLIC NO_CUBLASLT) + else() + target_link_libraries(bitsandbytes PUBLIC cublasLt) + endif() + + set_target_properties(bitsandbytes + PROPERTIES + CUDA_SEPARABLE_COMPILATION ON + ) +endif() + +if(WIN32) + set_target_properties(bitsandbytes PROPERTIES PREFIX "lib") +endif() + +set_target_properties(bitsandbytes + PROPERTIES + OUTPUT_NAME ${BNB_OUTPUT_NAME} + # We have to use a generator expression to prevent MSVC Debug/Release subdirs being made + RUNTIME_OUTPUT_DIRECTORY "$<1:${CMAKE_SOURCE_DIR}/bitsandbytes>" + LIBRARY_OUTPUT_DIRECTORY "$<1:${CMAKE_SOURCE_DIR}/bitsandbytes>" + POSITION_INDEPENDENT_CODE ON # The `-fPIC` commands for non-windows compilers + WINDOWS_EXPORT_ALL_SYMBOLS ON # On Windows, export all c methods as DLL exports +) From ca4a46eff11adc0881351db3e45378d23b521b92 Mon Sep 17 00:00:00 2001 From: Won-Kyu Park Date: Wed, 15 Nov 2023 06:53:54 +0900 Subject: [PATCH 018/112] add cmake workflows * build matrix for ubuntu + python 3.10, 3.11 + cuda 11.8 + 12.1 (windows is disabled for now) * add environment-bnb.yml for building * more fixes suggested by @akx (2024/01/30) * use python -m build --wheel suggested by @akx Co-authored-by: Aarni Koskela --- .github/workflows/cmake.yml | 165 ++++++++++++++++++++++++++++++++++++ environment-bnb.yml | 21 +++++ 2 files changed, 186 insertions(+) create mode 100644 .github/workflows/cmake.yml create mode 100644 environment-bnb.yml diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml new file mode 100644 index 000000000..be3092daa --- /dev/null +++ b/.github/workflows/cmake.yml @@ -0,0 +1,165 @@ +name: CMake on multiple platforms + +on: + push: + branches: [ "main" ] + pull_request: + branches: [ "main" ] + +jobs: + build: + runs-on: ${{ matrix.os }} + + strategy: + # Set fail-fast to false to ensure that feedback is delivered for all matrix combinations. Consider changing this to true when your workflow is stable. + fail-fast: false + + matrix: + os: [ubuntu-latest] + python-version: ['3.10', '3.11'] + cuda-version: ['11.8', '12.1'] + build_type: [Release] + c_compiler: [gcc] + include: + - os: ubuntu-latest + c_compiler: gcc + cpp_compiler: g++ + + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Set up MSVC + if: matrix.os == 'windows-latest' + uses: ilammy/msvc-dev-cmd@v1.12.1 + with: + arch: amd64 + + - name: Setup Mambaforge + uses: conda-incubator/setup-miniconda@v3.0.1 + with: + miniforge-variant: Mambaforge + miniforge-version: latest + activate-environment: bnb-env + use-mamba: true + + - uses: conda-incubator/setup-miniconda@v3.0.1 + with: + auto-update-conda: true + activate-environment: bnb-env + environment-file: environment-bnb.yml + use-only-tar-bz2: false + auto-activate-base: true + python-version: ${{ matrix.python-version }} + mamba-version: "*" + + - name: Set reusable strings + # Turn repeated input strings (such as the build output directory) into step outputs. These step outputs can be used throughout the workflow file. + id: strings + shell: bash + run: | + echo "build-output-dir=${{ github.workspace }}/build" >> "$GITHUB_OUTPUT" + + - name: CUDA Toolkit + shell: bash -el {0} + run: | + if [ "${{ matrix.os }}" = "ubuntu-latest" ]; then + # to prepare space + sudo rm -rf /usr/share/dotnet + sudo rm -rf /opt/ghc + sudo rm -rf /usr/local/share/boost + fi + addon="" + cuda_version=${{ matrix.cuda-version }} + [ "$cuda_version" = "12.1" ] && [ "${{ matrix.os }}" = "ubuntu-latest" ] && addon="cuda-cudart-static cuda-nvrtc" + [ "$cuda_version" = "12.1" ] && [ "${{ matrix.os }}" = "windows-latest" ] && addon="cuda-nvrtc" + [ "$cuda_version" = "11.8" ] && cuda_version="11.8.0" + [ "$cuda_version" = "12.1" ] && cuda_version="12.1.1" + + conda install pytorch-cuda=${{ matrix.cuda-version }} -c pytorch # it's dependency not correctly resolved sometime + conda install cuda-python=${{ matrix.cuda-version }} cuda-libraries-dev cuda-nvcc cuda-nvtx cuda-cupti cuda-cudart cuda-cudart-dev cuda-runtime cuda-libraries $addon -c "nvidia/label/cuda-$cuda_version" + + [ "${{ matrix.os }}" = "windows-latest" ] && conda install "clang>=17.0.6" "clangxx>=17.0.6" -c conda-forge + + CUDA_HOME="${{ env.CONDA }}/envs/bnb-env" + echo CUDA_HOME=$CUDA_HOME >> "$GITHUB_ENV" + echo CUDA_PATH=$CUDA_HOME >> "$GITHUB_ENV" + + if [ "${{ matrix.os }}" = "windows-latest" ]; then + # without -DCMAKE_CUDA_COMPILER=nvcc, cmake config always fail for cuda-11.8 + echo DCMAKE_CUDA_COMPILER=-DCMAKE_CUDA_COMPILER=nvcc >> "$GITHUB_ENV" + fi + + nvcc --version + + - name: Update environment + run: mamba env update -n bnb-env -f environment-bnb.yml + + - name: Prep build + run: python -m pip install cmake==3.27.9 ninja setuptools wheel + + - name: Configure CMake + run: > + cmake -B ${{ steps.strings.outputs.build-output-dir }} + -G Ninja ${{ env.DCMAKE_CUDA_COMPILER }} + -DCMAKE_CXX_COMPILER=${{ matrix.cpp_compiler }} + -DCMAKE_C_COMPILER=${{ matrix.c_compiler }} + -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} + -DCOMPUTE_CAPABILITY="50;52;60;61;62;70;72;75;80;86;87;89;90" + -S ${{ github.workspace }} + + - name: Build + # Build your program with the given configuration. Note that --config is needed because the default Windows generator is a multi-config generator (Visual Studio generator). + run: cmake --build ${{ steps.strings.outputs.build-output-dir }} --config ${{ matrix.build_type }} + + - name: Configure NOBLASLT + run: > + cmake -B ${{ steps.strings.outputs.build-output-dir }} + -G Ninja ${{ env.DCMAKE_CUDA_COMPILER }} + -DCMAKE_CXX_COMPILER=${{ matrix.cpp_compiler }} + -DCMAKE_C_COMPILER=${{ matrix.c_compiler }} + -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} + -DCOMPUTE_CAPABILITY="50;52;60;61;62;70;72;75;80;86;87;89;90" + -DNO_CUBLASLT=ON + -S ${{ github.workspace }} + + - name: Build NOBLASLT + run: cmake --build ${{ steps.strings.outputs.build-output-dir }} --config ${{ matrix.build_type }} + + - name: Configure CPU + run: > + cmake -B ${{ steps.strings.outputs.build-output-dir }} + -G Ninja ${{ env.DCMAKE_CUDA_COMPILER }} + -DCMAKE_CXX_COMPILER=${{ matrix.cpp_compiler }} + -DCMAKE_C_COMPILER=${{ matrix.c_compiler }} + -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} + -DNO_CUBLASLT=ON + -DBUILD_CUDA=OFF + -S ${{ github.workspace }} + + - name: Build CPU + run: cmake --build ${{ steps.strings.outputs.build-output-dir }} --config ${{ matrix.build_type }} + + - name: Test + working-directory: ${{ steps.strings.outputs.build-output-dir }} + # Execute tests defined by the CMake configuration. Note that --build-config is needed because the default Windows generator is a multi-config generator (Visual Studio generator). + # See https://cmake.org/cmake/help/latest/manual/ctest.1.html for more detail + run: ctest --build-config ${{ matrix.build_type }} + + - name: Build dist + shell: bash -el {0} + run: | + python -m pip install build + python -m build --wheel + mkdir dist/cu${{ matrix.cuda-version }} + mv dist/bitsandbytes*.* dist/cu${{ matrix.cuda-version }}/ + + - name: Upload Build Artifacts + uses: actions/upload-artifact@v4.3.0 + with: + name: bitsandbytes-${{ matrix.os }}-${{ matrix.python-version }}-${{ matrix.cuda-version }} + path: | + ${{ github.workspace }}/dist/ diff --git a/environment-bnb.yml b/environment-bnb.yml new file mode 100644 index 000000000..92c7761bb --- /dev/null +++ b/environment-bnb.yml @@ -0,0 +1,21 @@ +# for cmake build +name: bnb +channels: + - pytorch + - nvidia + - conda-forge + +dependencies: + - python + - accelerate + - einops + - scipy + - transformers + - pytest + - pytest-cases + - ipython + - debugpy + - yapf + - monkeytype + - rich + - pytest-sugar From c12c4c1e5aa5d9ae51f03cbe041229546289a3e0 Mon Sep 17 00:00:00 2001 From: Won-Kyu Park Date: Mon, 11 Dec 2023 19:47:28 +0900 Subject: [PATCH 019/112] add ext_modules * add a comment suggested by @akx (2024/01/30) Co-authored-by: Aarni Koskela --- setup.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 407116fbe..b109d9454 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ import glob import os -from setuptools import find_packages, setup +from setuptools import Extension, find_packages, setup libs = list(glob.glob("./bitsandbytes/libbitsandbytes*.so")) libs += list(glob.glob("./bitsandbytes/libbitsandbytes*.dll")) @@ -35,6 +35,9 @@ def read(fname): }, long_description=read("README.md"), long_description_content_type="text/markdown", + # HACK: pretend we have a native extension module so the wheel is tagged + # correctly with a platform tag (e.g. `-linux_x86_64.whl`). + ext_modules=[Extension("bitsandbytes", sources=[], language="c")], classifiers=[ "Development Status :: 4 - Beta", "Topic :: Scientific/Engineering :: Artificial Intelligence", From 926913998dba23034d7236752442c4788aba6766 Mon Sep 17 00:00:00 2001 From: Won-Kyu Park Date: Thu, 1 Feb 2024 12:26:57 +0900 Subject: [PATCH 020/112] add windows build matrix --- .github/workflows/cmake.yml | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml index be3092daa..35d1b0240 100644 --- a/.github/workflows/cmake.yml +++ b/.github/workflows/cmake.yml @@ -15,15 +15,23 @@ jobs: fail-fast: false matrix: - os: [ubuntu-latest] + os: [ubuntu-latest, windows-latest] python-version: ['3.10', '3.11'] cuda-version: ['11.8', '12.1'] build_type: [Release] - c_compiler: [gcc] + c_compiler: [gcc, cl] include: + - os: windows-latest + c_compiler: cl + cpp_compiler: cl - os: ubuntu-latest c_compiler: gcc cpp_compiler: g++ + exclude: + - os: ubuntu-latest + c_compiler: cl + - os: windows-latest + c_compiler: gcc steps: - uses: actions/checkout@v4 From 5831205a31aec437d62d8d212e001be272c3cc5e Mon Sep 17 00:00:00 2001 From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> Date: Thu, 1 Feb 2024 08:11:32 +0100 Subject: [PATCH 021/112] Update installation.mdx --- docs/source/installation.mdx | 42 +++++++++++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx index 035e3e70d..588295949 100644 --- a/docs/source/installation.mdx +++ b/docs/source/installation.mdx @@ -1,3 +1,43 @@ # Installation -... work in progress ... \ No newline at end of file +Note currently `bitsandbytes` is only supported on CUDA GPU hardwares, support for AMD GPUs and M1 chips (MacOS) is coming soon. + + + + +## Linux + +### From Pypi + +```bash +pip install bitsandbytes +``` + +### From source + +```bash +git clone https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/ +CUDA_VERSION=XXX make cuda12x +python setup.py install +``` + +with `XXX` being your CUDA version, for <12.0 call `make cuda 11x` + + + + +## Windows + +Currently for Windows users, you need to build bitsandbytes from source + +```bash +git clone https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/ +cmake -B build -DBUILD_CUDA=ON -S . +cmake --build build --config Release +python -m build --wheel +``` + +Big thanks to [wkpark](https://github.com/wkpark), [Jamezo97](https://github.com/Jamezo97), [rickardp](https://github.com/rickardp), [akx](https://github.com/akx) for their amazing contribution to make bitsandbytes compatible with Windows. + + + From 0f3d02933159291f86a8560e0eb4be2d64a62890 Mon Sep 17 00:00:00 2001 From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> Date: Thu, 1 Feb 2024 08:17:53 +0100 Subject: [PATCH 022/112] Update docs/source/installation.mdx --- docs/source/installation.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx index 588295949..50031acf7 100644 --- a/docs/source/installation.mdx +++ b/docs/source/installation.mdx @@ -37,7 +37,7 @@ cmake --build build --config Release python -m build --wheel ``` -Big thanks to [wkpark](https://github.com/wkpark), [Jamezo97](https://github.com/Jamezo97), [rickardp](https://github.com/rickardp), [akx](https://github.com/akx) for their amazing contribution to make bitsandbytes compatible with Windows. +Big thanks to [wkpark](https://github.com/wkpark), [Jamezo97](https://github.com/Jamezo97), [rickardp](https://github.com/rickardp), [akx](https://github.com/akx) for their amazing contributions to make bitsandbytes compatible with Windows. From 6974920b4f5269f3f0591278c7ec8340e671b0b0 Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Thu, 1 Feb 2024 14:56:51 +0200 Subject: [PATCH 023/112] Enable line-ending and other hygiene lints (#1006) --- .github/ISSUE_TEMPLATE/bug-report.yml | 8 +-- .github/ISSUE_TEMPLATE/feature-request.yml | 4 +- .github/workflows/build_pr_documentation.yml | 2 +- .github/workflows/stale.yml.disabled | 2 +- .pre-commit-config.yaml | 11 +++ .style.yapf | 2 +- README.md | 4 +- benchmarking/switchback/README.md | 2 +- .../switchback/make_plot_with_jsonl.py | 9 ++- benchmarking/switchback/speed_benchmark.py | 4 +- bitsandbytes/cuda_setup/main.py | 2 +- bitsandbytes/optim/adamw.py | 1 - bitsandbytes/research/autograd/_functions.py | 4 +- bitsandbytes/triton/dequantize_rowwise.py | 2 +- .../triton/int8_matmul_mixed_dequantize.py | 2 +- .../triton/int8_matmul_rowwise_dequantize.py | 2 +- .../quantize_columnwise_and_transpose.py | 3 +- bitsandbytes/triton/quantize_global.py | 17 +++-- bitsandbytes/triton/quantize_rowwise.py | 3 +- compile_from_source.md | 5 +- csrc/kernels.cu | 70 +++++++++---------- csrc/pythonInterface.c | 2 +- docs/source/_toctree.yml | 4 +- docs/source/index.mdx | 6 +- docs/source/quickstart.mdx | 4 +- environment.yml | 2 +- examples/int8_inference_huggingface.py | 3 - how_to_use_nonpytorch_cuda.md | 2 +- install_cuda.py | 8 +-- scripts/stale.py | 2 +- tests/test_autograd.py | 1 - tests/test_cuda_setup_evaluator.py | 8 --- tests/test_functional.py | 2 - tests/test_generation.py | 3 - tests/test_modules.py | 3 - tests/test_triton.py | 1 - 36 files changed, 97 insertions(+), 113 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/bug-report.yml b/.github/ISSUE_TEMPLATE/bug-report.yml index ac8e9de00..6ae3c7c0a 100644 --- a/.github/ISSUE_TEMPLATE/bug-report.yml +++ b/.github/ISSUE_TEMPLATE/bug-report.yml @@ -18,15 +18,15 @@ body: label: Reproduction description: | Please provide a code sample that reproduces the problem you ran into. It can be a Colab link or just a code snippet. - Please provide the simplest reproducer as possible so that we can quickly fix the issue. + Please provide the simplest reproducer as possible so that we can quickly fix the issue. placeholder: | - Reproducer: - + Reproducer: + - type: textarea id: expected-behavior validations: required: true attributes: label: Expected behavior - description: "A clear and concise description of what you would expect to happen." \ No newline at end of file + description: "A clear and concise description of what you would expect to happen." diff --git a/.github/ISSUE_TEMPLATE/feature-request.yml b/.github/ISSUE_TEMPLATE/feature-request.yml index 4e75c2a64..c39f346b9 100644 --- a/.github/ISSUE_TEMPLATE/feature-request.yml +++ b/.github/ISSUE_TEMPLATE/feature-request.yml @@ -18,7 +18,7 @@ body: attributes: label: Motivation description: | - Please outline the motivation for the proposal. Is your feature request related to a problem? + Please outline the motivation for the proposal. Is your feature request related to a problem? - type: textarea id: contribution @@ -27,4 +27,4 @@ body: attributes: label: Your contribution description: | - Is there any way that you could help, e.g. by submitting a PR? \ No newline at end of file + Is there any way that you could help, e.g. by submitting a PR? diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml index dace206b1..40ea8b5bc 100644 --- a/.github/workflows/build_pr_documentation.yml +++ b/.github/workflows/build_pr_documentation.yml @@ -14,4 +14,4 @@ jobs: commit_sha: ${{ github.event.pull_request.head.sha }} pr_number: ${{ github.event.number }} package: bitsandbytes - repo_owner: TimDettmers \ No newline at end of file + repo_owner: TimDettmers diff --git a/.github/workflows/stale.yml.disabled b/.github/workflows/stale.yml.disabled index ec011c7fb..0b4f789ea 100644 --- a/.github/workflows/stale.yml.disabled +++ b/.github/workflows/stale.yml.disabled @@ -24,4 +24,4 @@ jobs: pip install PyGithub - name: Close stale issues run: | - python scripts/stale.py \ No newline at end of file + python scripts/stale.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d568a849f..039139b95 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -6,3 +6,14 @@ repos: args: - --fix # - id: ruff-format # TODO: enable when the time is right + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.5.0 + hooks: + - id: check-merge-conflict + - id: check-yaml + - id: end-of-file-fixer + - id: fix-byte-order-marker + - id: trailing-whitespace + - id: mixed-line-ending + args: + - --fix=lf diff --git a/.style.yapf b/.style.yapf index a185235cf..e60ac16e5 100644 --- a/.style.yapf +++ b/.style.yapf @@ -10,4 +10,4 @@ SPLIT_BEFORE_BITWISE_OPERATOR = True SPLIT_BEFORE_FIRST_ARGUMENT = True SPLIT_BEFORE_LOGICAL_OPERATOR = True SPLIT_BEFORE_NAMED_ASSIGNS = True -SPLIT_COMPLEX_COMPREHENSION = True \ No newline at end of file +SPLIT_COMPLEX_COMPREHENSION = True diff --git a/README.md b/README.md index a4586d6ca..61dede8c1 100644 --- a/README.md +++ b/README.md @@ -153,10 +153,10 @@ To compile from source, you need an installation of CUDA. If `nvcc` is not insta wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/install_cuda.sh # Syntax cuda_install CUDA_VERSION INSTALL_PREFIX EXPORT_TO_BASH # CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122} -# EXPORT_TO_BASH in {0, 1} with 0=False and 1=True +# EXPORT_TO_BASH in {0, 1} with 0=False and 1=True # For example, the following installs CUDA 11.7 to ~/local/cuda-11.7 and exports the path to your .bashrc -bash install_cuda.sh 117 ~/local 1 +bash install_cuda.sh 117 ~/local 1 ``` To use a specific CUDA version just for a single compile run, you can set the variable `CUDA_HOME`, for example the following command compiles `libbitsandbytes_cuda117.so` using compiler flags for cuda11x with the cuda version at `~/local/cuda-11.7`: diff --git a/benchmarking/switchback/README.md b/benchmarking/switchback/README.md index bb33b5bbd..b73569030 100644 --- a/benchmarking/switchback/README.md +++ b/benchmarking/switchback/README.md @@ -1,4 +1,4 @@ Steps: 1. Run `python speed_benchmark/speed_benchmark.py` which times operations and writes their time to `speed_benchmark/info_a100_py2.jsonl` (change the name of the jsonl to a different name for your profiling). -2. Run `python speed_benchmark/make_plot_with_jsonl.py`, which produces the `speed_benchmark/plot_with_info.pdf`. Again make sure you change the jsonl which is being processed. \ No newline at end of file +2. Run `python speed_benchmark/make_plot_with_jsonl.py`, which produces the `speed_benchmark/plot_with_info.pdf`. Again make sure you change the jsonl which is being processed. diff --git a/benchmarking/switchback/make_plot_with_jsonl.py b/benchmarking/switchback/make_plot_with_jsonl.py index 3ef87d6b2..177270346 100644 --- a/benchmarking/switchback/make_plot_with_jsonl.py +++ b/benchmarking/switchback/make_plot_with_jsonl.py @@ -33,7 +33,7 @@ ('global_fwd', '^', '--', 'C4', 'Int8 Matmul XW (switchback)'), ('global_bwd', '^', '-.', 'C4', 'Int8 Matmul GW (switchback)'), - + ('x_quantize_rowwise', 'P', '--', 'C4', 'Quantize rowwise X (switchback)'), ('g_quantize_rowwise', 'P', '-.', 'C4', 'Quantize rowwise G (switchback)'), ('w_quantize_global', '.', '--', 'C4', 'Quatnize global W (switchback)'), @@ -55,7 +55,7 @@ y_ += df_[k_].values[0] ys.append(y_ * 0.5) - + ax.plot(xs, ys, color=color, label=name, marker=marker, markersize=5 if marker=='s' else 5, linestyle=ls, linewidth=2 if '+' in k else 1.) @@ -67,7 +67,7 @@ ax.set_xscale('log') if logscale_plot1: ax.set_yscale('log') - + ax.tick_params(axis='x', labelsize=11) ax.tick_params(axis='y', labelsize=11) @@ -91,7 +91,7 @@ ('standard_gx+standard_gw+standard_fwd', 's', '-', 'C2', 'Standard fp16 (total time)'), ('x_quantize_rowwise+g_quantize_rowwise+w_quantize_global+w_quantize_global_transpose+standard_gw+global_fwd+global_bwd', 'o', '-', 'C4', 'SwitchBack int8 (total time)'), ]: - + xs, ys = [], [] df = rdf[rdf.batch_size == batch_size] for embed_dim in dims_to_consider: @@ -133,4 +133,3 @@ plt.savefig('speed_benchmark/plot_with_info.pdf', bbox_inches='tight') - diff --git a/benchmarking/switchback/speed_benchmark.py b/benchmarking/switchback/speed_benchmark.py index d70df0386..c4f3cd4c6 100644 --- a/benchmarking/switchback/speed_benchmark.py +++ b/benchmarking/switchback/speed_benchmark.py @@ -42,7 +42,7 @@ def get_time(k, fn, info_dict): for dim in [1024, 1280, 1408, 1664, 2048, 4096]: # note "batch_size" is actually "batch_size * embed_dim", which is why it's large for batch_size in [256*32, 256*64, 256*128, 256*256, 256*512]: - + # switch switches dim_in and dim_out for switch in [False, True]: @@ -62,7 +62,7 @@ def get_time(k, fn, info_dict): x = torch.randn(batch_size, dim_in, dtype=torch.float16).cuda() g = torch.randn(batch_size, dim_out, dtype=torch.float16).cuda() w = torch.randn(dim_out, dim_in, dtype=torch.float16).cuda() - + x_int8 = x.clone().to(torch.int8) g_int8 = g.clone().to(torch.int8) w_int8 = w.clone().to(torch.int8) diff --git a/bitsandbytes/cuda_setup/main.py b/bitsandbytes/cuda_setup/main.py index a34385b1f..0db9df343 100644 --- a/bitsandbytes/cuda_setup/main.py +++ b/bitsandbytes/cuda_setup/main.py @@ -210,7 +210,7 @@ def remove_non_existent_dirs(candidate_paths: Set[Path]) -> Set[Path]: if path.exists(): existent_directories.add(path) except PermissionError: - # Handle the PermissionError first as it is a subtype of OSError + # Handle the PermissionError first as it is a subtype of OSError # https://docs.python.org/3/library/exceptions.html#exception-hierarchy pass except OSError as exc: diff --git a/bitsandbytes/optim/adamw.py b/bitsandbytes/optim/adamw.py index 9ea5812ea..17383eed5 100644 --- a/bitsandbytes/optim/adamw.py +++ b/bitsandbytes/optim/adamw.py @@ -35,4 +35,3 @@ class PagedAdamW32bit(Optimizer2State): def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=1e-2, amsgrad=False, optim_bits=32, args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True): super().__init__( "adam", params, lr, betas, eps, weight_decay, 32, args, min_8bit_size, percentile_clipping, block_wise, is_paged=True) - diff --git a/bitsandbytes/research/autograd/_functions.py b/bitsandbytes/research/autograd/_functions.py index e515bfeff..7d869e39a 100644 --- a/bitsandbytes/research/autograd/_functions.py +++ b/bitsandbytes/research/autograd/_functions.py @@ -83,7 +83,7 @@ def backward(ctx, grad_output): # fp8out_transpose = fp8out_transpose.view(grad_output.shape[0], grad_output.shape[1], grad_output.shape[2]) # not supported by PyTorch. TODO: create work-around - if req_gradA: + if req_gradA: grad_A = torch.matmul(fp8out, B.t().to(fp8out.dtype)).to(A.dtype) if req_gradB: @@ -167,7 +167,7 @@ def backward(ctx, grad_output): # fp8out_transpose = fp8out_transpose.view(grad_output.shape[0], grad_output.shape[1], grad_output.shape[2]) # not supported by PyTorch. TODO: create work-around - if req_gradA: + if req_gradA: grad_A = torch.matmul(fp8out, B.t().to(fp8out.dtype)).to(A.dtype) if req_gradB: diff --git a/bitsandbytes/triton/dequantize_rowwise.py b/bitsandbytes/triton/dequantize_rowwise.py index daa59da9c..3d7529852 100644 --- a/bitsandbytes/triton/dequantize_rowwise.py +++ b/bitsandbytes/triton/dequantize_rowwise.py @@ -50,7 +50,7 @@ def _dequantize_rowwise( max_val = tl.load(state_x + pid) output = max_val * x * inv_127 tl.store(output_ptr + offsets, output, mask=row_mask) - + def dequantize_rowwise(x: torch.Tensor, state_x: torch.Tensor): output = torch.empty(*x.shape, device=x.device, dtype=torch.float16) diff --git a/bitsandbytes/triton/int8_matmul_mixed_dequantize.py b/bitsandbytes/triton/int8_matmul_mixed_dequantize.py index 1b80ab1a0..dc3047d7e 100644 --- a/bitsandbytes/triton/int8_matmul_mixed_dequantize.py +++ b/bitsandbytes/triton/int8_matmul_mixed_dequantize.py @@ -120,7 +120,7 @@ def _int8_matmul_mixed_dequantize(A, B, C, bias, state_x_ptr, state_w_ptr, M, N, acc += tl.dot(a, b) A += BLOCK_K * SPLIT_K * stride_ak B += BLOCK_K * SPLIT_K * stride_bk - + acc = (w_factor * (x_factor * (acc * divfactor))) acc = acc.to(C.dtype.element_ty) diff --git a/bitsandbytes/triton/int8_matmul_rowwise_dequantize.py b/bitsandbytes/triton/int8_matmul_rowwise_dequantize.py index 1f28b0d10..4881e1468 100644 --- a/bitsandbytes/triton/int8_matmul_rowwise_dequantize.py +++ b/bitsandbytes/triton/int8_matmul_rowwise_dequantize.py @@ -119,7 +119,7 @@ def _int8_matmul_rowwise_dequantize(A, B, C, bias, state_x_ptr, state_w_ptr, M, acc += tl.dot(a, b) A += BLOCK_K * SPLIT_K * stride_ak B += BLOCK_K * SPLIT_K * stride_bk - + acc = (w_factor * (x_factor * (acc * divfactor))) acc = acc.to(C.dtype.element_ty) diff --git a/bitsandbytes/triton/quantize_columnwise_and_transpose.py b/bitsandbytes/triton/quantize_columnwise_and_transpose.py index fcadaba3e..e7961cf53 100644 --- a/bitsandbytes/triton/quantize_columnwise_and_transpose.py +++ b/bitsandbytes/triton/quantize_columnwise_and_transpose.py @@ -54,7 +54,7 @@ def _quantize_columnwise_and_transpose( max_val = tl.max(tl.where(p2_arange_mask, abs_x, 0), axis=0) output = tl.libdevice.llrint(127. * (x / max_val)) - new_start = pid * M + new_start = pid * M new_offsets = new_start + p2_arange tl.store(output_ptr + new_offsets, output, mask=p2_arange_mask) tl.store(output_maxs + pid, max_val) @@ -71,4 +71,3 @@ def quantize_columnwise_and_transpose(x: torch.Tensor): grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),) _quantize_columnwise_and_transpose[grid](x, output, output_maxs, n_elements, M, N, BLOCK_SIZE=M, P2=P2) return output, output_maxs - diff --git a/bitsandbytes/triton/quantize_global.py b/bitsandbytes/triton/quantize_global.py index a73a5bbaa..5cf194744 100644 --- a/bitsandbytes/triton/quantize_global.py +++ b/bitsandbytes/triton/quantize_global.py @@ -59,27 +59,27 @@ def quantize_global(x: torch.Tensor): key=['M', 'N'] ) @triton.jit - def _quantize_global_transpose(A, absmax_inv_ptr, B, stride_am, stride_an, stride_bn, stride_bm, M, N, - BLOCK_M : tl.constexpr, - BLOCK_N : tl.constexpr, + def _quantize_global_transpose(A, absmax_inv_ptr, B, stride_am, stride_an, stride_bn, stride_bm, M, N, + BLOCK_M : tl.constexpr, + BLOCK_N : tl.constexpr, GROUP_M : tl.constexpr): pid = tl.program_id(0) grid_m = (M + BLOCK_M - 1) // BLOCK_M grid_n = (N + BLOCK_N - 1) // BLOCK_N - + width = GROUP_M * grid_n group_id = pid // width group_size = min(grid_m - group_id * GROUP_M, GROUP_M) pid_m = group_id * GROUP_M + (pid % group_size) pid_n = (pid % width) // group_size - + rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) A = A + (rm[:, None] * stride_am + rn[None, :] * stride_an) mask = (rm < M)[:, None] & (rn < N)[None, :] a = tl.load(A, mask=mask) absmax_inv = tl.load(absmax_inv_ptr) - + # rematerialize to save registers rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N) @@ -95,12 +95,11 @@ def quantize_global_transpose(input): absmax_inv = 1./ absmax M, N = input.shape out = torch.empty(N, M, device='cuda', dtype=torch.int8) - + assert out.size(0) == N and out.size(1) == M assert input.stride(0) == 1 or input.stride(1) == 1 assert out.stride(0) == 1 or out.stride(1) == 1 - + grid = lambda META: (triton.cdiv(M, META['BLOCK_M']) * triton.cdiv(N, META['BLOCK_N']),) _quantize_global_transpose[grid](input, absmax_inv, out, input.stride(0), input.stride(1), out.stride(0), out.stride(1), M, N) return out, absmax - diff --git a/bitsandbytes/triton/quantize_rowwise.py b/bitsandbytes/triton/quantize_rowwise.py index fce464b19..078f4aa2d 100644 --- a/bitsandbytes/triton/quantize_rowwise.py +++ b/bitsandbytes/triton/quantize_rowwise.py @@ -46,7 +46,7 @@ def _quantize_rowwise( offsets = block_start + arange row_mask = arange < BLOCK_SIZE x = tl.load(x_ptr + offsets, mask=row_mask) - + abs_x = tl.abs(x) max_val = tl.max(tl.where(row_mask, abs_x, 0), axis=0) output = tl.libdevice.llrint(127. * (x / max_val)) @@ -64,4 +64,3 @@ def quantize_rowwise(x: torch.Tensor): grid = lambda meta: (x.shape[0],) _quantize_rowwise[grid](x, output, output_maxs, n_elements, BLOCK_SIZE=x.shape[1], P2=P2) return output, output_maxs - diff --git a/compile_from_source.md b/compile_from_source.md index 23afe1591..6310fd6c6 100644 --- a/compile_from_source.md +++ b/compile_from_source.md @@ -12,10 +12,10 @@ You can install CUDA locally without sudo by following the following steps: wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/install_cuda.sh # Syntax cuda_install CUDA_VERSION INSTALL_PREFIX EXPORT_TO_BASH # CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122} -# EXPORT_TO_BASH in {0, 1} with 0=False and 1=True +# EXPORT_TO_BASH in {0, 1} with 0=False and 1=True # For example, the following installs CUDA 11.7 to ~/local/cuda-11.7 and exports the path to your .bashrc -bash install_cuda.sh 117 ~/local 1 +bash install_cuda.sh 117 ~/local 1 ``` By default, the Makefile will look at your `CUDA_HOME` environmental variable to find your CUDA version for compiling the library. If this path is not set it is inferred from the path of your `nvcc` compiler. @@ -37,4 +37,3 @@ If you have problems compiling the library with these instructions from source, ## Compilation with Kepler Since 0.39.1 bitsandbytes installed via pip no longer provides Kepler binaries and these need to be compiled from source. Follow the steps above and instead of `cuda11x_nomatmul` etc use `cuda11x_nomatmul_kepler` - diff --git a/csrc/kernels.cu b/csrc/kernels.cu index 0fff83665..f117547ed 100644 --- a/csrc/kernels.cu +++ b/csrc/kernels.cu @@ -110,7 +110,7 @@ __device__ float dDequantizeFP4Tree(unsigned char val, float absmax) return 1.00000000f*absmax*sign; // 1011 else return 0.66666667f*absmax*sign; // 1010 - else + else if((val & 0b0001) == 1) // 100 return 5.208333333e-03f*absmax*sign; // 1001 else @@ -174,36 +174,36 @@ __device__ half dhDequantizeNF4(unsigned char val) if((val & 0b0100) == 4) // 1 if((val & 0b0010) == 2) // 11 if((val & 0b0001) == 1) // 111 - return 1.0f; + return 1.0f; else return 0.7229568362236023f; else if((val & 0b0001) == 1) // 110 - return 0.5626170039176941f; + return 0.5626170039176941f; else - return 0.44070982933044434f; + return 0.44070982933044434f; else if((val & 0b0010) == 2) //10 if((val & 0b0001) == 1) // 101 - return 0.33791524171829224f; + return 0.33791524171829224f; else - return 0.24611230194568634f; - else + return 0.24611230194568634f; + else if((val & 0b0001) == 1) // 100 - return 0.16093020141124725f; + return 0.16093020141124725f; else - return 0.07958029955625534f; + return 0.07958029955625534f; else if((val & 0b0100) == 4) // 0 if((val & 0b0010) == 2) //01 if((val & 0b0001) == 1) // 011 - return 0.0f; + return 0.0f; else - return -0.09105003625154495f; + return -0.09105003625154495f; else if((val & 0b0001) == 1) // 010 - return -0.18477343022823334f; + return -0.18477343022823334f; else return -0.28444138169288635f; else @@ -211,12 +211,12 @@ __device__ half dhDequantizeNF4(unsigned char val) if((val & 0b0001) == 1) // 001 return -0.39491748809814453f; else - return -0.5250730514526367f; - else + return -0.5250730514526367f; + else if((val & 0b0001) == 1) // 000 - return -0.6961928009986877f; + return -0.6961928009986877f; else - return -1.0f; + return -1.0f; } @@ -229,36 +229,36 @@ __device__ float dDequantizeNF4(unsigned char val) if((val & 0b0100) == 4) // 1 if((val & 0b0010) == 2) // 11 if((val & 0b0001) == 1) // 111 - return 1.0f; + return 1.0f; else return 0.7229568362236023f; else if((val & 0b0001) == 1) // 110 - return 0.5626170039176941f; + return 0.5626170039176941f; else - return 0.44070982933044434f; + return 0.44070982933044434f; else if((val & 0b0010) == 2) //10 if((val & 0b0001) == 1) // 101 - return 0.33791524171829224f; + return 0.33791524171829224f; else - return 0.24611230194568634f; - else + return 0.24611230194568634f; + else if((val & 0b0001) == 1) // 100 - return 0.16093020141124725f; + return 0.16093020141124725f; else - return 0.07958029955625534f; + return 0.07958029955625534f; else if((val & 0b0100) == 4) // 0 if((val & 0b0010) == 2) //01 if((val & 0b0001) == 1) // 011 - return 0.0f; + return 0.0f; else - return -0.09105003625154495f; + return -0.09105003625154495f; else if((val & 0b0001) == 1) // 010 - return -0.18477343022823334f; + return -0.18477343022823334f; else return -0.28444138169288635f; else @@ -266,12 +266,12 @@ __device__ float dDequantizeNF4(unsigned char val) if((val & 0b0001) == 1) // 001 return -0.39491748809814453f; else - return -0.5250730514526367f; - else + return -0.5250730514526367f; + else if((val & 0b0001) == 1) // 000 - return -0.6961928009986877f; + return -0.6961928009986877f; else - return -1.0f; + return -1.0f; } @@ -1863,7 +1863,7 @@ kOptimizerStatic8bit2StateBlockwise(T* p, T* __restrict__ const g, unsigned char //float ratio = (g_val*g_val)/fmaxf(s2_vals[j], eps*eps); //g_val = ratio > 2.0f ? 2.0f*g_val/ratio : g_val; g_val *= gnorm_scale; - + s2_vals[j] = (s2_vals[j]*beta2) + (((1.0f-beta2)*g_val*g_val)); s1_vals[j] = smem_quantiles1[lane_id][c1s[j]]*absmax1[i/BLOCK_SIZE]; @@ -3069,7 +3069,7 @@ template __global__ void kExtractOutliers(char *A, int *idx, char * //// use k warps per thread block //// 1. threadblock use read-only cache to read in register tile for A into shared memory //// 2. each warp loops over shared memory tiles of A of size 8x16 and loads them into fragments -//// 3. each warp reads a segment of values 16x32 from B +//// 3. each warp reads a segment of values 16x32 from B //// 4. do dequantization from register of B into second pair of registers //// 5. store (4) into fragment //// 6. matmul aggregate into fragment C @@ -3531,7 +3531,7 @@ template __global__ void kgemm_4bit_inference(int M, i template __global__ void kgemm_4bit_inference_naive(int M, int N, int K, T * __restrict__ const A, unsigned char *B, float *absmax, const float *datatype, T * out, int lda, int ldb, int ldc, int blocksize) { - // per threadblock: + // per threadblock: // load step-by-step in chunks of [32,warps]: 1x32 * [32,warps] -> [1,warps] // 4 warps -> 4 loads per iter // 1x32 * 32x4 -> 1x4 outputs per thread block @@ -3764,7 +3764,7 @@ template __global__ void kfunc(T *A, T *B, T value, long { switch(FUNC) { - case FILL: + case FILL: A[i] = (T)value; break; case ARANGE: diff --git a/csrc/pythonInterface.c b/csrc/pythonInterface.c index 865e4b6d5..087ae3921 100644 --- a/csrc/pythonInterface.c +++ b/csrc/pythonInterface.c @@ -389,7 +389,7 @@ extern "C" int hasPrefetch = 0; CUDA_CHECK_RETURN(cudaDeviceGetAttribute(&hasPrefetch, cudaDevAttrConcurrentManagedAccess, device)); // 40ns overhead if (hasPrefetch == 0) return; - + CUDA_CHECK_RETURN(cudaMemPrefetchAsync(ptr, bytes, device, 0)); CUDA_CHECK_RETURN(cudaPeekAtLastError()); } diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index 28da69eb0..043597177 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -1,8 +1,8 @@ -- sections: +- sections: - local: index title: Bits & Bytes - local: quickstart title: Quickstart - local: installation title: Installation - title: Get started \ No newline at end of file + title: Get started diff --git a/docs/source/index.mdx b/docs/source/index.mdx index 68ad433e6..67c928309 100644 --- a/docs/source/index.mdx +++ b/docs/source/index.mdx @@ -149,10 +149,10 @@ To compile from source, you need an installation of CUDA. If `nvcc` is not insta wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/install_cuda.sh # Syntax cuda_install CUDA_VERSION INSTALL_PREFIX EXPORT_TO_BASH # CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122} -# EXPORT_TO_BASH in {0, 1} with 0=False and 1=True +# EXPORT_TO_BASH in {0, 1} with 0=False and 1=True # For example, the following installs CUDA 11.7 to ~/local/cuda-11.7 and exports the path to your .bashrc -bash install_cuda.sh 117 ~/local 1 +bash install_cuda.sh 117 ~/local 1 ``` To use a specific CUDA version just for a single compile run, you can set the variable `CUDA_HOME`, for example the following command compiles `libbitsandbytes_cuda117.so` using compiler flags for cuda11x with the cuda version at `~/local/cuda-11.7`: @@ -188,4 +188,4 @@ For 8-bit optimizers or quantization routines, please consider citing the follow journal={9th International Conference on Learning Representations, ICLR}, year={2022} } -``` \ No newline at end of file +``` diff --git a/docs/source/quickstart.mdx b/docs/source/quickstart.mdx index 4dff2ba46..d1028c655 100644 --- a/docs/source/quickstart.mdx +++ b/docs/source/quickstart.mdx @@ -6,7 +6,7 @@ ## Minimal example -The following code illustrates the steps above. +The following code illustrates the steps above. ```python -``` \ No newline at end of file +``` diff --git a/environment.yml b/environment.yml index c0e07f153..9ab48dedc 100644 --- a/environment.yml +++ b/environment.yml @@ -42,4 +42,4 @@ dependencies: ## ENV UPDATE: # # add new packages to environment.yml, then: -# mamba env update -n bnb -f environment.yml \ No newline at end of file +# mamba env update -n bnb -f environment.yml diff --git a/examples/int8_inference_huggingface.py b/examples/int8_inference_huggingface.py index dc80a44db..2cee48e8e 100644 --- a/examples/int8_inference_huggingface.py +++ b/examples/int8_inference_huggingface.py @@ -22,6 +22,3 @@ ) generated_ids = model.generate(input_ids, max_length=MAX_NEW_TOKENS) print(tokenizer.decode(generated_ids[0], skip_special_tokens=True)) - - - diff --git a/how_to_use_nonpytorch_cuda.md b/how_to_use_nonpytorch_cuda.md index b5f01fbe5..566b0170e 100644 --- a/how_to_use_nonpytorch_cuda.md +++ b/how_to_use_nonpytorch_cuda.md @@ -18,7 +18,7 @@ You can also install CUDA version that you need locally with a script provided b wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/install_cuda.sh # Syntax cuda_install CUDA_VERSION INSTALL_PREFIX EXPORT_TO_BASH # CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122} -# EXPORT_TO_BASH in {0, 1} with 0=False and 1=True +# EXPORT_TO_BASH in {0, 1} with 0=False and 1=True # For example, the following installs CUDA 11.7 to ~/local/cuda-11.7 and exports the path to your .bashrc diff --git a/install_cuda.py b/install_cuda.py index 77e258609..4b041b8d0 100644 --- a/install_cuda.py +++ b/install_cuda.py @@ -49,13 +49,13 @@ def install_cuda(version, base_path, download_path): # Install CUDA print(f"Installing CUDA version {version}...") install_command = [ - "bash", filepath, - "--no-drm", "--no-man-page", "--override", + "bash", filepath, + "--no-drm", "--no-man-page", "--override", "--toolkitpath=" + install_path, "--toolkit", "--silent" ] print(f"Running command: {' '.join(install_command)}") - + try: subprocess.run(install_command, check=True) except subprocess.CalledProcessError as e: @@ -99,4 +99,4 @@ def main(): sys.exit(1) if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/scripts/stale.py b/scripts/stale.py index c299643ae..613f5b7cb 100644 --- a/scripts/stale.py +++ b/scripts/stale.py @@ -55,4 +55,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/tests/test_autograd.py b/tests/test_autograd.py index 7e70a30ca..d01e5e9db 100644 --- a/tests/test_autograd.py +++ b/tests/test_autograd.py @@ -519,4 +519,3 @@ def test_matmul_fp8( dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose): torch.testing.assert_close( gradB1, gradB2, atol=0.18, rtol=0.3 ) - diff --git a/tests/test_cuda_setup_evaluator.py b/tests/test_cuda_setup_evaluator.py index 5e1a548e5..189aa75b5 100644 --- a/tests/test_cuda_setup_evaluator.py +++ b/tests/test_cuda_setup_evaluator.py @@ -19,11 +19,3 @@ def test_manual_override(requires_cuda): import bitsandbytes as bnb loaded_lib = bnb.cuda_setup.main.CUDASetup.get_instance().binary_name #assert loaded_lib == 'libbitsandbytes_cuda122.so' - - - - - - - - diff --git a/tests/test_functional.py b/tests/test_functional.py index f4b8fca51..2d4e959ad 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -2345,5 +2345,3 @@ def test_gemv_eye_4bit(storage_type, dtype, double_quant): torch.testing.assert_close(A, C2) #torch.testing.assert_close(A, C1, rtol=1e-5, atol=0.00001) #torch.testing.assert_close(A, C2, rtol=1e-5, atol=0.080) - - diff --git a/tests/test_generation.py b/tests/test_generation.py index 9ed30cd2a..b05749bf8 100644 --- a/tests/test_generation.py +++ b/tests/test_generation.py @@ -120,6 +120,3 @@ def test_pi(requires_cuda, model_and_tokenizer, inference_kernel, DQ, dtype): for out in outputs: print(out) raise ValueError(f'Failure count: {failure_count}/{n_cases}') - - - diff --git a/tests/test_modules.py b/tests/test_modules.py index 1cb04044f..32d90938d 100644 --- a/tests/test_modules.py +++ b/tests/test_modules.py @@ -637,6 +637,3 @@ def test_4bit_warnings(): net(inp) assert len(record) == 2 - - - diff --git a/tests/test_triton.py b/tests/test_triton.py index 943db067a..218a533d5 100644 --- a/tests/test_triton.py +++ b/tests/test_triton.py @@ -58,4 +58,3 @@ def test_switchback(vector_wise_quantization): print('GX1', err_sb, err_baseline) assert err_sb < 2 * err_baseline - From c4079bde92bbeb1a4b0cce8bd1be336d420ef607 Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Thu, 1 Feb 2024 13:40:43 +0200 Subject: [PATCH 024/112] CI: don't build docs in private PRs --- .github/workflows/build_pr_documentation.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml index 40ea8b5bc..d6455fd11 100644 --- a/.github/workflows/build_pr_documentation.yml +++ b/.github/workflows/build_pr_documentation.yml @@ -9,6 +9,7 @@ concurrency: jobs: build: + if: github.repository == 'TimDettmers/bitsandbytes' uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main with: commit_sha: ${{ github.event.pull_request.head.sha }} From f718770592c7541d703b8a82b696d365fb1d7412 Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Thu, 1 Feb 2024 13:42:59 +0200 Subject: [PATCH 025/112] CI: set concurrency limits to avoid extra builds --- .github/workflows/cmake.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml index 35d1b0240..ae4037b3b 100644 --- a/.github/workflows/cmake.yml +++ b/.github/workflows/cmake.yml @@ -6,6 +6,10 @@ on: pull_request: branches: [ "main" ] +concurrency: + group: cmake-${{ github.ref }} + cancel-in-progress: true + jobs: build: runs-on: ${{ matrix.os }} From c62d8316ab309559688eb19c72f69046e0435dfc Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Thu, 1 Feb 2024 13:46:17 +0200 Subject: [PATCH 026/112] CI: don't include compiler type in matrix --- .github/workflows/cmake.yml | 37 +++++++++++++++---------------------- 1 file changed, 15 insertions(+), 22 deletions(-) diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml index ae4037b3b..35585e3d7 100644 --- a/.github/workflows/cmake.yml +++ b/.github/workflows/cmake.yml @@ -23,19 +23,6 @@ jobs: python-version: ['3.10', '3.11'] cuda-version: ['11.8', '12.1'] build_type: [Release] - c_compiler: [gcc, cl] - include: - - os: windows-latest - c_compiler: cl - cpp_compiler: cl - - os: ubuntu-latest - c_compiler: gcc - cpp_compiler: g++ - exclude: - - os: ubuntu-latest - c_compiler: cl - - os: windows-latest - c_compiler: gcc steps: - uses: actions/checkout@v4 @@ -101,8 +88,13 @@ jobs: echo CUDA_PATH=$CUDA_HOME >> "$GITHUB_ENV" if [ "${{ matrix.os }}" = "windows-latest" ]; then + echo CXX_COMPILER=cl >> "$GITHUB_ENV" + echo C_COMPILER=cl >> "$GITHUB_ENV" # without -DCMAKE_CUDA_COMPILER=nvcc, cmake config always fail for cuda-11.8 echo DCMAKE_CUDA_COMPILER=-DCMAKE_CUDA_COMPILER=nvcc >> "$GITHUB_ENV" + else + echo CXX_COMPILER=g++ >> "$GITHUB_ENV" + echo C_COMPILER=gcc >> "$GITHUB_ENV" fi nvcc --version @@ -113,26 +105,27 @@ jobs: - name: Prep build run: python -m pip install cmake==3.27.9 ninja setuptools wheel - - name: Configure CMake + # TODO: the following steps (CUDA, NOBLASLT, CPU) could be moved to the matrix, so they're built in parallel + + - name: Configure CUDA run: > cmake -B ${{ steps.strings.outputs.build-output-dir }} -G Ninja ${{ env.DCMAKE_CUDA_COMPILER }} - -DCMAKE_CXX_COMPILER=${{ matrix.cpp_compiler }} - -DCMAKE_C_COMPILER=${{ matrix.c_compiler }} + -DCMAKE_CXX_COMPILER=${{ env.CXX_COMPILER }} + -DCMAKE_C_COMPILER=${{ env.C_COMPILER }} -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DCOMPUTE_CAPABILITY="50;52;60;61;62;70;72;75;80;86;87;89;90" -S ${{ github.workspace }} - - name: Build - # Build your program with the given configuration. Note that --config is needed because the default Windows generator is a multi-config generator (Visual Studio generator). + - name: Build CUDA run: cmake --build ${{ steps.strings.outputs.build-output-dir }} --config ${{ matrix.build_type }} - name: Configure NOBLASLT run: > cmake -B ${{ steps.strings.outputs.build-output-dir }} -G Ninja ${{ env.DCMAKE_CUDA_COMPILER }} - -DCMAKE_CXX_COMPILER=${{ matrix.cpp_compiler }} - -DCMAKE_C_COMPILER=${{ matrix.c_compiler }} + -DCMAKE_CXX_COMPILER=${{ env.CXX_COMPILER }} + -DCMAKE_C_COMPILER=${{ env.C_COMPILER }} -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DCOMPUTE_CAPABILITY="50;52;60;61;62;70;72;75;80;86;87;89;90" -DNO_CUBLASLT=ON @@ -145,8 +138,8 @@ jobs: run: > cmake -B ${{ steps.strings.outputs.build-output-dir }} -G Ninja ${{ env.DCMAKE_CUDA_COMPILER }} - -DCMAKE_CXX_COMPILER=${{ matrix.cpp_compiler }} - -DCMAKE_C_COMPILER=${{ matrix.c_compiler }} + -DCMAKE_CXX_COMPILER=${{ env.CXX_COMPILER }} + -DCMAKE_C_COMPILER=${{ env.C_COMPILER }} -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DNO_CUBLASLT=ON -DBUILD_CUDA=OFF From 1cbe9fd82c51095c565e4e0cc0ba8ddc3ea7fd3b Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Thu, 1 Feb 2024 13:46:51 +0200 Subject: [PATCH 027/112] CI: remove ctest phase (there are no CMake tests) --- .github/workflows/cmake.yml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml index 35585e3d7..737824ccf 100644 --- a/.github/workflows/cmake.yml +++ b/.github/workflows/cmake.yml @@ -148,12 +148,6 @@ jobs: - name: Build CPU run: cmake --build ${{ steps.strings.outputs.build-output-dir }} --config ${{ matrix.build_type }} - - name: Test - working-directory: ${{ steps.strings.outputs.build-output-dir }} - # Execute tests defined by the CMake configuration. Note that --build-config is needed because the default Windows generator is a multi-config generator (Visual Studio generator). - # See https://cmake.org/cmake/help/latest/manual/ctest.1.html for more detail - run: ctest --build-config ${{ matrix.build_type }} - - name: Build dist shell: bash -el {0} run: | From 6ec252060e9d47176ca2e1357cf43598e8db0477 Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Thu, 1 Feb 2024 15:34:12 +0200 Subject: [PATCH 028/112] CI: build only on Python 3.10 (Python 3.11 can use the same wheel) --- .github/workflows/cmake.yml | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml index 737824ccf..728dd09fb 100644 --- a/.github/workflows/cmake.yml +++ b/.github/workflows/cmake.yml @@ -20,16 +20,11 @@ jobs: matrix: os: [ubuntu-latest, windows-latest] - python-version: ['3.10', '3.11'] cuda-version: ['11.8', '12.1'] build_type: [Release] steps: - uses: actions/checkout@v4 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 - with: - python-version: ${{ matrix.python-version }} - name: Set up MSVC if: matrix.os == 'windows-latest' @@ -52,7 +47,7 @@ jobs: environment-file: environment-bnb.yml use-only-tar-bz2: false auto-activate-base: true - python-version: ${{ matrix.python-version }} + python-version: "3.10" mamba-version: "*" - name: Set reusable strings @@ -159,6 +154,6 @@ jobs: - name: Upload Build Artifacts uses: actions/upload-artifact@v4.3.0 with: - name: bitsandbytes-${{ matrix.os }}-${{ matrix.python-version }}-${{ matrix.cuda-version }} + name: bitsandbytes-${{ matrix.os }}-${{ matrix.cuda-version }} path: | ${{ github.workspace }}/dist/ From 1a55fb70f9d704faa70469db7b1009f9aa7dd3a9 Mon Sep 17 00:00:00 2001 From: Titus <9048635+Titus-von-Koeller@users.noreply.github.com> Date: Thu, 1 Feb 2024 12:02:13 -0300 Subject: [PATCH 029/112] bump version to dev for docs --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index b109d9454..c493b8b62 100644 --- a/setup.py +++ b/setup.py @@ -19,7 +19,7 @@ def read(fname): setup( name="bitsandbytes", - version="0.42.0", + version="0.43.0.dev0", author="Tim Dettmers", author_email="dettmers@cs.washington.edu", description="k-bit optimizers and matrix multiplication routines.", From 3ba076d3fb3f1590a049f5715c446514cff3a8c0 Mon Sep 17 00:00:00 2001 From: Titus <9048635+Titus-von-Koeller@users.noreply.github.com> Date: Thu, 1 Feb 2024 12:03:36 -0300 Subject: [PATCH 030/112] bump version to dev for docs --- bitsandbytes/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bitsandbytes/__init__.py b/bitsandbytes/__init__.py index 87307a9d2..e54e933d9 100644 --- a/bitsandbytes/__init__.py +++ b/bitsandbytes/__init__.py @@ -24,6 +24,6 @@ "optim.optimizer.MockArgs": False, } -__version__ = "0.42.0" +__version__ = "0.43.0.dev" PACKAGE_GITHUB_URL = "https://github.com/TimDettmers/bitsandbytes" From f0e3b72fbfc04d3f5628b2f427549a2870d08e6a Mon Sep 17 00:00:00 2001 From: Rickard Date: Sat, 3 Feb 2024 14:52:11 +0100 Subject: [PATCH 031/112] Add build artifacts to .gitignore --- .gitignore | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 2f929968b..202dcb13d 100644 --- a/.gitignore +++ b/.gitignore @@ -2,9 +2,26 @@ __pycache__/ *.py[cod] *$py.class - -# C extensions *.so +*.dll +*.dylib +*.o +*.obj +*.air +*.metallib + +# CMake generated files +CMakeCache.txt +CMakeScripts/ +cmake_install.cmake +Makefile +CMakeFiles/ +*.sln +*.vcxproj* +*.xcodeproj/ +bitsandbytes.dir/ +Debug/ +Release/ # Distribution / packaging .Python @@ -133,4 +150,5 @@ dmypy.json dependencies cuda_build +output/ .vscode/* From c1a7952ceeafe978070c1783794ea0ba15a2477b Mon Sep 17 00:00:00 2001 From: Rickard Date: Sat, 3 Feb 2024 18:46:57 +0100 Subject: [PATCH 032/112] Update .gitignore --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 202dcb13d..46316e4b3 100644 --- a/.gitignore +++ b/.gitignore @@ -23,6 +23,9 @@ bitsandbytes.dir/ Debug/ Release/ +# IDE local files +.vs/ + # Distribution / packaging .Python build/ From ecf51cb6786a75355a51fb98a643f47d25e48bd0 Mon Sep 17 00:00:00 2001 From: Rickard Date: Sun, 4 Feb 2024 18:58:02 +0100 Subject: [PATCH 033/112] Fixed VS Code format on save (#1028) * Fixed VS Code format on save * Re-saved the json files using the new settings --- .editorconfig | 3 +++ .gitignore | 1 - .vscode/extensions.json | 7 +++++++ .vscode/settings.json | 7 +++++++ 4 files changed, 17 insertions(+), 1 deletion(-) create mode 100644 .editorconfig create mode 100644 .vscode/extensions.json create mode 100644 .vscode/settings.json diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 000000000..03490db50 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,3 @@ +[*] +trim_trailing_whitespace = true +insert_final_newline = true diff --git a/.gitignore b/.gitignore index 46316e4b3..22f5a6cd6 100644 --- a/.gitignore +++ b/.gitignore @@ -154,4 +154,3 @@ dmypy.json dependencies cuda_build output/ -.vscode/* diff --git a/.vscode/extensions.json b/.vscode/extensions.json new file mode 100644 index 000000000..939843f43 --- /dev/null +++ b/.vscode/extensions.json @@ -0,0 +1,7 @@ +{ + "recommendations": [ + "ms-python.python", + "charliermarsh.ruff", + "twxs.cmake" + ] +} diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 000000000..906f28588 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,7 @@ +{ + "ruff.fixAll": true, + "ruff.lint.run": "onType", + "editor.codeActionsOnSave": { + "source.fixAll": "always" + } +} From acc7fb3730aaad28a3f2ccd1903a00379a189e4b Mon Sep 17 00:00:00 2001 From: Titus <9048635+Titus-von-Koeller@users.noreply.github.com> Date: Sun, 4 Feb 2024 16:12:21 -0300 Subject: [PATCH 034/112] documentation release v1 (#1012) * add optional dependency for preview to environment.yml * Add additional sections, first optimizers, MacOS WIP * drafting + refactoring new docs * some changes * run pre-commit hooks * add mention of pre-commit to contributing * fix * test autodoc * new additions * add subtilte * add some content * add more methods * fix * further docs updates * Update _toctree.yml * fix link * run pre-commit hooks * refactor + further docs * Update README.md with new docs link Co-authored-by: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> * list of blog posts Co-authored-by: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> * list of blog posts Co-authored-by: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> * accept change suggestion Co-authored-by: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> * accept suggestion Co-authored-by: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> * accept suggestion Co-authored-by: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> * Update docs/source/integrations.mdx Co-authored-by: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> * index instead of intro * fixup README, add docs link * add instructions for creating docstrings * final polish (except integrations) * fill out integrations section --------- Co-authored-by: younesbelkada Co-authored-by: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- README.md | 192 +----------------- bitsandbytes/nn/modules.py | 129 ++++++++++++ compile_from_source.md | 39 ---- docs/source/_toctree.yml | 32 ++- docs/source/algorithms.mdx | 12 ++ docs/source/compiling.mdx | 41 ++++ docs/source/contributing.mdx | 20 ++ .../source/errors.mdx | 15 +- docs/source/faqs.mdx | 7 + docs/source/index.mdx | 192 +----------------- docs/source/installation.mdx | 22 +- docs/source/integrations.mdx | 42 ++++ .../source/nonpytorchcuda.mdx | 8 +- docs/source/optimizers.mdx | 190 +++++++++++++++++ docs/source/quantization.mdx | 13 ++ docs/source/quickstart.mdx | 7 +- docs/source/resources.mdx | 92 +++++++++ environment.yml | 1 + howto_config_override.md | 40 ---- 20 files changed, 631 insertions(+), 465 deletions(-) delete mode 100644 compile_from_source.md create mode 100644 docs/source/algorithms.mdx create mode 100644 docs/source/compiling.mdx create mode 100644 docs/source/contributing.mdx rename errors_and_solutions.md => docs/source/errors.mdx (68%) create mode 100644 docs/source/faqs.mdx create mode 100644 docs/source/integrations.mdx rename how_to_use_nonpytorch_cuda.md => docs/source/nonpytorchcuda.mdx (76%) create mode 100644 docs/source/optimizers.mdx create mode 100644 docs/source/quantization.mdx create mode 100644 docs/source/resources.mdx delete mode 100644 howto_config_override.md diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 039139b95..feb6c766e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.1.15 + rev: v0.2.0 hooks: - id: ruff args: diff --git a/README.md b/README.md index 61dede8c1..43eadf5a3 100644 --- a/README.md +++ b/README.md @@ -1,195 +1,17 @@ -# bitsandbytes +# `bitsandbytes` -The bitsandbytes is a lightweight wrapper around CUDA custom functions, in particular 8-bit optimizers, matrix multiplication (LLM.int8()), and quantization functions. +The `bitsandbytes` library is a lightweight Python wrapper around CUDA custom functions, in particular 8-bit optimizers, matrix multiplication (LLM.int8()), and 8 & 4-bit quantization functions. +The library includes quantization primitives for 8-bit & 4-bit operations, through `bitsandbytes.nn.Linear8bitLt` and `bitsandbytes.nn.Linear4bit` and 8-bit optimizers through `bitsandbytes.optim` module. +There are ongoing efforts to support further hardware backends, i.e. Intel CPU + GPU, AMD GPU, Apple Silicon. Windows support is quite far along and is on its way as well. -Resources: -- [8-bit Optimizer Paper](https://arxiv.org/abs/2110.02861) -- [Video](https://www.youtube.com/watch?v=IxrlHAJtqKE) -- [Docs](https://bitsandbytes.readthedocs.io/en/latest/) +**Please head to the official documentation page:** -- [LLM.int8() Paper](https://arxiv.org/abs/2208.07339) -- [LLM.int8() Software Blog Post](https://huggingface.co/blog/hf-bitsandbytes-integration) -- [LLM.int8() Emergent Features Blog Post](https://timdettmers.com/2022/08/17/llm-int8-and-emergent-features/) - -## TL;DR -**Requirements** -Python >=3.8. Linux distribution (Ubuntu, MacOS, etc.) + CUDA > 10.0. - -(Deprecated: CUDA 10.0 is deprecated and only CUDA >= 11.0) will be supported with release 0.39.0) - -**Installation**: - -``pip install bitsandbytes`` - -In some cases it can happen that you need to compile from source. If this happens please consider submitting a bug report with `python -m bitsandbytes` information. What now follows is some short instructions which might work out of the box if `nvcc` is installed. If these do not work see further below. - -Compilation quickstart: -```bash -git clone https://github.com/timdettmers/bitsandbytes.git -cd bitsandbytes - -# CUDA_VERSIONS in {110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122} -# make argument in {cuda110, cuda11x, cuda12x} -# if you do not know what CUDA you have, try looking at the output of: python -m bitsandbytes -CUDA_VERSION=117 make cuda11x -python setup.py install -``` - -**Using Int8 inference with HuggingFace Transformers** - -```python -from transformers import AutoModelForCausalLM -model = AutoModelForCausalLM.from_pretrained( - 'decapoda-research/llama-7b-hf', - device_map='auto', - load_in_8bit=True, - max_memory={ - i: f'{int(torch.cuda.mem_get_info(i)[0]/1024**3)-2}GB' - for i in range(torch.cuda.device_count()) - } -) -``` - -A more detailed example, can be found in [examples/int8_inference_huggingface.py](examples/int8_inference_huggingface.py). - -**Using 8-bit optimizer**: -1. Comment out optimizer: ``#torch.optim.Adam(....)`` -2. Add 8-bit optimizer of your choice ``bnb.optim.Adam8bit(....)`` (arguments stay the same) -3. Replace embedding layer if necessary: ``torch.nn.Embedding(..) -> bnb.nn.Embedding(..)`` - - -**Using 8-bit Inference**: -1. Comment out torch.nn.Linear: ``#linear = torch.nn.Linear(...)`` -2. Add bnb 8-bit linear light module: ``linear = bnb.nn.Linear8bitLt(...)`` (base arguments stay the same) -3. There are two modes: - - Mixed 8-bit training with 16-bit main weights. Pass the argument ``has_fp16_weights=True`` (default) - - Int8 inference. Pass the argument ``has_fp16_weights=False`` -4. To use the full LLM.int8() method, use the ``threshold=k`` argument. We recommend ``k=6.0``. -```python -# LLM.int8() -linear = bnb.nn.Linear8bitLt(dim1, dim2, bias=True, has_fp16_weights=False, threshold=6.0) -# inputs need to be fp16 -out = linear(x.to(torch.float16)) -``` - - -## Features -- 8-bit Matrix multiplication with mixed precision decomposition -- LLM.int8() inference -- 8-bit Optimizers: Adam, AdamW, RMSProp, LARS, LAMB, Lion (saves 75% memory) -- Stable Embedding Layer: Improved stability through better initialization, and normalization -- 8-bit quantization: Quantile, Linear, and Dynamic quantization -- Fast quantile estimation: Up to 100x faster than other algorithms - -## Requirements & Installation - -Requirements: anaconda, cudatoolkit, pytorch - -Hardware requirements: - - LLM.int8(): NVIDIA Turing (RTX 20xx; T4) or Ampere GPU (RTX 30xx; A4-A100); (a GPU from 2018 or newer). - - 8-bit optimizers and quantization: NVIDIA Kepler GPU or newer (>=GTX 78X). - -Supported CUDA versions: 10.2 - 12.2 - -The bitsandbytes library is currently only supported on Linux distributions. Windows is not supported at the moment. - -The requirements can best be fulfilled by installing pytorch via anaconda. You can install PyTorch by following the ["Get Started"](https://pytorch.org/get-started/locally/) instructions on the official website. - -To install run: - -``pip install bitsandbytes`` - -## Using bitsandbytes - -### Using Int8 Matrix Multiplication - -For straight Int8 matrix multiplication with mixed precision decomposition you can use ``bnb.matmul(...)``. To enable mixed precision decomposition, use the threshold parameter: -```python -bnb.matmul(..., threshold=6.0) -``` - -For instructions how to use LLM.int8() inference layers in your own code, see the TL;DR above or for extended instruction see [this blog post](https://huggingface.co/blog/hf-bitsandbytes-integration). - -### Using the 8-bit Optimizers - -With bitsandbytes 8-bit optimizers can be used by changing a single line of code in your codebase. For NLP models we recommend also to use the StableEmbedding layers (see below) which improves results and helps with stable 8-bit optimization. To get started with 8-bit optimizers, it is sufficient to replace your old optimizer with the 8-bit optimizer in the following way: -```python -import bitsandbytes as bnb - -# adam = torch.optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.995)) # comment out old optimizer -adam = bnb.optim.Adam8bit(model.parameters(), lr=0.001, betas=(0.9, 0.995)) # add bnb optimizer -adam = bnb.optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.995), optim_bits=8) # equivalent - - -torch.nn.Embedding(...) -> bnb.nn.StableEmbedding(...) # recommended for NLP models -``` - -Note that by default all parameter tensors with less than 4096 elements are kept at 32-bit even if you initialize those parameters with 8-bit optimizers. This is done since such small tensors do not save much memory and often contain highly variable parameters (biases) or parameters that require high precision (batch norm, layer norm). You can change this behavior like so: -```python -# parameter tensors with less than 16384 values are optimized in 32-bit -# it is recommended to use multiplies of 4096 -adam = bnb.optim.Adam8bit(model.parameters(), min_8bit_size=16384) -``` - -### Change Bits and other Hyperparameters for Individual Parameters - -If you want to optimize some unstable parameters with 32-bit Adam and others with 8-bit Adam, you can use the `GlobalOptimManager`. With this, we can also configure specific hyperparameters for particular layers, such as embedding layers. To do that, we need two things: (1) register the parameter while they are still on the CPU, (2) override the config with the new desired hyperparameters (anytime, anywhere). See our [guide](howto_config_override.md) for more details - -### Fairseq Users - -To use the Stable Embedding Layer, override the respective `build_embedding(...)` function of your model. Make sure to also use the `--no-scale-embedding` flag to disable scaling of the word embedding layer (nor replaced with layer norm). You can use the optimizers by replacing the optimizer in the respective file (`adam.py` etc.). - -## Release and Feature History - -For upcoming features and changes and full history see [Patch Notes](CHANGELOG.md). - -## Errors - -1. RuntimeError: CUDA error: no kernel image is available for execution on the device. [Solution](errors_and_solutions.md#No-kernel-image-available) -2. __fatbinwrap_.. [Solution](errors_and_solutions.md#fatbinwrap_) - -## Compile from source -To compile from source, you need an installation of CUDA. If `nvcc` is not installed, you can install the CUDA Toolkit with nvcc through the following commands. - -```bash -wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/install_cuda.sh -# Syntax cuda_install CUDA_VERSION INSTALL_PREFIX EXPORT_TO_BASH -# CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122} -# EXPORT_TO_BASH in {0, 1} with 0=False and 1=True - -# For example, the following installs CUDA 11.7 to ~/local/cuda-11.7 and exports the path to your .bashrc -bash install_cuda.sh 117 ~/local 1 -``` - -To use a specific CUDA version just for a single compile run, you can set the variable `CUDA_HOME`, for example the following command compiles `libbitsandbytes_cuda117.so` using compiler flags for cuda11x with the cuda version at `~/local/cuda-11.7`: - -``CUDA_HOME=~/local/cuda-11.7 CUDA_VERSION=117 make cuda11x`` - -For more detailed instruction, please follow the [compile_from_source.md](compile_from_source.md) instructions. +**[https://huggingface.co/docs/bitsandbytes/main](https://huggingface.co/docs/bitsandbytes/main)** ## License -The majority of bitsandbytes is licensed under MIT, however portions of the project are available under separate license terms: Pytorch is licensed under the BSD license. +The majority of bitsandbytes is licensed under MIT, however small portions of the project are available under separate license terms, as the parts adapted from Pytorch are licensed under the BSD license. We thank Fabio Cannizzo for his work on [FastBinarySearch](https://github.com/fabiocannizzo/FastBinarySearch) which we use for CPU quantization. - -## How to cite us -If you found this library and found LLM.int8() useful, please consider citing our work: - -```bibtex -@article{dettmers2022llmint8, - title={LLM.int8(): 8-bit Matrix Multiplication for Transformers at Scale}, - author={Dettmers, Tim and Lewis, Mike and Belkada, Younes and Zettlemoyer, Luke}, - journal={arXiv preprint arXiv:2208.07339}, - year={2022} -} -``` - -For 8-bit optimizers or quantization routines, please consider citing the following work: - -```bibtex -@article{dettmers2022optimizers, - title={8-bit Optimizers via Block-wise Quantization}, - author={Dettmers, Tim and Lewis, Mike and Shleifer, Sam and Zettlemoyer, Luke}, - journal={9th International Conference on Learning Representations, ICLR}, - year={2022} -} -``` diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py index 922feae15..6eeecc273 100644 --- a/bitsandbytes/nn/modules.py +++ b/bitsandbytes/nn/modules.py @@ -19,6 +19,42 @@ class StableEmbedding(torch.nn.Embedding): + """ + Custom embedding layer designed for stable training in NLP tasks. The stable + embedding layer improves stability during optimization for models with word + embeddings, addressing issues related to the non-uniform distribution of input + tokens. + + This stable embedding layer is initialized with Xavier uniform initialization, + followed by layer normalization. It is designed to support aggressive quantization, + addressing extreme gradient variations in non-uniform input distributions. The + stability of training is enhanced by using 32-bit optimizer states specifically + for this layer. + + Example: + + ``` + # Initialize StableEmbedding layer with vocabulary size 1000, embedding dimension 300 + embedding_layer = StableEmbedding(num_embeddings=1000, embedding_dim=300) + + # Reset embedding parameters + embedding_layer.reset_parameters() + + # Perform a forward pass with input tensor + input_tensor = torch.tensor([1, 2, 3]) + output_embedding = embedding_layer(input_tensor) + ``` + + Attributes: + norm (torch.nn.LayerNorm): Layer normalization applied after the embedding. + + Methods: + reset_parameters(): Reset embedding parameters using Xavier uniform initialization. + forward(input: Tensor) -> Tensor: Forward pass through the stable embedding layer. + + Reference: + - [8-bit optimizer paper](https://arxiv.org/pdf/2110.02861.pdf) + """ def __init__( self, num_embeddings: int, @@ -32,6 +68,17 @@ def __init__( device=None, dtype=None, ) -> None: + """ + Args: + num_embeddings (`int`): The number of unique embeddings (vocabulary size). + embedding_dim (`int`): The dimensionality of the embedding. + padding_idx (`Optional[int]`): If specified, pads the output with zeros at the given index. + max_norm (`Optional[float]`): If given, renormalizes embeddings to have a maximum L2 norm. + norm_type (`float`, defaults to `2.0`): The p-norm to compute for the max_norm option. + scale_grad_by_freq (`bool`): Scale gradient by frequency during backpropagation. + sparse (`bool`): If True, computes sparse gradients; False, computes dense gradients. + _weight (`Optional[Tensor]`): Pre-trained embeddings. + """ super().__init__( num_embeddings, embedding_dim, @@ -222,8 +269,49 @@ def to(self, *args, **kwargs): class Linear4bit(nn.Linear): + """ + This class is the base module for the 4-bit quantization algorithm presented in [QLoRA](https://arxiv.org/abs/2305.14314). + QLoRA 4-bit linear layers uses blockwise k-bit quantization under the hood, with the possibility of selecting various + compute datatypes such as FP4 and NF4. + + In order to quantize a linear layer one should first load the original fp16 / bf16 weights into + the Linear8bitLt module, then call `quantized_module.to("cuda")` to quantize the fp16 / bf16 weights. + + Example: + + ```python + import torch + import torch.nn as nn + + import bitsandbytes as bnb + from bnb.nn import Linear4bit + fp16_model = nn.Sequential( + nn.Linear(64, 64), + nn.Linear(64, 64) + ) + + quantized_model = nn.Sequential( + Linear4bit(64, 64), + Linear4bit(64, 64) + ) + + quantized_model.load_state_dict(fp16_model.state_dict()) + quantized_model = quantized_model.to(0) # Quantization happens here + ``` + """ def __init__(self, input_features, output_features, bias=True, compute_dtype=None, compress_statistics=True, quant_type='fp4', quant_storage=torch.uint8, device=None): + """ + Initialize Linear4bit class. + + Args: + input_features (`str`): + Number of input features of the linear layer. + output_features (`str`): + Number of output features of the linear layer. + bias (`bool`, defaults to `True`): + Whether the linear class uses the bias term as well. + """ super().__init__(input_features, output_features, bias, device) self.weight = Params4bit(self.weight.data, requires_grad=False, compress_statistics=compress_statistics, quant_type=quant_type, quant_storage=quant_storage, module=self) # self.persistent_buffers = [] # TODO consider as way to save quant state @@ -397,8 +485,49 @@ def maybe_rearrange_weight(state_dict, prefix, local_metadata, strict, missing_k class Linear8bitLt(nn.Linear): + """ + This class is the base module for the [LLM.int8()](https://arxiv.org/abs/2208.07339) algorithm. + To read more about it, have a look at the paper. + + In order to quantize a linear layer one should first load the original fp16 / bf16 weights into + the Linear8bitLt module, then call `int8_module.to("cuda")` to quantize the fp16 weights. + + Example: + + ```python + import torch + import torch.nn as nn + + import bitsandbytes as bnb + from bnb.nn import Linear8bitLt + + fp16_model = nn.Sequential( + nn.Linear(64, 64), + nn.Linear(64, 64) + ) + + int8_model = nn.Sequential( + Linear8bitLt(64, 64, has_fp16_weights=False), + Linear8bitLt(64, 64, has_fp16_weights=False) + ) + + int8_model.load_state_dict(fp16_model.state_dict()) + int8_model = int8_model.to(0) # Quantization happens here + ``` + """ def __init__(self, input_features, output_features, bias=True, has_fp16_weights=True, memory_efficient_backward=False, threshold=0.0, index=None, device=None): + """ + Initialize Linear8bitLt class. + + Args: + input_features (`str`): + Number of input features of the linear layer. + output_features (`str`): + Number of output features of the linear layer. + bias (`bool`, defaults to `True`): + Whether the linear class uses the bias term as well. + """ super().__init__(input_features, output_features, bias, device) assert not memory_efficient_backward, "memory_efficient_backward is no longer required and the argument is deprecated in 0.37.0 and will be removed in 0.39.0" self.state = bnb.MatmulLtState() diff --git a/compile_from_source.md b/compile_from_source.md deleted file mode 100644 index 6310fd6c6..000000000 --- a/compile_from_source.md +++ /dev/null @@ -1,39 +0,0 @@ -# Compiling from source - -Basic steps. -1. `CUDA_VERSION=XXX make [target]` where `[target]` is among `cuda92, cuda10x, cuda110, cuda11x, cuda12x, cpuonly` -2. `python setup.py install` - -To run these steps you will need to have the nvcc compiler installed that comes with a CUDA installation. If you use anaconda (recommended) then you can figure out which version of CUDA you are using with PyTorch via the command `conda list | grep cudatoolkit`. Then you can install the nvcc compiler by downloading and installing the same CUDA version from the [CUDA toolkit archive](https://developer.nvidia.com/cuda-toolkit-archive). - -You can install CUDA locally without sudo by following the following steps: - -```bash -wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/install_cuda.sh -# Syntax cuda_install CUDA_VERSION INSTALL_PREFIX EXPORT_TO_BASH -# CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122} -# EXPORT_TO_BASH in {0, 1} with 0=False and 1=True - -# For example, the following installs CUDA 11.7 to ~/local/cuda-11.7 and exports the path to your .bashrc -bash install_cuda.sh 117 ~/local 1 -``` - -By default, the Makefile will look at your `CUDA_HOME` environmental variable to find your CUDA version for compiling the library. If this path is not set it is inferred from the path of your `nvcc` compiler. - -Either `nvcc` needs to be in path for the `CUDA_HOME` variable needs to be set to the CUDA directory root (e.g. `/usr/local/cuda`) in order for compilation to succeed - -If you type `nvcc` and it cannot be found, you might need to add to your path or set the CUDA_HOME variable. You can run `python -m bitsandbytes` to find the path to CUDA. For example if `python -m bitsandbytes` shows you the following: -``` -++++++++++++++++++ /usr/local CUDA PATHS +++++++++++++++++++ -/usr/local/cuda-11.7/targets/x86_64-linux/lib/libcudart.so -``` -You can set `CUDA_HOME` to `/usr/local/cuda-11.7`. For example, you might be able to compile like this. - -``CUDA_HOME=~/local/cuda-11.7 CUDA_VERSION=117 make cuda11x`` - - -If you have problems compiling the library with these instructions from source, please open an issue. - -## Compilation with Kepler - -Since 0.39.1 bitsandbytes installed via pip no longer provides Kepler binaries and these need to be compiled from source. Follow the steps above and instead of `cuda11x_nomatmul` etc use `cuda11x_nomatmul_kepler` diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index 043597177..ede41bb6c 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -1,8 +1,34 @@ -- sections: +- title: Get started + sections: - local: index - title: Bits & Bytes + title: Index - local: quickstart title: Quickstart - local: installation title: Installation - title: Get started +- title: Features & Integrations + sections: + - local: quantization + title: Quantization + - local: optimizers + title: Optimizers + - local: integrations + title: Integrations + - local: algorithms + title: Algorithms +- title: Support & Learning + sections: + - local: resources + title: Papers, resources & how to cite + - local: errors + title: Errors & Solutions + - local: nonpytorchcuda + title: Non-PyTorch CUDA + - local: compiling + title: Compilation from Source (extended) + - local: faqs + title: FAQs (Frequently Asked Questions) +- title: Contributors Guidelines + sections: + - local: contributing + title: Contributing diff --git a/docs/source/algorithms.mdx b/docs/source/algorithms.mdx new file mode 100644 index 000000000..d9db5cb04 --- /dev/null +++ b/docs/source/algorithms.mdx @@ -0,0 +1,12 @@ +# Other algorithms +_WIP: Still incomplete... Community contributions would be greatly welcome!_ + +This is an overview of the `bnb.functional` API in `bitsandbytes` that we think would also be useful as standalone entities. + +## Using Int8 Matrix Multiplication + +For straight Int8 matrix multiplication with mixed precision decomposition you can use ``bnb.matmul(...)``. To enable mixed precision decomposition, use the threshold parameter: + +```py +bnb.matmul(..., threshold=6.0) +``` diff --git a/docs/source/compiling.mdx b/docs/source/compiling.mdx new file mode 100644 index 000000000..fc8c58769 --- /dev/null +++ b/docs/source/compiling.mdx @@ -0,0 +1,41 @@ +# Compiling from Source[[compiling]] + +To compile from source, the CUDA Toolkit is required. Ensure `nvcc` is installed; if not, follow these steps to install it along with the CUDA Toolkit: + +```bash +wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/install_cuda.sh +# Use the following syntax: cuda_install CUDA_VERSION INSTALL_PREFIX EXPORT_TO_BASH +# CUDA_VERSION options include 110 to 122 +# EXPORT_TO_BASH: 0 for False, 1 for True + +# Example for installing CUDA 11.7 at ~/local/cuda-11.7 and exporting the path to .bashrc: +bash install_cuda.sh 117 ~/local 1 +``` + +For a single compile run with a specific CUDA version, set `CUDA_HOME` to point to your CUDA installation directory. For instance, to compile using CUDA 11.7 located at `~/local/cuda-11.7`, use: + +``` +CUDA_HOME=~/local/cuda-11.7 CUDA_VERSION=117 make cuda11x +``` + +## General Compilation Steps + +1. Use `CUDA_VERSION=XXX make [target]` to compile, where `[target]` includes options like `cuda92`, `cuda10x`, `cuda11x`, and others. +2. Install with `python setup.py install`. + +Ensure `nvcc` is available in your system. If using Anaconda, determine your CUDA version with PyTorch using `conda list | grep cudatoolkit` and match it by downloading the corresponding version from the [CUDA Toolkit Archive](https://developer.nvidia.com/cuda-toolkit-archive). + +To install CUDA locally without administrative rights: + +```bash +wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/install_cuda.sh +# Follow the same syntax and example as mentioned earlier +``` + +The compilation process relies on the `CUDA_HOME` environment variable to locate CUDA. If `CUDA_HOME` is unset, it will attempt to infer the location from `nvcc`. If `nvcc` is not in your path, you may need to add it or set `CUDA_HOME` manually. For example, if `python -m bitsandbytes` indicates your CUDA path as `/usr/local/cuda-11.7`, you can set `CUDA_HOME` to this path. + +If compilation issues arise, please report them. + +## Compilation for Kepler Architecture + +From version 0.39.1, bitsandbytes no longer includes Kepler binaries in pip installations, requiring manual compilation. Follow the general steps and use `cuda11x_nomatmul_kepler` for Kepler-targeted compilation. diff --git a/docs/source/contributing.mdx b/docs/source/contributing.mdx new file mode 100644 index 000000000..b28e91936 --- /dev/null +++ b/docs/source/contributing.mdx @@ -0,0 +1,20 @@ +# Contributors guidelines +... stil under construction ... (feel free to propose materials, `bitsandbytes` is a community project) + +## Setup pre-commit hooks +- Install pre-commit hooks with `pip install pre-commit`. +- Run `pre-commit autoupdate` once to configure the hooks. +- Re-run `pre-commit autoupdate` every time a new hook got added. + +Now all the pre-commit hooks will be automatically run when you try to commit and if they introduce some changes, you need to re-add the changed files before being able to commit and push. + +## Doc-string syntax + +We're following NumPy doc-string conventions with the only notable difference being that we use Markdown instead of Rich text format (RTF) for markup within the doc-strings. + +Please see the existing documentation to see how to generate autodocs. + +## Documentation +- [guideline for documentation syntax](https://github.com/huggingface/doc-builder#readme) +- images shall be uploaded via PR in the `bitsandbytes/` directory [here](https://huggingface.co/datasets/huggingface/documentation-images) +- find the documentation builds for each PR in a link posted to the PR, such as https://moon-ci-docs.huggingface.co/docs/bitsandbytes/pr_1012/en/introduction diff --git a/errors_and_solutions.md b/docs/source/errors.mdx similarity index 68% rename from errors_and_solutions.md rename to docs/source/errors.mdx index 5b8cbcdd5..293017173 100644 --- a/errors_and_solutions.md +++ b/docs/source/errors.mdx @@ -1,21 +1,22 @@ -# No kernel image available +# Errors & Solutions -This problem arises with the cuda version loaded by bitsandbytes is not supported by your GPU, or if you pytorch CUDA version mismatches. To solve this problem you need to debug ``$LD_LIBRARY_PATH``, ``$CUDA_HOME``, ``$PATH``. You can print these via ``echo $PATH``. You should look for multiple paths to different CUDA versions. This can include versions in your anaconda path, for example ``$HOME/anaconda3/lib``. You can check those versions via ``ls -l $HOME/anaconda3/lib/*cuda*`` or equivalent paths. Look at the CUDA versions of files in these paths. Does it match with ``nvidia-smi``? +## No kernel image available -If you are feeling lucky, you can also try to compile the library from source. This can be still problematic if your PATH variables have multiple cuda versions. As such, it is recommended to figure out path conflicts before you proceed with compilation. +This problem arises with the cuda version loaded by bitsandbytes is not supported by your GPU, or if you pytorch CUDA version mismatches. +To solve this problem you need to debug ``$LD_LIBRARY_PATH``, ``$CUDA_HOME`` as well as ``$PATH``. You can print these via ``echo $PATH``. You should look for multiple paths to different CUDA versions. This can include versions in your anaconda path, for example ``$HOME/anaconda3/lib``. You can check those versions via ``ls -l $HOME/anaconda3/lib/*cuda*`` or equivalent paths. Look at the CUDA versions of files in these paths. Does it match with ``nvidia-smi``? -__If you encounter any other error not listed here please create an issue. This will help resolve your problem and will help out others in the future. +If you are feeling lucky, you can also try to compile the library from source. This can be still problematic if your PATH variables have multiple cuda versions. As such, it is recommended to figure out path conflicts before you proceed with compilation. +## `fatbinwrap` -# fatbinwrap +This error occurs if there is a mismatch between CUDA versions in the C++ library and the CUDA part. Make sure you have right CUDA in your `$PATH` and `$LD_LIBRARY_PATH` variable. In the conda base environment you can find the library under: -This error occurs if there is a mismatch between CUDA versions in the C++ library and the CUDA part. Make sure you have right CUDA in your $PATH and $LD_LIBRARY_PATH variable. In the conda base environment you can find the library under: ```bash ls $CONDA_PREFIX/lib/*cudart* ``` Make sure this path is appended to the `LD_LIBRARY_PATH` so bnb can find the CUDA runtime environment library (cudart). -If this does not fix the issue, please try [compilation from source](compile_from_source.md) next. +If this does not fix the issue, please try compilation from source next. If this does not work, please open an issue and paste the printed environment if you call `make` and the associated error when running bnb. diff --git a/docs/source/faqs.mdx b/docs/source/faqs.mdx new file mode 100644 index 000000000..b9549e9d8 --- /dev/null +++ b/docs/source/faqs.mdx @@ -0,0 +1,7 @@ +# FAQs + +Please submit your questions in [this Github Discussion thread](https://github.com/TimDettmers/bitsandbytes/discussions/1013) if you feel that they will likely affect a lot of other users and that they haven't been sufficiently covered in the documentation. + +We'll pick the most generally applicable ones and post the QAs here or integrate them into the general documentation (also feel free to submit doc PRs, please). + +# ... under construction ... diff --git a/docs/source/index.mdx b/docs/source/index.mdx index 67c928309..0b033c3a9 100644 --- a/docs/source/index.mdx +++ b/docs/source/index.mdx @@ -1,191 +1,19 @@ -# bitsandbytes +# `bitsandbytes` -The bitsandbytes is a lightweight wrapper around CUDA custom functions, in particular 8-bit optimizers, matrix multiplication (LLM.int8()), and quantization functions. +The `bitsandbytes` library is a lightweight Python wrapper around CUDA custom functions, in particular 8-bit optimizers, matrix multiplication (LLM.int8()), and 8 + 4-bit quantization functions. +The library includes quantization primitives for 8-bit & 4-bit operations, through `bitsandbytes.nn.Linear8bitLt` and `bitsandbytes.nn.Linear4bit` and 8bit optimizers through `bitsandbytes.optim` module. +There are ongoing efforts to support further hardware backends, i.e. Intel CPU + GPU, AMD GPU, Apple Silicon. Windows support is on its way as well. -Resources: -- [8-bit Optimizer Paper](https://arxiv.org/abs/2110.02861) -- [Video](https://www.youtube.com/watch?v=IxrlHAJtqKE) -- [Docs](https://bitsandbytes.readthedocs.io/en/latest/) +## API documentation -- [LLM.int8() Paper](https://arxiv.org/abs/2208.07339) -- [LLM.int8() Software Blog Post](https://huggingface.co/blog/hf-bitsandbytes-integration) -- [LLM.int8() Emergent Features Blog Post](https://timdettmers.com/2022/08/17/llm-int8-and-emergent-features/) +- [Linear4bit](quantizaton#linear4bit) +- [Linear8bit](quantizaton#linear8bit) +- [StableEmbedding](optimizers#stableembedding) -## TL;DR -**Requirements** -Python >=3.8. Linux distribution (Ubuntu, MacOS, etc.) + CUDA > 10.0. +# License -(Deprecated: CUDA 10.0 is deprecated and only CUDA >= 11.0) will be supported with release 0.39.0) - -**Installation**: - -``pip install bitsandbytes`` - -In some cases it can happen that you need to compile from source. If this happens please consider submitting a bug report with `python -m bitsandbytes` information. What now follows is some short instructions which might work out of the box if `nvcc` is installed. If these do not work see further below. - -Compilation quickstart: -```bash -git clone https://github.com/timdettmers/bitsandbytes.git -cd bitsandbytes - -# CUDA_VERSIONS in {110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 120} -# make argument in {cuda110, cuda11x, cuda12x} -# if you do not know what CUDA you have, try looking at the output of: python -m bitsandbytes -CUDA_VERSION=117 make cuda11x -python setup.py install -``` - -**Using Int8 inference with HuggingFace Transformers** - -```python -from transformers import AutoModelForCausalLM -model = AutoModelForCausalLM.from_pretrained( - 'decapoda-research/llama-7b-hf', - device_map='auto', - load_in_8bit=True, - max_memory=f'{int(torch.cuda.mem_get_info()[0]/1024**3)-2}GB') -``` - -A more detailed example, can be found in [examples/int8_inference_huggingface.py](examples/int8_inference_huggingface.py). - -**Using 8-bit optimizer**: -1. Comment out optimizer: ``#torch.optim.Adam(....)`` -2. Add 8-bit optimizer of your choice ``bnb.optim.Adam8bit(....)`` (arguments stay the same) -3. Replace embedding layer if necessary: ``torch.nn.Embedding(..) -> bnb.nn.Embedding(..)`` - - -**Using 8-bit Inference**: -1. Comment out torch.nn.Linear: ``#linear = torch.nn.Linear(...)`` -2. Add bnb 8-bit linear light module: ``linear = bnb.nn.Linear8bitLt(...)`` (base arguments stay the same) -3. There are two modes: - - Mixed 8-bit training with 16-bit main weights. Pass the argument ``has_fp16_weights=True`` (default) - - Int8 inference. Pass the argument ``has_fp16_weights=False`` -4. To use the full LLM.int8() method, use the ``threshold=k`` argument. We recommend ``k=6.0``. -```python -# LLM.int8() -linear = bnb.nn.Linear8bitLt(dim1, dim2, bias=True, has_fp16_weights=False, threshold=6.0) -# inputs need to be fp16 -out = linear(x.to(torch.float16)) -``` - - -## Features -- 8-bit Matrix multiplication with mixed precision decomposition -- LLM.int8() inference -- 8-bit Optimizers: Adam, AdamW, RMSProp, LARS, LAMB, Lion (saves 75% memory) -- Stable Embedding Layer: Improved stability through better initialization, and normalization -- 8-bit quantization: Quantile, Linear, and Dynamic quantization -- Fast quantile estimation: Up to 100x faster than other algorithms - -## Requirements & Installation - -Requirements: anaconda, cudatoolkit, pytorch - -Hardware requirements: - - LLM.int8(): NVIDIA Turing (RTX 20xx; T4) or Ampere GPU (RTX 30xx; A4-A100); (a GPU from 2018 or newer). - - 8-bit optimizers and quantization: NVIDIA Kepler GPU or newer (>=GTX 78X). - -Supported CUDA versions: 10.2 - 12.0 - -The bitsandbytes library is currently only supported on Linux distributions. Windows is not supported at the moment. - -The requirements can best be fulfilled by installing pytorch via anaconda. You can install PyTorch by following the ["Get Started"](https://pytorch.org/get-started/locally/) instructions on the official website. - -To install run: - -``pip install bitsandbytes`` - -## Using bitsandbytes - -### Using Int8 Matrix Multiplication - -For straight Int8 matrix multiplication with mixed precision decomposition you can use ``bnb.matmul(...)``. To enable mixed precision decomposition, use the threshold parameter: -```python -bnb.matmul(..., threshold=6.0) -``` - -For instructions how to use LLM.int8() inference layers in your own code, see the TL;DR above or for extended instruction see [this blog post](https://huggingface.co/blog/hf-bitsandbytes-integration). - -### Using the 8-bit Optimizers - -With bitsandbytes 8-bit optimizers can be used by changing a single line of code in your codebase. For NLP models we recommend also to use the StableEmbedding layers (see below) which improves results and helps with stable 8-bit optimization. To get started with 8-bit optimizers, it is sufficient to replace your old optimizer with the 8-bit optimizer in the following way: -```python -import bitsandbytes as bnb - -# adam = torch.optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.995)) # comment out old optimizer -adam = bnb.optim.Adam8bit(model.parameters(), lr=0.001, betas=(0.9, 0.995)) # add bnb optimizer -adam = bnb.optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.995), optim_bits=8) # equivalent - - -torch.nn.Embedding(...) -> bnb.nn.StableEmbedding(...) # recommended for NLP models -``` - -Note that by default all parameter tensors with less than 4096 elements are kept at 32-bit even if you initialize those parameters with 8-bit optimizers. This is done since such small tensors do not save much memory and often contain highly variable parameters (biases) or parameters that require high precision (batch norm, layer norm). You can change this behavior like so: -```python -# parameter tensors with less than 16384 values are optimized in 32-bit -# it is recommended to use multiplies of 4096 -adam = bnb.optim.Adam8bit(model.parameters(), min_8bit_size=16384) -``` - -### Change Bits and other Hyperparameters for Individual Parameters - -If you want to optimize some unstable parameters with 32-bit Adam and others with 8-bit Adam, you can use the `GlobalOptimManager`. With this, we can also configure specific hyperparameters for particular layers, such as embedding layers. To do that, we need two things: (1) register the parameter while they are still on the CPU, (2) override the config with the new desired hyperparameters (anytime, anywhere). See our [guide](howto_config_override.md) for more details - -### Fairseq Users - -To use the Stable Embedding Layer, override the respective `build_embedding(...)` function of your model. Make sure to also use the `--no-scale-embedding` flag to disable scaling of the word embedding layer (nor replaced with layer norm). You can use the optimizers by replacing the optimizer in the respective file (`adam.py` etc.). - -## Release and Feature History - -For upcoming features and changes and full history see [Patch Notes](CHANGELOG.md). - -## Errors - -1. RuntimeError: CUDA error: no kernel image is available for execution on the device. [Solution](errors_and_solutions.md#No-kernel-image-available) -2. __fatbinwrap_.. [Solution](errors_and_solutions.md#fatbinwrap_) - -## Compile from source -To compile from source, you need an installation of CUDA. If `nvcc` is not installed, you can install the CUDA Toolkit with nvcc through the following commands. - -```bash -wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/install_cuda.sh -# Syntax cuda_install CUDA_VERSION INSTALL_PREFIX EXPORT_TO_BASH -# CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122} -# EXPORT_TO_BASH in {0, 1} with 0=False and 1=True - -# For example, the following installs CUDA 11.7 to ~/local/cuda-11.7 and exports the path to your .bashrc -bash install_cuda.sh 117 ~/local 1 -``` - -To use a specific CUDA version just for a single compile run, you can set the variable `CUDA_HOME`, for example the following command compiles `libbitsandbytes_cuda117.so` using compiler flags for cuda11x with the cuda version at `~/local/cuda-11.7`: - -``CUDA_HOME=~/local/cuda-11.7 CUDA_VERSION=117 make cuda11x`` - -For more detailed instruction, please follow the [compile_from_source.md](compile_from_source.md) instructions. - -## License - -The majority of bitsandbytes is licensed under MIT, however portions of the project are available under separate license terms: Pytorch is licensed under the BSD license. +The majority of bitsandbytes is licensed under MIT, however portions of the project are available under separate license terms, as the parts adapted from Pytorch are licensed under the BSD license. We thank Fabio Cannizzo for his work on [FastBinarySearch](https://github.com/fabiocannizzo/FastBinarySearch) which we use for CPU quantization. - -## How to cite us -If you found this library and found LLM.int8() useful, please consider citing our work: - -```bibtex -@article{dettmers2022llmint8, - title={LLM.int8(): 8-bit Matrix Multiplication for Transformers at Scale}, - author={Dettmers, Tim and Lewis, Mike and Belkada, Younes and Zettlemoyer, Luke}, - journal={arXiv preprint arXiv:2208.07339}, - year={2022} -} -``` - -For 8-bit optimizers or quantization routines, please consider citing the following work: - -```bibtex -@article{dettmers2022optimizers, - title={8-bit Optimizers via Block-wise Quantization}, - author={Dettmers, Tim and Lewis, Mike and Shleifer, Sam and Zettlemoyer, Luke}, - journal={9th International Conference on Learning Representations, ICLR}, - year={2022} -} -``` diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx index 50031acf7..ecdcdeb28 100644 --- a/docs/source/installation.mdx +++ b/docs/source/installation.mdx @@ -5,6 +5,12 @@ Note currently `bitsandbytes` is only supported on CUDA GPU hardwares, support f +## Hardware requirements: + - LLM.int8(): NVIDIA Turing (RTX 20xx; T4) or Ampere GPU (RTX 30xx; A4-A100); (a GPU from 2018 or newer). + - 8-bit optimizers and quantization: NVIDIA Kepler GPU or newer (>=GTX 78X). + +Supported CUDA versions: 10.2 - 12.0 #TODO: check currently supported versions + ## Linux ### From Pypi @@ -21,14 +27,16 @@ CUDA_VERSION=XXX make cuda12x python setup.py install ``` -with `XXX` being your CUDA version, for <12.0 call `make cuda 11x` +with `XXX` being your CUDA version, for <12.0 call `make cuda 11x`. Note support for non-CUDA GPUs (e.g. AMD, Intel), is also coming soon. + +For a more detailed compilation guide, head to the [dedicated page on the topic](./compiling) ## Windows -Currently for Windows users, you need to build bitsandbytes from source +Currently for Windows users, you need to build bitsandbytes from source: ```bash git clone https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/ @@ -39,5 +47,15 @@ python -m build --wheel Big thanks to [wkpark](https://github.com/wkpark), [Jamezo97](https://github.com/Jamezo97), [rickardp](https://github.com/rickardp), [akx](https://github.com/akx) for their amazing contributions to make bitsandbytes compatible with Windows. +For a more detailed compilation guide, head to the [dedicated page on the topic](./compiling) + + + +## MacOS + +Mac support is still a work in progress. Please make sure to check out the [Apple Silicon implementation coordination issue](https://github.com/TimDettmers/bitsandbytes/issues/1020) to get notified about the discussions and progress with respect to MacOS integration. + + + diff --git a/docs/source/integrations.mdx b/docs/source/integrations.mdx new file mode 100644 index 000000000..7857abf4c --- /dev/null +++ b/docs/source/integrations.mdx @@ -0,0 +1,42 @@ +# Transformers + +With Transformers it's very easy to load any model in 4 or 8-bit, quantizing them on the fly with bitsandbytes primitives. + +Please review the [bitsandbytes section in the Accelerate docs](https://huggingface.co/docs/transformers/v4.37.2/en/quantization#bitsandbytes). + +Details about the BitsAndBytesConfig can be found here](https://huggingface.co/docs/transformers/v4.37.2/en/main_classes/quantization#transformers.BitsAndBytesConfig). + +## Beware: bf16 is optional compute data type +If your hardware supports it, `bf16` is the optimal compute dtype. The default is `float32` for backward compatibility and numerical stability. `float16` often leads to numerical instabilities, but `bfloat16` provides the benefits of both worlds: numerical stability and significant computation speedup. Therefore, be sure to check if your hardware supports `bf16` and configure it using the `bnb_4bit_compute_dtype` parameter in BitsAndBytesConfig: + +```py +import torch +from transformers import BitsAndBytesConfig + +quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16) +``` + +# PEFT +With `PEFT`, you can use QLoRA out of the box with `LoraConfig` and a 4-bit base model. + +Please review the [bitsandbytes section in the Accelerate docs](https://huggingface.co/docs/peft/developer_guides/quantization#quantize-a-model). + +# Accelerate + +Bitsandbytes is also easily usable from within Accelerate. + +Please review the [bitsandbytes section in the Accelerate docs](https://huggingface.co/docs/accelerate/en/usage_guides/quantization). + +# Trainer for the optimizers + +You can use any of the 8-bit and/or paged optimizers by simple passing them to the `transformers.Trainer` class on intialization.All bnb optimizers are supported by passing the correct string in `TrainingArguments`'s `optim` attribute - e.g. (`paged_adamw_32bit`). + +See the [official API docs for reference](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.Trainer). + +Here we point out to relevant doc sections in transformers / peft / Trainer + very briefly explain how these are integrated: +e.g. for transformers state that you can load any model in 8-bit / 4-bit precision, for PEFT, you can use QLoRA out of the box with `LoraConfig` + 4-bit base model, for Trainer: all bnb optimizers are supported by passing the correct string in `TrainingArguments`'s `optim` attribute - e.g. (`paged_adamw_32bit`): + +# Blog posts + +- [Making LLMs even more accessible with bitsandbytes, 4-bit quantization and QLoRA](https://huggingface.co/blog/4bit-transformers-bitsandbytes) +- [A Gentle Introduction to 8-bit Matrix Multiplication for transformers at scale using Hugging Face Transformers, Accelerate and bitsandbytes](https://huggingface.co/blog/hf-bitsandbytes-integration) diff --git a/how_to_use_nonpytorch_cuda.md b/docs/source/nonpytorchcuda.mdx similarity index 76% rename from how_to_use_nonpytorch_cuda.md rename to docs/source/nonpytorchcuda.mdx index 566b0170e..099a6961b 100644 --- a/how_to_use_nonpytorch_cuda.md +++ b/docs/source/nonpytorchcuda.mdx @@ -1,6 +1,6 @@ -## How to use a CUDA version that is different from PyTorch +# How to use a CUDA version that is different from PyTorch -Some features of bitsandbytes may need a newer CUDA version than regularly supported by PyTorch binaries from conda / pip. In that case you can use the following instructions to load a precompiled bitsandbytes binary that works for you. +Some features of `bitsandbytes` may need a newer CUDA version than regularly supported by PyTorch binaries from conda / pip. In that case you can use the following instructions to load a precompiled `bitsandbytes` binary that works for you. ## Installing or determining the CUDA installation @@ -12,7 +12,7 @@ Determine the path of the CUDA version that you want to use. Common paths paths where XX.X is the CUDA version number. -You can also install CUDA version that you need locally with a script provided by bitsandbytes as follows: +You can also install CUDA version that you need locally with a script provided by `bitsandbytes` as follows: ```bash wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/install_cuda.sh @@ -25,7 +25,7 @@ wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/install_cud bash cuda_install.sh 117 ~/local 1 ``` -## Setting the environmental variables BNB_CUDA_VERSION, and LD_LIBRARY_PATH +## Setting the environmental variables `BNB_CUDA_VERSION`, and `LD_LIBRARY_PATH` To manually override the PyTorch installed CUDA version you need to set to variable, like so: diff --git a/docs/source/optimizers.mdx b/docs/source/optimizers.mdx new file mode 100644 index 000000000..18d20de1d --- /dev/null +++ b/docs/source/optimizers.mdx @@ -0,0 +1,190 @@ +# Introduction: 8-bit optimizers + +With 8-bit optimizers, larger models can be finetuned with the same GPU memory compared to standard 32-bit optimizer training. 8-bit optimizers are a drop-in replacement for regular optimizers, with the following properties: + +- Faster (e.g. 4x faster than regular Adam) +- 75% less memory, same performance +- No hyperparameter tuning needed + +8-bit optimizers are mostly useful to finetune large models that did not fit into memory before. They also make it easier to pretrain larger models and have great synergy with sharded data parallelism. 8-bit Adam, for example, is already used across multiple teams in Facebook. This optimizer saves a ton of memory at no accuracy hit. + +Generally, our 8-bit optimizers have three components: +1. **block-wise quantization** isolates outliers and distributes the error more equally over all bits, +2. **dynamic quantization** quantizes both small and large values with high precision, +3. a **stable embedding layer** improves stability during optimization for models with word embeddings. + +With these components, performing an optimizer update with 8-bit states is straightforward and for GPUs, this makes 8-bit optimizers way faster than regular 32-bit optimizers. [Further details below](#research-background) + +We feature 8-bit `Adagrad`, `Adam`, `AdamW`, `LAMB`, `LARS`, `Lion`, `RMSprop` and `SGD` (momentum). + +## Caveats + +8-bit optimizers reduce the memory footprint and accelerate optimization on a wide range of tasks. However, since 8-bit optimizers reduce only the memory footprint proportional to the number of parameters, **models that use large amounts of activation memory, such as convolutional networks, have few benefits from using 8-bit optimizers**. Thus, 8-bit optimizers are most beneficial for training or finetuning models with many parameters on highly memory-constrained GPUs. + +## Usage + +It only requires a two-line code change to get started. +```diff +import bitsandbytes as bnb + +- adam = torch.optim.Adam(...) ++ adam = bnb.optim.Adam8bit(...) + +# recommended for NLP models +- before: torch.nn.Embedding(...) ++ bnb.nn.StableEmbedding(...) +``` + +The arguments passed are the same as standard Adam. For NLP models we recommend to also use the StableEmbedding layers which improves results and helps with stable 8-bit optimization. + +Note that by default all parameter tensors with less than 4096 elements are kept at 32-bit even if you initialize those parameters with 8-bit optimizers. This is done since such small tensors do not save much memory and often contain highly variable parameters (biases) or parameters that require high precision (batch norm, layer norm). You can change this behavior like so: + +```py +# For parameter tensors with less than 16384 values are optimized in 32-bit +# it is recommended to use multiplies of 4096: +adam = bnb.optim.Adam8bit(model.parameters(), min_8bit_size=16384) +``` + +Some more examples of how you can replace your old optimizer with the 8-bit optimizer: + +```diff +import bitsandbytes as bnb + +- adam = torch.optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.995)) # comment out old optimizer ++ adam = bnb.optim.Adam8bit(model.parameters(), lr=0.001, betas=(0.9, 0.995)) # add bnb optimizer + +# use 32-bit Adam with 5th percentile clipping ++ adam = bnb.optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.995), optim_bits=32, percentile_clipping=5) +- adam = torch.optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.995)) # comment out old optimizer +``` + +## Overview of supported 8-bit optimizers + +Currently, `bitsandbytes` supports the following optimizers: + +- `Adagrad`, `Adagrad8bit`, `Adagrad32bit` +- `Adam`, `Adam8bit`, `Adam32bit`, `PagedAdam`, `PagedAdam8bit`, `PagedAdam32bit` +- `AdamW`, `AdamW8bit`, `AdamW32bit`, `PagedAdamW`, `PagedAdamW8bit`, `PagedAdamW32bit` +- `LAMB`, `LAMB8bit`, `LAMB32bit` +- `LARS`, `LARS8bit`, `LARS32bit`, `PytorchLARS` +- `Lion`, `Lion8bit`, `Lion32bit`, `PagedLion`, `PagedLion8bit`, `PagedLion32bit` +- `RMSprop`, `RMSprop8bit`, `RMSprop32bit` +- `SGD`, `SGD8bit`, `SGD32bit` + +Additionally, for cases in which you want to optimize some unstable parameters with 32-bit Adam and others with 8-bit Adam, you can use the `GlobalOptimManager`, [as explained in greater detail below](#optim_manager). + +Find the API docs [here](#optim_api_docs) (still under construction). + +## Overview of expected gains + +
+ +
+ +See here an overview of the biggest models that can be trained based on optimizer usage: + +
+ +
+ +### Research Background + +Stateful optimizers maintain gradient statistics over time, e.g. the exponentially smoothed sum (SGD with momentum) or squared sum (Adam) of past gradient values. This state can be used to accelerate optimization compared to plain stochastic gradient descent but uses memory that might otherwise be allocated to model parameters, thereby limiting the maximum size of models trained in practice. `bitsandbytes` optimizers use 8-bit statistics, while maintaining the performance levels of using 32-bit optimizer states. + +To overcome the resulting computational, quantization and stability challenges, 8-bit optimizers have three components: + +1. **Block-wise quantization** divides input tensors into smaller blocks that are independently quantized, therein isolating outliers and distributing the error more equally over all bits. Each block is processed in parallel across cores, yielding faster optimization and high precision quantization. +2. **Dynamic quantization**, which quantizes both small and large values with high precision and +3. a **stable embedding layer** improves stability during optimization for models with word embeddings. + +With these components, performing an optimizer update with 8-bit states is straightforward. We dequantize the 8-bit optimizer states to 32-bit, perform the update and then quantize the states back to 8-bit for storage. + +We do this 8-bit to 32-bit conversion element-by-element in registers, which means no slow copies to GPU memory or additional temporary memory are needed to perform quantization and dequantization. For GPUs, this makes 8-bit optimizers much faster than regular 32-bit optimizers. + +For more details, please refer to the paper [8-bit Optimizers via Block-wise Quantization](https://arxiv.org/abs/2110.02861). + +## Stable Embedding Layer + +The Stable Embedding Layer enhances the standard word embedding layer for improved training stability in NLP tasks. It addresses the challenge of non-uniform input distributions and mitigates extreme gradient variations, ensuring smoother training processes. + +#### Features: + +- **Initialization**: Utilizes Xavier uniform initialization to maintain consistent variance, reducing the likelihood of large gradients. +- **Normalization**: Incorporates layer normalization before adding positional embeddings, aiding in output stability. +- **Optimizer States**: Employs 32-bit optimizer states exclusively for this layer to enhance stability, while the rest of the model may use standard 16-bit precision. + +#### Benefits: + +- Designed to support more aggressive quantization strategies without compromising training stability. +- Helps in achieving stable training outcomes, particularly important for models dealing with diverse and complex language data. + +## Paged optimizers + +Paged optimizers are build on top of the [unified memory](https://developer.nvidia.com/blog/unified-memory-cuda-beginners/) feature of CUDA. This feature is not supported by PyTorch and we added it to `bitsandbytes`. + +It works like regular CPU paging, which means that it only becomes active _if one runs out of GPU memory_. Only then will the memory be transferred, page-by-page, from GPU to CPU. The memory is mapped, meaning that pages are preallocated on the CPU, but they are not updated automatically. They are only updated if the memory is accessed, or a swapping operation is launched. + +The unified memory feature is less efficient than regular asynchronous memory transfers. This means, one usually will not be able to get full PCIe memory bandwidth utilization. If one does a manual prefetch, transfer speeds can be high but still about half or worse than the full PCIe memory bandwidth (tested on 16x lanes PCIe 3.0). + +This all means performance depends highly on the particular use-case. If one evicts, say, 1 GB of memory per forward-backward-optimizer loop: One can expect about 50% of the PCIe bandwidth as time in the best case. So 1 GB for PCIe 3.0 with 16x lanes, which runs at 16 GB/s, is `1/(16*0.5) = 1/8 = 125ms` overhead per optimizer step. Other overhead can be estimated for the particular use-case given a PCIe interface, lanes, and the memory that is evicted in each iteration. + +Compared to CPU offloading, this has the advantage that there is zero overhead if all the memory fits into the device and only some overhead if some of memory needs to be evicted. For offloading, one would usually offload fixed parts of the model and need to off and onload all this memory with each iteration through the model (sometimes twice for both forward and backward pass). + +[Find more details in this discussion](https://github.com/TimDettmers/bitsandbytes/issues/962). + + +## `GlobalOptimManager`: How to override config hyperparameters for particular weights/parameters[[optim_manager]] + +If you want to optimize some unstable parameters with 32-bit Adam and others with 8-bit Adam, you can use the `GlobalOptimManager`. With this, we can also configure specific hyperparameters for particular layers, such as embedding layers. To do that, we need two things: + +1. Register the parameter while they are still on the CPU. +2. Override the config with the new desired hyperparameters (anytime, anywhere). + +For global overrides in many different places in your code you can do: + +```py +import torch +import bitsandbytes as bnb + +mng = bnb.optim.GlobalOptimManager.get_instance() + +model = MyModel() +mng.register_parameters(model.parameters()) # 1. register parameters while still on CPU + +model = model.cuda() +# use 8-bit optimizer states for all parameters +adam = bnb.optim.Adam(model.parameters(), lr=0.001, optim_bits=8) + +# 2a. override: the parameter model.fc1.weight now uses 32-bit Adam +mng.override_config(model.fc1.weight, 'optim_bits', 32) + +# 2b. override: the two special layers use +# sparse optimization + different learning rate + different Adam betas +mng.override_config([model.special.weight, model.also_special.weight], + key_value_dict ={'is_sparse': True, 'lr': 1e-5, 'betas'=(0.9, 0.98)}) +``` +Possible options for the config override are: `betas, eps, weight_decay, lr, optim_bits, min_8bit_size, percentile_clipping, block_wise, max_unorm`. + +For overrides for particular layers, we recommend overriding locally in each module. You can do this by passing the module, the parameter, and its attribute name to the GlobalOptimManager: +```py +class MyModule(torch.nn.Module): + def __init__(din, dout): + super(MyModule, self).__init__() + self.linear = torch.nn.Linear(din, dout) + # optimization will happen in 32-bit and + # learning rate will be set to 0.0001 independent of the main learning rate + config = {'optim_bits': 32, 'lr' : 0.0001} + GlobalOptimManager.get_instance().register_module_override(self, 'weight', config) + +``` + +## API Docs[[optim_api_docs]] + +... under construction ... + +Here we'll provide further auto-generated API docs soon. Please feel free to contribute doc-strings for the respective optimizers, as `bitsandbytes` is a community effort. + +## StableEmbedding + +[[autodoc]] bitsandbytes.nn.StableEmbedding + - __init__ diff --git a/docs/source/quantization.mdx b/docs/source/quantization.mdx new file mode 100644 index 000000000..3880cc089 --- /dev/null +++ b/docs/source/quantization.mdx @@ -0,0 +1,13 @@ +# Quantization primitives + +Below you will find the docstring of the quantization primitives exposed in bitsandbytes. + +## Linear4bit (QLoRA)[[linear4bit]] + +[[autodoc]] bitsandbytes.nn.Linear4bit + - __init__ + +## Linear8bitLt[[linear8bit]] + +[[autodoc]] bitsandbytes.nn.Linear8bitLt + - __init__ diff --git a/docs/source/quickstart.mdx b/docs/source/quickstart.mdx index d1028c655..ed92c896b 100644 --- a/docs/source/quickstart.mdx +++ b/docs/source/quickstart.mdx @@ -4,9 +4,12 @@ ... work in progress ... -## Minimal example +(Community contributions would we very welcome!) + +## Minimal examples The following code illustrates the steps above. -```python +```py +code examples will soon follow ``` diff --git a/docs/source/resources.mdx b/docs/source/resources.mdx new file mode 100644 index 000000000..56330175a --- /dev/null +++ b/docs/source/resources.mdx @@ -0,0 +1,92 @@ +# Papers, related resources & how to cite + +The below academic work is ordered in reverse chronological order. + +## [SpQR: A Sparse-Quantized Representation for Near-Lossless LLM Weight Compression (Jun 2023)](https://arxiv.org/abs/2306.03078) + +Authors: Tim Dettmers, Ruslan Svirschevski, Vage Egiazarian, Denis Kuznedelev, Elias Frantar, Saleh Ashkboos, Alexander Borzunov, Torsten Hoefler, Dan Alistarh + +- [Twitter summary thread](https://twitter.com/Tim_Dettmers/status/1666076553665744896) + +``` +@article{dettmers2023spqr, + title={SpQR: A Sparse-Quantized Representation for Near-Lossless LLM Weight Compression}, + author={Dettmers, Tim and Svirschevski, Ruslan and Egiazarian, Vage and Kuznedelev, Denis and Frantar, Elias and Ashkboos, Saleh and Borzunov, Alexander and Hoefler, Torsten and Alistarh, Dan}, + journal={arXiv preprint arXiv:2306.03078}, + year={2023} +} +``` + +## [QLoRA: Efficient Finetuning of Quantized LLMs (May 2023)](https://arxiv.org/abs/2305.14314) +Authors: Tim Dettmers, Artidoro Pagnoni, Ari Holtzman, Luke Zettlemoyer + +- [Video](https://www.youtube.com/watch?v=y9PHWGOa8HA&ab_channel=LondonMachineLearningMeetup) +- [Twitter summary thread](https://twitter.com/Tim_Dettmers/status/1661379354507476994) + +``` +@article{dettmers2023qlora, + title={Qlora: Efficient finetuning of quantized llms}, + author={Dettmers, Tim and Pagnoni, Artidoro and Holtzman, Ari and Zettlemoyer, Luke}, + journal={arXiv preprint arXiv:2305.14314}, + year={2023} +} +``` + +## [The case for 4-bit precision: k-bit Inference Scaling Laws (Dec 2022)](https://arxiv.org/abs/2212.09720) +Authors: Tim Dettmers, Luke Zettlemoyer + +- [Video](https://www.youtube.com/watch?v=odlQa6AE1gY&ab_channel=TheInsideView) +- [Twitter summary thread](https://twitter.com/Tim_Dettmers/status/1605209171758284805) + +``` +@inproceedings{dettmers2023case, + title={The case for 4-bit precision: k-bit inference scaling laws}, + author={Dettmers, Tim and Zettlemoyer, Luke}, + booktitle={International Conference on Machine Learning}, + pages={7750--7774}, + year={2023}, + organization={PMLR} +} +``` + +## [LLM.int8(): 8-bit Matrix Multiplication for Transformers at Scale (Nov 2022)](https://arxiv.org/abs/2208.07339) +Authors: Tim Dettmers, Mike Lewis, Younes Belkada, Luke Zettlemoyer + +- [LLM.int8() Blog Post](https://huggingface.co/blog/hf-bitsandbytes-integration) +- [LLM.int8() Emergent Features Blog Post](https://timdettmers.com/2022/08/17/llm-int8-and-emergent-features/) +- [Introduction to Weight Quantization](https://towardsdatascience.com/introduction-to-weight-quantization-2494701b9c0c) +- [Poster](https://twitter.com/Tim_Dettmers/status/1598351301942951937) + +``` +@article{dettmers2022llm, + title={Llm. int8 (): 8-bit matrix multiplication for transformers at scale}, + author={Dettmers, Tim and Lewis, Mike and Belkada, Younes and Zettlemoyer, Luke}, + journal={arXiv preprint arXiv:2208.07339}, + year={2022} +} +``` + +## [8-bit Optimizers via Block-wise Quantization (Oct 2021)](https://arxiv.org/abs/2110.02861) +Authors: Tim Dettmers, Mike Lewis, Sam Shleifer, Luke Zettlemoyer + +- [Video](https://www.youtube.com/watch?v=IxrlHAJtqKE) +- [Twitter summary thread](https://twitter.com/Tim_Dettmers/status/1446472128979562499) + +``` +@article{DBLP:journals/corr/abs-2110-02861, + author = {Tim Dettmers and + Mike Lewis and + Sam Shleifer and + Luke Zettlemoyer}, + title = {8-bit Optimizers via Block-wise Quantization}, + journal = {CoRR}, + volume = {abs/2110.02861}, + year = {2021}, + url = {https://arxiv.org/abs/2110.02861}, + eprinttype = {arXiv}, + eprint = {2110.02861}, + timestamp = {Thu, 21 Oct 2021 16:20:08 +0200}, + biburl = {https://dblp.org/rec/journals/corr/abs-2110-02861.bib}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} +``` diff --git a/environment.yml b/environment.yml index 9ab48dedc..af421b3c6 100644 --- a/environment.yml +++ b/environment.yml @@ -27,6 +27,7 @@ dependencies: - conda-forge::monkeytype # infer type annotations - conda-forge::rich # better, colored tracebacks, etc - conda-forge::pytest-sugar # better pytest output + # - conda-forge::nodejs # for `doc-builder preview` (optional) ## ENV CREATION - steps to reproduce: # mamba env remove -n bnb diff --git a/howto_config_override.md b/howto_config_override.md deleted file mode 100644 index 55b24e3ab..000000000 --- a/howto_config_override.md +++ /dev/null @@ -1,40 +0,0 @@ -# How to override config hyperparameters for particular weights/parameters - -If you want to optimize some unstable parameters with 32-bit Adam and others with 8-bit Adam, you can use the `GlobalOptimManager`. With this, we can also configure specific hyperparameters for particular layers, such as embedding layers. To do that, we need two things: (1) register the parameter while they are still on the CPU, (2) override the config with the new desired hyperparameters (anytime, anywhere). See our [guide](howto_config_override.md) for more details - -For global overrides in many different places in your code you can do: -```python -import torch -import bitsandbytes as bnb - -mng = bnb.optim.GlobalOptimManager.get_instance() - -model = MyModel() -mng.register_parameters(model.parameters()) # 1. register parameters while still on CPU - -model = model.cuda() -# use 8-bit optimizer states for all parameters -adam = bnb.optim.Adam(model.parameters(), lr=0.001, optim_bits=8) - -# 2a. override: the parameter model.fc1.weight now uses 32-bit Adam -mng.override_config(model.fc1.weight, 'optim_bits', 32) - -# 2b. override: the two special layers use -# sparse optimization + different learning rate + different Adam betas -mng.override_config([model.special.weight, model.also_special.weight], - key_value_dict ={'is_sparse': True, 'lr': 1e-5, 'betas'=(0.9, 0.98)}) -``` -Possible options for the config override are: `betas, eps, weight_decay, lr, optim_bits, min_8bit_size, percentile_clipping, block_wise, max_unorm` - -For overrides for particular layers we recommend overriding locally in each module. You can do this by passing the module, the parameter, and its attribute name to the GlobalOptimManager: -```python -class MyModule(torch.nn.Module): - def __init__(din, dout): - super(MyModule, self).__init__() - self.linear = torch.nn.Linear(din, dout) - # optimization will happen in 32-bit and - # learning rate will be set to 0.0001 independent of the main learning rate - config = {'optim_bits': 32, 'lr' : 0.0001} - GlobalOptimManager.get_instance().register_module_override(self, 'weight', config) - -``` From 259ad44110940259b6a6b545b8a8b2a69289bbef Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Sun, 4 Feb 2024 21:18:05 +0100 Subject: [PATCH 035/112] CUDA setup cleanup (#996) * Diagnostics: streamline debug printing code * CUDA setup: Remove unused `backup_paths` * CUDA setup: DRY OS detection * CUDA setup: Streamline `manual_override()` * CUDA setup: Use comment instead of string literal, simplify * CUDA setup: remove duplicate sort The "sort compute capabilities" fix from #703 (#527) would actually do nothing due to this. * CUDA setup: make version number replacement logic more obvious --- bitsandbytes/__main__.py | 69 +++++++++------------------ bitsandbytes/cuda_setup/main.py | 83 +++++++++++++++++---------------- 2 files changed, 66 insertions(+), 86 deletions(-) diff --git a/bitsandbytes/__main__.py b/bitsandbytes/__main__.py index af5c1c523..61b42e78f 100644 --- a/bitsandbytes/__main__.py +++ b/bitsandbytes/__main__.py @@ -1,5 +1,5 @@ +import glob import os -from os.path import isdir import sys from warnings import warn @@ -8,17 +8,9 @@ HEADER_WIDTH = 60 -def find_file_recursive(folder, filename): - import glob - outs = [] - try: - for ext in ["so", "dll", "dylib"]: - out = glob.glob(os.path.join(folder, "**", filename + ext)) - outs.extend(out) - except Exception as e: - raise RuntimeError('Error: Something when wrong when trying to find file.') from e - - return outs +def find_dynamic_library(folder, filename): + for ext in ("so", "dll", "dylib"): + yield from glob.glob(os.path.join(folder, "**", filename + ext)) def generate_bug_report_information(): @@ -27,40 +19,25 @@ def generate_bug_report_information(): print_header("") print('') - if 'CONDA_PREFIX' in os.environ: - paths = find_file_recursive(os.environ['CONDA_PREFIX'], '*cuda*') - print_header("ANACONDA CUDA PATHS") - print(paths) - print('') - if isdir('/usr/local/'): - paths = find_file_recursive('/usr/local', '*cuda*') - print_header("/usr/local CUDA PATHS") - print(paths) - print('') - if 'CUDA_PATH' in os.environ and isdir(os.environ['CUDA_PATH']): - paths = find_file_recursive(os.environ['CUDA_PATH'], '*cuda*') - print_header("CUDA PATHS") - print(paths) - print('') - - if isdir(os.getcwd()): - paths = find_file_recursive(os.getcwd(), '*cuda*') - print_header("WORKING DIRECTORY CUDA PATHS") - print(paths) - print('') - - print_header("LD_LIBRARY CUDA PATHS") - if 'LD_LIBRARY_PATH' in os.environ: - lib_path = os.environ['LD_LIBRARY_PATH'].strip() - for path in set(lib_path.split(os.pathsep)): - try: - if isdir(path): - print_header(f"{path} CUDA PATHS") - paths = find_file_recursive(path, '*cuda*') - print(paths) - except Exception as e: - print(f'Could not read LD_LIBRARY_PATH: {path} ({e})') - print('') + path_sources = [ + ("ANACONDA CUDA PATHS", os.environ.get("CONDA_PREFIX")), + ("/usr/local CUDA PATHS", "/usr/local"), + ("CUDA PATHS", os.environ.get("CUDA_PATH")), + ("WORKING DIRECTORY CUDA PATHS", os.getcwd()), + ] + try: + ld_library_path = os.environ.get("LD_LIBRARY_PATH") + if ld_library_path: + for path in set(ld_library_path.strip().split(os.pathsep)): + path_sources.append((f"LD_LIBRARY_PATH {path} CUDA PATHS", path)) + except Exception as e: + print(f"Could not parse LD_LIBRARY_PATH: {e}") + + for name, path in path_sources: + if path and os.path.isdir(path): + print_header(name) + print(list(find_dynamic_library(path, '*cuda*'))) + print("") def print_header( diff --git a/bitsandbytes/cuda_setup/main.py b/bitsandbytes/cuda_setup/main.py index 0db9df343..a8792c1ad 100644 --- a/bitsandbytes/cuda_setup/main.py +++ b/bitsandbytes/cuda_setup/main.py @@ -28,19 +28,17 @@ from .env_vars import get_potentially_lib_path_containing_env_vars -# these are the most common libs names -# libcudart.so is missing by default for a conda install with PyTorch 2.0 and instead -# we have libcudart.so.11.0 which causes a lot of errors before -# not sure if libcudart.so.12.0 exists in pytorch installs, but it does not hurt -system = platform.system() -if system == 'Windows': +if platform.system() == 'Windows': # Windows CUDA_RUNTIME_LIBS = ["nvcuda.dll"] -else: # Linux or other - CUDA_RUNTIME_LIBS = ["libcudart.so", 'libcudart.so.11.0', 'libcudart.so.12.0', 'libcudart.so.12.1', 'libcudart.so.12.2'] + DYNAMIC_LIBRARY_SUFFIX = ".dll" +else: # Linux or other + # these are the most common libs names + # libcudart.so is missing by default for a conda install with PyTorch 2.0 and instead + # we have libcudart.so.11.0 which causes a lot of errors before + # not sure if libcudart.so.12.0 exists in pytorch installs, but it does not hurt + CUDA_RUNTIME_LIBS = ["libcudart.so", "libcudart.so.11.0", "libcudart.so.12.0", "libcudart.so.12.1", "libcudart.so.12.2"] + DYNAMIC_LIBRARY_SUFFIX = ".so" -# this is a order list of backup paths to search CUDA in, if it cannot be found in the main environmental paths -backup_paths = [] -backup_paths.append('$CONDA_PREFIX/lib/libcudart.so.11.0') class CUDASetup: _instance = None @@ -108,22 +106,30 @@ def initialize(self): self.error = False def manual_override(self): - if torch.cuda.is_available(): - if 'BNB_CUDA_VERSION' in os.environ: - if len(os.environ['BNB_CUDA_VERSION']) > 0: - warn( - f'\n\n{"=" * 80}\n' - 'WARNING: Manual override via BNB_CUDA_VERSION env variable detected!\n' - 'BNB_CUDA_VERSION=XXX can be used to load a bitsandbytes version that is different from the PyTorch CUDA version.\n' - 'If this was unintended set the BNB_CUDA_VERSION variable to an empty string: export BNB_CUDA_VERSION=\n' - 'If you use the manual override make sure the right libcudart.so is in your LD_LIBRARY_PATH\n' - 'For example by adding the following to your .bashrc: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH: Date: Mon, 5 Feb 2024 20:05:50 +0900 Subject: [PATCH 036/112] CI: split workflows (#1019) * CI: separate shared-libs, cuda and wheels jobs * CI: remove cuda dependent packages to reduce build time --- .github/workflows/cmake.yml | 156 +++++++++++++++++++++++++++++++----- environment-bnb.yml | 6 +- 2 files changed, 137 insertions(+), 25 deletions(-) diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml index 728dd09fb..06f08eb9d 100644 --- a/.github/workflows/cmake.yml +++ b/.github/workflows/cmake.yml @@ -11,7 +11,80 @@ concurrency: cancel-in-progress: true jobs: - build: + build-shared-libs: + runs-on: ${{ matrix.os }} + + strategy: + # Set fail-fast to false to ensure that feedback is delivered for all matrix combinations. Consider changing this to true when your workflow is stable. + fail-fast: false + + matrix: + os: [ubuntu-latest, windows-latest] + arch: [x86_64, aarch64] + build_type: [Release] + exclude: + - os: windows-latest + arch: aarch64 + + steps: + - uses: actions/checkout@v4 + + - name: Set up MSVC + if: matrix.os == 'windows-latest' + uses: ilammy/msvc-dev-cmd@v1.13.0 + with: + arch: amd64 + + - name: Set reusable strings + # Turn repeated input strings (such as the build output directory) into step outputs. These step outputs can be used throughout the workflow file. + id: strings + shell: bash + run: | + echo "build-output-dir=${{ github.workspace }}/build" >> "$GITHUB_OUTPUT" + + - name: Prep build + run: python3 -m pip install cmake==3.27.9 ninja setuptools wheel + + - name: Prep Compilers + shell: bash -el {0} + run: | + if [ "${{ matrix.os }}" = "windows-latest" ]; then + echo CXX_COMPILER=cl >> "$GITHUB_ENV" + echo C_COMPILER=cl >> "$GITHUB_ENV" + else + echo CXX_COMPILER=g++ >> "$GITHUB_ENV" + echo C_COMPILER=gcc >> "$GITHUB_ENV" + fi + + + - name: Configure CPU + run: > + cmake -B ${{ steps.strings.outputs.build-output-dir }} + -G Ninja + -DCMAKE_CXX_COMPILER=${{ env.CXX_COMPILER }} + -DCMAKE_C_COMPILER=${{ env.C_COMPILER }} + -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} + -DBUILD_CUDA=OFF + -S ${{ github.workspace }} + + - name: Build CPU + run: cmake --build ${{ steps.strings.outputs.build-output-dir }} --config ${{ matrix.build_type }} + + - name: Copy libraries + shell: bash + run: | + mkdir -p output/${{ matrix.os }}/${{ matrix.arch }} + ( shopt -s nullglob && cp -a bitsandbytes/*.{so,dylib,dll} output/${{ matrix.os }}/${{ matrix.arch }} ) + + + - name: Upload Build Artifacts + uses: actions/upload-artifact@v4 + with: + name: shared_library-${{ matrix.os }}-${{ matrix.arch }} + path: output/* + + + build-shared-libs-cuda: runs-on: ${{ matrix.os }} strategy: @@ -21,14 +94,22 @@ jobs: matrix: os: [ubuntu-latest, windows-latest] cuda-version: ['11.8', '12.1'] + arch: [x86_64, aarch64] build_type: [Release] + exclude: + - os: windows-latest + arch: aarch64 steps: - uses: actions/checkout@v4 + - name: Set up Python 3.10 + uses: actions/setup-python@v5 + with: + python-version: "3.10" - name: Set up MSVC if: matrix.os == 'windows-latest' - uses: ilammy/msvc-dev-cmd@v1.12.1 + uses: ilammy/msvc-dev-cmd@v1.13.0 with: arch: amd64 @@ -129,31 +210,62 @@ jobs: - name: Build NOBLASLT run: cmake --build ${{ steps.strings.outputs.build-output-dir }} --config ${{ matrix.build_type }} - - name: Configure CPU - run: > - cmake -B ${{ steps.strings.outputs.build-output-dir }} - -G Ninja ${{ env.DCMAKE_CUDA_COMPILER }} - -DCMAKE_CXX_COMPILER=${{ env.CXX_COMPILER }} - -DCMAKE_C_COMPILER=${{ env.C_COMPILER }} - -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} - -DNO_CUBLASLT=ON - -DBUILD_CUDA=OFF - -S ${{ github.workspace }} + - name: Copy libraries + shell: bash + run: | + mkdir -p output/${{ matrix.os }}/${{ matrix.arch }} + ( shopt -s nullglob && cp -a bitsandbytes/*.{so,dylib,dll} output/${{ matrix.os }}/${{ matrix.arch }} ) - - name: Build CPU - run: cmake --build ${{ steps.strings.outputs.build-output-dir }} --config ${{ matrix.build_type }} - - name: Build dist - shell: bash -el {0} + - name: Upload Build Artifacts + uses: actions/upload-artifact@v4 + with: + name: shared_library_cuda-${{ matrix.os }}-${{ matrix.cuda-version }}-${{ matrix.arch }} + path: output/* + + + build-wheels: + needs: + - build-shared-libs + - build-shared-libs-cuda + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [ubuntu-latest, windows-latest] + arch: [x86_64, aarch64] + exclude: + - os: windows-latest + arch: aarch64 + + steps: + # Check out code + - uses: actions/checkout@v4 + # Download shared libraries + - name: Download build artifact + uses: actions/download-artifact@v4 + with: + merge-multiple: true + path: output/ + - name: Copy correct platform shared libraries + shell: bash run: | - python -m pip install build - python -m build --wheel - mkdir dist/cu${{ matrix.cuda-version }} - mv dist/bitsandbytes*.* dist/cu${{ matrix.cuda-version }}/ + cp output/${{ matrix.os }}/${{ matrix.arch }}/* bitsandbytes/ + # Set up the Python version needed + - name: Set up Python 3.10 + uses: actions/setup-python@v5 + with: + python-version: "3.10" + cache: pip + - name: Install build package + shell: bash + run: pip install build + - name: Build wheel + shell: bash + run: python -m build . --wheel - name: Upload Build Artifacts - uses: actions/upload-artifact@v4.3.0 + uses: actions/upload-artifact@v4 with: - name: bitsandbytes-${{ matrix.os }}-${{ matrix.cuda-version }} + name: bdist_wheel-${{ matrix.os }}-${{ matrix.arch }} path: | ${{ github.workspace }}/dist/ diff --git a/environment-bnb.yml b/environment-bnb.yml index 92c7761bb..1214f7930 100644 --- a/environment-bnb.yml +++ b/environment-bnb.yml @@ -7,10 +7,10 @@ channels: dependencies: - python - - accelerate - - einops + #- accelerate + #- einops - scipy - - transformers + #- transformers - pytest - pytest-cases - ipython From 8c507d92c0950305d376b19137b5d8cccccea457 Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Mon, 5 Feb 2024 12:34:13 +0100 Subject: [PATCH 037/112] Enable crate-ci/typos lint; fix typos (#1005) Co-authored-by: Titus von Koeller fix erroneous correction --- .pre-commit-config.yaml | 4 ++ _typos.toml | 11 +++++ .../switchback/make_plot_with_jsonl.py | 4 +- bitsandbytes/cuda_setup/main.py | 4 +- bitsandbytes/functional.py | 6 +-- csrc/kernels.cu | 18 ++++---- deploy.sh | 44 +++++++++---------- docs/source/contributing.mdx | 2 +- docs/source/integrations.mdx | 2 +- docs/source/optimizers.mdx | 4 +- include/Algo-Direct2.h | 2 +- include/Portable.h | 2 +- include/SIMD.h | 2 +- tests/test_modules.py | 9 +++- 14 files changed, 68 insertions(+), 46 deletions(-) create mode 100644 _typos.toml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index feb6c766e..edcbc9b6b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -17,3 +17,7 @@ repos: - id: mixed-line-ending args: - --fix=lf + - repo: https://github.com/crate-ci/typos + rev: v1.17.2 + hooks: + - id: typos diff --git a/_typos.toml b/_typos.toml new file mode 100644 index 000000000..a04206b8d --- /dev/null +++ b/_typos.toml @@ -0,0 +1,11 @@ +[files] + +[default.extend-identifiers] + +[type.py.extend-words] +"BA" = "BA" # used as a commented-out variable in tests + +[type.cuda.extend-words] +"subtile" = "subtile" +"subtiles" = "subtiles" +"transation" = "transation" # TODO: is this transition, transaction, translation..? diff --git a/benchmarking/switchback/make_plot_with_jsonl.py b/benchmarking/switchback/make_plot_with_jsonl.py index 177270346..b23f63562 100644 --- a/benchmarking/switchback/make_plot_with_jsonl.py +++ b/benchmarking/switchback/make_plot_with_jsonl.py @@ -36,8 +36,8 @@ ('x_quantize_rowwise', 'P', '--', 'C4', 'Quantize rowwise X (switchback)'), ('g_quantize_rowwise', 'P', '-.', 'C4', 'Quantize rowwise G (switchback)'), - ('w_quantize_global', '.', '--', 'C4', 'Quatnize global W (switchback)'), - ('w_quantize_global_transpose', '.', '-.', 'C4', 'Quantize gloabl and\ntranspose W (switchback)'), + ('w_quantize_global', '.', '--', 'C4', 'Quantize global W (switchback)'), + ('w_quantize_global_transpose', '.', '-.', 'C4', 'Quantize global and\ntranspose W (switchback)'), ]: xs = [] ys = [] diff --git a/bitsandbytes/cuda_setup/main.py b/bitsandbytes/cuda_setup/main.py index a8792c1ad..4245a2842 100644 --- a/bitsandbytes/cuda_setup/main.py +++ b/bitsandbytes/cuda_setup/main.py @@ -4,7 +4,7 @@ [ ] TODO: Q - What if we have multiple GPUs of different makes? - CUDA version - Software: - - CPU-only: only CPU quantization functions (no optimizer, no matrix multipl) + - CPU-only: only CPU quantization functions (no optimizer, no matrix multiply) - CuBLAS-LT: full-build 8-bit optimizer - no CuBLAS-LT: no 8-bit matrix multiplication (`nomatmul`) @@ -263,7 +263,7 @@ def warn_in_case_of_duplicates(results_paths: Set[Path]) -> None: warning_msg = ( f"Found duplicate {CUDA_RUNTIME_LIBS} files: {results_paths}.. " "We select the PyTorch default libcudart.so, which is {torch.version.cuda}," - "but this might missmatch with the CUDA version that is needed for bitsandbytes." + "but this might mismatch with the CUDA version that is needed for bitsandbytes." "To override this behavior set the BNB_CUDA_VERSION= environmental variable" "For example, if you want to use the CUDA version 122" "BNB_CUDA_VERSION=122 python ..." diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py index 11db74859..9fc5e08f0 100644 --- a/bitsandbytes/functional.py +++ b/bitsandbytes/functional.py @@ -120,7 +120,7 @@ def get_instance(cls): return cls._instance def prefetch_all(self, to_cpu=False): - # assume the first added, will be hte + # assume the first added, will be the # ones that are used first, so swap them in last # in the case they are evicted again for t in self.paged_tensors[::-1]: @@ -219,7 +219,7 @@ def elementwise_func(func_name, A, B, value, prefetch=True): # paged function are fully asynchronous # if we return from this function, we want to the tensor # to be in the correct state, that is the final state after the - # operation occured. So we synchronize. + # operation occurred. So we synchronize. torch.cuda.synchronize() def fill(A, value, device=None, prefetch=True): elementwise_func('fill', A, None, value) @@ -589,7 +589,7 @@ def estimate_quantiles(A: Tensor, out: Optional[torch.Tensor] = None, offset: fl class QuantState: - """container for quantization state components to work with Params4bit and similar clases""" + """container for quantization state components to work with Params4bit and similar classes""" valid_quant_types = ('fp4', 'nf4') valid_qs_type_keys = [f"bitsandbytes__{x}" for x in valid_quant_types] valid_qs_keys = ['absmax', 'quant_map', 'nested_absmax', 'nested_quant_map', 'quant_state', 'quant_type', diff --git a/csrc/kernels.cu b/csrc/kernels.cu index f117547ed..df8488389 100644 --- a/csrc/kernels.cu +++ b/csrc/kernels.cu @@ -134,10 +134,10 @@ __device__ unsigned char dQuantizeFP4(float x) // we do a binary search // the pivots are divided by 12 (the FP4 absmax) - // since we assum input data is in [-1.0, 1.0] + // since we assume input data is in [-1.0, 1.0] // !be careful here, its easy to make a mistake - // that is difficult to noice if you add an extra + // that is difficult to notice if you add an extra // zero somewhere! int sign = x < 0 ? 0b1000 : 0b0000; @@ -2259,8 +2259,8 @@ template__global__ void kd // data is in 32 column-tile major with tile width 32 columns and numRows rows // L1. Load sub-tile row/col statistics. Each thread only holds 1 col, load rows into shared memory. - // L2. Load data in warp-striped arangement (t0 holds colidx [0, 0, 0, 0], rowidx [0, 1, 2, 3]) + // L2. Load data in warp-striped arrangement (t0 holds colidx [0, 0, 0, 0], rowidx [0, 1, 2, 3]) // C1. Compute val(row_stat*col_stat)/(127*127) (load 1/(127*127 into register)) // C2. Compute normalization values and store col values in register // S1. Store C1 into 16-bit output @@ -2383,7 +2383,7 @@ template __global__ void kd if(valid_items <= 0) // the sub-tile might have more elements than the tile itself break; - // L2. Load data in warp-striped arangement (t0 holds colidx [0, 0, 0, 0], rowidx [0, 1, 2, 3]) + // L2. Load data in warp-striped arrangement (t0 holds colidx [0, 0, 0, 0], rowidx [0, 1, 2, 3]) LoadInt32(loadint32).Load(&(A[subtile_idx]), local_values, valid_items, 0); ExchangeInt32(exchangeint32).BlockedToWarpStriped(local_values, local_values); @@ -2650,7 +2650,7 @@ template __global__ void kExtractOutliers(char *A, int *idx, char * //// 4. do dequantization from register of B into second pair of registers //// 5. store (4) into fragment //// 6. matmul aggregate into fragment C -//// 7. aggreecate files of C into shared memroy block C +//// 7. aggreecate files of C into shared memory block C //// 8. sum (7) //// 9. write outputs to matmul output matrix //} diff --git a/deploy.sh b/deploy.sh index c261ee9a9..e60373627 100644 --- a/deploy.sh +++ b/deploy.sh @@ -5,7 +5,7 @@ echo "MAKE SURE LD_LIBRARY_PATH IS EMPTY!" echo $LD_LIBRARY_PATH if [[ ! -z "${LD_LIBRARY_PATH}" ]]; then - echo "Compilation unsuccessul!" 1>&2 + echo "Compilation unsuccessful!" 1>&2 exit 64 fi @@ -24,7 +24,7 @@ make cpuonly CUDA_VERSION="CPU" if [ ! -f "./bitsandbytes/libbitsandbytes_cpu.so" ]; then # Control will enter here if $DIRECTORY doesn't exist. - echo "Compilation unsuccessul!" 1>&2 + echo "Compilation unsuccessful!" 1>&2 exit 64 fi @@ -34,7 +34,7 @@ make cuda110 CUDA_VERSION=110 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda110.so" ]; then # Control will enter here if $DIRECTORY doesn't exist. - echo "Compilation unsuccessul!" 1>&2 + echo "Compilation unsuccessful!" 1>&2 exit 64 fi @@ -44,7 +44,7 @@ make cuda11x CUDA_VERSION=111 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda111.so" ]; then # Control will enter here if $DIRECTORY doesn't exist. - echo "Compilation unsuccessul!" 1>&2 + echo "Compilation unsuccessful!" 1>&2 exit 64 fi @@ -54,7 +54,7 @@ make cuda11x CUDA_VERSION=114 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda114.so" ]; then # Control will enter here if $DIRECTORY doesn't exist. - echo "Compilation unsuccessul!" 1>&2 + echo "Compilation unsuccessful!" 1>&2 exit 64 fi @@ -64,7 +64,7 @@ make cuda11x CUDA_VERSION=115 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda115.so" ]; then # Control will enter here if $DIRECTORY doesn't exist. - echo "Compilation unsuccessul!" 1>&2 + echo "Compilation unsuccessful!" 1>&2 exit 64 fi @@ -74,7 +74,7 @@ make cuda11x CUDA_VERSION=117 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda117.so" ]; then # Control will enter here if $DIRECTORY doesn't exist. - echo "Compilation unsuccessul!" 1>&2 + echo "Compilation unsuccessful!" 1>&2 exit 64 fi @@ -84,7 +84,7 @@ make cuda118 CUDA_VERSION=118 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda118.so" ]; then # Control will enter here if $DIRECTORY doesn't exist. - echo "Compilation unsuccessul!" 1>&2 + echo "Compilation unsuccessful!" 1>&2 exit 64 fi @@ -94,7 +94,7 @@ make cuda12x CUDA_VERSION=120 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda120.so" ]; then # Control will enter here if $DIRECTORY doesn't exist. - echo "Compilation unsuccessul!" 1>&2 + echo "Compilation unsuccessful!" 1>&2 exit 64 fi @@ -104,7 +104,7 @@ make cuda12x CUDA_VERSION=121 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda121.so" ]; then # Control will enter here if $DIRECTORY doesn't exist. - echo "Compilation unsuccessul!" 1>&2 + echo "Compilation unsuccessful!" 1>&2 exit 64 fi @@ -114,7 +114,7 @@ make cuda12x CUDA_VERSION=122 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda122.so" ]; then # Control will enter here if $DIRECTORY doesn't exist. - echo "Compilation unsuccessul!" 1>&2 + echo "Compilation unsuccessful!" 1>&2 exit 64 fi @@ -124,7 +124,7 @@ make cuda12x CUDA_VERSION=123 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda123.so" ]; then # Control will enter here if $DIRECTORY doesn't exist. - echo "Compilation unsuccessul!" 1>&2 + echo "Compilation unsuccessful!" 1>&2 exit 64 fi @@ -138,7 +138,7 @@ make cuda110_nomatmul CUDA_VERSION=110 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda110_nocublaslt.so" ]; then # Control will enter here if $DIRECTORY doesn't exist. - echo "Compilation unsuccessul!" 1>&2 + echo "Compilation unsuccessful!" 1>&2 exit 64 fi @@ -149,7 +149,7 @@ make cuda11x_nomatmul CUDA_VERSION=111 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda111_nocublaslt.so" ]; then # Control will enter here if $DIRECTORY doesn't exist. - echo "Compilation unsuccessul!" 1>&2 + echo "Compilation unsuccessful!" 1>&2 exit 64 fi @@ -159,7 +159,7 @@ make cuda11x_nomatmul CUDA_VERSION=114 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda114_nocublaslt.so" ]; then # Control will enter here if $DIRECTORY doesn't exist. - echo "Compilation unsuccessul!" 1>&2 + echo "Compilation unsuccessful!" 1>&2 exit 64 fi @@ -169,7 +169,7 @@ make cuda11x_nomatmul CUDA_VERSION=115 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda115_nocublaslt.so" ]; then # Control will enter here if $DIRECTORY doesn't exist. - echo "Compilation unsuccessul!" 1>&2 + echo "Compilation unsuccessful!" 1>&2 exit 64 fi @@ -179,7 +179,7 @@ make cuda11x_nomatmul CUDA_VERSION=117 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda117_nocublaslt.so" ]; then # Control will enter here if $DIRECTORY doesn't exist. - echo "Compilation unsuccessul!" 1>&2 + echo "Compilation unsuccessful!" 1>&2 exit 64 fi @@ -189,7 +189,7 @@ make cuda118_nomatmul CUDA_VERSION=118 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda118_nocublaslt.so" ]; then # Control will enter here if $DIRECTORY doesn't exist. - echo "Compilation unsuccessul!" 1>&2 + echo "Compilation unsuccessful!" 1>&2 exit 64 fi @@ -199,7 +199,7 @@ make cuda12x_nomatmul CUDA_VERSION=120 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda120_nocublaslt.so" ]; then # Control will enter here if $DIRECTORY doesn't exist. - echo "Compilation unsuccessul!" 1>&2 + echo "Compilation unsuccessful!" 1>&2 exit 64 fi @@ -209,7 +209,7 @@ make cuda12x_nomatmul CUDA_VERSION=121 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda121_nocublaslt.so" ]; then # Control will enter here if $DIRECTORY doesn't exist. - echo "Compilation unsuccessul!" 1>&2 + echo "Compilation unsuccessful!" 1>&2 exit 64 fi @@ -219,7 +219,7 @@ make cuda12x_nomatmul CUDA_VERSION=122 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda122_nocublaslt.so" ]; then # Control will enter here if $DIRECTORY doesn't exist. - echo "Compilation unsuccessul!" 1>&2 + echo "Compilation unsuccessful!" 1>&2 exit 64 fi @@ -229,7 +229,7 @@ make cuda12x_nomatmul CUDA_VERSION=123 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda123_nocublaslt.so" ]; then # Control will enter here if $DIRECTORY doesn't exist. - echo "Compilation unsuccessul!" 1>&2 + echo "Compilation unsuccessful!" 1>&2 exit 64 fi diff --git a/docs/source/contributing.mdx b/docs/source/contributing.mdx index b28e91936..b482364de 100644 --- a/docs/source/contributing.mdx +++ b/docs/source/contributing.mdx @@ -1,5 +1,5 @@ # Contributors guidelines -... stil under construction ... (feel free to propose materials, `bitsandbytes` is a community project) +... still under construction ... (feel free to propose materials, `bitsandbytes` is a community project) ## Setup pre-commit hooks - Install pre-commit hooks with `pip install pre-commit`. diff --git a/docs/source/integrations.mdx b/docs/source/integrations.mdx index 7857abf4c..7d47ede62 100644 --- a/docs/source/integrations.mdx +++ b/docs/source/integrations.mdx @@ -29,7 +29,7 @@ Please review the [bitsandbytes section in the Accelerate docs](https://huggingf # Trainer for the optimizers -You can use any of the 8-bit and/or paged optimizers by simple passing them to the `transformers.Trainer` class on intialization.All bnb optimizers are supported by passing the correct string in `TrainingArguments`'s `optim` attribute - e.g. (`paged_adamw_32bit`). +You can use any of the 8-bit and/or paged optimizers by simple passing them to the `transformers.Trainer` class on initialization.All bnb optimizers are supported by passing the correct string in `TrainingArguments`'s `optim` attribute - e.g. (`paged_adamw_32bit`). See the [official API docs for reference](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.Trainer). diff --git a/docs/source/optimizers.mdx b/docs/source/optimizers.mdx index 18d20de1d..f74c89ae6 100644 --- a/docs/source/optimizers.mdx +++ b/docs/source/optimizers.mdx @@ -168,9 +168,9 @@ Possible options for the config override are: `betas, eps, weight_decay, lr, opt For overrides for particular layers, we recommend overriding locally in each module. You can do this by passing the module, the parameter, and its attribute name to the GlobalOptimManager: ```py class MyModule(torch.nn.Module): - def __init__(din, dout): + def __init__(d_in, d_out): super(MyModule, self).__init__() - self.linear = torch.nn.Linear(din, dout) + self.linear = torch.nn.Linear(d_in, d_out) # optimization will happen in 32-bit and # learning rate will be set to 0.0001 independent of the main learning rate config = {'optim_bits': 32, 'lr' : 0.0001} diff --git a/include/Algo-Direct2.h b/include/Algo-Direct2.h index d5fa58d12..4211c77bd 100644 --- a/include/Algo-Direct2.h +++ b/include/Algo-Direct2.h @@ -157,7 +157,7 @@ struct AlgoVecBase::val FVec vxp = _mm256_i32gather_ps(xi, idxp, sizeof(float)); IVec ip = idxm; -#else // do not use gather instrucions +#else // do not use gather instructions union U { __m256i vec; diff --git a/include/Portable.h b/include/Portable.h index 1710b0502..2cec1e7de 100644 --- a/include/Portable.h +++ b/include/Portable.h @@ -147,5 +147,5 @@ inline T prev(T x) return x; } -} // namepsace Details +} // namespace Details } // namespace BinSearch diff --git a/include/SIMD.h b/include/SIMD.h index d559e9f55..a2639d3ac 100644 --- a/include/SIMD.h +++ b/include/SIMD.h @@ -568,5 +568,5 @@ FORCE_INLINE FVec mulSub(const FVec& a, const FVec int8 automatically l1 = module(32, 64).cuda() From 332530ba0b16ea18b13f13f345e2d13ec4be04dd Mon Sep 17 00:00:00 2001 From: Rickard Date: Mon, 5 Feb 2024 12:42:26 +0100 Subject: [PATCH 038/112] quantize_block C->C++, use std::thread everywhere (#1024) --- csrc/common.cpp | 24 ++++++++------------ csrc/common.h | 2 +- csrc/cpu_ops.cpp | 59 +++++++++++++----------------------------------- 3 files changed, 27 insertions(+), 58 deletions(-) diff --git a/csrc/common.cpp b/csrc/common.cpp index 52f029917..0a9601689 100644 --- a/csrc/common.cpp +++ b/csrc/common.cpp @@ -1,39 +1,35 @@ #include #include -void *quantize_block(void *arguments) { +void quantize_block(const quantize_block_args& args) { // 1. find absmax in block // 2. divide input value by absmax to normalize into [-1.0, 1.0] // 3. do binary search to find the closest value // 4. check minimal distance // 5. store index - struct quantize_block_args *args = (quantize_block_args *) arguments; - // 1. find absmax in block float absmax_block = -FLT_MAX; - for (long long i = args->block_idx; i < args->block_end; i++) - absmax_block = fmax(absmax_block, fabs(args->A[i])); + for (long long i = args.block_idx; i < args.block_end; i++) + absmax_block = fmax(absmax_block, fabs(args.A[i])); - args->absmax[args->block_idx / args->blocksize] = absmax_block; + args.absmax[args.block_idx / args.blocksize] = absmax_block; - for (long long i = args->block_idx; i < args->block_end; i++) { + for (long long i = args.block_idx; i < args.block_end; i++) { // 2. divide input value by absmax to normalize into [-1.0, 1.0] // 3. do binary search to find the closest value - float normed_value = args->A[i] / absmax_block; - long long idx = args->bin_searcher->scalar(normed_value); + float normed_value = args.A[i] / absmax_block; + long long idx = args.bin_searcher->scalar(normed_value); // 4. check minimal distance // The binary search returns always the value to the left, which might not be the closest value if (idx < 255) { - float dist_left = fabs(normed_value - (args->code[idx])); - float dist_right = fabs(normed_value - (args->code[idx + 1])); + float dist_left = fabs(normed_value - (args.code[idx])); + float dist_right = fabs(normed_value - (args.code[idx + 1])); if (dist_right < dist_left) { idx += 1; } } // 5. store index - args->out[i] = (unsigned char) idx; + args.out[i] = (unsigned char) idx; } - - return NULL; } diff --git a/csrc/common.h b/csrc/common.h index c99034e78..e513f2875 100644 --- a/csrc/common.h +++ b/csrc/common.h @@ -20,6 +20,6 @@ struct quantize_block_args { }; -void *quantize_block(void *arguments); +void quantize_block(const quantize_block_args& args); #endif diff --git a/csrc/cpu_ops.cpp b/csrc/cpu_ops.cpp index 4741aa6aa..e67135360 100644 --- a/csrc/cpu_ops.cpp +++ b/csrc/cpu_ops.cpp @@ -1,10 +1,6 @@ #include -#ifdef _WIN32 -#include -#else -#include -#endif #include +#include using namespace BinSearch; @@ -30,21 +26,13 @@ void quantize_cpu(float *code, float *A, float *absmax, unsigned char *out, long BinAlgo bin_searcher(code, elements_code); int thread_wave_size = 256; - // we chunk the thresds into waves of 256 since the max limit is + // we chunk the threads into waves of 256 since the max limit is // between 16k and 64k on Linux (we reach this when running BLOOM-176B with a large batch size) for(long long offset = 0; offset < num_blocks; offset+=thread_wave_size) { long long valid_chunks = num_blocks - offset >= thread_wave_size ? thread_wave_size : num_blocks - offset; -#ifdef _WIN32 - std::thread *threads = (std::thread *) malloc(sizeof(std::thread) * valid_chunks); -#else - pthread_t *threads = (pthread_t *) malloc(sizeof(pthread_t) * valid_chunks); -#endif - - struct quantize_block_args **args = (quantize_block_args **) malloc(valid_chunks * sizeof(quantize_block_args *)); - - for(long long i = 0; i < valid_chunks; i++) - args[i] = (quantize_block_args *) malloc(sizeof(quantize_block_args)); + std::vector threads(valid_chunks); + std::vector args(valid_chunks); int chunks_processed = 0; for(long long block_idx = offset*blocksize; block_idx < n; block_idx += blocksize) @@ -52,39 +40,24 @@ void quantize_cpu(float *code, float *A, float *absmax, unsigned char *out, long long long valid_items = n - block_idx >= blocksize ? blocksize : n - block_idx; long long block_end = block_idx + valid_items; - struct quantize_block_args *arg = args[chunks_processed]; - arg->bin_searcher = &bin_searcher; - arg->code = code; - arg->A = A; - arg->absmax = absmax; - arg->out = out; - arg->block_end = block_end; - arg->block_idx = block_idx; - arg->threadidx = block_idx / blocksize; - arg->blocksize = blocksize; - -#ifdef _WIN32 - new (&threads[chunks_processed]) std::thread(quantize_block, arg); -#else - pthread_create(&threads[chunks_processed], NULL, &quantize_block, (void *) arg); -#endif + struct quantize_block_args& arg = args[chunks_processed]; + arg.bin_searcher = &bin_searcher; + arg.code = code; + arg.A = A; + arg.absmax = absmax; + arg.out = out; + arg.block_end = block_end; + arg.block_idx = block_idx; + arg.threadidx = block_idx / blocksize; + arg.blocksize = blocksize; + + threads[chunks_processed] = std::thread([arg] { quantize_block(arg); }); chunks_processed += 1; if(chunks_processed == valid_chunks){ break; } } for (int i = 0; i < valid_chunks; i++) - { -#ifdef _WIN32 threads[i].join(); -#else - int err = pthread_join(threads[i], NULL); -#endif - } - free(threads); - for (int i = 0; i < valid_chunks; i++) - free(args[i]); - free(args); - } } From 73d3e7b61307a7a8c05a8bab1be7a54d4ebd0156 Mon Sep 17 00:00:00 2001 From: Rickard Date: Mon, 5 Feb 2024 19:25:09 +0100 Subject: [PATCH 039/112] Make native code portable and add GitHub workflow for building (#949) * Make native code portable and add GitHub workflow for building * Removed deprecated Python versions * Update python-package.yml Co-authored-by: Aarni Koskela * Update python-package.yml Co-authored-by: Aarni Koskela * Update python-package.yml Co-authored-by: Aarni Koskela * Update python-package.yml Co-authored-by: Aarni Koskela * Update python-package.yml Co-authored-by: Aarni Koskela * Update python-package.yml Co-authored-by: Aarni Koskela * Update python-package.yml Co-authored-by: Aarni Koskela * Update python-package.yml * Do not test on Python 3.13 until released * Update python-package.yml * Update python-package.yml * Update python-package.yml * Update python-package.yml * Refactor build stage * Fixed breaking actions change * Slim down Windows cuda * Create dependabot.yml * Bespoke local dev requirements.txt * Enable VS integration * Group Dependabot updates * Cleanup * Update python-package.yml * Reinstate file that was wrongly merged * Fixed regression caused by new version of download-artifact * Update python-package.yml * Update python-package.yml * Fix matrix * Update python-package.yml * Merge * Pipeline * Fixed conflict * Fixed conflict * Update CMakeLists.txt * Fixed merge error * cleanup * cleanup * Find CUDA * Fix * Fixing merge error from latest merge from main * Fix setup.py * Fixed typo in artifact name * Remove linker flags * Build nocublaslt versions * Fixed formatting * Fixed VS Code format on save * Ran format on save from VScode * Re-saved the json files using the new settings * Re-saved CMakeLists.txt to get formatting right * Add path filter * Formatting --------- Co-authored-by: Aarni Koskela --- .github/dependabot.yml | 11 + .github/workflows/cmake.yml | 271 ------------------ .github/workflows/python-package.yml | 227 +++++++++++++++ CMakeLists.txt | 121 ++++++-- Makefile | 141 --------- csrc/mps_kernels.metal | 117 ++++++++ csrc/mps_ops.h | 0 csrc/mps_ops.mm | 67 +++++ ...{pythonInterface.c => pythonInterface.cpp} | 4 + include/Algo-Direct-Common.h | 2 +- include/Algo-Direct2.h | 2 + include/Portable.h | 33 ++- include/SIMD.h | 77 +++-- include/Type.h | 2 +- pyproject.toml | 5 +- requirements-ci.txt | 7 + requirements-dev.txt | 9 + requirements.txt | 3 - setup.py | 22 +- 19 files changed, 629 insertions(+), 492 deletions(-) create mode 100644 .github/dependabot.yml delete mode 100644 .github/workflows/cmake.yml create mode 100644 .github/workflows/python-package.yml delete mode 100644 Makefile create mode 100644 csrc/mps_kernels.metal create mode 100644 csrc/mps_ops.h create mode 100644 csrc/mps_ops.mm rename csrc/{pythonInterface.c => pythonInterface.cpp} (99%) create mode 100644 requirements-ci.txt create mode 100644 requirements-dev.txt delete mode 100644 requirements.txt diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 000000000..8a36c3689 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,11 @@ +version: 2 +updates: + - package-ecosystem: pip + directory: "/" + schedule: + interval: "weekly" + groups: + major: + update-types: [major] + minor-patch: + update-types: [minor, patch] diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml deleted file mode 100644 index 06f08eb9d..000000000 --- a/.github/workflows/cmake.yml +++ /dev/null @@ -1,271 +0,0 @@ -name: CMake on multiple platforms - -on: - push: - branches: [ "main" ] - pull_request: - branches: [ "main" ] - -concurrency: - group: cmake-${{ github.ref }} - cancel-in-progress: true - -jobs: - build-shared-libs: - runs-on: ${{ matrix.os }} - - strategy: - # Set fail-fast to false to ensure that feedback is delivered for all matrix combinations. Consider changing this to true when your workflow is stable. - fail-fast: false - - matrix: - os: [ubuntu-latest, windows-latest] - arch: [x86_64, aarch64] - build_type: [Release] - exclude: - - os: windows-latest - arch: aarch64 - - steps: - - uses: actions/checkout@v4 - - - name: Set up MSVC - if: matrix.os == 'windows-latest' - uses: ilammy/msvc-dev-cmd@v1.13.0 - with: - arch: amd64 - - - name: Set reusable strings - # Turn repeated input strings (such as the build output directory) into step outputs. These step outputs can be used throughout the workflow file. - id: strings - shell: bash - run: | - echo "build-output-dir=${{ github.workspace }}/build" >> "$GITHUB_OUTPUT" - - - name: Prep build - run: python3 -m pip install cmake==3.27.9 ninja setuptools wheel - - - name: Prep Compilers - shell: bash -el {0} - run: | - if [ "${{ matrix.os }}" = "windows-latest" ]; then - echo CXX_COMPILER=cl >> "$GITHUB_ENV" - echo C_COMPILER=cl >> "$GITHUB_ENV" - else - echo CXX_COMPILER=g++ >> "$GITHUB_ENV" - echo C_COMPILER=gcc >> "$GITHUB_ENV" - fi - - - - name: Configure CPU - run: > - cmake -B ${{ steps.strings.outputs.build-output-dir }} - -G Ninja - -DCMAKE_CXX_COMPILER=${{ env.CXX_COMPILER }} - -DCMAKE_C_COMPILER=${{ env.C_COMPILER }} - -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} - -DBUILD_CUDA=OFF - -S ${{ github.workspace }} - - - name: Build CPU - run: cmake --build ${{ steps.strings.outputs.build-output-dir }} --config ${{ matrix.build_type }} - - - name: Copy libraries - shell: bash - run: | - mkdir -p output/${{ matrix.os }}/${{ matrix.arch }} - ( shopt -s nullglob && cp -a bitsandbytes/*.{so,dylib,dll} output/${{ matrix.os }}/${{ matrix.arch }} ) - - - - name: Upload Build Artifacts - uses: actions/upload-artifact@v4 - with: - name: shared_library-${{ matrix.os }}-${{ matrix.arch }} - path: output/* - - - build-shared-libs-cuda: - runs-on: ${{ matrix.os }} - - strategy: - # Set fail-fast to false to ensure that feedback is delivered for all matrix combinations. Consider changing this to true when your workflow is stable. - fail-fast: false - - matrix: - os: [ubuntu-latest, windows-latest] - cuda-version: ['11.8', '12.1'] - arch: [x86_64, aarch64] - build_type: [Release] - exclude: - - os: windows-latest - arch: aarch64 - - steps: - - uses: actions/checkout@v4 - - name: Set up Python 3.10 - uses: actions/setup-python@v5 - with: - python-version: "3.10" - - - name: Set up MSVC - if: matrix.os == 'windows-latest' - uses: ilammy/msvc-dev-cmd@v1.13.0 - with: - arch: amd64 - - - name: Setup Mambaforge - uses: conda-incubator/setup-miniconda@v3.0.1 - with: - miniforge-variant: Mambaforge - miniforge-version: latest - activate-environment: bnb-env - use-mamba: true - - - uses: conda-incubator/setup-miniconda@v3.0.1 - with: - auto-update-conda: true - activate-environment: bnb-env - environment-file: environment-bnb.yml - use-only-tar-bz2: false - auto-activate-base: true - python-version: "3.10" - mamba-version: "*" - - - name: Set reusable strings - # Turn repeated input strings (such as the build output directory) into step outputs. These step outputs can be used throughout the workflow file. - id: strings - shell: bash - run: | - echo "build-output-dir=${{ github.workspace }}/build" >> "$GITHUB_OUTPUT" - - - name: CUDA Toolkit - shell: bash -el {0} - run: | - if [ "${{ matrix.os }}" = "ubuntu-latest" ]; then - # to prepare space - sudo rm -rf /usr/share/dotnet - sudo rm -rf /opt/ghc - sudo rm -rf /usr/local/share/boost - fi - addon="" - cuda_version=${{ matrix.cuda-version }} - [ "$cuda_version" = "12.1" ] && [ "${{ matrix.os }}" = "ubuntu-latest" ] && addon="cuda-cudart-static cuda-nvrtc" - [ "$cuda_version" = "12.1" ] && [ "${{ matrix.os }}" = "windows-latest" ] && addon="cuda-nvrtc" - [ "$cuda_version" = "11.8" ] && cuda_version="11.8.0" - [ "$cuda_version" = "12.1" ] && cuda_version="12.1.1" - - conda install pytorch-cuda=${{ matrix.cuda-version }} -c pytorch # it's dependency not correctly resolved sometime - conda install cuda-python=${{ matrix.cuda-version }} cuda-libraries-dev cuda-nvcc cuda-nvtx cuda-cupti cuda-cudart cuda-cudart-dev cuda-runtime cuda-libraries $addon -c "nvidia/label/cuda-$cuda_version" - - [ "${{ matrix.os }}" = "windows-latest" ] && conda install "clang>=17.0.6" "clangxx>=17.0.6" -c conda-forge - - CUDA_HOME="${{ env.CONDA }}/envs/bnb-env" - echo CUDA_HOME=$CUDA_HOME >> "$GITHUB_ENV" - echo CUDA_PATH=$CUDA_HOME >> "$GITHUB_ENV" - - if [ "${{ matrix.os }}" = "windows-latest" ]; then - echo CXX_COMPILER=cl >> "$GITHUB_ENV" - echo C_COMPILER=cl >> "$GITHUB_ENV" - # without -DCMAKE_CUDA_COMPILER=nvcc, cmake config always fail for cuda-11.8 - echo DCMAKE_CUDA_COMPILER=-DCMAKE_CUDA_COMPILER=nvcc >> "$GITHUB_ENV" - else - echo CXX_COMPILER=g++ >> "$GITHUB_ENV" - echo C_COMPILER=gcc >> "$GITHUB_ENV" - fi - - nvcc --version - - - name: Update environment - run: mamba env update -n bnb-env -f environment-bnb.yml - - - name: Prep build - run: python -m pip install cmake==3.27.9 ninja setuptools wheel - - # TODO: the following steps (CUDA, NOBLASLT, CPU) could be moved to the matrix, so they're built in parallel - - - name: Configure CUDA - run: > - cmake -B ${{ steps.strings.outputs.build-output-dir }} - -G Ninja ${{ env.DCMAKE_CUDA_COMPILER }} - -DCMAKE_CXX_COMPILER=${{ env.CXX_COMPILER }} - -DCMAKE_C_COMPILER=${{ env.C_COMPILER }} - -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} - -DCOMPUTE_CAPABILITY="50;52;60;61;62;70;72;75;80;86;87;89;90" - -S ${{ github.workspace }} - - - name: Build CUDA - run: cmake --build ${{ steps.strings.outputs.build-output-dir }} --config ${{ matrix.build_type }} - - - name: Configure NOBLASLT - run: > - cmake -B ${{ steps.strings.outputs.build-output-dir }} - -G Ninja ${{ env.DCMAKE_CUDA_COMPILER }} - -DCMAKE_CXX_COMPILER=${{ env.CXX_COMPILER }} - -DCMAKE_C_COMPILER=${{ env.C_COMPILER }} - -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} - -DCOMPUTE_CAPABILITY="50;52;60;61;62;70;72;75;80;86;87;89;90" - -DNO_CUBLASLT=ON - -S ${{ github.workspace }} - - - name: Build NOBLASLT - run: cmake --build ${{ steps.strings.outputs.build-output-dir }} --config ${{ matrix.build_type }} - - - name: Copy libraries - shell: bash - run: | - mkdir -p output/${{ matrix.os }}/${{ matrix.arch }} - ( shopt -s nullglob && cp -a bitsandbytes/*.{so,dylib,dll} output/${{ matrix.os }}/${{ matrix.arch }} ) - - - - name: Upload Build Artifacts - uses: actions/upload-artifact@v4 - with: - name: shared_library_cuda-${{ matrix.os }}-${{ matrix.cuda-version }}-${{ matrix.arch }} - path: output/* - - - build-wheels: - needs: - - build-shared-libs - - build-shared-libs-cuda - runs-on: ${{ matrix.os }} - strategy: - matrix: - os: [ubuntu-latest, windows-latest] - arch: [x86_64, aarch64] - exclude: - - os: windows-latest - arch: aarch64 - - steps: - # Check out code - - uses: actions/checkout@v4 - # Download shared libraries - - name: Download build artifact - uses: actions/download-artifact@v4 - with: - merge-multiple: true - path: output/ - - name: Copy correct platform shared libraries - shell: bash - run: | - cp output/${{ matrix.os }}/${{ matrix.arch }}/* bitsandbytes/ - # Set up the Python version needed - - name: Set up Python 3.10 - uses: actions/setup-python@v5 - with: - python-version: "3.10" - cache: pip - - - name: Install build package - shell: bash - run: pip install build - - name: Build wheel - shell: bash - run: python -m build . --wheel - - name: Upload Build Artifacts - uses: actions/upload-artifact@v4 - with: - name: bdist_wheel-${{ matrix.os }}-${{ matrix.arch }} - path: | - ${{ github.workspace }}/dist/ diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml new file mode 100644 index 000000000..265128637 --- /dev/null +++ b/.github/workflows/python-package.yml @@ -0,0 +1,227 @@ +name: Python package + +on: + push: {} + pull_request: + branches: [ main ] + paths: + - '.github/workflows/python-package.yml' + - 'bitsandbytes/**' + - 'csrc/**' + - 'include/**' + - 'tests/**' + - 'CMakeLists.txt' + - 'requirements*.txt' + - 'setup.py' + - 'pyproject.toml' + - 'pytest.ini' + - '**/*.md' + release: + types: [ published ] + +jobs: + + ## + # This job matrix builds the non-CUDA versions of the libraries for all supported platforms. + ## + build-shared-libs: + strategy: + matrix: + os: [ubuntu-latest, macos-latest, windows-latest] + arch: [x86_64, aarch64] + exclude: + - os: windows-latest # This probably requires arm64 Windows agents + arch: aarch64 + runs-on: ${{ matrix.os }} # One day, we could run them on native agents. Azure supports this now but it's planned only for Q3 2023 for hosted agents + steps: + # Check out code + - uses: actions/checkout@v4 + # On Linux we use CMake within Docker + - name: Setup cmake + uses: jwlawson/actions-setup-cmake@v1.14 + with: + cmake-version: '3.26.x' + - name: Add msbuild to PATH + uses: microsoft/setup-msbuild@v1.1 + if: ${{ startsWith(matrix.os, 'windows') }} + # Check out dependencies code + - uses: actions/checkout@v4 + name: Check out NVidia cub + with: + repository: nvidia/cub + ref: 1.11.0 + path: dependencies/cub + # Compile C++ code + - name: Build C++ + shell: bash + run: | + set -ex + build_os=${{ matrix.os }} + build_arch=${{ matrix.arch }} + if [ ${build_os:0:6} == ubuntu -a ${build_arch} == aarch64 ]; then + # Allow cross-compile om aarch64 + sudo apt-get install -y gcc-aarch64-linux-gnu binutils-aarch64-linux-gnu + fi + if [ ${build_os:0:5} == macos -a ${build_arch} == aarch64 ]; then + cmake -DCMAKE_OSX_ARCHITECTURES=arm64 -DCOMPUTE_BACKEND=cpu . + else + cmake -DCOMPUTE_BACKEND=cpu . + fi + if [ ${build_os:0:7} == windows ]; then + pwsh -Command "msbuild bitsandbytes.vcxproj /property:Configuration=Release" + else + make + fi + mkdir -p output/${{ matrix.os }}/${{ matrix.arch }} + ( shopt -s nullglob && cp bitsandbytes/*.{so,dylib,dll} output/${{ matrix.os }}/${{ matrix.arch }}/ ) + - name: Upload build artifact + uses: actions/upload-artifact@v4 + with: + name: shared_library_${{ matrix.os }}_${{ matrix.arch }} + path: output/* + retention-days: 7 + ## + # This job matrix builds the CUDA versions of the libraries for platforms that support CUDA (Linux x64/aarch64 + Windows x64) + ## + build-shared-libs-cuda: + strategy: + matrix: + os: [ubuntu-latest, windows-latest] + arch: [x86_64, aarch64] + cuda_version: ['12.1.0'] + exclude: + - os: windows-latest # This probably requires arm64 Windows agents + arch: aarch64 + runs-on: ${{ matrix.os }} # One day, we could run them on native agents. Azure supports this now but it's planned only for Q3 2023 for hosted agents + steps: + # Check out code + - uses: actions/checkout@v4 + # Linux: We use Docker to build cross platform Cuda (aarch64 is built in emulation) + - name: Set up Docker multiarch + if: startsWith(matrix.os, 'ubuntu') + uses: docker/setup-qemu-action@v2 + # On Linux we use CMake within Docker + - name: Setup cmake + if: ${{ !startsWith(matrix.os, 'linux') }} + uses: jwlawson/actions-setup-cmake@v1.14 + with: + cmake-version: '3.26.x' + # Windows: We install Cuda on the agent (slow) + - uses: Jimver/cuda-toolkit@v0.2.14 + if: startsWith(matrix.os, 'windows') + id: cuda-toolkit + with: + cuda: ${{ matrix.cuda_version }} + method: 'local' + # sub-packages: '["nvcc","cudart","nvrtc_dev","cublas_dev","cusparse_dev","visual_studio_integration"]' + - name: Add msbuild to PATH + uses: microsoft/setup-msbuild@v1.1 + if: ${{ startsWith(matrix.os, 'windows') }} + # Check out dependencies code + - uses: actions/checkout@v4 + name: Check out NVidia cub + with: + repository: nvidia/cub + ref: 1.11.0 + path: dependencies/cub + # Compile C++ code + - name: Build C++ + shell: bash + run: | + set -ex + build_os=${{ matrix.os }} + build_arch=${{ matrix.arch }} + for NO_CUBLASLT in ON OFF; do + if [ ${build_os:0:6} == ubuntu ]; then + image=nvidia/cuda:${{ matrix.cuda_version }}-devel-ubuntu22.04 + echo "Using image $image" + docker run --platform linux/$build_arch -i -w /src -v $PWD:/src $image sh -c \ + "apt-get update \ + && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cmake \ + && cmake -DCOMPUTE_BACKEND=cuda -DNO_CUBLASLT=${NO_CUBLASLT} . \ + && make" + else + cmake -DCOMPUTE_BACKEND=cuda -DNO_CUBLASLT=${NO_CUBLASLT} . + pwsh -Command "msbuild bitsandbytes.vcxproj /property:Configuration=Release" + fi + done + mkdir -p output/${{ matrix.os }}/${{ matrix.arch }} + ( shopt -s nullglob && cp bitsandbytes/*.{so,dylib,dll} output/${{ matrix.os }}/${{ matrix.arch }}/ ) + - name: Upload build artifact + uses: actions/upload-artifact@v4 + with: + name: shared_library_cuda_${{ matrix.os }}_${{ matrix.arch }}_${{ matrix.cuda_version }} + path: output/* + retention-days: 7 + build-wheels: + needs: + - build-shared-libs + - build-shared-libs-cuda + strategy: + matrix: + os: [ubuntu-latest, macos-latest, windows-latest] + python-version: ["3.9", "3.10", "3.11", "3.12"] + arch: [x86_64, aarch64] + exclude: + - os: windows-latest # This probably requires arm64 Windows agents + arch: aarch64 + runs-on: ${{ matrix.os }} + steps: + # Check out code + - uses: actions/checkout@v4 + # Download shared libraries + - name: Download build artifact + uses: actions/download-artifact@v4 + with: + merge-multiple: true + pattern: "shared_library*_${{ matrix.os }}_${{ matrix.arch }}*" + path: output/ + - name: Copy correct platform shared library + shell: bash + run: | + ls -lR output/ + cp output/${{ matrix.os }}/${{ matrix.arch }}/* bitsandbytes/ + # Set up the Python version needed + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + cache: pip + - name: Install build package + shell: bash + run: pip install build + - name: Install Python test dependencies + shell: bash + run: pip install -r requirements-ci.txt + # TODO: How to run CUDA tests on GitHub actions? + #- name: Run unit tests + # if: ${{ matrix.arch == 'x86_64' }} # Tests are too slow to run in emulation. Wait for real aarch64 agents + # run: | + # PYTHONPATH=. pytest --log-cli-level=DEBUG tests + - name: Build wheel + shell: bash + run: python -m build . + - name: Upload build artifact + uses: actions/upload-artifact@v4 + with: + name: bdist_wheel_${{ matrix.os }}_${{ matrix.arch }}_${{ matrix.python-version }} + path: dist/bitsandbytes-*.whl + retention-days: 7 + publish: + needs: build-wheels + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Download build artifact + uses: actions/download-artifact@v4 + with: + path: dist/ + merge-multiple: true + pattern: "bdist_wheel_*" + - run: | + ls -lR dist/ + - name: Publish to PyPi + if: startsWith(github.ref, 'refs/tags') + uses: pypa/gh-action-pypi-publish@release/v1 + with: + password: ${{ secrets.pypi }} diff --git a/CMakeLists.txt b/CMakeLists.txt index 140753af4..4a4090bb7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,8 +2,8 @@ # Ensure the CUDA Toolkit is available on your path. Then run: # For GCC: `cmake -B build . && cmake --build build` # For MSVC: `cmake -B build . && cmake --build build --config Release` -# You can also use the following options -# - BUILD_CUDA: Default ON, will build with CUDA +# You can also use the following options and variables +# - COMPUTE_BACKEND: Set to `cpu`, `cuda`, or `mps` to select the backend # - NO_CUBLASLT: Default OFF, will skip building/linking CUBLASLT support # - CUDA_VERSION: The expected CUDA version, for sanity checking. The actual version # is whatever CMake finds on your path. @@ -11,25 +11,53 @@ # Separate by semicolons, i.e. `-DCOMPUTE_CAPABILITY=89;90` # Check your compute capability here: https://developer.nvidia.com/cuda-gpus # - PTXAS_VERBOSE: Pass the `-v` option to the PTX Assembler -cmake_minimum_required(VERSION 3.18) +cmake_minimum_required(VERSION 3.22.1) -project(bitsandbytes LANGUAGES C CXX) +project(bitsandbytes LANGUAGES CXX) -option(BUILD_CUDA "Build bitsandbytes with CUDA support" ON) -option(NO_CUBLASLT "Disable CUBLAS" OFF) -option(PTXAS_VERBOSE "Pass through -v flag to PTX Assembler" OFF) - -set(CPP_FILES csrc/common.cpp csrc/cpu_ops.cpp csrc/pythonInterface.c) -list(APPEND CUDA_FILES csrc/ops.cu csrc/kernels.cu) +# Define included source files +set(CPP_FILES csrc/common.cpp csrc/cpu_ops.cpp csrc/pythonInterface.cpp) +set(CUDA_FILES csrc/ops.cu csrc/kernels.cu) +set(MPS_FILES csrc/mps_ops.mm) +set(METAL_FILES csrc/mps_kernels.metal) +# C++ sources are always included list(APPEND SRC_FILES ${CPP_FILES}) -message(STATUS "BUILD_CUDA := ${BUILD_CUDA}") -message(STATUS "NO_CUBLASLT := ${NO_CUBLASLT}") +set(COMPUTE_BACKEND "cpu" CACHE STRING "The compute backend to use (cpu, cuda, mps)") +set_property(CACHE COMPUTE_BACKEND PROPERTY STRINGS cpu cuda mps) +option(PTXAS_VERBOSE "Pass through -v flag to PTX Assembler" OFF) + +if(APPLE) + set(CMAKE_OSX_DEPLOYMENT_TARGET 13.1) +endif() set(BNB_OUTPUT_NAME "bitsandbytes") +message(STATUS "Building with backend ${COMPUTE_BACKEND}") + +if(${COMPUTE_BACKEND} STREQUAL "cuda") + if(APPLE) + message(FATAL_ERROR "CUDA is not supported on macOS" ) + endif() + option(NO_CUBLASLT "Disable CUBLAS" OFF) + set(BUILD_CUDA ON) + set(BUILD_MPS OFF) + message(STATUS "NO_CUBLASLT := ${NO_CUBLASLT}") +elseif(${COMPUTE_BACKEND} STREQUAL "mps") + if(NOT APPLE) + message(FATAL_ERROR "MPS is only supported on macOS" ) + endif() + set(BUILD_CUDA OFF) + set(BUILD_MPS ON) +else() + set(BUILD_CUDA OFF) + set(BUILD_MPS OFF) +endif() + + if(BUILD_CUDA) enable_language(CUDA) # This will fail if CUDA is not found + find_package(CUDAToolkit REQUIRED) # Convert the CUDA version from X.Y.z to XY. There's probably a shorter way of doing this string(REGEX MATCH "^[0-9]+.[0-9]+" _CUDA_VERSION_FIRST_TWO "${CMAKE_CUDA_COMPILER_VERSION}") @@ -87,28 +115,56 @@ if(BUILD_CUDA) if(NO_CUBLASLT) string(APPEND BNB_OUTPUT_NAME "_nocublaslt") endif() -else() - message(STATUS "Building CPU Only") - string(APPEND BNB_OUTPUT_NAME "_cpu") - if(NO_CUBLASLT) - message(WARNING "We're building in CPU only mode but NO_CUBLASLT is enabled. It will have no effect.") + add_compile_definitions(BUILD_CUDA) +elseif(BUILD_MPS) + if(NOT APPLE) + message(FATAL_ERROR "MPS is only supported on macOS" ) endif() + + enable_language(OBJCXX) + + list(APPEND SRC_FILES ${MPS_FILES}) + + string(APPEND BNB_OUTPUT_NAME "_mps") + add_compile_definitions(BUILD_MPS) + file(MAKE_DIRECTORY "build") + add_custom_command(OUTPUT "bitsandbytes/bitsandbytes.metallib" + COMMAND xcrun metal -c -o "build/bitsandbytes.air" ${METAL_FILES} + COMMAND xcrun metallib "build/bitsandbytes.air" -o "bitsandbytes/bitsandbytes.metallib" + DEPENDS "${METAL_FILES}" + COMMENT "Compiling Metal kernels" + VERBATIM) + add_custom_target(metallib DEPENDS "bitsandbytes/bitsandbytes.metallib") +else() + set(LIBSUFFIX "cpu") + set(GPU_SOURCES) +endif() + + +if(WIN32) + # Export all symbols + set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON) +endif() + +# Weird MSVC hacks +if(MSVC) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2 /fp:fast") + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX2 /fp:fast") endif() set_source_files_properties(${CPP_FILES} PROPERTIES LANGUAGE CXX) add_library(bitsandbytes SHARED ${SRC_FILES}) -include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) -target_include_directories(bitsandbytes PUBLIC csrc include) target_compile_features(bitsandbytes PUBLIC cxx_std_14) +target_include_directories(bitsandbytes PUBLIC csrc include) if(BUILD_CUDA) - target_compile_definitions(bitsandbytes PUBLIC BUILD_CUDA) - target_link_libraries(bitsandbytes PUBLIC cudart cublas cusparse) + target_include_directories(bitsandbytes PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) + target_link_libraries(bitsandbytes PUBLIC CUDA::cudart CUDA::cublas CUDA::cusparse) if(NO_CUBLASLT) target_compile_definitions(bitsandbytes PUBLIC NO_CUBLASLT) else() - target_link_libraries(bitsandbytes PUBLIC cublasLt) + target_link_libraries(bitsandbytes PUBLIC CUDA::cublasLt) endif() set_target_properties(bitsandbytes @@ -116,17 +172,20 @@ if(BUILD_CUDA) CUDA_SEPARABLE_COMPILATION ON ) endif() +if(BUILD_MPS) + add_dependencies(bitsandbytes metallib) + target_link_libraries(bitsandbytes objc "-framework Foundation" "-framework Metal" "-framework MetalPerformanceShaders" "-framework MetalPerformanceShadersGraph") +endif() if(WIN32) set_target_properties(bitsandbytes PROPERTIES PREFIX "lib") endif() +set_target_properties(bitsandbytes PROPERTIES OUTPUT_NAME ${BNB_OUTPUT_NAME}) +if(MSVC) + set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY_RELEASE bitsandbytes) + set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY_DEBUG bitsandbytes) + set_target_properties(bitsandbytes PROPERTIES RUNTIME_OUTPUT_DIRECTORY_RELEASE bitsandbytes) + set_target_properties(bitsandbytes PROPERTIES RUNTIME_OUTPUT_DIRECTORY_DEBUG bitsandbytes) +endif() -set_target_properties(bitsandbytes - PROPERTIES - OUTPUT_NAME ${BNB_OUTPUT_NAME} - # We have to use a generator expression to prevent MSVC Debug/Release subdirs being made - RUNTIME_OUTPUT_DIRECTORY "$<1:${CMAKE_SOURCE_DIR}/bitsandbytes>" - LIBRARY_OUTPUT_DIRECTORY "$<1:${CMAKE_SOURCE_DIR}/bitsandbytes>" - POSITION_INDEPENDENT_CODE ON # The `-fPIC` commands for non-windows compilers - WINDOWS_EXPORT_ALL_SYMBOLS ON # On Windows, export all c methods as DLL exports -) +set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY bitsandbytes) diff --git a/Makefile b/Makefile deleted file mode 100644 index e16d24624..000000000 --- a/Makefile +++ /dev/null @@ -1,141 +0,0 @@ -MKFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST))) -ROOT_DIR := $(patsubst %/,%,$(dir $(MKFILE_PATH))) - -GPP:= /usr/bin/g++ -#GPP:= /sw/gcc/11.2.0/bin/g++ -ifeq ($(CUDA_HOME),) - CUDA_HOME:= $(shell which nvcc | rev | cut -d'/' -f3- | rev) -endif - -ifndef CUDA_VERSION -ifneq ($(MAKECMDGOALS),clean) -$(warning WARNING: CUDA_VERSION not set. Call make with CUDA string, for example: make cuda11x CUDA_VERSION=115 or make cpuonly CUDA_VERSION=CPU) -CUDA_VERSION:= -endif -endif - - - -NVCC := $(CUDA_HOME)/bin/nvcc - -########################################### - -CSRC := $(ROOT_DIR)/csrc -BUILD_DIR:= $(ROOT_DIR)/build - -FILES_CUDA := $(CSRC)/ops.cu $(CSRC)/kernels.cu -FILES_CPP := $(CSRC)/common.cpp $(CSRC)/cpu_ops.cpp $(CSRC)/pythonInterface.c - -INCLUDE := -I $(CUDA_HOME)/include -I $(ROOT_DIR)/csrc -I $(CONDA_PREFIX)/include -I $(ROOT_DIR)/include -LIB := -L $(CUDA_HOME)/lib64 -lcudart -lcublas -lcublasLt -lcusparse -L $(CONDA_PREFIX)/lib - -# NVIDIA NVCC compilation flags -COMPUTE_CAPABILITY += -gencode arch=compute_50,code=sm_50 # Maxwell -COMPUTE_CAPABILITY += -gencode arch=compute_52,code=sm_52 # Maxwell -COMPUTE_CAPABILITY += -gencode arch=compute_60,code=sm_60 # Pascal -COMPUTE_CAPABILITY += -gencode arch=compute_61,code=sm_61 # Pascal -COMPUTE_CAPABILITY += -gencode arch=compute_70,code=sm_70 # Volta - -CC_KEPLER := -gencode arch=compute_35,code=sm_35 # Kepler -CC_KEPLER += -gencode arch=compute_37,code=sm_37 # Kepler - -# Later versions of CUDA support the new architectures -CC_CUDA11x := -gencode arch=compute_75,code=sm_75 -CC_CUDA11x += -gencode arch=compute_80,code=sm_80 -CC_CUDA11x += -gencode arch=compute_86,code=sm_86 - - -CC_cublasLt110 := -gencode arch=compute_75,code=sm_75 -CC_cublasLt110 += -gencode arch=compute_80,code=sm_80 - -CC_cublasLt111 := -gencode arch=compute_75,code=sm_75 -CC_cublasLt111 += -gencode arch=compute_80,code=sm_80 -CC_cublasLt111 += -gencode arch=compute_86,code=sm_86 - -CC_ADA_HOPPER := -gencode arch=compute_89,code=sm_89 -CC_ADA_HOPPER += -gencode arch=compute_90,code=sm_90 - - -all: $(BUILD_DIR) env - $(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) - $(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB) - -cuda110_nomatmul_kepler: $(BUILD_DIR) env - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA110) $(CC_KEPLER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA110) $(CC_KEPLER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB) - -cuda11x_nomatmul_kepler: $(BUILD_DIR) env - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_KEPLER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_KEPLER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB) - - -cuda110_nomatmul: $(BUILD_DIR) env - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA110) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA110) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB) - -cuda11x_nomatmul: $(BUILD_DIR) env - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB) - -cuda118_nomatmul: $(BUILD_DIR) env - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB) - -cuda12x_nomatmul: $(BUILD_DIR) env - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB) - -cuda110: $(BUILD_DIR) env - $(NVCC) $(CC_cublasLt110) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) - $(NVCC) $(CC_cublasLt110) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB) - -cuda11x: $(BUILD_DIR) env - $(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) - $(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB) - -cuda118: $(BUILD_DIR) env - $(NVCC) $(CC_cublasLt111) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) - $(NVCC) $(CC_cublasLt111) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB) - -cuda12x: $(BUILD_DIR) env - $(NVCC) $(CC_cublasLt111) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) - $(NVCC) $(CC_cublasLt111) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++20 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB) - -cpuonly: $(BUILD_DIR) env - $(GPP) -std=c++14 -shared -fPIC -I $(ROOT_DIR)/csrc -I $(ROOT_DIR)/include $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cpu.so - -env: - @echo "ENVIRONMENT" - @echo "============================" - @echo "CUDA_VERSION: $(CUDA_VERSION)" - @echo "============================" - @echo "NVCC path: $(NVCC)" - @echo "GPP path: $(GPP) VERSION: `$(GPP) --version | head -n 1`" - @echo "CUDA_HOME: $(CUDA_HOME)" - @echo "CONDA_PREFIX: $(CONDA_PREFIX)" - @echo "PATH: $(PATH)" - @echo "LD_LIBRARY_PATH: $(LD_LIBRARY_PATH)" - @echo "============================" - -$(BUILD_DIR): - mkdir -p build - mkdir -p dependencies - -$(ROOT_DIR)/dependencies/cub: - git clone https://github.com/NVlabs/cub $(ROOT_DIR)/dependencies/cub - cd dependencies/cub; git checkout 1.11.0 - -clean: - rm -rf build/* *.egg* - rm -f bitsandbytes/libbitsandbytes*.so diff --git a/csrc/mps_kernels.metal b/csrc/mps_kernels.metal new file mode 100644 index 000000000..63b3bf78c --- /dev/null +++ b/csrc/mps_kernels.metal @@ -0,0 +1,117 @@ +#include +using namespace metal; + +#define HLF_MAX 65504 +#define TH 1024 +#define NUM 4 +#define NUM_BLOCK 4096 + +template +static unsigned char quantize_scalar( + float rand, + device float* code, + float x) +{ + int pivot = 127; + int upper_pivot = 255; + int lower_pivot = 0; + + float lower = -1.0f; + float upper = 1.0f; + + float val = code[pivot]; + // i>>=1 = {32, 16, 8, 4, 2, 1} + for(int i = 64; i > 0; i>>=1) + { + if(x > val) + { + lower_pivot = pivot; + lower = val; + pivot+=i; + } + else + { + upper_pivot = pivot; + upper = val; + pivot-=i; + } + val = code[pivot]; + } + + if(upper_pivot == 255) + upper = code[upper_pivot]; + if(lower_pivot == 0) + lower = code[lower_pivot]; + + if(!STOCHASTIC) + { + if(x > val) + { + float midpoint = (upper+val)*0.5f; + if(x > midpoint) + { + return upper_pivot; + } + else + return pivot; + } + else + { + float midpoint = (lower+val)*0.5f; + if(x < midpoint) + return lower_pivot; + else + return pivot; + } + } + else + { + if(x > val) + { + float dist_to_upper = fabs(upper-x); + float dist_full = upper-val; + if(rand >= dist_to_upper/dist_full) return upper_pivot; + else return pivot; + } + else + { + float dist_to_lower = fabs(lower-x); + float dist_full = val-lower; + if(rand >= dist_to_lower/dist_full) return lower_pivot; + else return pivot; + } + } +} + +kernel void quantize(device float* code [[buffer(0)]], + device float* A [[buffer(1)]], + device uchar* out [[buffer(2)]], + constant uint& n [[buffer(3)]], + uint id [[thread_position_in_grid]]) { + const uint n_full = (NUM_BLOCK * (n / NUM_BLOCK)) + (n % NUM_BLOCK == 0 ? 0 : NUM_BLOCK); + uint valid_items = (id / NUM_BLOCK + 1 == (n + NUM_BLOCK - 1) / NUM_BLOCK) ? n - (id / NUM_BLOCK * NUM_BLOCK) : NUM_BLOCK; + const uint base_idx = (id / NUM_BLOCK * NUM_BLOCK); + + float vals[NUM]; + uchar qvals[NUM]; + + for (uint i = base_idx; i < n_full; i += ((n + NUM_BLOCK - 1) / NUM_BLOCK) * NUM_BLOCK) { + valid_items = n - i > NUM_BLOCK ? NUM_BLOCK : n - i; + + threadgroup_barrier(mem_flags::mem_threadgroup); + + for (uint j = 0; j < valid_items; j++) { + vals[j] = A[i + j]; + } + + for (uint j = 0; j < valid_items; j++) { + qvals[j] = quantize_scalar(0.0f, code, vals[j]); + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + for (uint j = 0; j < valid_items; j++) { + out[i + j] = qvals[j]; + } + } +} diff --git a/csrc/mps_ops.h b/csrc/mps_ops.h new file mode 100644 index 000000000..e69de29bb diff --git a/csrc/mps_ops.mm b/csrc/mps_ops.mm new file mode 100644 index 000000000..d198b3552 --- /dev/null +++ b/csrc/mps_ops.mm @@ -0,0 +1,67 @@ +#import + +#define HLF_MAX 65504 +#define TH 1024 +#define NUM 4 +#define NUM_BLOCK 4096 + +static inline MPSGraph* get_graph() +{ + static MPSGraph* cur = nil; + if(!cur) { + cur = [[MPSGraph alloc] init]; + } + return cur; +} + +static inline id get_device() +{ + NSError *error = nil; + static id device = nil; + if(!device) { + device = MTLCreateSystemDefaultDevice(); + } + if(!device) { + NSLog(@"Failed to get MPS device"); + abort(); + } + return device; +} + +static inline id get_library() +{ + NSError *error = nil; + static id library = nil; + if(!library) { + library = [get_device() newLibraryWithURL:[NSURL fileURLWithPath:@"bitsandbytes.metallib"] error:&error]; + } + if(!library) { + NSLog(@"Failed to load bitsandbytes.metallib"); + abort(); + } + return library; +} + +/*MPSGraphTensor* dequantize_mps(MPSGraphTensor* code, MPSGraphTensor* A, int n) +{ + id out = [get_graph() dequantizeTensor:(MPSGraphTensor*)A scaleTensor:(MPSGraphTensor*)code zeroPoint:0.0 dataType:MPSDataTypeInt8 axis:0 name:@"out"]; + return out; +}*/ + + +// MPSGraph function for quantize +extern "C" MPSGraphTensor* quantize_mps(MPSGraph* graph, MPSGraphTensor* code, MPSGraphTensor* A, int n) +{ + id device = get_device(); + id library = get_library(); + static id kernel = nil; + if(!kernel) { + kernel = [library newFunctionWithName:@"quantize"]; + if(!kernel) { + NSLog(@"Failed to load bitsandbytes.metallib"); + abort(); + } + } + NSLog(@"Not implemented"); + return nil; +} diff --git a/csrc/pythonInterface.c b/csrc/pythonInterface.cpp similarity index 99% rename from csrc/pythonInterface.c rename to csrc/pythonInterface.cpp index 087ae3921..ea2283504 100644 --- a/csrc/pythonInterface.c +++ b/csrc/pythonInterface.cpp @@ -6,6 +6,9 @@ #if BUILD_CUDA #include #endif +#if BUILD_MPS +// #include +#endif #include // We cannot call templated code from C, so we wrap the template in a C compatible call here if necessary. @@ -412,6 +415,7 @@ extern "C" { gemm_4bit_inference_naive_fp32(m, n, k, A, B, absmax, datatype, out, lda, ldb, ldc, blocksize); } #endif + void cquantize_blockwise_cpu_fp32(float *code, float *A, float *absmax, unsigned char *out, long long blocksize, long long n){ quantize_cpu(code, A, absmax, out, blocksize, n); } void cdequantize_blockwise_cpu_fp32(float *code, unsigned char *A, float *absmax, float *out, long long blocksize, long long n){ dequantize_cpu(code, A, absmax, out, blocksize, n); } } diff --git a/include/Algo-Direct-Common.h b/include/Algo-Direct-Common.h index c97084904..7b40edea9 100644 --- a/include/Algo-Direct-Common.h +++ b/include/Algo-Direct-Common.h @@ -190,7 +190,7 @@ struct DirectInfo xi = xws; } else { - myassert(Gap==1, "if Gap>1 then X workspace must be provided"); + myassert((Gap==1), "if Gap>1 then X workspace must be provided"); xi = x; } diff --git a/include/Algo-Direct2.h b/include/Algo-Direct2.h index 4211c77bd..547ca9955 100644 --- a/include/Algo-Direct2.h +++ b/include/Algo-Direct2.h @@ -52,6 +52,7 @@ struct AlgoVecBase::val private: typedef AlgoScalarBase base_t; +#ifdef USE_SSE2 FORCE_INLINE //NO_INLINE void resolve(const FVec& vz, const IVec& bidx, uint32 *pr) const @@ -135,6 +136,7 @@ struct AlgoVecBase::val pr[0] = u.ui32[0]; pr[1] = u.ui32[2]; } +#endif // USE_SSE2 #ifdef USE_AVX diff --git a/include/Portable.h b/include/Portable.h index 2cec1e7de..090a25065 100644 --- a/include/Portable.h +++ b/include/Portable.h @@ -4,10 +4,40 @@ #include #include +#if defined(__aarch64__) +#ifdef __CUDACC__ +#undef USE_NEON // Doesn't work with nvcc, undefined symbols +#else +#include +#undef USE_NEON // Not yet implemented +#endif +#undef USE_AVX // x86_64 only +#undef USE_AVX2 // x86_64 only +#undef USE_SSE2 // x86_64 only +#undef USE_SSE41 // x86_64 only +#undef USE_SSE42 // x86_64 only +#undef USE_FMA // x86_64 only +#ifdef USE_NEON +typedef float32x4_t __m128; +typedef int32x4_t __m128i; +typedef float64x2_t __m128d; +#else +typedef struct {float a; float b; float c; float d;} __m128; +typedef struct {int a; int b; int c; int d;} __m128i; +typedef struct {double a; double b;} __m128d; +#endif +#else +#undef USE_NEON // ARM64 only #ifdef __FMA__ #define USE_FMA #endif +#if !defined(__SSE2__) && !defined(_MSC_VER) +#error Compiler must support SSE2 +#endif +#define USE_SSE2 +#if defined(__aarch64__) +#else #ifdef __AVX2__ #define USE_AVX2 #endif @@ -24,7 +54,8 @@ #ifdef __SSE4_2__ #define USE_SSE42 #endif - +#endif +#endif #ifndef _MSC_VER #include diff --git a/include/SIMD.h b/include/SIMD.h index a2639d3ac..9d1410c73 100644 --- a/include/SIMD.h +++ b/include/SIMD.h @@ -2,6 +2,46 @@ #include "Portable.h" +#ifdef USE_SSE2 +#include +#if defined(USE_AVX) || defined(USE_AVX2) +#include +#else +#ifdef USE_SSE41 +#include +#endif +#endif +#endif + +namespace BinSearch { +namespace Details { + +template +struct FTOITraits{}; + +template +struct FVec; + +template +struct IVec; + +template +struct FVec1; + +template <> struct InstrFloatTraits +{ + typedef __m128 vec_t; +}; + +template <> struct InstrFloatTraits +{ + typedef __m128d vec_t; +}; + +} +} + +#if !defined(__aarch64__) #ifdef USE_SSE42 #ifndef _MSC_VER #include @@ -26,29 +66,11 @@ FORCE_INLINE int popcnt32(int x32) } // namespace #endif -#if defined(USE_AVX) || defined(USE_AVX2) -#include -#else -#include -#ifdef USE_SSE41 -#include -#endif -#endif - #include "Type.h" namespace BinSearch { namespace Details { -template -struct FVec; - -template -struct IVec; - -template -struct FVec1; - template <> struct InstrIntTraits { typedef __m128i vec_t; @@ -64,18 +86,8 @@ template <> struct InstrFloatTraits typedef __m128d vec_t; }; -template <> struct InstrFloatTraits -{ - typedef float vec_t; -}; - -template <> struct InstrFloatTraits -{ - typedef double vec_t; -}; - -template -struct FTOITraits +template <> +struct FTOITraits { typedef IVec vec_t; }; @@ -295,9 +307,11 @@ FORCE_INLINE FVec operator- (const FVec& a, const FVec< FORCE_INLINE FVec operator* (const FVec& a, const FVec& b) { return _mm_mul_ps( a, b ); } FORCE_INLINE FVec operator/ (const FVec& a, const FVec& b) { return _mm_div_ps( a, b ); } FORCE_INLINE IVec ftoi (const FVec& a) { return _mm_cvttps_epi32(a); } +#ifndef __clang__ // Conflicts with builtin operator FORCE_INLINE IVec operator<= (const FVec& a, const FVec& b) { return _mm_castps_si128( _mm_cmple_ps( a, b ) ); } FORCE_INLINE IVec operator>= (const FVec& a, const FVec& b) { return _mm_castps_si128( _mm_cmpge_ps( a, b ) ); } FORCE_INLINE IVec operator< (const FVec& a, const FVec& b) { return _mm_castps_si128(_mm_cmplt_ps(a, b)); } +#endif #ifdef USE_FMA FORCE_INLINE FVec mulSub(const FVec& a, const FVec& b, const FVec& c) { return _mm_fmsub_ps(a, b, c); } #endif @@ -349,9 +363,11 @@ FORCE_INLINE FVec operator- (const FVec& a, const FVec FORCE_INLINE FVec operator* (const FVec& a, const FVec& b) { return _mm_mul_pd( a, b ); } FORCE_INLINE FVec operator/ (const FVec& a, const FVec& b) { return _mm_div_pd( a, b ); } FORCE_INLINE IVec ftoi (const FVec& a) { return _mm_cvttpd_epi32(a); } +#ifndef __clang__ // Conflicts with builtin operator FORCE_INLINE IVec operator<= (const FVec& a, const FVec& b) { return _mm_castpd_si128( _mm_cmple_pd( a, b ) ); } FORCE_INLINE IVec operator< (const FVec& a, const FVec& b) { return _mm_castpd_si128(_mm_cmplt_pd(a, b)); } FORCE_INLINE IVec operator>= (const FVec& a, const FVec& b) { return _mm_castpd_si128( _mm_cmpge_pd( a, b ) ); } +#endif #ifdef USE_FMA FORCE_INLINE FVec mulSub(const FVec& a, const FVec& b, const FVec& c ) { return _mm_fmsub_pd(a, b, c); } #endif @@ -570,3 +586,4 @@ FORCE_INLINE FVec mulSub(const FVec& a, const FVec=42", - "wheel" -] +requires = [ "setuptools", "wheel" ] build-backend = "setuptools.build_meta" [tool.ruff] diff --git a/requirements-ci.txt b/requirements-ci.txt new file mode 100644 index 000000000..46bd5b9cd --- /dev/null +++ b/requirements-ci.txt @@ -0,0 +1,7 @@ +# Requirements used for GitHub actions +pytest==7.2.2 +einops==0.6.0 +wheel==0.40.0 +lion-pytorch==0.0.6 +scipy==1.11.4 +pandas==2.2.0 diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 000000000..7ede5b061 --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,9 @@ +# Requirements used for local development +setuptools>=63 +pytest~=7.2.2 +einops~=0.6.0 +wheel~=0.40.0 +lion-pytorch~=0.0.6 +scipy~=1.11.4 +pandas~=2.2.0 +matplotlib~=3.8.2 diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 3bde2dc6a..000000000 --- a/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -lion-pytorch -pytest -scipy diff --git a/setup.py b/setup.py index c493b8b62..13af2a39b 100644 --- a/setup.py +++ b/setup.py @@ -5,10 +5,10 @@ import glob import os -from setuptools import Extension, find_packages, setup +from setuptools import find_packages, setup +from setuptools.dist import Distribution -libs = list(glob.glob("./bitsandbytes/libbitsandbytes*.so")) -libs += list(glob.glob("./bitsandbytes/libbitsandbytes*.dll")) +libs = list(glob.glob("./bitsandbytes/libbitsandbytes*.*")) libs = [os.path.basename(p) for p in libs] print("libs:", libs) @@ -17,6 +17,12 @@ def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() +# Tested with wheel v0.29.0 +class BinaryDistribution(Distribution): + def has_ext_modules(self): + return True + + setup( name="bitsandbytes", version="0.43.0.dev0", @@ -28,18 +34,16 @@ def read(fname): url="https://github.com/TimDettmers/bitsandbytes", packages=find_packages(), package_data={"": libs}, - install_requires=['torch', 'numpy'], + install_requires=["torch", "numpy"], extras_require={ - 'benchmark': ['pandas', 'matplotlib'], - 'test': ['scipy'], + "benchmark": ["pandas", "matplotlib"], + "test": ["scipy"], }, long_description=read("README.md"), long_description_content_type="text/markdown", - # HACK: pretend we have a native extension module so the wheel is tagged - # correctly with a platform tag (e.g. `-linux_x86_64.whl`). - ext_modules=[Extension("bitsandbytes", sources=[], language="c")], classifiers=[ "Development Status :: 4 - Beta", "Topic :: Scientific/Engineering :: Artificial Intelligence", ], + distclass=BinaryDistribution, ) From d17b293c119ab751e8e526e3a62045a570bea520 Mon Sep 17 00:00:00 2001 From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> Date: Mon, 5 Feb 2024 23:32:10 +0100 Subject: [PATCH 040/112] Create Makefile --- Makefile | 141 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 141 insertions(+) create mode 100644 Makefile diff --git a/Makefile b/Makefile new file mode 100644 index 000000000..e16d24624 --- /dev/null +++ b/Makefile @@ -0,0 +1,141 @@ +MKFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST))) +ROOT_DIR := $(patsubst %/,%,$(dir $(MKFILE_PATH))) + +GPP:= /usr/bin/g++ +#GPP:= /sw/gcc/11.2.0/bin/g++ +ifeq ($(CUDA_HOME),) + CUDA_HOME:= $(shell which nvcc | rev | cut -d'/' -f3- | rev) +endif + +ifndef CUDA_VERSION +ifneq ($(MAKECMDGOALS),clean) +$(warning WARNING: CUDA_VERSION not set. Call make with CUDA string, for example: make cuda11x CUDA_VERSION=115 or make cpuonly CUDA_VERSION=CPU) +CUDA_VERSION:= +endif +endif + + + +NVCC := $(CUDA_HOME)/bin/nvcc + +########################################### + +CSRC := $(ROOT_DIR)/csrc +BUILD_DIR:= $(ROOT_DIR)/build + +FILES_CUDA := $(CSRC)/ops.cu $(CSRC)/kernels.cu +FILES_CPP := $(CSRC)/common.cpp $(CSRC)/cpu_ops.cpp $(CSRC)/pythonInterface.c + +INCLUDE := -I $(CUDA_HOME)/include -I $(ROOT_DIR)/csrc -I $(CONDA_PREFIX)/include -I $(ROOT_DIR)/include +LIB := -L $(CUDA_HOME)/lib64 -lcudart -lcublas -lcublasLt -lcusparse -L $(CONDA_PREFIX)/lib + +# NVIDIA NVCC compilation flags +COMPUTE_CAPABILITY += -gencode arch=compute_50,code=sm_50 # Maxwell +COMPUTE_CAPABILITY += -gencode arch=compute_52,code=sm_52 # Maxwell +COMPUTE_CAPABILITY += -gencode arch=compute_60,code=sm_60 # Pascal +COMPUTE_CAPABILITY += -gencode arch=compute_61,code=sm_61 # Pascal +COMPUTE_CAPABILITY += -gencode arch=compute_70,code=sm_70 # Volta + +CC_KEPLER := -gencode arch=compute_35,code=sm_35 # Kepler +CC_KEPLER += -gencode arch=compute_37,code=sm_37 # Kepler + +# Later versions of CUDA support the new architectures +CC_CUDA11x := -gencode arch=compute_75,code=sm_75 +CC_CUDA11x += -gencode arch=compute_80,code=sm_80 +CC_CUDA11x += -gencode arch=compute_86,code=sm_86 + + +CC_cublasLt110 := -gencode arch=compute_75,code=sm_75 +CC_cublasLt110 += -gencode arch=compute_80,code=sm_80 + +CC_cublasLt111 := -gencode arch=compute_75,code=sm_75 +CC_cublasLt111 += -gencode arch=compute_80,code=sm_80 +CC_cublasLt111 += -gencode arch=compute_86,code=sm_86 + +CC_ADA_HOPPER := -gencode arch=compute_89,code=sm_89 +CC_ADA_HOPPER += -gencode arch=compute_90,code=sm_90 + + +all: $(BUILD_DIR) env + $(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) + $(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o + $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB) + +cuda110_nomatmul_kepler: $(BUILD_DIR) env + $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA110) $(CC_KEPLER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT + $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA110) $(CC_KEPLER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o + $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB) + +cuda11x_nomatmul_kepler: $(BUILD_DIR) env + $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_KEPLER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT + $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_KEPLER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o + $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB) + + +cuda110_nomatmul: $(BUILD_DIR) env + $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA110) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT + $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA110) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o + $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB) + +cuda11x_nomatmul: $(BUILD_DIR) env + $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT + $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o + $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB) + +cuda118_nomatmul: $(BUILD_DIR) env + $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT + $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o + $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB) + +cuda12x_nomatmul: $(BUILD_DIR) env + $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT + $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o + $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB) + +cuda110: $(BUILD_DIR) env + $(NVCC) $(CC_cublasLt110) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) + $(NVCC) $(CC_cublasLt110) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o + $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB) + +cuda11x: $(BUILD_DIR) env + $(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) + $(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o + $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB) + +cuda118: $(BUILD_DIR) env + $(NVCC) $(CC_cublasLt111) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) + $(NVCC) $(CC_cublasLt111) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o + $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB) + +cuda12x: $(BUILD_DIR) env + $(NVCC) $(CC_cublasLt111) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) + $(NVCC) $(CC_cublasLt111) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o + $(GPP) -std=c++20 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB) + +cpuonly: $(BUILD_DIR) env + $(GPP) -std=c++14 -shared -fPIC -I $(ROOT_DIR)/csrc -I $(ROOT_DIR)/include $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cpu.so + +env: + @echo "ENVIRONMENT" + @echo "============================" + @echo "CUDA_VERSION: $(CUDA_VERSION)" + @echo "============================" + @echo "NVCC path: $(NVCC)" + @echo "GPP path: $(GPP) VERSION: `$(GPP) --version | head -n 1`" + @echo "CUDA_HOME: $(CUDA_HOME)" + @echo "CONDA_PREFIX: $(CONDA_PREFIX)" + @echo "PATH: $(PATH)" + @echo "LD_LIBRARY_PATH: $(LD_LIBRARY_PATH)" + @echo "============================" + +$(BUILD_DIR): + mkdir -p build + mkdir -p dependencies + +$(ROOT_DIR)/dependencies/cub: + git clone https://github.com/NVlabs/cub $(ROOT_DIR)/dependencies/cub + cd dependencies/cub; git checkout 1.11.0 + +clean: + rm -rf build/* *.egg* + rm -f bitsandbytes/libbitsandbytes*.so From bb5f6b972ca43260d0a486b7b20cf11d967a3f2f Mon Sep 17 00:00:00 2001 From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> Date: Tue, 6 Feb 2024 00:56:08 +0100 Subject: [PATCH 041/112] Re-Delete previous Makefile (#1039) * Delete Makefile * Update installation.mdx * Update docs/source/installation.mdx * Update docs/source/installation.mdx * Update docs/source/installation.mdx * Update docs/source/installation.mdx --- Makefile | 141 ----------------------------------- docs/source/installation.mdx | 8 +- 2 files changed, 3 insertions(+), 146 deletions(-) delete mode 100644 Makefile diff --git a/Makefile b/Makefile deleted file mode 100644 index e16d24624..000000000 --- a/Makefile +++ /dev/null @@ -1,141 +0,0 @@ -MKFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST))) -ROOT_DIR := $(patsubst %/,%,$(dir $(MKFILE_PATH))) - -GPP:= /usr/bin/g++ -#GPP:= /sw/gcc/11.2.0/bin/g++ -ifeq ($(CUDA_HOME),) - CUDA_HOME:= $(shell which nvcc | rev | cut -d'/' -f3- | rev) -endif - -ifndef CUDA_VERSION -ifneq ($(MAKECMDGOALS),clean) -$(warning WARNING: CUDA_VERSION not set. Call make with CUDA string, for example: make cuda11x CUDA_VERSION=115 or make cpuonly CUDA_VERSION=CPU) -CUDA_VERSION:= -endif -endif - - - -NVCC := $(CUDA_HOME)/bin/nvcc - -########################################### - -CSRC := $(ROOT_DIR)/csrc -BUILD_DIR:= $(ROOT_DIR)/build - -FILES_CUDA := $(CSRC)/ops.cu $(CSRC)/kernels.cu -FILES_CPP := $(CSRC)/common.cpp $(CSRC)/cpu_ops.cpp $(CSRC)/pythonInterface.c - -INCLUDE := -I $(CUDA_HOME)/include -I $(ROOT_DIR)/csrc -I $(CONDA_PREFIX)/include -I $(ROOT_DIR)/include -LIB := -L $(CUDA_HOME)/lib64 -lcudart -lcublas -lcublasLt -lcusparse -L $(CONDA_PREFIX)/lib - -# NVIDIA NVCC compilation flags -COMPUTE_CAPABILITY += -gencode arch=compute_50,code=sm_50 # Maxwell -COMPUTE_CAPABILITY += -gencode arch=compute_52,code=sm_52 # Maxwell -COMPUTE_CAPABILITY += -gencode arch=compute_60,code=sm_60 # Pascal -COMPUTE_CAPABILITY += -gencode arch=compute_61,code=sm_61 # Pascal -COMPUTE_CAPABILITY += -gencode arch=compute_70,code=sm_70 # Volta - -CC_KEPLER := -gencode arch=compute_35,code=sm_35 # Kepler -CC_KEPLER += -gencode arch=compute_37,code=sm_37 # Kepler - -# Later versions of CUDA support the new architectures -CC_CUDA11x := -gencode arch=compute_75,code=sm_75 -CC_CUDA11x += -gencode arch=compute_80,code=sm_80 -CC_CUDA11x += -gencode arch=compute_86,code=sm_86 - - -CC_cublasLt110 := -gencode arch=compute_75,code=sm_75 -CC_cublasLt110 += -gencode arch=compute_80,code=sm_80 - -CC_cublasLt111 := -gencode arch=compute_75,code=sm_75 -CC_cublasLt111 += -gencode arch=compute_80,code=sm_80 -CC_cublasLt111 += -gencode arch=compute_86,code=sm_86 - -CC_ADA_HOPPER := -gencode arch=compute_89,code=sm_89 -CC_ADA_HOPPER += -gencode arch=compute_90,code=sm_90 - - -all: $(BUILD_DIR) env - $(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) - $(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB) - -cuda110_nomatmul_kepler: $(BUILD_DIR) env - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA110) $(CC_KEPLER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA110) $(CC_KEPLER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB) - -cuda11x_nomatmul_kepler: $(BUILD_DIR) env - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_KEPLER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_KEPLER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB) - - -cuda110_nomatmul: $(BUILD_DIR) env - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA110) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA110) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB) - -cuda11x_nomatmul: $(BUILD_DIR) env - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB) - -cuda118_nomatmul: $(BUILD_DIR) env - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB) - -cuda12x_nomatmul: $(BUILD_DIR) env - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB) - -cuda110: $(BUILD_DIR) env - $(NVCC) $(CC_cublasLt110) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) - $(NVCC) $(CC_cublasLt110) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB) - -cuda11x: $(BUILD_DIR) env - $(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) - $(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB) - -cuda118: $(BUILD_DIR) env - $(NVCC) $(CC_cublasLt111) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) - $(NVCC) $(CC_cublasLt111) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB) - -cuda12x: $(BUILD_DIR) env - $(NVCC) $(CC_cublasLt111) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) - $(NVCC) $(CC_cublasLt111) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++20 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB) - -cpuonly: $(BUILD_DIR) env - $(GPP) -std=c++14 -shared -fPIC -I $(ROOT_DIR)/csrc -I $(ROOT_DIR)/include $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cpu.so - -env: - @echo "ENVIRONMENT" - @echo "============================" - @echo "CUDA_VERSION: $(CUDA_VERSION)" - @echo "============================" - @echo "NVCC path: $(NVCC)" - @echo "GPP path: $(GPP) VERSION: `$(GPP) --version | head -n 1`" - @echo "CUDA_HOME: $(CUDA_HOME)" - @echo "CONDA_PREFIX: $(CONDA_PREFIX)" - @echo "PATH: $(PATH)" - @echo "LD_LIBRARY_PATH: $(LD_LIBRARY_PATH)" - @echo "============================" - -$(BUILD_DIR): - mkdir -p build - mkdir -p dependencies - -$(ROOT_DIR)/dependencies/cub: - git clone https://github.com/NVlabs/cub $(ROOT_DIR)/dependencies/cub - cd dependencies/cub; git checkout 1.11.0 - -clean: - rm -rf build/* *.egg* - rm -f bitsandbytes/libbitsandbytes*.so diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx index ecdcdeb28..26c63d374 100644 --- a/docs/source/installation.mdx +++ b/docs/source/installation.mdx @@ -23,12 +23,10 @@ pip install bitsandbytes ```bash git clone https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/ -CUDA_VERSION=XXX make cuda12x -python setup.py install +cmake -B build -DBUILD_CUDA=ON -S . +pip install . ``` - -with `XXX` being your CUDA version, for <12.0 call `make cuda 11x`. Note support for non-CUDA GPUs (e.g. AMD, Intel), is also coming soon. - +Note support for non-CUDA GPUs (e.g. AMD, Intel), is also coming soon. For a more detailed compilation guide, head to the [dedicated page on the topic](./compiling) From a0fc3db7df0849f05dfc1260da966bb7b6e24b52 Mon Sep 17 00:00:00 2001 From: Titus <9048635+Titus-von-Koeller@users.noreply.github.com> Date: Tue, 6 Feb 2024 15:17:38 -0300 Subject: [PATCH 042/112] docs: fix broken links (#1045) * add anchor link * fix typo in link * links to subheadings don't work --- docs/source/index.mdx | 6 +++--- docs/source/optimizers.mdx | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/source/index.mdx b/docs/source/index.mdx index 0b033c3a9..71b3d67bd 100644 --- a/docs/source/index.mdx +++ b/docs/source/index.mdx @@ -8,9 +8,9 @@ There are ongoing efforts to support further hardware backends, i.e. Intel CPU + ## API documentation -- [Linear4bit](quantizaton#linear4bit) -- [Linear8bit](quantizaton#linear8bit) -- [StableEmbedding](optimizers#stableembedding) +- [Quantization](quantization) +- [Integrations](integrations) +- [Optimizers](optimizers) # License diff --git a/docs/source/optimizers.mdx b/docs/source/optimizers.mdx index f74c89ae6..734cb2211 100644 --- a/docs/source/optimizers.mdx +++ b/docs/source/optimizers.mdx @@ -184,7 +184,7 @@ class MyModule(torch.nn.Module): Here we'll provide further auto-generated API docs soon. Please feel free to contribute doc-strings for the respective optimizers, as `bitsandbytes` is a community effort. -## StableEmbedding +### StableEmbedding[[stable-emb-api]] [[autodoc]] bitsandbytes.nn.StableEmbedding - __init__ From 4941dd23f368c17cd202155fc7f443b54b95f3b9 Mon Sep 17 00:00:00 2001 From: Titus <9048635+Titus-von-Koeller@users.noreply.github.com> Date: Tue, 6 Feb 2024 15:51:09 -0300 Subject: [PATCH 043/112] fix missing bracket in link (#1046) --- docs/source/integrations.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/integrations.mdx b/docs/source/integrations.mdx index 7d47ede62..0df7efb72 100644 --- a/docs/source/integrations.mdx +++ b/docs/source/integrations.mdx @@ -4,7 +4,7 @@ With Transformers it's very easy to load any model in 4 or 8-bit, quantizing the Please review the [bitsandbytes section in the Accelerate docs](https://huggingface.co/docs/transformers/v4.37.2/en/quantization#bitsandbytes). -Details about the BitsAndBytesConfig can be found here](https://huggingface.co/docs/transformers/v4.37.2/en/main_classes/quantization#transformers.BitsAndBytesConfig). +Details about the BitsAndBytesConfig can be found [here](https://huggingface.co/docs/transformers/v4.37.2/en/main_classes/quantization#transformers.BitsAndBytesConfig). ## Beware: bf16 is optional compute data type If your hardware supports it, `bf16` is the optimal compute dtype. The default is `float32` for backward compatibility and numerical stability. `float16` often leads to numerical instabilities, but `bfloat16` provides the benefits of both worlds: numerical stability and significant computation speedup. Therefore, be sure to check if your hardware supports `bf16` and configure it using the `bnb_4bit_compute_dtype` parameter in BitsAndBytesConfig: From ee13b623cdca1c7ef76c357b216061d72f7beb4a Mon Sep 17 00:00:00 2001 From: Rickard Date: Tue, 6 Feb 2024 20:32:21 +0100 Subject: [PATCH 044/112] Updated documentation on compiling from source (#1048) * Updated compiler instructuins * Thank you, linter --- docs/source/compiling.mdx | 55 +++++++++++++++++++++++---------------- 1 file changed, 32 insertions(+), 23 deletions(-) diff --git a/docs/source/compiling.mdx b/docs/source/compiling.mdx index fc8c58769..39e277e71 100644 --- a/docs/source/compiling.mdx +++ b/docs/source/compiling.mdx @@ -1,40 +1,49 @@ # Compiling from Source[[compiling]] -To compile from source, the CUDA Toolkit is required. Ensure `nvcc` is installed; if not, follow these steps to install it along with the CUDA Toolkit: +## Linux + +To compile from source, you need the following: + +* The ability to compile C++ (gcc, make, headers, etc) +* CMake (version 3.22.1 or newer) +* Python 3.10 or newer +* [The CUDA toolkit](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html) (nvcc) + +On Ubuntu, install the first two with `apt-get install -y build-essential cmake`. + +To install the CUDA toolkit, follow the [instructions from your distro](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html). -```bash -wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/install_cuda.sh -# Use the following syntax: cuda_install CUDA_VERSION INSTALL_PREFIX EXPORT_TO_BASH -# CUDA_VERSION options include 110 to 122 -# EXPORT_TO_BASH: 0 for False, 1 for True -# Example for installing CUDA 11.7 at ~/local/cuda-11.7 and exporting the path to .bashrc: -bash install_cuda.sh 117 ~/local 1 -``` -For a single compile run with a specific CUDA version, set `CUDA_HOME` to point to your CUDA installation directory. For instance, to compile using CUDA 11.7 located at `~/local/cuda-11.7`, use: +To install the package from source, then run ``` -CUDA_HOME=~/local/cuda-11.7 CUDA_VERSION=117 make cuda11x +pip install -r requirements-dev.txt +cmake -DCOMPUTE_BACKEND=cuda -S . +make +pip install . ``` -## General Compilation Steps +If you have multiple versions of CUDA installed, or have installed it in a non-standard location, please refer to [cmake CUDA documentation](https://cliutils.gitlab.io/modern-cmake/chapters/packages/CUDA.html) for how to configure the CUDA compiler used. -1. Use `CUDA_VERSION=XXX make [target]` to compile, where `[target]` includes options like `cuda92`, `cuda10x`, `cuda11x`, and others. -2. Install with `python setup.py install`. +## Windows -Ensure `nvcc` is available in your system. If using Anaconda, determine your CUDA version with PyTorch using `conda list | grep cudatoolkit` and match it by downloading the corresponding version from the [CUDA Toolkit Archive](https://developer.nvidia.com/cuda-toolkit-archive). +The following is required to install from source on Windows -To install CUDA locally without administrative rights: +* [Microsoft Visual Studio](https://visualstudio.microsoft.com/downloads/) with C++ support +* CMake (version 3.22.1 or newer) +* Python 3.10 or newer +* [The CUDA toolkit](https://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html) (nvcc) -```bash -wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/install_cuda.sh -# Follow the same syntax and example as mentioned earlier -``` - -The compilation process relies on the `CUDA_HOME` environment variable to locate CUDA. If `CUDA_HOME` is unset, it will attempt to infer the location from `nvcc`. If `nvcc` is not in your path, you may need to add it or set `CUDA_HOME` manually. For example, if `python -m bitsandbytes` indicates your CUDA path as `/usr/local/cuda-11.7`, you can set `CUDA_HOME` to this path. +To install the CUDA toolkit, follow the [instructions for Windows](https://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html). -If compilation issues arise, please report them. +To install the package from source, then run +``` +pip install -r requirements-dev.txt +cmake -DCOMPUTE_BACKEND=cuda -S . +cmake --build . --config Release +pip install . +``` ## Compilation for Kepler Architecture From 6e0f84d44d19651446e1ecb197f6e583426a47fc Mon Sep 17 00:00:00 2001 From: Won-Kyu Park Date: Wed, 7 Feb 2024 04:59:33 +0900 Subject: [PATCH 045/112] HOTFIX: Fix regression (cpu fix) (#1038) * add "_cpu" tag correctly (regression) * add lib suffix ".dylib" for Darwin Co-authored-by: Aarni Koskela --------- Co-authored-by: Aarni Koskela --- CMakeLists.txt | 2 +- bitsandbytes/cuda_setup/main.py | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 4a4090bb7..1b9f1854b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -136,7 +136,7 @@ elseif(BUILD_MPS) VERBATIM) add_custom_target(metallib DEPENDS "bitsandbytes/bitsandbytes.metallib") else() - set(LIBSUFFIX "cpu") + string(APPEND BNB_OUTPUT_NAME "_cpu") set(GPU_SOURCES) endif() diff --git a/bitsandbytes/cuda_setup/main.py b/bitsandbytes/cuda_setup/main.py index 4245a2842..14c7abbd8 100644 --- a/bitsandbytes/cuda_setup/main.py +++ b/bitsandbytes/cuda_setup/main.py @@ -28,16 +28,15 @@ from .env_vars import get_potentially_lib_path_containing_env_vars -if platform.system() == 'Windows': # Windows +DYNAMIC_LIBRARY_SUFFIX = { "Darwin": ".dylib", "Windows": ".dll", "Linux": ".so"}.get(platform.system(), ".so") +if platform.system() == "Windows": # Windows CUDA_RUNTIME_LIBS = ["nvcuda.dll"] - DYNAMIC_LIBRARY_SUFFIX = ".dll" else: # Linux or other # these are the most common libs names # libcudart.so is missing by default for a conda install with PyTorch 2.0 and instead # we have libcudart.so.11.0 which causes a lot of errors before # not sure if libcudart.so.12.0 exists in pytorch installs, but it does not hurt CUDA_RUNTIME_LIBS = ["libcudart.so", "libcudart.so.11.0", "libcudart.so.12.0", "libcudart.so.12.1", "libcudart.so.12.2"] - DYNAMIC_LIBRARY_SUFFIX = ".so" class CUDASetup: From 88ab630315d9a79973302182d79653b1dfa0918a Mon Sep 17 00:00:00 2001 From: Rickard Date: Tue, 6 Feb 2024 22:13:09 +0100 Subject: [PATCH 046/112] Update installation instructions (#1047) * Update installation instructions * Update docs/source/installation.mdx Co-authored-by: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> * Update installation.mdx * Update docs/source/installation.mdx Co-authored-by: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> * Update installation.mdx * Update installation.mdx --------- Co-authored-by: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> --- docs/source/installation.mdx | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx index 26c63d374..af65a3c7f 100644 --- a/docs/source/installation.mdx +++ b/docs/source/installation.mdx @@ -21,12 +21,16 @@ pip install bitsandbytes ### From source +You need CMake and Python installed. For Linux, make sure to install a compiler (`apt install build-essential`, for example). + ```bash git clone https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/ -cmake -B build -DBUILD_CUDA=ON -S . +pip install -r requirements-dev.txt +cmake -DCOMPUTE_BACKEND=cuda -S . +make pip install . ``` -Note support for non-CUDA GPUs (e.g. AMD, Intel), is also coming soon. +Note support for non-CUDA GPUs (e.g. AMD, Intel, Apple Silicon), is also coming soon. For a more detailed compilation guide, head to the [dedicated page on the topic](./compiling) @@ -34,12 +38,15 @@ For a more detailed compilation guide, head to the [dedicated page on the topic] ## Windows +Windows builds require Visual Studio with C++ support, as well as the Cuda SDK installed. + Currently for Windows users, you need to build bitsandbytes from source: ```bash git clone https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/ -cmake -B build -DBUILD_CUDA=ON -S . -cmake --build build --config Release +pip install -r requirements-dev.txt +cmake -DCOMPUTE_BACKEND=cuda -S . +cmake --build . --config Release python -m build --wheel ``` From 136721a8c1437042f0491972ddc5f35695e5e9b2 Mon Sep 17 00:00:00 2001 From: Rickard Date: Thu, 8 Feb 2024 00:31:11 +0100 Subject: [PATCH 047/112] Skip checkout nvidia cub (#1053) --- .github/workflows/python-package.yml | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 265128637..81c5ae360 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -44,13 +44,6 @@ jobs: - name: Add msbuild to PATH uses: microsoft/setup-msbuild@v1.1 if: ${{ startsWith(matrix.os, 'windows') }} - # Check out dependencies code - - uses: actions/checkout@v4 - name: Check out NVidia cub - with: - repository: nvidia/cub - ref: 1.11.0 - path: dependencies/cub # Compile C++ code - name: Build C++ shell: bash @@ -117,13 +110,6 @@ jobs: - name: Add msbuild to PATH uses: microsoft/setup-msbuild@v1.1 if: ${{ startsWith(matrix.os, 'windows') }} - # Check out dependencies code - - uses: actions/checkout@v4 - name: Check out NVidia cub - with: - repository: nvidia/cub - ref: 1.11.0 - path: dependencies/cub # Compile C++ code - name: Build C++ shell: bash From 344e851673262073c9bc753d808c0acc8dfa8060 Mon Sep 17 00:00:00 2001 From: Won-Kyu Park Date: Thu, 15 Feb 2024 03:59:22 +0900 Subject: [PATCH 048/112] CI: Fix cuda toolkit speed issue. (#1055) * CI: fix cuda-toolkit speed issue * CI: use MSVC instead msbuild to remove 'visual_stuido_integration' dependency * use Ninja to compile without MS toolset * use 'network', install 'ninja' only Co-authored-by: Rickard --------- Co-authored-by: Rickard --- .github/workflows/python-package.yml | 33 ++++++++++++++-------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 81c5ae360..07c3b5217 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -41,9 +41,10 @@ jobs: uses: jwlawson/actions-setup-cmake@v1.14 with: cmake-version: '3.26.x' - - name: Add msbuild to PATH - uses: microsoft/setup-msbuild@v1.1 - if: ${{ startsWith(matrix.os, 'windows') }} + - name: Setup MSVC + if: startsWith(matrix.os, 'windows') + #uses: microsoft/setup-msbuild@v1.1 # to use msbuild + uses: ilammy/msvc-dev-cmd@v1.13.0 # to use cl # Compile C++ code - name: Build C++ shell: bash @@ -60,11 +61,7 @@ jobs: else cmake -DCOMPUTE_BACKEND=cpu . fi - if [ ${build_os:0:7} == windows ]; then - pwsh -Command "msbuild bitsandbytes.vcxproj /property:Configuration=Release" - else - make - fi + cmake --build . --config Release mkdir -p output/${{ matrix.os }}/${{ matrix.arch }} ( shopt -s nullglob && cp bitsandbytes/*.{so,dylib,dll} output/${{ matrix.os }}/${{ matrix.arch }}/ ) - name: Upload build artifact @@ -105,11 +102,14 @@ jobs: id: cuda-toolkit with: cuda: ${{ matrix.cuda_version }} - method: 'local' - # sub-packages: '["nvcc","cudart","nvrtc_dev","cublas_dev","cusparse_dev","visual_studio_integration"]' - - name: Add msbuild to PATH - uses: microsoft/setup-msbuild@v1.1 - if: ${{ startsWith(matrix.os, 'windows') }} + method: 'network' + sub-packages: '["nvcc","cudart","cusparse","cublas","thrust","nvrtc_dev","cublas_dev","cusparse_dev"]' + linux-local-args: '["--toolkit"]' + use-github-cache: false + - name: Setup MSVC + if: startsWith(matrix.os, 'windows') + #uses: microsoft/setup-msbuild@v1.1 # to use msbuild + uses: ilammy/msvc-dev-cmd@v1.13.0 # to use cl # Compile C++ code - name: Build C++ shell: bash @@ -117,6 +117,7 @@ jobs: set -ex build_os=${{ matrix.os }} build_arch=${{ matrix.arch }} + [[ "${{ matrix.os }}" = windows-* ]] && python3 -m pip install ninja for NO_CUBLASLT in ON OFF; do if [ ${build_os:0:6} == ubuntu ]; then image=nvidia/cuda:${{ matrix.cuda_version }}-devel-ubuntu22.04 @@ -125,10 +126,10 @@ jobs: "apt-get update \ && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cmake \ && cmake -DCOMPUTE_BACKEND=cuda -DNO_CUBLASLT=${NO_CUBLASLT} . \ - && make" + && cmake --build ." else - cmake -DCOMPUTE_BACKEND=cuda -DNO_CUBLASLT=${NO_CUBLASLT} . - pwsh -Command "msbuild bitsandbytes.vcxproj /property:Configuration=Release" + cmake -G Ninja -DCOMPUTE_BACKEND=cuda -DNO_CUBLASLT=${NO_CUBLASLT} -DCMAKE_BUILD_TYPE=Release -S . + cmake --build . --config Release fi done mkdir -p output/${{ matrix.os }}/${{ matrix.arch }} From 5b28fd3f7c7877fff0f48f717930e266d0f6e254 Mon Sep 17 00:00:00 2001 From: pnunna93 <104791500+pnunna93@users.noreply.github.com> Date: Wed, 14 Feb 2024 13:44:30 -0600 Subject: [PATCH 049/112] Fix race condition in kEstimateQuantiles (#1061) --- csrc/kernels.cu | 2 ++ 1 file changed, 2 insertions(+) diff --git a/csrc/kernels.cu b/csrc/kernels.cu index df8488389..6d15dbe64 100644 --- a/csrc/kernels.cu +++ b/csrc/kernels.cu @@ -654,6 +654,8 @@ __global__ void kEstimateQuantiles(T *__restrict__ const A, float *code, const f for(int j = threadIdx.x; j < BLOCK_ESTIMATE; j+=blockDim.x) temp_storage.smem_qidx[j] = -1; + __syncthreads(); + if(threadIdx.x < 256) { float q_interval = (1.0f-(2.0f*offset))/255.0f; From ceae15042c110c5b0db2c16c4da053ee85b75226 Mon Sep 17 00:00:00 2001 From: Brian Vaughan Date: Wed, 14 Feb 2024 16:46:06 -0500 Subject: [PATCH 050/112] fix a type in code comment (#1063) was pointing to wrong class --- bitsandbytes/nn/modules.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py index 6eeecc273..2b7e1f067 100644 --- a/bitsandbytes/nn/modules.py +++ b/bitsandbytes/nn/modules.py @@ -275,7 +275,7 @@ class Linear4bit(nn.Linear): compute datatypes such as FP4 and NF4. In order to quantize a linear layer one should first load the original fp16 / bf16 weights into - the Linear8bitLt module, then call `quantized_module.to("cuda")` to quantize the fp16 / bf16 weights. + the Linear4bit module, then call `quantized_module.to("cuda")` to quantize the fp16 / bf16 weights. Example: From c5e43637945c335a46a6c4dda4d2d874c07465c0 Mon Sep 17 00:00:00 2001 From: Rickard Date: Mon, 19 Feb 2024 18:31:45 +0100 Subject: [PATCH 051/112] Fix cross compilation on linux (#1050) --- .github/workflows/python-package.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 07c3b5217..e48c25cc5 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -54,9 +54,9 @@ jobs: build_arch=${{ matrix.arch }} if [ ${build_os:0:6} == ubuntu -a ${build_arch} == aarch64 ]; then # Allow cross-compile om aarch64 - sudo apt-get install -y gcc-aarch64-linux-gnu binutils-aarch64-linux-gnu - fi - if [ ${build_os:0:5} == macos -a ${build_arch} == aarch64 ]; then + sudo apt-get install -y gcc-aarch64-linux-gnu binutils-aarch64-linux-gnu g++-aarch64-linux-gnu + cmake -DCMAKE_C_COMPILER=aarch64-linux-gnu-gcc -DCMAKE_CXX_COMPILER=aarch64-linux-gnu-g++ -DCOMPUTE_BACKEND=cpu . + elif [ ${build_os:0:5} == macos -a ${build_arch} == aarch64 ]; then cmake -DCMAKE_OSX_ARCHITECTURES=arm64 -DCOMPUTE_BACKEND=cpu . else cmake -DCOMPUTE_BACKEND=cpu . From a84b660d18c11266535b642846b5ae120bce46fe Mon Sep 17 00:00:00 2001 From: Titus <9048635+Titus-von-Koeller@users.noreply.github.com> Date: Wed, 21 Feb 2024 12:12:17 +0100 Subject: [PATCH 052/112] contributing.mdx: how to ignore formatting revs --- docs/source/contributing.mdx | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/docs/source/contributing.mdx b/docs/source/contributing.mdx index b482364de..4fe6b7541 100644 --- a/docs/source/contributing.mdx +++ b/docs/source/contributing.mdx @@ -1,13 +1,18 @@ # Contributors guidelines ... still under construction ... (feel free to propose materials, `bitsandbytes` is a community project) -## Setup pre-commit hooks +## Setup + +### Setup pre-commit hooks - Install pre-commit hooks with `pip install pre-commit`. - Run `pre-commit autoupdate` once to configure the hooks. - Re-run `pre-commit autoupdate` every time a new hook got added. Now all the pre-commit hooks will be automatically run when you try to commit and if they introduce some changes, you need to re-add the changed files before being able to commit and push. +### Ignore formatting revs +- Run `git config blame.ignoreRevsFile .git-blame-ignore-revs`. This will make it so that `git blame` is aware of commits that were logged to be solely formatting-related. + ## Doc-string syntax We're following NumPy doc-string conventions with the only notable difference being that we use Markdown instead of Rich text format (RTF) for markup within the doc-strings. From 0bf71989566c63f4b301e5bdbf2cd73b5683a8e9 Mon Sep 17 00:00:00 2001 From: Titus <9048635+Titus-von-Koeller@users.noreply.github.com> Date: Wed, 21 Feb 2024 15:51:40 +0100 Subject: [PATCH 053/112] tests/helpers.py: fix py38 vers incompatibility from other PR --- tests/helpers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/helpers.py b/tests/helpers.py index 46c6ef93d..f82a8631f 100644 --- a/tests/helpers.py +++ b/tests/helpers.py @@ -1,13 +1,13 @@ from itertools import product import random -from typing import Any +from typing import Any, List import torch test_dims_rng = random.Random(42) -def get_test_dims(min: int, max: int, *, n: int) -> list[int]: +def get_test_dims(min: int, max: int, *, n: int) -> List[int]: return [test_dims_rng.randint(min, max) for _ in range(n)] From d11b5068dd74de6694cea0cce350bc86eb2ba5b2 Mon Sep 17 00:00:00 2001 From: Titus <9048635+Titus-von-Koeller@users.noreply.github.com> Date: Wed, 21 Feb 2024 16:27:46 +0100 Subject: [PATCH 054/112] tests: fix all_close to respect max 2 positional args (#1074) --- tests/test_functional.py | 4 ++-- tests/test_modules.py | 4 ++-- tests/test_optim.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/test_functional.py b/tests/test_functional.py index 2d4e959ad..d4f65755f 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -26,12 +26,12 @@ def assert_all_approx_close(a, b, rtol=1e-3, atol=1e-3, count=0, throw=True): - idx = torch.isclose(a, b, rtol, atol) + idx = torch.isclose(a, b, rtol=rtol, atol=atol) sumval = (idx == 0).sum().item() if sumval > count: if throw: print(f"Too many values not close: assert {sumval} < {count}") - torch.testing.assert_close(a, b, rtol, atol) + torch.testing.assert_close(a, b, rtol=rtol, atol=atol) return sumval diff --git a/tests/test_modules.py b/tests/test_modules.py index f809aa791..674620e29 100644 --- a/tests/test_modules.py +++ b/tests/test_modules.py @@ -42,11 +42,11 @@ def get_args(): def assert_all_approx_close(a, b, atol=1e-8, rtol=1e-5, count=10): - idx = torch.isclose(a, b, rtol, atol) + idx = torch.isclose(a, b, rtol=rtol, atol=atol) sumval = (idx == 0).sum().item() if sumval > count: print(f"Too many values not close: assert {sumval} < {count}") - torch.testing.assert_close(a, b, rtol, atol) + torch.testing.assert_close(a, b, rtol=rtol, atol=atol) class LinearFunction(torch.autograd.Function): diff --git a/tests/test_optim.py b/tests/test_optim.py index e379c424a..9395b8820 100644 --- a/tests/test_optim.py +++ b/tests/test_optim.py @@ -145,7 +145,7 @@ def test_optimizer32bit(dim1, dim2, gtype, optim_name): # since Lion can have pretty noisy updates where things lie at the boundary # allow up to 10 errors for Lion - assert_most_approx_close(p1, p2.float(), atol, rtol, max_error_count=10) + assert_most_approx_close(p1, p2.float(), atol=atol, rtol=rtol, max_error_count=10) if i % (k // 5) == 0 and i > 0: path = get_temp_dir() @@ -157,7 +157,7 @@ def test_optimizer32bit(dim1, dim2, gtype, optim_name): rm_path(path) # since Lion can have pretty noisy updates where things lie at the boundary # allow up to 10 errors for Lion - assert_most_approx_close(p1, p2.float(), atol, rtol, max_error_count=10) + assert_most_approx_close(p1, p2.float(), atol=atol, rtol=rtol, max_error_count=10) for name1, name2 in str2statenames[optim_name]: # since Lion can have pretty noisy updates where things lie at the boundary # allow up to 10 errors for Lion From b0730f4db0b08baebbe343d6f2ffb4ca302fc1ed Mon Sep 17 00:00:00 2001 From: Steven Liu <59462357+stevhliu@users.noreply.github.com> Date: Wed, 21 Feb 2024 11:32:32 -0800 Subject: [PATCH 055/112] structure, install (#1072) --- docs/source/_toctree.yml | 34 ++++++-------- docs/source/compiling.mdx | 50 -------------------- docs/source/errors.mdx | 2 +- docs/source/installation.mdx | 86 +++++++++++++++++++++++++--------- docs/source/integrations.mdx | 6 ++- docs/source/nonpytorchcuda.mdx | 46 ------------------ 6 files changed, 85 insertions(+), 139 deletions(-) delete mode 100644 docs/source/compiling.mdx delete mode 100644 docs/source/nonpytorchcuda.mdx diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index ede41bb6c..7584207d0 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -1,34 +1,30 @@ - title: Get started sections: - local: index - title: Index + title: bitsandbytes - local: quickstart title: Quickstart - local: installation title: Installation -- title: Features & Integrations +- title: Guides sections: - - local: quantization - title: Quantization - local: optimizers - title: Optimizers - - local: integrations - title: Integrations + title: 8-bit optimizers - local: algorithms title: Algorithms -- title: Support & Learning + - local: integrations + title: Integrations + - local: errors + title: Troubleshoot + - local: contributing + title: Contribute + - local: faqs + title: FAQs +- title: Explanation sections: - local: resources title: Papers, resources & how to cite - - local: errors - title: Errors & Solutions - - local: nonpytorchcuda - title: Non-PyTorch CUDA - - local: compiling - title: Compilation from Source (extended) - - local: faqs - title: FAQs (Frequently Asked Questions) -- title: Contributors Guidelines +- title: API reference sections: - - local: contributing - title: Contributing + - local: quantization + title: Quantization diff --git a/docs/source/compiling.mdx b/docs/source/compiling.mdx deleted file mode 100644 index 39e277e71..000000000 --- a/docs/source/compiling.mdx +++ /dev/null @@ -1,50 +0,0 @@ -# Compiling from Source[[compiling]] - -## Linux - -To compile from source, you need the following: - -* The ability to compile C++ (gcc, make, headers, etc) -* CMake (version 3.22.1 or newer) -* Python 3.10 or newer -* [The CUDA toolkit](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html) (nvcc) - -On Ubuntu, install the first two with `apt-get install -y build-essential cmake`. - -To install the CUDA toolkit, follow the [instructions from your distro](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html). - - - -To install the package from source, then run - -``` -pip install -r requirements-dev.txt -cmake -DCOMPUTE_BACKEND=cuda -S . -make -pip install . -``` - -If you have multiple versions of CUDA installed, or have installed it in a non-standard location, please refer to [cmake CUDA documentation](https://cliutils.gitlab.io/modern-cmake/chapters/packages/CUDA.html) for how to configure the CUDA compiler used. - -## Windows - -The following is required to install from source on Windows - -* [Microsoft Visual Studio](https://visualstudio.microsoft.com/downloads/) with C++ support -* CMake (version 3.22.1 or newer) -* Python 3.10 or newer -* [The CUDA toolkit](https://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html) (nvcc) - -To install the CUDA toolkit, follow the [instructions for Windows](https://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html). - -To install the package from source, then run -``` -pip install -r requirements-dev.txt -cmake -DCOMPUTE_BACKEND=cuda -S . -cmake --build . --config Release -pip install . -``` - -## Compilation for Kepler Architecture - -From version 0.39.1, bitsandbytes no longer includes Kepler binaries in pip installations, requiring manual compilation. Follow the general steps and use `cuda11x_nomatmul_kepler` for Kepler-targeted compilation. diff --git a/docs/source/errors.mdx b/docs/source/errors.mdx index 293017173..95594ea11 100644 --- a/docs/source/errors.mdx +++ b/docs/source/errors.mdx @@ -1,4 +1,4 @@ -# Errors & Solutions +# Troubleshoot ## No kernel image available diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx index af65a3c7f..f055e44f0 100644 --- a/docs/source/installation.mdx +++ b/docs/source/installation.mdx @@ -1,27 +1,35 @@ # Installation -Note currently `bitsandbytes` is only supported on CUDA GPU hardwares, support for AMD GPUs and M1 chips (MacOS) is coming soon. +bitsandbytes is only supported on CUDA GPUs for CUDA versions **10.2 - 12.0**. Select your operating system below to see the installation instructions. -## Hardware requirements: - - LLM.int8(): NVIDIA Turing (RTX 20xx; T4) or Ampere GPU (RTX 30xx; A4-A100); (a GPU from 2018 or newer). - - 8-bit optimizers and quantization: NVIDIA Kepler GPU or newer (>=GTX 78X). +For Linux systems, make sure your hardware meets the following requirements to use bitsandbytes features. -Supported CUDA versions: 10.2 - 12.0 #TODO: check currently supported versions +| **Feature** | **Hardware requirement** | +|---|---| +| LLM.int8() | NVIDIA Turing (RTX 20 series, T4) or Ampere (RTX 30 series, A4-A100) GPUs | +| 8-bit optimizers/quantization | NVIDIA Kepler (GTX 780 or newer) | -## Linux +> [!WARNING] +> bitsandbytes >= 0.39.1 no longer includes Kepler binaries in pip installations. This requires manual compilation, and you should follow the general steps and use `cuda11x_nomatmul_kepler` for Kepler-targeted compilation. -### From Pypi +To install from PyPI. ```bash pip install bitsandbytes ``` -### From source +To compile from source, you need CMake >= **3.22.1** and Python >= **3.10** installed. Make sure you have a compiler installed to compile C++ (gcc, make, headers, etc.). For example, to install a compiler and CMake on Ubuntu: -You need CMake and Python installed. For Linux, make sure to install a compiler (`apt install build-essential`, for example). +```bash +apt-get install -y build-essential cmake +``` + +You should also install CUDA Toolkit by following the [NVIDIA CUDA Installation Guide for Linux](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html) guide from NVIDIA. + +Now to install the bitsandbytes package from source, run the following commands: ```bash git clone https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/ @@ -30,17 +38,16 @@ cmake -DCOMPUTE_BACKEND=cuda -S . make pip install . ``` -Note support for non-CUDA GPUs (e.g. AMD, Intel, Apple Silicon), is also coming soon. -For a more detailed compilation guide, head to the [dedicated page on the topic](./compiling) + +> [!TIP] +> If you have multiple versions of CUDA installed or installed it in a non-standard location, please refer to CMake CUDA documentation for how to configure the CUDA compiler. -## Windows - -Windows builds require Visual Studio with C++ support, as well as the Cuda SDK installed. +Windows systems require Visual Studio with C++ support as well as an installation of the CUDA SDK. -Currently for Windows users, you need to build bitsandbytes from source: +You'll need to build bitsandbytes from source. To compile from source, you need CMake >= **3.22.1** and Python >= **3.10** installed. You should also install CUDA Toolkit by following the [CUDA Installation Guide for Windows](https://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html) guide from NVIDIA. ```bash git clone https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/ @@ -52,15 +59,52 @@ python -m build --wheel Big thanks to [wkpark](https://github.com/wkpark), [Jamezo97](https://github.com/Jamezo97), [rickardp](https://github.com/rickardp), [akx](https://github.com/akx) for their amazing contributions to make bitsandbytes compatible with Windows. -For a more detailed compilation guide, head to the [dedicated page on the topic](./compiling) - -## MacOS - -Mac support is still a work in progress. Please make sure to check out the [Apple Silicon implementation coordination issue](https://github.com/TimDettmers/bitsandbytes/issues/1020) to get notified about the discussions and progress with respect to MacOS integration. +> [!TIP] +> MacOS support is still a work in progress! Subscribe to this [issue](https://github.com/TimDettmers/bitsandbytes/issues/1020) to get notified about discussions and to track the integration progress. - + +## PyTorch CUDA versions + +Some bitsandbytes features may need a newer CUDA version than the one currently supported by PyTorch binaries from Conda and pip. In this case, you should follow these instructions to load a precompiled bitsandbytes binary. + +1. Determine the path of the CUDA version you want to use. Common paths include: + +* `/usr/local/cuda` +* `/usr/local/cuda-XX.X` where `XX.X` is the CUDA version number + +Then locally install the CUDA version you need with this script from bitsandbytes: + +```bash +wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/install_cuda.sh +# Syntax cuda_install CUDA_VERSION INSTALL_PREFIX EXPORT_TO_BASH +# CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122} +# EXPORT_TO_BASH in {0, 1} with 0=False and 1=True + +# For example, the following installs CUDA 11.7 to ~/local/cuda-11.7 and exports the path to your .bashrc + +bash cuda_install.sh 117 ~/local 1 +``` + +2. Set the environment variables `BNB_CUDA_VERSION` and `LD_LIBRARY_PATH` by manually overriding the CUDA version installed by PyTorch. + +> [!TIP] +> It is recommended to add the following lines to the `.bashrc` file to make them permanent. + +```bash +export BNB_CUDA_VERSION= +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH: +``` + +For example, to use a local install path: + +```bash +export BNB_CUDA_VERSION=117 +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/tim/local/cuda-11.7 +``` + +3. Now when you launch bitsandbytes with these environment variables, the PyTorch CUDA version is overridden by the new CUDA version (in this example, version 11.7) and a different bitsandbytes library is loaded. diff --git a/docs/source/integrations.mdx b/docs/source/integrations.mdx index 0df7efb72..0e37765c5 100644 --- a/docs/source/integrations.mdx +++ b/docs/source/integrations.mdx @@ -6,8 +6,10 @@ Please review the [bitsandbytes section in the Accelerate docs](https://huggingf Details about the BitsAndBytesConfig can be found [here](https://huggingface.co/docs/transformers/v4.37.2/en/main_classes/quantization#transformers.BitsAndBytesConfig). -## Beware: bf16 is optional compute data type -If your hardware supports it, `bf16` is the optimal compute dtype. The default is `float32` for backward compatibility and numerical stability. `float16` often leads to numerical instabilities, but `bfloat16` provides the benefits of both worlds: numerical stability and significant computation speedup. Therefore, be sure to check if your hardware supports `bf16` and configure it using the `bnb_4bit_compute_dtype` parameter in BitsAndBytesConfig: +> [!WARNING] +> **Beware: bf16 is the optimal compute data type!** +> +> If your hardware supports it, `bf16` is the optimal compute dtype. The default is `float32` for backward compatibility and numerical stability. `float16` often leads to numerical instabilities, but `bfloat16` provides the benefits of both worlds: numerical stability equivalent to float32, but combined with the memory footprint and significant computation speedup of a 16-bit data type. Therefore, be sure to check if your hardware supports `bf16` and configure it using the `bnb_4bit_compute_dtype` parameter in BitsAndBytesConfig: ```py import torch diff --git a/docs/source/nonpytorchcuda.mdx b/docs/source/nonpytorchcuda.mdx deleted file mode 100644 index 099a6961b..000000000 --- a/docs/source/nonpytorchcuda.mdx +++ /dev/null @@ -1,46 +0,0 @@ -# How to use a CUDA version that is different from PyTorch - -Some features of `bitsandbytes` may need a newer CUDA version than regularly supported by PyTorch binaries from conda / pip. In that case you can use the following instructions to load a precompiled `bitsandbytes` binary that works for you. - -## Installing or determining the CUDA installation - -Determine the path of the CUDA version that you want to use. Common paths paths are: -```bash -/usr/local/cuda -/usr/local/cuda-XX.X -``` - -where XX.X is the CUDA version number. - -You can also install CUDA version that you need locally with a script provided by `bitsandbytes` as follows: - -```bash -wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/install_cuda.sh -# Syntax cuda_install CUDA_VERSION INSTALL_PREFIX EXPORT_TO_BASH -# CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122} -# EXPORT_TO_BASH in {0, 1} with 0=False and 1=True - -# For example, the following installs CUDA 11.7 to ~/local/cuda-11.7 and exports the path to your .bashrc - -bash cuda_install.sh 117 ~/local 1 -``` - -## Setting the environmental variables `BNB_CUDA_VERSION`, and `LD_LIBRARY_PATH` - -To manually override the PyTorch installed CUDA version you need to set to variable, like so: - -```bash -export BNB_CUDA_VERSION= -export LD_LIBRARY_PATH=$LD_LIBRARY_PATH: -``` - -For example, to use the local install path from above: - -```bash -export BNB_CUDA_VERSION=117 -export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/tim/local/cuda-11.7 -``` - -It is best to add these lines to the `.bashrc` file to make them permanent. - -If you now launch bitsandbytes with these environmental variables the PyTorch CUDA version will be overridden by the new CUDA version and a different bitsandbytes library is loaded (in this case version 117). From cfd6ac75eff48e8c06b03cd8e721302a713c77a8 Mon Sep 17 00:00:00 2001 From: Marc Sun <57196510+SunMarc@users.noreply.github.com> Date: Wed, 21 Feb 2024 15:46:02 -0500 Subject: [PATCH 056/112] add deepcopy and copy for Param4bit (#1060) * fix deepcopy and copy * add tests * remove line * ruff fix * ruff * Update tests/test_linear4bit.py Co-authored-by: Aarni Koskela * add missing state * ruff format * ignore formatting commit for git blame * Params4bit should be initialized as frozen by default * add test for serialization round-tripping * add comparison capability for QuantSate * add back accidentally remove line --------- Co-authored-by: Aarni Koskela Co-authored-by: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com> --- .git-blame-ignore-revs | 3 ++ bitsandbytes/functional.py | 15 ++++++++ bitsandbytes/nn/modules.py | 43 +++++++++++++++++++-- tests/test_linear4bit.py | 77 +++++++++++++++++++++++++++++++------- 4 files changed, 121 insertions(+), 17 deletions(-) diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs index f7dd01bdf..c0386dc9f 100644 --- a/.git-blame-ignore-revs +++ b/.git-blame-ignore-revs @@ -6,3 +6,6 @@ ea7c14f8ef64924f2d0ff80df3cdabf2c7299848 # Remove f-prefix from strings that don't use formatting 7727fa4c8c6c1ef2b109120aff4196a0a6bf3ed6 + +# format tests/linear_4bit.py +34735ba89de8235ea9da6ef409f814dcea9e2038 \ No newline at end of file diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py index 9fc5e08f0..f0de962e1 100644 --- a/bitsandbytes/functional.py +++ b/bitsandbytes/functional.py @@ -706,6 +706,21 @@ def to(self, device): self.state2.absmax = self.state2.absmax.to(device) self.state2.code = self.state2.code.to(device) + def __eq__(self, other): + if not isinstance(other, QuantState): + return False + + return ( + torch.allclose(self.absmax, other.absmax, atol=1e-6) and + self.shape == other.shape and + torch.allclose(self.code, other.code, atol=1e-6) and + self.dtype == other.dtype and + self.blocksize == other.blocksize and + self.quant_type == other.quant_type and + (self.offset == other.offset if self.offset is not None and other.offset is not None else self.offset is other.offset) and + (self.state2 == other.state2 if self.state2 is not None and other.state2 is not None else self.state2 is other.state2) + ) + def quantize_blockwise( A: Tensor, diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py index 2b7e1f067..bd2bd5832 100644 --- a/bitsandbytes/nn/modules.py +++ b/bitsandbytes/nn/modules.py @@ -2,6 +2,7 @@ # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. +import copy from typing import Any, Dict, Optional, TypeVar, Union, overload import warnings @@ -191,7 +192,7 @@ class Params4bit(torch.nn.Parameter): def __new__( cls, data: Optional[torch.Tensor] = None, - requires_grad=True, + requires_grad=False, # quantized weights should be frozen by default quant_state: Optional[QuantState] = None, blocksize: int = 64, compress_statistics: bool = True, @@ -214,6 +215,37 @@ def __new__( self.module = module return self + def __getstate__(self): + state = self.__dict__ + state["data"] = self.data + state["requires_grad"] = self.requires_grad + return state + + def __setstate__(self, state): + self.requires_grad = state["requires_grad"] + self.blocksize = state["blocksize"] + self.compress_statistics = state["compress_statistics"] + self.quant_type = state["quant_type"] + self.quant_state = state["quant_state"] + self.data = state["data"] + self.quant_storage = state["quant_storage"] + self.bnb_quantized = state["bnb_quantized"] + self.module = state["module"] + + def __deepcopy__(self,memo): + new_instance = type(self).__new__(type(self)) + state = self.__getstate__() + new_instance.__setstate__(state) + new_instance.quant_state = copy.deepcopy(state["quant_state"]) + new_instance.data = copy.deepcopy(state["data"]) + return new_instance + + def __copy__(self): + new_instance = type(self).__new__(type(self)) + state = self.__getstate__() + new_instance.__setstate__(state) + return new_instance + @classmethod def from_prequantized(cls, data: torch.Tensor, quantized_stats: Dict[str, Any], requires_grad: bool = False, device='cuda', **kwargs) -> "Params4bit": self = torch.Tensor._make_subclass(cls, data.to(device)) @@ -227,8 +259,13 @@ def from_prequantized(cls, data: torch.Tensor, quantized_stats: Dict[str, Any], def _quantize(self, device): w = self.data.contiguous().cuda(device) - w_4bit, quant_state = bnb.functional.quantize_4bit(w, blocksize=self.blocksize, compress_statistics=self.compress_statistics, - quant_type=self.quant_type, quant_storage=self.quant_storage) + w_4bit, quant_state = bnb.functional.quantize_4bit( + w, + blocksize=self.blocksize, + compress_statistics=self.compress_statistics, + quant_type=self.quant_type, + quant_storage=self.quant_storage, + ) self.data = w_4bit self.quant_state = quant_state if self.module is not None: diff --git a/tests/test_linear4bit.py b/tests/test_linear4bit.py index 13db28ed4..3e62bdf3b 100644 --- a/tests/test_linear4bit.py +++ b/tests/test_linear4bit.py @@ -1,4 +1,6 @@ +import copy import os +import pickle from tempfile import TemporaryDirectory import pytest @@ -8,13 +10,14 @@ from tests.helpers import TRUE_FALSE storage = { - 'uint8': torch.uint8, - 'float16': torch.float16, - 'bfloat16': torch.bfloat16, - 'float32': torch.float32 + "uint8": torch.uint8, + "float16": torch.float16, + "bfloat16": torch.bfloat16, + "float32": torch.float32, } -@pytest.mark.parametrize("quant_storage", ['uint8', 'float16', 'bfloat16', 'float32']) + +@pytest.mark.parametrize("quant_storage", ["uint8", "float16", "bfloat16", "float32"]) @pytest.mark.parametrize("bias", TRUE_FALSE) @pytest.mark.parametrize("compress_statistics", TRUE_FALSE) @pytest.mark.parametrize("quant_type", ["nf4", "fp4"]) @@ -24,7 +27,9 @@ def test_linear_serialization(quant_type, compress_statistics, bias, quant_stora device = "cuda" layer_shape = (300, 400) - linear = torch.nn.Linear(*layer_shape, dtype=original_dtype, device="cpu") # original layer + linear = torch.nn.Linear( + *layer_shape, dtype=original_dtype, device="cpu" + ) # original layer # Quantizing original layer linear_q = bnb.nn.Linear4bit( @@ -36,7 +41,9 @@ def test_linear_serialization(quant_type, compress_statistics, bias, quant_stora quant_type=quant_type, device="meta", ) - new_weight = bnb.nn.Params4bit(data=linear.weight, quant_type=quant_type, requires_grad=False) + new_weight = bnb.nn.Params4bit( + data=linear.weight, quant_type=quant_type, requires_grad=False + ) linear_q.weight = new_weight if bias: linear_q.bias = torch.nn.Parameter(linear.bias) @@ -80,7 +87,12 @@ def test_linear_serialization(quant_type, compress_statistics, bias, quant_stora quant_storage=storage[quant_storage], device="meta", ) - linear_qs.weight = bnb.nn.Params4bit(data=linear.weight, requires_grad=False, quant_type=quant_type, quant_storage=storage[quant_storage]) + linear_qs.weight = bnb.nn.Params4bit( + data=linear.weight, + requires_grad=False, + quant_type=quant_type, + quant_storage=storage[quant_storage], + ) if bias: linear_qs.bias = torch.nn.Parameter(linear.bias) linear_qs = linear_qs.to(device) @@ -91,7 +103,7 @@ def test_linear_serialization(quant_type, compress_statistics, bias, quant_stora q0 = a.quant_state q1 = b.quant_state - for attr in ('code', 'dtype', 'blocksize', 'absmax'): + for attr in ("code", "dtype", "blocksize", "absmax"): c, d = getattr(q0, attr), getattr(q1, attr) if isinstance(c, torch.Tensor): assert torch.equal(c, d) @@ -99,7 +111,7 @@ def test_linear_serialization(quant_type, compress_statistics, bias, quant_stora assert c == d, f"{c} != {d}" if q0.state2 is not None: - for attr in ('code', 'dtype', 'blocksize', 'absmax'): + for attr in ("code", "dtype", "blocksize", "absmax"): c, d = getattr(q0.state2, attr), getattr(q1.state2, attr) if isinstance(c, torch.Tensor): assert torch.equal(c, d) @@ -125,7 +137,7 @@ def test_linear_serialization(quant_type, compress_statistics, bias, quant_stora assert torch.equal(a, c) # Test moving to CPU and back to GPU - linear_q2.to('cpu') + linear_q2.to("cpu") linear_q2.to(device) d = linear_qs(x) assert c.dtype == d.dtype @@ -139,10 +151,47 @@ def test_linear_serialization(quant_type, compress_statistics, bias, quant_stora torch.save(linear.state_dict(), state_path) torch.save(linear_q.state_dict(), state_path_4bit) - size_orig, size_4 = os.path.getsize(state_path), os.path.getsize( - state_path_4bit + size_orig, size_4 = ( + os.path.getsize(state_path), + os.path.getsize(state_path_4bit), ) size_ratio = size_4 / size_orig - target_compression = 0.143 if original_dtype == torch.float32 else 0.29 # these numbers get lower as weight shape increases + target_compression = ( + 0.143 if original_dtype == torch.float32 else 0.29 + ) # these numbers get lower as weight shape increases ratio_error_msg = f"quantized_size {size_4:,} is larger on disk than {target_compression:.2%} of original size {size_orig:,}" assert size_ratio < target_compression, ratio_error_msg + + +def test_copy_param(): + tensor = torch.tensor([1.0, 2.0, 3.0, 4.0]) + param = bnb.nn.Params4bit(data=tensor, requires_grad=False).cuda(0) + + shallow_copy_param = copy.copy(param) + assert param.quant_state is shallow_copy_param.quant_state + assert param.data.data_ptr() == shallow_copy_param.data.data_ptr() + + +def test_deepcopy_param(): + tensor = torch.tensor([1.0, 2.0, 3.0, 4.0]) + param = bnb.nn.Params4bit(data=tensor, requires_grad=False).cuda(0) + copy_param = copy.deepcopy(param) + assert param.quant_state is not copy_param.quant_state + assert param.data.data_ptr() != copy_param.data.data_ptr() + + +def test_params4bit_real_serialization(): + original_tensor = torch.tensor([1.0, 2.0, 3.0, 4.0], dtype=torch.float32) + original_param = bnb.nn.Params4bit(data=original_tensor, quant_type="fp4") + + original_param.cuda(0) # move to CUDA to trigger quantization + + serialized_param = pickle.dumps(original_param) + deserialized_param = pickle.loads(serialized_param) + + assert torch.equal(original_param.data, deserialized_param.data) + assert original_param.requires_grad == deserialized_param.requires_grad == False + assert original_param.quant_type == deserialized_param.quant_type + assert original_param.blocksize == deserialized_param.blocksize + assert original_param.compress_statistics == deserialized_param.compress_statistics + assert original_param.quant_state == deserialized_param.quant_state From e4376db4f847ddaa8c58ed0e9f8276f608bddfd5 Mon Sep 17 00:00:00 2001 From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com> Date: Thu, 22 Feb 2024 13:59:10 +0000 Subject: [PATCH 057/112] blame-ignore-rev entry obsolete due to prior squash merge --- .git-blame-ignore-revs | 3 --- 1 file changed, 3 deletions(-) diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs index c0386dc9f..f7dd01bdf 100644 --- a/.git-blame-ignore-revs +++ b/.git-blame-ignore-revs @@ -6,6 +6,3 @@ ea7c14f8ef64924f2d0ff80df3cdabf2c7299848 # Remove f-prefix from strings that don't use formatting 7727fa4c8c6c1ef2b109120aff4196a0a6bf3ed6 - -# format tests/linear_4bit.py -34735ba89de8235ea9da6ef409f814dcea9e2038 \ No newline at end of file From e820409c095ea7cbb5ce156992307b84352cbf90 Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Thu, 22 Feb 2024 12:16:40 -0500 Subject: [PATCH 058/112] (docs) Clarify Python and CUDA Toolkit version requirement (#1076) (misc) Update CUDA download URLs --- bitsandbytes/cuda_setup/main.py | 4 ++-- docs/source/installation.mdx | 8 ++++---- install_cuda.py | 15 +++++---------- install_cuda.sh | 32 ++++++++------------------------ 4 files changed, 19 insertions(+), 40 deletions(-) diff --git a/bitsandbytes/cuda_setup/main.py b/bitsandbytes/cuda_setup/main.py index 14c7abbd8..cd0d94cd7 100644 --- a/bitsandbytes/cuda_setup/main.py +++ b/bitsandbytes/cuda_setup/main.py @@ -161,7 +161,7 @@ def run_cuda_setup(self): self.add_log_entry('3. CUDA not installed') self.add_log_entry('4. You have multiple conflicting CUDA libraries') self.add_log_entry('5. Required library not pre-compiled for this bitsandbytes release!') - self.add_log_entry('CUDA SETUP: If you compiled from source, try again with `make CUDA_VERSION=DETECTED_CUDA_VERSION` for example, `make CUDA_VERSION=113`.') + self.add_log_entry('CUDA SETUP: If you compiled from source, try again with `make CUDA_VERSION=DETECTED_CUDA_VERSION` for example, `make CUDA_VERSION=118`.') self.add_log_entry('CUDA SETUP: The CUDA version for the compile might depend on your conda install. Inspect CUDA version via `conda list | grep cuda`.') self.add_log_entry('='*80) self.add_log_entry('') @@ -268,7 +268,7 @@ def warn_in_case_of_duplicates(results_paths: Set[Path]) -> None: "BNB_CUDA_VERSION=122 python ..." "OR set the environmental variable in your .bashrc: export BNB_CUDA_VERSION=122" "In the case of a manual override, make sure you set the LD_LIBRARY_PATH, e.g." - "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-11.2") + "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12.2") CUDASetup.get_instance().add_log_entry(warning_msg, is_warning=True) diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx index f055e44f0..c6d1f27ca 100644 --- a/docs/source/installation.mdx +++ b/docs/source/installation.mdx @@ -1,6 +1,6 @@ # Installation -bitsandbytes is only supported on CUDA GPUs for CUDA versions **10.2 - 12.0**. Select your operating system below to see the installation instructions. +bitsandbytes is only supported on CUDA GPUs for CUDA versions **11.0 - 12.3**. Select your operating system below to see the installation instructions. @@ -21,7 +21,7 @@ To install from PyPI. pip install bitsandbytes ``` -To compile from source, you need CMake >= **3.22.1** and Python >= **3.10** installed. Make sure you have a compiler installed to compile C++ (gcc, make, headers, etc.). For example, to install a compiler and CMake on Ubuntu: +To compile from source, you need CMake >= **3.22.1** and Python >= **3.8** installed. Make sure you have a compiler installed to compile C++ (gcc, make, headers, etc.). For example, to install a compiler and CMake on Ubuntu: ```bash apt-get install -y build-essential cmake @@ -47,7 +47,7 @@ pip install . Windows systems require Visual Studio with C++ support as well as an installation of the CUDA SDK. -You'll need to build bitsandbytes from source. To compile from source, you need CMake >= **3.22.1** and Python >= **3.10** installed. You should also install CUDA Toolkit by following the [CUDA Installation Guide for Windows](https://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html) guide from NVIDIA. +You'll need to build bitsandbytes from source. To compile from source, you need CMake >= **3.22.1** and Python >= **3.8** installed. You should also install CUDA Toolkit by following the [CUDA Installation Guide for Windows](https://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html) guide from NVIDIA. ```bash git clone https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/ @@ -82,7 +82,7 @@ Then locally install the CUDA version you need with this script from bitsandbyte ```bash wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/install_cuda.sh # Syntax cuda_install CUDA_VERSION INSTALL_PREFIX EXPORT_TO_BASH -# CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122} +# CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122, 123} # EXPORT_TO_BASH in {0, 1} with 0=False and 1=True # For example, the following installs CUDA 11.7 to ~/local/cuda-11.7 and exports the path to your .bashrc diff --git a/install_cuda.py b/install_cuda.py index 4b041b8d0..b41b33b39 100644 --- a/install_cuda.py +++ b/install_cuda.py @@ -4,10 +4,6 @@ from urllib.request import urlretrieve cuda_versions = { - "92": "https://developer.nvidia.com/compute/cuda/9.2/Prod2/local_installers/cuda_9.2.148_396.37_linux", - "100": "https://developer.nvidia.com/compute/cuda/10.0/Prod/local_installers/cuda_10.0.130_410.48_linux", - "101": "https://developer.nvidia.com/compute/cuda/10.1/Prod/local_installers/cuda_10.1.105_418.39_linux.run", - "102": "https://developer.download.nvidia.com/compute/cuda/10.2/Prod/local_installers/cuda_10.2.89_440.33.01_linux.run", "110": "https://developer.download.nvidia.com/compute/cuda/11.0.3/local_installers/cuda_11.0.3_450.51.06_linux.run", "111": "https://developer.download.nvidia.com/compute/cuda/11.1.1/local_installers/cuda_11.1.1_455.32.00_linux.run", "112": "https://developer.download.nvidia.com/compute/cuda/11.2.2/local_installers/cuda_11.2.2_460.32.03_linux.run", @@ -15,15 +11,14 @@ "114": "https://developer.download.nvidia.com/compute/cuda/11.4.4/local_installers/cuda_11.4.4_470.82.01_linux.run", "115": "https://developer.download.nvidia.com/compute/cuda/11.5.2/local_installers/cuda_11.5.2_495.29.05_linux.run", "116": "https://developer.download.nvidia.com/compute/cuda/11.6.2/local_installers/cuda_11.6.2_510.47.03_linux.run", - "117": "https://developer.download.nvidia.com/compute/cuda/11.7.0/local_installers/cuda_11.7.0_515.43.04_linux.run", + "117": "https://developer.download.nvidia.com/compute/cuda/11.7.1/local_installers/cuda_11.7.1_515.65.01_linux.run", "118": "https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run", - "120": "https://developer.download.nvidia.com/compute/cuda/12.0.0/local_installers/cuda_12.0.0_525.60.13_linux.run", - "121": "https://developer.download.nvidia.com/compute/cuda/12.1.0/local_installers/cuda_12.1.0_530.30.02_linux.run", - "122": "https://developer.download.nvidia.com/compute/cuda/12.2.0/local_installers/cuda_12.2.0_535.54.03_linux.run", - "123": "https://developer.download.nvidia.com/compute/cuda/12.3.1/local_installers/cuda_12.3.1_545.23.08_linux.run", + "120": "https://developer.download.nvidia.com/compute/cuda/12.0.1/local_installers/cuda_12.0.1_525.85.12_linux.run", + "121": "https://developer.download.nvidia.com/compute/cuda/12.1.1/local_installers/cuda_12.1.1_530.30.02_linux.run", + "122": "https://developer.download.nvidia.com/compute/cuda/12.2.2/local_installers/cuda_12.2.2_535.104.05_linux.run", + "123": "https://developer.download.nvidia.com/compute/cuda/12.3.2/local_installers/cuda_12.3.2_545.23.08_linux.run", } - def install_cuda(version, base_path, download_path): formatted_version = f"{version[:-1]}.{version[-1]}" folder = f"cuda-{formatted_version}" diff --git a/install_cuda.sh b/install_cuda.sh index 70263da15..8ffbc8478 100644 --- a/install_cuda.sh +++ b/install_cuda.sh @@ -1,7 +1,3 @@ -URL92=https://developer.nvidia.com/compute/cuda/9.2/Prod2/local_installers/cuda_9.2.148_396.37_linux -URL100=https://developer.nvidia.com/compute/cuda/10.0/Prod/local_installers/cuda_10.0.130_410.48_linux -URL101=https://developer.nvidia.com/compute/cuda/10.1/Prod/local_installers/cuda_10.1.105_418.39_linux.run -URL102=https://developer.download.nvidia.com/compute/cuda/10.2/Prod/local_installers/cuda_10.2.89_440.33.01_linux.run URL110=https://developer.download.nvidia.com/compute/cuda/11.0.3/local_installers/cuda_11.0.3_450.51.06_linux.run URL111=https://developer.download.nvidia.com/compute/cuda/11.1.1/local_installers/cuda_11.1.1_455.32.00_linux.run URL112=https://developer.download.nvidia.com/compute/cuda/11.2.2/local_installers/cuda_11.2.2_460.32.03_linux.run @@ -9,12 +5,12 @@ URL113=https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installer URL114=https://developer.download.nvidia.com/compute/cuda/11.4.4/local_installers/cuda_11.4.4_470.82.01_linux.run URL115=https://developer.download.nvidia.com/compute/cuda/11.5.2/local_installers/cuda_11.5.2_495.29.05_linux.run URL116=https://developer.download.nvidia.com/compute/cuda/11.6.2/local_installers/cuda_11.6.2_510.47.03_linux.run -URL117=https://developer.download.nvidia.com/compute/cuda/11.7.0/local_installers/cuda_11.7.0_515.43.04_linux.run +URL117=https://developer.download.nvidia.com/compute/cuda/11.7.1/local_installers/cuda_11.7.1_515.65.01_linux.run URL118=https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run -URL120=https://developer.download.nvidia.com/compute/cuda/12.0.0/local_installers/cuda_12.0.0_525.60.13_linux.run +URL120=https://developer.download.nvidia.com/compute/cuda/12.0.1/local_installers/cuda_12.0.1_525.85.12_linux.run URL121=https://developer.download.nvidia.com/compute/cuda/12.1.1/local_installers/cuda_12.1.1_530.30.02_linux.run -URL122=https://developer.download.nvidia.com/compute/cuda/12.2.1/local_installers/cuda_12.2.1_535.86.10_linux.run -URL123=https://developer.download.nvidia.com/compute/cuda/12.3.1/local_installers/cuda_12.3.1_545.23.08_linux.run +URL122=https://developer.download.nvidia.com/compute/cuda/12.2.2/local_installers/cuda_12.2.2_535.104.05_linux.run +URL123=https://developer.download.nvidia.com/compute/cuda/12.3.2/local_installers/cuda_12.3.2_545.23.08_linux.run CUDA_VERSION=$1 @@ -22,28 +18,16 @@ BASE_PATH=$2 EXPORT_BASHRC=$3 if [[ -n "$CUDA_VERSION" ]]; then - if [[ "$CUDA_VERSION" -eq "92" ]]; then - URL=$URL92 - FOLDER=cuda-9.2 - elif [[ "$CUDA_VERSION" -eq "100" ]]; then - URL=$URL100 - FOLDER=cuda-10.0 - elif [[ "$CUDA_VERSION" -eq "101" ]]; then - URL=$URL101 - FOLDER=cuda-10.1 - elif [[ "$CUDA_VERSION" -eq "102" ]]; then - URL=$URL102 - FOLDER=cuda-10.2 - elif [[ "$CUDA_VERSION" -eq "110" ]]; then + if [[ "$CUDA_VERSION" -eq "110" ]]; then URL=$URL110 FOLDER=cuda-11.0 - elif [[ "$CUDA_VERSION" -eq "111" ]]; then + elif [[ "$CUDA_VERSION" -eq "111" ]]; then URL=$URL111 FOLDER=cuda-11.1 - elif [[ "$CUDA_VERSION" -eq "112" ]]; then + elif [[ "$CUDA_VERSION" -eq "112" ]]; then URL=$URL112 FOLDER=cuda-11.2 - elif [[ "$CUDA_VERSION" -eq "113" ]]; then + elif [[ "$CUDA_VERSION" -eq "113" ]]; then URL=$URL113 FOLDER=cuda-11.3 elif [[ "$CUDA_VERSION" -eq "114" ]]; then From 1626374d318c1e5253bfeb8ec9ef80473a807d65 Mon Sep 17 00:00:00 2001 From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com> Date: Fri, 23 Feb 2024 09:44:25 +0000 Subject: [PATCH 059/112] upgrade pre-commit config --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index edcbc9b6b..c8ccfe8df 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -18,6 +18,6 @@ repos: args: - --fix=lf - repo: https://github.com/crate-ci/typos - rev: v1.17.2 + rev: v1.18.2 hooks: - id: typos From 5d6dfe6fb43e5aae277ec86cba20a002b34df705 Mon Sep 17 00:00:00 2001 From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com> Date: Fri, 23 Feb 2024 09:46:11 +0000 Subject: [PATCH 060/112] fix newly found typo due to upgraded typos pkg --- csrc/kernels.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/kernels.cu b/csrc/kernels.cu index 6d15dbe64..f4673359b 100644 --- a/csrc/kernels.cu +++ b/csrc/kernels.cu @@ -3075,7 +3075,7 @@ template __global__ void kExtractOutliers(char *A, int *idx, char * //// 4. do dequantization from register of B into second pair of registers //// 5. store (4) into fragment //// 6. matmul aggregate into fragment C -//// 7. aggreecate files of C into shared memory block C +//// 7. aggregate files of C into shared memory block C //// 8. sum (7) //// 9. write outputs to matmul output matrix //} From 1f36bd4cf24d221e61cf2609b7c6170e955222bf Mon Sep 17 00:00:00 2001 From: Titus <9048635+Titus-von-Koeller@users.noreply.github.com> Date: Mon, 26 Feb 2024 16:12:46 +0100 Subject: [PATCH 061/112] docs: fix link text --- docs/source/integrations.mdx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/integrations.mdx b/docs/source/integrations.mdx index 0e37765c5..bcba6e5e5 100644 --- a/docs/source/integrations.mdx +++ b/docs/source/integrations.mdx @@ -2,7 +2,7 @@ With Transformers it's very easy to load any model in 4 or 8-bit, quantizing them on the fly with bitsandbytes primitives. -Please review the [bitsandbytes section in the Accelerate docs](https://huggingface.co/docs/transformers/v4.37.2/en/quantization#bitsandbytes). +Please review the [bitsandbytes section in the Transformers docs](https://huggingface.co/docs/transformers/v4.37.2/en/quantization#bitsandbytes). Details about the BitsAndBytesConfig can be found [here](https://huggingface.co/docs/transformers/v4.37.2/en/main_classes/quantization#transformers.BitsAndBytesConfig). @@ -21,7 +21,7 @@ quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dty # PEFT With `PEFT`, you can use QLoRA out of the box with `LoraConfig` and a 4-bit base model. -Please review the [bitsandbytes section in the Accelerate docs](https://huggingface.co/docs/peft/developer_guides/quantization#quantize-a-model). +Please review the [bitsandbytes section in the PEFT docs](https://huggingface.co/docs/peft/developer_guides/quantization#quantize-a-model). # Accelerate From a03df4325dfa8e25f9780d1b854870d85a972898 Mon Sep 17 00:00:00 2001 From: Sebastian Raschka Date: Mon, 26 Feb 2024 13:42:23 -0600 Subject: [PATCH 062/112] Lit-GPT integration docs (#1089) * lit-gpt integration * mention PT lightning --- docs/source/integrations.mdx | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/docs/source/integrations.mdx b/docs/source/integrations.mdx index bcba6e5e5..67d50d6a0 100644 --- a/docs/source/integrations.mdx +++ b/docs/source/integrations.mdx @@ -29,6 +29,25 @@ Bitsandbytes is also easily usable from within Accelerate. Please review the [bitsandbytes section in the Accelerate docs](https://huggingface.co/docs/accelerate/en/usage_guides/quantization). + + +# PyTorch Lightning and Lightning Fabric + +Bitsandbytes is available from within both +- [PyTorch Lightning](https://lightning.ai/docs/pytorch/stable/), a deep learning framework for professional AI researchers and machine learning engineers who need maximal flexibility without sacrificing performance at scale; +- and [Lightning Fabric](https://lightning.ai/docs/fabric/stable/), a fast and lightweight way to scale PyTorch models without boilerplate). + +Please review the [bitsandbytes section in the PyTorch Lightning docs](https://lightning.ai/docs/pytorch/stable/common/precision_intermediate.html#quantization-via-bitsandbytes). + + +# Lit-GPT + +Bitsandbytes is integrated into [Lit-GPT](https://github.com/Lightning-AI/lit-gpt), a hackable implementation of state-of-the-art open-source large language models, based on Lightning Fabric, where it can be used for quantization during training, finetuning, and inference. + +Please review the [bitsandbytes section in the Lit-GPT quantization docs](https://github.com/Lightning-AI/lit-gpt/blob/main/tutorials/quantize.md). + + + # Trainer for the optimizers You can use any of the 8-bit and/or paged optimizers by simple passing them to the `transformers.Trainer` class on initialization.All bnb optimizers are supported by passing the correct string in `TrainingArguments`'s `optim` attribute - e.g. (`paged_adamw_32bit`). From 433275e3791122a21900474ae0eac8150ec344d2 Mon Sep 17 00:00:00 2001 From: Titus <9048635+Titus-von-Koeller@users.noreply.github.com> Date: Tue, 27 Feb 2024 10:39:04 +0100 Subject: [PATCH 063/112] improve accelerate reference in docs (#1086) * improve accelerate reference in docs * Apply suggestions from code review Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com> * fix spelling --------- Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com> --- docs/source/integrations.mdx | 40 ++++++++++++++++++++++++++++++------ 1 file changed, 34 insertions(+), 6 deletions(-) diff --git a/docs/source/integrations.mdx b/docs/source/integrations.mdx index 67d50d6a0..48b4d6060 100644 --- a/docs/source/integrations.mdx +++ b/docs/source/integrations.mdx @@ -1,8 +1,8 @@ # Transformers -With Transformers it's very easy to load any model in 4 or 8-bit, quantizing them on the fly with bitsandbytes primitives. +With Transformers it's very easy to load any model in 4 or 8-bit, quantizing them on the fly with `bitsandbytes` primitives. -Please review the [bitsandbytes section in the Transformers docs](https://huggingface.co/docs/transformers/v4.37.2/en/quantization#bitsandbytes). +Please review the [`bitsandbytes` section in the Transformers docs](https://huggingface.co/docs/transformers/main/en/quantization#bitsandbytes). Details about the BitsAndBytesConfig can be found [here](https://huggingface.co/docs/transformers/v4.37.2/en/main_classes/quantization#transformers.BitsAndBytesConfig). @@ -25,9 +25,37 @@ Please review the [bitsandbytes section in the PEFT docs](https://huggingface.co # Accelerate -Bitsandbytes is also easily usable from within Accelerate. +Bitsandbytes is also easily usable from within Accelerate, where you can quantize any PyTorch model simply by passing a quantization config; e.g: -Please review the [bitsandbytes section in the Accelerate docs](https://huggingface.co/docs/accelerate/en/usage_guides/quantization). +```py +from accelerate import init_empty_weights +from accelerate.utils import BnbQuantizationConfig, load_and_quantize_model +from mingpt.model import GPT + +model_config = GPT.get_default_config() +model_config.model_type = 'gpt2-xl' +model_config.vocab_size = 50257 +model_config.block_size = 1024 + +with init_empty_weights(): + empty_model = GPT(model_config) + +bnb_quantization_config = BnbQuantizationConfig( + load_in_4bit=True, + bnb_4bit_compute_dtype=torch.bfloat16, # optional + bnb_4bit_use_double_quant=True, # optional + bnb_4bit_quant_type="nf4" # optional +) + +quantized_model = load_and_quantize_model( + empty_model, + weights_location=weights_location, + bnb_quantization_config=bnb_quantization_config, + device_map = "auto" +) +``` + +For further details, e.g. model saving, cpu-offloading andfine-tuning, please review the [`bitsandbytes` section in the Accelerate docs](https://huggingface.co/docs/accelerate/en/usage_guides/quantization). @@ -59,5 +87,5 @@ e.g. for transformers state that you can load any model in 8-bit / 4-bit precisi # Blog posts -- [Making LLMs even more accessible with bitsandbytes, 4-bit quantization and QLoRA](https://huggingface.co/blog/4bit-transformers-bitsandbytes) -- [A Gentle Introduction to 8-bit Matrix Multiplication for transformers at scale using Hugging Face Transformers, Accelerate and bitsandbytes](https://huggingface.co/blog/hf-bitsandbytes-integration) +- [Making LLMs even more accessible with `bitsandbytes`, 4-bit quantization and QLoRA](https://huggingface.co/blog/4bit-transformers-bitsandbytes) +- [A Gentle Introduction to 8-bit Matrix Multiplication for transformers at scale using Hugging Face Transformers, Accelerate and `bitsandbytes`](https://huggingface.co/blog/hf-bitsandbytes-integration) From 753df25c7fed6683b7d4562319849192ec8d9873 Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Tue, 27 Feb 2024 12:24:20 -0500 Subject: [PATCH 064/112] (cmake) Fix cuda arch selection (#1091) * (cmake) Fix generation of targets for nvcc * Typo * (ci) linux + CUDA workflow: make sure we specify target architectures * fix * fix one more time * (cmake) Default in CMAKE_CUDA_ARCHITECTURES_ALL when cmake<3.23, make sure we build only selected cubins and only ptx for latest capability * Fix static lookup for CMAKE_CUDA_ARCHITECTURES_ALL on cmake<3.23 * Remove debug setting * clarification --- .github/workflows/python-package.yml | 2 +- CMakeLists.txt | 44 ++++++++++++++++++++++++---- 2 files changed, 39 insertions(+), 7 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index e48c25cc5..faa30ca30 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -125,7 +125,7 @@ jobs: docker run --platform linux/$build_arch -i -w /src -v $PWD:/src $image sh -c \ "apt-get update \ && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cmake \ - && cmake -DCOMPUTE_BACKEND=cuda -DNO_CUBLASLT=${NO_CUBLASLT} . \ + && cmake -DCOMPUTE_BACKEND=cuda -DCOMPUTE_CAPABILITY=\"50;52;60;61;70;75;80;86;89;90\" -DNO_CUBLASLT=${NO_CUBLASLT} . \ && cmake --build ." else cmake -G Ninja -DCOMPUTE_BACKEND=cuda -DNO_CUBLASLT=${NO_CUBLASLT} -DCMAKE_BUILD_TYPE=Release -S . diff --git a/CMakeLists.txt b/CMakeLists.txt index 1b9f1854b..7f70a089e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -33,7 +33,7 @@ endif() set(BNB_OUTPUT_NAME "bitsandbytes") -message(STATUS "Building with backend ${COMPUTE_BACKEND}") +message(STATUS "Configuring ${PROJECT_NAME} (Backend: ${COMPUTE_BACKEND})") if(${COMPUTE_BACKEND} STREQUAL "cuda") if(APPLE) @@ -82,6 +82,31 @@ if(BUILD_CUDA) message(FATAL_ERROR "CUDA Version > 12 is not supported") endif() + # CMake < 3.23.0 does not define CMAKE_CUDA_ARCHITECTURES_ALL. + if(CMAKE_VERSION VERSION_LESS "3.23.0") + message(STATUS "CMake < 3.23.0; determining CUDA architectures supported...") + + # 11.x and 12.x both support these at a minimum. + set(CMAKE_CUDA_ARCHITECTURES_ALL 50 52 53 60 61 62 70 72 75 80) + set(CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 50 60 70 80) + + # CUDA 11.1 adds Ampere support for GA102-GA107. + if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "11.1") + list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 86) + endif() + + # CUDA 11.4 adds Ampere support for GA10B. + if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "11.4") + list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 87) + endif() + + # CUDA 11.8 adds support for Ada and Hopper. + if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "11.8") + list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 89 90) + list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 90) + endif() + endif() + string(APPEND CMAKE_CUDA_FLAGS " --use_fast_math") if(PTXAS_VERBOSE) # Verbose? Outputs register usage information, and other things... @@ -103,10 +128,18 @@ if(BUILD_CUDA) message(STATUS "CUDA Capabilities Available: ${POSSIBLE_CAPABILITIES}") message(STATUS "CUDA Capabilities Selected: ${COMPUTE_CAPABILITY}") - foreach(capability ${COMPUTE_CAPABILITY}) - string(APPEND CMAKE_CUDA_FLAGS " -gencode arch=compute_${capability},code=sm_${capability}") - endforeach() - + # Use the "real" option to build native cubin for all selections. + # Ensure we build the PTX for the latest version. + # This behavior of adding a PTX (virtual) target for the highest architecture + # is similar to how the "all" and "all-major" options would behave in CMake >= 3.23. + # TODO: Consider bumping CMake requirement and using CMAKE_CUDA_ARCHITECTURES=[all | native] by default + list(REMOVE_DUPLICATES COMPUTE_CAPABILITY) + list(SORT COMPUTE_CAPABILITY COMPARE NATURAL) + list(POP_BACK COMPUTE_CAPABILITY _LATEST_CAPABILITY) + list(TRANSFORM COMPUTE_CAPABILITY APPEND "-real" OUTPUT_VARIABLE CMAKE_CUDA_ARCHITECTURES) + list(APPEND CMAKE_CUDA_ARCHITECTURES ${_LATEST_CAPABILITY}) + + message(STATUS "CUDA Targets: ${CMAKE_CUDA_ARCHITECTURES}") message(STATUS "CUDA NVCC Flags: ${CMAKE_CUDA_FLAGS}") list(APPEND SRC_FILES ${CUDA_FILES}) @@ -149,7 +182,6 @@ endif() # Weird MSVC hacks if(MSVC) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2 /fp:fast") - set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX2 /fp:fast") endif() set_source_files_properties(${CPP_FILES} PROPERTIES LANGUAGE CXX) From cc5f8cd8b9f6f97f30b85322780359851ee2caf1 Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Tue, 27 Feb 2024 13:03:42 -0500 Subject: [PATCH 065/112] (cmake) Update library output directory (#1080) --- CMakeLists.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7f70a089e..62ff4e535 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -214,10 +214,10 @@ if(WIN32) endif() set_target_properties(bitsandbytes PROPERTIES OUTPUT_NAME ${BNB_OUTPUT_NAME}) if(MSVC) - set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY_RELEASE bitsandbytes) - set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY_DEBUG bitsandbytes) - set_target_properties(bitsandbytes PROPERTIES RUNTIME_OUTPUT_DIRECTORY_RELEASE bitsandbytes) - set_target_properties(bitsandbytes PROPERTIES RUNTIME_OUTPUT_DIRECTORY_DEBUG bitsandbytes) + set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY_RELEASE "${PROJECT_SOURCE_DIR}/bitsandbytes") + set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY_DEBUG "${PROJECT_SOURCE_DIR}/bitsandbytes") + set_target_properties(bitsandbytes PROPERTIES RUNTIME_OUTPUT_DIRECTORY_RELEASE "${PROJECT_SOURCE_DIR}/bitsandbytes") + set_target_properties(bitsandbytes PROPERTIES RUNTIME_OUTPUT_DIRECTORY_DEBUG "${PROJECT_SOURCE_DIR}/bitsandbytes") endif() set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY bitsandbytes) From 4b232edf8c923dab4a0059a449cd3dab3201e4d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alejandro=20Rodr=C3=ADguez=20Salamanca?= Date: Tue, 27 Feb 2024 19:46:20 +0100 Subject: [PATCH 066/112] Fix example int8_inference_huggingface.py (#414) * Fix example int8_inference_huggingface.py * Update examples/int8_inference_huggingface.py Co-authored-by: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> --------- Co-authored-by: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> --- examples/int8_inference_huggingface.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/int8_inference_huggingface.py b/examples/int8_inference_huggingface.py index 2cee48e8e..c89ba8d11 100644 --- a/examples/int8_inference_huggingface.py +++ b/examples/int8_inference_huggingface.py @@ -1,24 +1,24 @@ import torch -from transformers import AutoModelForCausalLM, AutoTokenizer +from transformers import LlamaForCausalLM, LlamaTokenizer MAX_NEW_TOKENS = 128 -model_name = 'decapoda-research/llama-7b-hf' +model_name = 'meta-llama/Llama-2-7b-hf' text = 'Hamburg is in which country?\n' -tokenizer = AutoTokenizer.from_pretrained(model_name) +tokenizer = LlamaTokenizer.from_pretrained(model_name) input_ids = tokenizer(text, return_tensors="pt").input_ids -free_in_GB = int(torch.cuda.mem_get_info()[0]/1024**3) max_memory = f'{int(torch.cuda.mem_get_info()[0]/1024**3)-2}GB' n_gpus = torch.cuda.device_count() max_memory = {i: max_memory for i in range(n_gpus)} -model = AutoModelForCausalLM.from_pretrained( +model = LlamaForCausalLM.from_pretrained( model_name, device_map='auto', load_in_8bit=True, max_memory=max_memory ) + generated_ids = model.generate(input_ids, max_length=MAX_NEW_TOKENS) print(tokenizer.decode(generated_ids[0], skip_special_tokens=True)) From 1d709aadef94c35ff8a403ab1a65f7343c011074 Mon Sep 17 00:00:00 2001 From: Rickard Date: Tue, 27 Feb 2024 20:15:02 +0100 Subject: [PATCH 067/112] Add concurrency to not waste precious build minutes when modifying PRs frequently. (#1051) Co-authored-by: wkpark --- .github/workflows/python-package.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index faa30ca30..c85cd063d 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -15,10 +15,13 @@ on: - 'setup.py' - 'pyproject.toml' - 'pytest.ini' - - '**/*.md' release: types: [ published ] +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: ## From 0488566462c24f5016ca76f698ef9d4d95d00b11 Mon Sep 17 00:00:00 2001 From: Won-Kyu Park Date: Wed, 28 Feb 2024 04:22:37 +0900 Subject: [PATCH 068/112] fix cudart*dll for Windows (#1064) --- bitsandbytes/cuda_setup/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bitsandbytes/cuda_setup/main.py b/bitsandbytes/cuda_setup/main.py index cd0d94cd7..b351f7f03 100644 --- a/bitsandbytes/cuda_setup/main.py +++ b/bitsandbytes/cuda_setup/main.py @@ -30,7 +30,7 @@ DYNAMIC_LIBRARY_SUFFIX = { "Darwin": ".dylib", "Windows": ".dll", "Linux": ".so"}.get(platform.system(), ".so") if platform.system() == "Windows": # Windows - CUDA_RUNTIME_LIBS = ["nvcuda.dll"] + CUDA_RUNTIME_LIBS = ["cudart64_110.dll", "cudart64_12.dll"] else: # Linux or other # these are the most common libs names # libcudart.so is missing by default for a conda install with PyTorch 2.0 and instead From 20f3eea787f577a5c11ad75f6be83b94c2a882ff Mon Sep 17 00:00:00 2001 From: Titus <9048635+Titus-von-Koeller@users.noreply.github.com> Date: Wed, 28 Feb 2024 15:48:54 +0100 Subject: [PATCH 069/112] docs: add header for compilation from source --- docs/source/installation.mdx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx index c6d1f27ca..f701f08d0 100644 --- a/docs/source/installation.mdx +++ b/docs/source/installation.mdx @@ -21,6 +21,8 @@ To install from PyPI. pip install bitsandbytes ``` +## Alternative: Compiling from source + To compile from source, you need CMake >= **3.22.1** and Python >= **3.8** installed. Make sure you have a compiler installed to compile C++ (gcc, make, headers, etc.). For example, to install a compiler and CMake on Ubuntu: ```bash From f9eba9c8dd3ffc7d59036fbd16c2b0c498fd3041 Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Wed, 28 Feb 2024 09:52:19 -0500 Subject: [PATCH 070/112] (ci) update apt repo before aarch64 build tools are installed (#1096) --- .github/workflows/python-package.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index c85cd063d..a25f53f46 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -56,7 +56,8 @@ jobs: build_os=${{ matrix.os }} build_arch=${{ matrix.arch }} if [ ${build_os:0:6} == ubuntu -a ${build_arch} == aarch64 ]; then - # Allow cross-compile om aarch64 + # Allow cross-compile on aarch64 + sudo apt-get update sudo apt-get install -y gcc-aarch64-linux-gnu binutils-aarch64-linux-gnu g++-aarch64-linux-gnu cmake -DCMAKE_C_COMPILER=aarch64-linux-gnu-gcc -DCMAKE_CXX_COMPILER=aarch64-linux-gnu-g++ -DCOMPUTE_BACKEND=cpu . elif [ ${build_os:0:5} == macos -a ${build_arch} == aarch64 ]; then From a1c0844bdbcabbd4354d73b9c5c9af8077d3c08b Mon Sep 17 00:00:00 2001 From: rdyro Date: Tue, 5 Mar 2024 10:10:08 -0800 Subject: [PATCH 071/112] adding whole Linear8bitLt/Linear4bit module save/load serialization (#1099) --- bitsandbytes/nn/modules.py | 4 +++- tests/test_linear4bit.py | 27 ++++++++++++++++++++++++++- tests/test_linear8bitlt.py | 34 +++++++++++++++++++++++++++++++++- 3 files changed, 62 insertions(+), 3 deletions(-) diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py index bd2bd5832..16c8aa9b8 100644 --- a/bitsandbytes/nn/modules.py +++ b/bitsandbytes/nn/modules.py @@ -449,7 +449,9 @@ def __new__( cls.SCB = None if data is None: data = torch.empty(0) - return torch.Tensor._make_subclass(cls, data, requires_grad) + obj = torch.Tensor._make_subclass(cls, data, requires_grad) + obj.CB, obj.SCB = cls.CB, cls.SCB + return obj def cuda(self, device): if self.has_fp16_weights: diff --git a/tests/test_linear4bit.py b/tests/test_linear4bit.py index 3e62bdf3b..d1f60423c 100644 --- a/tests/test_linear4bit.py +++ b/tests/test_linear4bit.py @@ -1,4 +1,5 @@ import copy +from io import BytesIO import os import pickle from tempfile import TemporaryDirectory @@ -16,12 +17,24 @@ "float32": torch.float32, } +def torch_save_to_buffer(obj): + buffer = BytesIO() + torch.save(obj, buffer) + buffer.seek(0) + return buffer + +def torch_load_from_buffer(buffer): + buffer.seek(0) + obj = torch.load(buffer) + buffer.seek(0) + return obj @pytest.mark.parametrize("quant_storage", ["uint8", "float16", "bfloat16", "float32"]) @pytest.mark.parametrize("bias", TRUE_FALSE) @pytest.mark.parametrize("compress_statistics", TRUE_FALSE) @pytest.mark.parametrize("quant_type", ["nf4", "fp4"]) -def test_linear_serialization(quant_type, compress_statistics, bias, quant_storage): +@pytest.mark.parametrize("save_before_forward", TRUE_FALSE) +def test_linear_serialization(quant_type, compress_statistics, bias, quant_storage, save_before_forward): original_dtype = torch.float16 compute_dtype = None device = "cuda" @@ -124,6 +137,9 @@ def test_linear_serialization(quant_type, compress_statistics, bias, quant_stora assert a.dtype == b.dtype assert torch.equal(a, b) + if save_before_forward: + bytes_4bit = torch_save_to_buffer(linear_q) + # Forward test x = torch.rand(42, layer_shape[0], device=device) a = linear_q(x) @@ -136,6 +152,10 @@ def test_linear_serialization(quant_type, compress_statistics, bias, quant_stora assert torch.equal(a, b) assert torch.equal(a, c) + if not save_before_forward: + bytes_4bit = torch_save_to_buffer(linear_q) + linear_q3 = torch_load_from_buffer(bytes_4bit) + # Test moving to CPU and back to GPU linear_q2.to("cpu") linear_q2.to(device) @@ -144,6 +164,11 @@ def test_linear_serialization(quant_type, compress_statistics, bias, quant_stora assert c.device == d.device assert torch.equal(c, d) + d = linear_q3(x) + assert c.dtype == d.dtype + assert c.device == d.device + assert torch.equal(c, d) + # Saved size ratio test. Target set for layer_shape == (300, 400) w/ bias with TemporaryDirectory() as tmpdir: state_path_4bit = os.path.join(tmpdir, "state_4bit.pth") diff --git a/tests/test_linear8bitlt.py b/tests/test_linear8bitlt.py index 6fa7efb8d..a996b0215 100644 --- a/tests/test_linear8bitlt.py +++ b/tests/test_linear8bitlt.py @@ -1,4 +1,5 @@ from contextlib import nullcontext +from io import BytesIO import os from tempfile import TemporaryDirectory @@ -65,12 +66,25 @@ def test_linear_no_igemmlt(): assert linear_custom.state.CB is not None assert linear_custom.state.CxB is None +def torch_save_to_buffer(obj): + buffer = BytesIO() + torch.save(obj, buffer) + buffer.seek(0) + return buffer + +def torch_load_from_buffer(buffer): + buffer.seek(0) + obj = torch.load(buffer) + buffer.seek(0) + return obj @pytest.mark.parametrize("has_fp16_weights", TRUE_FALSE, ids=id_formatter("has_fp16_weights")) @pytest.mark.parametrize("serialize_before_forward", TRUE_FALSE, ids=id_formatter("serialize_before_forward")) @pytest.mark.parametrize("deserialize_before_cuda", TRUE_FALSE, ids=id_formatter("deserialize_before_cuda")) @pytest.mark.parametrize("force_no_igemmlt", TRUE_FALSE, ids=id_formatter("force_no_igemmlt")) -def test_linear_serialization(has_fp16_weights, serialize_before_forward, deserialize_before_cuda, force_no_igemmlt): +@pytest.mark.parametrize("save_before_forward", TRUE_FALSE, ids=id_formatter("save_before_forward")) +@pytest.mark.parametrize("load_before_cuda", TRUE_FALSE, ids=id_formatter("load_before_cuda")) +def test_linear_serialization(has_fp16_weights, serialize_before_forward, deserialize_before_cuda, force_no_igemmlt, save_before_forward, load_before_cuda): linear = torch.nn.Linear(32, 96) x = torch.randn(3, 32, dtype=torch.half) @@ -93,6 +107,9 @@ def test_linear_serialization(has_fp16_weights, serialize_before_forward, deseri if serialize_before_forward: state_dict_8bit = linear_custom.state_dict() + if save_before_forward: + bytes_8bit = torch_save_to_buffer(linear_custom) + x_first = x.clone().cuda().requires_grad_(True) fx_first = linear_custom(x_first).float() grad_proj = torch.randn_like(fx_first) @@ -101,6 +118,9 @@ def test_linear_serialization(has_fp16_weights, serialize_before_forward, deseri if not serialize_before_forward: state_dict_8bit = linear_custom.state_dict() + if not save_before_forward: + bytes_8bit = torch_save_to_buffer(linear_custom) + with TemporaryDirectory() as tmpdir: state_path_8bit = os.path.join(tmpdir, "state_8bit.pth") state_path = os.path.join(tmpdir, "state.pth") @@ -127,16 +147,28 @@ def test_linear_serialization(has_fp16_weights, serialize_before_forward, deseri with nullcontext() if has_fp16_weights else pytest.raises(RuntimeError): new_linear_custom.load_state_dict(new_state_dict, strict=True) + if load_before_cuda: + new_linear_custom2 = torch_load_from_buffer(bytes_8bit) + new_linear_custom = new_linear_custom.cuda() if not deserialize_before_cuda: new_linear_custom.load_state_dict(new_state_dict, strict=True) + if not load_before_cuda: + new_linear_custom2 = torch_load_from_buffer(bytes_8bit) + x_second = x.clone().cuda().requires_grad_(True) fx_second = new_linear_custom(x_second).float() (fx_second * grad_proj).mean().backward() + x_third = x.clone().cuda().requires_grad_(True) + fx_third = new_linear_custom2(x_third).float() + (fx_third * grad_proj).mean().backward() + # if 8-bit weights were loaded before .cuda, state is incorrect anyway and RuntimeError was raised if has_fp16_weights or not deserialize_before_cuda: assert torch.allclose(fx_first, fx_second, atol=1e-5) assert torch.allclose(x_first.grad, x_second.grad, atol=1e-5) + assert torch.allclose(fx_first, fx_third, atol=1e-5) + assert torch.allclose(x_first.grad, x_third.grad, atol=1e-5) \ No newline at end of file From 048a2d404c6a909e6f835ba18182fbfae130ba09 Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Wed, 6 Mar 2024 09:30:51 +0200 Subject: [PATCH 072/112] Deduplicate helpers & fix lint issues from #1099 (#1107) --- tests/helpers.py | 27 ++++++++++++++++++++------- tests/test_linear4bit.py | 14 +------------- tests/test_linear8bitlt.py | 21 +++++++-------------- 3 files changed, 28 insertions(+), 34 deletions(-) diff --git a/tests/helpers.py b/tests/helpers.py index f82a8631f..02cb881a3 100644 --- a/tests/helpers.py +++ b/tests/helpers.py @@ -1,3 +1,4 @@ +from io import BytesIO from itertools import product import random from typing import Any, List @@ -7,6 +8,25 @@ test_dims_rng = random.Random(42) +TRUE_FALSE = (True, False) +BOOLEAN_TRIPLES = list(product(TRUE_FALSE, repeat=3)) # all combinations of (bool, bool, bool) +BOOLEAN_TUPLES = list(product(TRUE_FALSE, repeat=2)) # all combinations of (bool, bool) + + +def torch_save_to_buffer(obj): + buffer = BytesIO() + torch.save(obj, buffer) + buffer.seek(0) + return buffer + + +def torch_load_from_buffer(buffer): + buffer.seek(0) + obj = torch.load(buffer) + buffer.seek(0) + return obj + + def get_test_dims(min: int, max: int, *, n: int) -> List[int]: return [test_dims_rng.randint(min, max) for _ in range(n)] @@ -42,10 +62,3 @@ def id_formatter(label: str): def describe_dtype(dtype: torch.dtype) -> str: return DTYPE_NAMES.get(dtype) or str(dtype).rpartition(".")[2] - - -TRUE_FALSE = (True, False) -BOOLEAN_TRIPLES = list( - product(TRUE_FALSE, repeat=3) -) # all combinations of (bool, bool, bool) -BOOLEAN_TUPLES = list(product(TRUE_FALSE, repeat=2)) # all combinations of (bool, bool) diff --git a/tests/test_linear4bit.py b/tests/test_linear4bit.py index d1f60423c..567e1a466 100644 --- a/tests/test_linear4bit.py +++ b/tests/test_linear4bit.py @@ -1,5 +1,4 @@ import copy -from io import BytesIO import os import pickle from tempfile import TemporaryDirectory @@ -8,7 +7,7 @@ import torch import bitsandbytes as bnb -from tests.helpers import TRUE_FALSE +from tests.helpers import TRUE_FALSE, torch_load_from_buffer, torch_save_to_buffer storage = { "uint8": torch.uint8, @@ -17,17 +16,6 @@ "float32": torch.float32, } -def torch_save_to_buffer(obj): - buffer = BytesIO() - torch.save(obj, buffer) - buffer.seek(0) - return buffer - -def torch_load_from_buffer(buffer): - buffer.seek(0) - obj = torch.load(buffer) - buffer.seek(0) - return obj @pytest.mark.parametrize("quant_storage", ["uint8", "float16", "bfloat16", "float32"]) @pytest.mark.parametrize("bias", TRUE_FALSE) diff --git a/tests/test_linear8bitlt.py b/tests/test_linear8bitlt.py index a996b0215..edc3409cd 100644 --- a/tests/test_linear8bitlt.py +++ b/tests/test_linear8bitlt.py @@ -1,5 +1,4 @@ from contextlib import nullcontext -from io import BytesIO import os from tempfile import TemporaryDirectory @@ -10,7 +9,12 @@ from bitsandbytes import functional as F from bitsandbytes.autograd import get_inverse_transform_indices, undo_layout from bitsandbytes.nn.modules import Linear8bitLt -from tests.helpers import TRUE_FALSE, id_formatter +from tests.helpers import ( + TRUE_FALSE, + id_formatter, + torch_load_from_buffer, + torch_save_to_buffer, +) # contributed by Alex Borzunov, see: # https://github.com/bigscience-workshop/petals/blob/main/tests/test_linear8bitlt.py @@ -66,17 +70,6 @@ def test_linear_no_igemmlt(): assert linear_custom.state.CB is not None assert linear_custom.state.CxB is None -def torch_save_to_buffer(obj): - buffer = BytesIO() - torch.save(obj, buffer) - buffer.seek(0) - return buffer - -def torch_load_from_buffer(buffer): - buffer.seek(0) - obj = torch.load(buffer) - buffer.seek(0) - return obj @pytest.mark.parametrize("has_fp16_weights", TRUE_FALSE, ids=id_formatter("has_fp16_weights")) @pytest.mark.parametrize("serialize_before_forward", TRUE_FALSE, ids=id_formatter("serialize_before_forward")) @@ -171,4 +164,4 @@ def test_linear_serialization(has_fp16_weights, serialize_before_forward, deseri assert torch.allclose(fx_first, fx_second, atol=1e-5) assert torch.allclose(x_first.grad, x_second.grad, atol=1e-5) assert torch.allclose(fx_first, fx_third, atol=1e-5) - assert torch.allclose(x_first.grad, x_third.grad, atol=1e-5) \ No newline at end of file + assert torch.allclose(x_first.grad, x_third.grad, atol=1e-5) From 87e029bc5dd0bd728758db35125aa51e9f4a0077 Mon Sep 17 00:00:00 2001 From: MOHAMMAD ALBARHAM Date: Thu, 7 Mar 2024 01:24:06 +0400 Subject: [PATCH 073/112] fix typo on the script installation file (#1109) --- docs/source/installation.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx index f701f08d0..a63a6a93e 100644 --- a/docs/source/installation.mdx +++ b/docs/source/installation.mdx @@ -89,7 +89,7 @@ wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/install_cud # For example, the following installs CUDA 11.7 to ~/local/cuda-11.7 and exports the path to your .bashrc -bash cuda_install.sh 117 ~/local 1 +bash install_cuda.sh 117 ~/local 1 ``` 2. Set the environment variables `BNB_CUDA_VERSION` and `LD_LIBRARY_PATH` by manually overriding the CUDA version installed by PyTorch. From ac5d6ee6c2fab42229fcb7dc031240f09d55d951 Mon Sep 17 00:00:00 2001 From: Steven Liu <59462357+stevhliu@users.noreply.github.com> Date: Thu, 7 Mar 2024 13:10:17 -0800 Subject: [PATCH 074/112] [docs] implement API docs (#1075) * optims * fix path * fix path * mdx * fix path * toctree * fix * optimizer, adagrad * add init * add * more apis * params * clarify * run pre-commit hooks --------- Co-authored-by: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com> --- .git-blame-ignore-revs | 3 + bitsandbytes/nn/modules.py | 83 ++++++--- bitsandbytes/optim/adagrad.py | 81 ++++++++ bitsandbytes/optim/adam.py | 174 ++++++++++++++++++ bitsandbytes/optim/adamw.py | 174 ++++++++++++++++++ bitsandbytes/optim/lamb.py | 95 ++++++++++ bitsandbytes/optim/lars.py | 77 ++++++++ bitsandbytes/optim/lion.py | 140 ++++++++++++++ bitsandbytes/optim/optimizer.py | 140 +++++++++++--- bitsandbytes/optim/rmsprop.py | 87 +++++++++ bitsandbytes/optim/sgd.py | 77 ++++++++ docs/source/_toctree.yml | 30 ++- docs/source/reference/nn/embeddings.mdx | 15 ++ docs/source/reference/nn/linear4bit.mdx | 23 +++ docs/source/reference/nn/linear8bit.mdx | 13 ++ docs/source/reference/optim/adagrad.mdx | 18 ++ docs/source/reference/optim/adam.mdx | 38 ++++ docs/source/reference/optim/adamw.mdx | 34 ++++ docs/source/reference/optim/lamb.mdx | 21 +++ docs/source/reference/optim/lars.mdx | 18 ++ docs/source/reference/optim/lion.mdx | 33 ++++ .../source/reference/optim/optim_overview.mdx | 24 +++ docs/source/reference/optim/rmsprop.mdx | 15 ++ docs/source/reference/optim/sgd.mdx | 20 ++ docs/source/{ => reference}/quantization.mdx | 0 25 files changed, 1389 insertions(+), 44 deletions(-) create mode 100644 docs/source/reference/nn/embeddings.mdx create mode 100644 docs/source/reference/nn/linear4bit.mdx create mode 100644 docs/source/reference/nn/linear8bit.mdx create mode 100644 docs/source/reference/optim/adagrad.mdx create mode 100644 docs/source/reference/optim/adam.mdx create mode 100644 docs/source/reference/optim/adamw.mdx create mode 100644 docs/source/reference/optim/lamb.mdx create mode 100644 docs/source/reference/optim/lars.mdx create mode 100644 docs/source/reference/optim/lion.mdx create mode 100644 docs/source/reference/optim/optim_overview.mdx create mode 100644 docs/source/reference/optim/rmsprop.mdx create mode 100644 docs/source/reference/optim/sgd.mdx rename docs/source/{ => reference}/quantization.mdx (100%) diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs index f7dd01bdf..fc44037d8 100644 --- a/.git-blame-ignore-revs +++ b/.git-blame-ignore-revs @@ -6,3 +6,6 @@ ea7c14f8ef64924f2d0ff80df3cdabf2c7299848 # Remove f-prefix from strings that don't use formatting 7727fa4c8c6c1ef2b109120aff4196a0a6bf3ed6 + +# format tests/linear_4bit.py +34735ba89de8235ea9da6ef409f814dcea9e2038 diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py index 16c8aa9b8..f7b96205b 100644 --- a/bitsandbytes/nn/modules.py +++ b/bitsandbytes/nn/modules.py @@ -21,16 +21,7 @@ class StableEmbedding(torch.nn.Embedding): """ - Custom embedding layer designed for stable training in NLP tasks. The stable - embedding layer improves stability during optimization for models with word - embeddings, addressing issues related to the non-uniform distribution of input - tokens. - - This stable embedding layer is initialized with Xavier uniform initialization, - followed by layer normalization. It is designed to support aggressive quantization, - addressing extreme gradient variations in non-uniform input distributions. The - stability of training is enhanced by using 32-bit optimizer states specifically - for this layer. + Custom embedding layer designed to improve stability during training for NLP tasks by using 32-bit optimizer states. It is designed to reduce gradient variations that can result from quantization. This embedding layer is initialized with Xavier uniform initialization followed by layer normalization. Example: @@ -47,14 +38,11 @@ class StableEmbedding(torch.nn.Embedding): ``` Attributes: - norm (torch.nn.LayerNorm): Layer normalization applied after the embedding. + norm (`torch.nn.LayerNorm`): Layer normalization applied after the embedding. Methods: reset_parameters(): Reset embedding parameters using Xavier uniform initialization. forward(input: Tensor) -> Tensor: Forward pass through the stable embedding layer. - - Reference: - - [8-bit optimizer paper](https://arxiv.org/pdf/2110.02861.pdf) """ def __init__( self, @@ -71,14 +59,22 @@ def __init__( ) -> None: """ Args: - num_embeddings (`int`): The number of unique embeddings (vocabulary size). - embedding_dim (`int`): The dimensionality of the embedding. - padding_idx (`Optional[int]`): If specified, pads the output with zeros at the given index. - max_norm (`Optional[float]`): If given, renormalizes embeddings to have a maximum L2 norm. - norm_type (`float`, defaults to `2.0`): The p-norm to compute for the max_norm option. - scale_grad_by_freq (`bool`): Scale gradient by frequency during backpropagation. - sparse (`bool`): If True, computes sparse gradients; False, computes dense gradients. - _weight (`Optional[Tensor]`): Pre-trained embeddings. + num_embeddings (`int`): + The number of unique embeddings (vocabulary size). + embedding_dim (`int`): + The dimensionality of the embedding. + padding_idx (`Optional[int]`): + Pads the output with zeros at the given index. + max_norm (`Optional[float]`): + Renormalizes embeddings to have a maximum L2 norm. + norm_type (`float`, defaults to `2.0`): + The p-norm to compute for the `max_norm` option. + scale_grad_by_freq (`bool`, defaults to `False`): + Scale gradient by frequency during backpropagation. + sparse (`bool`, defaults to `False`): + Computes dense gradients. Set to `True` to compute sparse gradients instead. + _weight (`Optional[Tensor]`): + Pretrained embeddings. """ super().__init__( num_embeddings, @@ -131,6 +127,9 @@ def forward(self, input: Tensor) -> Tensor: class Embedding(torch.nn.Embedding): + """ + Embedding class to store and retrieve word embeddings from their indices. + """ def __init__( self, num_embeddings: int, @@ -143,6 +142,25 @@ def __init__( _weight: Optional[Tensor] = None, device: Optional[device] = None, ) -> None: + """ + Args: + num_embeddings (`int`): + The number of unique embeddings (vocabulary size). + embedding_dim (`int`): + The dimensionality of the embedding. + padding_idx (`Optional[int]`): + Pads the output with zeros at the given index. + max_norm (`Optional[float]`): + Renormalizes embeddings to have a maximum L2 norm. + norm_type (`float`, defaults to `2.0`): + The p-norm to compute for the `max_norm` option. + scale_grad_by_freq (`bool`, defaults to `False`): + Scale gradient by frequency during backpropagation. + sparse (`bool`, defaults to `False`): + Computes dense gradients. Set to `True` to compute sparse gradients instead. + _weight (`Optional[Tensor]`): + Pretrained embeddings. + """ super().__init__( num_embeddings, embedding_dim, @@ -416,7 +434,19 @@ def forward(self, x: torch.Tensor): class LinearFP4(Linear4bit): + """ + Implements the FP4 data type. + """ def __init__(self, input_features, output_features, bias=True, compute_dtype=None, compress_statistics=True, quant_storage=torch.uint8, device=None): + """ + Args: + input_features (`str`): + Number of input features of the linear layer. + output_features (`str`): + Number of output features of the linear layer. + bias (`bool`, defaults to `True`): + Whether the linear class uses the bias term as well. + """ super().__init__(input_features, output_features, bias, compute_dtype, compress_statistics, 'fp4', quant_storage, device) @@ -432,6 +462,15 @@ class LinearNF4(Linear4bit): the `functional.py` file: https://github.com/TimDettmers/bitsandbytes/blob/main/bitsandbytes/functional.py#L236. ''' def __init__(self, input_features, output_features, bias=True, compute_dtype=None, compress_statistics=True, quant_storage=torch.uint8, device=None): + """ + Args: + input_features (`str`): + Number of input features of the linear layer. + output_features (`str`): + Number of output features of the linear layer. + bias (`bool`, defaults to `True`): + Whether the linear class uses the bias term as well. + """ super().__init__(input_features, output_features, bias, compute_dtype, compress_statistics, 'nf4', quant_storage, device) diff --git a/bitsandbytes/optim/adagrad.py b/bitsandbytes/optim/adagrad.py index 7d8df58ac..c2ea87ab0 100644 --- a/bitsandbytes/optim/adagrad.py +++ b/bitsandbytes/optim/adagrad.py @@ -20,6 +20,33 @@ def __init__( percentile_clipping=100, block_wise=True, ): + """ + Base Adagrad optimizer. + + Arguments: + params (`torch.tensor`): + The input parameters to optimize. + lr (`float`, defaults to 1e-2): + The learning rate. + lr_decay (`int`, defaults to 0): + The learning rate decay. + weight_decay (`float`, defaults to 0.0): + The weight decay value for the optimizer. + initial_accumulator_value (`int`, defaults to 0): + The initial momemtum values. + eps (`float`, defaults to 1e-10): + The epsilon value prevents division by zero in the optimizer. + optim_bits (`int`, defaults to 32): + The number of bits of the optimizer state. + args (`dict`, defaults to `None`): + A dictionary with additional arguments. + min_8bit_size (`int`, defaults to 4096): + The minimum number of elements of the parameter tensors for 8-bit optimization. + percentile_clipping (`int`, defaults to 100): + Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. + block_wise (`bool`, defaults to `True`): + Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. + """ if not 0.0 <= lr: raise ValueError(f"Invalid learning rate: {lr}") if not 0.0 <= weight_decay: @@ -62,6 +89,33 @@ def __init__( percentile_clipping=100, block_wise=True, ): + """ + 8-bit Adagrad optimizer. + + Arguments: + params (`torch.tensor`): + The input parameters to optimize. + lr (`float`, defaults to 1e-2): + The learning rate. + lr_decay (`int`, defaults to 0): + The learning rate decay. + weight_decay (`float`, defaults to 0.0): + The weight decay value for the optimizer. + initial_accumulator_value (`int`, defaults to 0): + The initial momemtum values. + eps (`float`, defaults to 1e-10): + The epsilon value prevents division by zero in the optimizer. + optim_bits (`int`, defaults to 8): + The number of bits of the optimizer state. + args (`dict`, defaults to `None`): + A dictionary with additional arguments. + min_8bit_size (`int`, defaults to 4096): + The minimum number of elements of the parameter tensors for 8-bit optimization. + percentile_clipping (`int`, defaults to 100): + Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. + block_wise (`bool`, defaults to `True`): + Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. + """ if not 0.0 <= lr: raise ValueError(f"Invalid learning rate: {lr}") if not 0.0 <= weight_decay: @@ -105,6 +159,33 @@ def __init__( percentile_clipping=100, block_wise=True, ): + """ + 32-bit Adagrad optimizer. + + Arguments: + params (`torch.tensor`): + The input parameters to optimize. + lr (`float`, defaults to 1e-2): + The learning rate. + lr_decay (`int`, defaults to 0): + The learning rate decay. + weight_decay (`float`, defaults to 0.0): + The weight decay value for the optimizer. + initial_accumulator_value (`int`, defaults to 0): + The initial momemtum values. + eps (`float`, defaults to 1e-10): + The epsilon value prevents division by zero in the optimizer. + optim_bits (`int`, defaults to 32): + The number of bits of the optimizer state. + args (`dict`, defaults to `None`): + A dictionary with additional arguments. + min_8bit_size (`int`, defaults to 4096): + The minimum number of elements of the parameter tensors for 8-bit optimization. + percentile_clipping (`int`, defaults to 100): + Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. + block_wise (`bool`, defaults to `True`): + Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. + """ if not 0.0 <= lr: raise ValueError(f"Invalid learning rate: {lr}") if not 0.0 <= weight_decay: diff --git a/bitsandbytes/optim/adam.py b/bitsandbytes/optim/adam.py index 86981eb86..e534c8b8f 100644 --- a/bitsandbytes/optim/adam.py +++ b/bitsandbytes/optim/adam.py @@ -16,31 +16,205 @@ class Adam(Optimizer2State): def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, amsgrad=False, optim_bits=32, args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True, is_paged=False): + """ + Base Adam optimizer. + + Arguments: + params (`torch.tensor`): + The input parameters to optimize. + lr (`float`, defaults to 1e-3): + The learning rate. + betas (`tuple(float, float)`, defaults to (0.9, 0.999)): + The beta values are the decay rates of the first and second-order moment of the optimizer. + eps (`float`, defaults to 1e-8): + The epsilon value prevents division by zero in the optimizer. + weight_decay (`float`, defaults to 0.0): + The weight decay value for the optimizer. + amsgrad (`bool`, defaults to `False`): + Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead. + optim_bits (`int`, defaults to 32): + The number of bits of the optimizer state. + args (`dict`, defaults to `None`): + A dictionary with additional arguments. + min_8bit_size (`int`, defaults to 4096): + The minimum number of elements of the parameter tensors for 8-bit optimization. + percentile_clipping (`int`, defaults to 100): + Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. + block_wise (`bool`, defaults to `True`): + Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. + is_paged (`bool`, defaults to `False`): + Whether the optimizer is a paged optimizer or not. + """ super().__init__( "adam", params, lr, betas, eps, weight_decay, optim_bits, args, min_8bit_size, percentile_clipping, block_wise, is_paged=is_paged) class Adam8bit(Optimizer2State): def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, amsgrad=False, optim_bits=32, args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True, is_paged=False): + """ + 8-bit Adam optimizer. + + Arguments: + params (`torch.tensor`): + The input parameters to optimize. + lr (`float`, defaults to 1e-3): + The learning rate. + betas (`tuple(float, float)`, defaults to (0.9, 0.999)): + The beta values are the decay rates of the first and second-order moment of the optimizer. + eps (`float`, defaults to 1e-8): + The epsilon value prevents division by zero in the optimizer. + weight_decay (`float`, defaults to 0.0): + The weight decay value for the optimizer. + amsgrad (`bool`, defaults to `False`): + Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead. + optim_bits (`int`, defaults to 32): + The number of bits of the optimizer state. + args (`dict`, defaults to `None`): + A dictionary with additional arguments. + min_8bit_size (`int`, defaults to 4096): + The minimum number of elements of the parameter tensors for 8-bit optimization. + percentile_clipping (`int`, defaults to 100): + Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. + block_wise (`bool`, defaults to `True`): + Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. + is_paged (`bool`, defaults to `False`): + Whether the optimizer is a paged optimizer or not. + """ super().__init__( "adam", params, lr, betas, eps, weight_decay, 8, args, min_8bit_size, percentile_clipping, block_wise, is_paged=is_paged) class Adam32bit(Optimizer2State): def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, amsgrad=False, optim_bits=32, args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True, is_paged=False): + """ + 32-bit Adam optimizer. + + Arguments: + params (`torch.tensor`): + The input parameters to optimize. + lr (`float`, defaults to 1e-3): + The learning rate. + betas (`tuple(float, float)`, defaults to (0.9, 0.999)): + The beta values are the decay rates of the first and second-order moment of the optimizer. + eps (`float`, defaults to 1e-8): + The epsilon value prevents division by zero in the optimizer. + weight_decay (`float`, defaults to 0.0): + The weight decay value for the optimizer. + amsgrad (`bool`, defaults to `False`): + Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead. + optim_bits (`int`, defaults to 32): + The number of bits of the optimizer state. + args (`dict`, defaults to `None`): + A dictionary with additional arguments. + min_8bit_size (`int`, defaults to 4096): + The minimum number of elements of the parameter tensors for 8-bit optimization. + percentile_clipping (`int`, defaults to 100): + Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. + block_wise (`bool`, defaults to `True`): + Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. + is_paged (`bool`, defaults to `False`): + Whether the optimizer is a paged optimizer or not. + """ super().__init__( "adam", params, lr, betas, eps, weight_decay, 32, args, min_8bit_size, percentile_clipping, block_wise, is_paged=is_paged) class PagedAdam(Optimizer2State): def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, amsgrad=False, optim_bits=32, args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True, is_paged=False): + """ + Paged Adam optimizer. + + Arguments: + params (`torch.tensor`): + The input parameters to optimize. + lr (`float`, defaults to 1e-3): + The learning rate. + betas (`tuple(float, float)`, defaults to (0.9, 0.999)): + The beta values are the decay rates of the first and second-order moment of the optimizer. + eps (`float`, defaults to 1e-8): + The epsilon value prevents division by zero in the optimizer. + weight_decay (`float`, defaults to 0.0): + The weight decay value for the optimizer. + amsgrad (`bool`, defaults to `False`): + Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead. + optim_bits (`int`, defaults to 32): + The number of bits of the optimizer state. + args (`dict`, defaults to `None`): + A dictionary with additional arguments. + min_8bit_size (`int`, defaults to 4096): + The minimum number of elements of the parameter tensors for 8-bit optimization. + percentile_clipping (`int`, defaults to 100): + Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. + block_wise (`bool`, defaults to `True`): + Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. + is_paged (`bool`, defaults to `False`): + Whether the optimizer is a paged optimizer or not. + """ super().__init__( "adam", params, lr, betas, eps, weight_decay, optim_bits, args, min_8bit_size, percentile_clipping, block_wise, is_paged=True) class PagedAdam8bit(Optimizer2State): def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, amsgrad=False, optim_bits=32, args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True, is_paged=False): + """ + 8-bit paged Adam optimizer. + + Arguments: + params (`torch.tensor`): + The input parameters to optimize. + lr (`float`, defaults to 1e-3): + The learning rate. + betas (`tuple(float, float)`, defaults to (0.9, 0.999)): + The beta values are the decay rates of the first and second-order moment of the optimizer. + eps (`float`, defaults to 1e-8): + The epsilon value prevents division by zero in the optimizer. + weight_decay (`float`, defaults to 0.0): + The weight decay value for the optimizer. + amsgrad (`bool`, defaults to `False`): + Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead. + optim_bits (`int`, defaults to 32): + The number of bits of the optimizer state. + args (`dict`, defaults to `None`): + A dictionary with additional arguments. + min_8bit_size (`int`, defaults to 4096): + The minimum number of elements of the parameter tensors for 8-bit optimization. + percentile_clipping (`int`, defaults to 100): + Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. + block_wise (`bool`, defaults to `True`): + Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. + is_paged (`bool`, defaults to `False`): + Whether the optimizer is a paged optimizer or not. + """ super().__init__( "adam", params, lr, betas, eps, weight_decay, 8, args, min_8bit_size, percentile_clipping, block_wise, is_paged=True) class PagedAdam32bit(Optimizer2State): def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, amsgrad=False, optim_bits=32, args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True, is_paged=False): + """ + Paged 32-bit Adam optimizer. + + Arguments: + params (`torch.tensor`): + The input parameters to optimize. + lr (`float`, defaults to 1e-3): + The learning rate. + betas (`tuple(float, float)`, defaults to (0.9, 0.999)): + The beta values are the decay rates of the first and second-order moment of the optimizer. + eps (`float`, defaults to 1e-8): + The epsilon value prevents division by zero in the optimizer. + weight_decay (`float`, defaults to 0.0): + The weight decay value for the optimizer. + amsgrad (`bool`, defaults to `False`): + Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead. + optim_bits (`int`, defaults to 32): + The number of bits of the optimizer state. + args (`dict`, defaults to `None`): + A dictionary with additional arguments. + min_8bit_size (`int`, defaults to 4096): + The minimum number of elements of the parameter tensors for 8-bit optimization. + percentile_clipping (`int`, defaults to 100): + Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. + block_wise (`bool`, defaults to `True`): + Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. + is_paged (`bool`, defaults to `False`): + Whether the optimizer is a paged optimizer or not. + """ super().__init__( "adam", params, lr, betas, eps, weight_decay, 32, args, min_8bit_size, percentile_clipping, block_wise, is_paged=True) class AnalysisAdam(torch.optim.Optimizer): diff --git a/bitsandbytes/optim/adamw.py b/bitsandbytes/optim/adamw.py index 17383eed5..1e2dc04de 100644 --- a/bitsandbytes/optim/adamw.py +++ b/bitsandbytes/optim/adamw.py @@ -8,30 +8,204 @@ class AdamW(Optimizer2State): def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=1e-2, amsgrad=False, optim_bits=32, args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True, is_paged=False): + """ + Base AdamW optimizer. + + Arguments: + params (`torch.tensor`): + The input parameters to optimize. + lr (`float`, defaults to 1e-3): + The learning rate. + betas (`tuple(float, float)`, defaults to (0.9, 0.999)): + The beta values are the decay rates of the first and second-order moment of the optimizer. + eps (`float`, defaults to 1e-8): + The epsilon value prevents division by zero in the optimizer. + weight_decay (`float`, defaults to 1e-2): + The weight decay value for the optimizer. + amsgrad (`bool`, defaults to `False`): + Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead. + optim_bits (`int`, defaults to 32): + The number of bits of the optimizer state. + args (`dict`, defaults to `None`): + A dictionary with additional arguments. + min_8bit_size (`int`, defaults to 4096): + The minimum number of elements of the parameter tensors for 8-bit optimization. + percentile_clipping (`int`, defaults to 100): + Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. + block_wise (`bool`, defaults to `True`): + Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. + is_paged (`bool`, defaults to `False`): + Whether the optimizer is a paged optimizer or not. + """ super().__init__( "adam", params, lr, betas, eps, weight_decay, optim_bits, args, min_8bit_size, percentile_clipping, block_wise, is_paged=is_paged ) class AdamW8bit(Optimizer2State): def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=1e-2, amsgrad=False, optim_bits=32, args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True, is_paged=False): + """ + 8-bit AdamW optimizer. + + Arguments: + params (`torch.tensor`): + The input parameters to optimize. + lr (`float`, defaults to 1e-3): + The learning rate. + betas (`tuple(float, float)`, defaults to (0.9, 0.999)): + The beta values are the decay rates of the first and second-order moment of the optimizer. + eps (`float`, defaults to 1e-8): + The epsilon value prevents division by zero in the optimizer. + weight_decay (`float`, defaults to 1e-2): + The weight decay value for the optimizer. + amsgrad (`bool`, defaults to `False`): + Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead. + optim_bits (`int`, defaults to 32): + The number of bits of the optimizer state. + args (`dict`, defaults to `None`): + A dictionary with additional arguments. + min_8bit_size (`int`, defaults to 4096): + The minimum number of elements of the parameter tensors for 8-bit optimization. + percentile_clipping (`int`, defaults to 100): + Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. + block_wise (`bool`, defaults to `True`): + Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. + is_paged (`bool`, defaults to `False`): + Whether the optimizer is a paged optimizer or not. + """ super().__init__( "adam", params, lr, betas, eps, weight_decay, 8, args, min_8bit_size, percentile_clipping, block_wise, is_paged=is_paged ) class AdamW32bit(Optimizer2State): def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=1e-2, amsgrad=False, optim_bits=32, args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True, is_paged=False): + """ + 32-bit AdamW optimizer. + + Arguments: + params (`torch.tensor`): + The input parameters to optimize. + lr (`float`, defaults to 1e-3): + The learning rate. + betas (`tuple(float, float)`, defaults to (0.9, 0.999)): + The beta values are the decay rates of the first and second-order moment of the optimizer. + eps (`float`, defaults to 1e-8): + The epsilon value prevents division by zero in the optimizer. + weight_decay (`float`, defaults to 1e-2): + The weight decay value for the optimizer. + amsgrad (`bool`, defaults to `False`): + Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead. + optim_bits (`int`, defaults to 32): + The number of bits of the optimizer state. + args (`dict`, defaults to `None`): + A dictionary with additional arguments. + min_8bit_size (`int`, defaults to 4096): + The minimum number of elements of the parameter tensors for 8-bit optimization. + percentile_clipping (`int`, defaults to 100): + Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. + block_wise (`bool`, defaults to `True`): + Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. + is_paged (`bool`, defaults to `False`): + Whether the optimizer is a paged optimizer or not. + """ super().__init__( "adam", params, lr, betas, eps, weight_decay, 32, args, min_8bit_size, percentile_clipping, block_wise, is_paged=is_paged) class PagedAdamW(Optimizer2State): def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=1e-2, amsgrad=False, optim_bits=32, args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True): + """ + Paged AdamW optimizer. + + Arguments: + params (`torch.tensor`): + The input parameters to optimize. + lr (`float`, defaults to 1e-3): + The learning rate. + betas (`tuple(float, float)`, defaults to (0.9, 0.999)): + The beta values are the decay rates of the first and second-order moment of the optimizer. + eps (`float`, defaults to 1e-8): + The epsilon value prevents division by zero in the optimizer. + weight_decay (`float`, defaults to 1e-2): + The weight decay value for the optimizer. + amsgrad (`bool`, defaults to `False`): + Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead. + optim_bits (`int`, defaults to 32): + The number of bits of the optimizer state. + args (`dict`, defaults to `None`): + A dictionary with additional arguments. + min_8bit_size (`int`, defaults to 4096): + The minimum number of elements of the parameter tensors for 8-bit optimization. + percentile_clipping (`int`, defaults to 100): + Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. + block_wise (`bool`, defaults to `True`): + Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. + is_paged (`bool`, defaults to `False`): + Whether the optimizer is a paged optimizer or not. + """ super().__init__( "adam", params, lr, betas, eps, weight_decay, optim_bits, args, min_8bit_size, percentile_clipping, block_wise, is_paged=True) class PagedAdamW8bit(Optimizer2State): def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=1e-2, amsgrad=False, optim_bits=32, args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True): + """ + Paged 8-bit AdamW optimizer. + + Arguments: + params (`torch.tensor`): + The input parameters to optimize. + lr (`float`, defaults to 1e-3): + The learning rate. + betas (`tuple(float, float)`, defaults to (0.9, 0.999)): + The beta values are the decay rates of the first and second-order moment of the optimizer. + eps (`float`, defaults to 1e-8): + The epsilon value prevents division by zero in the optimizer. + weight_decay (`float`, defaults to 1e-2): + The weight decay value for the optimizer. + amsgrad (`bool`, defaults to `False`): + Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead. + optim_bits (`int`, defaults to 32): + The number of bits of the optimizer state. + args (`dict`, defaults to `None`): + A dictionary with additional arguments. + min_8bit_size (`int`, defaults to 4096): + The minimum number of elements of the parameter tensors for 8-bit optimization. + percentile_clipping (`int`, defaults to 100): + Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. + block_wise (`bool`, defaults to `True`): + Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. + is_paged (`bool`, defaults to `False`): + Whether the optimizer is a paged optimizer or not. + """ super().__init__( "adam", params, lr, betas, eps, weight_decay, 8, args, min_8bit_size, percentile_clipping, block_wise, is_paged=True) class PagedAdamW32bit(Optimizer2State): def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=1e-2, amsgrad=False, optim_bits=32, args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True): + """ + Paged 32-bit AdamW optimizer. + + Arguments: + params (`torch.tensor`): + The input parameters to optimize. + lr (`float`, defaults to 1e-3): + The learning rate. + betas (`tuple(float, float)`, defaults to (0.9, 0.999)): + The beta values are the decay rates of the first and second-order moment of the optimizer. + eps (`float`, defaults to 1e-8): + The epsilon value prevents division by zero in the optimizer. + weight_decay (`float`, defaults to 1e-2): + The weight decay value for the optimizer. + amsgrad (`bool`, defaults to `False`): + Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead. + optim_bits (`int`, defaults to 32): + The number of bits of the optimizer state. + args (`dict`, defaults to `None`): + A dictionary with additional arguments. + min_8bit_size (`int`, defaults to 4096): + The minimum number of elements of the parameter tensors for 8-bit optimization. + percentile_clipping (`int`, defaults to 100): + Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. + block_wise (`bool`, defaults to `True`): + Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. + is_paged (`bool`, defaults to `False`): + Whether the optimizer is a paged optimizer or not. + """ super().__init__( "adam", params, lr, betas, eps, weight_decay, 32, args, min_8bit_size, percentile_clipping, block_wise, is_paged=True) diff --git a/bitsandbytes/optim/lamb.py b/bitsandbytes/optim/lamb.py index 1fbb6fadc..ec829ee85 100644 --- a/bitsandbytes/optim/lamb.py +++ b/bitsandbytes/optim/lamb.py @@ -23,6 +23,39 @@ def __init__( block_wise=False, max_unorm=1.0, ): + """ + Base LAMB optimizer. + + Arguments: + params (`torch.tensor`): + The input parameters to optimize. + lr (`float`, defaults to 1e-3): + The learning rate. + bias_correction (`bool`, defaults to `True`): + Whether to apply bias correction to the first and second-order moments. + betas (`tuple(float, float)`, defaults to (0.9, 0.999)): + The beta values are the decay rates of the first and second-order moment of the optimizer. + eps (`float`, defaults to 1e-8): + The epsilon value prevents division by zero in the optimizer. + weight_decay (`float`, defaults to 1e-2): + The weight decay value for the optimizer. + amsgrad (`bool`, defaults to `False`): + Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead. + adam_w_mode (`bool`, defaults to `True`): + Whether to use the AdamW variant. + optim_bits (`int`, defaults to 32): + The number of bits of the optimizer state. + args (`dict`, defaults to `None`): + A dictionary with additional arguments. + min_8bit_size (`int`, defaults to 4096): + The minimum number of elements of the parameter tensors for 8-bit optimization. + percentile_clipping (`int`, defaults to 100): + Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. + block_wise (`bool`, defaults to `True`): + Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. + max_unorm (`float`, defaults to 1.0): + The maximum gradient norm. + """ super().__init__( "lamb", params, @@ -56,6 +89,37 @@ def __init__( block_wise=False, max_unorm=1.0, ): + """ + 8-bit LAMB optimizer. + + Arguments: + params (`torch.tensor`): + The input parameters to optimize. + lr (`float`, defaults to 1e-3): + The learning rate. + bias_correction (`bool`, defaults to `True`): + Whether to apply bias correction to the first and second-order moments. + betas (`tuple(float, float)`, defaults to (0.9, 0.999)): + The beta values are the decay rates of the first and second-order moment of the optimizer. + eps (`float`, defaults to 1e-8): + The epsilon value prevents division by zero in the optimizer. + weight_decay (`float`, defaults to 1e-2): + The weight decay value for the optimizer. + amsgrad (`bool`, defaults to `False`): + Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead. + adam_w_mode (`bool`, defaults to `True`): + Whether to use the AdamW variant. + args (`dict`, defaults to `None`): + A dictionary with additional arguments. + min_8bit_size (`int`, defaults to 4096): + The minimum number of elements of the parameter tensors for 8-bit optimization. + percentile_clipping (`int`, defaults to 100): + Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. + block_wise (`bool`, defaults to `True`): + Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. + max_unorm (`float`, defaults to 1.0): + The maximum gradient norm. + """ super().__init__( "lamb", params, @@ -89,6 +153,37 @@ def __init__( block_wise=False, max_unorm=1.0, ): + """ + 32-bit LAMB optimizer. + + Arguments: + params (`torch.tensor`): + The input parameters to optimize. + lr (`float`, defaults to 1e-3): + The learning rate. + bias_correction (`bool`, defaults to `True`): + Whether to apply bias correction to the first and second-order moments. + betas (`tuple(float, float)`, defaults to (0.9, 0.999)): + The beta values are the decay rates of the first and second-order moment of the optimizer. + eps (`float`, defaults to 1e-8): + The epsilon value prevents division by zero in the optimizer. + weight_decay (`float`, defaults to 1e-2): + The weight decay value for the optimizer. + amsgrad (`bool`, defaults to `False`): + Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead. + adam_w_mode (`bool`, defaults to `True`): + Whether to use the AdamW variant. + args (`dict`, defaults to `None`): + A dictionary with additional arguments. + min_8bit_size (`int`, defaults to 4096): + The minimum number of elements of the parameter tensors for 8-bit optimization. + percentile_clipping (`int`, defaults to 100): + Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. + block_wise (`bool`, defaults to `True`): + Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. + max_unorm (`float`, defaults to 1.0): + The maximum gradient norm. + """ super().__init__( "lamb", params, diff --git a/bitsandbytes/optim/lars.py b/bitsandbytes/optim/lars.py index 73554e3cc..7449b805b 100644 --- a/bitsandbytes/optim/lars.py +++ b/bitsandbytes/optim/lars.py @@ -23,6 +23,33 @@ def __init__( percentile_clipping=100, max_unorm=0.02, ): + """ + Base LARS optimizer. + + Arguments: + params (`torch.tensor`): + The input parameters to optimize. + lr (`float`): + The learning rate. + momentum (`float`, defaults to 0): + The momentum value speeds up the optimizer by taking bigger steps. + dampening (`float`, defaults to 0): + The dampening value reduces the momentum of the optimizer. + weight_decay (`float`, defaults to 1e-2): + The weight decay value for the optimizer. + nesterov (`bool`, defaults to `False`): + Whether to use Nesterov momentum. + optim_bits (`int`, defaults to 32): + The number of bits of the optimizer state. + args (`dict`, defaults to `None`): + A dictionary with additional arguments. + min_8bit_size (`int`, defaults to 4096): + The minimum number of elements of the parameter tensors for 8-bit optimization. + percentile_clipping (`int`, defaults to 100): + Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. + max_unorm (`float`, defaults to 0.02): + The maximum gradient norm. + """ if momentum == 0: raise NotImplementedError( "LARS without momentum is not supported!" @@ -57,6 +84,31 @@ def __init__( percentile_clipping=100, max_unorm=0.02, ): + """ + 8-bit LARS optimizer. + + Arguments: + params (`torch.tensor`): + The input parameters to optimize. + lr (`float`): + The learning rate. + momentum (`float`, defaults to 0): + The momentum value speeds up the optimizer by taking bigger steps. + dampening (`float`, defaults to 0): + The dampening value reduces the momentum of the optimizer. + weight_decay (`float`, defaults to 1e-2): + The weight decay value for the optimizer. + nesterov (`bool`, defaults to `False`): + Whether to use Nesterov momentum. + args (`dict`, defaults to `None`): + A dictionary with additional arguments. + min_8bit_size (`int`, defaults to 4096): + The minimum number of elements of the parameter tensors for 8-bit optimization. + percentile_clipping (`int`, defaults to 100): + Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. + max_unorm (`float`, defaults to 0.02): + The maximum gradient norm. + """ if momentum == 0: raise NotImplementedError( "LARS without momentum is not supported!" @@ -91,6 +143,31 @@ def __init__( percentile_clipping=100, max_unorm=0.02, ): + """ + 32-bit LARS optimizer. + + Arguments: + params (`torch.tensor`): + The input parameters to optimize. + lr (`float`): + The learning rate. + momentum (`float`, defaults to 0): + The momentum value speeds up the optimizer by taking bigger steps. + dampening (`float`, defaults to 0): + The dampening value reduces the momentum of the optimizer. + weight_decay (`float`, defaults to 1e-2): + The weight decay value for the optimizer. + nesterov (`bool`, defaults to `False`): + Whether to use Nesterov momentum. + args (`dict`, defaults to `None`): + A dictionary with additional arguments. + min_8bit_size (`int`, defaults to 4096): + The minimum number of elements of the parameter tensors for 8-bit optimization. + percentile_clipping (`int`, defaults to 100): + Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. + max_unorm (`float`, defaults to 0.02): + The maximum gradient norm. + """ if momentum == 0: raise NotImplementedError( "LARS without momentum is not supported!" diff --git a/bitsandbytes/optim/lion.py b/bitsandbytes/optim/lion.py index b6ba4a9f1..ce185f863 100644 --- a/bitsandbytes/optim/lion.py +++ b/bitsandbytes/optim/lion.py @@ -7,25 +7,165 @@ class Lion(Optimizer1State): def __init__(self, params, lr=1e-4, betas=(0.9, 0.99), weight_decay=0, optim_bits=32, args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True, is_paged=False): + """ + Base Lion optimizer. + + Arguments: + params (`torch.tensor`): + The input parameters to optimize. + lr (`float`, defaults to 1e-4): + The learning rate. + betas (`tuple(float, float)`, defaults to (0.9, 0.999)): + The beta values are the decay rates of the first and second-order moment of the optimizer. + weight_decay (`float`, defaults to 0): + The weight decay value for the optimizer. + optim_bits (`int`, defaults to 32): + The number of bits of the optimizer state. + args (`dict`, defaults to `None`): + A dictionary with additional arguments. + min_8bit_size (`int`, defaults to 4096): + The minimum number of elements of the parameter tensors for 8-bit optimization. + percentile_clipping (`int`, defaults to 100): + Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. + block_wise (`bool`, defaults to `True`): + Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. + is_paged (`bool`, defaults to `False`): + Whether the optimizer is a paged optimizer or not. + """ super().__init__("lion", params, lr, betas, 0., weight_decay, optim_bits, args, min_8bit_size, percentile_clipping, block_wise, is_paged=is_paged) class Lion8bit(Optimizer1State): def __init__(self, params, lr=1e-4, betas=(0.9, 0.99), weight_decay=0, args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True, is_paged=False): + """ + 8-bit Lion optimizer. + + Arguments: + params (`torch.tensor`): + The input parameters to optimize. + lr (`float`, defaults to 1e-4): + The learning rate. + betas (`tuple(float, float)`, defaults to (0.9, 0.999)): + The beta values are the decay rates of the first and second-order moment of the optimizer. + weight_decay (`float`, defaults to 0): + The weight decay value for the optimizer. + args (`dict`, defaults to `None`): + A dictionary with additional arguments. + min_8bit_size (`int`, defaults to 4096): + The minimum number of elements of the parameter tensors for 8-bit optimization. + percentile_clipping (`int`, defaults to 100): + Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. + block_wise (`bool`, defaults to `True`): + Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. + is_paged (`bool`, defaults to `False`): + Whether the optimizer is a paged optimizer or not. + """ super().__init__("lion", params, lr, betas, 0., weight_decay, 8, args, min_8bit_size, percentile_clipping, block_wise, is_paged=is_paged) class Lion32bit(Optimizer1State): def __init__(self, params, lr=1e-4, betas=(0.9, 0.99), weight_decay=0, args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True, is_paged=False): + """ + 32-bit Lion optimizer. + + Arguments: + params (`torch.tensor`): + The input parameters to optimize. + lr (`float`, defaults to 1e-4): + The learning rate. + betas (`tuple(float, float)`, defaults to (0.9, 0.999)): + The beta values are the decay rates of the first and second-order moment of the optimizer. + weight_decay (`float`, defaults to 0): + The weight decay value for the optimizer. + args (`dict`, defaults to `None`): + A dictionary with additional arguments. + min_8bit_size (`int`, defaults to 4096): + The minimum number of elements of the parameter tensors for 8-bit optimization. + percentile_clipping (`int`, defaults to 100): + Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. + block_wise (`bool`, defaults to `True`): + Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. + is_paged (`bool`, defaults to `False`): + Whether the optimizer is a paged optimizer or not. + """ super().__init__("lion", params, lr, betas, 0., weight_decay, 32, args, min_8bit_size, percentile_clipping, block_wise, is_paged=is_paged) class PagedLion(Optimizer1State): def __init__(self, params, lr=1e-4, betas=(0.9, 0.99), weight_decay=0, optim_bits=32, args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True): + """ + Paged Lion optimizer. + + Arguments: + params (`torch.tensor`): + The input parameters to optimize. + lr (`float`, defaults to 1e-4): + The learning rate. + betas (`tuple(float, float)`, defaults to (0.9, 0.999)): + The beta values are the decay rates of the first and second-order moment of the optimizer. + weight_decay (`float`, defaults to 0): + The weight decay value for the optimizer. + optim_bits (`int`, defaults to 32): + The number of bits of the optimizer state. + args (`dict`, defaults to `None`): + A dictionary with additional arguments. + min_8bit_size (`int`, defaults to 4096): + The minimum number of elements of the parameter tensors for 8-bit optimization. + percentile_clipping (`int`, defaults to 100): + Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. + block_wise (`bool`, defaults to `True`): + Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. + """ super().__init__("lion", params, lr, betas, 0., weight_decay, optim_bits, args, min_8bit_size, percentile_clipping, block_wise, is_paged=True) class PagedLion8bit(Optimizer1State): def __init__(self, params, lr=1e-4, betas=(0.9, 0.99), weight_decay=0, args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True): + """ + Paged 8-bit Lion optimizer. + + Arguments: + params (`torch.tensor`): + The input parameters to optimize. + lr (`float`, defaults to 1e-4): + The learning rate. + betas (`tuple(float, float)`, defaults to (0.9, 0.999)): + The beta values are the decay rates of the first and second-order moment of the optimizer. + weight_decay (`float`, defaults to 0): + The weight decay value for the optimizer. + optim_bits (`int`, defaults to 32): + The number of bits of the optimizer state. + args (`dict`, defaults to `None`): + A dictionary with additional arguments. + min_8bit_size (`int`, defaults to 4096): + The minimum number of elements of the parameter tensors for 8-bit optimization. + percentile_clipping (`int`, defaults to 100): + Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. + block_wise (`bool`, defaults to `True`): + Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. + """ super().__init__("lion", params, lr, betas, 0., weight_decay, 8, args, min_8bit_size, percentile_clipping, block_wise, is_paged=True) class PagedLion32bit(Optimizer1State): def __init__(self, params, lr=1e-4, betas=(0.9, 0.99), weight_decay=0, args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True): + """ + Paged 32-bit Lion optimizer. + + Arguments: + params (`torch.tensor`): + The input parameters to optimize. + lr (`float`, defaults to 1e-4): + The learning rate. + betas (`tuple(float, float)`, defaults to (0.9, 0.999)): + The beta values are the decay rates of the first and second-order moment of the optimizer. + weight_decay (`float`, defaults to 0): + The weight decay value for the optimizer. + optim_bits (`int`, defaults to 32): + The number of bits of the optimizer state. + args (`dict`, defaults to `None`): + A dictionary with additional arguments. + min_8bit_size (`int`, defaults to 4096): + The minimum number of elements of the parameter tensors for 8-bit optimization. + percentile_clipping (`int`, defaults to 100): + Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. + block_wise (`bool`, defaults to `True`): + Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. + """ super().__init__("lion", params, lr, betas, 0., weight_decay, 32, args, min_8bit_size, percentile_clipping, block_wise, is_paged=True) diff --git a/bitsandbytes/optim/optimizer.py b/bitsandbytes/optim/optimizer.py index 8254d16b4..a97afb026 100644 --- a/bitsandbytes/optim/optimizer.py +++ b/bitsandbytes/optim/optimizer.py @@ -18,6 +18,9 @@ def __init__(self, initial_data): class GlobalOptimManager: + """ + A global optimizer manager for enabling custom optimizer configs. + """ _instance = None def __init__(self): @@ -53,22 +56,40 @@ def override_config( self, parameters, key=None, value=None, key_value_dict=None ): """ - Overrides initial optimizer config for specific parameters. + Override initial optimizer config with specific hyperparameters. The key-values of the optimizer config for the input parameters are overridden - This can be both, optimizer parameters like "betas", or "lr" or it can be - 8-bit specific parameters like "optim_bits", "percentile_clipping". - - Parameters - ---------- - parameters : torch.Tensor or list(torch.Tensors) - The input parameters. - key : str - The hyperparamter to override. - value : object - The value for the hyperparamters. - key_value_dict : dict - A dictionary with multiple key-values to override. + This can be both, optimizer parameters like `betas` or `lr`, or it can be + 8-bit specific parameters like `optim_bits` or `percentile_clipping`. + + Arguments: + parameters (`torch.Tensor` or `list(torch.Tensors)`): + The input parameters. + key (`str`): + The hyperparamter to override. + value: + The hyperparameter values. + key_value_dict (`dict`): + A dictionary with multiple key-values to override. + + Example: + + ```py + import torch + import bitsandbytes as bnb + + mng = bnb.optim.GlobalOptimManager.get_instance() + + model = MyModel() + mng.register_parameters(model.parameters()) # 1. register parameters while still on CPU + + model = model.cuda() + # use 8-bit optimizer states for all parameters + adam = bnb.optim.Adam(model.parameters(), lr=0.001, optim_bits=8) + + # 2. override: the parameter model.fc1.weight now uses 32-bit Adam + mng.override_config(model.fc1.weight, 'optim_bits', 32) + ``` """ self.uses_config_override = True if isinstance(parameters, torch.nn.Parameter): @@ -92,6 +113,17 @@ def register_module_override(self, module, param_name, config): class Optimizer8bit(torch.optim.Optimizer): def __init__(self, params, defaults, optim_bits=32, is_paged=False): + """ + Base 8-bit optimizer class. + + Arguments: + params (`torch.tensor`): + The input parameters to optimize. + optim_bits (`int`, defaults to 32): + The number of bits of the optimizer state. + is_paged (`bool`, defaults to `False`): + Whether the optimizer is a paged optimizer or not. + """ super().__init__(params, defaults) self.initialized = False self.name2qmap = {} @@ -125,11 +157,11 @@ def __setstate__(self, state): super().__setstate__(state) def load_state_dict(self, state_dict): - r"""Loads the optimizer state. + """Load an optimizer state. - Args: - state_dict (dict): optimizer state. Should be an object returned - from a call to :meth:`state_dict`. + Arguments: + state_dict (`dict`): + An optimizer state (should be returned from a call to `state_dict`) to load. """ # deepcopy, to be consistent with module API state_dict = deepcopy(state_dict) @@ -237,11 +269,11 @@ def check_overrides(self): @torch.no_grad() def step(self, closure=None): - """Performs a single optimization step. + """Perform a single optimization step. Arguments: - closure (callable, optional): A closure that reevaluates the model - and returns the loss. + closure (`Callable`, *optional*, defaults to `None`): + A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: @@ -339,6 +371,39 @@ def __init__( skip_zeros=False, is_paged=False ): + """ + Base 2-state update optimizer class. + + Arguments: + optimizer_name (`str`): + The name of the optimizer. + params (`torch.tensor`): + The input parameters to optimize. + lr (`float`, defaults to 1e-3): + The learning rate. + betas (`tuple`, defaults to (0.9, 0.999)): + The beta values for the optimizer. + eps (`float`, defaults to 1e-8): + The epsilon value for the optimizer. + weight_decay (`float`, defaults to 0.0): + The weight decay value for the optimizer. + optim_bits (`int`, defaults to 32): + The number of bits of the optimizer state. + args (`dict`, defaults to `None`): + A dictionary with additional arguments. + min_8bit_size (`int`, defaults to 4096): + The minimum number of elements of the parameter tensors for 8-bit optimization. + percentile_clipping (`int`, defaults to 100): + Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. + block_wise (`bool`, defaults to `True`): + Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. + max_unorm (`float`, defaults to 0.0): + The maximum value to normalize each block with. + skip_zeros (`bool`, defaults to `False`): + Whether to skip zero values for sparse gradients and models to ensure correct updates. + is_paged (`bool`, defaults to `False`): + Whether the optimizer is a paged optimizer or not. + """ if not 0.0 <= lr: raise ValueError(f"Invalid learning rate: {lr}") if not 0.0 <= eps: @@ -552,6 +617,39 @@ def __init__( skip_zeros=False, is_paged=False ): + """ + Base 1-state update optimizer class. + + Arguments: + optimizer_name (`str`): + The name of the optimizer. + params (`torch.tensor`): + The input parameters to optimize. + lr (`float`, defaults to 1e-3): + The learning rate. + betas (`tuple`, defaults to (0.9, 0.0)): + The beta values for the optimizer. + eps (`float`, defaults to 1e-8): + The epsilon value for the optimizer. + weight_decay (`float`, defaults to 0.0): + The weight decay value for the optimizer. + optim_bits (`int`, defaults to 32): + The number of bits of the optimizer state. + args (`dict`, defaults to `None`): + A dictionary with additional arguments. + min_8bit_size (`int`, defaults to 4096): + The minimum number of elements of the parameter tensors for 8-bit optimization. + percentile_clipping (`int`, defaults to 100): + Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. + block_wise (`bool`, defaults to `True`): + Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. + max_unorm (`float`, defaults to 0.0): + The maximum value to normalize each block with. + skip_zeros (`bool`, defaults to `False`): + Whether to skip zero values for sparse gradients and models to ensure correct updates. + is_paged (`bool`, defaults to `False`): + Whether the optimizer is a paged optimizer or not. + """ if not 0.0 <= lr: raise ValueError(f"Invalid learning rate: {lr}") if not 0.0 <= eps: diff --git a/bitsandbytes/optim/rmsprop.py b/bitsandbytes/optim/rmsprop.py index 2853ca723..ac371a66f 100644 --- a/bitsandbytes/optim/rmsprop.py +++ b/bitsandbytes/optim/rmsprop.py @@ -21,6 +21,35 @@ def __init__( percentile_clipping=100, block_wise=True, ): + """ + Base RMSprop optimizer. + + Arguments: + params (`torch.tensor`): + The input parameters to optimize. + lr (`float`, defaults to 1e-2): + The learning rate. + alpha (`float`, defaults to 0.99): + The alpha value is the decay rate of the squared gradients of the optimizer. + eps (`float`, defaults to 1e-8): + The epsilon value prevents division by zero in the optimizer. + weight_decay (`float`, defaults to 0.0): + The weight decay value for the optimizer. + momentum (`float`, defaults to 0): + The momentum value speeds up the optimizer by taking bigger steps. + centered (`bool`, defaults to `False`): + Whether the gradients are normalized by the variance. If `True`, it can help training at the expense of additional compute. + optim_bits (`int`, defaults to 32): + The number of bits of the optimizer state. + args (`dict`, defaults to `None`): + A dictionary with additional arguments. + min_8bit_size (`int`, defaults to 4096): + The minimum number of elements of the parameter tensors for 8-bit optimization. + percentile_clipping (`int`, defaults to 100): + Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. + block_wise (`bool`, defaults to `True`): + Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. + """ if alpha == 0: raise NotImplementedError( "RMSprop with alpha==0.0 is not supported!" @@ -57,6 +86,35 @@ def __init__( percentile_clipping=100, block_wise=True, ): + """ + 8-bit RMSprop optimizer. + + Arguments: + params (`torch.tensor`): + The input parameters to optimize. + lr (`float`, defaults to 1e-2): + The learning rate. + alpha (`float`, defaults to 0.99): + The alpha value is the decay rate of the squared gradients of the optimizer. + eps (`float`, defaults to 1e-8): + The epsilon value prevents division by zero in the optimizer. + weight_decay (`float`, defaults to 0.0): + The weight decay value for the optimizer. + momentum (`float`, defaults to 0): + The momentum value speeds up the optimizer by taking bigger steps. + centered (`bool`, defaults to `False`): + Whether the gradients are normalized by the variance. If `True`, it can help training at the expense of additional compute. + optim_bits (`int`, defaults to 32): + The number of bits of the optimizer state. + args (`dict`, defaults to `None`): + A dictionary with additional arguments. + min_8bit_size (`int`, defaults to 4096): + The minimum number of elements of the parameter tensors for 8-bit optimization. + percentile_clipping (`int`, defaults to 100): + Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. + block_wise (`bool`, defaults to `True`): + Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. + """ if alpha == 0: raise NotImplementedError( "RMSprop with alpha==0.0 is not supported!" @@ -93,6 +151,35 @@ def __init__( percentile_clipping=100, block_wise=True, ): + """ + 32-bit RMSprop optimizer. + + Arguments: + params (`torch.tensor`): + The input parameters to optimize. + lr (`float`, defaults to 1e-2): + The learning rate. + alpha (`float`, defaults to 0.99): + The alpha value is the decay rate of the squared gradients of the optimizer. + eps (`float`, defaults to 1e-8): + The epsilon value prevents division by zero in the optimizer. + weight_decay (`float`, defaults to 0.0): + The weight decay value for the optimizer. + momentum (`float`, defaults to 0): + The momentum value speeds up the optimizer by taking bigger steps. + centered (`bool`, defaults to `False`): + Whether the gradients are normalized by the variance. If `True`, it can help training at the expense of additional compute. + optim_bits (`int`, defaults to 32): + The number of bits of the optimizer state. + args (`dict`, defaults to `None`): + A dictionary with additional arguments. + min_8bit_size (`int`, defaults to 4096): + The minimum number of elements of the parameter tensors for 8-bit optimization. + percentile_clipping (`int`, defaults to 100): + Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. + block_wise (`bool`, defaults to `True`): + Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. + """ if alpha == 0: raise NotImplementedError( diff --git a/bitsandbytes/optim/sgd.py b/bitsandbytes/optim/sgd.py index 3c0fc2b9f..0f0b12e4b 100644 --- a/bitsandbytes/optim/sgd.py +++ b/bitsandbytes/optim/sgd.py @@ -20,6 +20,33 @@ def __init__( percentile_clipping=100, block_wise=True, ): + """ + Base SGD optimizer. + + Arguments: + params (`torch.tensor`): + The input parameters to optimize. + lr (`float`): + The learning rate. + momentum (`float`, defaults to 0): + The momentum value speeds up the optimizer by taking bigger steps. + dampening (`float`, defaults to 0): + The dampening value reduces the momentum of the optimizer. + weight_decay (`float`, defaults to 0.0): + The weight decay value for the optimizer. + nesterov (`bool`, defaults to `False`): + Whether to use Nesterov momentum. + optim_bits (`int`, defaults to 32): + The number of bits of the optimizer state. + args (`dict`, defaults to `None`): + A dictionary with additional arguments. + min_8bit_size (`int`, defaults to 4096): + The minimum number of elements of the parameter tensors for 8-bit optimization. + percentile_clipping (`int`, defaults to 100): + Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. + block_wise (`bool`, defaults to `True`): + Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. + """ if momentum == 0: raise NotImplementedError("SGD without momentum is not supported!") super().__init__( @@ -51,6 +78,31 @@ def __init__( percentile_clipping=100, block_wise=True, ): + """ + 8-bit SGD optimizer. + + Arguments: + params (`torch.tensor`): + The input parameters to optimize. + lr (`float`): + The learning rate. + momentum (`float`, defaults to 0): + The momentum value speeds up the optimizer by taking bigger steps. + dampening (`float`, defaults to 0): + The dampening value reduces the momentum of the optimizer. + weight_decay (`float`, defaults to 0.0): + The weight decay value for the optimizer. + nesterov (`bool`, defaults to `False`): + Whether to use Nesterov momentum. + args (`dict`, defaults to `None`): + A dictionary with additional arguments. + min_8bit_size (`int`, defaults to 4096): + The minimum number of elements of the parameter tensors for 8-bit optimization. + percentile_clipping (`int`, defaults to 100): + Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. + block_wise (`bool`, defaults to `True`): + Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. + """ if momentum == 0: raise NotImplementedError("SGD without momentum is not supported!") super().__init__( @@ -82,6 +134,31 @@ def __init__( percentile_clipping=100, block_wise=True, ): + """ + 32-bit SGD optimizer. + + Arguments: + params (`torch.tensor`): + The input parameters to optimize. + lr (`float`): + The learning rate. + momentum (`float`, defaults to 0): + The momentum value speeds up the optimizer by taking bigger steps. + dampening (`float`, defaults to 0): + The dampening value reduces the momentum of the optimizer. + weight_decay (`float`, defaults to 0.0): + The weight decay value for the optimizer. + nesterov (`bool`, defaults to `False`): + Whether to use Nesterov momentum. + args (`dict`, defaults to `None`): + A dictionary with additional arguments. + min_8bit_size (`int`, defaults to 4096): + The minimum number of elements of the parameter tensors for 8-bit optimization. + percentile_clipping (`int`, defaults to 100): + Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. + block_wise (`bool`, defaults to `True`): + Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. + """ if momentum == 0: raise NotImplementedError("SGD without momentum is not supported!") super().__init__( diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index 7584207d0..87c4242de 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -26,5 +26,33 @@ title: Papers, resources & how to cite - title: API reference sections: - - local: quantization + - local: reference/quantization title: Quantization + - title: Optimizers + sections: + - local: reference/optim/optim_overview + title: Overview + - local: reference/optim/adagrad + title: AdaGrad + - local: reference/optim/adam + title: Adam + - local: reference/optim/adamw + title: AdamW + - local: reference/optim/lamb + title: LAMB + - local: reference/optim/lars + title: LARS + - local: reference/optim/lion + title: Lion + - local: reference/optim/rmsprop + title: RMSprop + - local: reference/optim/sgd + title: SGD + - title: k-bit quantizers + sections: + - local: reference/nn/linear8bit + title: 8-bit quantizer + - local: reference/nn/linear4bit + title: 4-bit quantizer + - local: reference/nn/embeddings + title: Embedding diff --git a/docs/source/reference/nn/embeddings.mdx b/docs/source/reference/nn/embeddings.mdx new file mode 100644 index 000000000..e725ecb17 --- /dev/null +++ b/docs/source/reference/nn/embeddings.mdx @@ -0,0 +1,15 @@ +# Embedding + +The embedding class is used to store and retrieve word embeddings from their indices. There are two types of embeddings in bitsandbytes, the standard PyTorch [`Embedding`] class and the [`StableEmbedding`] class. + +The [`StableEmbedding`] class was introduced in the [8-bit Optimizers via Block-wise Quantization](https://hf.co/papers/2110.02861) paper to reduce gradient variance as a result of the non-uniform distribution of input tokens. This class is designed to support quantization. + +## Embedding + +[[autodoc]] bitsandbytes.nn.Embedding + - __init__ + +## StableEmbedding + +[[autodoc]] bitsandbytes.nn.StableEmbedding + - __init__ diff --git a/docs/source/reference/nn/linear4bit.mdx b/docs/source/reference/nn/linear4bit.mdx new file mode 100644 index 000000000..3cbf6509d --- /dev/null +++ b/docs/source/reference/nn/linear4bit.mdx @@ -0,0 +1,23 @@ +# 4-bit quantization + +[QLoRA](https://hf.co/papers/2305.14314) is a finetuning method that quantizes a model to 4-bits and adds a set of low-rank adaptation (LoRA) weights to the model and tuning them through the quantized weights. This method also introduces a new data type, 4-bit NormalFloat (`LinearNF4`) in addition to the standard Float4 data type (`LinearFP4`). `LinearNF4` is a quantization data type for normally distributed data and can improve performance. + +## Linear4bit + +[[autodoc]] bitsandbytes.nn.Linear4bit + - __init__ + +## LinearFP4 + +[[autdodoc]] bitsandbytes.nn.LinearFP4 + - __init__ + +## LinearNF4 + +[[autodoc]] bitsandbytes.nn.LinearNF4 + - __init__ + +## Params4bit + +[[autodoc]] bitsandbytes.nn.Params4bit + - __init__ diff --git a/docs/source/reference/nn/linear8bit.mdx b/docs/source/reference/nn/linear8bit.mdx new file mode 100644 index 000000000..73254fe67 --- /dev/null +++ b/docs/source/reference/nn/linear8bit.mdx @@ -0,0 +1,13 @@ +# 8-bit quantization + +[LLM.int8()](https://hf.co/papers/2208.07339) is a quantization method that doesn't degrade performance which makes large model inference more accessible. The key is to extract the outliers from the inputs and weights and multiply them in 16-bit. All other values are multiplied in 8-bit and quantized to Int8 before being dequantized back to 16-bits. The outputs from the 16-bit and 8-bit multiplication are combined to produce the final output. + +## Linear8bitLt + +[[autodoc]] bitsandbytes.nn.Linear8bitLt + - __init__ + +## Int8Params + +[[autodoc]] bitsandbytes.nn.Int8Params + - __init__ diff --git a/docs/source/reference/optim/adagrad.mdx b/docs/source/reference/optim/adagrad.mdx new file mode 100644 index 000000000..8dddba04c --- /dev/null +++ b/docs/source/reference/optim/adagrad.mdx @@ -0,0 +1,18 @@ +# AdaGrad + +[AdaGrad (Adaptive Gradient)](https://jmlr.org/papers/v12/duchi11a.html) is an adaptive learning rate optimizer. AdaGrad stores a sum of the squared past gradients for each parameter and uses it to scale their learning rate. This allows the learning rate to be automatically lower or higher depending on the magnitude of the gradient, eliminating the need to manually tune the learning rate. + +## Adagrad[[api-class]] + +[[autodoc]] bitsandbytes.optim.Adagrad + - __init__ + +## Adagrad8bit + +[[autodoc]] bitsandbytes.optim.Adagrad8bit + - __init__ + +## Adagrad32bit + +[[autodoc]] bitsandbytes.optim.Adagrad32bit + - __init__ diff --git a/docs/source/reference/optim/adam.mdx b/docs/source/reference/optim/adam.mdx new file mode 100644 index 000000000..f367bc415 --- /dev/null +++ b/docs/source/reference/optim/adam.mdx @@ -0,0 +1,38 @@ +# Adam + +[Adam (Adaptive moment estimation)](https://hf.co/papers/1412.6980) is an adaptive learning rate optimizer, combining ideas from [`SGD`] with momentum and [`RMSprop`] to automatically scale the learning rate: + +- a weighted average of the past gradients to provide direction (first-moment) +- a weighted average of the *squared* past gradients to adapt the learning rate to each parameter (second-moment) + +bitsandbytes also supports paged optimizers which take advantage of CUDAs unified memory to transfer memory from the GPU to the CPU when GPU memory is exhausted. + +## Adam[[api-class]] + +[[autodoc]] bitsandbytes.optim.Adam + - __init__ + +## Adam8bit + +[[autodoc]] bitsandbytes.optim.Adam8bit + - __init__ + +## Adam32bit + +[[autodoc]] bitsandbytes.optim.Adam32bit + - __init__ + +## PagedAdam + +[[autodoc]] bitsandbytes.optim.PagedAdam + - __init__ + +## PagedAdam8bit + +[[autodoc]] bitsandbytes.optim.PagedAdam8bit + - __init__ + +## PagedAdam32bit + +[[autodoc]] bitsandbytes.optim.PagedAdam32bit + - __init__ diff --git a/docs/source/reference/optim/adamw.mdx b/docs/source/reference/optim/adamw.mdx new file mode 100644 index 000000000..e3dd410de --- /dev/null +++ b/docs/source/reference/optim/adamw.mdx @@ -0,0 +1,34 @@ +# AdamW + +[AdamW](https://hf.co/papers/1711.05101) is a variant of the [`Adam`] optimizer that separates weight decay from the gradient update based on the observation that the weight decay formulation is different when applied to [`SGD`] and [`Adam`]. + +bitsandbytes also supports paged optimizers which take advantage of CUDAs unified memory to transfer memory from the GPU to the CPU when GPU memory is exhausted. + +## AdamW[[api-class]] + +[[autodoc]] bitsandbytes.optim.AdamW + - __init__ + +## AdamW8bit + +[[autodoc]] bitsandbytes.optim.AdamW8bit + - __init__ + +## AdamW32bit + +[[autodoc]] bitsandbytes.optim.AdamW32bit + - __init__ + +## PagedAdamW + +[[autodoc]] bitsandbytes.optim.PagedAdamW + - __init__ +## PagedAdamW8bit + +[[autodoc]] bitsandbytes.optim.PagedAdamW8bit + - __init__ + +## PagedAdamW32bit + +[[autodoc]] bitsandbytes.optim.PagedAdamW32bit + - __init__ diff --git a/docs/source/reference/optim/lamb.mdx b/docs/source/reference/optim/lamb.mdx new file mode 100644 index 000000000..d581380ed --- /dev/null +++ b/docs/source/reference/optim/lamb.mdx @@ -0,0 +1,21 @@ +# LAMB + +[LAMB (Layerwise adaptive large batch optimization)](https://hf.co/papers/1904.00962) is an adaptive optimizer designed for training with large batch sizes to accelerate training, combining ideas from [`LARS`] and [`Adam`] to automatically scale the learning rate for each layer: + +- calculates a *trust ratio* between the weight and gradient norm in a layer and clips the ratio to prevent overly large or small updates +- updates weights with the first and second-moments + +## LAMB[[api-class]] + +[[autodoc]] bitsandbytes.optim.LAMB + - __init__ + +## LAMB8bit + +[[autodoc]] bitsandbytes.optim.LAMB8bit + - __init__ + +## LAMB32bit + +[[autodoc]] bitsandbytes.optim.LAMB32bit + - __init__ diff --git a/docs/source/reference/optim/lars.mdx b/docs/source/reference/optim/lars.mdx new file mode 100644 index 000000000..93b5c55c3 --- /dev/null +++ b/docs/source/reference/optim/lars.mdx @@ -0,0 +1,18 @@ +# LARS + +[LARS (Layer-wise Adaptive Rate Scaling)](https:/hf.co/papers/1708.03888) is an optimizer designed for training with large batch sizes to accelerate training. LARS uses a separate learning rate for each *layer* instead of each parameter. The learning rate is calculated from a *trust ratio* between the weight and gradient norm in a layer. This helps calibrate a stable update size. + +## LARS[[api-class]] + +[[autodoc]] bitsandbytes.optim.LARS + - __init__ + +## LARS8bit + +[[autodoc]] bitsandbytes.optim.LARS8bit + - __init__ + +## LARS32bit + +[[autodoc]] bitsandbytes.optim.LARS32bit + - __init__ diff --git a/docs/source/reference/optim/lion.mdx b/docs/source/reference/optim/lion.mdx new file mode 100644 index 000000000..8183c27e7 --- /dev/null +++ b/docs/source/reference/optim/lion.mdx @@ -0,0 +1,33 @@ +# Lion + +[Lion (Evolved Sign Momentum)](https://hf.co/papers/2302.06675) is a unique optimizer that uses the sign of the gradient to determine the update direction of the momentum. This makes Lion more memory-efficient and faster than [`AdamW`] which tracks and store the first and second-order moments. + +## Lion[[api-class]] + +[[autodoc]] bitsandbytes.optim.Lion + - __init__ + +## Lion8bit + +[[autodoc]] bitsandbytes.optim.Lion8bit + - __init__ + +## Lion32bit + +[[autodoc]] bitsandbytes.optim.Lion32bit + - __init__ + +## PagedLion + +[[autodoc]] bitsandbytes.optim.PagedLion + - __init__ + +## PagedLion8bit + +[[autodoc]] bitsandbytes.optim.PagedLion8bit + - __init__ + +## PagedLion32bit + +[[autodoc]] bitsandbytes.optim.PagedLion32bit + - __init__ diff --git a/docs/source/reference/optim/optim_overview.mdx b/docs/source/reference/optim/optim_overview.mdx new file mode 100644 index 000000000..48e12b544 --- /dev/null +++ b/docs/source/reference/optim/optim_overview.mdx @@ -0,0 +1,24 @@ +# Overview + +[8-bit optimizers](https://hf.co/papers/2110.02861) reduce the memory footprint of 32-bit optimizers without any performance degradation which means you can train large models with many parameters faster. At the core of 8-bit optimizers is block-wise quantization which enables quantization accuracy, computational efficiency, and stability. + +bitsandbytes provides 8-bit optimizers through the base [`Optimizer8bit`] class, and additionally provides [`Optimizer2State`] and [`Optimizer1State`] for 2-state (for example, [`Adam`]) and 1-state (for example, [`Adagrad`]) optimizers respectively. To provide custom optimizer hyperparameters, use the [`GlobalOptimManager`] class to configure the optimizer. + +## Optimizer8bit + +[[autodoc]] bitsandbytes.optim.optimizer.Optimizer8bit + - __init__ + +## Optimizer2State + +[[autodoc]] bitsandbytes.optim.optimizer.Optimizer2State + - __init__ + +## Optimizer1State + +[[autodoc]] bitsandbytes.optim.optimizer.Optimizer1State + - __init__ + +## Utilities + +[[autodoc]] bitsandbytes.optim.optimizer.GlobalOptimManager diff --git a/docs/source/reference/optim/rmsprop.mdx b/docs/source/reference/optim/rmsprop.mdx new file mode 100644 index 000000000..33d839f6b --- /dev/null +++ b/docs/source/reference/optim/rmsprop.mdx @@ -0,0 +1,15 @@ +# RMSprop + +RMSprop is an adaptive learning rate optimizer that is very similar to [`Adagrad`]. RMSprop stores a *weighted average* of the squared past gradients for each parameter and uses it to scale their learning rate. This allows the learning rate to be automatically lower or higher depending on the magnitude of the gradient, and it prevents the learning rate from diminishing. + +## RMSprop[[api-class]] + +[[autodoc]] bitsandbytes.optim.RMSprop + +## RMSprop8bit + +[[autodoc]] bitsandbytes.optim.RMSprop8bit + +## RMSprop32bit + +[[autodoc]] bitsandbytes.optim.RMSprop32bit diff --git a/docs/source/reference/optim/sgd.mdx b/docs/source/reference/optim/sgd.mdx new file mode 100644 index 000000000..a0d09d1e8 --- /dev/null +++ b/docs/source/reference/optim/sgd.mdx @@ -0,0 +1,20 @@ +# SGD + +Stochastic gradient descent (SGD) is a basic gradient descent optimizer to minimize loss given a set of model parameters and updates the parameters in the opposite direction of the gradient. The update is performed on a randomly sampled mini-batch of data from the dataset. + +bitsandbytes also supports momentum and Nesterov momentum to accelerate SGD by adding a weighted average of past gradients to the current gradient. + +## SGD[[api-class]] + +[[autodoc]] bitsandbytes.optim.SGD + - __init__ + +## SGD8bit + +[[autodoc]] bitsandbytes.optim.SGD8bit + - __init__ + +## SGD32bit + +[[autodoc]] bitsandbytes.optim.SGD32bit + - __init__ diff --git a/docs/source/quantization.mdx b/docs/source/reference/quantization.mdx similarity index 100% rename from docs/source/quantization.mdx rename to docs/source/reference/quantization.mdx From 1cfc27779f54fdc83dcaca7469a49c267ca217cf Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Thu, 7 Mar 2024 19:10:22 -0500 Subject: [PATCH 075/112] Build: Expand CUDA Toolkit Matrix (#1111) * (ci) build with wider CUDA version matrix * (ci) build with wider CUDA version matrix * (ci) skip sm_89 target on CUDA 11.7 * (ci) skip sm_90 target on CUDA 11.8 * modify workflow to publish to test.pypi * (build) Test for manylinux_2_24 build on GH actions * (build) got that backwards. * try fixing manual triggering condition for testpypi * try if Ubuntu 18.04 is an easy fix to allow for `manylinux_2_24` compatibility * hardcode publish step to run to test publishing * set ubuntu to newest supported version * try statically linking libstdc++ to achieve manylinux_2_18 * last commit only brought us to manylinux_2_34, reverse * add misssing permission for publishing to pypi * snake case deprecated in favor of kebab * downgrade cuda ubuntu aiming for manylinux_2_24 * add step to upgrade cmake due to old Ubuntu for CUDA build * adjust path to prefer pip installed cmake * (cmake) set CMAKE_BUILD_TYPE=Release if unspecified * default to CMAKE_BUILD_TYPE Release for optimized releases and better many_linux compatibility * (build) back to ubuntu22.04 docker images * verify Cmake in separte step * add clarifying comment about Python version compatibility * (build) we don't need cmake for wheel step * fixup testpypi publish to run in PR for testing * add pypi publishing when tagged on main * add functionality to rewrite platform tags * (ci) adjust platform tags for wheels * fix for windows, get order right. * fix for windows, get order right. * (build) slim down those fatbins on windows cuda * sloppy * remove broken PyPi upload for now --------- Co-authored-by: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com> --- .github/workflows/python-package.yml | 43 +++++++++++++--------------- CMakeLists.txt | 11 ++++++- scripts/set_platform_tag.py | 34 ++++++++++++++++++++++ 3 files changed, 64 insertions(+), 24 deletions(-) create mode 100644 scripts/set_platform_tag.py diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index a25f53f46..c868b18d2 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -17,6 +17,7 @@ on: - 'pytest.ini' release: types: [ published ] + workflow_dispatch: {} # Allow manual trigger concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} @@ -82,10 +83,12 @@ jobs: matrix: os: [ubuntu-latest, windows-latest] arch: [x86_64, aarch64] - cuda_version: ['12.1.0'] + cuda_version: ["11.7.1", "11.8.0", "12.0.1", "12.1.1", "12.2.2", "12.3.2"] exclude: - os: windows-latest # This probably requires arm64 Windows agents arch: aarch64 + - os: ubuntu-latest # Temporary. Takes too long, not ready yet. + arch: aarch64 runs-on: ${{ matrix.os }} # One day, we could run them on native agents. Azure supports this now but it's planned only for Q3 2023 for hosted agents steps: # Check out code @@ -121,6 +124,9 @@ jobs: set -ex build_os=${{ matrix.os }} build_arch=${{ matrix.arch }} + build_capability="50;52;60;61;70;75;80;86;89;90" + [[ "${{ matrix.cuda_version }}" == 11.7.* ]] && build_capability=${build_capability%??????} + [[ "${{ matrix.cuda_version }}" == 11.8.* ]] && build_capability=${build_capability%???} [[ "${{ matrix.os }}" = windows-* ]] && python3 -m pip install ninja for NO_CUBLASLT in ON OFF; do if [ ${build_os:0:6} == ubuntu ]; then @@ -129,10 +135,10 @@ jobs: docker run --platform linux/$build_arch -i -w /src -v $PWD:/src $image sh -c \ "apt-get update \ && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cmake \ - && cmake -DCOMPUTE_BACKEND=cuda -DCOMPUTE_CAPABILITY=\"50;52;60;61;70;75;80;86;89;90\" -DNO_CUBLASLT=${NO_CUBLASLT} . \ + && cmake -DCOMPUTE_BACKEND=cuda -DCOMPUTE_CAPABILITY=\"${build_capability}\" -DNO_CUBLASLT=${NO_CUBLASLT} . \ && cmake --build ." else - cmake -G Ninja -DCOMPUTE_BACKEND=cuda -DNO_CUBLASLT=${NO_CUBLASLT} -DCMAKE_BUILD_TYPE=Release -S . + cmake -G Ninja -DCOMPUTE_BACKEND=cuda -DCOMPUTE_CAPABILITY="${build_capability}" -DNO_CUBLASLT=${NO_CUBLASLT} -DCMAKE_BUILD_TYPE=Release -S . cmake --build . --config Release fi done @@ -151,7 +157,10 @@ jobs: strategy: matrix: os: [ubuntu-latest, macos-latest, windows-latest] - python-version: ["3.9", "3.10", "3.11", "3.12"] + # The specific Python version is irrelevant in this context as we are only packaging non-C extension + # code. This ensures compatibility across Python versions, including Python 3.8, as compatibility is + # dictated by the packaged code itself, not the Python version used for packaging. + python-version: ["3.10"] arch: [x86_64, aarch64] exclude: - os: windows-latest # This probably requires arm64 Windows agents @@ -192,27 +201,15 @@ jobs: - name: Build wheel shell: bash run: python -m build . + - name: Determine and Set Platform Tag, then Tag Wheel + shell: bash + run: | + PLATFORM_TAG=$(python scripts/set_platform_tag.py ${{ matrix.arch }}) + echo "PLATFORM_TAG=$PLATFORM_TAG" + wheel tags --remove --abi-tag=none --python-tag=py3 --platform-tag=$PLATFORM_TAG dist/bitsandbytes-*.whl - name: Upload build artifact uses: actions/upload-artifact@v4 with: - name: bdist_wheel_${{ matrix.os }}_${{ matrix.arch }}_${{ matrix.python-version }} + name: bdist_wheel_${{ matrix.os }}_${{ matrix.arch }} path: dist/bitsandbytes-*.whl retention-days: 7 - publish: - needs: build-wheels - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - name: Download build artifact - uses: actions/download-artifact@v4 - with: - path: dist/ - merge-multiple: true - pattern: "bdist_wheel_*" - - run: | - ls -lR dist/ - - name: Publish to PyPi - if: startsWith(github.ref, 'refs/tags') - uses: pypa/gh-action-pypi-publish@release/v1 - with: - password: ${{ secrets.pypi }} diff --git a/CMakeLists.txt b/CMakeLists.txt index 62ff4e535..be0d3555f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -15,6 +15,14 @@ cmake_minimum_required(VERSION 3.22.1) project(bitsandbytes LANGUAGES CXX) +# If run without specifying a build type, default to using the Release configuration: +# optimizing the generated binaries for performance and also adds the `-DNDEBUG` flag, +# which turns off a bunch of asserts which seem to link to new symbols in libstdc++, +# worsening our many_linux compliance.. +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE Release) +endif() + # Define included source files set(CPP_FILES csrc/common.cpp csrc/cpu_ops.cpp csrc/pythonInterface.cpp) set(CUDA_FILES csrc/ops.cu csrc/kernels.cu) @@ -108,6 +116,7 @@ if(BUILD_CUDA) endif() string(APPEND CMAKE_CUDA_FLAGS " --use_fast_math") + if(PTXAS_VERBOSE) # Verbose? Outputs register usage information, and other things... string(APPEND CMAKE_CUDA_FLAGS " -Xptxas=-v") @@ -220,4 +229,4 @@ if(MSVC) set_target_properties(bitsandbytes PROPERTIES RUNTIME_OUTPUT_DIRECTORY_DEBUG "${PROJECT_SOURCE_DIR}/bitsandbytes") endif() -set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY bitsandbytes) +set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY "${PROJECT_SOURCE_DIR}/bitsandbytes") diff --git a/scripts/set_platform_tag.py b/scripts/set_platform_tag.py new file mode 100644 index 000000000..ca561c880 --- /dev/null +++ b/scripts/set_platform_tag.py @@ -0,0 +1,34 @@ +import argparse +import platform +import sys + + +def get_platform_tag(architecture): + system = platform.system() + + if system == "Linux": + tag = ( + "manylinux_2_24_x86_64" if architecture == "x86_64" else "manylinux_2_24_aarch64" + ) + elif system == "Darwin": + tag = "macosx_13_1_x86_64" if architecture == "x86_64" else "macosx_13_1_arm64" + elif system == "Windows": + tag = "win_amd64" if architecture == "x86_64" else "win_arm64" + else: + sys.exit(f"Unsupported system: {system}") + + return tag + + +def main(): + parser = argparse.ArgumentParser(description="Determine platform tag.") + parser.add_argument("arch", type=str, help="Architecture (e.g., x86_64, aarch64)") + args = parser.parse_args() + + tag = get_platform_tag(args.arch) + + print(tag) # This will be captured by the GitHub Actions workflow + + +if __name__ == "__main__": + main() From 487632450733a1c638e690d54a36920a61918b49 Mon Sep 17 00:00:00 2001 From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com> Date: Fri, 8 Mar 2024 00:17:10 +0000 Subject: [PATCH 076/112] Release: v0.43.0 --- CHANGELOG.md | 24 ++++++++++++++++++++++++ bitsandbytes/__init__.py | 2 +- setup.py | 2 +- 3 files changed, 26 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c12443cf3..397dceb77 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -342,3 +342,27 @@ Bug fixes: - Fixed a bug where kgetColRowStats (LLM.int8()) would fail for certain dimensions @LucQueen @905 - Fixed a bug where the adjusted regular Embedding layer was not available via bnb.nn.Embedding @neel04 #563 - Fixed added missing scipy requirement @dulalbert #525 + +### 0.43.0 + +#### Improvements and New Features: +- QLoRA + FSDP official support is now live! https://github.com/TimDettmers/bitsandbytes/pull/970 by @warner-benjamin and team - with FSDP you can train very large models (70b scale) on multiple 24GB consumer-type GPUs. See https://www.answer.ai/posts/2024-03-06-fsdp-qlora.html for more details. +- Introduced improvements to the CI process for enhanced performance and efficiency during builds, specifically enabling more effective cross-compilation on Linux platforms. This was accomplished by deprecating Make and migrating to Cmake, as well as implementing new corresponding workflows. Huge thanks go to @wkpark, @rickardp, @matthewdouglas and @younesbelkada; #1055, #1050, #1111. +- Windows should be officially supported in bitsandbytes if you install the library from source. See: https://huggingface.co/docs/bitsandbytes/main/en/index for more details +- Updated installation instructions to provide more comprehensive guidance for users. This includes clearer explanations and additional tips for various setup scenarios, making the library more accessible to a broader audience (@rickardp, #1047). +- Enhanced the library's compatibility and setup process, including fixes for CPU-only installations and improvements in CUDA setup error messaging. This effort aims to streamline the installation process and improve user experience across different platforms and setups (@wkpark, @akx, #1038, #996, #1012). +- Setup a new documentation at https://huggingface.co/docs/bitsandbytes/main with extensive new sections and content to help users better understand and utilize the library. Especially notable are the new API docs. (big thanks to @stevhliu and @mishig25 from HuggingFace #1012). The API docs have been also addressed in #1075. + +#### Bug Fixes: +- Addressed a race condition in kEstimateQuantiles, enhancing the reliability of quantile estimation in concurrent environments (@pnunna93, #1061). +- Fixed various minor issues, including typos in code comments and documentation, to improve code clarity and prevent potential confusion (@Brian Vaughan, #1063). + +#### Internal and Build System Enhancements: +- Implemented several enhancements to the internal and build systems, including adjustments to the CI workflows, portability improvements, and build artifact management. These changes contribute to a more robust and flexible development process, ensuring the library's ongoing quality and maintainability (@rickardp, @akx, @wkpark, @matthewdouglas; #949, #1053, #1045, #1037). + +#### Contributors: +This release is made possible thanks to the many active contributors that submitted PRs and many others who contributed to discussions, reviews, and testing. Your efforts greatly enhance the library's quality and user experience. It's truly inspiring to work with such a dedicated and competent group of volunteers and professionals! + +We give a special thanks to @TimDettmers for managing to find a little bit of time for valuable consultations on critical topics, despite preparing for and touring the states applying for professor positions. We wish him the utmost success! + +We also extend our gratitude to the broader community for your continued support, feedback, and engagement, which play a crucial role in driving the library's development forward. diff --git a/bitsandbytes/__init__.py b/bitsandbytes/__init__.py index e54e933d9..3b83a8d6d 100644 --- a/bitsandbytes/__init__.py +++ b/bitsandbytes/__init__.py @@ -24,6 +24,6 @@ "optim.optimizer.MockArgs": False, } -__version__ = "0.43.0.dev" +__version__ = "0.43.0" PACKAGE_GITHUB_URL = "https://github.com/TimDettmers/bitsandbytes" diff --git a/setup.py b/setup.py index 13af2a39b..57603a4cc 100644 --- a/setup.py +++ b/setup.py @@ -25,7 +25,7 @@ def has_ext_modules(self): setup( name="bitsandbytes", - version="0.43.0.dev0", + version="0.43.0", author="Tim Dettmers", author_email="dettmers@cs.washington.edu", description="k-bit optimizers and matrix multiplication routines.", From 17681f682592f6e4b1210bdf3b5659e87b086076 Mon Sep 17 00:00:00 2001 From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com> Date: Fri, 8 Mar 2024 00:55:15 +0000 Subject: [PATCH 077/112] up version to next dev (needed for docs) --- bitsandbytes/__init__.py | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bitsandbytes/__init__.py b/bitsandbytes/__init__.py index 3b83a8d6d..dbb267d17 100644 --- a/bitsandbytes/__init__.py +++ b/bitsandbytes/__init__.py @@ -24,6 +24,6 @@ "optim.optimizer.MockArgs": False, } -__version__ = "0.43.0" +__version__ = "0.44.0.dev" PACKAGE_GITHUB_URL = "https://github.com/TimDettmers/bitsandbytes" diff --git a/setup.py b/setup.py index 57603a4cc..a51b3867c 100644 --- a/setup.py +++ b/setup.py @@ -25,7 +25,7 @@ def has_ext_modules(self): setup( name="bitsandbytes", - version="0.43.0", + version="0.44.0.dev", author="Tim Dettmers", author_email="dettmers@cs.washington.edu", description="k-bit optimizers and matrix multiplication routines.", From 958dfa990b580fe93dbddaf654e8a86a4fce838d Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Fri, 8 Mar 2024 13:38:06 +0200 Subject: [PATCH 078/112] Reformat .github with Prettier --- .github/ISSUE_TEMPLATE/feature-request.yml | 2 +- .github/workflows/build_documentation.yml | 2 +- .github/workflows/python-package.yml | 312 ++++++++++----------- 3 files changed, 158 insertions(+), 158 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/feature-request.yml b/.github/ISSUE_TEMPLATE/feature-request.yml index c39f346b9..1dc2a298d 100644 --- a/.github/ISSUE_TEMPLATE/feature-request.yml +++ b/.github/ISSUE_TEMPLATE/feature-request.yml @@ -1,6 +1,6 @@ name: "\U0001F680 Feature request" description: Submit a proposal/request for a new feature -labels: [ "feature" ] +labels: ["feature"] body: - type: textarea id: feature-request diff --git a/.github/workflows/build_documentation.yml b/.github/workflows/build_documentation.yml index f5dc1153d..10272be87 100644 --- a/.github/workflows/build_documentation.yml +++ b/.github/workflows/build_documentation.yml @@ -8,7 +8,7 @@ on: - v*-release jobs: - build: + build: uses: huggingface/doc-builder/.github/workflows/build_main_documentation.yml@main with: commit_sha: ${{ github.sha }} diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index c868b18d2..3245e2d58 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -3,28 +3,27 @@ name: Python package on: push: {} pull_request: - branches: [ main ] + branches: [main] paths: - - '.github/workflows/python-package.yml' - - 'bitsandbytes/**' - - 'csrc/**' - - 'include/**' - - 'tests/**' - - 'CMakeLists.txt' - - 'requirements*.txt' - - 'setup.py' - - 'pyproject.toml' - - 'pytest.ini' + - ".github/workflows/python-package.yml" + - "bitsandbytes/**" + - "csrc/**" + - "include/**" + - "tests/**" + - "CMakeLists.txt" + - "requirements*.txt" + - "setup.py" + - "pyproject.toml" + - "pytest.ini" release: - types: [ published ] - workflow_dispatch: {} # Allow manual trigger + types: [published] + workflow_dispatch: {} # Allow manual trigger concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: true jobs: - ## # This job matrix builds the non-CUDA versions of the libraries for all supported platforms. ## @@ -39,42 +38,42 @@ jobs: runs-on: ${{ matrix.os }} # One day, we could run them on native agents. Azure supports this now but it's planned only for Q3 2023 for hosted agents steps: # Check out code - - uses: actions/checkout@v4 - # On Linux we use CMake within Docker - - name: Setup cmake - uses: jwlawson/actions-setup-cmake@v1.14 - with: - cmake-version: '3.26.x' - - name: Setup MSVC - if: startsWith(matrix.os, 'windows') - #uses: microsoft/setup-msbuild@v1.1 # to use msbuild - uses: ilammy/msvc-dev-cmd@v1.13.0 # to use cl - # Compile C++ code - - name: Build C++ - shell: bash - run: | - set -ex - build_os=${{ matrix.os }} - build_arch=${{ matrix.arch }} - if [ ${build_os:0:6} == ubuntu -a ${build_arch} == aarch64 ]; then - # Allow cross-compile on aarch64 - sudo apt-get update - sudo apt-get install -y gcc-aarch64-linux-gnu binutils-aarch64-linux-gnu g++-aarch64-linux-gnu - cmake -DCMAKE_C_COMPILER=aarch64-linux-gnu-gcc -DCMAKE_CXX_COMPILER=aarch64-linux-gnu-g++ -DCOMPUTE_BACKEND=cpu . - elif [ ${build_os:0:5} == macos -a ${build_arch} == aarch64 ]; then - cmake -DCMAKE_OSX_ARCHITECTURES=arm64 -DCOMPUTE_BACKEND=cpu . - else - cmake -DCOMPUTE_BACKEND=cpu . - fi - cmake --build . --config Release - mkdir -p output/${{ matrix.os }}/${{ matrix.arch }} - ( shopt -s nullglob && cp bitsandbytes/*.{so,dylib,dll} output/${{ matrix.os }}/${{ matrix.arch }}/ ) - - name: Upload build artifact - uses: actions/upload-artifact@v4 - with: - name: shared_library_${{ matrix.os }}_${{ matrix.arch }} - path: output/* - retention-days: 7 + - uses: actions/checkout@v4 + # On Linux we use CMake within Docker + - name: Setup cmake + uses: jwlawson/actions-setup-cmake@v1.14 + with: + cmake-version: "3.26.x" + - name: Setup MSVC + if: startsWith(matrix.os, 'windows') + #uses: microsoft/setup-msbuild@v1.1 # to use msbuild + uses: ilammy/msvc-dev-cmd@v1.13.0 # to use cl + # Compile C++ code + - name: Build C++ + shell: bash + run: | + set -ex + build_os=${{ matrix.os }} + build_arch=${{ matrix.arch }} + if [ ${build_os:0:6} == ubuntu -a ${build_arch} == aarch64 ]; then + # Allow cross-compile on aarch64 + sudo apt-get update + sudo apt-get install -y gcc-aarch64-linux-gnu binutils-aarch64-linux-gnu g++-aarch64-linux-gnu + cmake -DCMAKE_C_COMPILER=aarch64-linux-gnu-gcc -DCMAKE_CXX_COMPILER=aarch64-linux-gnu-g++ -DCOMPUTE_BACKEND=cpu . + elif [ ${build_os:0:5} == macos -a ${build_arch} == aarch64 ]; then + cmake -DCMAKE_OSX_ARCHITECTURES=arm64 -DCOMPUTE_BACKEND=cpu . + else + cmake -DCOMPUTE_BACKEND=cpu . + fi + cmake --build . --config Release + mkdir -p output/${{ matrix.os }}/${{ matrix.arch }} + ( shopt -s nullglob && cp bitsandbytes/*.{so,dylib,dll} output/${{ matrix.os }}/${{ matrix.arch }}/ ) + - name: Upload build artifact + uses: actions/upload-artifact@v4 + with: + name: shared_library_${{ matrix.os }}_${{ matrix.arch }} + path: output/* + retention-days: 7 ## # This job matrix builds the CUDA versions of the libraries for platforms that support CUDA (Linux x64/aarch64 + Windows x64) ## @@ -83,7 +82,8 @@ jobs: matrix: os: [ubuntu-latest, windows-latest] arch: [x86_64, aarch64] - cuda_version: ["11.7.1", "11.8.0", "12.0.1", "12.1.1", "12.2.2", "12.3.2"] + cuda_version: + ["11.7.1", "11.8.0", "12.0.1", "12.1.1", "12.2.2", "12.3.2"] exclude: - os: windows-latest # This probably requires arm64 Windows agents arch: aarch64 @@ -92,68 +92,68 @@ jobs: runs-on: ${{ matrix.os }} # One day, we could run them on native agents. Azure supports this now but it's planned only for Q3 2023 for hosted agents steps: # Check out code - - uses: actions/checkout@v4 - # Linux: We use Docker to build cross platform Cuda (aarch64 is built in emulation) - - name: Set up Docker multiarch - if: startsWith(matrix.os, 'ubuntu') - uses: docker/setup-qemu-action@v2 - # On Linux we use CMake within Docker - - name: Setup cmake - if: ${{ !startsWith(matrix.os, 'linux') }} - uses: jwlawson/actions-setup-cmake@v1.14 - with: - cmake-version: '3.26.x' - # Windows: We install Cuda on the agent (slow) - - uses: Jimver/cuda-toolkit@v0.2.14 - if: startsWith(matrix.os, 'windows') - id: cuda-toolkit - with: - cuda: ${{ matrix.cuda_version }} - method: 'network' - sub-packages: '["nvcc","cudart","cusparse","cublas","thrust","nvrtc_dev","cublas_dev","cusparse_dev"]' - linux-local-args: '["--toolkit"]' - use-github-cache: false - - name: Setup MSVC - if: startsWith(matrix.os, 'windows') - #uses: microsoft/setup-msbuild@v1.1 # to use msbuild - uses: ilammy/msvc-dev-cmd@v1.13.0 # to use cl - # Compile C++ code - - name: Build C++ - shell: bash - run: | - set -ex - build_os=${{ matrix.os }} - build_arch=${{ matrix.arch }} - build_capability="50;52;60;61;70;75;80;86;89;90" - [[ "${{ matrix.cuda_version }}" == 11.7.* ]] && build_capability=${build_capability%??????} - [[ "${{ matrix.cuda_version }}" == 11.8.* ]] && build_capability=${build_capability%???} - [[ "${{ matrix.os }}" = windows-* ]] && python3 -m pip install ninja - for NO_CUBLASLT in ON OFF; do - if [ ${build_os:0:6} == ubuntu ]; then - image=nvidia/cuda:${{ matrix.cuda_version }}-devel-ubuntu22.04 - echo "Using image $image" - docker run --platform linux/$build_arch -i -w /src -v $PWD:/src $image sh -c \ - "apt-get update \ - && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cmake \ - && cmake -DCOMPUTE_BACKEND=cuda -DCOMPUTE_CAPABILITY=\"${build_capability}\" -DNO_CUBLASLT=${NO_CUBLASLT} . \ - && cmake --build ." - else - cmake -G Ninja -DCOMPUTE_BACKEND=cuda -DCOMPUTE_CAPABILITY="${build_capability}" -DNO_CUBLASLT=${NO_CUBLASLT} -DCMAKE_BUILD_TYPE=Release -S . - cmake --build . --config Release - fi - done - mkdir -p output/${{ matrix.os }}/${{ matrix.arch }} - ( shopt -s nullglob && cp bitsandbytes/*.{so,dylib,dll} output/${{ matrix.os }}/${{ matrix.arch }}/ ) - - name: Upload build artifact - uses: actions/upload-artifact@v4 - with: - name: shared_library_cuda_${{ matrix.os }}_${{ matrix.arch }}_${{ matrix.cuda_version }} - path: output/* - retention-days: 7 + - uses: actions/checkout@v4 + # Linux: We use Docker to build cross platform Cuda (aarch64 is built in emulation) + - name: Set up Docker multiarch + if: startsWith(matrix.os, 'ubuntu') + uses: docker/setup-qemu-action@v2 + # On Linux we use CMake within Docker + - name: Setup cmake + if: ${{ !startsWith(matrix.os, 'linux') }} + uses: jwlawson/actions-setup-cmake@v1.14 + with: + cmake-version: "3.26.x" + # Windows: We install Cuda on the agent (slow) + - uses: Jimver/cuda-toolkit@v0.2.14 + if: startsWith(matrix.os, 'windows') + id: cuda-toolkit + with: + cuda: ${{ matrix.cuda_version }} + method: "network" + sub-packages: '["nvcc","cudart","cusparse","cublas","thrust","nvrtc_dev","cublas_dev","cusparse_dev"]' + linux-local-args: '["--toolkit"]' + use-github-cache: false + - name: Setup MSVC + if: startsWith(matrix.os, 'windows') + #uses: microsoft/setup-msbuild@v1.1 # to use msbuild + uses: ilammy/msvc-dev-cmd@v1.13.0 # to use cl + # Compile C++ code + - name: Build C++ + shell: bash + run: | + set -ex + build_os=${{ matrix.os }} + build_arch=${{ matrix.arch }} + build_capability="50;52;60;61;70;75;80;86;89;90" + [[ "${{ matrix.cuda_version }}" == 11.7.* ]] && build_capability=${build_capability%??????} + [[ "${{ matrix.cuda_version }}" == 11.8.* ]] && build_capability=${build_capability%???} + [[ "${{ matrix.os }}" = windows-* ]] && python3 -m pip install ninja + for NO_CUBLASLT in ON OFF; do + if [ ${build_os:0:6} == ubuntu ]; then + image=nvidia/cuda:${{ matrix.cuda_version }}-devel-ubuntu22.04 + echo "Using image $image" + docker run --platform linux/$build_arch -i -w /src -v $PWD:/src $image sh -c \ + "apt-get update \ + && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cmake \ + && cmake -DCOMPUTE_BACKEND=cuda -DCOMPUTE_CAPABILITY=\"${build_capability}\" -DNO_CUBLASLT=${NO_CUBLASLT} . \ + && cmake --build ." + else + cmake -G Ninja -DCOMPUTE_BACKEND=cuda -DCOMPUTE_CAPABILITY="${build_capability}" -DNO_CUBLASLT=${NO_CUBLASLT} -DCMAKE_BUILD_TYPE=Release -S . + cmake --build . --config Release + fi + done + mkdir -p output/${{ matrix.os }}/${{ matrix.arch }} + ( shopt -s nullglob && cp bitsandbytes/*.{so,dylib,dll} output/${{ matrix.os }}/${{ matrix.arch }}/ ) + - name: Upload build artifact + uses: actions/upload-artifact@v4 + with: + name: shared_library_cuda_${{ matrix.os }}_${{ matrix.arch }}_${{ matrix.cuda_version }} + path: output/* + retention-days: 7 build-wheels: needs: - - build-shared-libs - - build-shared-libs-cuda + - build-shared-libs + - build-shared-libs-cuda strategy: matrix: os: [ubuntu-latest, macos-latest, windows-latest] @@ -168,48 +168,48 @@ jobs: runs-on: ${{ matrix.os }} steps: # Check out code - - uses: actions/checkout@v4 - # Download shared libraries - - name: Download build artifact - uses: actions/download-artifact@v4 - with: - merge-multiple: true - pattern: "shared_library*_${{ matrix.os }}_${{ matrix.arch }}*" - path: output/ - - name: Copy correct platform shared library - shell: bash - run: | - ls -lR output/ - cp output/${{ matrix.os }}/${{ matrix.arch }}/* bitsandbytes/ - # Set up the Python version needed - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 - with: - python-version: ${{ matrix.python-version }} - cache: pip - - name: Install build package - shell: bash - run: pip install build - - name: Install Python test dependencies - shell: bash - run: pip install -r requirements-ci.txt - # TODO: How to run CUDA tests on GitHub actions? - #- name: Run unit tests - # if: ${{ matrix.arch == 'x86_64' }} # Tests are too slow to run in emulation. Wait for real aarch64 agents - # run: | - # PYTHONPATH=. pytest --log-cli-level=DEBUG tests - - name: Build wheel - shell: bash - run: python -m build . - - name: Determine and Set Platform Tag, then Tag Wheel - shell: bash - run: | - PLATFORM_TAG=$(python scripts/set_platform_tag.py ${{ matrix.arch }}) - echo "PLATFORM_TAG=$PLATFORM_TAG" - wheel tags --remove --abi-tag=none --python-tag=py3 --platform-tag=$PLATFORM_TAG dist/bitsandbytes-*.whl - - name: Upload build artifact - uses: actions/upload-artifact@v4 - with: - name: bdist_wheel_${{ matrix.os }}_${{ matrix.arch }} - path: dist/bitsandbytes-*.whl - retention-days: 7 + - uses: actions/checkout@v4 + # Download shared libraries + - name: Download build artifact + uses: actions/download-artifact@v4 + with: + merge-multiple: true + pattern: "shared_library*_${{ matrix.os }}_${{ matrix.arch }}*" + path: output/ + - name: Copy correct platform shared library + shell: bash + run: | + ls -lR output/ + cp output/${{ matrix.os }}/${{ matrix.arch }}/* bitsandbytes/ + # Set up the Python version needed + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + cache: pip + - name: Install build package + shell: bash + run: pip install build + - name: Install Python test dependencies + shell: bash + run: pip install -r requirements-ci.txt + # TODO: How to run CUDA tests on GitHub actions? + #- name: Run unit tests + # if: ${{ matrix.arch == 'x86_64' }} # Tests are too slow to run in emulation. Wait for real aarch64 agents + # run: | + # PYTHONPATH=. pytest --log-cli-level=DEBUG tests + - name: Build wheel + shell: bash + run: python -m build . + - name: Determine and Set Platform Tag, then Tag Wheel + shell: bash + run: | + PLATFORM_TAG=$(python scripts/set_platform_tag.py ${{ matrix.arch }}) + echo "PLATFORM_TAG=$PLATFORM_TAG" + wheel tags --remove --abi-tag=none --python-tag=py3 --platform-tag=$PLATFORM_TAG dist/bitsandbytes-*.whl + - name: Upload build artifact + uses: actions/upload-artifact@v4 + with: + name: bdist_wheel_${{ matrix.os }}_${{ matrix.arch }} + path: dist/bitsandbytes-*.whl + retention-days: 7 From 62485a346c397509bf3e2a414c1dde7cedb653ff Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Fri, 8 Mar 2024 11:21:44 +0200 Subject: [PATCH 079/112] Move build scripts to .github/scripts (from scripts/ and workflow YAML) --- .github/scripts/build-cpu.sh | 23 +++++ .github/scripts/build-cuda.sh | 29 ++++++ .../scripts}/set_platform_tag.py | 0 .github/workflows/python-package.yml | 90 +++---------------- 4 files changed, 64 insertions(+), 78 deletions(-) create mode 100644 .github/scripts/build-cpu.sh create mode 100644 .github/scripts/build-cuda.sh rename {scripts => .github/scripts}/set_platform_tag.py (100%) diff --git a/.github/scripts/build-cpu.sh b/.github/scripts/build-cpu.sh new file mode 100644 index 000000000..6dc6a8ddf --- /dev/null +++ b/.github/scripts/build-cpu.sh @@ -0,0 +1,23 @@ +#!/bin/bash +declare build_arch +declare build_os + +set -xeuo pipefail + +pip install cmake==3.28.3 + +if [ "${build_os:0:6}" == ubuntu ] && [ "${build_arch}" == aarch64 ]; then + # Allow cross-compile on aarch64 + sudo apt-get update + sudo apt-get install -y gcc-aarch64-linux-gnu binutils-aarch64-linux-gnu g++-aarch64-linux-gnu + cmake -DCMAKE_C_COMPILER=aarch64-linux-gnu-gcc -DCMAKE_CXX_COMPILER=aarch64-linux-gnu-g++ -DCOMPUTE_BACKEND=cpu . +elif [ "${build_os:0:5}" == macos ] && [ "${build_arch}" == aarch64 ]; then + cmake -DCMAKE_OSX_ARCHITECTURES=arm64 -DCOMPUTE_BACKEND=cpu . +else + cmake -DCOMPUTE_BACKEND=cpu . +fi +cmake --build . --config Release + +output_dir="output/${build_os}/${build_arch}" +mkdir -p "${output_dir}" +(shopt -s nullglob && cp bitsandbytes/*.{so,dylib,dll} "${output_dir}") diff --git a/.github/scripts/build-cuda.sh b/.github/scripts/build-cuda.sh new file mode 100644 index 000000000..0f9b8d726 --- /dev/null +++ b/.github/scripts/build-cuda.sh @@ -0,0 +1,29 @@ +#!/bin/bash +declare build_arch +declare build_os +declare cuda_version + +set -xeuo pipefail +build_capability="50;52;60;61;70;75;80;86;89;90" +[[ "${cuda_version}" == 11.7.* ]] && build_capability=${build_capability%??????} +[[ "${cuda_version}" == 11.8.* ]] && build_capability=${build_capability%???} +[[ "${build_os}" = windows-* ]] && python3 -m pip install ninja +for NO_CUBLASLT in ON OFF; do + if [ "${build_os:0:6}" == ubuntu ]; then + image=nvidia/cuda:${cuda_version}-devel-ubuntu22.04 + echo "Using image $image" + docker run --platform "linux/$build_arch" -i -w /src -v "$PWD:/src" "$image" sh -c \ + "apt-get update \ + && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cmake \ + && cmake -DCOMPUTE_BACKEND=cuda -DCOMPUTE_CAPABILITY=\"${build_capability}\" -DNO_CUBLASLT=${NO_CUBLASLT} . \ + && cmake --build ." + else + pip install cmake==3.28.3 + cmake -G Ninja -DCOMPUTE_BACKEND=cuda -DCOMPUTE_CAPABILITY="${build_capability}" -DNO_CUBLASLT=${NO_CUBLASLT} -DCMAKE_BUILD_TYPE=Release -S . + cmake --build . --config Release + fi +done + +output_dir="output/${build_os}/${build_arch}" +mkdir -p "${output_dir}" +(shopt -s nullglob && cp bitsandbytes/*.{so,dylib,dll} "${output_dir}") diff --git a/scripts/set_platform_tag.py b/.github/scripts/set_platform_tag.py similarity index 100% rename from scripts/set_platform_tag.py rename to .github/scripts/set_platform_tag.py diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 3245e2d58..f02279707 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -37,37 +37,15 @@ jobs: arch: aarch64 runs-on: ${{ matrix.os }} # One day, we could run them on native agents. Azure supports this now but it's planned only for Q3 2023 for hosted agents steps: - # Check out code - uses: actions/checkout@v4 - # On Linux we use CMake within Docker - - name: Setup cmake - uses: jwlawson/actions-setup-cmake@v1.14 - with: - cmake-version: "3.26.x" - name: Setup MSVC if: startsWith(matrix.os, 'windows') - #uses: microsoft/setup-msbuild@v1.1 # to use msbuild uses: ilammy/msvc-dev-cmd@v1.13.0 # to use cl - # Compile C++ code - name: Build C++ - shell: bash - run: | - set -ex - build_os=${{ matrix.os }} - build_arch=${{ matrix.arch }} - if [ ${build_os:0:6} == ubuntu -a ${build_arch} == aarch64 ]; then - # Allow cross-compile on aarch64 - sudo apt-get update - sudo apt-get install -y gcc-aarch64-linux-gnu binutils-aarch64-linux-gnu g++-aarch64-linux-gnu - cmake -DCMAKE_C_COMPILER=aarch64-linux-gnu-gcc -DCMAKE_CXX_COMPILER=aarch64-linux-gnu-g++ -DCOMPUTE_BACKEND=cpu . - elif [ ${build_os:0:5} == macos -a ${build_arch} == aarch64 ]; then - cmake -DCMAKE_OSX_ARCHITECTURES=arm64 -DCOMPUTE_BACKEND=cpu . - else - cmake -DCOMPUTE_BACKEND=cpu . - fi - cmake --build . --config Release - mkdir -p output/${{ matrix.os }}/${{ matrix.arch }} - ( shopt -s nullglob && cp bitsandbytes/*.{so,dylib,dll} output/${{ matrix.os }}/${{ matrix.arch }}/ ) + run: bash .github/scripts/build-cpu.sh + env: + build_os: ${{ matrix.os }} + build_arch: ${{ matrix.arch }} - name: Upload build artifact uses: actions/upload-artifact@v4 with: @@ -91,18 +69,11 @@ jobs: arch: aarch64 runs-on: ${{ matrix.os }} # One day, we could run them on native agents. Azure supports this now but it's planned only for Q3 2023 for hosted agents steps: - # Check out code - uses: actions/checkout@v4 # Linux: We use Docker to build cross platform Cuda (aarch64 is built in emulation) - name: Set up Docker multiarch if: startsWith(matrix.os, 'ubuntu') uses: docker/setup-qemu-action@v2 - # On Linux we use CMake within Docker - - name: Setup cmake - if: ${{ !startsWith(matrix.os, 'linux') }} - uses: jwlawson/actions-setup-cmake@v1.14 - with: - cmake-version: "3.26.x" # Windows: We install Cuda on the agent (slow) - uses: Jimver/cuda-toolkit@v0.2.14 if: startsWith(matrix.os, 'windows') @@ -115,35 +86,13 @@ jobs: use-github-cache: false - name: Setup MSVC if: startsWith(matrix.os, 'windows') - #uses: microsoft/setup-msbuild@v1.1 # to use msbuild uses: ilammy/msvc-dev-cmd@v1.13.0 # to use cl - # Compile C++ code - name: Build C++ - shell: bash - run: | - set -ex - build_os=${{ matrix.os }} - build_arch=${{ matrix.arch }} - build_capability="50;52;60;61;70;75;80;86;89;90" - [[ "${{ matrix.cuda_version }}" == 11.7.* ]] && build_capability=${build_capability%??????} - [[ "${{ matrix.cuda_version }}" == 11.8.* ]] && build_capability=${build_capability%???} - [[ "${{ matrix.os }}" = windows-* ]] && python3 -m pip install ninja - for NO_CUBLASLT in ON OFF; do - if [ ${build_os:0:6} == ubuntu ]; then - image=nvidia/cuda:${{ matrix.cuda_version }}-devel-ubuntu22.04 - echo "Using image $image" - docker run --platform linux/$build_arch -i -w /src -v $PWD:/src $image sh -c \ - "apt-get update \ - && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cmake \ - && cmake -DCOMPUTE_BACKEND=cuda -DCOMPUTE_CAPABILITY=\"${build_capability}\" -DNO_CUBLASLT=${NO_CUBLASLT} . \ - && cmake --build ." - else - cmake -G Ninja -DCOMPUTE_BACKEND=cuda -DCOMPUTE_CAPABILITY="${build_capability}" -DNO_CUBLASLT=${NO_CUBLASLT} -DCMAKE_BUILD_TYPE=Release -S . - cmake --build . --config Release - fi - done - mkdir -p output/${{ matrix.os }}/${{ matrix.arch }} - ( shopt -s nullglob && cp bitsandbytes/*.{so,dylib,dll} output/${{ matrix.os }}/${{ matrix.arch }}/ ) + run: bash .github/scripts/build-cuda.sh + env: + build_os: ${{ matrix.os }} + build_arch: ${{ matrix.arch }} + cuda_version: ${{ matrix.cuda_version }} - name: Upload build artifact uses: actions/upload-artifact@v4 with: @@ -167,9 +116,7 @@ jobs: arch: aarch64 runs-on: ${{ matrix.os }} steps: - # Check out code - uses: actions/checkout@v4 - # Download shared libraries - name: Download build artifact uses: actions/download-artifact@v4 with: @@ -181,30 +128,17 @@ jobs: run: | ls -lR output/ cp output/${{ matrix.os }}/${{ matrix.arch }}/* bitsandbytes/ - # Set up the Python version needed - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} cache: pip - - name: Install build package - shell: bash - run: pip install build - - name: Install Python test dependencies - shell: bash - run: pip install -r requirements-ci.txt - # TODO: How to run CUDA tests on GitHub actions? - #- name: Run unit tests - # if: ${{ matrix.arch == 'x86_64' }} # Tests are too slow to run in emulation. Wait for real aarch64 agents - # run: | - # PYTHONPATH=. pytest --log-cli-level=DEBUG tests - - name: Build wheel - shell: bash - run: python -m build . + - run: pip install build wheel + - run: python -m build . - name: Determine and Set Platform Tag, then Tag Wheel shell: bash run: | - PLATFORM_TAG=$(python scripts/set_platform_tag.py ${{ matrix.arch }}) + PLATFORM_TAG=$(python .github/scripts/set_platform_tag.py "${{ matrix.arch }}") echo "PLATFORM_TAG=$PLATFORM_TAG" wheel tags --remove --abi-tag=none --python-tag=py3 --platform-tag=$PLATFORM_TAG dist/bitsandbytes-*.whl - name: Upload build artifact From 7af138ab57b8529a421dc3586e0ce3c700e5ede6 Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Fri, 8 Mar 2024 11:26:52 +0200 Subject: [PATCH 080/112] Add audit-wheel step Closes #1114 Co-authored-by: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com> --- .github/scripts/auditwheel_show.py | 31 ++++++++++++++++++++++++++++ .github/workflows/python-package.yml | 24 +++++++++++++++++++++ 2 files changed, 55 insertions(+) create mode 100755 .github/scripts/auditwheel_show.py diff --git a/.github/scripts/auditwheel_show.py b/.github/scripts/auditwheel_show.py new file mode 100755 index 000000000..c9dd09cc2 --- /dev/null +++ b/.github/scripts/auditwheel_show.py @@ -0,0 +1,31 @@ +import argparse +import subprocess + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("wheels", nargs="*") + args = ap.parse_args() + if not args.wheels: + ap.error("At least one wheel must be provided.") + for whl in args.wheels: + print(f"### `{whl}`") + + audit_wheel_output = subprocess.run( + ["auditwheel", "show", whl], + capture_output=True, + text=True, + errors="backslashreplace", + ) + + if audit_wheel_output.stdout: + print(audit_wheel_output.stdout) + + if audit_wheel_output.stderr: + print(f"**Error:**\n```{audit_wheel_output.stderr}```") + + print("---") + + +if __name__ == "__main__": + main() diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index f02279707..40671d68e 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -35,6 +35,8 @@ jobs: exclude: - os: windows-latest # This probably requires arm64 Windows agents arch: aarch64 + - os: ubuntu-latest # Temporary. Takes too long, not ready yet. + arch: aarch64 runs-on: ${{ matrix.os }} # One day, we could run them on native agents. Azure supports this now but it's planned only for Q3 2023 for hosted agents steps: - uses: actions/checkout@v4 @@ -114,6 +116,8 @@ jobs: exclude: - os: windows-latest # This probably requires arm64 Windows agents arch: aarch64 + - os: ubuntu-latest # Temporary. Takes too long, not ready yet. + arch: aarch64 runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v4 @@ -147,3 +151,23 @@ jobs: name: bdist_wheel_${{ matrix.os }}_${{ matrix.arch }} path: dist/bitsandbytes-*.whl retention-days: 7 + + audit-wheels: + needs: build-wheels + runs-on: ubuntu-latest + env: + PIP_DISABLE_PIP_VERSION_CHECK: 1 + steps: + - uses: actions/checkout@v4 + - name: Download all wheels + uses: actions/download-artifact@v4 + with: + merge-multiple: true + pattern: "bdist_wheel_*" + path: wheels/ + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + - run: pip install auditwheel + - run: python ./.github/scripts/auditwheel_show.py wheels/* | tee $GITHUB_STEP_SUMMARY From 782ab96eff062a6f24393742371f6442a88ae0fe Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Fri, 8 Mar 2024 14:00:36 +0200 Subject: [PATCH 081/112] Adjust requirements-ci.txt for CI --- requirements-ci.txt | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/requirements-ci.txt b/requirements-ci.txt index 46bd5b9cd..e6e375ccb 100644 --- a/requirements-ci.txt +++ b/requirements-ci.txt @@ -1,7 +1,6 @@ # Requirements used for GitHub actions pytest==7.2.2 einops==0.6.0 -wheel==0.40.0 lion-pytorch==0.0.6 -scipy==1.11.4 -pandas==2.2.0 +scipy==1.10.1; python_version < "3.9" +scipy==1.11.4; python_version >= "3.9" From 62249b4a7752ba35d75f0848a2e380e5d650da5c Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Fri, 8 Mar 2024 16:35:08 +0200 Subject: [PATCH 082/112] Soft-require `transformers` in tests --- tests/test_generation.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/tests/test_generation.py b/tests/test_generation.py index b05749bf8..ef354d70a 100644 --- a/tests/test_generation.py +++ b/tests/test_generation.py @@ -3,17 +3,14 @@ import pytest import torch -import transformers -from transformers import ( - AutoModelForCausalLM, - BitsAndBytesConfig, -) from tests.helpers import TRUE_FALSE, describe_dtype, id_formatter +transformers = pytest.importorskip("transformers") + def get_4bit_config(): - return BitsAndBytesConfig( + return transformers.BitsAndBytesConfig( load_in_4bit=True, load_in_8bit=False, llm_int8_threshold=6.0, @@ -31,7 +28,7 @@ def get_model_and_tokenizer(config): bnb_config.load_in_4bit = False else: bnb_config.bnb_4bit_quant_type= quant_type - model = AutoModelForCausalLM.from_pretrained(model_name_or_path, + model = transformers.AutoModelForCausalLM.from_pretrained(model_name_or_path, quantization_config=bnb_config, max_memory={0:'48GB'}, device_map='auto', From 2416dd3693fc2f1aa93fe300b71bf14eb1184aa9 Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Fri, 8 Mar 2024 17:37:38 +0200 Subject: [PATCH 083/112] Add additional guard for "no NVIDIA driver" --- tests/conftest.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/conftest.py b/tests/conftest.py index 7aee8c922..17ffd281c 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -13,6 +13,11 @@ def pytest_runtest_call(item): if str(ae) == "Torch not compiled with CUDA enabled": pytest.skip("Torch not compiled with CUDA enabled") raise + except RuntimeError as re: + # CUDA-enabled Torch build, but no CUDA-capable device found + if "Found no NVIDIA driver on your system" in str(re): + pytest.skip("No NVIDIA driver found") + raise @pytest.fixture(scope="session") From ce597c639c66e0d049e0a42c33e56db07844f21f Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Fri, 8 Mar 2024 11:23:10 +0200 Subject: [PATCH 084/112] Add commented-out test step to CI --- .github/workflows/python-package.yml | 30 ++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 40671d68e..ba5961f72 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -171,3 +171,33 @@ jobs: python-version: "3.12" - run: pip install auditwheel - run: python ./.github/scripts/auditwheel_show.py wheels/* | tee $GITHUB_STEP_SUMMARY + +# test: +# needs: +# - build-wheels +# strategy: +# fail-fast: false +# matrix: +# include: +# - os: ubuntu-latest +# arch: x86_64 +# python-version: "3.8" +# - os: windows-latest +# arch: x86_64 +# python-version: "3.8" +# runs-on: ${{ matrix.os }} +# steps: +# - uses: actions/checkout@v4 +# - uses: actions/download-artifact@v4 +# with: +# merge-multiple: true +# pattern: "bdist_wheel_${{ matrix.os }}_${{ matrix.arch }}*" +# path: wheel/ +# - uses: actions/setup-python@v5 +# with: +# python-version: ${{ matrix.python-version }} +# cache: pip +# - shell: bash +# run: ls -lar wheel/ +# - run: pip install wheel/*.whl -r requirements-ci.txt +# - run: pytest --log-cli-level=DEBUG --continue-on-collection-errors tests From e2db55eda3f806b21eca595a790b80a2a60ab9ab Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Tue, 6 Feb 2024 01:44:10 +0200 Subject: [PATCH 085/112] Rework CUDA setup and diagnostics --- bitsandbytes/__init__.py | 9 +- bitsandbytes/__main__.py | 108 +---- bitsandbytes/cextension.py | 149 +++++-- bitsandbytes/consts.py | 12 + bitsandbytes/cuda_setup/env_vars.py | 53 --- bitsandbytes/cuda_setup/main.py | 393 ------------------ bitsandbytes/cuda_specs.py | 41 ++ .../{cuda_setup => diagnostics}/__init__.py | 0 bitsandbytes/diagnostics/cuda.py | 169 ++++++++ bitsandbytes/diagnostics/main.py | 70 ++++ bitsandbytes/diagnostics/utils.py | 12 + bitsandbytes/functional.py | 4 +- bitsandbytes/optim/__init__.py | 2 - tests/test_cuda_setup_evaluator.py | 50 ++- 14 files changed, 462 insertions(+), 610 deletions(-) create mode 100644 bitsandbytes/consts.py delete mode 100644 bitsandbytes/cuda_setup/env_vars.py delete mode 100644 bitsandbytes/cuda_setup/main.py create mode 100644 bitsandbytes/cuda_specs.py rename bitsandbytes/{cuda_setup => diagnostics}/__init__.py (100%) create mode 100644 bitsandbytes/diagnostics/cuda.py create mode 100644 bitsandbytes/diagnostics/main.py create mode 100644 bitsandbytes/diagnostics/utils.py diff --git a/bitsandbytes/__init__.py b/bitsandbytes/__init__.py index dbb267d17..78c99355b 100644 --- a/bitsandbytes/__init__.py +++ b/bitsandbytes/__init__.py @@ -3,7 +3,7 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. -from . import cuda_setup, research, utils +from . import research, utils from .autograd._functions import ( MatmulLtState, bmm_cublas, @@ -12,11 +12,8 @@ matmul_cublas, mm_cublas, ) -from .cextension import COMPILED_WITH_CUDA from .nn import modules - -if COMPILED_WITH_CUDA: - from .optim import adam +from .optim import adam __pdoc__ = { "libbitsandbytes": False, @@ -25,5 +22,3 @@ } __version__ = "0.44.0.dev" - -PACKAGE_GITHUB_URL = "https://github.com/TimDettmers/bitsandbytes" diff --git a/bitsandbytes/__main__.py b/bitsandbytes/__main__.py index 61b42e78f..e716b6f3f 100644 --- a/bitsandbytes/__main__.py +++ b/bitsandbytes/__main__.py @@ -1,108 +1,4 @@ -import glob -import os -import sys -from warnings import warn - -import torch - -HEADER_WIDTH = 60 - - -def find_dynamic_library(folder, filename): - for ext in ("so", "dll", "dylib"): - yield from glob.glob(os.path.join(folder, "**", filename + ext)) - - -def generate_bug_report_information(): - print_header("") - print_header("BUG REPORT INFORMATION") - print_header("") - print('') - - path_sources = [ - ("ANACONDA CUDA PATHS", os.environ.get("CONDA_PREFIX")), - ("/usr/local CUDA PATHS", "/usr/local"), - ("CUDA PATHS", os.environ.get("CUDA_PATH")), - ("WORKING DIRECTORY CUDA PATHS", os.getcwd()), - ] - try: - ld_library_path = os.environ.get("LD_LIBRARY_PATH") - if ld_library_path: - for path in set(ld_library_path.strip().split(os.pathsep)): - path_sources.append((f"LD_LIBRARY_PATH {path} CUDA PATHS", path)) - except Exception as e: - print(f"Could not parse LD_LIBRARY_PATH: {e}") - - for name, path in path_sources: - if path and os.path.isdir(path): - print_header(name) - print(list(find_dynamic_library(path, '*cuda*'))) - print("") - - -def print_header( - txt: str, width: int = HEADER_WIDTH, filler: str = "+" -) -> None: - txt = f" {txt} " if txt else "" - print(txt.center(width, filler)) - - -def print_debug_info() -> None: - from . import PACKAGE_GITHUB_URL - print( - "\nAbove we output some debug information. Please provide this info when " - f"creating an issue via {PACKAGE_GITHUB_URL}/issues/new/choose ...\n" - ) - - -def main(): - generate_bug_report_information() - - from . import COMPILED_WITH_CUDA - from .cuda_setup.main import get_compute_capabilities - - print_header("OTHER") - print(f"COMPILED_WITH_CUDA = {COMPILED_WITH_CUDA}") - print(f"COMPUTE_CAPABILITIES_PER_GPU = {get_compute_capabilities()}") - print_header("") - print_header("DEBUG INFO END") - print_header("") - print("Checking that the library is importable and CUDA is callable...") - print("\nWARNING: Please be sure to sanitize sensitive info from any such env vars!\n") - - try: - from bitsandbytes.optim import Adam - - p = torch.nn.Parameter(torch.rand(10, 10).cuda()) - a = torch.rand(10, 10).cuda() - - p1 = p.data.sum().item() - - adam = Adam([p]) - - out = a * p - loss = out.sum() - loss.backward() - adam.step() - - p2 = p.data.sum().item() - - assert p1 != p2 - print("SUCCESS!") - print("Installation was successful!") - except ImportError: - print() - warn( - f"WARNING: {__package__} is currently running as CPU-only!\n" - "Therefore, 8-bit optimizers and GPU quantization are unavailable.\n\n" - f"If you think that this is so erroneously,\nplease report an issue!" - ) - print_debug_info() - except Exception as e: - print(e) - print_debug_info() - sys.exit(1) - - if __name__ == "__main__": + from bitsandbytes.diagnostics.main import main + main() diff --git a/bitsandbytes/cextension.py b/bitsandbytes/cextension.py index 858365f02..57ba71020 100644 --- a/bitsandbytes/cextension.py +++ b/bitsandbytes/cextension.py @@ -1,39 +1,124 @@ +""" +extract factors the build is dependent on: +[X] compute capability + [ ] TODO: Q - What if we have multiple GPUs of different makes? +- CUDA version +- Software: + - CPU-only: only CPU quantization functions (no optimizer, no matrix multiple) + - CuBLAS-LT: full-build 8-bit optimizer + - no CuBLAS-LT: no 8-bit matrix multiplication (`nomatmul`) + +evaluation: + - if paths faulty, return meaningful error + - else: + - determine CUDA version + - determine capabilities + - based on that set the default path +""" + import ctypes as ct -from warnings import warn +import logging +import os +from pathlib import Path import torch -from bitsandbytes.cuda_setup.main import CUDASetup +from bitsandbytes.consts import DYNAMIC_LIBRARY_SUFFIX, PACKAGE_DIR +from bitsandbytes.cuda_specs import CUDASpecs, get_cuda_specs + +logger = logging.getLogger(__name__) + + +def get_cuda_bnb_library_path(cuda_specs: CUDASpecs) -> Path: + """ + Get the disk path to the CUDA BNB native library specified by the + given CUDA specs, taking into account the `BNB_CUDA_VERSION` override environment variable. + + The library is not guaranteed to exist at the returned path. + """ + library_name = f"libbitsandbytes_cuda{cuda_specs.cuda_version_string}" + if not cuda_specs.has_cublaslt: + # if not has_cublaslt (CC < 7.5), then we have to choose _nocublaslt + library_name += "_nocublaslt" + library_name = f"{library_name}{DYNAMIC_LIBRARY_SUFFIX}" + + override_value = os.environ.get("BNB_CUDA_VERSION") + if override_value: + library_name_stem, _, library_name_ext = library_name.rpartition(".") + # `library_name_stem` will now be e.g. `libbitsandbytes_cuda118`; + # let's remove any trailing numbers: + library_name_stem = library_name_stem.rstrip("0123456789") + # `library_name_stem` will now be e.g. `libbitsandbytes_cuda`; + # let's tack the new version number and the original extension back on. + library_name = f"{library_name_stem}{override_value}.{library_name_ext}" + logger.warning( + f"WARNING: BNB_CUDA_VERSION={override_value} environment variable detected; loading {library_name}.\n" + "This can be used to load a bitsandbytes version that is different from the PyTorch CUDA version.\n" + "If this was unintended set the BNB_CUDA_VERSION variable to an empty string: export BNB_CUDA_VERSION=\n" + "If you use the manual override make sure the right libcudart.so is in your LD_LIBRARY_PATH\n" + "For example by adding the following to your .bashrc: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH: BNBNativeLibrary: + binary_path = PACKAGE_DIR / f"libbitsandbytes_cpu{DYNAMIC_LIBRARY_SUFFIX}" + cuda_specs = get_cuda_specs() + if cuda_specs: + cuda_binary_path = get_cuda_bnb_library_path(cuda_specs) + if cuda_binary_path.exists(): + binary_path = cuda_binary_path + else: + logger.warning("Could not find the bitsandbytes CUDA binary at %r", cuda_binary_path) + logger.debug(f"Loading bitsandbytes native library from: {binary_path}") + dll = ct.cdll.LoadLibrary(str(binary_path)) + + if hasattr(dll, "get_context"): # only a CUDA-built library exposes this + return CudaBNBNativeLibrary(dll) + + logger.warning( + "The installed version of bitsandbytes was compiled without GPU support. " + "8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable." + ) + return BNBNativeLibrary(dll) -setup = CUDASetup.get_instance() -if setup.initialized != True: - setup.run_cuda_setup() -lib = setup.lib try: - if lib is None and torch.cuda.is_available(): - CUDASetup.get_instance().generate_instructions() - CUDASetup.get_instance().print_log_stack() - raise RuntimeError(''' - CUDA Setup failed despite GPU being available. Please run the following command to get more information: - - python -m bitsandbytes - - Inspect the output of the command and see if you can locate CUDA libraries. You might need to add them - to your LD_LIBRARY_PATH. If you suspect a bug, please take the information from python -m bitsandbytes - and open an issue at: https://github.com/TimDettmers/bitsandbytes/issues''') - _ = lib.cadam32bit_grad_fp32 # runs on an error if the library could not be found -> COMPILED_WITH_CUDA=False - lib.get_context.restype = ct.c_void_p - lib.get_cusparse.restype = ct.c_void_p - lib.cget_managed_ptr.restype = ct.c_void_p - COMPILED_WITH_CUDA = True -except AttributeError as ex: - warn("The installed version of bitsandbytes was compiled without GPU support. " - "8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.") - COMPILED_WITH_CUDA = False - print(str(ex)) - - -# print the setup details after checking for errors so we do not print twice -#if 'BITSANDBYTES_NOWELCOME' not in os.environ or str(os.environ['BITSANDBYTES_NOWELCOME']) == '0': - #setup.print_log_stack() + lib = get_native_library() +except Exception as e: + lib = None + logger.error(f"Could not load bitsandbytes native library: {e}", exc_info=True) + if torch.cuda.is_available(): + logger.warning( + """ +CUDA Setup failed despite CUDA being available. Please run the following command to get more information: + +python -m bitsandbytes + +Inspect the output of the command and see if you can locate CUDA libraries. You might need to add them +to your LD_LIBRARY_PATH. If you suspect a bug, please take the information from python -m bitsandbytes +and open an issue at: https://github.com/TimDettmers/bitsandbytes/issues +""" + ) diff --git a/bitsandbytes/consts.py b/bitsandbytes/consts.py new file mode 100644 index 000000000..8242d104e --- /dev/null +++ b/bitsandbytes/consts.py @@ -0,0 +1,12 @@ +from pathlib import Path +import platform + +DYNAMIC_LIBRARY_SUFFIX = { + "Darwin": ".dylib", + "Linux": ".so", + "Windows": ".dll", +}.get(platform.system(), ".so") + +PACKAGE_DIR = Path(__file__).parent +PACKAGE_GITHUB_URL = "https://github.com/TimDettmers/bitsandbytes" +NONPYTORCH_DOC_URL = "https://github.com/TimDettmers/bitsandbytes/blob/main/docs/source/nonpytorchcuda.mdx" diff --git a/bitsandbytes/cuda_setup/env_vars.py b/bitsandbytes/cuda_setup/env_vars.py deleted file mode 100644 index 4b2549653..000000000 --- a/bitsandbytes/cuda_setup/env_vars.py +++ /dev/null @@ -1,53 +0,0 @@ -import os -from typing import Dict - - -def to_be_ignored(env_var: str, value: str) -> bool: - ignorable = { - "PWD", # PWD: this is how the shell keeps track of the current working dir - "OLDPWD", - "SSH_AUTH_SOCK", # SSH stuff, therefore unrelated - "SSH_TTY", - "GOOGLE_VM_CONFIG_LOCK_FILE", # GCP: requires elevated permissions, causing problems in VMs and Jupyter notebooks - "HOME", # Linux shell default - "TMUX", # Terminal Multiplexer - "XDG_DATA_DIRS", # XDG: Desktop environment stuff - "XDG_GREETER_DATA_DIR", # XDG: Desktop environment stuff - "XDG_RUNTIME_DIR", - "MAIL", # something related to emails - "SHELL", # binary for currently invoked shell - "DBUS_SESSION_BUS_ADDRESS", # hardware related - "PATH", # this is for finding binaries, not libraries - "LESSOPEN", # related to the `less` command - "LESSCLOSE", - "_", # current Python interpreter - } - return env_var in ignorable - - -def might_contain_a_path(candidate: str) -> bool: - return os.sep in candidate - - -def is_active_conda_env(env_var: str) -> bool: - return "CONDA_PREFIX" == env_var - - -def is_other_conda_env_var(env_var: str) -> bool: - return "CONDA" in env_var - - -def is_relevant_candidate_env_var(env_var: str, value: str) -> bool: - return is_active_conda_env(env_var) or ( - might_contain_a_path(value) and not - is_other_conda_env_var(env_var) and not - to_be_ignored(env_var, value) - ) - - -def get_potentially_lib_path_containing_env_vars() -> Dict[str, str]: - return { - env_var: value - for env_var, value in os.environ.items() - if is_relevant_candidate_env_var(env_var, value) - } diff --git a/bitsandbytes/cuda_setup/main.py b/bitsandbytes/cuda_setup/main.py deleted file mode 100644 index b351f7f03..000000000 --- a/bitsandbytes/cuda_setup/main.py +++ /dev/null @@ -1,393 +0,0 @@ -""" -extract factors the build is dependent on: -[X] compute capability - [ ] TODO: Q - What if we have multiple GPUs of different makes? -- CUDA version -- Software: - - CPU-only: only CPU quantization functions (no optimizer, no matrix multiply) - - CuBLAS-LT: full-build 8-bit optimizer - - no CuBLAS-LT: no 8-bit matrix multiplication (`nomatmul`) - -evaluation: - - if paths faulty, return meaningful error - - else: - - determine CUDA version - - determine capabilities - - based on that set the default path -""" - -import ctypes as ct -import errno -import os -from pathlib import Path -import platform -from typing import Set, Union -from warnings import warn - -import torch - -from .env_vars import get_potentially_lib_path_containing_env_vars - -DYNAMIC_LIBRARY_SUFFIX = { "Darwin": ".dylib", "Windows": ".dll", "Linux": ".so"}.get(platform.system(), ".so") -if platform.system() == "Windows": # Windows - CUDA_RUNTIME_LIBS = ["cudart64_110.dll", "cudart64_12.dll"] -else: # Linux or other - # these are the most common libs names - # libcudart.so is missing by default for a conda install with PyTorch 2.0 and instead - # we have libcudart.so.11.0 which causes a lot of errors before - # not sure if libcudart.so.12.0 exists in pytorch installs, but it does not hurt - CUDA_RUNTIME_LIBS = ["libcudart.so", "libcudart.so.11.0", "libcudart.so.12.0", "libcudart.so.12.1", "libcudart.so.12.2"] - - -class CUDASetup: - _instance = None - - def __init__(self): - raise RuntimeError("Call get_instance() instead") - - def generate_instructions(self): - if getattr(self, 'error', False): return - print(self.error) - self.error = True - if not self.cuda_available: - self.add_log_entry('CUDA SETUP: Problem: The main issue seems to be that the main CUDA library was not detected or CUDA not installed.') - self.add_log_entry('CUDA SETUP: Solution 1): Your paths are probably not up-to-date. You can update them via: sudo ldconfig.') - self.add_log_entry('CUDA SETUP: Solution 2): If you do not have sudo rights, you can do the following:') - self.add_log_entry('CUDA SETUP: Solution 2a): Find the cuda library via: find / -name libcuda.so 2>/dev/null') - self.add_log_entry('CUDA SETUP: Solution 2b): Once the library is found add it to the LD_LIBRARY_PATH: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:FOUND_PATH_FROM_2a') - self.add_log_entry('CUDA SETUP: Solution 2c): For a permanent solution add the export from 2b into your .bashrc file, located at ~/.bashrc') - self.add_log_entry('CUDA SETUP: Solution 3): For a missing CUDA runtime library (libcudart.so), use `find / -name libcudart.so* and follow with step (2b)') - return - - if self.cudart_path is None: - self.add_log_entry('CUDA SETUP: Problem: The main issue seems to be that the main CUDA runtime library was not detected.') - self.add_log_entry('CUDA SETUP: Solution 1: To solve the issue the libcudart.so location needs to be added to the LD_LIBRARY_PATH variable') - self.add_log_entry('CUDA SETUP: Solution 1a): Find the cuda runtime library via: find / -name libcudart.so 2>/dev/null') - self.add_log_entry('CUDA SETUP: Solution 1b): Once the library is found add it to the LD_LIBRARY_PATH: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:FOUND_PATH_FROM_1a') - self.add_log_entry('CUDA SETUP: Solution 1c): For a permanent solution add the export from 1b into your .bashrc file, located at ~/.bashrc') - self.add_log_entry('CUDA SETUP: Solution 2: If no library was found in step 1a) you need to install CUDA.') - self.add_log_entry('CUDA SETUP: Solution 2a): Download CUDA install script: wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/cuda_install.sh') - self.add_log_entry('CUDA SETUP: Solution 2b): Install desired CUDA version to desired location. The syntax is bash cuda_install.sh CUDA_VERSION PATH_TO_INSTALL_INTO.') - self.add_log_entry('CUDA SETUP: Solution 2b): For example, "bash cuda_install.sh 113 ~/local/" will download CUDA 11.3 and install into the folder ~/local') - - return - - make_cmd = f'CUDA_VERSION={self.cuda_version_string}' - if len(self.cuda_version_string) < 3: - make_cmd += ' make cuda92' - elif self.cuda_version_string == '110': - make_cmd += ' make cuda110' - elif self.cuda_version_string[:2] == '11' and int(self.cuda_version_string[2]) > 0: - make_cmd += ' make cuda11x' - elif self.cuda_version_string[:2] == '12' and 1 >= int(self.cuda_version_string[2]) >= 0: - make_cmd += ' make cuda12x' - elif self.cuda_version_string == '100': - self.add_log_entry('CUDA SETUP: CUDA 10.0 not supported. Please use a different CUDA version.') - self.add_log_entry('CUDA SETUP: Before you try again running bitsandbytes, make sure old CUDA 10.0 versions are uninstalled and removed from $LD_LIBRARY_PATH variables.') - return - - - has_cublaslt = is_cublasLt_compatible(self.cc) - if not has_cublaslt: - make_cmd += '_nomatmul' - - self.add_log_entry('CUDA SETUP: Something unexpected happened. Please compile from source:') - self.add_log_entry('git clone https://github.com/TimDettmers/bitsandbytes.git') - self.add_log_entry('cd bitsandbytes') - self.add_log_entry(make_cmd) - self.add_log_entry('python setup.py install') - - def initialize(self): - if not getattr(self, 'initialized', False): - self.has_printed = False - self.lib = None - self.initialized = False - self.error = False - - def manual_override(self): - if not torch.cuda.is_available(): - return - override_value = os.environ.get('BNB_CUDA_VERSION') - if not override_value: - return - - binary_name_stem, _, binary_name_ext = self.binary_name.rpartition(".") - # `binary_name_stem` will now be e.g. `/foo/bar/libbitsandbytes_cuda118`; - # let's remove any trailing numbers: - binary_name_stem = binary_name_stem.rstrip("0123456789") - # `binary_name_stem` will now be e.g. `/foo/bar/libbitsandbytes_cuda`; - # let's tack the new version number and the original extension back on. - self.binary_name = f"{binary_name_stem}{override_value}.{binary_name_ext}" - - warn( - f'\n\n{"=" * 80}\n' - 'WARNING: Manual override via BNB_CUDA_VERSION env variable detected!\n' - 'BNB_CUDA_VERSION=XXX can be used to load a bitsandbytes version that is different from the PyTorch CUDA version.\n' - 'If this was unintended set the BNB_CUDA_VERSION variable to an empty string: export BNB_CUDA_VERSION=\n' - 'If you use the manual override make sure the right libcudart.so is in your LD_LIBRARY_PATH\n' - 'For example by adding the following to your .bashrc: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH: Set[Path]: - return {Path(ld_path) for ld_path in paths_list_candidate.split(os.pathsep) if ld_path} - - -def remove_non_existent_dirs(candidate_paths: Set[Path]) -> Set[Path]: - existent_directories: Set[Path] = set() - for path in candidate_paths: - try: - if path.exists(): - existent_directories.add(path) - except PermissionError: - # Handle the PermissionError first as it is a subtype of OSError - # https://docs.python.org/3/library/exceptions.html#exception-hierarchy - pass - except OSError as exc: - if exc.errno != errno.ENAMETOOLONG: - raise exc - - non_existent_directories: Set[Path] = candidate_paths - existent_directories - if non_existent_directories: - CUDASetup.get_instance().add_log_entry( - f"The following directories listed in your path were found to be non-existent: {non_existent_directories}", - is_warning=False, - ) - - return existent_directories - - -def get_cuda_runtime_lib_paths(candidate_paths: Set[Path]) -> Set[Path]: - paths = set() - for libname in CUDA_RUNTIME_LIBS: - for path in candidate_paths: - try: - if (path / libname).is_file(): - paths.add(path / libname) - except PermissionError: - pass - return paths - - -def resolve_paths_list(paths_list_candidate: str) -> Set[Path]: - """ - Searches a given environmental var for the CUDA runtime library, - i.e. `libcudart.so`. - """ - return remove_non_existent_dirs(extract_candidate_paths(paths_list_candidate)) - - -def find_cuda_lib_in(paths_list_candidate: str) -> Set[Path]: - return get_cuda_runtime_lib_paths( - resolve_paths_list(paths_list_candidate) - ) - - -def warn_in_case_of_duplicates(results_paths: Set[Path]) -> None: - if len(results_paths) > 1: - warning_msg = ( - f"Found duplicate {CUDA_RUNTIME_LIBS} files: {results_paths}.. " - "We select the PyTorch default libcudart.so, which is {torch.version.cuda}," - "but this might mismatch with the CUDA version that is needed for bitsandbytes." - "To override this behavior set the BNB_CUDA_VERSION= environmental variable" - "For example, if you want to use the CUDA version 122" - "BNB_CUDA_VERSION=122 python ..." - "OR set the environmental variable in your .bashrc: export BNB_CUDA_VERSION=122" - "In the case of a manual override, make sure you set the LD_LIBRARY_PATH, e.g." - "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12.2") - CUDASetup.get_instance().add_log_entry(warning_msg, is_warning=True) - - -def determine_cuda_runtime_lib_path() -> Union[Path, None]: - """ - Searches for a cuda installations, in the following order of priority: - 1. active conda env - 2. LD_LIBRARY_PATH - 3. any other env vars, while ignoring those that - - are known to be unrelated (see `bnb.cuda_setup.env_vars.to_be_ignored`) - - don't contain the path separator `/` - - If multiple libraries are found in part 3, we optimistically try one, - while giving a warning message. - """ - candidate_env_vars = get_potentially_lib_path_containing_env_vars() - - cuda_runtime_libs = set() - if "CONDA_PREFIX" in candidate_env_vars: - conda_libs_path = Path(candidate_env_vars["CONDA_PREFIX"]) / "lib" - - conda_cuda_libs = find_cuda_lib_in(str(conda_libs_path)) - warn_in_case_of_duplicates(conda_cuda_libs) - - if conda_cuda_libs: - cuda_runtime_libs.update(conda_cuda_libs) - - CUDASetup.get_instance().add_log_entry(f'{candidate_env_vars["CONDA_PREFIX"]} did not contain ' - f'{CUDA_RUNTIME_LIBS} as expected! Searching further paths...', is_warning=True) - - if "LD_LIBRARY_PATH" in candidate_env_vars: - lib_ld_cuda_libs = find_cuda_lib_in(candidate_env_vars["LD_LIBRARY_PATH"]) - - if lib_ld_cuda_libs: - cuda_runtime_libs.update(lib_ld_cuda_libs) - warn_in_case_of_duplicates(lib_ld_cuda_libs) - - CUDASetup.get_instance().add_log_entry(f'{candidate_env_vars["LD_LIBRARY_PATH"]} did not contain ' - f'{CUDA_RUNTIME_LIBS} as expected! Searching further paths...', is_warning=True) - - remaining_candidate_env_vars = { - env_var: value for env_var, value in candidate_env_vars.items() - if env_var not in {"CONDA_PREFIX", "LD_LIBRARY_PATH"} - } - - cuda_runtime_libs = set() - for env_var, value in remaining_candidate_env_vars.items(): - cuda_runtime_libs.update(find_cuda_lib_in(value)) - - if len(cuda_runtime_libs) == 0: - CUDASetup.get_instance().add_log_entry('CUDA_SETUP: WARNING! libcudart.so not found in any environmental path. Searching in backup paths...') - cuda_runtime_libs.update(find_cuda_lib_in('/usr/local/cuda/lib64')) - - warn_in_case_of_duplicates(cuda_runtime_libs) - - cuda_setup = CUDASetup.get_instance() - cuda_setup.add_log_entry(f'DEBUG: Possible options found for libcudart.so: {cuda_runtime_libs}') - - return next(iter(cuda_runtime_libs)) if cuda_runtime_libs else None - - -# https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART____VERSION.html#group__CUDART____VERSION -def get_cuda_version(): - major, minor = map(int, torch.version.cuda.split(".")) - - if major < 11: - CUDASetup.get_instance().add_log_entry('CUDA SETUP: CUDA version lower than 11 are currently not supported for LLM.int8(). You will be only to use 8-bit optimizers and quantization routines!!') - - return f'{major}{minor}' - -def get_compute_capabilities(): - ccs = [] - for i in range(torch.cuda.device_count()): - cc_major, cc_minor = torch.cuda.get_device_capability(torch.cuda.device(i)) - ccs.append(f"{cc_major}.{cc_minor}") - - ccs.sort(key=lambda v: tuple(map(int, str(v).split(".")))) - - return ccs - - -def evaluate_cuda_setup(): - cuda_setup = CUDASetup.get_instance() - if 'BITSANDBYTES_NOWELCOME' not in os.environ or str(os.environ['BITSANDBYTES_NOWELCOME']) == '0': - cuda_setup.add_log_entry('') - cuda_setup.add_log_entry('='*35 + 'BUG REPORT' + '='*35) - cuda_setup.add_log_entry(('Welcome to bitsandbytes. For bug reports, please run\n\npython -m bitsandbytes\n\n'), - ('and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues')) - cuda_setup.add_log_entry('='*80) - - if not torch.cuda.is_available(): - return f'libbitsandbytes_cpu{DYNAMIC_LIBRARY_SUFFIX}', None, None, None - - cudart_path = determine_cuda_runtime_lib_path() - cc = get_compute_capabilities()[-1] # we take the highest capability - cuda_version_string = get_cuda_version() - - cuda_setup.add_log_entry(f"CUDA SETUP: PyTorch settings found: CUDA_VERSION={cuda_version_string}, Highest Compute Capability: {cc}.") - cuda_setup.add_log_entry( - "CUDA SETUP: To manually override the PyTorch CUDA version please see:" - "https://github.com/TimDettmers/bitsandbytes/blob/main/how_to_use_nonpytorch_cuda.md" - ) - - - # 7.5 is the minimum CC vor cublaslt - has_cublaslt = is_cublasLt_compatible(cc) - - # TODO: - # (1) CUDA missing cases (no CUDA installed by CUDA driver (nvidia-smi accessible) - # (2) Multiple CUDA versions installed - - # we use ls -l instead of nvcc to determine the cuda version - # since most installations will have the libcudart.so installed, but not the compiler - - binary_name = f"libbitsandbytes_cuda{cuda_version_string}" - if not has_cublaslt: - # if not has_cublaslt (CC < 7.5), then we have to choose _nocublaslt - binary_name += "_nocublaslt" - - binary_name = f"{binary_name}{DYNAMIC_LIBRARY_SUFFIX}" - - return binary_name, cudart_path, cc, cuda_version_string diff --git a/bitsandbytes/cuda_specs.py b/bitsandbytes/cuda_specs.py new file mode 100644 index 000000000..ed19795a0 --- /dev/null +++ b/bitsandbytes/cuda_specs.py @@ -0,0 +1,41 @@ +import dataclasses +from typing import List, Optional, Tuple + +import torch + + +@dataclasses.dataclass(frozen=True) +class CUDASpecs: + highest_compute_capability: Tuple[int, int] + cuda_version_string: str + cuda_version_tuple: Tuple[int, int] + + @property + def has_cublaslt(self) -> bool: + return self.highest_compute_capability >= (7, 5) + + +def get_compute_capabilities() -> List[Tuple[int, int]]: + return sorted(torch.cuda.get_device_capability(torch.cuda.device(i)) for i in range(torch.cuda.device_count())) + + +def get_cuda_version_tuple() -> Tuple[int, int]: + # https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART____VERSION.html#group__CUDART____VERSION + major, minor = map(int, torch.version.cuda.split(".")) + return major, minor + + +def get_cuda_version_string() -> str: + major, minor = get_cuda_version_tuple() + return f"{major}{minor}" + + +def get_cuda_specs() -> Optional[CUDASpecs]: + if not torch.cuda.is_available(): + return None + + return CUDASpecs( + highest_compute_capability=(get_compute_capabilities()[-1]), + cuda_version_string=(get_cuda_version_string()), + cuda_version_tuple=get_cuda_version_tuple(), + ) diff --git a/bitsandbytes/cuda_setup/__init__.py b/bitsandbytes/diagnostics/__init__.py similarity index 100% rename from bitsandbytes/cuda_setup/__init__.py rename to bitsandbytes/diagnostics/__init__.py diff --git a/bitsandbytes/diagnostics/cuda.py b/bitsandbytes/diagnostics/cuda.py new file mode 100644 index 000000000..adb4cfde3 --- /dev/null +++ b/bitsandbytes/diagnostics/cuda.py @@ -0,0 +1,169 @@ +import logging +import os +from pathlib import Path +from typing import Dict, Iterable, Iterator + +import torch + +from bitsandbytes.cextension import get_cuda_bnb_library_path +from bitsandbytes.consts import NONPYTORCH_DOC_URL +from bitsandbytes.cuda_specs import CUDASpecs +from bitsandbytes.diagnostics.utils import print_dedented + +CUDART_PATH_PREFERRED_ENVVARS = ("CONDA_PREFIX", "LD_LIBRARY_PATH") + +CUDART_PATH_IGNORED_ENVVARS = { + "DBUS_SESSION_BUS_ADDRESS", # hardware related + "GOOGLE_VM_CONFIG_LOCK_FILE", # GCP: requires elevated permissions, causing problems in VMs and Jupyter notebooks + "HOME", # Linux shell default + "LESSCLOSE", + "LESSOPEN", # related to the `less` command + "MAIL", # something related to emails + "OLDPWD", + "PATH", # this is for finding binaries, not libraries + "PWD", # PWD: this is how the shell keeps track of the current working dir + "SHELL", # binary for currently invoked shell + "SSH_AUTH_SOCK", # SSH stuff, therefore unrelated + "SSH_TTY", + "TMUX", # Terminal Multiplexer + "XDG_DATA_DIRS", # XDG: Desktop environment stuff + "XDG_GREETER_DATA_DIR", # XDG: Desktop environment stuff + "XDG_RUNTIME_DIR", + "_", # current Python interpreter +} + +CUDA_RUNTIME_LIB_PATTERNS = ( + "cudart64*.dll", # Windows + "libcudart*.so*", # libcudart.so, libcudart.so.11.0, libcudart.so.12.0, libcudart.so.12.1, libcudart.so.12.2 etc. + "nvcuda*.dll", # Windows +) + +logger = logging.getLogger(__name__) + + +def find_cuda_libraries_in_path_list(paths_list_candidate: str) -> Iterable[Path]: + for dir_string in paths_list_candidate.split(os.pathsep): + if not dir_string: + continue + try: + dir = Path(dir_string) + if not dir.exists(): + logger.warning(f"The directory listed in your path is found to be non-existent: {dir}") + continue + for lib_pattern in CUDA_RUNTIME_LIB_PATTERNS: + for pth in dir.glob(lib_pattern): + if pth.is_file(): + yield pth + except PermissionError: + pass + + +def is_relevant_candidate_env_var(env_var: str, value: str) -> bool: + return ( + env_var in CUDART_PATH_PREFERRED_ENVVARS # is a preferred location + or ( + os.sep in value # might contain a path + and "CONDA" not in env_var # not another conda envvar + and env_var not in CUDART_PATH_IGNORED_ENVVARS # not ignored + ) + ) + + +def get_potentially_lib_path_containing_env_vars() -> Dict[str, str]: + return {env_var: value for env_var, value in os.environ.items() if is_relevant_candidate_env_var(env_var, value)} + + +def find_cudart_libraries() -> Iterator[Path]: + """ + Searches for a cuda installations, in the following order of priority: + 1. active conda env + 2. LD_LIBRARY_PATH + 3. any other env vars, while ignoring those that + - are known to be unrelated + - don't contain the path separator `/` + + If multiple libraries are found in part 3, we optimistically try one, + while giving a warning message. + """ + candidate_env_vars = get_potentially_lib_path_containing_env_vars() + + for envvar in CUDART_PATH_PREFERRED_ENVVARS: + if envvar in candidate_env_vars: + directory = candidate_env_vars[envvar] + yield from find_cuda_libraries_in_path_list(directory) + candidate_env_vars.pop(envvar) + + for env_var, value in candidate_env_vars.items(): + yield from find_cuda_libraries_in_path_list(value) + + +def print_cuda_diagnostics(cuda_specs: CUDASpecs) -> None: + print( + f"PyTorch settings found: CUDA_VERSION={cuda_specs.cuda_version_string}, " + f"Highest Compute Capability: {cuda_specs.highest_compute_capability}.", + ) + + binary_path = get_cuda_bnb_library_path(cuda_specs) + if not binary_path.exists(): + print_dedented( + f""" + Library not found: {binary_path}. Maybe you need to compile it from source? + If you compiled from source, try again with `make CUDA_VERSION=DETECTED_CUDA_VERSION`, + for example, `make CUDA_VERSION=113`. + + The CUDA version for the compile might depend on your conda install, if using conda. + Inspect CUDA version via `conda list | grep cuda`. + """ + ) + + cuda_major, cuda_minor = cuda_specs.cuda_version_tuple + if cuda_major < 11: + print_dedented( + """ + WARNING: CUDA versions lower than 11 are currently not supported for LLM.int8(). + You will be only to use 8-bit optimizers and quantization routines! + """ + ) + + print(f"To manually override the PyTorch CUDA version please see: {NONPYTORCH_DOC_URL}") + + # 7.5 is the minimum CC for cublaslt + if not cuda_specs.has_cublaslt: + print_dedented( + """ + WARNING: Compute capability < 7.5 detected! Only slow 8-bit matmul is supported for your GPU! + If you run into issues with 8-bit matmul, you can try 4-bit quantization: + https://huggingface.co/blog/4bit-transformers-bitsandbytes + """, + ) + + # TODO: + # (1) CUDA missing cases (no CUDA installed by CUDA driver (nvidia-smi accessible) + # (2) Multiple CUDA versions installed + + +def print_cuda_runtime_diagnostics() -> None: + cudart_paths = list(find_cudart_libraries()) + if not cudart_paths: + print("CUDA SETUP: WARNING! CUDA runtime files not found in any environmental path.") + elif len(cudart_paths) > 1: + print_dedented( + f""" + Found duplicate CUDA runtime files (see below). + + We select the PyTorch default CUDA runtime, which is {torch.version.cuda}, + but this might mismatch with the CUDA version that is needed for bitsandbytes. + To override this behavior set the `BNB_CUDA_VERSION=` environmental variable. + + For example, if you want to use the CUDA version 122, + BNB_CUDA_VERSION=122 python ... + + OR set the environmental variable in your .bashrc: + export BNB_CUDA_VERSION=122 + + In the case of a manual override, make sure you set LD_LIBRARY_PATH, e.g. + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-11.2, + """ + ) + for pth in cudart_paths: + print(f"* Found CUDA runtime at: {pth}") diff --git a/bitsandbytes/diagnostics/main.py b/bitsandbytes/diagnostics/main.py new file mode 100644 index 000000000..a7f0c901e --- /dev/null +++ b/bitsandbytes/diagnostics/main.py @@ -0,0 +1,70 @@ +import sys +import traceback + +import torch + +from bitsandbytes.consts import PACKAGE_GITHUB_URL +from bitsandbytes.cuda_specs import get_cuda_specs +from bitsandbytes.diagnostics.cuda import ( + print_cuda_diagnostics, + print_cuda_runtime_diagnostics, +) +from bitsandbytes.diagnostics.utils import print_dedented, print_header + + +def sanity_check(): + from bitsandbytes.optim import Adam + + p = torch.nn.Parameter(torch.rand(10, 10).cuda()) + a = torch.rand(10, 10).cuda() + p1 = p.data.sum().item() + adam = Adam([p]) + out = a * p + loss = out.sum() + loss.backward() + adam.step() + p2 = p.data.sum().item() + assert p1 != p2 + + +def main(): + print_header("") + print_header("BUG REPORT INFORMATION") + print_header("") + + print_header("OTHER") + cuda_specs = get_cuda_specs() + print("CUDA specs:", cuda_specs) + if not torch.cuda.is_available(): + print("Torch says CUDA is not available. Possible reasons:") + print("1. CUDA driver not installed") + print("2. CUDA not installed") + print("3. You have multiple conflicting CUDA libraries") + if cuda_specs: + print_cuda_diagnostics(cuda_specs) + print_cuda_runtime_diagnostics() + print_header("") + print_header("DEBUG INFO END") + print_header("") + print("Checking that the library is importable and CUDA is callable...") + try: + sanity_check() + print("SUCCESS!") + print("Installation was successful!") + return + except ImportError: + print( + f"WARNING: {__package__} is currently running as CPU-only!\n" + "Therefore, 8-bit optimizers and GPU quantization are unavailable.\n\n" + f"If you think that this is so erroneously,\nplease report an issue!" + ) + except Exception: + traceback.print_exc() + print_dedented( + f""" + Above we output some debug information. + Please provide this info when creating an issue via {PACKAGE_GITHUB_URL}/issues/new/choose + WARNING: Please be sure to sanitize sensitive info from the output before posting it. + """ + ) + sys.exit(1) diff --git a/bitsandbytes/diagnostics/utils.py b/bitsandbytes/diagnostics/utils.py new file mode 100644 index 000000000..770209b9d --- /dev/null +++ b/bitsandbytes/diagnostics/utils.py @@ -0,0 +1,12 @@ +import textwrap + +HEADER_WIDTH = 60 + + +def print_header(txt: str, width: int = HEADER_WIDTH, filler: str = "+") -> None: + txt = f" {txt} " if txt else "" + print(txt.center(width, filler)) + + +def print_dedented(text): + print("\n".join(textwrap.dedent(text).strip().split("\n"))) diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py index f0de962e1..61d0d83b2 100644 --- a/bitsandbytes/functional.py +++ b/bitsandbytes/functional.py @@ -14,7 +14,7 @@ from bitsandbytes.utils import pack_dict_to_tensor, unpack_tensor_to_dict -from .cextension import COMPILED_WITH_CUDA, lib +from .cextension import lib # math.prod not compatible with python < 3.8 @@ -23,7 +23,7 @@ def prod(iterable): name2qmap = {} -if COMPILED_WITH_CUDA: +if lib and lib.compiled_with_cuda: """C FUNCTIONS FOR OPTIMIZERS""" str2optimizer32bit = { "adam": ( diff --git a/bitsandbytes/optim/__init__.py b/bitsandbytes/optim/__init__.py index 6796b8e0e..b4c95793a 100644 --- a/bitsandbytes/optim/__init__.py +++ b/bitsandbytes/optim/__init__.py @@ -3,8 +3,6 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. -from bitsandbytes.cextension import COMPILED_WITH_CUDA - from .adagrad import Adagrad, Adagrad8bit, Adagrad32bit from .adam import Adam, Adam8bit, Adam32bit, PagedAdam, PagedAdam8bit, PagedAdam32bit from .adamw import ( diff --git a/tests/test_cuda_setup_evaluator.py b/tests/test_cuda_setup_evaluator.py index 189aa75b5..cb0b38fdd 100644 --- a/tests/test_cuda_setup_evaluator.py +++ b/tests/test_cuda_setup_evaluator.py @@ -1,21 +1,41 @@ -import os -from pathlib import Path +import pytest -import torch +from bitsandbytes.cextension import get_cuda_bnb_library_path +from bitsandbytes.cuda_specs import CUDASpecs -# hardcoded test. Not good, but a sanity check for now -# TODO: improve this -def test_manual_override(requires_cuda): - manual_cuda_path = str(Path('/mmfs1/home/dettmers/data/local/cuda-12.2')) +@pytest.fixture +def cuda120_spec() -> CUDASpecs: + return CUDASpecs( + cuda_version_string="120", + highest_compute_capability=(8, 6), + cuda_version_tuple=(12, 0), + ) - pytorch_version = torch.version.cuda.replace('.', '') - assert pytorch_version != 122 # TODO: this will never be true... +@pytest.fixture +def cuda111_noblas_spec() -> CUDASpecs: + return CUDASpecs( + cuda_version_string="111", + highest_compute_capability=(7, 2), + cuda_version_tuple=(11, 1), + ) - os.environ['CUDA_HOME']='{manual_cuda_path}' - os.environ['BNB_CUDA_VERSION']='122' - #assert str(manual_cuda_path) in os.environ['LD_LIBRARY_PATH'] - import bitsandbytes as bnb - loaded_lib = bnb.cuda_setup.main.CUDASetup.get_instance().binary_name - #assert loaded_lib == 'libbitsandbytes_cuda122.so' + +def test_get_cuda_bnb_library_path(monkeypatch, cuda120_spec): + monkeypatch.delenv("BNB_CUDA_VERSION", raising=False) + assert get_cuda_bnb_library_path(cuda120_spec).stem == "libbitsandbytes_cuda120" + + +def test_get_cuda_bnb_library_path_override(monkeypatch, cuda120_spec, caplog): + monkeypatch.setenv("BNB_CUDA_VERSION", "110") + assert get_cuda_bnb_library_path(cuda120_spec).stem == "libbitsandbytes_cuda110" + assert "BNB_CUDA_VERSION" in caplog.text # did we get the warning? + + +def test_get_cuda_bnb_library_path_nocublaslt(monkeypatch, cuda111_noblas_spec): + monkeypatch.delenv("BNB_CUDA_VERSION", raising=False) + assert ( + get_cuda_bnb_library_path(cuda111_noblas_spec).stem + == "libbitsandbytes_cuda111_nocublaslt" + ) From 6a5a18a1d4badcd0d888c6de752599a05c820ade Mon Sep 17 00:00:00 2001 From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com> Date: Wed, 13 Mar 2024 10:58:24 +0200 Subject: [PATCH 086/112] Sanity check: Add check for `lib` being None --- bitsandbytes/diagnostics/main.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/bitsandbytes/diagnostics/main.py b/bitsandbytes/diagnostics/main.py index a7f0c901e..7a88bca26 100644 --- a/bitsandbytes/diagnostics/main.py +++ b/bitsandbytes/diagnostics/main.py @@ -13,6 +13,21 @@ def sanity_check(): + from bitsandbytes.cextension import lib + + if lib is None: + print_dedented( + """ + Couldn't load the bitsandbytes library, likely due to missing binaries. + Please ensure bitsandbytes is properly installed. + + For source installations, compile the binaries with `cmake -DCOMPUTE_BACKEND=cuda -S .`. + See the documentation for more details if needed. + + Trying a simple check anyway, but this will likely fail... + """ + ) + from bitsandbytes.optim import Adam p = torch.nn.Parameter(torch.rand(10, 10).cuda()) From 79d1cccc7fb90eaa3d65aebabe99050e343306b3 Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Wed, 13 Mar 2024 11:15:18 +0200 Subject: [PATCH 087/112] Improve filtering for values that are surely not paths --- bitsandbytes/diagnostics/cuda.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/bitsandbytes/diagnostics/cuda.py b/bitsandbytes/diagnostics/cuda.py index adb4cfde3..d65f80d8b 100644 --- a/bitsandbytes/diagnostics/cuda.py +++ b/bitsandbytes/diagnostics/cuda.py @@ -45,11 +45,16 @@ def find_cuda_libraries_in_path_list(paths_list_candidate: str) -> Iterable[Path for dir_string in paths_list_candidate.split(os.pathsep): if not dir_string: continue + if os.sep not in dir_string: + continue try: dir = Path(dir_string) - if not dir.exists(): - logger.warning(f"The directory listed in your path is found to be non-existent: {dir}") - continue + try: + if not dir.exists(): + logger.warning(f"The directory listed in your path is found to be non-existent: {dir}") + continue + except OSError: # Assume an esoteric error trying to poke at the directory + pass for lib_pattern in CUDA_RUNTIME_LIB_PATTERNS: for pth in dir.glob(lib_pattern): if pth.is_file(): @@ -63,8 +68,10 @@ def is_relevant_candidate_env_var(env_var: str, value: str) -> bool: env_var in CUDART_PATH_PREFERRED_ENVVARS # is a preferred location or ( os.sep in value # might contain a path - and "CONDA" not in env_var # not another conda envvar and env_var not in CUDART_PATH_IGNORED_ENVVARS # not ignored + and "CONDA" not in env_var # not another conda envvar + and "BASH_FUNC" not in env_var # not a bash function defined via envvar + and "\n" not in value # likely e.g. a script or something? ) ) From 02e30ca6e4d629d9cddc87886acf06207f07f803 Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Sat, 24 Feb 2024 11:54:14 +0200 Subject: [PATCH 088/112] Upgrade Ruff + configure formatting --- .pre-commit-config.yaml | 4 ++-- pyproject.toml | 9 ++++++--- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c8ccfe8df..a859d05af 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,11 +1,11 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.2.0 + rev: v0.3.2 hooks: - id: ruff args: - --fix - # - id: ruff-format # TODO: enable when the time is right + - id: ruff-format - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.5.0 hooks: diff --git a/pyproject.toml b/pyproject.toml index f74750720..609ff84fa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,6 +8,10 @@ src = [ "tests", "benchmarking" ] +target-version = "py38" +line-length = 119 + +[tool.ruff.lint] select = [ "B", # bugbear: security warnings "E", # pycodestyle @@ -17,7 +21,6 @@ select = [ "UP", # alert you when better syntax is available in your python version "RUF", # the ruff developer's own rules ] -target-version = "py38" ignore = [ "B007", # Loop control variable not used within the loop body (TODO: enable) "B028", # Warning without stacklevel (TODO: enable) @@ -30,7 +33,7 @@ ignore = [ ] ignore-init-module-imports = true # allow to expose in __init__.py via imports -[tool.ruff.extend-per-file-ignores] +[tool.ruff.lint.extend-per-file-ignores] "**/__init__.py" = ["F401"] # allow unused imports in __init__.py "{benchmarking,tests}/**/*.py" = [ "B007", @@ -42,7 +45,7 @@ ignore-init-module-imports = true # allow to expose in __init__.py via imports "UP030", ] -[tool.ruff.isort] +[tool.ruff.lint.isort] combine-as-imports = true detect-same-package = true force-sort-within-sections = true From 5a4263f4dc05fe8f78f4111beab9f68a81deeab1 Mon Sep 17 00:00:00 2001 From: Ruff Date: Sat, 24 Feb 2024 12:01:15 +0200 Subject: [PATCH 089/112] Reformat with ruff-format --- .github/scripts/set_platform_tag.py | 4 +- .../switchback/make_plot_with_jsonl.py | 122 +- benchmarking/switchback/speed_benchmark.py | 122 +- bitsandbytes/autograd/_functions.py | 50 +- bitsandbytes/cextension.py | 6 +- bitsandbytes/diagnostics/cuda.py | 6 +- bitsandbytes/diagnostics/main.py | 6 +- bitsandbytes/functional.py | 1020 +++++++++++------ bitsandbytes/nn/modules.py | 262 +++-- bitsandbytes/nn/triton_based_modules.py | 72 +- bitsandbytes/optim/adagrad.py | 12 +- bitsandbytes/optim/adam.py | 255 ++++- bitsandbytes/optim/adamw.py | 193 +++- bitsandbytes/optim/lars.py | 20 +- bitsandbytes/optim/lion.py | 171 ++- bitsandbytes/optim/optimizer.py | 172 +-- bitsandbytes/optim/rmsprop.py | 12 +- bitsandbytes/research/autograd/_functions.py | 47 +- bitsandbytes/research/nn/modules.py | 19 +- bitsandbytes/triton/dequantize_rowwise.py | 38 +- .../triton/int8_matmul_mixed_dequantize.py | 148 ++- .../triton/int8_matmul_rowwise_dequantize.py | 147 ++- .../quantize_columnwise_and_transpose.py | 48 +- bitsandbytes/triton/quantize_global.py | 81 +- bitsandbytes/triton/quantize_rowwise.py | 37 +- bitsandbytes/utils.py | 28 +- check_bnb_install.py | 10 +- examples/int8_inference_huggingface.py | 13 +- install_cuda.py | 16 +- scripts/stale.py | 3 +- tests/test_autograd.py | 140 +-- tests/test_cuda_setup_evaluator.py | 5 +- tests/test_functional.py | 777 ++++++------- tests/test_generation.py | 75 +- tests/test_linear4bit.py | 12 +- tests/test_linear8bitlt.py | 18 +- tests/test_modules.py | 148 ++- tests/test_optim.py | 91 +- tests/test_triton.py | 19 +- 39 files changed, 2653 insertions(+), 1772 deletions(-) diff --git a/.github/scripts/set_platform_tag.py b/.github/scripts/set_platform_tag.py index ca561c880..c82077074 100644 --- a/.github/scripts/set_platform_tag.py +++ b/.github/scripts/set_platform_tag.py @@ -7,9 +7,7 @@ def get_platform_tag(architecture): system = platform.system() if system == "Linux": - tag = ( - "manylinux_2_24_x86_64" if architecture == "x86_64" else "manylinux_2_24_aarch64" - ) + tag = "manylinux_2_24_x86_64" if architecture == "x86_64" else "manylinux_2_24_aarch64" elif system == "Darwin": tag = "macosx_13_1_x86_64" if architecture == "x86_64" else "macosx_13_1_arm64" elif system == "Windows": diff --git a/benchmarking/switchback/make_plot_with_jsonl.py b/benchmarking/switchback/make_plot_with_jsonl.py index b23f63562..fd0dd7d58 100644 --- a/benchmarking/switchback/make_plot_with_jsonl.py +++ b/benchmarking/switchback/make_plot_with_jsonl.py @@ -1,13 +1,11 @@ - import matplotlib.gridspec as gridspec import matplotlib.pyplot as plt import pandas as pd -cmap=plt.get_cmap('cool') - -if __name__ == '__main__': +cmap = plt.get_cmap("cool") - fig = plt.figure(tight_layout=True, figsize=(12,3.5)) +if __name__ == "__main__": + fig = plt.figure(tight_layout=True, figsize=(12, 3.5)) gs = gridspec.GridSpec(1, 2) dims_to_consider = [1024, 1280, 1408, 1664, 2048, 4096] @@ -19,25 +17,28 @@ ax = fig.add_subplot(gs[0, 0]) # TODO: change this to what you want. - rdf = pd.read_json('speed_benchmark/info_a100_py2.jsonl', lines=True) + rdf = pd.read_json("speed_benchmark/info_a100_py2.jsonl", lines=True) df = rdf[rdf.batch_size == batch_size_for_plot1] # first plot the time occupied by different operations for k, marker, ls, color, name in [ - ('standard_gx+standard_gw+standard_fwd', 's', '-', 'C2', 'Standard fp16 (sum of parts)'), - ('x_quantize_rowwise+g_quantize_rowwise+w_quantize_global+w_quantize_global_transpose+standard_gw+global_fwd+global_bwd', 'o', '-', 'C4', 'SwitchBack int8 (sum of parts)'), - - ('standard_fwd', '^', '--', 'C2', 'Matmul XW (standard)'), - ('standard_gw', '^', '-.', 'C2', 'Matmul GW (standard)'), - ('standard_gx', '^', ':', 'gray', 'Matmul GX (both)'), - - ('global_fwd', '^', '--', 'C4', 'Int8 Matmul XW (switchback)'), - ('global_bwd', '^', '-.', 'C4', 'Int8 Matmul GW (switchback)'), - - ('x_quantize_rowwise', 'P', '--', 'C4', 'Quantize rowwise X (switchback)'), - ('g_quantize_rowwise', 'P', '-.', 'C4', 'Quantize rowwise G (switchback)'), - ('w_quantize_global', '.', '--', 'C4', 'Quantize global W (switchback)'), - ('w_quantize_global_transpose', '.', '-.', 'C4', 'Quantize global and\ntranspose W (switchback)'), + ("standard_gx+standard_gw+standard_fwd", "s", "-", "C2", "Standard fp16 (sum of parts)"), + ( + "x_quantize_rowwise+g_quantize_rowwise+w_quantize_global+w_quantize_global_transpose+standard_gw+global_fwd+global_bwd", + "o", + "-", + "C4", + "SwitchBack int8 (sum of parts)", + ), + ("standard_fwd", "^", "--", "C2", "Matmul XW (standard)"), + ("standard_gw", "^", "-.", "C2", "Matmul GW (standard)"), + ("standard_gx", "^", ":", "gray", "Matmul GX (both)"), + ("global_fwd", "^", "--", "C4", "Int8 Matmul XW (switchback)"), + ("global_bwd", "^", "-.", "C4", "Int8 Matmul GW (switchback)"), + ("x_quantize_rowwise", "P", "--", "C4", "Quantize rowwise X (switchback)"), + ("g_quantize_rowwise", "P", "-.", "C4", "Quantize rowwise G (switchback)"), + ("w_quantize_global", ".", "--", "C4", "Quantize global W (switchback)"), + ("w_quantize_global_transpose", ".", "-.", "C4", "Quantize global and\ntranspose W (switchback)"), ]: xs = [] ys = [] @@ -47,40 +48,46 @@ df_ = df_[df_.dim_out == embed_dim * 4] xs.append(embed_dim) y_ = 0 - for k_ in k.split('+'): + for k_ in k.split("+"): y_ += df_[k_].values[0] df_ = df[df.dim_in == embed_dim * 4] df_ = df_[df_.dim_out == embed_dim] - for k_ in k.split('+'): + for k_ in k.split("+"): y_ += df_[k_].values[0] ys.append(y_ * 0.5) + ax.plot( + xs, + ys, + color=color, + label=name, + marker=marker, + markersize=5 if marker == "s" else 5, + linestyle=ls, + linewidth=2 if "+" in k else 1.0, + ) - ax.plot(xs, ys, color=color, label=name, marker=marker, markersize=5 if marker=='s' else 5, linestyle=ls, linewidth=2 if '+' in k else 1.) - - - ax.set_xlabel('dim', fontsize=13) - ax.set_ylabel('time (ms)', fontsize=13) + ax.set_xlabel("dim", fontsize=13) + ax.set_ylabel("time (ms)", fontsize=13) ax.grid() - ax.set_xscale('log') + ax.set_xscale("log") if logscale_plot1: - ax.set_yscale('log') + ax.set_yscale("log") - ax.tick_params(axis='x', labelsize=11) - ax.tick_params(axis='y', labelsize=11) + ax.tick_params(axis="x", labelsize=11) + ax.tick_params(axis="y", labelsize=11) ax.set_xticks(dims_to_xtick) ax.set_xticklabels(dims_to_xtick) ax.set_xticks([], minor=True) - leg = ax.legend(loc='upper center', bbox_to_anchor=(-0.64, 1.), ncol=1, fontsize=10) - leg.get_texts()[0].set_fontweight('bold') - leg.get_texts()[1].set_fontweight('bold') + leg = ax.legend(loc="upper center", bbox_to_anchor=(-0.64, 1.0), ncol=1, fontsize=10) + leg.get_texts()[0].set_fontweight("bold") + leg.get_texts()[1].set_fontweight("bold") plt.subplots_adjust(left=0.1) - ax.set_title(' Linear layer, batch * sequence length = 32k', fontsize=10, loc='left', y=1.05, pad=-20) - + ax.set_title(" Linear layer, batch * sequence length = 32k", fontsize=10, loc="left", y=1.05, pad=-20) ax = fig.add_subplot(gs[0, 1]) @@ -88,10 +95,15 @@ for j, batch_size in enumerate(batch_sizes_for_plot2): all_xs, all_ys = [], [] for k, marker, ls, color, name in [ - ('standard_gx+standard_gw+standard_fwd', 's', '-', 'C2', 'Standard fp16 (total time)'), - ('x_quantize_rowwise+g_quantize_rowwise+w_quantize_global+w_quantize_global_transpose+standard_gw+global_fwd+global_bwd', 'o', '-', 'C4', 'SwitchBack int8 (total time)'), + ("standard_gx+standard_gw+standard_fwd", "s", "-", "C2", "Standard fp16 (total time)"), + ( + "x_quantize_rowwise+g_quantize_rowwise+w_quantize_global+w_quantize_global_transpose+standard_gw+global_fwd+global_bwd", + "o", + "-", + "C4", + "SwitchBack int8 (total time)", + ), ]: - xs, ys = [], [] df = rdf[rdf.batch_size == batch_size] for embed_dim in dims_to_consider: @@ -99,11 +111,11 @@ df_ = df_[df_.dim_out == embed_dim * 4] xs.append(embed_dim) y_ = 0 - for k_ in k.split('+'): + for k_ in k.split("+"): y_ += df_[k_].values[0] df_ = df[df.dim_in == embed_dim * 4] df_ = df_[df_.dim_out == embed_dim] - for k_ in k.split('+'): + for k_ in k.split("+"): y_ += df_[k_].values[0] ys.append(y_ * 0.5) all_xs.append(xs) @@ -111,25 +123,29 @@ color = cmap(j * 0.25) real_ys = [-((all_ys[1][i] - all_ys[0][i]) / all_ys[0][i]) * 100 for i in range(len(all_ys[0]))] - markers = ['^', 'v', 'P', 'o'] - ax.plot(all_xs[0], real_ys, color=color, label=f'batch * sequence length = {batch_size}', marker=markers[j], markersize=5 if marker=='s' else 5) + markers = ["^", "v", "P", "o"] + ax.plot( + all_xs[0], + real_ys, + color=color, + label=f"batch * sequence length = {batch_size}", + marker=markers[j], + markersize=5 if marker == "s" else 5, + ) ax.legend() - ax.set_xlabel('dim', fontsize=13) - ax.set_xscale('log') + ax.set_xlabel("dim", fontsize=13) + ax.set_xscale("log") ax.grid() - ax.set_ylabel(r'% speedup', fontsize=13) + ax.set_ylabel(r"% speedup", fontsize=13) - - ax.tick_params(axis='x', labelsize=11) - ax.tick_params(axis='y', labelsize=11) + ax.tick_params(axis="x", labelsize=11) + ax.tick_params(axis="y", labelsize=11) ax.set_xticks(dims_to_xtick) ax.set_xticklabels(dims_to_xtick) ax.set_xticks([], minor=True) - ax.set_title(' Linear layer summary, varying dimensions', fontsize=10, loc='left', y=1.05, pad=-20) - - + ax.set_title(" Linear layer summary, varying dimensions", fontsize=10, loc="left", y=1.05, pad=-20) - plt.savefig('speed_benchmark/plot_with_info.pdf', bbox_inches='tight') + plt.savefig("speed_benchmark/plot_with_info.pdf", bbox_inches="tight") diff --git a/benchmarking/switchback/speed_benchmark.py b/benchmarking/switchback/speed_benchmark.py index c4f3cd4c6..eaba0e9cd 100644 --- a/benchmarking/switchback/speed_benchmark.py +++ b/benchmarking/switchback/speed_benchmark.py @@ -20,15 +20,15 @@ # KNOW ISSUE: need to optimize "w_quantize_colwise_transpose" when embeddim is too large. -def get_time(k, fn, info_dict): +def get_time(k, fn, info_dict): for _ in range(repeat // 2): - fn() + fn() torch.cuda.synchronize() start = time.time() for _ in range(repeat): - fn() + fn() torch.cuda.synchronize() end = time.time() @@ -36,16 +36,15 @@ def get_time(k, fn, info_dict): print(f"time {k}: {ms:.3f} ms") info_dict[k] = ms -if __name__ == '__main__': + +if __name__ == "__main__": torch.manual_seed(0) wm = 4 for dim in [1024, 1280, 1408, 1664, 2048, 4096]: # note "batch_size" is actually "batch_size * embed_dim", which is why it's large - for batch_size in [256*32, 256*64, 256*128, 256*256, 256*512]: - + for batch_size in [256 * 32, 256 * 64, 256 * 128, 256 * 256, 256 * 512]: # switch switches dim_in and dim_out for switch in [False, True]: - # hparams repeat = 64 batch_size = batch_size @@ -73,35 +72,86 @@ def get_time(k, fn, info_dict): state_w_rowwise = w.max(dim=1)[0] state_w_global = w.max() - info = {'repeat' : repeat, 'batch_size' : batch_size, 'dim_out' : dim_out, 'dim_in' : dim_in, 'wm' : wm, 'switch' : switch} - - get_time('standard_fwd', lambda : x.matmul(w.t()), info) - get_time('standard_gw', lambda : g.t().matmul(x), info) - get_time('standard_gx', lambda : g.matmul(w), info) - get_time('rowwise_fwd', lambda : int8_matmul_rowwise_dequantize(x_int8, w_int8.t(), state_x_rowwise, state_w_columnwise, None), info) - get_time('rowwise_bwd', lambda : int8_matmul_rowwise_dequantize(g_int8, wt_int8.t(), state_x_rowwise, state_w_rowwise, None), info) - get_time('global_fwd', lambda : int8_matmul_mixed_dequantize(x_int8, w_int8.t(), state_x_rowwise, state_w_global, None), info) - get_time('global_bwd', lambda : int8_matmul_mixed_dequantize(g_int8, wt_int8.t(), state_x_rowwise, state_w_global, None), info) - get_time('x_quantize_rowwise', lambda : quantize_rowwise(x), info) - get_time('g_quantize_rowwise', lambda : quantize_rowwise(g), info) - get_time('w_quantize_rowwise', lambda : quantize_rowwise(w), info) - get_time('w_quantize_colwise_transpose', lambda : quantize_columnwise_and_transpose(w), info) - get_time('w_quantize_global', lambda : quantize_global(w), info) - get_time('w_quantize_global_transpose', lambda : quantize_global_transpose(w), info) - - time_standard = info['standard_fwd'] + info['standard_gx'] + info['standard_gw'] - time_rowwise = info['x_quantize_rowwise'] + info['g_quantize_rowwise'] + info['w_quantize_colwise_transpose'] + info['w_quantize_rowwise'] + info['standard_gw'] + info['rowwise_fwd'] + info['rowwise_bwd'] - time_global = info['x_quantize_rowwise'] + info['g_quantize_rowwise'] + info['w_quantize_global'] + info['w_quantize_global_transpose'] + info['standard_gw'] + info['global_fwd'] + info['global_bwd'] - - print('TOTAL STANDARD', time_standard) - print('TOTAL ROWWISE', time_rowwise) - print('TOTAL GLOBAL', time_global) - - print('speedup', -100*(time_global - time_standard)/time_standard) - - info['time_standard'] = time_standard - info['time_rowwise'] = time_rowwise - info['time_global'] = time_global + info = { + "repeat": repeat, + "batch_size": batch_size, + "dim_out": dim_out, + "dim_in": dim_in, + "wm": wm, + "switch": switch, + } + + get_time("standard_fwd", lambda: x.matmul(w.t()), info) + get_time("standard_gw", lambda: g.t().matmul(x), info) + get_time("standard_gx", lambda: g.matmul(w), info) + get_time( + "rowwise_fwd", + lambda: int8_matmul_rowwise_dequantize( + x_int8, + w_int8.t(), + state_x_rowwise, + state_w_columnwise, + None, + ), + info, + ) + get_time( + "rowwise_bwd", + lambda: int8_matmul_rowwise_dequantize( + g_int8, + wt_int8.t(), + state_x_rowwise, + state_w_rowwise, + None, + ), + info, + ) + get_time( + "global_fwd", + lambda: int8_matmul_mixed_dequantize(x_int8, w_int8.t(), state_x_rowwise, state_w_global, None), + info, + ) + get_time( + "global_bwd", + lambda: int8_matmul_mixed_dequantize(g_int8, wt_int8.t(), state_x_rowwise, state_w_global, None), + info, + ) + get_time("x_quantize_rowwise", lambda: quantize_rowwise(x), info) + get_time("g_quantize_rowwise", lambda: quantize_rowwise(g), info) + get_time("w_quantize_rowwise", lambda: quantize_rowwise(w), info) + get_time("w_quantize_colwise_transpose", lambda: quantize_columnwise_and_transpose(w), info) + get_time("w_quantize_global", lambda: quantize_global(w), info) + get_time("w_quantize_global_transpose", lambda: quantize_global_transpose(w), info) + + time_standard = info["standard_fwd"] + info["standard_gx"] + info["standard_gw"] + time_rowwise = ( + info["x_quantize_rowwise"] + + info["g_quantize_rowwise"] + + info["w_quantize_colwise_transpose"] + + info["w_quantize_rowwise"] + + info["standard_gw"] + + info["rowwise_fwd"] + + info["rowwise_bwd"] + ) + time_global = ( + info["x_quantize_rowwise"] + + info["g_quantize_rowwise"] + + info["w_quantize_global"] + + info["w_quantize_global_transpose"] + + info["standard_gw"] + + info["global_fwd"] + + info["global_bwd"] + ) + + print("TOTAL STANDARD", time_standard) + print("TOTAL ROWWISE", time_rowwise) + print("TOTAL GLOBAL", time_global) + + print("speedup", -100 * (time_global - time_standard) / time_standard) + + info["time_standard"] = time_standard + info["time_rowwise"] = time_rowwise + info["time_global"] = time_global info_json = json.dumps(info) diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py index 6cbb6efd9..e9821cd36 100644 --- a/bitsandbytes/autograd/_functions.py +++ b/bitsandbytes/autograd/_functions.py @@ -14,16 +14,18 @@ def prod(iterable): return reduce(operator.mul, iterable, 1) + # The inverse transformation for the colTuring and colAmpere format were contributed by Alex Borzunov: # https://github.com/bigscience-workshop/petals/blob/main/src/petals/utils/linear8bitlt_patch.py - """ This class pools outlier dimensions across layers. This is particularly important for small models where outlier features are less systematic and occur with low frequency. """ + + class GlobalOutlierPooler: _instance = None @@ -83,6 +85,7 @@ def get_inverse_transform_indices( break # if all indices fit in i bytes, stop early return permuted_tile_indices + def undo_layout(permuted_tensor: torch.Tensor, tile_indices: torch.LongTensor) -> torch.Tensor: """ Undo a tiled permutation such as turing or ampere layout @@ -159,20 +162,12 @@ def backward(ctx, grad_output): ) if not A.is_contiguous(): A = A.contiguous() - qA, S2 = F.vectorwise_quant( - A.view(-1, A.shape[2]), dim=0, quant_type=quant_type - ) + qA, S2 = F.vectorwise_quant(A.view(-1, A.shape[2]), dim=0, quant_type=quant_type) igrad_B = F.igemm(qA.t(), qgrad_output) - grad_B = F.vectorwise_mm_dequant( - igrad_B, S2.t(), S1, grad_output.dtype, quant_type - ) + grad_B = F.vectorwise_mm_dequant(igrad_B, S2.t(), S1, grad_output.dtype, quant_type) else: - qgrad_output, S1 = F.vectorwise_quant( - grad_output, dim=dims, quant_type=quant_type - ) - qA, S2 = F.vectorwise_quant( - A, dim=dims, quant_type=quant_type - ) + qgrad_output, S1 = F.vectorwise_quant(grad_output, dim=dims, quant_type=quant_type) + qA, S2 = F.vectorwise_quant(A, dim=dims, quant_type=quant_type) igrad_B = F.igemm(qA.permute(permute_dim), qgrad_output) grad_B = F.vectorwise_mm_dequant( igrad_B, @@ -201,9 +196,7 @@ def backward(ctx, grad_output): with torch.no_grad(): grad_A = torch.matmul(grad_output, B.permute(permute_dim)) else: - qgrad_output, S1 = F.vectorwise_quant( - grad_output, dim=dims, quant_type=quant_type - ) + qgrad_output, S1 = F.vectorwise_quant(grad_output, dim=dims, quant_type=quant_type) qB, S3 = F.vectorwise_quant(B, dim=dim_B, quant_type=quant_type) igrad_A = F.igemm(qgrad_output, qB.permute(permute_dim)) grad_A = F.vectorwise_mm_dequant( @@ -227,7 +220,7 @@ def supports_igemmlt(device: torch.device) -> bool: if torch.cuda.get_device_capability(device=device) < (7, 5): return False device_name = torch.cuda.get_device_name(device=device) - nvidia16_models = ('GTX 1630', 'GTX 1650', 'GTX 1660') # https://en.wikipedia.org/wiki/GeForce_16_series + nvidia16_models = ("GTX 1630", "GTX 1650", "GTX 1660") # https://en.wikipedia.org/wiki/GeForce_16_series if any(model_name in device_name for model_name in nvidia16_models): return False # these devices are technically cuda 7.5-capable, but they lack tensor cores return True @@ -246,6 +239,7 @@ def get_tile_inds(format, device): with torch.no_grad(): return get_inverse_transform_indices(transform, _get_tile_size(format)).to(device) + @dataclass class MatmulLtState: _tile_indices: Optional[torch.Tensor] = None @@ -510,7 +504,6 @@ def forward(ctx, A, B, out=None, bias=None, quant_state: Optional[F.QuantState] else: return torch.empty(A.shape[:-1] + B_shape[:1], dtype=A.dtype, device=A.device) - # 1. Dequantize # 2. MatmulnN output = torch.nn.functional.linear(A, F.dequantize_4bit(B, quant_state).to(A.dtype).t(), bias) @@ -532,7 +525,7 @@ def backward(ctx, grad_output): bias_grad = None if ctx.bias is None else torch.zeros_like(ctx.bias) return torch.zeros_like(ctx.A), torch.zeros_like(ctx.B), None, bias_grad, None - req_gradA, _, _, req_gradBias, _= ctx.needs_input_grad + req_gradA, _, _, req_gradBias, _ = ctx.needs_input_grad A, B = ctx.tensors grad_A, grad_B, grad_bias = None, None, None @@ -542,8 +535,9 @@ def backward(ctx, grad_output): grad_bias = grad_output.sum(0, dtype=ctx.dtype_bias) # not supported by PyTorch. TODO: create work-around - #if req_gradB: grad_B = torch.matmul(grad_output.t(), A) - if req_gradA: grad_A = torch.matmul(grad_output, F.dequantize_4bit(B, ctx.state).to(grad_output.dtype).t()) + # if req_gradB: grad_B = torch.matmul(grad_output.t(), A) + if req_gradA: + grad_A = torch.matmul(grad_output, F.dequantize_4bit(B, ctx.state).to(grad_output.dtype).t()) return grad_A, grad_B, None, grad_bias, None @@ -554,7 +548,7 @@ def matmul( out: Optional[torch.Tensor] = None, state: Optional[MatmulLtState] = None, threshold=0.0, - bias=None + bias=None, ): state = state or MatmulLtState() if threshold > 0.0: @@ -562,11 +556,19 @@ def matmul( return MatMul8bitLt.apply(A, B, out, bias, state) -def matmul_4bit(A: torch.Tensor, B: torch.Tensor, quant_state: F.QuantState, out: Optional[torch.Tensor] = None, bias=None): +def matmul_4bit( + A: torch.Tensor, + B: torch.Tensor, + quant_state: F.QuantState, + out: Optional[torch.Tensor] = None, + bias=None, +): assert quant_state is not None if A.numel() == A.shape[-1] and A.requires_grad == False: if A.shape[-1] % quant_state.blocksize != 0: - warn(f'Some matrices hidden dimension is not a multiple of {quant_state.blocksize} and efficient inference kernels are not supported for these (slow). Matrix input size found: {A.shape}') + warn( + f"Some matrices hidden dimension is not a multiple of {quant_state.blocksize} and efficient inference kernels are not supported for these (slow). Matrix input size found: {A.shape}", + ) return MatMul4Bit.apply(A, B, out, bias, quant_state) else: out = F.gemv_4bit(A, B.t(), out, state=quant_state) diff --git a/bitsandbytes/cextension.py b/bitsandbytes/cextension.py index 57ba71020..c8ae7358d 100644 --- a/bitsandbytes/cextension.py +++ b/bitsandbytes/cextension.py @@ -56,7 +56,7 @@ def get_cuda_bnb_library_path(cuda_specs: CUDASpecs) -> Path: "This can be used to load a bitsandbytes version that is different from the PyTorch CUDA version.\n" "If this was unintended set the BNB_CUDA_VERSION variable to an empty string: export BNB_CUDA_VERSION=\n" "If you use the manual override make sure the right libcudart.so is in your LD_LIBRARY_PATH\n" - "For example by adding the following to your .bashrc: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH: BNBNativeLibrary: logger.warning( "The installed version of bitsandbytes was compiled without GPU support. " - "8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable." + "8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.", ) return BNBNativeLibrary(dll) @@ -120,5 +120,5 @@ def get_native_library() -> BNBNativeLibrary: Inspect the output of the command and see if you can locate CUDA libraries. You might need to add them to your LD_LIBRARY_PATH. If you suspect a bug, please take the information from python -m bitsandbytes and open an issue at: https://github.com/TimDettmers/bitsandbytes/issues -""" +""", ) diff --git a/bitsandbytes/diagnostics/cuda.py b/bitsandbytes/diagnostics/cuda.py index d65f80d8b..f993dff7e 100644 --- a/bitsandbytes/diagnostics/cuda.py +++ b/bitsandbytes/diagnostics/cuda.py @@ -120,7 +120,7 @@ def print_cuda_diagnostics(cuda_specs: CUDASpecs) -> None: The CUDA version for the compile might depend on your conda install, if using conda. Inspect CUDA version via `conda list | grep cuda`. - """ + """, ) cuda_major, cuda_minor = cuda_specs.cuda_version_tuple @@ -129,7 +129,7 @@ def print_cuda_diagnostics(cuda_specs: CUDASpecs) -> None: """ WARNING: CUDA versions lower than 11 are currently not supported for LLM.int8(). You will be only to use 8-bit optimizers and quantization routines! - """ + """, ) print(f"To manually override the PyTorch CUDA version please see: {NONPYTORCH_DOC_URL}") @@ -170,7 +170,7 @@ def print_cuda_runtime_diagnostics() -> None: In the case of a manual override, make sure you set LD_LIBRARY_PATH, e.g. export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-11.2, - """ + """, ) for pth in cudart_paths: print(f"* Found CUDA runtime at: {pth}") diff --git a/bitsandbytes/diagnostics/main.py b/bitsandbytes/diagnostics/main.py index 7a88bca26..1ce096f69 100644 --- a/bitsandbytes/diagnostics/main.py +++ b/bitsandbytes/diagnostics/main.py @@ -25,7 +25,7 @@ def sanity_check(): See the documentation for more details if needed. Trying a simple check anyway, but this will likely fail... - """ + """, ) from bitsandbytes.optim import Adam @@ -71,7 +71,7 @@ def main(): print( f"WARNING: {__package__} is currently running as CPU-only!\n" "Therefore, 8-bit optimizers and GPU quantization are unavailable.\n\n" - f"If you think that this is so erroneously,\nplease report an issue!" + f"If you think that this is so erroneously,\nplease report an issue!", ) except Exception: traceback.print_exc() @@ -80,6 +80,6 @@ def main(): Above we output some debug information. Please provide this info when creating an issue via {PACKAGE_GITHUB_URL}/issues/new/choose WARNING: Please be sure to sanitize sensitive info from the output before posting it. - """ + """, ) sys.exit(1) diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py index 61d0d83b2..8fa8f2f60 100644 --- a/bitsandbytes/functional.py +++ b/bitsandbytes/functional.py @@ -21,6 +21,7 @@ def prod(iterable): return reduce(operator.mul, iterable, 1) + name2qmap = {} if lib and lib.compiled_with_cuda: @@ -127,7 +128,6 @@ def prefetch_all(self, to_cpu=False): prefetch_tensor(t, to_cpu) - class CUBLAS_Context: _instance = None @@ -169,6 +169,7 @@ def get_instance(cls): cls._instance.initialize() return cls._instance + dtype2bytes = {} dtype2bytes[torch.float32] = 4 dtype2bytes[torch.float16] = 2 @@ -176,10 +177,11 @@ def get_instance(cls): dtype2bytes[torch.uint8] = 1 dtype2bytes[torch.int8] = 1 -FIRST_CUDA_DEVICE = torch.device('cuda', index=0) +FIRST_CUDA_DEVICE = torch.device("cuda", index=0) + def get_paged(*shape, dtype=torch.float32, device=FIRST_CUDA_DEVICE): - num_bytes = dtype2bytes[dtype]*prod(shape) + num_bytes = dtype2bytes[dtype] * prod(shape) cuda_ptr = lib.cget_managed_ptr(ct.c_size_t(num_bytes)) c_ptr = ct.cast(cuda_ptr, ct.POINTER(ct.c_int)) new_array = np.ctypeslib.as_array(c_ptr, shape=shape) @@ -188,31 +190,35 @@ def get_paged(*shape, dtype=torch.float32, device=FIRST_CUDA_DEVICE): out.page_deviceid = device.index return out + def prefetch_tensor(A, to_cpu=False): - assert A.is_paged, 'Only paged tensors can be prefetched!' + assert A.is_paged, "Only paged tensors can be prefetched!" if to_cpu: deviceid = -1 else: deviceid = A.page_deviceid - num_bytes = dtype2bytes[A.dtype]*A.numel() + num_bytes = dtype2bytes[A.dtype] * A.numel() lib.cprefetch(get_ptr(A), ct.c_size_t(num_bytes), ct.c_int32(deviceid)) + def elementwise_func(func_name, A, B, value, prefetch=True): func = None if A.dtype == torch.float32: - func = getattr(lib, f'c{func_name}_fp32', None) + func = getattr(lib, f"c{func_name}_fp32", None) cvalue = ct.c_float(value) elif A.dtype == torch.uint8: - func = getattr(lib, f'c{func_name}_uint8', None) + func = getattr(lib, f"c{func_name}_uint8", None) cvalue = ct.c_uint8(value) - if func is None: raise NotImplementedError(f'Function not implemented: {func_name}') + if func is None: + raise NotImplementedError(f"Function not implemented: {func_name}") - is_managed = getattr(A, 'is_managed', False) + is_managed = getattr(A, "is_managed", False) if is_managed and prefetch: prefetch_tensor(A) - if B is not None: prefetch_tensor(B) + if B is not None: + prefetch_tensor(B) func(get_ptr(A), get_ptr(B), cvalue, ct.c_int64(A.numel())) if A.is_paged or B.is_paged: @@ -222,28 +228,36 @@ def elementwise_func(func_name, A, B, value, prefetch=True): # operation occurred. So we synchronize. torch.cuda.synchronize() -def fill(A, value, device=None, prefetch=True): elementwise_func('fill', A, None, value) -def arange(A, device=None): elementwise_func('arange', A, None, 0) -def _mul(A, B, device=None): elementwise_func('_mul', A, B, 0) + +def fill(A, value, device=None, prefetch=True): + elementwise_func("fill", A, None, value) + + +def arange(A, device=None): + elementwise_func("arange", A, None, 0) + + +def _mul(A, B, device=None): + elementwise_func("_mul", A, B, 0) def create_linear_map(signed=True, total_bits=8, add_zero=True): - sign = (-1.0 if signed else 0.0) + sign = -1.0 if signed else 0.0 total_values = 2**total_bits if add_zero or total_bits < 8: # add a zero # since we simulate less bits by having zeros in the data type, we # we need to center the quantization around zero and as such lose # a single value - total_values = (2**total_bits if not signed else 2**total_bits-1) + total_values = 2**total_bits if not signed else 2**total_bits - 1 values = torch.linspace(sign, 1.0, total_values) gap = 256 - values.numel() if gap == 0: return values else: - l = values.numel()//2 # noqa: E741 - return torch.Tensor(values[:l].tolist() + [0]*gap + values[l:].tolist()) + l = values.numel() // 2 # noqa: E741 + return torch.Tensor(values[:l].tolist() + [0] * gap + values[l:].tolist()) def create_normal_map(offset=0.9677083, use_extra_value=True): @@ -251,18 +265,17 @@ def create_normal_map(offset=0.9677083, use_extra_value=True): from scipy.stats import norm except ImportError as ie: raise ImportError( - "Scipy is required for `create_normal_map`. " - "Install `bitsandbytes` with the `[test]` extra." + "Scipy is required for `create_normal_map`. Install `bitsandbytes` with the `[test]` extra.", ) from ie if use_extra_value: # one more positive value, this is an asymmetric type v1 = norm.ppf(torch.linspace(offset, 0.5, 9)[:-1]).tolist() - v2 = [0]*(256-15) ## we have 15 non-zero values in this data type + v2 = [0] * (256 - 15) ## we have 15 non-zero values in this data type v3 = (-norm.ppf(torch.linspace(offset, 0.5, 8)[:-1])).tolist() else: v1 = norm.ppf(torch.linspace(offset, 0.5, 8)[:-1]).tolist() - v2 = [0]*(256-14) ## we have 14 non-zero values in this data type + v2 = [0] * (256 - 14) ## we have 14 non-zero values in this data type v3 = (-norm.ppf(torch.linspace(offset, 0.5, 8)[:-1])).tolist() v = v1 + v2 + v3 @@ -275,38 +288,37 @@ def create_normal_map(offset=0.9677083, use_extra_value=True): return values + def create_fp8_map(signed=True, exponent_bits=5, precision_bits=2, total_bits=8): e = exponent_bits p = precision_bits has_sign = 1 if signed else 0 - assert e+p == total_bits-has_sign + assert e + p == total_bits - has_sign # the exponent is biased to 2^(e-1) -1 == 0 evalues = [] pvalues = [] - for i, val in enumerate(range(-(2**(exponent_bits-has_sign)), 2**(exponent_bits-has_sign), 1)): + for i, val in enumerate(range(-(2 ** (exponent_bits - has_sign)), 2 ** (exponent_bits - has_sign), 1)): evalues.append(2**val) - values = [] lst = list(itertools.product([0, 1], repeat=precision_bits)) - #for ev in evalues: - bias = 2**(exponent_bits-1) - for evalue in range(2**(exponent_bits)): + # for ev in evalues: + bias = 2 ** (exponent_bits - 1) + for evalue in range(2 ** (exponent_bits)): for bit_pattern in lst: - value = (1 if evalue != 0 else 0) + value = 1 if evalue != 0 else 0 for i, pval in enumerate(list(bit_pattern)): - value += pval*(2**-(i+1)) + value += pval * (2 ** -(i + 1)) if evalue == 0: # subnormals - value = value*2**-(bias) + value = value * 2**-(bias) else: # normals - value = value*2**-(evalue-bias-1) + value = value * 2 ** -(evalue - bias - 1) values.append(value) if signed: values.append(-value) - assert len(values) == 2**total_bits values.sort() if total_bits < 8: @@ -320,7 +332,6 @@ def create_fp8_map(signed=True, exponent_bits=5, precision_bits=2, total_bits=8) return code - def create_dynamic_map(signed=True, max_exponent_bits=7, total_bits=8): """ Creates the dynamic quantiztion map. @@ -345,7 +356,11 @@ def create_dynamic_map(signed=True, max_exponent_bits=7, total_bits=8): non_sign_bits = total_bits - (1 if signed else 1) additional_items = 2 ** (non_sign_bits - max_exponent_bits) - 1 for i in range(max_exponent_bits): - fraction_items = int(2 ** (i + non_sign_bits - max_exponent_bits) + 1 if signed else 2 ** (i + non_sign_bits - max_exponent_bits + 1) + 1) + fraction_items = int( + 2 ** (i + non_sign_bits - max_exponent_bits) + 1 + if signed + else 2 ** (i + non_sign_bits - max_exponent_bits + 1) + 1, + ) boundaries = torch.linspace(0.1, 1, fraction_items) means = (boundaries[:-1] + boundaries[1:]) / 2.0 data += ((10 ** (-(max_exponent_bits - 1) + i)) * means).tolist() @@ -371,8 +386,9 @@ def create_dynamic_map(signed=True, max_exponent_bits=7, total_bits=8): data.sort() return Tensor(data) + def create_quantile_map(A, total_bits=8): - q = estimate_quantiles(A, num_quantiles=2**total_bits-1) + q = estimate_quantiles(A, num_quantiles=2**total_bits - 1) q = q.tolist() q.append(0) @@ -383,11 +399,13 @@ def create_quantile_map(A, total_bits=8): q.sort() q = Tensor(q) - q = q/q.abs().max() + q = q / q.abs().max() return q + def get_special_format_str(): - if not torch.cuda.is_available(): return 'col_turing' + if not torch.cuda.is_available(): + return "col_turing" major, _minor = torch.cuda.get_device_capability() if major <= 7: return "col_turing" @@ -396,20 +414,24 @@ def get_special_format_str(): return "col_turing" - def is_on_gpu(tensors): on_gpu = True gpu_ids = set() for t in tensors: - if t is None: continue # NULL pointers are fine - is_paged = getattr(t, 'is_paged', False) - on_gpu &= (t.device.type == 'cuda' or is_paged) + if t is None: + continue # NULL pointers are fine + is_paged = getattr(t, "is_paged", False) + on_gpu &= t.device.type == "cuda" or is_paged if not is_paged: gpu_ids.add(t.device.index) if not on_gpu: - raise TypeError(f'All input tensors need to be on the same GPU, but found some tensors to not be on a GPU:\n {[(t.shape, t.device) for t in tensors]}') + raise TypeError( + f"All input tensors need to be on the same GPU, but found some tensors to not be on a GPU:\n {[(t.shape, t.device) for t in tensors]}", + ) if len(gpu_ids) > 1: - raise TypeError(f'Input tensors need to be on the same GPU, but found the following tensor and device combinations:\n {[(t.shape, t.device) for t in tensors]}') + raise TypeError( + f"Input tensors need to be on the same GPU, but found the following tensor and device combinations:\n {[(t.shape, t.device) for t in tensors]}", + ) return on_gpu @@ -447,15 +469,13 @@ def get_transform_func(dtype, orderA, orderOut, transpose=False): if not hasattr(lib, name): print(name) raise ValueError( - f"Transform function not supported: {orderA} to {orderOut} for data type {dtype} and transpose={transpose}" + f"Transform function not supported: {orderA} to {orderOut} for data type {dtype} and transpose={transpose}", ) else: return getattr(lib, name) -def get_transform_buffer( - shape, dtype, device, to_order, from_order="row", transpose=False -): +def get_transform_buffer(shape, dtype, device, to_order, from_order="row", transpose=False): # init_func = torch.empty init_func = torch.zeros dims = len(shape) @@ -508,9 +528,7 @@ def nvidia_transform( else: from_order = state[1] if out is None: - out, new_state = get_transform_buffer( - state[0], A.dtype, A.device, to_order, state[1] - ) + out, new_state = get_transform_buffer(state[0], A.dtype, A.device, to_order, state[1]) else: new_state = (state[1], to_order) func = get_transform_func(A.dtype, from_order, to_order, transpose) @@ -534,8 +552,13 @@ def nvidia_transform( return out, new_state -def estimate_quantiles(A: Tensor, out: Optional[torch.Tensor] = None, offset: float = 1 / 512, num_quantiles=256) -> Tensor: - ''' +def estimate_quantiles( + A: Tensor, + out: Optional[torch.Tensor] = None, + offset: float = 1 / 512, + num_quantiles=256, +) -> Tensor: + """ Estimates 256 equidistant quantiles on the input tensor eCDF. Uses SRAM-Quantiles algorithm to quickly estimate 256 equidistant quantiles @@ -562,14 +585,21 @@ def estimate_quantiles(A: Tensor, out: Optional[torch.Tensor] = None, offset: fl ------- torch.Tensor: The 256 quantiles in float32 datatype. - ''' - if A.numel() < 256: raise NotImplementedError(f'Quantile estimation needs at least 256 values in the Tensor, but Tensor had only {A.numel()} values.') - if num_quantiles > 256: raise NotImplementedError(f"Currently only a maximum of 256 equally spaced quantiles are supported, but the argument num_quantiles={num_quantiles}") - if num_quantiles < 256 and offset == 1/(512): + """ + if A.numel() < 256: + raise NotImplementedError( + f"Quantile estimation needs at least 256 values in the Tensor, but Tensor had only {A.numel()} values.", + ) + if num_quantiles > 256: + raise NotImplementedError( + f"Currently only a maximum of 256 equally spaced quantiles are supported, but the argument num_quantiles={num_quantiles}", + ) + if num_quantiles < 256 and offset == 1 / (512): # override default arguments - offset = 1/(2*num_quantiles) + offset = 1 / (2 * num_quantiles) - if out is None: out = torch.zeros((256,), dtype=torch.float32, device=A.device) + if out is None: + out = torch.zeros((256,), dtype=torch.float32, device=A.device) is_on_gpu([A, out]) device = pre_call(A.device) if A.dtype == torch.float32: @@ -581,7 +611,7 @@ def estimate_quantiles(A: Tensor, out: Optional[torch.Tensor] = None, offset: fl post_call(device) if num_quantiles < 256: - step = round(256/num_quantiles) + step = round(256 / num_quantiles) idx = torch.linspace(0, 255, num_quantiles).long().to(A.device) out = out[idx] @@ -590,12 +620,35 @@ def estimate_quantiles(A: Tensor, out: Optional[torch.Tensor] = None, offset: fl class QuantState: """container for quantization state components to work with Params4bit and similar classes""" - valid_quant_types = ('fp4', 'nf4') - valid_qs_type_keys = [f"bitsandbytes__{x}" for x in valid_quant_types] - valid_qs_keys = ['absmax', 'quant_map', 'nested_absmax', 'nested_quant_map', 'quant_state', 'quant_type', - 'blocksize', 'dtype', 'shape', 'nested_blocksize', 'nested_dtype', 'nested_offset'] - def __init__(self, absmax, shape=None, code=None, blocksize=None, quant_type=None, dtype=None, offset=None, state2=None): + valid_quant_types = ("fp4", "nf4") + valid_qs_type_keys = [f"bitsandbytes__{x}" for x in valid_quant_types] + valid_qs_keys = [ + "absmax", + "quant_map", + "nested_absmax", + "nested_quant_map", + "quant_state", + "quant_type", + "blocksize", + "dtype", + "shape", + "nested_blocksize", + "nested_dtype", + "nested_offset", + ] + + def __init__( + self, + absmax, + shape=None, + code=None, + blocksize=None, + quant_type=None, + dtype=None, + offset=None, + state2=None, + ): self.absmax = absmax self.shape = shape self.code = code @@ -614,13 +667,20 @@ def __get_item__(self, idx): state2 = [absmax, input_shape, A.dtype, blocksize, None, quant_type] """ if self.nested: - list_repr = [self.absmax, self.shape, self.dtype, self.blocksize, [self.offset, self.state2], self.quant_type] + list_repr = [ + self.absmax, + self.shape, + self.dtype, + self.blocksize, + [self.offset, self.state2], + self.quant_type, + ] else: list_repr = [self.absmax, self.shape, self.dtype, self.blocksize, None, self.quant_type] return list_repr[idx] @classmethod - def from_dict(cls, qs_dict: Dict[str, Any], device: torch.device) -> 'QuantState': + def from_dict(cls, qs_dict: Dict[str, Any], device: torch.device) -> "QuantState": """ unpacks components of state_dict into QuantState where necessary, convert into strings, torch.dtype, ints, etc. @@ -632,37 +692,39 @@ def from_dict(cls, qs_dict: Dict[str, Any], device: torch.device) -> 'QuantState # unpacking tensor with non-tensor components qs_key = [k for k, v in qs_dict.items() if "quant_state" in k and isinstance(v, torch.Tensor)] - if not len(qs_key) and 'quant_type' not in qs_dict: + if not len(qs_key) and "quant_type" not in qs_dict: raise ValueError("Expected packed or unpacked quant_state items, found neither") elif len(qs_key) != 1 or qs_key[0].split(".")[-1] not in cls.valid_qs_type_keys: - raise ValueError(f"There should be exactly one `quant_state` item with ending from {cls.valid_qs_type_keys}.\nDetected {qs_key}.") + raise ValueError( + f"There should be exactly one `quant_state` item with ending from {cls.valid_qs_type_keys}.\nDetected {qs_key}.", + ) # unpacking minor and non-tensor quant state items if necessary if len(qs_key) == 1: first_qs_key = qs_key[0] qs_dict.update(unpack_tensor_to_dict(qs_dict.pop(first_qs_key))) - qs_dict = {k.split('.')[-1]: v for k, v in qs_dict.items()} # strip prefixes + qs_dict = {k.split(".")[-1]: v for k, v in qs_dict.items()} # strip prefixes assert set(qs_dict.keys()).issubset(cls.valid_qs_keys) - if 'nested_absmax' in qs_dict: - offset = torch.tensor(float(qs_dict['nested_offset'])).to(device) + if "nested_absmax" in qs_dict: + offset = torch.tensor(float(qs_dict["nested_offset"])).to(device) state2 = cls( - absmax=qs_dict['nested_absmax'].to(device), - blocksize=qs_dict['nested_blocksize'], - code=qs_dict['nested_quant_map'].to(device), - dtype=getattr(torch, qs_dict['nested_dtype']), + absmax=qs_dict["nested_absmax"].to(device), + blocksize=qs_dict["nested_blocksize"], + code=qs_dict["nested_quant_map"].to(device), + dtype=getattr(torch, qs_dict["nested_dtype"]), ) else: offset, state2 = None, None quant_state = cls( - quant_type=qs_dict['quant_type'], - absmax=qs_dict['absmax'].to(device), - blocksize=qs_dict['blocksize'], - code=qs_dict['quant_map'].to(device), - dtype=getattr(torch, qs_dict['dtype']), - shape=torch.Size(qs_dict['shape']) if qs_dict['shape'] is not None else None, + quant_type=qs_dict["quant_type"], + absmax=qs_dict["absmax"].to(device), + blocksize=qs_dict["blocksize"], + code=qs_dict["quant_map"].to(device), + dtype=getattr(torch, qs_dict["dtype"]), + shape=torch.Size(qs_dict["shape"]) if qs_dict["shape"] is not None else None, offset=offset, state2=state2, ) @@ -674,21 +736,23 @@ def as_dict(self, packed=False): param: packed -- returns dict[str, torch.Tensor] for state_dict fit for safetensors saving """ qs_dict = { - 'quant_type': self.quant_type, - 'absmax': self.absmax, - 'blocksize': self.blocksize, - 'quant_map': self.code, - 'dtype': str(self.dtype).strip('torch.'), - 'shape': tuple(self.shape), + "quant_type": self.quant_type, + "absmax": self.absmax, + "blocksize": self.blocksize, + "quant_map": self.code, + "dtype": str(self.dtype).strip("torch."), + "shape": tuple(self.shape), } if self.nested: - qs_dict.update({ - 'nested_absmax': self.state2.absmax, - 'nested_blocksize': self.state2.blocksize, - 'nested_quant_map': self.state2.code.clone(), # un-shared to avoid restoring it after shared tensors are removed by safetensors - 'nested_dtype': str(self.state2.dtype).strip('torch.'), - 'nested_offset': self.offset.item(), - }) + qs_dict.update( + { + "nested_absmax": self.state2.absmax, + "nested_blocksize": self.state2.blocksize, + "nested_quant_map": self.state2.code.clone(), # un-shared to avoid restoring it after shared tensors are removed by safetensors + "nested_dtype": str(self.state2.dtype).strip("torch."), + "nested_offset": self.offset.item(), + }, + ) if not packed: return qs_dict @@ -711,14 +775,22 @@ def __eq__(self, other): return False return ( - torch.allclose(self.absmax, other.absmax, atol=1e-6) and - self.shape == other.shape and - torch.allclose(self.code, other.code, atol=1e-6) and - self.dtype == other.dtype and - self.blocksize == other.blocksize and - self.quant_type == other.quant_type and - (self.offset == other.offset if self.offset is not None and other.offset is not None else self.offset is other.offset) and - (self.state2 == other.state2 if self.state2 is not None and other.state2 is not None else self.state2 is other.state2) + torch.allclose(self.absmax, other.absmax, atol=1e-6) + and self.shape == other.shape + and torch.allclose(self.code, other.code, atol=1e-6) + and self.dtype == other.dtype + and self.blocksize == other.blocksize + and self.quant_type == other.quant_type + and ( + self.offset == other.offset + if self.offset is not None and other.offset is not None + else self.offset is other.offset + ) + and ( + self.state2 == other.state2 + if self.state2 is not None and other.state2 is not None + else self.state2 is other.state2 + ) ) @@ -756,7 +828,6 @@ def quantize_blockwise( The quantization state to undo the quantization. """ - if code is None: if "dynamic" not in name2qmap: name2qmap["dynamic"] = create_dynamic_map().to(A.device) @@ -771,31 +842,66 @@ def quantize_blockwise( if out is None: out = torch.zeros_like(A, dtype=torch.uint8) - if A.device.type != 'cpu': + if A.device.type != "cpu": assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64] cblocksize = ct.c_int32(blocksize) prev_device = pre_call(A.device) code = code.to(A.device) is_on_gpu([code, A, out, absmax]) if A.dtype == torch.float32: - lib.cquantize_blockwise_fp32(get_ptr(code), get_ptr(A), get_ptr(absmax), get_ptr(out), cblocksize, ct.c_int(A.numel())) + lib.cquantize_blockwise_fp32( + get_ptr(code), + get_ptr(A), + get_ptr(absmax), + get_ptr(out), + cblocksize, + ct.c_int(A.numel()), + ) elif A.dtype == torch.float16: - lib.cquantize_blockwise_fp16(get_ptr(code), get_ptr(A), get_ptr(absmax), get_ptr(out), cblocksize, ct.c_int(A.numel())) + lib.cquantize_blockwise_fp16( + get_ptr(code), + get_ptr(A), + get_ptr(absmax), + get_ptr(out), + cblocksize, + ct.c_int(A.numel()), + ) elif A.dtype == torch.bfloat16: - lib.cquantize_blockwise_bf16(get_ptr(code), get_ptr(A), get_ptr(absmax), get_ptr(out), cblocksize, ct.c_int(A.numel())) + lib.cquantize_blockwise_bf16( + get_ptr(code), + get_ptr(A), + get_ptr(absmax), + get_ptr(out), + cblocksize, + ct.c_int(A.numel()), + ) else: raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}") post_call(A.device) else: # cpu code = code.cpu() - lib.cquantize_blockwise_cpu_fp32(get_ptr(code), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_longlong(blocksize), ct.c_longlong(A.numel())) + lib.cquantize_blockwise_cpu_fp32( + get_ptr(code), + get_ptr(A), + get_ptr(absmax), + get_ptr(out), + ct.c_longlong(blocksize), + ct.c_longlong(A.numel()), + ) if nested: offset = absmax.mean() absmax -= offset qabsmax, state2 = quantize_blockwise(absmax, blocksize=blocksize, nested=False) - quant_state = QuantState(absmax=qabsmax, code=code, blocksize=blocksize, dtype=A.dtype, offset=offset, state2=state2) + quant_state = QuantState( + absmax=qabsmax, + code=code, + blocksize=blocksize, + dtype=A.dtype, + offset=offset, + state2=state2, + ) else: quant_state = QuantState(absmax=absmax, code=code, blocksize=blocksize, dtype=A.dtype) @@ -809,7 +915,7 @@ def dequantize_blockwise( code: Optional[torch.Tensor] = None, out: Optional[torch.Tensor] = None, blocksize: int = 4096, - nested=False + nested=False, ) -> Tensor: """ Dequantizes blockwise quantized values. @@ -843,43 +949,76 @@ def dequantize_blockwise( code = name2qmap["dynamic"] if quant_state is None: - quant_state = QuantState(absmax=absmax, code=code, blocksize=blocksize, dtype=torch.float32) + quant_state = QuantState(absmax=absmax, code=code, blocksize=blocksize, dtype=torch.float32) absmax = quant_state.absmax if quant_state.nested: absmax = dequantize_blockwise(quant_state.absmax, quant_state.state2) absmax += quant_state.offset - if absmax.dtype != torch.float32: absmax = absmax.float() + if absmax.dtype != torch.float32: + absmax = absmax.float() if out is None: out = torch.empty(A.shape, dtype=quant_state.dtype, device=A.device) - if A.device.type != 'cpu': + if A.device.type != "cpu": device = pre_call(A.device) code = quant_state.code.to(A.device) if quant_state.blocksize not in [2048, 4096, 1024, 512, 256, 128, 64]: - raise ValueError(f"The blockwise of {quant_state.blocksize} is not supported. Supported values: [2048, 4096, 1024, 512, 256, 128, 64]") + raise ValueError( + f"The blockwise of {quant_state.blocksize} is not supported. Supported values: [2048, 4096, 1024, 512, 256, 128, 64]", + ) is_on_gpu([A, absmax, out]) if out.dtype == torch.float32: - lib.cdequantize_blockwise_fp32(get_ptr(quant_state.code), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(quant_state.blocksize), ct.c_int(A.numel())) + lib.cdequantize_blockwise_fp32( + get_ptr(quant_state.code), + get_ptr(A), + get_ptr(absmax), + get_ptr(out), + ct.c_int(quant_state.blocksize), + ct.c_int(A.numel()), + ) elif out.dtype == torch.float16: - lib.cdequantize_blockwise_fp16(get_ptr(quant_state.code), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(quant_state.blocksize), ct.c_int(A.numel())) + lib.cdequantize_blockwise_fp16( + get_ptr(quant_state.code), + get_ptr(A), + get_ptr(absmax), + get_ptr(out), + ct.c_int(quant_state.blocksize), + ct.c_int(A.numel()), + ) elif out.dtype == torch.bfloat16: - lib.cdequantize_blockwise_bf16(get_ptr(quant_state.code), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(quant_state.blocksize), ct.c_int(A.numel())) + lib.cdequantize_blockwise_bf16( + get_ptr(quant_state.code), + get_ptr(A), + get_ptr(absmax), + get_ptr(out), + ct.c_int(quant_state.blocksize), + ct.c_int(A.numel()), + ) else: raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}") post_call(A.device) else: code = quant_state.code.cpu() - lib.cdequantize_blockwise_cpu_fp32(get_ptr(code), get_ptr(A), get_ptr(quant_state.absmax), get_ptr(out), ct.c_longlong(quant_state.blocksize), ct.c_longlong(A.numel())) + lib.cdequantize_blockwise_cpu_fp32( + get_ptr(code), + get_ptr(A), + get_ptr(quant_state.absmax), + get_ptr(out), + ct.c_longlong(quant_state.blocksize), + ct.c_longlong(A.numel()), + ) return out + def get_4bit_type(typename, device=None, blocksize=64): - if device is None: device = 'cuda' + if device is None: + device = "cuda" data = None - if typename == 'nf4': - ''' Implements the NF4 data type. + if typename == "nf4": + """ Implements the NF4 data type. Constructs a quantization data type where each bin has equal area under a standard normal distribution N(0, 1) that is normalized into the range [-1, 1]. @@ -888,12 +1027,26 @@ def get_4bit_type(typename, device=None, blocksize=64): Implementation of the NF4 data type in bitsandbytes can be found in the `create_normal_map` function in the `functional.py` file: https://github.com/TimDettmers/bitsandbytes/blob/main/bitsandbytes/functional.py#L236. - ''' - data = [-1.0, -0.6961928009986877, -0.5250730514526367, -0.39491748809814453, -0.28444138169288635, - -0.18477343022823334, -0.09105003625154495, 0.0, 0.07958029955625534, 0.16093020141124725, - 0.24611230194568634, 0.33791524171829224, 0.44070982933044434, 0.5626170039176941, - 0.7229568362236023, 1.0] - elif typename == 'fp4': + """ + data = [ + -1.0, + -0.6961928009986877, + -0.5250730514526367, + -0.39491748809814453, + -0.28444138169288635, + -0.18477343022823334, + -0.09105003625154495, + 0.0, + 0.07958029955625534, + 0.16093020141124725, + 0.24611230194568634, + 0.33791524171829224, + 0.44070982933044434, + 0.5626170039176941, + 0.7229568362236023, + 1.0, + ] + elif typename == "fp4": # 0b000 = 0 # 0b001 = 0.0625 # 0b010 = 8 @@ -904,20 +1057,35 @@ def get_4bit_type(typename, device=None, blocksize=64): # 0b111 = 3 # can also be created with bnb.functional.create_fp8_map(signed=True, exponent_bits=2, precision_bits=1, total_bits=4) data = [0, 0.0625, 8.0, 12.0, 4.0, 6.0, 2.0, 3.0, -0, -0.0625, -8.0, -12.0, -4.0, -6.0, -2.0, -3.0] - elif typename == 'int4': + elif typename == "int4": data = [7, 6, 5, 4, 3, 2, 1, 0, -0, -1, -2, -3, -4, -5, -6, -7] - elif typename == 'af4': + elif typename == "af4": # Taken from: NF4 Isn't Information Theoretically Optimal (and that's Good) # https://arxiv.org/abs/2306.06965 if blocksize == 64: - data = [-1., -0.69441008, -0.51243739, -0.3736951, -0.25607552, -0.14982478, - -0.04934812, 0., 0.04273164, 0.12934483, 0.21961274, 0.31675666, - 0.42563882, 0.55496234, 0.72424863, 1.][::-1] + data = [ + -1.0, + -0.69441008, + -0.51243739, + -0.3736951, + -0.25607552, + -0.14982478, + -0.04934812, + 0.0, + 0.04273164, + 0.12934483, + 0.21961274, + 0.31675666, + 0.42563882, + 0.55496234, + 0.72424863, + 1.0, + ][::-1] else: - raise NotImplementedError('4-bit AbnormalFloats currently only support blocksize 64.') + raise NotImplementedError("4-bit AbnormalFloats currently only support blocksize 64.") if data is None: - raise NotImplementedError(f'Typename {typename} not supported') + raise NotImplementedError(f"Typename {typename} not supported") data = Tensor(data) data /= data.abs().max() @@ -926,11 +1094,26 @@ def get_4bit_type(typename, device=None, blocksize=64): return data.to(device) -def quantize_fp4(A: Tensor, absmax: Optional[torch.Tensor] = None, out: Optional[torch.Tensor] = None, blocksize=64, compress_statistics=False, quant_storage=torch.uint8): - return quantize_4bit(A, absmax, out, blocksize, compress_statistics, 'fp4', quant_storage) +def quantize_fp4( + A: Tensor, + absmax: Optional[torch.Tensor] = None, + out: Optional[torch.Tensor] = None, + blocksize=64, + compress_statistics=False, + quant_storage=torch.uint8, +): + return quantize_4bit(A, absmax, out, blocksize, compress_statistics, "fp4", quant_storage) -def quantize_nf4(A: Tensor, absmax: Optional[torch.Tensor] = None, out: Optional[torch.Tensor] = None, blocksize=64, compress_statistics=False, quant_storage=torch.uint8): - return quantize_4bit(A, absmax, out, blocksize, compress_statistics, 'nf4', quant_storage) + +def quantize_nf4( + A: Tensor, + absmax: Optional[torch.Tensor] = None, + out: Optional[torch.Tensor] = None, + blocksize=64, + compress_statistics=False, + quant_storage=torch.uint8, +): + return quantize_4bit(A, absmax, out, blocksize, compress_statistics, "nf4", quant_storage) def quantize_4bit( @@ -939,7 +1122,7 @@ def quantize_4bit( out: Optional[torch.Tensor] = None, blocksize=64, compress_statistics=False, - quant_type='fp4', + quant_type="fp4", quant_storage=torch.uint8, ) -> Tuple[Tensor, QuantState]: """ @@ -967,10 +1150,10 @@ def quantize_4bit( tuple(torch.Tensor, torch.Size, torch.dtype, int): The quantization state to undo the quantization. """ - if A.device.type != 'cuda': - raise NotImplementedError(f'Device type not supported for FP4 quantization: {A.device.type}') - if quant_type not in ['fp4', 'nf4']: - raise NotImplementedError(f'4-bit quantization data type {quant_type} is not implemented.') + if A.device.type != "cuda": + raise NotImplementedError(f"Device type not supported for FP4 quantization: {A.device.type}") + if quant_type not in ["fp4", "nf4"]: + raise NotImplementedError(f"4-bit quantization data type {quant_type} is not implemented.") n = A.numel() input_shape = A.shape @@ -980,10 +1163,9 @@ def quantize_4bit( blocks += 1 if n % blocksize > 0 else 0 absmax = torch.zeros((blocks,), device=A.device, dtype=torch.float32) - if out is None: mod = dtype2bytes[quant_storage] * 2 - out = torch.zeros(((n+1)//mod, 1), dtype=quant_storage, device=A.device) + out = torch.zeros(((n + 1) // mod, 1), dtype=quant_storage, device=A.device) assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64] @@ -991,20 +1173,62 @@ def quantize_4bit( is_on_gpu([A, out, absmax]) if A.dtype == torch.float32: - if quant_type == 'fp4': - lib.cquantize_blockwise_fp32_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n)) + if quant_type == "fp4": + lib.cquantize_blockwise_fp32_fp4( + get_ptr(None), + get_ptr(A), + get_ptr(absmax), + get_ptr(out), + ct.c_int32(blocksize), + ct.c_int(n), + ) else: - lib.cquantize_blockwise_fp32_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n)) + lib.cquantize_blockwise_fp32_nf4( + get_ptr(None), + get_ptr(A), + get_ptr(absmax), + get_ptr(out), + ct.c_int32(blocksize), + ct.c_int(n), + ) elif A.dtype == torch.float16: - if quant_type == 'fp4': - lib.cquantize_blockwise_fp16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n)) + if quant_type == "fp4": + lib.cquantize_blockwise_fp16_fp4( + get_ptr(None), + get_ptr(A), + get_ptr(absmax), + get_ptr(out), + ct.c_int32(blocksize), + ct.c_int(n), + ) else: - lib.cquantize_blockwise_fp16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n)) + lib.cquantize_blockwise_fp16_nf4( + get_ptr(None), + get_ptr(A), + get_ptr(absmax), + get_ptr(out), + ct.c_int32(blocksize), + ct.c_int(n), + ) elif A.dtype == torch.bfloat16: - if quant_type == 'fp4': - lib.cquantize_blockwise_bf16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n)) + if quant_type == "fp4": + lib.cquantize_blockwise_bf16_fp4( + get_ptr(None), + get_ptr(A), + get_ptr(absmax), + get_ptr(out), + ct.c_int32(blocksize), + ct.c_int(n), + ) else: - lib.cquantize_blockwise_bf16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n)) + lib.cquantize_blockwise_bf16_nf4( + get_ptr(None), + get_ptr(A), + get_ptr(absmax), + get_ptr(out), + ct.c_int32(blocksize), + ct.c_int(n), + ) else: raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}") post_call(A.device) @@ -1016,19 +1240,57 @@ def quantize_4bit( absmax -= offset qabsmax, state2 = quantize_blockwise(absmax, blocksize=256) del absmax - state = QuantState(absmax=qabsmax, shape=input_shape, dtype=A.dtype, blocksize=blocksize, code=code, quant_type=quant_type, offset=offset, state2=state2) + state = QuantState( + absmax=qabsmax, + shape=input_shape, + dtype=A.dtype, + blocksize=blocksize, + code=code, + quant_type=quant_type, + offset=offset, + state2=state2, + ) else: - state = QuantState(absmax=absmax, shape=input_shape, dtype=A.dtype, blocksize=blocksize, code=code, quant_type=quant_type, ) + state = QuantState( + absmax=absmax, + shape=input_shape, + dtype=A.dtype, + blocksize=blocksize, + code=code, + quant_type=quant_type, + ) return out, state -def dequantize_fp4(A: Tensor, quant_state: Optional[QuantState] = None, absmax: Optional[torch.Tensor] = None, out: Optional[torch.Tensor] = None, blocksize: int = 64) -> Tensor: - return dequantize_4bit(A, quant_state, absmax, out, blocksize, 'fp4') -def dequantize_nf4(A: Tensor, quant_state: Optional[QuantState] = None, absmax: Optional[torch.Tensor] = None, out: Optional[torch.Tensor] = None, blocksize: int = 64) -> Tensor: - return dequantize_4bit(A, quant_state, absmax, out, blocksize, 'nf4') +def dequantize_fp4( + A: Tensor, + quant_state: Optional[QuantState] = None, + absmax: Optional[torch.Tensor] = None, + out: Optional[torch.Tensor] = None, + blocksize: int = 64, +) -> Tensor: + return dequantize_4bit(A, quant_state, absmax, out, blocksize, "fp4") -def dequantize_4bit(A: Tensor, quant_state: Optional[QuantState] = None, absmax: Optional[torch.Tensor] = None, out: Optional[torch.Tensor] = None, blocksize: int = 64, quant_type='fp4') -> Tensor: + +def dequantize_nf4( + A: Tensor, + quant_state: Optional[QuantState] = None, + absmax: Optional[torch.Tensor] = None, + out: Optional[torch.Tensor] = None, + blocksize: int = 64, +) -> Tensor: + return dequantize_4bit(A, quant_state, absmax, out, blocksize, "nf4") + + +def dequantize_4bit( + A: Tensor, + quant_state: Optional[QuantState] = None, + absmax: Optional[torch.Tensor] = None, + out: Optional[torch.Tensor] = None, + blocksize: int = 64, + quant_type="fp4", +) -> Tensor: """ Dequantizes FP4 blockwise quantized values. @@ -1056,23 +1318,31 @@ def dequantize_4bit(A: Tensor, quant_state: Optional[QuantState] = None, absmax: Dequantized tensor. """ if blocksize not in [2048, 4096, 1024, 512, 256, 128, 64]: - raise ValueError(f"The blockwise of {blocksize} is not supported. Supported values: [2048, 4096, 1024, 512, 256, 128, 64]") - if quant_type not in ['fp4', 'nf4']: - raise NotImplementedError(f'4-bit quantization data type {quant_type} is not implemented.') + raise ValueError( + f"The blockwise of {blocksize} is not supported. Supported values: [2048, 4096, 1024, 512, 256, 128, 64]", + ) + if quant_type not in ["fp4", "nf4"]: + raise NotImplementedError(f"4-bit quantization data type {quant_type} is not implemented.") if quant_state is None: assert absmax is not None and out is not None - quant_state = QuantState(absmax=absmax, shape=out.shape, dtype=out.dtype, blocksize=blocksize, quant_type=quant_type) + quant_state = QuantState( + absmax=absmax, + shape=out.shape, + dtype=out.dtype, + blocksize=blocksize, + quant_type=quant_type, + ) else: absmax = quant_state.absmax - if quant_state.nested: absmax = dequantize_blockwise(quant_state.absmax, quant_state.state2) absmax += quant_state.offset - if absmax.dtype != torch.float32: absmax = absmax.float() + if absmax.dtype != torch.float32: + absmax = absmax.float() if out is None: out = torch.empty(quant_state.shape, dtype=quant_state.dtype, device=A.device) @@ -1082,27 +1352,71 @@ def dequantize_4bit(A: Tensor, quant_state: Optional[QuantState] = None, absmax: device = pre_call(A.device) is_on_gpu([A, absmax, out]) if out.dtype == torch.float32: - if quant_state.quant_type == 'fp4': - lib.cdequantize_blockwise_fp32_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(quant_state.blocksize), ct.c_int(n)) + if quant_state.quant_type == "fp4": + lib.cdequantize_blockwise_fp32_fp4( + get_ptr(None), + get_ptr(A), + get_ptr(absmax), + get_ptr(out), + ct.c_int(quant_state.blocksize), + ct.c_int(n), + ) else: - lib.cdequantize_blockwise_fp32_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(quant_state.blocksize), ct.c_int(n)) + lib.cdequantize_blockwise_fp32_nf4( + get_ptr(None), + get_ptr(A), + get_ptr(absmax), + get_ptr(out), + ct.c_int(quant_state.blocksize), + ct.c_int(n), + ) elif out.dtype == torch.float16: - if quant_state.quant_type == 'fp4': - lib.cdequantize_blockwise_fp16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(quant_state.blocksize), ct.c_int(n)) + if quant_state.quant_type == "fp4": + lib.cdequantize_blockwise_fp16_fp4( + get_ptr(None), + get_ptr(A), + get_ptr(absmax), + get_ptr(out), + ct.c_int(quant_state.blocksize), + ct.c_int(n), + ) else: - lib.cdequantize_blockwise_fp16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(quant_state.blocksize), ct.c_int(n)) + lib.cdequantize_blockwise_fp16_nf4( + get_ptr(None), + get_ptr(A), + get_ptr(absmax), + get_ptr(out), + ct.c_int(quant_state.blocksize), + ct.c_int(n), + ) elif out.dtype == torch.bfloat16: - if quant_state.quant_type == 'fp4': - lib.cdequantize_blockwise_bf16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(quant_state.blocksize), ct.c_int(n)) + if quant_state.quant_type == "fp4": + lib.cdequantize_blockwise_bf16_fp4( + get_ptr(None), + get_ptr(A), + get_ptr(absmax), + get_ptr(out), + ct.c_int(quant_state.blocksize), + ct.c_int(n), + ) else: - lib.cdequantize_blockwise_bf16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(quant_state.blocksize), ct.c_int(n)) + lib.cdequantize_blockwise_bf16_nf4( + get_ptr(None), + get_ptr(A), + get_ptr(absmax), + get_ptr(out), + ct.c_int(quant_state.blocksize), + ct.c_int(n), + ) else: raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}") post_call(A.device) - is_transposed = (True if A.shape[0] == 1 else False) - if is_transposed: return out.t() - else: return out + is_transposed = True if A.shape[0] == 1 else False + if is_transposed: + return out.t() + else: + return out def quantize( @@ -1117,7 +1431,8 @@ def quantize( code = code.to(A.device) absmax = torch.abs(A).max() - if absmax.dtype != torch.float32: absmax = absmax.float() + if absmax.dtype != torch.float32: + absmax = absmax.float() inp = A / absmax out = quantize_no_absmax(inp, code, out) return out, (absmax, code) @@ -1144,7 +1459,7 @@ def dequantize( def quantize_no_absmax(A: Tensor, code: Tensor, out: Optional[torch.Tensor] = None) -> Tensor: - ''' + """ Quantizes input tensor to 8-bit. Quantizes the 32-bit input tensor `A` to the 8-bit output tensor @@ -1163,9 +1478,10 @@ def quantize_no_absmax(A: Tensor, code: Tensor, out: Optional[torch.Tensor] = No ------- torch.Tensor: Quantized 8-bit tensor. - ''' + """ prev_device = pre_call(A.device) - if out is None: out = torch.zeros_like(A, dtype=torch.uint8) + if out is None: + out = torch.zeros_like(A, dtype=torch.uint8) is_on_gpu([A, out]) lib.cquantize(get_ptr(code), get_ptr(A), get_ptr(out), ct.c_int(A.numel())) post_call(prev_device) @@ -1173,7 +1489,7 @@ def quantize_no_absmax(A: Tensor, code: Tensor, out: Optional[torch.Tensor] = No def dequantize_no_absmax(A: Tensor, code: Tensor, out: Optional[torch.Tensor] = None) -> Tensor: - ''' + """ Dequantizes the 8-bit tensor to 32-bit. Dequantizes the 8-bit tensor `A` to the 32-bit tensor `out` via @@ -1192,9 +1508,10 @@ def dequantize_no_absmax(A: Tensor, code: Tensor, out: Optional[torch.Tensor] = ------- torch.Tensor: 32-bit output tensor. - ''' + """ prev_device = pre_call(A.device) - if out is None: out = torch.zeros_like(A, dtype=torch.float32) + if out is None: + out = torch.zeros_like(A, dtype=torch.float32) is_on_gpu([code, A, out]) lib.cdequantize(get_ptr(code), get_ptr(A), get_ptr(out), ct.c_int(A.numel())) post_call(prev_device) @@ -1261,16 +1578,17 @@ def optimizer_update_32bit( if max_unorm > 0.0: param_norm = torch.norm(p.data.float()) - optim_func = None if g.dtype == torch.float32: optim_func = str2optimizer32bit[optimizer_name][0] elif g.dtype == torch.float16: optim_func = str2optimizer32bit[optimizer_name][1] - elif (g.dtype == torch.bfloat16 and len(str2optimizer32bit[optimizer_name])==3): + elif g.dtype == torch.bfloat16 and len(str2optimizer32bit[optimizer_name]) == 3: optim_func = str2optimizer32bit[optimizer_name][2] else: - raise ValueError(f"Gradient+optimizer bit data type combination not supported: grad {g.dtype}, optimizer {state1.dtype}") + raise ValueError( + f"Gradient+optimizer bit data type combination not supported: grad {g.dtype}, optimizer {state1.dtype}", + ) is_on_gpu([g, p, state1, state2, unorm_vec]) prev_device = pre_call(g.device) @@ -1290,7 +1608,8 @@ def optimizer_update_32bit( ct.c_float(lr), ct.c_float(gnorm_scale), ct.c_bool(skip_zeros), - ct.c_int32(g.numel())) + ct.c_int32(g.numel()), + ) post_call(prev_device) @@ -1422,7 +1741,7 @@ def optimizer_update_8bit( ) else: raise ValueError( - f"Gradient+optimizer bit data type combination not supported: grad {g.dtype}, optimizer {state1.dtype}" + f"Gradient+optimizer bit data type combination not supported: grad {g.dtype}, optimizer {state1.dtype}", ) post_call(prev_device) @@ -1446,7 +1765,6 @@ def optimizer_update_8bit_blockwise( gnorm_scale: float = 1.0, skip_zeros=False, ) -> None: - optim_func = None prev_device = pre_call(g.device) is_on_gpu([g, p, state1, state2, qmap1, qmap2, absmax1, absmax2]) @@ -1454,12 +1772,15 @@ def optimizer_update_8bit_blockwise( optim_func = str2optimizer8bit_blockwise[optimizer_name][0] elif g.dtype == torch.float16 and state1.dtype == torch.uint8: optim_func = str2optimizer8bit_blockwise[optimizer_name][1] - elif (g.dtype == torch.bfloat16 and state1.dtype == torch.uint8 and - len(str2optimizer8bit_blockwise[optimizer_name])==3): + elif ( + g.dtype == torch.bfloat16 + and state1.dtype == torch.uint8 + and len(str2optimizer8bit_blockwise[optimizer_name]) == 3 + ): optim_func = str2optimizer8bit_blockwise[optimizer_name][2] else: raise ValueError( - f"Gradient+optimizer bit data type combination not supported: grad {g.dtype}, optimizer {state1.dtype}" + f"Gradient+optimizer bit data type combination not supported: grad {g.dtype}, optimizer {state1.dtype}", ) post_call(prev_device) @@ -1487,9 +1808,8 @@ def optimizer_update_8bit_blockwise( ) post_call(prev_device) -def percentile_clipping( - grad: Tensor, gnorm_vec: Tensor, step: int, percentile: int = 5 -): + +def percentile_clipping(grad: Tensor, gnorm_vec: Tensor, step: int, percentile: int = 5): """Applies percentile clipping grad: torch.Tensor @@ -1531,9 +1851,7 @@ def percentile_clipping( return current_gnorm, clip_value, gnorm_scale -def histogram_scatter_add_2d( - histogram: Tensor, index1: Tensor, index2: Tensor, source: Tensor -): +def histogram_scatter_add_2d(histogram: Tensor, index1: Tensor, index2: Tensor, source: Tensor): assert len(histogram.shape) == 2 assert histogram.dtype == torch.float32 assert source.dtype == torch.float32 @@ -1550,12 +1868,12 @@ def histogram_scatter_add_2d( is_on_gpu([histogram, index1, index2, source]) lib.chistogram_scatter_add_2d(get_ptr(histogram), get_ptr(index1), get_ptr(index2), get_ptr(source), maxdim1, n) + def check_matmul(A, B, out, transposed_A, transposed_B, expected_type=torch.int8): - if not torch.cuda.is_initialized(): torch.cuda.init() + if not torch.cuda.is_initialized(): + torch.cuda.init() if A.dtype != expected_type or B.dtype != expected_type: - raise TypeError( - f"Expected torch.int8 input tensors A and B, but got {A.dtype} and {B.dtype}" - ) + raise TypeError(f"Expected torch.int8 input tensors A and B, but got {A.dtype} and {B.dtype}") sA = A.shape sB = B.shape @@ -1596,12 +1914,7 @@ def check_matmul(A, B, out, transposed_A, transposed_B, expected_type=torch.int8 sout = out.shape # special case common in backprop if not correct and len(sA) == 3 and len(sB) == 3: - if ( - sout[0] == sA[2] - and sout[1] == sB[2] - and sA[0] == sB[0] - and sA[1] == sB[1] - ): + if sout[0] == sA[2] and sout[1] == sB[2] and sA[0] == sB[0] and sA[1] == sB[1]: correct = True else: if len(sA) == 2 and len(sB) == 2: @@ -1634,26 +1947,29 @@ def check_matmul(A, B, out, transposed_A, transposed_B, expected_type=torch.int8 if not correct: raise ValueError( - f"Tensor dimensions incorrect for matrix mulitiplication: A x B: {sA} x {sB} with transpose for A x B: {tA} x {tB}." + f"Tensor dimensions incorrect for matrix mulitiplication: A x B: {sA} x {sB} with transpose for A x B: {tA} x {tB}.", ) return sout + def gemv_4bit( A: Tensor, B: Tensor, out: Optional[torch.Tensor] = None, transposed_A=False, transposed_B=False, - state=None + state=None, ): prev_device = pre_call(A.device) - #sout = check_matmul(A, B, out, transposed_A, transposed_B, expected_type=A.dtype) + # sout = check_matmul(A, B, out, transposed_A, transposed_B, expected_type=A.dtype) if state is None: - raise ValueError('state cannot None. gem_4bit( ) requires the state from quantize_4bit( )') + raise ValueError("state cannot None. gem_4bit( ) requires the state from quantize_4bit( )") if A.numel() != A.shape[-1]: - raise ValueError('Dimensions of A are invalid. Must be a vector with the leading dimensions of "1", e.g. [1, 1, 2048]') + raise ValueError( + 'Dimensions of A are invalid. Must be a vector with the leading dimensions of "1", e.g. [1, 1, 2048]', + ) Bshape = state.shape bout = Bshape[0] @@ -1673,7 +1989,7 @@ def gemv_4bit( k = Bshape[1] lda = Bshape[0] ldc = Bshape[0] - ldb = (A.shape[-1]+1)//2 + ldb = (A.shape[-1] + 1) // 2 is_on_gpu([B, A, out, absmax, state.code]) m = ct.c_int32(m) n = ct.c_int32(n) @@ -1684,21 +2000,61 @@ def gemv_4bit( if B.dtype in [torch.uint8, torch.bfloat16, torch.float16, torch.float32]: if A.dtype == torch.float16: - lib.cgemm_4bit_inference_naive_fp16(m, n, k, get_ptr(A), get_ptr(B), get_ptr(absmax), get_ptr(state.code), get_ptr(out), lda, ldb, ldc, ct.c_int32(state.blocksize)) + lib.cgemm_4bit_inference_naive_fp16( + m, + n, + k, + get_ptr(A), + get_ptr(B), + get_ptr(absmax), + get_ptr(state.code), + get_ptr(out), + lda, + ldb, + ldc, + ct.c_int32(state.blocksize), + ) elif A.dtype == torch.bfloat16: - lib.cgemm_4bit_inference_naive_bf16(m, n, k, get_ptr(A), get_ptr(B), get_ptr(absmax), get_ptr(state.code), get_ptr(out), lda, ldb, ldc, ct.c_int32(state.blocksize)) + lib.cgemm_4bit_inference_naive_bf16( + m, + n, + k, + get_ptr(A), + get_ptr(B), + get_ptr(absmax), + get_ptr(state.code), + get_ptr(out), + lda, + ldb, + ldc, + ct.c_int32(state.blocksize), + ) elif A.dtype == torch.float32: - lib.cgemm_4bit_inference_naive_fp32(m, n, k, get_ptr(A), get_ptr(B), get_ptr(absmax), get_ptr(state.code), get_ptr(out), lda, ldb, ldc, ct.c_int32(state.blocksize)) + lib.cgemm_4bit_inference_naive_fp32( + m, + n, + k, + get_ptr(A), + get_ptr(B), + get_ptr(absmax), + get_ptr(state.code), + get_ptr(out), + lda, + ldb, + ldc, + ct.c_int32(state.blocksize), + ) else: - raise NotImplementedError(f'Matmul not implemented for data type {A.dtype}') + raise NotImplementedError(f"Matmul not implemented for data type {A.dtype}") else: - raise NotImplementedError(f'Matmul not implemented for data type {A.dtype}') + raise NotImplementedError(f"Matmul not implemented for data type {A.dtype}") post_call(prev_device) return out + def igemm( A: Tensor, B: Tensor, @@ -1764,7 +2120,7 @@ def igemm( assert len(sA) == 3 if not (sA[0] == sB[0] and sA[1] == sB[1]): raise ValueError( - f"Only bsi,bso->io supported for tensor contractions, but dims for A x B were: {sA} x {sB}" + f"Only bsi,bso->io supported for tensor contractions, but dims for A x B were: {sA} x {sB}", ) transposed_A = True @@ -1783,8 +2139,20 @@ def igemm( # B^T @ A^T = C^T # [km, nk -> mn] is_on_gpu([B, A, out]) - lib.cigemm(ptr, ct.c_bool(transposed_B), ct.c_bool(transposed_A), ct.c_int32(m), ct.c_int32(n), ct.c_int32(k), - get_ptr(B), get_ptr(A), get_ptr(out), ct.c_int32(lda), ct.c_int32(ldb), ct.c_int32(ldc)) + lib.cigemm( + ptr, + ct.c_bool(transposed_B), + ct.c_bool(transposed_A), + ct.c_int32(m), + ct.c_int32(n), + ct.c_int32(k), + get_ptr(B), + get_ptr(A), + get_ptr(out), + ct.c_int32(lda), + ct.c_int32(ldb), + ct.c_int32(ldc), + ) return out @@ -1796,9 +2164,7 @@ def batched_igemm( transposed_B=False, ): if not len(A.shape) == 3 or not len(B.shape) == 3: - raise ValueError( - f"Expected 3-dimensional tensors for bmm, but got shapes A and B: {A.shape} and {B.shape}" - ) + raise ValueError(f"Expected 3-dimensional tensors for bmm, but got shapes A and B: {A.shape} and {B.shape}") sout = check_matmul(A, B, out, transposed_A, transposed_B) if out is None: out = torch.zeros(size=sout, dtype=torch.int32, device=A.device) @@ -1865,9 +2231,24 @@ def batched_igemm( ptr = CUBLAS_Context.get_instance().get_context(A.device) is_on_gpu([B, A, out]) - lib.cbatched_igemm(ptr, ct.c_bool(transposed_B), ct.c_bool(transposed_A), ct.c_int32(m), ct.c_int32(n), ct.c_int32(k), - get_ptr(B), get_ptr(A), get_ptr(out), ct.c_int32(lda), ct.c_int32(ldb), ct.c_int32(ldc), - ct.c_long(strideA), ct.c_long(strideB), ct.c_long(strideC), ct.c_uint32(num_batch)) + lib.cbatched_igemm( + ptr, + ct.c_bool(transposed_B), + ct.c_bool(transposed_A), + ct.c_int32(m), + ct.c_int32(n), + ct.c_int32(k), + get_ptr(B), + get_ptr(A), + get_ptr(out), + ct.c_int32(lda), + ct.c_int32(ldb), + ct.c_int32(ldc), + ct.c_long(strideA), + ct.c_long(strideB), + ct.c_long(strideC), + ct.c_uint32(num_batch), + ) return out @@ -1876,14 +2257,14 @@ def igemmlt(A, B, SA, SB, out=None, Sout=None, dtype=torch.int32): shapeB = SB[0] dimsA = len(shapeA) dimsB = len(shapeB) - assert dimsB == 2, 'Only two dimensional matrices are supported for argument B' + assert dimsB == 2, "Only two dimensional matrices are supported for argument B" if dimsA == 2: m = shapeA[0] elif dimsA == 3: m = shapeA[0] * shapeA[1] rows = n = shapeB[0] - assert prod(list(shapeA)) > 0, f'Input tensor dimensions need to be > 0: {shapeA}' + assert prod(list(shapeA)) > 0, f"Input tensor dimensions need to be > 0: {shapeA}" # if the tensor is empty, return a transformed empty tensor with the right dimensions if shapeA[0] == 0 and dimsA == 2: @@ -1892,13 +2273,9 @@ def igemmlt(A, B, SA, SB, out=None, Sout=None, dtype=torch.int32): return torch.empty(tuple(shapeA[:2] + [shapeB[0]]), device=A.device, dtype=torch.float16) if dimsA == 2 and out is None: - out, Sout = get_transform_buffer( - (shapeA[0], shapeB[0]), dtype, A.device, "col32", "row" - ) + out, Sout = get_transform_buffer((shapeA[0], shapeB[0]), dtype, A.device, "col32", "row") elif dimsA == 3 and out is None: - out, Sout = get_transform_buffer( - (shapeA[0], shapeA[1], shapeB[0]), dtype, A.device, "col32", "row" - ) + out, Sout = get_transform_buffer((shapeA[0], shapeA[1], shapeB[0]), dtype, A.device, "col32", "row") assert dimsB != 3, "len(B.shape)==3 not supported" assert A.device.type == "cuda" @@ -1940,49 +2317,33 @@ def igemmlt(A, B, SA, SB, out=None, Sout=None, dtype=torch.int32): has_error = 0 ptrRowScale = get_ptr(None) is_on_gpu([A, B, out]) - if formatB == 'col_turing': + if formatB == "col_turing": if dtype == torch.int32: - has_error = lib.cigemmlt_turing_32( - ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc - ) + has_error = lib.cigemmlt_turing_32(ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc) else: - has_error = lib.cigemmlt_turing_8( - ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc - ) + has_error = lib.cigemmlt_turing_8(ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc) elif formatB == "col_ampere": if dtype == torch.int32: - has_error = lib.cigemmlt_ampere_32( - ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc - ) + has_error = lib.cigemmlt_ampere_32(ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc) else: - has_error = lib.cigemmlt_ampere_8( - ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc - ) + has_error = lib.cigemmlt_ampere_8(ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc) if has_error == 100: # `ERR_NOT_IMPLEMENTED` is defined as 100 in `ops.cu` raise NotImplementedError("igemmlt not available (probably built with NO_CUBLASLT)") if has_error: - print(f'A: {shapeA}, B: {shapeB}, C: {Sout[0]}; (lda, ldb, ldc): {(lda, ldb, ldc)}; (m, n, k): {(m, n, k)}') - raise Exception('cublasLt ran into an error!') + print(f"A: {shapeA}, B: {shapeB}, C: {Sout[0]}; (lda, ldb, ldc): {(lda, ldb, ldc)}; (m, n, k): {(m, n, k)}") + raise Exception("cublasLt ran into an error!") torch.cuda.set_device(prev_device) return out, Sout -def mm_dequant( - A, - quant_state, - row_stats, - col_stats, - out=None, - new_row_stats=None, - new_col_stats=None, - bias=None -): +def mm_dequant(A, quant_state, row_stats, col_stats, out=None, new_row_stats=None, new_col_stats=None, bias=None): assert A.dtype == torch.int32 - if bias is not None: assert bias.dtype == torch.float16 + if bias is not None: + assert bias.dtype == torch.float16 out_shape = quant_state[0] if len(out_shape) == 3: out_shape = (out_shape[0] * out_shape[1], out_shape[2]) @@ -1990,19 +2351,11 @@ def mm_dequant( if out is None: out = torch.empty(out_shape, dtype=torch.float16, device=A.device) if new_row_stats is None: - new_row_stats = torch.empty( - out_shape[0], dtype=torch.float32, device=A.device - ) + new_row_stats = torch.empty(out_shape[0], dtype=torch.float32, device=A.device) if new_col_stats is None: - new_col_stats = torch.empty( - out_shape[1], dtype=torch.float32, device=A.device - ) - assert ( - new_row_stats.shape[0] == row_stats.shape[0] - ), f"{new_row_stats.shape} vs {row_stats.shape}" - assert ( - new_col_stats.shape[0] == col_stats.shape[0] - ), f"{new_col_stats.shape} vs {col_stats.shape}" + new_col_stats = torch.empty(out_shape[1], dtype=torch.float32, device=A.device) + assert new_row_stats.shape[0] == row_stats.shape[0], f"{new_row_stats.shape} vs {row_stats.shape}" + assert new_col_stats.shape[0] == col_stats.shape[0], f"{new_col_stats.shape} vs {col_stats.shape}" prev_device = pre_call(A.device) ptrA = get_ptr(A) @@ -2016,15 +2369,23 @@ def mm_dequant( numCols = ct.c_int32(out_shape[1]) is_on_gpu([A, row_stats, col_stats, out, new_row_stats, new_col_stats, bias]) - lib.cdequant_mm_int32_fp16(ptrA, ptrRowStats, ptrColStats, ptrOut, ptrNewRowStats, ptrNewColStats, ptrBias, numRows, numCols) + lib.cdequant_mm_int32_fp16( + ptrA, + ptrRowStats, + ptrColStats, + ptrOut, + ptrNewRowStats, + ptrNewColStats, + ptrBias, + numRows, + numCols, + ) post_call(prev_device) return out -def get_colrow_absmax( - A, row_stats=None, col_stats=None, nnz_block_ptr=None, threshold=0.0 -): +def get_colrow_absmax(A, row_stats=None, col_stats=None, nnz_block_ptr=None, threshold=0.0): assert A.dtype == torch.float16 device = A.device @@ -2037,18 +2398,12 @@ def get_colrow_absmax( col_tiles = (cols + 255) // 256 tiled_rows = ((rows + 15) // 16) * 16 if row_stats is None: - row_stats = torch.empty( - (rows,), dtype=torch.float32, device=device - ).fill_(-50000.0) + row_stats = torch.empty((rows,), dtype=torch.float32, device=device).fill_(-50000.0) if col_stats is None: - col_stats = torch.empty( - (cols,), dtype=torch.float32, device=device - ).fill_(-50000.0) + col_stats = torch.empty((cols,), dtype=torch.float32, device=device).fill_(-50000.0) if nnz_block_ptr is None and threshold > 0.0: - nnz_block_ptr = torch.zeros( - ((tiled_rows * col_tiles) + 1,), dtype=torch.int32, device=device - ) + nnz_block_ptr = torch.zeros(((tiled_rows * col_tiles) + 1,), dtype=torch.int32, device=device) ptrA = get_ptr(A) ptrRowStats = get_ptr(row_stats) @@ -2122,14 +2477,10 @@ def __init__(self, rows, cols, nnz, colptr, rowidx, values): def coo2csr(cooA): values, counts = torch.unique(cooA.rowidx, return_counts=True) values.add_(1) - rowptr = torch.zeros( - (cooA.rows + 1,), dtype=torch.int32, device=cooA.rowidx.device - ) + rowptr = torch.zeros((cooA.rows + 1,), dtype=torch.int32, device=cooA.rowidx.device) rowptr.scatter_(index=values.long(), src=counts.int(), dim=0) rowptr.cumsum_(0) - return CSRSparseTensor( - cooA.rows, cooA.cols, cooA.nnz, rowptr, cooA.colidx, cooA.values - ) + return CSRSparseTensor(cooA.rows, cooA.cols, cooA.nnz, rowptr, cooA.colidx, cooA.values) def coo2csc(cooA): @@ -2138,14 +2489,10 @@ def coo2csc(cooA): values = cooA.values[col2rowidx] colvalues, counts = torch.unique(val, return_counts=True) colvalues.add_(1) - colptr = torch.zeros( - (cooA.cols + 1,), dtype=torch.int32, device=cooA.colidx.device - ) + colptr = torch.zeros((cooA.cols + 1,), dtype=torch.int32, device=cooA.colidx.device) colptr.scatter_(index=colvalues.long(), src=counts.int(), dim=0) colptr.cumsum_(0) - return CSCSparseTensor( - cooA.rows, cooA.cols, cooA.nnz, colptr, rowidx, values - ) + return CSCSparseTensor(cooA.rows, cooA.cols, cooA.nnz, colptr, rowidx, values) def coo_zeros(rows, cols, nnz, device, dtype=torch.half): @@ -2155,9 +2502,7 @@ def coo_zeros(rows, cols, nnz, device, dtype=torch.half): return COOSparseTensor(rows, cols, nnz, rowidx, colidx, values) -def double_quant( - A, col_stats=None, row_stats=None, out_col=None, out_row=None, threshold=0.0 -): +def double_quant(A, col_stats=None, row_stats=None, out_col=None, out_row=None, threshold=0.0): device = A.device assert A.dtype == torch.half assert device.type == "cuda" @@ -2170,9 +2515,7 @@ def double_quant( rows = A.shape[0] if row_stats is None or col_stats is None: - row_stats, col_stats, nnz_row_ptr = get_colrow_absmax( - A, threshold=threshold - ) + row_stats, col_stats, nnz_row_ptr = get_colrow_absmax(A, threshold=threshold) if out_col is None: out_col = torch.zeros(A.shape, device=device, dtype=torch.int8) @@ -2190,9 +2533,7 @@ def double_quant( if threshold > 0.0: nnz = nnz_row_ptr[-1].item() if nnz > 0: - coo_tensor = coo_zeros( - A.shape[0], A.shape[1], nnz_row_ptr[-1].item(), device - ) + coo_tensor = coo_zeros(A.shape[0], A.shape[1], nnz_row_ptr[-1].item(), device) ptrRowIdx = get_ptr(coo_tensor.rowidx) ptrColIdx = get_ptr(coo_tensor.colidx) ptrVal = get_ptr(coo_tensor.values) @@ -2251,12 +2592,16 @@ def double_quant( return out_row, out_col, row_stats, col_stats, coo_tensor -def transform(A, to_order, from_order='row', out=None, transpose=False, state=None, ld=None): +def transform(A, to_order, from_order="row", out=None, transpose=False, state=None, ld=None): prev_device = pre_call(A.device) - if state is None: state = (A.shape, from_order) - else: from_order = state[1] - if out is None: out, new_state = get_transform_buffer(state[0], A.dtype, A.device, to_order, state[1], transpose) - else: new_state = (state[0], to_order) # (shape, order) + if state is None: + state = (A.shape, from_order) + else: + from_order = state[1] + if out is None: + out, new_state = get_transform_buffer(state[0], A.dtype, A.device, to_order, state[1], transpose) + else: + new_state = (state[0], to_order) # (shape, order) shape = state[0] if len(shape) == 2: @@ -2267,7 +2612,7 @@ def transform(A, to_order, from_order='row', out=None, transpose=False, state=No dim2 = ct.c_int32(shape[2]) is_on_gpu([A, out]) - if to_order == 'col32': + if to_order == "col32": if transpose: lib.ctransform_row2col32T(get_ptr(A), get_ptr(out), dim1, dim2) else: @@ -2288,7 +2633,7 @@ def transform(A, to_order, from_order='row', out=None, transpose=False, state=No elif from_order == "col_ampere": lib.ctransform_ampere2row(get_ptr(A), get_ptr(out), dim1, dim2) else: - raise NotImplementedError(f'Transform function not implemented: From {from_order} to {to_order}') + raise NotImplementedError(f"Transform function not implemented: From {from_order} to {to_order}") post_call(prev_device) @@ -2297,9 +2642,7 @@ def transform(A, to_order, from_order='row', out=None, transpose=False, state=No def spmm_coo(cooA, B, out=None): if out is None: - out = torch.empty( - (cooA.rows, B.shape[1]), device=B.device, dtype=B.dtype - ) + out = torch.empty((cooA.rows, B.shape[1]), device=B.device, dtype=B.dtype) nnz = cooA.nnz assert cooA.rowidx.numel() == nnz assert cooA.colidx.numel() == nnz @@ -2326,16 +2669,28 @@ def spmm_coo(cooA, B, out=None): cldc = ct.c_int32(ldc) is_on_gpu([cooA.rowidx, cooA.colidx, cooA.values, B, out]) - lib.cspmm_coo(ptr, ptrRowidx, ptrColidx, ptrValues, cnnz, crowsA, ccolsA, ccolsB, cldb, ptrB, cldc, ptrC, ct.c_bool(transposed_B)) + lib.cspmm_coo( + ptr, + ptrRowidx, + ptrColidx, + ptrValues, + cnnz, + crowsA, + ccolsA, + ccolsB, + cldb, + ptrB, + cldc, + ptrC, + ct.c_bool(transposed_B), + ) return out def spmm_coo_very_sparse(cooA, B, dequant_stats=None, out=None): if out is None: - out = torch.zeros( - (cooA.rows, B.shape[1]), device=B.device, dtype=cooA.values.dtype - ) + out = torch.zeros((cooA.rows, B.shape[1]), device=B.device, dtype=cooA.values.dtype) nnz = cooA.nnz prev_device = pre_call(B.device) assert cooA.rowidx.numel() == nnz @@ -2353,9 +2708,7 @@ def spmm_coo_very_sparse(cooA, B, dequant_stats=None, out=None): max_count, max_idx = torch.sort(counts, descending=True) max_idx = max_idx.int() max_count = max_count.int() - assert ( - max_count[0] <= 32 - ), f"Current max count per row is 8 but found {max_count[0]}." + assert max_count[0] <= 32, f"Current max count per row is 8 but found {max_count[0]}." assert B.dtype in [torch.float16, torch.int8] ptrOffset = get_ptr(offset) ptrMaxCount = get_ptr(max_count) @@ -2443,9 +2796,7 @@ def vectorwise_quant(x, dim=1, quant_type="vector"): elif quant_type in ["vector-zeropoint", "row-zeropoint"]: dtype = x.dtype x = x.float() - dyna = torch.amax(x, dim=dim, keepdim=True) - torch.amin( - x, dim=dim, keepdim=True - ) + dyna = torch.amax(x, dim=dim, keepdim=True) - torch.amin(x, dim=dim, keepdim=True) dyna[dyna == 0] = 1 qx = 255.0 / dyna minx = torch.amin(x, dim=dim, keepdim=True) @@ -2553,9 +2904,7 @@ def extract_outliers(A, SA, idx): assert formatA in ["col_turing", "col_ampere"] assert A.device.type == "cuda" - out = torch.zeros( - (shapeA[0], idx.numel()), dtype=torch.int8, device=A.device - ) + out = torch.zeros((shapeA[0], idx.numel()), dtype=torch.int8, device=A.device) idx_size = ct.c_int32(idx.numel()) rows = ct.c_int32(shapeA[0]) @@ -2565,7 +2914,7 @@ def extract_outliers(A, SA, idx): ptrOut = get_ptr(out) prev_device = pre_call(A.device) - if formatA == 'col_turing': + if formatA == "col_turing": lib.cextractOutliers_turing(ptrA, ptrIdx, ptrOut, idx_size, rows, cols) elif formatA == "col_ampere": lib.cextractOutliers_ampere(ptrA, ptrIdx, ptrOut, idx_size, rows, cols) @@ -2573,6 +2922,7 @@ def extract_outliers(A, SA, idx): return out + def pipeline_test(A, batch_size): out = torch.zeros_like(A) lib.cpipeline_test(get_ptr(A), get_ptr(out), ct.c_size_t(A.numel()), ct.c_size_t(batch_size)) diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py index f7b96205b..e1cc6600d 100644 --- a/bitsandbytes/nn/modules.py +++ b/bitsandbytes/nn/modules.py @@ -44,6 +44,7 @@ class StableEmbedding(torch.nn.Embedding): reset_parameters(): Reset embedding parameters using Xavier uniform initialization. forward(input: Tensor) -> Tensor: Forward pass through the stable embedding layer. """ + def __init__( self, num_embeddings: int, @@ -89,9 +90,7 @@ def __init__( dtype, ) self.norm = torch.nn.LayerNorm(embedding_dim, device=device) - GlobalOptimManager.get_instance().register_module_override( - self, "weight", {"optim_bits": 32} - ) + GlobalOptimManager.get_instance().register_module_override(self, "weight", {"optim_bits": 32}) def reset_parameters(self) -> None: torch.nn.init.xavier_uniform_(self.weight) @@ -130,6 +129,7 @@ class Embedding(torch.nn.Embedding): """ Embedding class to store and retrieve word embeddings from their indices. """ + def __init__( self, num_embeddings: int, @@ -170,11 +170,9 @@ def __init__( scale_grad_by_freq, sparse, _weight, - device=device - ) - GlobalOptimManager.get_instance().register_module_override( - self, "weight", {"optim_bits": 32} + device=device, ) + GlobalOptimManager.get_instance().register_module_override(self, "weight", {"optim_bits": 32}) def reset_parameters(self) -> None: torch.nn.init.xavier_uniform_(self.weight) @@ -208,16 +206,16 @@ def forward(self, input: Tensor) -> Tensor: class Params4bit(torch.nn.Parameter): def __new__( - cls, - data: Optional[torch.Tensor] = None, - requires_grad=False, # quantized weights should be frozen by default - quant_state: Optional[QuantState] = None, - blocksize: int = 64, - compress_statistics: bool = True, - quant_type: str = 'fp4', - quant_storage: torch.dtype = torch.uint8, - module: Optional["Linear4bit"] = None, - bnb_quantized: bool = False + cls, + data: Optional[torch.Tensor] = None, + requires_grad=False, # quantized weights should be frozen by default + quant_state: Optional[QuantState] = None, + blocksize: int = 64, + compress_statistics: bool = True, + quant_type: str = "fp4", + quant_storage: torch.dtype = torch.uint8, + module: Optional["Linear4bit"] = None, + bnb_quantized: bool = False, ) -> "Params4bit": if data is None: data = torch.empty(0) @@ -250,7 +248,7 @@ def __setstate__(self, state): self.bnb_quantized = state["bnb_quantized"] self.module = state["module"] - def __deepcopy__(self,memo): + def __deepcopy__(self, memo): new_instance = type(self).__new__(type(self)) state = self.__getstate__() new_instance.__setstate__(state) @@ -265,7 +263,14 @@ def __copy__(self): return new_instance @classmethod - def from_prequantized(cls, data: torch.Tensor, quantized_stats: Dict[str, Any], requires_grad: bool = False, device='cuda', **kwargs) -> "Params4bit": + def from_prequantized( + cls, + data: torch.Tensor, + quantized_stats: Dict[str, Any], + requires_grad: bool = False, + device="cuda", + **kwargs, + ) -> "Params4bit": self = torch.Tensor._make_subclass(cls, data.to(device)) self.requires_grad = requires_grad self.quant_state = QuantState.from_dict(qs_dict=quantized_stats, device=device) @@ -292,33 +297,39 @@ def _quantize(self, device): return self def cuda(self, device: Optional[Union[int, device, str]] = None, non_blocking: bool = False): - return self.to(device='cuda' if device is None else device, non_blocking=non_blocking) + return self.to(device="cuda" if device is None else device, non_blocking=non_blocking) @overload - def to(self: T, device: Optional[Union[int, device]] = ..., dtype: Optional[Union[dtype, str]] = ..., non_blocking: bool = ...,) -> T: - ... + def to( + self: T, + device: Optional[Union[int, device]] = ..., + dtype: Optional[Union[dtype, str]] = ..., + non_blocking: bool = ..., + ) -> T: ... @overload - def to(self: T, dtype: Union[dtype, str], non_blocking: bool = ...) -> T: - ... + def to(self: T, dtype: Union[dtype, str], non_blocking: bool = ...) -> T: ... @overload - def to(self: T, tensor: Tensor, non_blocking: bool = ...) -> T: - ... + def to(self: T, tensor: Tensor, non_blocking: bool = ...) -> T: ... def to(self, *args, **kwargs): device, dtype, non_blocking, convert_to_format = torch._C._nn._parse_to(*args, **kwargs) - if (device is not None and device.type == "cuda" and not self.bnb_quantized): + if device is not None and device.type == "cuda" and not self.bnb_quantized: return self._quantize(device) else: if self.quant_state is not None: self.quant_state.to(device) - new_param = Params4bit(super().to(device=device, dtype=dtype, non_blocking=non_blocking), - requires_grad=self.requires_grad, quant_state=self.quant_state, - blocksize=self.blocksize, compress_statistics=self.compress_statistics, - quant_type=self.quant_type) + new_param = Params4bit( + super().to(device=device, dtype=dtype, non_blocking=non_blocking), + requires_grad=self.requires_grad, + quant_state=self.quant_state, + blocksize=self.blocksize, + compress_statistics=self.compress_statistics, + quant_type=self.quant_type, + ) return new_param @@ -355,7 +366,18 @@ class Linear4bit(nn.Linear): quantized_model = quantized_model.to(0) # Quantization happens here ``` """ - def __init__(self, input_features, output_features, bias=True, compute_dtype=None, compress_statistics=True, quant_type='fp4', quant_storage=torch.uint8, device=None): + + def __init__( + self, + input_features, + output_features, + bias=True, + compute_dtype=None, + compress_statistics=True, + quant_type="fp4", + quant_storage=torch.uint8, + device=None, + ): """ Initialize Linear4bit class. @@ -368,7 +390,14 @@ def __init__(self, input_features, output_features, bias=True, compute_dtype=Non Whether the linear class uses the bias term as well. """ super().__init__(input_features, output_features, bias, device) - self.weight = Params4bit(self.weight.data, requires_grad=False, compress_statistics=compress_statistics, quant_type=quant_type, quant_storage=quant_storage, module=self) + self.weight = Params4bit( + self.weight.data, + requires_grad=False, + compress_statistics=compress_statistics, + quant_type=quant_type, + quant_storage=quant_storage, + module=self, + ) # self.persistent_buffers = [] # TODO consider as way to save quant state self.compute_dtype = compute_dtype self.compute_type_is_set = False @@ -385,11 +414,15 @@ def set_compute_type(self, x): if self.compute_dtype == torch.float32 and (x.numel() == x.shape[-1]): # single batch inference with input torch.float16 and compute_dtype float32 -> slow inference when it could be fast # warn the user about this - warnings.warn('Input type into Linear4bit is torch.float16, but bnb_4bit_compute_dtype=torch.float32 (default). This will lead to slow inference.') - warnings.filterwarnings('ignore', message='.*inference.') + warnings.warn( + "Input type into Linear4bit is torch.float16, but bnb_4bit_compute_dtype=torch.float32 (default). This will lead to slow inference.", + ) + warnings.filterwarnings("ignore", message=".*inference.") if self.compute_dtype == torch.float32 and (x.numel() != x.shape[-1]): - warnings.warn('Input type into Linear4bit is torch.float16, but bnb_4bit_compute_dtype=torch.float32 (default). This will lead to slow inference or training speed.') - warnings.filterwarnings('ignore', message='.*inference or training') + warnings.warn( + "Input type into Linear4bit is torch.float16, but bnb_4bit_compute_dtype=torch.float32 (default). This will lead to slow inference or training speed.", + ) + warnings.filterwarnings("ignore", message=".*inference or training") def _save_to_state_dict(self, destination, prefix, keep_vars): """ @@ -407,8 +440,8 @@ def forward(self, x: torch.Tensor): if self.bias is not None and self.bias.dtype != x.dtype: self.bias.data = self.bias.data.to(x.dtype) - if getattr(self.weight, 'quant_state', None) is None: - if getattr(self, 'quant_state', None) is not None: + if getattr(self.weight, "quant_state", None) is None: + if getattr(self, "quant_state", None) is not None: # the quant state got lost when the parameter got converted. This happens for example for fsdp # since we registered the module, we can recover the state here assert self.weight.shape[1] == 1 @@ -416,7 +449,9 @@ def forward(self, x: torch.Tensor): self.weight = Params4bit(self.weight, quant_storage=self.quant_storage) self.weight.quant_state = self.quant_state else: - print('FP4 quantization state not initialized. Please call .cuda() or .to(device) on the LinearFP4 layer first.') + print( + "FP4 quantization state not initialized. Please call .cuda() or .to(device) on the LinearFP4 layer first.", + ) if not self.compute_type_is_set: self.set_compute_type(x) self.compute_type_is_set = True @@ -437,7 +472,17 @@ class LinearFP4(Linear4bit): """ Implements the FP4 data type. """ - def __init__(self, input_features, output_features, bias=True, compute_dtype=None, compress_statistics=True, quant_storage=torch.uint8, device=None): + + def __init__( + self, + input_features, + output_features, + bias=True, + compute_dtype=None, + compress_statistics=True, + quant_storage=torch.uint8, + device=None, + ): """ Args: input_features (`str`): @@ -447,21 +492,40 @@ def __init__(self, input_features, output_features, bias=True, compute_dtype=Non bias (`bool`, defaults to `True`): Whether the linear class uses the bias term as well. """ - super().__init__(input_features, output_features, bias, compute_dtype, compress_statistics, 'fp4', quant_storage, device) + super().__init__( + input_features, + output_features, + bias, + compute_dtype, + compress_statistics, + "fp4", + quant_storage, + device, + ) class LinearNF4(Linear4bit): - ''' Implements the NF4 data type. + """Implements the NF4 data type. + + Constructs a quantization data type where each bin has equal area under a standard normal distribution N(0, 1) that + is normalized into the range [-1, 1]. - Constructs a quantization data type where each bin has equal area under a standard normal distribution N(0, 1) that - is normalized into the range [-1, 1]. + For more information read the paper: QLoRA: Efficient Finetuning of Quantized LLMs (https://arxiv.org/abs/2305.14314) - For more information read the paper: QLoRA: Efficient Finetuning of Quantized LLMs (https://arxiv.org/abs/2305.14314) + Implementation of the NF4 data type in bitsandbytes can be found in the `create_normal_map` function in + the `functional.py` file: https://github.com/TimDettmers/bitsandbytes/blob/main/bitsandbytes/functional.py#L236. + """ - Implementation of the NF4 data type in bitsandbytes can be found in the `create_normal_map` function in - the `functional.py` file: https://github.com/TimDettmers/bitsandbytes/blob/main/bitsandbytes/functional.py#L236. - ''' - def __init__(self, input_features, output_features, bias=True, compute_dtype=None, compress_statistics=True, quant_storage=torch.uint8, device=None): + def __init__( + self, + input_features, + output_features, + bias=True, + compute_dtype=None, + compress_statistics=True, + quant_storage=torch.uint8, + device=None, + ): """ Args: input_features (`str`): @@ -471,7 +535,16 @@ def __init__(self, input_features, output_features, bias=True, compute_dtype=Non bias (`bool`, defaults to `True`): Whether the linear class uses the bias term as well. """ - super().__init__(input_features, output_features, bias, compute_dtype, compress_statistics, 'nf4', quant_storage, device) + super().__init__( + input_features, + output_features, + bias, + compute_dtype, + compress_statistics, + "nf4", + quant_storage, + device, + ) class Int8Params(torch.nn.Parameter): @@ -514,33 +587,22 @@ def to( device: Optional[Union[int, device]] = ..., dtype: Optional[Union[dtype, str]] = ..., non_blocking: bool = ..., - ) -> T: - ... + ) -> T: ... @overload - def to(self: T, dtype: Union[dtype, str], non_blocking: bool = ...) -> T: - ... + def to(self: T, dtype: Union[dtype, str], non_blocking: bool = ...) -> T: ... @overload - def to(self: T, tensor: Tensor, non_blocking: bool = ...) -> T: - ... + def to(self: T, tensor: Tensor, non_blocking: bool = ...) -> T: ... def to(self, *args, **kwargs): - device, dtype, non_blocking, convert_to_format = torch._C._nn._parse_to( - *args, **kwargs - ) + device, dtype, non_blocking, convert_to_format = torch._C._nn._parse_to(*args, **kwargs) - if ( - device is not None - and device.type == "cuda" - and self.data.device.type == "cpu" - ): + if device is not None and device.type == "cuda" and self.data.device.type == "cpu": return self.cuda(device) else: new_param = Int8Params( - super().to( - device=device, dtype=dtype, non_blocking=non_blocking - ), + super().to(device=device, dtype=dtype, non_blocking=non_blocking), requires_grad=self.requires_grad, has_fp16_weights=self.has_fp16_weights, ) @@ -593,8 +655,18 @@ class Linear8bitLt(nn.Linear): int8_model = int8_model.to(0) # Quantization happens here ``` """ - def __init__(self, input_features, output_features, bias=True, has_fp16_weights=True, - memory_efficient_backward=False, threshold=0.0, index=None, device=None): + + def __init__( + self, + input_features, + output_features, + bias=True, + has_fp16_weights=True, + memory_efficient_backward=False, + threshold=0.0, + index=None, + device=None, + ): """ Initialize Linear8bitLt class. @@ -647,19 +719,36 @@ def _save_to_state_dict(self, destination, prefix, keep_vars): destination[key_name] = param_from_state if keep_vars else param_from_state.detach() destination[format_name] = self.state.formatB - def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, - missing_keys, unexpected_keys, error_msgs): - super()._load_from_state_dict(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, - error_msgs) + def _load_from_state_dict( + self, + state_dict, + prefix, + local_metadata, + strict, + missing_keys, + unexpected_keys, + error_msgs, + ): + super()._load_from_state_dict( + state_dict, + prefix, + local_metadata, + strict, + missing_keys, + unexpected_keys, + error_msgs, + ) unexpected_copy = list(unexpected_keys) for key in unexpected_copy: - input_name = key[len(prefix):] + input_name = key[len(prefix) :] if input_name == "SCB": if self.weight.SCB is None: # buffers not yet initialized, can't access them directly without quantizing first - raise RuntimeError("Loading a quantized checkpoint into non-quantized Linear8bitLt is " - "not supported. Please call module.cuda() before module.load_state_dict()") + raise RuntimeError( + "Loading a quantized checkpoint into non-quantized Linear8bitLt is " + "not supported. Please call module.cuda() before module.load_state_dict()", + ) input_param = state_dict[key] self.weight.SCB.copy_(input_param) @@ -702,18 +791,18 @@ def __init__(self, input_features, output_features, bias=True, device=None): self.is_quantized = False def forward_with_outliers(self, x, outlier_idx): - raise NotImplementedError('Please override the `forward_with_outliers(self, x, outlier_idx)` function') + raise NotImplementedError("Please override the `forward_with_outliers(self, x, outlier_idx)` function") def quantize_weight(self, w, outlier_idx): - raise NotImplementedError('Please override the `quantize_weights(self, w, outlier_idx)` function') + raise NotImplementedError("Please override the `quantize_weights(self, w, outlier_idx)` function") def forward(self, x): if self.outlier_dim is None: tracer = OutlierTracer.get_instance() if not tracer.is_initialized(): - print('Please use OutlierTracer.initialize(model) before using the OutlierAwareLinear layer') + print("Please use OutlierTracer.initialize(model) before using the OutlierAwareLinear layer") outlier_idx = tracer.get_outliers(self.weight) - #print(outlier_idx, tracer.get_hvalue(self.weight)) + # print(outlier_idx, tracer.get_hvalue(self.weight)) self.outlier_dim = outlier_idx if not self.is_quantized: @@ -721,6 +810,7 @@ def forward(self, x): self.weight.data.copy_(w) self.is_quantized = True + class SwitchBackLinearBnb(nn.Linear): def __init__( self, @@ -731,11 +821,9 @@ def __init__( memory_efficient_backward=False, threshold=0.0, index=None, - device=None + device=None, ): - super().__init__( - input_features, output_features, bias, device - ) + super().__init__(input_features, output_features, bias, device) self.state = bnb.MatmulLtState() self.index = index @@ -745,9 +833,7 @@ def __init__( if threshold > 0.0 and not has_fp16_weights: self.state.use_pool = True - self.weight = Int8Params( - self.weight.data, has_fp16_weights=has_fp16_weights, requires_grad=has_fp16_weights - ) + self.weight = Int8Params(self.weight.data, has_fp16_weights=has_fp16_weights, requires_grad=has_fp16_weights) def init_8bit_state(self): self.state.CB = self.weight.CB diff --git a/bitsandbytes/nn/triton_based_modules.py b/bitsandbytes/nn/triton_based_modules.py index 9c7738c59..aa8494942 100644 --- a/bitsandbytes/nn/triton_based_modules.py +++ b/bitsandbytes/nn/triton_based_modules.py @@ -22,7 +22,6 @@ class _switchback_global(torch.autograd.Function): - @staticmethod def forward(ctx, X_3D, W, bias): # reshape input to [N * L, D] @@ -37,9 +36,7 @@ def forward(ctx, X_3D, W, bias): # matmult, fused dequant and add bias # call "mixed" because we are mixing rowwise quantized and global quantized - return int8_matmul_mixed_dequantize( - X_int8, W_int8.t(), state_X, state_W, bias - ).view(*X_3D.size()[:-1], -1) + return int8_matmul_mixed_dequantize(X_int8, W_int8.t(), state_X, state_W, bias).view(*X_3D.size()[:-1], -1) @staticmethod def backward(ctx, G_3D): @@ -56,7 +53,8 @@ def backward(ctx, G_3D): G_int8, state_G = quantize_rowwise(G) W_int8, state_W = quantize_global_transpose(W) grad_X = int8_matmul_mixed_dequantize(G_int8, W_int8.t(), state_G, state_W, None).view( - *G_3D.size()[:-1], -1 + *G_3D.size()[:-1], + -1, ) if ctx.needs_input_grad[1]: # backward pass uses standard weight grad @@ -66,8 +64,8 @@ def backward(ctx, G_3D): return grad_X, grad_W, grad_bias -class _switchback_vectorrize(torch.autograd.Function): +class _switchback_vectorrize(torch.autograd.Function): @staticmethod def forward(ctx, X_3D, W, bias): # reshape input to [N * L, D] @@ -81,9 +79,7 @@ def forward(ctx, X_3D, W, bias): # matmult, fused dequant and add bias # call kernel which expects rowwise quantized X and W - return int8_matmul_rowwise_dequantize( - X_int8, W_int8.t(), state_X, state_W, bias - ).view(*X_3D.size()[:-1], -1) + return int8_matmul_rowwise_dequantize(X_int8, W_int8.t(), state_X, state_W, bias).view(*X_3D.size()[:-1], -1) @staticmethod def backward(ctx, G_3D): @@ -99,7 +95,8 @@ def backward(ctx, G_3D): G_int8, state_G = quantize_rowwise(G) W_int8, state_W = quantize_columnwise_and_transpose(W) grad_X = int8_matmul_rowwise_dequantize(G_int8, W_int8.t(), state_G, state_W, None).view( - *G_3D.size()[:-1], -1 + *G_3D.size()[:-1], + -1, ) if ctx.needs_input_grad[1]: # backward pass uses standard weight grad @@ -109,8 +106,8 @@ def backward(ctx, G_3D): return grad_X, grad_W, grad_bias -class _switchback_global_mem_efficient(torch.autograd.Function): +class _switchback_global_mem_efficient(torch.autograd.Function): @staticmethod def forward(ctx, X_3D, W, bias): # reshape input to [N * L, D] @@ -127,9 +124,7 @@ def forward(ctx, X_3D, W, bias): # matmult, fused dequant and add bias # call "mixed" because we are mixing rowwise quantized and global quantized - return int8_matmul_mixed_dequantize( - X_int8, W_int8.t(), state_X, state_W, bias - ).view(*X_3D_sz[:-1], -1) + return int8_matmul_mixed_dequantize(X_int8, W_int8.t(), state_X, state_W, bias).view(*X_3D_sz[:-1], -1) @staticmethod def backward(ctx, G_3D): @@ -151,35 +146,34 @@ def backward(ctx, G_3D): G_int8, state_G = quantize_rowwise(G) del G W_int8 = W_int8.t().contiguous() - grad_X = int8_matmul_mixed_dequantize(G_int8, W_int8.t(), state_G, state_W, None).view( - *G_3D_sz[:-1], -1 - ) + grad_X = int8_matmul_mixed_dequantize(G_int8, W_int8.t(), state_G, state_W, None).view(*G_3D_sz[:-1], -1) return grad_X, grad_W, grad_bias + class SwitchBackLinear(nn.Linear): def __init__( - self, - in_features: int, - out_features: int, - bias: bool = True, - device=None, - dtype=None, - vector_wise_quantization: bool = False, - mem_efficient : bool = False, - ): + self, + in_features: int, + out_features: int, + bias: bool = True, + device=None, + dtype=None, + vector_wise_quantization: bool = False, + mem_efficient: bool = False, + ): super().__init__(in_features, out_features, bias, device, dtype) if not is_triton_available(): - raise ImportError('''Could not import triton. Please install triton to use SwitchBackLinear. - Alternatively, you can use bnb.nn.SwitchBackLinearBnb, but it will be slower''') + raise ImportError("""Could not import triton. Please install triton to use SwitchBackLinear. + Alternatively, you can use bnb.nn.SwitchBackLinearBnb, but it will be slower""") # By default, we use the global quantization. self.vector_wise_quantization = vector_wise_quantization if self.vector_wise_quantization: self._fn = _switchback_vectorrize if mem_efficient: - print('mem efficient is not supported for vector-wise quantization.') + print("mem efficient is not supported for vector-wise quantization.") exit(1) else: if mem_efficient: @@ -195,7 +189,7 @@ def prepare_for_eval(self): # if hasattr(m, "prepare_for_eval"): # m.prepare_for_eval() # model.apply(cond_prepare) - print('=> preparing for eval.') + print("=> preparing for eval.") if self.vector_wise_quantization: W_int8, state_W = quantize_rowwise(self.weight) else: @@ -219,18 +213,22 @@ def forward(self, x): X_int8, state_X = quantize_rowwise(X) if self.vector_wise_quantization: - return int8_matmul_rowwise_dequantize( - X_int8, self.W_int8.t(), state_X, self.state_W, self.bias - ).view(*x.size()[:-1], -1) + return int8_matmul_rowwise_dequantize(X_int8, self.W_int8.t(), state_X, self.state_W, self.bias).view( + *x.size()[:-1], + -1, + ) else: - return int8_matmul_mixed_dequantize( - X_int8, self.W_int8.t(), state_X, self.state_W, self.bias - ).view(*x.size()[:-1], -1) + return int8_matmul_mixed_dequantize(X_int8, self.W_int8.t(), state_X, self.state_W, self.bias).view( + *x.size()[:-1], + -1, + ) + SwitchBackLinearGlobal = partial(SwitchBackLinear, vector_wise_quantization=False) SwitchBackLinearGlobalMemEfficient = partial(SwitchBackLinear, vector_wise_quantization=False, mem_efficient=True) SwitchBackLinearVectorwise = partial(SwitchBackLinear, vector_wise_quantization=True) + # This is just the standard linear function. class StandardLinearFunction(torch.autograd.Function): @staticmethod @@ -260,7 +258,7 @@ def backward(ctx, grad_output_3D): return grad_input, grad_weight, grad_bias -class StandardLinear(nn.Linear): +class StandardLinear(nn.Linear): def forward(self, x): return StandardLinearFunction.apply(x, self.weight, self.bias) diff --git a/bitsandbytes/optim/adagrad.py b/bitsandbytes/optim/adagrad.py index c2ea87ab0..aace548fa 100644 --- a/bitsandbytes/optim/adagrad.py +++ b/bitsandbytes/optim/adagrad.py @@ -50,9 +50,7 @@ def __init__( if not 0.0 <= lr: raise ValueError(f"Invalid learning rate: {lr}") if not 0.0 <= weight_decay: - raise ValueError( - f"Invalid weight_decay value: {weight_decay}" - ) + raise ValueError(f"Invalid weight_decay value: {weight_decay}") if not 0.0 <= eps: raise ValueError(f"Invalid epsilon value: {eps}") if initial_accumulator_value != 0.0: @@ -119,9 +117,7 @@ def __init__( if not 0.0 <= lr: raise ValueError(f"Invalid learning rate: {lr}") if not 0.0 <= weight_decay: - raise ValueError( - f"Invalid weight_decay value: {weight_decay}" - ) + raise ValueError(f"Invalid weight_decay value: {weight_decay}") if not 0.0 <= eps: raise ValueError(f"Invalid epsilon value: {eps}") if initial_accumulator_value != 0.0: @@ -189,9 +185,7 @@ def __init__( if not 0.0 <= lr: raise ValueError(f"Invalid learning rate: {lr}") if not 0.0 <= weight_decay: - raise ValueError( - f"Invalid weight_decay value: {weight_decay}" - ) + raise ValueError(f"Invalid weight_decay value: {weight_decay}") if not 0.0 <= eps: raise ValueError(f"Invalid epsilon value: {eps}") if initial_accumulator_value != 0.0: diff --git a/bitsandbytes/optim/adam.py b/bitsandbytes/optim/adam.py index e534c8b8f..d8ffca63e 100644 --- a/bitsandbytes/optim/adam.py +++ b/bitsandbytes/optim/adam.py @@ -14,8 +14,21 @@ class Adam(Optimizer2State): - def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, amsgrad=False, optim_bits=32, - args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True, is_paged=False): + def __init__( + self, + params, + lr=1e-3, + betas=(0.9, 0.999), + eps=1e-8, + weight_decay=0, + amsgrad=False, + optim_bits=32, + args=None, + min_8bit_size=4096, + percentile_clipping=100, + block_wise=True, + is_paged=False, + ): """ Base Adam optimizer. @@ -45,11 +58,38 @@ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0 is_paged (`bool`, defaults to `False`): Whether the optimizer is a paged optimizer or not. """ - super().__init__( "adam", params, lr, betas, eps, weight_decay, optim_bits, args, min_8bit_size, percentile_clipping, block_wise, is_paged=is_paged) + super().__init__( + "adam", + params, + lr, + betas, + eps, + weight_decay, + optim_bits, + args, + min_8bit_size, + percentile_clipping, + block_wise, + is_paged=is_paged, + ) + class Adam8bit(Optimizer2State): - def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, amsgrad=False, optim_bits=32, - args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True, is_paged=False): + def __init__( + self, + params, + lr=1e-3, + betas=(0.9, 0.999), + eps=1e-8, + weight_decay=0, + amsgrad=False, + optim_bits=32, + args=None, + min_8bit_size=4096, + percentile_clipping=100, + block_wise=True, + is_paged=False, + ): """ 8-bit Adam optimizer. @@ -79,11 +119,38 @@ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0 is_paged (`bool`, defaults to `False`): Whether the optimizer is a paged optimizer or not. """ - super().__init__( "adam", params, lr, betas, eps, weight_decay, 8, args, min_8bit_size, percentile_clipping, block_wise, is_paged=is_paged) + super().__init__( + "adam", + params, + lr, + betas, + eps, + weight_decay, + 8, + args, + min_8bit_size, + percentile_clipping, + block_wise, + is_paged=is_paged, + ) + class Adam32bit(Optimizer2State): - def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, amsgrad=False, optim_bits=32, - args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True, is_paged=False): + def __init__( + self, + params, + lr=1e-3, + betas=(0.9, 0.999), + eps=1e-8, + weight_decay=0, + amsgrad=False, + optim_bits=32, + args=None, + min_8bit_size=4096, + percentile_clipping=100, + block_wise=True, + is_paged=False, + ): """ 32-bit Adam optimizer. @@ -113,11 +180,38 @@ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0 is_paged (`bool`, defaults to `False`): Whether the optimizer is a paged optimizer or not. """ - super().__init__( "adam", params, lr, betas, eps, weight_decay, 32, args, min_8bit_size, percentile_clipping, block_wise, is_paged=is_paged) + super().__init__( + "adam", + params, + lr, + betas, + eps, + weight_decay, + 32, + args, + min_8bit_size, + percentile_clipping, + block_wise, + is_paged=is_paged, + ) + class PagedAdam(Optimizer2State): - def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, amsgrad=False, optim_bits=32, - args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True, is_paged=False): + def __init__( + self, + params, + lr=1e-3, + betas=(0.9, 0.999), + eps=1e-8, + weight_decay=0, + amsgrad=False, + optim_bits=32, + args=None, + min_8bit_size=4096, + percentile_clipping=100, + block_wise=True, + is_paged=False, + ): """ Paged Adam optimizer. @@ -147,11 +241,38 @@ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0 is_paged (`bool`, defaults to `False`): Whether the optimizer is a paged optimizer or not. """ - super().__init__( "adam", params, lr, betas, eps, weight_decay, optim_bits, args, min_8bit_size, percentile_clipping, block_wise, is_paged=True) + super().__init__( + "adam", + params, + lr, + betas, + eps, + weight_decay, + optim_bits, + args, + min_8bit_size, + percentile_clipping, + block_wise, + is_paged=True, + ) + class PagedAdam8bit(Optimizer2State): - def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, amsgrad=False, optim_bits=32, - args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True, is_paged=False): + def __init__( + self, + params, + lr=1e-3, + betas=(0.9, 0.999), + eps=1e-8, + weight_decay=0, + amsgrad=False, + optim_bits=32, + args=None, + min_8bit_size=4096, + percentile_clipping=100, + block_wise=True, + is_paged=False, + ): """ 8-bit paged Adam optimizer. @@ -181,11 +302,38 @@ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0 is_paged (`bool`, defaults to `False`): Whether the optimizer is a paged optimizer or not. """ - super().__init__( "adam", params, lr, betas, eps, weight_decay, 8, args, min_8bit_size, percentile_clipping, block_wise, is_paged=True) + super().__init__( + "adam", + params, + lr, + betas, + eps, + weight_decay, + 8, + args, + min_8bit_size, + percentile_clipping, + block_wise, + is_paged=True, + ) + class PagedAdam32bit(Optimizer2State): - def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, amsgrad=False, optim_bits=32, - args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True, is_paged=False): + def __init__( + self, + params, + lr=1e-3, + betas=(0.9, 0.999), + eps=1e-8, + weight_decay=0, + amsgrad=False, + optim_bits=32, + args=None, + min_8bit_size=4096, + percentile_clipping=100, + block_wise=True, + is_paged=False, + ): """ Paged 32-bit Adam optimizer. @@ -215,7 +363,21 @@ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0 is_paged (`bool`, defaults to `False`): Whether the optimizer is a paged optimizer or not. """ - super().__init__( "adam", params, lr, betas, eps, weight_decay, 32, args, min_8bit_size, percentile_clipping, block_wise, is_paged=True) + super().__init__( + "adam", + params, + lr, + betas, + eps, + weight_decay, + 32, + args, + min_8bit_size, + percentile_clipping, + block_wise, + is_paged=True, + ) + class AnalysisAdam(torch.optim.Optimizer): """Adam that performs 8-bit vs 32-bit error analysis. @@ -293,9 +455,7 @@ def step(self, closure=None): if grad.dtype in {torch.float16, torch.bfloat16}: grad = grad.float() if grad.is_sparse: - raise RuntimeError( - "Adam does not support sparse gradients, please consider SparseAdam instead" - ) + raise RuntimeError("Adam does not support sparse gradients, please consider SparseAdam instead") amsgrad = group.get("amsgrad", False) assert not amsgrad @@ -312,15 +472,9 @@ def step(self, closure=None): state["exp_avg"] = torch.zeros_like(p_data_fp32) # Exponential moving average of squared gradient values state["exp_avg_sq"] = torch.zeros_like(p_data_fp32) - state["abserrors"] = torch.zeros( - (256, 256), device=p_data_fp32.device - ) - state["relerrors"] = torch.zeros( - (256, 256), device=p_data_fp32.device - ) - state["counts"] = torch.zeros( - (256, 256), device=p_data_fp32.device - ) + state["abserrors"] = torch.zeros((256, 256), device=p_data_fp32.device) + state["relerrors"] = torch.zeros((256, 256), device=p_data_fp32.device) + state["counts"] = torch.zeros((256, 256), device=p_data_fp32.device) if amsgrad: # Maintains max of all exp. moving avg. of sq. grad. values state["max_exp_avg_sq"] = torch.zeros_like(p_data_fp32) @@ -328,25 +482,19 @@ def step(self, closure=None): state["exp_avg"] = state["exp_avg"].to(p_data_fp32) state["exp_avg_sq"] = state["exp_avg_sq"].to(p_data_fp32) if amsgrad: - state["max_exp_avg_sq"] = state["max_exp_avg_sq"].to( - p_data_fp32 - ) + state["max_exp_avg_sq"] = state["max_exp_avg_sq"].to(p_data_fp32) state["step"] += 1 beta1, beta2 = group["betas"] bias_correction1 = 1 - beta1 ** state["step"] bias_correction2 = 1 - beta2 ** state["step"] - step_size = ( - group["lr"] * math.sqrt(bias_correction2) / bias_correction1 - ) + step_size = group["lr"] * math.sqrt(bias_correction2) / bias_correction1 e = state["abserrors"] rele = state["relerrors"] counts = state["counts"] if group["weight_decay"] != 0: - p_data_fp32.add_( - p_data_fp32, alpha=-group["weight_decay"] * group["lr"] - ) + p_data_fp32.add_(p_data_fp32, alpha=-group["weight_decay"] * group["lr"]) exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"] if amsgrad: @@ -359,10 +507,7 @@ def step(self, closure=None): denom = exp_avg_sq.sqrt().add_(group["eps"]) update_fp32 = exp_avg / denom - if ( - p_data_fp32.numel() <= 8192 - or p_data_fp32.numel() > 50000 * 1000 - ): + if p_data_fp32.numel() <= 8192 or p_data_fp32.numel() > 50000 * 1000: # embedding layer or too small p_data_fp32 += -step_size * update_fp32 else: @@ -401,9 +546,7 @@ def step(self, closure=None): # 3. dequantize # Error will be calculated automatically! else: - raise ValueError( - f"Invalid analysis value: {self.analysis}!" - ) + raise ValueError(f"Invalid analysis value: {self.analysis}!") denom = state2.sqrt().add_(group["eps"]) update_8bit = state1 / denom @@ -415,9 +558,7 @@ def step(self, closure=None): F.histogram_scatter_add_2d(e, C1.int(), C2.int(), abserr) F.histogram_scatter_add_2d(rele, C1.int(), C2.int(), relerr) - F.histogram_scatter_add_2d( - counts, C1.int(), C2.int(), torch.ones_like(abserr) - ) + F.histogram_scatter_add_2d(counts, C1.int(), C2.int(), torch.ones_like(abserr)) p_data_fp32 += -step_size * update_fp32 @@ -425,18 +566,10 @@ def step(self, closure=None): if self.savedir != "" and state["step"] % 100 == 0: if not os.path.exists(self.savedir): os.makedirs(self.savedir) - shapestr = "_".join( - [str(dim) for dim in p_data_fp32.shape] - ) - pathe = os.path.join( - self.savedir, f"{p_id}_{shapestr}_abserr.pkl" - ) - pathrele = os.path.join( - self.savedir, f"{p_id}_{shapestr}_relerr.pkl" - ) - pathcounts = os.path.join( - self.savedir, f"{p_id}_{shapestr}_counts.pkl" - ) + shapestr = "_".join([str(dim) for dim in p_data_fp32.shape]) + pathe = os.path.join(self.savedir, f"{p_id}_{shapestr}_abserr.pkl") + pathrele = os.path.join(self.savedir, f"{p_id}_{shapestr}_relerr.pkl") + pathcounts = os.path.join(self.savedir, f"{p_id}_{shapestr}_counts.pkl") torch.save(e, pathe) torch.save(rele, pathrele) torch.save(counts, pathcounts) diff --git a/bitsandbytes/optim/adamw.py b/bitsandbytes/optim/adamw.py index 1e2dc04de..fa51458fd 100644 --- a/bitsandbytes/optim/adamw.py +++ b/bitsandbytes/optim/adamw.py @@ -6,8 +6,21 @@ class AdamW(Optimizer2State): - def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=1e-2, amsgrad=False, optim_bits=32, - args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True, is_paged=False): + def __init__( + self, + params, + lr=1e-3, + betas=(0.9, 0.999), + eps=1e-8, + weight_decay=1e-2, + amsgrad=False, + optim_bits=32, + args=None, + min_8bit_size=4096, + percentile_clipping=100, + block_wise=True, + is_paged=False, + ): """ Base AdamW optimizer. @@ -37,11 +50,38 @@ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=1 is_paged (`bool`, defaults to `False`): Whether the optimizer is a paged optimizer or not. """ - super().__init__( "adam", params, lr, betas, eps, weight_decay, optim_bits, args, min_8bit_size, percentile_clipping, block_wise, is_paged=is_paged ) + super().__init__( + "adam", + params, + lr, + betas, + eps, + weight_decay, + optim_bits, + args, + min_8bit_size, + percentile_clipping, + block_wise, + is_paged=is_paged, + ) + class AdamW8bit(Optimizer2State): - def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=1e-2, amsgrad=False, optim_bits=32, - args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True, is_paged=False): + def __init__( + self, + params, + lr=1e-3, + betas=(0.9, 0.999), + eps=1e-8, + weight_decay=1e-2, + amsgrad=False, + optim_bits=32, + args=None, + min_8bit_size=4096, + percentile_clipping=100, + block_wise=True, + is_paged=False, + ): """ 8-bit AdamW optimizer. @@ -71,11 +111,38 @@ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=1 is_paged (`bool`, defaults to `False`): Whether the optimizer is a paged optimizer or not. """ - super().__init__( "adam", params, lr, betas, eps, weight_decay, 8, args, min_8bit_size, percentile_clipping, block_wise, is_paged=is_paged ) + super().__init__( + "adam", + params, + lr, + betas, + eps, + weight_decay, + 8, + args, + min_8bit_size, + percentile_clipping, + block_wise, + is_paged=is_paged, + ) + class AdamW32bit(Optimizer2State): - def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=1e-2, amsgrad=False, optim_bits=32, - args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True, is_paged=False): + def __init__( + self, + params, + lr=1e-3, + betas=(0.9, 0.999), + eps=1e-8, + weight_decay=1e-2, + amsgrad=False, + optim_bits=32, + args=None, + min_8bit_size=4096, + percentile_clipping=100, + block_wise=True, + is_paged=False, + ): """ 32-bit AdamW optimizer. @@ -105,12 +172,37 @@ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=1 is_paged (`bool`, defaults to `False`): Whether the optimizer is a paged optimizer or not. """ - super().__init__( "adam", params, lr, betas, eps, weight_decay, 32, args, min_8bit_size, percentile_clipping, block_wise, is_paged=is_paged) + super().__init__( + "adam", + params, + lr, + betas, + eps, + weight_decay, + 32, + args, + min_8bit_size, + percentile_clipping, + block_wise, + is_paged=is_paged, + ) class PagedAdamW(Optimizer2State): - def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=1e-2, amsgrad=False, optim_bits=32, - args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True): + def __init__( + self, + params, + lr=1e-3, + betas=(0.9, 0.999), + eps=1e-8, + weight_decay=1e-2, + amsgrad=False, + optim_bits=32, + args=None, + min_8bit_size=4096, + percentile_clipping=100, + block_wise=True, + ): """ Paged AdamW optimizer. @@ -140,11 +232,37 @@ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=1 is_paged (`bool`, defaults to `False`): Whether the optimizer is a paged optimizer or not. """ - super().__init__( "adam", params, lr, betas, eps, weight_decay, optim_bits, args, min_8bit_size, percentile_clipping, block_wise, is_paged=True) + super().__init__( + "adam", + params, + lr, + betas, + eps, + weight_decay, + optim_bits, + args, + min_8bit_size, + percentile_clipping, + block_wise, + is_paged=True, + ) + class PagedAdamW8bit(Optimizer2State): - def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=1e-2, amsgrad=False, optim_bits=32, - args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True): + def __init__( + self, + params, + lr=1e-3, + betas=(0.9, 0.999), + eps=1e-8, + weight_decay=1e-2, + amsgrad=False, + optim_bits=32, + args=None, + min_8bit_size=4096, + percentile_clipping=100, + block_wise=True, + ): """ Paged 8-bit AdamW optimizer. @@ -174,11 +292,37 @@ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=1 is_paged (`bool`, defaults to `False`): Whether the optimizer is a paged optimizer or not. """ - super().__init__( "adam", params, lr, betas, eps, weight_decay, 8, args, min_8bit_size, percentile_clipping, block_wise, is_paged=True) + super().__init__( + "adam", + params, + lr, + betas, + eps, + weight_decay, + 8, + args, + min_8bit_size, + percentile_clipping, + block_wise, + is_paged=True, + ) + class PagedAdamW32bit(Optimizer2State): - def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=1e-2, amsgrad=False, optim_bits=32, - args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True): + def __init__( + self, + params, + lr=1e-3, + betas=(0.9, 0.999), + eps=1e-8, + weight_decay=1e-2, + amsgrad=False, + optim_bits=32, + args=None, + min_8bit_size=4096, + percentile_clipping=100, + block_wise=True, + ): """ Paged 32-bit AdamW optimizer. @@ -208,4 +352,17 @@ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=1 is_paged (`bool`, defaults to `False`): Whether the optimizer is a paged optimizer or not. """ - super().__init__( "adam", params, lr, betas, eps, weight_decay, 32, args, min_8bit_size, percentile_clipping, block_wise, is_paged=True) + super().__init__( + "adam", + params, + lr, + betas, + eps, + weight_decay, + 32, + args, + min_8bit_size, + percentile_clipping, + block_wise, + is_paged=True, + ) diff --git a/bitsandbytes/optim/lars.py b/bitsandbytes/optim/lars.py index 7449b805b..63c062988 100644 --- a/bitsandbytes/optim/lars.py +++ b/bitsandbytes/optim/lars.py @@ -51,9 +51,7 @@ def __init__( The maximum gradient norm. """ if momentum == 0: - raise NotImplementedError( - "LARS without momentum is not supported!" - ) + raise NotImplementedError("LARS without momentum is not supported!") super().__init__( "lars", params, @@ -110,9 +108,7 @@ def __init__( The maximum gradient norm. """ if momentum == 0: - raise NotImplementedError( - "LARS without momentum is not supported!" - ) + raise NotImplementedError("LARS without momentum is not supported!") super().__init__( "lars", params, @@ -169,9 +165,7 @@ def __init__( The maximum gradient norm. """ if momentum == 0: - raise NotImplementedError( - "LARS without momentum is not supported!" - ) + raise NotImplementedError("LARS without momentum is not supported!") super().__init__( "lars", params, @@ -204,9 +198,7 @@ def __init__( if momentum < 0.0: raise ValueError(f"Invalid momentum value: {momentum}") if weight_decay < 0.0: - raise ValueError( - f"Invalid weight_decay value: {weight_decay}" - ) + raise ValueError(f"Invalid weight_decay value: {weight_decay}") defaults = dict( lr=lr, @@ -217,9 +209,7 @@ def __init__( max_unorm=max_unorm, ) if nesterov and (momentum <= 0 or dampening != 0): - raise ValueError( - "Nesterov momentum requires a momentum and zero dampening" - ) + raise ValueError("Nesterov momentum requires a momentum and zero dampening") super().__init__(params, defaults) def __setstate__(self, state): diff --git a/bitsandbytes/optim/lion.py b/bitsandbytes/optim/lion.py index ce185f863..9f0f4a8a9 100644 --- a/bitsandbytes/optim/lion.py +++ b/bitsandbytes/optim/lion.py @@ -6,7 +6,19 @@ class Lion(Optimizer1State): - def __init__(self, params, lr=1e-4, betas=(0.9, 0.99), weight_decay=0, optim_bits=32, args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True, is_paged=False): + def __init__( + self, + params, + lr=1e-4, + betas=(0.9, 0.99), + weight_decay=0, + optim_bits=32, + args=None, + min_8bit_size=4096, + percentile_clipping=100, + block_wise=True, + is_paged=False, + ): """ Base Lion optimizer. @@ -32,10 +44,35 @@ def __init__(self, params, lr=1e-4, betas=(0.9, 0.99), weight_decay=0, optim_bit is_paged (`bool`, defaults to `False`): Whether the optimizer is a paged optimizer or not. """ - super().__init__("lion", params, lr, betas, 0., weight_decay, optim_bits, args, min_8bit_size, percentile_clipping, block_wise, is_paged=is_paged) + super().__init__( + "lion", + params, + lr, + betas, + 0.0, + weight_decay, + optim_bits, + args, + min_8bit_size, + percentile_clipping, + block_wise, + is_paged=is_paged, + ) + class Lion8bit(Optimizer1State): - def __init__(self, params, lr=1e-4, betas=(0.9, 0.99), weight_decay=0, args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True, is_paged=False): + def __init__( + self, + params, + lr=1e-4, + betas=(0.9, 0.99), + weight_decay=0, + args=None, + min_8bit_size=4096, + percentile_clipping=100, + block_wise=True, + is_paged=False, + ): """ 8-bit Lion optimizer. @@ -59,10 +96,35 @@ def __init__(self, params, lr=1e-4, betas=(0.9, 0.99), weight_decay=0, args=None is_paged (`bool`, defaults to `False`): Whether the optimizer is a paged optimizer or not. """ - super().__init__("lion", params, lr, betas, 0., weight_decay, 8, args, min_8bit_size, percentile_clipping, block_wise, is_paged=is_paged) + super().__init__( + "lion", + params, + lr, + betas, + 0.0, + weight_decay, + 8, + args, + min_8bit_size, + percentile_clipping, + block_wise, + is_paged=is_paged, + ) + class Lion32bit(Optimizer1State): - def __init__(self, params, lr=1e-4, betas=(0.9, 0.99), weight_decay=0, args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True, is_paged=False): + def __init__( + self, + params, + lr=1e-4, + betas=(0.9, 0.99), + weight_decay=0, + args=None, + min_8bit_size=4096, + percentile_clipping=100, + block_wise=True, + is_paged=False, + ): """ 32-bit Lion optimizer. @@ -86,11 +148,35 @@ def __init__(self, params, lr=1e-4, betas=(0.9, 0.99), weight_decay=0, args=None is_paged (`bool`, defaults to `False`): Whether the optimizer is a paged optimizer or not. """ - super().__init__("lion", params, lr, betas, 0., weight_decay, 32, args, min_8bit_size, percentile_clipping, block_wise, is_paged=is_paged) + super().__init__( + "lion", + params, + lr, + betas, + 0.0, + weight_decay, + 32, + args, + min_8bit_size, + percentile_clipping, + block_wise, + is_paged=is_paged, + ) class PagedLion(Optimizer1State): - def __init__(self, params, lr=1e-4, betas=(0.9, 0.99), weight_decay=0, optim_bits=32, args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True): + def __init__( + self, + params, + lr=1e-4, + betas=(0.9, 0.99), + weight_decay=0, + optim_bits=32, + args=None, + min_8bit_size=4096, + percentile_clipping=100, + block_wise=True, + ): """ Paged Lion optimizer. @@ -114,10 +200,34 @@ def __init__(self, params, lr=1e-4, betas=(0.9, 0.99), weight_decay=0, optim_bit block_wise (`bool`, defaults to `True`): Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. """ - super().__init__("lion", params, lr, betas, 0., weight_decay, optim_bits, args, min_8bit_size, percentile_clipping, block_wise, is_paged=True) + super().__init__( + "lion", + params, + lr, + betas, + 0.0, + weight_decay, + optim_bits, + args, + min_8bit_size, + percentile_clipping, + block_wise, + is_paged=True, + ) + class PagedLion8bit(Optimizer1State): - def __init__(self, params, lr=1e-4, betas=(0.9, 0.99), weight_decay=0, args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True): + def __init__( + self, + params, + lr=1e-4, + betas=(0.9, 0.99), + weight_decay=0, + args=None, + min_8bit_size=4096, + percentile_clipping=100, + block_wise=True, + ): """ Paged 8-bit Lion optimizer. @@ -141,10 +251,34 @@ def __init__(self, params, lr=1e-4, betas=(0.9, 0.99), weight_decay=0, args=None block_wise (`bool`, defaults to `True`): Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. """ - super().__init__("lion", params, lr, betas, 0., weight_decay, 8, args, min_8bit_size, percentile_clipping, block_wise, is_paged=True) + super().__init__( + "lion", + params, + lr, + betas, + 0.0, + weight_decay, + 8, + args, + min_8bit_size, + percentile_clipping, + block_wise, + is_paged=True, + ) + class PagedLion32bit(Optimizer1State): - def __init__(self, params, lr=1e-4, betas=(0.9, 0.99), weight_decay=0, args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True): + def __init__( + self, + params, + lr=1e-4, + betas=(0.9, 0.99), + weight_decay=0, + args=None, + min_8bit_size=4096, + percentile_clipping=100, + block_wise=True, + ): """ Paged 32-bit Lion optimizer. @@ -168,4 +302,17 @@ def __init__(self, params, lr=1e-4, betas=(0.9, 0.99), weight_decay=0, args=None block_wise (`bool`, defaults to `True`): Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. """ - super().__init__("lion", params, lr, betas, 0., weight_decay, 32, args, min_8bit_size, percentile_clipping, block_wise, is_paged=True) + super().__init__( + "lion", + params, + lr, + betas, + 0.0, + weight_decay, + 32, + args, + min_8bit_size, + percentile_clipping, + block_wise, + is_paged=True, + ) diff --git a/bitsandbytes/optim/optimizer.py b/bitsandbytes/optim/optimizer.py index a97afb026..43ebbb24d 100644 --- a/bitsandbytes/optim/optimizer.py +++ b/bitsandbytes/optim/optimizer.py @@ -21,6 +21,7 @@ class GlobalOptimManager: """ A global optimizer manager for enabling custom optimizer configs. """ + _instance = None def __init__(self): @@ -48,13 +49,9 @@ def register_parameters(self, params): for group_index, group in enumerate(param_groups): for p_index, p in enumerate(group["params"]): if id(p) in self.pid2config: - self.index2config[(group_index, p_index)] = self.pid2config[ - id(p) - ] + self.index2config[(group_index, p_index)] = self.pid2config[id(p)] - def override_config( - self, parameters, key=None, value=None, key_value_dict=None - ): + def override_config(self, parameters, key=None, value=None, key_value_dict=None): """ Override initial optimizer config with specific hyperparameters. @@ -132,18 +129,18 @@ def __init__(self, params, defaults, optim_bits=32, is_paged=False): self.mng = GlobalOptimManager.get_instance() self.non_castable_tensor_keys = { - "qmap1", - "qmap2", - "max1", - "max2", - "new_max1", - "new_max2", - "state1", - "state2", - "gnorm_vec", - "absmax1", - "absmax2", - "unorm_vec", + "qmap1", + "qmap2", + "max1", + "max2", + "new_max1", + "new_max2", + "state1", + "state2", + "gnorm_vec", + "absmax1", + "absmax2", + "unorm_vec", } if optim_bits == 8: @@ -170,16 +167,12 @@ def load_state_dict(self, state_dict): saved_groups = state_dict["param_groups"] if len(groups) != len(saved_groups): - raise ValueError( - "loaded state dict has a different number of " - "parameter groups" - ) + raise ValueError("loaded state dict has a different number of parameter groups") param_lens = (len(g["params"]) for g in groups) saved_lens = (len(g["params"]) for g in saved_groups) if any(p_len != s_len for p_len, s_len in zip(param_lens, saved_lens)): raise ValueError( - "loaded state dict contains a parameter group " - "that doesn't match the size of optimizer's group" + "loaded state dict contains a parameter group that doesn't match the size of optimizer's group", ) # Update the state @@ -228,9 +221,7 @@ def update_group(group, new_group): new_group["params"] = group["params"] return new_group - param_groups = [ - update_group(g, ng) for g, ng in zip(groups, saved_groups) - ] + param_groups = [update_group(g, ng) for g, ng in zip(groups, saved_groups)] self.__setstate__({"state": state, "param_groups": param_groups}) def to_gpu(self): @@ -240,7 +231,7 @@ def to_gpu(self): values = self.state[p] for k, v in values.items(): if isinstance(v, torch.Tensor): - is_paged = getattr(v, 'is_paged', False) + is_paged = getattr(v, "is_paged", False) if not is_paged: self.state[p][k] = v.to(p.device) @@ -248,9 +239,7 @@ def check_overrides(self): for module, attr, config in self.mng.module_weight_config_triple: pmodule = getattr(module, attr) assert pmodule is not None - assert isinstance(pmodule, torch.Tensor) or isinstance( - pmodule, torch.Parameter - ) + assert isinstance(pmodule, torch.Tensor) or isinstance(pmodule, torch.Parameter) found = False for gindex, group in enumerate(self.param_groups): if found: @@ -262,9 +251,7 @@ def check_overrides(self): # found the matching parameter # init override self.mng.pid2config[id(p)] = config - self.mng.index2config[ - (gindex, pindex) - ] = self.mng.pid2config[id(p)] + self.mng.index2config[(gindex, pindex)] = self.mng.pid2config[id(p)] found = True @torch.no_grad() @@ -287,7 +274,7 @@ def step(self, closure=None): self.to_gpu() # needed for fairseq pure fp16 training self.initialized = True - #if self.is_paged: self.page_mng.prefetch_all() + # if self.is_paged: self.page_mng.prefetch_all() for gindex, group in enumerate(self.param_groups): for pindex, p in enumerate(group["params"]): if p.grad is None: @@ -304,7 +291,6 @@ def step(self, closure=None): # to sync to make sure all tensors are in the right state torch.cuda.synchronize() - return loss def get_config(self, gindex, pindex, group): @@ -328,9 +314,7 @@ def init_state(self, group, p, gindex, pindex): raise NotImplementedError("init_state method needs to be overridden") def update_step(self, group, p, gindex, pindex): - raise NotImplementedError( - "The update_step method needs to be overridden" - ) + raise NotImplementedError("The update_step method needs to be overridden") def get_state_buffer(self, p, dtype=torch.float32): if not self.is_paged or p.numel() < 1e5: @@ -345,12 +329,12 @@ def get_state_buffer(self, p, dtype=torch.float32): def prefetch_state(self, p): if self.is_paged: state = self.state[p] - s1 = state['state1'] - is_paged = getattr(s1, 'is_paged', False) + s1 = state["state1"] + is_paged = getattr(s1, "is_paged", False) if is_paged: - F.prefetch_tensor(state['state1']) - if 'state2' in state: - F.prefetch_tensor(state['state2']) + F.prefetch_tensor(state["state1"]) + if "state2" in state: + F.prefetch_tensor(state["state2"]) class Optimizer2State(Optimizer8bit): @@ -369,7 +353,7 @@ def __init__( block_wise=True, max_unorm=0.0, skip_zeros=False, - is_paged=False + is_paged=False, ): """ Base 2-state update optimizer class. @@ -414,13 +398,9 @@ def __init__( betas = [float(b) for b in betas] for i in range(len(betas)): if not 0.0 <= betas[i] < 1.0: - raise ValueError( - f"Invalid beta parameter at index {i}: {betas[i]}" - ) + raise ValueError(f"Invalid beta parameter at index {i}: {betas[i]}") if not 0.0 <= weight_decay: - raise ValueError( - f"Invalid weight_decay value: {weight_decay}" - ) + raise ValueError(f"Invalid weight_decay value: {weight_decay}") defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) super().__init__(params, defaults, optim_bits, is_paged) @@ -449,9 +429,7 @@ def init_state(self, group, p, gindex, pindex): elif config["optim_bits"] == 8: dtype = torch.uint8 else: - raise NotImplementedError( - f'Amount of optimizer bits not supported: {config["optim_bits"]}' - ) + raise NotImplementedError(f'Amount of optimizer bits not supported: {config["optim_bits"]}') if p.numel() < config["min_8bit_size"]: dtype = torch.float32 @@ -459,21 +437,15 @@ def init_state(self, group, p, gindex, pindex): state = self.state[p] state["step"] = 0 - if dtype == torch.float32 or ( - dtype == torch.uint8 and p.numel() < 4096 - ): + if dtype == torch.float32 or (dtype == torch.uint8 and p.numel() < 4096): state["state1"] = self.get_state_buffer(p, dtype=torch.float32) state["state2"] = self.get_state_buffer(p, dtype=torch.float32) elif dtype == torch.uint8: if state["step"] == 0: if "dynamic" not in self.name2qmap: self.fill_qmap() - self.name2qmap["dynamic"] = self.name2qmap["dynamic"].to( - p.device - ) - self.name2qmap["udynamic"] = self.name2qmap["udynamic"].to( - p.device - ) + self.name2qmap["dynamic"] = self.name2qmap["dynamic"].to(p.device) + self.name2qmap["udynamic"] = self.name2qmap["udynamic"].to(p.device) state["state1"] = self.get_state_buffer(p, dtype=torch.uint8) state["qmap1"] = self.name2qmap["dynamic"] @@ -486,25 +458,13 @@ def init_state(self, group, p, gindex, pindex): blocks = n // 2048 blocks += 1 if n % 2048 > 0 else 0 - state["absmax1"] = torch.zeros( - (blocks,), dtype=torch.float32, device=p.device - ) - state["absmax2"] = torch.zeros( - (blocks,), dtype=torch.float32, device=p.device - ) + state["absmax1"] = torch.zeros((blocks,), dtype=torch.float32, device=p.device) + state["absmax2"] = torch.zeros((blocks,), dtype=torch.float32, device=p.device) else: - state["max1"] = torch.zeros( - (1,), dtype=torch.float32, device=p.device - ) - state["new_max1"] = torch.zeros( - (1,), dtype=torch.float32, device=p.device - ) - state["max2"] = torch.zeros( - (1,), dtype=torch.float32, device=p.device - ) - state["new_max2"] = torch.zeros( - (1,), dtype=torch.float32, device=p.device - ) + state["max1"] = torch.zeros((1,), dtype=torch.float32, device=p.device) + state["new_max1"] = torch.zeros((1,), dtype=torch.float32, device=p.device) + state["max2"] = torch.zeros((1,), dtype=torch.float32, device=p.device) + state["new_max2"] = torch.zeros((1,), dtype=torch.float32, device=p.device) if config["percentile_clipping"] < 100: state["gnorm_vec"] = torch.zeros((100,), device=p.device) @@ -524,7 +484,10 @@ def update_step(self, group, p, gindex, pindex): if config["percentile_clipping"] < 100: current_gnorm, clip_value, gnorm_scale = F.percentile_clipping( - grad, state["gnorm_vec"], step, config["percentile_clipping"] + grad, + state["gnorm_vec"], + step, + config["percentile_clipping"], ) else: gnorm_scale = 1.0 @@ -568,9 +531,7 @@ def update_step(self, group, p, gindex, pindex): state["new_max2"], config["weight_decay"], gnorm_scale=gnorm_scale, - unorm_vec=state["unorm_vec"] - if config["max_unorm"] > 0.0 - else None, + unorm_vec=state["unorm_vec"] if config["max_unorm"] > 0.0 else None, max_unorm=config["max_unorm"], ) @@ -615,7 +576,7 @@ def __init__( block_wise=True, max_unorm=0.0, skip_zeros=False, - is_paged=False + is_paged=False, ): """ Base 1-state update optimizer class. @@ -656,13 +617,9 @@ def __init__( raise ValueError(f"Invalid epsilon value: {eps}") for i in range(len(betas)): if not 0.0 <= betas[i] < 1.0: - raise ValueError( - f"Invalid beta parameter at index {i}: {betas[i]}" - ) + raise ValueError(f"Invalid beta parameter at index {i}: {betas[i]}") if not 0.0 <= weight_decay: - raise ValueError( - f"Invalid weight_decay value: {weight_decay}" - ) + raise ValueError(f"Invalid weight_decay value: {weight_decay}") defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) super().__init__(params, defaults, optim_bits, is_paged) @@ -691,9 +648,7 @@ def init_state(self, group, p, gindex, pindex): elif config["optim_bits"] == 8: dtype = torch.uint8 else: - raise NotImplementedError( - f'Amount of optimizer bits not supported: {config["optim_bits"]}' - ) + raise NotImplementedError(f'Amount of optimizer bits not supported: {config["optim_bits"]}') if p.numel() < config["min_8bit_size"]: dtype = torch.float32 @@ -701,17 +656,13 @@ def init_state(self, group, p, gindex, pindex): state = self.state[p] state["step"] = 0 - if dtype == torch.float32 or ( - dtype == torch.uint8 and p.numel() < 4096 - ): + if dtype == torch.float32 or (dtype == torch.uint8 and p.numel() < 4096): state["state1"] = self.get_state_buffer(p, dtype=torch.float32) elif dtype == torch.uint8: if state["step"] == 0: if "dynamic" not in self.name2qmap: self.fill_qmap() - self.name2qmap["dynamic"] = self.name2qmap["dynamic"].to( - p.device - ) + self.name2qmap["dynamic"] = self.name2qmap["dynamic"].to(p.device) state["state1"] = self.get_state_buffer(p, dtype=torch.uint8) state["qmap1"] = self.name2qmap["dynamic"] @@ -721,16 +672,10 @@ def init_state(self, group, p, gindex, pindex): blocks = n // 2048 blocks += 1 if n % 2048 > 0 else 0 - state["absmax1"] = torch.zeros( - (blocks,), dtype=torch.float32, device=p.device - ) + state["absmax1"] = torch.zeros((blocks,), dtype=torch.float32, device=p.device) else: - state["max1"] = torch.zeros( - (1,), dtype=torch.float32, device=p.device - ) - state["new_max1"] = torch.zeros( - (1,), dtype=torch.float32, device=p.device - ) + state["max1"] = torch.zeros((1,), dtype=torch.float32, device=p.device) + state["new_max1"] = torch.zeros((1,), dtype=torch.float32, device=p.device) if config["percentile_clipping"] < 100: state["gnorm_vec"] = torch.zeros((100,), device=p.device) @@ -750,7 +695,10 @@ def update_step(self, group, p, gindex, pindex): if config["percentile_clipping"] < 100: current_gnorm, clip_value, gnorm_scale = F.percentile_clipping( - grad, state["gnorm_vec"], step, config["percentile_clipping"] + grad, + state["gnorm_vec"], + step, + config["percentile_clipping"], ) else: gnorm_scale = 1.0 @@ -766,7 +714,7 @@ def update_step(self, group, p, gindex, pindex): step, config["lr"], None, - config['betas'][1], + config["betas"][1], config["weight_decay"], gnorm_scale, state["unorm_vec"] if config["max_unorm"] > 0.0 else None, diff --git a/bitsandbytes/optim/rmsprop.py b/bitsandbytes/optim/rmsprop.py index ac371a66f..659617654 100644 --- a/bitsandbytes/optim/rmsprop.py +++ b/bitsandbytes/optim/rmsprop.py @@ -51,9 +51,7 @@ def __init__( Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. """ if alpha == 0: - raise NotImplementedError( - "RMSprop with alpha==0.0 is not supported!" - ) + raise NotImplementedError("RMSprop with alpha==0.0 is not supported!") if centered: raise NotImplementedError("Centered RMSprop is not supported!") super().__init__( @@ -116,9 +114,7 @@ def __init__( Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. """ if alpha == 0: - raise NotImplementedError( - "RMSprop with alpha==0.0 is not supported!" - ) + raise NotImplementedError("RMSprop with alpha==0.0 is not supported!") if centered: raise NotImplementedError("Centered RMSprop is not supported!") super().__init__( @@ -182,9 +178,7 @@ def __init__( """ if alpha == 0: - raise NotImplementedError( - "RMSprop with alpha==0.0 is not supported!" - ) + raise NotImplementedError("RMSprop with alpha==0.0 is not supported!") if centered: raise NotImplementedError("Centered RMSprop is not supported!") super().__init__( diff --git a/bitsandbytes/research/autograd/_functions.py b/bitsandbytes/research/autograd/_functions.py index 7d869e39a..b194b8777 100644 --- a/bitsandbytes/research/autograd/_functions.py +++ b/bitsandbytes/research/autograd/_functions.py @@ -195,9 +195,9 @@ def forward(ctx, A, B, out=None, bias=None, state=MatmulLtState()): # noqa: B00 ctx.B = B ctx.bias = bias if A.shape[-1] == B.shape[0]: - return torch.empty(A.shape[:-1]+B.shape[1:], dtype=A.dtype, device=A.device) + return torch.empty(A.shape[:-1] + B.shape[1:], dtype=A.dtype, device=A.device) else: - return torch.empty(A.shape[:-1]+B.shape[:1], dtype=A.dtype, device=A.device) + return torch.empty(A.shape[:-1] + B.shape[:1], dtype=A.dtype, device=A.device) # 1. Quantize A # 2. Quantize B @@ -216,9 +216,7 @@ def forward(ctx, A, B, out=None, bias=None, state=MatmulLtState()): # noqa: B00 # 1. Quantize A if len(A.shape) == 3: A = A.view(-1, A.shape[-1]).contiguous() - CA, CAt, SCA, SCAt, coo_tensorA = F.double_quant( - A.to(torch.float16), threshold=state.threshold - ) + CA, CAt, SCA, SCAt, coo_tensorA = F.double_quant(A.to(torch.float16), threshold=state.threshold) if state.threshold > 0.0 and coo_tensorA is not None: if state.has_fp16_weights: @@ -234,14 +232,14 @@ def forward(ctx, A, B, out=None, bias=None, state=MatmulLtState()): # noqa: B00 # we also need to convert it to the turing/ampere format state.CxB, state.SB = F.transform(state.CB, to_order=formatB) else: - #print('A shape', A.shape) + # print('A shape', A.shape) if not state.has_fp16_weights and state.CxB is None: state.CxB, state.SB = F.transform(state.CB, to_order=formatB) subA = None # 2. Quantize B if state.has_fp16_weights: - #print('B shape', B.shape) + # print('B shape', B.shape) has_grad = True if (getattr(B, "grad", None) is not None) else False is_transposed = not B.is_contiguous() and B.shape[0] == B.stride(1) if is_transposed: @@ -272,12 +270,7 @@ def forward(ctx, A, B, out=None, bias=None, state=MatmulLtState()): # noqa: B00 # else: # state.idx = outlier_idx outliers = F.extract_outliers(state.CxB, state.SB, state.idx.int()) - state.subB = ( - (outliers * state.SCB.view(-1, 1) / 127.0) - .t() - .contiguous() - .to(A.dtype) - ) + state.subB = (outliers * state.SCB.view(-1, 1) / 127.0).t().contiguous().to(A.dtype) CA[:, state.idx.long()] = 0 CAt[:, state.idx.long()] = 0 subA = A[:, state.idx.long()] @@ -320,14 +313,13 @@ def forward(ctx, A, B, out=None, bias=None, state=MatmulLtState()): # noqa: B00 ctx.tensor_states = (None, None) ctx.save_for_backward(None, None) - - clone_func = torch.clone if len(output_shape) == 3 else lambda x : x + clone_func = torch.clone if len(output_shape) == 3 else lambda x: x return clone_func(output.view(output_shape)) @staticmethod def backward(ctx, grad_output): if ctx.is_empty: - bias_grad = (None if ctx.bias is None else torch.zeros_like(ctx.bias)) + bias_grad = None if ctx.bias is None else torch.zeros_like(ctx.bias) return torch.zeros_like(ctx.A), torch.zeros_like(ctx.B), None, bias_grad, None req_gradA, req_gradB, _, req_gradBias, _ = ctx.needs_input_grad CAt, subA, A = ctx.tensors @@ -342,9 +334,7 @@ def backward(ctx, grad_output): # Cast grad_output to fp16 if len(grad_output.shape) == 3: - grad_output = grad_output.reshape( - -1, grad_output.shape[-1] - ).contiguous() + grad_output = grad_output.reshape(-1, grad_output.shape[-1]).contiguous() Cgrad, Cgradt, SCgrad, SCgradt, coo_tensor = F.double_quant(grad_output.to(torch.float16)) @@ -357,25 +347,24 @@ def backward(ctx, grad_output): if state.CBt is not None: C32grad, Sgrad = F.transform(Cgrad, "col32") if state.CxBt is None: - state.CxBt, state.SBt = F.transform( - state.CBt, to_order=formatB, transpose=True - ) + state.CxBt, state.SBt = F.transform(state.CBt, to_order=formatB, transpose=True) # print('back B shape', state.CxBt.shape) # print('back grad shape', C32grad.shape) gradA32, SgradA32 = F.igemmlt(C32grad, state.CxBt, Sgrad, state.SBt) grad_A = F.mm_dequant(gradA32, SgradA32, SCgrad, state.SCBt).view(ctx.grad_shape).to(ctx.dtype_A) elif state.CB is not None: - CB = state.CB.to(ctx.dtype_A, copy=True).mul_(state.SCB.unsqueeze(1).mul(1. / 127.0)) + CB = state.CB.to(ctx.dtype_A, copy=True).mul_(state.SCB.unsqueeze(1).mul(1.0 / 127.0)) grad_A = torch.matmul(grad_output, CB).view(ctx.grad_shape).to(ctx.dtype_A) else: - raise Exception('State must contain either CBt or CB matrix for backward') + raise Exception("State must contain either CBt or CB matrix for backward") return grad_A, grad_B, None, grad_bias, None + def get_block_sizes(input_matrix, weight_matrix): input_features = input_matrix.shape[-1] - output_features = (weight_matrix.shape[0] if weight_matrix.shape[1] == input_features else weight_matrix.shape[1]) + output_features = weight_matrix.shape[0] if weight_matrix.shape[1] == input_features else weight_matrix.shape[1] array = [4096, 2048, 1024, 512, 256, 128, 64, 0] bsz, bsz2 = 1024, 1024 for i, k in enumerate(array): @@ -399,7 +388,8 @@ def matmul_fp8_global( bsz: int = -1, bsz2: int = -1, ): - if bsz == -1 or bsz2 == -1: bsz, bsz2 = get_block_sizes(A, B) + if bsz == -1 or bsz2 == -1: + bsz, bsz2 = get_block_sizes(A, B) return MatMulFP8Global.apply(A, B, out, fw_code, bw_code, bsz, bsz2) @@ -412,7 +402,8 @@ def matmul_fp8_mixed( bsz: int = -1, bsz2: int = -1, ): - if bsz == -1 or bsz2 == -1: bsz, bsz2 = get_block_sizes(A, B) + if bsz == -1 or bsz2 == -1: + bsz, bsz2 = get_block_sizes(A, B) return MatMulFP8Mixed.apply(A, B, out, fw_code, bw_code, bsz, bsz2) @@ -422,7 +413,7 @@ def switchback_bnb( out: Optional[torch.Tensor] = None, state: Optional[MatmulLtState] = None, threshold=0.0, - bias=None + bias=None, ): state = state or MatmulLtState() if threshold > 0.0: diff --git a/bitsandbytes/research/nn/modules.py b/bitsandbytes/research/nn/modules.py index 7fca34d23..57c0f3358 100644 --- a/bitsandbytes/research/nn/modules.py +++ b/bitsandbytes/research/nn/modules.py @@ -28,12 +28,20 @@ def forward(self, x: torch.Tensor): self.bw_code = bnb.functional.create_fp8_map(True, 5, 2, 8).to(x.device) self.fw_code = bnb.functional.create_fp8_map(True, 4, 3, 8).to(x.device) - out = bnb.research.matmul_fp8_mixed(x, self.weight.t(), fw_code=self.fw_code, bw_code=self.bw_code, bsz=self.bsz, bsz2=self.bsz2) + out = bnb.research.matmul_fp8_mixed( + x, + self.weight.t(), + fw_code=self.fw_code, + bw_code=self.bw_code, + bsz=self.bsz, + bsz2=self.bsz2, + ) if self.bias is not None: out += self.bias return out + class LinearFP8Global(nn.Linear): def __init__(self, input_features, output_features, bias=True): super().__init__(input_features, output_features, bias) @@ -54,7 +62,14 @@ def forward(self, x: torch.Tensor): self.bw_code = bnb.functional.create_fp8_map(True, 5, 2, 8).to(x.device) self.fw_code = bnb.functional.create_fp8_map(True, 4, 3, 8).to(x.device) - out = bnb.matmul_fp8_global(x, self.weight.t(), fw_code=self.fw_code, bw_code=self.bw_code, bsz=self.bsz, bsz2=self.bsz2) + out = bnb.matmul_fp8_global( + x, + self.weight.t(), + fw_code=self.fw_code, + bw_code=self.bw_code, + bsz=self.bsz, + bsz2=self.bsz2, + ) if self.bias is not None: out += self.bias diff --git a/bitsandbytes/triton/dequantize_rowwise.py b/bitsandbytes/triton/dequantize_rowwise.py index 3d7529852..26eab84f2 100644 --- a/bitsandbytes/triton/dequantize_rowwise.py +++ b/bitsandbytes/triton/dequantize_rowwise.py @@ -5,9 +5,10 @@ from bitsandbytes.triton.triton_utils import is_triton_available if not is_triton_available(): - def dequantize_rowwise(x: torch.Tensor, state_x: torch.Tensor): return None -else: + def dequantize_rowwise(x: torch.Tensor, state_x: torch.Tensor): + return None +else: import triton import triton.language as tl @@ -15,21 +16,21 @@ def dequantize_rowwise(x: torch.Tensor, state_x: torch.Tensor): return None # TODO: autotune this better. @triton.autotune( - configs=[ - triton.Config({}, num_stages=1, num_warps=8), - triton.Config({}, num_stages=2, num_warps=8), - triton.Config({}, num_stages=4, num_warps=8), - triton.Config({}, num_stages=8, num_warps=8), - triton.Config({}, num_stages=1), - triton.Config({}, num_stages=2), - triton.Config({}, num_stages=4), - triton.Config({}, num_stages=8), - triton.Config({}, num_warps=1), - triton.Config({}, num_warps=2), - triton.Config({}, num_warps=4), - triton.Config({}, num_warps=8), - ], - key=['n_elements'] + configs=[ + triton.Config({}, num_stages=1, num_warps=8), + triton.Config({}, num_stages=2, num_warps=8), + triton.Config({}, num_stages=4, num_warps=8), + triton.Config({}, num_stages=8, num_warps=8), + triton.Config({}, num_stages=1), + triton.Config({}, num_stages=2), + triton.Config({}, num_stages=4), + triton.Config({}, num_stages=8), + triton.Config({}, num_warps=1), + triton.Config({}, num_warps=2), + triton.Config({}, num_warps=4), + triton.Config({}, num_warps=8), + ], + key=["n_elements"], ) @triton.jit def _dequantize_rowwise( @@ -51,7 +52,6 @@ def _dequantize_rowwise( output = max_val * x * inv_127 tl.store(output_ptr + offsets, output, mask=row_mask) - def dequantize_rowwise(x: torch.Tensor, state_x: torch.Tensor): output = torch.empty(*x.shape, device=x.device, dtype=torch.float16) @@ -60,5 +60,5 @@ def dequantize_rowwise(x: torch.Tensor, state_x: torch.Tensor): assert x.is_cuda and output.is_cuda n_elements = output.numel() grid = lambda meta: (x.shape[0],) - _dequantize_rowwise[grid](x, state_x, output, 1./127, n_elements, BLOCK_SIZE=x.shape[1], P2=P2) + _dequantize_rowwise[grid](x, state_x, output, 1.0 / 127, n_elements, BLOCK_SIZE=x.shape[1], P2=P2) return output diff --git a/bitsandbytes/triton/int8_matmul_mixed_dequantize.py b/bitsandbytes/triton/int8_matmul_mixed_dequantize.py index dc3047d7e..583371d91 100644 --- a/bitsandbytes/triton/int8_matmul_mixed_dequantize.py +++ b/bitsandbytes/triton/int8_matmul_mixed_dequantize.py @@ -3,14 +3,14 @@ from bitsandbytes.triton.triton_utils import is_triton_available if not is_triton_available(): - def int8_matmul_mixed_dequantize(a, b, state_x, state_w, bias): return None -else: + def int8_matmul_mixed_dequantize(a, b, state_x, state_w, bias): + return None +else: import triton import triton.language as tl from triton.ops.matmul_perf_model import early_config_prune, estimate_matmul_time - # This is a matmul kernel based on triton.ops.matmul # It is modified to support rowwise quantized input and global quantized weight # It's purpose is fused matmul then dequantize @@ -27,58 +27,83 @@ def get_configs_io_bound(): for block_n in [32, 64, 128, 256]: num_warps = 2 if block_n <= 64 else 4 configs.append( - triton.Config({'BLOCK_M': block_m, 'BLOCK_N': block_n, 'BLOCK_K': block_k, 'SPLIT_K': 1}, - num_stages=num_stages, num_warps=num_warps)) + triton.Config( + {"BLOCK_M": block_m, "BLOCK_N": block_n, "BLOCK_K": block_k, "SPLIT_K": 1}, + num_stages=num_stages, + num_warps=num_warps, + ), + ) # split_k for split_k in [2, 4, 8, 16]: - configs.append(triton.Config({'BLOCK_M': block_m, 'BLOCK_N': block_n, 'BLOCK_K': block_k, 'SPLIT_K': split_k}, - num_stages=num_stages, num_warps=num_warps, pre_hook=init_to_zero('C'))) + configs.append( + triton.Config( + {"BLOCK_M": block_m, "BLOCK_N": block_n, "BLOCK_K": block_k, "SPLIT_K": split_k}, + num_stages=num_stages, + num_warps=num_warps, + pre_hook=init_to_zero("C"), + ), + ) return configs - @triton.autotune( configs=[ # basic configs for compute-bound matmuls - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 256, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=3, num_warps=8), - triton.Config({'BLOCK_M': 256, 'BLOCK_N': 128, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=3, num_warps=8), - triton.Config({'BLOCK_M': 256, 'BLOCK_N': 64, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 64, 'BLOCK_N': 256, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 64, 'BLOCK_N': 128, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 32, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 64, 'BLOCK_N': 32, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=5, num_warps=2), + triton.Config({"BLOCK_M": 128, "BLOCK_N": 256, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=3, num_warps=8), + triton.Config({"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=3, num_warps=8), + triton.Config({"BLOCK_M": 256, "BLOCK_N": 64, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4), + triton.Config({"BLOCK_M": 64, "BLOCK_N": 256, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4), + triton.Config({"BLOCK_M": 128, "BLOCK_N": 128, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4), + triton.Config({"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4), + triton.Config({"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4), + triton.Config({"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4), + triton.Config({"BLOCK_M": 64, "BLOCK_N": 32, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=5, num_warps=2), # good for int8 - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 256, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=3, num_warps=8), - triton.Config({'BLOCK_M': 256, 'BLOCK_N': 128, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=3, num_warps=8), - triton.Config({'BLOCK_M': 256, 'BLOCK_N': 64, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 64, 'BLOCK_N': 256, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 64, 'BLOCK_N': 128, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 32, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 64, 'BLOCK_N': 32, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=5, num_warps=2), + triton.Config({"BLOCK_M": 128, "BLOCK_N": 256, "BLOCK_K": 128, "SPLIT_K": 1}, num_stages=3, num_warps=8), + triton.Config({"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 128, "SPLIT_K": 1}, num_stages=3, num_warps=8), + triton.Config({"BLOCK_M": 256, "BLOCK_N": 64, "BLOCK_K": 128, "SPLIT_K": 1}, num_stages=4, num_warps=4), + triton.Config({"BLOCK_M": 64, "BLOCK_N": 256, "BLOCK_K": 128, "SPLIT_K": 1}, num_stages=4, num_warps=4), + triton.Config({"BLOCK_M": 128, "BLOCK_N": 128, "BLOCK_K": 128, "SPLIT_K": 1}, num_stages=4, num_warps=4), + triton.Config({"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 64, "SPLIT_K": 1}, num_stages=4, num_warps=4), + triton.Config({"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 64, "SPLIT_K": 1}, num_stages=4, num_warps=4), + triton.Config({"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "SPLIT_K": 1}, num_stages=4, num_warps=4), + triton.Config({"BLOCK_M": 64, "BLOCK_N": 32, "BLOCK_K": 64, "SPLIT_K": 1}, num_stages=5, num_warps=2), *get_configs_io_bound(), ], - key=['M', 'N', 'K'], - prune_configs_by={ - 'early_config_prune': early_config_prune, - 'perf_model': estimate_matmul_time, - 'top_k': 10 + key=["M", "N", "K"], + prune_configs_by={"early_config_prune": early_config_prune, "perf_model": estimate_matmul_time, "top_k": 10}, + ) + @triton.heuristics( + { + "EVEN_K": lambda args: args["K"] % (args["BLOCK_K"] * args["SPLIT_K"]) == 0, }, ) - @triton.heuristics({ - 'EVEN_K': lambda args: args['K'] % (args['BLOCK_K'] * args['SPLIT_K']) == 0, - }) @triton.jit - def _int8_matmul_mixed_dequantize(A, B, C, bias, state_x_ptr, state_w_ptr, M, N, K, divfactor: tl.constexpr, has_bias : tl.constexpr, - stride_am, stride_ak, - stride_bk, stride_bn, - stride_cm, stride_cn, - BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr, - GROUP_M: tl.constexpr, SPLIT_K: tl.constexpr, EVEN_K: tl.constexpr, - ACC_TYPE: tl.constexpr - ): + def _int8_matmul_mixed_dequantize( + A, + B, + C, + bias, + state_x_ptr, + state_w_ptr, + M, + N, + K, + divfactor: tl.constexpr, + has_bias: tl.constexpr, + stride_am, + stride_ak, + stride_bk, + stride_bn, + stride_cm, + stride_cn, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_K: tl.constexpr, + GROUP_M: tl.constexpr, + SPLIT_K: tl.constexpr, + EVEN_K: tl.constexpr, + ACC_TYPE: tl.constexpr, + ): # matrix multiplication pid = tl.program_id(0) pid_z = tl.program_id(1) @@ -115,13 +140,13 @@ def _int8_matmul_mixed_dequantize(A, B, C, bias, state_x_ptr, state_w_ptr, M, N, b = tl.load(B) else: k_remaining = K - k * (BLOCK_K * SPLIT_K) - a = tl.load(A, mask=rk[None, :] < k_remaining, other=0.) - b = tl.load(B, mask=rk[:, None] < k_remaining, other=0.) + a = tl.load(A, mask=rk[None, :] < k_remaining, other=0.0) + b = tl.load(B, mask=rk[:, None] < k_remaining, other=0.0) acc += tl.dot(a, b) A += BLOCK_K * SPLIT_K * stride_ak B += BLOCK_K * SPLIT_K * stride_bk - acc = (w_factor * (x_factor * (acc * divfactor))) + acc = w_factor * (x_factor * (acc * divfactor)) acc = acc.to(C.dtype.element_ty) # conditionally add bias @@ -137,10 +162,9 @@ def _int8_matmul_mixed_dequantize(A, B, C, bias, state_x_ptr, state_w_ptr, M, N, else: tl.atomic_add(C, acc, mask=mask) - def int8_matmul_mixed_dequantize(a, b, state_x, state_w, bias): device = a.device - divfactor = 1. / (127. * 127.) + divfactor = 1.0 / (127.0 * 127.0) has_bias = 0 if bias is None else 1 # handle non-contiguous inputs if necessary if a.stride(0) > 1 and a.stride(1) > 1: @@ -154,12 +178,28 @@ def int8_matmul_mixed_dequantize(a, b, state_x, state_w, bias): # allocates output c = torch.empty((M, N), device=device, dtype=torch.float16) # accumulator types - ACC_TYPE = tl.float32 #if a.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32 + ACC_TYPE = tl.float32 # if a.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32 # launch int8_matmul_mixed_dequantize kernel - grid = lambda META: (triton.cdiv(M, META['BLOCK_M']) * triton.cdiv(N, META['BLOCK_N']), META['SPLIT_K']) - _int8_matmul_mixed_dequantize[grid](a, b, c, bias, state_x, state_w, M, N, K, divfactor, has_bias, - a.stride(0), a.stride(1), - b.stride(0), b.stride(1), - c.stride(0), c.stride(1), - GROUP_M=8, ACC_TYPE=ACC_TYPE) + grid = lambda META: (triton.cdiv(M, META["BLOCK_M"]) * triton.cdiv(N, META["BLOCK_N"]), META["SPLIT_K"]) + _int8_matmul_mixed_dequantize[grid]( + a, + b, + c, + bias, + state_x, + state_w, + M, + N, + K, + divfactor, + has_bias, + a.stride(0), + a.stride(1), + b.stride(0), + b.stride(1), + c.stride(0), + c.stride(1), + GROUP_M=8, + ACC_TYPE=ACC_TYPE, + ) return c diff --git a/bitsandbytes/triton/int8_matmul_rowwise_dequantize.py b/bitsandbytes/triton/int8_matmul_rowwise_dequantize.py index 4881e1468..e3d192ded 100644 --- a/bitsandbytes/triton/int8_matmul_rowwise_dequantize.py +++ b/bitsandbytes/triton/int8_matmul_rowwise_dequantize.py @@ -3,7 +3,9 @@ from bitsandbytes.triton.triton_utils import is_triton_available if not is_triton_available(): - def int8_matmul_rowwise_dequantize(a, b, state_x, state_w, bias): return None + + def int8_matmul_rowwise_dequantize(a, b, state_x, state_w, bias): + return None else: import triton import triton.language as tl @@ -17,7 +19,6 @@ def int8_matmul_rowwise_dequantize(a, b, state_x, state_w, bias): return None def init_to_zero(name): return lambda nargs: nargs[name].zero_() - def get_configs_io_bound(): configs = [] for num_stages in [2, 3, 4, 5, 6]: @@ -26,58 +27,83 @@ def get_configs_io_bound(): for block_n in [32, 64, 128, 256]: num_warps = 2 if block_n <= 64 else 4 configs.append( - triton.Config({'BLOCK_M': block_m, 'BLOCK_N': block_n, 'BLOCK_K': block_k, 'SPLIT_K': 1}, - num_stages=num_stages, num_warps=num_warps)) + triton.Config( + {"BLOCK_M": block_m, "BLOCK_N": block_n, "BLOCK_K": block_k, "SPLIT_K": 1}, + num_stages=num_stages, + num_warps=num_warps, + ), + ) # split_k for split_k in [2, 4, 8, 16]: - configs.append(triton.Config({'BLOCK_M': block_m, 'BLOCK_N': block_n, 'BLOCK_K': block_k, 'SPLIT_K': split_k}, - num_stages=num_stages, num_warps=num_warps, pre_hook=init_to_zero('C'))) + configs.append( + triton.Config( + {"BLOCK_M": block_m, "BLOCK_N": block_n, "BLOCK_K": block_k, "SPLIT_K": split_k}, + num_stages=num_stages, + num_warps=num_warps, + pre_hook=init_to_zero("C"), + ), + ) return configs - @triton.autotune( configs=[ # basic configs for compute-bound matmuls - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 256, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=3, num_warps=8), - triton.Config({'BLOCK_M': 256, 'BLOCK_N': 128, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=3, num_warps=8), - triton.Config({'BLOCK_M': 256, 'BLOCK_N': 64, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 64, 'BLOCK_N': 256, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 64, 'BLOCK_N': 128, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 32, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 64, 'BLOCK_N': 32, 'BLOCK_K': 32, 'SPLIT_K': 1}, num_stages=5, num_warps=2), + triton.Config({"BLOCK_M": 128, "BLOCK_N": 256, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=3, num_warps=8), + triton.Config({"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=3, num_warps=8), + triton.Config({"BLOCK_M": 256, "BLOCK_N": 64, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4), + triton.Config({"BLOCK_M": 64, "BLOCK_N": 256, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4), + triton.Config({"BLOCK_M": 128, "BLOCK_N": 128, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4), + triton.Config({"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4), + triton.Config({"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4), + triton.Config({"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=4, num_warps=4), + triton.Config({"BLOCK_M": 64, "BLOCK_N": 32, "BLOCK_K": 32, "SPLIT_K": 1}, num_stages=5, num_warps=2), # good for int8 - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 256, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=3, num_warps=8), - triton.Config({'BLOCK_M': 256, 'BLOCK_N': 128, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=3, num_warps=8), - triton.Config({'BLOCK_M': 256, 'BLOCK_N': 64, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 64, 'BLOCK_N': 256, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'BLOCK_K': 128, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 64, 'BLOCK_N': 128, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 32, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=4, num_warps=4), - triton.Config({'BLOCK_M': 64, 'BLOCK_N': 32, 'BLOCK_K': 64, 'SPLIT_K': 1}, num_stages=5, num_warps=2), + triton.Config({"BLOCK_M": 128, "BLOCK_N": 256, "BLOCK_K": 128, "SPLIT_K": 1}, num_stages=3, num_warps=8), + triton.Config({"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 128, "SPLIT_K": 1}, num_stages=3, num_warps=8), + triton.Config({"BLOCK_M": 256, "BLOCK_N": 64, "BLOCK_K": 128, "SPLIT_K": 1}, num_stages=4, num_warps=4), + triton.Config({"BLOCK_M": 64, "BLOCK_N": 256, "BLOCK_K": 128, "SPLIT_K": 1}, num_stages=4, num_warps=4), + triton.Config({"BLOCK_M": 128, "BLOCK_N": 128, "BLOCK_K": 128, "SPLIT_K": 1}, num_stages=4, num_warps=4), + triton.Config({"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 64, "SPLIT_K": 1}, num_stages=4, num_warps=4), + triton.Config({"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 64, "SPLIT_K": 1}, num_stages=4, num_warps=4), + triton.Config({"BLOCK_M": 128, "BLOCK_N": 32, "BLOCK_K": 64, "SPLIT_K": 1}, num_stages=4, num_warps=4), + triton.Config({"BLOCK_M": 64, "BLOCK_N": 32, "BLOCK_K": 64, "SPLIT_K": 1}, num_stages=5, num_warps=2), *get_configs_io_bound(), ], - key=['M', 'N', 'K'], - prune_configs_by={ - 'early_config_prune': early_config_prune, - 'perf_model': estimate_matmul_time, - 'top_k': 10 + key=["M", "N", "K"], + prune_configs_by={"early_config_prune": early_config_prune, "perf_model": estimate_matmul_time, "top_k": 10}, + ) + @triton.heuristics( + { + "EVEN_K": lambda args: args["K"] % (args["BLOCK_K"] * args["SPLIT_K"]) == 0, }, ) - @triton.heuristics({ - 'EVEN_K': lambda args: args['K'] % (args['BLOCK_K'] * args['SPLIT_K']) == 0, - }) @triton.jit - def _int8_matmul_rowwise_dequantize(A, B, C, bias, state_x_ptr, state_w_ptr, M, N, K, divfactor, has_bias : tl.constexpr, - stride_am, stride_ak, - stride_bk, stride_bn, - stride_cm, stride_cn, - BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr, - GROUP_M: tl.constexpr, SPLIT_K: tl.constexpr, EVEN_K: tl.constexpr, - ACC_TYPE: tl.constexpr - ): + def _int8_matmul_rowwise_dequantize( + A, + B, + C, + bias, + state_x_ptr, + state_w_ptr, + M, + N, + K, + divfactor, + has_bias: tl.constexpr, + stride_am, + stride_ak, + stride_bk, + stride_bn, + stride_cm, + stride_cn, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_K: tl.constexpr, + GROUP_M: tl.constexpr, + SPLIT_K: tl.constexpr, + EVEN_K: tl.constexpr, + ACC_TYPE: tl.constexpr, + ): # matrix multiplication pid = tl.program_id(0) pid_z = tl.program_id(1) @@ -114,13 +140,13 @@ def _int8_matmul_rowwise_dequantize(A, B, C, bias, state_x_ptr, state_w_ptr, M, b = tl.load(B) else: k_remaining = K - k * (BLOCK_K * SPLIT_K) - a = tl.load(A, mask=rk[None, :] < k_remaining, other=0.) - b = tl.load(B, mask=rk[:, None] < k_remaining, other=0.) + a = tl.load(A, mask=rk[None, :] < k_remaining, other=0.0) + b = tl.load(B, mask=rk[:, None] < k_remaining, other=0.0) acc += tl.dot(a, b) A += BLOCK_K * SPLIT_K * stride_ak B += BLOCK_K * SPLIT_K * stride_bk - acc = (w_factor * (x_factor * (acc * divfactor))) + acc = w_factor * (x_factor * (acc * divfactor)) acc = acc.to(C.dtype.element_ty) if has_bias: @@ -135,9 +161,8 @@ def _int8_matmul_rowwise_dequantize(A, B, C, bias, state_x_ptr, state_w_ptr, M, else: tl.atomic_add(C, acc, mask=mask) - def int8_matmul_rowwise_dequantize(a, b, state_x, state_w, bias): - divfactor = 1. / (127. * 127.) + divfactor = 1.0 / (127.0 * 127.0) has_bias = 0 if bias is None else 1 @@ -154,12 +179,28 @@ def int8_matmul_rowwise_dequantize(a, b, state_x, state_w, bias): # allocates output c = torch.empty((M, N), device=device, dtype=torch.float16) # accumulator types - ACC_TYPE = tl.float32 #if a.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32 + ACC_TYPE = tl.float32 # if a.dtype in [torch.float16, torch.bfloat16, torch.float32] else tl.int32 # launch int8_matmul_rowwise_dequantize kernel - grid = lambda META: (triton.cdiv(M, META['BLOCK_M']) * triton.cdiv(N, META['BLOCK_N']), META['SPLIT_K']) - _int8_matmul_rowwise_dequantize[grid](a, b, c, bias, state_x, state_w, M, N, K, divfactor, has_bias, - a.stride(0), a.stride(1), - b.stride(0), b.stride(1), - c.stride(0), c.stride(1), - GROUP_M=8, ACC_TYPE=ACC_TYPE) + grid = lambda META: (triton.cdiv(M, META["BLOCK_M"]) * triton.cdiv(N, META["BLOCK_N"]), META["SPLIT_K"]) + _int8_matmul_rowwise_dequantize[grid]( + a, + b, + c, + bias, + state_x, + state_w, + M, + N, + K, + divfactor, + has_bias, + a.stride(0), + a.stride(1), + b.stride(0), + b.stride(1), + c.stride(0), + c.stride(1), + GROUP_M=8, + ACC_TYPE=ACC_TYPE, + ) return c diff --git a/bitsandbytes/triton/quantize_columnwise_and_transpose.py b/bitsandbytes/triton/quantize_columnwise_and_transpose.py index e7961cf53..b8eeffd0c 100644 --- a/bitsandbytes/triton/quantize_columnwise_and_transpose.py +++ b/bitsandbytes/triton/quantize_columnwise_and_transpose.py @@ -5,9 +5,10 @@ from bitsandbytes.triton.triton_utils import is_triton_available if not is_triton_available(): - def quantize_columnwise_and_transpose(x: torch.Tensor): return None -else: + def quantize_columnwise_and_transpose(x: torch.Tensor): + return None +else: import triton import triton.language as tl @@ -15,23 +16,23 @@ def quantize_columnwise_and_transpose(x: torch.Tensor): return None # TODO: autotune this better. @triton.autotune( - configs=[ - triton.Config({}, num_stages=1), - triton.Config({}, num_stages=2), - triton.Config({}, num_stages=4), - triton.Config({}, num_stages=8), - triton.Config({}, num_stages=16), - triton.Config({}, num_stages=1, num_warps=8), - triton.Config({}, num_stages=2, num_warps=8), - triton.Config({}, num_stages=4, num_warps=8), - triton.Config({}, num_stages=8, num_warps=8), - triton.Config({}, num_stages=16, num_warps=8), - triton.Config({}, num_warps=1), - triton.Config({}, num_warps=2), - triton.Config({}, num_warps=4), - triton.Config({}, num_warps=8), - ], - key=['n_elements'] + configs=[ + triton.Config({}, num_stages=1), + triton.Config({}, num_stages=2), + triton.Config({}, num_stages=4), + triton.Config({}, num_stages=8), + triton.Config({}, num_stages=16), + triton.Config({}, num_stages=1, num_warps=8), + triton.Config({}, num_stages=2, num_warps=8), + triton.Config({}, num_stages=4, num_warps=8), + triton.Config({}, num_stages=8, num_warps=8), + triton.Config({}, num_stages=16, num_warps=8), + triton.Config({}, num_warps=1), + triton.Config({}, num_warps=2), + triton.Config({}, num_warps=4), + triton.Config({}, num_warps=8), + ], + key=["n_elements"], ) @triton.jit def _quantize_columnwise_and_transpose( @@ -39,7 +40,8 @@ def _quantize_columnwise_and_transpose( output_ptr, output_maxs, n_elements, - M : tl.constexpr, N : tl.constexpr, + M: tl.constexpr, + N: tl.constexpr, BLOCK_SIZE: tl.constexpr, P2: tl.constexpr, ): @@ -47,12 +49,12 @@ def _quantize_columnwise_and_transpose( block_start = pid p2_arange = tl.arange(0, P2) p2_arange_mask = p2_arange < M - arange = p2_arange * N + arange = p2_arange * N offsets = block_start + arange x = tl.load(x_ptr + offsets, mask=p2_arange_mask) abs_x = tl.abs(x) max_val = tl.max(tl.where(p2_arange_mask, abs_x, 0), axis=0) - output = tl.libdevice.llrint(127. * (x / max_val)) + output = tl.libdevice.llrint(127.0 * (x / max_val)) new_start = pid * M new_offsets = new_start + p2_arange @@ -68,6 +70,6 @@ def quantize_columnwise_and_transpose(x: torch.Tensor): assert x.is_cuda and output.is_cuda n_elements = output.numel() - grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),) + grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),) _quantize_columnwise_and_transpose[grid](x, output, output_maxs, n_elements, M, N, BLOCK_SIZE=M, P2=P2) return output, output_maxs diff --git a/bitsandbytes/triton/quantize_global.py b/bitsandbytes/triton/quantize_global.py index 5cf194744..f35bdd304 100644 --- a/bitsandbytes/triton/quantize_global.py +++ b/bitsandbytes/triton/quantize_global.py @@ -1,24 +1,25 @@ - import torch from bitsandbytes.triton.triton_utils import is_triton_available if not is_triton_available(): - def quantize_global_transpose(input): return None - def quantize_global(x: torch.Tensor): return None -else: + def quantize_global_transpose(input): + return None + + def quantize_global(x: torch.Tensor): + return None +else: import triton import triton.language as tl # global quantize @triton.autotune( - configs=[ - triton.Config({'BLOCK_SIZE': 1024,}, num_warps=4), - triton.Config({'BLOCK_SIZE': 2048,}, num_stages=1), - - ], - key=['n_elements'] + configs=[ + triton.Config({"BLOCK_SIZE": 1024}, num_warps=4), + triton.Config({"BLOCK_SIZE": 2048}, num_stages=1), + ], + key=["n_elements"], ) @triton.jit def _quantize_global( @@ -34,35 +35,43 @@ def _quantize_global( mask = offsets < n_elements x = tl.load(x_ptr + offsets, mask=mask) absmax_inv = tl.load(absmax_inv_ptr) - output = tl.libdevice.llrint(127. * (x * absmax_inv)) + output = tl.libdevice.llrint(127.0 * (x * absmax_inv)) tl.store(output_ptr + offsets, output, mask=mask) def quantize_global(x: torch.Tensor): absmax = x.abs().max().unsqueeze(0) - absmax_inv = 1./ absmax - output = torch.empty(*x.shape, device='cuda', dtype=torch.int8) + absmax_inv = 1.0 / absmax + output = torch.empty(*x.shape, device="cuda", dtype=torch.int8) assert x.is_cuda and output.is_cuda n_elements = output.numel() - grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),) + grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),) _quantize_global[grid](x, absmax_inv, output, n_elements) return output, absmax - # global quantize and transpose @triton.autotune( - configs=[ - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'GROUP_M': 8}, num_warps=4), - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'GROUP_M': 8}, num_warps=4), - - # ... - ], - key=['M', 'N'] + configs=[ + triton.Config({"BLOCK_M": 128, "BLOCK_N": 128, "GROUP_M": 8}, num_warps=4), + triton.Config({"BLOCK_M": 128, "BLOCK_N": 128, "GROUP_M": 8}, num_warps=4), + # ... + ], + key=["M", "N"], ) @triton.jit - def _quantize_global_transpose(A, absmax_inv_ptr, B, stride_am, stride_an, stride_bn, stride_bm, M, N, - BLOCK_M : tl.constexpr, - BLOCK_N : tl.constexpr, - GROUP_M : tl.constexpr): + def _quantize_global_transpose( + A, + absmax_inv_ptr, + B, + stride_am, + stride_an, + stride_bn, + stride_bm, + M, + N, + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + GROUP_M: tl.constexpr, + ): pid = tl.program_id(0) grid_m = (M + BLOCK_M - 1) // BLOCK_M grid_n = (N + BLOCK_N - 1) // BLOCK_N @@ -86,20 +95,30 @@ def _quantize_global_transpose(A, absmax_inv_ptr, B, stride_am, stride_an, strid B = B + (rm[:, None] * stride_bm + rn[None, :] * stride_bn) mask = (rm < M)[:, None] & (rn < N)[None, :] - output = tl.libdevice.llrint(127. * (a * absmax_inv)) + output = tl.libdevice.llrint(127.0 * (a * absmax_inv)) tl.store(B, output, mask=mask) def quantize_global_transpose(input): absmax = input.abs().max().unsqueeze(0) - absmax_inv = 1./ absmax + absmax_inv = 1.0 / absmax M, N = input.shape - out = torch.empty(N, M, device='cuda', dtype=torch.int8) + out = torch.empty(N, M, device="cuda", dtype=torch.int8) assert out.size(0) == N and out.size(1) == M assert input.stride(0) == 1 or input.stride(1) == 1 assert out.stride(0) == 1 or out.stride(1) == 1 - grid = lambda META: (triton.cdiv(M, META['BLOCK_M']) * triton.cdiv(N, META['BLOCK_N']),) - _quantize_global_transpose[grid](input, absmax_inv, out, input.stride(0), input.stride(1), out.stride(0), out.stride(1), M, N) + grid = lambda META: (triton.cdiv(M, META["BLOCK_M"]) * triton.cdiv(N, META["BLOCK_N"]),) + _quantize_global_transpose[grid]( + input, + absmax_inv, + out, + input.stride(0), + input.stride(1), + out.stride(0), + out.stride(1), + M, + N, + ) return out, absmax diff --git a/bitsandbytes/triton/quantize_rowwise.py b/bitsandbytes/triton/quantize_rowwise.py index 078f4aa2d..f92ace02c 100644 --- a/bitsandbytes/triton/quantize_rowwise.py +++ b/bitsandbytes/triton/quantize_rowwise.py @@ -5,9 +5,10 @@ from bitsandbytes.triton.triton_utils import is_triton_available if not is_triton_available(): - def quantize_rowwise(x: torch.Tensor): return None -else: + def quantize_rowwise(x: torch.Tensor): + return None +else: import triton import triton.language as tl @@ -15,21 +16,21 @@ def quantize_rowwise(x: torch.Tensor): return None # TODO: autotune this better. @triton.autotune( - configs=[ - triton.Config({}, num_stages=1, num_warps=8), - triton.Config({}, num_stages=2, num_warps=8), - triton.Config({}, num_stages=4, num_warps=8), - triton.Config({}, num_stages=8, num_warps=8), - triton.Config({}, num_stages=1), - triton.Config({}, num_stages=2), - triton.Config({}, num_stages=4), - triton.Config({}, num_stages=8), - triton.Config({}, num_warps=1), - triton.Config({}, num_warps=2), - triton.Config({}, num_warps=4), - triton.Config({}, num_warps=8), - ], - key=['n_elements'] + configs=[ + triton.Config({}, num_stages=1, num_warps=8), + triton.Config({}, num_stages=2, num_warps=8), + triton.Config({}, num_stages=4, num_warps=8), + triton.Config({}, num_stages=8, num_warps=8), + triton.Config({}, num_stages=1), + triton.Config({}, num_stages=2), + triton.Config({}, num_stages=4), + triton.Config({}, num_stages=8), + triton.Config({}, num_warps=1), + triton.Config({}, num_warps=2), + triton.Config({}, num_warps=4), + triton.Config({}, num_warps=8), + ], + key=["n_elements"], ) @triton.jit def _quantize_rowwise( @@ -49,7 +50,7 @@ def _quantize_rowwise( abs_x = tl.abs(x) max_val = tl.max(tl.where(row_mask, abs_x, 0), axis=0) - output = tl.libdevice.llrint(127. * (x / max_val)) + output = tl.libdevice.llrint(127.0 * (x / max_val)) tl.store(output_ptr + offsets, output, mask=row_mask) tl.store(output_maxs + pid, max_val) diff --git a/bitsandbytes/utils.py b/bitsandbytes/utils.py index 0582f7fc0..48c7fc82d 100644 --- a/bitsandbytes/utils.py +++ b/bitsandbytes/utils.py @@ -30,7 +30,7 @@ def outlier_hook(module, input): # (1) zscore test of std of hidden dimension outlier_idx = find_outlier_dims(merged, reduction_dim=1, zscore=3) # (2) magnitude > 6 test - dims = (torch.abs(input[0])> 6).sum(dim=list(range(len(input[0].shape)-1))) + dims = (torch.abs(input[0]) > 6).sum(dim=list(range(len(input[0].shape) - 1))) outlier_idx2 = torch.where(dims > 0)[0] outlier_idx = torch.cat([outlier_idx, outlier_idx2]).unique() tracer.hvalue2outlier_idx[hvalue] = outlier_idx @@ -59,14 +59,14 @@ def initialize(self, model): self.hooks.append(m.register_forward_pre_hook(outlier_hook)) def is_initialized(self): - return getattr(self, 'initialized', False) + return getattr(self, "initialized", False) def get_hvalue(self, weight): return weight.data.storage().data_ptr() def get_outliers(self, weight): if not self.is_initialized(): - print('Outlier tracer is not initialized...') + print("Outlier tracer is not initialized...") return None hvalue = self.get_hvalue(weight) if hvalue in self.hvalue2outlier_idx: @@ -80,6 +80,7 @@ def get_instance(cls): cls._instance = cls.__new__(cls) return cls._instance + def find_outlier_dims(weight, reduction_dim=0, zscore=4.0, topk=None, rdm=False): if rdm: return torch.randint(0, weight.shape[1], size=(topk,), device=weight.device).long() @@ -87,13 +88,13 @@ def find_outlier_dims(weight, reduction_dim=0, zscore=4.0, topk=None, rdm=False) m = weight.mean(reduction_dim) mm = m.mean() mstd = m.std() - zm = (m-mm)/mstd + zm = (m - mm) / mstd std = weight.std(reduction_dim) stdm = std.mean() stdstd = std.std() - zstd = (std-stdm)/stdstd + zstd = (std - stdm) / stdstd if topk is not None: val, idx = torch.topk(std.abs(), k=topk, dim=0) @@ -105,10 +106,7 @@ def find_outlier_dims(weight, reduction_dim=0, zscore=4.0, topk=None, rdm=False) def execute_and_return(command_string: str) -> Tuple[str, str]: def _decode(subprocess_err_out_tuple): - return tuple( - to_decode.decode("UTF-8").strip() - for to_decode in subprocess_err_out_tuple - ) + return tuple(to_decode.decode("UTF-8").strip() for to_decode in subprocess_err_out_tuple) def execute_and_return_decoded_std_streams(command_string): return _decode( @@ -116,14 +114,13 @@ def execute_and_return_decoded_std_streams(command_string): shlex.split(command_string), stdout=subprocess.PIPE, stderr=subprocess.PIPE, - ).communicate() + ).communicate(), ) std_out, std_err = execute_and_return_decoded_std_streams(command_string) return std_out, std_err - def replace_linear( model, linear_replacement, @@ -163,8 +160,9 @@ def replace_linear( model._modules[name].bias = old_module.bias if post_processing_function is not None: - func = getattr(module, post_processing_function, None) - if func is not None: func(module) + func = getattr(module, post_processing_function, None) + if func is not None: + func(module) return model @@ -179,7 +177,7 @@ def pack_dict_to_tensor(source_dict): A torch tensor containing the packed data. """ json_str = json.dumps(source_dict) - json_bytes = json_str.encode('utf-8') + json_bytes = json_str.encode("utf-8") tensor_data = torch.tensor(list(json_bytes), dtype=torch.uint8) return tensor_data @@ -196,7 +194,7 @@ def unpack_tensor_to_dict(tensor_data): A Python dictionary containing the unpacked data. """ json_bytes = bytes(tensor_data.cpu().numpy()) - json_str = json_bytes.decode('utf-8') + json_str = json_bytes.decode("utf-8") unpacked_dict = json.loads(json_str) return unpacked_dict diff --git a/check_bnb_install.py b/check_bnb_install.py index 5a7f74f89..7a9dc93fc 100644 --- a/check_bnb_install.py +++ b/check_bnb_install.py @@ -2,14 +2,14 @@ import bitsandbytes as bnb -p = torch.nn.Parameter(torch.rand(10,10).cuda()) -a = torch.rand(10,10).cuda() +p = torch.nn.Parameter(torch.rand(10, 10).cuda()) +a = torch.rand(10, 10).cuda() p1 = p.data.sum().item() adam = bnb.optim.Adam([p]) -out = a*p +out = a * p loss = out.sum() loss.backward() adam.step() @@ -17,5 +17,5 @@ p2 = p.data.sum().item() assert p1 != p2 -print('SUCCESS!') -print('Installation was successful!') +print("SUCCESS!") +print("Installation was successful!") diff --git a/examples/int8_inference_huggingface.py b/examples/int8_inference_huggingface.py index c89ba8d11..2d4c77952 100644 --- a/examples/int8_inference_huggingface.py +++ b/examples/int8_inference_huggingface.py @@ -2,23 +2,18 @@ from transformers import LlamaForCausalLM, LlamaTokenizer MAX_NEW_TOKENS = 128 -model_name = 'meta-llama/Llama-2-7b-hf' +model_name = "meta-llama/Llama-2-7b-hf" -text = 'Hamburg is in which country?\n' +text = "Hamburg is in which country?\n" tokenizer = LlamaTokenizer.from_pretrained(model_name) input_ids = tokenizer(text, return_tensors="pt").input_ids -max_memory = f'{int(torch.cuda.mem_get_info()[0]/1024**3)-2}GB' +max_memory = f"{int(torch.cuda.mem_get_info()[0]/1024**3)-2}GB" n_gpus = torch.cuda.device_count() max_memory = {i: max_memory for i in range(n_gpus)} -model = LlamaForCausalLM.from_pretrained( - model_name, - device_map='auto', - load_in_8bit=True, - max_memory=max_memory -) +model = LlamaForCausalLM.from_pretrained(model_name, device_map="auto", load_in_8bit=True, max_memory=max_memory) generated_ids = model.generate(input_ids, max_length=MAX_NEW_TOKENS) print(tokenizer.decode(generated_ids[0], skip_special_tokens=True)) diff --git a/install_cuda.py b/install_cuda.py index b41b33b39..9e426cbd7 100644 --- a/install_cuda.py +++ b/install_cuda.py @@ -19,6 +19,7 @@ "123": "https://developer.download.nvidia.com/compute/cuda/12.3.2/local_installers/cuda_12.3.2_545.23.08_linux.run", } + def install_cuda(version, base_path, download_path): formatted_version = f"{version[:-1]}.{version[-1]}" folder = f"cuda-{formatted_version}" @@ -29,7 +30,7 @@ def install_cuda(version, base_path, download_path): subprocess.run(["rm", "-rf", install_path], check=True) url = cuda_versions[version] - filename = url.split('/')[-1] + filename = url.split("/")[-1] filepath = os.path.join(download_path, filename) if not os.path.exists(filepath): @@ -44,9 +45,14 @@ def install_cuda(version, base_path, download_path): # Install CUDA print(f"Installing CUDA version {version}...") install_command = [ - "bash", filepath, - "--no-drm", "--no-man-page", "--override", - "--toolkitpath=" + install_path, "--toolkit", "--silent" + "bash", + filepath, + "--no-drm", + "--no-man-page", + "--override", + "--toolkitpath=" + install_path, + "--toolkit", + "--silent", ] print(f"Running command: {' '.join(install_command)}") @@ -62,6 +68,7 @@ def install_cuda(version, base_path, download_path): print(f"CUDA version {version} installed at {install_path}") + def main(): user_base_path = os.path.expanduser("~/cuda") system_base_path = "/usr/local/cuda" @@ -93,5 +100,6 @@ def main(): print(f"Invalid CUDA version: {version}. Available versions are: {', '.join(cuda_versions.keys())}") sys.exit(1) + if __name__ == "__main__": main() diff --git a/scripts/stale.py b/scripts/stale.py index 613f5b7cb..a65652aeb 100644 --- a/scripts/stale.py +++ b/scripts/stale.py @@ -15,6 +15,7 @@ Script to close stale issue. Taken in part from the AllenNLP repository. https://github.com/allenai/allennlp. """ + from datetime import datetime as dt, timezone import os @@ -50,7 +51,7 @@ def main(): issue.create_comment( "This issue has been automatically marked as stale because it has not had " "recent activity. If you think this still needs to be addressed " - "please comment on this thread.\n\n" + "please comment on this thread.\n\n", ) diff --git a/tests/test_autograd.py b/tests/test_autograd.py index d01e5e9db..9da665a2d 100644 --- a/tests/test_autograd.py +++ b/tests/test_autograd.py @@ -20,7 +20,11 @@ @pytest.mark.parametrize("dim2", get_test_dims(32, 96, n=1), ids=id_formatter("dim2")) @pytest.mark.parametrize("dim3", get_test_dims(32, 96, n=1), ids=id_formatter("dim3")) @pytest.mark.parametrize("dim4", get_test_dims(32, 96, n=1), ids=id_formatter("dim4")) -@pytest.mark.parametrize("funcs", [(torch.bmm, bnb.bmm_cublas), (torch.matmul, bnb.matmul_cublas)], ids=["func=bmm", "func=matmul"]) +@pytest.mark.parametrize( + "funcs", + [(torch.bmm, bnb.bmm_cublas), (torch.matmul, bnb.matmul_cublas)], + ids=["func=bmm", "func=matmul"], +) @pytest.mark.parametrize("dtype", [torch.float32, torch.float16], ids=describe_dtype) @pytest.mark.parametrize("req_grad", BOOLEAN_TUPLES, ids=id_formatter("req_grad")) @pytest.mark.parametrize("transpose", BOOLEAN_TUPLES, ids=id_formatter("transpose")) @@ -30,16 +34,13 @@ def test_matmul(dim1, dim2, dim3, dim4, funcs, dtype, req_grad: Tuple[bool, bool dim3 = dim3 - (dim3 % 16) dim4 = dim4 - (dim4 % 16) for i in range(25): - # normal multiply if funcs[0] in [torch.mm, torch.matmul]: dimA = (dim2, dim3) if not transpose[0] else (dim3, dim2) dimB = (dim3, dim4) if not transpose[1] else (dim4, dim3) A = torch.randn(size=dimA, device="cuda", requires_grad=req_grad[0]) B = torch.randn(size=dimB, device="cuda", requires_grad=req_grad[1]) - target = torch.randn( - size=(dim2, dim4), device="cuda", requires_grad=req_grad[1] - ) + target = torch.randn(size=(dim2, dim4), device="cuda", requires_grad=req_grad[1]) torch.nn.init.xavier_uniform_(B) if not transpose[0] and not transpose[1]: @@ -71,9 +72,7 @@ def test_matmul(dim1, dim2, dim3, dim4, funcs, dtype, req_grad: Tuple[bool, bool A.grad = None B.grad = None - loss_torch = torch.nn.functional.mse_loss( - out_torch, target - ).mean() + loss_torch = torch.nn.functional.mse_loss(out_torch, target).mean() loss_torch.backward() gradA2 = A.grad gradB2 = B.grad @@ -81,18 +80,14 @@ def test_matmul(dim1, dim2, dim3, dim4, funcs, dtype, req_grad: Tuple[bool, bool B.grad = None if req_grad[0]: - torch.testing.assert_close( - gradA1, gradA2, atol=0.015, rtol=0.1 - ) + torch.testing.assert_close(gradA1, gradA2, atol=0.015, rtol=0.1) if req_grad[1]: n = gradB1.numel() idx = torch.isclose(gradB1, gradB2, atol=0.06, rtol=0.3) assert (idx == 0).sum().item() < n * 0.1 idx = torch.isclose(gradB1, gradB2, atol=0.10, rtol=0.3) assert (idx == 0).sum().item() < n * 0.02 - torch.testing.assert_close( - gradB1, gradB2, atol=0.18, rtol=0.3 - ) + torch.testing.assert_close(gradB1, gradB2, atol=0.18, rtol=0.3) # batched matrix multiply if funcs[0] in [torch.bmm, torch.matmul]: @@ -119,9 +114,7 @@ def test_matmul(dim1, dim2, dim3, dim4, funcs, dtype, req_grad: Tuple[bool, bool n = out_bnb.numel() idx = torch.isclose(out_bnb, out_torch, atol=0.01, rtol=0.1) assert (idx == 0).sum().item() < n * 0.01 - torch.testing.assert_close( - out_bnb, out_torch, atol=0.027, rtol=0.2 - ) + torch.testing.assert_close(out_bnb, out_torch, atol=0.027, rtol=0.2) if any(req_grad): out_bnb.data.copy_(out_torch) @@ -133,9 +126,7 @@ def test_matmul(dim1, dim2, dim3, dim4, funcs, dtype, req_grad: Tuple[bool, bool A.grad = None B.grad = None - loss_torch = torch.nn.functional.mse_loss( - out_torch, target - ).mean() + loss_torch = torch.nn.functional.mse_loss(out_torch, target).mean() loss_torch.backward() gradA2 = A.grad gradB2 = B.grad @@ -143,9 +134,7 @@ def test_matmul(dim1, dim2, dim3, dim4, funcs, dtype, req_grad: Tuple[bool, bool B.grad = None if req_grad[0]: - torch.testing.assert_close( - gradA1, gradA2, atol=0.015, rtol=0.1 - ) + torch.testing.assert_close(gradA1, gradA2, atol=0.015, rtol=0.1) if req_grad[1]: n = gradB1.numel() idx = torch.isclose(gradB1, gradB2, atol=0.06, rtol=0.3) @@ -192,9 +181,7 @@ def test_matmul(dim1, dim2, dim3, dim4, funcs, dtype, req_grad: Tuple[bool, bool A.grad = None B.grad = None - loss_torch = torch.nn.functional.mse_loss( - out_torch, target - ).mean() + loss_torch = torch.nn.functional.mse_loss(out_torch, target).mean() loss_torch.backward() gradA2 = A.grad gradB2 = B.grad @@ -202,9 +189,7 @@ def test_matmul(dim1, dim2, dim3, dim4, funcs, dtype, req_grad: Tuple[bool, bool B.grad = None if req_grad[0]: - torch.testing.assert_close( - gradA1, gradA2, atol=0.015, rtol=0.1 - ) + torch.testing.assert_close(gradA1, gradA2, atol=0.015, rtol=0.1) if req_grad[1]: n = gradB1.numel() idx = torch.isclose(gradB1, gradB2, atol=0.06, rtol=0.3) @@ -218,25 +203,17 @@ def test_matmul(dim1, dim2, dim3, dim4, funcs, dtype, req_grad: Tuple[bool, bool @pytest.mark.parametrize("dim3", get_test_dims(32, 96, n=1), ids=id_formatter("dim3")) @pytest.mark.parametrize("dim4", get_test_dims(32, 96, n=1), ids=id_formatter("dim4")) @pytest.mark.parametrize("decomp", [0.0, 6.0], ids=id_formatter("decomp")) -@pytest.mark.parametrize("funcs", [(torch.matmul, bnb.matmul), (torch.matmul, bnb.research.switchback_bnb)], ids=["func=matmul", "func=switchback_bnb"]) +@pytest.mark.parametrize( + "funcs", + [(torch.matmul, bnb.matmul), (torch.matmul, bnb.research.switchback_bnb)], + ids=["func=matmul", "func=switchback_bnb"], +) @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype) @pytest.mark.parametrize("req_grad", BOOLEAN_TRIPLES, ids=id_formatter("req_grad")) @pytest.mark.parametrize("transpose", TRANSPOSE_VALS, ids=id_formatter("transpose")) @pytest.mark.parametrize("has_fp16_weights", TRUE_FALSE, ids=id_formatter("has_fp16_weights")) @pytest.mark.parametrize("has_bias", TRUE_FALSE, ids=id_formatter("has_bias")) -def test_matmullt( - dim1, - dim2, - dim3, - dim4, - funcs, - dtype, - req_grad, - transpose, - decomp, - has_fp16_weights, - has_bias -): +def test_matmullt(dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose, decomp, has_fp16_weights, has_bias): dimA = (dim2, dim3) if not transpose[0] else (dim3, dim2) dimB = (dim3, dim4) if not transpose[1] else (dim4, dim3) outlier_dim = torch.randint(0, dimA[1], size=(dimA[1] // 8,), device="cuda") @@ -245,18 +222,13 @@ def test_matmullt( req_grad[2] = False for i in range(3): - # normal multiply if funcs[0] in [torch.mm, torch.matmul]: - A = torch.randn( - size=dimA, device="cuda", requires_grad=req_grad[0], dtype=dtype - ) + A = torch.randn(size=dimA, device="cuda", requires_grad=req_grad[0], dtype=dtype) if decomp == 6.0: with torch.no_grad(): A[:, outlier_dim] = 6.0 - B = torch.randn( - size=dimB, device="cuda", requires_grad=req_grad[1], dtype=dtype - ) + B = torch.randn(size=dimB, device="cuda", requires_grad=req_grad[1], dtype=dtype) target = torch.randn( size=(dim2, dim4), device="cuda", @@ -266,7 +238,7 @@ def test_matmullt( bias = None bias2 = None if has_bias: - bias = torch.randn(dim4, device='cuda', dtype=dtype, requires_grad=req_grad[2]) + bias = torch.randn(dim4, device="cuda", dtype=dtype, requires_grad=req_grad[2]) bias2 = bias.clone() torch.nn.init.xavier_uniform_(B) B2 = B.clone() @@ -311,9 +283,7 @@ def test_matmullt( if any(req_grad): out_bnb.data.copy_(out_torch) torch.cuda.synchronize() - loss_bnb = torch.nn.functional.mse_loss( - out_bnb, target - ).mean() + loss_bnb = torch.nn.functional.mse_loss(out_bnb, target).mean() loss_bnb.backward() gradA1 = A.grad gradB1 = B.grad @@ -323,9 +293,7 @@ def test_matmullt( gradBias1 = bias.grad bias.grad = None - loss_torch = torch.nn.functional.mse_loss( - out_torch, target - ).mean() + loss_torch = torch.nn.functional.mse_loss(out_torch, target).mean() loss_torch.backward() gradA2 = A.grad gradB2 = B.grad @@ -336,9 +304,7 @@ def test_matmullt( bias.grad = None if req_grad[0]: - torch.testing.assert_close( - gradA1, gradA2, atol=0.015, rtol=0.1 - ) + torch.testing.assert_close(gradA1, gradA2, atol=0.015, rtol=0.1) if req_grad[1]: n = gradB1.numel() if dim2 > 0: @@ -352,9 +318,7 @@ def test_matmullt( assert (idx == 0).sum().item() <= n * 0.1 idx = torch.isclose(gradB1, gradB2, atol=0.10, rtol=0.3) assert (idx == 0).sum().item() <= n * 0.02 - torch.testing.assert_close( - gradB1, gradB2, atol=0.18, rtol=0.3 - ) + torch.testing.assert_close(gradB1, gradB2, atol=0.18, rtol=0.3) if req_grad[2]: torch.testing.assert_close(gradBias1, gradBias2) @@ -370,8 +334,20 @@ def test_matmullt( @pytest.mark.parametrize("has_bias", TRUE_FALSE, ids=id_formatter("has_bias")) @pytest.mark.parametrize("dtype", [torch.float16, torch.float32], ids=describe_dtype) @pytest.mark.parametrize("compress_statistics", TRUE_FALSE, ids=id_formatter("compress_statistics")) -@pytest.mark.parametrize("quant_type", ['fp4', 'nf4'], ids=id_formatter("quant_type")) -def test_matmul_4bit(dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose, has_bias, compress_statistics, quant_type): +@pytest.mark.parametrize("quant_type", ["fp4", "nf4"], ids=id_formatter("quant_type")) +def test_matmul_4bit( + dim1, + dim2, + dim3, + dim4, + funcs, + dtype, + req_grad, + transpose, + has_bias, + compress_statistics, + quant_type, +): dimA = (dim2, dim3) if not transpose[0] else (dim3, dim2) dimB = (dim3, dim4) if not transpose[1] else (dim4, dim3) if has_bias == False: @@ -387,11 +363,15 @@ def test_matmul_4bit(dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose, bias = None bias2 = None if has_bias: - bias = torch.randn(dim4, device='cuda', dtype=dtype, requires_grad=req_grad[2]) + bias = torch.randn(dim4, device="cuda", dtype=dtype, requires_grad=req_grad[2]) bias2 = bias.clone() torch.nn.init.xavier_uniform_(B) - B2, quant_state = bnb.functional.quantize_4bit(B, compress_statistics=compress_statistics, quant_type=quant_type) + B2, quant_state = bnb.functional.quantize_4bit( + B, + compress_statistics=compress_statistics, + quant_type=quant_type, + ) if not transpose[0] and transpose[1]: out_torch = funcs[0](A, B.t()) @@ -410,7 +390,7 @@ def test_matmul_4bit(dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose, if n > 0: assert err < 0.115 - #assert err < 0.20 + # assert err < 0.20 if any(req_grad): out_bnb.data.copy_(out_torch) torch.cuda.synchronize() @@ -424,7 +404,7 @@ def test_matmul_4bit(dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose, gradBias1 = bias.grad bias.grad = None - loss_torch = torch.nn.functional.mse_loss( out_torch, target ).mean() + loss_torch = torch.nn.functional.mse_loss(out_torch, target).mean() loss_torch.backward() gradA2 = A.grad gradB2 = B.grad @@ -435,7 +415,7 @@ def test_matmul_4bit(dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose, bias.grad = None if req_grad[0]: - torch.testing.assert_close( gradA1, gradA2, atol=0.015, rtol=0.1) + torch.testing.assert_close(gradA1, gradA2, atol=0.015, rtol=0.1) if req_grad[2]: torch.testing.assert_close(gradBias1, gradBias2) @@ -448,8 +428,12 @@ def test_matmul_4bit(dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose, @pytest.mark.parametrize("req_grad", BOOLEAN_TRIPLES, ids=id_formatter("req_grad")) @pytest.mark.parametrize("transpose", TRANSPOSE_VALS, ids=id_formatter("transpose")) @pytest.mark.parametrize("dtype", [torch.float16, torch.float32], ids=describe_dtype) -@pytest.mark.parametrize("funcs", [(torch.matmul, bnb.research.matmul_fp8_mixed), (torch.matmul, bnb.research.matmul_fp8_global)], ids=["matmul_fp8_mixed", 'matmul_fp8_global']) -def test_matmul_fp8( dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose): +@pytest.mark.parametrize( + "funcs", + [(torch.matmul, bnb.research.matmul_fp8_mixed), (torch.matmul, bnb.research.matmul_fp8_global)], + ids=["matmul_fp8_mixed", "matmul_fp8_global"], +) +def test_matmul_fp8(dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose): dimA = (dim2, dim3) if not transpose[0] else (dim3, dim2) dimB = (dim3, dim4) if not transpose[1] else (dim4, dim3) req_grad = list(req_grad) @@ -480,7 +464,7 @@ def test_matmul_fp8( dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose): err = torch.abs(out_bnb - out_torch).float().mean().item() if n > 0: assert err < 0.115 - #assert err < 0.20 + # assert err < 0.20 if any(req_grad): out_bnb.data.copy_(out_torch) torch.cuda.synchronize() @@ -491,7 +475,7 @@ def test_matmul_fp8( dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose): A.grad = None B.grad = None - loss_torch = torch.nn.functional.mse_loss( out_torch, target ).mean() + loss_torch = torch.nn.functional.mse_loss(out_torch, target).mean() loss_torch.backward() gradA2 = A.grad gradB2 = B.grad @@ -499,7 +483,7 @@ def test_matmul_fp8( dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose): B.grad = None if req_grad[0]: - torch.testing.assert_close( gradA1, gradA2, atol=0.015, rtol=0.1) + torch.testing.assert_close(gradA1, gradA2, atol=0.015, rtol=0.1) if req_grad[1]: n = gradB1.numel() @@ -514,8 +498,6 @@ def test_matmul_fp8( dim1, dim2, dim3, dim4, funcs, dtype, req_grad, transpose): assert (idx == 0).sum().item() <= n * 0.1 idx = torch.isclose(gradB1, gradB2, atol=0.10, rtol=0.3) assert (idx == 0).sum().item() <= n * 0.02 - grad_err = (gradB1-gradB2).abs().mean() + grad_err = (gradB1 - gradB2).abs().mean() assert grad_err.item() < 0.003 - torch.testing.assert_close( - gradB1, gradB2, atol=0.18, rtol=0.3 - ) + torch.testing.assert_close(gradB1, gradB2, atol=0.18, rtol=0.3) diff --git a/tests/test_cuda_setup_evaluator.py b/tests/test_cuda_setup_evaluator.py index cb0b38fdd..fc79a54b0 100644 --- a/tests/test_cuda_setup_evaluator.py +++ b/tests/test_cuda_setup_evaluator.py @@ -35,7 +35,4 @@ def test_get_cuda_bnb_library_path_override(monkeypatch, cuda120_spec, caplog): def test_get_cuda_bnb_library_path_nocublaslt(monkeypatch, cuda111_noblas_spec): monkeypatch.delenv("BNB_CUDA_VERSION", raising=False) - assert ( - get_cuda_bnb_library_path(cuda111_noblas_spec).stem - == "libbitsandbytes_cuda111_nocublaslt" - ) + assert get_cuda_bnb_library_path(cuda111_noblas_spec).stem == "libbitsandbytes_cuda111_nocublaslt" diff --git a/tests/test_functional.py b/tests/test_functional.py index d4f65755f..b9f1a6ead 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -19,9 +19,7 @@ id_formatter, ) -torch.set_printoptions( - precision=5, sci_mode=False, linewidth=120, edgeitems=20, threshold=10000 -) +torch.set_printoptions(precision=5, sci_mode=False, linewidth=120, edgeitems=20, threshold=10000) k = 20 @@ -98,9 +96,7 @@ def teardown(): pass -@pytest.mark.parametrize( - "dtype", [torch.float32, torch.float16], ids=["float", "half"] -) +@pytest.mark.parametrize("dtype", [torch.float32, torch.float16], ids=["float", "half"]) def test_estimate_quantiles(dtype): A = torch.rand(1024, 1024, device="cuda") A = A.to(dtype) @@ -136,7 +132,6 @@ def test_quantile_quantization(): assert diff < 0.001 - def test_dynamic_quantization(): diffs = [] reldiffs = [] @@ -149,8 +144,8 @@ def test_dynamic_quantization(): diffs.append(diff.mean().item()) reldiffs.append(reldiff.mean().item()) assert diff.mean().item() < 0.0135 - print(sum(diffs)/len(diffs)) - print(sum(reldiffs)/len(reldiffs)) + print(sum(diffs) / len(diffs)) + print(sum(reldiffs) / len(reldiffs)) for i in range(100): A1 = torch.rand(1024, 1024, device="cuda") @@ -161,13 +156,12 @@ def test_dynamic_quantization(): assert diff < 0.004 - @pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16], ids=describe_dtype) @pytest.mark.parametrize("nested", TRUE_FALSE, ids=id_formatter("nested")) @pytest.mark.parametrize("blocksize", [4096, 2048, 1024, 512, 256, 128, 64]) @pytest.mark.parametrize("signed", TRUE_FALSE, ids=id_formatter("signed")) def test_dynamic_blockwise_quantization(dtype, nested, blocksize, signed): - #print('') + # print('') diffs = [] reldiffs = [] for i in range(100): @@ -178,10 +172,10 @@ def test_dynamic_blockwise_quantization(dtype, nested, blocksize, signed): reldiff = diff / torch.abs(A1.float() + 1e-8) diffs.append(diff.mean().item()) reldiffs.append(reldiff.mean().item()) - abserr = sum(diffs)/len(diffs) - relerr = sum(reldiffs)/len(reldiffs) - #print('nested=', nested, 'randn', blocksize, 'dtype', dtype, sum(diffs)/len(diffs)) - #print('nested=', nested, 'randn', blocksize, 'dtype', dtype, sum(reldiffs)/len(reldiffs)) + abserr = sum(diffs) / len(diffs) + relerr = sum(reldiffs) / len(reldiffs) + # print('nested=', nested, 'randn', blocksize, 'dtype', dtype, sum(diffs)/len(diffs)) + # print('nested=', nested, 'randn', blocksize, 'dtype', dtype, sum(reldiffs)/len(reldiffs)) assert abserr < 0.011 assert relerr < 0.018 assert A2.dtype == dtype @@ -196,9 +190,9 @@ def test_dynamic_blockwise_quantization(dtype, nested, blocksize, signed): reldiff = diff / torch.abs(A1.float() + 1e-8) diffs.append(diff.mean().item()) reldiffs.append(reldiff.mean().item()) - #torch.testing.assert_close(A1, A2, atol=1e-2, rtol=0) - abserr = sum(diffs)/len(diffs) - relerr = sum(reldiffs)/len(reldiffs) + # torch.testing.assert_close(A1, A2, atol=1e-2, rtol=0) + abserr = sum(diffs) / len(diffs) + relerr = sum(reldiffs) / len(reldiffs) if signed: assert abserr < 0.0035 assert relerr < 0.015 @@ -206,14 +200,11 @@ def test_dynamic_blockwise_quantization(dtype, nested, blocksize, signed): assert abserr < 0.00175 assert relerr < 0.012 assert A2.dtype == dtype - #print('signed=', signed, 'nested=', nested, 'rand', blocksize, sum(diffs)/len(diffs)) - #print('signed=', signed, 'nested=', nested, 'rand', blocksize, sum(reldiffs)/len(reldiffs)) - + # print('signed=', signed, 'nested=', nested, 'rand', blocksize, sum(diffs)/len(diffs)) + # print('signed=', signed, 'nested=', nested, 'rand', blocksize, sum(reldiffs)/len(reldiffs)) -@pytest.mark.parametrize( - "gtype", [torch.float32, torch.float16], ids=["float", "half"] -) +@pytest.mark.parametrize("gtype", [torch.float32, torch.float16], ids=["float", "half"]) def test_percentile_clipping(gtype): gnorm_vec1 = torch.zeros(100, device="cuda") gnorm_vec2 = torch.zeros(100, device="cuda") @@ -223,9 +214,7 @@ def test_percentile_clipping(gtype): for i in range(k): step += 1 g = torch.randn(n, n, dtype=gtype, device="cuda") - gnorm1, clip2, gnorm_scale = F.percentile_clipping( - g, gnorm_vec2, step, percentile=percentile - ) + gnorm1, clip2, gnorm_scale = F.percentile_clipping(g, gnorm_vec2, step, percentile=percentile) assert gnorm_scale == 1.0 if gnorm1 < clip2 else clip2 / gnorm1 gnorm2 = torch.norm(g.float()) @@ -309,7 +298,7 @@ def test_approx_igemm(dim1, dim2, quant_methods, batched): dim2 = dim2 - (dim2 % 32) errors = [] relerrors = [] - #print("") + # print("") for i in range(5): if batched: A = torch.normal(0, 0.5, size=(32, dim1, dim2 // 32), device="cuda") @@ -321,9 +310,7 @@ def test_approx_igemm(dim1, dim2, quant_methods, batched): B = torch.normal(0, 0.5, size=(dim2, dim1), device="cuda") maxA, Ac = quant_methods[0](A, 1) maxB, Bc = quant_methods[1](B, 0) - torch.testing.assert_close( - quant_methods[2](maxA, Ac), A, atol=0.025, rtol=0.05 - ) + torch.testing.assert_close(quant_methods[2](maxA, Ac), A, atol=0.025, rtol=0.05) if batched: out2 = torch.bmm(A, B) C = torch.bmm(Ac.float(), Bc.float()) @@ -338,8 +325,8 @@ def test_approx_igemm(dim1, dim2, quant_methods, batched): relerr = err / torch.abs(out2) errors.append(err.mean().item()) relerrors.append(relerr.mean().item()) - #print(mean(errors)) - #print(mean(relerrors)) + # print(mean(errors)) + # print(mean(relerrors)) def test_stable_embedding(): @@ -356,16 +343,8 @@ def test_igemm(hidden_dim, batch_dim, transpose, seq_dim): batch_dim = batch_dim - (batch_dim % 16) seq_dim = seq_dim - (seq_dim % 16) for i in range(k): - shapeA = ( - (batch_dim, hidden_dim) - if not transpose[0] - else (hidden_dim, batch_dim) - ) - shapeB = ( - (32 * random.randint(1, 4), hidden_dim) - if transpose[1] - else (hidden_dim, 32 * random.randint(1, 4)) - ) + shapeA = (batch_dim, hidden_dim) if not transpose[0] else (hidden_dim, batch_dim) + shapeB = (32 * random.randint(1, 4), hidden_dim) if transpose[1] else (hidden_dim, 32 * random.randint(1, 4)) A = torch.randint(-128, 127, size=shapeA, device="cuda").to(torch.int8) B = torch.randint(-128, 127, size=shapeB, device="cuda").to(torch.int8) if not transpose[0] and not transpose[1]: @@ -385,11 +364,7 @@ def test_igemm(hidden_dim, batch_dim, transpose, seq_dim): for i in range(k): shapeA = (batch_dim, seq_dim, hidden_dim) - shapeB = ( - (32 * random.randint(1, 4), hidden_dim) - if transpose[1] - else (hidden_dim, 32 * random.randint(1, 4)) - ) + shapeB = (32 * random.randint(1, 4), hidden_dim) if transpose[1] else (hidden_dim, 32 * random.randint(1, 4)) A = torch.randint(-128, 127, size=shapeA, device="cuda").to(torch.int8) B = torch.randint(-128, 127, size=shapeB, device="cuda").to(torch.int8) if not transpose[0] and not transpose[1]: @@ -410,16 +385,10 @@ def test_dim3_igemm(seq_dim, hidden_dim, batch_dim): hidden_dim = hidden_dim - (hidden_dim % 32) batch_dim = batch_dim - (batch_dim % 2) for i in range(25): - A = torch.randint( - -128, 127, size=(batch_dim, seq_dim, hidden_dim), device="cuda" - ).to(torch.int8) - B = torch.randint( - -128, 127, size=(batch_dim, seq_dim, 1024), device="cuda" - ).to(torch.int8) + A = torch.randint(-128, 127, size=(batch_dim, seq_dim, hidden_dim), device="cuda").to(torch.int8) + B = torch.randint(-128, 127, size=(batch_dim, seq_dim, 1024), device="cuda").to(torch.int8) out2 = torch.einsum("bsi, bso->io", A.float(), B.float()) - iout = torch.empty( - A.shape[2], B.shape[2], dtype=torch.int32, device=A.device - ) + iout = torch.empty(A.shape[2], B.shape[2], dtype=torch.int32, device=A.device) out = F.igemm(A, B, out=iout) torch.testing.assert_close(out.float(), out2) @@ -444,9 +413,7 @@ def min_max(x): errs2 = [] relerrs2 = [] for i in range(k): - A = torch.normal( - 0.0, 0.5, size=(batch_dim, seq_dim, hidden_dim), device="cuda" - ) + A = torch.normal(0.0, 0.5, size=(batch_dim, seq_dim, hidden_dim), device="cuda") if transpose: B = torch.normal(0, 0.5, size=(256, hidden_dim), device="cuda") else: @@ -523,9 +490,7 @@ def test_ibmm(dim1, dim2, dim3, dim4, transpose): out2 = torch.bmm(A.permute([0, 2, 1]).float(), B.float()) out = F.igemm(A.permute([0, 2, 1]), B) elif transpose[0] and transpose[1]: - out2 = torch.bmm( - A.permute([0, 2, 1]).float(), B.permute([0, 2, 1]).float() - ) + out2 = torch.bmm(A.permute([0, 2, 1]).float(), B.permute([0, 2, 1]).float()) out = F.igemm(A.permute([0, 2, 1]), B.permute([0, 2, 1])) torch.testing.assert_close(out.float(), out2.float()) @@ -541,7 +506,7 @@ def test_vector_quant(dim1, dim2, dim3): qA, SA = F.vectorwise_quant(A, dim=0) A1 = F.vectorwise_dequant(qA, SA) n = A1.numel() - assert_all_approx_close(A1, A, atol=0.01, rtol=0.1, count=int(n*0.002)) + assert_all_approx_close(A1, A, atol=0.01, rtol=0.1, count=int(n * 0.002)) @pytest.mark.parametrize("dim1", get_test_dims(2, 256, n=2), ids=id_formatter("dim1")) @@ -565,9 +530,7 @@ def test_nvidia_transform(dim1, dim2, dim3, dims, dtype, orderA, orderOut, trans if dims == 2: A = torch.randint(-128, 127, size=(dim1, dim2), device="cuda").to(dtype) elif dims == 3: - A = torch.randint(-128, 127, size=(dim1, dim2, dim3), device="cuda").to( - dtype - ) + A = torch.randint(-128, 127, size=(dim1, dim2, dim3), device="cuda").to(dtype) out, S = F.nvidia_transform(A, to_order=orderOut) @@ -579,17 +542,11 @@ def test_nvidia_transform(dim1, dim2, dim3, dims, dtype, orderA, orderOut, trans if dims == 2: n = A.shape[0] * (A.shape[1] + (32 - (A.shape[1] % 32))) elif dims == 3: - n = ( - A.shape[0] - * A.shape[1] - * (A.shape[2] + (32 - (A.shape[2] % 32))) - ) + n = A.shape[0] * A.shape[1] * (A.shape[2] + (32 - (A.shape[2] % 32))) assert out.numel() == n elif orderOut == "col_turing": # 32 col 8 row tiles - n = (A.shape[0] + (8 - A.shape[0] % 8)) * ( - A.shape[1] + (32 - (A.shape[1] % 32)) - ) + n = (A.shape[0] + (8 - A.shape[0] % 8)) * (A.shape[1] + (32 - (A.shape[1] % 32))) assert out.numel() == n total_coltile = (A.shape[1] // 32) + (1 if A.shape[1] % 32 != 0 else 0) for row in range(A.shape[0]): @@ -598,9 +555,7 @@ def test_nvidia_transform(dim1, dim2, dim3, dims, dtype, orderA, orderOut, trans j = col coltile = (col // 32) + (1 if col % 32 != 0 else 0) - rowtile = ( - (row // 8) + (1 if row % 8 != 0 else 0) - ) * total_coltile + rowtile = ((row // 8) + (1 if row % 8 != 0 else 0)) * total_coltile offset = 32 * 8 * (rowtile + coltile) col2 = col % 32 row2 = (row % 8) * 32 @@ -611,9 +566,7 @@ def test_nvidia_transform(dim1, dim2, dim3, dims, dtype, orderA, orderOut, trans # torch.testing.assert_close(A.flatten()[i+j], out.flatten()[row2+ col2+block_offset]) if orderOut == "col32": - out2, S = F.nvidia_transform( - out, from_order=orderOut, to_order="row", state=S - ) + out2, S = F.nvidia_transform(out, from_order=orderOut, to_order="row", state=S) torch.testing.assert_close(A, out2) @@ -626,16 +579,10 @@ def test_nvidia_transform(dim1, dim2, dim3, dims, dtype, orderA, orderOut, trans def test_igemmlt_int(dim1, dim2, dim3, dim4, dims, ldb): for i in range(k): if dims == 2: - A = torch.randint(-128, 127, size=(dim1, dim3), device="cuda").to( - torch.int8 - ) + A = torch.randint(-128, 127, size=(dim1, dim3), device="cuda").to(torch.int8) elif dims == 3: - A = torch.randint( - -128, 127, size=(dim1, dim2, dim3), device="cuda" - ).to(torch.int8) - B = torch.randint(-128, 127, size=(dim4, dim3), device="cuda").to( - torch.int8 - ) + A = torch.randint(-128, 127, size=(dim1, dim2, dim3), device="cuda").to(torch.int8) + B = torch.randint(-128, 127, size=(dim4, dim3), device="cuda").to(torch.int8) C1 = torch.matmul(A.float(), B.t().float()) A2, SA = F.transform(A, "col32") @@ -645,9 +592,7 @@ def test_igemmlt_int(dim1, dim2, dim3, dim4, dims, ldb): torch.testing.assert_close(C1, C3.float()) # transpose - B = torch.randint(-128, 127, size=(dim3, dim4), device="cuda").to( - torch.int8 - ) + B = torch.randint(-128, 127, size=(dim3, dim4), device="cuda").to(torch.int8) C1 = torch.matmul(A.float(), B.float()) B2t, SBt = F.transform(B, "col_turing", transpose=True) @@ -667,9 +612,7 @@ def test_igemmlt_half(dim1, dim2, dim3, dim4, dims): if dims == 2: A = torch.normal(0, 0.5, size=(dim1, dim3), device="cuda").half() elif dims == 3: - A = torch.normal( - 0, 0.5, size=(dim1, dim2, dim3), device="cuda" - ).half() + A = torch.normal(0, 0.5, size=(dim1, dim2, dim3), device="cuda").half() B = torch.randn((dim4, dim3), device="cuda").half() torch.nn.init.xavier_uniform_(B) C1 = torch.matmul(A, B.t()) @@ -700,6 +643,7 @@ def test_igemmlt_half(dim1, dim2, dim3, dim4, dims): # C3, S = F.transform(C2, 'row', state=SC) # torch.testing.assert_close(C1, C3.float()) + @pytest.mark.parametrize( ("batch", "seq", "model", "hidden"), [ @@ -729,7 +673,6 @@ def test_bench_8bit_training(batch, seq, model, hidden): torch.cuda.synchronize() t0 = time.time() for i in range(k): - out1 = torch.matmul(A, w1.t()) # fc1 # out2 = torch.matmul(out1, w2.t())# fc2 @@ -866,13 +809,15 @@ def test_bench_8bit_training(batch, seq, model, hidden): def test_dequant_mm(dim1, dim4, dims, formatB, has_bias): inner = torch.randint(1, 128, size=(1,)).item() bias = None - if has_bias: bias = torch.randn(dim4, device='cuda', dtype=torch.float16) + if has_bias: + bias = torch.randn(dim4, device="cuda", dtype=torch.float16) formatB = F.get_special_format_str() for i in range(1): A = torch.randn(dim1, inner, device="cuda") B = torch.randn(dim4, inner, device="cuda") C1 = torch.matmul(A.half(), B.t().half()) - if has_bias: C1 += bias + if has_bias: + C1 += bias A1, maxA = F.vectorwise_quant(A, dim=1) B1, maxB = F.vectorwise_quant(B, dim=1) @@ -883,7 +828,8 @@ def test_dequant_mm(dim1, dim4, dims, formatB, has_bias): C3, S = F.nvidia_transform(C2, "row", state=SC) C4 = F.vectorwise_mm_dequant(C3.float(), maxA, maxB.t()) - if has_bias: C4 += bias + if has_bias: + C4 += bias # TODO: is something wrong here? If so, the problem goes deeper # n = C1.numel() @@ -917,9 +863,7 @@ def test_colrow_absmax(dim1, dim2, dims): else: assert False - row_stats2, col_stats2, nnz_block_ptr2 = F.get_colrow_absmax( - A, threshold=threshold - ) + row_stats2, col_stats2, nnz_block_ptr2 = F.get_colrow_absmax(A, threshold=threshold) A_blocked = einops.rearrange( torch.abs(A), @@ -939,9 +883,7 @@ def test_colrow_absmax(dim1, dim2, dims): torch.testing.assert_close(row_stats1_trunc, row_stats2) torch.testing.assert_close(nnz_block_ptr1.int(), nnz_block_ptr2) - row_stats2, col_stats2, nnz_block_ptr2 = F.get_colrow_absmax( - A, threshold=0.0 - ) + row_stats2, col_stats2, nnz_block_ptr2 = F.get_colrow_absmax(A, threshold=0.0) torch.testing.assert_close(col_stats1, col_stats2) torch.testing.assert_close(row_stats1, row_stats2) @@ -963,24 +905,16 @@ def test_double_quant(dim1, dim2): torch.testing.assert_close(CAt, out_col1, atol=1, rtol=0) n = CAt.numel() - num_not_close_rows = ( - (torch.isclose(CA, out_row1, atol=1) == 0).sum().item() - ) - num_not_close_cols = ( - (torch.isclose(CAt, out_col1, atol=1) == 0).sum().item() - ) + num_not_close_rows = (torch.isclose(CA, out_row1, atol=1) == 0).sum().item() + num_not_close_cols = (torch.isclose(CAt, out_col1, atol=1) == 0).sum().item() # allow for 1:500 error due to rounding differences min_error = 1 / 500 if num_not_close_cols > (min_error * n): - print( - f"Min error exceeded {num_not_close_cols} elements are different. Error: {num_not_close_cols/n:.4f}" - ) + print(f"Min error exceeded {num_not_close_cols} elements are different. Error: {num_not_close_cols/n:.4f}") assert False if num_not_close_rows > (min_error * n): - print( - f"Min error exceeded {num_not_close_rows} elements are different. Error: {num_not_close_rows/n:.4f}" - ) + print(f"Min error exceeded {num_not_close_rows} elements are different. Error: {num_not_close_rows/n:.4f}") assert False torch.testing.assert_close(Srow.flatten().float(), statsA) @@ -991,13 +925,12 @@ def test_double_quant(dim1, dim2): ("dim1", "dim4", "inner"), ( pytest.param(dim1, dim4, inner, id=f"{dim1=},{dim4=},{inner=}") - for (dim1, dim4, inner) - in zip( + for (dim1, dim4, inner) in zip( get_test_dims(1, 4 * 1024, n=4), get_test_dims(1, 4 * 1024, n=4), get_test_dims(1, 4 * 1024, n=4), ) - ) + ), ) def test_integrated_igemmlt(dim1, dim4, inner): for i in range(k): @@ -1037,13 +970,12 @@ def test_integrated_igemmlt(dim1, dim4, inner): ("dim1", "dim4", "inner"), ( pytest.param(dim1, dim4, inner, id=f"{dim1=},{dim4=},{inner=}") - for (dim1, dim4, inner) - in zip( + for (dim1, dim4, inner) in zip( get_test_dims(1, 4 * 1024, n=6), get_test_dims(1, 4 * 1024, n=6), get_test_dims(1, 4 * 1024, n=6), ) - ) + ), ) @pytest.mark.skip("Row scale has some bugs for ampere") def test_igemmlt_row_scale(dim1, dim4, inner): @@ -1067,9 +999,7 @@ def test_igemmlt_row_scale(dim1, dim4, inner): c = 10.0 * inner * scale row_scale = torch.ones_like(maxA) / c - outC32, SC = F.igemmlt( - A2, B2, SA, SB, dtype=torch.int8, row_scale=row_scale - ) + outC32, SC = F.igemmlt(A2, B2, SA, SB, dtype=torch.int8, row_scale=row_scale) C3, S = F.nvidia_transform(outC32, "row", state=SC) maxval = torch.abs(C3).max() if maxval == 127: @@ -1150,9 +1080,7 @@ def test_row_scale_bench(dim1, dim4, inner): torch.cuda.synchronize() t0 = time.time() for i in range(k): - outC32, SC = F.igemmlt( - A2, B2, SA, SB, dtype=torch.int8, row_scale=row_scale - ) + outC32, SC = F.igemmlt(A2, B2, SA, SB, dtype=torch.int8, row_scale=row_scale) torch.cuda.synchronize() print("row-wise", time.time() - t0) @@ -1177,13 +1105,9 @@ def test_row_scale_bench(dim1, dim4, inner): def test_transform(dim1, dim2, dim3, dims, dtype, orderA, orderOut, transpose): for i in range(k): if dims == 2: - A = torch.randint(10, 99, size=(dim1, dim2), device="cuda").to( - dtype - ) + A = torch.randint(10, 99, size=(dim1, dim2), device="cuda").to(dtype) elif dims == 3: - A = torch.randint( - 10, 99, size=(dim1, dim2, dim3), device="cuda" - ).to(dtype) + A = torch.randint(10, 99, size=(dim1, dim2, dim3), device="cuda").to(dtype) A.view(-1)[-1] = -1 if transpose: @@ -1224,23 +1148,17 @@ def test_coo_double_quant(dim1, dim2): idx = torch.abs(A) >= threshold CA2, CAt, statsA, statsAt, coo_tensor = F.double_quant(A) - CA, CAt, statsA, statsAt, coo_tensor = F.double_quant( - A, threshold=threshold - ) + CA, CAt, statsA, statsAt, coo_tensor = F.double_quant(A, threshold=threshold) if coo_tensor is not None: A1 = A * idx A2 = torch.zeros_like(A) - A2[ - coo_tensor.rowidx.long(), coo_tensor.colidx.long() - ] = coo_tensor.values + A2[coo_tensor.rowidx.long(), coo_tensor.colidx.long()] = coo_tensor.values torch.testing.assert_close(A1, A2) A1 = A * (idx == 0) A2 = (CA.float() * statsA.unsqueeze(1) / 127).half() - torch.testing.assert_close( - A * (idx == 0), A2, rtol=0.05, atol=1.5e-2 - ) + torch.testing.assert_close(A * (idx == 0), A2, rtol=0.05, atol=1.5e-2) @pytest.mark.parametrize("dim1", get_test_dims(1, 1 * 1024, n=2), ids=id_formatter("dim1")) @@ -1261,9 +1179,7 @@ def test_spmm_coo(dim1, dim2, transposed_B): nnz = (idx == 1).sum().item() rows, cols = torch.where(idx) values = A[idx] - cooA = F.COOSparseTensor( - A.shape[0], A.shape[1], nnz, rows.int(), cols.int(), values - ) + cooA = F.COOSparseTensor(A.shape[0], A.shape[1], nnz, rows.int(), cols.int(), values) A2 = A * idx if transposed_B: @@ -1303,9 +1219,7 @@ def test_spmm_bench(): print(nnz / idx.numel()) rows, cols = torch.where(idx) values = A[idx] - cooA = F.COOSparseTensor( - A.shape[0], A.shape[1], nnz, rows.int(), cols.int(), values - ) + cooA = F.COOSparseTensor(A.shape[0], A.shape[1], nnz, rows.int(), cols.int(), values) for i in range(10): out2 = F.spmm_coo(cooA, B) @@ -1339,9 +1253,7 @@ def test_integrated_sparse_decomp(dim1, dim2): out1_32, Sout1_32 = F.igemmlt(C32A, CTw1, SA, Sw1) out2 = F.mm_dequant(out1_32, Sout1_32, statsA, statsw1) - CA, CAt, statsA, statsAt, coo_tensor = F.double_quant( - A, threshold=threshold - ) + CA, CAt, statsA, statsAt, coo_tensor = F.double_quant(A, threshold=threshold) C32A, SA = F.transform(CA, "col32") out1_32, Sout1_32 = F.igemmlt(C32A, CTw1, SA, Sw1) @@ -1396,9 +1308,7 @@ def test_spmm_coo_very_sparse(dim1, dim2, dtype, out_func): nnz = (idx == 1).sum().item() rows, cols = torch.where(idx) values = A[idx] - cooA = F.COOSparseTensor( - A.shape[0], A.shape[1], nnz, rows.int(), cols.int(), values - ) + cooA = F.COOSparseTensor(A.shape[0], A.shape[1], nnz, rows.int(), cols.int(), values) A2 = A * idx out1 = torch.matmul(A2.half(), B.half()) out = out_func(out1.shape, dtype=torch.float16, device=out1.device) @@ -1413,9 +1323,7 @@ def test_spmm_coo_very_sparse(dim1, dim2, dtype, out_func): std = out1.std() out1 /= std out2 /= std - assert_all_approx_close( - out1, out2.half(), rtol=0.01, atol=3.0e-2, count=count - ) + assert_all_approx_close(out1, out2.half(), rtol=0.01, atol=3.0e-2, count=count) # assert_all_approx_close(out1, out2.half(), rtol=0.05, atol=0.01, count=count) idx_col = torch.randint(0, A2.shape[-1], size=(15,)) @@ -1443,9 +1351,7 @@ def test_coo2csr(): nnz = (idx == 1).sum().item() rows, cols = torch.where(idx) values = A[idx] - cooA = F.COOSparseTensor( - A.shape[0], A.shape[1], nnz, rows.int(), cols.int(), values - ) + cooA = F.COOSparseTensor(A.shape[0], A.shape[1], nnz, rows.int(), cols.int(), values) A2 = A * idx csrA = F.coo2csr(cooA) counts = csrA.rowptr[1:] - csrA.rowptr[:-1] @@ -1463,9 +1369,7 @@ def test_coo2csc(): nnz = (idx == 1).sum().item() rows, cols = torch.where(idx) values = A[idx] - cooA = F.COOSparseTensor( - A.shape[0], A.shape[1], nnz, rows.int(), cols.int(), values - ) + cooA = F.COOSparseTensor(A.shape[0], A.shape[1], nnz, rows.int(), cols.int(), values) A2 = A * idx cscA = F.coo2csc(cooA) counts = cscA.colptr[1:] - cscA.colptr[:-1] @@ -1499,9 +1403,7 @@ def test_spmm_coo_dequant(dim1, dim2, dtype): nnz = (idx == 1).sum().item() rows, cols = torch.where(idx) values = A[idx] - cooA = F.COOSparseTensor( - A.shape[0], A.shape[1], nnz, rows.int(), cols.int(), values - ) + cooA = F.COOSparseTensor(A.shape[0], A.shape[1], nnz, rows.int(), cols.int(), values) A2 = A * idx out2 = F.spmm_coo_very_sparse(cooA, CBt, dequant_stats=statsBt) out1 = torch.matmul(A2, B.half()) @@ -1582,7 +1484,7 @@ def test_spmm_coo_dequant(dim1, dim2, dtype): @pytest.mark.parametrize( ("batch", "seq", "model", "hidden"), - [pytest.param(1, 1, 6656, 4*6656, id="batch=1, seq=1, model=6656, hidden=26k")], + [pytest.param(1, 1, 6656, 4 * 6656, id="batch=1, seq=1, model=6656, hidden=26k")], ) @pytest.mark.benchmark def test_bench_matmul(batch, seq, model, hidden): @@ -1605,8 +1507,8 @@ def test_bench_matmul(batch, seq, model, hidden): outliers = torch.randint(0, model, size=(5,)).cuda() A[:, :, outliers] = 8.0 - linearMixedBit = (bnb.nn.Linear8bitLt(model, hidden, False, False, threshold=6.0).cuda().half()) - #linearMixedBit.eval() + linearMixedBit = bnb.nn.Linear8bitLt(model, hidden, False, False, threshold=6.0).cuda().half() + # linearMixedBit.eval() linear8bit_train = bnb.nn.Linear8bitLt(model, hidden, False).cuda().half() linear8bit_train_thresh = bnb.nn.Linear8bitLt(model, hidden, False, threshold=6.0).cuda().half() @@ -1623,121 +1525,123 @@ def test_bench_matmul(batch, seq, model, hidden): for i in range(iters): torch.matmul(A, B.t()) torch.cuda.synchronize() - print( f"pytorch fp16: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s" ) + print( + f"pytorch fp16: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s", + ) - #torch.cuda.synchronize() - #t0 = time.time() - #for i in range(iters): + # torch.cuda.synchronize() + # t0 = time.time() + # for i in range(iters): # bnb.matmul_4bit(A, B_fp4.t(), quant_state=state) - #torch.cuda.synchronize() - #print( f"bnb fp4: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s" ) + # torch.cuda.synchronize() + # print( f"bnb fp4: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s" ) - #torch.cuda.synchronize() - #t0 = time.time() - #for i in range(iters): + # torch.cuda.synchronize() + # t0 = time.time() + # for i in range(iters): # bnb.matmul_4bit(A, B_fp4.t(), quant_state=state_c) - #torch.cuda.synchronize() - #print( f"bnb fp4 + compressed stats: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s" ) + # torch.cuda.synchronize() + # print( f"bnb fp4 + compressed stats: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s" ) torch.cuda.synchronize() t0 = time.time() for i in range(iters): bnb.matmul_4bit(A, B_nf4.t(), quant_state=state_nf4) torch.cuda.synchronize() - print( f"bnb nf4: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s" ) + print(f"bnb nf4: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s") torch.cuda.synchronize() t0 = time.time() for i in range(iters): bnb.matmul_4bit(A, B_nf4_c.t(), quant_state=state_nf4_c) torch.cuda.synchronize() - print( f"bnb nf4+DQ: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s" ) + print(f"bnb nf4+DQ: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s") - - #torch.cuda.synchronize() - #t0 = time.time() - #for i in range(iters): + # torch.cuda.synchronize() + # t0 = time.time() + # for i in range(iters): # bnb.matmul(A, B) - #torch.cuda.synchronize() - #print(f"CB -> CxB conversion (each iteration): [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s") + # torch.cuda.synchronize() + # print(f"CB -> CxB conversion (each iteration): [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s") - #torch.cuda.synchronize() - #t0 = time.time() - #for i in range(iters): + # torch.cuda.synchronize() + # t0 = time.time() + # for i in range(iters): # bnb.matmul(A, B, threshold=6.0) - #torch.cuda.synchronize() - #print(f"CB -> CxB conversion + threshold: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s") - - #CA, CAt, SCA, SCAt, coo_tensorA = F.double_quant(A, threshold=0.0) - #C32A, SA = F.transform(CA, "col32") - #CB, CBt, SCB, SCBt, coo_tensorB = F.double_quant(B) - #CxB, SB = F.transform(CB, to_order=formatB) - #torch.cuda.synchronize() - #t0 = time.time() - #for i in range(iters): + # torch.cuda.synchronize() + # print(f"CB -> CxB conversion + threshold: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s") + + # CA, CAt, SCA, SCAt, coo_tensorA = F.double_quant(A, threshold=0.0) + # C32A, SA = F.transform(CA, "col32") + # CB, CBt, SCB, SCBt, coo_tensorB = F.double_quant(B) + # CxB, SB = F.transform(CB, to_order=formatB) + # torch.cuda.synchronize() + # t0 = time.time() + # for i in range(iters): # out32, Sout32 = F.igemmlt(C32A, CxB, SA, SB) - #torch.cuda.synchronize() - #print(f"no overhead matmul-lt: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s") - - #BA, statsB = F.vectorwise_quant(B, dim=1) - #CxB, SB = F.nvidia_transform(CB, to_order=formatB) - #torch.cuda.synchronize() - #t0 = time.time() - #for i in range(iters): + # torch.cuda.synchronize() + # print(f"no overhead matmul-lt: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s") + + # BA, statsB = F.vectorwise_quant(B, dim=1) + # CxB, SB = F.nvidia_transform(CB, to_order=formatB) + # torch.cuda.synchronize() + # t0 = time.time() + # for i in range(iters): # A2 = A.view(-1, A.shape[-1]).contiguous() # CA, statsA = F.vectorwise_quant(A2, dim=1) # C32A, SA = F.nvidia_transform(CA, "col32") # out32, Sout32 = F.igemmlt(C32A, CxB, SA, SB) # Cout, Sout = F.nvidia_transform(out32, "row", state=Sout32) # F.vectorwise_mm_dequant(Cout, statsA, statsB.t()) - #torch.cuda.synchronize() - #print(f"vector pytorch + nvidia: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s") - - #BA, statsB = F.vectorwise_quant(B, dim=1, quant_type="linear") - #CxB, SB = F.nvidia_transform(CB, to_order=formatB) - #torch.cuda.synchronize() - #t0 = time.time() - #for i in range(iters): + # torch.cuda.synchronize() + # print(f"vector pytorch + nvidia: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s") + + # BA, statsB = F.vectorwise_quant(B, dim=1, quant_type="linear") + # CxB, SB = F.nvidia_transform(CB, to_order=formatB) + # torch.cuda.synchronize() + # t0 = time.time() + # for i in range(iters): # A2 = A.view(-1, A.shape[-1]).contiguous() # CA, statsA = F.vectorwise_quant(A2, dim=1, quant_type="linear") # C32A, SA = F.nvidia_transform(CA, "col32") # out32, Sout32 = F.igemmlt(C32A, CxB, SA, SB) # Cout, Sout = F.nvidia_transform(out32, "row", state=Sout32) # out = Cout * statsB * statsA * (1.0 / (127 * 127)) - #torch.cuda.synchronize() - #print(f"linear pytorch + nvidia: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s") + # torch.cuda.synchronize() + # print(f"linear pytorch + nvidia: [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s") - #linear8bit(A) - #torch.cuda.synchronize() - #t0 = time.time() - #for i in range(iters): + # linear8bit(A) + # torch.cuda.synchronize() + # t0 = time.time() + # for i in range(iters): # linear8bit(A) - #torch.cuda.synchronize() - #print( f"bnb linear8bitlt (eval): [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s") + # torch.cuda.synchronize() + # print( f"bnb linear8bitlt (eval): [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s") - #linearMixedBit(A) - #torch.cuda.synchronize() - #t0 = time.time() - #for i in range(iters): + # linearMixedBit(A) + # torch.cuda.synchronize() + # t0 = time.time() + # for i in range(iters): # linearMixedBit(A) - #torch.cuda.synchronize() - #print( f"bnb linear8bitlt with threshold (eval): [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s") + # torch.cuda.synchronize() + # print( f"bnb linear8bitlt with threshold (eval): [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s") - #linear8bit_train(A) - #torch.cuda.synchronize() - #t0 = time.time() - #for i in range(iters): + # linear8bit_train(A) + # torch.cuda.synchronize() + # t0 = time.time() + # for i in range(iters): # linear8bit_train(A) - #torch.cuda.synchronize() - #print( f"bnb linear8bitlt (training): [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s") + # torch.cuda.synchronize() + # print( f"bnb linear8bitlt (training): [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s") - #linear8bit_train_thresh(A) - #torch.cuda.synchronize() - #t0 = time.time() - #for i in range(iters): + # linear8bit_train_thresh(A) + # torch.cuda.synchronize() + # t0 = time.time() + # for i in range(iters): # linear8bit_train(A) - #torch.cuda.synchronize() - #print( f"bnb linear8bitlt with threshold (training): [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s") + # torch.cuda.synchronize() + # print( f"bnb linear8bitlt with threshold (training): [{batch},{seq},{model}], [{model},{hidden}]->[{batch},{seq},{hidden}]: {time.time()-t0:.4f}s") + def test_zeropoint(): def quant_zp(x): @@ -1778,8 +1682,8 @@ def quant_zp(x): C2 -= A.sum(1).view(-1, 1) * zp ca, cqa, cza = quant_zp(A) - #print(ca.min(), ca.max()) - #print((ca - cza).min(), (ca - cza).max()) + # print(ca.min(), ca.max()) + # print((ca - cza).min(), (ca - cza).max()) zp = 1 scale = 2.0 @@ -1808,14 +1712,14 @@ def quant_zp(x): C7 -= zpa * zpb * A.shape[1] C7 /= qa * qb - #print("") + # print("") # print(C0.flatten()[:10]) - #print(C1.flatten()[:10]) - #print(C2.flatten()[:10]) - #print(C3.flatten()[:10]) - #print(C5.flatten()[:10]) - #print(C6.flatten()[:10]) - #print(C7.flatten()[:10]) + # print(C1.flatten()[:10]) + # print(C2.flatten()[:10]) + # print(C3.flatten()[:10]) + # print(C5.flatten()[:10]) + # print(C6.flatten()[:10]) + # print(C7.flatten()[:10]) err1 = torch.abs(C1 - C2).mean().item() err2 = torch.abs(C1 - C3).mean().item() err3 = torch.abs(C1 - C4).mean().item() @@ -1852,16 +1756,15 @@ def test_extract_outliers(): torch.testing.assert_close(outliers1, outliers2) - def test_blockwise_cpu_large(): diffs = [] reldiffs = [] batch = 128 seq = 128 - for hidden in [128]:#, 14336]: + for hidden in [128]: # , 14336]: for blocksize in [4096, 16384]: for i in range(2): - A1 = torch.randn(batch, seq, hidden, device='cpu') + A1 = torch.randn(batch, seq, hidden, device="cpu") t0 = time.time() C, S = F.quantize_blockwise(A1, blocksize=blocksize) A2 = F.dequantize_blockwise(C, S, blocksize=blocksize) @@ -1875,10 +1778,9 @@ def test_blockwise_cpu_large(): # print(sum(reldiffs)/len(reldiffs)) - def test_fp8_quant(): for e_bits in range(1, 7): - p_bits = 7-e_bits + p_bits = 7 - e_bits code = F.create_fp8_map(True, e_bits, p_bits).cuda() abserr = [] @@ -1888,12 +1790,12 @@ def test_fp8_quant(): C, SC = F.quantize_blockwise(A1, code=code) A2 = F.dequantize_blockwise(C, SC) diff = torch.abs(A1 - A2) - reldiff = diff/torch.abs(A1+1e-8) + reldiff = diff / torch.abs(A1 + 1e-8) abserr.append(diff.mean().item()) relerr.append(reldiff.mean().item()) - #assert diff < 0.0075 - #print(sum(abserr)/len(abserr)) - #print(sum(relerr)/len(relerr)) + # assert diff < 0.0075 + # print(sum(abserr)/len(abserr)) + # print(sum(relerr)/len(relerr)) abserr = [] relerr = [] @@ -1902,12 +1804,12 @@ def test_fp8_quant(): C, SC = F.quantize_blockwise(A1, code=code) A2 = F.dequantize_blockwise(C, SC) diff = torch.abs(A1 - A2) - reldiff = diff/torch.abs(A1+1e-8) + reldiff = diff / torch.abs(A1 + 1e-8) abserr.append(diff.mean().item()) relerr.append(reldiff.mean().item()) - #assert diff < 0.0075 - #print(sum(abserr)/len(abserr)) - #print(sum(relerr)/len(relerr)) + # assert diff < 0.0075 + # print(sum(abserr)/len(abserr)) + # print(sum(relerr)/len(relerr)) abserr = [] relerr = [] @@ -1916,50 +1818,48 @@ def test_fp8_quant(): C, SC = F.quantize_blockwise(A1) A2 = F.dequantize_blockwise(C, SC) diff = torch.abs(A1 - A2) - reldiff = diff/torch.abs(A1+1e-8) + reldiff = diff / torch.abs(A1 + 1e-8) abserr.append(diff.mean().item()) relerr.append(reldiff.mean().item()) - #assert diff < 0.0075 - #print(3, sum(abserr)/len(abserr)) - #print(3, sum(relerr)/len(relerr)) + # assert diff < 0.0075 + # print(3, sum(abserr)/len(abserr)) + # print(3, sum(relerr)/len(relerr)) def test_few_bit_quant(): - - #print('') + # print('') for bits in range(2, 9): - #print('='*30, bits, '='*30) - for method in ['linear', 'fp8', 'dynamic', 'quantile']: + # print('='*30, bits, '='*30) + for method in ["linear", "fp8", "dynamic", "quantile"]: abserrs = [] relerrs = [] code = None - if method == 'linear': + if method == "linear": code = F.create_linear_map(True, total_bits=bits).cuda() - elif method == 'fp8': - ebits = math.ceil(bits/2) - pbits = bits-ebits-1 + elif method == "fp8": + ebits = math.ceil(bits / 2) + pbits = bits - ebits - 1 code = F.create_fp8_map(True, ebits, pbits, bits).cuda() - elif method == 'dynamic': - code = F.create_dynamic_map(True, bits-0, bits).cuda() - elif method == 'quantile': - values = torch.randn(2048, 2048, device='cuda') + elif method == "dynamic": + code = F.create_dynamic_map(True, bits - 0, bits).cuda() + elif method == "quantile": + values = torch.randn(2048, 2048, device="cuda") code = F.create_quantile_map(values, bits).cuda() # for some data types we have no zero # for some data types we have one zero # for some data types we have two zeros - assert torch.unique(code).numel() in [2**bits, 2**bits-1], f'bits: {bits}, method: {method}' - #print(method, (code==0).sum()) + assert torch.unique(code).numel() in [2**bits, 2**bits - 1], f"bits: {bits}, method: {method}" + # print(method, (code==0).sum()) assert code.numel() == 256 for i in range(10): - - values = torch.randn(1, 32, device='cuda') + values = torch.randn(1, 32, device="cuda") values /= values.abs().max() - #values[values.abs() < 1e-6] += 1e-5 + # values[values.abs() < 1e-6] += 1e-5 q1 = [] v1 = [] for v in values[0]: - idx = torch.abs(v-code).argmin() + idx = torch.abs(v - code).argmin() q1.append(idx.item()) v1.append(code[idx].item()) @@ -1970,62 +1870,61 @@ def test_few_bit_quant(): v2 = F.dequantize_blockwise(q2, S2) idx = torch.isclose(q1.int(), q2.int()) - err2 = torch.abs(v2-values) + err2 = torch.abs(v2 - values) abserrs.append(err2.mean().item()) - relerrs.append((err2/(1e-10+values).abs()).mean().item()) + relerrs.append((err2 / (1e-10 + values).abs()).mean().item()) if idx.sum(): # some weird cases - err1 = torch.abs(v1-values).mean() - #assert err2.mean() <= err1 + err1 = torch.abs(v1 - values).mean() + # assert err2.mean() <= err1 else: torch.testing.assert_close(q1, q2) - #print(method, 'abserr:', sum(abserrs)/len(abserrs), 'relerr:', sum(relerrs)/len(relerrs)) - #assert False + # print(method, 'abserr:', sum(abserrs)/len(abserrs), 'relerr:', sum(relerrs)/len(relerrs)) + # assert False def test_kbit_quantile_estimation(): for i in range(100): - data = torch.randn(1024, 1024, device='cuda') + data = torch.randn(1024, 1024, device="cuda") for bits in range(2, 9): - p = np.linspace(1.3e-4, 1-1.3e-4, 2**bits) + p = np.linspace(1.3e-4, 1 - 1.3e-4, 2**bits) val1 = torch.Tensor(norm.ppf(p)).cuda() val2 = F.estimate_quantiles(data, offset=0, num_quantiles=2**bits) - err = torch.abs(val1-val2).mean() + err = torch.abs(val1 - val2).mean() assert err < 0.038 for i in range(100): - data = torch.randn(1024, 1024, device='cuda') + data = torch.randn(1024, 1024, device="cuda") for bits in range(2, 4): - total_values = 2**bits-1 - p = np.linspace(0, 1, 2*total_values+1) - idx = np.arange(1, 2*total_values+1, 2) + total_values = 2**bits - 1 + p = np.linspace(0, 1, 2 * total_values + 1) + idx = np.arange(1, 2 * total_values + 1, 2) p = p[idx] - offset = 1/(2*total_values) - p = np.linspace(offset, 1-offset, total_values) + offset = 1 / (2 * total_values) + p = np.linspace(offset, 1 - offset, total_values) val1 = torch.Tensor(norm.ppf(p)).cuda() - val2 = F.estimate_quantiles(data, num_quantiles=2**bits-1) - err = torch.abs(val1-val2).mean() + val2 = F.estimate_quantiles(data, num_quantiles=2**bits - 1) + err = torch.abs(val1 - val2).mean() assert err < 0.035 @pytest.mark.benchmark def test_bench_dequantization(): - a = torch.rand(1024, 1024, device='cuda').half() - code =F.create_fp8_map(True, 3, 0, 4).cuda() + a = torch.rand(1024, 1024, device="cuda").half() + code = F.create_fp8_map(True, 3, 0, 4).cuda() qa, SA = F.quantize_blockwise(a, code=code) print(qa.max()) - max_theoretical_mu = 1024*1024*2/1024**3/672*1000*1000 - #print(max_theoretical_mu) + max_theoretical_mu = 1024 * 1024 * 2 / 1024**3 / 672 * 1000 * 1000 + # print(max_theoretical_mu) torch.cuda.synchronize() t0 = time.time() for i in range(100): qa, SA = F.quantize_blockwise(a) torch.cuda.synchronize() - #print((time.time()-t0)/1e6) - + # print((time.time()-t0)/1e6) @pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16], ids=describe_dtype) @@ -2037,26 +1936,28 @@ def test_fp4_quant(dtype): result = 0 bias = 3 sign, e1, e2, p1 = bits - idx = sign*8 + e1*4 + e2*2 + p1*1 + idx = sign * 8 + e1 * 4 + e2 * 2 + p1 * 1 sign = -1.0 if sign else 1.0 - exp = e1*2 + e2*1 + exp = e1 * 2 + e2 * 1 if exp == 0: # sub-normal - if p1 == 0: result = 0 - else: result = sign*0.0625 + if p1 == 0: + result = 0 + else: + result = sign * 0.0625 else: # normal - exp = 2**(-exp + bias + 1) + exp = 2 ** (-exp + bias + 1) frac = 1.5 if p1 else 1.0 - result = sign*exp*frac + result = sign * exp * frac code[idx] = result - A1 = torch.randn(1024, 1024, device='cuda', dtype=dtype) + A1 = torch.randn(1024, 1024, device="cuda", dtype=dtype) qa, SA = F.quantize_fp4(A1, blocksize=64) A2 = F.dequantize_fp4(qa, SA) err = (A1 - A2).abs().float() - relerr = (err/(A1.abs().float()+1e-8)).mean() + relerr = (err / (A1.abs().float() + 1e-8)).mean() idx = err > 1.0 err = err.mean() @@ -2065,31 +1966,29 @@ def test_fp4_quant(dtype): assert relerr.item() < 0.28 -@pytest.mark.parametrize("quant_type", ['fp4', 'nf4']) +@pytest.mark.parametrize("quant_type", ["fp4", "nf4"]) def test_4bit_compressed_stats(quant_type): for blocksize in [128, 64]: errs1 = [] errs2 = [] for i in range(10): - A1 = torch.randn(1024, 1024, device='cuda').half() + A1 = torch.randn(1024, 1024, device="cuda").half() q2, SA2 = F.quantize_4bit(A1, blocksize=blocksize, quant_type=quant_type) - q3, SA3= F.quantize_4bit(A1, blocksize=blocksize, compress_statistics=True, quant_type=quant_type) + q3, SA3 = F.quantize_4bit(A1, blocksize=blocksize, compress_statistics=True, quant_type=quant_type) A2 = F.dequantize_4bit(q2, SA2, quant_type=quant_type) A3 = F.dequantize_4bit(q3, SA3, quant_type=quant_type) - err = (A1 - A2).abs().float() - relerr = (err/(A1.abs().float()+1e-15)).mean() + relerr = (err / (A1.abs().float() + 1e-15)).mean() err = err.mean() errs1.append(err.item()) - assert err.item() < 0.11 assert relerr.item() < 0.28 err = (A1 - A3).abs().float() - relerr = (err/(A1.abs().float()+1e-15)).mean() + relerr = (err / (A1.abs().float() + 1e-15)).mean() err = err.mean() errs2.append(err.item()) @@ -2097,70 +1996,71 @@ def test_4bit_compressed_stats(quant_type): assert err.item() < 0.11 assert relerr.item() < 0.28 - #print(sum(errs1)/len(errs1), blocksize, quant_type) - #print(sum(errs2)/len(errs2), blocksize, quant_type) - + # print(sum(errs1)/len(errs1), blocksize, quant_type) + # print(sum(errs2)/len(errs2), blocksize, quant_type) - -#@pytest.mark.parametrize("quant_type", ['fp4', 'nf4']) -@pytest.mark.parametrize("quant_type", ['nf4']) +# @pytest.mark.parametrize("quant_type", ['fp4', 'nf4']) +@pytest.mark.parametrize("quant_type", ["nf4"]) @pytest.mark.benchmark def test_bench_4bit_dequant(quant_type): blocksize = 256 - a = torch.rand(1024*12*4, 1024*12, device='cuda').half() + a = torch.rand(1024 * 12 * 4, 1024 * 12, device="cuda").half() qa, SA = F.quantize_4bit(a, blocksize=blocksize, quant_type=quant_type) - input_size = a.numel()/2 - output_size = a.numel()*2 - num_bytes = input_size+output_size - GB = num_bytes/1e9 - max_theoretical_s = GB/768 - #print(max_theoretical_s*1e6) - b = torch.randn(128, 1024*12, device='cuda').half() + input_size = a.numel() / 2 + output_size = a.numel() * 2 + num_bytes = input_size + output_size + GB = num_bytes / 1e9 + max_theoretical_s = GB / 768 + # print(max_theoretical_s*1e6) + b = torch.randn(128, 1024 * 12, device="cuda").half() iters = 100 torch.cuda.synchronize() t0 = time.time() for i in range(iters): F.dequantize_4bit(qa, SA, blocksize=blocksize, quant_type=quant_type) - #b.copy_(a) + # b.copy_(a) torch.cuda.synchronize() - #print((time.time()-t0)/iters*1e6) + # print((time.time()-t0)/iters*1e6) - #torch.cuda.synchronize() - #t0 = time.time() - #for i in range(iters): + # torch.cuda.synchronize() + # t0 = time.time() + # for i in range(iters): # torch.matmul(b, a.t()) - #torch.cuda.synchronize() - #print((time.time()-t0)/iters*1e6) - + # torch.cuda.synchronize() + # print((time.time()-t0)/iters*1e6) def test_normal_map_tree(): code = F.create_normal_map() - values =code[:8].tolist() + code[-8:].tolist() + values = code[:8].tolist() + code[-8:].tolist() num_pivots = 1 - #print(values) - while num_pivots <16: - idx = list(range(16//num_pivots//2, 16, 16//num_pivots)) - #print(idx) + # print(values) + while num_pivots < 16: + idx = list(range(16 // num_pivots // 2, 16, 16 // num_pivots)) + # print(idx) num_pivots *= 2 pivots = [] for i in idx: - pivots.append((values[i-1]+values[i])/2) - #print(pivots) + pivots.append((values[i - 1] + values[i]) / 2) + # print(pivots) @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}") -@pytest.mark.parametrize("storage_type", ['nf4', 'fp4']) -@pytest.mark.parametrize("kind", ['fc1', 'fc2', 'attn', 'attn_packed']) +@pytest.mark.parametrize("storage_type", ["nf4", "fp4"]) +@pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"]) @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype) -@pytest.mark.parametrize("quant_storage", [torch.uint8, torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype) +@pytest.mark.parametrize( + "quant_storage", + [torch.uint8, torch.float16, torch.bfloat16, torch.float32], + ids=describe_dtype, +) def test_gemv_4bit(dtype, storage_type, quant_storage, double_quant, kind): for dim in [128, 256, 512, 1024]: - #for dim in [4*1024]: - #for dim in [1*16]: + # for dim in [4*1024]: + # for dim in [1*16]: errs1 = [] errs2 = [] errs3 = [] @@ -2171,38 +2071,42 @@ def test_gemv_4bit(dtype, storage_type, quant_storage, double_quant, kind): max_errs2 = [] max_errs3 = [] - for i in range(100): - if kind == 'fc1': - A = torch.randn(1, dim, dtype=dtype, device='cuda') - B = torch.randn(dim*4, dim, dtype=dtype, device='cuda')/math.sqrt(dim) - elif kind == 'fc2': - A = torch.randn(1, 4*dim, dtype=dtype, device='cuda') - B = torch.randn(dim, 4*dim, dtype=dtype, device='cuda')/math.sqrt(dim) - elif kind == 'attn': - A = torch.randn(1, dim, dtype=dtype, device='cuda') - B = torch.randn(dim, dim, dtype=dtype, device='cuda')/math.sqrt(dim) - elif kind == 'attn_packed': - A = torch.randn(1, dim, dtype=dtype, device='cuda') - B = torch.randn(dim*3, dim, dtype=dtype, device='cuda')/math.sqrt(dim) - - qB, state = F.quantize_4bit(B, quant_type=storage_type, compress_statistics=double_quant, quant_storage=quant_storage) + if kind == "fc1": + A = torch.randn(1, dim, dtype=dtype, device="cuda") + B = torch.randn(dim * 4, dim, dtype=dtype, device="cuda") / math.sqrt(dim) + elif kind == "fc2": + A = torch.randn(1, 4 * dim, dtype=dtype, device="cuda") + B = torch.randn(dim, 4 * dim, dtype=dtype, device="cuda") / math.sqrt(dim) + elif kind == "attn": + A = torch.randn(1, dim, dtype=dtype, device="cuda") + B = torch.randn(dim, dim, dtype=dtype, device="cuda") / math.sqrt(dim) + elif kind == "attn_packed": + A = torch.randn(1, dim, dtype=dtype, device="cuda") + B = torch.randn(dim * 3, dim, dtype=dtype, device="cuda") / math.sqrt(dim) + + qB, state = F.quantize_4bit( + B, + quant_type=storage_type, + compress_statistics=double_quant, + quant_storage=quant_storage, + ) C3 = torch.matmul(A, B.t()) C2 = F.gemv_4bit(A, qB.t(), state=state) A.requires_grad = True C1 = bnb.matmul_4bit(A, qB.t(), state) - err1 = (C1-C2).abs().float() - err2 = (C3-C2).abs().float() - err3 = (C3-C1).abs().float() + err1 = (C1 - C2).abs().float() + err2 = (C3 - C2).abs().float() + err3 = (C3 - C1).abs().float() - mag1 = torch.abs(C1).float()+1e-5 - mag2 = torch.abs(C3).float()+1e-5 - mag3 = torch.abs(C3).float()+1e-5 + mag1 = torch.abs(C1).float() + 1e-5 + mag2 = torch.abs(C3).float() + 1e-5 + mag3 = torch.abs(C3).float() + 1e-5 - relerr1 = err1/mag1 - relerr2 = err2/mag2 - relerr3 = err3/mag3 + relerr1 = err1 / mag1 + relerr2 = err2 / mag2 + relerr3 = err3 / mag3 max_err1 = err1.max() max_err2 = err2.max() @@ -2220,34 +2124,34 @@ def test_gemv_4bit(dtype, storage_type, quant_storage, double_quant, kind): max_errs2.append(max_err2.item()) max_errs3.append(max_err3.item()) - c = int(C1.numel()*0.0014*(dim/256))+1 + c = int(C1.numel() * 0.0014 * (dim / 256)) + 1 c = assert_all_approx_close(C1, C2, 1e-5, 0.01, count=c, throw=False) - err1 = sum(errs1)/len(errs1)/math.sqrt(dim) - err2 = sum(errs2)/len(errs2)/math.sqrt(dim) - err3 = sum(errs3)/len(errs3)/math.sqrt(dim) - relerr1 = sum(relerrs1)/len(relerrs1)/math.sqrt(dim) - relerr2 = sum(relerrs2)/len(relerrs2)/math.sqrt(dim) - relerr3 = sum(relerrs3)/len(relerrs3)/math.sqrt(dim) - maxerr1 = sum(max_errs1)/len(max_errs1)/math.sqrt(dim) - maxerr2 = sum(max_errs2)/len(max_errs2)/math.sqrt(dim) - maxerr3 = sum(max_errs3)/len(max_errs3)/math.sqrt(dim) - absratio = err2/err3 - relratio = relerr2/relerr3 - maxratio = relerr2/relerr3 + err1 = sum(errs1) / len(errs1) / math.sqrt(dim) + err2 = sum(errs2) / len(errs2) / math.sqrt(dim) + err3 = sum(errs3) / len(errs3) / math.sqrt(dim) + relerr1 = sum(relerrs1) / len(relerrs1) / math.sqrt(dim) + relerr2 = sum(relerrs2) / len(relerrs2) / math.sqrt(dim) + relerr3 = sum(relerrs3) / len(relerrs3) / math.sqrt(dim) + maxerr1 = sum(max_errs1) / len(max_errs1) / math.sqrt(dim) + maxerr2 = sum(max_errs2) / len(max_errs2) / math.sqrt(dim) + maxerr3 = sum(max_errs3) / len(max_errs3) / math.sqrt(dim) + absratio = err2 / err3 + relratio = relerr2 / relerr3 + maxratio = relerr2 / relerr3 # for debugging if the tests fails # - #print('='*80) - #print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:') - #print(C1.flatten()[-20:]) - #print(C2.flatten()[-20:]) - #print(f'inference vs training abs: {err1}') - #print(f'inference vs training rel: {relerr1}') - #print(f'inference vs training max: {maxerr1}') - #print(f'inference vs training vs torch err ratio abs: {absratio}') - #print(f'inference vs training vs torch err ratio rel: {relratio}') - #print(f'inference vs training vs torch err ratio max: {maxratio}') + # print('='*80) + # print(f'For matmul: {A.shape}, {B.shape}, {kind}, {dtype}, {storage_type}, double_quant={double_quant}:') + # print(C1.flatten()[-20:]) + # print(C2.flatten()[-20:]) + # print(f'inference vs training abs: {err1}') + # print(f'inference vs training rel: {relerr1}') + # print(f'inference vs training max: {maxerr1}') + # print(f'inference vs training vs torch err ratio abs: {absratio}') + # print(f'inference vs training vs torch err ratio rel: {relratio}') + # print(f'inference vs training vs torch err ratio max: {maxratio}') if dtype == torch.float16: if dim <= 512: assert err1 < 7e-5 @@ -2283,56 +2187,59 @@ def test_gemv_4bit(dtype, storage_type, quant_storage, double_quant, kind): assert relratio < 1.04 and relratio > 0.96 assert maxratio < 1.02 and maxratio > 0.98 + @pytest.mark.skip("Row scale has some bugs for ampere") def test_managed(): - n = 32*10 + n = 32 * 10 A = F.get_paged(n, n, dtype=torch.float32) B = F.get_paged(n, n, dtype=torch.uint8) B2 = F.get_paged(n, n, dtype=torch.float32) assert A.is_paged assert B.is_paged - assert A.page_deviceid==0 - assert B.page_deviceid==0 + assert A.page_deviceid == 0 + assert B.page_deviceid == 0 F.fill(A, 17.0) F.fill(B, 17) F.fill(B2, 2) - assert (A==17).sum().item() == n*n - assert (B==17).sum().item() == n*n - C = A*B.float() - assert (C==289).sum().item() == n*n + assert (A == 17).sum().item() == n * n + assert (B == 17).sum().item() == n * n + C = A * B.float() + assert (C == 289).sum().item() == n * n F._mul(A, B2) F._mul(A, B2) F._mul(A, B2) - assert (A==17*(2**3)).sum().item() == n*n - # F.prefetch_tensor(A) - # F.prefetch_tensor(B) + assert (A == 17 * (2**3)).sum().item() == n * n + + +# F.prefetch_tensor(A) +# F.prefetch_tensor(B) - # F.fill(B2, 17.0) - # F._mul(A, B2) +# F.fill(B2, 17.0) +# F._mul(A, B2) - # F.prefetch_tensor(A, to_cpu=True) - # F.prefetch_tensor(B, to_cpu=True) - # F.prefetch_tensor(B2, to_cpu=True) - # torch.cuda.synchronize() +# F.prefetch_tensor(A, to_cpu=True) +# F.prefetch_tensor(B, to_cpu=True) +# F.prefetch_tensor(B2, to_cpu=True) +# torch.cuda.synchronize() - # assert (A==17).sum().item() == n*n +# assert (A==17).sum().item() == n*n - # torch.testing.assert_close(A, torch.ones(A.shape)*289) +# torch.testing.assert_close(A, torch.ones(A.shape)*289) -@pytest.mark.parametrize("storage_type", ['nf4', 'fp4'], ids=['nf4', 'fp4']) +@pytest.mark.parametrize("storage_type", ["nf4", "fp4"], ids=["nf4", "fp4"]) @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype) -@pytest.mark.parametrize("double_quant", [False], ids=['DQ_True']) +@pytest.mark.parametrize("double_quant", [False], ids=["DQ_True"]) def test_gemv_eye_4bit(storage_type, dtype, double_quant): dims = 10 torch.random.manual_seed(np.random.randint(0, 412424242)) dims = get_test_dims(0, 8192, n=dims) - dims = [dim + (64-(dim % 64)) for dim in dims] - #for dim in [576, 5120, 3520, 5184, 1280, 4992, 5312, 2048]: + dims = [dim + (64 - (dim % 64)) for dim in dims] + # for dim in [576, 5120, 3520, 5184, 1280, 4992, 5312, 2048]: for dim in dims: - A = torch.normal(0, 0.1, size=(1, 1, dim), dtype=dtype, device='cuda') - B = torch.eye(dim, dtype=dtype, device='cuda') + A = torch.normal(0, 0.1, size=(1, 1, dim), dtype=dtype, device="cuda") + B = torch.eye(dim, dtype=dtype, device="cuda") qB, state = F.quantize_4bit(B, quant_type=storage_type, compress_statistics=double_quant) C3 = torch.matmul(A, B.t()) @@ -2343,5 +2250,5 @@ def test_gemv_eye_4bit(storage_type, dtype, double_quant): torch.testing.assert_close(A, C3) torch.testing.assert_close(A, C1) torch.testing.assert_close(A, C2) - #torch.testing.assert_close(A, C1, rtol=1e-5, atol=0.00001) - #torch.testing.assert_close(A, C2, rtol=1e-5, atol=0.080) + # torch.testing.assert_close(A, C1, rtol=1e-5, atol=0.00001) + # torch.testing.assert_close(A, C2, rtol=1e-5, atol=0.080) diff --git a/tests/test_generation.py b/tests/test_generation.py index ef354d70a..911aa14da 100644 --- a/tests/test_generation.py +++ b/tests/test_generation.py @@ -10,56 +10,61 @@ def get_4bit_config(): - return transformers.BitsAndBytesConfig( - load_in_4bit=True, - load_in_8bit=False, - llm_int8_threshold=6.0, - llm_int8_has_fp16_weight=False, - bnb_4bit_compute_dtype=torch.float16, - bnb_4bit_use_double_quant=True, - bnb_4bit_quant_type='nf4', - ) + return transformers.BitsAndBytesConfig( + load_in_4bit=True, + load_in_8bit=False, + llm_int8_threshold=6.0, + llm_int8_has_fp16_weight=False, + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type="nf4", + ) def get_model_and_tokenizer(config): model_name_or_path, quant_type = config bnb_config = get_4bit_config() - if quant_type == '16bit': + if quant_type == "16bit": bnb_config.load_in_4bit = False else: - bnb_config.bnb_4bit_quant_type= quant_type - model = transformers.AutoModelForCausalLM.from_pretrained(model_name_or_path, + bnb_config.bnb_4bit_quant_type = quant_type + model = transformers.AutoModelForCausalLM.from_pretrained( + model_name_or_path, quantization_config=bnb_config, - max_memory={0:'48GB'}, - device_map='auto', - torch_dtype=torch.bfloat16 - ).eval() + max_memory={0: "48GB"}, + device_map="auto", + torch_dtype=torch.bfloat16, + ).eval() tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path) return model, tokenizer + def get_prompt_for_generation_eval(text, add_roles=True): description = ( "A chat between a curious human and an artificial intelligence assistant. " "The assistant gives helpful, detailed, and polite answers to the user's questions." ) if add_roles: - prompt = f'{description} ### Human: {text} ### Assistant:' + prompt = f"{description} ### Human: {text} ### Assistant:" else: - prompt = f'{description} {text}' + prompt = f"{description} {text}" return prompt + def generate(model, tokenizer, text, generation_config, prompt_func=get_prompt_for_generation_eval): text = prompt_func(text) - inputs = tokenizer(text, return_tensors="pt").to('cuda:0') - outputs = model.generate(inputs=inputs['input_ids'], generation_config=generation_config) + inputs = tokenizer(text, return_tensors="pt").to("cuda:0") + outputs = model.generate(inputs=inputs["input_ids"], generation_config=generation_config) return tokenizer.decode(outputs[0], skip_special_tokens=True) -models = ['huggyllama/llama-7b', 'bigscience/bloom-1b7'] -dtypes = ['nf4', 'fp4'] -@pytest.fixture(scope='session', params=product(models, dtypes)) +models = ["huggyllama/llama-7b", "bigscience/bloom-1b7"] +dtypes = ["nf4", "fp4"] + + +@pytest.fixture(scope="session", params=product(models, dtypes)) def model_and_tokenizer(request): model, tokenizer = get_model_and_tokenizer(request.param) yield request.param, model, tokenizer @@ -81,20 +86,19 @@ def test_pi(requires_cuda, model_and_tokenizer, inference_kernel, DQ, dtype): ) generation_config.max_new_tokens = 20 - - #text = 'Please write down the first 50 digits of pi.' - #text = get_prompt_for_generation_eval(text) - #text += ' Sure, here the first 50 digits of pi: 3.14159' + # text = 'Please write down the first 50 digits of pi.' + # text = get_prompt_for_generation_eval(text) + # text += ' Sure, here the first 50 digits of pi: 3.14159' n_cases = 6 - text = '3.14159' - if hasattr(model.config, 'quantization_config'): + text = "3.14159" + if hasattr(model.config, "quantization_config"): model.config.quantization_config.bnb_4bit_compute_dtype = dtype model.config.quantization_config.bnb_4bit_use_double_quant = DQ if not inference_kernel: - text = [text]*n_cases - inputs = tokenizer(text, return_tensors="pt").to('cuda:0') - x = inputs['input_ids'] + text = [text] * n_cases + inputs = tokenizer(text, return_tensors="pt").to("cuda:0") + x = inputs["input_ids"] outputs = [] if inference_kernel: for i in range(n_cases): @@ -105,15 +109,14 @@ def test_pi(requires_cuda, model_and_tokenizer, inference_kernel, DQ, dtype): outputs = model.generate(x, generation_config=generation_config) outputs = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs] - assert len(outputs) == n_cases failure_count = 0 for i in range(n_cases): - if not outputs[i][:len(str(math.pi))] == str(math.pi): + if not outputs[i][: len(str(math.pi))] == str(math.pi): failure_count += 1 - failure_max = (2 if fixture_config[0] == 'huggyllama/llama-7b' else 4) + failure_max = 2 if fixture_config[0] == "huggyllama/llama-7b" else 4 if failure_count > failure_max: print(math.pi) for out in outputs: print(out) - raise ValueError(f'Failure count: {failure_count}/{n_cases}') + raise ValueError(f"Failure count: {failure_count}/{n_cases}") diff --git a/tests/test_linear4bit.py b/tests/test_linear4bit.py index 567e1a466..bbbd05335 100644 --- a/tests/test_linear4bit.py +++ b/tests/test_linear4bit.py @@ -28,9 +28,7 @@ def test_linear_serialization(quant_type, compress_statistics, bias, quant_stora device = "cuda" layer_shape = (300, 400) - linear = torch.nn.Linear( - *layer_shape, dtype=original_dtype, device="cpu" - ) # original layer + linear = torch.nn.Linear(*layer_shape, dtype=original_dtype, device="cpu") # original layer # Quantizing original layer linear_q = bnb.nn.Linear4bit( @@ -42,9 +40,7 @@ def test_linear_serialization(quant_type, compress_statistics, bias, quant_stora quant_type=quant_type, device="meta", ) - new_weight = bnb.nn.Params4bit( - data=linear.weight, quant_type=quant_type, requires_grad=False - ) + new_weight = bnb.nn.Params4bit(data=linear.weight, quant_type=quant_type, requires_grad=False) linear_q.weight = new_weight if bias: linear_q.bias = torch.nn.Parameter(linear.bias) @@ -172,7 +168,9 @@ def test_linear_serialization(quant_type, compress_statistics, bias, quant_stora target_compression = ( 0.143 if original_dtype == torch.float32 else 0.29 ) # these numbers get lower as weight shape increases - ratio_error_msg = f"quantized_size {size_4:,} is larger on disk than {target_compression:.2%} of original size {size_orig:,}" + ratio_error_msg = ( + f"quantized_size {size_4:,} is larger on disk than {target_compression:.2%} of original size {size_orig:,}" + ) assert size_ratio < target_compression, ratio_error_msg diff --git a/tests/test_linear8bitlt.py b/tests/test_linear8bitlt.py index edc3409cd..4b62abd6d 100644 --- a/tests/test_linear8bitlt.py +++ b/tests/test_linear8bitlt.py @@ -19,6 +19,7 @@ # contributed by Alex Borzunov, see: # https://github.com/bigscience-workshop/petals/blob/main/tests/test_linear8bitlt.py + @pytest.mark.skipif( not torch.cuda.is_available() or torch.cuda.get_device_capability() < (7, 5), reason="this test requires a turing-generation or newer GPU, see bitsandbytes docs", @@ -50,7 +51,9 @@ def test_linear_no_igemmlt(): linear_custom.state.force_no_igemmlt = True linear_custom.weight = bnb.nn.Int8Params( - linear.weight.data.clone(), requires_grad=False, has_fp16_weights=False + linear.weight.data.clone(), + requires_grad=False, + has_fp16_weights=False, ).to(linear.weight.dtype) linear_custom.bias = linear.bias linear_custom = linear_custom.cuda() @@ -77,7 +80,14 @@ def test_linear_no_igemmlt(): @pytest.mark.parametrize("force_no_igemmlt", TRUE_FALSE, ids=id_formatter("force_no_igemmlt")) @pytest.mark.parametrize("save_before_forward", TRUE_FALSE, ids=id_formatter("save_before_forward")) @pytest.mark.parametrize("load_before_cuda", TRUE_FALSE, ids=id_formatter("load_before_cuda")) -def test_linear_serialization(has_fp16_weights, serialize_before_forward, deserialize_before_cuda, force_no_igemmlt, save_before_forward, load_before_cuda): +def test_linear_serialization( + has_fp16_weights, + serialize_before_forward, + deserialize_before_cuda, + force_no_igemmlt, + save_before_forward, + load_before_cuda, +): linear = torch.nn.Linear(32, 96) x = torch.randn(3, 32, dtype=torch.half) @@ -92,7 +102,9 @@ def test_linear_serialization(has_fp16_weights, serialize_before_forward, deseri linear_custom.state.force_no_igemmlt = True linear_custom.weight = bnb.nn.Int8Params( - linear.weight.data.clone(), requires_grad=has_fp16_weights, has_fp16_weights=has_fp16_weights + linear.weight.data.clone(), + requires_grad=has_fp16_weights, + has_fp16_weights=has_fp16_weights, ) linear_custom.bias = linear.bias linear_custom = linear_custom.cuda() diff --git a/tests/test_modules.py b/tests/test_modules.py index 674620e29..db4d72410 100644 --- a/tests/test_modules.py +++ b/tests/test_modules.py @@ -19,12 +19,18 @@ class MLP8bit(torch.nn.Module): def __init__(self, dim1, dim2, has_fp16_weights=True, memory_efficient_backward=False, threshold=0.0): super().__init__() self.fc1 = bnb.nn.Linear8bitLt( - dim1, dim2, has_fp16_weights=has_fp16_weights, memory_efficient_backward=memory_efficient_backward, - threshold=threshold + dim1, + dim2, + has_fp16_weights=has_fp16_weights, + memory_efficient_backward=memory_efficient_backward, + threshold=threshold, ) self.fc2 = bnb.nn.Linear8bitLt( - dim2, dim1, has_fp16_weights=has_fp16_weights, memory_efficient_backward=memory_efficient_backward, - threshold=threshold + dim2, + dim1, + has_fp16_weights=has_fp16_weights, + memory_efficient_backward=memory_efficient_backward, + threshold=threshold, ) def forward(self, x): @@ -52,9 +58,7 @@ def assert_all_approx_close(a, b, atol=1e-8, rtol=1e-5, count=10): class LinearFunction(torch.autograd.Function): @staticmethod def get_8bit_linear_trimmed(x, stochastic=False, trim_value=3.0): - round_func = ( - LinearFunction.round_stoachastic if stochastic else torch.round - ) + round_func = LinearFunction.round_stoachastic if stochastic else torch.round norm = math.sqrt(math.pi) / math.sqrt(2.0) # std = torch.abs(x).mean()*norm std = torch.std(x) @@ -122,9 +126,7 @@ def dequant_min_max(xq, A, B, SA, SB, dtype): return x.to(dtype) def get_8bit_linear(x, stochastic=False): - round_func = ( - LinearFunction.round_stoachastic if stochastic else torch.round - ) + round_func = LinearFunction.round_stoachastic if stochastic else torch.round max1 = torch.abs(x).max() x = x / max1 * 127 x = round_func(x) / 127 * max1 @@ -133,9 +135,7 @@ def get_8bit_linear(x, stochastic=False): @staticmethod def get_8bit_vector_wise(x, dim, stochastic=False): - round_func = ( - LinearFunction.round_stoachastic if stochastic else torch.round - ) + round_func = LinearFunction.round_stoachastic if stochastic else torch.round max1 = torch.amax(torch.abs(x), dim=dim, keepdim=True) max1[max1 == 0] = 1.0 x = (x * 127) / max1 @@ -219,9 +219,7 @@ def forward(ctx, x, weight, bias=None, args=None): weight8, S1 = LinearFunction.quant(weight, args.quant_type, dim=1) x8, S2 = LinearFunction.quant(x, args.quant_type, dim=2) outputq = bnb.functional.igemm(x8, weight8.t()) - output = LinearFunction.dequant( - outputq, S1, S2, x.dtype, args.quant_type - ) + output = LinearFunction.dequant(outputq, S1, S2, x.dtype, args.quant_type) # if torch.rand(1) < 0.01: # output32 = torch.matmul(x, weight.t()) # err = torch.abs(output-output32).float() @@ -250,37 +248,25 @@ def backward(ctx, grad_output): # weight and x are already 8bit # -> transform grad_output to 8-bit if args.use_8bit_training == "forward+wgrad": - grad_output8, S1 = LinearFunction.quant( - grad_output, args.quant_type, dim=[0, 1] - ) + grad_output8, S1 = LinearFunction.quant(grad_output, args.quant_type, dim=[0, 1]) x8, S2 = LinearFunction.quant(x, args.quant_type, dim=[0, 1]) grad_weight8 = bnb.functional.igemm(grad_output8, x8) - grad_weight = LinearFunction.dequant( - grad_weight8, S1, S2, grad_output.dtype, args.quant_type - ) + grad_weight = LinearFunction.dequant(grad_weight8, S1, S2, grad_output.dtype, args.quant_type) # grad_weight32 = torch.einsum('bso,bsi->oi', grad_output, x) grad_input = grad_output.matmul(weight) elif args.use_8bit_training == "full": - grad_output8, S1 = LinearFunction.quant( - grad_output, args.quant_type, dim=[0, 1] - ) + grad_output8, S1 = LinearFunction.quant(grad_output, args.quant_type, dim=[0, 1]) x8, S2 = LinearFunction.quant(x, args.quant_type, dim=[0, 1]) grad_weight8 = torch.zeros_like(weight, dtype=torch.int32) bnb.functional.igemm(grad_output8, x8, out=grad_weight8) - grad_weight = LinearFunction.dequant( - grad_weight8, S1, S2, grad_output.dtype, args.quant_type - ) + grad_weight = LinearFunction.dequant(grad_weight8, S1, S2, grad_output.dtype, args.quant_type) - grad_output8, S1 = LinearFunction.quant( - grad_output, args.quant_type, dim=2 - ) + grad_output8, S1 = LinearFunction.quant(grad_output, args.quant_type, dim=2) weight8, S3 = LinearFunction.quant(weight, args.quant_type, dim=0) grad_input8 = bnb.functional.igemm(grad_output8, weight8) - grad_input = LinearFunction.dequant( - grad_input8, S1, S3, grad_output.dtype, args.quant_type - ) + grad_input = LinearFunction.dequant(grad_input8, S1, S3, grad_output.dtype, args.quant_type) else: grad_input = grad_output.matmul(weight) @@ -356,12 +342,8 @@ def test_linear8bitlt_accumulated_gradient(): opt1.zero_grad(True) opt2.step() opt2.zero_grad(True) - assert_all_approx_close( - l1[0].weight, l2[0].weight, rtol=1.05, atol=0.01, count=2 - ) - assert_all_approx_close( - l1[1].weight, l2[1].weight, rtol=1.05, atol=0.01, count=2 - ) + assert_all_approx_close(l1[0].weight, l2[0].weight, rtol=1.05, atol=0.01, count=2) + assert_all_approx_close(l1[1].weight, l2[1].weight, rtol=1.05, atol=0.01, count=2) # we do this copy because otherwise we have small divergences over time that add up l1[0].weight.data.copy_(l2[0].weight.data) l1[1].weight.data.copy_(l2[1].weight.data) @@ -375,7 +357,17 @@ def test_linear8bitlt_accumulated_gradient(): @pytest.mark.parametrize("threshold", [0.0, 2.0]) @pytest.mark.parametrize("memory_efficient_backward", [False]) def test_linear8bitlt_no_fp16_weights(threshold, memory_efficient_backward): - l1 = (bnb.nn.Linear8bitLt( 32, 64, threshold=threshold, has_fp16_weights=False, memory_efficient_backward=memory_efficient_backward).cuda().half()) + l1 = ( + bnb.nn.Linear8bitLt( + 32, + 64, + threshold=threshold, + has_fp16_weights=False, + memory_efficient_backward=memory_efficient_backward, + ) + .cuda() + .half() + ) assert l1.weight.dtype == torch.int8 l1.eval() @@ -397,11 +389,7 @@ def test_linear8bitlt_no_fp16_weights(threshold, memory_efficient_backward): if threshold > 0: assert mlp.fc2.state.idx is not None - mlp = ( - MLP8bit(32, 64, threshold=threshold, has_fp16_weights=False) - .cuda() - .half() - ) + mlp = MLP8bit(32, 64, threshold=threshold, has_fp16_weights=False).cuda().half() assert mlp.fc1.weight.dtype == torch.int8 assert mlp.fc2.weight.dtype == torch.int8 @@ -414,11 +402,7 @@ def test_linear8bitlt_no_fp16_weights(threshold, memory_efficient_backward): if threshold > 0: assert mlp.fc2.state.idx is not None - mlp = ( - MLP8bit(32, 64, threshold=threshold, has_fp16_weights=False) - .half() - .cuda() - ) + mlp = MLP8bit(32, 64, threshold=threshold, has_fp16_weights=False).half().cuda() for i in range(100): b1 = torch.randn(16, 8, 32, device="cuda").half() @@ -431,7 +415,17 @@ def test_linear8bitlt_no_fp16_weights(threshold, memory_efficient_backward): assert mlp.fc1.weight.dtype == torch.int8 assert mlp.fc2.weight.dtype == torch.int8 - mlp = ( MLP8bit( 32, 64, threshold=threshold, has_fp16_weights=False, memory_efficient_backward=memory_efficient_backward).half().to("cuda")) + mlp = ( + MLP8bit( + 32, + 64, + threshold=threshold, + has_fp16_weights=False, + memory_efficient_backward=memory_efficient_backward, + ) + .half() + .to("cuda") + ) for i in range(100): b1 = torch.randn(16, 8, 32, device="cuda").half() @@ -447,8 +441,12 @@ def test_linear8bitlt_no_fp16_weights(threshold, memory_efficient_backward): assert mlp.fc2.weight.device.type == "cuda" mlp = MLP8bit( - 32, 64, threshold=threshold, has_fp16_weights=False, memory_efficient_backward=memory_efficient_backward - ) + 32, + 64, + threshold=threshold, + has_fp16_weights=False, + memory_efficient_backward=memory_efficient_backward, + ) w1, w2 = mlp.fc1.weight.clone().cuda(), mlp.fc2.weight.clone().cuda() # grab weights before quantization, mlp = mlp.cuda().half() # and this line triggers quantization @@ -489,7 +487,7 @@ def test_linear8bitlt_no_fp16_weights(threshold, memory_efficient_backward): lambda n_in, n_out, bias=True: bnb.nn.Linear8bitLt(n_in, n_out, bias=bias, has_fp16_weights=False), bnb.nn.LinearFP4, ], - ids=['Int8Lt', 'FP4'], + ids=["Int8Lt", "FP4"], ) def test_linear_kbit_fp32_bias(module): # casts model to fp16 -> int8 automatically @@ -544,7 +542,7 @@ def test_kbit_backprop(module): kbit[1].bias.detach().copy_(ref[1].bias) ref = ref.half().cuda() kbit = kbit.half().cuda() - kbit = kbit.half().to('cuda') + kbit = kbit.half().to("cuda") errs1 = [] errs2 = [] @@ -562,10 +560,10 @@ def test_kbit_backprop(module): bgrad1 = ref[0].bias.grad bgrad2 = kbit[0].bias.grad - err1 = (out1-out2).abs().float() - err2 = (grad1-grad2).abs().float() - relerr1 = (err1/(out1.abs().float()+1e-9)) - relerr2 = (err2/(grad1.abs().float()+1e-9)) + err1 = (out1 - out2).abs().float() + err2 = (grad1 - grad2).abs().float() + relerr1 = err1 / (out1.abs().float() + 1e-9) + relerr2 = err2 / (grad1.abs().float() + 1e-9) errs1.append(err1.mean().item()) errs2.append(err2.mean().item()) relerrs1.append(relerr1.mean().item()) @@ -582,20 +580,20 @@ def test_kbit_backprop(module): assert kbit[0].weight.grad is None or kbit[0].weight.grad.sum().item() == 0 assert kbit[0].weight.grad is None or kbit[0].bias.grad.sum().item() == 0 - #print('out', sum(errs1)/len(errs1)) - #print('grad', sum(errs2)/len(errs2)) - #print('rel out', sum(relerrs1)/len(relerrs1)) - #print('rel grad', sum(relerrs2)/len(relerrs2)) + # print('out', sum(errs1)/len(errs1)) + # print('grad', sum(errs2)/len(errs2)) + # print('rel out', sum(relerrs1)/len(relerrs1)) + # print('rel grad', sum(relerrs2)/len(relerrs2)) -def test_fp8linear(): +def test_fp8linear(): b = 10 h = 1024 inp = torch.randn(b, h).cuda() - fp32 = torch.nn.Linear(h, h*2).cuda() - fp8 = bnb.research.nn.LinearFP8Mixed(h, h*2).cuda() - fp32b = torch.nn.Linear(h*2, h).cuda() - fp8b = bnb.research.nn.LinearFP8Mixed(h*2, h).cuda() + fp32 = torch.nn.Linear(h, h * 2).cuda() + fp8 = bnb.research.nn.LinearFP8Mixed(h, h * 2).cuda() + fp32b = torch.nn.Linear(h * 2, h).cuda() + fp8b = bnb.research.nn.LinearFP8Mixed(h * 2, h).cuda() fp8.weight.data.copy_(fp32.weight.data) fp8.bias.data.copy_(fp32.bias.data) @@ -605,34 +603,34 @@ def test_fp8linear(): a = fp32b(torch.nn.functional.gelu(fp32(inp))) b = fp8b(torch.nn.functional.gelu(fp8(inp))) - err = (a-b).abs().mean() + err = (a - b).abs().mean() a.mean().backward() b.mean().backward() - graderr = (fp8.weight.grad-fp32.weight.grad).abs().mean() - bgraderr = (fp8.bias.grad-fp32.bias.grad).abs().mean() + graderr = (fp8.weight.grad - fp32.weight.grad).abs().mean() + bgraderr = (fp8.bias.grad - fp32.bias.grad).abs().mean() assert err < 0.05 assert graderr < 0.00002 assert bgraderr < 0.00002 + def test_4bit_warnings(): dim1 = 64 - with pytest.warns(UserWarning, match=r'inference or training'): + with pytest.warns(UserWarning, match=r"inference or training"): net = nn.Sequential(*[bnb.nn.Linear4bit(dim1, dim1, compute_dtype=torch.float32) for i in range(10)]) net = net.cuda() inp = torch.rand(10, dim1).cuda().half() net(inp) - with pytest.warns(UserWarning, match=r'inference.'): + with pytest.warns(UserWarning, match=r"inference."): net = nn.Sequential(*[bnb.nn.Linear4bit(dim1, dim1, compute_dtype=torch.float32) for i in range(10)]) net = net.cuda() inp = torch.rand(1, dim1).cuda().half() net(inp) with pytest.warns(UserWarning) as record: - net = nn.Sequential(*[bnb.nn.Linear4bit(dim1, dim1, compute_dtype=torch.float32) for i in range(10)]) net = net.cuda() inp = torch.rand(10, dim1).cuda().half() diff --git a/tests/test_optim.py b/tests/test_optim.py index 9395b8820..d8c46e415 100644 --- a/tests/test_optim.py +++ b/tests/test_optim.py @@ -16,6 +16,7 @@ k = 20 + def assert_most_approx_close(a, b, rtol=1e-3, atol=1e-3, max_error_count=0): idx = torch.isclose(a, b, rtol=rtol, atol=atol) error_count = (idx == 0).sum().item() @@ -33,6 +34,7 @@ def get_temp_dir(): def rm_path(path): shutil.rmtree(path) + str2optimizers = {} str2optimizers["adam_pytorch"] = (None, torch.optim.Adam, bnb.optim.Adam) str2optimizers["lion_pytorch"] = (None, Lion, bnb.optim.Lion) @@ -66,8 +68,14 @@ def rm_path(path): ) str2optimizers["adam8bit_blockwise"] = (torch.optim.Adam, lambda pxx: bnb.optim.Adam8bit(pxx, block_wise=True)) -str2optimizers["paged_adamw8bit_blockwise"] = (torch.optim.AdamW, lambda pxx: bnb.optim.PagedAdamW8bit(pxx, block_wise=True)) -str2optimizers["paged_adam8bit_blockwise"] = (torch.optim.Adam, lambda pxx: bnb.optim.PagedAdam8bit(pxx, block_wise=True)) +str2optimizers["paged_adamw8bit_blockwise"] = ( + torch.optim.AdamW, + lambda pxx: bnb.optim.PagedAdamW8bit(pxx, block_wise=True), +) +str2optimizers["paged_adam8bit_blockwise"] = ( + torch.optim.Adam, + lambda pxx: bnb.optim.PagedAdam8bit(pxx, block_wise=True), +) str2optimizers["lion8bit_blockwise"] = (Lion, lambda pxx: bnb.optim.Lion8bit(pxx, block_wise=True)) str2optimizers["paged_lion8bit_blockwise"] = (Lion, lambda pxx: bnb.optim.PagedLion8bit(pxx, block_wise=True)) str2optimizers["momentum8bit_blockwise"] = ( @@ -90,9 +98,18 @@ def rm_path(path): str2statenames["rmsprop"] = [("square_avg", "state1")] str2statenames["adam8bit"] = [("exp_avg", "state1", "qmap1", "max1"), ("exp_avg_sq", "state2", "qmap2", "max2")] str2statenames["lamb8bit"] = [("exp_avg", "state1", "qmap1", "max1"), ("exp_avg_sq", "state2", "qmap2", "max2")] -str2statenames["adam8bit_blockwise"] = [("exp_avg", "state1", "qmap1", "absmax1"), ("exp_avg_sq", "state2", "qmap2", "absmax2")] -str2statenames["paged_adam8bit_blockwise"] = [("exp_avg", "state1", "qmap1", "absmax1"), ("exp_avg_sq", "state2", "qmap2", "absmax2")] -str2statenames["paged_adamw8bit_blockwise"] = [("exp_avg", "state1", "qmap1", "absmax1"), ("exp_avg_sq", "state2", "qmap2", "absmax2")] +str2statenames["adam8bit_blockwise"] = [ + ("exp_avg", "state1", "qmap1", "absmax1"), + ("exp_avg_sq", "state2", "qmap2", "absmax2"), +] +str2statenames["paged_adam8bit_blockwise"] = [ + ("exp_avg", "state1", "qmap1", "absmax1"), + ("exp_avg_sq", "state2", "qmap2", "absmax2"), +] +str2statenames["paged_adamw8bit_blockwise"] = [ + ("exp_avg", "state1", "qmap1", "absmax1"), + ("exp_avg_sq", "state2", "qmap2", "absmax2"), +] str2statenames["momentum8bit"] = [("momentum_buffer", "state1", "qmap1", "max1")] str2statenames["lion8bit"] = [("exp_avg", "state1", "qmap1", "max1")] str2statenames["momentum8bit_blockwise"] = [("momentum_buffer", "state1", "qmap1", "absmax1")] @@ -101,7 +118,7 @@ def rm_path(path): str2statenames["lion8bit_blockwise"] = [("exp_avg", "state1", "qmap1", "absmax1")] str2statenames["paged_lion8bit_blockwise"] = [("exp_avg", "state1", "qmap1", "absmax1")] -optimizer_names_32bit = ["adam", "momentum", "rmsprop", 'paged_adamw', 'paged_adam', 'lion', 'paged_lion'] +optimizer_names_32bit = ["adam", "momentum", "rmsprop", "paged_adamw", "paged_adam", "lion", "paged_lion"] @pytest.mark.parametrize("optim_name", optimizer_names_32bit, ids=id_formatter("opt")) @@ -109,7 +126,7 @@ def rm_path(path): @pytest.mark.parametrize("dim1", [1024], ids=id_formatter("dim1")) @pytest.mark.parametrize("dim2", [32, 1024, 4097, 1], ids=id_formatter("dim2")) def test_optimizer32bit(dim1, dim2, gtype, optim_name): - if gtype == torch.bfloat16 and optim_name in ['momentum', 'rmsprop']: + if gtype == torch.bfloat16 and optim_name in ["momentum", "rmsprop"]: pytest.skip() if dim1 == 1 and dim2 == 1: return @@ -161,9 +178,13 @@ def test_optimizer32bit(dim1, dim2, gtype, optim_name): for name1, name2 in str2statenames[optim_name]: # since Lion can have pretty noisy updates where things lie at the boundary # allow up to 10 errors for Lion - assert_most_approx_close(torch_optimizer.state[p1][name1], bnb_optimizer.state[p2][name2], - atol=atol, rtol=rtol, - max_error_count=10) + assert_most_approx_close( + torch_optimizer.state[p1][name1], + bnb_optimizer.state[p2][name2], + atol=atol, + rtol=rtol, + max_error_count=10, + ) if gtype != torch.float32: # the adam buffers should also be close because they are 32-bit @@ -193,13 +214,9 @@ def test_global_config(dim1, dim2, gtype): eps = 1e-8 bnb.optim.GlobalOptimManager.get_instance().initialize() - bnb.optim.GlobalOptimManager.get_instance().override_config( - p3, "optim_bits", 8 - ) + bnb.optim.GlobalOptimManager.get_instance().override_config(p3, "optim_bits", 8) - bnb.optim.GlobalOptimManager.get_instance().register_parameters( - [p1, p2, p3] - ) + bnb.optim.GlobalOptimManager.get_instance().register_parameters([p1, p2, p3]) p1 = p1.cuda() p2 = p2.cuda() p3 = p3.cuda() @@ -242,7 +259,8 @@ def test_global_config(dim1, dim2, gtype): @pytest.mark.parametrize("dim2", [32, 1024, 4097], ids=id_formatter("dim2")) @pytest.mark.parametrize("dim1", [1024], ids=id_formatter("dim1")) def test_optimizer8bit(dim1, dim2, gtype, optim_name): - if gtype == torch.bfloat16 and optim_name not in ['adam8bit_blockwise', 'lion8bit_blockwise']: pytest.skip() + if gtype == torch.bfloat16 and optim_name not in ["adam8bit_blockwise", "lion8bit_blockwise"]: + pytest.skip() if dim1 == 1 and dim2 == 1: return p1 = torch.randn(dim1, dim2, device="cuda", dtype=gtype) * 0.1 @@ -294,17 +312,12 @@ def test_optimizer8bit(dim1, dim2, gtype, optim_name): absmax=bnb_optimizer.state[p2][max_val], A=bnb_optimizer.state[p2][name2], ) - num_not_close = ( - torch.isclose( - torch_optimizer.state[p1][name1], s1, atol=atol, rtol=rtol - ) - == 0 - ) - #assert num_not_close.sum().item() < 20 + num_not_close = torch.isclose(torch_optimizer.state[p1][name1], s1, atol=atol, rtol=rtol) == 0 + # assert num_not_close.sum().item() < 20 dequant_states.append(s1.clone()) err = torch.abs(p1 - p2) - relerr = err / (torch.abs(p1)+1e-9) + relerr = err / (torch.abs(p1) + 1e-9) if g.dtype == torch.bfloat16: assert err.mean() < 0.00015 assert relerr.mean() < 0.0016 @@ -316,9 +329,7 @@ def test_optimizer8bit(dim1, dim2, gtype, optim_name): relerrors.append(relerr.mean().item()) if i % 10 == 0 and i > 0: - for (name1, name2, qmap, max_val), s in zip( - str2statenames[optim_name], dequant_states - ): + for (name1, name2, qmap, max_val), s in zip(str2statenames[optim_name], dequant_states): s1cpy = s.clone() raws1cpy = bnb_optimizer.state[p2][name2].clone() qmap1 = bnb_optimizer.state[p2][qmap].clone() @@ -348,7 +359,7 @@ def test_optimizer8bit(dim1, dim2, gtype, optim_name): ) torch.testing.assert_close(s1cpy, s1) - num_not_close = (torch.isclose(torch_optimizer.state[p1][name1], s1, atol=atol, rtol=rtol) == 0) + num_not_close = torch.isclose(torch_optimizer.state[p1][name1], s1, atol=atol, rtol=rtol) == 0 assert num_not_close.sum().item() < 20 # since Lion can have pretty noisy updates where things lie at the boundary # allow up to 5 errors for Lion @@ -395,15 +406,11 @@ def test_adam_percentile_clipping(dim1, dim2, gtype, optim_bits): for i in range(50): step += 1 - g1 = torch.randn(dim1, dim2, device="cuda", dtype=gtype) * 0.1 + ( - 0.01 * i - ) + g1 = torch.randn(dim1, dim2, device="cuda", dtype=gtype) * 0.1 + (0.01 * i) g2 = g1.clone() p2.grad = g2 - current_gnorm, clip_val, gnorm_scale = F.percentile_clipping( - g1, gnorm_vec, step, 5 - ) + current_gnorm, clip_val, gnorm_scale = F.percentile_clipping(g1, gnorm_vec, step, 5) g1 = (g1.float() * gnorm_scale).to(gtype) p1.grad = g1 @@ -497,8 +504,8 @@ def test_benchmark_blockwise(dim1, dim2, gtype, optim_name): @pytest.mark.parametrize("dim1", [2 * 1024], ids=id_formatter("dim1")) @pytest.mark.parametrize("gtype", [torch.float16], ids=describe_dtype) -@pytest.mark.parametrize("optim_name", ['paged_adamw'], ids=id_formatter("optim_name")) -@pytest.mark.parametrize("mode", ['bnb'], ids=id_formatter("mode")) +@pytest.mark.parametrize("optim_name", ["paged_adamw"], ids=id_formatter("optim_name")) +@pytest.mark.parametrize("mode", ["bnb"], ids=id_formatter("mode")) @pytest.mark.benchmark def test_stream_optimizer_bench(dim1, gtype, optim_name, mode): layers1 = torch.nn.Sequential(*torch.nn.ModuleList([torch.nn.Linear(dim1, dim1) for i in range(10)])) @@ -506,24 +513,24 @@ def test_stream_optimizer_bench(dim1, gtype, optim_name, mode): layers1 = layers1.cuda() large_tensor = None - if mode == 'torch': + if mode == "torch": optim = str2optimizers[optim_name][0](layers1.parameters()) else: optim = str2optimizers[optim_name][1](layers1.parameters()) # 12 GB - large_tensor = torch.empty((int(4.5e9),), device='cuda') + large_tensor = torch.empty((int(4.5e9),), device="cuda") torch.cuda.synchronize() time.sleep(5) num_batches = 5 - batches = torch.randn(num_batches, 128, dim1, device='cuda').to(gtype) - lbls = torch.randint(0, 10, size=(num_batches,128)).cuda() + batches = torch.randn(num_batches, 128, dim1, device="cuda").to(gtype) + lbls = torch.randint(0, 10, size=(num_batches, 128)).cuda() for i in range(num_batches): print(i) b = batches[i] - if i ==2: + if i == 2: torch.cuda.synchronize() t0 = time.time() diff --git a/tests/test_triton.py b/tests/test_triton.py index 218a533d5..3624fb5e9 100644 --- a/tests/test_triton.py +++ b/tests/test_triton.py @@ -7,15 +7,18 @@ from tests.helpers import TRUE_FALSE -@pytest.mark.skipif(not is_triton_available() or not torch.cuda.is_available() or not torch.cuda.get_device_capability()[0] >= 8, - reason="This test requires triton and a GPU with compute capability 8.0 or higher.") +@pytest.mark.skipif( + not is_triton_available() or not torch.cuda.is_available() or not torch.cuda.get_device_capability()[0] >= 8, + reason="This test requires triton and a GPU with compute capability 8.0 or higher.", +) @pytest.mark.parametrize("vector_wise_quantization", TRUE_FALSE) def test_switchback(vector_wise_quantization): for dim in [83]: for batch in [13]: - standard = torch.nn.Linear(dim, 4 * dim).cuda().half() - switchback = SwitchBackLinear(dim, 4 * dim, vector_wise_quantization=vector_wise_quantization).cuda().half() + switchback = ( + SwitchBackLinear(dim, 4 * dim, vector_wise_quantization=vector_wise_quantization).cuda().half() + ) baseline = Linear8bitLt(dim, 4 * dim).cuda().half() switchback.weight.data.copy_(standard.weight) switchback.bias.data.copy_(standard.bias) @@ -38,23 +41,23 @@ def test_switchback(vector_wise_quantization): err_sb = (out_standard - out_sb).abs().mean() err_baseline = (out_standard - out_baseline).abs().mean() - print('OUT', err_sb, err_baseline) + print("OUT", err_sb, err_baseline) assert err_sb < 2 * err_baseline err_sb = (standard.bias.grad - switchback.bias.grad).abs().mean() err_baseline = (standard.bias.grad - baseline.bias.grad).abs().mean() - print('GW2', err_sb, err_baseline) + print("GW2", err_sb, err_baseline) assert err_sb < 2 * err_baseline err_sb = (standard.weight.grad - switchback.weight.grad).abs().mean() err_baseline = (standard.weight.grad - baseline.weight.grad).abs().mean() - print('GW1', err_sb, err_baseline) + print("GW1", err_sb, err_baseline) assert err_sb < 2 * err_baseline err_sb = (x1.grad - x2.grad).abs().mean() err_baseline = (x1.grad - x3.grad).abs().mean() - print('GX1', err_sb, err_baseline) + print("GX1", err_sb, err_baseline) assert err_sb < 2 * err_baseline From b03ce0e0be74888e3a604ec749b761621a6c8407 Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Wed, 13 Mar 2024 16:36:25 +0200 Subject: [PATCH 090/112] Update git-blame-ignore-revs --- .git-blame-ignore-revs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs index fc44037d8..d953c93dd 100644 --- a/.git-blame-ignore-revs +++ b/.git-blame-ignore-revs @@ -9,3 +9,6 @@ ea7c14f8ef64924f2d0ff80df3cdabf2c7299848 # format tests/linear_4bit.py 34735ba89de8235ea9da6ef409f814dcea9e2038 + +# Reformat with ruff-format +5a4263f4dc05fe8f78f4111beab9f68a81deeab1 From 8706830f56e339d2f2eb4baaf43e0b08bcd75cd5 Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Wed, 13 Mar 2024 17:53:28 +0200 Subject: [PATCH 091/112] Fix some bad types --- bitsandbytes/nn/modules.py | 8 ++++---- bitsandbytes/utils.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py index e1cc6600d..ec14e5940 100644 --- a/bitsandbytes/nn/modules.py +++ b/bitsandbytes/nn/modules.py @@ -658,8 +658,8 @@ class Linear8bitLt(nn.Linear): def __init__( self, - input_features, - output_features, + input_features: int, + output_features: int, bias=True, has_fp16_weights=True, memory_efficient_backward=False, @@ -671,9 +671,9 @@ def __init__( Initialize Linear8bitLt class. Args: - input_features (`str`): + input_features (`int`): Number of input features of the linear layer. - output_features (`str`): + output_features (`int`): Number of output features of the linear layer. bias (`bool`, defaults to `True`): Whether the linear class uses the bias term as well. diff --git a/bitsandbytes/utils.py b/bitsandbytes/utils.py index 48c7fc82d..0229e59e2 100644 --- a/bitsandbytes/utils.py +++ b/bitsandbytes/utils.py @@ -140,7 +140,7 @@ def replace_linear( List of modules names not to convert. Defaults to `lm_head`. copy_weights (`bool`): Copy the weights from the old linear module to the new one - post_processing_fun_name (`str`): + post_processing_function (`str`): A function name of the replacement linear class that is called after processing. """ From 3ec3dd26655927b109b314545f3448516d69a770 Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Wed, 13 Mar 2024 17:56:49 +0200 Subject: [PATCH 092/112] Fix type documentation for optimizer `args` --- bitsandbytes/optim/adagrad.py | 12 ++++++------ bitsandbytes/optim/adam.py | 24 ++++++++++++------------ bitsandbytes/optim/adamw.py | 24 ++++++++++++------------ bitsandbytes/optim/lamb.py | 12 ++++++------ bitsandbytes/optim/lars.py | 12 ++++++------ bitsandbytes/optim/lion.py | 24 ++++++++++++------------ bitsandbytes/optim/optimizer.py | 8 ++++---- bitsandbytes/optim/rmsprop.py | 12 ++++++------ bitsandbytes/optim/sgd.py | 12 ++++++------ 9 files changed, 70 insertions(+), 70 deletions(-) diff --git a/bitsandbytes/optim/adagrad.py b/bitsandbytes/optim/adagrad.py index aace548fa..7459dece1 100644 --- a/bitsandbytes/optim/adagrad.py +++ b/bitsandbytes/optim/adagrad.py @@ -38,8 +38,8 @@ def __init__( The epsilon value prevents division by zero in the optimizer. optim_bits (`int`, defaults to 32): The number of bits of the optimizer state. - args (`dict`, defaults to `None`): - A dictionary with additional arguments. + args (`object`, defaults to `None`): + An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. percentile_clipping (`int`, defaults to 100): @@ -105,8 +105,8 @@ def __init__( The epsilon value prevents division by zero in the optimizer. optim_bits (`int`, defaults to 8): The number of bits of the optimizer state. - args (`dict`, defaults to `None`): - A dictionary with additional arguments. + args (`object`, defaults to `None`): + An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. percentile_clipping (`int`, defaults to 100): @@ -173,8 +173,8 @@ def __init__( The epsilon value prevents division by zero in the optimizer. optim_bits (`int`, defaults to 32): The number of bits of the optimizer state. - args (`dict`, defaults to `None`): - A dictionary with additional arguments. + args (`object`, defaults to `None`): + An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. percentile_clipping (`int`, defaults to 100): diff --git a/bitsandbytes/optim/adam.py b/bitsandbytes/optim/adam.py index d8ffca63e..740db26ac 100644 --- a/bitsandbytes/optim/adam.py +++ b/bitsandbytes/optim/adam.py @@ -47,8 +47,8 @@ def __init__( Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead. optim_bits (`int`, defaults to 32): The number of bits of the optimizer state. - args (`dict`, defaults to `None`): - A dictionary with additional arguments. + args (`object`, defaults to `None`): + An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. percentile_clipping (`int`, defaults to 100): @@ -108,8 +108,8 @@ def __init__( Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead. optim_bits (`int`, defaults to 32): The number of bits of the optimizer state. - args (`dict`, defaults to `None`): - A dictionary with additional arguments. + args (`object`, defaults to `None`): + An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. percentile_clipping (`int`, defaults to 100): @@ -169,8 +169,8 @@ def __init__( Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead. optim_bits (`int`, defaults to 32): The number of bits of the optimizer state. - args (`dict`, defaults to `None`): - A dictionary with additional arguments. + args (`object`, defaults to `None`): + An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. percentile_clipping (`int`, defaults to 100): @@ -230,8 +230,8 @@ def __init__( Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead. optim_bits (`int`, defaults to 32): The number of bits of the optimizer state. - args (`dict`, defaults to `None`): - A dictionary with additional arguments. + args (`object`, defaults to `None`): + An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. percentile_clipping (`int`, defaults to 100): @@ -291,8 +291,8 @@ def __init__( Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead. optim_bits (`int`, defaults to 32): The number of bits of the optimizer state. - args (`dict`, defaults to `None`): - A dictionary with additional arguments. + args (`object`, defaults to `None`): + An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. percentile_clipping (`int`, defaults to 100): @@ -352,8 +352,8 @@ def __init__( Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead. optim_bits (`int`, defaults to 32): The number of bits of the optimizer state. - args (`dict`, defaults to `None`): - A dictionary with additional arguments. + args (`object`, defaults to `None`): + An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. percentile_clipping (`int`, defaults to 100): diff --git a/bitsandbytes/optim/adamw.py b/bitsandbytes/optim/adamw.py index fa51458fd..4bf3f6436 100644 --- a/bitsandbytes/optim/adamw.py +++ b/bitsandbytes/optim/adamw.py @@ -39,8 +39,8 @@ def __init__( Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead. optim_bits (`int`, defaults to 32): The number of bits of the optimizer state. - args (`dict`, defaults to `None`): - A dictionary with additional arguments. + args (`object`, defaults to `None`): + An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. percentile_clipping (`int`, defaults to 100): @@ -100,8 +100,8 @@ def __init__( Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead. optim_bits (`int`, defaults to 32): The number of bits of the optimizer state. - args (`dict`, defaults to `None`): - A dictionary with additional arguments. + args (`object`, defaults to `None`): + An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. percentile_clipping (`int`, defaults to 100): @@ -161,8 +161,8 @@ def __init__( Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead. optim_bits (`int`, defaults to 32): The number of bits of the optimizer state. - args (`dict`, defaults to `None`): - A dictionary with additional arguments. + args (`object`, defaults to `None`): + An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. percentile_clipping (`int`, defaults to 100): @@ -221,8 +221,8 @@ def __init__( Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead. optim_bits (`int`, defaults to 32): The number of bits of the optimizer state. - args (`dict`, defaults to `None`): - A dictionary with additional arguments. + args (`object`, defaults to `None`): + An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. percentile_clipping (`int`, defaults to 100): @@ -281,8 +281,8 @@ def __init__( Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead. optim_bits (`int`, defaults to 32): The number of bits of the optimizer state. - args (`dict`, defaults to `None`): - A dictionary with additional arguments. + args (`object`, defaults to `None`): + An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. percentile_clipping (`int`, defaults to 100): @@ -341,8 +341,8 @@ def __init__( Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead. optim_bits (`int`, defaults to 32): The number of bits of the optimizer state. - args (`dict`, defaults to `None`): - A dictionary with additional arguments. + args (`object`, defaults to `None`): + An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. percentile_clipping (`int`, defaults to 100): diff --git a/bitsandbytes/optim/lamb.py b/bitsandbytes/optim/lamb.py index ec829ee85..8d29cbbfe 100644 --- a/bitsandbytes/optim/lamb.py +++ b/bitsandbytes/optim/lamb.py @@ -45,8 +45,8 @@ def __init__( Whether to use the AdamW variant. optim_bits (`int`, defaults to 32): The number of bits of the optimizer state. - args (`dict`, defaults to `None`): - A dictionary with additional arguments. + args (`object`, defaults to `None`): + An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. percentile_clipping (`int`, defaults to 100): @@ -109,8 +109,8 @@ def __init__( Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead. adam_w_mode (`bool`, defaults to `True`): Whether to use the AdamW variant. - args (`dict`, defaults to `None`): - A dictionary with additional arguments. + args (`object`, defaults to `None`): + An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. percentile_clipping (`int`, defaults to 100): @@ -173,8 +173,8 @@ def __init__( Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead. adam_w_mode (`bool`, defaults to `True`): Whether to use the AdamW variant. - args (`dict`, defaults to `None`): - A dictionary with additional arguments. + args (`object`, defaults to `None`): + An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. percentile_clipping (`int`, defaults to 100): diff --git a/bitsandbytes/optim/lars.py b/bitsandbytes/optim/lars.py index 63c062988..90c3686fe 100644 --- a/bitsandbytes/optim/lars.py +++ b/bitsandbytes/optim/lars.py @@ -41,8 +41,8 @@ def __init__( Whether to use Nesterov momentum. optim_bits (`int`, defaults to 32): The number of bits of the optimizer state. - args (`dict`, defaults to `None`): - A dictionary with additional arguments. + args (`object`, defaults to `None`): + An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. percentile_clipping (`int`, defaults to 100): @@ -98,8 +98,8 @@ def __init__( The weight decay value for the optimizer. nesterov (`bool`, defaults to `False`): Whether to use Nesterov momentum. - args (`dict`, defaults to `None`): - A dictionary with additional arguments. + args (`object`, defaults to `None`): + An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. percentile_clipping (`int`, defaults to 100): @@ -155,8 +155,8 @@ def __init__( The weight decay value for the optimizer. nesterov (`bool`, defaults to `False`): Whether to use Nesterov momentum. - args (`dict`, defaults to `None`): - A dictionary with additional arguments. + args (`object`, defaults to `None`): + An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. percentile_clipping (`int`, defaults to 100): diff --git a/bitsandbytes/optim/lion.py b/bitsandbytes/optim/lion.py index 9f0f4a8a9..2e4163694 100644 --- a/bitsandbytes/optim/lion.py +++ b/bitsandbytes/optim/lion.py @@ -33,8 +33,8 @@ def __init__( The weight decay value for the optimizer. optim_bits (`int`, defaults to 32): The number of bits of the optimizer state. - args (`dict`, defaults to `None`): - A dictionary with additional arguments. + args (`object`, defaults to `None`): + An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. percentile_clipping (`int`, defaults to 100): @@ -85,8 +85,8 @@ def __init__( The beta values are the decay rates of the first and second-order moment of the optimizer. weight_decay (`float`, defaults to 0): The weight decay value for the optimizer. - args (`dict`, defaults to `None`): - A dictionary with additional arguments. + args (`object`, defaults to `None`): + An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. percentile_clipping (`int`, defaults to 100): @@ -137,8 +137,8 @@ def __init__( The beta values are the decay rates of the first and second-order moment of the optimizer. weight_decay (`float`, defaults to 0): The weight decay value for the optimizer. - args (`dict`, defaults to `None`): - A dictionary with additional arguments. + args (`object`, defaults to `None`): + An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. percentile_clipping (`int`, defaults to 100): @@ -191,8 +191,8 @@ def __init__( The weight decay value for the optimizer. optim_bits (`int`, defaults to 32): The number of bits of the optimizer state. - args (`dict`, defaults to `None`): - A dictionary with additional arguments. + args (`object`, defaults to `None`): + An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. percentile_clipping (`int`, defaults to 100): @@ -242,8 +242,8 @@ def __init__( The weight decay value for the optimizer. optim_bits (`int`, defaults to 32): The number of bits of the optimizer state. - args (`dict`, defaults to `None`): - A dictionary with additional arguments. + args (`object`, defaults to `None`): + An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. percentile_clipping (`int`, defaults to 100): @@ -293,8 +293,8 @@ def __init__( The weight decay value for the optimizer. optim_bits (`int`, defaults to 32): The number of bits of the optimizer state. - args (`dict`, defaults to `None`): - A dictionary with additional arguments. + args (`object`, defaults to `None`): + An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. percentile_clipping (`int`, defaults to 100): diff --git a/bitsandbytes/optim/optimizer.py b/bitsandbytes/optim/optimizer.py index 43ebbb24d..f1e60e5e7 100644 --- a/bitsandbytes/optim/optimizer.py +++ b/bitsandbytes/optim/optimizer.py @@ -373,8 +373,8 @@ def __init__( The weight decay value for the optimizer. optim_bits (`int`, defaults to 32): The number of bits of the optimizer state. - args (`dict`, defaults to `None`): - A dictionary with additional arguments. + args (`object`, defaults to `None`): + An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. percentile_clipping (`int`, defaults to 100): @@ -596,8 +596,8 @@ def __init__( The weight decay value for the optimizer. optim_bits (`int`, defaults to 32): The number of bits of the optimizer state. - args (`dict`, defaults to `None`): - A dictionary with additional arguments. + args (`object`, defaults to `None`): + An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. percentile_clipping (`int`, defaults to 100): diff --git a/bitsandbytes/optim/rmsprop.py b/bitsandbytes/optim/rmsprop.py index 659617654..25611309b 100644 --- a/bitsandbytes/optim/rmsprop.py +++ b/bitsandbytes/optim/rmsprop.py @@ -41,8 +41,8 @@ def __init__( Whether the gradients are normalized by the variance. If `True`, it can help training at the expense of additional compute. optim_bits (`int`, defaults to 32): The number of bits of the optimizer state. - args (`dict`, defaults to `None`): - A dictionary with additional arguments. + args (`object`, defaults to `None`): + An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. percentile_clipping (`int`, defaults to 100): @@ -104,8 +104,8 @@ def __init__( Whether the gradients are normalized by the variance. If `True`, it can help training at the expense of additional compute. optim_bits (`int`, defaults to 32): The number of bits of the optimizer state. - args (`dict`, defaults to `None`): - A dictionary with additional arguments. + args (`object`, defaults to `None`): + An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. percentile_clipping (`int`, defaults to 100): @@ -167,8 +167,8 @@ def __init__( Whether the gradients are normalized by the variance. If `True`, it can help training at the expense of additional compute. optim_bits (`int`, defaults to 32): The number of bits of the optimizer state. - args (`dict`, defaults to `None`): - A dictionary with additional arguments. + args (`object`, defaults to `None`): + An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. percentile_clipping (`int`, defaults to 100): diff --git a/bitsandbytes/optim/sgd.py b/bitsandbytes/optim/sgd.py index 0f0b12e4b..ec18f036c 100644 --- a/bitsandbytes/optim/sgd.py +++ b/bitsandbytes/optim/sgd.py @@ -38,8 +38,8 @@ def __init__( Whether to use Nesterov momentum. optim_bits (`int`, defaults to 32): The number of bits of the optimizer state. - args (`dict`, defaults to `None`): - A dictionary with additional arguments. + args (`object`, defaults to `None`): + An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. percentile_clipping (`int`, defaults to 100): @@ -94,8 +94,8 @@ def __init__( The weight decay value for the optimizer. nesterov (`bool`, defaults to `False`): Whether to use Nesterov momentum. - args (`dict`, defaults to `None`): - A dictionary with additional arguments. + args (`object`, defaults to `None`): + An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. percentile_clipping (`int`, defaults to 100): @@ -150,8 +150,8 @@ def __init__( The weight decay value for the optimizer. nesterov (`bool`, defaults to `False`): Whether to use Nesterov momentum. - args (`dict`, defaults to `None`): - A dictionary with additional arguments. + args (`object`, defaults to `None`): + An object with additional arguments. min_8bit_size (`int`, defaults to 4096): The minimum number of elements of the parameter tensors for 8-bit optimization. percentile_clipping (`int`, defaults to 100): From 0c6dda0842a8ee463518aa547fa0e4ab36b233db Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Wed, 13 Mar 2024 18:10:10 +0200 Subject: [PATCH 093/112] Mark some optimizer update arguments as Noneable (they were being called with Nones) --- bitsandbytes/functional.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py index 8fa8f2f60..bb6a04892 100644 --- a/bitsandbytes/functional.py +++ b/bitsandbytes/functional.py @@ -1618,18 +1618,18 @@ def optimizer_update_8bit( g: Tensor, p: Tensor, state1: Tensor, - state2: Tensor, + state2: Optional[torch.Tensor], beta1: float, beta2: float, eps: float, step: int, lr: float, qmap1: Tensor, - qmap2: Tensor, + qmap2: Optional[torch.Tensor], max1: Tensor, - max2: Tensor, + max2: Optional[torch.Tensor], new_max1: Tensor, - new_max2: Tensor, + new_max2: Optional[torch.Tensor], weight_decay: float = 0.0, gnorm_scale: float = 1.0, unorm_vec: Optional[torch.Tensor] = None, @@ -1751,16 +1751,16 @@ def optimizer_update_8bit_blockwise( g: Tensor, p: Tensor, state1: Tensor, - state2: Tensor, + state2: Optional[torch.Tensor], beta1: float, beta2: float, eps: float, step: int, lr: float, qmap1: Tensor, - qmap2: Tensor, + qmap2: Optional[torch.Tensor], absmax1: Tensor, - absmax2: Tensor, + absmax2: Optional[torch.Tensor], weight_decay: float = 0.0, gnorm_scale: float = 1.0, skip_zeros=False, From 054837684e8c4e3ad3ef74919a71b906cde77700 Mon Sep 17 00:00:00 2001 From: Steven Liu <59462357+stevhliu@users.noreply.github.com> Date: Fri, 15 Mar 2024 12:41:33 -0700 Subject: [PATCH 094/112] [docs] refine optimizers, integrations, etc (#1125) * optim, integration * toctree * feedback --- docs/source/_toctree.yml | 6 +- docs/source/explanations/optimizers.mdx | 51 ++++++ docs/source/{ => explanations}/resources.mdx | 0 docs/source/index.mdx | 18 +- docs/source/installation.mdx | 2 +- docs/source/integrations.mdx | 122 +++++++++----- docs/source/optimizers.mdx | 164 ++++--------------- docs/source/reference/quantization.mdx | 13 -- 8 files changed, 179 insertions(+), 197 deletions(-) create mode 100644 docs/source/explanations/optimizers.mdx rename docs/source/{ => explanations}/resources.mdx (100%) delete mode 100644 docs/source/reference/quantization.mdx diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index 87c4242de..2184cce8c 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -22,12 +22,12 @@ title: FAQs - title: Explanation sections: - - local: resources + - local: explanations/optimizers + title: 8-bit optimizers + - local: explanations/resources title: Papers, resources & how to cite - title: API reference sections: - - local: reference/quantization - title: Quantization - title: Optimizers sections: - local: reference/optim/optim_overview diff --git a/docs/source/explanations/optimizers.mdx b/docs/source/explanations/optimizers.mdx new file mode 100644 index 000000000..327938e54 --- /dev/null +++ b/docs/source/explanations/optimizers.mdx @@ -0,0 +1,51 @@ +# 8-bit optimizers + +Stateful optimizers maintain gradient statistics over time, for example, the exponentially smoothed sum (SGD with momentum) or squared sum (Adam) of past gradient values. This state can be used to accelerate optimization compared to plain stochastic gradient descent, but uses memory that might otherwise be allocated to model parameters. As a result, this limits the maximum size of models that can be trained in practice. Now take a look at the biggest models that can be trained with 8-bit optimizers. + +
+
+ +
Depending on your GPU size, you can train a much larger model with a 8-bit optimizer.
+
+
+ +bitsandbytes optimizers use 8-bit statistics, while maintaining the performance levels of using 32-bit optimizer states. + +To overcome the resulting computational, quantization and stability challenges, 8-bit optimizers have three components: + +1. Block-wise quantization: divides input tensors into smaller blocks that are independently quantized, isolating outliers and distributing the error more equally over all bits. Each block is processed in parallel across cores, yielding faster optimization and high precision quantization. +2. Dynamic quantization: quantizes both small and large values with high precision. +3. Stable embedding layer: improves stability during optimization for models with word embeddings. + +With these components, performing an optimizer update with 8-bit states is straightforward. The 8-bit optimizer states are dequantized to 32-bit before you perform the update, and then the states are quantized back to 8-bit for storage. + +The 8-bit to 32-bit conversion happens element-by-element in registers, meaning no slow copies to GPU memory or additional temporary memory are needed to perform quantization and dequantization. For GPUs, this makes 8-bit optimizers much faster than regular 32-bit optimizers. + +
+
+ +
A comparison of memory and time saved using 8-bit and 32-bit optimizers.
+
+
+ +## Stable embedding layer + +The stable embedding layer improves the training stability of the standard word embedding layer for NLP tasks. It addresses the challenge of non-uniform input distributions and mitigates extreme gradient variations. This means the stable embedding layer can support more aggressive quantization strategies without compromising training stability, and it can help achieve stable training outcomes, which is particularly important for models dealing with diverse and complex language data. + +There are three features of the stable embedding layer: + +- Initialization: utilizes Xavier uniform initialization to maintain consistent variance, reducing the likelihood of large gradients. +- Normalization: incorporates layer normalization before adding positional embeddings, aiding in output stability. +- Optimizer states: employs 32-bit optimizer states exclusively for this layer to enhance stability, while the rest of the model may use standard 16-bit precision. + +## Paged optimizers + +Paged optimizers are built on top of the [unified memory](https://developer.nvidia.com/blog/unified-memory-cuda-beginners/) feature of CUDA. Unified memory provides a single memory space the GPU and CPU can easily access. While this feature is not supported by PyTorch, it has been added to bitsandbytes. + +Paged optimizers works like regular CPU paging, which means that it *only becomes active if you run out of GPU memory*. When that happens, memory is transferred page-by-page from GPU to CPU. The memory is mapped, meaning that pages are pre-allocated on the CPU but they are not updated automatically. Pages are only updated if the memory is accessed or a swapping operation is launched. + +The unified memory feature is less efficient than regular asynchronous memory transfers, and you usually won't be able to get full PCIe memory bandwidth utilization. If you do a manual prefetch, transfer speeds can be high but still only about half or worse than the full PCIe memory bandwidth (tested on 16x lanes PCIe 3.0). + +This means performance depends highly on the particular use-case. For example, if you evict 1 GB of memory per forward-backward-optimizer loop, then you can expect about 50% of the PCIe bandwidth as time in the best case. So, 1 GB for PCIe 3.0 with 16x lanes would run at 16 GB/s, which is `1/(16*0.5) = 1/8 = 125ms` of overhead per optimizer step. Other overhead can be estimated for the particular use-case given a PCIe interface, lanes, and the memory evicted in each iteration. + +Compared to CPU offloading, a paged optimizer has zero overhead if all the memory fits onto the device and only some overhead if some of memory needs to be evicted. For offloading, you usually offload fixed parts of the model and need to off and onload all this memory with each iteration through the model (sometimes twice for both forward and backward pass). diff --git a/docs/source/resources.mdx b/docs/source/explanations/resources.mdx similarity index 100% rename from docs/source/resources.mdx rename to docs/source/explanations/resources.mdx diff --git a/docs/source/index.mdx b/docs/source/index.mdx index 71b3d67bd..5943e7d1d 100644 --- a/docs/source/index.mdx +++ b/docs/source/index.mdx @@ -1,19 +1,13 @@ -# `bitsandbytes` +# bitsandbytes -The `bitsandbytes` library is a lightweight Python wrapper around CUDA custom functions, in particular 8-bit optimizers, matrix multiplication (LLM.int8()), and 8 + 4-bit quantization functions. +bitsandbytes enables accessible large language models via k-bit quantization for PyTorch. bitsandbytes provides three main features for dramatically reducing memory consumption for inference and training: -The library includes quantization primitives for 8-bit & 4-bit operations, through `bitsandbytes.nn.Linear8bitLt` and `bitsandbytes.nn.Linear4bit` and 8bit optimizers through `bitsandbytes.optim` module. - -There are ongoing efforts to support further hardware backends, i.e. Intel CPU + GPU, AMD GPU, Apple Silicon. Windows support is on its way as well. - -## API documentation - -- [Quantization](quantization) -- [Integrations](integrations) -- [Optimizers](optimizers) +* 8-bit optimizers uses block-wise quantization to maintain 32-bit performance at a small fraction of the memory cost. +* LLM.Int() or 8-bit quantization enables large language model inference with only half the required memory and without any performance degradation. This method is based on vector-wise quantization to quantize most features to 8-bits and separately treating outliers with 16-bit matrix multiplication. +* QLoRA or 4-bit quantization enables large language model training with several memory-saving techniques that don't compromise performance. This method quantizes a model to 4-bits and inserts a small set of trainable low-rank adaptation (LoRA) weights to allow training. # License -The majority of bitsandbytes is licensed under MIT, however portions of the project are available under separate license terms, as the parts adapted from Pytorch are licensed under the BSD license. +bitsandbytes is MIT licensed. We thank Fabio Cannizzo for his work on [FastBinarySearch](https://github.com/fabiocannizzo/FastBinarySearch) which we use for CPU quantization. diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx index a63a6a93e..49d8b4ebd 100644 --- a/docs/source/installation.mdx +++ b/docs/source/installation.mdx @@ -21,7 +21,7 @@ To install from PyPI. pip install bitsandbytes ``` -## Alternative: Compiling from source +## Compile from source To compile from source, you need CMake >= **3.22.1** and Python >= **3.8** installed. Make sure you have a compiler installed to compile C++ (gcc, make, headers, etc.). For example, to install a compiler and CMake on Ubuntu: diff --git a/docs/source/integrations.mdx b/docs/source/integrations.mdx index 48b4d6060..4badece49 100644 --- a/docs/source/integrations.mdx +++ b/docs/source/integrations.mdx @@ -1,31 +1,89 @@ -# Transformers +# Integrations -With Transformers it's very easy to load any model in 4 or 8-bit, quantizing them on the fly with `bitsandbytes` primitives. +bitsandbytes is widely integrated with many of the libraries in the Hugging Face and wider PyTorch ecosystem. This guide provides a brief overview of the integrations and how to use bitsandbytes with them. For more details, you should refer to the linked documentation for each library. -Please review the [`bitsandbytes` section in the Transformers docs](https://huggingface.co/docs/transformers/main/en/quantization#bitsandbytes). +## Transformers -Details about the BitsAndBytesConfig can be found [here](https://huggingface.co/docs/transformers/v4.37.2/en/main_classes/quantization#transformers.BitsAndBytesConfig). +> [!TIP] +> Learn more in the bitsandbytes Transformers integration [guide](https://huggingface.co/docs/transformers/quantization#bitsandbytes). + +With Transformers, it's very easy to load any model in 4 or 8-bit and quantize them on the fly. To configure the quantization parameters, specify them in the [`~transformers.BitsAndBytesConfig`] class. + +For example, to load and quantize a model to 4-bits and use the bfloat16 data type for compute: > [!WARNING] -> **Beware: bf16 is the optimal compute data type!** -> -> If your hardware supports it, `bf16` is the optimal compute dtype. The default is `float32` for backward compatibility and numerical stability. `float16` often leads to numerical instabilities, but `bfloat16` provides the benefits of both worlds: numerical stability equivalent to float32, but combined with the memory footprint and significant computation speedup of a 16-bit data type. Therefore, be sure to check if your hardware supports `bf16` and configure it using the `bnb_4bit_compute_dtype` parameter in BitsAndBytesConfig: +> bfloat16 is the optimal compute data type if your hardware supports it. The default is float32 for backward compatibility and numerical stability, but it can often lead to numerical instabilities. bfloat16 provides the best of both worlds, numerical stability equivalent to float32, but combined with the memory footprint and significant computation speedup of a 16-bit data type. Make sure to check if your hardware supports bfloat16 and if it does, configure it using the `bnb_4bit_compute_dtype` parameter in [`~transformers.BitsAndBytesConfig`]! ```py -import torch -from transformers import BitsAndBytesConfig +from transformers import AutoModelForCausalLM, BitsAndBytesConfig quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16) +model_4bit = AutoModelForCausalLM.from_pretrained( + "bigscience/bloom-1b7", + device_map=device_map, + quantization_config=quantization_config, +) +``` + +### 8-bit optimizers + +You can use any of the 8-bit or paged optimizers with Transformers by passing them to the [`~transformers.Trainer`] class on initialization. All bitsandbytes optimizers are supported by passing the correct string in the [`~transformers.TrainingArguments`] `optim` parameter. For example, to load a [`~bitsandbytes.optim.PagedAdamW32bit`] optimizer: + +```py +from transformers import TrainingArguments, Trainer + +training_args = TrainingArguments( + ..., + optim="paged_adamw_32bit", +) +trainer = Trainer(model, training_args, ...) +trainer.train() +``` + +## PEFT + +> [!TIP] +> Learn more in the bitsandbytes PEFT integration [guide](https://huggingface.co/docs/peft/developer_guides/quantization#quantization). + +PEFT builds on the bitsandbytes Transformers integration, and extends it for training with a few more steps. Let's prepare the 4-bit model from the section above for training. + +Call the [`~peft.prepare_model_for_kbit_training`] method to prepare the model for training. This only works for Transformers models! + +```py +from peft import prepare_model_for_kbit_training + +model_4bit = prepare_model_for_kbit_training(model_4bit) ``` -# PEFT -With `PEFT`, you can use QLoRA out of the box with `LoraConfig` and a 4-bit base model. +Setup a [`~peft.LoraConfig`] to use QLoRA: + +```py +from peft import LoraConfig + +config = LoraConfig( + r=16, + lora_alpha=8, + target_modules="all-linear", + lora_dropout=0.05 + bias="none", + task_type="CAUSAL_LM" +) +``` -Please review the [bitsandbytes section in the PEFT docs](https://huggingface.co/docs/peft/developer_guides/quantization#quantize-a-model). +Now call the [`~peft.get_peft_model`] function on your model and config to create a trainable [`PeftModel`]. + +```py +from peft import get_peft_model + +model = get_peft_model(model_4bit, config) +``` -# Accelerate +## Accelerate -Bitsandbytes is also easily usable from within Accelerate, where you can quantize any PyTorch model simply by passing a quantization config; e.g: +> [!TIP] +> Learn more in the bitsandbytes Accelerate integration [guide](https://huggingface.co/docs/accelerate/usage_guides/quantization). + +bitsandbytes is also easily usable from Accelerate and you can quantize any PyTorch model by passing a [`~accelerate.utils.BnbQuantizationConfig`] with your desired settings, and then calling the [`~accelerate.utils.load_and_quantize_model`] function to quantize it. ```py from accelerate import init_empty_weights @@ -55,37 +113,25 @@ quantized_model = load_and_quantize_model( ) ``` -For further details, e.g. model saving, cpu-offloading andfine-tuning, please review the [`bitsandbytes` section in the Accelerate docs](https://huggingface.co/docs/accelerate/en/usage_guides/quantization). - - - -# PyTorch Lightning and Lightning Fabric - -Bitsandbytes is available from within both -- [PyTorch Lightning](https://lightning.ai/docs/pytorch/stable/), a deep learning framework for professional AI researchers and machine learning engineers who need maximal flexibility without sacrificing performance at scale; -- and [Lightning Fabric](https://lightning.ai/docs/fabric/stable/), a fast and lightweight way to scale PyTorch models without boilerplate). - -Please review the [bitsandbytes section in the PyTorch Lightning docs](https://lightning.ai/docs/pytorch/stable/common/precision_intermediate.html#quantization-via-bitsandbytes). - - -# Lit-GPT +## PyTorch Lightning and Lightning Fabric -Bitsandbytes is integrated into [Lit-GPT](https://github.com/Lightning-AI/lit-gpt), a hackable implementation of state-of-the-art open-source large language models, based on Lightning Fabric, where it can be used for quantization during training, finetuning, and inference. +bitsandbytes is available from: -Please review the [bitsandbytes section in the Lit-GPT quantization docs](https://github.com/Lightning-AI/lit-gpt/blob/main/tutorials/quantize.md). +- [PyTorch Lightning](https://lightning.ai/docs/pytorch/stable/), a deep learning framework for professional AI researchers and machine learning engineers who need maximal flexibility without sacrificing performance at scale. +- [Lightning Fabric](https://lightning.ai/docs/fabric/stable/), a fast and lightweight way to scale PyTorch models without boilerplate. +Learn more in the bitsandbytes PyTorch Lightning integration [guide](https://lightning.ai/docs/pytorch/stable/common/precision_intermediate.html#quantization-via-bitsandbytes). -# Trainer for the optimizers +## Lit-GPT -You can use any of the 8-bit and/or paged optimizers by simple passing them to the `transformers.Trainer` class on initialization.All bnb optimizers are supported by passing the correct string in `TrainingArguments`'s `optim` attribute - e.g. (`paged_adamw_32bit`). +bitsandbytes is integrated with [Lit-GPT](https://github.com/Lightning-AI/lit-gpt), a hackable implementation of state-of-the-art open-source large language models. Lit-GPT is based on Lightning Fabric, and it can be used for quantization during training, finetuning, and inference. -See the [official API docs for reference](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.Trainer). +Learn more in the bitsandbytes Lit-GPT integration [guide](https://github.com/Lightning-AI/lit-gpt/blob/main/tutorials/quantize.md). -Here we point out to relevant doc sections in transformers / peft / Trainer + very briefly explain how these are integrated: -e.g. for transformers state that you can load any model in 8-bit / 4-bit precision, for PEFT, you can use QLoRA out of the box with `LoraConfig` + 4-bit base model, for Trainer: all bnb optimizers are supported by passing the correct string in `TrainingArguments`'s `optim` attribute - e.g. (`paged_adamw_32bit`): +## Blog posts -# Blog posts +To learn in more detail about some of bitsandbytes integrations, take a look at the following blog posts: -- [Making LLMs even more accessible with `bitsandbytes`, 4-bit quantization and QLoRA](https://huggingface.co/blog/4bit-transformers-bitsandbytes) -- [A Gentle Introduction to 8-bit Matrix Multiplication for transformers at scale using Hugging Face Transformers, Accelerate and `bitsandbytes`](https://huggingface.co/blog/hf-bitsandbytes-integration) +- [Making LLMs even more accessible with bitsandbytes, 4-bit quantization and QLoRA](https://huggingface.co/blog/4bit-transformers-bitsandbytes) +- [A Gentle Introduction to 8-bit Matrix Multiplication for transformers at scale using Hugging Face Transformers, Accelerate and bitsandbytes](https://huggingface.co/blog/hf-bitsandbytes-integration) diff --git a/docs/source/optimizers.mdx b/docs/source/optimizers.mdx index 734cb2211..7d04f82b1 100644 --- a/docs/source/optimizers.mdx +++ b/docs/source/optimizers.mdx @@ -1,29 +1,14 @@ -# Introduction: 8-bit optimizers +# 8-bit optimizers -With 8-bit optimizers, larger models can be finetuned with the same GPU memory compared to standard 32-bit optimizer training. 8-bit optimizers are a drop-in replacement for regular optimizers, with the following properties: +With 8-bit optimizers, large models can be finetuned with 75% less GPU memory without losing any accuracy compared to training with standard 32-bit optimizers. The reduced memory requirements means 8-bit optimizers are 4x faster than a standard optimizer, and no hyperparameter tuning is required. -- Faster (e.g. 4x faster than regular Adam) -- 75% less memory, same performance -- No hyperparameter tuning needed +This guide will show you how to use 8-bit optimizers. -8-bit optimizers are mostly useful to finetune large models that did not fit into memory before. They also make it easier to pretrain larger models and have great synergy with sharded data parallelism. 8-bit Adam, for example, is already used across multiple teams in Facebook. This optimizer saves a ton of memory at no accuracy hit. +> [!WARNING] +> 8-bit optimizers reduce memory usage and accelerate optimization on a wide range of tasks. However, since 8-bit optimizers only reduce memory proportional to the number of parameters, models that use large amounts of activation memory, such as convolutional networks, don't really benefit from 8-bit optimizers. 8-bit optimizers are most beneficial for training or finetuning models with many parameters on highly memory-constrained GPUs. -Generally, our 8-bit optimizers have three components: -1. **block-wise quantization** isolates outliers and distributes the error more equally over all bits, -2. **dynamic quantization** quantizes both small and large values with high precision, -3. a **stable embedding layer** improves stability during optimization for models with word embeddings. +8-bit optimizers are a drop-in replacement for regular optimizers which means they also accept the same arguments as a regular optimizer. For NLP models, it is recommended to use the [`~nn.StableEmbedding`] class to improve stability and results. -With these components, performing an optimizer update with 8-bit states is straightforward and for GPUs, this makes 8-bit optimizers way faster than regular 32-bit optimizers. [Further details below](#research-background) - -We feature 8-bit `Adagrad`, `Adam`, `AdamW`, `LAMB`, `LARS`, `Lion`, `RMSprop` and `SGD` (momentum). - -## Caveats - -8-bit optimizers reduce the memory footprint and accelerate optimization on a wide range of tasks. However, since 8-bit optimizers reduce only the memory footprint proportional to the number of parameters, **models that use large amounts of activation memory, such as convolutional networks, have few benefits from using 8-bit optimizers**. Thus, 8-bit optimizers are most beneficial for training or finetuning models with many parameters on highly memory-constrained GPUs. - -## Usage - -It only requires a two-line code change to get started. ```diff import bitsandbytes as bnb @@ -35,112 +20,29 @@ import bitsandbytes as bnb + bnb.nn.StableEmbedding(...) ``` -The arguments passed are the same as standard Adam. For NLP models we recommend to also use the StableEmbedding layers which improves results and helps with stable 8-bit optimization. +By default, all parameter tensors with less than 4096 elements are kept at 32-bits even if you initialize those parameters with 8-bit optimizers. This is done because small tensors do not save much memory and often contain highly variable parameters (biases) or parameters that require high precision (batch norm, layer norm). -Note that by default all parameter tensors with less than 4096 elements are kept at 32-bit even if you initialize those parameters with 8-bit optimizers. This is done since such small tensors do not save much memory and often contain highly variable parameters (biases) or parameters that require high precision (batch norm, layer norm). You can change this behavior like so: +You can change this value with the `min_8bit_size` parameter. For example, if you want to optimize parameters to 8-bits only if the minimum size is 16384 values (it is recommended to use multiples of 4096): ```py -# For parameter tensors with less than 16384 values are optimized in 32-bit -# it is recommended to use multiplies of 4096: +import bitsandbytes as bnb + adam = bnb.optim.Adam8bit(model.parameters(), min_8bit_size=16384) ``` -Some more examples of how you can replace your old optimizer with the 8-bit optimizer: +Other parameters you can configure include the learning rate (`lr`), the decay rates (`betas`), the number of bits of the optimizer state (`optim_bits`), and percentile clipping (`percentile_clipping`) which can increase stability. For example, to initialize a 32-bit [`~bitsandbytes.optim.Adam`] optimizer with 5th percentile clipping: -```diff +```py import bitsandbytes as bnb -- adam = torch.optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.995)) # comment out old optimizer -+ adam = bnb.optim.Adam8bit(model.parameters(), lr=0.001, betas=(0.9, 0.995)) # add bnb optimizer - -# use 32-bit Adam with 5th percentile clipping -+ adam = bnb.optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.995), optim_bits=32, percentile_clipping=5) -- adam = torch.optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.995)) # comment out old optimizer +adam = bnb.optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.995), optim_bits=32, percentile_clipping=5) ``` -## Overview of supported 8-bit optimizers - -Currently, `bitsandbytes` supports the following optimizers: - -- `Adagrad`, `Adagrad8bit`, `Adagrad32bit` -- `Adam`, `Adam8bit`, `Adam32bit`, `PagedAdam`, `PagedAdam8bit`, `PagedAdam32bit` -- `AdamW`, `AdamW8bit`, `AdamW32bit`, `PagedAdamW`, `PagedAdamW8bit`, `PagedAdamW32bit` -- `LAMB`, `LAMB8bit`, `LAMB32bit` -- `LARS`, `LARS8bit`, `LARS32bit`, `PytorchLARS` -- `Lion`, `Lion8bit`, `Lion32bit`, `PagedLion`, `PagedLion8bit`, `PagedLion32bit` -- `RMSprop`, `RMSprop8bit`, `RMSprop32bit` -- `SGD`, `SGD8bit`, `SGD32bit` - -Additionally, for cases in which you want to optimize some unstable parameters with 32-bit Adam and others with 8-bit Adam, you can use the `GlobalOptimManager`, [as explained in greater detail below](#optim_manager). - -Find the API docs [here](#optim_api_docs) (still under construction). - -## Overview of expected gains - -
- -
- -See here an overview of the biggest models that can be trained based on optimizer usage: - -
- -
- -### Research Background - -Stateful optimizers maintain gradient statistics over time, e.g. the exponentially smoothed sum (SGD with momentum) or squared sum (Adam) of past gradient values. This state can be used to accelerate optimization compared to plain stochastic gradient descent but uses memory that might otherwise be allocated to model parameters, thereby limiting the maximum size of models trained in practice. `bitsandbytes` optimizers use 8-bit statistics, while maintaining the performance levels of using 32-bit optimizer states. - -To overcome the resulting computational, quantization and stability challenges, 8-bit optimizers have three components: - -1. **Block-wise quantization** divides input tensors into smaller blocks that are independently quantized, therein isolating outliers and distributing the error more equally over all bits. Each block is processed in parallel across cores, yielding faster optimization and high precision quantization. -2. **Dynamic quantization**, which quantizes both small and large values with high precision and -3. a **stable embedding layer** improves stability during optimization for models with word embeddings. - -With these components, performing an optimizer update with 8-bit states is straightforward. We dequantize the 8-bit optimizer states to 32-bit, perform the update and then quantize the states back to 8-bit for storage. - -We do this 8-bit to 32-bit conversion element-by-element in registers, which means no slow copies to GPU memory or additional temporary memory are needed to perform quantization and dequantization. For GPUs, this makes 8-bit optimizers much faster than regular 32-bit optimizers. - -For more details, please refer to the paper [8-bit Optimizers via Block-wise Quantization](https://arxiv.org/abs/2110.02861). - -## Stable Embedding Layer +## Optimize unstable parameters -The Stable Embedding Layer enhances the standard word embedding layer for improved training stability in NLP tasks. It addresses the challenge of non-uniform input distributions and mitigates extreme gradient variations, ensuring smoother training processes. +To optimize some unstable parameters with 32-bit Adam and others with 8-bit Adam, use the [`~bitsandbytes.optim.GlobalOptimManager`] class to override the specific hyperparameters for a particular layer. You'll need to: -#### Features: - -- **Initialization**: Utilizes Xavier uniform initialization to maintain consistent variance, reducing the likelihood of large gradients. -- **Normalization**: Incorporates layer normalization before adding positional embeddings, aiding in output stability. -- **Optimizer States**: Employs 32-bit optimizer states exclusively for this layer to enhance stability, while the rest of the model may use standard 16-bit precision. - -#### Benefits: - -- Designed to support more aggressive quantization strategies without compromising training stability. -- Helps in achieving stable training outcomes, particularly important for models dealing with diverse and complex language data. - -## Paged optimizers - -Paged optimizers are build on top of the [unified memory](https://developer.nvidia.com/blog/unified-memory-cuda-beginners/) feature of CUDA. This feature is not supported by PyTorch and we added it to `bitsandbytes`. - -It works like regular CPU paging, which means that it only becomes active _if one runs out of GPU memory_. Only then will the memory be transferred, page-by-page, from GPU to CPU. The memory is mapped, meaning that pages are preallocated on the CPU, but they are not updated automatically. They are only updated if the memory is accessed, or a swapping operation is launched. - -The unified memory feature is less efficient than regular asynchronous memory transfers. This means, one usually will not be able to get full PCIe memory bandwidth utilization. If one does a manual prefetch, transfer speeds can be high but still about half or worse than the full PCIe memory bandwidth (tested on 16x lanes PCIe 3.0). - -This all means performance depends highly on the particular use-case. If one evicts, say, 1 GB of memory per forward-backward-optimizer loop: One can expect about 50% of the PCIe bandwidth as time in the best case. So 1 GB for PCIe 3.0 with 16x lanes, which runs at 16 GB/s, is `1/(16*0.5) = 1/8 = 125ms` overhead per optimizer step. Other overhead can be estimated for the particular use-case given a PCIe interface, lanes, and the memory that is evicted in each iteration. - -Compared to CPU offloading, this has the advantage that there is zero overhead if all the memory fits into the device and only some overhead if some of memory needs to be evicted. For offloading, one would usually offload fixed parts of the model and need to off and onload all this memory with each iteration through the model (sometimes twice for both forward and backward pass). - -[Find more details in this discussion](https://github.com/TimDettmers/bitsandbytes/issues/962). - - -## `GlobalOptimManager`: How to override config hyperparameters for particular weights/parameters[[optim_manager]] - -If you want to optimize some unstable parameters with 32-bit Adam and others with 8-bit Adam, you can use the `GlobalOptimManager`. With this, we can also configure specific hyperparameters for particular layers, such as embedding layers. To do that, we need two things: - -1. Register the parameter while they are still on the CPU. -2. Override the config with the new desired hyperparameters (anytime, anywhere). - -For global overrides in many different places in your code you can do: +1. Register the parameters while they're on the CPU. ```py import torch @@ -149,23 +51,32 @@ import bitsandbytes as bnb mng = bnb.optim.GlobalOptimManager.get_instance() model = MyModel() -mng.register_parameters(model.parameters()) # 1. register parameters while still on CPU +mng.register_parameters(model.parameters()) +``` + +2. Override the config with the new desired hyperparameters. For example, let's override the `model.fc1.weight` layer to use 32-bit Adam. +> [!TIP] +> Check the optimizer API documentation for more information about other hyperparameters you can override. + +```py model = model.cuda() # use 8-bit optimizer states for all parameters adam = bnb.optim.Adam(model.parameters(), lr=0.001, optim_bits=8) -# 2a. override: the parameter model.fc1.weight now uses 32-bit Adam -mng.override_config(model.fc1.weight, 'optim_bits', 32) +# override the parameter model.fc1.weight now uses 32-bit Adam +mng.override_config(model.fc1.weight, "optim_bits", 32) +``` -# 2b. override: the two special layers use -# sparse optimization + different learning rate + different Adam betas +You can also override multiple layers at once by passing them as a list and the new hyperparameters as a dictionary. For example, let's override the `model.special.weight` and `model.also_special.weight` layers to use sparse optimization and a lower learning and decay rate. + +```py mng.override_config([model.special.weight, model.also_special.weight], key_value_dict ={'is_sparse': True, 'lr': 1e-5, 'betas'=(0.9, 0.98)}) ``` -Possible options for the config override are: `betas, eps, weight_decay, lr, optim_bits, min_8bit_size, percentile_clipping, block_wise, max_unorm`. -For overrides for particular layers, we recommend overriding locally in each module. You can do this by passing the module, the parameter, and its attribute name to the GlobalOptimManager: +For a specific layer, we recommend overriding locally in each module. Pass the module, the parameter, and its attribute name to the [`~bitsandbytes.optim.GlobalOptimManager`]: + ```py class MyModule(torch.nn.Module): def __init__(d_in, d_out): @@ -178,13 +89,6 @@ class MyModule(torch.nn.Module): ``` -## API Docs[[optim_api_docs]] - -... under construction ... - -Here we'll provide further auto-generated API docs soon. Please feel free to contribute doc-strings for the respective optimizers, as `bitsandbytes` is a community effort. - -### StableEmbedding[[stable-emb-api]] +## Next steps -[[autodoc]] bitsandbytes.nn.StableEmbedding - - __init__ +For more conceptual details and explanation about 8-bit optimizers, take a look at the [8-bit optimizers](./explanations/optimizers) guide. diff --git a/docs/source/reference/quantization.mdx b/docs/source/reference/quantization.mdx deleted file mode 100644 index 3880cc089..000000000 --- a/docs/source/reference/quantization.mdx +++ /dev/null @@ -1,13 +0,0 @@ -# Quantization primitives - -Below you will find the docstring of the quantization primitives exposed in bitsandbytes. - -## Linear4bit (QLoRA)[[linear4bit]] - -[[autodoc]] bitsandbytes.nn.Linear4bit - - __init__ - -## Linear8bitLt[[linear8bit]] - -[[autodoc]] bitsandbytes.nn.Linear8bitLt - - __init__ From 3ccb1308dc078278d8d6f98349ee823a52471955 Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Mon, 18 Mar 2024 14:14:20 -0400 Subject: [PATCH 095/112] fix diagnostics error within vscode on windows --- bitsandbytes/diagnostics/cuda.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bitsandbytes/diagnostics/cuda.py b/bitsandbytes/diagnostics/cuda.py index f993dff7e..8974c6400 100644 --- a/bitsandbytes/diagnostics/cuda.py +++ b/bitsandbytes/diagnostics/cuda.py @@ -59,7 +59,7 @@ def find_cuda_libraries_in_path_list(paths_list_candidate: str) -> Iterable[Path for pth in dir.glob(lib_pattern): if pth.is_file(): yield pth - except PermissionError: + except (OSError, PermissionError): pass From c6e319072f3c1817460b441aa4135ce956b54e24 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 26 Mar 2024 22:11:08 +0100 Subject: [PATCH 096/112] Bump the major group with 3 updates (#1145) Updates the requirements on [pytest](https://github.com/pytest-dev/pytest), [pandas](https://github.com/pandas-dev/pandas) and [matplotlib](https://github.com/matplotlib/matplotlib) to permit the latest version. Updates `pytest` from 7.2.2 to 8.1.1 - [Release notes](https://github.com/pytest-dev/pytest/releases) - [Changelog](https://github.com/pytest-dev/pytest/blob/main/CHANGELOG.rst) - [Commits](https://github.com/pytest-dev/pytest/compare/7.2.2...8.1.1) Updates `pandas` to 2.2.1 - [Release notes](https://github.com/pandas-dev/pandas/releases) - [Commits](https://github.com/pandas-dev/pandas/compare/v2.2.0...v2.2.1) Updates `matplotlib` to 3.8.3 - [Release notes](https://github.com/matplotlib/matplotlib/releases) - [Commits](https://github.com/matplotlib/matplotlib/compare/v3.8.2...v3.8.3) --- updated-dependencies: - dependency-name: pytest dependency-type: direct:production update-type: version-update:semver-major dependency-group: major - dependency-name: pandas dependency-type: direct:development dependency-group: major - dependency-name: matplotlib dependency-type: direct:development dependency-group: major ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- requirements-ci.txt | 2 +- requirements-dev.txt | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/requirements-ci.txt b/requirements-ci.txt index e6e375ccb..39fa16e08 100644 --- a/requirements-ci.txt +++ b/requirements-ci.txt @@ -1,5 +1,5 @@ # Requirements used for GitHub actions -pytest==7.2.2 +pytest==8.1.1 einops==0.6.0 lion-pytorch==0.0.6 scipy==1.10.1; python_version < "3.9" diff --git a/requirements-dev.txt b/requirements-dev.txt index 7ede5b061..e112365ea 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,9 +1,9 @@ # Requirements used for local development setuptools>=63 -pytest~=7.2.2 +pytest~=8.1.1 einops~=0.6.0 wheel~=0.40.0 lion-pytorch~=0.0.6 scipy~=1.11.4 -pandas~=2.2.0 -matplotlib~=3.8.2 +pandas~=2.2.1 +matplotlib~=3.8.3 From 040526310ed1b502647510648464d2673de8ad63 Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Tue, 26 Mar 2024 17:42:25 -0400 Subject: [PATCH 097/112] Add CUDA 12.4 to docs/install helper (#1136) * Add CUDA 12.4 download to utility script, docs * (ci) Add CUDA 12.4.0 build to workflow * Apply ruff format to install_cuda.py --- docs/source/installation.mdx | 2 +- install_cuda.py | 9 +++++++-- install_cuda.sh | 7 +++++-- 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx index 49d8b4ebd..d0dd7ba76 100644 --- a/docs/source/installation.mdx +++ b/docs/source/installation.mdx @@ -84,7 +84,7 @@ Then locally install the CUDA version you need with this script from bitsandbyte ```bash wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/install_cuda.sh # Syntax cuda_install CUDA_VERSION INSTALL_PREFIX EXPORT_TO_BASH -# CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122, 123} +# CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122, 123, 124} # EXPORT_TO_BASH in {0, 1} with 0=False and 1=True # For example, the following installs CUDA 11.7 to ~/local/cuda-11.7 and exports the path to your .bashrc diff --git a/install_cuda.py b/install_cuda.py index 9e426cbd7..a5d09356d 100644 --- a/install_cuda.py +++ b/install_cuda.py @@ -17,6 +17,7 @@ "121": "https://developer.download.nvidia.com/compute/cuda/12.1.1/local_installers/cuda_12.1.1_530.30.02_linux.run", "122": "https://developer.download.nvidia.com/compute/cuda/12.2.2/local_installers/cuda_12.2.2_535.104.05_linux.run", "123": "https://developer.download.nvidia.com/compute/cuda/12.3.2/local_installers/cuda_12.3.2_545.23.08_linux.run", + "124": "https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_550.54.14_linux.run", } @@ -76,7 +77,9 @@ def main(): download_path = "/tmp" # default download path if len(sys.argv) < 2: - print("Usage: python install_cuda.py [user/system] [download_path]") + print( + "Usage: python install_cuda.py [user/system] [download_path]" + ) sys.exit(1) version = sys.argv[1] @@ -97,7 +100,9 @@ def main(): elif version in cuda_versions: install_cuda(version, base_path, download_path) else: - print(f"Invalid CUDA version: {version}. Available versions are: {', '.join(cuda_versions.keys())}") + print( + f"Invalid CUDA version: {version}. Available versions are: {', '.join(cuda_versions.keys())}" + ) sys.exit(1) diff --git a/install_cuda.sh b/install_cuda.sh index 8ffbc8478..2e7fe8ed2 100644 --- a/install_cuda.sh +++ b/install_cuda.sh @@ -11,7 +11,7 @@ URL120=https://developer.download.nvidia.com/compute/cuda/12.0.1/local_installer URL121=https://developer.download.nvidia.com/compute/cuda/12.1.1/local_installers/cuda_12.1.1_530.30.02_linux.run URL122=https://developer.download.nvidia.com/compute/cuda/12.2.2/local_installers/cuda_12.2.2_535.104.05_linux.run URL123=https://developer.download.nvidia.com/compute/cuda/12.3.2/local_installers/cuda_12.3.2_545.23.08_linux.run - +URL124=https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_550.54.14_linux.run CUDA_VERSION=$1 BASE_PATH=$2 @@ -57,8 +57,11 @@ if [[ -n "$CUDA_VERSION" ]]; then elif [[ "$CUDA_VERSION" -eq "123" ]]; then URL=$URL123 FOLDER=cuda-12.3 + elif [[ "$CUDA_VERSION" -eq "124" ]]; then + URL=$URL124 + FOLDER=cuda-12.4 else - echo "argument error: No cuda version passed as input. Choose among versions 92 to 123" + echo "argument error: No cuda version passed as input. Choose among versions 110 to 124" fi else echo "argument error: No cuda version passed as input. Choose among versions 92 to 123" From fd9d072e02b74348004f197e686e168448883a9e Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 27 Mar 2024 18:32:04 +0100 Subject: [PATCH 098/112] Bump the minor-patch group with 4 updates (#1146) Updates the requirements on [einops](https://github.com/arogozhnikov/einops), [wheel](https://github.com/pypa/wheel), [lion-pytorch](https://github.com/lucidrains/lion-pytorch) and [scipy](https://github.com/scipy/scipy) to permit the latest version. Updates `einops` from 0.6.0 to 0.7.0 - [Release notes](https://github.com/arogozhnikov/einops/releases) - [Commits](https://github.com/arogozhnikov/einops/compare/v0.6.0...v0.7.0) Updates `wheel` to 0.43.0 - [Release notes](https://github.com/pypa/wheel/releases) - [Changelog](https://github.com/pypa/wheel/blob/main/docs/news.rst) - [Commits](https://github.com/pypa/wheel/compare/0.40.0...0.43.0) Updates `lion-pytorch` from 0.0.6 to 0.1.2 - [Release notes](https://github.com/lucidrains/lion-pytorch/releases) - [Commits](https://github.com/lucidrains/lion-pytorch/compare/0.0.6...0.1.2) Updates `scipy` from 1.11.4 to 1.12.0 - [Release notes](https://github.com/scipy/scipy/releases) - [Commits](https://github.com/scipy/scipy/compare/v1.11.4...v1.12.0) --- updated-dependencies: - dependency-name: einops dependency-type: direct:production update-type: version-update:semver-minor dependency-group: minor-patch - dependency-name: wheel dependency-type: direct:development dependency-group: minor-patch - dependency-name: lion-pytorch dependency-type: direct:production update-type: version-update:semver-minor dependency-group: minor-patch - dependency-name: scipy dependency-type: direct:production update-type: version-update:semver-minor dependency-group: minor-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- requirements-ci.txt | 6 +++--- requirements-dev.txt | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/requirements-ci.txt b/requirements-ci.txt index 39fa16e08..61f92018a 100644 --- a/requirements-ci.txt +++ b/requirements-ci.txt @@ -1,6 +1,6 @@ # Requirements used for GitHub actions pytest==8.1.1 -einops==0.6.0 -lion-pytorch==0.0.6 +einops==0.7.0 +lion-pytorch==0.1.2 scipy==1.10.1; python_version < "3.9" -scipy==1.11.4; python_version >= "3.9" +scipy==1.12.0; python_version >= "3.9" diff --git a/requirements-dev.txt b/requirements-dev.txt index e112365ea..fc5449ba7 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,9 +1,9 @@ # Requirements used for local development setuptools>=63 pytest~=8.1.1 -einops~=0.6.0 -wheel~=0.40.0 -lion-pytorch~=0.0.6 -scipy~=1.11.4 +einops~=0.7.0 +wheel~=0.43.0 +lion-pytorch~=0.1.2 +scipy~=1.12.0 pandas~=2.2.1 matplotlib~=3.8.3 From ff3337148ea23642f1d2af9782854998bd132915 Mon Sep 17 00:00:00 2001 From: Prasanth Nunna Date: Tue, 9 Apr 2024 20:37:54 +0000 Subject: [PATCH 099/112] Update README.md --- README.md | 140 ++++++++++-------------------------------------------- 1 file changed, 24 insertions(+), 116 deletions(-) diff --git a/README.md b/README.md index 81d1f40bb..9a741d22f 100644 --- a/README.md +++ b/README.md @@ -1,31 +1,13 @@ -# bitsandbytes-rocm +# `bitsandbytes` -The bitsandbytes is a lightweight wrapper around CUDA custom functions, in particular 8-bit optimizers, matrix multiplication (LLM.int8()), and quantization functions. -This fork is the ROCm adaptation of bitsandbytes. The repo is inspired by [agrocylo/bitsandbytes-rocm](https://github.com/agrocylo/bitsandbytes-rocm/tree/main/bitsandbytes), which is a ROCm version of bitsandbytes 0.37. This fork incorporates the majority of features from bitsandbytes 0.44, including the crucial 4 bit quantization feature. - -The library includes quantization primitives for 8-bit & 4-bit operations, through `bitsandbytes.nn.Linear8bitLt` and `bitsandbytes.nn.Linear4bit` and 8-bit optimizers through `bitsandbytes.optim` module. - -Resources: -- [8-bit Optimizer Paper](https://arxiv.org/abs/2110.02861) -- [Video](https://www.youtube.com/watch?v=IxrlHAJtqKE) -- [Docs](https://bitsandbytes.readthedocs.io/en/latest/) - -- [LLM.int8() Paper](https://arxiv.org/abs/2208.07339) -- [LLM.int8() Software Blog Post](https://huggingface.co/blog/hf-bitsandbytes-integration) -- [LLM.int8() Emergent Features Blog Post](https://timdettmers.com/2022/08/17/llm-int8-and-emergent-features/) - -## TL;DR -**Requirements** -Python >=3.8. Linux distribution (Ubuntu, MacOS, etc.) + ROCm >= 6.0 or CUDA > 10.0 +[![Downloads](https://static.pepy.tech/badge/bitsandbytes)](https://pepy.tech/project/bitsandbytes) [![Downloads](https://static.pepy.tech/badge/bitsandbytes/month)](https://pepy.tech/project/bitsandbytes) [![Downloads](https://static.pepy.tech/badge/bitsandbytes/week)](https://pepy.tech/project/bitsandbytes) +The `bitsandbytes` library is a lightweight Python wrapper around CUDA custom functions, in particular 8-bit optimizers, matrix multiplication (LLM.int8()), and 8 & 4-bit quantization functions. -**Installation**: - - -You need to compile from source for ROCm. - -Compilation quickstart: -```bash -# Run Docker -docker run -it --network=host --device=/dev/kfd --device=/dev/dri --name=bnb_test --shm-size=8g --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --group-add video rocm/pytorch:latest +The library includes quantization primitives for 8-bit & 4-bit operations, through `bitsandbytes.nn.Linear8bitLt` and `bitsandbytes.nn.Linear4bit` and 8-bit optimizers through `bitsandbytes.optim` module. -# Install BitsandBytes +**Installation for ROCm:** +To install latest bitsandbytes (supported on ROCm 6.2): git clone --recurse https://github.com/ROCm/bitsandbytes cd bitsandbytes git checkout rocm_enabled @@ -34,101 +16,27 @@ cmake -DCOMPUTE_BACKEND=hip -S . make pip install . +For ROCm specific versions: +Install Dependencies: +#hipblaslt installation needed only for rocm<6.0 +apt install hipblaslt +pip install --upgrade pip +pip install einops lion_pytorch accelerate +pip install git+https://github.com/ROCm/transformers.git -# Run this script to check if its installed successfully -python check_bnb_install.py -``` - -**Using Int8 inference with HuggingFace Transformers** - -```python -from transformers import AutoModelForCausalLM -model = AutoModelForCausalLM.from_pretrained( - 'decapoda-research/llama-7b-hf', - device_map='auto', - load_in_8bit=True, - max_memory=f'{int(torch.cuda.mem_get_info()[0]/1024**3)-2}GB') -``` - -A more detailed example, can be found in [examples/int8_inference_huggingface.py](examples/int8_inference_huggingface.py). - -**Using 8-bit optimizer**: -1. Comment out optimizer: ``#torch.optim.Adam(....)`` -2. Add 8-bit optimizer of your choice ``bnb.optim.Adam8bit(....)`` (arguments stay the same) -3. Replace embedding layer if necessary: ``torch.nn.Embedding(..) -> bnb.nn.Embedding(..)`` - - -**Using 8-bit Inference**: -1. Comment out torch.nn.Linear: ``#linear = torch.nn.Linear(...)`` -2. Add bnb 8-bit linear light module: ``linear = bnb.nn.Linear8bitLt(...)`` (base arguments stay the same) -3. There are two modes: - - Mixed 8-bit training with 16-bit main weights. Pass the argument ``has_fp16_weights=True`` (default) - - Int8 inference. Pass the argument ``has_fp16_weights=False`` -4. To use the full LLM.int8() method, use the ``threshold=k`` argument. We recommend ``k=6.0``. -```python -# LLM.int8() -linear = bnb.nn.Linear8bitLt(dim1, dim2, bias=True, has_fp16_weights=False, threshold=6.0) -# inputs need to be fp16 -out = linear(x.to(torch.float16)) -``` - - -## Features -- 8-bit Matrix multiplication with mixed precision decomposition -- LLM.int8() inference -- 8-bit Optimizers: Adam, AdamW, RMSProp, LARS, LAMB, Lion (saves 75% memory) -- Stable Embedding Layer: Improved stability through better initialization, and normalization -- 8-bit quantization: Quantile, Linear, and Dynamic quantization -- Fast quantile estimation: Up to 100x faster than other algorithms - -## Using bitsandbytes - -### Using Int8 Matrix Multiplication - -For straight Int8 matrix multiplication with mixed precision decomposition you can use ``bnb.matmul(...)``. To enable mixed precision decomposition, use the threshold parameter: -```python -bnb.matmul(..., threshold=6.0) -``` - -For instructions how to use LLM.int8() inference layers in your own code, see the TL;DR above or for extended instruction see [this blog post](https://huggingface.co/blog/hf-bitsandbytes-integration). - -### Using the 8-bit Optimizers - -With bitsandbytes 8-bit optimizers can be used by changing a single line of code in your codebase. For NLP models we recommend also to use the StableEmbedding layers (see below) which improves results and helps with stable 8-bit optimization. To get started with 8-bit optimizers, it is sufficient to replace your old optimizer with the 8-bit optimizer in the following way: -```python -import bitsandbytes as bnb - -# adam = torch.optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.995)) # comment out old optimizer -adam = bnb.optim.Adam8bit(model.parameters(), lr=0.001, betas=(0.9, 0.995)) # add bnb optimizer -adam = bnb.optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.995), optim_bits=8) # equivalent - - -torch.nn.Embedding(...) -> bnb.nn.StableEmbedding(...) # recommended for NLP models -``` - -Note that by default all parameter tensors with less than 4096 elements are kept at 32-bit even if you initialize those parameters with 8-bit optimizers. This is done since such small tensors do not save much memory and often contain highly variable parameters (biases) or parameters that require high precision (batch norm, layer norm). You can change this behavior like so: -```python -# parameter tensors with less than 16384 values are optimized in 32-bit -# it is recommended to use multiplies of 4096 -adam = bnb.optim.Adam8bit(model.parameters(), min_8bit_size=16384) -``` - -### Change Bits and other Hyperparameters for Individual Parameters - -If you want to optimize some unstable parameters with 32-bit Adam and others with 8-bit Adam, you can use the `GlobalOptimManager`. With this, we can also configure specific hyperparameters for particular layers, such as embedding layers. To do that, we need two things: (1) register the parameter while they are still on the CPU, (2) override the config with the new desired hyperparameters (anytime, anywhere). See our [guide](howto_config_override.md) for more details - -### Fairseq Users - -To use the Stable Embedding Layer, override the respective `build_embedding(...)` function of your model. Make sure to also use the `--no-scale-embedding` flag to disable scaling of the word embedding layer (nor replaced with layer norm). You can use the optimizers by replacing the optimizer in the respective file (`adam.py` etc.). - -## Release and Feature History - -For upcoming features and changes and full history see [Patch Notes](CHANGELOG.md). +Install Bitsandbytes: +git clone --recurse https://github.com/ROCm/bitsandbytes +cd bitsandbytes +# Checkout branch as needed +# for rocm 5.7 - rocm5.7_internal_testing +# for rocm 6.2 - rocm6.2_internal_testing +git checkout +make hip +python setup.py install -## Errors +**For more details, please head to the official documentation page:** -1. RuntimeError: CUDA error: no kernel image is available for execution on the device. [Solution](errors_and_solutions.md#No-kernel-image-available) -2. __fatbinwrap_.. [Solution](errors_and_solutions.md#fatbinwrap_) +**[https://huggingface.co/docs/bitsandbytes/main](https://huggingface.co/docs/bitsandbytes/main)** ## License From 702ca1ae32e022314f766a16b34888314f294570 Mon Sep 17 00:00:00 2001 From: Prasanth Nunna Date: Tue, 9 Apr 2024 22:41:26 +0000 Subject: [PATCH 100/112] fix PEP errors --- Makefile | 154 ----- benchmarking/accuracy/bnb_accuracy.py | 9 +- bitsandbytes/archive_functional.py | 641 ++++++++++--------- bitsandbytes/cuda_setup/main.py | 296 +++++---- bitsandbytes/functional.py | 28 +- bitsandbytes/nn/modules.py | 7 +- bitsandbytes/research/autograd/_functions.py | 2 +- csrc/kernels.hip | 94 +-- csrc/pythonInterface.cpp | 4 +- install_cuda.py | 8 +- tests/helpers.py | 2 +- tests/test_autograd.py | 2 - tests/test_cuda_setup_evaluator.py | 2 +- tests/test_functional.py | 18 +- tests/test_generation.py | 4 +- tests/test_linear8bitlt.py | 2 +- tests/test_optim.py | 1 - tests/test_triton.py | 3 +- 18 files changed, 625 insertions(+), 652 deletions(-) delete mode 100644 Makefile diff --git a/Makefile b/Makefile deleted file mode 100644 index 00f5869b3..000000000 --- a/Makefile +++ /dev/null @@ -1,154 +0,0 @@ -MKFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST))) -ROOT_DIR := $(patsubst %/,%,$(dir $(MKFILE_PATH))) - -GPP:= /usr/bin/g++ -#GPP:= /sw/gcc/11.2.0/bin/g++ -ifeq ($(CUDA_HOME),) - CUDA_HOME:= $(shell which nvcc | rev | cut -d'/' -f3- | rev) -endif - -ROCM_HOME := /opt/rocm - -ifndef CUDA_VERSION -ifneq ($(MAKECMDGOALS),clean) -$(warning WARNING: CUDA_VERSION not set. Call make with CUDA string, for example: make cuda11x CUDA_VERSION=115 or make cpuonly CUDA_VERSION=CPU) -CUDA_VERSION:= -endif -endif - - - -NVCC := $(CUDA_HOME)/bin/nvcc -HIPCC := $(ROCM_HOME)/bin/hipcc - -########################################### - -CSRC := $(ROOT_DIR)/csrc -BUILD_DIR:= $(ROOT_DIR)/build - -FILES_CUDA := $(CSRC)/ops.cu $(CSRC)/kernels.cu -FILES_CPP := $(CSRC)/common.cpp $(CSRC)/cpu_ops.cpp $(CSRC)/pythonInterface.c - -INCLUDE := -I $(CUDA_HOME)/include -I $(ROOT_DIR)/csrc -I $(CONDA_PREFIX)/include -I $(ROOT_DIR)/include -LIB := -L $(CUDA_HOME)/lib64 -lcudart -lcublas -lcublasLt -lcusparse -L $(CONDA_PREFIX)/lib - -INCLUDE_ROCM := -I $(ROCM_HOME)/include -I $(ROOT_DIR)/csrc -I $(CONDA_PREFIX)/include -I $(ROOT_DIR)/include -LIB_ROCM := -L $(ROCM_HOME)/lib -lhipblas -lhipblaslt -lhiprand -lhipsparse -L $(CONDA_PREFIX)/lib - -# NVIDIA NVCC compilation flags -COMPUTE_CAPABILITY += -gencode arch=compute_50,code=sm_50 # Maxwell -COMPUTE_CAPABILITY += -gencode arch=compute_52,code=sm_52 # Maxwell -COMPUTE_CAPABILITY += -gencode arch=compute_60,code=sm_60 # Pascal -COMPUTE_CAPABILITY += -gencode arch=compute_61,code=sm_61 # Pascal -COMPUTE_CAPABILITY += -gencode arch=compute_70,code=sm_70 # Volta - -CC_KEPLER := -gencode arch=compute_35,code=sm_35 # Kepler -CC_KEPLER += -gencode arch=compute_37,code=sm_37 # Kepler - -# Later versions of CUDA support the new architectures -CC_CUDA11x := -gencode arch=compute_75,code=sm_75 -CC_CUDA11x += -gencode arch=compute_80,code=sm_80 -CC_CUDA11x += -gencode arch=compute_86,code=sm_86 - - -CC_cublasLt110 := -gencode arch=compute_75,code=sm_75 -CC_cublasLt110 += -gencode arch=compute_80,code=sm_80 - -CC_cublasLt111 := -gencode arch=compute_75,code=sm_75 -CC_cublasLt111 += -gencode arch=compute_80,code=sm_80 -CC_cublasLt111 += -gencode arch=compute_86,code=sm_86 - -CC_ADA_HOPPER := -gencode arch=compute_89,code=sm_89 -CC_ADA_HOPPER += -gencode arch=compute_90,code=sm_90 - - -all: $(BUILD_DIR) env - $(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) - $(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB) - -cuda110_nomatmul_kepler: $(BUILD_DIR) env - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA110) $(CC_KEPLER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA110) $(CC_KEPLER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB) - -cuda11x_nomatmul_kepler: $(BUILD_DIR) env - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_KEPLER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_KEPLER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB) - - -cuda110_nomatmul: $(BUILD_DIR) env - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA110) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA110) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB) - -cuda11x_nomatmul: $(BUILD_DIR) env - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB) - -cuda118_nomatmul: $(BUILD_DIR) env - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB) - -cuda12x_nomatmul: $(BUILD_DIR) env - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB) - -cuda110: $(BUILD_DIR) env - $(NVCC) $(CC_cublasLt110) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) - $(NVCC) $(CC_cublasLt110) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB) - -cuda11x: $(BUILD_DIR) env - $(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) - $(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB) - -cuda118: $(BUILD_DIR) env - $(NVCC) $(CC_cublasLt111) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) - $(NVCC) $(CC_cublasLt111) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB) - -hip: $(BUILD_DIR) env - $(HIPCC) -std=c++14 -fPIC -c $(INCLUDE_ROCM) $(LIB_ROCM) $(CSRC)/ops.hip -o $(BUILD_DIR)/ops.o - $(HIPCC) -std=c++14 -fPIC -c $(INCLUDE_ROCM) $(LIB_ROCM) $(CSRC)/kernels.hip -o $(BUILD_DIR)/kernels.o - $(GPP) -std=c++14 -D__HIP_PLATFORM_AMD__ -D__HIP_PLATFORM_HCC__ -DBUILD_HIP -shared -fPIC $(INCLUDE_ROCM) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_hip_nohipblaslt.so $(LIB_ROCM) - -cuda12x: $(BUILD_DIR) env - $(NVCC) $(CC_cublasLt111) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) - $(NVCC) $(CC_cublasLt111) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++20 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB) - -cpuonly: $(BUILD_DIR) env - $(GPP) -std=c++14 -shared -fPIC -I $(ROOT_DIR)/csrc -I $(ROOT_DIR)/include $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cpu.so - -env: - @echo "ENVIRONMENT" - @echo "============================" - @echo "CUDA_VERSION: $(CUDA_VERSION)" - @echo "============================" - @echo "NVCC path: $(NVCC)" - @echo "HIPCC path: $(HIPCC)" - @echo "GPP path: $(GPP) VERSION: `$(GPP) --version | head -n 1`" - @echo "CUDA_HOME: $(CUDA_HOME)" - @echo "HIP_HOME: $(HIP_HOME)" - @echo "CONDA_PREFIX: $(CONDA_PREFIX)" - @echo "PATH: $(PATH)" - @echo "LD_LIBRARY_PATH: $(LD_LIBRARY_PATH)" - @echo "============================" - -$(BUILD_DIR): - mkdir -p build - mkdir -p dependencies - -$(ROOT_DIR)/dependencies/cub: - git clone https://github.com/NVlabs/cub $(ROOT_DIR)/dependencies/cub - cd dependencies/cub; git checkout 1.11.0 - -clean: - rm -rf build/* *.egg* - rm -f bitsandbytes/libbitsandbytes*.so diff --git a/benchmarking/accuracy/bnb_accuracy.py b/benchmarking/accuracy/bnb_accuracy.py index bd3b81db4..2860338ec 100644 --- a/benchmarking/accuracy/bnb_accuracy.py +++ b/benchmarking/accuracy/bnb_accuracy.py @@ -1,8 +1,6 @@ import torch -import bitsandbytes as bnb -from bitsandbytes import functional as F - +from bitsandbytes import functional as F def debug_blocksize(block): @@ -11,6 +9,7 @@ def debug_blocksize(block): dq = F.dequantize_fp4(qx, qstate) return torch.sum(torch.linalg.norm(x - dq, ord="fro")) + def test_blocksize(block): x = torch.randn(10, 10).cuda() qx, qstate = F.quantize_fp4(x, blocksize=block) @@ -20,10 +19,8 @@ def test_blocksize(block): print("---------------") print(qstate) - - for block in [128, 256, 512, 1024, 2048]: print(debug_blocksize(block)) -#test_blocksize(2048) +# test_blocksize(2048) diff --git a/bitsandbytes/archive_functional.py b/bitsandbytes/archive_functional.py index 226c9e51f..dac7430ed 100644 --- a/bitsandbytes/archive_functional.py +++ b/bitsandbytes/archive_functional.py @@ -3,17 +3,14 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import ctypes as ct +from functools import reduce # Required in Python 3 import itertools import operator -import random -import torch -import itertools -import math -from scipy.stats import norm -import numpy as np - -from functools import reduce # Required in Python 3 from typing import Tuple + +import numpy as np +from scipy.stats import norm +import torch from torch import Tensor from .cextension import COMPILED_WITH_CUDA, lib @@ -23,12 +20,13 @@ def prod(iterable): return reduce(operator.mul, iterable, 1) + name2qmap = {} if COMPILED_WITH_CUDA: """C FUNCTIONS FOR OPTIMIZERS""" str2optimizer32bit = {} - str2optimizer32bit["adam"] = (lib.cadam32bit_grad_fp32, lib.cadam32bit_grad_fp16) #, lib.cadam32bit_grad_bf16) + str2optimizer32bit["adam"] = (lib.cadam32bit_grad_fp32, lib.cadam32bit_grad_fp16) # , lib.cadam32bit_grad_bf16) str2optimizer32bit["momentum"] = ( lib.cmomentum32bit_grad_32, lib.cmomentum32bit_grad_16, @@ -37,7 +35,7 @@ def prod(iterable): lib.crmsprop32bit_grad_32, lib.crmsprop32bit_grad_16, ) - str2optimizer32bit["lion"] = (lib.clion32bit_grad_fp32, lib.clion32bit_grad_fp16) #, lib.clion32bit_grad_bf16) + str2optimizer32bit["lion"] = (lib.clion32bit_grad_fp32, lib.clion32bit_grad_fp16) # , lib.clion32bit_grad_bf16) str2optimizer32bit["adagrad"] = ( lib.cadagrad32bit_grad_32, lib.cadagrad32bit_grad_16, @@ -73,7 +71,7 @@ def prod(iterable): str2optimizer8bit_blockwise["adam"] = ( lib.cadam_8bit_blockwise_grad_fp32, lib.cadam_8bit_blockwise_grad_fp16, - #lib.cadam_8bit_blockwise_grad_bf16, + # lib.cadam_8bit_blockwise_grad_bf16, ) str2optimizer8bit_blockwise["momentum"] = ( lib.cmomentum_8bit_blockwise_grad_fp32, @@ -86,13 +84,14 @@ def prod(iterable): str2optimizer8bit_blockwise["lion"] = ( lib.clion_8bit_blockwise_grad_fp32, lib.clion_8bit_blockwise_grad_fp16, - #lib.clion_8bit_blockwise_grad_bf16, + # lib.clion_8bit_blockwise_grad_bf16, ) str2optimizer8bit_blockwise["adagrad"] = ( lib.cadagrad_8bit_blockwise_grad_fp32, lib.cadagrad_8bit_blockwise_grad_fp16, ) + class GlobalPageManager: _instance = None @@ -110,14 +109,13 @@ def get_instance(cls): return cls._instance def prefetch_all(self, to_cpu=False): - # assume the first added, will be hte + # assume the first added, will be the # ones that are used first, so swap them in last # in the case they are evicted again for t in self.paged_tensors[::-1]: prefetch_tensor(t, to_cpu) - class CUBLAS_Context: _instance = None @@ -150,7 +148,7 @@ def __init__(self): raise RuntimeError("Call get_instance() instead") def initialize(self): - #self.context = ct.c_void_p(lib.get_cusparse()) + # self.context = ct.c_void_p(lib.get_cusparse()) if torch.version.cuda: self.context = ct.c_void_p(lib.get_cusparse()) elif torch.version.hip: @@ -163,6 +161,7 @@ def get_instance(cls): cls._instance.initialize() return cls._instance + dtype2bytes = {} dtype2bytes[torch.float32] = 4 dtype2bytes[torch.float16] = 2 @@ -170,8 +169,9 @@ def get_instance(cls): dtype2bytes[torch.uint8] = 1 dtype2bytes[torch.int8] = 1 -def get_paged(*shape, dtype=torch.float32, device=torch.device('cuda', index=0)): - num_bytes = dtype2bytes[dtype]*prod(shape) + +def get_paged(*shape, dtype=torch.float32, device=torch.device("cuda", index=0)): + num_bytes = dtype2bytes[dtype] * prod(shape) cuda_ptr = lib.cget_managed_ptr(ct.c_size_t(num_bytes)) c_ptr = ct.cast(cuda_ptr, ct.POINTER(ct.c_int)) new_array = np.ctypeslib.as_array(c_ptr, shape=shape) @@ -180,74 +180,86 @@ def get_paged(*shape, dtype=torch.float32, device=torch.device('cuda', index=0)) out.page_deviceid = device.index return out + def prefetch_tensor(A, to_cpu=False): - assert A.is_paged, 'Only paged tensors can be prefetched!' + assert A.is_paged, "Only paged tensors can be prefetched!" if to_cpu: deviceid = -1 else: deviceid = A.page_deviceid - num_bytes = dtype2bytes[A.dtype]*A.numel() + num_bytes = dtype2bytes[A.dtype] * A.numel() lib.cprefetch(get_ptr(A), ct.c_size_t(num_bytes), ct.c_int32(deviceid)) + def elementwise_func(func_name, A, B, value, prefetch=True): func = None if A.dtype == torch.float32: - func = getattr(lib, f'c{func_name}_fp32', None) + func = getattr(lib, f"c{func_name}_fp32", None) cvalue = ct.c_float(value) elif A.dtype == torch.uint8: - func = getattr(lib, f'c{func_name}_uint8', None) + func = getattr(lib, f"c{func_name}_uint8", None) cvalue = ct.c_uint8(value) - if func is None: raise NotImplementedError(f'Function not implemented: {func_name}') + if func is None: + raise NotImplementedError(f"Function not implemented: {func_name}") - is_managed = getattr(A, 'is_managed', False) + is_managed = getattr(A, "is_managed", False) if is_managed and prefetch: prefetch_tensor(A) - if B is not None: prefetch_tensor(B) + if B is not None: + prefetch_tensor(B) func(get_ptr(A), get_ptr(B), cvalue, ct.c_int64(A.numel())) if A.is_paged or B.is_paged: # paged function are fully asynchronous # if we return from this function, we want to the tensor # to be in the correct state, that is the final state after the - # operation occured. So we synchronize. + # operation occurred. So we synchronize. torch.cuda.synchronize() -def fill(A, value, device=None, prefetch=True): elementwise_func('fill', A, None, value) -def arange(A, device=None): elementwise_func('arange', A, None, 0) -def _mul(A, B, device=None): elementwise_func('_mul', A, B, 0) + +def fill(A, value, device=None, prefetch=True): + elementwise_func("fill", A, None, value) + + +def arange(A, device=None): + elementwise_func("arange", A, None, 0) + + +def _mul(A, B, device=None): + elementwise_func("_mul", A, B, 0) def create_linear_map(signed=True, total_bits=8, add_zero=True): - sign = (-1.0 if signed else 0.0) + sign = -1.0 if signed else 0.0 total_values = 2**total_bits if add_zero or total_bits < 8: # add a zero # since we simulate less bits by having zeros in the data type, we # we need to center the quantization around zero and as such lose # a single value - total_values = (2**total_bits if not signed else 2**total_bits-1) + total_values = 2**total_bits if not signed else 2**total_bits - 1 values = torch.linspace(sign, 1.0, total_values) gap = 256 - values.numel() if gap == 0: return values else: - l = values.numel()//2 - return torch.Tensor(values[:l].tolist() + [0]*gap + values[l:].tolist()) + l = values.numel() // 2 + return torch.Tensor(values[:l].tolist() + [0] * gap + values[l:].tolist()) -def create_normal_map(offset=0.9677083, use_extra_value=True): +def create_normal_map(offset=0.9677083, use_extra_value=True): if use_extra_value: # one more positive value, this is an asymmetric type v1 = norm.ppf(torch.linspace(offset, 0.5, 9)[:-1]).tolist() - v2 = [0]*(256-15) ## we have 15 non-zero values in this data type + v2 = [0] * (256 - 15) ## we have 15 non-zero values in this data type v3 = (-norm.ppf(torch.linspace(offset, 0.5, 8)[:-1])).tolist() v = v1 + v2 + v3 else: v1 = norm.ppf(torch.linspace(offset, 0.5, 8)[:-1]).tolist() - v2 = [0]*(256-14) ## we have 14 non-zero values in this data type + v2 = [0] * (256 - 14) ## we have 14 non-zero values in this data type v3 = (-norm.ppf(torch.linspace(offset, 0.5, 8)[:-1])).tolist() v = v1 + v2 + v3 @@ -257,38 +269,37 @@ def create_normal_map(offset=0.9677083, use_extra_value=True): assert values.numel() == 256 return values + def create_fp8_map(signed=True, exponent_bits=5, precision_bits=2, total_bits=8): e = exponent_bits p = precision_bits has_sign = 1 if signed else 0 - assert e+p == total_bits-has_sign + assert e + p == total_bits - has_sign # the exponent is biased to 2^(e-1) -1 == 0 evalues = [] pvalues = [] - for i, val in enumerate(range(-((2**(exponent_bits-has_sign))), 2**(exponent_bits-has_sign), 1)): + for i, val in enumerate(range(-(2 ** (exponent_bits - has_sign)), 2 ** (exponent_bits - has_sign), 1)): evalues.append(2**val) - values = [] lst = list(itertools.product([0, 1], repeat=precision_bits)) - #for ev in evalues: - bias = 2**(exponent_bits-1) - for evalue in range(2**(exponent_bits)): + # for ev in evalues: + bias = 2 ** (exponent_bits - 1) + for evalue in range(2 ** (exponent_bits)): for bit_pattern in lst: - value = (1 if evalue != 0 else 0) + value = 1 if evalue != 0 else 0 for i, pval in enumerate(list(bit_pattern)): - value += pval*(2**-(i+1)) + value += pval * (2 ** -(i + 1)) if evalue == 0: # subnormals - value = value*2**-(bias) + value = value * 2**-(bias) else: # normals - value = value*2**-(evalue-bias-1) + value = value * 2 ** -(evalue - bias - 1) values.append(value) if signed: values.append(-value) - assert len(values) == 2**total_bits values.sort() if total_bits < 8: @@ -302,7 +313,6 @@ def create_fp8_map(signed=True, exponent_bits=5, precision_bits=2, total_bits=8) return code - def create_dynamic_map(signed=True, max_exponent_bits=7, total_bits=8): """ Creates the dynamic quantiztion map. @@ -329,7 +339,11 @@ def create_dynamic_map(signed=True, max_exponent_bits=7, total_bits=8): if not signed: additional_items = 2 * additional_items for i in range(max_exponent_bits): - fraction_items = int((2 ** (i + non_sign_bits - max_exponent_bits) + 1 if signed else 2 ** (i + non_sign_bits - max_exponent_bits + 1) + 1)) + fraction_items = int( + 2 ** (i + non_sign_bits - max_exponent_bits) + 1 + if signed + else 2 ** (i + non_sign_bits - max_exponent_bits + 1) + 1 + ) boundaries = torch.linspace(0.1, 1, fraction_items) means = (boundaries[:-1] + boundaries[1:]) / 2.0 data += ((10 ** (-(max_exponent_bits - 1) + i)) * means).tolist() @@ -353,8 +367,9 @@ def create_dynamic_map(signed=True, max_exponent_bits=7, total_bits=8): data.sort() return Tensor(data) + def create_quantile_map(A, total_bits=8): - q = estimate_quantiles(A, num_quantiles=2**total_bits-1) + q = estimate_quantiles(A, num_quantiles=2**total_bits - 1) q = q.tolist() q.append(0) @@ -365,11 +380,13 @@ def create_quantile_map(A, total_bits=8): q.sort() q = Tensor(q) - q = q/q.abs().max() + q = q / q.abs().max() return q + def get_special_format_str(): - if not torch.cuda.is_available(): return 'col_turing' + if not torch.cuda.is_available(): + return "col_turing" major, _minor = torch.cuda.get_device_capability() if major <= 7: return "col_turing" @@ -378,22 +395,27 @@ def get_special_format_str(): return "col_turing" - def is_on_gpu(tensors): on_gpu = True gpu_ids = set() for t in tensors: - if t is None: continue # NULL pointers are fine - is_paged = getattr(t, 'is_paged', False) - on_gpu &= (t.device.type == 'cuda' or is_paged) + if t is None: + continue # NULL pointers are fine + is_paged = getattr(t, "is_paged", False) + on_gpu &= t.device.type == "cuda" or is_paged if not is_paged: gpu_ids.add(t.device.index) if not on_gpu: - raise TypeError(f'All input tensors need to be on the same GPU, but found some tensors to not be on a GPU:\n {[(t.shape, t.device) for t in tensors]}') + raise TypeError( + f"All input tensors need to be on the same GPU, but found some tensors to not be on a GPU:\n {[(t.shape, t.device) for t in tensors]}" + ) if len(gpu_ids) > 1: - raise TypeError(f'Input tensors need to be on the same GPU, but found the following tensor and device combinations:\n {[(t.shape, t.device) for t in tensors]}') + raise TypeError( + f"Input tensors need to be on the same GPU, but found the following tensor and device combinations:\n {[(t.shape, t.device) for t in tensors]}" + ) return on_gpu + def get_ptr(A: Tensor) -> ct.c_void_p: """ Get the ctypes pointer from a PyTorch Tensor. @@ -434,9 +456,7 @@ def get_transform_func(dtype, orderA, orderOut, transpose=False): return getattr(lib, name) -def get_transform_buffer( - shape, dtype, device, to_order, from_order="row", transpose=False -): +def get_transform_buffer(shape, dtype, device, to_order, from_order="row", transpose=False): # init_func = torch.empty init_func = torch.zeros dims = len(shape) @@ -489,9 +509,7 @@ def nvidia_transform( else: from_order = state[1] if out is None: - out, new_state = get_transform_buffer( - state[0], A.dtype, A.device, to_order, state[1] - ) + out, new_state = get_transform_buffer(state[0], A.dtype, A.device, to_order, state[1]) else: new_state = (state[1], to_order) func = get_transform_func(A.dtype, from_order, to_order, transpose) @@ -516,7 +534,7 @@ def nvidia_transform( def estimate_quantiles(A: Tensor, out: Tensor = None, offset: float = 1 / 512, num_quantiles=256) -> Tensor: - ''' + """ Estimates 256 equidistant quantiles on the input tensor eCDF. Uses SRAM-Quantiles algorithm to quickly estimate 256 equidistant quantiles @@ -543,14 +561,21 @@ def estimate_quantiles(A: Tensor, out: Tensor = None, offset: float = 1 / 512, n ------- torch.Tensor: The 256 quantiles in float32 datatype. - ''' - if A.numel() < 256: raise NotImplementedError(f'Quantile estimation needs at least 256 values in the Tensor, but Tensor had only {A.numel()} values.') - if num_quantiles > 256: raise NotImplementedError(f"Currently only a maximum of 256 equally spaced quantiles are supported, but the argument num_quantiles={num_quantiles}") - if num_quantiles < 256 and offset == 1/(512): + """ + if A.numel() < 256: + raise NotImplementedError( + f"Quantile estimation needs at least 256 values in the Tensor, but Tensor had only {A.numel()} values." + ) + if num_quantiles > 256: + raise NotImplementedError( + f"Currently only a maximum of 256 equally spaced quantiles are supported, but the argument num_quantiles={num_quantiles}" + ) + if num_quantiles < 256 and offset == 1 / (512): # override default arguments - offset = 1/(2*num_quantiles) + offset = 1 / (2 * num_quantiles) - if out is None: out = torch.zeros((256,), dtype=torch.float32, device=A.device) + if out is None: + out = torch.zeros((256,), dtype=torch.float32, device=A.device) is_on_gpu([A, out]) device = pre_call(A.device) if A.dtype == torch.float32: @@ -562,14 +587,16 @@ def estimate_quantiles(A: Tensor, out: Tensor = None, offset: float = 1 / 512, n post_call(device) if num_quantiles < 256: - step = round(256/num_quantiles) + step = round(256 / num_quantiles) idx = torch.linspace(0, 255, num_quantiles).long().to(A.device) out = out[idx] return out -def quantize_blockwise(A: Tensor, code: Tensor = None, absmax: Tensor = None, out: Tensor = None, blocksize=4096, nested=False) -> Tensor: +def quantize_blockwise( + A: Tensor, code: Tensor = None, absmax: Tensor = None, out: Tensor = None, blocksize=4096, nested=False +) -> Tensor: """ Quantize tensor A in blocks of size 4096 values. @@ -596,7 +623,6 @@ def quantize_blockwise(A: Tensor, code: Tensor = None, absmax: Tensor = None, ou The quantization state to undo the quantization. """ - if code is None: if "dynamic" not in name2qmap: name2qmap["dynamic"] = create_dynamic_map().to(A.device) @@ -611,23 +637,34 @@ def quantize_blockwise(A: Tensor, code: Tensor = None, absmax: Tensor = None, ou if out is None: out = torch.zeros_like(A, dtype=torch.uint8) - if A.device.type != 'cpu': + if A.device.type != "cpu": assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64] cblocksize = ct.c_int32(blocksize) prev_device = pre_call(A.device) code = code.to(A.device) is_on_gpu([code, A, out, absmax]) if A.dtype == torch.float32: - lib.cquantize_blockwise_fp32(get_ptr(code), get_ptr(A), get_ptr(absmax), get_ptr(out), cblocksize, ct.c_int(A.numel())) + lib.cquantize_blockwise_fp32( + get_ptr(code), get_ptr(A), get_ptr(absmax), get_ptr(out), cblocksize, ct.c_int(A.numel()) + ) elif A.dtype == torch.float16: - lib.cquantize_blockwise_fp16(get_ptr(code), get_ptr(A), get_ptr(absmax), get_ptr(out), cblocksize, ct.c_int(A.numel())) + lib.cquantize_blockwise_fp16( + get_ptr(code), get_ptr(A), get_ptr(absmax), get_ptr(out), cblocksize, ct.c_int(A.numel()) + ) else: raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}") post_call(A.device) else: # cpu code = code.cpu() - lib.cquantize_blockwise_cpu_fp32(get_ptr(code), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_longlong(blocksize), ct.c_longlong(A.numel())) + lib.cquantize_blockwise_cpu_fp32( + get_ptr(code), + get_ptr(A), + get_ptr(absmax), + get_ptr(out), + ct.c_longlong(blocksize), + ct.c_longlong(A.numel()), + ) if nested: offset = absmax.mean() @@ -637,8 +674,6 @@ def quantize_blockwise(A: Tensor, code: Tensor = None, absmax: Tensor = None, ou else: state = [absmax, code, blocksize, nested, None, None] - - return out, state @@ -649,7 +684,7 @@ def dequantize_blockwise( code: Tensor = None, out: Tensor = None, blocksize: int = 4096, - nested=False + nested=False, ) -> Tensor: """ Dequantizes blockwise quantized values. @@ -686,41 +721,58 @@ def dequantize_blockwise( out = torch.zeros_like(A, dtype=torch.float32) if quant_state is None: - quant_state = (absmax, code, blocksize) - assert absmax is not None and out is not None + quant_state = (absmax, code, blocksize) + assert absmax is not None and out is not None else: - absmax, code, blocksize, nested, offset, state2 = quant_state - if nested: - absmax = dequantize_blockwise(absmax, state2) - absmax += offset - + absmax, code, blocksize, nested, offset, state2 = quant_state + if nested: + absmax = dequantize_blockwise(absmax, state2) + absmax += offset - if A.device.type != 'cpu': + if A.device.type != "cpu": device = pre_call(A.device) code = code.to(A.device) if blocksize not in [2048, 4096, 1024, 512, 256, 128, 64]: - raise ValueError(f"The blockwise of {blocksize} is not supported. Supported values: [2048, 4096, 1024, 512, 256, 128, 64]") + raise ValueError( + f"The blockwise of {blocksize} is not supported. Supported values: [2048, 4096, 1024, 512, 256, 128, 64]" + ) is_on_gpu([A, absmax, out]) if out.dtype == torch.float32: - lib.cdequantize_blockwise_fp32(get_ptr(code), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(A.numel())) + lib.cdequantize_blockwise_fp32( + get_ptr(code), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(A.numel()) + ) elif out.dtype == torch.float16: - lib.cdequantize_blockwise_fp16(get_ptr(code), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(A.numel())) + lib.cdequantize_blockwise_fp16( + get_ptr(code), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(A.numel()) + ) else: raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}") post_call(A.device) else: code = code.cpu() - lib.cdequantize_blockwise_cpu_fp32(get_ptr(quant_state[1]), get_ptr(A), get_ptr(quant_state[0]), get_ptr(out), ct.c_longlong(blocksize), ct.c_longlong(A.numel())) + lib.cdequantize_blockwise_cpu_fp32( + get_ptr(quant_state[1]), + get_ptr(A), + get_ptr(quant_state[0]), + get_ptr(out), + ct.c_longlong(blocksize), + ct.c_longlong(A.numel()), + ) return out + def quantize_fp4(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=64, compress_statistics=False): - return quantize_4bit(A, absmax, out, blocksize, compress_statistics, 'fp4') + return quantize_4bit(A, absmax, out, blocksize, compress_statistics, "fp4") + def quantize_nf4(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=64, compress_statistics=False): - return quantize_4bit(A, absmax, out, blocksize, compress_statistics, 'nf4') + return quantize_4bit(A, absmax, out, blocksize, compress_statistics, "nf4") + -def quantize_4bit(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=64, compress_statistics=False, quant_type='fp4') -> Tensor: +def quantize_4bit( + A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=64, compress_statistics=False, quant_type="fp4" +) -> Tensor: """ Quantize tensor A in blocks of 4-bit values. @@ -746,10 +798,10 @@ def quantize_4bit(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksiz tuple(torch.Tensor, torch.Size, torch.dtype, int): The quantization state to undo the quantization. """ - if A.device.type != 'cuda': - raise NotImplementedError(f'Device type not supported for FP4 quantization: {A.device.type}') - if quant_type not in ['fp4', 'nf4']: - raise NotImplementedError(f'4-bit quantization data type {quant_type} is not implemented.') + if A.device.type != "cuda": + raise NotImplementedError(f"Device type not supported for FP4 quantization: {A.device.type}") + if quant_type not in ["fp4", "nf4"]: + raise NotImplementedError(f"4-bit quantization data type {quant_type} is not implemented.") n = A.numel() input_shape = A.shape @@ -759,9 +811,8 @@ def quantize_4bit(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksiz blocks += 1 if n % blocksize > 0 else 0 absmax = torch.zeros((blocks,), device=A.device) - if out is None: - out = torch.zeros(((n+1)//2, 1), dtype=torch.uint8, device=A.device) + out = torch.zeros(((n + 1) // 2, 1), dtype=torch.uint8, device=A.device) assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64] @@ -769,15 +820,23 @@ def quantize_4bit(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksiz is_on_gpu([A, out, absmax]) if A.dtype == torch.float32: - if quant_type == 'fp4': - lib.cquantize_blockwise_fp32_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n)) + if quant_type == "fp4": + lib.cquantize_blockwise_fp32_fp4( + get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n) + ) else: - lib.cquantize_blockwise_fp32_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n)) + lib.cquantize_blockwise_fp32_nf4( + get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n) + ) elif A.dtype == torch.float16: - if quant_type == 'fp4': - lib.cquantize_blockwise_fp16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n)) + if quant_type == "fp4": + lib.cquantize_blockwise_fp16_fp4( + get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n) + ) else: - lib.cquantize_blockwise_fp16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n)) + lib.cquantize_blockwise_fp16_nf4( + get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n) + ) else: raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}") post_call(A.device) @@ -785,8 +844,8 @@ def quantize_4bit(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksiz if compress_statistics: offset = absmax.mean() absmax -= offset - #code = create_custom_map().to(absmax.device) - #qabsmax, state2 = quantize_blockwise(absmax, code=code, blocksize=256) + # code = create_custom_map().to(absmax.device) + # qabsmax, state2 = quantize_blockwise(absmax, code=code, blocksize=256) qabsmax, state2 = quantize_blockwise(absmax, blocksize=256) del absmax state = [qabsmax, input_shape, A.dtype, blocksize, [offset, state2], quant_type] @@ -795,13 +854,35 @@ def quantize_4bit(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksiz return out, state -def dequantize_fp4(A: Tensor, quant_state: Tuple[Tensor, Tensor] = None, absmax: Tensor = None, out: Tensor = None, blocksize: int = 64) -> Tensor: - return dequantize_4bit(A, quant_state, absmax, out, blocksize, 'fp4') -def dequantize_nf4(A: Tensor, quant_state: Tuple[Tensor, Tensor] = None, absmax: Tensor = None, out: Tensor = None, blocksize: int = 64) -> Tensor: - return dequantize_4bit(A, quant_state, absmax, out, blocksize, 'nf4') +def dequantize_fp4( + A: Tensor, + quant_state: Tuple[Tensor, Tensor] = None, + absmax: Tensor = None, + out: Tensor = None, + blocksize: int = 64, +) -> Tensor: + return dequantize_4bit(A, quant_state, absmax, out, blocksize, "fp4") -def dequantize_4bit(A: Tensor,quant_state: Tuple[Tensor, Tensor] = None, absmax: Tensor = None, out: Tensor = None, blocksize: int = 64, quant_type='fp4') -> Tensor: + +def dequantize_nf4( + A: Tensor, + quant_state: Tuple[Tensor, Tensor] = None, + absmax: Tensor = None, + out: Tensor = None, + blocksize: int = 64, +) -> Tensor: + return dequantize_4bit(A, quant_state, absmax, out, blocksize, "nf4") + + +def dequantize_4bit( + A: Tensor, + quant_state: Tuple[Tensor, Tensor] = None, + absmax: Tensor = None, + out: Tensor = None, + blocksize: int = 64, + quant_type="fp4", +) -> Tensor: """ Dequantizes FP4 blockwise quantized values. @@ -829,9 +910,11 @@ def dequantize_4bit(A: Tensor,quant_state: Tuple[Tensor, Tensor] = None, absmax: Dequantized tensor. """ if blocksize not in [2048, 4096, 1024, 512, 256, 128, 64]: - raise ValueError(f"The blockwise of {blocksize} is not supported. Supported values: [2048, 4096, 1024, 512, 256, 128, 64]") - if quant_type not in ['fp4', 'nf4']: - raise NotImplementedError(f'4-bit quantization data type {quant_type} is not implemented.') + raise ValueError( + f"The blockwise of {blocksize} is not supported. Supported values: [2048, 4096, 1024, 512, 256, 128, 64]" + ) + if quant_type not in ["fp4", "nf4"]: + raise NotImplementedError(f"4-bit quantization data type {quant_type} is not implemented.") if quant_state is None: assert absmax is not None and out is not None @@ -840,7 +923,6 @@ def dequantize_4bit(A: Tensor,quant_state: Tuple[Tensor, Tensor] = None, absmax: else: absmax, shape, dtype, blocksize, compressed_stats, quant_type = quant_state - if compressed_stats is not None: offset, state2 = compressed_stats absmax = dequantize_blockwise(absmax, state2) @@ -851,26 +933,35 @@ def dequantize_4bit(A: Tensor,quant_state: Tuple[Tensor, Tensor] = None, absmax: n = out.numel() - device = pre_call(A.device) is_on_gpu([A, absmax, out]) if out.dtype == torch.float32: - if quant_type == 'fp4': - lib.cdequantize_blockwise_fp32_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(n)) + if quant_type == "fp4": + lib.cdequantize_blockwise_fp32_fp4( + get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(n) + ) else: - lib.cdequantize_blockwise_fp32_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(n)) + lib.cdequantize_blockwise_fp32_nf4( + get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(n) + ) elif out.dtype == torch.float16: - if quant_type == 'fp4': - lib.cdequantize_blockwise_fp16_fp4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(n)) + if quant_type == "fp4": + lib.cdequantize_blockwise_fp16_fp4( + get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(n) + ) else: - lib.cdequantize_blockwise_fp16_nf4(get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(n)) + lib.cdequantize_blockwise_fp16_nf4( + get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(n) + ) else: raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}") post_call(A.device) - is_transposed = (True if A.shape[0] == 1 else False) - if is_transposed: return out.t() - else: return out + is_transposed = True if A.shape[0] == 1 else False + if is_transposed: + return out.t() + else: + return out def quantize(A: Tensor, code: Tensor = None, out: Tensor = None) -> Tensor: @@ -907,7 +998,7 @@ def dequantize( def quantize_no_absmax(A: Tensor, code: Tensor, out: Tensor = None) -> Tensor: - ''' + """ Quantizes input tensor to 8-bit. Quantizes the 32-bit input tensor `A` to the 8-bit output tensor @@ -926,9 +1017,10 @@ def quantize_no_absmax(A: Tensor, code: Tensor, out: Tensor = None) -> Tensor: ------- torch.Tensor: Quantized 8-bit tensor. - ''' + """ prev_device = pre_call(A.device) - if out is None: out = torch.zeros_like(A, dtype=torch.uint8) + if out is None: + out = torch.zeros_like(A, dtype=torch.uint8) is_on_gpu([A, out]) lib.cquantize(get_ptr(code), get_ptr(A), get_ptr(out), ct.c_int(A.numel())) post_call(prev_device) @@ -936,7 +1028,7 @@ def quantize_no_absmax(A: Tensor, code: Tensor, out: Tensor = None) -> Tensor: def dequantize_no_absmax(A: Tensor, code: Tensor, out: Tensor = None) -> Tensor: - ''' + """ Dequantizes the 8-bit tensor to 32-bit. Dequantizes the 8-bit tensor `A` to the 32-bit tensor `out` via @@ -955,9 +1047,10 @@ def dequantize_no_absmax(A: Tensor, code: Tensor, out: Tensor = None) -> Tensor: ------- torch.Tensor: 32-bit output tensor. - ''' + """ prev_device = pre_call(A.device) - if out is None: out = torch.zeros_like(A, dtype=torch.float32) + if out is None: + out = torch.zeros_like(A, dtype=torch.float32) is_on_gpu([code, A, out]) lib.cdequantize(get_ptr(code), get_ptr(A), get_ptr(out), ct.c_int(A.numel())) post_call(prev_device) @@ -1024,16 +1117,17 @@ def optimizer_update_32bit( if max_unorm > 0.0: param_norm = torch.norm(p.data.float()) - optim_func = None if g.dtype == torch.float32: optim_func = str2optimizer32bit[optimizer_name][0] elif g.dtype == torch.float16: optim_func = str2optimizer32bit[optimizer_name][1] - elif (g.dtype == torch.bfloat16 and len(str2optimizer32bit[optimizer_name])==3): + elif g.dtype == torch.bfloat16 and len(str2optimizer32bit[optimizer_name]) == 3: optim_func = str2optimizer32bit[optimizer_name][2] else: - raise ValueError(f"Gradient+optimizer bit data type combination not supported: grad {g.dtype}, optimizer {state1.dtype}") + raise ValueError( + f"Gradient+optimizer bit data type combination not supported: grad {g.dtype}, optimizer {state1.dtype}" + ) is_on_gpu([g, p, state1, state2, unorm_vec]) prev_device = pre_call(g.device) @@ -1053,7 +1147,8 @@ def optimizer_update_32bit( ct.c_float(lr), ct.c_float(gnorm_scale), ct.c_bool(skip_zeros), - ct.c_int32(g.numel())) + ct.c_int32(g.numel()), + ) post_call(prev_device) @@ -1209,7 +1304,6 @@ def optimizer_update_8bit_blockwise( gnorm_scale: float = 1.0, skip_zeros=False, ) -> None: - optim_func = None prev_device = pre_call(g.device) is_on_gpu([g, p, state1, state2, qmap1, qmap2, absmax1, absmax2]) @@ -1217,8 +1311,11 @@ def optimizer_update_8bit_blockwise( optim_func = str2optimizer8bit_blockwise[optimizer_name][0] elif g.dtype == torch.float16 and state1.dtype == torch.uint8: optim_func = str2optimizer8bit_blockwise[optimizer_name][1] - elif (g.dtype == torch.bfloat16 and state1.dtype == torch.uint8 and - len(str2optimizer8bit_blockwise[optimizer_name])==3): + elif ( + g.dtype == torch.bfloat16 + and state1.dtype == torch.uint8 + and len(str2optimizer8bit_blockwise[optimizer_name]) == 3 + ): optim_func = str2optimizer8bit_blockwise[optimizer_name][2] else: raise ValueError( @@ -1250,9 +1347,8 @@ def optimizer_update_8bit_blockwise( ) post_call(prev_device) -def percentile_clipping( - grad: Tensor, gnorm_vec: Tensor, step: int, percentile: int = 5 -): + +def percentile_clipping(grad: Tensor, gnorm_vec: Tensor, step: int, percentile: int = 5): """Applies percentile clipping grad: torch.Tensor @@ -1294,9 +1390,7 @@ def percentile_clipping( return current_gnorm, clip_value, gnorm_scale -def histogram_scatter_add_2d( - histogram: Tensor, index1: Tensor, index2: Tensor, source: Tensor -): +def histogram_scatter_add_2d(histogram: Tensor, index1: Tensor, index2: Tensor, source: Tensor): assert len(histogram.shape) == 2 assert histogram.dtype == torch.float32 assert source.dtype == torch.float32 @@ -1313,12 +1407,12 @@ def histogram_scatter_add_2d( is_on_gpu([histogram, index1, index2, source]) lib.chistogram_scatter_add_2d(get_ptr(histogram), get_ptr(index1), get_ptr(index2), get_ptr(source), maxdim1, n) + def check_matmul(A, B, out, transposed_A, transposed_B, expected_type=torch.int8): - if not torch.cuda.is_initialized(): torch.cuda.init() + if not torch.cuda.is_initialized(): + torch.cuda.init() if A.dtype != expected_type or B.dtype != expected_type: - raise TypeError( - f"Expected torch.int8 input tensors A and B, but got {A.dtype} and {B.dtype}" - ) + raise TypeError(f"Expected torch.int8 input tensors A and B, but got {A.dtype} and {B.dtype}") sA = A.shape sB = B.shape @@ -1359,12 +1453,7 @@ def check_matmul(A, B, out, transposed_A, transposed_B, expected_type=torch.int8 sout = out.shape # special case common in backprop if not correct and len(sA) == 3 and len(sB) == 3: - if ( - sout[0] == sA[2] - and sout[1] == sB[2] - and sA[0] == sB[0] - and sA[1] == sB[1] - ): + if sout[0] == sA[2] and sout[1] == sB[2] and sA[0] == sB[0] and sA[1] == sB[1]: correct = True else: if len(sA) == 2 and len(sB) == 2: @@ -1402,15 +1491,9 @@ def check_matmul(A, B, out, transposed_A, transposed_B, expected_type=torch.int8 return sout -def cutlass3_gemm( - A: Tensor, - B: Tensor, - out: Tensor = None, - transposed_A=False, - transposed_B=False, - state=None -): - #sout = check_matmul(A, B, out, transposed_A, transposed_B, expected_type=A.dtype) + +def cutlass3_gemm(A: Tensor, B: Tensor, out: Tensor = None, transposed_A=False, transposed_B=False, state=None): + # sout = check_matmul(A, B, out, transposed_A, transposed_B, expected_type=A.dtype) if state is None: Bshape = B.shape bout = Bshape[1] @@ -1489,15 +1572,15 @@ def cutlass3_gemm( # B^T @ A^T = C^T # [km, nk -> mn] - #lda = ldb = ldc = 1 - #lda = 1 + # lda = ldb = ldc = 1 + # lda = 1 if state is not None: m = Bshape[0] k = Bshape[1] lda = Bshape[0] ldc = Bshape[0] - ldb = (ldb+1)//2 - #print(m, n, k, lda, ldb, ldc) + ldb = (ldb + 1) // 2 + # print(m, n, k, lda, ldb, ldc) is_on_gpu([B, A, out]) m = ct.c_int32(m) n = ct.c_int32(n) @@ -1507,19 +1590,19 @@ def cutlass3_gemm( ldc = ct.c_int32(ldc) if B.dtype == torch.uint8: - lib.cgemm_4bit_inference(m, n, k, get_ptr(A), get_ptr(B), get_ptr(state[0]), get_ptr(out), lda, ldb, ldc, ct.c_int32(state[3])) + lib.cgemm_4bit_inference( + m, n, k, get_ptr(A), get_ptr(B), get_ptr(state[0]), get_ptr(out), lda, ldb, ldc, ct.c_int32(state[3]) + ) elif A.dtype == torch.float32: lib.cgemm_host_fp32(m, n, k, get_ptr(A), get_ptr(B), get_ptr(out), lda, ldb, ldc) elif A.dtype == torch.float16: lib.cgemm_host_fp16(m, n, k, get_ptr(A), get_ptr(B), get_ptr(out), lda, ldb, ldc) else: - raise NotImplementedError(f'Matmul not implemented for data type {A.dtype}') + raise NotImplementedError(f"Matmul not implemented for data type {A.dtype}") return out - - def igemm( A: Tensor, B: Tensor, @@ -1604,8 +1687,20 @@ def igemm( # B^T @ A^T = C^T # [km, nk -> mn] is_on_gpu([B, A, out]) - lib.cigemm(ptr, ct.c_bool(transposed_B), ct.c_bool(transposed_A), ct.c_int32(m), ct.c_int32(n), ct.c_int32(k), - get_ptr(B), get_ptr(A), get_ptr(out), ct.c_int32(lda), ct.c_int32(ldb), ct.c_int32(ldc)) + lib.cigemm( + ptr, + ct.c_bool(transposed_B), + ct.c_bool(transposed_A), + ct.c_int32(m), + ct.c_int32(n), + ct.c_int32(k), + get_ptr(B), + get_ptr(A), + get_ptr(out), + ct.c_int32(lda), + ct.c_int32(ldb), + ct.c_int32(ldc), + ) return out @@ -1617,9 +1712,7 @@ def batched_igemm( transposed_B=False, ): if not len(A.shape) == 3 or not len(B.shape) == 3: - raise ValueError( - f"Expected 3-dimensional tensors for bmm, but got shapes A and B: {A.shape} and {B.shape}" - ) + raise ValueError(f"Expected 3-dimensional tensors for bmm, but got shapes A and B: {A.shape} and {B.shape}") sout = check_matmul(A, B, out, transposed_A, transposed_B) if out is None: out = torch.zeros(size=sout, dtype=torch.int32, device=A.device) @@ -1686,9 +1779,24 @@ def batched_igemm( ptr = CUBLAS_Context.get_instance().get_context(A.device) is_on_gpu([B, A, out]) - lib.cbatched_igemm(ptr, ct.c_bool(transposed_B), ct.c_bool(transposed_A), ct.c_int32(m), ct.c_int32(n), ct.c_int32(k), - get_ptr(B), get_ptr(A), get_ptr(out), ct.c_int32(lda), ct.c_int32(ldb), ct.c_int32(ldc), - ct.c_long(strideA), ct.c_long(strideB), ct.c_long(strideC), ct.c_uint32(num_batch)) + lib.cbatched_igemm( + ptr, + ct.c_bool(transposed_B), + ct.c_bool(transposed_A), + ct.c_int32(m), + ct.c_int32(n), + ct.c_int32(k), + get_ptr(B), + get_ptr(A), + get_ptr(out), + ct.c_int32(lda), + ct.c_int32(ldb), + ct.c_int32(ldc), + ct.c_long(strideA), + ct.c_long(strideB), + ct.c_long(strideC), + ct.c_uint32(num_batch), + ) return out @@ -1697,14 +1805,14 @@ def igemmlt(A, B, SA, SB, out=None, Sout=None, dtype=torch.int32): shapeB = SB[0] dimsA = len(shapeA) dimsB = len(shapeB) - assert dimsB == 2, 'Only two dimensional matrices are supported for argument B' + assert dimsB == 2, "Only two dimensional matrices are supported for argument B" if dimsA == 2: m = shapeA[0] elif dimsA == 3: m = shapeA[0] * shapeA[1] rows = n = shapeB[0] - assert prod(list(shapeA)) > 0, f'Input tensor dimensions need to be > 0: {shapeA}' + assert prod(list(shapeA)) > 0, f"Input tensor dimensions need to be > 0: {shapeA}" # if the tensor is empty, return a transformed empty tensor with the right dimensions if shapeA[0] == 0 and dimsA == 2: @@ -1713,13 +1821,9 @@ def igemmlt(A, B, SA, SB, out=None, Sout=None, dtype=torch.int32): return torch.empty(tuple(shapeA[:2] + [shapeB[0]]), device=A.device, dtype=torch.float16) if dimsA == 2 and out is None: - out, Sout = get_transform_buffer( - (shapeA[0], shapeB[0]), dtype, A.device, "col32", "row" - ) + out, Sout = get_transform_buffer((shapeA[0], shapeB[0]), dtype, A.device, "col32", "row") elif dimsA == 3 and out is None: - out, Sout = get_transform_buffer( - (shapeA[0], shapeA[1], shapeB[0]), dtype, A.device, "col32", "row" - ) + out, Sout = get_transform_buffer((shapeA[0], shapeA[1], shapeB[0]), dtype, A.device, "col32", "row") assert dimsB != 3, "len(B.shape)==3 not supported" assert A.device.type == "cuda" @@ -1761,46 +1865,30 @@ def igemmlt(A, B, SA, SB, out=None, Sout=None, dtype=torch.int32): has_error = 0 ptrRowScale = get_ptr(None) is_on_gpu([A, B, out]) - if formatB == 'col_turing': + if formatB == "col_turing": if dtype == torch.int32: - has_error = lib.cigemmlt_turing_32( - ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc - ) + has_error = lib.cigemmlt_turing_32(ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc) else: - has_error = lib.cigemmlt_turing_8( - ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc - ) + has_error = lib.cigemmlt_turing_8(ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc) elif formatB == "col_ampere": if dtype == torch.int32: - has_error = lib.cigemmlt_ampere_32( - ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc - ) + has_error = lib.cigemmlt_ampere_32(ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc) else: - has_error = lib.cigemmlt_ampere_8( - ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc - ) + has_error = lib.cigemmlt_ampere_8(ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc) if has_error == 1: - print(f'A: {shapeA}, B: {shapeB}, C: {Sout[0]}; (lda, ldb, ldc): {(lda, ldb, ldc)}; (m, n, k): {(m, n, k)}') - raise Exception('cublasLt ran into an error!') + print(f"A: {shapeA}, B: {shapeB}, C: {Sout[0]}; (lda, ldb, ldc): {(lda, ldb, ldc)}; (m, n, k): {(m, n, k)}") + raise Exception("cublasLt ran into an error!") torch.cuda.set_device(prev_device) return out, Sout -def mm_dequant( - A, - quant_state, - row_stats, - col_stats, - out=None, - new_row_stats=None, - new_col_stats=None, - bias=None -): +def mm_dequant(A, quant_state, row_stats, col_stats, out=None, new_row_stats=None, new_col_stats=None, bias=None): assert A.dtype == torch.int32 - if bias is not None: assert bias.dtype == torch.float16 + if bias is not None: + assert bias.dtype == torch.float16 out_shape = quant_state[0] if len(out_shape) == 3: out_shape = (out_shape[0] * out_shape[1], out_shape[2]) @@ -1808,19 +1896,11 @@ def mm_dequant( if out is None: out = torch.empty(out_shape, dtype=torch.float16, device=A.device) if new_row_stats is None: - new_row_stats = torch.empty( - out_shape[0], dtype=torch.float32, device=A.device - ) + new_row_stats = torch.empty(out_shape[0], dtype=torch.float32, device=A.device) if new_col_stats is None: - new_col_stats = torch.empty( - out_shape[1], dtype=torch.float32, device=A.device - ) - assert ( - new_row_stats.shape[0] == row_stats.shape[0] - ), f"{new_row_stats.shape} vs {row_stats.shape}" - assert ( - new_col_stats.shape[0] == col_stats.shape[0] - ), f"{new_col_stats.shape} vs {col_stats.shape}" + new_col_stats = torch.empty(out_shape[1], dtype=torch.float32, device=A.device) + assert new_row_stats.shape[0] == row_stats.shape[0], f"{new_row_stats.shape} vs {row_stats.shape}" + assert new_col_stats.shape[0] == col_stats.shape[0], f"{new_col_stats.shape} vs {col_stats.shape}" prev_device = pre_call(A.device) ptrA = get_ptr(A) @@ -1834,15 +1914,15 @@ def mm_dequant( numCols = ct.c_int32(out_shape[1]) is_on_gpu([A, row_stats, col_stats, out, new_row_stats, new_col_stats, bias]) - lib.cdequant_mm_int32_fp16(ptrA, ptrRowStats, ptrColStats, ptrOut, ptrNewRowStats, ptrNewColStats, ptrBias, numRows, numCols) + lib.cdequant_mm_int32_fp16( + ptrA, ptrRowStats, ptrColStats, ptrOut, ptrNewRowStats, ptrNewColStats, ptrBias, numRows, numCols + ) post_call(prev_device) return out -def get_colrow_absmax( - A, row_stats=None, col_stats=None, nnz_block_ptr=None, threshold=0.0 -): +def get_colrow_absmax(A, row_stats=None, col_stats=None, nnz_block_ptr=None, threshold=0.0): assert A.dtype == torch.float16 device = A.device @@ -1855,18 +1935,12 @@ def get_colrow_absmax( col_tiles = (cols + 255) // 256 tiled_rows = ((rows + 15) // 16) * 16 if row_stats is None: - row_stats = torch.empty( - (rows,), dtype=torch.float32, device=device - ).fill_(-50000.0) + row_stats = torch.empty((rows,), dtype=torch.float32, device=device).fill_(-50000.0) if col_stats is None: - col_stats = torch.empty( - (cols,), dtype=torch.float32, device=device - ).fill_(-50000.0) + col_stats = torch.empty((cols,), dtype=torch.float32, device=device).fill_(-50000.0) if nnz_block_ptr is None and threshold > 0.0: - nnz_block_ptr = torch.zeros( - ((tiled_rows * col_tiles) + 1,), dtype=torch.int32, device=device - ) + nnz_block_ptr = torch.zeros(((tiled_rows * col_tiles) + 1,), dtype=torch.int32, device=device) ptrA = get_ptr(A) ptrRowStats = get_ptr(row_stats) @@ -1940,14 +2014,10 @@ def __init__(self, rows, cols, nnz, colptr, rowidx, values): def coo2csr(cooA): values, counts = torch.unique(cooA.rowidx, return_counts=True) values.add_(1) - rowptr = torch.zeros( - (cooA.rows + 1,), dtype=torch.int32, device=cooA.rowidx.device - ) + rowptr = torch.zeros((cooA.rows + 1,), dtype=torch.int32, device=cooA.rowidx.device) rowptr.scatter_(index=values.long(), src=counts.int(), dim=0) rowptr.cumsum_(0) - return CSRSparseTensor( - cooA.rows, cooA.cols, cooA.nnz, rowptr, cooA.colidx, cooA.values - ) + return CSRSparseTensor(cooA.rows, cooA.cols, cooA.nnz, rowptr, cooA.colidx, cooA.values) def coo2csc(cooA): @@ -1956,14 +2026,10 @@ def coo2csc(cooA): values = cooA.values[col2rowidx] colvalues, counts = torch.unique(val, return_counts=True) colvalues.add_(1) - colptr = torch.zeros( - (cooA.cols + 1,), dtype=torch.int32, device=cooA.colidx.device - ) + colptr = torch.zeros((cooA.cols + 1,), dtype=torch.int32, device=cooA.colidx.device) colptr.scatter_(index=colvalues.long(), src=counts.int(), dim=0) colptr.cumsum_(0) - return CSCSparseTensor( - cooA.rows, cooA.cols, cooA.nnz, colptr, rowidx, values - ) + return CSCSparseTensor(cooA.rows, cooA.cols, cooA.nnz, colptr, rowidx, values) def coo_zeros(rows, cols, nnz, device, dtype=torch.half): @@ -1973,9 +2039,7 @@ def coo_zeros(rows, cols, nnz, device, dtype=torch.half): return COOSparseTensor(rows, cols, nnz, rowidx, colidx, values) -def double_quant( - A, col_stats=None, row_stats=None, out_col=None, out_row=None, threshold=0.0 -): +def double_quant(A, col_stats=None, row_stats=None, out_col=None, out_row=None, threshold=0.0): device = A.device assert A.dtype == torch.half assert device.type == "cuda" @@ -1988,9 +2052,7 @@ def double_quant( rows = A.shape[0] if row_stats is None or col_stats is None: - row_stats, col_stats, nnz_row_ptr = get_colrow_absmax( - A, threshold=threshold - ) + row_stats, col_stats, nnz_row_ptr = get_colrow_absmax(A, threshold=threshold) if out_col is None: out_col = torch.zeros(A.shape, device=device, dtype=torch.int8) @@ -2008,9 +2070,7 @@ def double_quant( if threshold > 0.0: nnz = nnz_row_ptr[-1].item() if nnz > 0: - coo_tensor = coo_zeros( - A.shape[0], A.shape[1], nnz_row_ptr[-1].item(), device - ) + coo_tensor = coo_zeros(A.shape[0], A.shape[1], nnz_row_ptr[-1].item(), device) ptrRowIdx = get_ptr(coo_tensor.rowidx) ptrColIdx = get_ptr(coo_tensor.colidx) ptrVal = get_ptr(coo_tensor.values) @@ -2069,12 +2129,16 @@ def double_quant( return out_row, out_col, row_stats, col_stats, coo_tensor -def transform(A, to_order, from_order='row', out=None, transpose=False, state=None, ld=None): +def transform(A, to_order, from_order="row", out=None, transpose=False, state=None, ld=None): prev_device = pre_call(A.device) - if state is None: state = (A.shape, from_order) - else: from_order = state[1] - if out is None: out, new_state = get_transform_buffer(state[0], A.dtype, A.device, to_order, state[1], transpose) - else: new_state = (state[0], to_order) # (shape, order) + if state is None: + state = (A.shape, from_order) + else: + from_order = state[1] + if out is None: + out, new_state = get_transform_buffer(state[0], A.dtype, A.device, to_order, state[1], transpose) + else: + new_state = (state[0], to_order) # (shape, order) shape = state[0] if len(shape) == 2: @@ -2085,7 +2149,7 @@ def transform(A, to_order, from_order='row', out=None, transpose=False, state=No dim2 = ct.c_int32(shape[2]) is_on_gpu([A, out]) - if to_order == 'col32': + if to_order == "col32": if transpose: lib.ctransform_row2col32T(get_ptr(A), get_ptr(out), dim1, dim2) else: @@ -2106,7 +2170,7 @@ def transform(A, to_order, from_order='row', out=None, transpose=False, state=No elif from_order == "col_ampere": lib.ctransform_ampere2row(get_ptr(A), get_ptr(out), dim1, dim2) else: - raise NotImplementedError(f'Transform function not implemented: From {from_order} to {to_order}') + raise NotImplementedError(f"Transform function not implemented: From {from_order} to {to_order}") post_call(prev_device) @@ -2115,9 +2179,7 @@ def transform(A, to_order, from_order='row', out=None, transpose=False, state=No def spmm_coo(cooA, B, out=None): if out is None: - out = torch.empty( - (cooA.rows, B.shape[1]), device=B.device, dtype=B.dtype - ) + out = torch.empty((cooA.rows, B.shape[1]), device=B.device, dtype=B.dtype) nnz = cooA.nnz assert cooA.rowidx.numel() == nnz assert cooA.colidx.numel() == nnz @@ -2144,16 +2206,28 @@ def spmm_coo(cooA, B, out=None): cldc = ct.c_int32(ldc) is_on_gpu([cooA.rowidx, cooA.colidx, cooA.values, B, out]) - lib.cspmm_coo(ptr, ptrRowidx, ptrColidx, ptrValues, cnnz, crowsA, ccolsA, ccolsB, cldb, ptrB, cldc, ptrC, ct.c_bool(transposed_B)) + lib.cspmm_coo( + ptr, + ptrRowidx, + ptrColidx, + ptrValues, + cnnz, + crowsA, + ccolsA, + ccolsB, + cldb, + ptrB, + cldc, + ptrC, + ct.c_bool(transposed_B), + ) return out def spmm_coo_very_sparse(cooA, B, dequant_stats=None, out=None): if out is None: - out = torch.zeros( - (cooA.rows, B.shape[1]), device=B.device, dtype=cooA.values.dtype - ) + out = torch.zeros((cooA.rows, B.shape[1]), device=B.device, dtype=cooA.values.dtype) nnz = cooA.nnz prev_device = pre_call(B.device) assert cooA.rowidx.numel() == nnz @@ -2171,9 +2245,7 @@ def spmm_coo_very_sparse(cooA, B, dequant_stats=None, out=None): max_count, max_idx = torch.sort(counts, descending=True) max_idx = max_idx.int() max_count = max_count.int() - assert ( - max_count[0] <= 32 - ), f"Current max count per row is 8 but found {max_count[0]}." + assert max_count[0] <= 32, f"Current max count per row is 8 but found {max_count[0]}." assert B.dtype in [torch.float16, torch.int8] ptrOffset = get_ptr(offset) ptrMaxCount = get_ptr(max_count) @@ -2261,9 +2333,7 @@ def vectorwise_quant(x, dim=1, quant_type="vector"): elif quant_type in ["vector-zeropoint", "row-zeropoint"]: dtype = x.dtype x = x.float() - dyna = torch.amax(x, dim=dim, keepdim=True) - torch.amin( - x, dim=dim, keepdim=True - ) + dyna = torch.amax(x, dim=dim, keepdim=True) - torch.amin(x, dim=dim, keepdim=True) dyna[dyna == 0] = 1 qx = 255.0 / dyna minx = torch.amin(x, dim=dim, keepdim=True) @@ -2371,9 +2441,7 @@ def extract_outliers(A, SA, idx): assert formatA in ["col_turing", "col_ampere"] assert A.device.type == "cuda" - out = torch.zeros( - (shapeA[0], idx.numel()), dtype=torch.int8, device=A.device - ) + out = torch.zeros((shapeA[0], idx.numel()), dtype=torch.int8, device=A.device) idx_size = ct.c_int32(idx.numel()) rows = ct.c_int32(shapeA[0]) @@ -2383,7 +2451,7 @@ def extract_outliers(A, SA, idx): ptrOut = get_ptr(out) prev_device = pre_call(A.device) - if formatA == 'col_turing': + if formatA == "col_turing": lib.cextractOutliers_turing(ptrA, ptrIdx, ptrOut, idx_size, rows, cols) elif formatA == "col_ampere": lib.cextractOutliers_ampere(ptrA, ptrIdx, ptrOut, idx_size, rows, cols) @@ -2391,6 +2459,7 @@ def extract_outliers(A, SA, idx): return out + def pipeline_test(A, batch_size): out = torch.zeros_like(A) lib.cpipeline_test(get_ptr(A), get_ptr(out), ct.c_size_t(A.numel()), ct.c_size_t(batch_size)) diff --git a/bitsandbytes/cuda_setup/main.py b/bitsandbytes/cuda_setup/main.py index b4962c1a0..b0a790e70 100644 --- a/bitsandbytes/cuda_setup/main.py +++ b/bitsandbytes/cuda_setup/main.py @@ -17,25 +17,32 @@ """ import ctypes as ct -import os import errno -import torch -from warnings import warn -from itertools import product - +import os from pathlib import Path from typing import Set, Union +from warnings import warn + +import torch + from .env_vars import get_potentially_lib_path_containing_env_vars # these are the most common libs names # libcudart.so is missing by default for a conda install with PyTorch 2.0 and instead # we have libcudart.so.11.0 which causes a lot of errors before # not sure if libcudart.so.12.0 exists in pytorch installs, but it does not hurt -CUDA_RUNTIME_LIBS: list = ["libcudart.so", 'libcudart.so.11.0', 'libcudart.so.12.0', 'libcudart.so.12.1', 'libcudart.so.12.2'] +CUDA_RUNTIME_LIBS: list = [ + "libcudart.so", + "libcudart.so.11.0", + "libcudart.so.12.0", + "libcudart.so.12.1", + "libcudart.so.12.2", +] # this is a order list of backup paths to search CUDA in, if it cannot be found in the main environmental paths backup_paths = [] -backup_paths.append('$CONDA_PREFIX/lib/libcudart.so.11.0') +backup_paths.append("$CONDA_PREFIX/lib/libcudart.so.11.0") + class CUDASetup: _instance = None @@ -44,59 +51,89 @@ def __init__(self): raise RuntimeError("Call get_instance() instead") def generate_instructions(self): - if getattr(self, 'error', False): return + if getattr(self, "error", False): + return print(self.error) self.error = True if not self.cuda_available: - self.add_log_entry('CUDA SETUP: Problem: The main issue seems to be that the main CUDA library was not detected or CUDA not installed.') - self.add_log_entry('CUDA SETUP: Solution 1): Your paths are probably not up-to-date. You can update them via: sudo ldconfig.') - self.add_log_entry('CUDA SETUP: Solution 2): If you do not have sudo rights, you can do the following:') - self.add_log_entry('CUDA SETUP: Solution 2a): Find the cuda library via: find / -name libcuda.so 2>/dev/null') - self.add_log_entry('CUDA SETUP: Solution 2b): Once the library is found add it to the LD_LIBRARY_PATH: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:FOUND_PATH_FROM_2a') - self.add_log_entry('CUDA SETUP: Solution 2c): For a permanent solution add the export from 2b into your .bashrc file, located at ~/.bashrc') - self.add_log_entry('CUDA SETUP: Solution 3): For a missing CUDA runtime library (libcudart.so), use `find / -name libcudart.so* and follow with step (2b)') + self.add_log_entry( + "CUDA SETUP: Problem: The main issue seems to be that the main CUDA library was not detected or CUDA not installed." + ) + self.add_log_entry( + "CUDA SETUP: Solution 1): Your paths are probably not up-to-date. You can update them via: sudo ldconfig." + ) + self.add_log_entry("CUDA SETUP: Solution 2): If you do not have sudo rights, you can do the following:") + self.add_log_entry( + "CUDA SETUP: Solution 2a): Find the cuda library via: find / -name libcuda.so 2>/dev/null" + ) + self.add_log_entry( + "CUDA SETUP: Solution 2b): Once the library is found add it to the LD_LIBRARY_PATH: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:FOUND_PATH_FROM_2a" + ) + self.add_log_entry( + "CUDA SETUP: Solution 2c): For a permanent solution add the export from 2b into your .bashrc file, located at ~/.bashrc" + ) + self.add_log_entry( + "CUDA SETUP: Solution 3): For a missing CUDA runtime library (libcudart.so), use `find / -name libcudart.so* and follow with step (2b)" + ) return if self.cudart_path is None: - self.add_log_entry('CUDA SETUP: Problem: The main issue seems to be that the main CUDA runtime library was not detected.') - self.add_log_entry('CUDA SETUP: Solution 1: To solve the issue the libcudart.so location needs to be added to the LD_LIBRARY_PATH variable') - self.add_log_entry('CUDA SETUP: Solution 1a): Find the cuda runtime library via: find / -name libcudart.so 2>/dev/null') - self.add_log_entry('CUDA SETUP: Solution 1b): Once the library is found add it to the LD_LIBRARY_PATH: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:FOUND_PATH_FROM_1a') - self.add_log_entry('CUDA SETUP: Solution 1c): For a permanent solution add the export from 1b into your .bashrc file, located at ~/.bashrc') - self.add_log_entry('CUDA SETUP: Solution 2: If no library was found in step 1a) you need to install CUDA.') - self.add_log_entry('CUDA SETUP: Solution 2a): Download CUDA install script: wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/cuda_install.sh') - self.add_log_entry('CUDA SETUP: Solution 2b): Install desired CUDA version to desired location. The syntax is bash cuda_install.sh CUDA_VERSION PATH_TO_INSTALL_INTO.') - self.add_log_entry('CUDA SETUP: Solution 2b): For example, "bash cuda_install.sh 113 ~/local/" will download CUDA 11.3 and install into the folder ~/local') + self.add_log_entry( + "CUDA SETUP: Problem: The main issue seems to be that the main CUDA runtime library was not detected." + ) + self.add_log_entry( + "CUDA SETUP: Solution 1: To solve the issue the libcudart.so location needs to be added to the LD_LIBRARY_PATH variable" + ) + self.add_log_entry( + "CUDA SETUP: Solution 1a): Find the cuda runtime library via: find / -name libcudart.so 2>/dev/null" + ) + self.add_log_entry( + "CUDA SETUP: Solution 1b): Once the library is found add it to the LD_LIBRARY_PATH: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:FOUND_PATH_FROM_1a" + ) + self.add_log_entry( + "CUDA SETUP: Solution 1c): For a permanent solution add the export from 1b into your .bashrc file, located at ~/.bashrc" + ) + self.add_log_entry("CUDA SETUP: Solution 2: If no library was found in step 1a) you need to install CUDA.") + self.add_log_entry( + "CUDA SETUP: Solution 2a): Download CUDA install script: wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/cuda_install.sh" + ) + self.add_log_entry( + "CUDA SETUP: Solution 2b): Install desired CUDA version to desired location. The syntax is bash cuda_install.sh CUDA_VERSION PATH_TO_INSTALL_INTO." + ) + self.add_log_entry( + 'CUDA SETUP: Solution 2b): For example, "bash cuda_install.sh 113 ~/local/" will download CUDA 11.3 and install into the folder ~/local' + ) return - make_cmd = f'CUDA_VERSION={self.cuda_version_string}' + make_cmd = f"CUDA_VERSION={self.cuda_version_string}" if len(self.cuda_version_string) < 3: - make_cmd += ' make cuda92' - elif self.cuda_version_string == '110': - make_cmd += ' make cuda110' - elif self.cuda_version_string[:2] == '11' and int(self.cuda_version_string[2]) > 0: - make_cmd += ' make cuda11x' - elif self.cuda_version_string[:2] == '12' and 1 >= int(self.cuda_version_string[2]) >= 0: - make_cmd += ' make cuda12x' - elif self.cuda_version_string == '100': - self.add_log_entry('CUDA SETUP: CUDA 10.0 not supported. Please use a different CUDA version.') - self.add_log_entry('CUDA SETUP: Before you try again running bitsandbytes, make sure old CUDA 10.0 versions are uninstalled and removed from $LD_LIBRARY_PATH variables.') + make_cmd += " make cuda92" + elif self.cuda_version_string == "110": + make_cmd += " make cuda110" + elif self.cuda_version_string[:2] == "11" and int(self.cuda_version_string[2]) > 0: + make_cmd += " make cuda11x" + elif self.cuda_version_string[:2] == "12" and 1 >= int(self.cuda_version_string[2]) >= 0: + make_cmd += " make cuda12x" + elif self.cuda_version_string == "100": + self.add_log_entry("CUDA SETUP: CUDA 10.0 not supported. Please use a different CUDA version.") + self.add_log_entry( + "CUDA SETUP: Before you try again running bitsandbytes, make sure old CUDA 10.0 versions are uninstalled and removed from $LD_LIBRARY_PATH variables." + ) return - has_cublaslt = is_cublasLt_compatible(self.cc) if not has_cublaslt: - make_cmd += '_nomatmul' + make_cmd += "_nomatmul" - self.add_log_entry('CUDA SETUP: Something unexpected happened. Please compile from source:') - self.add_log_entry('git clone https://github.com/TimDettmers/bitsandbytes.git') - self.add_log_entry('cd bitsandbytes') + self.add_log_entry("CUDA SETUP: Something unexpected happened. Please compile from source:") + self.add_log_entry("git clone https://github.com/TimDettmers/bitsandbytes.git") + self.add_log_entry("cd bitsandbytes") self.add_log_entry(make_cmd) - self.add_log_entry('python setup.py install') + self.add_log_entry("python setup.py install") def initialize(self): - if not getattr(self, 'initialized', False): + if not getattr(self, "initialized", False): self.has_printed = False self.lib = None self.initialized = False @@ -104,16 +141,18 @@ def initialize(self): def manual_override(self): if torch.cuda.is_available(): - if 'BNB_CUDA_VERSION' in os.environ: - if len(os.environ['BNB_CUDA_VERSION']) > 0: - warn((f'\n\n{"="*80}\n' - 'WARNING: Manual override via BNB_CUDA_VERSION env variable detected!\n' - 'BNB_CUDA_VERSION=XXX can be used to load a bitsandbytes version that is different from the PyTorch CUDA version.\n' - 'If this was unintended set the BNB_CUDA_VERSION variable to an empty string: export BNB_CUDA_VERSION=\n' - 'If you use the manual override make sure the right libcudart.so is in your LD_LIBRARY_PATH\n' - 'For example by adding the following to your .bashrc: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH: 0: + warn( + f'\n\n{"="*80}\n' + 'WARNING: Manual override via BNB_CUDA_VERSION env variable detected!\n' + 'BNB_CUDA_VERSION=XXX can be used to load a bitsandbytes version that is different from the PyTorch CUDA version.\n' + 'If this was unintended set the BNB_CUDA_VERSION variable to an empty string: export BNB_CUDA_VERSION=\n' + 'If you use the manual override make sure the right libcudart.so is in your LD_LIBRARY_PATH\n' + 'For example by adding the following to your .bashrc: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH: Set[Path]: return {Path(ld_path) for ld_path in paths_list_candidate.split(":") if ld_path} @@ -202,7 +253,7 @@ def remove_non_existent_dirs(candidate_paths: Set[Path]) -> Set[Path]: if path.exists(): existent_directories.add(path) except PermissionError as pex: - # Handle the PermissionError first as it is a subtype of OSError + # Handle the PermissionError first as it is a subtype of OSError # https://docs.python.org/3/library/exceptions.html#exception-hierarchy pass except OSError as exc: @@ -211,8 +262,11 @@ def remove_non_existent_dirs(candidate_paths: Set[Path]) -> Set[Path]: non_existent_directories: Set[Path] = candidate_paths - existent_directories if non_existent_directories: - CUDASetup.get_instance().add_log_entry("The following directories listed in your path were found to " - f"be non-existent: {non_existent_directories}", is_warning=False) + CUDASetup.get_instance().add_log_entry( + "The following directories listed in your path were found to " + f"be non-existent: {non_existent_directories}", + is_warning=False, + ) return existent_directories @@ -238,9 +292,7 @@ def resolve_paths_list(paths_list_candidate: str) -> Set[Path]: def find_cuda_lib_in(paths_list_candidate: str) -> Set[Path]: - return get_cuda_runtime_lib_paths( - resolve_paths_list(paths_list_candidate) - ) + return get_cuda_runtime_lib_paths(resolve_paths_list(paths_list_candidate)) def warn_in_case_of_duplicates(results_paths: Set[Path]) -> None: @@ -248,27 +300,28 @@ def warn_in_case_of_duplicates(results_paths: Set[Path]) -> None: warning_msg = ( f"Found duplicate {CUDA_RUNTIME_LIBS} files: {results_paths}.. " "We select the PyTorch default libcudart.so, which is {torch.version.cuda}," - "but this might missmatch with the CUDA version that is needed for bitsandbytes." + "but this might mismatch with the CUDA version that is needed for bitsandbytes." "To override this behavior set the BNB_CUDA_VERSION= environmental variable" "For example, if you want to use the CUDA version 122" "BNB_CUDA_VERSION=122 python ..." "OR set the environmental variable in your .bashrc: export BNB_CUDA_VERSION=122" "In the case of a manual override, make sure you set the LD_LIBRARY_PATH, e.g." - "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-11.2") + "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-11.2" + ) CUDASetup.get_instance().add_log_entry(warning_msg, is_warning=True) def determine_cuda_runtime_lib_path() -> Union[Path, None]: """ - Searches for a cuda installations, in the following order of priority: - 1. active conda env - 2. LD_LIBRARY_PATH - 3. any other env vars, while ignoring those that - - are known to be unrelated (see `bnb.cuda_setup.env_vars.to_be_ignored`) - - don't contain the path separator `/` - - If multiple libraries are found in part 3, we optimistically try one, - while giving a warning message. + Searches for a cuda installations, in the following order of priority: + 1. active conda env + 2. LD_LIBRARY_PATH + 3. any other env vars, while ignoring those that + - are known to be unrelated (see `bnb.cuda_setup.env_vars.to_be_ignored`) + - don't contain the path separator `/` + + If multiple libraries are found in part 3, we optimistically try one, + while giving a warning message. """ candidate_env_vars = get_potentially_lib_path_containing_env_vars() @@ -282,8 +335,11 @@ def determine_cuda_runtime_lib_path() -> Union[Path, None]: if conda_cuda_libs: cuda_runtime_libs.update(conda_cuda_libs) - CUDASetup.get_instance().add_log_entry(f'{candidate_env_vars["CONDA_PREFIX"]} did not contain ' - f'{CUDA_RUNTIME_LIBS} as expected! Searching further paths...', is_warning=True) + CUDASetup.get_instance().add_log_entry( + f'{candidate_env_vars["CONDA_PREFIX"]} did not contain ' + f'{CUDA_RUNTIME_LIBS} as expected! Searching further paths...', + is_warning=True, + ) if "LD_LIBRARY_PATH" in candidate_env_vars: lib_ld_cuda_libs = find_cuda_lib_in(candidate_env_vars["LD_LIBRARY_PATH"]) @@ -292,11 +348,15 @@ def determine_cuda_runtime_lib_path() -> Union[Path, None]: cuda_runtime_libs.update(lib_ld_cuda_libs) warn_in_case_of_duplicates(lib_ld_cuda_libs) - CUDASetup.get_instance().add_log_entry(f'{candidate_env_vars["LD_LIBRARY_PATH"]} did not contain ' - f'{CUDA_RUNTIME_LIBS} as expected! Searching further paths...', is_warning=True) + CUDASetup.get_instance().add_log_entry( + f'{candidate_env_vars["LD_LIBRARY_PATH"]} did not contain ' + f'{CUDA_RUNTIME_LIBS} as expected! Searching further paths...', + is_warning=True, + ) remaining_candidate_env_vars = { - env_var: value for env_var, value in candidate_env_vars.items() + env_var: value + for env_var, value in candidate_env_vars.items() if env_var not in {"CONDA_PREFIX", "LD_LIBRARY_PATH"} } @@ -305,13 +365,15 @@ def determine_cuda_runtime_lib_path() -> Union[Path, None]: cuda_runtime_libs.update(find_cuda_lib_in(value)) if len(cuda_runtime_libs) == 0: - CUDASetup.get_instance().add_log_entry('CUDA_SETUP: WARNING! libcudart.so not found in any environmental path. Searching in backup paths...') - cuda_runtime_libs.update(find_cuda_lib_in('/usr/local/cuda/lib64')) + CUDASetup.get_instance().add_log_entry( + "CUDA_SETUP: WARNING! libcudart.so not found in any environmental path. Searching in backup paths..." + ) + cuda_runtime_libs.update(find_cuda_lib_in("/usr/local/cuda/lib64")) warn_in_case_of_duplicates(cuda_runtime_libs) cuda_setup = CUDASetup.get_instance() - cuda_setup.add_log_entry(f'DEBUG: Possible options found for libcudart.so: {cuda_runtime_libs}') + cuda_setup.add_log_entry(f"DEBUG: Possible options found for libcudart.so: {cuda_runtime_libs}") return next(iter(cuda_runtime_libs)) if cuda_runtime_libs else None @@ -321,9 +383,12 @@ def get_cuda_version(): major, minor = map(int, torch.version.cuda.split(".")) if major < 11: - CUDASetup.get_instance().add_log_entry('CUDA SETUP: CUDA version lower than 11 are currently not supported for LLM.int8(). You will be only to use 8-bit optimizers and quantization routines!!') + CUDASetup.get_instance().add_log_entry( + "CUDA SETUP: CUDA version lower than 11 are currently not supported for LLM.int8(). You will be only to use 8-bit optimizers and quantization routines!!" + ) + + return f"{major}{minor}" - return f'{major}{minor}' def get_compute_capabilities(): ccs = [] @@ -338,25 +403,34 @@ def get_compute_capabilities(): def evaluate_cuda_setup(): cuda_setup = CUDASetup.get_instance() - if 'BITSANDBYTES_NOWELCOME' not in os.environ or str(os.environ['BITSANDBYTES_NOWELCOME']) == '0': - cuda_setup.add_log_entry('') - cuda_setup.add_log_entry('='*35 + 'BUG REPORT' + '='*35) - cuda_setup.add_log_entry(('Welcome to bitsandbytes. For bug reports, please run\n\npython -m bitsandbytes\n\n'), - ('and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues')) - cuda_setup.add_log_entry('='*80) - if not torch.cuda.is_available(): return 'libbitsandbytes_cpu.so', None, None, None - if torch.version.hip: return 'libbitsandbytes_hip_nohipblaslt.so', None, None, None + if "BITSANDBYTES_NOWELCOME" not in os.environ or str(os.environ["BITSANDBYTES_NOWELCOME"]) == "0": + cuda_setup.add_log_entry("") + cuda_setup.add_log_entry("=" * 35 + "BUG REPORT" + "=" * 35) + cuda_setup.add_log_entry( + ("Welcome to bitsandbytes. For bug reports, please run\n\npython -m bitsandbytes\n\n"), + ( + "and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues" + ), + ) + cuda_setup.add_log_entry("=" * 80) + if not torch.cuda.is_available(): + return "libbitsandbytes_cpu.so", None, None, None + if torch.version.hip: + return "libbitsandbytes_hip_nohipblaslt.so", None, None, None cudart_path = determine_cuda_runtime_lib_path() ccs = get_compute_capabilities() ccs.sort() - cc = ccs[-1] # we take the highest capability + cc = ccs[-1] # we take the highest capability cuda_version_string = get_cuda_version() - cuda_setup.add_log_entry(f"CUDA SETUP: PyTorch settings found: CUDA_VERSION={cuda_version_string}, Highest Compute Capability: {cc}.") - cuda_setup.add_log_entry(f"CUDA SETUP: To manually override the PyTorch CUDA version please see:" - "https://github.com/TimDettmers/bitsandbytes/blob/main/how_to_use_nonpytorch_cuda.md") - + cuda_setup.add_log_entry( + f"CUDA SETUP: PyTorch settings found: CUDA_VERSION={cuda_version_string}, Highest Compute Capability: {cc}." + ) + cuda_setup.add_log_entry( + "CUDA SETUP: To manually override the PyTorch CUDA version please see:" + "https://github.com/TimDettmers/bitsandbytes/blob/main/how_to_use_nonpytorch_cuda.md" + ) # 7.5 is the minimum CC vor cublaslt has_cublaslt = is_cublasLt_compatible(cc) diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py index 4f7bba4ee..37728bb4a 100644 --- a/bitsandbytes/functional.py +++ b/bitsandbytes/functional.py @@ -14,7 +14,7 @@ from bitsandbytes.utils import pack_dict_to_tensor, unpack_tensor_to_dict -from .cextension import lib, HIP_ENVIRONMENT +from .cextension import HIP_ENVIRONMENT, lib # math.prod not compatible with python < 3.8 @@ -160,7 +160,7 @@ def __init__(self): raise RuntimeError("Call get_instance() instead") def initialize(self): - #self.context = ct.c_void_p(lib.get_cusparse()) + # self.context = ct.c_void_p(lib.get_cusparse()) if torch.version.cuda: self.context = ct.c_void_p(lib.get_cusparse()) elif torch.version.hip: @@ -528,8 +528,8 @@ def nvidia_transform( ld=None, ): if HIP_ENVIRONMENT: - to_order = "col" if to_order in ["col32","col_turing","col_ampere"] else to_order - from_order = "col" if from_order in ["col32","col_turing","col_ampere"] else from_order + to_order = "col" if to_order in ["col32", "col_turing", "col_ampere"] else to_order + from_order = "col" if from_order in ["col32", "col_turing", "col_ampere"] else from_order if state is None: state = (A.shape, from_order) @@ -850,7 +850,7 @@ def quantize_blockwise( if out is None: out = torch.zeros_like(A, dtype=torch.uint8) - if A.device.type != 'cpu': + if A.device.type != "cpu": if not HIP_ENVIRONMENT: assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64] else: @@ -1291,7 +1291,7 @@ def dequantize_fp4( quant_state: Optional[QuantState] = None, absmax: Optional[torch.Tensor] = None, out: Optional[torch.Tensor] = None, - blocksize: int = None, + blocksize: Optional[int] = None, ) -> Tensor: if blocksize is None: blocksize = 64 if not HIP_ENVIRONMENT else 128 @@ -1304,7 +1304,7 @@ def dequantize_nf4( quant_state: Optional[QuantState] = None, absmax: Optional[torch.Tensor] = None, out: Optional[torch.Tensor] = None, - blocksize: int = None, + blocksize: Optional[int] = None, ) -> Tensor: if blocksize is None: blocksize = 64 if not HIP_ENVIRONMENT else 128 @@ -1317,7 +1317,7 @@ def dequantize_4bit( quant_state: Optional[QuantState] = None, absmax: Optional[torch.Tensor] = None, out: Optional[torch.Tensor] = None, - blocksize: int = None, + blocksize: Optional[int] = None, quant_type="fp4", ) -> Tensor: """ @@ -1348,7 +1348,7 @@ def dequantize_4bit( """ if blocksize is None: blocksize = 64 if not HIP_ENVIRONMENT else 128 - + supported_blocksizes = [2048, 4096, 1024, 512, 256, 128, 64] if HIP_ENVIRONMENT: supported_blocksizes = supported_blocksizes[:-1] @@ -2368,7 +2368,7 @@ def igemmlt(A, B, SA, SB, out=None, Sout=None, dtype=torch.int32): has_error = 0 ptrRowScale = get_ptr(None) is_on_gpu([A, B, out]) - if formatB == 'col_turing' or HIP_ENVIRONMENT: + if formatB == "col_turing" or HIP_ENVIRONMENT: if dtype == torch.int32: has_error = lib.cigemmlt_turing_32(ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc) else: @@ -2393,7 +2393,7 @@ def igemmlt(A, B, SA, SB, out=None, Sout=None, dtype=torch.int32): def mm_dequant(A, quant_state, row_stats, col_stats, out=None, new_row_stats=None, new_col_stats=None, bias=None): if HIP_ENVIRONMENT: - A, quant_state = nvidia_transform(A, "row", state = quant_state) + A, quant_state = nvidia_transform(A, "row", state=quant_state) assert A.dtype == torch.int32 if bias is not None: assert bias.dtype == torch.float16 @@ -2645,9 +2645,9 @@ def double_quant(A, col_stats=None, row_stats=None, out_col=None, out_row=None, return out_row, out_col, row_stats, col_stats, coo_tensor -def transform(A, to_order, from_order='row', out=None, transpose=False, state=None, ld=None): +def transform(A, to_order, from_order="row", out=None, transpose=False, state=None, ld=None): if HIP_ENVIRONMENT: - return nvidia_transform(A,to_order,from_order,out,transpose,state,ld) + return nvidia_transform(A, to_order, from_order, out, transpose, state, ld) prev_device = pre_call(A.device) if state is None: @@ -2973,7 +2973,7 @@ def extract_outliers(A, SA, idx): ptrOut = get_ptr(out) prev_device = pre_call(A.device) - if formatA == 'col_turing' or HIP_ENVIRONMENT: + if formatA == "col_turing" or HIP_ENVIRONMENT: lib.cextractOutliers_turing(ptrA, ptrIdx, ptrOut, idx_size, rows, cols) elif formatA == "col_ampere": lib.cextractOutliers_ampere(ptrA, ptrIdx, ptrOut, idx_size, rows, cols) diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py index ad2579664..3684badf6 100644 --- a/bitsandbytes/nn/modules.py +++ b/bitsandbytes/nn/modules.py @@ -12,12 +12,11 @@ import bitsandbytes as bnb from bitsandbytes.autograd._functions import get_tile_inds, undo_layout +from bitsandbytes.cextension import HIP_ENVIRONMENT from bitsandbytes.functional import QuantState from bitsandbytes.optim import GlobalOptimManager from bitsandbytes.utils import OutlierTracer -from bitsandbytes.cextension import HIP_ENVIRONMENT - T = TypeVar("T", bound="torch.nn.Module") @@ -212,7 +211,7 @@ def __new__( data: Optional[torch.Tensor] = None, requires_grad=False, # quantized weights should be frozen by default quant_state: Optional[QuantState] = None, - blocksize: int = None, + blocksize: Optional[int] = None, compress_statistics: bool = True, quant_type: str = "fp4", quant_storage: torch.dtype = torch.uint8, @@ -221,7 +220,7 @@ def __new__( ) -> "Params4bit": if data is None: data = torch.empty(0) - + if blocksize is None: blocksize = 64 if not HIP_ENVIRONMENT else 128 diff --git a/bitsandbytes/research/autograd/_functions.py b/bitsandbytes/research/autograd/_functions.py index 9598bb1e3..e5655b546 100644 --- a/bitsandbytes/research/autograd/_functions.py +++ b/bitsandbytes/research/autograd/_functions.py @@ -6,9 +6,9 @@ import torch from bitsandbytes.autograd._functions import GlobalOutlierPooler, MatmulLtState +from bitsandbytes.cextension import HIP_ENVIRONMENT import bitsandbytes.functional as F -from bitsandbytes.cextension import HIP_ENVIRONMENT # math.prod not compatible with python < 3.8 def prod(iterable): diff --git a/csrc/kernels.hip b/csrc/kernels.hip index dd7011f6b..6ff643a07 100644 --- a/csrc/kernels.hip +++ b/csrc/kernels.hip @@ -22,7 +22,7 @@ // source: https://stackoverflow.com/questions/17399119/how-do-i-use-atomicmax-on-floating-point-values-in-cuda -// Luckily we have atomicmax and atomicmin in ROCm +// Luckily we have atomicmax and atomicmin in ROCm __device__ float dDequantizeFP4(unsigned char val, float absmax) { @@ -86,7 +86,7 @@ __device__ float dDequantizeFP4Tree(unsigned char val, float absmax) return 1.00000000f*absmax*sign; // 1011 else return 0.66666667f*absmax*sign; // 1010 - else + else if((val & 0b0001) == 1) // 100 return 5.208333333e-03f*absmax*sign; // 1001 else @@ -110,7 +110,7 @@ __device__ unsigned char dQuantizeFP4(float x) // we do a binary search // the pivots are divided by 12 (the FP4 absmax) - // since we assum input data is in [-1.0, 1.0] + // since we assume input data is in [-1.0, 1.0] // !be careful here, its easy to make a mistake // that is difficult to noice if you add an extra @@ -150,36 +150,36 @@ __device__ half dhDequantizeNF4(unsigned char val) if((val & 0b0100) == 4) // 1 if((val & 0b0010) == 2) // 11 if((val & 0b0001) == 1) // 111 - return 1.0f; + return 1.0f; else return 0.7229568362236023f; else if((val & 0b0001) == 1) // 110 - return 0.5626170039176941f; + return 0.5626170039176941f; else - return 0.44070982933044434f; + return 0.44070982933044434f; else if((val & 0b0010) == 2) //10 if((val & 0b0001) == 1) // 101 - return 0.33791524171829224f; + return 0.33791524171829224f; else - return 0.24611230194568634f; - else + return 0.24611230194568634f; + else if((val & 0b0001) == 1) // 100 - return 0.16093020141124725f; + return 0.16093020141124725f; else - return 0.07958029955625534f; + return 0.07958029955625534f; else if((val & 0b0100) == 4) // 0 if((val & 0b0010) == 2) //01 if((val & 0b0001) == 1) // 011 - return 0.0f; + return 0.0f; else - return -0.09105003625154495f; + return -0.09105003625154495f; else if((val & 0b0001) == 1) // 010 - return -0.18477343022823334f; + return -0.18477343022823334f; else return -0.28444138169288635f; else @@ -187,12 +187,12 @@ __device__ half dhDequantizeNF4(unsigned char val) if((val & 0b0001) == 1) // 001 return -0.39491748809814453f; else - return -0.5250730514526367f; - else + return -0.5250730514526367f; + else if((val & 0b0001) == 1) // 000 - return -0.6961928009986877f; + return -0.6961928009986877f; else - return -1.0f; + return -1.0f; } @@ -205,36 +205,36 @@ __device__ float dDequantizeNF4(unsigned char val) if((val & 0b0100) == 4) // 1 if((val & 0b0010) == 2) // 11 if((val & 0b0001) == 1) // 111 - return 1.0f; + return 1.0f; else return 0.7229568362236023f; else if((val & 0b0001) == 1) // 110 - return 0.5626170039176941f; + return 0.5626170039176941f; else - return 0.44070982933044434f; + return 0.44070982933044434f; else if((val & 0b0010) == 2) //10 if((val & 0b0001) == 1) // 101 - return 0.33791524171829224f; + return 0.33791524171829224f; else - return 0.24611230194568634f; - else + return 0.24611230194568634f; + else if((val & 0b0001) == 1) // 100 - return 0.16093020141124725f; + return 0.16093020141124725f; else - return 0.07958029955625534f; + return 0.07958029955625534f; else if((val & 0b0100) == 4) // 0 if((val & 0b0010) == 2) //01 if((val & 0b0001) == 1) // 011 - return 0.0f; + return 0.0f; else - return -0.09105003625154495f; + return -0.09105003625154495f; else if((val & 0b0001) == 1) // 010 - return -0.18477343022823334f; + return -0.18477343022823334f; else return -0.28444138169288635f; else @@ -242,12 +242,12 @@ __device__ float dDequantizeNF4(unsigned char val) if((val & 0b0001) == 1) // 001 return -0.39491748809814453f; else - return -0.5250730514526367f; - else + return -0.5250730514526367f; + else if((val & 0b0001) == 1) // 000 - return -0.6961928009986877f; + return -0.6961928009986877f; else - return -1.0f; + return -1.0f; } @@ -1841,7 +1841,7 @@ kOptimizerStatic8bit2StateBlockwise(T* p, T* __restrict__ const g, unsigned char //float ratio = (g_val*g_val)/fmaxf(s2_vals[j], eps*eps); //g_val = ratio > 2.0f ? 2.0f*g_val/ratio : g_val; g_val *= gnorm_scale; - + s2_vals[j] = (s2_vals[j]*beta2) + (((1.0f-beta2)*g_val*g_val)); s1_vals[j] = smem_quantiles1[lane_id][c1s[j]]*absmax1[i/BLOCK_SIZE]; @@ -2237,8 +2237,8 @@ template__global__ void kd // data is in 32 column-tile major with tile width 32 columns and numRows rows // L1. Load sub-tile row/col statistics. Each thread only holds 1 col, load rows into shared memory. - // L2. Load data in warp-striped arangement (t0 holds colidx [0, 0, 0, 0], rowidx [0, 1, 2, 3]) + // L2. Load data in warp-striped arrangement (t0 holds colidx [0, 0, 0, 0], rowidx [0, 1, 2, 3]) // C1. Compute val(row_stat*col_stat)/(127*127) (load 1/(127*127 into register)) // C2. Compute normalization values and store col values in register // S1. Store C1 into 16-bit output @@ -2367,7 +2367,7 @@ template __global__ void kd #pragma unroll ITEMS_PER_THREAD for(int j = 0; j < ITEMS_PER_THREAD; j++) local_output[j] = __float2half((local_values[j]*MM_DEQUANT_CONST*rowStat[j]*colStat[j]) + local_biasValue[j]); - + // each block processes SUBTILE_ROWS*32 elements #pragma unroll ITEMS_PER_THREAD for(int j = 0; j < ITEMS_PER_THREAD; j++) @@ -2390,14 +2390,14 @@ template __global__ void kd if(valid_items <= 0) // the sub-tile might have more elements than the tile itself break; - // L2. Load data in warp-striped arangement (t0 holds colidx [0, 0, 0, 0], rowidx [0, 1, 2, 3]) + // L2. Load data in warp-striped arrangement (t0 holds colidx [0, 0, 0, 0], rowidx [0, 1, 2, 3]) LoadInt32(loadint32).Load(&(A[subtile_idx]), local_values, valid_items, 0); ExchangeInt32(exchangeint32).BlockedToWarpStriped(local_values, local_values); #pragma unroll ITEMS_PER_THREAD for(int j = 0; j < ITEMS_PER_THREAD; j++) local_rowStats[j] = smem_rowStats[subtile_base_row+row_offset+j]; - + #pragma unroll ITEMS_PER_THREAD for(int j = 0; j < ITEMS_PER_THREAD; j++) local_output[j] = __float2half((local_values[j]*MM_DEQUANT_CONST*local_rowStats[j]*colStat) + local_biasValue); @@ -2657,7 +2657,7 @@ template __global__ void kExtractOutliers(char *A, int *idx, char * { //col-major offset int offset = local_colidx * rowsA + row; - + char val = A[offset]; int out_idx = (row*idx_size) + blockIdx.x; out[out_idx] = val; @@ -3087,11 +3087,11 @@ template __global__ void kExtractOutliers(char *A, int *idx, char * //// use k warps per thread block //// 1. threadblock use read-only cache to read in register tile for A into shared memory //// 2. each warp loops over shared memory tiles of A of size 8x16 and loads them into fragments -//// 3. each warp reads a segment of values 16x32 from B +//// 3. each warp reads a segment of values 16x32 from B //// 4. do dequantization from register of B into second pair of registers //// 5. store (4) into fragment //// 6. matmul aggregate into fragment C -//// 7. aggreecate files of C into shared memroy block C +//// 7. aggregate files of C into shared memory block C //// 8. sum (7) //// 9. write outputs to matmul output matrix //} @@ -3549,7 +3549,7 @@ template __global__ void kgemm_4bit_inference(int M, i template __global__ void kgemm_4bit_inference_naive(int M, int N, int K, T * __restrict__ const A, unsigned char *B, float *absmax, const float *datatype, T * out, int lda, int ldb, int ldc, int blocksize) { - // per threadblock: + // per threadblock: // load step-by-step in chunks of [32,warps]: 1x32 * [32,warps] -> [1,warps] // 4 warps -> 4 loads per iter // 1x32 * 32x4 -> 1x4 outputs per thread block @@ -3782,7 +3782,7 @@ template __global__ void kfunc(T *A, T *B, T value, long { switch(FUNC) { - case FILL: + case FILL: A[i] = (T)value; break; case ARANGE: diff --git a/csrc/pythonInterface.cpp b/csrc/pythonInterface.cpp index f03636e47..be6abc070 100644 --- a/csrc/pythonInterface.cpp +++ b/csrc/pythonInterface.cpp @@ -429,7 +429,7 @@ extern "C" { \ transform_##fbits##_##fsrc##_to_##ftrgt##_##ftranspose((cublasLtHandle_t) context->m_handle, A, out, dim1, dim2); \ } \ - + #endif #if BUILD_HIP @@ -572,7 +572,7 @@ extern "C" int hasPrefetch = 0; CUDA_CHECK_RETURN(hipDeviceGetAttribute(&hasPrefetch, hipDeviceAttributeConcurrentManagedAccess, device)); // 40ns overhead if (hasPrefetch == 0) return; - + CUDA_CHECK_RETURN(hipMemPrefetchAsync(ptr, bytes, device, 0)); CUDA_CHECK_RETURN(hipPeekAtLastError()); } diff --git a/install_cuda.py b/install_cuda.py index a5d09356d..cf7c8ee71 100644 --- a/install_cuda.py +++ b/install_cuda.py @@ -77,9 +77,7 @@ def main(): download_path = "/tmp" # default download path if len(sys.argv) < 2: - print( - "Usage: python install_cuda.py [user/system] [download_path]" - ) + print("Usage: python install_cuda.py [user/system] [download_path]") sys.exit(1) version = sys.argv[1] @@ -100,9 +98,7 @@ def main(): elif version in cuda_versions: install_cuda(version, base_path, download_path) else: - print( - f"Invalid CUDA version: {version}. Available versions are: {', '.join(cuda_versions.keys())}" - ) + print(f"Invalid CUDA version: {version}. Available versions are: {', '.join(cuda_versions.keys())}") sys.exit(1) diff --git a/tests/helpers.py b/tests/helpers.py index fc7ce1acb..e93c11b70 100644 --- a/tests/helpers.py +++ b/tests/helpers.py @@ -63,9 +63,9 @@ def id_formatter(label: str): def describe_dtype(dtype: torch.dtype) -> str: return DTYPE_NAMES.get(dtype) or str(dtype).rpartition(".")[2] + def get_blocksizes(hip_env: bool) -> List[int]: if not hip_env: return [4096, 2048, 1024, 512, 256, 128, 64] else: return [4096, 2048, 1024, 512, 256, 128] - diff --git a/tests/test_autograd.py b/tests/test_autograd.py index 8c9acb31d..9da665a2d 100644 --- a/tests/test_autograd.py +++ b/tests/test_autograd.py @@ -4,8 +4,6 @@ import torch import bitsandbytes as bnb -from bitsandbytes.cextension import HIP_ENVIRONMENT - from tests.helpers import ( BOOLEAN_TRIPLES, BOOLEAN_TUPLES, diff --git a/tests/test_cuda_setup_evaluator.py b/tests/test_cuda_setup_evaluator.py index e01a15b94..53dd25044 100644 --- a/tests/test_cuda_setup_evaluator.py +++ b/tests/test_cuda_setup_evaluator.py @@ -1,6 +1,6 @@ import pytest -from bitsandbytes.cextension import get_cuda_bnb_library_path, HIP_ENVIRONMENT +from bitsandbytes.cextension import HIP_ENVIRONMENT, get_cuda_bnb_library_path from bitsandbytes.cuda_specs import CUDASpecs diff --git a/tests/test_functional.py b/tests/test_functional.py index a729ecebe..0f817d1dc 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -12,14 +12,7 @@ import bitsandbytes as bnb from bitsandbytes import functional as F from bitsandbytes.cextension import HIP_ENVIRONMENT -from tests.helpers import ( - BOOLEAN_TUPLES, - TRUE_FALSE, - describe_dtype, - get_test_dims, - id_formatter, - get_blocksizes -) +from tests.helpers import BOOLEAN_TUPLES, TRUE_FALSE, describe_dtype, get_blocksizes, get_test_dims, id_formatter torch.set_printoptions(precision=5, sci_mode=False, linewidth=120, edgeitems=20, threshold=10000) k = 20 @@ -115,6 +108,7 @@ def test_estimate_quantiles(dtype): diff = torch.abs(code - quantiles) assert (diff > 5e-02).sum().item() == 0 + def test_quantile_quantization(): for i in range(100): A1 = torch.randn(1024, 1024, device="cuda") @@ -516,7 +510,9 @@ def test_vector_quant(dim1, dim2, dim3): @pytest.mark.parametrize("dim3", get_test_dims(2, 256, n=2), ids=id_formatter("dim3")) @pytest.mark.parametrize("dtype", [torch.int8, torch.int32], ids=describe_dtype) @pytest.mark.parametrize("orderA", ["row"], ids=id_formatter("orderA")) -@pytest.mark.parametrize("orderOut", ["col", "row"] if HIP_ENVIRONMENT else ["col", "row", "col32"], ids=id_formatter("orderOut")) +@pytest.mark.parametrize( + "orderOut", ["col", "row"] if HIP_ENVIRONMENT else ["col", "row", "col32"], ids=id_formatter("orderOut") +) @pytest.mark.parametrize("transpose", [False], ids=id_formatter("transpose")) @pytest.mark.parametrize("dims", [2, 3], ids=id_formatter("dims")) def test_nvidia_transform(dim1, dim2, dim3, dims, dtype, orderA, orderOut, transpose): @@ -2058,7 +2054,9 @@ def test_normal_map_tree(): # print(pivots) -@pytest.mark.skipif(HIP_ENVIRONMENT, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64") +@pytest.mark.skipif( + HIP_ENVIRONMENT, reason="gemv 4bit tests are partially enabled on MI300, others being fixed for warpsize 64" +) @pytest.mark.parametrize("double_quant", TRUE_FALSE, ids=lambda double_quant: f"DQ_{double_quant}") @pytest.mark.parametrize("storage_type", ["nf4", "fp4"]) @pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"]) diff --git a/tests/test_generation.py b/tests/test_generation.py index 20490ea33..8e689261b 100644 --- a/tests/test_generation.py +++ b/tests/test_generation.py @@ -4,10 +4,8 @@ import pytest import torch -from tests.helpers import TRUE_FALSE, describe_dtype, id_formatter - -import bitsandbytes as bnb from bitsandbytes.cextension import HIP_ENVIRONMENT +from tests.helpers import TRUE_FALSE, describe_dtype, id_formatter transformers = pytest.importorskip("transformers") diff --git a/tests/test_linear8bitlt.py b/tests/test_linear8bitlt.py index cef765dad..ca52f312e 100644 --- a/tests/test_linear8bitlt.py +++ b/tests/test_linear8bitlt.py @@ -8,8 +8,8 @@ import bitsandbytes as bnb from bitsandbytes import functional as F from bitsandbytes.autograd import get_inverse_transform_indices, undo_layout -from bitsandbytes.nn.modules import Linear8bitLt from bitsandbytes.cextension import HIP_ENVIRONMENT +from bitsandbytes.nn.modules import Linear8bitLt from tests.helpers import ( TRUE_FALSE, id_formatter, diff --git a/tests/test_optim.py b/tests/test_optim.py index 362f037f1..d8c46e415 100644 --- a/tests/test_optim.py +++ b/tests/test_optim.py @@ -10,7 +10,6 @@ import bitsandbytes as bnb import bitsandbytes.functional as F -from bitsandbytes.cextension import HIP_ENVIRONMENT from tests.helpers import describe_dtype, id_formatter # import apex diff --git a/tests/test_triton.py b/tests/test_triton.py index 8d9e15c4d..1c5422c0d 100644 --- a/tests/test_triton.py +++ b/tests/test_triton.py @@ -1,9 +1,8 @@ import pytest import torch -from bitsandbytes.nn import Linear8bitLt from bitsandbytes.cextension import HIP_ENVIRONMENT - +from bitsandbytes.nn import Linear8bitLt from bitsandbytes.nn.triton_based_modules import SwitchBackLinear from bitsandbytes.triton.triton_utils import is_triton_available from tests.helpers import TRUE_FALSE From 8c23dc0100e1d610cedb6ea13d6489d20690974f Mon Sep 17 00:00:00 2001 From: Prasanth Nunna Date: Tue, 9 Apr 2024 22:47:05 +0000 Subject: [PATCH 101/112] Fix typos --- bitsandbytes/cuda_setup/main.py | 2 +- csrc/kernels.hip | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/bitsandbytes/cuda_setup/main.py b/bitsandbytes/cuda_setup/main.py index b0a790e70..b2f9214a4 100644 --- a/bitsandbytes/cuda_setup/main.py +++ b/bitsandbytes/cuda_setup/main.py @@ -4,7 +4,7 @@ [ ] TODO: Q - What if we have multiple GPUs of different makes? - CUDA version - Software: - - CPU-only: only CPU quantization functions (no optimizer, no matrix multipl) + - CPU-only: only CPU quantization functions (no optimizer, no matrix multiplication) - CuBLAS-LT: full-build 8-bit optimizer - no CuBLAS-LT: no 8-bit matrix multiplication (`nomatmul`) diff --git a/csrc/kernels.hip b/csrc/kernels.hip index 6ff643a07..ca77dceda 100644 --- a/csrc/kernels.hip +++ b/csrc/kernels.hip @@ -113,7 +113,7 @@ __device__ unsigned char dQuantizeFP4(float x) // since we assume input data is in [-1.0, 1.0] // !be careful here, its easy to make a mistake - // that is difficult to noice if you add an extra + // that is difficult to notice if you add an extra // zero somewhere! int sign = x < 0 ? 0b1000 : 0b0000; @@ -2118,7 +2118,7 @@ template Date: Tue, 9 Apr 2024 22:16:13 -0500 Subject: [PATCH 102/112] Fix formatting in README file --- README.md | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 9a741d22f..415679df9 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,9 @@ The `bitsandbytes` library is a lightweight Python wrapper around CUDA custom fu The library includes quantization primitives for 8-bit & 4-bit operations, through `bitsandbytes.nn.Linear8bitLt` and `bitsandbytes.nn.Linear4bit` and 8-bit optimizers through `bitsandbytes.optim` module. **Installation for ROCm:** + To install latest bitsandbytes (supported on ROCm 6.2): +```bash git clone --recurse https://github.com/ROCm/bitsandbytes cd bitsandbytes git checkout rocm_enabled @@ -15,16 +17,20 @@ pip install -r requirements-dev.txt cmake -DCOMPUTE_BACKEND=hip -S . make pip install . +``` For ROCm specific versions: + Install Dependencies: -#hipblaslt installation needed only for rocm<6.0 +```bash +# hipblaslt installation needed only for rocm<6.0 apt install hipblaslt pip install --upgrade pip pip install einops lion_pytorch accelerate pip install git+https://github.com/ROCm/transformers.git - +``` Install Bitsandbytes: +```bash git clone --recurse https://github.com/ROCm/bitsandbytes cd bitsandbytes # Checkout branch as needed @@ -33,6 +39,7 @@ cd bitsandbytes git checkout make hip python setup.py install +``` **For more details, please head to the official documentation page:** From 79cb5548c7cbe8a129dea42e7e38feb1c1251979 Mon Sep 17 00:00:00 2001 From: Prasanth Nunna Date: Thu, 18 Apr 2024 23:11:04 +0000 Subject: [PATCH 103/112] Update gpu arch setting --- CMakeLists.txt | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 4b776005f..6b9b2dbe6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -174,7 +174,16 @@ if(BUILD_CUDA) elseif(BUILD_HIP) enable_language(HIP) message(STATUS "HIP Compiler: ${CMAKE_HIP_COMPILER}") - message(STATUS "HIP Targets: ${AMDGPU_TARGETS}") + if(DEFINED BNB_ROCM_ARCH) + set(CMAKE_HIP_ARCHITECTURES ${BNB_ROCM_ARCH}) + else() + if (NOT AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES) + set(CMAKE_HIP_ARCHITECTURES "gfx908;gfx90a;gfx942") + elseif (AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES) + set(CMAKE_HIP_ARCHITECTURES ${AMDGPU_TARGETS}) + endif() + endif() + message(STATUS "HIP Targets: ${CMAKE_HIP_ARCHITECTURES}") list(APPEND SRC_FILES ${HIP_FILES}) From 5c0414e20545c3ae9162ab8428d10e290e2047f6 Mon Sep 17 00:00:00 2001 From: Prasanth Nunna Date: Thu, 18 Apr 2024 23:13:00 +0000 Subject: [PATCH 104/112] Add ROCM_PATH variable --- CMakeLists.txt | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6b9b2dbe6..113c3d037 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -250,7 +250,12 @@ if(BUILD_CUDA) ) endif() if(BUILD_HIP) - list(APPEND CMAKE_PREFIX_PATH /opt/rocm) + if(NOT DEFINED ENV{ROCM_PATH}) + set(ROCM_PATH /opt/rocm) + else() + set(ROCM_PATH $ENV{ROCM_PATH}) + endif() + list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH}) macro(find_package_and_print_version PACKAGE_NAME) find_package("${PACKAGE_NAME}" ${ARGN}) message("${PACKAGE_NAME} VERSION: ${${PACKAGE_NAME}_VERSION}") @@ -264,8 +269,8 @@ if(BUILD_HIP) set_target_properties(hip-lang::host PROPERTIES INTERFACE_LINK_LIBRARIES "") set(CMAKE_HIP_IMPLICIT_LINK_LIBRARIES "") - target_include_directories(bitsandbytes PRIVATE ${CMAKE_SOURCE_DIR} ${CMAKE_SOURCE_DIR}/include /opt/rocm/include /include) - target_link_directories(bitsandbytes PRIVATE /opt/rocm/lib /lib) + target_include_directories(bitsandbytes PRIVATE ${CMAKE_SOURCE_DIR} ${CMAKE_SOURCE_DIR}/include ${ROCM_PATH}/include /include) + target_link_directories(bitsandbytes PRIVATE ${ROCM_PATH}/lib /lib) target_link_libraries(bitsandbytes PUBLIC roc::hipblas hip::hiprand roc::hipsparse) target_compile_definitions(bitsandbytes PUBLIC BNB_USE_HIP) From 47795f5586661bfe79558c975e163fc0a38e8b47 Mon Sep 17 00:00:00 2001 From: Prasanth Nunna Date: Thu, 18 Apr 2024 23:13:47 +0000 Subject: [PATCH 105/112] Add HIP_VERSION variable --- CMakeLists.txt | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 113c3d037..373db6550 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -188,7 +188,12 @@ elseif(BUILD_HIP) list(APPEND SRC_FILES ${HIP_FILES}) string(APPEND BNB_OUTPUT_NAME "_hip") - if(NO_CUBLASLT) + + # get hip version + execute_process(COMMAND hipconfig --version OUTPUT_VARIABLE HIP_CONFIG_VERSION) + string(REGEX MATCH "[0-9]+\\.[0-9]+" HIP_VERSION "${HIP_CONFIG_VERSION}") + + if(NO_CUBLASLT OR HIP_VERSION VERSION_LESS "6.1") string(APPEND BNB_OUTPUT_NAME "_nohipblaslt") endif() add_compile_definitions(__HIP_PLATFORM_AMD__) @@ -277,7 +282,7 @@ if(BUILD_HIP) set_source_files_properties(${HIP_FILES} PROPERTIES LANGUAGE HIP) set_target_properties(bitsandbytes PROPERTIES LINKER_LANGUAGE CXX) - if(NO_CUBLASLT) + if(NO_CUBLASLT OR HIP_VERSION VERSION_LESS "6.1") target_compile_definitions(bitsandbytes PUBLIC NO_HIPBLASLT) else() find_package(hipblaslt) From 6d9045241e61d2a8f29a5ad48325c1f25a347be9 Mon Sep 17 00:00:00 2001 From: Prasanth Nunna Date: Thu, 18 Apr 2024 23:14:42 +0000 Subject: [PATCH 106/112] Add BNB_HIP_VERSION variable --- bitsandbytes/cextension.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/bitsandbytes/cextension.py b/bitsandbytes/cextension.py index 157f3a65a..69cf0b15f 100644 --- a/bitsandbytes/cextension.py +++ b/bitsandbytes/cextension.py @@ -37,7 +37,10 @@ def get_cuda_bnb_library_path(cuda_specs: CUDASpecs) -> Path: The library is not guaranteed to exist at the returned path. """ if torch.version.hip: - return PACKAGE_DIR / f"libbitsandbytes_hip{DYNAMIC_LIBRARY_SUFFIX}" + if BNB_HIP_VERSION < 601: + return PACKAGE_DIR / f"libbitsandbytes_hip_nohipblaslt{DYNAMIC_LIBRARY_SUFFIX}" + else: + return PACKAGE_DIR / f"libbitsandbytes_hip{DYNAMIC_LIBRARY_SUFFIX}" library_name = f"libbitsandbytes_cuda{cuda_specs.cuda_version_string}" if not cuda_specs.has_cublaslt: # if not has_cublaslt (CC < 7.5), then we have to choose _nocublaslt @@ -111,8 +114,12 @@ def get_native_library() -> BNBNativeLibrary: try: + if torch.version.hip: + hip_major, hip_minor = map(int, torch.version.hip.split(".")[0:2]) + HIP_ENVIRONMENT, BNB_HIP_VERSION = True, hip_major * 100 + hip_minor + else: + HIP_ENVIRONMENT, BNB_HIP_VERSION = False, 0 lib = get_native_library() - HIP_ENVIRONMENT = True if torch.version.hip else False except Exception as e: lib = None logger.error(f"Could not load bitsandbytes native library: {e}", exc_info=True) From 049a2dc5147a2d7c179a5dc202546f518f5475a1 Mon Sep 17 00:00:00 2001 From: Prasanth Nunna Date: Thu, 18 Apr 2024 23:15:37 +0000 Subject: [PATCH 107/112] Update supports igemmlt based on HIP version --- bitsandbytes/autograd/_functions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py index 15574d702..3eafd502a 100644 --- a/bitsandbytes/autograd/_functions.py +++ b/bitsandbytes/autograd/_functions.py @@ -8,7 +8,7 @@ import torch import bitsandbytes.functional as F - +from bitsandbytes.cextension import BNB_HIP_VERSION # math.prod not compatible with python < 3.8 def prod(iterable): @@ -218,7 +218,7 @@ def backward(ctx, grad_output): def supports_igemmlt(device: torch.device) -> bool: """check if this device supports the optimized int8 kernel""" if torch.version.hip: - return True + return False if BNB_HIP_VERSION < 601 else True if torch.cuda.get_device_capability(device=device) < (7, 5): return False device_name = torch.cuda.get_device_name(device=device) From 47a0bc3b63d0f9dcbdb97696af317f335ca2b2d4 Mon Sep 17 00:00:00 2001 From: Prasanth Nunna Date: Thu, 18 Apr 2024 23:16:17 +0000 Subject: [PATCH 108/112] Skip failing tests based on HIP version --- tests/test_autograd.py | 2 ++ tests/test_functional.py | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/test_autograd.py b/tests/test_autograd.py index 9da665a2d..eafa01f0e 100644 --- a/tests/test_autograd.py +++ b/tests/test_autograd.py @@ -4,6 +4,7 @@ import torch import bitsandbytes as bnb +from bitsandbytes.cextension import BNB_HIP_VERSION from tests.helpers import ( BOOLEAN_TRIPLES, BOOLEAN_TUPLES, @@ -198,6 +199,7 @@ def test_matmul(dim1, dim2, dim3, dim4, funcs, dtype, req_grad: Tuple[bool, bool assert (idx == 0).sum().item() < n * 0.02 +@pytest.mark.skipif(0 < BNB_HIP_VERSION < 601, reason="this test is supported on ROCm from 6.1") @pytest.mark.parametrize("dim1", get_test_dims(16, 64, n=1), ids=id_formatter("dim1")) @pytest.mark.parametrize("dim2", [*get_test_dims(32, 96, n=1), 0], ids=id_formatter("dim2")) @pytest.mark.parametrize("dim3", get_test_dims(32, 96, n=1), ids=id_formatter("dim3")) diff --git a/tests/test_functional.py b/tests/test_functional.py index 0f817d1dc..13a43cb70 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -11,7 +11,7 @@ import bitsandbytes as bnb from bitsandbytes import functional as F -from bitsandbytes.cextension import HIP_ENVIRONMENT +from bitsandbytes.cextension import HIP_ENVIRONMENT, BNB_HIP_VERSION from tests.helpers import BOOLEAN_TUPLES, TRUE_FALSE, describe_dtype, get_blocksizes, get_test_dims, id_formatter torch.set_printoptions(precision=5, sci_mode=False, linewidth=120, edgeitems=20, threshold=10000) @@ -505,6 +505,7 @@ def test_vector_quant(dim1, dim2, dim3): assert_all_approx_close(A1, A, atol=0.01, rtol=0.1, count=int(n * 0.002)) +@pytest.mark.skipif(0 < BNB_HIP_VERSION < 601, reason="this test is supported on ROCm from 6.1") @pytest.mark.parametrize("dim1", get_test_dims(2, 256, n=2), ids=id_formatter("dim1")) @pytest.mark.parametrize("dim2", get_test_dims(2, 256, n=2), ids=id_formatter("dim2")) @pytest.mark.parametrize("dim3", get_test_dims(2, 256, n=2), ids=id_formatter("dim3")) @@ -1733,6 +1734,7 @@ def quant_zp(x): print(err1, err2, err3, err4, err5, err6) +@pytest.mark.skipif(0 < BNB_HIP_VERSION < 601, reason="this test is supported on ROCm from 6.1") def test_extract_outliers(): for i in range(k): shapeA = (4096, 4096 * 4) From 1b2a0951e227a349188c4dc74ebcf7029362bc35 Mon Sep 17 00:00:00 2001 From: Prasanth Nunna Date: Thu, 18 Apr 2024 23:18:46 +0000 Subject: [PATCH 109/112] pre-commit fixes --- bitsandbytes/autograd/_functions.py | 3 ++- tests/test_functional.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py index 3eafd502a..18ca66b17 100644 --- a/bitsandbytes/autograd/_functions.py +++ b/bitsandbytes/autograd/_functions.py @@ -7,8 +7,9 @@ import torch -import bitsandbytes.functional as F from bitsandbytes.cextension import BNB_HIP_VERSION +import bitsandbytes.functional as F + # math.prod not compatible with python < 3.8 def prod(iterable): diff --git a/tests/test_functional.py b/tests/test_functional.py index 13a43cb70..04a898d4b 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -11,7 +11,7 @@ import bitsandbytes as bnb from bitsandbytes import functional as F -from bitsandbytes.cextension import HIP_ENVIRONMENT, BNB_HIP_VERSION +from bitsandbytes.cextension import BNB_HIP_VERSION, HIP_ENVIRONMENT from tests.helpers import BOOLEAN_TUPLES, TRUE_FALSE, describe_dtype, get_blocksizes, get_test_dims, id_formatter torch.set_printoptions(precision=5, sci_mode=False, linewidth=120, edgeitems=20, threshold=10000) From 4515a2186a997edd8e4d91cc9b371e89322906bd Mon Sep 17 00:00:00 2001 From: Prasanth Nunna Date: Thu, 18 Apr 2024 23:54:57 +0000 Subject: [PATCH 110/112] Update README file --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 415679df9..9503ff1ff 100644 --- a/README.md +++ b/README.md @@ -8,13 +8,13 @@ The library includes quantization primitives for 8-bit & 4-bit operations, throu **Installation for ROCm:** -To install latest bitsandbytes (supported on ROCm 6.2): +To install develop version: ```bash git clone --recurse https://github.com/ROCm/bitsandbytes cd bitsandbytes git checkout rocm_enabled pip install -r requirements-dev.txt -cmake -DCOMPUTE_BACKEND=hip -S . +cmake -DCOMPUTE_BACKEND=hip -S . (Use -DBNB_ROCM_ARCH="gfx90a;gfx942" to target specific gpu arch) make pip install . ``` From e7ef75fc8481ecb83f312a7c7a842b5d3c434000 Mon Sep 17 00:00:00 2001 From: Prasanth Nunna Date: Fri, 19 Apr 2024 14:27:20 +0000 Subject: [PATCH 111/112] Update default arch list --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 373db6550..3bedefd51 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -178,7 +178,7 @@ elseif(BUILD_HIP) set(CMAKE_HIP_ARCHITECTURES ${BNB_ROCM_ARCH}) else() if (NOT AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES) - set(CMAKE_HIP_ARCHITECTURES "gfx908;gfx90a;gfx942") + set(CMAKE_HIP_ARCHITECTURES "gfx908;gfx90a;gfx940;gfx941;gfx942") elseif (AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES) set(CMAKE_HIP_ARCHITECTURES ${AMDGPU_TARGETS}) endif() From c0d244c99bb169b19c313ede468234ef513776d7 Mon Sep 17 00:00:00 2001 From: Prasanth Nunna Date: Fri, 19 Apr 2024 16:08:50 +0000 Subject: [PATCH 112/112] update readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 9503ff1ff..377ca2e86 100644 --- a/README.md +++ b/README.md @@ -35,7 +35,7 @@ git clone --recurse https://github.com/ROCm/bitsandbytes cd bitsandbytes # Checkout branch as needed # for rocm 5.7 - rocm5.7_internal_testing -# for rocm 6.2 - rocm6.2_internal_testing +# for rocm 6.x - rocm6.2_internal_testing git checkout make hip python setup.py install