Skip to content
Permalink

Comparing changes

This is a direct comparison between two commits made in this repository or its related repositories. View the default comparison for this range or learn more about diff comparisons.

Open a pull request

Create a new pull request by comparing changes across two branches. If you need to, you can also . Learn more about diff comparisons here.
base repository: bitsandbytes-foundation/bitsandbytes
Failed to load repositories. Confirm that selected base ref is valid, then try again.
Loading
base: b6c2712b9df04950ee289593d2339d5a0645d042
Choose a base ref
..
head repository: bitsandbytes-foundation/bitsandbytes
Failed to load repositories. Confirm that selected head ref is valid, then try again.
Loading
compare: fbc0385fef90f613650d1fbbc53e9963ee49f1bb
Choose a head ref
Showing with 866 additions and 751 deletions.
  1. +4 −4 .github/ISSUE_TEMPLATE/bug-report.yml
  2. +2 −2 .github/ISSUE_TEMPLATE/feature-request.yml
  3. +1 −1 .github/workflows/build_pr_documentation.yml
  4. +173 −0 .github/workflows/cmake.yml
  5. +1 −1 .github/workflows/stale.yml.disabled
  6. +16 −0 .github/workflows/upload_pr_documentation.yml
  7. +11 −0 .pre-commit-config.yaml
  8. +1 −1 .style.yapf
  9. +132 −0 CMakeLists.txt
  10. +2 −2 README.md
  11. +1 −1 benchmarking/switchback/README.md
  12. +4 −5 benchmarking/switchback/make_plot_with_jsonl.py
  13. +2 −2 benchmarking/switchback/speed_benchmark.py
  14. +1 −1 bitsandbytes/__init__.py
  15. +1 −1 bitsandbytes/cuda_setup/main.py
  16. +0 −1 bitsandbytes/optim/adamw.py
  17. +2 −2 bitsandbytes/research/autograd/_functions.py
  18. +1 −1 bitsandbytes/triton/dequantize_rowwise.py
  19. +1 −1 bitsandbytes/triton/int8_matmul_mixed_dequantize.py
  20. +1 −1 bitsandbytes/triton/int8_matmul_rowwise_dequantize.py
  21. +1 −2 bitsandbytes/triton/quantize_columnwise_and_transpose.py
  22. +8 −9 bitsandbytes/triton/quantize_global.py
  23. +1 −2 bitsandbytes/triton/quantize_rowwise.py
  24. +2 −3 compile_from_source.md
  25. +18 −1 csrc/cpu_ops.cpp
  26. +41 −41 csrc/kernels.cu
  27. +0 −1 csrc/ops.cuh
  28. +1 −1 csrc/pythonInterface.c
  29. +2 −2 docs/source/_toctree.yml
  30. +3 −3 docs/source/index.mdx
  31. +41 −1 docs/source/installation.mdx
  32. +2 −2 docs/source/quickstart.mdx
  33. +21 −0 environment-bnb.yml
  34. +1 −1 environment.yml
  35. +0 −3 examples/int8_inference_huggingface.py
  36. +1 −1 how_to_use_nonpytorch_cuda.md
  37. +10 −0 include/SIMD.h
  38. +4 −4 install_cuda.py
  39. +4 −1 pytest.ini
  40. +1 −1 scripts/stale.py
  41. +5 −2 setup.py
  42. +4 −0 tests/conftest.py
  43. +51 −0 tests/helpers.py
  44. +56 −157 tests/test_autograd.py
  45. +0 −8 tests/test_cuda_setup_evaluator.py
  46. +155 −372 tests/test_functional.py
  47. +9 −14 tests/test_generation.py
  48. +5 −5 tests/test_linear4bit.py
  49. +5 −3 tests/test_linear8bitlt.py
  50. +17 −21 tests/test_modules.py
  51. +38 −61 tests/test_optim.py
  52. +2 −2 tests/test_triton.py
8 changes: 4 additions & 4 deletions .github/ISSUE_TEMPLATE/bug-report.yml
Original file line number Diff line number Diff line change
@@ -18,15 +18,15 @@ body:
label: Reproduction
description: |
Please provide a code sample that reproduces the problem you ran into. It can be a Colab link or just a code snippet.
Please provide the simplest reproducer as possible so that we can quickly fix the issue.
Please provide the simplest reproducer as possible so that we can quickly fix the issue.
placeholder: |
Reproducer:
Reproducer:
- type: textarea
id: expected-behavior
validations:
required: true
attributes:
label: Expected behavior
description: "A clear and concise description of what you would expect to happen."
description: "A clear and concise description of what you would expect to happen."
4 changes: 2 additions & 2 deletions .github/ISSUE_TEMPLATE/feature-request.yml
Original file line number Diff line number Diff line change
@@ -18,7 +18,7 @@ body:
attributes:
label: Motivation
description: |
Please outline the motivation for the proposal. Is your feature request related to a problem?
Please outline the motivation for the proposal. Is your feature request related to a problem?
- type: textarea
id: contribution
@@ -27,4 +27,4 @@ body:
attributes:
label: Your contribution
description: |
Is there any way that you could help, e.g. by submitting a PR?
Is there any way that you could help, e.g. by submitting a PR?
2 changes: 1 addition & 1 deletion .github/workflows/build_pr_documentation.yml
Original file line number Diff line number Diff line change
@@ -14,4 +14,4 @@ jobs:
commit_sha: ${{ github.event.pull_request.head.sha }}
pr_number: ${{ github.event.number }}
package: bitsandbytes
repo_owner: TimDettmers
repo_owner: TimDettmers
173 changes: 173 additions & 0 deletions .github/workflows/cmake.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
name: CMake on multiple platforms

on:
push:
branches: [ "main" ]
pull_request:
branches: [ "main" ]

jobs:
build:
runs-on: ${{ matrix.os }}

strategy:
# Set fail-fast to false to ensure that feedback is delivered for all matrix combinations. Consider changing this to true when your workflow is stable.
fail-fast: false

matrix:
os: [ubuntu-latest, windows-latest]
python-version: ['3.10', '3.11']
cuda-version: ['11.8', '12.1']
build_type: [Release]
c_compiler: [gcc, cl]
include:
- os: windows-latest
c_compiler: cl
cpp_compiler: cl
- os: ubuntu-latest
c_compiler: gcc
cpp_compiler: g++
exclude:
- os: ubuntu-latest
c_compiler: cl
- os: windows-latest
c_compiler: gcc

steps:
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}

- name: Set up MSVC
if: matrix.os == 'windows-latest'
uses: ilammy/msvc-dev-cmd@v1.12.1
with:
arch: amd64

- name: Setup Mambaforge
uses: conda-incubator/setup-miniconda@v3.0.1
with:
miniforge-variant: Mambaforge
miniforge-version: latest
activate-environment: bnb-env
use-mamba: true

- uses: conda-incubator/setup-miniconda@v3.0.1
with:
auto-update-conda: true
activate-environment: bnb-env
environment-file: environment-bnb.yml
use-only-tar-bz2: false
auto-activate-base: true
python-version: ${{ matrix.python-version }}
mamba-version: "*"

- name: Set reusable strings
# Turn repeated input strings (such as the build output directory) into step outputs. These step outputs can be used throughout the workflow file.
id: strings
shell: bash
run: |
echo "build-output-dir=${{ github.workspace }}/build" >> "$GITHUB_OUTPUT"
- name: CUDA Toolkit
shell: bash -el {0}
run: |
if [ "${{ matrix.os }}" = "ubuntu-latest" ]; then
# to prepare space
sudo rm -rf /usr/share/dotnet
sudo rm -rf /opt/ghc
sudo rm -rf /usr/local/share/boost
fi
addon=""
cuda_version=${{ matrix.cuda-version }}
[ "$cuda_version" = "12.1" ] && [ "${{ matrix.os }}" = "ubuntu-latest" ] && addon="cuda-cudart-static cuda-nvrtc"
[ "$cuda_version" = "12.1" ] && [ "${{ matrix.os }}" = "windows-latest" ] && addon="cuda-nvrtc"
[ "$cuda_version" = "11.8" ] && cuda_version="11.8.0"
[ "$cuda_version" = "12.1" ] && cuda_version="12.1.1"
conda install pytorch-cuda=${{ matrix.cuda-version }} -c pytorch # it's dependency not correctly resolved sometime
conda install cuda-python=${{ matrix.cuda-version }} cuda-libraries-dev cuda-nvcc cuda-nvtx cuda-cupti cuda-cudart cuda-cudart-dev cuda-runtime cuda-libraries $addon -c "nvidia/label/cuda-$cuda_version"
[ "${{ matrix.os }}" = "windows-latest" ] && conda install "clang>=17.0.6" "clangxx>=17.0.6" -c conda-forge
CUDA_HOME="${{ env.CONDA }}/envs/bnb-env"
echo CUDA_HOME=$CUDA_HOME >> "$GITHUB_ENV"
echo CUDA_PATH=$CUDA_HOME >> "$GITHUB_ENV"
if [ "${{ matrix.os }}" = "windows-latest" ]; then
# without -DCMAKE_CUDA_COMPILER=nvcc, cmake config always fail for cuda-11.8
echo DCMAKE_CUDA_COMPILER=-DCMAKE_CUDA_COMPILER=nvcc >> "$GITHUB_ENV"
fi
nvcc --version
- name: Update environment
run: mamba env update -n bnb-env -f environment-bnb.yml

- name: Prep build
run: python -m pip install cmake==3.27.9 ninja setuptools wheel

- name: Configure CMake
run: >
cmake -B ${{ steps.strings.outputs.build-output-dir }}
-G Ninja ${{ env.DCMAKE_CUDA_COMPILER }}
-DCMAKE_CXX_COMPILER=${{ matrix.cpp_compiler }}
-DCMAKE_C_COMPILER=${{ matrix.c_compiler }}
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
-DCOMPUTE_CAPABILITY="50;52;60;61;62;70;72;75;80;86;87;89;90"
-S ${{ github.workspace }}
- name: Build
# Build your program with the given configuration. Note that --config is needed because the default Windows generator is a multi-config generator (Visual Studio generator).
run: cmake --build ${{ steps.strings.outputs.build-output-dir }} --config ${{ matrix.build_type }}

- name: Configure NOBLASLT
run: >
cmake -B ${{ steps.strings.outputs.build-output-dir }}
-G Ninja ${{ env.DCMAKE_CUDA_COMPILER }}
-DCMAKE_CXX_COMPILER=${{ matrix.cpp_compiler }}
-DCMAKE_C_COMPILER=${{ matrix.c_compiler }}
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
-DCOMPUTE_CAPABILITY="50;52;60;61;62;70;72;75;80;86;87;89;90"
-DNO_CUBLASLT=ON
-S ${{ github.workspace }}
- name: Build NOBLASLT
run: cmake --build ${{ steps.strings.outputs.build-output-dir }} --config ${{ matrix.build_type }}

- name: Configure CPU
run: >
cmake -B ${{ steps.strings.outputs.build-output-dir }}
-G Ninja ${{ env.DCMAKE_CUDA_COMPILER }}
-DCMAKE_CXX_COMPILER=${{ matrix.cpp_compiler }}
-DCMAKE_C_COMPILER=${{ matrix.c_compiler }}
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
-DNO_CUBLASLT=ON
-DBUILD_CUDA=OFF
-S ${{ github.workspace }}
- name: Build CPU
run: cmake --build ${{ steps.strings.outputs.build-output-dir }} --config ${{ matrix.build_type }}

- name: Test
working-directory: ${{ steps.strings.outputs.build-output-dir }}
# Execute tests defined by the CMake configuration. Note that --build-config is needed because the default Windows generator is a multi-config generator (Visual Studio generator).
# See https://cmake.org/cmake/help/latest/manual/ctest.1.html for more detail
run: ctest --build-config ${{ matrix.build_type }}

- name: Build dist
shell: bash -el {0}
run: |
python -m pip install build
python -m build --wheel
mkdir dist/cu${{ matrix.cuda-version }}
mv dist/bitsandbytes*.* dist/cu${{ matrix.cuda-version }}/
- name: Upload Build Artifacts
uses: actions/upload-artifact@v4.3.0
with:
name: bitsandbytes-${{ matrix.os }}-${{ matrix.python-version }}-${{ matrix.cuda-version }}
path: |
${{ github.workspace }}/dist/
2 changes: 1 addition & 1 deletion .github/workflows/stale.yml.disabled
Original file line number Diff line number Diff line change
@@ -24,4 +24,4 @@ jobs:
pip install PyGithub
- name: Close stale issues
run: |
python scripts/stale.py
python scripts/stale.py
16 changes: 16 additions & 0 deletions .github/workflows/upload_pr_documentation.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
name: Upload PR Documentation

on:
workflow_run:
workflows: ["Build PR Documentation"]
types:
- completed

jobs:
build:
uses: huggingface/doc-builder/.github/workflows/upload_pr_documentation.yml@main
with:
package_name: bitsandbytes
secrets:
hf_token: ${{ secrets.HUGGINGFACE_PUSH }}
comment_bot_token: ${{ secrets.GITHUB_TOKEN }}
11 changes: 11 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -6,3 +6,14 @@ repos:
args:
- --fix
# - id: ruff-format # TODO: enable when the time is right
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.5.0
hooks:
- id: check-merge-conflict
- id: check-yaml
- id: end-of-file-fixer
- id: fix-byte-order-marker
- id: trailing-whitespace
- id: mixed-line-ending
args:
- --fix=lf
2 changes: 1 addition & 1 deletion .style.yapf
Original file line number Diff line number Diff line change
@@ -10,4 +10,4 @@ SPLIT_BEFORE_BITWISE_OPERATOR = True
SPLIT_BEFORE_FIRST_ARGUMENT = True
SPLIT_BEFORE_LOGICAL_OPERATOR = True
SPLIT_BEFORE_NAMED_ASSIGNS = True
SPLIT_COMPLEX_COMPREHENSION = True
SPLIT_COMPLEX_COMPREHENSION = True
132 changes: 132 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
# This CMake config hopefully makes it easier to compile.
# Ensure the CUDA Toolkit is available on your path. Then run:
# For GCC: `cmake -B build . && cmake --build build`
# For MSVC: `cmake -B build . && cmake --build build --config Release`
# You can also use the following options
# - BUILD_CUDA: Default ON, will build with CUDA
# - NO_CUBLASLT: Default OFF, will skip building/linking CUBLASLT support
# - CUDA_VERSION: The expected CUDA version, for sanity checking. The actual version
# is whatever CMake finds on your path.
# - COMPUTE_CAPABILITY: Which GPU Arch/Compute codes to provide to NVCC.
# Separate by semicolons, i.e. `-DCOMPUTE_CAPABILITY=89;90`
# Check your compute capability here: https://developer.nvidia.com/cuda-gpus
# - PTXAS_VERBOSE: Pass the `-v` option to the PTX Assembler
cmake_minimum_required(VERSION 3.18)

project(bitsandbytes LANGUAGES C CXX)

option(BUILD_CUDA "Build bitsandbytes with CUDA support" ON)
option(NO_CUBLASLT "Disable CUBLAS" OFF)
option(PTXAS_VERBOSE "Pass through -v flag to PTX Assembler" OFF)

set(CPP_FILES csrc/common.cpp csrc/cpu_ops.cpp csrc/pythonInterface.c)
list(APPEND CUDA_FILES csrc/ops.cu csrc/kernels.cu)
list(APPEND SRC_FILES ${CPP_FILES})

message(STATUS "BUILD_CUDA := ${BUILD_CUDA}")
message(STATUS "NO_CUBLASLT := ${NO_CUBLASLT}")

set(BNB_OUTPUT_NAME "bitsandbytes")

if(BUILD_CUDA)
enable_language(CUDA) # This will fail if CUDA is not found

# Convert the CUDA version from X.Y.z to XY. There's probably a shorter way of doing this
string(REGEX MATCH "^[0-9]+.[0-9]+" _CUDA_VERSION_FIRST_TWO "${CMAKE_CUDA_COMPILER_VERSION}")
string(REPLACE "." "" CUDA_VERSION_SHORT "${_CUDA_VERSION_FIRST_TWO}")

# Expose a cache variable that the user can set to ensure the correct version of CUDA is found
set(CUDA_VERSION "${CUDA_VERSION_SHORT}" CACHE STRING "Expected CUDA Version Shortcode")

message(STATUS "CUDA Version: ${CUDA_VERSION_SHORT} (${CMAKE_CUDA_COMPILER_VERSION})")
message(STATUS "CUDA Compiler: ${CMAKE_CUDA_COMPILER}")

# It should match the discovered version
if(NOT CUDA_VERSION STREQUAL "${CUDA_VERSION_SHORT}")
message(FATAL_ERROR "You've specified CUDA version ${CUDA_VERSION} however the CUDA compiler found is ${CUDA_VERSION_SHORT}."
" Ensure the desired CUDA compiler is the first one available on your PATH."
)
endif()

if(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS "11.0")
message(FATAL_ERROR "CUDA Version < 11 is not supported")
elseif(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "13.0")
message(FATAL_ERROR "CUDA Version > 12 is not supported")
endif()

string(APPEND CMAKE_CUDA_FLAGS " --use_fast_math")
if(PTXAS_VERBOSE)
# Verbose? Outputs register usage information, and other things...
string(APPEND CMAKE_CUDA_FLAGS " -Xptxas=-v")
endif()

foreach(capability ${CMAKE_CUDA_ARCHITECTURES_ALL})
# Most of the items here are like: `xx-real`, so we just extract the `xx` portion
string(REGEX MATCH "[0-9]+" capability_id "${capability}")
if(capability_id GREATER 0)
list(APPEND POSSIBLE_CAPABILITIES ${capability_id})
endif()
endforeach()

# This can be changed via -D argument to CMake
# By default all possible capabilities are compiled
set(COMPUTE_CAPABILITY "${POSSIBLE_CAPABILITIES}" CACHE STRING "Compute Capabilities Targeted")

message(STATUS "CUDA Capabilities Available: ${POSSIBLE_CAPABILITIES}")
message(STATUS "CUDA Capabilities Selected: ${COMPUTE_CAPABILITY}")

foreach(capability ${COMPUTE_CAPABILITY})
string(APPEND CMAKE_CUDA_FLAGS " -gencode arch=compute_${capability},code=sm_${capability}")
endforeach()

message(STATUS "CUDA NVCC Flags: ${CMAKE_CUDA_FLAGS}")

list(APPEND SRC_FILES ${CUDA_FILES})

string(APPEND BNB_OUTPUT_NAME "_cuda${CUDA_VERSION_SHORT}")
if(NO_CUBLASLT)
string(APPEND BNB_OUTPUT_NAME "_nocublaslt")
endif()
else()
message(STATUS "Building CPU Only")
string(APPEND BNB_OUTPUT_NAME "_cpu")
if(NO_CUBLASLT)
message(WARNING "We're building in CPU only mode but NO_CUBLASLT is enabled. It will have no effect.")
endif()
endif()

set_source_files_properties(${CPP_FILES} PROPERTIES LANGUAGE CXX)
add_library(bitsandbytes SHARED ${SRC_FILES})
include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
target_include_directories(bitsandbytes PUBLIC csrc include)
target_compile_features(bitsandbytes PUBLIC cxx_std_14)


if(BUILD_CUDA)
target_compile_definitions(bitsandbytes PUBLIC BUILD_CUDA)
target_link_libraries(bitsandbytes PUBLIC cudart cublas cusparse)
if(NO_CUBLASLT)
target_compile_definitions(bitsandbytes PUBLIC NO_CUBLASLT)
else()
target_link_libraries(bitsandbytes PUBLIC cublasLt)
endif()

set_target_properties(bitsandbytes
PROPERTIES
CUDA_SEPARABLE_COMPILATION ON
)
endif()

if(WIN32)
set_target_properties(bitsandbytes PROPERTIES PREFIX "lib")
endif()

set_target_properties(bitsandbytes
PROPERTIES
OUTPUT_NAME ${BNB_OUTPUT_NAME}
# We have to use a generator expression to prevent MSVC Debug/Release subdirs being made
RUNTIME_OUTPUT_DIRECTORY "$<1:${CMAKE_SOURCE_DIR}/bitsandbytes>"
LIBRARY_OUTPUT_DIRECTORY "$<1:${CMAKE_SOURCE_DIR}/bitsandbytes>"
POSITION_INDEPENDENT_CODE ON # The `-fPIC` commands for non-windows compilers
WINDOWS_EXPORT_ALL_SYMBOLS ON # On Windows, export all c methods as DLL exports
)
Loading