Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Enable crate-ci/typos lint; fix typos
Browse files Browse the repository at this point in the history
Co-authored-by: Titus von Koeller <titus@vonkoeller.com>

fix erroneous correction
akx committed Feb 5, 2024
1 parent 259ad44 commit b7360d9
Showing 14 changed files with 68 additions and 46 deletions.
4 changes: 4 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -17,3 +17,7 @@ repos:
- id: mixed-line-ending
args:
- --fix=lf
- repo: https://github.com/crate-ci/typos
rev: v1.17.2
hooks:
- id: typos
11 changes: 11 additions & 0 deletions _typos.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
[files]

[default.extend-identifiers]

[type.py.extend-words]
"BA" = "BA" # used as a commented-out variable in tests

[type.cuda.extend-words]
"subtile" = "subtile"
"subtiles" = "subtiles"
"transation" = "transation" # TODO: is this transition, transaction, translation..?
4 changes: 2 additions & 2 deletions benchmarking/switchback/make_plot_with_jsonl.py
Original file line number Diff line number Diff line change
@@ -36,8 +36,8 @@

('x_quantize_rowwise', 'P', '--', 'C4', 'Quantize rowwise X (switchback)'),
('g_quantize_rowwise', 'P', '-.', 'C4', 'Quantize rowwise G (switchback)'),
('w_quantize_global', '.', '--', 'C4', 'Quatnize global W (switchback)'),
('w_quantize_global_transpose', '.', '-.', 'C4', 'Quantize gloabl and\ntranspose W (switchback)'),
('w_quantize_global', '.', '--', 'C4', 'Quantize global W (switchback)'),
('w_quantize_global_transpose', '.', '-.', 'C4', 'Quantize global and\ntranspose W (switchback)'),
]:
xs = []
ys = []
4 changes: 2 additions & 2 deletions bitsandbytes/cuda_setup/main.py
Original file line number Diff line number Diff line change
@@ -4,7 +4,7 @@
[ ] TODO: Q - What if we have multiple GPUs of different makes?
- CUDA version
- Software:
- CPU-only: only CPU quantization functions (no optimizer, no matrix multipl)
- CPU-only: only CPU quantization functions (no optimizer, no matrix multiply)
- CuBLAS-LT: full-build 8-bit optimizer
- no CuBLAS-LT: no 8-bit matrix multiplication (`nomatmul`)
@@ -263,7 +263,7 @@ def warn_in_case_of_duplicates(results_paths: Set[Path]) -> None:
warning_msg = (
f"Found duplicate {CUDA_RUNTIME_LIBS} files: {results_paths}.. "
"We select the PyTorch default libcudart.so, which is {torch.version.cuda},"
"but this might missmatch with the CUDA version that is needed for bitsandbytes."
"but this might mismatch with the CUDA version that is needed for bitsandbytes."
"To override this behavior set the BNB_CUDA_VERSION=<version string, e.g. 122> environmental variable"
"For example, if you want to use the CUDA version 122"
"BNB_CUDA_VERSION=122 python ..."
6 changes: 3 additions & 3 deletions bitsandbytes/functional.py
Original file line number Diff line number Diff line change
@@ -120,7 +120,7 @@ def get_instance(cls):
return cls._instance

def prefetch_all(self, to_cpu=False):
# assume the first added, will be hte
# assume the first added, will be the
# ones that are used first, so swap them in last
# in the case they are evicted again
for t in self.paged_tensors[::-1]:
@@ -219,7 +219,7 @@ def elementwise_func(func_name, A, B, value, prefetch=True):
# paged function are fully asynchronous
# if we return from this function, we want to the tensor
# to be in the correct state, that is the final state after the
# operation occured. So we synchronize.
# operation occurred. So we synchronize.
torch.cuda.synchronize()

def fill(A, value, device=None, prefetch=True): elementwise_func('fill', A, None, value)
@@ -589,7 +589,7 @@ def estimate_quantiles(A: Tensor, out: Optional[torch.Tensor] = None, offset: fl


class QuantState:
"""container for quantization state components to work with Params4bit and similar clases"""
"""container for quantization state components to work with Params4bit and similar classes"""
valid_quant_types = ('fp4', 'nf4')
valid_qs_type_keys = [f"bitsandbytes__{x}" for x in valid_quant_types]
valid_qs_keys = ['absmax', 'quant_map', 'nested_absmax', 'nested_quant_map', 'quant_state', 'quant_type',
18 changes: 9 additions & 9 deletions csrc/kernels.cu
Original file line number Diff line number Diff line change
@@ -134,10 +134,10 @@ __device__ unsigned char dQuantizeFP4(float x)

// we do a binary search
// the pivots are divided by 12 (the FP4 absmax)
// since we assum input data is in [-1.0, 1.0]
// since we assume input data is in [-1.0, 1.0]

// !be careful here, its easy to make a mistake
// that is difficult to noice if you add an extra
// that is difficult to notice if you add an extra
// zero somewhere!

int sign = x < 0 ? 0b1000 : 0b0000;
@@ -2259,8 +2259,8 @@ template<typename T, int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_
}

// 4. store data via atomicMax
// to store col data efficienctly we need to rewrite the smem blocked data [0, 1, 2, 3...] for t0
// into a striped arangement: [0, 8, 16, 24, ..] for t0
// to store col data efficiently we need to rewrite the smem blocked data [0, 1, 2, 3...] for t0
// into a striped arrangement: [0, 8, 16, 24, ..] for t0
__syncthreads();
BlockExchange(temp_storage.exchange).BlockedToStriped(local_col_absmax_values);

@@ -2310,7 +2310,7 @@ template <int ITEMS_PER_THREAD, int SUBTILE_ROWS, int THREADS>__global__ void kd

// data is in 32 column-tile major with tile width 32 columns and numRows rows
// L1. Load sub-tile row/col statistics. Each thread only holds 1 col, load rows into shared memory.
// L2. Load data in warp-striped arangement (t0 holds colidx [0, 0, 0, 0], rowidx [0, 1, 2, 3])
// L2. Load data in warp-striped arrangement (t0 holds colidx [0, 0, 0, 0], rowidx [0, 1, 2, 3])
// C1. Compute val(row_stat*col_stat)/(127*127) (load 1/(127*127 into register))
// C2. Compute normalization values and store col values in register
// S1. Store C1 into 16-bit output
@@ -2383,7 +2383,7 @@ template <int ITEMS_PER_THREAD, int SUBTILE_ROWS, int THREADS>__global__ void kd
if(valid_items <= 0) // the sub-tile might have more elements than the tile itself
break;

// L2. Load data in warp-striped arangement (t0 holds colidx [0, 0, 0, 0], rowidx [0, 1, 2, 3])
// L2. Load data in warp-striped arrangement (t0 holds colidx [0, 0, 0, 0], rowidx [0, 1, 2, 3])
LoadInt32(loadint32).Load(&(A[subtile_idx]), local_values, valid_items, 0);
ExchangeInt32(exchangeint32).BlockedToWarpStriped(local_values, local_values);

@@ -2650,7 +2650,7 @@ template <int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_COLS, int T
// row1 [col0 col1 ... col31]
// ...
//
// As such we read consequtive entries with 256 threads (8rows x 32 columns)
// As such we read consecutive entries with 256 threads (8rows x 32 columns)
// as j increase, the row increase by a factor of 8
// We load 8 rows per subrow loop, and subrow increase by 8 per loop
// so we have an offset of 8 rows every loop or (subrow/warps)*8 = (subrow/8)*8
@@ -2747,7 +2747,7 @@ template <int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_COLS, int T
// each of these has 32 values in total for 32*4 = 128 as offset if odd
// every set of 4 columns increases the total offset by 16
// each even row increase the offset by 4, for example row 2 is offset by 4, 4 by 6 etc so: subrow/2*4 = subrow*2
// this happends every 8 rows anew (subrow % 8)
// this happens every 8 rows anew (subrow % 8)
// one writes 4 columns at once that is (col % 4) for the particular index in the subtile
int subcol = warp_lane;

@@ -3073,7 +3073,7 @@ template <int FORMAT> __global__ void kExtractOutliers(char *A, int *idx, char *
//// 4. do dequantization from register of B into second pair of registers
//// 5. store (4) into fragment
//// 6. matmul aggregate into fragment C
//// 7. aggreecate files of C into shared memroy block C
//// 7. aggreecate files of C into shared memory block C
//// 8. sum (7)
//// 9. write outputs to matmul output matrix
//}
44 changes: 22 additions & 22 deletions deploy.sh
Original file line number Diff line number Diff line change
@@ -5,7 +5,7 @@ echo "MAKE SURE LD_LIBRARY_PATH IS EMPTY!"
echo $LD_LIBRARY_PATH

if [[ ! -z "${LD_LIBRARY_PATH}" ]]; then
echo "Compilation unsuccessul!" 1>&2
echo "Compilation unsuccessful!" 1>&2
exit 64
fi

@@ -24,7 +24,7 @@ make cpuonly CUDA_VERSION="CPU"

if [ ! -f "./bitsandbytes/libbitsandbytes_cpu.so" ]; then
# Control will enter here if $DIRECTORY doesn't exist.
echo "Compilation unsuccessul!" 1>&2
echo "Compilation unsuccessful!" 1>&2
exit 64
fi

@@ -34,7 +34,7 @@ make cuda110 CUDA_VERSION=110

if [ ! -f "./bitsandbytes/libbitsandbytes_cuda110.so" ]; then
# Control will enter here if $DIRECTORY doesn't exist.
echo "Compilation unsuccessul!" 1>&2
echo "Compilation unsuccessful!" 1>&2
exit 64
fi

@@ -44,7 +44,7 @@ make cuda11x CUDA_VERSION=111

if [ ! -f "./bitsandbytes/libbitsandbytes_cuda111.so" ]; then
# Control will enter here if $DIRECTORY doesn't exist.
echo "Compilation unsuccessul!" 1>&2
echo "Compilation unsuccessful!" 1>&2
exit 64
fi

@@ -54,7 +54,7 @@ make cuda11x CUDA_VERSION=114

if [ ! -f "./bitsandbytes/libbitsandbytes_cuda114.so" ]; then
# Control will enter here if $DIRECTORY doesn't exist.
echo "Compilation unsuccessul!" 1>&2
echo "Compilation unsuccessful!" 1>&2
exit 64
fi

@@ -64,7 +64,7 @@ make cuda11x CUDA_VERSION=115

if [ ! -f "./bitsandbytes/libbitsandbytes_cuda115.so" ]; then
# Control will enter here if $DIRECTORY doesn't exist.
echo "Compilation unsuccessul!" 1>&2
echo "Compilation unsuccessful!" 1>&2
exit 64
fi

@@ -74,7 +74,7 @@ make cuda11x CUDA_VERSION=117

if [ ! -f "./bitsandbytes/libbitsandbytes_cuda117.so" ]; then
# Control will enter here if $DIRECTORY doesn't exist.
echo "Compilation unsuccessul!" 1>&2
echo "Compilation unsuccessful!" 1>&2
exit 64
fi

@@ -84,7 +84,7 @@ make cuda118 CUDA_VERSION=118

if [ ! -f "./bitsandbytes/libbitsandbytes_cuda118.so" ]; then
# Control will enter here if $DIRECTORY doesn't exist.
echo "Compilation unsuccessul!" 1>&2
echo "Compilation unsuccessful!" 1>&2
exit 64
fi

@@ -94,7 +94,7 @@ make cuda12x CUDA_VERSION=120

if [ ! -f "./bitsandbytes/libbitsandbytes_cuda120.so" ]; then
# Control will enter here if $DIRECTORY doesn't exist.
echo "Compilation unsuccessul!" 1>&2
echo "Compilation unsuccessful!" 1>&2
exit 64
fi

@@ -104,7 +104,7 @@ make cuda12x CUDA_VERSION=121

if [ ! -f "./bitsandbytes/libbitsandbytes_cuda121.so" ]; then
# Control will enter here if $DIRECTORY doesn't exist.
echo "Compilation unsuccessul!" 1>&2
echo "Compilation unsuccessful!" 1>&2
exit 64
fi

@@ -114,7 +114,7 @@ make cuda12x CUDA_VERSION=122

if [ ! -f "./bitsandbytes/libbitsandbytes_cuda122.so" ]; then
# Control will enter here if $DIRECTORY doesn't exist.
echo "Compilation unsuccessul!" 1>&2
echo "Compilation unsuccessful!" 1>&2
exit 64
fi

@@ -124,7 +124,7 @@ make cuda12x CUDA_VERSION=123

if [ ! -f "./bitsandbytes/libbitsandbytes_cuda123.so" ]; then
# Control will enter here if $DIRECTORY doesn't exist.
echo "Compilation unsuccessul!" 1>&2
echo "Compilation unsuccessful!" 1>&2
exit 64
fi

@@ -138,7 +138,7 @@ make cuda110_nomatmul CUDA_VERSION=110

if [ ! -f "./bitsandbytes/libbitsandbytes_cuda110_nocublaslt.so" ]; then
# Control will enter here if $DIRECTORY doesn't exist.
echo "Compilation unsuccessul!" 1>&2
echo "Compilation unsuccessful!" 1>&2
exit 64
fi

@@ -149,7 +149,7 @@ make cuda11x_nomatmul CUDA_VERSION=111

if [ ! -f "./bitsandbytes/libbitsandbytes_cuda111_nocublaslt.so" ]; then
# Control will enter here if $DIRECTORY doesn't exist.
echo "Compilation unsuccessul!" 1>&2
echo "Compilation unsuccessful!" 1>&2
exit 64
fi

@@ -159,7 +159,7 @@ make cuda11x_nomatmul CUDA_VERSION=114

if [ ! -f "./bitsandbytes/libbitsandbytes_cuda114_nocublaslt.so" ]; then
# Control will enter here if $DIRECTORY doesn't exist.
echo "Compilation unsuccessul!" 1>&2
echo "Compilation unsuccessful!" 1>&2
exit 64
fi

@@ -169,7 +169,7 @@ make cuda11x_nomatmul CUDA_VERSION=115

if [ ! -f "./bitsandbytes/libbitsandbytes_cuda115_nocublaslt.so" ]; then
# Control will enter here if $DIRECTORY doesn't exist.
echo "Compilation unsuccessul!" 1>&2
echo "Compilation unsuccessful!" 1>&2
exit 64
fi

@@ -179,7 +179,7 @@ make cuda11x_nomatmul CUDA_VERSION=117

if [ ! -f "./bitsandbytes/libbitsandbytes_cuda117_nocublaslt.so" ]; then
# Control will enter here if $DIRECTORY doesn't exist.
echo "Compilation unsuccessul!" 1>&2
echo "Compilation unsuccessful!" 1>&2
exit 64
fi

@@ -189,7 +189,7 @@ make cuda118_nomatmul CUDA_VERSION=118

if [ ! -f "./bitsandbytes/libbitsandbytes_cuda118_nocublaslt.so" ]; then
# Control will enter here if $DIRECTORY doesn't exist.
echo "Compilation unsuccessul!" 1>&2
echo "Compilation unsuccessful!" 1>&2
exit 64
fi

@@ -199,7 +199,7 @@ make cuda12x_nomatmul CUDA_VERSION=120

if [ ! -f "./bitsandbytes/libbitsandbytes_cuda120_nocublaslt.so" ]; then
# Control will enter here if $DIRECTORY doesn't exist.
echo "Compilation unsuccessul!" 1>&2
echo "Compilation unsuccessful!" 1>&2
exit 64
fi

@@ -209,7 +209,7 @@ make cuda12x_nomatmul CUDA_VERSION=121

if [ ! -f "./bitsandbytes/libbitsandbytes_cuda121_nocublaslt.so" ]; then
# Control will enter here if $DIRECTORY doesn't exist.
echo "Compilation unsuccessul!" 1>&2
echo "Compilation unsuccessful!" 1>&2
exit 64
fi

@@ -219,7 +219,7 @@ make cuda12x_nomatmul CUDA_VERSION=122

if [ ! -f "./bitsandbytes/libbitsandbytes_cuda122_nocublaslt.so" ]; then
# Control will enter here if $DIRECTORY doesn't exist.
echo "Compilation unsuccessul!" 1>&2
echo "Compilation unsuccessful!" 1>&2
exit 64
fi

@@ -229,7 +229,7 @@ make cuda12x_nomatmul CUDA_VERSION=123

if [ ! -f "./bitsandbytes/libbitsandbytes_cuda123_nocublaslt.so" ]; then
# Control will enter here if $DIRECTORY doesn't exist.
echo "Compilation unsuccessul!" 1>&2
echo "Compilation unsuccessful!" 1>&2
exit 64
fi

2 changes: 1 addition & 1 deletion docs/source/contributing.mdx
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Contributors guidelines
... stil under construction ... (feel free to propose materials, `bitsandbytes` is a community project)
... still under construction ... (feel free to propose materials, `bitsandbytes` is a community project)

## Setup pre-commit hooks
- Install pre-commit hooks with `pip install pre-commit`.
2 changes: 1 addition & 1 deletion docs/source/integrations.mdx
Original file line number Diff line number Diff line change
@@ -29,7 +29,7 @@ Please review the [bitsandbytes section in the Accelerate docs](https://huggingf

# Trainer for the optimizers

You can use any of the 8-bit and/or paged optimizers by simple passing them to the `transformers.Trainer` class on intialization.All bnb optimizers are supported by passing the correct string in `TrainingArguments`'s `optim` attribute - e.g. (`paged_adamw_32bit`).
You can use any of the 8-bit and/or paged optimizers by simple passing them to the `transformers.Trainer` class on initialization.All bnb optimizers are supported by passing the correct string in `TrainingArguments`'s `optim` attribute - e.g. (`paged_adamw_32bit`).

See the [official API docs for reference](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.Trainer).

4 changes: 2 additions & 2 deletions docs/source/optimizers.mdx
Original file line number Diff line number Diff line change
@@ -168,9 +168,9 @@ Possible options for the config override are: `betas, eps, weight_decay, lr, opt
For overrides for particular layers, we recommend overriding locally in each module. You can do this by passing the module, the parameter, and its attribute name to the GlobalOptimManager:
```py
class MyModule(torch.nn.Module):
def __init__(din, dout):
def __init__(d_in, d_out):
super(MyModule, self).__init__()
self.linear = torch.nn.Linear(din, dout)
self.linear = torch.nn.Linear(d_in, d_out)
# optimization will happen in 32-bit and
# learning rate will be set to 0.0001 independent of the main learning rate
config = {'optim_bits': 32, 'lr' : 0.0001}
2 changes: 1 addition & 1 deletion include/Algo-Direct2.h
Original file line number Diff line number Diff line change
@@ -157,7 +157,7 @@ struct AlgoVecBase<I, T, A, typename std::enable_if<DirectAux::IsDirect2<A>::val
FVec<AVX, float> vxp = _mm256_i32gather_ps(xi, idxp, sizeof(float));
IVec<AVX, float> ip = idxm;

#else // do not use gather instrucions
#else // do not use gather instructions

union U {
__m256i vec;
2 changes: 1 addition & 1 deletion include/Portable.h
Original file line number Diff line number Diff line change
@@ -147,5 +147,5 @@ inline T prev(T x)
return x;
}

} // namepsace Details
} // namespace Details
} // namespace BinSearch
2 changes: 1 addition & 1 deletion include/SIMD.h
Original file line number Diff line number Diff line change
@@ -568,5 +568,5 @@ FORCE_INLINE FVec<AVX, double> mulSub(const FVec<AVX, double>& a, const FVec<AVX

#endif

} // namepsace Details
} // namespace Details
} // namespace BinSearch
Loading

0 comments on commit b7360d9

Please sign in to comment.