From a5ec850c9e36a58bf5e611a05a60108bfe909260 Mon Sep 17 00:00:00 2001
From: Aarni Koskela <akx@iki.fi>
Date: Thu, 1 Feb 2024 09:50:43 +0200
Subject: [PATCH] Enable crate-ci/typos lint; fix typos

---
 .pre-commit-config.yaml                       |  4 ++
 _typos.toml                                   | 11 +++++
 .../switchback/make_plot_with_jsonl.py        |  4 +-
 bitsandbytes/cuda_setup/main.py               |  4 +-
 bitsandbytes/functional.py                    |  6 +--
 csrc/kernels.cu                               | 18 ++++----
 deploy.sh                                     | 44 +++++++++----------
 howto_config_override.md                      |  4 +-
 include/Algo-Direct2.h                        |  2 +-
 include/Portable.h                            |  2 +-
 include/SIMD.h                                |  2 +-
 tests/test_modules.py                         |  9 +++-
 12 files changed, 66 insertions(+), 44 deletions(-)
 create mode 100644 _typos.toml

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index d568a849f..d65f892e8 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -6,3 +6,7 @@ repos:
         args:
           - --fix
       # - id: ruff-format  # TODO: enable when the time is right
+  - repo: https://github.com/crate-ci/typos
+    rev: v1.17.2
+    hooks:
+      - id: typos
diff --git a/_typos.toml b/_typos.toml
new file mode 100644
index 000000000..a04206b8d
--- /dev/null
+++ b/_typos.toml
@@ -0,0 +1,11 @@
+[files]
+
+[default.extend-identifiers]
+
+[type.py.extend-words]
+"BA" = "BA"  # used as a commented-out variable in tests
+
+[type.cuda.extend-words]
+"subtile" = "subtile"
+"subtiles" = "subtiles"
+"transation" = "transation"  # TODO: is this transition, transaction, translation..?
diff --git a/benchmarking/switchback/make_plot_with_jsonl.py b/benchmarking/switchback/make_plot_with_jsonl.py
index 3ef87d6b2..335a626c5 100644
--- a/benchmarking/switchback/make_plot_with_jsonl.py
+++ b/benchmarking/switchback/make_plot_with_jsonl.py
@@ -36,8 +36,8 @@
         
         ('x_quantize_rowwise', 'P', '--', 'C4', 'Quantize rowwise X (switchback)'),
         ('g_quantize_rowwise', 'P', '-.', 'C4', 'Quantize rowwise G (switchback)'),
-        ('w_quantize_global', '.', '--', 'C4', 'Quatnize global W (switchback)'),
-        ('w_quantize_global_transpose', '.', '-.', 'C4', 'Quantize gloabl and\ntranspose W (switchback)'),
+        ('w_quantize_global', '.', '--', 'C4', 'Quantize global W (switchback)'),
+        ('w_quantize_global_transpose', '.', '-.', 'C4', 'Quantize global and\ntranspose W (switchback)'),
     ]:
         xs = []
         ys = []
diff --git a/bitsandbytes/cuda_setup/main.py b/bitsandbytes/cuda_setup/main.py
index a34385b1f..3c111b725 100644
--- a/bitsandbytes/cuda_setup/main.py
+++ b/bitsandbytes/cuda_setup/main.py
@@ -4,7 +4,7 @@
     [ ] TODO: Q - What if we have multiple GPUs of different makes?
 - CUDA version
 - Software:
-    - CPU-only: only CPU quantization functions (no optimizer, no matrix multipl)
+    - CPU-only: only CPU quantization functions (no optimizer, no matrix multiply)
     - CuBLAS-LT: full-build 8-bit optimizer
     - no CuBLAS-LT: no 8-bit matrix multiplication (`nomatmul`)
 
@@ -258,7 +258,7 @@ def warn_in_case_of_duplicates(results_paths: Set[Path]) -> None:
         warning_msg = (
             f"Found duplicate {CUDA_RUNTIME_LIBS} files: {results_paths}.. "
             "We select the PyTorch default libcudart.so, which is {torch.version.cuda},"
-            "but this might missmatch with the CUDA version that is needed for bitsandbytes."
+            "but this might mismatch with the CUDA version that is needed for bitsandbytes."
             "To override this behavior set the BNB_CUDA_VERSION=<version string, e.g. 122> environmental variable"
             "For example, if you want to use the CUDA version 122"
             "BNB_CUDA_VERSION=122 python ..."
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index 11db74859..9fc5e08f0 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -120,7 +120,7 @@ def get_instance(cls):
         return cls._instance
 
     def prefetch_all(self, to_cpu=False):
-        # assume the first added, will be hte
+        # assume the first added, will be the
         # ones that are used first, so swap them in last
         # in the case they are evicted again
         for t in self.paged_tensors[::-1]:
@@ -219,7 +219,7 @@ def elementwise_func(func_name, A, B, value, prefetch=True):
         # paged function are fully asynchronous
         # if we return from this function, we want to the tensor
         # to be in the correct state, that is the final state after the
-        # operation occured. So we synchronize.
+        # operation occurred. So we synchronize.
         torch.cuda.synchronize()
 
 def fill(A, value, device=None, prefetch=True): elementwise_func('fill', A, None, value)
@@ -589,7 +589,7 @@ def estimate_quantiles(A: Tensor, out: Optional[torch.Tensor] = None, offset: fl
 
 
 class QuantState:
-    """container for quantization state components to work with Params4bit and similar clases"""
+    """container for quantization state components to work with Params4bit and similar classes"""
     valid_quant_types = ('fp4', 'nf4')
     valid_qs_type_keys = [f"bitsandbytes__{x}" for x in valid_quant_types]
     valid_qs_keys = ['absmax', 'quant_map', 'nested_absmax', 'nested_quant_map', 'quant_state', 'quant_type',
diff --git a/csrc/kernels.cu b/csrc/kernels.cu
index 0fff83665..625476be7 100644
--- a/csrc/kernels.cu
+++ b/csrc/kernels.cu
@@ -134,10 +134,10 @@ __device__ unsigned char dQuantizeFP4(float x)
 
   // we do a binary search
   // the pivots are divided by 12 (the FP4 absmax)
-  // since we assum input data is in [-1.0, 1.0]
+  // since we assume input data is in [-1.0, 1.0]
 
   // !be careful here, its easy to make a mistake
-  // that is difficult to noice if you add an extra
+  // that is difficult to notice if you add an extra
   // zero somewhere!
 
   int sign = x < 0 ? 0b1000 : 0b0000;
@@ -2259,8 +2259,8 @@ template<typename T, int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_
   }
 
   // 4. store data via atomicMax
-  // to store col data efficienctly we need to rewrite the smem blocked data [0, 1, 2, 3...] for t0
-  // into a striped arangement: [0, 8, 16, 24, ..] for t0
+  // to store col data efficiency we need to rewrite the smem blocked data [0, 1, 2, 3...] for t0
+  // into a striped arrangement: [0, 8, 16, 24, ..] for t0
   __syncthreads();
   BlockExchange(temp_storage.exchange).BlockedToStriped(local_col_absmax_values);
 
@@ -2310,7 +2310,7 @@ template <int ITEMS_PER_THREAD, int SUBTILE_ROWS, int THREADS>__global__ void kd
 
   // data is in 32 column-tile major with tile width 32 columns and numRows rows
   // L1. Load sub-tile row/col statistics. Each thread only holds 1 col, load rows into shared memory.
-  // L2. Load data in warp-striped arangement (t0 holds colidx [0, 0, 0, 0], rowidx [0, 1, 2, 3])
+  // L2. Load data in warp-striped arrangement (t0 holds colidx [0, 0, 0, 0], rowidx [0, 1, 2, 3])
   // C1. Compute val(row_stat*col_stat)/(127*127) (load 1/(127*127 into register))
   // C2. Compute normalization values and store col values in register
   // S1. Store C1 into 16-bit output
@@ -2383,7 +2383,7 @@ template <int ITEMS_PER_THREAD, int SUBTILE_ROWS, int THREADS>__global__ void kd
     if(valid_items <= 0) // the sub-tile might have more elements than the tile itself
       break;
 
-    // L2. Load data in warp-striped arangement (t0 holds colidx [0, 0, 0, 0], rowidx [0, 1, 2, 3])
+    // L2. Load data in warp-striped arrangement (t0 holds colidx [0, 0, 0, 0], rowidx [0, 1, 2, 3])
     LoadInt32(loadint32).Load(&(A[subtile_idx]), local_values, valid_items, 0);
     ExchangeInt32(exchangeint32).BlockedToWarpStriped(local_values, local_values);
 
@@ -2650,7 +2650,7 @@ template <int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_COLS, int T
                   // row1 [col0 col1 ... col31]
                   // ...
                   //
-                  // As such we read consequtive entries with 256 threads (8rows x 32 columns)
+                  // As such we read consecutive entries with 256 threads (8rows x 32 columns)
                   // as j increase, the row increase by a factor of 8
                   // We load 8 rows per subrow loop, and subrow increase by 8 per loop
                   // so we have an offset of 8 rows every loop or (subrow/warps)*8 = (subrow/8)*8
@@ -2747,7 +2747,7 @@ template <int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_COLS, int T
                     // each of these has 32 values in total for 32*4 = 128 as offset if odd
                     // every set of 4 columns increases the total offset by 16
                     // each even row increase the offset by 4, for example row 2 is offset by 4, 4 by 6 etc so: subrow/2*4 = subrow*2
-                    // this happends every 8 rows anew (subrow % 8)
+                    // this happens every 8 rows anew (subrow % 8)
                     // one writes 4 columns at once that is (col % 4) for the particular index in the subtile
                     int subcol = warp_lane;
 
@@ -3073,7 +3073,7 @@ template <int FORMAT> __global__ void kExtractOutliers(char *A, int *idx, char *
 //// 4. do dequantization from register of B into second pair of registers
 //// 5. store (4) into fragment
 //// 6. matmul aggregate into fragment C
-//// 7. aggreecate files of C into shared memroy block C
+//// 7. aggreecate files of C into shared memory block C
 //// 8. sum (7)
 //// 9. write outputs to matmul output matrix
 //}
diff --git a/deploy.sh b/deploy.sh
index c261ee9a9..e60373627 100644
--- a/deploy.sh
+++ b/deploy.sh
@@ -5,7 +5,7 @@ echo "MAKE SURE LD_LIBRARY_PATH IS EMPTY!"
 echo $LD_LIBRARY_PATH
 
 if [[ ! -z "${LD_LIBRARY_PATH}" ]]; then
-  echo "Compilation unsuccessul!" 1>&2
+  echo "Compilation unsuccessful!" 1>&2
   exit 64
 fi
 
@@ -24,7 +24,7 @@ make cpuonly CUDA_VERSION="CPU"
 
 if [ ! -f "./bitsandbytes/libbitsandbytes_cpu.so" ]; then
   # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessul!" 1>&2
+  echo "Compilation unsuccessful!" 1>&2
   exit 64
 fi
 
@@ -34,7 +34,7 @@ make cuda110 CUDA_VERSION=110
 
 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda110.so" ]; then
   # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessul!" 1>&2
+  echo "Compilation unsuccessful!" 1>&2
   exit 64
 fi
 
@@ -44,7 +44,7 @@ make cuda11x CUDA_VERSION=111
 
 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda111.so" ]; then
   # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessul!" 1>&2
+  echo "Compilation unsuccessful!" 1>&2
   exit 64
 fi
 
@@ -54,7 +54,7 @@ make cuda11x CUDA_VERSION=114
 
 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda114.so" ]; then
   # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessul!" 1>&2
+  echo "Compilation unsuccessful!" 1>&2
   exit 64
 fi
 
@@ -64,7 +64,7 @@ make cuda11x CUDA_VERSION=115
 
 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda115.so" ]; then
   # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessul!" 1>&2
+  echo "Compilation unsuccessful!" 1>&2
   exit 64
 fi
 
@@ -74,7 +74,7 @@ make cuda11x CUDA_VERSION=117
 
 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda117.so" ]; then
   # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessul!" 1>&2
+  echo "Compilation unsuccessful!" 1>&2
   exit 64
 fi
 
@@ -84,7 +84,7 @@ make cuda118 CUDA_VERSION=118
 
 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda118.so" ]; then
   # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessul!" 1>&2
+  echo "Compilation unsuccessful!" 1>&2
   exit 64
 fi
 
@@ -94,7 +94,7 @@ make cuda12x CUDA_VERSION=120
 
 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda120.so" ]; then
   # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessul!" 1>&2
+  echo "Compilation unsuccessful!" 1>&2
   exit 64
 fi
 
@@ -104,7 +104,7 @@ make cuda12x CUDA_VERSION=121
 
 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda121.so" ]; then
   # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessul!" 1>&2
+  echo "Compilation unsuccessful!" 1>&2
   exit 64
 fi
 
@@ -114,7 +114,7 @@ make cuda12x CUDA_VERSION=122
 
 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda122.so" ]; then
   # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessul!" 1>&2
+  echo "Compilation unsuccessful!" 1>&2
   exit 64
 fi
 
@@ -124,7 +124,7 @@ make cuda12x CUDA_VERSION=123
 
 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda123.so" ]; then
   # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessul!" 1>&2
+  echo "Compilation unsuccessful!" 1>&2
   exit 64
 fi
 
@@ -138,7 +138,7 @@ make cuda110_nomatmul CUDA_VERSION=110
 
 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda110_nocublaslt.so" ]; then
   # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessul!" 1>&2
+  echo "Compilation unsuccessful!" 1>&2
   exit 64
 fi
 
@@ -149,7 +149,7 @@ make cuda11x_nomatmul CUDA_VERSION=111
 
 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda111_nocublaslt.so" ]; then
   # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessul!" 1>&2
+  echo "Compilation unsuccessful!" 1>&2
   exit 64
 fi
 
@@ -159,7 +159,7 @@ make cuda11x_nomatmul CUDA_VERSION=114
 
 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda114_nocublaslt.so" ]; then
   # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessul!" 1>&2
+  echo "Compilation unsuccessful!" 1>&2
   exit 64
 fi
 
@@ -169,7 +169,7 @@ make cuda11x_nomatmul CUDA_VERSION=115
 
 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda115_nocublaslt.so" ]; then
   # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessul!" 1>&2
+  echo "Compilation unsuccessful!" 1>&2
   exit 64
 fi
 
@@ -179,7 +179,7 @@ make cuda11x_nomatmul CUDA_VERSION=117
 
 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda117_nocublaslt.so" ]; then
   # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessul!" 1>&2
+  echo "Compilation unsuccessful!" 1>&2
   exit 64
 fi
 
@@ -189,7 +189,7 @@ make cuda118_nomatmul CUDA_VERSION=118
 
 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda118_nocublaslt.so" ]; then
   # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessul!" 1>&2
+  echo "Compilation unsuccessful!" 1>&2
   exit 64
 fi
 
@@ -199,7 +199,7 @@ make cuda12x_nomatmul CUDA_VERSION=120
 
 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda120_nocublaslt.so" ]; then
   # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessul!" 1>&2
+  echo "Compilation unsuccessful!" 1>&2
   exit 64
 fi
 
@@ -209,7 +209,7 @@ make cuda12x_nomatmul CUDA_VERSION=121
 
 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda121_nocublaslt.so" ]; then
   # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessul!" 1>&2
+  echo "Compilation unsuccessful!" 1>&2
   exit 64
 fi
 
@@ -219,7 +219,7 @@ make cuda12x_nomatmul CUDA_VERSION=122
 
 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda122_nocublaslt.so" ]; then
   # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessul!" 1>&2
+  echo "Compilation unsuccessful!" 1>&2
   exit 64
 fi
 
@@ -229,7 +229,7 @@ make cuda12x_nomatmul CUDA_VERSION=123
 
 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda123_nocublaslt.so" ]; then
   # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessul!" 1>&2
+  echo "Compilation unsuccessful!" 1>&2
   exit 64
 fi
 
diff --git a/howto_config_override.md b/howto_config_override.md
index 55b24e3ab..88fa7d4b1 100644
--- a/howto_config_override.md
+++ b/howto_config_override.md
@@ -29,9 +29,9 @@ Possible options for the config override are: `betas, eps, weight_decay, lr, opt
 For overrides for particular layers we recommend overriding locally in each module. You can do this by passing the module, the parameter, and its attribute name to the GlobalOptimManager:
 ```python
 class MyModule(torch.nn.Module):
-  def __init__(din, dout):
+  def __init__(din, doubt):
     super(MyModule, self).__init__()
-    self.linear = torch.nn.Linear(din, dout)
+    self.linear = torch.nn.Linear(din, doubt)
     # optimization will happen in 32-bit and
     # learning rate will be set to 0.0001 independent of the main learning rate
     config = {'optim_bits': 32, 'lr' : 0.0001}
diff --git a/include/Algo-Direct2.h b/include/Algo-Direct2.h
index d5fa58d12..4211c77bd 100644
--- a/include/Algo-Direct2.h
+++ b/include/Algo-Direct2.h
@@ -157,7 +157,7 @@ struct AlgoVecBase<I, T, A, typename std::enable_if<DirectAux::IsDirect2<A>::val
         FVec<AVX, float> vxp = _mm256_i32gather_ps(xi, idxp, sizeof(float));
         IVec<AVX, float> ip = idxm;
 
-#else // do not use gather instrucions
+#else // do not use gather instructions
 
         union U {
             __m256i vec;
diff --git a/include/Portable.h b/include/Portable.h
index 1710b0502..2cec1e7de 100644
--- a/include/Portable.h
+++ b/include/Portable.h
@@ -147,5 +147,5 @@ inline T prev(T x)
     return x;
 }
 
-} // namepsace Details
+} // namespace Details
 } // namespace BinSearch
diff --git a/include/SIMD.h b/include/SIMD.h
index d559e9f55..a2639d3ac 100644
--- a/include/SIMD.h
+++ b/include/SIMD.h
@@ -568,5 +568,5 @@ FORCE_INLINE FVec<AVX, double> mulSub(const FVec<AVX, double>& a, const FVec<AVX
 
 #endif
 
-} // namepsace Details
+} // namespace Details
 } // namespace BinSearch
diff --git a/tests/test_modules.py b/tests/test_modules.py
index 1cb04044f..a59ae9e68 100644
--- a/tests/test_modules.py
+++ b/tests/test_modules.py
@@ -483,7 +483,14 @@ def test_linear8bitlt_no_fp16_weights(threshold, memory_efficient_backward):
         assert (idx == 0).sum().item() <= b1.numel() * 0.005
 
 
-@pytest.mark.parametrize("module", [lambda nin, nout, bias=True: bnb.nn.Linear8bitLt(nin, nout, bias=bias, has_fp16_weights=False), bnb.nn.LinearFP4], ids=['Int8Lt', 'FP4'])
+@pytest.mark.parametrize(
+    "module",
+    [
+        lambda n_in, n_out, bias=True: bnb.nn.Linear8bitLt(n_in, n_out, bias=bias, has_fp16_weights=False),
+        bnb.nn.LinearFP4,
+    ],
+    ids=['Int8Lt', 'FP4'],
+)
 def test_linear_kbit_fp32_bias(module):
     # casts model to fp16 -> int8 automatically
     l1 = module(32, 64).cuda()