diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index 44116cc63..fbd756464 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -2,13 +2,13 @@
 #   
 # This source code is licensed under the MIT license found in the 
 # LICENSE file in the root directory of this source tree.
+import ctypes as ct
 import os
 import random
-import math
-import ctypes as ct
+from typing import Tuple
+
 import torch
 from torch import Tensor
-from typing import Tuple
 
 lib = ct.cdll.LoadLibrary(os.path.dirname(__file__) + '/libbitsandbytes.so')
 name2qmap = {}
diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index dc0a1713a..c5460fb83 100644
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -7,7 +7,6 @@
 from typing import Optional
 
 from torch import Tensor
-from torch.nn.parameter import Parameter
 import torch.nn.functional as F
 
 from bitsandbytes.optim import GlobalOptimManager
diff --git a/bitsandbytes/optim/adagrad.py b/bitsandbytes/optim/adagrad.py
index 84ade3c78..4f51250ed 100644
--- a/bitsandbytes/optim/adagrad.py
+++ b/bitsandbytes/optim/adagrad.py
@@ -2,11 +2,8 @@
 #   
 # This source code is licensed under the MIT license found in the 
 # LICENSE file in the root directory of this source tree.
-import torch
 from bitsandbytes.optim.optimizer import Optimizer1State
 
-torch.optim.Adagrad
-
 class Adagrad(Optimizer1State):
     def __init__(self, params, lr=1e-2, lr_decay=0, weight_decay=0, initial_accumulator_value=0, eps=1e-10,
             optim_bits=32, args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True):
diff --git a/bitsandbytes/optim/adamw.py b/bitsandbytes/optim/adamw.py
index 7761f3b7a..c4f0355c4 100644
--- a/bitsandbytes/optim/adamw.py
+++ b/bitsandbytes/optim/adamw.py
@@ -2,9 +2,7 @@
 #   
 # This source code is licensed under the MIT license found in the 
 # LICENSE file in the root directory of this source tree.
-import torch
 from bitsandbytes.optim.optimizer import Optimizer2State
-import bitsandbytes.functional as F
 
 class AdamW(Optimizer2State):
     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
diff --git a/bitsandbytes/optim/lars.py b/bitsandbytes/optim/lars.py
index 40dede706..912520df5 100644
--- a/bitsandbytes/optim/lars.py
+++ b/bitsandbytes/optim/lars.py
@@ -12,7 +12,7 @@ def __init__(self, params, lr, momentum=0, dampening=0,
                  weight_decay=0, nesterov=False, optim_bits=32, args=None,
                  min_8bit_size=4096, percentile_clipping=100, max_unorm=0.02):
         if momentum == 0:
-            raise NotImplementError(f'LARS without momentum is not supported!')
+            raise NotImplementedError(f'LARS without momentum is not supported!')
         super(LARS, self).__init__('lars', params, lr, (momentum, dampening), 0.0,
                 weight_decay, optim_bits, args, min_8bit_size, percentile_clipping, max_unorm=max_unorm, block_wise=False)
 
@@ -21,7 +21,7 @@ def __init__(self, params, lr, momentum=0, dampening=0,
                  weight_decay=0, nesterov=False, args=None,
                  min_8bit_size=4096, percentile_clipping=100, max_unorm=0.02):
         if momentum == 0:
-            raise NotImplementError(f'LARS without momentum is not supported!')
+            raise NotImplementedError(f'LARS without momentum is not supported!')
         super(LARS8bit, self).__init__('lars', params, lr, (momentum, dampening), 0.0,
                 weight_decay, 8, args, min_8bit_size, percentile_clipping, max_unorm=max_unorm, block_wise=False)
 
@@ -30,7 +30,7 @@ def __init__(self, params, lr, momentum=0, dampening=0,
                  weight_decay=0, nesterov=False, args=None,
                  min_8bit_size=4096, percentile_clipping=100, max_unorm=0.02):
         if momentum == 0:
-            raise NotImplementError(f'LARS without momentum is not supported!')
+            raise NotImplementedError(f'LARS without momentum is not supported!')
         super(LARS32bit, self).__init__('lars', params, lr, (momentum, dampening), 0.0,
                 weight_decay, 32, args, min_8bit_size, percentile_clipping, max_unorm=max_unorm, block_wise=False)
 
diff --git a/bitsandbytes/optim/rmsprop.py b/bitsandbytes/optim/rmsprop.py
index 99b718e43..7909d5daf 100644
--- a/bitsandbytes/optim/rmsprop.py
+++ b/bitsandbytes/optim/rmsprop.py
@@ -2,16 +2,15 @@
 #   
 # This source code is licensed under the MIT license found in the 
 # LICENSE file in the root directory of this source tree.
-import torch
 from bitsandbytes.optim.optimizer import Optimizer1State
 
 class RMSprop(Optimizer1State):
     def __init__(self, params, lr=1e-2, alpha=0.99, eps=1e-8, weight_decay=0, momentum=0, centered=False, optim_bits=32, args=None,
             min_8bit_size=4096, percentile_clipping=100, block_wise=True):
         if alpha == 0:
-            raise NotImplementError(f'RMSprop with alpha==0.0 is not supported!')
+            raise NotImplementedError(f'RMSprop with alpha==0.0 is not supported!')
         if centered:
-            raise NotImplementError(f'Centered RMSprop is not supported!')
+            raise NotImplementedError(f'Centered RMSprop is not supported!')
         super(RMSprop, self).__init__('rmsprop', params, lr, (alpha, momentum), eps,
                 weight_decay, optim_bits, args, min_8bit_size, percentile_clipping, block_wise)
 
@@ -19,9 +18,9 @@ class RMSprop8bit(Optimizer1State):
     def __init__(self, params, lr=1e-2, alpha=0.99, eps=1e-8, weight_decay=0, momentum=0, centered=False, args=None,
             min_8bit_size=4096, percentile_clipping=100, block_wise=True):
         if alpha == 0:
-            raise NotImplementError(f'RMSprop with alpha==0.0 is not supported!')
+            raise NotImplementedError(f'RMSprop with alpha==0.0 is not supported!')
         if centered:
-            raise NotImplementError(f'Centered RMSprop is not supported!')
+            raise NotImplementedError(f'Centered RMSprop is not supported!')
         super(RMSprop8bit, self).__init__('rmsprop', params, lr, (alpha, momentum), eps,
                 weight_decay, 8, args, min_8bit_size, percentile_clipping, block_wise)
 
@@ -30,7 +29,7 @@ def __init__(self, params, lr=1e-2, alpha=0.99, eps=1e-8, weight_decay=0, moment
             min_8bit_size=4096, percentile_clipping=100, block_wise=True):
 
         if alpha == 0:
-            raise NotImplementError(f'RMSprop with alpha==0.0 is not supported!')
+            raise NotImplementedError(f'RMSprop with alpha==0.0 is not supported!')
         if centered:
             raise NotImplementError(f'Centered RMSprop is not supported!')
         super(RMSprop32bit, self).__init__('rmsprop', params, lr, (alpha, momentum), eps,
diff --git a/bitsandbytes/optim/sgd.py b/bitsandbytes/optim/sgd.py
index 926d8046a..052987977 100644
--- a/bitsandbytes/optim/sgd.py
+++ b/bitsandbytes/optim/sgd.py
@@ -9,7 +9,7 @@ def __init__(self, params, lr, momentum=0, dampening=0,
                  weight_decay=0, nesterov=False, optim_bits=32, args=None,
             min_8bit_size=4096, percentile_clipping=100, block_wise=True):
         if momentum == 0:
-            raise NotImplementError(f'SGD without momentum is not supported!')
+            raise NotImplementedError(f'SGD without momentum is not supported!')
         super(SGD, self).__init__('momentum', params, lr, (momentum, dampening), 0.0,
                 weight_decay, optim_bits, args, min_8bit_size, percentile_clipping, block_wise)
 
@@ -18,7 +18,7 @@ def __init__(self, params, lr, momentum=0, dampening=0,
                  weight_decay=0, nesterov=False, args=None,
             min_8bit_size=4096, percentile_clipping=100, block_wise=True):
         if momentum == 0:
-            raise NotImplementError(f'SGD without momentum is not supported!')
+            raise NotImplementedError(f'SGD without momentum is not supported!')
         super(SGD8bit, self).__init__('momentum', params, lr, (momentum, dampening), 0.0,
                 weight_decay, 8, args, min_8bit_size, percentile_clipping, block_wise)
 
@@ -27,6 +27,6 @@ def __init__(self, params, lr, momentum=0, dampening=0,
                  weight_decay=0, nesterov=False, args=None,
             min_8bit_size=4096, percentile_clipping=100, block_wise=True):
         if momentum == 0:
-            raise NotImplementError(f'SGD without momentum is not supported!')
+            raise NotImplementedError(f'SGD without momentum is not supported!')
         super(SGD32bit, self).__init__('momentum', params, lr, (momentum, dampening), 0.0,
                 weight_decay, 32, args, min_8bit_size, percentile_clipping, block_wise)
diff --git a/tests/test_modules.py b/tests/test_modules.py
index 6cbee7ba8..a0379cb52 100644
--- a/tests/test_modules.py
+++ b/tests/test_modules.py
@@ -6,10 +6,6 @@
 import torch
 import bitsandbytes as bnb
 
-from itertools import product
-
-from bitsandbytes import functional as F
-
 
 @pytest.mark.parametrize("embcls", [bnb.nn.Embedding, bnb.nn.StableEmbedding], ids=['Embedding', 'StableEmbedding'])
 def test_embeddings(embcls):
diff --git a/tests/test_optim.py b/tests/test_optim.py
index 5464043b2..c80fe5107 100644
--- a/tests/test_optim.py
+++ b/tests/test_optim.py
@@ -7,7 +7,6 @@
 import shutil
 import uuid
 import pytest
-import ctypes
 import torch
 import bitsandbytes as bnb
 import bitsandbytes.functional as F