quatization lifecycle - disable forward pass override + helper for we…

…ight quant param updates (vllm-project#111)
markmc · Jul 18, 2024 · 1d4a39f · 1d4a39f
1 parent 0c2d88b
commit 1d4a39f
Show file tree

Hide file tree

Showing 4 changed files with 116 additions and 0 deletions.
diff --git a/src/compressed_tensors/quantization/lifecycle/__init__.py b/src/compressed_tensors/quantization/lifecycle/__init__.py
@@ -21,3 +21,4 @@
 from .initialize import *
 from .compressed import *
 from .apply import *
+from .helpers import *
diff --git a/src/compressed_tensors/quantization/lifecycle/forward.py b/src/compressed_tensors/quantization/lifecycle/forward.py
@@ -245,6 +245,11 @@ def wrap_module_forward_quantized(module: Module, scheme: QuantizationScheme):
 
     @wraps(forward_func_orig)  # ensures docstring, names, etc are propagated
     def wrapped_forward(self, *args, **kwargs):
+        if not getattr(module, "quantization_enabled", True):
+            # quantization is disabled on forward passes, return baseline
+            # forward call
+            return forward_func_orig.__get__(module, module.__class__)(*args, **kwargs)
+
         input_ = args[0]
 
         if scheme.input_activations is not None:

diff --git a/src/compressed_tensors/quantization/lifecycle/helpers.py b/src/compressed_tensors/quantization/lifecycle/helpers.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Miscelaneous helpers for the quantization lifecycle
+"""
+
+
+from torch.nn import Module
+
+
+__all__ = [
+    "update_layer_weight_quant_params",
+    "enable_quantization",
+    "disable_quantization",
+]
+
+
+def update_layer_weight_quant_params(layer: Module):
+    weight = getattr(layer, "weight", None)
+    scale = getattr(layer, "weight_scale", None)
+    zero_point = getattr(layer, "weight_zero_point", None)
+    observer = getattr(layer, "weight_observer", None)
+
+    if weight is None or observer is None or scale is None or zero_point is None:
+        # scale, zp, or observer not calibratable or weight not available
+        return
+
+    updated_scale, updated_zero_point = observer(weight)
+
+    # update scale and zero point
+    device = next(layer.parameters()).device
+    scale.data = updated_scale.to(device)
+    zero_point.data = updated_zero_point.to(device)
+
+
+def enable_quantization(module: Module):
+    module.quantization_enabled = True
+
+
+def disable_quantization(module: Module):
+    module.quantization_enabled = False
diff --git a/tests/test_quantization/lifecycle/test_enabled.py b/tests/test_quantization/lifecycle/test_enabled.py
@@ -0,0 +1,57 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from copy import deepcopy
+
+import torch
+from compressed_tensors.quantization import (
+    QuantizationConfig,
+    apply_quantization_config,
+    disable_quantization,
+    enable_quantization,
+)
+from torch.nn import Linear
+
+
+def test_quantization_enabled_disabled():
+    inp = torch.randn(16)
+    model = Linear(16, 16)
+    quantized_model = deepcopy(model)
+    apply_quantization_config(
+        model=quantized_model,
+        config=QuantizationConfig(
+            config_groups=dict(W4A16=["Linear"]),
+            quantization_status="calibration",
+        ),
+    )
+
+    # run one calibration pass
+    quantized_model(inp)
+
+    model_output = model(inp)
+    quantized_model_output = quantized_model(inp)
+
+    # quantized and non quantized outputs should be different
+    assert not torch.all(model_output == quantized_model_output)
+
+    # disable quantization
+    quantized_model.apply(disable_quantization)
+    # check that quantized model now matches model output
+    assert torch.all(model_output == quantized_model(inp))
+
+    # re-enable quantization
+    quantized_model.apply(enable_quantization)
+    # check that quantized model matches original quantized output
+    assert torch.all(quantized_model_output == quantized_model(inp))