Remove static token quantization (#487)

kylesayrs · web-flow · commit 9a3e6315dab1 · 2025-10-10T14:29:52.000-04:00
* remove static per token quantization

Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;

* earlier validation

Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;

* fix tests

Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;

* dummy commit

Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;

* dummy commit 2

Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;

* dummy commit 3

Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;

* dummy commit 4

Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;

* fix test

Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;

---------

Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;
diff --git a/src/compressed_tensors/quantization/lifecycle/initialize.py b/src/compressed_tensors/quantization/lifecycle/initialize.py
@@ -199,7 +199,7 @@ def initialize_qparams(
         expected_shape = (1,)
 
     elif strategy == QuantizationStrategy.TOKEN:
-        expected_shape = (1, 1)
+        raise ValueError("Cannot perform static token quantization")
 
     elif strategy == QuantizationStrategy.CHANNEL:
         if len(observed_shape) < 2:
diff --git a/src/compressed_tensors/quantization/quant_args.py b/src/compressed_tensors/quantization/quant_args.py
@@ -264,6 +264,7 @@ def validate_model_after(model: "QuantizationArgs") -> "QuantizationArgs":
         actorder = model.actorder
         dynamic = model.dynamic
         observer = model.observer
+        dynamic = model.dynamic
 
         # infer strategy
         if strategy is None:
@@ -279,6 +280,12 @@ def validate_model_after(model: "QuantizationArgs") -> "QuantizationArgs":
                     "strategy='group' and group_size = -1 for 'channel'"
                 )
 
+        # validate token strategy
+        if strategy == QuantizationStrategy.TOKEN and not dynamic:
+            raise ValueError(
+                "Cannot perform static token quantization, please use `dynamic=True`"
+            )
+
         # validate group strategy
         if strategy == QuantizationStrategy.GROUP:
             if group_size is None or group_size <= 0:
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -29,27 +29,6 @@ def _get_dim(dim: int, value: torch.Tensor):
     return reduce_dims
 
 
-@pytest.fixture
-def mock_per_token_calibration():
-    def update_scale_zp(module: torch.nn.Module, base_name: str, value: torch.Tensor):
-        quantization_scheme = getattr(module, "quantization_scheme", None)
-        if not quantization_scheme:
-            # no quantization scheme nothing to do
-            return
-
-        arg_name = "weights" if base_name == "weight" else f"{base_name}_activations"
-        args = getattr(quantization_scheme, arg_name, None)
-
-        dim = _get_dim({0, 1}, value)
-        min_val = torch.amin(value, dim=dim, keepdims=True)
-        max_val = torch.amax(value, dim=dim, keepdims=True)
-        scale, zp = calculate_qparams(min_val, max_val, args)
-        update_parameter_data(module, scale, f"{base_name}_scale")
-        update_parameter_data(module, zp, f"{base_name}_zero_point")
-
-    return update_scale_zp
-
-
 @pytest.fixture
 def mock_per_group_calibration():
     def update_scale_zp(
diff --git a/tests/test_quantization/lifecycle/test_initialize.py b/tests/test_quantization/lifecycle/test_initialize.py
@@ -176,10 +176,6 @@ def test_initialize_module_for_quantization_offloaded(
             QuantizationArgs(strategy="block", block_structure=[2, 4]),
             None,
         ),
-        (
-            QuantizationArgs(strategy="token"),
-            QuantizationArgs(strategy="token"),
-        ),
     ],
 )
 def test_initialize_quantization_parameters(weights, input_activations):
@@ -238,9 +234,6 @@ def test_initialize_quantization_parameters(weights, input_activations):
                 # For activations or when block_structure is None
                 expected_shape = (1,)
 
-        elif args.strategy == QuantizationStrategy.TOKEN:
-            expected_shape = (1, 1)
-
         if not args.dynamic:
             assert getattr(layer, f"{q_param_name}_scale").shape == expected_shape
             assert getattr(layer, f"{q_param_name}_zero_point").shape == expected_shape
diff --git a/tests/test_quantization/test_configs/test_strategies.py b/tests/test_quantization/test_configs/test_strategies.py
@@ -105,34 +105,3 @@ def test_group(
         model_shape[1],
         int(model_shape[0] / group_size),
     )
-
-
-@torch.no_grad
-@pytest.mark.parametrize("input_symmetry", [True, False])
-@pytest.mark.parametrize("weight_symmetry", [True, False])
-@pytest.mark.parametrize("input_shape", [(32, 256), (300, 200), (400, 400)])
-def test_token(
-    mock_per_channel_calibration,
-    mock_per_token_calibration,
-    input_symmetry,
-    weight_symmetry,
-    input_shape,
-):
-    model = Linear(input_shape[1], 256)
-    quant_config = create_config(
-        input_symmetry,
-        weight_symmetry,
-        w_strategy=QuantizationStrategy.CHANNEL,
-        i_strategy=QuantizationStrategy.TOKEN,
-    )
-    apply_quantization_config(model, quant_config)
-
-    inputs = torch.randn(input_shape)
-    mock_per_channel_calibration(model, base_name="weight", value=model.weight)
-    mock_per_token_calibration(model, base_name="input", value=inputs)
-
-    assert model.input_scale.shape == (1, 1)
-    assert model.input_zero_point.shape == (1, 1)
-
-    assert model.weight_scale.shape == (256, 1)
-    assert model.weight_zero_point.shape == (256, 1)
diff --git a/tests/test_quantization/test_utils/test_helpers.py b/tests/test_quantization/test_utils/test_helpers.py
@@ -50,7 +50,6 @@
                 ]
             ),
         ),
-        (True, "token", torch.Size([1, 1])),
     ],
 )
 def test_calculate_qparams(keepdims, strategy, exp_shape):

Original file line number	Diff line number	Diff line change
`@@ -50,7 +50,6 @@`
`50`	`50`	`]`
`51`	`51`	`),`
`52`	`52`	`),`
`53`		`- (True, "token", torch.Size([1, 1])),`
`54`	`53`	`],`
`55`	`54`	`)`
`56`	`55`	`def test_calculate_qparams(keepdims, strategy, exp_shape):`