Add comment

rafvasq · rafvasq · commit bb17bbf74563 · 2025-07-23T15:55:39.000-04:00
Signed-off-by: Rafael Vasquez &lt;rafvasq21@gmail.com&gt;
diff --git a/tests/e2e/test_spyre_online.py b/tests/e2e/test_spyre_online.py
@@ -83,31 +83,6 @@ def test_openai_serving_gptq(remote_openai_server, model, backend,
     assert len(completion.choices[0].text) > 0
 
 
-@pytest.mark.quantized
-@pytest.mark.parametrize("model", get_spyre_model_list(quantization="fp8"))
-@pytest.mark.parametrize("backend", ["sendnn"])
-@pytest.mark.parametrize("warmup_shape", [[(64, 20, 1)]])
-def test_openai_serving_fp8(remote_openai_server, model, backend,
-                            warmup_shape):
-    """Test online serving a GPTQ model with the sendnn backend only"""
-
-    client = remote_openai_server.get_client()
-    completion = client.completions.create(model=model,
-                                           prompt="Hello World!",
-                                           max_tokens=5,
-                                           temperature=0.0)
-    assert len(completion.choices) == 1
-    assert len(completion.choices[0].text) > 0
-
-    completion = client.completions.create(model=model,
-                                           prompt="Hello World!",
-                                           max_tokens=5,
-                                           temperature=1.0,
-                                           n=2)
-    assert len(completion.choices) == 2
-    assert len(completion.choices[0].text) > 0
-
-
 @pytest.mark.parametrize("model", get_spyre_model_list())
 @pytest.mark.parametrize("cb",
                          [pytest.param(1, marks=pytest.mark.cb, id="cb")])
diff --git a/vllm_spyre/platform.py b/vllm_spyre/platform.py
@@ -40,6 +40,8 @@ class SpyrePlatform(Platform):
     # "spyre" device_name no longer worked due to https://github.com/vllm-project/vllm/pull/16464
     device_name: str = "cpu"
     device_type: str = "cpu"
+    # compressed-tensors supported by
+    # https://github.com/foundation-model-stack/fms-model-optimizer/blob/main/fms_mo/aiu_addons/__init__.py
     supported_quantization: list[str] = ["gptq", "fp8", "compressed-tensors"]
     _warmup_shapes: Optional[tuple[dict[str, int], ...]] = None
     _block_size: int = 64  # hardcoded Spyre constraint for now