[refact] Remove V0 tests (#241)

wallashss · prashantgupta24 · web-flow · commit 14dd264fef7f · 2025-06-18T13:23:21.000-07:00
# Description

Remove V0 tests , except for embedding models.

---------

Signed-off-by: Wallas Santos &lt;wallashss@ibm.com&gt;
Signed-off-by: Prashant Gupta &lt;prashantgupta@us.ibm.com&gt;
Co-authored-by: Prashant Gupta &lt;prashantgupta@us.ibm.com&gt;
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -39,17 +39,17 @@ jobs:
           - name: "vLLM:main"
             repo: "git+https://github.com/vllm-project/vllm --branch main"
         test_suite:
-          - name: "V0-e2e"
-            markers: "v0 and cpu and e2e"
-            flags: "--timeout=300"
-          - name: "V1-e2e"
-            markers: "v1 and cpu and e2e and not cb"
+          - name: "static batching"
+            markers: "cpu and decoder and not cb"
+            flags: "--timeout=300 --forked"
+          - name: "embedding"
+            markers: "cpu and embedding"
             flags: "--timeout=300 --forked"
-          - name: "V1-cb"
-            markers: "v1 and cpu and cb"
+          - name: "continuous batching"
+            markers: "cpu and cb"
             flags: "--timeout=300 --forked"
-          - name: "V1-worker and utils"
-            markers: "v1 and not e2e or utils"
+          - name: "worker and utils"
+            markers: "not e2e"
             flags: "--timeout=300"
 
     name: "${{ matrix.test_suite.name }} (${{ matrix.vllm_version.name }})"
diff --git a/docs/contributing/README.md b/docs/contributing/README.md
@@ -92,7 +92,7 @@ uv pip install --group dev
 Now, you can run the tests:
   
 ```sh
-python -m pytest -v -x tests -m "v1 and cpu and e2e"
+python -m pytest -v -x tests -m "cpu and e2e"
 ```
 
 Here is a list of `pytest` markers you can use to filter them:
diff --git a/pyproject.toml b/pyproject.toml
@@ -121,8 +121,6 @@ pythonpath = ["."]
 markers = [
     "skip_global_cleanup",
     "e2e: Tests using end-to-end engine spin-up",
-    "v0: Tests using vLLM v0 engine",
-    "v1: Tests using vLLM v1 engine",
     "cb: Continuous batching tests",
     "cpu: Tests using CPU (i.e. eager) backend",
     "spyre: Tests using Spyre hardware backend",
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -68,7 +68,6 @@ def remote_openai_server(request):
         model = params['model']
         warmup_shape = params['warmup_shape']
         backend = params['backend']
-        vllm_version = params['vllm_version']
     except KeyError as e:
         raise pytest.UsageError(
             "Error setting up remote_openai_server params") from e
@@ -79,14 +78,13 @@ def remote_openai_server(request):
     warmup_prompt_length = [t[0] for t in warmup_shape]
     warmup_new_tokens = [t[1] for t in warmup_shape]
     warmup_batch_size = [t[2] for t in warmup_shape]
-    v1_flag = "1" if vllm_version == "V1" else "0"
     env_dict = {
         "VLLM_SPYRE_WARMUP_PROMPT_LENS":
         ','.join(map(str, warmup_prompt_length)),
         "VLLM_SPYRE_WARMUP_NEW_TOKENS": ','.join(map(str, warmup_new_tokens)),
         "VLLM_SPYRE_WARMUP_BATCH_SIZES": ','.join(map(str, warmup_batch_size)),
         "VLLM_SPYRE_DYNAMO_BACKEND": backend,
-        "VLLM_USE_V1": v1_flag
+        "VLLM_USE_V1": "1"
     }
 
     # Add extra server args if present in test
diff --git a/tests/e2e/test_spyre_basic.py b/tests/e2e/test_spyre_basic.py
@@ -4,7 +4,7 @@
 """
 
 import pytest
-from spyre_util import (VLLM_VERSIONS, compare_results, create_random_request,
+from spyre_util import (compare_results, create_random_request,
                         generate_hf_output, generate_spyre_vllm_output,
                         get_spyre_backend_list, get_spyre_model_list)
 from vllm import EngineArgs, SamplingParams
@@ -33,14 +33,8 @@
     "warmup_shape", [(64, 20, 4), (64, 20, 8), (128, 20, 4),
                      (128, 20, 8)])  # (prompt_length/new_tokens/batch_size)
 @pytest.mark.parametrize("backend", get_spyre_backend_list())
-@pytest.mark.parametrize("vllm_version", VLLM_VERSIONS)
-def test_output(
-    model: str,
-    prompts: list[str],
-    warmup_shape: tuple[int, int, int],
-    backend: str,
-    vllm_version: str,
-) -> None:
+def test_output(model: str, prompts: list[str],
+                warmup_shape: tuple[int, int, int], backend: str) -> None:
     '''
     The warmup is based on a single shape. After the warmup,
     one request with the provided prompts is input to vLLM.
@@ -70,8 +64,7 @@ def test_output(
         block_size=2048,
         sampling_params=vllm_sampling_params,
         tensor_parallel_size=1,
-        backend=backend,
-        vllm_version=vllm_version)
+        backend=backend)
 
     hf_results = generate_hf_output(model=model,
                                     prompts=prompts,
@@ -94,13 +87,11 @@ def test_output(
 @pytest.mark.parametrize(
     "warmup_shape", [(64, 20, 4)])  # (prompt_length/new_tokens/batch_size)
 @pytest.mark.parametrize("backend", ["sendnn_decoder"])
-@pytest.mark.parametrize("vllm_version", VLLM_VERSIONS)
 def test_output_sendnn_decoder(
     model: str,
     prompts: list[str],
     warmup_shape: tuple[int, int, int],
     backend: str,
-    vllm_version: str,
 ) -> None:
     '''
     Tests the deprecated sendnn_decoder backend, which should fall-back to
@@ -123,8 +114,7 @@ def test_output_sendnn_decoder(
         block_size=2048,
         sampling_params=vllm_sampling_params,
         tensor_parallel_size=1,
-        backend=backend,
-        vllm_version=vllm_version)
+        backend=backend)
 
     hf_results = generate_hf_output(model=model,
                                     prompts=prompts,
@@ -141,11 +131,9 @@ def test_output_sendnn_decoder(
 
 @pytest.mark.parametrize("model", get_spyre_model_list())
 @pytest.mark.parametrize("backend", get_spyre_backend_list())
-@pytest.mark.parametrize("vllm_version", VLLM_VERSIONS)
 def test_batch_handling(
     model: str,
     backend: str,
-    vllm_version: str,
 ):
     """Test that the spyre worker correctly handles batches of requests that
     finish after different numbers of forward passes"""
@@ -178,8 +166,7 @@ def test_batch_handling(
         block_size=2048,
         sampling_params=vllm_sampling_params,
         tensor_parallel_size=1,
-        backend=backend,
-        vllm_version=vllm_version)
+        backend=backend)
 
     assert vllm_results[0]["text"] == " 3 2 "
     assert vllm_results[1]["text"] == " 6 5 4 3 2 "
@@ -189,10 +176,7 @@ def test_batch_handling(
 
 @pytest.mark.parametrize("model", get_spyre_model_list())
 @pytest.mark.parametrize("backend", get_spyre_backend_list())
-@pytest.mark.parametrize("vllm_version",
-                         [pytest.param("V1", marks=pytest.mark.v1, id="v1")])
-def test_full_batch_scheduling(model: str, backend: str, vllm_version: str,
-                               monkeypatch):
+def test_full_batch_scheduling(model: str, backend: str, monkeypatch):
     """Test that we can schedule a full batch of prompts."""
 
     # We need to ensure here that the max number of tokens in a full batch
diff --git a/tests/e2e/test_spyre_cb.py b/tests/e2e/test_spyre_cb.py
@@ -19,7 +19,6 @@
 
 
 @pytest.mark.cb
-@pytest.mark.v1
 @pytest.mark.parametrize("max_num_seqs", [2, 3, 4],
                          ids=lambda val: f"max_num_seqs({val})")
 @pytest.mark.parametrize("model", get_spyre_model_list())
@@ -92,7 +91,6 @@ def test_cb_handling(
     "backend", [pytest.param("eager", marks=pytest.mark.cpu, id="eager")])
 @pytest.mark.parametrize("cb",
                          [pytest.param(1, marks=pytest.mark.cb, id="cb")])
-# @pytest.mark.v1
 def test_cb_max_tokens(
     model: str,
     backend: str,
@@ -648,7 +646,6 @@ def augment_checked_steps(
 
 
 @pytest.mark.cb
-@pytest.mark.v1
 @pytest.mark.parametrize("model", get_spyre_model_list())
 @pytest.mark.parametrize("backend", get_spyre_backend_list())
 @pytest.mark.parametrize("max_num_seqs", [2])
diff --git a/tests/e2e/test_spyre_embeddings.py b/tests/e2e/test_spyre_embeddings.py
@@ -20,10 +20,8 @@
                          [(64, 4), (64, 8), (128, 4),
                           (128, 8)])  # (prompt_length/batch_size)
 @pytest.mark.parametrize("backend", get_spyre_backend_list())
-@pytest.mark.parametrize(
-    "vllm_version",
-    [pytest.param("V0", marks=pytest.mark.v0, id="v0")
-     ])  # TODO: Replace with VLLM_VERSIONS when v1 is supported.
+# TODO: Add it when v1 is supported.
+@pytest.mark.parametrize("vllm_version", ["V0"])
 def test_output(
     model: str,
     prompts: list[str],
diff --git a/tests/e2e/test_spyre_max_new_tokens.py b/tests/e2e/test_spyre_max_new_tokens.py
@@ -4,7 +4,7 @@
 """
 
 import pytest
-from spyre_util import (VLLM_VERSIONS, compare_results, generate_hf_output,
+from spyre_util import (compare_results, generate_hf_output,
                         generate_spyre_vllm_output, get_spyre_backend_list,
                         get_spyre_model_list)
 from vllm import SamplingParams
@@ -27,14 +27,12 @@
 @pytest.mark.parametrize(
     "warmup_shape", [(64, 10, 4)])  # (prompt_length/new_tokens/batch_size)
 @pytest.mark.parametrize("backend", get_spyre_backend_list())
-@pytest.mark.parametrize("vllm_version", VLLM_VERSIONS)
 def test_output(
     model: str,
     prompts: list[str],
     stop_last: bool,
     warmup_shape: tuple[int, int, int],
     backend: str,
-    vllm_version: str,
 ) -> None:
     '''
     The warmup is based on a single shape. After the warmup,
@@ -86,8 +84,7 @@ def test_output(
         block_size=2048,
         sampling_params=vllm_sampling_params,
         tensor_parallel_size=1,
-        backend=backend,
-        vllm_version=vllm_version)
+        backend=backend)
 
     hf_results = generate_hf_output(model=model,
                                     prompts=prompts,
diff --git a/tests/e2e/test_spyre_online.py b/tests/e2e/test_spyre_online.py
@@ -1,17 +1,14 @@
 import openai
 import pytest
-from spyre_util import (VLLM_VERSIONS, get_spyre_backend_list,
-                        get_spyre_model_list)
+from spyre_util import get_spyre_backend_list, get_spyre_model_list
 
 
 @pytest.mark.parametrize("model", get_spyre_model_list())
 @pytest.mark.parametrize("backend", get_spyre_backend_list())
 @pytest.mark.parametrize("warmup_shape", [[
     (64, 20, 4),
 ]])
-@pytest.mark.parametrize("vllm_version", VLLM_VERSIONS)
-def test_openai_serving(remote_openai_server, model, warmup_shape, backend,
-                        vllm_version):
+def test_openai_serving(remote_openai_server, model, warmup_shape, backend):
     """Test online serving using the `vllm serve` CLI"""
 
     client = remote_openai_server.get_client()
@@ -48,10 +45,6 @@ def test_openai_serving(remote_openai_server, model, warmup_shape, backend,
         completion = client.completions.create(model=model,
                                                prompt="Hello World!",
                                                max_tokens=25)
-        # V1 should raise
-        assert vllm_version == "V0"
-        assert len(completion.choices) == 1
-        assert len(completion.choices[0].text) == 0
     except openai.BadRequestError as e:
         assert "warmup" in str(e)
 
@@ -60,9 +53,8 @@ def test_openai_serving(remote_openai_server, model, warmup_shape, backend,
 @pytest.mark.parametrize("backend", ["sendnn"])
 @pytest.mark.parametrize("quantization", ["gptq"])
 @pytest.mark.parametrize("warmup_shape", [[(64, 20, 4)]])
-@pytest.mark.parametrize("vllm_version", VLLM_VERSIONS)
 def test_openai_serving_gptq(remote_openai_server, model, backend,
-                             warmup_shape, vllm_version, quantization):
+                             warmup_shape, quantization):
     """Test online serving a GPTQ model with the sendnn backend only"""
 
     client = remote_openai_server.get_client()
diff --git a/tests/e2e/test_spyre_online_multi.py b/tests/e2e/test_spyre_online_multi.py
@@ -1,6 +1,5 @@
 import pytest
-from spyre_util import (VLLM_VERSIONS, get_spyre_backend_list,
-                        get_spyre_model_list)
+from spyre_util import get_spyre_backend_list, get_spyre_model_list
 
 
 @pytest.mark.multi
@@ -11,9 +10,8 @@
 @pytest.mark.parametrize(
     "backend", [b for b in get_spyre_backend_list() if "eager" not in str(b)])
 @pytest.mark.parametrize("tensor_parallel_size", ["2", "4", "8"])
-@pytest.mark.parametrize("vllm_version", VLLM_VERSIONS)
 def test_openai_tp_serving(remote_openai_server, model, warmup_shape, backend,
-                           vllm_version, tensor_parallel_size):
+                           tensor_parallel_size):
     """Test online serving with tensor parallelism using the `vllm serve` CLI"""
 
     client = remote_openai_server.get_client()
diff --git a/tests/e2e/test_spyre_seed.py b/tests/e2e/test_spyre_seed.py
@@ -6,8 +6,8 @@
 import math
 
 import pytest
-from spyre_util import (VLLM_VERSIONS, generate_spyre_vllm_output,
-                        get_spyre_backend_list, get_spyre_model_list)
+from spyre_util import (generate_spyre_vllm_output, get_spyre_backend_list,
+                        get_spyre_model_list)
 from vllm import SamplingParams
 
 
@@ -22,15 +22,13 @@
     "warmup_shape", [(64, 20, 4), (64, 20, 8), (128, 20, 4),
                      (128, 20, 8)])  # (prompt_length/new_tokens/batch_size)
 @pytest.mark.parametrize("backend", get_spyre_backend_list())
-@pytest.mark.parametrize("vllm_version", VLLM_VERSIONS)
 def test_seed(
     model: str,
     prompt: str,
     temperature: float,
     seed: int,
     warmup_shape: tuple[int, int, int],
     backend: str,
-    vllm_version: str,
 ) -> None:
     '''
     The warmup is based on a single shape. After the warmup,
@@ -59,8 +57,7 @@ def test_seed(
         block_size=2048,
         sampling_params=vllm_sampling_params,
         tensor_parallel_size=1,
-        backend=backend,
-        vllm_version=vllm_version)
+        backend=backend)
 
     # compare all generated outputs against the first generated output
     for vllm_result in vllm_results:
diff --git a/tests/e2e/test_spyre_static_batching_limits.py b/tests/e2e/test_spyre_static_batching_limits.py
@@ -15,14 +15,10 @@
     [[(64, 20, 4)], [(64, 20, 4),
                      (128, 20, 4)]])  # (prompt_length/new_tokens/batch_size)
 @pytest.mark.parametrize("backend", get_spyre_backend_list())
-@pytest.mark.parametrize("vllm_version",
-                         [pytest.param("V1", marks=pytest.mark.v1, id="v1")
-                          ])  # v0 doesn't support multiple shapes
 def test_max_prompt_len_and_new_tokens(model: str,
                                        warmup_shapes: list[tuple[int, int,
                                                                  int]],
-                                       backend: str, vllm_version: str,
-                                       monkeypatch) -> None:
+                                       backend: str, monkeypatch) -> None:
     '''
     Simple test that for static batching:
     - prompts cannot exceed the maximum prompt length of all warmup shapes
@@ -33,7 +29,7 @@ def test_max_prompt_len_and_new_tokens(model: str,
     '''
     monkeypatch.setenv("VLLM_SPYRE_DYNAMO_BACKEND", backend)
     patch_warmup_shapes(warmup_shapes, monkeypatch)
-    monkeypatch.setenv("VLLM_USE_V1", "1" if vllm_version == "V1" else "0")
+    monkeypatch.setenv("VLLM_USE_V1", "1")
 
     max_prompt_length = max([t[0] for t in warmup_shapes])
     max_new_tokens = max([t[1] for t in warmup_shapes])
diff --git a/tests/e2e/test_spyre_tensor_parallel.py b/tests/e2e/test_spyre_tensor_parallel.py
@@ -4,7 +4,7 @@
 """
 
 import pytest
-from spyre_util import (VLLM_VERSIONS, compare_results, generate_hf_output,
+from spyre_util import (compare_results, generate_hf_output,
                         generate_spyre_vllm_output, get_spyre_backend_list,
                         get_spyre_model_list, skip_unsupported_tp_size)
 from vllm import SamplingParams
@@ -24,14 +24,12 @@
 @pytest.mark.parametrize("tp_size", [2, 4, 8])
 @pytest.mark.parametrize(
     "backend", [b for b in get_spyre_backend_list() if "eager" not in str(b)])
-@pytest.mark.parametrize("vllm_version", VLLM_VERSIONS)
 def test_output(
     model: str,
     prompts: list[str],
     warmup_shapes: list[tuple[int, int, int]],
     tp_size: int,
     backend: str,
-    vllm_version: str,
 ) -> None:
     '''
     The warmup is based on one or multiple shapes. After the warmup,
@@ -64,8 +62,7 @@ def test_output(
         block_size=2048,
         sampling_params=vllm_sampling_params,
         tensor_parallel_size=tp_size,
-        backend=backend,
-        vllm_version=vllm_version)
+        backend=backend)
 
     hf_results = generate_hf_output(model=model,
                                     prompts=prompts,
diff --git a/tests/e2e/test_spyre_warmup_shapes.py b/tests/e2e/test_spyre_warmup_shapes.py
diff --git a/tests/spyre_util.py b/tests/spyre_util.py