fix conflicts

sducouedic · sducouedic · commit 7390fb0e7355 · 2025-06-19T12:56:53.000Z
Signed-off-by: Sophie du Couédic &lt;sop@zurich.ibm.com&gt;
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -39,17 +39,17 @@ jobs:
           - name: "vLLM:main"
             repo: "git+https://github.com/vllm-project/vllm --branch main"
         test_suite:
-          - name: "V0-e2e"
-            markers: "v0 and cpu and e2e"
-            flags: "--timeout=300"
-          - name: "V1-e2e"
-            markers: "v1 and cpu and e2e and not cb"
+          - name: "static batching"
+            markers: "cpu and decoder and not cb"
+            flags: "--timeout=300 --forked"
+          - name: "embedding"
+            markers: "cpu and embedding"
             flags: "--timeout=300 --forked"
-          - name: "V1-cb"
-            markers: "v1 and cpu and cb"
+          - name: "continuous batching"
+            markers: "cpu and cb"
             flags: "--timeout=300 --forked"
-          - name: "V1-worker and utils"
-            markers: "v1 and not e2e or utils"
+          - name: "worker and utils"
+            markers: "not e2e"
             flags: "--timeout=300"
 
     name: "${{ matrix.test_suite.name }} (${{ matrix.vllm_version.name }})"
diff --git a/docs/contributing/README.md b/docs/contributing/README.md
@@ -92,7 +92,7 @@ uv pip install --group dev
 Now, you can run the tests:
   
 ```sh
-python -m pytest -v -x tests -m "v1 and cpu and e2e"
+python -m pytest -v -x tests -m "cpu and e2e"
 ```
 
 Here is a list of `pytest` markers you can use to filter them:
diff --git a/pyproject.toml b/pyproject.toml
@@ -121,8 +121,6 @@ pythonpath = ["."]
 markers = [
     "skip_global_cleanup",
     "e2e: Tests using end-to-end engine spin-up",
-    "v0: Tests using vLLM v0 engine",
-    "v1: Tests using vLLM v1 engine",
     "cb: Continuous batching tests",
     "cpu: Tests using CPU (i.e. eager) backend",
     "spyre: Tests using Spyre hardware backend",
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -68,7 +68,6 @@ def remote_openai_server(request):
         model = params['model']
         warmup_shape = params['warmup_shape']
         backend = params['backend']
-        vllm_version = params['vllm_version']
     except KeyError as e:
         raise pytest.UsageError(
             "Error setting up remote_openai_server params") from e
@@ -79,14 +78,13 @@ def remote_openai_server(request):
     warmup_prompt_length = [t[0] for t in warmup_shape]
     warmup_new_tokens = [t[1] for t in warmup_shape]
     warmup_batch_size = [t[2] for t in warmup_shape]
-    v1_flag = "1" if vllm_version == "V1" else "0"
     env_dict = {
         "VLLM_SPYRE_WARMUP_PROMPT_LENS":
         ','.join(map(str, warmup_prompt_length)),
         "VLLM_SPYRE_WARMUP_NEW_TOKENS": ','.join(map(str, warmup_new_tokens)),
         "VLLM_SPYRE_WARMUP_BATCH_SIZES": ','.join(map(str, warmup_batch_size)),
         "VLLM_SPYRE_DYNAMO_BACKEND": backend,
-        "VLLM_USE_V1": v1_flag
+        "VLLM_USE_V1": "1"
     }
 
     # Add extra server args if present in test
diff --git a/tests/e2e/test_spyre_basic.py b/tests/e2e/test_spyre_basic.py
@@ -4,7 +4,7 @@
 """
 
 import pytest
-from spyre_util import (VLLM_VERSIONS, compare_results, create_random_request,
+from spyre_util import (compare_results, create_random_request,
                         generate_hf_output, generate_spyre_vllm_output,
                         get_spyre_backend_list, get_spyre_model_list)
 from vllm import EngineArgs, SamplingParams
@@ -33,13 +33,11 @@
     "warmup_shape", [(64, 20, 4), (64, 20, 8), (128, 20, 4),
                      (128, 20, 8)])  # (prompt_length/new_tokens/batch_size)
 @pytest.mark.parametrize("backend", get_spyre_backend_list())
-@pytest.mark.parametrize("vllm_version", VLLM_VERSIONS)
 def test_output(
     model: str,
     prompts: list[str],
     warmup_shape: tuple[int, int, int],
     backend: str,
-    vllm_version: str,
     monkeypatch: pytest.MonkeyPatch,
 ) -> None:
     '''
@@ -72,7 +70,6 @@ def test_output(
         sampling_params=vllm_sampling_params,
         tensor_parallel_size=1,
         backend=backend,
-        vllm_version=vllm_version,
         monkeypatch=monkeypatch)
 
     hf_results = generate_hf_output(model=model,
@@ -96,13 +93,11 @@ def test_output(
 @pytest.mark.parametrize(
     "warmup_shape", [(64, 20, 4)])  # (prompt_length/new_tokens/batch_size)
 @pytest.mark.parametrize("backend", ["sendnn_decoder"])
-@pytest.mark.parametrize("vllm_version", VLLM_VERSIONS)
 def test_output_sendnn_decoder(
     model: str,
     prompts: list[str],
     warmup_shape: tuple[int, int, int],
     backend: str,
-    vllm_version: str,
     monkeypatch: pytest.MonkeyPatch,
 ) -> None:
     '''
@@ -127,7 +122,6 @@ def test_output_sendnn_decoder(
         sampling_params=vllm_sampling_params,
         tensor_parallel_size=1,
         backend=backend,
-        vllm_version=vllm_version,
         monkeypatch=monkeypatch)
 
     hf_results = generate_hf_output(model=model,
@@ -145,11 +139,9 @@ def test_output_sendnn_decoder(
 
 @pytest.mark.parametrize("model", get_spyre_model_list())
 @pytest.mark.parametrize("backend", get_spyre_backend_list())
-@pytest.mark.parametrize("vllm_version", VLLM_VERSIONS)
 def test_batch_handling(
     model: str,
     backend: str,
-    vllm_version: str,
     monkeypatch: pytest.MonkeyPatch,
 ):
     """Test that the spyre worker correctly handles batches of requests that
@@ -184,7 +176,6 @@ def test_batch_handling(
         sampling_params=vllm_sampling_params,
         tensor_parallel_size=1,
         backend=backend,
-        vllm_version=vllm_version,
         monkeypatch=monkeypatch)
 
     assert vllm_results[0]["text"] == " 3 2 "
@@ -195,10 +186,7 @@ def test_batch_handling(
 
 @pytest.mark.parametrize("model", get_spyre_model_list())
 @pytest.mark.parametrize("backend", get_spyre_backend_list())
-@pytest.mark.parametrize("vllm_version",
-                         [pytest.param("V1", marks=pytest.mark.v1, id="v1")])
-def test_full_batch_scheduling(model: str, backend: str, vllm_version: str,
-                               monkeypatch):
+def test_full_batch_scheduling(model: str, backend: str, monkeypatch):
     """Test that we can schedule a full batch of prompts."""
 
     # We need to ensure here that the max number of tokens in a full batch
diff --git a/tests/e2e/test_spyre_cb.py b/tests/e2e/test_spyre_cb.py
@@ -8,14 +8,9 @@
 from typing import Any
 
 import pytest
-<<<<<<< HEAD
 from spyre_util import (compare_results, create_random_request,
                         generate_hf_output, generate_spyre_vllm_output,
-                        get_spyre_model_list)
-=======
-from spyre_util import (create_random_request, generate_cb_spyre_vllm_output,
                         get_spyre_backend_list, get_spyre_model_list)
->>>>>>> origin/main
 from vllm import EngineArgs, SamplingParams
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.core import EngineCore
@@ -28,16 +23,12 @@
     "appropriately completes the request. Be polite in your response to the "
     "user.\n\n### Instruction:\n{}\n\n### Response:")
 
-<<<<<<< HEAD
 
 @pytest.mark.cb
 @pytest.mark.parametrize("max_num_seqs", [2, 3, 4],
                          ids=lambda val: f"max_num_seqs({val})")
 @pytest.mark.parametrize("model", get_spyre_model_list())
-@pytest.mark.parametrize(
-    "backend", [pytest.param("eager", marks=pytest.mark.cpu, id="eager")])
-# commenting v1 since we don't want this test to run with v1 marker yet
-# @pytest.mark.v1
+@pytest.mark.parametrize("backend", get_spyre_backend_list())
 @pytest.mark.parametrize("prompts", [[
     template.format("Provide a list of instructions "
                     "for preparing chicken soup."),
@@ -47,35 +38,6 @@
         "how do I add multiple new columns in m for power query or power bi?"),
     template.format("Convert char to string in Java."),
 ]])
-=======
-@pytest.mark.cb
-@pytest.mark.v1
-@pytest.mark.parametrize("max_num_seqs", [2, 3, 4],
-                         ids=lambda val: f"max_num_seqs({val})")
-@pytest.mark.parametrize("model", get_spyre_model_list())
-@pytest.mark.parametrize("backend", get_spyre_backend_list())
-@pytest.mark.parametrize(
-    "prompts",
-    [
-        [
-            "7 6 5 4",
-            "10 9 8 7",
-        ],
-        [
-            "7 6 5 4",
-            "10 9 8 7",
-            "8 7 6 5",
-        ],
-        [
-            "7 6 5 4",
-            "10 9 8 7",
-            "8 7 6 5",
-            "9 8 7 6",
-        ],
-    ],
-    ids=lambda val: f"num_prompts({len(val)})",
-)
->>>>>>> origin/main
 def test_cb_handling(
     model: str,
     backend: str,
@@ -683,7 +645,6 @@ def augment_checked_steps(
 
 
 @pytest.mark.cb
-@pytest.mark.v1
 @pytest.mark.parametrize("model", get_spyre_model_list())
 @pytest.mark.parametrize("backend", get_spyre_backend_list())
 @pytest.mark.parametrize("max_num_seqs", [2])
diff --git a/tests/e2e/test_spyre_embeddings.py b/tests/e2e/test_spyre_embeddings.py
@@ -20,10 +20,8 @@
                          [(64, 4), (64, 8), (128, 4),
                           (128, 8)])  # (prompt_length/batch_size)
 @pytest.mark.parametrize("backend", get_spyre_backend_list())
-@pytest.mark.parametrize(
-    "vllm_version",
-    [pytest.param("V0", marks=pytest.mark.v0, id="v0")
-     ])  # TODO: Replace with VLLM_VERSIONS when v1 is supported.
+# TODO: Add it when v1 is supported.
+@pytest.mark.parametrize("vllm_version", ["V0"])
 def test_output(
     model: str,
     prompts: list[str],
diff --git a/tests/e2e/test_spyre_max_new_tokens.py b/tests/e2e/test_spyre_max_new_tokens.py
@@ -4,7 +4,7 @@
 """
 
 import pytest
-from spyre_util import (VLLM_VERSIONS, compare_results, generate_hf_output,
+from spyre_util import (compare_results, generate_hf_output,
                         generate_spyre_vllm_output, get_spyre_backend_list,
                         get_spyre_model_list)
 from vllm import SamplingParams
@@ -27,14 +27,12 @@
 @pytest.mark.parametrize(
     "warmup_shape", [(64, 10, 4)])  # (prompt_length/new_tokens/batch_size)
 @pytest.mark.parametrize("backend", get_spyre_backend_list())
-@pytest.mark.parametrize("vllm_version", VLLM_VERSIONS)
 def test_output(
     model: str,
     prompts: list[str],
     stop_last: bool,
     warmup_shape: tuple[int, int, int],
     backend: str,
-    vllm_version: str,
     monkeypatch: pytest.MonkeyPatch,
 ) -> None:
     '''
@@ -88,7 +86,6 @@ def test_output(
         sampling_params=vllm_sampling_params,
         tensor_parallel_size=1,
         backend=backend,
-        vllm_version=vllm_version,
         monkeypatch=monkeypatch)
 
     hf_results = generate_hf_output(model=model,
diff --git a/tests/e2e/test_spyre_online.py b/tests/e2e/test_spyre_online.py
@@ -1,17 +1,14 @@
 import openai
 import pytest
-from spyre_util import (VLLM_VERSIONS, get_spyre_backend_list,
-                        get_spyre_model_list)
+from spyre_util import get_spyre_backend_list, get_spyre_model_list
 
 
 @pytest.mark.parametrize("model", get_spyre_model_list())
 @pytest.mark.parametrize("backend", get_spyre_backend_list())
 @pytest.mark.parametrize("warmup_shape", [[
     (64, 20, 4),
 ]])
-@pytest.mark.parametrize("vllm_version", VLLM_VERSIONS)
-def test_openai_serving(remote_openai_server, model, warmup_shape, backend,
-                        vllm_version):
+def test_openai_serving(remote_openai_server, model, warmup_shape, backend):
     """Test online serving using the `vllm serve` CLI"""
 
     client = remote_openai_server.get_client()
@@ -48,10 +45,6 @@ def test_openai_serving(remote_openai_server, model, warmup_shape, backend,
         completion = client.completions.create(model=model,
                                                prompt="Hello World!",
                                                max_tokens=25)
-        # V1 should raise
-        assert vllm_version == "V0"
-        assert len(completion.choices) == 1
-        assert len(completion.choices[0].text) == 0
     except openai.BadRequestError as e:
         assert "warmup" in str(e)
 
@@ -60,9 +53,8 @@ def test_openai_serving(remote_openai_server, model, warmup_shape, backend,
 @pytest.mark.parametrize("backend", ["sendnn"])
 @pytest.mark.parametrize("quantization", ["gptq"])
 @pytest.mark.parametrize("warmup_shape", [[(64, 20, 4)]])
-@pytest.mark.parametrize("vllm_version", VLLM_VERSIONS)
 def test_openai_serving_gptq(remote_openai_server, model, backend,
-                             warmup_shape, vllm_version, quantization):
+                             warmup_shape, quantization):
     """Test online serving a GPTQ model with the sendnn backend only"""
 
     client = remote_openai_server.get_client()
diff --git a/tests/e2e/test_spyre_online_multi.py b/tests/e2e/test_spyre_online_multi.py
@@ -1,6 +1,5 @@
 import pytest
-from spyre_util import (VLLM_VERSIONS, get_spyre_backend_list,
-                        get_spyre_model_list)
+from spyre_util import get_spyre_backend_list, get_spyre_model_list
 
 
 @pytest.mark.multi
@@ -11,9 +10,8 @@
 @pytest.mark.parametrize(
     "backend", [b for b in get_spyre_backend_list() if "eager" not in str(b)])
 @pytest.mark.parametrize("tensor_parallel_size", ["2", "4", "8"])
-@pytest.mark.parametrize("vllm_version", VLLM_VERSIONS)
 def test_openai_tp_serving(remote_openai_server, model, warmup_shape, backend,
-                           vllm_version, tensor_parallel_size):
+                           tensor_parallel_size):
     """Test online serving with tensor parallelism using the `vllm serve` CLI"""
 
     client = remote_openai_server.get_client()
diff --git a/tests/e2e/test_spyre_seed.py b/tests/e2e/test_spyre_seed.py
@@ -6,8 +6,8 @@
 import math
 
 import pytest
-from spyre_util import (VLLM_VERSIONS, generate_spyre_vllm_output,
-                        get_spyre_backend_list, get_spyre_model_list)
+from spyre_util import (generate_spyre_vllm_output, get_spyre_backend_list,
+                        get_spyre_model_list)
 from vllm import SamplingParams
 
 
@@ -22,15 +22,13 @@
     "warmup_shape", [(64, 20, 4), (64, 20, 8), (128, 20, 4),
                      (128, 20, 8)])  # (prompt_length/new_tokens/batch_size)
 @pytest.mark.parametrize("backend", get_spyre_backend_list())
-@pytest.mark.parametrize("vllm_version", VLLM_VERSIONS)
 def test_seed(
     model: str,
     prompt: str,
     temperature: float,
     seed: int,
     warmup_shape: tuple[int, int, int],
     backend: str,
-    vllm_version: str,
     monkeypatch: pytest.MonkeyPatch,
 ) -> None:
     '''
@@ -61,7 +59,6 @@ def test_seed(
         sampling_params=vllm_sampling_params,
         tensor_parallel_size=1,
         backend=backend,
-        vllm_version=vllm_version,
         monkeypatch=monkeypatch)
 
     # compare all generated outputs against the first generated output
diff --git a/tests/e2e/test_spyre_static_batching_limits.py b/tests/e2e/test_spyre_static_batching_limits.py
@@ -15,14 +15,10 @@
     [[(64, 20, 4)], [(64, 20, 4),
                      (128, 20, 4)]])  # (prompt_length/new_tokens/batch_size)
 @pytest.mark.parametrize("backend", get_spyre_backend_list())
-@pytest.mark.parametrize("vllm_version",
-                         [pytest.param("V1", marks=pytest.mark.v1, id="v1")
-                          ])  # v0 doesn't support multiple shapes
 def test_max_prompt_len_and_new_tokens(model: str,
                                        warmup_shapes: list[tuple[int, int,
                                                                  int]],
-                                       backend: str, vllm_version: str,
-                                       monkeypatch) -> None:
+                                       backend: str, monkeypatch) -> None:
     '''
     Simple test that for static batching:
     - prompts cannot exceed the maximum prompt length of all warmup shapes
@@ -33,7 +29,7 @@ def test_max_prompt_len_and_new_tokens(model: str,
     '''
     monkeypatch.setenv("VLLM_SPYRE_DYNAMO_BACKEND", backend)
     patch_warmup_shapes(warmup_shapes, monkeypatch)
-    monkeypatch.setenv("VLLM_USE_V1", "1" if vllm_version == "V1" else "0")
+    monkeypatch.setenv("VLLM_USE_V1", "1")
 
     max_prompt_length = max([t[0] for t in warmup_shapes])
     max_new_tokens = max([t[1] for t in warmup_shapes])
diff --git a/tests/e2e/test_spyre_tensor_parallel.py b/tests/e2e/test_spyre_tensor_parallel.py
diff --git a/tests/e2e/test_spyre_warmup_shapes.py b/tests/e2e/test_spyre_warmup_shapes.py
diff --git a/tests/spyre_util.py b/tests/spyre_util.py
diff --git a/vllm_spyre/platform.py b/vllm_spyre/platform.py