vllm-project
diff --git a/‎.github/workflows/test.yml‎
Lines changed: 9 additions & 9 deletions b/‎.github/workflows/test.yml‎
Lines changed: 9 additions & 9 deletions
diff --git a/‎docs/contributing/README.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/contributing/README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎pyproject.toml‎
Lines changed: 0 additions & 2 deletions b/‎pyproject.toml‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎tests/conftest.py‎
Lines changed: 1 addition & 3 deletions b/‎tests/conftest.py‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎tests/e2e/test_spyre_basic.py‎
Lines changed: 2 additions & 14 deletions b/‎tests/e2e/test_spyre_basic.py‎
Lines changed: 2 additions & 14 deletions
diff --git a/‎tests/e2e/test_spyre_cb.py‎
Lines changed: 11 additions & 54 deletions b/‎tests/e2e/test_spyre_cb.py‎
Lines changed: 11 additions & 54 deletions
diff --git a/‎tests/e2e/test_spyre_embeddings.py‎
Lines changed: 2 additions & 4 deletions b/‎tests/e2e/test_spyre_embeddings.py‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎tests/e2e/test_spyre_max_new_tokens.py‎
Lines changed: 1 addition & 4 deletions b/‎tests/e2e/test_spyre_max_new_tokens.py‎
Lines changed: 1 addition & 4 deletions
diff --git a/‎tests/e2e/test_spyre_online.py‎
Lines changed: 3 additions & 11 deletions b/‎tests/e2e/test_spyre_online.py‎
Lines changed: 3 additions & 11 deletions
diff --git a/‎tests/e2e/test_spyre_online_multi.py‎
Lines changed: 2 additions & 4 deletions b/‎tests/e2e/test_spyre_online_multi.py‎
Lines changed: 2 additions & 4 deletions
@@ -39,17 +39,17 @@ jobs:
           - name: "vLLM:main"
             repo: "git+https://github.com/vllm-project/vllm --branch main"
         test_suite:
-          - name: "V0-e2e"
-            markers: "v0 and cpu and e2e"
-            flags: "--timeout=300"
-          - name: "V1-e2e"
-            markers: "v1 and cpu and e2e and not cb"
+          - name: "static batching"
+            markers: "cpu and decoder and not cb"
+            flags: "--timeout=300 --forked"
+          - name: "embedding"
+            markers: "cpu and embedding"
             flags: "--timeout=300 --forked"
-          - name: "V1-cb"
-            markers: "v1 and cpu and cb"
+          - name: "continuous batching"
+            markers: "cpu and cb"
             flags: "--timeout=300 --forked"
-          - name: "V1-worker and utils"
-            markers: "v1 and not e2e or utils"
+          - name: "worker and utils"
+            markers: "not e2e"
             flags: "--timeout=300"
 
     name: "${{ matrix.test_suite.name }} (${{ matrix.vllm_version.name }})"
 
@@ -92,7 +92,7 @@ uv pip install --group dev
 Now, you can run the tests:
 
 ```sh
-python -m pytest -v -x tests -m "v1 and cpu and e2e"
+python -m pytest -v -x tests -m "cpu and e2e"
 ```
 
 Here is a list of `pytest` markers you can use to filter them:
 
@@ -121,8 +121,6 @@ pythonpath = ["."]
 markers = [
     "skip_global_cleanup",
     "e2e: Tests using end-to-end engine spin-up",
-    "v0: Tests using vLLM v0 engine",
-    "v1: Tests using vLLM v1 engine",
     "cb: Continuous batching tests",
     "cpu: Tests using CPU (i.e. eager) backend",
     "spyre: Tests using Spyre hardware backend",
 
@@ -68,7 +68,6 @@ def remote_openai_server(request):
         model = params['model']
         warmup_shape = params['warmup_shape']
         backend = params['backend']
-        vllm_version = params['vllm_version']
     except KeyError as e:
         raise pytest.UsageError(
             "Error setting up remote_openai_server params") from e
@@ -79,14 +78,13 @@ def remote_openai_server(request):
     warmup_prompt_length = [t[0] for t in warmup_shape]
     warmup_new_tokens = [t[1] for t in warmup_shape]
     warmup_batch_size = [t[2] for t in warmup_shape]
-    v1_flag = "1" if vllm_version == "V1" else "0"
     env_dict = {
         "VLLM_SPYRE_WARMUP_PROMPT_LENS":
         ','.join(map(str, warmup_prompt_length)),
         "VLLM_SPYRE_WARMUP_NEW_TOKENS": ','.join(map(str, warmup_new_tokens)),
         "VLLM_SPYRE_WARMUP_BATCH_SIZES": ','.join(map(str, warmup_batch_size)),
         "VLLM_SPYRE_DYNAMO_BACKEND": backend,
-        "VLLM_USE_V1": v1_flag
+        "VLLM_USE_V1": "1"
     }
 
     # Add extra server args if present in test
 
@@ -4,7 +4,7 @@
 """
 
 import pytest
-from spyre_util import (VLLM_VERSIONS, compare_results, create_random_request,
+from spyre_util import (compare_results, create_random_request,
                         generate_hf_output, generate_spyre_vllm_output,
                         get_spyre_backend_list, get_spyre_model_list)
 from vllm import EngineArgs, SamplingParams
@@ -33,13 +33,11 @@
     "warmup_shape", [(64, 20, 4), (64, 20, 8), (128, 20, 4),
                      (128, 20, 8)])  # (prompt_length/new_tokens/batch_size)
 @pytest.mark.parametrize("backend", get_spyre_backend_list())
-@pytest.mark.parametrize("vllm_version", VLLM_VERSIONS)
 def test_output(
     model: str,
     prompts: list[str],
     warmup_shape: tuple[int, int, int],
     backend: str,
-    vllm_version: str,
     monkeypatch: pytest.MonkeyPatch,
 ) -> None:
     '''
@@ -72,7 +70,6 @@ def test_output(
         sampling_params=vllm_sampling_params,
         tensor_parallel_size=1,
         backend=backend,
-        vllm_version=vllm_version,
         monkeypatch=monkeypatch)
 
     hf_results = generate_hf_output(model=model,
@@ -96,13 +93,11 @@ def test_output(
 @pytest.mark.parametrize(
     "warmup_shape", [(64, 20, 4)])  # (prompt_length/new_tokens/batch_size)
 @pytest.mark.parametrize("backend", ["sendnn_decoder"])
-@pytest.mark.parametrize("vllm_version", VLLM_VERSIONS)
 def test_output_sendnn_decoder(
     model: str,
     prompts: list[str],
     warmup_shape: tuple[int, int, int],
     backend: str,
-    vllm_version: str,
     monkeypatch: pytest.MonkeyPatch,
 ) -> None:
     '''
@@ -127,7 +122,6 @@ def test_output_sendnn_decoder(
         sampling_params=vllm_sampling_params,
         tensor_parallel_size=1,
         backend=backend,
-        vllm_version=vllm_version,
         monkeypatch=monkeypatch)
 
     hf_results = generate_hf_output(model=model,
@@ -145,11 +139,9 @@ def test_output_sendnn_decoder(
 
 @pytest.mark.parametrize("model", get_spyre_model_list())
 @pytest.mark.parametrize("backend", get_spyre_backend_list())
-@pytest.mark.parametrize("vllm_version", VLLM_VERSIONS)
 def test_batch_handling(
     model: str,
     backend: str,
-    vllm_version: str,
     monkeypatch: pytest.MonkeyPatch,
 ):
     """Test that the spyre worker correctly handles batches of requests that
@@ -184,7 +176,6 @@ def test_batch_handling(
         sampling_params=vllm_sampling_params,
         tensor_parallel_size=1,
         backend=backend,
-        vllm_version=vllm_version,
         monkeypatch=monkeypatch)
 
     assert vllm_results[0]["text"] == " 3 2 "
@@ -195,10 +186,7 @@ def test_batch_handling(
 
 @pytest.mark.parametrize("model", get_spyre_model_list())
 @pytest.mark.parametrize("backend", get_spyre_backend_list())
-@pytest.mark.parametrize("vllm_version",
-                         [pytest.param("V1", marks=pytest.mark.v1, id="v1")])
-def test_full_batch_scheduling(model: str, backend: str, vllm_version: str,
-                               monkeypatch):
+def test_full_batch_scheduling(model: str, backend: str, monkeypatch):
     """Test that we can schedule a full batch of prompts."""
 
     # We need to ensure here that the max number of tokens in a full batch
 
@@ -8,14 +8,9 @@
 from typing import Any
 
 import pytest
-<<<<<<< HEAD
 from spyre_util import (compare_results, create_random_request,
                         generate_hf_output, generate_spyre_vllm_output,
-                        get_spyre_model_list)
-=======
-from spyre_util import (create_random_request, generate_cb_spyre_vllm_output,
                         get_spyre_backend_list, get_spyre_model_list)
->>>>>>> origin/main
 from vllm import EngineArgs, SamplingParams
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.core import EngineCore
@@ -28,16 +23,12 @@
     "appropriately completes the request. Be polite in your response to the "
     "user.\n\n### Instruction:\n{}\n\n### Response:")
 
-<<<<<<< HEAD
 
 @pytest.mark.cb
 @pytest.mark.parametrize("max_num_seqs", [2, 3, 4],
                          ids=lambda val: f"max_num_seqs({val})")
 @pytest.mark.parametrize("model", get_spyre_model_list())
-@pytest.mark.parametrize(
-    "backend", [pytest.param("eager", marks=pytest.mark.cpu, id="eager")])
-# commenting v1 since we don't want this test to run with v1 marker yet
-# @pytest.mark.v1
+@pytest.mark.parametrize("backend", get_spyre_backend_list())
 @pytest.mark.parametrize("prompts", [[
     template.format("Provide a list of instructions "
                     "for preparing chicken soup."),
@@ -47,35 +38,6 @@
         "how do I add multiple new columns in m for power query or power bi?"),
     template.format("Convert char to string in Java."),
 ]])
-=======
-@pytest.mark.cb
-@pytest.mark.v1
-@pytest.mark.parametrize("max_num_seqs", [2, 3, 4],
-                         ids=lambda val: f"max_num_seqs({val})")
-@pytest.mark.parametrize("model", get_spyre_model_list())
-@pytest.mark.parametrize("backend", get_spyre_backend_list())
-@pytest.mark.parametrize(
-    "prompts",
-    [
-        [
-            "7 6 5 4",
-            "10 9 8 7",
-        ],
-        [
-            "7 6 5 4",
-            "10 9 8 7",
-            "8 7 6 5",
-        ],
-        [
-            "7 6 5 4",
-            "10 9 8 7",
-            "8 7 6 5",
-            "9 8 7 6",
-        ],
-    ],
-    ids=lambda val: f"num_prompts({len(val)})",
-)
->>>>>>> origin/main
 def test_cb_handling(
     model: str,
     backend: str,
@@ -107,7 +69,6 @@ def test_cb_handling(
         backend=backend,
         max_num_seqs=max_num_seqs,
         use_cb=True,
-        vllm_version="V1",  # CB runs in V1 only
         monkeypatch=monkeypatch)
 
     hf_results = generate_hf_output(model=model,
@@ -124,7 +85,6 @@ def test_cb_handling(
 
 
 @pytest.mark.cb
-# @pytest.mark.v1
 @pytest.mark.parametrize("max_num_seqs", [2])
 @pytest.mark.parametrize("model", get_spyre_model_list())
 @pytest.mark.parametrize(
@@ -149,18 +109,16 @@ def test_cb_max_tokens(
                                           logprobs=0)
 
     with pytest.raises(ValueError, match="max model context length"):
-        generate_spyre_vllm_output(
-            model=model,
-            prompts=overflow_prompt,
-            max_model_len=max_model_len,
-            block_size=max_model_len,
-            sampling_params=vllm_sampling_params,
-            tensor_parallel_size=1,
-            backend=backend,
-            max_num_seqs=max_num_seqs,
-            use_cb=True,
-            vllm_version="V1",  # CB runs in V1 only
-            monkeypatch=monkeypatch)
+        generate_spyre_vllm_output(model=model,
+                                   prompts=overflow_prompt,
+                                   max_model_len=max_model_len,
+                                   block_size=max_model_len,
+                                   sampling_params=vllm_sampling_params,
+                                   tensor_parallel_size=1,
+                                   backend=backend,
+                                   max_num_seqs=max_num_seqs,
+                                   use_cb=True,
+                                   monkeypatch=monkeypatch)
 
 
 def get_params_test_blocks_borders_aligned_prompts():
@@ -683,7 +641,6 @@ def augment_checked_steps(
 
 
 @pytest.mark.cb
-@pytest.mark.v1
 @pytest.mark.parametrize("model", get_spyre_model_list())
 @pytest.mark.parametrize("backend", get_spyre_backend_list())
 @pytest.mark.parametrize("max_num_seqs", [2])
 
@@ -20,10 +20,8 @@
                          [(64, 4), (64, 8), (128, 4),
                           (128, 8)])  # (prompt_length/batch_size)
 @pytest.mark.parametrize("backend", get_spyre_backend_list())
-@pytest.mark.parametrize(
-    "vllm_version",
-    [pytest.param("V0", marks=pytest.mark.v0, id="v0")
-     ])  # TODO: Replace with VLLM_VERSIONS when v1 is supported.
+# TODO: Add it when v1 is supported.
+@pytest.mark.parametrize("vllm_version", ["V0"])
 def test_output(
     model: str,
     prompts: list[str],
 
@@ -4,7 +4,7 @@
 """
 
 import pytest
-from spyre_util import (VLLM_VERSIONS, compare_results, generate_hf_output,
+from spyre_util import (compare_results, generate_hf_output,
                         generate_spyre_vllm_output, get_spyre_backend_list,
                         get_spyre_model_list)
 from vllm import SamplingParams
@@ -27,14 +27,12 @@
 @pytest.mark.parametrize(
     "warmup_shape", [(64, 10, 4)])  # (prompt_length/new_tokens/batch_size)
 @pytest.mark.parametrize("backend", get_spyre_backend_list())
-@pytest.mark.parametrize("vllm_version", VLLM_VERSIONS)
 def test_output(
     model: str,
     prompts: list[str],
     stop_last: bool,
     warmup_shape: tuple[int, int, int],
     backend: str,
-    vllm_version: str,
     monkeypatch: pytest.MonkeyPatch,
 ) -> None:
     '''
@@ -88,7 +86,6 @@ def test_output(
         sampling_params=vllm_sampling_params,
         tensor_parallel_size=1,
         backend=backend,
-        vllm_version=vllm_version,
         monkeypatch=monkeypatch)
 
     hf_results = generate_hf_output(model=model,
 
@@ -1,17 +1,14 @@
 import openai
 import pytest
-from spyre_util import (VLLM_VERSIONS, get_spyre_backend_list,
-                        get_spyre_model_list)
+from spyre_util import get_spyre_backend_list, get_spyre_model_list
 
 
 @pytest.mark.parametrize("model", get_spyre_model_list())
 @pytest.mark.parametrize("backend", get_spyre_backend_list())
 @pytest.mark.parametrize("warmup_shape", [[
     (64, 20, 4),
 ]])
-@pytest.mark.parametrize("vllm_version", VLLM_VERSIONS)
-def test_openai_serving(remote_openai_server, model, warmup_shape, backend,
-                        vllm_version):
+def test_openai_serving(remote_openai_server, model, warmup_shape, backend):
     """Test online serving using the `vllm serve` CLI"""
 
     client = remote_openai_server.get_client()
@@ -48,10 +45,6 @@ def test_openai_serving(remote_openai_server, model, warmup_shape, backend,
         completion = client.completions.create(model=model,
                                                prompt="Hello World!",
                                                max_tokens=25)
-        # V1 should raise
-        assert vllm_version == "V0"
-        assert len(completion.choices) == 1
-        assert len(completion.choices[0].text) == 0
     except openai.BadRequestError as e:
         assert "warmup" in str(e)
 
@@ -60,9 +53,8 @@ def test_openai_serving(remote_openai_server, model, warmup_shape, backend,
 @pytest.mark.parametrize("backend", ["sendnn"])
 @pytest.mark.parametrize("quantization", ["gptq"])
 @pytest.mark.parametrize("warmup_shape", [[(64, 20, 4)]])
-@pytest.mark.parametrize("vllm_version", VLLM_VERSIONS)
 def test_openai_serving_gptq(remote_openai_server, model, backend,
-                             warmup_shape, vllm_version, quantization):
+                             warmup_shape, quantization):
     """Test online serving a GPTQ model with the sendnn backend only"""
 
     client = remote_openai_server.get_client()
 
@@ -1,6 +1,5 @@
 import pytest
-from spyre_util import (VLLM_VERSIONS, get_spyre_backend_list,
-                        get_spyre_model_list)
+from spyre_util import get_spyre_backend_list, get_spyre_model_list
 
 
 @pytest.mark.multi
@@ -11,9 +10,8 @@
 @pytest.mark.parametrize(
     "backend", [b for b in get_spyre_backend_list() if "eager" not in str(b)])
 @pytest.mark.parametrize("tensor_parallel_size", ["2", "4", "8"])
-@pytest.mark.parametrize("vllm_version", VLLM_VERSIONS)
 def test_openai_tp_serving(remote_openai_server, model, warmup_shape, backend,
-                           vllm_version, tensor_parallel_size):
+                           tensor_parallel_size):
     """Test online serving with tensor parallelism using the `vllm serve` CLI"""
 
     client = remote_openai_server.get_client()