vllm-project · prashantgupta24 · Jun 23, 2025 · Jun 17, 2025 · Jun 18, 2025 · Jun 18, 2025
@@ -40,6 +40,7 @@ def test_output(
     warmup_shape: tuple[int, int, int],
     backend: str,
     vllm_version: str,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
     '''
     The warmup is based on a single shape. After the warmup,
@@ -71,7 +72,8 @@ def test_output(
         sampling_params=vllm_sampling_params,
         tensor_parallel_size=1,
         backend=backend,
-        vllm_version=vllm_version)
+        vllm_version=vllm_version,
+        monkeypatch=monkeypatch)
 
     hf_results = generate_hf_output(model=model,
                                     prompts=prompts,
@@ -101,6 +103,7 @@ def test_output_sendnn_decoder(
     warmup_shape: tuple[int, int, int],
     backend: str,
     vllm_version: str,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
     '''
     Tests the deprecated sendnn_decoder backend, which should fall-back to
@@ -124,7 +127,8 @@ def test_output_sendnn_decoder(
         sampling_params=vllm_sampling_params,
         tensor_parallel_size=1,
         backend=backend,
-        vllm_version=vllm_version)
+        vllm_version=vllm_version,
+        monkeypatch=monkeypatch)
 
     hf_results = generate_hf_output(model=model,
                                     prompts=prompts,
@@ -146,6 +150,7 @@ def test_batch_handling(
     model: str,
     backend: str,
     vllm_version: str,
+    monkeypatch: pytest.MonkeyPatch,
 ):
     """Test that the spyre worker correctly handles batches of requests that
     finish after different numbers of forward passes"""
@@ -179,7 +184,8 @@ def test_batch_handling(
         sampling_params=vllm_sampling_params,
         tensor_parallel_size=1,
         backend=backend,
-        vllm_version=vllm_version)
+        vllm_version=vllm_version,
+        monkeypatch=monkeypatch)
 
     assert vllm_results[0]["text"] == " 3 2 "
     assert vllm_results[1]["text"] == " 6 5 4 3 2 "

@@ -8,16 +8,46 @@
 from typing import Any
 
 import pytest
+<<<<<<< HEAD
+from spyre_util import (compare_results, create_random_request,
+                        generate_hf_output, generate_spyre_vllm_output,
+                        get_spyre_model_list)
+=======
 from spyre_util import (create_random_request, generate_cb_spyre_vllm_output,
                         get_spyre_backend_list, get_spyre_model_list)
+>>>>>>> origin/main
 from vllm import EngineArgs, SamplingParams
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.core import EngineCore
 from vllm.v1.executor.abstract import Executor
 
 from vllm_spyre.v1.core.scheduler import ContinuousBatchingSpyreScheduler
 
+template = (
+    "Below is an instruction that describes a task. Write a response that "
+    "appropriately completes the request. Be polite in your response to the "
+    "user.\n\n### Instruction:\n{}\n\n### Response:")
 
+<<<<<<< HEAD
+
+@pytest.mark.cb
+@pytest.mark.parametrize("max_num_seqs", [2, 3, 4],
+                         ids=lambda val: f"max_num_seqs({val})")
+@pytest.mark.parametrize("model", get_spyre_model_list())
+@pytest.mark.parametrize(
+    "backend", [pytest.param("eager", marks=pytest.mark.cpu, id="eager")])
+# commenting v1 since we don't want this test to run with v1 marker yet
+# @pytest.mark.v1
+@pytest.mark.parametrize("prompts", [[
+    template.format("Provide a list of instructions "
+                    "for preparing chicken soup."),
+    template.format("Provide me a list of things that I can do with my "
+                    "new found wealth."),
+    template.format(
+        "how do I add multiple new columns in m for power query or power bi?"),
+    template.format("Convert char to string in Java."),
+]])
+=======
 @pytest.mark.cb
 @pytest.mark.v1
 @pytest.mark.parametrize("max_num_seqs", [2, 3, 4],
@@ -45,6 +75,7 @@
     ],
     ids=lambda val: f"num_prompts({len(val)})",
 )
+>>>>>>> origin/main
 def test_cb_handling(
     model: str,
     backend: str,
@@ -56,16 +87,17 @@
     continuous batches of requests that
     finish after different numbers of forward passes"""
 
-    vllm_sampling_params = SamplingParams(max_tokens=20,
+    max_tokens = 20
+
+    vllm_sampling_params = SamplingParams(max_tokens=max_tokens,
                                           temperature=0,
-                                          stop="1",
                                           ignore_eos=True,
                                           logprobs=0)
 
     # Ensure that both:
     # - The model doesn't crash
     # - The output sequences are correct
-    vllm_results = generate_cb_spyre_vllm_output(
+    vllm_results = generate_spyre_vllm_output(
         model=model,
         prompts=prompts,
         max_model_len=2048,
@@ -74,30 +106,33 @@
         tensor_parallel_size=1,
         backend=backend,
         max_num_seqs=max_num_seqs,
-        use_cb=1,
-        monkeypatch=monkeypatch,
-    )
+        use_cb=True,
+        vllm_version="V1",  # CB runs in V1 only
+        monkeypatch=monkeypatch)
+
+    hf_results = generate_hf_output(model=model,
+                                    prompts=prompts,
+                                    max_new_tokens=max_tokens)
 
-    for i, prompt in enumerate(prompts):
-        assert (vllm_results[i]["text"] == [
-            " " + " ".join(
-                str(i)
-                for i in range(int(prompt.split()[-1]) - 1, 1, -1)) + " "
-        ][0])
+    compare_results(model=model,
+                    prompts=prompts,
+                    warmup_shapes=[],
+                    tensor_parallel_size=1,
+                    backend=backend,
+                    vllm_results=vllm_results,
+                    hf_results=hf_results)
 
 
+@pytest.mark.cb
+# @pytest.mark.v1
 @pytest.mark.parametrize("max_num_seqs", [2])
 @pytest.mark.parametrize("model", get_spyre_model_list())
 @pytest.mark.parametrize(
     "backend", [pytest.param("eager", marks=pytest.mark.cpu, id="eager")])
-@pytest.mark.parametrize("cb",
-                         [pytest.param(1, marks=pytest.mark.cb, id="cb")])
-# @pytest.mark.v1
 def test_cb_max_tokens(
     model: str,
     backend: str,
     max_num_seqs: int,
-    cb: int,
     monkeypatch: pytest.MonkeyPatch,
 ):
     """Test that continuous batches of requests that
@@ -114,7 +149,7 @@
                                           logprobs=0)
 
     with pytest.raises(ValueError, match="max model context length"):
-        generate_cb_spyre_vllm_output(
+        generate_spyre_vllm_output(
             model=model,
             prompts=overflow_prompt,
             max_model_len=max_model_len,
@@ -123,9 +158,9 @@
             tensor_parallel_size=1,
             backend=backend,
             max_num_seqs=max_num_seqs,
-            use_cb=cb,
-            monkeypatch=monkeypatch,
-        )
+            use_cb=True,
+            vllm_version="V1",  # CB runs in V1 only
+            monkeypatch=monkeypatch)
 
 
 def get_params_test_blocks_borders_aligned_prompts():

@@ -35,6 +35,7 @@ def test_output(
     warmup_shape: tuple[int, int, int],
     backend: str,
     vllm_version: str,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
     '''
     The warmup is based on a single shape. After the warmup,
@@ -87,7 +88,8 @@ def test_output(
         sampling_params=vllm_sampling_params,
         tensor_parallel_size=1,
         backend=backend,
-        vllm_version=vllm_version)
+        vllm_version=vllm_version,
+        monkeypatch=monkeypatch)
 
     hf_results = generate_hf_output(model=model,
                                     prompts=prompts,

@@ -31,6 +31,7 @@ def test_seed(
     warmup_shape: tuple[int, int, int],
     backend: str,
     vllm_version: str,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
     '''
     The warmup is based on a single shape. After the warmup,
@@ -60,7 +61,8 @@ def test_seed(
         sampling_params=vllm_sampling_params,
         tensor_parallel_size=1,
         backend=backend,
-        vllm_version=vllm_version)
+        vllm_version=vllm_version,
+        monkeypatch=monkeypatch)
 
     # compare all generated outputs against the first generated output
     for vllm_result in vllm_results:

@@ -32,6 +32,7 @@ def test_output(
     tp_size: int,
     backend: str,
     vllm_version: str,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
     '''
     The warmup is based on one or multiple shapes. After the warmup,
@@ -65,7 +66,8 @@ def test_output(
         sampling_params=vllm_sampling_params,
         tensor_parallel_size=tp_size,
         backend=backend,
-        vllm_version=vllm_version)
+        vllm_version=vllm_version,
+        monkeypatch=monkeypatch)
 
     hf_results = generate_hf_output(model=model,
                                     prompts=prompts,

@@ -34,6 +34,7 @@ def test_output(
     warmup_shapes: list[tuple[int, int, int]],
     backend: str,
     vllm_version: str,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
     '''
     The warmup is based on two shapes, that 'overlap' each
@@ -72,7 +73,8 @@ def test_output(
         sampling_params=vllm_sampling_params,
         tensor_parallel_size=1,
         backend=backend,
-        vllm_version=vllm_version)
+        vllm_version=vllm_version,
+        monkeypatch=monkeypatch)
 
     hf_results = generate_hf_output(model=model,
                                     prompts=prompts,
@@ -99,6 +101,7 @@ def test_invalid_prompt_len(
     warmup_shapes: list[tuple[int, int, int]],
     backend: str,
     vllm_version: str,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
     '''
     Expects an error to be raised if the warmup prompt length
@@ -119,4 +122,5 @@ def test_invalid_prompt_len(
                                    sampling_params=vllm_sampling_params,
                                    tensor_parallel_size=1,
                                    backend=backend,
-                                   vllm_version=vllm_version)
+                                   vllm_version=vllm_version,
+                                   monkeypatch=monkeypatch)