vllm-project · prashantgupta24 · Jun 23, 2025 · Jun 17, 2025 · Jun 18, 2025 · Jun 18, 2025
@@ -33,8 +33,13 @@
     "warmup_shape", [(64, 20, 4), (64, 20, 8), (128, 20, 4),
                      (128, 20, 8)])  # (prompt_length/new_tokens/batch_size)
 @pytest.mark.parametrize("backend", get_spyre_backend_list())
-def test_output(model: str, prompts: list[str],
-                warmup_shape: tuple[int, int, int], backend: str) -> None:
+def test_output(
+    model: str,
+    prompts: list[str],
+    warmup_shape: tuple[int, int, int],
+    backend: str,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
     '''
     The warmup is based on a single shape. After the warmup,
     one request with the provided prompts is input to vLLM.
@@ -64,7 +69,8 @@ def test_output(model: str, prompts: list[str],
         block_size=2048,
         sampling_params=vllm_sampling_params,
         tensor_parallel_size=1,
-        backend=backend)
+        backend=backend,
+        monkeypatch=monkeypatch)
 
     hf_results = generate_hf_output(model=model,
                                     prompts=prompts,
@@ -92,6 +98,7 @@ def test_output_sendnn_decoder(
     prompts: list[str],
     warmup_shape: tuple[int, int, int],
     backend: str,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
     '''
     Tests the deprecated sendnn_decoder backend, which should fall-back to
@@ -114,7 +121,8 @@ def test_output_sendnn_decoder(
         block_size=2048,
         sampling_params=vllm_sampling_params,
         tensor_parallel_size=1,
-        backend=backend)
+        backend=backend,
+        monkeypatch=monkeypatch)
 
     hf_results = generate_hf_output(model=model,
                                     prompts=prompts,
@@ -134,6 +142,7 @@ def test_output_sendnn_decoder(
 def test_batch_handling(
     model: str,
     backend: str,
+    monkeypatch: pytest.MonkeyPatch,
 ):
     """Test that the spyre worker correctly handles batches of requests that
     finish after different numbers of forward passes"""
@@ -166,7 +175,8 @@ def test_batch_handling(
         block_size=2048,
         sampling_params=vllm_sampling_params,
         tensor_parallel_size=1,
-        backend=backend)
+        backend=backend,
+        monkeypatch=monkeypatch)
 
     assert vllm_results[0]["text"] == " 3 2 "
     assert vllm_results[1]["text"] == " 6 5 4 3 2 "

@@ -8,7 +8,8 @@
 from typing import Any
 
 import pytest
-from spyre_util import (create_random_request, generate_cb_spyre_vllm_output,
+from spyre_util import (compare_results, create_random_request,
+                        generate_hf_output, generate_spyre_vllm_output,
                         get_spyre_backend_list, get_spyre_model_list)
 from vllm import EngineArgs, SamplingParams
 from vllm.v1.engine import EngineCoreRequest
@@ -17,33 +18,26 @@
 
 from vllm_spyre.v1.core.scheduler import ContinuousBatchingSpyreScheduler
 
+template = (
+    "Below is an instruction that describes a task. Write a response that "
+    "appropriately completes the request. Be polite in your response to the "
+    "user.\n\n### Instruction:\n{}\n\n### Response:")
+
 
 @pytest.mark.cb
 @pytest.mark.parametrize("max_num_seqs", [2, 3, 4],
                          ids=lambda val: f"max_num_seqs({val})")
 @pytest.mark.parametrize("model", get_spyre_model_list())
 @pytest.mark.parametrize("backend", get_spyre_backend_list())
-@pytest.mark.parametrize(
-    "prompts",
-    [
-        [
-            "7 6 5 4",
-            "10 9 8 7",
-        ],
-        [
-            "7 6 5 4",
-            "10 9 8 7",
-            "8 7 6 5",
-        ],
-        [
-            "7 6 5 4",
-            "10 9 8 7",
-            "8 7 6 5",
-            "9 8 7 6",
-        ],
-    ],
-    ids=lambda val: f"num_prompts({len(val)})",
-)
+@pytest.mark.parametrize("prompts", [[
+    template.format("Provide a list of instructions "
+                    "for preparing chicken soup."),
+    template.format("Provide me a list of things that I can do with my "
+                    "new found wealth."),
+    template.format(
+        "how do I add multiple new columns in m for power query or power bi?"),
+    template.format("Convert char to string in Java."),
+]])
 def test_cb_handling(
     model: str,
     backend: str,
@@ -55,16 +49,17 @@ def test_cb_handling(
     continuous batches of requests that
     finish after different numbers of forward passes"""
 
-    vllm_sampling_params = SamplingParams(max_tokens=20,
+    max_tokens = 20
+
+    vllm_sampling_params = SamplingParams(max_tokens=max_tokens,
                                           temperature=0,
-                                          stop="1",
                                           ignore_eos=True,
                                           logprobs=0)
 
     # Ensure that both:
     # - The model doesn't crash
     # - The output sequences are correct
-    vllm_results = generate_cb_spyre_vllm_output(
+    vllm_results = generate_spyre_vllm_output(
         model=model,
         prompts=prompts,
         max_model_len=2048,
@@ -73,29 +68,31 @@ def test_cb_handling(
         tensor_parallel_size=1,
         backend=backend,
         max_num_seqs=max_num_seqs,
-        use_cb=1,
-        monkeypatch=monkeypatch,
-    )
+        use_cb=True,
+        monkeypatch=monkeypatch)
+
+    hf_results = generate_hf_output(model=model,
+                                    prompts=prompts,
+                                    max_new_tokens=max_tokens)
 
-    for i, prompt in enumerate(prompts):
-        assert (vllm_results[i]["text"] == [
-            " " + " ".join(
-                str(i)
-                for i in range(int(prompt.split()[-1]) - 1, 1, -1)) + " "
-        ][0])
+    compare_results(model=model,
+                    prompts=prompts,
+                    warmup_shapes=[],
+                    tensor_parallel_size=1,
+                    backend=backend,
+                    vllm_results=vllm_results,
+                    hf_results=hf_results)
 
 
+@pytest.mark.cb
 @pytest.mark.parametrize("max_num_seqs", [2])
 @pytest.mark.parametrize("model", get_spyre_model_list())
 @pytest.mark.parametrize(
     "backend", [pytest.param("eager", marks=pytest.mark.cpu, id="eager")])
-@pytest.mark.parametrize("cb",
-                         [pytest.param(1, marks=pytest.mark.cb, id="cb")])
 def test_cb_max_tokens(
     model: str,
     backend: str,
     max_num_seqs: int,
-    cb: int,
     monkeypatch: pytest.MonkeyPatch,
 ):
     """Test that continuous batches of requests that
@@ -112,18 +109,16 @@ def test_cb_max_tokens(
                                           logprobs=0)
 
     with pytest.raises(ValueError, match="max model context length"):
-        generate_cb_spyre_vllm_output(
-            model=model,
-            prompts=overflow_prompt,
-            max_model_len=max_model_len,
-            block_size=max_model_len,
-            sampling_params=vllm_sampling_params,
-            tensor_parallel_size=1,
-            backend=backend,
-            max_num_seqs=max_num_seqs,
-            use_cb=cb,
-            monkeypatch=monkeypatch,
-        )
+        generate_spyre_vllm_output(model=model,
+                                   prompts=overflow_prompt,
+                                   max_model_len=max_model_len,
+                                   block_size=max_model_len,
+                                   sampling_params=vllm_sampling_params,
+                                   tensor_parallel_size=1,
+                                   backend=backend,
+                                   max_num_seqs=max_num_seqs,
+                                   use_cb=True,
+                                   monkeypatch=monkeypatch)
 
 
 def get_params_test_blocks_borders_aligned_prompts():

@@ -33,6 +33,7 @@ def test_output(
     stop_last: bool,
     warmup_shape: tuple[int, int, int],
     backend: str,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
     '''
     The warmup is based on a single shape. After the warmup,
@@ -84,7 +85,8 @@ def test_output(
         block_size=2048,
         sampling_params=vllm_sampling_params,
         tensor_parallel_size=1,
-        backend=backend)
+        backend=backend,
+        monkeypatch=monkeypatch)
 
     hf_results = generate_hf_output(model=model,
                                     prompts=prompts,

@@ -29,6 +29,7 @@ def test_seed(
     seed: int,
     warmup_shape: tuple[int, int, int],
     backend: str,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
     '''
     The warmup is based on a single shape. After the warmup,
@@ -57,7 +58,8 @@ def test_seed(
         block_size=2048,
         sampling_params=vllm_sampling_params,
         tensor_parallel_size=1,
-        backend=backend)
+        backend=backend,
+        monkeypatch=monkeypatch)
 
     # compare all generated outputs against the first generated output
     for vllm_result in vllm_results:

@@ -30,6 +30,7 @@ def test_output(
     warmup_shapes: list[tuple[int, int, int]],
     tp_size: int,
     backend: str,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
     '''
     The warmup is based on one or multiple shapes. After the warmup,
@@ -62,7 +63,8 @@ def test_output(
         block_size=2048,
         sampling_params=vllm_sampling_params,
         tensor_parallel_size=tp_size,
-        backend=backend)
+        backend=backend,
+        monkeypatch=monkeypatch)
 
     hf_results = generate_hf_output(model=model,
                                     prompts=prompts,

@@ -31,6 +31,7 @@ def test_output(
     prompts: list[str],
     warmup_shapes: list[tuple[int, int, int]],
     backend: str,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
     '''
     The warmup is based on two shapes, that 'overlap' each
@@ -68,7 +69,8 @@ def test_output(
         block_size=2048,
         sampling_params=vllm_sampling_params,
         tensor_parallel_size=1,
-        backend=backend)
+        backend=backend,
+        monkeypatch=monkeypatch)
 
     hf_results = generate_hf_output(model=model,
                                     prompts=prompts,
@@ -92,6 +94,7 @@ def test_invalid_prompt_len(
     prompts: list[str],
     warmup_shapes: list[tuple[int, int, int]],
     backend: str,
+    monkeypatch: pytest.MonkeyPatch,
 ) -> None:
     '''
     Expects an error to be raised if the warmup prompt length
@@ -111,4 +114,5 @@ def test_invalid_prompt_len(
                                    block_size=64,
                                    sampling_params=vllm_sampling_params,
                                    tensor_parallel_size=1,
-                                   backend=backend)
+                                   backend=backend,
+                                   monkeypatch=monkeypatch)