vllm-project · joerunde · Jun 18, 2025 · May 16, 2025 · May 27, 2025 · Jun 3, 2025
@@ -43,17 +43,14 @@ jobs:
             markers: "v0 and cpu and e2e"
             flags: "--timeout=300"
           - name: "V1-e2e"
-            markers: "v1 and cpu and e2e"
+            markers: "v1 and cpu and e2e and not cb"
             flags: "--timeout=300 --forked"
-          - name: "V1-worker"
-            markers: "v1 and not e2e"
-            flags: "--timeout=300"
-          - name: "utils"
-            markers: "utils"
-            flags: "--timeout=300"
-          - name: "cb"
-            markers: "cb"
+          - name: "V1-cb"
+            markers: "v1 and cpu and cb"
             flags: "--timeout=300 --forked"
+          - name: "V1-worker and utils"
+            markers: "v1 and not e2e or utils"
+            flags: "--timeout=300"
 
     name: "${{ matrix.test_suite.name }} (${{ matrix.vllm_version.name }})"
 
@@ -163,10 +160,6 @@ jobs:
           # `uv run`, to avoid having `uv run` re-sync any dependencies or 
           # re-install the vllm_sypre package from source
           source .venv/bin/activate
-          if [ ${{ matrix.test_suite.markers }} == "cb" ]; then
-          # install custom fms branch
-              uv pip install git+https://github.com/foundation-model-stack/foundation-model-stack@paged_attn_mock --force-reinstall
-          fi
           # commands to run if condition is true
           python3 -m pytest ${{ matrix.test_suite.flags }} \
             tests -v -m "${{ matrix.test_suite.markers }}"
@@ -12,7 +12,7 @@ readme = "README.md"
 license = {text = "Apache 2"}
 dependencies = [
     "fms-model-optimizer>=0.2.0",
-    "ibm-fms==1.0.0",
+    "ibm-fms==1.1.0",
     "vllm>=0.9.0,!=0.9.1",
 ]
 requires-python = ">=3.9"

@@ -9,7 +9,7 @@
 
 import pytest
 from spyre_util import (create_random_request, generate_cb_spyre_vllm_output,
-                        get_spyre_model_list)
+                        get_spyre_backend_list, get_spyre_model_list)
 from vllm import EngineArgs, SamplingParams
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.core import EngineCore
@@ -18,16 +18,12 @@
 from vllm_spyre.v1.core.scheduler import ContinuousBatchingSpyreScheduler
 
 
+@pytest.mark.cb
+@pytest.mark.v1
 @pytest.mark.parametrize("max_num_seqs", [2, 3, 4],
                          ids=lambda val: f"max_num_seqs({val})")
 @pytest.mark.parametrize("model", get_spyre_model_list())
-@pytest.mark.parametrize(
-    "backend", [pytest.param("eager", marks=pytest.mark.cpu, id="eager")])
-@pytest.mark.parametrize("cb",
-                         [pytest.param(1, marks=pytest.mark.cb, id="cb")])
-# commenting v1 since we don't want this test to run with v1 marker yet
-# @pytest.mark.parametrize("vllm_version",
-#                          [pytest.param("V1", marks=pytest.mark.v1, id="v1")])
+@pytest.mark.parametrize("backend", get_spyre_backend_list())
 @pytest.mark.parametrize(
     "prompts",
     [
@@ -53,9 +49,7 @@ def test_cb_handling(
     model: str,
     backend: str,
     max_num_seqs: int,
-    cb: int,
     prompts: list[str],
-    # vllm_version: str,
     monkeypatch: pytest.MonkeyPatch,
 ):
     """Test that the spyre worker correctly handles
@@ -80,7 +74,7 @@ def test_cb_handling(
         tensor_parallel_size=1,
         backend=backend,
         max_num_seqs=max_num_seqs,
-        use_cb=cb,
+        use_cb=1,
         monkeypatch=monkeypatch,
     )
 
@@ -654,9 +648,9 @@ def augment_checked_steps(
 
 
 @pytest.mark.cb
+@pytest.mark.v1
 @pytest.mark.parametrize("model", get_spyre_model_list())
-@pytest.mark.parametrize(
-    "backend", [pytest.param("eager", marks=pytest.mark.cpu, id="eager")])
+@pytest.mark.parametrize("backend", get_spyre_backend_list())
 @pytest.mark.parametrize("max_num_seqs", [2])
 @pytest.mark.parametrize(
     "seqs_max_tokens,prompts_lengths,steps_add_reqs,checked_steps,"

@@ -314,13 +314,16 @@ def __init__(
 
         # set num_blocks to the minimal value of 4 required for warmup
         # is reset to the value returned by the Spyre compiler after warmup
-        self._set_past_key_value_states(num_blocks=4)
+        # self._set_past_key_value_states(num_blocks=4)
+        num_blocks = scheduler_config.max_num_seqs * max_model_len // BLOCK_SIZE
+        self._set_past_key_value_states(num_blocks=num_blocks)
 
         # mark the num_blocks dimension dynamic for Spyre compiler for warmup
-        # only, compiler will return the number of blocks it can accommodate
-        for layer in self.past_key_value_states:
-            for tensor in layer:
-                torch._dynamo.mark_dynamic(tensor, 0)
+        # only, compiler will return the number of blocks it can accommodate.
+        # (This is not yet supported by the compiler)
+        # for layer in self.past_key_value_states:
+        #     for tensor in layer:
+        #         torch._dynamo.mark_dynamic(tensor, 0)
 
     def _set_past_key_value_states(self, num_blocks) -> None:
         # List[layers] of Tuple[k,v] of
@@ -353,17 +356,25 @@ def forward(
         **extra_kwargs,
     ) -> torch.Tensor:
 
+        # import will be not be needed/ handled by FMS soon
+        import fms.utils.spyre.paged  # noqa # pylint: disable=unused-import
+
+        # specify attention type for continuous batching
+        extra_kwargs['attn_name'] = "spyre_paged_attn"
+
+        # additional (paged) attention arguments
+        extra_kwargs['current_tkv_mask'] = current_tkv_mask
+        extra_kwargs['left_padded_prompt_mask'] = left_padded_prompt_mask
+        extra_kwargs['block_table'] = block_table
+        extra_kwargs['slot_mapping'] = slot_mapping
+
         output = self.model(
             input_ids,
             position_ids=position_ids,
             mask=mask,
             past_key_value_states=self.past_key_value_states,
             use_cache=use_cache,
             only_last_token=only_last_token,
-            current_tkv_mask=current_tkv_mask,
-            left_padded_prompt_mask=left_padded_prompt_mask,
-            block_table=block_table,
-            slot_mapping=slot_mapping,
             **extra_kwargs,
         )
 
@@ -401,6 +412,9 @@ def forward(
         **extra_kwargs,
     ) -> torch.Tensor:
 
+        # specify attention type for static batching
+        extra_kwargs['attn_name'] = "sdpa_bidirectional"
+
         output = self.model(
             input_ids,
             position_ids=position_ids,

@@ -578,7 +578,13 @@ def __init__(
         self.tkv: int = 0
         # set self.free_blocks to the minimal value of 4 required for warmup
         # is reset to the value returned by the Spyre compiler after warmup
-        self._set_free_blocks(num_blocks=4)
+        # self._set_free_blocks(num_blocks=4)
+        # for the time being we set this to num_blocks consistent with the
+        # cache dimension of ContinuousBatchingFmsModel.past_key_value_states
+        num_blocks = (vllm_config.scheduler_config.max_num_seqs *
+                      vllm_config.model_config.max_model_len //
+                      self.BLOCK_SIZE)
+        self._set_free_blocks(num_blocks=num_blocks)
         self.dummy_req_ids2blocks: list[int] = []
 
         # TODO: Remove this once we can prefill and decode