🐛 fix batch handling in V1 runner (#33)

joerunde · rafvasq · commit a6b8e25b966c · 2025-03-20T12:19:40.000-04:00
* 🐛 fix batch handling in V1 runner

Signed-off-by: Joe Runde &lt;Joseph.Runde@ibm.com&gt;

* ⚗️ try v1 test only

Signed-off-by: Joe Runde &lt;Joseph.Runde@ibm.com&gt;

* ⚗️ add a bit more prompt

Signed-off-by: Joe Runde &lt;Joseph.Runde@ibm.com&gt;

* ⚗️ unclear why CI won't count to 0

Signed-off-by: Joe Runde &lt;Joseph.Runde@ibm.com&gt;

* ♻️ rename map_output_indices

Signed-off-by: Joe Runde &lt;Joseph.Runde@ibm.com&gt;

---------

Signed-off-by: Joe Runde &lt;Joseph.Runde@ibm.com&gt;
diff --git a/tests/test_spyre_basic.py b/tests/test_spyre_basic.py
@@ -72,4 +72,52 @@ def test_output(
                     tensor_parallel_size=1,
                     backend=backend,
                     vllm_results=vllm_results,
-                    hf_results=hf_results)
+                    hf_results=hf_results)
+
+
+@pytest.mark.parametrize("model", get_spyre_model_list())
+@pytest.mark.parametrize("backend", get_spyre_backend_list())
+@pytest.mark.parametrize("vllm_version", ["V0", "V1"])
+def test_batch_handling(
+    model: str,
+    backend: str,
+    vllm_version: str,
+):
+    """Test that the spyre worker correctly handles batches of requests that
+    finish after different numbers of forward passes"""
+
+    # Test with batch size 4
+    warmup_shape = (64, 20, 4)
+
+    # Have the model count down to one and stop
+    vllm_sampling_params = SamplingParams(max_tokens=20,
+                                          temperature=0,
+                                          stop="1",
+                                          logprobs=0)
+    # Importantly, these prompts are ordered so that they don't finish in the
+    # order given
+    prompts = [
+        "7 6 5 4",
+        "10 9 8 7",
+        "8 7 6 5",
+        "9 8 7 6",
+    ]
+
+    # Ensure that both:
+    # - The model doesn't crash
+    # - The output sequences are correct
+    vllm_results = generate_spyre_vllm_output(
+        model=model,
+        prompts=prompts,
+        warmup_shapes=[warmup_shape],
+        max_model_len=2048,
+        block_size=2048,
+        sampling_params=vllm_sampling_params,
+        tensor_parallel_size=1,
+        backend=backend,
+        vllm_version=vllm_version)
+
+    assert vllm_results[0]["text"] == " 3 2 "
+    assert vllm_results[1]["text"] == " 6 5 4 3 2 "
+    assert vllm_results[2]["text"] == " 4 3 2 "
+    assert vllm_results[3]["text"] == " 5 4 3 2 "
diff --git a/vllm_spyre/platform.py b/vllm_spyre/platform.py
@@ -102,7 +102,15 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
             #       the scheduler always thinks there's a block available
             model_config.max_model_len = max_seq_len
             cache_config.block_size = model_config.max_model_len
-            cache_config.num_gpu_blocks_override = scheduler_config.max_num_seqs
+
+            if envs.VLLM_USE_V1:
+                # The V1 scheduler actually needs 2 blocks for each sequence...
+                cache_config.num_gpu_blocks_override = \
+                    scheduler_config.max_num_seqs * 2
+            else:
+                cache_config.num_gpu_blocks_override = \
+                    scheduler_config.max_num_seqs
+
             logger.info(
                 "Overriding configurations based on warmup shapes. "
                 "max_model_len=%d, max_num_seqs=%d, block_size=%d, "
diff --git a/vllm_spyre/v1/core/scheduler.py b/vllm_spyre/v1/core/scheduler.py
@@ -106,6 +106,13 @@ def schedule(self) -> "SchedulerOutput":
                         # can work with the batch we have
                         break
 
+            logger.debug(
+                "Scheduling a new batch of %d requests, holding back %d "
+                "requests", len(self.waiting), len(self.holdback_queue))
+        else:
+            logger.debug("Scheduling a running batch of %d requests",
+                         len(self.running))
+
         outputs = super().schedule()
         return outputs
 
diff --git a/vllm_spyre/v1/worker/spyre_model_runner.py b/vllm_spyre/v1/worker/spyre_model_runner.py
@@ -366,7 +366,7 @@ def execute_model(
 
         model_output = ModelRunnerOutput(
             req_ids=list(self._req_ids2idx.keys()),
-            req_id_to_index=self._req_ids2idx,
+            req_id_to_index=self._get_unpadded_output_indices(),
             sampled_token_ids=output.sampled_token_ids.tolist(),
             spec_token_ids=None,
             logprobs=output.logprobs_tensors.tolists()
@@ -378,6 +378,28 @@ def execute_model(
         )
         return model_output
 
+    def _get_unpadded_output_indices(self) -> dict[str, int]:
+        """The inputs to the model are all padded to a constant batch size, and
+        self.req_id2idx is the map of request id -> padded index.
+        However, finished requests and padded requests are stripped from the
+        output, so the mapping of request id -> unpadded output index needs to
+        be created to be returned in `ModelRunnerOutput`.
+
+        For example if:
+        - self.model.indices = [F, T, T, F]
+        - self.req_ids2ix = {"A": 0, "B": 1, "C": 2, "D": 3}
+        This will output: {"B": 0, "C": 1}
+        """
+        remapped_indices = {}
+        for req_id, idx in self._req_ids2idx.items():
+            if self.model.indices[idx]:
+                # Sum up all the requests to the left of this one that are still
+                # processing. That should be this requests' index in the output
+                # tensor.
+                remapped_indices[req_id] = self.model.indices[0:idx].sum(
+                ).item()
+        return remapped_indices
+
     def _prepare_pad_input_ids(
         self,
         input_ids_list: List[torch.Tensor],