🐛 fixed static batch warmup (#246)

joerunde · web-flow · commit 0ceea2048bf6 · 2025-06-19T17:24:39.000Z
# Description

This changes the warmup for static batching back to how it originally
was, only warming up a single pass.

This fixes a bug where the compiled model graphs were incorrect- we
would invoke the model with one batch size and the model would output a
different batch size.

Signed-off-by: Joe Runde &lt;joe@joerun.de&gt;
diff --git a/vllm_spyre/v1/worker/spyre_worker.py b/vllm_spyre/v1/worker/spyre_worker.py
@@ -90,10 +90,8 @@ def compile_or_warm_up_model(self) -> None:
             logger.info(
                 "Warming up for prompt length %d, decoding %d tokens with "
                 "batch size %d", prompt_len, num_decode_tokens, batch_size)
-            with _maybe_warmup_context():
-                self._warmup_spyre_fixed_size(prompt_len, num_decode_tokens,
-                                              self.restricted_tokens,
-                                              batch_size)
+            self._warmup_spyre_fixed_size(prompt_len, num_decode_tokens,
+                                          self.restricted_tokens, batch_size)
         all_warmup_end_t = time.time()
         all_warmup_total_t = all_warmup_end_t - all_warmup_start_t
         self.perf_metrics.log("total warmup time", all_warmup_total_t)
@@ -537,8 +535,10 @@ def _warmup_spyre_fixed_size(self, prompt_len, num_decode_tokens,
 
         # First full forward pass
         logger.info("Warmup forward pass 1/2...")
-        self._warmup_model_forward_pass(scheduler_output, dummy_requests,
-                                        cached_requests, num_decode_tokens)
+        # The fixed size warmup needs to happen only in here
+        with _maybe_warmup_context():
+            self._warmup_model_forward_pass(scheduler_output, dummy_requests,
+                                            cached_requests, num_decode_tokens)
         self.perf_metrics.log("warmup 1 time",
                               time.time() - warmup_start_t,
                               batch_size=batch_size,