Don't allow warmup shapes that exceed the max sequence length (#185)

maxdebayser · joerunde · web-flow · commit c0269a366e76 · 2025-06-05T17:01:02.000Z
In V0, warmup shapes that result in sequence lengths longer than the
maximum sequence length that the model supports are not validated. When
a request that is between the two values comes in, it results in a
server crash:

```
WARNING 04-23 02:30:31 [scheduler.py:717] Input prompt (306 tokens) is too long and exceeds limit of 256
CRITICAL 04-23 02:30:31 [launcher.py:116] MQLLMEngine is already dead, terminating server process
INFO:     127.0.0.1:54294 - "POST /v1/embeddings HTTP/1.1" 500 Internal Server Error
ERROR 04-23 02:30:31 [engine.py:160] ValueError('Sampling parameters are missing for a CompletionRequest.')
ERROR 04-23 02:30:31 [engine.py:160] Traceback (most recent call last):
ERROR 04-23 02:30:31 [engine.py:160]   File "/opt/vllm/lib64/python3.11/site-packages/vllm/engine/multiprocessing/engine.py", line 158, in start
ERROR 04-23 02:30:31 [engine.py:160]     self.run_engine_loop()
ERROR 04-23 02:30:31 [engine.py:160]   File "/opt/vllm/lib64/python3.11/site-packages/vllm/engine/multiprocessing/engine.py", line 221, in run_engine_loop
ERROR 04-23 02:30:31 [engine.py:160]     request_outputs = self.engine_step()
ERROR 04-23 02:30:31 [engine.py:160]                       ^^^^^^^^^^^^^^^^^^
ERROR 04-23 02:30:31 [engine.py:160]   File "/opt/vllm/lib64/python3.11/site-packages/vllm/engine/multiprocessing/engine.py", line 247, in engine_step
ERROR 04-23 02:30:31 [engine.py:160]     raise e
ERROR 04-23 02:30:31 [engine.py:160]   File "/opt/vllm/lib64/python3.11/site-packages/vllm/engine/multiprocessing/engine.py", line 230, in engine_step
ERROR 04-23 02:30:31 [engine.py:160]     return self.engine.step()
ERROR 04-23 02:30:31 [engine.py:160]            ^^^^^^^^^^^^^^^^^^
ERROR 04-23 02:30:31 [engine.py:160]   File "/opt/vllm/lib64/python3.11/site-packages/vllm/engine/llm_engine.py", line 1493, in step
ERROR 04-23 02:30:31 [engine.py:160]     self._process_model_outputs(ctx=ctx)
ERROR 04-23 02:30:31 [engine.py:160]   File "/opt/vllm/lib64/python3.11/site-packages/vllm/engine/llm_engine.py", line 1220, in _process_model_outputs
ERROR 04-23 02:30:31 [engine.py:160]     request_output = RequestOutputFactory.create(
ERROR 04-23 02:30:31 [engine.py:160]                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 04-23 02:30:31 [engine.py:160]   File "/opt/vllm/lib64/python3.11/site-packages/vllm/outputs.py", line 392, in create
ERROR 04-23 02:30:31 [engine.py:160]     return RequestOutput.from_seq_group(seq_group, use_cache,
ERROR 04-23 02:30:31 [engine.py:160]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 04-23 02:30:31 [engine.py:160]   File "/opt/vllm/lib64/python3.11/site-packages/vllm/outputs.py", line 181, in from_seq_group
ERROR 04-23 02:30:31 [engine.py:160]     raise ValueError(
ERROR 04-23 02:30:31 [engine.py:160] ValueError: Sampling parameters are missing for a CompletionRequest.
INFO:     Shutting down
INFO:     Waiting for application shutdown.
INFO:     Application shutdown complete.
INFO:     Finished server process [705]
```

---------

Signed-off-by: Max de Bayser &lt;mbayser@br.ibm.com&gt;
Co-authored-by: Joe Runde &lt;Joseph.Runde@ibm.com&gt;
diff --git a/vllm_spyre/platform.py b/vllm_spyre/platform.py
@@ -77,7 +77,10 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
             # Override --max-num-seqs to the biggest warmup batch size
             # And override --max-model-len to the biggest warmup sequence
             cls._warmup_shapes = None
-            spyre_warmup_shapes = cls.get_warmup_shapes(scheduler_config)
+            max_model_len = model_config.max_model_len \
+                if model_config is not None else sys.maxsize
+            spyre_warmup_shapes = cls.get_warmup_shapes(
+                scheduler_config, max_model_len)
             max_batch_size = 0
             max_seq_len = 0
             for shape in spyre_warmup_shapes:
@@ -168,7 +171,10 @@ def inference_mode(cls):
         return torch.no_grad()
 
     @classmethod
-    def get_warmup_shapes(cls, scheduler_config) -> tuple[dict[str, int], ...]:
+    def get_warmup_shapes(
+            cls,
+            scheduler_config,
+            max_model_len: int = sys.maxsize) -> tuple[dict[str, int], ...]:
         if cls._warmup_shapes is not None:
             return cls._warmup_shapes
         # load warmup shapes and sort by "speed"
@@ -204,6 +210,16 @@ def get_warmup_shapes(cls, scheduler_config) -> tuple[dict[str, int], ...]:
             } for pl, nt, bs in zip(wup_prompt_lens, wup_new_tokens,
                                     wup_batch_sizes)],
                    key=operator.itemgetter('batch_size', 'prompt_length')))
+
+        for shape in cls._warmup_shapes:
+            max_seq_len = shape["prompt_length"] + shape["new_tokens"]
+            if max_seq_len > max_model_len:
+                raise RuntimeError(
+                    f"Warmup shape [{shape['batch_size']},"
+                    " {shape['prompt_length']}, {shape['new_tokens']}]"
+                    " results in a maximum sequence length of "
+                    "{max_seq_len} which is longer that what the model "
+                    "supports ({max_model_len})")
         return cls._warmup_shapes
 
     @classmethod