Skip to content

Commit c0269a3

Browse files
Don't allow warmup shapes that exceed the max sequence length (#185)
In V0, warmup shapes that result in sequence lengths longer than the maximum sequence length that the model supports are not validated. When a request that is between the two values comes in, it results in a server crash: ``` WARNING 04-23 02:30:31 [scheduler.py:717] Input prompt (306 tokens) is too long and exceeds limit of 256 CRITICAL 04-23 02:30:31 [launcher.py:116] MQLLMEngine is already dead, terminating server process INFO: 127.0.0.1:54294 - "POST /v1/embeddings HTTP/1.1" 500 Internal Server Error ERROR 04-23 02:30:31 [engine.py:160] ValueError('Sampling parameters are missing for a CompletionRequest.') ERROR 04-23 02:30:31 [engine.py:160] Traceback (most recent call last): ERROR 04-23 02:30:31 [engine.py:160] File "/opt/vllm/lib64/python3.11/site-packages/vllm/engine/multiprocessing/engine.py", line 158, in start ERROR 04-23 02:30:31 [engine.py:160] self.run_engine_loop() ERROR 04-23 02:30:31 [engine.py:160] File "/opt/vllm/lib64/python3.11/site-packages/vllm/engine/multiprocessing/engine.py", line 221, in run_engine_loop ERROR 04-23 02:30:31 [engine.py:160] request_outputs = self.engine_step() ERROR 04-23 02:30:31 [engine.py:160] ^^^^^^^^^^^^^^^^^^ ERROR 04-23 02:30:31 [engine.py:160] File "/opt/vllm/lib64/python3.11/site-packages/vllm/engine/multiprocessing/engine.py", line 247, in engine_step ERROR 04-23 02:30:31 [engine.py:160] raise e ERROR 04-23 02:30:31 [engine.py:160] File "/opt/vllm/lib64/python3.11/site-packages/vllm/engine/multiprocessing/engine.py", line 230, in engine_step ERROR 04-23 02:30:31 [engine.py:160] return self.engine.step() ERROR 04-23 02:30:31 [engine.py:160] ^^^^^^^^^^^^^^^^^^ ERROR 04-23 02:30:31 [engine.py:160] File "/opt/vllm/lib64/python3.11/site-packages/vllm/engine/llm_engine.py", line 1493, in step ERROR 04-23 02:30:31 [engine.py:160] self._process_model_outputs(ctx=ctx) ERROR 04-23 02:30:31 [engine.py:160] File "/opt/vllm/lib64/python3.11/site-packages/vllm/engine/llm_engine.py", line 1220, in _process_model_outputs ERROR 04-23 02:30:31 [engine.py:160] request_output = RequestOutputFactory.create( ERROR 04-23 02:30:31 [engine.py:160] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ERROR 04-23 02:30:31 [engine.py:160] File "/opt/vllm/lib64/python3.11/site-packages/vllm/outputs.py", line 392, in create ERROR 04-23 02:30:31 [engine.py:160] return RequestOutput.from_seq_group(seq_group, use_cache, ERROR 04-23 02:30:31 [engine.py:160] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ERROR 04-23 02:30:31 [engine.py:160] File "/opt/vllm/lib64/python3.11/site-packages/vllm/outputs.py", line 181, in from_seq_group ERROR 04-23 02:30:31 [engine.py:160] raise ValueError( ERROR 04-23 02:30:31 [engine.py:160] ValueError: Sampling parameters are missing for a CompletionRequest. INFO: Shutting down INFO: Waiting for application shutdown. INFO: Application shutdown complete. INFO: Finished server process [705] ``` --------- Signed-off-by: Max de Bayser <[email protected]> Co-authored-by: Joe Runde <[email protected]>
1 parent 98dca8d commit c0269a3

File tree

1 file changed

+18
-2
lines changed

1 file changed

+18
-2
lines changed

vllm_spyre/platform.py

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,10 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
7777
# Override --max-num-seqs to the biggest warmup batch size
7878
# And override --max-model-len to the biggest warmup sequence
7979
cls._warmup_shapes = None
80-
spyre_warmup_shapes = cls.get_warmup_shapes(scheduler_config)
80+
max_model_len = model_config.max_model_len \
81+
if model_config is not None else sys.maxsize
82+
spyre_warmup_shapes = cls.get_warmup_shapes(
83+
scheduler_config, max_model_len)
8184
max_batch_size = 0
8285
max_seq_len = 0
8386
for shape in spyre_warmup_shapes:
@@ -168,7 +171,10 @@ def inference_mode(cls):
168171
return torch.no_grad()
169172

170173
@classmethod
171-
def get_warmup_shapes(cls, scheduler_config) -> tuple[dict[str, int], ...]:
174+
def get_warmup_shapes(
175+
cls,
176+
scheduler_config,
177+
max_model_len: int = sys.maxsize) -> tuple[dict[str, int], ...]:
172178
if cls._warmup_shapes is not None:
173179
return cls._warmup_shapes
174180
# load warmup shapes and sort by "speed"
@@ -204,6 +210,16 @@ def get_warmup_shapes(cls, scheduler_config) -> tuple[dict[str, int], ...]:
204210
} for pl, nt, bs in zip(wup_prompt_lens, wup_new_tokens,
205211
wup_batch_sizes)],
206212
key=operator.itemgetter('batch_size', 'prompt_length')))
213+
214+
for shape in cls._warmup_shapes:
215+
max_seq_len = shape["prompt_length"] + shape["new_tokens"]
216+
if max_seq_len > max_model_len:
217+
raise RuntimeError(
218+
f"Warmup shape [{shape['batch_size']},"
219+
" {shape['prompt_length']}, {shape['new_tokens']}]"
220+
" results in a maximum sequence length of "
221+
"{max_seq_len} which is longer that what the model "
222+
"supports ({max_model_len})")
207223
return cls._warmup_shapes
208224

209225
@classmethod

0 commit comments

Comments
 (0)