vllm-project · yannicks1 · Jul 21, 2025 · Jul 17, 2025 · Jul 17, 2025
@@ -160,32 +160,14 @@ def test_batch_handling(model: str, backend: str, cb: int,
 
     prompts = get_chicken_soup_prompts(4)
 
-    sampling_params1 = SamplingParams(max_tokens=5,
-                                      min_tokens=5,
-                                      temperature=0,
-                                      ignore_eos=True,
-                                      logprobs=0)
-    sampling_params2 = SamplingParams(max_tokens=20,
-                                      min_tokens=20,
-                                      temperature=0,
-                                      ignore_eos=True,
-                                      logprobs=0)
-    sampling_params3 = SamplingParams(max_tokens=10,
-                                      min_tokens=10,
-                                      temperature=0,
-                                      ignore_eos=True,
-                                      logprobs=0)
-    sampling_params4 = SamplingParams(max_tokens=5,
-                                      min_tokens=5,
-                                      temperature=0,
-                                      ignore_eos=True,
-                                      logprobs=0)
+    max_new_tokens = [5, 20, 10, 5]
 
     vllm_sampling_params = [
-        sampling_params1,
-        sampling_params2,
-        sampling_params3,
-        sampling_params4,
+        SamplingParams(max_tokens=max_new_tokens[i],
+                       min_tokens=max_new_tokens[i],
+                       temperature=0,
+                       ignore_eos=True,
+                       logprobs=0) for i in range(len(max_new_tokens))
     ]
 
     kwargs = {
@@ -207,7 +189,7 @@ def test_batch_handling(model: str, backend: str, cb: int,
         **kwargs)
     hf_results = generate_hf_output(model=model,
                                     prompts=prompts,
-                                    max_new_tokens=[5, 20, 10, 5])
+                                    max_new_tokens=max_new_tokens)
 
     compare_results(
         model=model,

@@ -79,9 +79,9 @@ def compile_or_warm_up_model(self) -> None:
             if self.model_config.task != "embed":
                 # TODO: remove if spyre supports
                 # lower number of output tokens
-                assert num_decode_tokens >= 3, (
+                assert num_decode_tokens >= 2, (
                     "VLLM_SPYRE_WARMUP_NEW_TOKENS must be "
-                    "at least 3 (spyre requirement).")
+                    "at least 2 (spyre requirement).")
             # warmup individual combination
             logger.info(
                 "[WARMUP] (%d/%d) for prompt length %d, decoding %d tokens "

@@ -208,9 +208,9 @@ def load_model(self):
             if self.model_config.task != "embed":
                 # TODO: remove if spyre supports
                 # lower number of output tokens
-                assert num_decode_tokens >= 3, (
+                assert num_decode_tokens >= 2, (
                     "VLLM_SPYRE_WARMUP_NEW_TOKENS must be "
-                    "at least 3 (spyre requirement).")
+                    "at least 2 (spyre requirement).")
             # warmup individual combination
             print(f"[SpyreWorker] Warmup {i+1}/"
                   f"{len(wup_new_tokens)} "