vllm-project · yannicks1 · Sep 10, 2025 · Sep 4, 2025 · Sep 4, 2025 · Sep 4, 2025
@@ -2,6 +2,7 @@
 import math
 import os
 import random
+import time
 from pathlib import Path
 from typing import Any, Optional, Union
 
@@ -605,7 +606,7 @@ def create_random_request(
             mm_placeholders=None,
             sampling_params=sampling_params,
             eos_token_id=None,
-            arrival_time=0,
+            arrival_time=time.time(),
             lora_request=None,
             data_parallel_rank=None,
             pooling_params=None,
@@ -622,7 +623,7 @@ def create_random_request(
         multi_modal_placeholders=None,
         sampling_params=sampling_params,
         eos_token_id=None,
-        arrival_time=0,
+        arrival_time=time.time(),
         lora_request=None,
         pooling_params=None,
         cache_salt=None,

@@ -107,6 +107,12 @@ def _backend_backwards_compat() -> str:
     lambda: bool(int(os.getenv("VLLM_SPYRE_ENABLE_PREFILL_OPTIMIZATION", "0"))
                  ),
 
+    # scheduling heuristic: maximal waiting (blocking) time for prefill
+    # Prefills waiting longer than VLLM_SPYRE_MAX_WAITING_TIME_PREFILL
+    # seconds will have priority after the current decode batch has finished.
+    "VLLM_SPYRE_MAX_WAITING_TIME_PREFILL":
+    lambda: int(os.getenv("VLLM_SPYRE_MAX_WAITING_TIME_PREFILL", "-1")),
+
     # Allow vllm-spyre to update env vars related to multi-threading (eg. OMP)
     # based on the detected CPU cores and server configuration
     "VLLM_SPYRE_UPDATE_THREAD_CONFIG":

@@ -235,6 +235,20 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
             "VLLM_DT_MAX_BATCH_TKV_LIMIT found. Using the default value " \
             "(max_model_len * max_batch_size): %d", default_max_batch_tkv_limit)
 
+        # scheduling heuristic: maximal waiting (blocking) time for prefill
+        if envs_spyre.VLLM_SPYRE_MAX_WAITING_TIME_PREFILL == -1:
+            logger.info(
+                "Env var VLLM_SPYRE_MAX_WAITING_TIME_PREFILL determining the "
+                "maximal waiting time for a request unset. Defaulting to -1, "
+                "which is infinite time (no scheduler heuristic at all).")
+        else:
+            logger.info(
+                "Env var VLLM_SPYRE_MAX_WAITING_TIME_PREFILL determining the "
+                "maximal waiting time is set to %s. This means that prefills "
+                "waiting longer than %s seconds will always be prioritized. ",
+                envs_spyre.VLLM_SPYRE_MAX_WAITING_TIME_PREFILL,
+                envs_spyre.VLLM_SPYRE_MAX_WAITING_TIME_PREFILL)
+
     @classmethod
     def use_all_gather(cls) -> bool:
         """

@@ -2,6 +2,7 @@
 
 import math
 import os
+import time
 from collections import deque
 from typing import TYPE_CHECKING
 
@@ -156,6 +157,9 @@ def __init__(self, *args, **kwargs) -> None:
         assert self.max_batch_tkv_limit != '-1', (
             "Expecting the env var VLLM_DT_MAX_BATCH_TKV_LIMIT to be set in "
             "platform.py")
+        # if batch_is_locked: finish current decode batch to serve a request
+        # that waited for longer than VLLM_SPYRE_MAX_WAITING_TIME_PREFILL
+        self.batch_is_locked = False
 
     def update_from_output(
         self,
@@ -179,14 +183,20 @@ def schedule(self) -> "SchedulerOutput":
         To avoid additional specialization, some requests are held back from the
         base scheduler but are restored after.
         """
+        # unlock the current decode batch if no requests are in running queue
+        if len(self.running) == 0:
+            self.batch_is_locked = False
+            logger.debug("Unlocking the current decode batch as no requests "
+                         "are in running queue")
         # First purge the full waiting queue into our holdback queue, preserving
         # priority
         while self.waiting:
             self.holdback_queue.append(self.waiting.popleft())
 
         # Check if new requests can be scheduled.
         while self.holdback_queue:
-            if self.can_schedule(self.holdback_queue[0]):
+            if not self.batch_is_locked and self.can_schedule(
+                    self.holdback_queue[0]):
                 # Add request to the waiting queue
                 self.waiting.append(self.holdback_queue.popleft())
             else:
@@ -226,6 +236,18 @@ def can_schedule(self, request) -> bool:
         if len(self.running) + len(self.waiting) == 0:
             return True
 
+        # scheduling heuristic: maximal waiting (blocking) time for prefill
+        if envs_spyre.VLLM_SPYRE_MAX_WAITING_TIME_PREFILL > 0:
+            waiting_time = (time.time() - request.arrival_time)
+            if waiting_time > envs_spyre.VLLM_SPYRE_MAX_WAITING_TIME_PREFILL:
+                self.batch_is_locked = True
+                logger.debug("Request %s waited longer (%ds) than " \
+                "VLLM_SPYRE_MAX_WAITING_TIME_PREFILL (%ds): locking current" \
+                " decode batch and schedule this request afterwards.",
+                request.request_id, waiting_time,
+                envs_spyre.VLLM_SPYRE_MAX_WAITING_TIME_PREFILL
+                )
+
         # check that there is space in the current decode batch
         cond1 = len(self.running) + len(
             self.waiting) < self.max_num_running_reqs