-
Notifications
You must be signed in to change notification settings - Fork 26
[cb] scheduler heuristic 2: unblock long prompts #440
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 4 commits
5a2d843
104bdeb
2c50ffc
489daa4
039c674
f1c96d6
95a4cf4
ea7edcf
c302cfb
324f827
2ff9717
eb5c35a
ab9a8ee
8877a6e
9a5c444
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -107,6 +107,12 @@ def _backend_backwards_compat() -> str: | |
| lambda: bool(int(os.getenv("VLLM_SPYRE_ENABLE_PREFILL_OPTIMIZATION", "0")) | ||
| ), | ||
|
|
||
| # scheduling heuristic: maximal waiting (blocking) time for prefill | ||
| # Prefills waiting longer than VLLM_SPYRE_MAX_WAITING_TIME_PREFILL | ||
| # seconds will have priority after the current decode batch has finished. | ||
| "VLLM_SPYRE_MAX_WAITING_TIME_PREFILL": | ||
|
||
| lambda: int(os.getenv("VLLM_SPYRE_MAX_WAITING_TIME_PREFILL", "-1")), | ||
|
||
|
|
||
| # Allow vllm-spyre to update env vars related to multi-threading (eg. OMP) | ||
| # based on the detected CPU cores and server configuration | ||
| "VLLM_SPYRE_UPDATE_THREAD_CONFIG": | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2,6 +2,7 @@ | |
|
|
||
| import math | ||
| import os | ||
| import time | ||
| from collections import deque | ||
| from typing import TYPE_CHECKING | ||
|
|
||
|
|
@@ -156,6 +157,9 @@ def __init__(self, *args, **kwargs) -> None: | |
| assert self.max_batch_tkv_limit != '-1', ( | ||
| "Expecting the env var VLLM_DT_MAX_BATCH_TKV_LIMIT to be set in " | ||
| "platform.py") | ||
| # if batch_is_locked: finish current decode batch to serve a request | ||
| # that waited for longer than VLLM_SPYRE_MAX_WAITING_TIME_PREFILL | ||
| self.batch_is_locked = False | ||
|
|
||
| def update_from_output( | ||
| self, | ||
|
|
@@ -179,14 +183,20 @@ def schedule(self) -> "SchedulerOutput": | |
| To avoid additional specialization, some requests are held back from the | ||
| base scheduler but are restored after. | ||
| """ | ||
| # unlock the current decode batch if no requests are in running queue | ||
| if len(self.running) == 0: | ||
| self.batch_is_locked = False | ||
| logger.debug("Unlocking the current decode batch as no requests " | ||
| "are in running queue") | ||
tdoublep marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| # First purge the full waiting queue into our holdback queue, preserving | ||
| # priority | ||
| while self.waiting: | ||
| self.holdback_queue.append(self.waiting.popleft()) | ||
|
|
||
| # Check if new requests can be scheduled. | ||
| while self.holdback_queue: | ||
| if self.can_schedule(self.holdback_queue[0]): | ||
| if not self.batch_is_locked and self.can_schedule( | ||
| self.holdback_queue[0]): | ||
|
||
| # Add request to the waiting queue | ||
| self.waiting.append(self.holdback_queue.popleft()) | ||
| else: | ||
|
|
@@ -226,6 +236,18 @@ def can_schedule(self, request) -> bool: | |
| if len(self.running) + len(self.waiting) == 0: | ||
| return True | ||
|
|
||
| # scheduling heuristic: maximal waiting (blocking) time for prefill | ||
| if envs_spyre.VLLM_SPYRE_MAX_WAITING_TIME_PREFILL > 0: | ||
| waiting_time = (time.time() - request.arrival_time) | ||
| if waiting_time > envs_spyre.VLLM_SPYRE_MAX_WAITING_TIME_PREFILL: | ||
| self.batch_is_locked = True | ||
| logger.debug("Request %s waited longer (%ds) than " \ | ||
| "VLLM_SPYRE_MAX_WAITING_TIME_PREFILL (%ds): locking current" \ | ||
| " decode batch and schedule this request afterwards.", | ||
| request.request_id, waiting_time, | ||
| envs_spyre.VLLM_SPYRE_MAX_WAITING_TIME_PREFILL | ||
| ) | ||
|
|
||
| # check that there is space in the current decode batch | ||
| cond1 = len(self.running) + len( | ||
| self.waiting) < self.max_num_running_reqs | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would suggest using
time.monotonic()instead to avoid issues with daylight savings etc.