-
Couldn't load subscription status.
- Fork 26
[cb] scheduler heuristic 2: unblock long prompts #440
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
5a2d843
104bdeb
2c50ffc
489daa4
039c674
f1c96d6
95a4cf4
ea7edcf
c302cfb
324f827
2ff9717
eb5c35a
ab9a8ee
8877a6e
9a5c444
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2,6 +2,7 @@ | |
|
|
||
| import math | ||
| import os | ||
| import time | ||
| from collections import deque | ||
| from typing import TYPE_CHECKING | ||
|
|
||
|
|
@@ -159,6 +160,9 @@ def __init__(self, *args, **kwargs) -> None: | |
| # cache for self.check_batch_tkv_limit() outer key: tuple(request_ids), | ||
| # inner key: (request_id, max_batch_tkv_limit), value: (lower, upper) | ||
| self._cache_check_batch_tkv_limit: dict[tuple, dict[tuple, tuple]] = {} | ||
| # if batch_is_locked: finish current decode batch to serve a request | ||
| # that waited for longer than VLLM_SPYRE_MAX_WAITING_TIME_SECONDS | ||
| self.batch_is_locked = False | ||
|
|
||
| def update_from_output( | ||
| self, | ||
|
|
@@ -182,6 +186,11 @@ def schedule(self) -> "SchedulerOutput": | |
| To avoid additional specialization, some requests are held back from the | ||
| base scheduler but are restored after. | ||
| """ | ||
| # unlock the current decode batch if no requests are in running queue | ||
| if len(self.running) == 0 and self.batch_is_locked: | ||
| self.batch_is_locked = False | ||
| logger.debug("Unlocking the current decode batch as no requests " | ||
| "are in running queue") | ||
| # First purge the full waiting queue into our holdback queue, preserving | ||
| # priority | ||
| while self.waiting: | ||
|
|
@@ -224,11 +233,29 @@ def can_schedule(self, request) -> bool: | |
| max_prompt_batch_size = 1 | ||
| max_context_len = self.scheduler_config.max_model_len | ||
|
|
||
| # if the batch is locked by a request which has been waiting for | ||
| # longer then VLLM_SPYRE_MAX_WAITING_TIME_SECONDS, we cannot | ||
| # schedule the current sequence until we have served this request | ||
| if self.batch_is_locked: | ||
| return False | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. instead of locking the batch entirely, shouldn't we just disallow any skipping of requests in the queue until the request at the head of the waiting queue schedules? I haven't followed super closely but my assumption is that the blocked request may be able to be scheduled before the full batch finishes. E.g. with the 128k limit, a 64k request could potentially schedule once the batch has drained down to a single other request, so we wouldn't need to wait for the last one to finish. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. great idea! I will certainly address that in a follow up. We wanted to keep the first version as simple and fail-proof as possible. |
||
|
|
||
| # running and waiting queues are both empty -> start a new batch | ||
| # which can always be scheduled | ||
| if len(self.running) + len(self.waiting) == 0: | ||
| return True | ||
|
|
||
| # scheduling heuristic: maximal waiting (blocking) time for prefill | ||
| waiting_time = (time.monotonic() - request.arrival_time) | ||
| if waiting_time > envs_spyre.VLLM_SPYRE_MAX_WAITING_TIME_SECONDS: | ||
| self.batch_is_locked = True | ||
| logger.debug("Request %s waited longer (%ss) than " \ | ||
| "VLLM_SPYRE_MAX_WAITING_TIME_SECONDS (%ss): locking current " \ | ||
| "decode batch and schedule this request either as part of " \ | ||
| "the current batch or in an exclusive subsequent new batch.", | ||
| request.request_id, round(waiting_time, 2), | ||
| envs_spyre.VLLM_SPYRE_MAX_WAITING_TIME_SECONDS | ||
| ) | ||
|
|
||
| # check that there is space in the current decode batch | ||
| cond1 = len(self.running) + len( | ||
| self.waiting) < self.max_num_running_reqs | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.