vllm-project · joerunde · Mar 24, 2025 · Mar 19, 2025 · Mar 19, 2025 · Mar 19, 2025
@@ -25,5 +25,5 @@ jobs:
           export DISTRIBUTED_STRATEGY_IGNORE_MODULES=WordEmbedding && \
           cd vllm-spyre && \
           python -m pytest --timeout=300  tests -v -k "V0 and eager" && \
-          python -m pytest --forked --timeout=300  tests -v -k "V1 and eager"
+          python -m pytest --forked --timeout=300  tests -v -k "V1- and eager"
         '''
@@ -18,13 +18,13 @@ RUN ln -sf $(which python${PYTHON_VERSION}) /usr/bin/python && \
 # Download and install vllm ###########################################################
 RUN git clone --depth 1 https://github.com/vllm-project/vllm.git \
     && cd vllm \
-    && git fetch origin pull/14242/head:spyre-workarounds \
-    && git checkout spyre-workarounds \
+    && git fetch --tags \
+    && git checkout v0.8.0 \
     && python -m pip install --upgrade pip \
     && pip3 install torch=="2.5.1+cpu" --index-url https://download.pytorch.org/whl/cpu \
     && python use_existing_torch.py \
-    && pip install -r requirements-build.txt \
-    && SETUPTOOLS_SCM_PRETEND_VERSION=0.7.3 VLLM_TARGET_DEVICE=empty pip install --verbose . --no-build-isolation
+    && pip install -r requirements/build.txt \
+    && SETUPTOOLS_SCM_PRETEND_VERSION=0.8.0 VLLM_TARGET_DEVICE=empty pip install --verbose . --no-build-isolation
 
 # Install vllm Spyre plugin ##################################################################
 RUN mkdir /workspace/vllm-spyre

@@ -1,14 +1,15 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from collections import deque
-from typing import Deque, Optional
+from typing import Deque
 
-from vllm.config import (CacheConfig, LoRAConfig, ModelConfig, SchedulerConfig,
-                         SpeculativeConfig)
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
+from vllm.sampling_params import SamplingParams
 from vllm.v1.core.scheduler import Scheduler
 from vllm.v1.core.scheduler_output import SchedulerOutput
+from vllm.v1.engine import EngineCoreOutput, EngineCoreOutputs, FinishReason
+from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
 
 logger = init_logger(__name__)
@@ -20,22 +21,9 @@ class SpyreScheduler(Scheduler):
     - Only schedules batches of requests that fit a common warmup shape
     """
 
-    def __init__(
-        self,
-        scheduler_config: SchedulerConfig,
-        model_config: ModelConfig,
-        cache_config: CacheConfig,
-        lora_config: Optional[LoRAConfig],
-        speculative_config: Optional[SpeculativeConfig],
-        log_stats: bool,
-    ) -> None:
+    def __init__(self, *args, **kwargs) -> None:
         # Initialize vLLM scheduler
-        super().__init__(scheduler_config=scheduler_config,
-                         model_config=model_config,
-                         cache_config=cache_config,
-                         lora_config=lora_config,
-                         speculative_config=speculative_config,
-                         log_stats=log_stats)
+        super().__init__(*args, **kwargs)
 
         # Add our own state for handling Spyre constraints
 
@@ -49,6 +37,51 @@ def __init__(
         # scheduler sees have at least one common warmup shape.
         self.holdback_queue: Deque[Request] = deque()
 
+        self.rejected_requests: set[str] = set()
+
+    def add_request(self, request: Request) -> None:
+        """This override rejects requests that fit no warmup shape"""
+        if len(
+                self._get_matching_warmup_shapes(request=request,
+                                                 warmup_shapes=list(
+                                                     self.spyre_warmup_shapes),
+                                                 current_batch_size=0)) == 0:
+            logger.warning(
+                "No applicable warmup shape exists for "
+                "combination of prompt length (%d tokens) "
+                "and maximum number of output tokens to be "
+                "generated (%d tokens) from request id %s",
+                request.num_prompt_tokens, request.sampling_params.max_tokens,
+                request.request_id)
+            # TODO: There are open PRs that should enable raising an error for
+            # a single request like this, which will gracefully return an error
+            # for the request, instead of shutting down the engine.
+            # See https://github.com/vllm-project/vllm/pull/11737
+            # raise ValueError("Request does not fit any spyre warmup shape")
+
+            # For now, we'll insert a dummy request and manually reject it when
+            # we construct the outputs later
+            self.rejected_requests.add(request.request_id)
+            request.prompt_token_ids = [0]
+            request.num_prompt_tokens = 1
+            request.sampling_params = SamplingParams(max_tokens=1)
+
+        # delegate to super
+        super().add_request(request=request)
+
+    def update_from_output(
+        self,
+        scheduler_output: SchedulerOutput,
+        model_runner_output: ModelRunnerOutput,
+    ) -> EngineCoreOutputs:
+        """Temporary override to handle rejected requests that were too large
+        to schedule."""
+        reject_outputs = self._handle_rejects()
+        outputs = super().update_from_output(scheduler_output,
+                                             model_runner_output)
+        outputs.outputs.extend(reject_outputs)
+        return outputs
+
     def schedule(self) -> "SchedulerOutput":
         """This override adds constraints and then delegates most of the work
         to the base scheduler"""
@@ -71,40 +104,19 @@ def schedule(self) -> "SchedulerOutput":
 
                 # prune the possible shapes to only those that fit this request
                 # and the growing batch size
-                max_tokens = 0
-                if request.sampling_params is not None and\
-                        request.sampling_params.max_tokens is not None:
-                    max_tokens = request.sampling_params.max_tokens
-
-                available_warmup_shapes = [
-                    shape for shape in available_warmup_shapes
-                    if request.num_prompt_tokens <= shape['prompt_length']
-                    and max_tokens <= shape['new_tokens']
-                    and len(self.waiting) < shape['batch_size']
-                ]
+                available_warmup_shapes = self._get_matching_warmup_shapes(
+                    request=request,
+                    warmup_shapes=available_warmup_shapes,
+                    current_batch_size=len(self.waiting))
 
                 if len(available_warmup_shapes) > 0:
                     # There is still at least one valid shape, so add to the
                     # waiting queue
                     self.waiting.append(self.holdback_queue.popleft())
                 else:
-                    # We can't schedule this one.
-                    # If it's the first request, then it fits _no_ shapes at all
-                    # So we reject it entirely
-                    if len(self.waiting) == 0:
-                        logger.warning(
-                            "No applicable warmup shape exists for "
-                            "combination of prompt length (%d tokens) "
-                            "and maximum number of output tokens to be "
-                            "generated (%d tokens)", request.num_prompt_tokens,
-                            request.sampling_params.max_tokens)
-
-                        request.status = RequestStatus.FINISHED_IGNORED
-                        self._free_request(self.holdback_queue.popleft())
-                    else:
-                        # Otherwise, we simply stop here so that the scheduler
-                        # can work with the batch we have
-                        break
+                    # Otherwise, we simply stop here so that the scheduler
+                    # can work with the batch we have
+                    break
 
             logger.debug(
                 "Scheduling a new batch of %d requests, holding back %d "
@@ -119,3 +131,59 @@ def schedule(self) -> "SchedulerOutput":
     def get_num_unfinished_requests(self) -> int:
         # Override this to include our extra queue
         return len(self.waiting) + len(self.running) + len(self.holdback_queue)
+
+    def _get_matching_warmup_shapes(
+            self, request: Request, warmup_shapes: list[dict[str, int]],
+            current_batch_size: int) -> list[dict[str, int]]:
+        """Return the subset of shapes that match this request"""
+        max_tokens = 0
+        if request.sampling_params is not None and\
+                request.sampling_params.max_tokens is not None:
+            max_tokens = request.sampling_params.max_tokens
+
+        return [
+            shape for shape in warmup_shapes
+            if request.num_prompt_tokens <= shape['prompt_length']
+            and max_tokens <= shape['new_tokens']
+            and current_batch_size < shape['batch_size']
+        ]
+
+    def _handle_rejects(self) -> list[EngineCoreOutput]:
+        """Temporary solution to reject requests that were too large to
+        schedule. This removes the rejected requests from the scheduler, and 
+        returns empty outputs for them with finish reason `abort`.
+        """
+        if len(self.rejected_requests) == 0:
+            return []
+
+        # Remove rejected requests from all queues
+        reject_outputs = self._reject_from_queue(self.running)
+        reject_outputs.extend(self._reject_from_queue(self.waiting))
+        reject_outputs.extend(self._reject_from_queue(self.holdback_queue))
+        self.rejected_requests.clear()
+
+        return reject_outputs
+
+    def _reject_from_queue(self,
+                           queue: Deque[Request]) -> list[EngineCoreOutput]:
+        """Remove rejected requests from a given queue and return a list of 
+        engine core outputs to return for them"""
+        reject_outputs: list[EngineCoreOutput] = []
+        rejected_requests: list[Request] = [
+            request for request in queue
+            if request.request_id in self.rejected_requests
+        ]
+
+        for request in rejected_requests:
+            queue.remove(request)
+            reject_outputs.append(
+                EngineCoreOutput(request.request_id,
+                                 new_token_ids=[],
+                                 finish_reason=FinishReason.ABORT,
+                                 stop_reason="Request did not fit any warmup "
+                                 "shape"))
+            request.status = RequestStatus.FINISHED_ABORTED
+            self._free_request(request)
+            self.rejected_requests.remove(request.request_id)
+
+        return reject_outputs
@@ -305,6 +305,12 @@ def prepare_model_input(
         # TODO: Build the rest of the SamplingMetadata correctly
         dummy_tensors = lambda v: torch.full(
             (num_reqs, ), v, device=self.device)
+
+        # vllm 0.7.3 backwards compatibility
+        extra_kwargs: dict = {}
+        if "bad_words_token_ids" in SamplingMetadata.__dataclass_fields__:
+            extra_kwargs["bad_words_token_ids"] = {}
+
         dummy_metadata = SamplingMetadata(
             temperature=dummy_tensors(0.0),
             all_greedy=False,
@@ -323,7 +329,7 @@ def prepare_model_input(
             min_tokens={},
             logit_bias=[None for _ in range(num_reqs)],
             allowed_token_ids_mask=None,
-        )
+            **extra_kwargs)
 
         return ModelInputForSpyre(input_tokens=input_tokens,
                                   input_positions=input_positions,
@@ -499,10 +505,17 @@ class in the modeling code. Every attention layer populates an entry
         """
         # We do at least use the real size from the cache config.
         block_size = self.vllm_config.cache_config.block_size
-        return {
-            "foo":
-            FullAttentionSpec(block_size=block_size,
-                              num_kv_heads=1,
-                              head_size=1,
-                              dtype=torch.float16)
-        }
+
+        # vllm 0.7.3 backwards compatibility
+        try:
+            attn_spec = FullAttentionSpec(block_size=block_size,
+                                          num_kv_heads=1,
+                                          head_size=1,
+                                          dtype=torch.float16)
+        except TypeError:
+            attn_spec = FullAttentionSpec(block_size=block_size,
+                                          num_kv_heads=1,
+                                          head_size=1,
+                                          dtype=torch.float16,
+                                          use_mla=False)
+        return {"foo": attn_spec}