vllm-project · joerunde · Apr 7, 2025 · Mar 28, 2025 · Mar 28, 2025 · Mar 28, 2025
@@ -23,10 +23,11 @@ RUN pip install torch==2.5.1+cpu --index-url https://download.pytorch.org/whl/cp
 # Install uv
 RUN pip install uv
 # Install the plugin in a new venv, along with dev deps to test with
+ENV VLLM_TARGET_DEVICE=empty
 RUN cd /workspace/vllm-spyre \
     && uv venv .venv --system-site-packages \
     && source .venv/bin/activate \
-    && VLLM_TARGET_DEVICE=empty uv pip install -v -e . --system \
+    && uv pip install -v -e . --system \
     && uv sync --frozen --group dev
 ENV VLLM_PLUGINS=spyre
 

@@ -39,7 +39,7 @@ override-dependencies = [
 ]
 
 [tool.uv.sources]
-vllm = { git = "https://github.com/vllm-project/vllm", rev = "v0.8.0" }
+vllm = { git = "https://github.com/vllm-project/vllm", rev = "v0.8.3" }
 
 [tool.ruff]
 # Allow lines to be as long as 80.

@@ -163,10 +163,14 @@ def generate_spyre_vllm_output(model: str, prompts: List[str],
     vllm_outputs = vllm_model.generate(prompts, sampling_params)
 
     results = []
+
     for req_output in vllm_outputs:
         result = {}
         result['text'] = req_output.outputs[0].text
-        result['token_ids'] = tuple(req_output.outputs[0].token_ids)
+        # TODO: Workaround for V1, if request does not fit in a warmup shape
+        # token_ids may be filled with -1.
+        token_ids = [t for t in req_output.outputs[0].token_ids if t >= 0]
+        result['token_ids'] = tuple(token_ids)
         result['tokens'] = tuple([
             req_output.outputs[0].logprobs[i][t].decoded_token
             for i, t in enumerate(result['token_ids'])

@@ -3,6 +3,7 @@
 
 from tests.spyre_util import (RemoteOpenAIServer, get_spyre_backend_list,
                               get_spyre_model_list)
+from vllm_spyre.v1.core.scheduler import NO_WARMUP_FIT_STOP_REASON
 
 
 @pytest.mark.parametrize("model", get_spyre_model_list())
@@ -71,4 +72,11 @@ def test_openai_serving(model, warmup_shape, backend, vllm_version):
                                                max_tokens=25)
 
         assert len(completion.choices) == 1
+
+        # TODO: V0 and V1 have slight different behavior for requests
+        # that do not fit in a warmup shape
+
         assert len(completion.choices[0].text) == 0
+        if vllm_version == 'V1':
+            assert completion.choices[0].stop_reason == \
+                NO_WARMUP_FIT_STOP_REASON
@@ -22,6 +22,8 @@
 
 logger = init_logger(__name__)
 
+NO_WARMUP_FIT_STOP_REASON = "Request did not fit any warmup shape"
+
 
 class SpyreScheduler(Scheduler):
     """Small extension of the V1 scheduler that adds constraints for Sypre:
@@ -185,11 +187,13 @@ def _reject_from_queue(self,
         for request in rejected_requests:
             queue.remove(request)
             reject_outputs.append(
-                EngineCoreOutput(request.request_id,
-                                 new_token_ids=[],
-                                 finish_reason=FinishReason.ABORT,
-                                 stop_reason="Request did not fit any warmup "
-                                 "shape"))
+                EngineCoreOutput(
+                    request.request_id,
+                    # TODO: FIXME
+                    # Dummy token prevent stats collection crash
+                    new_token_ids=[-1],
+                    finish_reason=FinishReason.ABORT,
+                    stop_reason=NO_WARMUP_FIT_STOP_REASON))
             request.status = RequestStatus.FINISHED_ABORTED
             self._free_request(request)
             self.rejected_requests.remove(request.request_id)

@@ -305,6 +305,20 @@ def execute_model(
 
         t0 = time.time()
 
+        # TODO: change to EMPTY_MODEL_RUNNER_OUTPUT, right now this
+        # will be a breaking change, or clumsy to make retrocompatible
+        # with conditional import
+        if not scheduler_output.total_num_scheduled_tokens:
+            # Return empty ModelRunnerOuptut if there's no work to do.
+            return ModelRunnerOutput(
+                req_ids=[],
+                req_id_to_index={},
+                sampled_token_ids=[],
+                spec_token_ids=None,
+                logprobs=None,
+                prompt_logprobs_dict={},
+            )
+
         self._update_states(scheduler_output)
 
         model_input = self.prepare_model_input(scheduler_output)