Skip to content
3 changes: 2 additions & 1 deletion Dockerfile.spyre
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,11 @@ RUN pip install torch==2.5.1+cpu --index-url https://download.pytorch.org/whl/cp
# Install uv
RUN pip install uv
# Install the plugin in a new venv, along with dev deps to test with
ENV VLLM_TARGET_DEVICE=empty
RUN cd /workspace/vllm-spyre \
&& uv venv .venv --system-site-packages \
&& source .venv/bin/activate \
&& VLLM_TARGET_DEVICE=empty uv pip install -v -e . --system \
&& uv pip install -v -e . --system \
&& uv sync --frozen --group dev
ENV VLLM_PLUGINS=spyre

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ override-dependencies = [
]

[tool.uv.sources]
vllm = { git = "https://github.com/vllm-project/vllm", rev = "v0.8.0" }
vllm = { git = "https://github.com/vllm-project/vllm", rev = "v0.8.3" }

[tool.ruff]
# Allow lines to be as long as 80.
Expand Down
6 changes: 5 additions & 1 deletion tests/spyre_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,10 +163,14 @@ def generate_spyre_vllm_output(model: str, prompts: List[str],
vllm_outputs = vllm_model.generate(prompts, sampling_params)

results = []

for req_output in vllm_outputs:
result = {}
result['text'] = req_output.outputs[0].text
result['token_ids'] = tuple(req_output.outputs[0].token_ids)
# TODO: Workaround for V1, if request does not fit in a warmup shape
# token_ids may be filled with -1.
token_ids = [t for t in req_output.outputs[0].token_ids if t >= 0]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This makes me sad D:
(But so does all the code in the scheduler that does this dummy scheduling anyway that I wrote)

Approved with sadness lol

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you with sadness 😅

result['token_ids'] = tuple(token_ids)
result['tokens'] = tuple([
req_output.outputs[0].logprobs[i][t].decoded_token
for i, t in enumerate(result['token_ids'])
Expand Down
8 changes: 8 additions & 0 deletions tests/test_spyre_online.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from tests.spyre_util import (RemoteOpenAIServer, get_spyre_backend_list,
get_spyre_model_list)
from vllm_spyre.v1.core.scheduler import NO_WARMUP_FIT_STOP_REASON


@pytest.mark.parametrize("model", get_spyre_model_list())
Expand Down Expand Up @@ -71,4 +72,11 @@ def test_openai_serving(model, warmup_shape, backend, vllm_version):
max_tokens=25)

assert len(completion.choices) == 1

# TODO: V0 and V1 have slight different behavior for requests
# that do not fit in a warmup shape

assert len(completion.choices[0].text) == 0
if vllm_version == 'V1':
assert completion.choices[0].stop_reason == \
NO_WARMUP_FIT_STOP_REASON
14 changes: 9 additions & 5 deletions vllm_spyre/v1/core/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@

logger = init_logger(__name__)

NO_WARMUP_FIT_STOP_REASON = "Request did not fit any warmup shape"


class SpyreScheduler(Scheduler):
"""Small extension of the V1 scheduler that adds constraints for Sypre:
Expand Down Expand Up @@ -185,11 +187,13 @@ def _reject_from_queue(self,
for request in rejected_requests:
queue.remove(request)
reject_outputs.append(
EngineCoreOutput(request.request_id,
new_token_ids=[],
finish_reason=FinishReason.ABORT,
stop_reason="Request did not fit any warmup "
"shape"))
EngineCoreOutput(
request.request_id,
# TODO: FIXME
# Dummy token prevent stats collection crash
new_token_ids=[-1],
finish_reason=FinishReason.ABORT,
stop_reason=NO_WARMUP_FIT_STOP_REASON))
request.status = RequestStatus.FINISHED_ABORTED
self._free_request(request)
self.rejected_requests.remove(request.request_id)
Expand Down
14 changes: 14 additions & 0 deletions vllm_spyre/v1/worker/spyre_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,20 @@ def execute_model(

t0 = time.time()

# TODO: change to EMPTY_MODEL_RUNNER_OUTPUT, right now this
# will be a breaking change, or clumsy to make retrocompatible
# with conditional import
if not scheduler_output.total_num_scheduled_tokens:
# Return empty ModelRunnerOuptut if there's no work to do.
return ModelRunnerOutput(
req_ids=[],
req_id_to_index={},
sampled_token_ids=[],
spec_token_ids=None,
logprobs=None,
prompt_logprobs_dict={},
)

self._update_states(scheduler_output)

model_input = self.prepare_model_input(scheduler_output)
Expand Down