From 73584aa9957d029325aecb6887dc1081fbaf92f2 Mon Sep 17 00:00:00 2001 From: Wallas Santos Date: Fri, 28 Mar 2025 12:21:28 -0300 Subject: [PATCH 01/11] [V1] Fix assert failure when finishing a batch Signed-off-by: Wallas Santos --- vllm_spyre/v1/worker/spyre_model_runner.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/vllm_spyre/v1/worker/spyre_model_runner.py b/vllm_spyre/v1/worker/spyre_model_runner.py index 8ab175574..46c0cc83c 100644 --- a/vllm_spyre/v1/worker/spyre_model_runner.py +++ b/vllm_spyre/v1/worker/spyre_model_runner.py @@ -305,6 +305,20 @@ def execute_model( t0 = time.time() + # TODO: change to EMPTY_MODEL_RUNNER_OUTPUT, right now this + # will be a breaking change, or clumsy to make retrocompatible + # with conditional import + if not scheduler_output.total_num_scheduled_tokens: + # Return empty ModelRunnerOuptut if there's no work to do. + return ModelRunnerOutput( + req_ids=[], + req_id_to_index={}, + sampled_token_ids=[], + spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}, + ) + self._update_states(scheduler_output) model_input = self.prepare_model_input(scheduler_output) From b5a0e25822e67da1e58d27f1634f43e2da33c5d6 Mon Sep 17 00:00:00 2001 From: Wallas Santos Date: Fri, 28 Mar 2025 17:05:34 -0300 Subject: [PATCH 02/11] update Dockerfile to use current latest vllm Signed-off-by: Wallas Santos --- Dockerfile.spyre | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile.spyre b/Dockerfile.spyre index cfdbfcc60..a96c5017c 100644 --- a/Dockerfile.spyre +++ b/Dockerfile.spyre @@ -19,7 +19,7 @@ RUN ln -sf $(which python${PYTHON_VERSION}) /usr/bin/python && \ RUN git clone --depth 1 https://github.com/vllm-project/vllm.git \ && cd vllm \ && git fetch --tags \ - && git checkout v0.8.0 \ + && git checkout 5f063a8 \ && python -m pip install --upgrade pip \ && pip3 install torch=="2.5.1+cpu" --index-url https://download.pytorch.org/whl/cpu \ && python use_existing_torch.py \ From e31a13d43b802a646006e58070f15ea5e88736c0 Mon Sep 17 00:00:00 2001 From: Wallas Santos Date: Fri, 28 Mar 2025 17:17:13 -0300 Subject: [PATCH 03/11] fix dockerfile Signed-off-by: Wallas Santos --- Dockerfile.spyre | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Dockerfile.spyre b/Dockerfile.spyre index a96c5017c..852bc19d9 100644 --- a/Dockerfile.spyre +++ b/Dockerfile.spyre @@ -16,10 +16,11 @@ RUN ln -sf $(which python${PYTHON_VERSION}) /usr/bin/python && \ ln -sf $(which pip${PYTHON_VERSION}) /usr/bin/pip # Download and install vllm ########################################################### -RUN git clone --depth 1 https://github.com/vllm-project/vllm.git \ +RUN VLLM_REVISION=5f063a80bda621da09e73fc63d4c59320259131d \ + git clone --depth 1 https://github.com/vllm-project/vllm.git \ && cd vllm \ - && git fetch --tags \ - && git checkout 5f063a8 \ + && git fetch --depth 1 origin $VLLM_REVISION \ + && git checkout $VLLM_REVISION \ && python -m pip install --upgrade pip \ && pip3 install torch=="2.5.1+cpu" --index-url https://download.pytorch.org/whl/cpu \ && python use_existing_torch.py \ From 9c56bf0bb37a27ab512592990f8e5e4184bc1f73 Mon Sep 17 00:00:00 2001 From: Wallas Santos Date: Mon, 31 Mar 2025 15:25:56 -0300 Subject: [PATCH 04/11] fix: dockerfile build Signed-off-by: Wallas Santos --- Dockerfile.spyre | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Dockerfile.spyre b/Dockerfile.spyre index 852bc19d9..2711ca87e 100644 --- a/Dockerfile.spyre +++ b/Dockerfile.spyre @@ -15,12 +15,12 @@ RUN microdnf update -y && microdnf install -y \ RUN ln -sf $(which python${PYTHON_VERSION}) /usr/bin/python && \ ln -sf $(which pip${PYTHON_VERSION}) /usr/bin/pip +ENV VLLM_REVISION=5f063a80bda621da09e73fc63d4c59320259131d # Download and install vllm ########################################################### -RUN VLLM_REVISION=5f063a80bda621da09e73fc63d4c59320259131d \ - git clone --depth 1 https://github.com/vllm-project/vllm.git \ +RUN git clone --depth 1 https://github.com/vllm-project/vllm.git \ && cd vllm \ - && git fetch --depth 1 origin $VLLM_REVISION \ - && git checkout $VLLM_REVISION \ + && git fetch --depth 1 origin ${VLLM_REVISION} \ + && git checkout ${VLLM_REVISION} \ && python -m pip install --upgrade pip \ && pip3 install torch=="2.5.1+cpu" --index-url https://download.pytorch.org/whl/cpu \ && python use_existing_torch.py \ From 03cc587ec587d005a12d63a6312254fb47afd613 Mon Sep 17 00:00:00 2001 From: Wallas Santos Date: Tue, 1 Apr 2025 15:10:08 -0300 Subject: [PATCH 05/11] disable stats for test and warn users Signed-off-by: Wallas Santos --- tests/spyre_util.py | 4 ++++ vllm_spyre/v1/core/scheduler.py | 7 +++++++ 2 files changed, 11 insertions(+) diff --git a/tests/spyre_util.py b/tests/spyre_util.py index d1cde33fc..cf5544f81 100644 --- a/tests/spyre_util.py +++ b/tests/spyre_util.py @@ -64,6 +64,10 @@ def __init__(self, env = os.environ.copy() if env_dict is not None: env.update(env_dict) + + # TODO: Re-enable stats for vllm-spyre plugin + # See: https://github.com/vllm-project/vllm-spyre/issues/68 + vllm_serve_args.append("--disable-log-stats") self.proc = subprocess.Popen( ["vllm", "serve", model, *vllm_serve_args], env=env, diff --git a/vllm_spyre/v1/core/scheduler.py b/vllm_spyre/v1/core/scheduler.py index f31502479..2a9c79208 100644 --- a/vllm_spyre/v1/core/scheduler.py +++ b/vllm_spyre/v1/core/scheduler.py @@ -47,6 +47,13 @@ def __init__(self, *args, **kwargs) -> None: self.rejected_requests: set[str] = set() + if self.log_stats: + logger.warning_once( + "Log stats for V1 is not working properly. Requests that do " + "not fit in warmup shapes will crash the engine. " + "Pass --disable-log-stats to disable stats and this message. " + "See https://github.com/vllm-project/vllm-spyre/issues/68") + def add_request(self, request: Request) -> None: """This override rejects requests that fit no warmup shape""" if len( From 5fb35ad67c8fc66c033a1d9576a64948568c8b86 Mon Sep 17 00:00:00 2001 From: Wallas Santos Date: Wed, 2 Apr 2025 15:42:27 -0300 Subject: [PATCH 06/11] Add workaround for requests that does not fit in warmup shapes Revert "disable stats for test and warn users" This reverts commit 03cc587ec587d005a12d63a6312254fb47afd613. Signed-off-by: Wallas Santos --- tests/spyre_util.py | 4 ---- tests/test_spyre_online.py | 10 +++++++++- vllm_spyre/v1/core/scheduler.py | 21 +++++++++------------ 3 files changed, 18 insertions(+), 17 deletions(-) diff --git a/tests/spyre_util.py b/tests/spyre_util.py index cf5544f81..d1cde33fc 100644 --- a/tests/spyre_util.py +++ b/tests/spyre_util.py @@ -64,10 +64,6 @@ def __init__(self, env = os.environ.copy() if env_dict is not None: env.update(env_dict) - - # TODO: Re-enable stats for vllm-spyre plugin - # See: https://github.com/vllm-project/vllm-spyre/issues/68 - vllm_serve_args.append("--disable-log-stats") self.proc = subprocess.Popen( ["vllm", "serve", model, *vllm_serve_args], env=env, diff --git a/tests/test_spyre_online.py b/tests/test_spyre_online.py index 2e0dc9fba..a39582433 100644 --- a/tests/test_spyre_online.py +++ b/tests/test_spyre_online.py @@ -3,6 +3,7 @@ from tests.spyre_util import (RemoteOpenAIServer, get_spyre_backend_list, get_spyre_model_list) +from vllm_spyre.v1.core.scheduler import NO_WARMUP_FIT_STOP_REASON @pytest.mark.parametrize("model", get_spyre_model_list()) @@ -71,4 +72,11 @@ def test_openai_serving(model, warmup_shape, backend, vllm_version): max_tokens=25) assert len(completion.choices) == 1 - assert len(completion.choices[0].text) == 0 + + # TODO: V0 and V1 have slight different behavior for requests + # that do not fit in a warmup shape + if vllm_version == 'V0': + assert len(completion.choices[0].text) == 0 + elif vllm_version == 'V1': + assert completion.choices[0].stop_reason == \ + NO_WARMUP_FIT_STOP_REASON diff --git a/vllm_spyre/v1/core/scheduler.py b/vllm_spyre/v1/core/scheduler.py index 2a9c79208..a90904bee 100644 --- a/vllm_spyre/v1/core/scheduler.py +++ b/vllm_spyre/v1/core/scheduler.py @@ -22,6 +22,8 @@ logger = init_logger(__name__) +NO_WARMUP_FIT_STOP_REASON = "Request did not fit any warmup shape" + class SpyreScheduler(Scheduler): """Small extension of the V1 scheduler that adds constraints for Sypre: @@ -47,13 +49,6 @@ def __init__(self, *args, **kwargs) -> None: self.rejected_requests: set[str] = set() - if self.log_stats: - logger.warning_once( - "Log stats for V1 is not working properly. Requests that do " - "not fit in warmup shapes will crash the engine. " - "Pass --disable-log-stats to disable stats and this message. " - "See https://github.com/vllm-project/vllm-spyre/issues/68") - def add_request(self, request: Request) -> None: """This override rejects requests that fit no warmup shape""" if len( @@ -192,11 +187,13 @@ def _reject_from_queue(self, for request in rejected_requests: queue.remove(request) reject_outputs.append( - EngineCoreOutput(request.request_id, - new_token_ids=[], - finish_reason=FinishReason.ABORT, - stop_reason="Request did not fit any warmup " - "shape")) + EngineCoreOutput( + request.request_id, + # TODO: FIXME + # Dummy token prevent stats collection crash + new_token_ids=[0], + finish_reason=FinishReason.ABORT, + stop_reason=NO_WARMUP_FIT_STOP_REASON)) request.status = RequestStatus.FINISHED_ABORTED self._free_request(request) self.rejected_requests.remove(request.request_id) From 1eba6e3a37aa5c0bcebf0a3441aab338f6bef007 Mon Sep 17 00:00:00 2001 From: Wallas Santos Date: Fri, 4 Apr 2025 15:39:57 -0300 Subject: [PATCH 07/11] fix: change dummy token id Signed-off-by: Wallas Santos --- vllm_spyre/v1/core/scheduler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm_spyre/v1/core/scheduler.py b/vllm_spyre/v1/core/scheduler.py index a90904bee..f0f8f3537 100644 --- a/vllm_spyre/v1/core/scheduler.py +++ b/vllm_spyre/v1/core/scheduler.py @@ -191,7 +191,7 @@ def _reject_from_queue(self, request.request_id, # TODO: FIXME # Dummy token prevent stats collection crash - new_token_ids=[0], + new_token_ids=[-1], finish_reason=FinishReason.ABORT, stop_reason=NO_WARMUP_FIT_STOP_REASON)) request.status = RequestStatus.FINISHED_ABORTED From 1ae7d614a95d6b320de7f940600e62ac72ed38fb Mon Sep 17 00:00:00 2001 From: Wallas Santos Date: Fri, 4 Apr 2025 15:52:13 -0300 Subject: [PATCH 08/11] fix: more workarounds Signed-off-by: Wallas Santos --- tests/spyre_util.py | 6 +++++- tests/test_spyre_online.py | 8 ++++---- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/tests/spyre_util.py b/tests/spyre_util.py index d1cde33fc..e8305c8a0 100644 --- a/tests/spyre_util.py +++ b/tests/spyre_util.py @@ -163,10 +163,14 @@ def generate_spyre_vllm_output(model: str, prompts: List[str], vllm_outputs = vllm_model.generate(prompts, sampling_params) results = [] + for req_output in vllm_outputs: result = {} result['text'] = req_output.outputs[0].text - result['token_ids'] = tuple(req_output.outputs[0].token_ids) + # TODO: Workaround for V1, if request does not fit in a warmup shape + # token_ids may be filled with -1. + token_ids = [t for t in req_output.outputs[0].token_ids if t >=0] + result['token_ids'] = tuple(token_ids) result['tokens'] = tuple([ req_output.outputs[0].logprobs[i][t].decoded_token for i, t in enumerate(result['token_ids']) diff --git a/tests/test_spyre_online.py b/tests/test_spyre_online.py index a39582433..0fe8b495c 100644 --- a/tests/test_spyre_online.py +++ b/tests/test_spyre_online.py @@ -75,8 +75,8 @@ def test_openai_serving(model, warmup_shape, backend, vllm_version): # TODO: V0 and V1 have slight different behavior for requests # that do not fit in a warmup shape - if vllm_version == 'V0': - assert len(completion.choices[0].text) == 0 - elif vllm_version == 'V1': + + assert len(completion.choices[0].text) == 0 + if vllm_version == 'V1': assert completion.choices[0].stop_reason == \ - NO_WARMUP_FIT_STOP_REASON + NO_WARMUP_FIT_STOP_REASON From db229e6cac12ff0a18e35f58bc4a5e57693b0b03 Mon Sep 17 00:00:00 2001 From: Wallas Santos Date: Fri, 4 Apr 2025 16:04:47 -0300 Subject: [PATCH 09/11] fix linting Signed-off-by: Wallas Santos --- tests/spyre_util.py | 4 ++-- tests/test_spyre_online.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/spyre_util.py b/tests/spyre_util.py index e8305c8a0..bf7bce20b 100644 --- a/tests/spyre_util.py +++ b/tests/spyre_util.py @@ -163,13 +163,13 @@ def generate_spyre_vllm_output(model: str, prompts: List[str], vllm_outputs = vllm_model.generate(prompts, sampling_params) results = [] - + for req_output in vllm_outputs: result = {} result['text'] = req_output.outputs[0].text # TODO: Workaround for V1, if request does not fit in a warmup shape # token_ids may be filled with -1. - token_ids = [t for t in req_output.outputs[0].token_ids if t >=0] + token_ids = [t for t in req_output.outputs[0].token_ids if t >= 0] result['token_ids'] = tuple(token_ids) result['tokens'] = tuple([ req_output.outputs[0].logprobs[i][t].decoded_token diff --git a/tests/test_spyre_online.py b/tests/test_spyre_online.py index 0fe8b495c..6de7bfa73 100644 --- a/tests/test_spyre_online.py +++ b/tests/test_spyre_online.py @@ -75,8 +75,8 @@ def test_openai_serving(model, warmup_shape, backend, vllm_version): # TODO: V0 and V1 have slight different behavior for requests # that do not fit in a warmup shape - + assert len(completion.choices[0].text) == 0 if vllm_version == 'V1': assert completion.choices[0].stop_reason == \ - NO_WARMUP_FIT_STOP_REASON + NO_WARMUP_FIT_STOP_REASON From 538cbce9cc06ace8fd54495960fa3afedd8ad2ef Mon Sep 17 00:00:00 2001 From: Wallas Santos Date: Fri, 4 Apr 2025 16:11:15 -0300 Subject: [PATCH 10/11] feat: upgrade vllm Signed-off-by: Wallas Santos --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index e4992d60e..88ce135c2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,7 +39,7 @@ override-dependencies = [ ] [tool.uv.sources] -vllm = { git = "https://github.com/vllm-project/vllm", rev = "v0.8.0" } +vllm = { git = "https://github.com/vllm-project/vllm", rev = "v0.8.3" } [tool.ruff] # Allow lines to be as long as 80. From 359d06b79573afc4b215d3a241118b7a067dd0c5 Mon Sep 17 00:00:00 2001 From: Wallas Santos Date: Fri, 4 Apr 2025 16:22:58 -0300 Subject: [PATCH 11/11] trying to fix docker build Signed-off-by: Wallas Santos --- Dockerfile.spyre | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Dockerfile.spyre b/Dockerfile.spyre index af79dcfa5..802d5cfdb 100644 --- a/Dockerfile.spyre +++ b/Dockerfile.spyre @@ -23,10 +23,11 @@ RUN pip install torch==2.5.1+cpu --index-url https://download.pytorch.org/whl/cp # Install uv RUN pip install uv # Install the plugin in a new venv, along with dev deps to test with +ENV VLLM_TARGET_DEVICE=empty RUN cd /workspace/vllm-spyre \ && uv venv .venv --system-site-packages \ && source .venv/bin/activate \ - && VLLM_TARGET_DEVICE=empty uv pip install -v -e . --system \ + && uv pip install -v -e . --system \ && uv sync --frozen --group dev ENV VLLM_PLUGINS=spyre