From 73584aa9957d029325aecb6887dc1081fbaf92f2 Mon Sep 17 00:00:00 2001
From: Wallas Santos <wallashss@ibm.com>
Date: Fri, 28 Mar 2025 12:21:28 -0300
Subject: [PATCH 01/11] [V1] Fix assert failure when finishing a batch

Signed-off-by: Wallas Santos <wallashss@ibm.com>
---
 vllm_spyre/v1/worker/spyre_model_runner.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/vllm_spyre/v1/worker/spyre_model_runner.py b/vllm_spyre/v1/worker/spyre_model_runner.py
index 8ab175574..46c0cc83c 100644
--- a/vllm_spyre/v1/worker/spyre_model_runner.py
+++ b/vllm_spyre/v1/worker/spyre_model_runner.py
@@ -305,6 +305,20 @@ def execute_model(
 
         t0 = time.time()
 
+        # TODO: change to EMPTY_MODEL_RUNNER_OUTPUT, right now this
+        # will be a breaking change, or clumsy to make retrocompatible
+        # with conditional import
+        if not scheduler_output.total_num_scheduled_tokens:
+            # Return empty ModelRunnerOuptut if there's no work to do.
+            return ModelRunnerOutput(
+                req_ids=[],
+                req_id_to_index={},
+                sampled_token_ids=[],
+                spec_token_ids=None,
+                logprobs=None,
+                prompt_logprobs_dict={},
+            )
+
         self._update_states(scheduler_output)
 
         model_input = self.prepare_model_input(scheduler_output)

From b5a0e25822e67da1e58d27f1634f43e2da33c5d6 Mon Sep 17 00:00:00 2001
From: Wallas Santos <wallashss@ibm.com>
Date: Fri, 28 Mar 2025 17:05:34 -0300
Subject: [PATCH 02/11] update Dockerfile to use current latest vllm

Signed-off-by: Wallas Santos <wallashss@ibm.com>
---
 Dockerfile.spyre | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile.spyre b/Dockerfile.spyre
index cfdbfcc60..a96c5017c 100644
--- a/Dockerfile.spyre
+++ b/Dockerfile.spyre
@@ -19,7 +19,7 @@ RUN ln -sf $(which python${PYTHON_VERSION}) /usr/bin/python && \
 RUN git clone --depth 1 https://github.com/vllm-project/vllm.git \
     && cd vllm \
     && git fetch --tags \
-    && git checkout v0.8.0 \
+    && git checkout 5f063a8 \
     && python -m pip install --upgrade pip \
     && pip3 install torch=="2.5.1+cpu" --index-url https://download.pytorch.org/whl/cpu \
     && python use_existing_torch.py \

From e31a13d43b802a646006e58070f15ea5e88736c0 Mon Sep 17 00:00:00 2001
From: Wallas Santos <wallashss@ibm.com>
Date: Fri, 28 Mar 2025 17:17:13 -0300
Subject: [PATCH 03/11] fix dockerfile

Signed-off-by: Wallas Santos <wallashss@ibm.com>
---
 Dockerfile.spyre | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/Dockerfile.spyre b/Dockerfile.spyre
index a96c5017c..852bc19d9 100644
--- a/Dockerfile.spyre
+++ b/Dockerfile.spyre
@@ -16,10 +16,11 @@ RUN ln -sf $(which python${PYTHON_VERSION}) /usr/bin/python && \
     ln -sf $(which pip${PYTHON_VERSION}) /usr/bin/pip
 
 # Download and install vllm ###########################################################
-RUN git clone --depth 1 https://github.com/vllm-project/vllm.git \
+RUN VLLM_REVISION=5f063a80bda621da09e73fc63d4c59320259131d \
+    git clone --depth 1 https://github.com/vllm-project/vllm.git \
     && cd vllm \
-    && git fetch --tags \
-    && git checkout 5f063a8 \
+    && git fetch --depth 1 origin $VLLM_REVISION \
+    && git checkout $VLLM_REVISION \
     && python -m pip install --upgrade pip \
     && pip3 install torch=="2.5.1+cpu" --index-url https://download.pytorch.org/whl/cpu \
     && python use_existing_torch.py \

From 9c56bf0bb37a27ab512592990f8e5e4184bc1f73 Mon Sep 17 00:00:00 2001
From: Wallas Santos <wallashss@ibm.com>
Date: Mon, 31 Mar 2025 15:25:56 -0300
Subject: [PATCH 04/11] fix: dockerfile build

Signed-off-by: Wallas Santos <wallashss@ibm.com>
---
 Dockerfile.spyre | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Dockerfile.spyre b/Dockerfile.spyre
index 852bc19d9..2711ca87e 100644
--- a/Dockerfile.spyre
+++ b/Dockerfile.spyre
@@ -15,12 +15,12 @@ RUN microdnf update -y && microdnf install -y \
 RUN ln -sf $(which python${PYTHON_VERSION}) /usr/bin/python && \
     ln -sf $(which pip${PYTHON_VERSION}) /usr/bin/pip
 
+ENV VLLM_REVISION=5f063a80bda621da09e73fc63d4c59320259131d    
 # Download and install vllm ###########################################################
-RUN VLLM_REVISION=5f063a80bda621da09e73fc63d4c59320259131d \
-    git clone --depth 1 https://github.com/vllm-project/vllm.git \
+RUN git clone --depth 1 https://github.com/vllm-project/vllm.git \
     && cd vllm \
-    && git fetch --depth 1 origin $VLLM_REVISION \
-    && git checkout $VLLM_REVISION \
+    && git fetch --depth 1 origin ${VLLM_REVISION} \
+    && git checkout ${VLLM_REVISION} \
     && python -m pip install --upgrade pip \
     && pip3 install torch=="2.5.1+cpu" --index-url https://download.pytorch.org/whl/cpu \
     && python use_existing_torch.py \

From 03cc587ec587d005a12d63a6312254fb47afd613 Mon Sep 17 00:00:00 2001
From: Wallas Santos <wallashss@ibm.com>
Date: Tue, 1 Apr 2025 15:10:08 -0300
Subject: [PATCH 05/11] disable stats for test and warn users

Signed-off-by: Wallas Santos <wallashss@ibm.com>
---
 tests/spyre_util.py             | 4 ++++
 vllm_spyre/v1/core/scheduler.py | 7 +++++++
 2 files changed, 11 insertions(+)

diff --git a/tests/spyre_util.py b/tests/spyre_util.py
index d1cde33fc..cf5544f81 100644
--- a/tests/spyre_util.py
+++ b/tests/spyre_util.py
@@ -64,6 +64,10 @@ def __init__(self,
         env = os.environ.copy()
         if env_dict is not None:
             env.update(env_dict)
+
+        # TODO: Re-enable stats for vllm-spyre plugin
+        # See: https://github.com/vllm-project/vllm-spyre/issues/68
+        vllm_serve_args.append("--disable-log-stats")
         self.proc = subprocess.Popen(
             ["vllm", "serve", model, *vllm_serve_args],
             env=env,
diff --git a/vllm_spyre/v1/core/scheduler.py b/vllm_spyre/v1/core/scheduler.py
index f31502479..2a9c79208 100644
--- a/vllm_spyre/v1/core/scheduler.py
+++ b/vllm_spyre/v1/core/scheduler.py
@@ -47,6 +47,13 @@ def __init__(self, *args, **kwargs) -> None:
 
         self.rejected_requests: set[str] = set()
 
+        if self.log_stats:
+            logger.warning_once(
+                "Log stats for V1 is not working properly. Requests that do "
+                "not fit in warmup shapes will crash the engine. "
+                "Pass --disable-log-stats to disable stats and this message. "
+                "See https://github.com/vllm-project/vllm-spyre/issues/68")
+
     def add_request(self, request: Request) -> None:
         """This override rejects requests that fit no warmup shape"""
         if len(

From 5fb35ad67c8fc66c033a1d9576a64948568c8b86 Mon Sep 17 00:00:00 2001
From: Wallas Santos <wallashss@ibm.com>
Date: Wed, 2 Apr 2025 15:42:27 -0300
Subject: [PATCH 06/11] Add workaround for requests that does not fit in warmup
 shapes Revert "disable stats for test and warn users"

This reverts commit 03cc587ec587d005a12d63a6312254fb47afd613.

Signed-off-by: Wallas Santos <wallashss@ibm.com>
---
 tests/spyre_util.py             |  4 ----
 tests/test_spyre_online.py      | 10 +++++++++-
 vllm_spyre/v1/core/scheduler.py | 21 +++++++++------------
 3 files changed, 18 insertions(+), 17 deletions(-)

diff --git a/tests/spyre_util.py b/tests/spyre_util.py
index cf5544f81..d1cde33fc 100644
--- a/tests/spyre_util.py
+++ b/tests/spyre_util.py
@@ -64,10 +64,6 @@ def __init__(self,
         env = os.environ.copy()
         if env_dict is not None:
             env.update(env_dict)
-
-        # TODO: Re-enable stats for vllm-spyre plugin
-        # See: https://github.com/vllm-project/vllm-spyre/issues/68
-        vllm_serve_args.append("--disable-log-stats")
         self.proc = subprocess.Popen(
             ["vllm", "serve", model, *vllm_serve_args],
             env=env,
diff --git a/tests/test_spyre_online.py b/tests/test_spyre_online.py
index 2e0dc9fba..a39582433 100644
--- a/tests/test_spyre_online.py
+++ b/tests/test_spyre_online.py
@@ -3,6 +3,7 @@
 
 from tests.spyre_util import (RemoteOpenAIServer, get_spyre_backend_list,
                               get_spyre_model_list)
+from vllm_spyre.v1.core.scheduler import NO_WARMUP_FIT_STOP_REASON
 
 
 @pytest.mark.parametrize("model", get_spyre_model_list())
@@ -71,4 +72,11 @@ def test_openai_serving(model, warmup_shape, backend, vllm_version):
                                                max_tokens=25)
 
         assert len(completion.choices) == 1
-        assert len(completion.choices[0].text) == 0
+
+        # TODO: V0 and V1 have slight different behavior for requests
+        # that do not fit in a warmup shape
+        if vllm_version == 'V0':
+            assert len(completion.choices[0].text) == 0
+        elif vllm_version == 'V1':
+            assert completion.choices[0].stop_reason == \
+                NO_WARMUP_FIT_STOP_REASON
diff --git a/vllm_spyre/v1/core/scheduler.py b/vllm_spyre/v1/core/scheduler.py
index 2a9c79208..a90904bee 100644
--- a/vllm_spyre/v1/core/scheduler.py
+++ b/vllm_spyre/v1/core/scheduler.py
@@ -22,6 +22,8 @@
 
 logger = init_logger(__name__)
 
+NO_WARMUP_FIT_STOP_REASON = "Request did not fit any warmup shape"
+
 
 class SpyreScheduler(Scheduler):
     """Small extension of the V1 scheduler that adds constraints for Sypre:
@@ -47,13 +49,6 @@ def __init__(self, *args, **kwargs) -> None:
 
         self.rejected_requests: set[str] = set()
 
-        if self.log_stats:
-            logger.warning_once(
-                "Log stats for V1 is not working properly. Requests that do "
-                "not fit in warmup shapes will crash the engine. "
-                "Pass --disable-log-stats to disable stats and this message. "
-                "See https://github.com/vllm-project/vllm-spyre/issues/68")
-
     def add_request(self, request: Request) -> None:
         """This override rejects requests that fit no warmup shape"""
         if len(
@@ -192,11 +187,13 @@ def _reject_from_queue(self,
         for request in rejected_requests:
             queue.remove(request)
             reject_outputs.append(
-                EngineCoreOutput(request.request_id,
-                                 new_token_ids=[],
-                                 finish_reason=FinishReason.ABORT,
-                                 stop_reason="Request did not fit any warmup "
-                                 "shape"))
+                EngineCoreOutput(
+                    request.request_id,
+                    # TODO: FIXME
+                    # Dummy token prevent stats collection crash
+                    new_token_ids=[0],
+                    finish_reason=FinishReason.ABORT,
+                    stop_reason=NO_WARMUP_FIT_STOP_REASON))
             request.status = RequestStatus.FINISHED_ABORTED
             self._free_request(request)
             self.rejected_requests.remove(request.request_id)

From 1eba6e3a37aa5c0bcebf0a3441aab338f6bef007 Mon Sep 17 00:00:00 2001
From: Wallas Santos <wallashss@ibm.com>
Date: Fri, 4 Apr 2025 15:39:57 -0300
Subject: [PATCH 07/11] fix: change dummy token id

Signed-off-by: Wallas Santos <wallashss@ibm.com>
---
 vllm_spyre/v1/core/scheduler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm_spyre/v1/core/scheduler.py b/vllm_spyre/v1/core/scheduler.py
index a90904bee..f0f8f3537 100644
--- a/vllm_spyre/v1/core/scheduler.py
+++ b/vllm_spyre/v1/core/scheduler.py
@@ -191,7 +191,7 @@ def _reject_from_queue(self,
                     request.request_id,
                     # TODO: FIXME
                     # Dummy token prevent stats collection crash
-                    new_token_ids=[0],
+                    new_token_ids=[-1],
                     finish_reason=FinishReason.ABORT,
                     stop_reason=NO_WARMUP_FIT_STOP_REASON))
             request.status = RequestStatus.FINISHED_ABORTED

From 1ae7d614a95d6b320de7f940600e62ac72ed38fb Mon Sep 17 00:00:00 2001
From: Wallas Santos <wallashss@ibm.com>
Date: Fri, 4 Apr 2025 15:52:13 -0300
Subject: [PATCH 08/11] fix: more workarounds

Signed-off-by: Wallas Santos <wallashss@ibm.com>
---
 tests/spyre_util.py        | 6 +++++-
 tests/test_spyre_online.py | 8 ++++----
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/tests/spyre_util.py b/tests/spyre_util.py
index d1cde33fc..e8305c8a0 100644
--- a/tests/spyre_util.py
+++ b/tests/spyre_util.py
@@ -163,10 +163,14 @@ def generate_spyre_vllm_output(model: str, prompts: List[str],
     vllm_outputs = vllm_model.generate(prompts, sampling_params)
 
     results = []
+    
     for req_output in vllm_outputs:
         result = {}
         result['text'] = req_output.outputs[0].text
-        result['token_ids'] = tuple(req_output.outputs[0].token_ids)
+        # TODO: Workaround for V1, if request does not fit in a warmup shape
+        # token_ids may be filled with -1.
+        token_ids = [t for t in req_output.outputs[0].token_ids if t >=0]
+        result['token_ids'] = tuple(token_ids)
         result['tokens'] = tuple([
             req_output.outputs[0].logprobs[i][t].decoded_token
             for i, t in enumerate(result['token_ids'])
diff --git a/tests/test_spyre_online.py b/tests/test_spyre_online.py
index a39582433..0fe8b495c 100644
--- a/tests/test_spyre_online.py
+++ b/tests/test_spyre_online.py
@@ -75,8 +75,8 @@ def test_openai_serving(model, warmup_shape, backend, vllm_version):
 
         # TODO: V0 and V1 have slight different behavior for requests
         # that do not fit in a warmup shape
-        if vllm_version == 'V0':
-            assert len(completion.choices[0].text) == 0
-        elif vllm_version == 'V1':
+        
+        assert len(completion.choices[0].text) == 0
+        if vllm_version == 'V1':
             assert completion.choices[0].stop_reason == \
-                NO_WARMUP_FIT_STOP_REASON
+                NO_WARMUP_FIT_STOP_REASON   

From db229e6cac12ff0a18e35f58bc4a5e57693b0b03 Mon Sep 17 00:00:00 2001
From: Wallas Santos <wallashss@ibm.com>
Date: Fri, 4 Apr 2025 16:04:47 -0300
Subject: [PATCH 09/11] fix linting

Signed-off-by: Wallas Santos <wallashss@ibm.com>
---
 tests/spyre_util.py        | 4 ++--
 tests/test_spyre_online.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/spyre_util.py b/tests/spyre_util.py
index e8305c8a0..bf7bce20b 100644
--- a/tests/spyre_util.py
+++ b/tests/spyre_util.py
@@ -163,13 +163,13 @@ def generate_spyre_vllm_output(model: str, prompts: List[str],
     vllm_outputs = vllm_model.generate(prompts, sampling_params)
 
     results = []
-    
+
     for req_output in vllm_outputs:
         result = {}
         result['text'] = req_output.outputs[0].text
         # TODO: Workaround for V1, if request does not fit in a warmup shape
         # token_ids may be filled with -1.
-        token_ids = [t for t in req_output.outputs[0].token_ids if t >=0]
+        token_ids = [t for t in req_output.outputs[0].token_ids if t >= 0]
         result['token_ids'] = tuple(token_ids)
         result['tokens'] = tuple([
             req_output.outputs[0].logprobs[i][t].decoded_token
diff --git a/tests/test_spyre_online.py b/tests/test_spyre_online.py
index 0fe8b495c..6de7bfa73 100644
--- a/tests/test_spyre_online.py
+++ b/tests/test_spyre_online.py
@@ -75,8 +75,8 @@ def test_openai_serving(model, warmup_shape, backend, vllm_version):
 
         # TODO: V0 and V1 have slight different behavior for requests
         # that do not fit in a warmup shape
-        
+
         assert len(completion.choices[0].text) == 0
         if vllm_version == 'V1':
             assert completion.choices[0].stop_reason == \
-                NO_WARMUP_FIT_STOP_REASON   
+                NO_WARMUP_FIT_STOP_REASON

From 538cbce9cc06ace8fd54495960fa3afedd8ad2ef Mon Sep 17 00:00:00 2001
From: Wallas Santos <wallashss@ibm.com>
Date: Fri, 4 Apr 2025 16:11:15 -0300
Subject: [PATCH 10/11] feat: upgrade vllm

Signed-off-by: Wallas Santos <wallashss@ibm.com>
---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index e4992d60e..88ce135c2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -39,7 +39,7 @@ override-dependencies = [
 ]
 
 [tool.uv.sources]
-vllm = { git = "https://github.com/vllm-project/vllm", rev = "v0.8.0" }
+vllm = { git = "https://github.com/vllm-project/vllm", rev = "v0.8.3" }
 
 [tool.ruff]
 # Allow lines to be as long as 80.

From 359d06b79573afc4b215d3a241118b7a067dd0c5 Mon Sep 17 00:00:00 2001
From: Wallas Santos <wallashss@ibm.com>
Date: Fri, 4 Apr 2025 16:22:58 -0300
Subject: [PATCH 11/11] trying to fix docker build

Signed-off-by: Wallas Santos <wallashss@ibm.com>
---
 Dockerfile.spyre | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Dockerfile.spyre b/Dockerfile.spyre
index af79dcfa5..802d5cfdb 100644
--- a/Dockerfile.spyre
+++ b/Dockerfile.spyre
@@ -23,10 +23,11 @@ RUN pip install torch==2.5.1+cpu --index-url https://download.pytorch.org/whl/cp
 # Install uv
 RUN pip install uv
 # Install the plugin in a new venv, along with dev deps to test with
+ENV VLLM_TARGET_DEVICE=empty
 RUN cd /workspace/vllm-spyre \
     && uv venv .venv --system-site-packages \
     && source .venv/bin/activate \
-    && VLLM_TARGET_DEVICE=empty uv pip install -v -e . --system \
+    && uv pip install -v -e . --system \
     && uv sync --frozen --group dev
 ENV VLLM_PLUGINS=spyre