vllm-project · yannicks1 · Jul 31, 2025 · Jul 29, 2025 · Jul 29, 2025 · Jul 29, 2025
@@ -13,8 +13,8 @@ To run inference on IBM Spyre Accelerators, the backend should be set as:
 | --- | --- | --- | --- |
 | Decoder | v0 | sendnn | V0 support for decoder models is deprecated |
 | Decoder | v1 | sendnn | |
-| Embedding | v0 | sendnn | |
-| Embedding | v1 | N/A | Embedding models are not yet supported on V1 |
+| Embedding | v0 | sendnn | V0 support for embedding models is deprecated|
+| Embedding | v1 | sendnn | |
 
 ## Batching Modes
 

@@ -10,7 +10,7 @@ This table summarize the status of features on Spyre. By default, those features
 | Prompt Adapter                |   ⛔   | Being deprecated in vLLM [vllm#13981](https://github.com/vllm-project/vllm/issues/13981) |
 | Speculative Decoding          |   🗓️   |      |
 | Guided Decoding               |   🗓️   |      |
-| Pooling                       |   ⚠️   | Works with V0. V1 still being developed in vLLM [vllm#18052](https://github.com/vllm-project/vllm/issues/18052) |
+| Pooling                       |   ✅   |      |
 | Enc-dec                       |   ⛔   | No plans for now |
 | Multi Modality                |   🗓️   |      |
 | LogProbs                      |   ✅   |      |

@@ -42,7 +42,6 @@
 if "VLLM_SPYRE_DYNAMO_BACKEND" not in os.environ:
     os.environ['VLLM_SPYRE_DYNAMO_BACKEND'] = 'eager'
 os.environ['VLLM_SPYRE_USE_CB'] = '1'
-os.environ['VLLM_USE_V1'] = '1'
 
 template = (
     "Below is an instruction that describes a task. Write a response that "

@@ -68,7 +68,6 @@
 if "VLLM_SPYRE_DYNAMO_BACKEND" not in os.environ:
     os.environ['VLLM_SPYRE_DYNAMO_BACKEND'] = 'eager'
 os.environ['VLLM_SPYRE_USE_CB'] = '1'
-os.environ['VLLM_USE_V1'] = '1'
 
 template = ("Summarize the following code: \n\n{}")
 

@@ -1,19 +1,5 @@
-# 🌶️🌶️🌶️ Hack to allow testing of both engines
-import os
-
-# If `VLLM_USE_V1=1` is set upon first vLLM import, then there is a side effect
-# that will cause the V1 engine to always be selected. This is intentionally
-# done for backwards-compatibility of code that was using the AsyncLLMEngine
-# constructor directly, instead of using the `.from_engine_args` construction
-# methods that will select the appropriate v0 or v1 engine. See:
-# https://github.com/vllm-project/vllm/blob/v0.8.4/vllm/engine/llm_engine.py#L2169-L2171
-# Deleting VLLM_USE_V1 here before importing vLLM allows us to continue testing
-# both engines.
-if "VLLM_USE_V1" in os.environ:
-    del os.environ["VLLM_USE_V1"]
-# 🌶️🌶️🌶️ end hack
-
 import hashlib
+import os
 import random
 
 import pytest
@@ -98,8 +84,7 @@ def remote_openai_server(request):
         max_num_seqs = params["max_num_seqs"]
         env_dict = {
             "VLLM_SPYRE_USE_CB": "1",
-            "VLLM_SPYRE_DYNAMO_BACKEND": backend,
-            "VLLM_USE_V1": "1"
+            "VLLM_SPYRE_DYNAMO_BACKEND": backend
         }
         server_args = [
             "--max_num_seqs",
@@ -121,8 +106,6 @@ def remote_openai_server(request):
             ','.join(map(str, warmup_batch_size)),
             "VLLM_SPYRE_DYNAMO_BACKEND":
             backend,
-            "VLLM_USE_V1":
-            "1"
         }
 
         # Default to None if not present

@@ -65,7 +65,6 @@ async def test_abort(
 ):
     """Test handling of cancelled requests"""
     with monkeypatch.context() as m, ExitStack() as after:
-        m.setenv("VLLM_USE_V1", "1")
         m.setenv("VLLM_SPYRE_DYNAMO_BACKEND", backend)
         if cb == 1:
             m.setenv("VLLM_SPYRE_USE_CB", "1")

@@ -205,7 +205,6 @@ def test_full_batch_scheduling(model: str, backend: str, monkeypatch):
                        f"{max_batched_tokens}")
     monkeypatch.setenv("VLLM_SPYRE_WARMUP_NEW_TOKENS", "20")
 
-    monkeypatch.setenv("VLLM_USE_V1", "1")
     monkeypatch.setenv("VLLM_SPYRE_DYNAMO_BACKEND", backend)
 
     # Setup the engine

@@ -18,12 +18,10 @@
                          [(64, 4), (64, 8), (128, 4),
                           (128, 8)])  # (prompt_length/batch_size)
 @pytest.mark.parametrize("backend", get_spyre_backend_list())
-@pytest.mark.parametrize("vllm_version", ["V0", "V1"])
 def test_output(
     model: str,
     warmup_shape: tuple[int, int],
     backend: str,
-    vllm_version: str,
     monkeypatch,
 ) -> None:
     '''
@@ -34,7 +32,6 @@ def test_output(
     '''
 
     monkeypatch.setenv("VLLM_SPYRE_DYNAMO_BACKEND", backend)
-    monkeypatch.setenv("VLLM_USE_V1", "1" if vllm_version == "V1" else "0")
     patch_warmup_shapes([warmup_shape], monkeypatch)
 
     prompts = get_chicken_soup_prompts(1)
@@ -44,8 +41,7 @@ def test_output(
                                          max_model_len=256,
                                          block_size=256,
                                          tensor_parallel_size=1,
-                                         backend=backend,
-                                         vllm_version=vllm_version)
+                                         backend=backend)
 
     hf_results = st_embeddings(model=model, prompts=prompts)
 
@@ -65,12 +61,10 @@ def test_output(
 ])  # (prompt_length/batch_size)
 @pytest.mark.parametrize("backend", get_spyre_backend_list())
 @pytest.mark.parametrize("model", get_spyre_model_list(isEmbeddings=True))
-@pytest.mark.parametrize("vllm_version", ["V0", "V1"])
 def test_scheduling_invariance(
     model,
     backend,
     warmup_shape: tuple[int, int],
-    vllm_version,
     monkeypatch,
 ) -> None:
     '''
@@ -83,7 +77,6 @@ def test_scheduling_invariance(
     '''
 
     monkeypatch.setenv("VLLM_SPYRE_DYNAMO_BACKEND", backend)
-    monkeypatch.setenv("VLLM_USE_V1", "1" if vllm_version == "V1" else "0")
     patch_warmup_shapes([warmup_shape], monkeypatch)
 
     prompts = get_chicken_soup_prompts(4)

@@ -40,7 +40,6 @@ def test_prompt_logprobs(
 
     prompts = get_chicken_soup_prompts(4)
 
-    monkeypatch.setenv("VLLM_USE_V1", "1")
     monkeypatch.setenv("VLLM_SPYRE_DYNAMO_BACKEND", backend)
     monkeypatch.setenv("VLLM_SPYRE_ENABLE_PROMPT_LOGPROBS", "1")
     llm = LLM(model, tensor_parallel_size=tp_size, tokenizer=model)

@@ -30,7 +30,6 @@ def test_max_prompt_len_and_new_tokens(model: str,
     '''
     monkeypatch.setenv("VLLM_SPYRE_DYNAMO_BACKEND", backend)
     patch_warmup_shapes(warmup_shapes, monkeypatch)
-    monkeypatch.setenv("VLLM_USE_V1", "1")
 
     max_prompt_length = max([t[0] for t in warmup_shapes])
     max_new_tokens = max([t[1] for t in warmup_shapes])

@@ -56,7 +56,6 @@ def check_scheduler_inference_steps(
     """
 
     # set env vars
-    monkeypatch.setenv("VLLM_USE_V1", "1")
     monkeypatch.setenv("VLLM_SPYRE_DYNAMO_BACKEND", backend)
     if use_cb:
         monkeypatch.setenv("VLLM_SPYRE_USE_CB", "1")

@@ -203,7 +203,6 @@ def generate_spyre_vllm_output(
                            ",".join(str(val) for val in warmup_batch_size))
     # --------------
     monkeypatch.setenv("VLLM_SPYRE_USE_CB", "1" if use_cb else "0")
-    monkeypatch.setenv("VLLM_USE_V1", "1")
     monkeypatch.setenv("VLLM_SPYRE_DYNAMO_BACKEND", backend)
 
     # Allows to run multiprocess V1 engine without dumping meaningless logs at
@@ -417,8 +416,7 @@ def check_output_against_hf(model, backend, max_new_tokens, vllm_results,
 # vLLM / Spyre
 def spyre_vllm_embeddings(model: str, prompts: list[str], max_model_len: int,
                           block_size: int, tensor_parallel_size: int,
-                          backend: str,
-                          vllm_version: str) -> list[dict[str, Any]]:
+                          backend: str) -> list[dict[str, Any]]:
 
     vllm_model = LLM(model=model,
                      tokenizer=model,

@@ -211,7 +211,6 @@ def same(t1: Optional[torch.Tensor], t2: Optional[torch.Tensor]) -> bool:
         sampling_metadata.bad_words_token_ids
 
 
-@pytest.mark.v1
 @pytest.mark.worker
 @pytest.mark.parametrize("batch_size", [1, 2, 32, 64])
 def test_sampling_metadata_in_input_batch(batch_size: int):