vllm-project · tjohnson31415 · Jun 5, 2025 · May 27, 2025 · May 27, 2025 · Jun 2, 2025
@@ -74,7 +74,7 @@ ENV COMPILATION_MODE=offline_decoder \
     FLEX_UNLINK_DEVMEM=false \
     FLEX_RDMA_MODE_FULL=1 \
     TOKENIZERS_PARALLELISM=false \
-    TORCH_SENDNN_LOG=CRITICAL
+    TORCH_SENDNN_LOG=WARNING
 
 # Required configuration file
 COPY docker/.senlib.json /home/senuser

@@ -11,8 +11,8 @@ To run inference on IBM Spyre Accelerators, the backend should be set as:
 
 | Model type | vLLM backend | `VLLM_SPYRE_DYNAMO_BACKEND` configuration | Notes |
 | --- | --- | --- | --- |
-| Decoder | v0 | sendnn_decoder | V0 support for decoder models is deprecated |
-| Decoder | v1 | sendnn_decoder | |
+| Decoder | v0 | sendnn | V0 support for decoder models is deprecated |
+| Decoder | v1 | sendnn | |
 | Embedding | v0 | sendnn | |
 | Embedding | v1 | N/A | Embedding models are not yet supported on V1 |
 

@@ -113,7 +113,7 @@
       "0 / 1 : FLEX_RDMA_WORLD_RANK=0\n",
       "0 / 1 : FLEX_RDMA_WORLD_SIZE=1\n",
       "0 / 1 : Spyre: Enabled (0) (offset=0)\n",
-      "0 / 1 : Dynamo Backend  : sendnn_decoder\n",
+      "0 / 1 : Dynamo Backend  : sendnn\n",
       "0 / 1 : CPU Cores       : 56 x 2 HW threads\n",
       "------------------------------------------------------------\n",
       "NOTICE: Adjusting torch._dynamo.config.accumulated_cache_size_limit from 64 to 160 to accommodate prompt size of 64 and decode tokens of 5\n",

@@ -87,6 +87,59 @@ def test_output(
                     hf_results=hf_results)
 
 
+@pytest.mark.parametrize("model", get_spyre_model_list())
+@pytest.mark.parametrize("prompts", [[
+    template.format("Provide a list of instructions "
+                    "for preparing chicken soup."),
+]])
+@pytest.mark.parametrize(
+    "warmup_shape", [(64, 20, 4)])  # (prompt_length/new_tokens/batch_size)
+@pytest.mark.parametrize("backend", ["sendnn_decoder"])
+@pytest.mark.parametrize("vllm_version", VLLM_VERSIONS)
+def test_output_sendnn_decoder(
+    model: str,
+    prompts: list[str],
+    warmup_shape: tuple[int, int, int],
+    backend: str,
+    vllm_version: str,
+) -> None:
+    '''
+    Tests the deprecated sendnn_decoder backend, which should fall-back to
+    sendnn
+    '''
+
+    max_new_tokens = warmup_shape[1]
+
+    vllm_sampling_params = SamplingParams(
+        max_tokens=max_new_tokens,
+        temperature=0,
+        logprobs=0,  # return logprobs of generated tokens only
+        ignore_eos=True)
+
+    vllm_results = generate_spyre_vllm_output(
+        model=model,
+        prompts=prompts,
+        warmup_shapes=[warmup_shape],
+        max_model_len=2048,
+        block_size=2048,
+        sampling_params=vllm_sampling_params,
+        tensor_parallel_size=1,
+        backend=backend,
+        vllm_version=vllm_version)
+
+    hf_results = generate_hf_output(model=model,
+                                    prompts=prompts,
+                                    max_new_tokens=max_new_tokens)
+
+    compare_results(model=model,
+                    prompts=prompts,
+                    warmup_shapes=[warmup_shape],
+                    tensor_parallel_size=1,
+                    backend=backend,
+                    vllm_results=vllm_results,
+                    hf_results=hf_results)
+
+
 @pytest.mark.parametrize("model", get_spyre_model_list())
 @pytest.mark.parametrize("backend", get_spyre_backend_list())
 @pytest.mark.parametrize("vllm_version", VLLM_VERSIONS)

@@ -18,7 +18,7 @@
 ]])
 @pytest.mark.parametrize("warmup_shape",
                          [(64, 4), (64, 8), (128, 4),
-                          (128, 8)])  # (prompt_length/new_tokens/batch_size)
+                          (128, 8)])  # (prompt_length/batch_size)
 @pytest.mark.parametrize("backend", get_spyre_backend_list())
 @pytest.mark.parametrize(
     "vllm_version",

@@ -57,13 +57,13 @@ def test_openai_serving(remote_openai_server, model, warmup_shape, backend,
 
 
 @pytest.mark.parametrize("model", get_spyre_model_list(quantization="gptq"))
-@pytest.mark.parametrize("backend", ["sendnn_decoder"])
+@pytest.mark.parametrize("backend", ["sendnn"])
 @pytest.mark.parametrize("quantization", ["gptq"])
 @pytest.mark.parametrize("warmup_shape", [[(64, 20, 4)]])
 @pytest.mark.parametrize("vllm_version", VLLM_VERSIONS)
 def test_openai_serving_gptq(remote_openai_server, model, backend,
                              warmup_shape, vllm_version, quantization):
-    """Test online serving a GPTQ model with the sendnn_decoder backend only"""
+    """Test online serving a GPTQ model with the sendnn backend only"""
 
     client = remote_openai_server.get_client()
     completion = client.completions.create(model=model,

@@ -324,7 +324,7 @@ def compare_results(model: str, prompts: list[str],
         print(f"        vLLM:  {repr(vllm_result['text']):s}{err_msg}")
         print()
 
-        assert DISABLE_ASSERTS or backend == 'sendnn_decoder' or\
+        assert DISABLE_ASSERTS or backend == 'sendnn' or\
             hf_result['token_ids'] == vllm_result['token_ids']
 
         if len(hf_result['tokens']) > 0:
@@ -351,13 +351,13 @@ def compare_results(model: str, prompts: list[str],
                     f"{vllm_logprob:14f}  ",
                     end='')
 
-                if backend == 'sendnn_decoder':
+                if backend == 'sendnn':
                     rel_tol = ISCLOSE_REL_TOL_SPYRE
                 else:
                     rel_tol = ISCLOSE_REL_TOL_CPU
 
                 if hf_token_id != vllm_token_id:  # different tokens
-                    if backend == 'sendnn_decoder' and math.isclose(
+                    if backend == 'sendnn' and math.isclose(
                             hf_logprob, vllm_logprob, rel_tol=rel_tol):
                         # probably still OK
                         print('DIVERGING')
@@ -477,15 +477,15 @@ def get_spyre_model_dir_path() -> Path:
 # get model backends from env or default to all and add pytest markers
 def get_spyre_backend_list():
     user_backend_list = os.environ.get("VLLM_SPYRE_TEST_BACKEND_LIST",
-                                       "eager,inductor,sendnn_decoder,sendnn")
+                                       "eager,inductor,sendnn")
 
     backends = []
     for backend in user_backend_list.split(","):
         backend = backend.strip()
         marks = []
         if backend == "eager":
             marks = [pytest.mark.cpu]
-        elif backend == "sendnn_decoder":
+        elif backend == "sendnn":
             marks = [pytest.mark.spyre]
 
         backends.append(pytest.param(backend, marks=marks, id=backend))

@@ -8,12 +8,11 @@ def test_get_spyre_backend_list(monkeypatch):
     Ensure we return the backend list correctly
     '''
     with monkeypatch.context() as m:
-        m.setenv("VLLM_SPYRE_TEST_BACKEND_LIST",
-                 "eager,inductor,sendnn_decoder")
+        m.setenv("VLLM_SPYRE_TEST_BACKEND_LIST", "eager,inductor,sendnn")
         backend_list = get_spyre_backend_list()
         assert backend_list[0].values[0] == "eager"
         assert backend_list[1].values[0] == "inductor"
-        assert backend_list[2].values[0] == "sendnn_decoder"
+        assert backend_list[2].values[0] == "sendnn"
 
     with monkeypatch.context() as m:
         m.setenv("VLLM_SPYRE_TEST_BACKEND_LIST", "sendnn")

@@ -1,8 +1,10 @@
 import os
 from typing import TYPE_CHECKING, Any, Callable, Optional
 
+from vllm.logger import init_logger
+
 if TYPE_CHECKING:
-    VLLM_SPYRE_DYNAMO_BACKEND: str = "sendnn_decoder"
+    VLLM_SPYRE_DYNAMO_BACKEND: str = "sendnn"
     VLLM_SPYRE_WARMUP_PROMPT_LENS: Optional[list[int]] = None
     VLLM_SPYRE_WARMUP_NEW_TOKENS: Optional[list[int]] = None
     VLLM_SPYRE_WARMUP_BATCH_SIZES: Optional[list[int]] = None
@@ -12,6 +14,19 @@
     VLLM_SPYRE_PERF_METRIC_LOGGING_DIR: str = "/tmp"
     VLLM_SPYRE_OVERRIDE_SIGNALS_HANDLER: bool = False
 
+logger = init_logger(__name__)
+
+
+def _backend_backwards_compat() -> str:
+    val = os.getenv("VLLM_SPYRE_DYNAMO_BACKEND", "sendnn")
+    if val == "sendnn_decoder":
+        logger.warning_once(
+            "Using 'sendnn_decoder' for "
+            "VLLM_SPYRE_DYNAMO_BACKEND is deprecated. Use 'sendnn' instead")
+        val = 'sendnn'
+    return val
+
+
 # --8<-- [start:env-vars-definition]
 environment_variables: dict[str, Callable[[], Any]] = {
     # Defines the prompt lengths the Spyre accelerator should be prepared
@@ -41,14 +56,13 @@
 
     # Defines the backend that torch.compile will use when using Spyre
     # Available options:
-    # - "sendnn_decoder": Compile for execution on Spyre hardware for
-    #   decoder models
-    # - "sendnn": Compile for execution on Spyre hardware for
-    #   encoder models
+    # - "sendnn": Compile for execution on Spyre hardware
     # - "inductor": Compile for execution on CPU (for debug and testing)
     # - "eager": Skip compile entirely (for debug and testing)
+    #
+    # - "sendnn_decoder": Deprecated in favor of "sendnn"
     "VLLM_SPYRE_DYNAMO_BACKEND":
-    lambda: os.getenv("VLLM_SPYRE_DYNAMO_BACKEND", "sendnn_decoder"),
+    _backend_backwards_compat,
 
     # If set, use the V1 continuous batching implementation. Otherwise, static
     # batching mode will be enabled.

@@ -29,7 +29,7 @@
     print("WARNING: Disabled: dynamo_tracer")
     pass
 
-BACKEND_LIST = ['sendnn_decoder', 'inductor']
+BACKEND_LIST = ['sendnn', 'inductor']
 
 logger = init_logger(__name__)
 
@@ -88,7 +88,7 @@ def forward(
             self.model.past_key_value_states = None  # type: ignore
 
         extra_kwargs: dict[str, Any] = {}
-        if envs_spyre.VLLM_SPYRE_DYNAMO_BACKEND != "sendnn_decoder":
+        if envs_spyre.VLLM_SPYRE_DYNAMO_BACKEND != "sendnn":
             # Bug in 2.3.1 fixed in 2.4.1 for SDPA flash
             # cpu impl when padding too much
             extra_kwargs["attn_algorithm"] = "math"
@@ -153,7 +153,7 @@ def __init__(
 
         self.config: PretrainedConfig = model_config.hf_config
         self.dtype = torch.float16 if envs_spyre.VLLM_SPYRE_DYNAMO_BACKEND == \
-            'sendnn_decoder' else torch.float32
+            'sendnn' else torch.float32
 
         # Actual FMS model
         self.model: nn.Module
@@ -177,7 +177,7 @@ def load_weights(
     ) -> None:
 
         if model_config.quantization == "gptq":
-            if envs_spyre.VLLM_SPYRE_DYNAMO_BACKEND == "sendnn_decoder":
+            if envs_spyre.VLLM_SPYRE_DYNAMO_BACKEND == "sendnn":
                 from fms_mo.aiu_addons.gptq import (  # noqa: F401
                     gptq_aiu_adapter, gptq_aiu_linear)
                 linear_type = "gptq_aiu"
@@ -215,7 +215,7 @@ def load_weights(
                 revision=model_config.revision)
 
         # we can use fused weights unless running on Spyre
-        fused_weights = envs_spyre.VLLM_SPYRE_DYNAMO_BACKEND != "sendnn_decoder"
+        fused_weights = envs_spyre.VLLM_SPYRE_DYNAMO_BACKEND != "sendnn"
 
         self.model = get_model(architecture="hf_configured",
                                variant=model_config.model,