Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docker/Dockerfile.amd64
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ ENV COMPILATION_MODE=offline_decoder \
FLEX_UNLINK_DEVMEM=false \
FLEX_RDMA_MODE_FULL=1 \
TOKENIZERS_PARALLELISM=false \
TORCH_SENDNN_LOG=CRITICAL
TORCH_SENDNN_LOG=WARNING

# Required configuration file
COPY docker/.senlib.json /home/senuser
Expand Down
4 changes: 2 additions & 2 deletions docs/user_guide/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ To run inference on IBM Spyre Accelerators, the backend should be set as:

| Model type | vLLM backend | `VLLM_SPYRE_DYNAMO_BACKEND` configuration | Notes |
| --- | --- | --- | --- |
| Decoder | v0 | sendnn_decoder | V0 support for decoder models is deprecated |
| Decoder | v1 | sendnn_decoder | |
| Decoder | v0 | sendnn | V0 support for decoder models is deprecated |
| Decoder | v1 | sendnn | |
| Embedding | v0 | sendnn | |
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think I may have been mistaken when I wrote this and we actually need the sendnn_decoder backend for v0 embeddings. At least I think it looks like that's how our internal CI is set up to run them right now.

Would be good to double check that, though hopefully we deprecate soon anyway

Copy link
Collaborator Author

@tjohnson31415 tjohnson31415 Jun 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, my understanding is that decoder models can be used for embeddings as well (typically via "pooling").

I'm not following what needs to be checked. This PR is removing use of the sendnn_decoder backend (with a fallback to sendnn), so all embedding models would use sendnn.

| Embedding | v1 | N/A | Embedding models are not yet supported on V1 |

Expand Down
2 changes: 1 addition & 1 deletion examples/offline_inference_spyre.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@
"0 / 1 : FLEX_RDMA_WORLD_RANK=0\n",
"0 / 1 : FLEX_RDMA_WORLD_SIZE=1\n",
"0 / 1 : Spyre: Enabled (0) (offset=0)\n",
"0 / 1 : Dynamo Backend : sendnn_decoder\n",
"0 / 1 : Dynamo Backend : sendnn\n",
"0 / 1 : CPU Cores : 56 x 2 HW threads\n",
"------------------------------------------------------------\n",
"NOTICE: Adjusting torch._dynamo.config.accumulated_cache_size_limit from 64 to 160 to accommodate prompt size of 64 and decode tokens of 5\n",
Expand Down
53 changes: 53 additions & 0 deletions tests/e2e/test_spyre_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,59 @@ def test_output(
hf_results=hf_results)


@pytest.mark.parametrize("model", get_spyre_model_list())
@pytest.mark.parametrize("prompts", [[
template.format("Provide a list of instructions "
"for preparing chicken soup."),
]])
@pytest.mark.parametrize(
"warmup_shape", [(64, 20, 4)]) # (prompt_length/new_tokens/batch_size)
@pytest.mark.parametrize("backend", ["sendnn_decoder"])
@pytest.mark.parametrize("vllm_version", VLLM_VERSIONS)
def test_output_sendnn_decoder(
model: str,
prompts: list[str],
warmup_shape: tuple[int, int, int],
backend: str,
vllm_version: str,
) -> None:
'''
Tests the deprecated sendnn_decoder backend, which should fall-back to
sendnn
'''

max_new_tokens = warmup_shape[1]

vllm_sampling_params = SamplingParams(
max_tokens=max_new_tokens,
temperature=0,
logprobs=0, # return logprobs of generated tokens only
ignore_eos=True)

vllm_results = generate_spyre_vllm_output(
model=model,
prompts=prompts,
warmup_shapes=[warmup_shape],
max_model_len=2048,
block_size=2048,
sampling_params=vllm_sampling_params,
tensor_parallel_size=1,
backend=backend,
vllm_version=vllm_version)

hf_results = generate_hf_output(model=model,
prompts=prompts,
max_new_tokens=max_new_tokens)

compare_results(model=model,
prompts=prompts,
warmup_shapes=[warmup_shape],
tensor_parallel_size=1,
backend=backend,
vllm_results=vllm_results,
hf_results=hf_results)


@pytest.mark.parametrize("model", get_spyre_model_list())
@pytest.mark.parametrize("backend", get_spyre_backend_list())
@pytest.mark.parametrize("vllm_version", VLLM_VERSIONS)
Expand Down
2 changes: 1 addition & 1 deletion tests/e2e/test_spyre_embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
]])
@pytest.mark.parametrize("warmup_shape",
[(64, 4), (64, 8), (128, 4),
(128, 8)]) # (prompt_length/new_tokens/batch_size)
(128, 8)]) # (prompt_length/batch_size)
@pytest.mark.parametrize("backend", get_spyre_backend_list())
@pytest.mark.parametrize(
"vllm_version",
Expand Down
4 changes: 2 additions & 2 deletions tests/e2e/test_spyre_online.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,13 +57,13 @@ def test_openai_serving(remote_openai_server, model, warmup_shape, backend,


@pytest.mark.parametrize("model", get_spyre_model_list(quantization="gptq"))
@pytest.mark.parametrize("backend", ["sendnn_decoder"])
@pytest.mark.parametrize("backend", ["sendnn"])
@pytest.mark.parametrize("quantization", ["gptq"])
@pytest.mark.parametrize("warmup_shape", [[(64, 20, 4)]])
@pytest.mark.parametrize("vllm_version", VLLM_VERSIONS)
def test_openai_serving_gptq(remote_openai_server, model, backend,
warmup_shape, vllm_version, quantization):
"""Test online serving a GPTQ model with the sendnn_decoder backend only"""
"""Test online serving a GPTQ model with the sendnn backend only"""

client = remote_openai_server.get_client()
completion = client.completions.create(model=model,
Expand Down
10 changes: 5 additions & 5 deletions tests/spyre_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,7 +324,7 @@ def compare_results(model: str, prompts: list[str],
print(f" vLLM: {repr(vllm_result['text']):s}{err_msg}")
print()

assert DISABLE_ASSERTS or backend == 'sendnn_decoder' or\
assert DISABLE_ASSERTS or backend == 'sendnn' or\
hf_result['token_ids'] == vllm_result['token_ids']

if len(hf_result['tokens']) > 0:
Expand All @@ -351,13 +351,13 @@ def compare_results(model: str, prompts: list[str],
f"{vllm_logprob:14f} ",
end='')

if backend == 'sendnn_decoder':
if backend == 'sendnn':
rel_tol = ISCLOSE_REL_TOL_SPYRE
else:
rel_tol = ISCLOSE_REL_TOL_CPU

if hf_token_id != vllm_token_id: # different tokens
if backend == 'sendnn_decoder' and math.isclose(
if backend == 'sendnn' and math.isclose(
hf_logprob, vllm_logprob, rel_tol=rel_tol):
# probably still OK
print('DIVERGING')
Expand Down Expand Up @@ -477,15 +477,15 @@ def get_spyre_model_dir_path() -> Path:
# get model backends from env or default to all and add pytest markers
def get_spyre_backend_list():
user_backend_list = os.environ.get("VLLM_SPYRE_TEST_BACKEND_LIST",
"eager,inductor,sendnn_decoder,sendnn")
"eager,inductor,sendnn")

backends = []
for backend in user_backend_list.split(","):
backend = backend.strip()
marks = []
if backend == "eager":
marks = [pytest.mark.cpu]
elif backend == "sendnn_decoder":
elif backend == "sendnn":
marks = [pytest.mark.spyre]

backends.append(pytest.param(backend, marks=marks, id=backend))
Expand Down
5 changes: 2 additions & 3 deletions tests/utils/test_spyre_backend_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,11 @@ def test_get_spyre_backend_list(monkeypatch):
Ensure we return the backend list correctly
'''
with monkeypatch.context() as m:
m.setenv("VLLM_SPYRE_TEST_BACKEND_LIST",
"eager,inductor,sendnn_decoder")
m.setenv("VLLM_SPYRE_TEST_BACKEND_LIST", "eager,inductor,sendnn")
backend_list = get_spyre_backend_list()
assert backend_list[0].values[0] == "eager"
assert backend_list[1].values[0] == "inductor"
assert backend_list[2].values[0] == "sendnn_decoder"
assert backend_list[2].values[0] == "sendnn"

with monkeypatch.context() as m:
m.setenv("VLLM_SPYRE_TEST_BACKEND_LIST", "sendnn")
Expand Down
26 changes: 20 additions & 6 deletions vllm_spyre/envs.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import os
from typing import TYPE_CHECKING, Any, Callable, Optional

from vllm.logger import init_logger

if TYPE_CHECKING:
VLLM_SPYRE_DYNAMO_BACKEND: str = "sendnn_decoder"
VLLM_SPYRE_DYNAMO_BACKEND: str = "sendnn"
VLLM_SPYRE_WARMUP_PROMPT_LENS: Optional[list[int]] = None
VLLM_SPYRE_WARMUP_NEW_TOKENS: Optional[list[int]] = None
VLLM_SPYRE_WARMUP_BATCH_SIZES: Optional[list[int]] = None
Expand All @@ -12,6 +14,19 @@
VLLM_SPYRE_PERF_METRIC_LOGGING_DIR: str = "/tmp"
VLLM_SPYRE_OVERRIDE_SIGNALS_HANDLER: bool = False

logger = init_logger(__name__)


def _backend_backwards_compat() -> str:
val = os.getenv("VLLM_SPYRE_DYNAMO_BACKEND", "sendnn")
if val == "sendnn_decoder":
logger.warning_once(
"Using 'sendnn_decoder' for "
"VLLM_SPYRE_DYNAMO_BACKEND is deprecated. Use 'sendnn' instead")
val = 'sendnn'
return val


# --8<-- [start:env-vars-definition]
environment_variables: dict[str, Callable[[], Any]] = {
# Defines the prompt lengths the Spyre accelerator should be prepared
Expand Down Expand Up @@ -41,14 +56,13 @@

# Defines the backend that torch.compile will use when using Spyre
# Available options:
# - "sendnn_decoder": Compile for execution on Spyre hardware for
# decoder models
# - "sendnn": Compile for execution on Spyre hardware for
# encoder models
# - "sendnn": Compile for execution on Spyre hardware
# - "inductor": Compile for execution on CPU (for debug and testing)
# - "eager": Skip compile entirely (for debug and testing)
#
# - "sendnn_decoder": Deprecated in favor of "sendnn"
"VLLM_SPYRE_DYNAMO_BACKEND":
lambda: os.getenv("VLLM_SPYRE_DYNAMO_BACKEND", "sendnn_decoder"),
_backend_backwards_compat,

# If set, use the V1 continuous batching implementation. Otherwise, static
# batching mode will be enabled.
Expand Down
10 changes: 5 additions & 5 deletions vllm_spyre/model_executor/model_loader/spyre.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
print("WARNING: Disabled: dynamo_tracer")
pass

BACKEND_LIST = ['sendnn_decoder', 'inductor']
BACKEND_LIST = ['sendnn', 'inductor']

logger = init_logger(__name__)

Expand Down Expand Up @@ -88,7 +88,7 @@ def forward(
self.model.past_key_value_states = None # type: ignore

extra_kwargs: dict[str, Any] = {}
if envs_spyre.VLLM_SPYRE_DYNAMO_BACKEND != "sendnn_decoder":
if envs_spyre.VLLM_SPYRE_DYNAMO_BACKEND != "sendnn":
# Bug in 2.3.1 fixed in 2.4.1 for SDPA flash
# cpu impl when padding too much
extra_kwargs["attn_algorithm"] = "math"
Expand Down Expand Up @@ -153,7 +153,7 @@ def __init__(

self.config: PretrainedConfig = model_config.hf_config
self.dtype = torch.float16 if envs_spyre.VLLM_SPYRE_DYNAMO_BACKEND == \
'sendnn_decoder' else torch.float32
'sendnn' else torch.float32

# Actual FMS model
self.model: nn.Module
Expand All @@ -177,7 +177,7 @@ def load_weights(
) -> None:

if model_config.quantization == "gptq":
if envs_spyre.VLLM_SPYRE_DYNAMO_BACKEND == "sendnn_decoder":
if envs_spyre.VLLM_SPYRE_DYNAMO_BACKEND == "sendnn":
from fms_mo.aiu_addons.gptq import ( # noqa: F401
gptq_aiu_adapter, gptq_aiu_linear)
linear_type = "gptq_aiu"
Expand Down Expand Up @@ -215,7 +215,7 @@ def load_weights(
revision=model_config.revision)

# we can use fused weights unless running on Spyre
fused_weights = envs_spyre.VLLM_SPYRE_DYNAMO_BACKEND != "sendnn_decoder"
fused_weights = envs_spyre.VLLM_SPYRE_DYNAMO_BACKEND != "sendnn"

self.model = get_model(architecture="hf_configured",
variant=model_config.model,
Expand Down
Loading