Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions docs/user_guide/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ To run inference on IBM Spyre Accelerators, the backend should be set as:
| --- | --- | --- | --- |
| Decoder | v0 | sendnn | V0 support for decoder models is deprecated |
| Decoder | v1 | sendnn | |
| Embedding | v0 | sendnn | |
| Embedding | v1 | N/A | Embedding models are not yet supported on V1 |
| Embedding | v0 | sendnn | V0 support for embedding models is deprecated|
| Embedding | v1 | sendnn | |

## Batching Modes

Expand Down
2 changes: 1 addition & 1 deletion docs/user_guide/supported_features.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ This table summarize the status of features on Spyre. By default, those features
| Prompt Adapter | ⛔ | Being deprecated in vLLM [vllm#13981](https://github.com/vllm-project/vllm/issues/13981) |
| Speculative Decoding | 🗓️ | |
| Guided Decoding | 🗓️ | |
| Pooling | ⚠️ | Works with V0. V1 still being developed in vLLM [vllm#18052](https://github.com/vllm-project/vllm/issues/18052) |
| Pooling | | |
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We already have Embedding models at the end of this table - is that still needed?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

question for @maxdebayser

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since we don't support all pooling applications, I think it's better to remove this and leave just Embedding below.

| Enc-dec | ⛔ | No plans for now |
| Multi Modality | 🗓️ | |
| LogProbs | ✅ | |
Expand Down
1 change: 0 additions & 1 deletion examples/offline_inference/cb_spyre_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@
if "VLLM_SPYRE_DYNAMO_BACKEND" not in os.environ:
os.environ['VLLM_SPYRE_DYNAMO_BACKEND'] = 'eager'
os.environ['VLLM_SPYRE_USE_CB'] = '1'
os.environ['VLLM_USE_V1'] = '1'

template = (
"Below is an instruction that describes a task. Write a response that "
Expand Down
1 change: 0 additions & 1 deletion examples/offline_inference/long_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,6 @@
if "VLLM_SPYRE_DYNAMO_BACKEND" not in os.environ:
os.environ['VLLM_SPYRE_DYNAMO_BACKEND'] = 'eager'
os.environ['VLLM_SPYRE_USE_CB'] = '1'
os.environ['VLLM_USE_V1'] = '1'

template = ("Summarize the following code: \n\n{}")

Expand Down
21 changes: 2 additions & 19 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,5 @@
# 🌶️🌶️🌶️ Hack to allow testing of both engines
import os

# If `VLLM_USE_V1=1` is set upon first vLLM import, then there is a side effect
# that will cause the V1 engine to always be selected. This is intentionally
# done for backwards-compatibility of code that was using the AsyncLLMEngine
# constructor directly, instead of using the `.from_engine_args` construction
# methods that will select the appropriate v0 or v1 engine. See:
# https://github.com/vllm-project/vllm/blob/v0.8.4/vllm/engine/llm_engine.py#L2169-L2171
# Deleting VLLM_USE_V1 here before importing vLLM allows us to continue testing
# both engines.
if "VLLM_USE_V1" in os.environ:
del os.environ["VLLM_USE_V1"]
# 🌶️🌶️🌶️ end hack

import hashlib
import os
import random

import pytest
Expand Down Expand Up @@ -98,8 +84,7 @@ def remote_openai_server(request):
max_num_seqs = params["max_num_seqs"]
env_dict = {
"VLLM_SPYRE_USE_CB": "1",
"VLLM_SPYRE_DYNAMO_BACKEND": backend,
"VLLM_USE_V1": "1"
"VLLM_SPYRE_DYNAMO_BACKEND": backend
}
server_args = [
"--max_num_seqs",
Expand All @@ -121,8 +106,6 @@ def remote_openai_server(request):
','.join(map(str, warmup_batch_size)),
"VLLM_SPYRE_DYNAMO_BACKEND":
backend,
"VLLM_USE_V1":
"1"
}

# Default to None if not present
Expand Down
1 change: 0 additions & 1 deletion tests/e2e/test_spyre_async_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,6 @@ async def test_abort(
):
"""Test handling of cancelled requests"""
with monkeypatch.context() as m, ExitStack() as after:
m.setenv("VLLM_USE_V1", "1")
m.setenv("VLLM_SPYRE_DYNAMO_BACKEND", backend)
if cb == 1:
m.setenv("VLLM_SPYRE_USE_CB", "1")
Expand Down
1 change: 0 additions & 1 deletion tests/e2e/test_spyre_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,6 @@ def test_full_batch_scheduling(model: str, backend: str, monkeypatch):
f"{max_batched_tokens}")
monkeypatch.setenv("VLLM_SPYRE_WARMUP_NEW_TOKENS", "20")

monkeypatch.setenv("VLLM_USE_V1", "1")
monkeypatch.setenv("VLLM_SPYRE_DYNAMO_BACKEND", backend)

# Setup the engine
Expand Down
9 changes: 1 addition & 8 deletions tests/e2e/test_spyre_embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,10 @@
[(64, 4), (64, 8), (128, 4),
(128, 8)]) # (prompt_length/batch_size)
@pytest.mark.parametrize("backend", get_spyre_backend_list())
@pytest.mark.parametrize("vllm_version", ["V0", "V1"])
def test_output(
model: str,
warmup_shape: tuple[int, int],
backend: str,
vllm_version: str,
monkeypatch,
) -> None:
'''
Expand All @@ -34,7 +32,6 @@ def test_output(
'''

monkeypatch.setenv("VLLM_SPYRE_DYNAMO_BACKEND", backend)
monkeypatch.setenv("VLLM_USE_V1", "1" if vllm_version == "V1" else "0")
patch_warmup_shapes([warmup_shape], monkeypatch)

prompts = get_chicken_soup_prompts(1)
Expand All @@ -44,8 +41,7 @@ def test_output(
max_model_len=256,
block_size=256,
tensor_parallel_size=1,
backend=backend,
vllm_version=vllm_version)
backend=backend)

hf_results = st_embeddings(model=model, prompts=prompts)

Expand All @@ -65,12 +61,10 @@ def test_output(
]) # (prompt_length/batch_size)
@pytest.mark.parametrize("backend", get_spyre_backend_list())
@pytest.mark.parametrize("model", get_spyre_model_list(isEmbeddings=True))
@pytest.mark.parametrize("vllm_version", ["V0", "V1"])
def test_scheduling_invariance(
model,
backend,
warmup_shape: tuple[int, int],
vllm_version,
monkeypatch,
) -> None:
'''
Expand All @@ -83,7 +77,6 @@ def test_scheduling_invariance(
'''

monkeypatch.setenv("VLLM_SPYRE_DYNAMO_BACKEND", backend)
monkeypatch.setenv("VLLM_USE_V1", "1" if vllm_version == "V1" else "0")
patch_warmup_shapes([warmup_shape], monkeypatch)

prompts = get_chicken_soup_prompts(4)
Expand Down
1 change: 0 additions & 1 deletion tests/e2e/test_spyre_prompt_logprobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@ def test_prompt_logprobs(

prompts = get_chicken_soup_prompts(4)

monkeypatch.setenv("VLLM_USE_V1", "1")
monkeypatch.setenv("VLLM_SPYRE_DYNAMO_BACKEND", backend)
monkeypatch.setenv("VLLM_SPYRE_ENABLE_PROMPT_LOGPROBS", "1")
llm = LLM(model, tensor_parallel_size=tp_size, tokenizer=model)
Expand Down
1 change: 0 additions & 1 deletion tests/e2e/test_spyre_static_batching_limits.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ def test_max_prompt_len_and_new_tokens(model: str,
'''
monkeypatch.setenv("VLLM_SPYRE_DYNAMO_BACKEND", backend)
patch_warmup_shapes(warmup_shapes, monkeypatch)
monkeypatch.setenv("VLLM_USE_V1", "1")

max_prompt_length = max([t[0] for t in warmup_shapes])
max_new_tokens = max([t[1] for t in warmup_shapes])
Expand Down
1 change: 0 additions & 1 deletion tests/scheduling_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,6 @@ def check_scheduler_inference_steps(
"""

# set env vars
monkeypatch.setenv("VLLM_USE_V1", "1")
monkeypatch.setenv("VLLM_SPYRE_DYNAMO_BACKEND", backend)
if use_cb:
monkeypatch.setenv("VLLM_SPYRE_USE_CB", "1")
Expand Down
4 changes: 1 addition & 3 deletions tests/spyre_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,6 @@ def generate_spyre_vllm_output(
",".join(str(val) for val in warmup_batch_size))
# --------------
monkeypatch.setenv("VLLM_SPYRE_USE_CB", "1" if use_cb else "0")
monkeypatch.setenv("VLLM_USE_V1", "1")
monkeypatch.setenv("VLLM_SPYRE_DYNAMO_BACKEND", backend)

# Allows to run multiprocess V1 engine without dumping meaningless logs at
Expand Down Expand Up @@ -417,8 +416,7 @@ def check_output_against_hf(model, backend, max_new_tokens, vllm_results,
# vLLM / Spyre
def spyre_vllm_embeddings(model: str, prompts: list[str], max_model_len: int,
block_size: int, tensor_parallel_size: int,
backend: str,
vllm_version: str) -> list[dict[str, Any]]:
backend: str) -> list[dict[str, Any]]:

vllm_model = LLM(model=model,
tokenizer=model,
Expand Down
1 change: 0 additions & 1 deletion tests/v1/worker/test_spyre_input_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,6 @@ def same(t1: Optional[torch.Tensor], t2: Optional[torch.Tensor]) -> bool:
sampling_metadata.bad_words_token_ids


@pytest.mark.v1
@pytest.mark.worker
@pytest.mark.parametrize("batch_size", [1, 2, 32, 64])
def test_sampling_metadata_in_input_batch(batch_size: int):
Expand Down
Empty file removed vllm_spyre/core/__init__.py
Empty file.
Loading
Loading