Skip to content
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions docs/user_guide/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ To run inference on IBM Spyre Accelerators, the backend should be set as:
| --- | --- | --- | --- |
| Decoder | v0 | sendnn | V0 support for decoder models is deprecated |
| Decoder | v1 | sendnn | |
| Embedding | v0 | sendnn | |
| Embedding | v1 | N/A | Embedding models are not yet supported on V1 |
| Embedding | v0 | sendnn | V0 support for embedding models is deprecated|
| Embedding | v1 | sendnn | |

## Batching Modes

Expand Down
2 changes: 1 addition & 1 deletion docs/user_guide/supported_features.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ This table summarize the status of features on Spyre. By default, those features
| Prompt Adapter | ⛔ | Being deprecated in vLLM [vllm#13981](https://github.com/vllm-project/vllm/issues/13981) |
| Speculative Decoding | 🗓️ | |
| Guided Decoding | 🗓️ | |
| Pooling | ⚠️ | Works with V0. V1 still being developed in vLLM [vllm#18052](https://github.com/vllm-project/vllm/issues/18052) |
| Pooling | | |
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We already have Embedding models at the end of this table - is that still needed?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

question for @maxdebayser

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since we don't support all pooling applications, I think it's better to remove this and leave just Embedding below.

| Enc-dec | ⛔ | No plans for now |
| Multi Modality | 🗓️ | |
| LogProbs | ✅ | |
Expand Down
1 change: 0 additions & 1 deletion examples/offline_inference/cb_spyre_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@
if "VLLM_SPYRE_DYNAMO_BACKEND" not in os.environ:
os.environ['VLLM_SPYRE_DYNAMO_BACKEND'] = 'eager'
os.environ['VLLM_SPYRE_USE_CB'] = '1'
os.environ['VLLM_USE_V1'] = '1'

template = (
"Below is an instruction that describes a task. Write a response that "
Expand Down
1 change: 0 additions & 1 deletion examples/offline_inference/long_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,6 @@
if "VLLM_SPYRE_DYNAMO_BACKEND" not in os.environ:
os.environ['VLLM_SPYRE_DYNAMO_BACKEND'] = 'eager'
os.environ['VLLM_SPYRE_USE_CB'] = '1'
os.environ['VLLM_USE_V1'] = '1'

template = ("Summarize the following code: \n\n{}")

Expand Down
21 changes: 2 additions & 19 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,5 @@
# 🌶️🌶️🌶️ Hack to allow testing of both engines
import os

# If `VLLM_USE_V1=1` is set upon first vLLM import, then there is a side effect
# that will cause the V1 engine to always be selected. This is intentionally
# done for backwards-compatibility of code that was using the AsyncLLMEngine
# constructor directly, instead of using the `.from_engine_args` construction
# methods that will select the appropriate v0 or v1 engine. See:
# https://github.com/vllm-project/vllm/blob/v0.8.4/vllm/engine/llm_engine.py#L2169-L2171
# Deleting VLLM_USE_V1 here before importing vLLM allows us to continue testing
# both engines.
if "VLLM_USE_V1" in os.environ:
del os.environ["VLLM_USE_V1"]
# 🌶️🌶️🌶️ end hack

import hashlib
import os
import random

import pytest
Expand Down Expand Up @@ -98,8 +84,7 @@ def remote_openai_server(request):
max_num_seqs = params["max_num_seqs"]
env_dict = {
"VLLM_SPYRE_USE_CB": "1",
"VLLM_SPYRE_DYNAMO_BACKEND": backend,
"VLLM_USE_V1": "1"
"VLLM_SPYRE_DYNAMO_BACKEND": backend
}
server_args = [
"--max_num_seqs",
Expand All @@ -121,8 +106,6 @@ def remote_openai_server(request):
','.join(map(str, warmup_batch_size)),
"VLLM_SPYRE_DYNAMO_BACKEND":
backend,
"VLLM_USE_V1":
"1"
}

# Default to None if not present
Expand Down
1 change: 0 additions & 1 deletion tests/e2e/test_spyre_async_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,6 @@ async def test_abort(
):
"""Test handling of cancelled requests"""
with monkeypatch.context() as m, ExitStack() as after:
m.setenv("VLLM_USE_V1", "1")
m.setenv("VLLM_SPYRE_DYNAMO_BACKEND", backend)
if cb == 1:
m.setenv("VLLM_SPYRE_USE_CB", "1")
Expand Down
1 change: 0 additions & 1 deletion tests/e2e/test_spyre_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,6 @@ def test_full_batch_scheduling(model: str, backend: str, monkeypatch):
f"{max_batched_tokens}")
monkeypatch.setenv("VLLM_SPYRE_WARMUP_NEW_TOKENS", "20")

monkeypatch.setenv("VLLM_USE_V1", "1")
monkeypatch.setenv("VLLM_SPYRE_DYNAMO_BACKEND", backend)

# Setup the engine
Expand Down
9 changes: 1 addition & 8 deletions tests/e2e/test_spyre_embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,10 @@
[(64, 4), (64, 8), (128, 4),
(128, 8)]) # (prompt_length/batch_size)
@pytest.mark.parametrize("backend", get_spyre_backend_list())
@pytest.mark.parametrize("vllm_version", ["V0", "V1"])
def test_output(
model: str,
warmup_shape: tuple[int, int],
backend: str,
vllm_version: str,
monkeypatch,
) -> None:
'''
Expand All @@ -34,7 +32,6 @@ def test_output(
'''

monkeypatch.setenv("VLLM_SPYRE_DYNAMO_BACKEND", backend)
monkeypatch.setenv("VLLM_USE_V1", "1" if vllm_version == "V1" else "0")
patch_warmup_shapes([warmup_shape], monkeypatch)

prompts = get_chicken_soup_prompts(1)
Expand All @@ -44,8 +41,7 @@ def test_output(
max_model_len=256,
block_size=256,
tensor_parallel_size=1,
backend=backend,
vllm_version=vllm_version)
backend=backend)

hf_results = st_embeddings(model=model, prompts=prompts)

Expand All @@ -65,12 +61,10 @@ def test_output(
]) # (prompt_length/batch_size)
@pytest.mark.parametrize("backend", get_spyre_backend_list())
@pytest.mark.parametrize("model", get_spyre_model_list(isEmbeddings=True))
@pytest.mark.parametrize("vllm_version", ["V0", "V1"])
def test_scheduling_invariance(
model,
backend,
warmup_shape: tuple[int, int],
vllm_version,
monkeypatch,
) -> None:
'''
Expand All @@ -83,7 +77,6 @@ def test_scheduling_invariance(
'''

monkeypatch.setenv("VLLM_SPYRE_DYNAMO_BACKEND", backend)
monkeypatch.setenv("VLLM_USE_V1", "1" if vllm_version == "V1" else "0")
patch_warmup_shapes([warmup_shape], monkeypatch)

prompts = get_chicken_soup_prompts(4)
Expand Down
1 change: 0 additions & 1 deletion tests/e2e/test_spyre_prompt_logprobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@ def test_prompt_logprobs(

prompts = get_chicken_soup_prompts(4)

monkeypatch.setenv("VLLM_USE_V1", "1")
monkeypatch.setenv("VLLM_SPYRE_DYNAMO_BACKEND", backend)
monkeypatch.setenv("VLLM_SPYRE_ENABLE_PROMPT_LOGPROBS", "1")
llm = LLM(model, tensor_parallel_size=tp_size, tokenizer=model)
Expand Down
1 change: 0 additions & 1 deletion tests/e2e/test_spyre_static_batching_limits.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ def test_max_prompt_len_and_new_tokens(model: str,
'''
monkeypatch.setenv("VLLM_SPYRE_DYNAMO_BACKEND", backend)
patch_warmup_shapes(warmup_shapes, monkeypatch)
monkeypatch.setenv("VLLM_USE_V1", "1")

max_prompt_length = max([t[0] for t in warmup_shapes])
max_new_tokens = max([t[1] for t in warmup_shapes])
Expand Down
1 change: 0 additions & 1 deletion tests/scheduling_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,6 @@ def check_scheduler_inference_steps(
"""

# set env vars
monkeypatch.setenv("VLLM_USE_V1", "1")
monkeypatch.setenv("VLLM_SPYRE_DYNAMO_BACKEND", backend)
if use_cb:
monkeypatch.setenv("VLLM_SPYRE_USE_CB", "1")
Expand Down
4 changes: 1 addition & 3 deletions tests/spyre_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,6 @@ def generate_spyre_vllm_output(
",".join(str(val) for val in warmup_batch_size))
# --------------
monkeypatch.setenv("VLLM_SPYRE_USE_CB", "1" if use_cb else "0")
monkeypatch.setenv("VLLM_USE_V1", "1")
monkeypatch.setenv("VLLM_SPYRE_DYNAMO_BACKEND", backend)

# Allows to run multiprocess V1 engine without dumping meaningless logs at
Expand Down Expand Up @@ -417,8 +416,7 @@ def check_output_against_hf(model, backend, max_new_tokens, vllm_results,
# vLLM / Spyre
def spyre_vllm_embeddings(model: str, prompts: list[str], max_model_len: int,
block_size: int, tensor_parallel_size: int,
backend: str,
vllm_version: str) -> list[dict[str, Any]]:
backend: str) -> list[dict[str, Any]]:

vllm_model = LLM(model=model,
tokenizer=model,
Expand Down
Empty file removed vllm_spyre/core/__init__.py
Empty file.
Loading
Loading