Skip to content

Commit 14dd264

Browse files
[refact] Remove V0 tests (#241)
# Description Remove V0 tests , except for embedding models. --------- Signed-off-by: Wallas Santos <[email protected]> Signed-off-by: Prashant Gupta <[email protected]> Co-authored-by: Prashant Gupta <[email protected]>
1 parent eeedaf4 commit 14dd264

15 files changed

+39
-100
lines changed

.github/workflows/test.yml

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -39,17 +39,17 @@ jobs:
3939
- name: "vLLM:main"
4040
repo: "git+https://github.com/vllm-project/vllm --branch main"
4141
test_suite:
42-
- name: "V0-e2e"
43-
markers: "v0 and cpu and e2e"
44-
flags: "--timeout=300"
45-
- name: "V1-e2e"
46-
markers: "v1 and cpu and e2e and not cb"
42+
- name: "static batching"
43+
markers: "cpu and decoder and not cb"
44+
flags: "--timeout=300 --forked"
45+
- name: "embedding"
46+
markers: "cpu and embedding"
4747
flags: "--timeout=300 --forked"
48-
- name: "V1-cb"
49-
markers: "v1 and cpu and cb"
48+
- name: "continuous batching"
49+
markers: "cpu and cb"
5050
flags: "--timeout=300 --forked"
51-
- name: "V1-worker and utils"
52-
markers: "v1 and not e2e or utils"
51+
- name: "worker and utils"
52+
markers: "not e2e"
5353
flags: "--timeout=300"
5454

5555
name: "${{ matrix.test_suite.name }} (${{ matrix.vllm_version.name }})"

docs/contributing/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ uv pip install --group dev
9292
Now, you can run the tests:
9393

9494
```sh
95-
python -m pytest -v -x tests -m "v1 and cpu and e2e"
95+
python -m pytest -v -x tests -m "cpu and e2e"
9696
```
9797

9898
Here is a list of `pytest` markers you can use to filter them:

pyproject.toml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -121,8 +121,6 @@ pythonpath = ["."]
121121
markers = [
122122
"skip_global_cleanup",
123123
"e2e: Tests using end-to-end engine spin-up",
124-
"v0: Tests using vLLM v0 engine",
125-
"v1: Tests using vLLM v1 engine",
126124
"cb: Continuous batching tests",
127125
"cpu: Tests using CPU (i.e. eager) backend",
128126
"spyre: Tests using Spyre hardware backend",

tests/conftest.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,6 @@ def remote_openai_server(request):
6868
model = params['model']
6969
warmup_shape = params['warmup_shape']
7070
backend = params['backend']
71-
vllm_version = params['vllm_version']
7271
except KeyError as e:
7372
raise pytest.UsageError(
7473
"Error setting up remote_openai_server params") from e
@@ -79,14 +78,13 @@ def remote_openai_server(request):
7978
warmup_prompt_length = [t[0] for t in warmup_shape]
8079
warmup_new_tokens = [t[1] for t in warmup_shape]
8180
warmup_batch_size = [t[2] for t in warmup_shape]
82-
v1_flag = "1" if vllm_version == "V1" else "0"
8381
env_dict = {
8482
"VLLM_SPYRE_WARMUP_PROMPT_LENS":
8583
','.join(map(str, warmup_prompt_length)),
8684
"VLLM_SPYRE_WARMUP_NEW_TOKENS": ','.join(map(str, warmup_new_tokens)),
8785
"VLLM_SPYRE_WARMUP_BATCH_SIZES": ','.join(map(str, warmup_batch_size)),
8886
"VLLM_SPYRE_DYNAMO_BACKEND": backend,
89-
"VLLM_USE_V1": v1_flag
87+
"VLLM_USE_V1": "1"
9088
}
9189

9290
# Add extra server args if present in test

tests/e2e/test_spyre_basic.py

Lines changed: 7 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
"""
55

66
import pytest
7-
from spyre_util import (VLLM_VERSIONS, compare_results, create_random_request,
7+
from spyre_util import (compare_results, create_random_request,
88
generate_hf_output, generate_spyre_vllm_output,
99
get_spyre_backend_list, get_spyre_model_list)
1010
from vllm import EngineArgs, SamplingParams
@@ -33,14 +33,8 @@
3333
"warmup_shape", [(64, 20, 4), (64, 20, 8), (128, 20, 4),
3434
(128, 20, 8)]) # (prompt_length/new_tokens/batch_size)
3535
@pytest.mark.parametrize("backend", get_spyre_backend_list())
36-
@pytest.mark.parametrize("vllm_version", VLLM_VERSIONS)
37-
def test_output(
38-
model: str,
39-
prompts: list[str],
40-
warmup_shape: tuple[int, int, int],
41-
backend: str,
42-
vllm_version: str,
43-
) -> None:
36+
def test_output(model: str, prompts: list[str],
37+
warmup_shape: tuple[int, int, int], backend: str) -> None:
4438
'''
4539
The warmup is based on a single shape. After the warmup,
4640
one request with the provided prompts is input to vLLM.
@@ -70,8 +64,7 @@ def test_output(
7064
block_size=2048,
7165
sampling_params=vllm_sampling_params,
7266
tensor_parallel_size=1,
73-
backend=backend,
74-
vllm_version=vllm_version)
67+
backend=backend)
7568

7669
hf_results = generate_hf_output(model=model,
7770
prompts=prompts,
@@ -94,13 +87,11 @@ def test_output(
9487
@pytest.mark.parametrize(
9588
"warmup_shape", [(64, 20, 4)]) # (prompt_length/new_tokens/batch_size)
9689
@pytest.mark.parametrize("backend", ["sendnn_decoder"])
97-
@pytest.mark.parametrize("vllm_version", VLLM_VERSIONS)
9890
def test_output_sendnn_decoder(
9991
model: str,
10092
prompts: list[str],
10193
warmup_shape: tuple[int, int, int],
10294
backend: str,
103-
vllm_version: str,
10495
) -> None:
10596
'''
10697
Tests the deprecated sendnn_decoder backend, which should fall-back to
@@ -123,8 +114,7 @@ def test_output_sendnn_decoder(
123114
block_size=2048,
124115
sampling_params=vllm_sampling_params,
125116
tensor_parallel_size=1,
126-
backend=backend,
127-
vllm_version=vllm_version)
117+
backend=backend)
128118

129119
hf_results = generate_hf_output(model=model,
130120
prompts=prompts,
@@ -141,11 +131,9 @@ def test_output_sendnn_decoder(
141131

142132
@pytest.mark.parametrize("model", get_spyre_model_list())
143133
@pytest.mark.parametrize("backend", get_spyre_backend_list())
144-
@pytest.mark.parametrize("vllm_version", VLLM_VERSIONS)
145134
def test_batch_handling(
146135
model: str,
147136
backend: str,
148-
vllm_version: str,
149137
):
150138
"""Test that the spyre worker correctly handles batches of requests that
151139
finish after different numbers of forward passes"""
@@ -178,8 +166,7 @@ def test_batch_handling(
178166
block_size=2048,
179167
sampling_params=vllm_sampling_params,
180168
tensor_parallel_size=1,
181-
backend=backend,
182-
vllm_version=vllm_version)
169+
backend=backend)
183170

184171
assert vllm_results[0]["text"] == " 3 2 "
185172
assert vllm_results[1]["text"] == " 6 5 4 3 2 "
@@ -189,10 +176,7 @@ def test_batch_handling(
189176

190177
@pytest.mark.parametrize("model", get_spyre_model_list())
191178
@pytest.mark.parametrize("backend", get_spyre_backend_list())
192-
@pytest.mark.parametrize("vllm_version",
193-
[pytest.param("V1", marks=pytest.mark.v1, id="v1")])
194-
def test_full_batch_scheduling(model: str, backend: str, vllm_version: str,
195-
monkeypatch):
179+
def test_full_batch_scheduling(model: str, backend: str, monkeypatch):
196180
"""Test that we can schedule a full batch of prompts."""
197181

198182
# We need to ensure here that the max number of tokens in a full batch

tests/e2e/test_spyre_cb.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919

2020

2121
@pytest.mark.cb
22-
@pytest.mark.v1
2322
@pytest.mark.parametrize("max_num_seqs", [2, 3, 4],
2423
ids=lambda val: f"max_num_seqs({val})")
2524
@pytest.mark.parametrize("model", get_spyre_model_list())
@@ -92,7 +91,6 @@ def test_cb_handling(
9291
"backend", [pytest.param("eager", marks=pytest.mark.cpu, id="eager")])
9392
@pytest.mark.parametrize("cb",
9493
[pytest.param(1, marks=pytest.mark.cb, id="cb")])
95-
# @pytest.mark.v1
9694
def test_cb_max_tokens(
9795
model: str,
9896
backend: str,
@@ -648,7 +646,6 @@ def augment_checked_steps(
648646

649647

650648
@pytest.mark.cb
651-
@pytest.mark.v1
652649
@pytest.mark.parametrize("model", get_spyre_model_list())
653650
@pytest.mark.parametrize("backend", get_spyre_backend_list())
654651
@pytest.mark.parametrize("max_num_seqs", [2])

tests/e2e/test_spyre_embeddings.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,8 @@
2020
[(64, 4), (64, 8), (128, 4),
2121
(128, 8)]) # (prompt_length/batch_size)
2222
@pytest.mark.parametrize("backend", get_spyre_backend_list())
23-
@pytest.mark.parametrize(
24-
"vllm_version",
25-
[pytest.param("V0", marks=pytest.mark.v0, id="v0")
26-
]) # TODO: Replace with VLLM_VERSIONS when v1 is supported.
23+
# TODO: Add it when v1 is supported.
24+
@pytest.mark.parametrize("vllm_version", ["V0"])
2725
def test_output(
2826
model: str,
2927
prompts: list[str],

tests/e2e/test_spyre_max_new_tokens.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
"""
55

66
import pytest
7-
from spyre_util import (VLLM_VERSIONS, compare_results, generate_hf_output,
7+
from spyre_util import (compare_results, generate_hf_output,
88
generate_spyre_vllm_output, get_spyre_backend_list,
99
get_spyre_model_list)
1010
from vllm import SamplingParams
@@ -27,14 +27,12 @@
2727
@pytest.mark.parametrize(
2828
"warmup_shape", [(64, 10, 4)]) # (prompt_length/new_tokens/batch_size)
2929
@pytest.mark.parametrize("backend", get_spyre_backend_list())
30-
@pytest.mark.parametrize("vllm_version", VLLM_VERSIONS)
3130
def test_output(
3231
model: str,
3332
prompts: list[str],
3433
stop_last: bool,
3534
warmup_shape: tuple[int, int, int],
3635
backend: str,
37-
vllm_version: str,
3836
) -> None:
3937
'''
4038
The warmup is based on a single shape. After the warmup,
@@ -86,8 +84,7 @@ def test_output(
8684
block_size=2048,
8785
sampling_params=vllm_sampling_params,
8886
tensor_parallel_size=1,
89-
backend=backend,
90-
vllm_version=vllm_version)
87+
backend=backend)
9188

9289
hf_results = generate_hf_output(model=model,
9390
prompts=prompts,

tests/e2e/test_spyre_online.py

Lines changed: 3 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,14 @@
11
import openai
22
import pytest
3-
from spyre_util import (VLLM_VERSIONS, get_spyre_backend_list,
4-
get_spyre_model_list)
3+
from spyre_util import get_spyre_backend_list, get_spyre_model_list
54

65

76
@pytest.mark.parametrize("model", get_spyre_model_list())
87
@pytest.mark.parametrize("backend", get_spyre_backend_list())
98
@pytest.mark.parametrize("warmup_shape", [[
109
(64, 20, 4),
1110
]])
12-
@pytest.mark.parametrize("vllm_version", VLLM_VERSIONS)
13-
def test_openai_serving(remote_openai_server, model, warmup_shape, backend,
14-
vllm_version):
11+
def test_openai_serving(remote_openai_server, model, warmup_shape, backend):
1512
"""Test online serving using the `vllm serve` CLI"""
1613

1714
client = remote_openai_server.get_client()
@@ -48,10 +45,6 @@ def test_openai_serving(remote_openai_server, model, warmup_shape, backend,
4845
completion = client.completions.create(model=model,
4946
prompt="Hello World!",
5047
max_tokens=25)
51-
# V1 should raise
52-
assert vllm_version == "V0"
53-
assert len(completion.choices) == 1
54-
assert len(completion.choices[0].text) == 0
5548
except openai.BadRequestError as e:
5649
assert "warmup" in str(e)
5750

@@ -60,9 +53,8 @@ def test_openai_serving(remote_openai_server, model, warmup_shape, backend,
6053
@pytest.mark.parametrize("backend", ["sendnn"])
6154
@pytest.mark.parametrize("quantization", ["gptq"])
6255
@pytest.mark.parametrize("warmup_shape", [[(64, 20, 4)]])
63-
@pytest.mark.parametrize("vllm_version", VLLM_VERSIONS)
6456
def test_openai_serving_gptq(remote_openai_server, model, backend,
65-
warmup_shape, vllm_version, quantization):
57+
warmup_shape, quantization):
6658
"""Test online serving a GPTQ model with the sendnn backend only"""
6759

6860
client = remote_openai_server.get_client()

tests/e2e/test_spyre_online_multi.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
import pytest
2-
from spyre_util import (VLLM_VERSIONS, get_spyre_backend_list,
3-
get_spyre_model_list)
2+
from spyre_util import get_spyre_backend_list, get_spyre_model_list
43

54

65
@pytest.mark.multi
@@ -11,9 +10,8 @@
1110
@pytest.mark.parametrize(
1211
"backend", [b for b in get_spyre_backend_list() if "eager" not in str(b)])
1312
@pytest.mark.parametrize("tensor_parallel_size", ["2", "4", "8"])
14-
@pytest.mark.parametrize("vllm_version", VLLM_VERSIONS)
1513
def test_openai_tp_serving(remote_openai_server, model, warmup_shape, backend,
16-
vllm_version, tensor_parallel_size):
14+
tensor_parallel_size):
1715
"""Test online serving with tensor parallelism using the `vllm serve` CLI"""
1816

1917
client = remote_openai_server.get_client()

0 commit comments

Comments
 (0)