Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 15 additions & 5 deletions tests/e2e/test_spyre_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,13 @@
"warmup_shape", [(64, 20, 4), (64, 20, 8), (128, 20, 4),
(128, 20, 8)]) # (prompt_length/new_tokens/batch_size)
@pytest.mark.parametrize("backend", get_spyre_backend_list())
def test_output(model: str, prompts: list[str],
warmup_shape: tuple[int, int, int], backend: str) -> None:
def test_output(
model: str,
prompts: list[str],
warmup_shape: tuple[int, int, int],
backend: str,
monkeypatch: pytest.MonkeyPatch,
) -> None:
'''
The warmup is based on a single shape. After the warmup,
one request with the provided prompts is input to vLLM.
Expand Down Expand Up @@ -64,7 +69,8 @@ def test_output(model: str, prompts: list[str],
block_size=2048,
sampling_params=vllm_sampling_params,
tensor_parallel_size=1,
backend=backend)
backend=backend,
monkeypatch=monkeypatch)

hf_results = generate_hf_output(model=model,
prompts=prompts,
Expand Down Expand Up @@ -92,6 +98,7 @@ def test_output_sendnn_decoder(
prompts: list[str],
warmup_shape: tuple[int, int, int],
backend: str,
monkeypatch: pytest.MonkeyPatch,
) -> None:
'''
Tests the deprecated sendnn_decoder backend, which should fall-back to
Expand All @@ -114,7 +121,8 @@ def test_output_sendnn_decoder(
block_size=2048,
sampling_params=vllm_sampling_params,
tensor_parallel_size=1,
backend=backend)
backend=backend,
monkeypatch=monkeypatch)

hf_results = generate_hf_output(model=model,
prompts=prompts,
Expand All @@ -134,6 +142,7 @@ def test_output_sendnn_decoder(
def test_batch_handling(
model: str,
backend: str,
monkeypatch: pytest.MonkeyPatch,
):
"""Test that the spyre worker correctly handles batches of requests that
finish after different numbers of forward passes"""
Expand Down Expand Up @@ -166,7 +175,8 @@ def test_batch_handling(
block_size=2048,
sampling_params=vllm_sampling_params,
tensor_parallel_size=1,
backend=backend)
backend=backend,
monkeypatch=monkeypatch)

assert vllm_results[0]["text"] == " 3 2 "
assert vllm_results[1]["text"] == " 6 5 4 3 2 "
Expand Down
93 changes: 44 additions & 49 deletions tests/e2e/test_spyre_cb.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@
from typing import Any

import pytest
from spyre_util import (create_random_request, generate_cb_spyre_vllm_output,
from spyre_util import (compare_results, create_random_request,
generate_hf_output, generate_spyre_vllm_output,
get_spyre_backend_list, get_spyre_model_list)
from vllm import EngineArgs, SamplingParams
from vllm.v1.engine import EngineCoreRequest
Expand All @@ -17,33 +18,26 @@

from vllm_spyre.v1.core.scheduler import ContinuousBatchingSpyreScheduler

template = (
"Below is an instruction that describes a task. Write a response that "
"appropriately completes the request. Be polite in your response to the "
"user.\n\n### Instruction:\n{}\n\n### Response:")


@pytest.mark.cb
@pytest.mark.parametrize("max_num_seqs", [2, 3, 4],
ids=lambda val: f"max_num_seqs({val})")
@pytest.mark.parametrize("model", get_spyre_model_list())
@pytest.mark.parametrize("backend", get_spyre_backend_list())
@pytest.mark.parametrize(
"prompts",
[
[
"7 6 5 4",
"10 9 8 7",
],
[
"7 6 5 4",
"10 9 8 7",
"8 7 6 5",
],
[
"7 6 5 4",
"10 9 8 7",
"8 7 6 5",
"9 8 7 6",
],
],
ids=lambda val: f"num_prompts({len(val)})",
)
@pytest.mark.parametrize("prompts", [[
template.format("Provide a list of instructions "
"for preparing chicken soup."),
template.format("Provide me a list of things that I can do with my "
"new found wealth."),
template.format(
"how do I add multiple new columns in m for power query or power bi?"),
template.format("Convert char to string in Java."),
]])
def test_cb_handling(
model: str,
backend: str,
Expand All @@ -55,16 +49,17 @@ def test_cb_handling(
continuous batches of requests that
finish after different numbers of forward passes"""

vllm_sampling_params = SamplingParams(max_tokens=20,
max_tokens = 20

vllm_sampling_params = SamplingParams(max_tokens=max_tokens,
temperature=0,
stop="1",
ignore_eos=True,
logprobs=0)

# Ensure that both:
# - The model doesn't crash
# - The output sequences are correct
vllm_results = generate_cb_spyre_vllm_output(
vllm_results = generate_spyre_vllm_output(
model=model,
prompts=prompts,
max_model_len=2048,
Expand All @@ -73,29 +68,31 @@ def test_cb_handling(
tensor_parallel_size=1,
backend=backend,
max_num_seqs=max_num_seqs,
use_cb=1,
monkeypatch=monkeypatch,
)
use_cb=True,
monkeypatch=monkeypatch)

hf_results = generate_hf_output(model=model,
prompts=prompts,
max_new_tokens=max_tokens)

for i, prompt in enumerate(prompts):
assert (vllm_results[i]["text"] == [
" " + " ".join(
str(i)
for i in range(int(prompt.split()[-1]) - 1, 1, -1)) + " "
][0])
compare_results(model=model,
prompts=prompts,
warmup_shapes=[],
tensor_parallel_size=1,
backend=backend,
vllm_results=vllm_results,
hf_results=hf_results)


@pytest.mark.cb
@pytest.mark.parametrize("max_num_seqs", [2])
@pytest.mark.parametrize("model", get_spyre_model_list())
@pytest.mark.parametrize(
"backend", [pytest.param("eager", marks=pytest.mark.cpu, id="eager")])
@pytest.mark.parametrize("cb",
[pytest.param(1, marks=pytest.mark.cb, id="cb")])
def test_cb_max_tokens(
model: str,
backend: str,
max_num_seqs: int,
cb: int,
monkeypatch: pytest.MonkeyPatch,
):
"""Test that continuous batches of requests that
Expand All @@ -112,18 +109,16 @@ def test_cb_max_tokens(
logprobs=0)

with pytest.raises(ValueError, match="max model context length"):
generate_cb_spyre_vllm_output(
model=model,
prompts=overflow_prompt,
max_model_len=max_model_len,
block_size=max_model_len,
sampling_params=vllm_sampling_params,
tensor_parallel_size=1,
backend=backend,
max_num_seqs=max_num_seqs,
use_cb=cb,
monkeypatch=monkeypatch,
)
generate_spyre_vllm_output(model=model,
prompts=overflow_prompt,
max_model_len=max_model_len,
block_size=max_model_len,
sampling_params=vllm_sampling_params,
tensor_parallel_size=1,
backend=backend,
max_num_seqs=max_num_seqs,
use_cb=True,
monkeypatch=monkeypatch)


def get_params_test_blocks_borders_aligned_prompts():
Expand Down
4 changes: 3 additions & 1 deletion tests/e2e/test_spyre_max_new_tokens.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ def test_output(
stop_last: bool,
warmup_shape: tuple[int, int, int],
backend: str,
monkeypatch: pytest.MonkeyPatch,
) -> None:
'''
The warmup is based on a single shape. After the warmup,
Expand Down Expand Up @@ -84,7 +85,8 @@ def test_output(
block_size=2048,
sampling_params=vllm_sampling_params,
tensor_parallel_size=1,
backend=backend)
backend=backend,
monkeypatch=monkeypatch)

hf_results = generate_hf_output(model=model,
prompts=prompts,
Expand Down
4 changes: 3 additions & 1 deletion tests/e2e/test_spyre_seed.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ def test_seed(
seed: int,
warmup_shape: tuple[int, int, int],
backend: str,
monkeypatch: pytest.MonkeyPatch,
) -> None:
'''
The warmup is based on a single shape. After the warmup,
Expand Down Expand Up @@ -57,7 +58,8 @@ def test_seed(
block_size=2048,
sampling_params=vllm_sampling_params,
tensor_parallel_size=1,
backend=backend)
backend=backend,
monkeypatch=monkeypatch)

# compare all generated outputs against the first generated output
for vllm_result in vllm_results:
Expand Down
4 changes: 3 additions & 1 deletion tests/e2e/test_spyre_tensor_parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ def test_output(
warmup_shapes: list[tuple[int, int, int]],
tp_size: int,
backend: str,
monkeypatch: pytest.MonkeyPatch,
) -> None:
'''
The warmup is based on one or multiple shapes. After the warmup,
Expand Down Expand Up @@ -62,7 +63,8 @@ def test_output(
block_size=2048,
sampling_params=vllm_sampling_params,
tensor_parallel_size=tp_size,
backend=backend)
backend=backend,
monkeypatch=monkeypatch)

hf_results = generate_hf_output(model=model,
prompts=prompts,
Expand Down
8 changes: 6 additions & 2 deletions tests/e2e/test_spyre_warmup_shapes.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ def test_output(
prompts: list[str],
warmup_shapes: list[tuple[int, int, int]],
backend: str,
monkeypatch: pytest.MonkeyPatch,
) -> None:
'''
The warmup is based on two shapes, that 'overlap' each
Expand Down Expand Up @@ -68,7 +69,8 @@ def test_output(
block_size=2048,
sampling_params=vllm_sampling_params,
tensor_parallel_size=1,
backend=backend)
backend=backend,
monkeypatch=monkeypatch)

hf_results = generate_hf_output(model=model,
prompts=prompts,
Expand All @@ -92,6 +94,7 @@ def test_invalid_prompt_len(
prompts: list[str],
warmup_shapes: list[tuple[int, int, int]],
backend: str,
monkeypatch: pytest.MonkeyPatch,
) -> None:
'''
Expects an error to be raised if the warmup prompt length
Expand All @@ -111,4 +114,5 @@ def test_invalid_prompt_len(
block_size=64,
sampling_params=vllm_sampling_params,
tensor_parallel_size=1,
backend=backend)
backend=backend,
monkeypatch=monkeypatch)
Loading