Skip to content
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,12 +152,11 @@ def remote_openai_server(request):
"VLLM_SPYRE_USE_CB": "1",
"VLLM_SPYRE_DYNAMO_BACKEND": backend
}
server_args = [
server_args.extend([
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

found a 🐛 !

"--max_num_seqs",
str(max_num_seqs), "--max-model-len",
str(max_model_len)
]

])
else:
warmup_shape = params['warmup_shape']
warmup_prompt_length = [t[0] for t in warmup_shape]
Expand Down
180 changes: 179 additions & 1 deletion tests/e2e/test_spyre_cb_scheduler_steps.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import pytest
from scheduling_utils import check_scheduler_inference_steps
from spyre_util import (check_output_against_hf, get_spyre_backend_list,
get_spyre_model_list)
get_spyre_model_list, skip_unsupported_tp_size)


@pytest.mark.cb
Expand Down Expand Up @@ -1855,3 +1855,181 @@ def test_requests_use_more_than_available_blocks(

check_output_against_hf(model, backend, seqs_max_tokens, cb_outputs,
prompts)


@pytest.mark.cb
@pytest.mark.parametrize("model", ["ibm-granite/granite-3.3-8b-instruct"])
@pytest.mark.parametrize(
"backend", [pytest.param("sendnn", marks=pytest.mark.spyre, id="sendnn")])
@pytest.mark.parametrize(
"tp_size",
[
pytest.param(4, marks=pytest.mark.multi),
],
ids=lambda val: f"TP({val})",
)
def test_staggered_requests(
model: str,
backend: str,
monkeypatch: pytest.MonkeyPatch,
tp_size: int,
set_random_seed,
# seqs_max_tokens,
):
"""Scenario where some request arrive later than others.

Configuration:
* max_num_seqs: 4
* number of prompts: 12
* 1: len = 10, max tokens = 15, step joining = 0
* 2: len = 10, max tokens = 10, step joining = 0
* 3: len = 10, max tokens = 5, step joining = 0
* 4: len = 10, max tokens = 20, step joining = 0
* 5: len = 10, max tokens = 15, step joining = 10
* 6: len = 10, max tokens = 10, step joining = 10
* 7: len = 10, max tokens = 5, step joining = 10
* 8: len = 10, max tokens = 20, step joining = 10
* 9: len = 10, max tokens = 15, step joining = 20
* 10: len = 10, max tokens = 10, step joining = 20
* 11: len = 10, max tokens = 5, step joining = 20
* 12: len = 10, max tokens = 20, step joining = 20
"""

skip_unsupported_tp_size(int(tp_size), backend)

seqs_max_tokens = [15, 10, 5, 20] * 3
prompts_lengths = [10] * 12
steps_add_reqs = [0, 0, 0, 0, 10, 10, 10, 10, 20, 20, 20, 20]
available_blocks = 1000
max_num_seqs = 4
max_model_len = 256

checked_steps = [
{
"step": 0,
"tkv": 0,
"waiting": ["0", "1", "2", "3"],
"running": [],
"request_outputs": [],
"n_reserved_blocks": 0,
"n_used_blocks": 0,
},
{
"step": 1,
"tkv": 64,
"waiting": ["1", "2", "3"],
"running": ["0"],
"request_outputs": ["0"],
"n_reserved_blocks": 2,
"n_used_blocks": 1,
},
{
"step": 2,
"tkv": 64,
"waiting": ["2", "3"],
"running": ["1", "0"],
"request_outputs": ["1"],
"n_reserved_blocks": 4,
"n_used_blocks": 2,
},
{
"step": 3,
"tkv": 64,
"waiting": ["3"],
"running": ["2", "1", "0"],
"request_outputs": ["2"],
"n_reserved_blocks": 6,
"n_used_blocks": 3,
},
{
"step": 4,
"tkv": 64,
"waiting": [],
"running": ["3", "2", "1", "0"],
"request_outputs": ["3"],
"n_reserved_blocks": 8,
"n_used_blocks": 4,
},
{
"step": 5,
"tkv": 65,
"waiting": [],
"running": ["3", "2", "1", "0"],
"request_outputs": ["3", "2", "1", "0"],
"n_reserved_blocks": 8,
"n_used_blocks": 8,
},
{
"step": 6,
"tkv": 66,
"waiting": [],
"running": ["3", "2", "1", "0"],
"request_outputs": ["3", "2", "1", "0"],
"n_reserved_blocks": 8,
"n_used_blocks": 8,
},
{
"step": 8,
"tkv": 68,
"waiting": [],
"running": ["3", "1", "0"],
"request_outputs": ["3", "2", "1", "0"],
"n_reserved_blocks": 8,
"finished_requests": ["2"],
"n_used_blocks": 8,
},
{
"step": 9,
"tkv": 69,
"waiting": [],
"running": ["3", "1", "0"],
"request_outputs": ["3", "1", "0"],
"n_reserved_blocks": 6,
"n_used_blocks": 6,
},
{
"step": 10,
"tkv": 70,
"waiting": ["4", "5", "6", "7"],
"running": ["3", "1", "0"],
"request_outputs": ["3", "1", "0"],
"n_reserved_blocks": 6,
"n_used_blocks": 6,
},
{
"step": 11,
"tkv": 70,
"waiting": ["5", "6", "7"],
"running": ["4", "3", "1", "0"],
"request_outputs": ["4"],
"n_reserved_blocks": 8,
"n_used_blocks": 8,
},
{
"step": 100,
"tkv": 66,
"waiting": ["2", "3"],
"running": ["1", "0"],
"request_outputs": ["1", "0"],
"n_reserved_blocks": 4,
"n_used_blocks": 4,
},
]

cb_outputs, prompts = check_scheduler_inference_steps(
model=model,
backend=backend,
monkeypatch=monkeypatch,
seqs_max_tokens=seqs_max_tokens,
prompts_lengths=prompts_lengths,
steps_add_reqs=steps_add_reqs,
checked_steps=checked_steps,
max_num_seqs=max_num_seqs,
max_model_len=max_model_len,
available_blocks=available_blocks,
use_cb=True,
tensor_parallel_size=tp_size,
)

check_output_against_hf(model, backend, seqs_max_tokens, cb_outputs,
prompts)
77 changes: 61 additions & 16 deletions tests/e2e/test_spyre_online.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,64 @@
import time
from concurrent.futures import ThreadPoolExecutor, as_completed

import openai
import pytest
from spyre_util import get_spyre_backend_list, get_spyre_model_list


def _check_result(client, model, max_tokens=8, temperature=0.0, n=1) -> None:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🌶️ !

completion = client.completions.create(
model=model,
prompt="Hello World!",
max_tokens=max_tokens,
temperature=temperature,
n=n,
)
assert len(completion.choices) == n
assert len(completion.choices[0].text) > 0


@pytest.mark.parametrize("model", get_spyre_model_list())
@pytest.mark.parametrize("backend", get_spyre_backend_list())
@pytest.mark.parametrize("cb",
[pytest.param(1, marks=pytest.mark.cb, id="cb")])
@pytest.mark.parametrize("max_num_seqs", [4],
ids=lambda val: f"max_num_seqs({val})")
@pytest.mark.parametrize("max_model_len", [256],
ids=lambda val: f"max_model_len({val})")
@pytest.mark.parametrize("max_tokens", [32, 64],
ids=lambda val: f"max_tokens({val})")
# @pytest.mark.asyncio
def test_online_output(
remote_openai_server,
model,
backend,
cb,
max_num_seqs,
max_model_len,
max_tokens,
):
"""Test online serving using batches of requests at different times
using the `vllm serve` CLI"""

client = remote_openai_server.get_client()

total_requests = 20
batch_size = 5
delay = 1
futures = []
with ThreadPoolExecutor(max_workers=20) as executor:
for batch_start in range(1, total_requests + 1, batch_size):
batch_end = min(batch_start + batch_size, total_requests + 1)
for _ in range(batch_start, batch_end):
futures.append(
executor.submit(_check_result, client, model, max_tokens))
time.sleep(delay) # Wait before launching the next batch
# Wait for all requests to finish
for future in as_completed(futures):
future.result()


@pytest.mark.parametrize("model", get_spyre_model_list())
@pytest.mark.parametrize(
"tp_size",
Expand Down Expand Up @@ -40,20 +96,9 @@ def test_openai_serving(
"""Test online serving using the `vllm serve` CLI"""

client = remote_openai_server.get_client()
completion = client.completions.create(model=model,
prompt="Hello World!",
max_tokens=5,
temperature=0.0)
assert len(completion.choices) == 1
assert len(completion.choices[0].text) > 0

completion = client.completions.create(model=model,
prompt="Hello World!",
max_tokens=5,
temperature=1.0,
n=2)
assert len(completion.choices) == 2
assert len(completion.choices[0].text) > 0
_check_result(client, model, n=1)
_check_result(client, model, temperature=1.0, n=2)

# rest are SB tests
if cb:
Expand All @@ -73,8 +118,8 @@ def test_openai_serving(
# Short prompt under context length but requesting too many tokens for
# the warmup shape should return an empty result
try:
completion = client.completions.create(model=model,
prompt="Hello World!",
max_tokens=25)
client.completions.create(model=model,
prompt="Hello World!",
max_tokens=25)
except openai.BadRequestError as e:
assert "warmup" in str(e)
Loading
Loading