diff --git a/tests/e2e/test_spyre_cb_scheduler_steps.py b/tests/e2e/test_spyre_cb_scheduler_steps.py index f7ae22f9c..30cae6c59 100644 --- a/tests/e2e/test_spyre_cb_scheduler_steps.py +++ b/tests/e2e/test_spyre_cb_scheduler_steps.py @@ -33,6 +33,7 @@ def test_prompts_aligned_with_tkv_boundaries(model: str, backend: str, steps_add_reqs = [0, 0, 0] # add all requests in the beginning available_blocks = -1 # no restriction max_num_seqs = 2 + max_model_len = 256 checked_steps = [ { @@ -170,6 +171,7 @@ def test_prompts_aligned_with_tkv_boundaries(model: str, backend: str, steps_add_reqs=steps_add_reqs, checked_steps=checked_steps, max_num_seqs=max_num_seqs, + max_model_len=max_model_len, available_blocks=available_blocks, use_cb=True, ) @@ -197,6 +199,7 @@ def test_prompts_misaligned_with_tkv_boundaries( steps_add_reqs = [0, 0, 0] # add all requests in the beginning available_blocks = -1 # no restriction max_num_seqs = 2 + max_model_len = 256 checked_steps = [ { @@ -332,6 +335,7 @@ def test_prompts_misaligned_with_tkv_boundaries( steps_add_reqs=steps_add_reqs, checked_steps=checked_steps, max_num_seqs=max_num_seqs, + max_model_len=max_model_len, available_blocks=available_blocks, use_cb=True, ) @@ -358,6 +362,7 @@ def test_two_sequences_finish_same_time_as_new_arrive( steps_add_reqs = [0, 0, 31] available_blocks = -1 # no restriction max_num_seqs = 2 + max_model_len = 256 checked_steps = [ { @@ -470,6 +475,270 @@ def test_two_sequences_finish_same_time_as_new_arrive( steps_add_reqs=steps_add_reqs, checked_steps=checked_steps, max_num_seqs=max_num_seqs, + max_model_len=max_model_len, + available_blocks=available_blocks, + use_cb=True, + ) + + +@pytest.mark.cb +@pytest.mark.parametrize("model", get_spyre_model_list()) +@pytest.mark.parametrize("backend", get_spyre_backend_list()) +def test_new_sequence_joins_during_decode(model: str, backend: str, + monkeypatch: pytest.MonkeyPatch): + """ Scenario where a new sequence joins while decoding other sequences + + Configuration: + * max_num_seqs: 4 + * number of prompts: 4 + * 1: len = 49, max tokens = 119, step joining = 0 + * 2: len = 14, max tokens = 52, step joining = 0 + * 3: len = 89, max tokens = 104, step joining = 32 + * 4: len = 9, max tokens = 64, step joining = 131 + """ + # TODO change to 65 max_tokens for last prompt if ever possible + + seqs_max_tokens = [119, 52, 104, 64] + prompts_lengths = [49, 14, 89, 9] + steps_add_reqs = [0, 0, 32, 131] + available_blocks = -1 # no restriction + max_num_seqs = 4 + max_model_len = 256 + + checked_steps = [ + { + "step": 0, + "tkv": 0, + "waiting": ["0", "1"], + "running": [], + "request_outputs": [], + "n_reserved_blocks": 0, + "n_used_blocks": 0 + }, + { + # Prefill sequence 0 + "step": 1, + "tkv": 64, + "waiting": ["1"], + "running": ["0"], + "request_outputs": ["0"], + "n_reserved_blocks": 3, # prefill (1 block) + 119 decode (2 block) + "n_used_blocks": 1 + }, + { + # Prefill sequence 1 + "step": 2, + "tkv": 64, + "waiting": [], + "running": ["1", "0"], + "request_outputs": ["1"], + "n_reserved_blocks": 5, # prefill (1 block) + 51 decodes (1 block) + "n_used_blocks": 2 + }, + { + # Decode sequences 0 and 1 + "step": 3, + "tkv": 65, + "waiting": [], + "running": ["1", "0"], + "request_outputs": ["1", "0"], + "n_reserved_blocks": 5, + "n_used_blocks": 4 # 2 blocks extended, one for each sequence + }, + { + # Sequence 2 joins: one iteration in waiting queue + "step": 32, + "tkv": 94, + "waiting": ["2"], + "running": ["1", "0"], + "request_outputs": ["1", "0"], + "n_reserved_blocks": 5, + "n_used_blocks": 4 + }, + { + # Prefill sequence 2 + "step": 33, + "tkv": 94, + "waiting": [], + "running": ["2", "1", "0"], + "request_outputs": ["2"], + "n_reserved_blocks": 9, # prefill (2 block) + 103 decode (2 block) + "n_used_blocks": 6 + }, + { + # Decode sequences 0, 1, and 2 + "step": 34, + "tkv": 95, + "waiting": [], + "running": ["2", "1", "0"], + "request_outputs": ["2", "1", "0"], + "n_reserved_blocks": 9, + "n_used_blocks": 6 + }, + { + # Sequence 1 finishes at step 54 + # (start step + 2 prefills + 51 decodes - 1) = 2 + 2 + 51 - 1 = 54 + "step": 54, + "tkv": 115, + "waiting": [], + "running": ["2", "0"], + "request_outputs": ["2", "1", "0"], + "finished_requests": ["1"], + "n_reserved_blocks": 9, + "n_used_blocks": 6 + }, + { + # Decode sequences 0 and 2 + "step": 55, + "tkv": 116, + "waiting": [], + "running": ["2", "0"], + "request_outputs": ["2", "0"], + "n_reserved_blocks": 7, # two blocks released + "n_used_blocks": 4 # two blocks released + }, + { + # Decode sequences 0 and 2, tkv arrives to new block + "step": 68, + "tkv": 129, + "waiting": [], + "running": ["2", "0"], + "request_outputs": ["2", "0"], + "n_reserved_blocks": 7, + "n_used_blocks": 6 # 2 blocks extended, one for each sequence + }, + { + # Sequence 0 finishes at step 121 + # (start step + 3 prefills + 118 decode - 1) = 1 + 3 + 118 - 1 = 121 + "step": 121, + "tkv": 182, + "waiting": [], + "running": ["2"], + "request_outputs": ["2", "0"], + "finished_requests": ["0"], + "n_reserved_blocks": 7, + "n_used_blocks": 6 + }, + { + # Decode sequence 2 + "step": 122, + "tkv": 183, + "waiting": [], + "running": ["2"], + "request_outputs": ["2"], + "n_reserved_blocks": 4, # 3 blocks released + "n_used_blocks": 3 # 3 blocks released + }, + { + # Sequence 3 joins: one iteration in waiting queue + "step": 131, + "tkv": 192, + "waiting": ["3"], + "running": ["2"], + "request_outputs": ["2"], + "n_reserved_blocks": 4, + "n_used_blocks": 3 + }, + { + # Prefill sequence 3 + "step": 132, + "tkv": 192, + "waiting": [], + "running": ["3", "2"], + "request_outputs": ["3"], + "n_reserved_blocks": 8, # prefill (3 blocks) + 63 decode (1 block) + "n_used_blocks": 6 # prefill (3 block) + }, + { + # Decode sequences 2 and 3 + "step": 133, + "tkv": 193, + "waiting": [], + "running": ["3", "2"], + "request_outputs": ["3", "2"], + "n_reserved_blocks": 8, + "n_used_blocks": 8 # 2 blocks extended, one for each sequence + }, + { + # Sequence 2 finishes at step 137 + # (start step + 2 prefills + 103 decodes) = 33 + 2 + 103 - 1 = 137 + "step": 137, + "tkv": 197, + "waiting": [], + "running": ["3"], + "request_outputs": ["3", "2"], + "finished_requests": ["2"], + "n_reserved_blocks": 8, + "n_used_blocks": 8 + }, + { + # Decode sequence 3 + "step": 138, + "tkv": 70, + "waiting": [], + "running": ["3"], + "request_outputs": ["3"], + # 6 blocks freed: finished sequence (4) + left padding stripping (2) + "n_reserved_blocks": 2, + "n_used_blocks": 2 + }, + { + # Sequence 3 finishes at step 196 + # (start step + 1 prefills + 103 decodes) = 132 + 1 + 63 - 1 = 196 + "step": 195, + "tkv": 127, + "waiting": [], + "running": [], + "request_outputs": ["3"], + "finished_requests": ["3"], + "n_reserved_blocks": 2, + "n_used_blocks": 2 + }, + { + # Tkv should be cleared one step later + "step": 196, + "tkv": 0, + "waiting": [], + "running": [], + "request_outputs": [], + "n_reserved_blocks": 0, + "n_used_blocks": 0 + }, + # TODO this is when max_tokens = 65 for last prompt + # { + # # Sequence 3 finishes at step 196 + # # (start step + 1 prefills + 103 decodes) = 132 + 1 + 64 - 1 = 196 + # "step": 196, + # "tkv": 128, + # "waiting": [], + # "running": [], + # "request_outputs": ["3"], + # "finished_requests": ["3"], + # "n_reserved_blocks": 2, + # "n_used_blocks": 2 + # }, + # { + # # Tkv should be cleared one step later + # "step": 197, + # "tkv": 0, + # "waiting": [], + # "running": [], + # "request_outputs": [], + # "n_reserved_blocks": 0, + # "n_used_blocks": 0 + # }, + ] + + check_scheduler_inference_steps( + model=model, + backend=backend, + monkeypatch=monkeypatch, + seqs_max_tokens=seqs_max_tokens, + prompts_lengths=prompts_lengths, + steps_add_reqs=steps_add_reqs, + checked_steps=checked_steps, + max_num_seqs=max_num_seqs, + max_model_len=max_model_len, available_blocks=available_blocks, use_cb=True, ) @@ -494,6 +763,7 @@ def test_prompt_too_long_for_current_tkv(model: str, backend: str, steps_add_reqs = [0, 0] available_blocks = -1 # no restriction max_num_seqs = 2 + max_model_len = 256 checked_steps = [ { @@ -617,6 +887,7 @@ def test_prompt_too_long_for_current_tkv(model: str, backend: str, steps_add_reqs=steps_add_reqs, checked_steps=checked_steps, max_num_seqs=max_num_seqs, + max_model_len=max_model_len, available_blocks=available_blocks, use_cb=True, ) @@ -642,6 +913,7 @@ def test_requested_tokens_not_fitting_remaining_space( steps_add_reqs = [0, 0, 0] available_blocks = -1 # no restriction max_num_seqs = 2 + max_model_len = 256 checked_steps = [ { @@ -802,6 +1074,7 @@ def test_requested_tokens_not_fitting_remaining_space( steps_add_reqs=steps_add_reqs, checked_steps=checked_steps, max_num_seqs=max_num_seqs, + max_model_len=max_model_len, available_blocks=available_blocks, use_cb=True, ) @@ -830,6 +1103,8 @@ def test_requests_use_all_available_blocks(model: str, backend: str, # total number of blocks needed if scheduled together : 4 * (1 + 1) = 8 available_blocks = 8 max_num_seqs = 4 + max_model_len = 256 + checked_steps = [ { "step": 0, @@ -933,6 +1208,7 @@ def test_requests_use_all_available_blocks(model: str, backend: str, steps_add_reqs=steps_add_reqs, checked_steps=checked_steps, max_num_seqs=max_num_seqs, + max_model_len=max_model_len, available_blocks=available_blocks, use_cb=True, ) @@ -962,6 +1238,8 @@ def test_requests_use_more_than_available_blocks( # total number of blocks needed if scheduled together : 4 * (1 + 1) = 8 available_blocks = 4 max_num_seqs = 4 + max_model_len = 256 + checked_steps = [ { "step": 0, @@ -1090,6 +1368,7 @@ def test_requests_use_more_than_available_blocks( steps_add_reqs=steps_add_reqs, checked_steps=checked_steps, max_num_seqs=max_num_seqs, + max_model_len=max_model_len, available_blocks=available_blocks, use_cb=True, ) diff --git a/tests/scheduling_utils.py b/tests/scheduling_utils.py index 0bf00fad6..6a49c7a44 100644 --- a/tests/scheduling_utils.py +++ b/tests/scheduling_utils.py @@ -41,6 +41,7 @@ def check_scheduler_inference_steps( steps_add_reqs: list[int], checked_steps: list[dict[str, Any]], max_num_seqs: int, + max_model_len: int, available_blocks: int, use_cb: bool = True, ):