Skip to content

Commit 11562e9

Browse files
authored
[CB] remove VLLM_SPYRE_RM_PADDED_BLOCKS, enable the feature by default (#231)
### [CB] remove VLLM_SPYRE_RM_PADDED_BLOCKS, enable the feature by default This PR removes `VLLM_SPYRE_RM_PADDED_BLOCKS` and enables the removal of padded blocks by default. The feature is working on AIU Spyre, hence there is no need for turning it off anymore. --------- Signed-off-by: Yannick Schnider <[email protected]>
1 parent e7ae34a commit 11562e9

File tree

3 files changed

+9
-144
lines changed

3 files changed

+9
-144
lines changed

tests/e2e/test_spyre_cb.py

Lines changed: 8 additions & 134 deletions
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,6 @@ def get_params_test_blocks_borders_aligned_prompts():
136136
prompts_lengths = [49, 41, 47]
137137
steps_add_reqs = [0, 0, 0] # add all requests in the beginning
138138
max_model_len = 2048
139-
remove_left_padding = False
140139

141140
checked_steps = [
142141
{
@@ -203,7 +202,7 @@ def get_params_test_blocks_borders_aligned_prompts():
203202
},
204203
{
205204
"step": 70, # Decode sequence 2
206-
"tkv": 131,
205+
"tkv": 67, # tkv is reset by 64 due to removing the padded block
207206
"waiting": [],
208207
"running": ["2"],
209208
"request_outputs": ["2"]
@@ -212,7 +211,7 @@ def get_params_test_blocks_borders_aligned_prompts():
212211
# Sequence 2 finishes at step 73
213212
# (start step + 1 prefill + 6 decodes - 1) = 67 + 1 + 6 - 1 = 73
214213
"step": 73,
215-
"tkv": 134,
214+
"tkv": 70,
216215
"waiting": [],
217216
"running": [],
218217
"request_outputs": ["2"],
@@ -229,7 +228,7 @@ def get_params_test_blocks_borders_aligned_prompts():
229228
]
230229

231230
return (seqs_max_tokens, prompts_lengths, steps_add_reqs, checked_steps,
232-
max_model_len, remove_left_padding)
231+
max_model_len)
233232

234233

235234
def get_params_test_blocks_borders_misaligned_prompts():
@@ -241,7 +240,6 @@ def get_params_test_blocks_borders_misaligned_prompts():
241240
prompts_lengths = [49, 41, 47]
242241
steps_add_reqs = [0, 0, 0] # add all requests in the beginning
243242
max_model_len = 2048
244-
remove_left_padding = False
245243

246244
checked_steps = [
247245
{
@@ -334,7 +332,7 @@ def get_params_test_blocks_borders_misaligned_prompts():
334332
]
335333

336334
return (seqs_max_tokens, prompts_lengths, steps_add_reqs, checked_steps,
337-
max_model_len, remove_left_padding)
335+
max_model_len)
338336

339337

340338
def get_params_test_special_finish():
@@ -345,7 +343,6 @@ def get_params_test_special_finish():
345343
prompts_lengths = [49, 30, 20]
346344
steps_add_reqs = [0, 0, 31]
347345
max_model_len = 2048
348-
remove_left_padding = False
349346

350347
checked_steps = [
351348
{
@@ -426,7 +423,7 @@ def get_params_test_special_finish():
426423
]
427424

428425
return (seqs_max_tokens, prompts_lengths, steps_add_reqs, checked_steps,
429-
max_model_len, remove_left_padding)
426+
max_model_len)
430427

431428

432429
def get_params_test_scheduler_constraints_tkv():
@@ -436,7 +433,6 @@ def get_params_test_scheduler_constraints_tkv():
436433
prompts_lengths = [49, 70]
437434
steps_add_reqs = [0, 0]
438435
max_model_len = 2048
439-
remove_left_padding = False
440436

441437
checked_steps = [
442438
{
@@ -518,7 +514,7 @@ def get_params_test_scheduler_constraints_tkv():
518514
]
519515

520516
return (seqs_max_tokens, prompts_lengths, steps_add_reqs, checked_steps,
521-
max_model_len, remove_left_padding)
517+
max_model_len)
522518

523519

524520
def get_params_test_scheduler_constraints_max_prompt_len():
@@ -528,7 +524,6 @@ def get_params_test_scheduler_constraints_max_prompt_len():
528524
prompts_lengths = [70, 49, 41]
529525
steps_add_reqs = [0, 0, 0]
530526
max_model_len = 256
531-
remove_left_padding = False
532527

533528
checked_steps = [
534529
{
@@ -628,124 +623,7 @@ def get_params_test_scheduler_constraints_max_prompt_len():
628623
]
629624

630625
return (seqs_max_tokens, prompts_lengths, steps_add_reqs, checked_steps,
631-
max_model_len, remove_left_padding)
632-
633-
634-
def get_params_test_remove_left_padding():
635-
"""" Test the stripping of repeated left padding in continuous batching """
636-
637-
seqs_max_tokens = [40, 20, 11]
638-
prompts_lengths = [20, 14, 5]
639-
steps_add_reqs = [0, 30, 31]
640-
max_model_len = 2048
641-
remove_left_padding = True
642-
643-
checked_steps = [
644-
{
645-
"step": 0,
646-
"tkv": 0,
647-
"waiting": ["0"],
648-
"running": [],
649-
"request_outputs": []
650-
},
651-
{
652-
# Prefill sequence 0
653-
"step": 1,
654-
"tkv": 64,
655-
"waiting": [],
656-
"running": ["0"],
657-
"request_outputs": ["0"]
658-
},
659-
{
660-
# Decode sequence 0
661-
"step": 2,
662-
"tkv": 65,
663-
"waiting": [],
664-
"running": ["0"],
665-
"request_outputs": ["0"]
666-
},
667-
{
668-
# Decode sequence 0, sequence 1 enters
669-
"step": 30,
670-
"tkv": 93,
671-
"waiting": ["1"],
672-
"running": ["0"],
673-
"request_outputs": ["0"]
674-
},
675-
{
676-
# Prefill sequence 1, sequence 2 enters
677-
"step": 31,
678-
"tkv": 93,
679-
"waiting": ["2"],
680-
"running": ["1", "0"],
681-
"request_outputs": ["1"]
682-
},
683-
{
684-
# Decode sequences 0 and 1
685-
"step": 32,
686-
"tkv": 94,
687-
"waiting": ["2"],
688-
"running": ["1", "0"],
689-
"request_outputs": ["1", "0"]
690-
},
691-
{
692-
# Sequence 0 finishes at step 41
693-
# (start step + 2 prefills + 39 decodes - 1) = 1 + 2 + 39 - 1 = 41
694-
"step": 41,
695-
"tkv": 103,
696-
"waiting": ["2"],
697-
"running": ["1"],
698-
"request_outputs": ["1", "0"],
699-
"finished_requests": ["0"]
700-
},
701-
{
702-
# Prefill sequence 2
703-
"step": 42,
704-
"tkv": 39, # left padding reduction: 103 - 64 (block size)
705-
"waiting": [],
706-
"running": ["2", "1"],
707-
"request_outputs": ["2"]
708-
},
709-
{
710-
# Decode sequences 1 and 2
711-
"step": 43,
712-
"tkv": 40,
713-
"waiting": [],
714-
"running": ["2", "1"],
715-
"request_outputs": ["2", "1"]
716-
},
717-
{
718-
# Sequences 1 finishes at step 51
719-
# (start step + 2 prefill + 19 decodes - 1) = 31 + 2 + 19 - 1 = 51
720-
"step": 51,
721-
"tkv": 48,
722-
"waiting": [],
723-
"running": ["2"],
724-
"request_outputs": ["2", "1"],
725-
"finished_requests": ["1"]
726-
},
727-
{
728-
# Sequences 2 finishes at step 52
729-
# (start step + 1 prefill + 10 decodes - 1) = 42 + 1 + 10 - 1 = 52
730-
"step": 52,
731-
"tkv": 49,
732-
"waiting": [],
733-
"running": [],
734-
"request_outputs": ["2"],
735-
"finished_requests": ["2"]
736-
},
737-
{
738-
# Tkv should be cleared one step later
739-
"step": 53,
740-
"tkv": 0,
741-
"waiting": [],
742-
"running": [],
743-
"request_outputs": [],
744-
},
745-
]
746-
747-
return (seqs_max_tokens, prompts_lengths, steps_add_reqs, checked_steps,
748-
max_model_len, remove_left_padding)
626+
max_model_len)
749627

750628

751629
def augment_checked_steps(
@@ -775,13 +653,12 @@ def augment_checked_steps(
775653
@pytest.mark.parametrize("max_num_seqs", [2])
776654
@pytest.mark.parametrize(
777655
"seqs_max_tokens,prompts_lengths,steps_add_reqs,checked_steps,"
778-
"max_model_len,remove_left_padding", [
656+
"max_model_len", [
779657
get_params_test_blocks_borders_aligned_prompts(),
780658
get_params_test_blocks_borders_misaligned_prompts(),
781659
get_params_test_special_finish(),
782660
get_params_test_scheduler_constraints_tkv(),
783661
get_params_test_scheduler_constraints_max_prompt_len(),
784-
get_params_test_remove_left_padding(),
785662
])
786663
def test_scheduler_cb_steps_tkv(
787664
model: str,
@@ -793,7 +670,6 @@ def test_scheduler_cb_steps_tkv(
793670
steps_add_reqs: list[int],
794671
checked_steps: list[dict[str, Any]],
795672
max_model_len: int,
796-
remove_left_padding: bool,
797673
):
798674
"""
799675
Test the scheduler execution by comparing the scheduler attributes at each
@@ -809,8 +685,6 @@ def test_scheduler_cb_steps_tkv(
809685
monkeypatch.setenv("VLLM_SPYRE_USE_CB", "1")
810686
monkeypatch.setenv("VLLM_USE_V1", "1")
811687
monkeypatch.setenv("VLLM_SPYRE_DYNAMO_BACKEND", backend)
812-
monkeypatch.setenv("VLLM_SPYRE_RM_PADDED_BLOCKS",
813-
"1" if remove_left_padding else "0")
814688

815689
# To get deterministic execution in V1
816690
# and to enable InprocClient

vllm_spyre/envs.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
VLLM_SPYRE_WARMUP_NEW_TOKENS: Optional[list[int]] = None
1010
VLLM_SPYRE_WARMUP_BATCH_SIZES: Optional[list[int]] = None
1111
VLLM_SPYRE_USE_CB: bool = False
12-
VLLM_SPYRE_RM_PADDED_BLOCKS: bool = False
1312
VLLM_SPYRE_PERF_METRIC_LOGGING_ENABLED: int = 0
1413
VLLM_SPYRE_PERF_METRIC_LOGGING_DIR: str = "/tmp"
1514
VLLM_SPYRE_OVERRIDE_SIGNALS_HANDLER: bool = False
@@ -69,11 +68,6 @@ def _backend_backwards_compat() -> str:
6968
"VLLM_SPYRE_USE_CB":
7069
lambda: bool(int(os.getenv("VLLM_SPYRE_USE_CB", "0"))),
7170

72-
# If set, remove redundant (left) padded blocks. Only applicable in
73-
# continuous batching mode.
74-
"VLLM_SPYRE_RM_PADDED_BLOCKS":
75-
lambda: bool(int(os.getenv("VLLM_SPYRE_RM_PADDED_BLOCKS", "0"))),
76-
7771
# Enable performance metric logging. This captures startup information
7872
# such as warmup times, and loading times. It is turned off by default.
7973
"VLLM_SPYRE_PERF_METRIC_LOGGING_ENABLED":

vllm_spyre/v1/worker/spyre_model_runner.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,6 @@
2828

2929
from vllm.v1.outputs import ModelRunnerOutput
3030

31-
import vllm_spyre.envs as envs_spyre
32-
3331
logger = init_logger(__name__)
3432

3533

@@ -906,8 +904,7 @@ def prepare_model_input(
906904
self, scheduler_output: SchedulerOutput) -> ModelForwardInputs:
907905

908906
# remove left padding if applicable before next prefil/decode step
909-
if envs_spyre.VLLM_SPYRE_RM_PADDED_BLOCKS:
910-
self.reduce_left_padding()
907+
self.reduce_left_padding()
911908

912909
# NOTE: We assume that all sequences in the group are all prompts or
913910
# all decodes.

0 commit comments

Comments
 (0)