[CB] remove VLLM_SPYRE_RM_PADDED_BLOCKS, enable the feature by default (#231)

yannicks1 · web-flow · commit 11562e92d782 · 2025-06-13T12:52:01.000+02:00
### [CB] remove VLLM_SPYRE_RM_PADDED_BLOCKS, enable the feature by
default

This PR removes `VLLM_SPYRE_RM_PADDED_BLOCKS` and enables the removal of
padded blocks by default. The feature is working on AIU Spyre, hence
there is no need for turning it off anymore.

---------

Signed-off-by: Yannick Schnider &lt;yannick.schnider1@ibm.com&gt;
diff --git a/tests/e2e/test_spyre_cb.py b/tests/e2e/test_spyre_cb.py
@@ -136,7 +136,6 @@ def get_params_test_blocks_borders_aligned_prompts():
     prompts_lengths = [49, 41, 47]
     steps_add_reqs = [0, 0, 0]  # add all requests in the beginning
     max_model_len = 2048
-    remove_left_padding = False
 
     checked_steps = [
         {
@@ -203,7 +202,7 @@ def get_params_test_blocks_borders_aligned_prompts():
         },
         {
             "step": 70,  # Decode sequence 2
-            "tkv": 131,
+            "tkv": 67,  # tkv is reset by 64 due to removing the padded block
             "waiting": [],
             "running": ["2"],
             "request_outputs": ["2"]
@@ -212,7 +211,7 @@ def get_params_test_blocks_borders_aligned_prompts():
             # Sequence 2 finishes at step 73
             # (start step + 1 prefill + 6 decodes - 1) = 67 + 1 + 6 - 1 = 73
             "step": 73,
-            "tkv": 134,
+            "tkv": 70,
             "waiting": [],
             "running": [],
             "request_outputs": ["2"],
@@ -229,7 +228,7 @@ def get_params_test_blocks_borders_aligned_prompts():
     ]
 
     return (seqs_max_tokens, prompts_lengths, steps_add_reqs, checked_steps,
-            max_model_len, remove_left_padding)
+            max_model_len)
 
 
 def get_params_test_blocks_borders_misaligned_prompts():
@@ -241,7 +240,6 @@ def get_params_test_blocks_borders_misaligned_prompts():
     prompts_lengths = [49, 41, 47]
     steps_add_reqs = [0, 0, 0]  # add all requests in the beginning
     max_model_len = 2048
-    remove_left_padding = False
 
     checked_steps = [
         {
@@ -334,7 +332,7 @@ def get_params_test_blocks_borders_misaligned_prompts():
     ]
 
     return (seqs_max_tokens, prompts_lengths, steps_add_reqs, checked_steps,
-            max_model_len, remove_left_padding)
+            max_model_len)
 
 
 def get_params_test_special_finish():
@@ -345,7 +343,6 @@ def get_params_test_special_finish():
     prompts_lengths = [49, 30, 20]
     steps_add_reqs = [0, 0, 31]
     max_model_len = 2048
-    remove_left_padding = False
 
     checked_steps = [
         {
@@ -426,7 +423,7 @@ def get_params_test_special_finish():
     ]
 
     return (seqs_max_tokens, prompts_lengths, steps_add_reqs, checked_steps,
-            max_model_len, remove_left_padding)
+            max_model_len)
 
 
 def get_params_test_scheduler_constraints_tkv():
@@ -436,7 +433,6 @@ def get_params_test_scheduler_constraints_tkv():
     prompts_lengths = [49, 70]
     steps_add_reqs = [0, 0]
     max_model_len = 2048
-    remove_left_padding = False
 
     checked_steps = [
         {
@@ -518,7 +514,7 @@ def get_params_test_scheduler_constraints_tkv():
     ]
 
     return (seqs_max_tokens, prompts_lengths, steps_add_reqs, checked_steps,
-            max_model_len, remove_left_padding)
+            max_model_len)
 
 
 def get_params_test_scheduler_constraints_max_prompt_len():
@@ -528,7 +524,6 @@ def get_params_test_scheduler_constraints_max_prompt_len():
     prompts_lengths = [70, 49, 41]
     steps_add_reqs = [0, 0, 0]
     max_model_len = 256
-    remove_left_padding = False
 
     checked_steps = [
         {
@@ -628,124 +623,7 @@ def get_params_test_scheduler_constraints_max_prompt_len():
     ]
 
     return (seqs_max_tokens, prompts_lengths, steps_add_reqs, checked_steps,
-            max_model_len, remove_left_padding)
-
-
-def get_params_test_remove_left_padding():
-    """" Test the stripping of repeated left padding in continuous batching """
-
-    seqs_max_tokens = [40, 20, 11]
-    prompts_lengths = [20, 14, 5]
-    steps_add_reqs = [0, 30, 31]
-    max_model_len = 2048
-    remove_left_padding = True
-
-    checked_steps = [
-        {
-            "step": 0,
-            "tkv": 0,
-            "waiting": ["0"],
-            "running": [],
-            "request_outputs": []
-        },
-        {
-            # Prefill sequence 0
-            "step": 1,
-            "tkv": 64,
-            "waiting": [],
-            "running": ["0"],
-            "request_outputs": ["0"]
-        },
-        {
-            # Decode sequence 0
-            "step": 2,
-            "tkv": 65,
-            "waiting": [],
-            "running": ["0"],
-            "request_outputs": ["0"]
-        },
-        {
-            # Decode sequence 0, sequence 1 enters
-            "step": 30,
-            "tkv": 93,
-            "waiting": ["1"],
-            "running": ["0"],
-            "request_outputs": ["0"]
-        },
-        {
-            # Prefill sequence 1, sequence 2 enters
-            "step": 31,
-            "tkv": 93,
-            "waiting": ["2"],
-            "running": ["1", "0"],
-            "request_outputs": ["1"]
-        },
-        {
-            # Decode sequences 0 and 1
-            "step": 32,
-            "tkv": 94,
-            "waiting": ["2"],
-            "running": ["1", "0"],
-            "request_outputs": ["1", "0"]
-        },
-        {
-            # Sequence 0 finishes at step 41
-            # (start step + 2 prefills + 39 decodes - 1) = 1 + 2 + 39 - 1 = 41
-            "step": 41,
-            "tkv": 103,
-            "waiting": ["2"],
-            "running": ["1"],
-            "request_outputs": ["1", "0"],
-            "finished_requests": ["0"]
-        },
-        {
-            # Prefill sequence 2
-            "step": 42,
-            "tkv": 39,  # left padding reduction: 103 - 64 (block size)
-            "waiting": [],
-            "running": ["2", "1"],
-            "request_outputs": ["2"]
-        },
-        {
-            # Decode sequences 1 and 2
-            "step": 43,
-            "tkv": 40,
-            "waiting": [],
-            "running": ["2", "1"],
-            "request_outputs": ["2", "1"]
-        },
-        {
-            # Sequences 1 finishes at step 51
-            # (start step + 2 prefill + 19 decodes - 1) = 31 + 2 + 19 - 1 = 51
-            "step": 51,
-            "tkv": 48,
-            "waiting": [],
-            "running": ["2"],
-            "request_outputs": ["2", "1"],
-            "finished_requests": ["1"]
-        },
-        {
-            # Sequences 2 finishes at step 52
-            # (start step + 1 prefill + 10 decodes - 1) = 42 + 1 + 10 - 1 = 52
-            "step": 52,
-            "tkv": 49,
-            "waiting": [],
-            "running": [],
-            "request_outputs": ["2"],
-            "finished_requests": ["2"]
-        },
-        {
-            # Tkv should be cleared one step later
-            "step": 53,
-            "tkv": 0,
-            "waiting": [],
-            "running": [],
-            "request_outputs": [],
-        },
-    ]
-
-    return (seqs_max_tokens, prompts_lengths, steps_add_reqs, checked_steps,
-            max_model_len, remove_left_padding)
+            max_model_len)
 
 
 def augment_checked_steps(
@@ -775,13 +653,12 @@ def augment_checked_steps(
 @pytest.mark.parametrize("max_num_seqs", [2])
 @pytest.mark.parametrize(
     "seqs_max_tokens,prompts_lengths,steps_add_reqs,checked_steps,"
-    "max_model_len,remove_left_padding", [
+    "max_model_len", [
         get_params_test_blocks_borders_aligned_prompts(),
         get_params_test_blocks_borders_misaligned_prompts(),
         get_params_test_special_finish(),
         get_params_test_scheduler_constraints_tkv(),
         get_params_test_scheduler_constraints_max_prompt_len(),
-        get_params_test_remove_left_padding(),
     ])
 def test_scheduler_cb_steps_tkv(
     model: str,
@@ -793,7 +670,6 @@ def test_scheduler_cb_steps_tkv(
     steps_add_reqs: list[int],
     checked_steps: list[dict[str, Any]],
     max_model_len: int,
-    remove_left_padding: bool,
 ):
     """
     Test the scheduler execution by comparing the scheduler attributes at each 
@@ -809,8 +685,6 @@ def test_scheduler_cb_steps_tkv(
     monkeypatch.setenv("VLLM_SPYRE_USE_CB", "1")
     monkeypatch.setenv("VLLM_USE_V1", "1")
     monkeypatch.setenv("VLLM_SPYRE_DYNAMO_BACKEND", backend)
-    monkeypatch.setenv("VLLM_SPYRE_RM_PADDED_BLOCKS",
-                       "1" if remove_left_padding else "0")
 
     # To get deterministic execution in V1
     # and to enable InprocClient
diff --git a/vllm_spyre/envs.py b/vllm_spyre/envs.py
@@ -9,7 +9,6 @@
     VLLM_SPYRE_WARMUP_NEW_TOKENS: Optional[list[int]] = None
     VLLM_SPYRE_WARMUP_BATCH_SIZES: Optional[list[int]] = None
     VLLM_SPYRE_USE_CB: bool = False
-    VLLM_SPYRE_RM_PADDED_BLOCKS: bool = False
     VLLM_SPYRE_PERF_METRIC_LOGGING_ENABLED: int = 0
     VLLM_SPYRE_PERF_METRIC_LOGGING_DIR: str = "/tmp"
     VLLM_SPYRE_OVERRIDE_SIGNALS_HANDLER: bool = False
@@ -69,11 +68,6 @@ def _backend_backwards_compat() -> str:
     "VLLM_SPYRE_USE_CB":
     lambda: bool(int(os.getenv("VLLM_SPYRE_USE_CB", "0"))),
 
-    # If set, remove redundant (left) padded blocks. Only applicable in
-    # continuous batching mode.
-    "VLLM_SPYRE_RM_PADDED_BLOCKS":
-    lambda: bool(int(os.getenv("VLLM_SPYRE_RM_PADDED_BLOCKS", "0"))),
-
     # Enable performance metric logging. This captures startup information
     # such as warmup times, and loading times. It is turned off by default.
     "VLLM_SPYRE_PERF_METRIC_LOGGING_ENABLED":
diff --git a/vllm_spyre/v1/worker/spyre_model_runner.py b/vllm_spyre/v1/worker/spyre_model_runner.py
@@ -28,8 +28,6 @@
 
 from vllm.v1.outputs import ModelRunnerOutput
 
-import vllm_spyre.envs as envs_spyre
-
 logger = init_logger(__name__)
 
 
@@ -906,8 +904,7 @@ def prepare_model_input(
             self, scheduler_output: SchedulerOutput) -> ModelForwardInputs:
 
         # remove left padding if applicable before next prefil/decode step
-        if envs_spyre.VLLM_SPYRE_RM_PADDED_BLOCKS:
-            self.reduce_left_padding()
+        self.reduce_left_padding()
 
         # NOTE: We assume that all sequences in the group are all prompts or
         # all decodes.