[CB] hard code number of spyre blocks to 2080 (#362)

yannicks1 · web-flow · commit 443cb83b916d · 2025-08-08T11:05:07.000-06:00
### [CB] hard code number of Spyre blocks to 2080 We have received a number from the compiler team for the [model](https://huggingface.co/ibm-granite/granite-3.3-8b-instruct) of interest and hard code it here. --------- Signed-off-by: Yannick Schnider <yannick.schnider1@ibm.com> Signed-off-by: Yannick Schnider <Yannick.Schnider1@ibm.com>
diff --git a/vllm_spyre/v1/worker/spyre_model_runner.py b/vllm_spyre/v1/worker/spyre_model_runner.py
@@ -840,15 +840,27 @@ def _get_num_blocks_available(self) -> int:
         max_model_len = self.vllm_config.scheduler_config.max_model_len
 
         min_req_num_blocks = max_model_len // self.block_size
-        # min_req_num_blocks is not enough blocks for the following test:
-        # tests/e2e/test_spyre_cb.py::test_scheduler_cb_steps_tkv
-        # [seqs_max_tokens4-prompts_lengths4-steps_add_reqs4-
-        # checked_steps4-256-False-2-eager-llama-194m]
+
+        # TODO: replace the hard coded NUM_BLOCKS_SPYRE by calling a function
+        # in torch_sendnn which returns the value set by the Spyre compiler.
+
+        if ('granite-3.3-8b-instruct' in self.model_config.model
+                and self.parallel_config.world_size == 4):
+            # hard coded value for tensor parallel size 4 with the below model
+            # https://huggingface.co/ibm-granite/granite-3.3-8b-instruct
+            NUM_BLOCKS_SPYRE = 2080
+            logger.info("Model granite-3.3-8b-instruct and tensor parallel " \
+            "size 4 detected. Using NUM_BLOCKS_SPYRE = %d", 2080)
+        else:
+            # default value for any other model/ tensor parallel size
+            NUM_BLOCKS_SPYRE = max_batch_size * min_req_num_blocks
+            logger.info("No model / tensor parallel size specific value for" \
+            "the number of KV cache blocks available on Spyre found. Using " \
+            "default value (max_batch_size * max_model_len / block_size): %d",
+              NUM_BLOCKS_SPYRE)
 
         if envs_spyre.VLLM_SPYRE_DYNAMO_BACKEND == 'sendnn':
-            # TODO: replace num_blocks_spyre by calling a function in
-            # torch_sendnn which returns the value set by the Spyre compiler
-            num_blocks_spyre = max_batch_size * min_req_num_blocks
+            num_blocks_spyre = NUM_BLOCKS_SPYRE
             assert num_blocks_spyre >= min_req_num_blocks, (
                 "Number of pages available on Spyre (%d) is not enough to "
                 "serve the current model (need at least %d pages)." %
@@ -861,7 +873,8 @@ def _get_num_blocks_available(self) -> int:
                         str(max_model_len), max_concurrency_spyre)
             return num_blocks_spyre
         else:  # dynamo backend 'eager'
-            num_blocks_cpu = max_batch_size * min_req_num_blocks
+            # for debugging purposes we also put the spyre value here for cpu
+            num_blocks_cpu = NUM_BLOCKS_SPYRE
             assert num_blocks_cpu >= min_req_num_blocks, (
                 "Number of pages available on CPU (%d) is not enough to "
                 "serve the current model (need at least %d pages)." %