[fix] Add 1 and draft_token_num to seq_len when overlap scheduling is enabled during memory estimation (#5343)

HuiGao-NV · web-flow · commit e16c1bef6e82 · 2025-06-24T11:43:43.000+08:00
Signed-off-by: Hui Gao &lt;huig@nvidia.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/_util.py b/tensorrt_llm/_torch/pyexecutor/_util.py
@@ -151,7 +151,13 @@ def _get_token_num_for_estimation(self) -> int:
         # estimate_max_kv_cache_tokens submits self._dummy_reqs
         num_cache_blocks = 0
         num_extra_tokens_per_seq = 1  # account for generated tokens
+        pytorch_backend_config = executor_config.pytorch_backend_config
         spec_cfg = executor_config.speculative_config
+        if not pytorch_backend_config.disable_overlap_scheduler:
+            num_extra_tokens_per_seq = num_extra_tokens_per_seq + 1
+            if spec_cfg is not None:
+                num_extra_tokens_per_seq += spec_cfg.max_draft_tokens
+
         if spec_cfg is not None:
             num_extra_tokens_per_seq += spec_cfg.max_draft_tokens
             num_extra_tokens_per_seq += spec_cfg.num_extra_kv_tokens