Added workspace check and reflected this in test (#1991)

nvmbreughe · web-flow · commit 7d9d7aff943a · 2025-10-28T14:56:14.000-07:00
## 📌 Description This PR attempts to fix #1986 (to be confirmed by requester) The issue is that num_tokens was larger than MAX_TOKEN_NUM, which results in an IMA, or even in a hang. To address this, I added a validation check. This required a non-breaking API change: * create_ipc_workspace_for_all_reduce_fusion now has an optional "create_metadata" bool, which results in an additional return value * it is made optional as additional return value could break the API * trtllm_allreduce_fusion now takes an optional metadata dictionary * When provided, this will run the validation check * again, this is also optional, to avoid breaking the api In addition this PR deprecates the older AllReduce functionality so it can be removed in a major version bump. ## 🔍 Related Issues  ## 🚀 Pull Request Checklist Thank you for contributing to FlashInfer! Before we review your pull request, please make sure the following items are complete. ### ✅ Pre-commit Checks - [ ] I have installed `pre-commit` by running `pip install pre-commit` (or used your preferred method). - [x] I have installed the hooks with `pre-commit install`. - [ ] I have run the hooks manually with `pre-commit run --all-files` and fixed any reported issues. > If you are unsure about how to set up `pre-commit`, see [the pre-commit documentation](https://pre-commit.com/). ## 🧪 Tests - [x] Tests have been added or updated as needed. - [ ] All tests are passing (`unittest`, etc.). ## Reviewer Notes   ## Summary by CodeRabbit * **API Changes** * Workspace creation can optionally return metadata describing the workspace configuration (create_metadata flag). * Allreduce fusion operations accept optional metadata to validate runtime parameters against the workspace and raise clear errors on mismatch. * A workspace destruction endpoint was renamed for naming consistency. * Legacy wrappers were marked deprecated and now point users toward the newer fusion variants.
diff --git a/flashinfer/comm/trtllm_ar.py b/flashinfer/comm/trtllm_ar.py
@@ -19,6 +19,7 @@
 from ctypes import c_void_p, cast
 from types import SimpleNamespace
 from typing import List, Optional, Tuple, Union
+from typing_extensions import deprecated
 
 import torch
 import torch.distributed as dist
@@ -121,6 +122,9 @@ def trtllm_lamport_initialize_all(
             buffer_0_ptr, buffer_1_ptr, buffer_2_ptr, size, dtype
         )
 
+    @deprecated(
+        "trtllm_create_ipc_workspace_for_all_reduce and trtllm_custom_all_reduce are deprecated, use trtllm_create_ipc_workspace_for_all_reduce_fusion and trtllm_allreduce_fusion instead"
+    )
     @register_custom_op(
         "flashinfer::trtllm_custom_all_reduce",
         mutates_args=[
@@ -393,6 +397,9 @@ def trtllm_moe_finalize_allreduce_fusion(
 LamportTokenNumThreshold = 16
 
 
+@deprecated(
+    "trtllm_create_ipc_workspace_for_all_reduce and trtllm_custom_all_reduce are deprecated, use trtllm_create_ipc_workspace_for_all_reduce_fusion and trtllm_allreduce_fusion instead"
+)
 def trtllm_create_ipc_workspace_for_all_reduce(
     rank: int,
     tp_size: int,
@@ -493,14 +500,18 @@ def trtllm_destroy_ipc_workspace_for_all_reduce(
 MAX_COMM_SIZE = 2147483647 & ~((1 << 21) - 1)  # MAX_INT32 rounded down to 2MB
 
 
+# @TODO(nvmbreughe): on a next major bump, remove create_metadata and make create_metadata=True the default behavior
 def trtllm_create_ipc_workspace_for_all_reduce_fusion(
     tp_rank: int,
     tp_size: int,
     max_token_num: int,
     hidden_dim,
     use_fp32_lamport: bool = False,
     group: Optional[ProcessGroup] = None,
-) -> Tuple[List[List[int]], torch.Tensor]:
+    create_metadata: bool = False,
+) -> Union[
+    Tuple[List[List[int]], torch.Tensor], Tuple[List[List[int]], torch.Tensor, dict]
+]:
     """
     Parameters:
     - tp_rank: the rank of the current process.
@@ -509,6 +520,13 @@ def trtllm_create_ipc_workspace_for_all_reduce_fusion(
     - hidden_dim: the dimension of the hidden states.
     - use_fp32_lamport: if True, we will use fp32 datatype in allreduce fusion.
     - group: the process group to use.
+    - create_metadata: if True, return metadata dict as third element (default: False).
+
+    Returns:
+    - If create_metadata=False: (ipc_handles, workspace_tensor)
+    - If create_metadata=True: (ipc_handles, workspace_tensor, metadata)
+      where metadata contains: tp_rank, tp_size, max_token_num, hidden_dim,
+      use_fp32_lamport, buffer_size, flag_size, lamport_comm_size, lamport_buffer_size
 
     Note:
     We would init 3 IPC buffers for trtllm_custom_all_reduce_fusion.
@@ -517,8 +535,8 @@ def trtllm_create_ipc_workspace_for_all_reduce_fusion(
     where:
     - buffer_size: tp_size * max_token_num * hidden_dim * sizeof(half)
     - flag_size: tp_size * BarrierFlagCount * sizeof(int)
-    - lamport_buffer_size: tp_size * max(max_token_num, OneShotMaxToken) * tp_size * hidden_dim * sizeof(half)
-
+    - lamport_buffer_size: tp_size * max_token_num * tp_size * hidden_dim * sizeof(half)
+      where sizeof(elem) = 2 (fp16/bf16) or 4 (fp32 when use_fp32_lamport=True)
     The workspace is passed as workspace field in AllReduceFusionParams.
 
     We use tp_size and world_size here interchangeably (allReduceFusion).
@@ -608,7 +626,21 @@ def trtllm_create_ipc_workspace_for_all_reduce_fusion(
 
     dist.barrier(group=group)  # must sync after create_workspace
 
-    return ipc_handles, workspace_tensor
+    if create_metadata:
+        metadata = {
+            "tp_rank": tp_rank,
+            "tp_size": tp_size,
+            "max_token_num": max_token_num,
+            "hidden_dim": hidden_dim,
+            "use_fp32_lamport": use_fp32_lamport,
+            "buffer_size": buffer_size,
+            "flag_size": flag_size,
+            "lamport_comm_size": lamport_comm_size,
+            "lamport_buffer_size": lamport_buffer_size,
+        }
+        return ipc_handles, workspace_tensor, metadata
+    else:
+        return ipc_handles, workspace_tensor
 
 
 def trtllm_destroy_ipc_workspace_for_all_reduce_fusion(
@@ -675,6 +707,9 @@ def trtllm_lamport_initialize_all(
     )
 
 
+@deprecated(
+    "trtllm_create_ipc_workspace_for_all_reduce and trtllm_custom_all_reduce are deprecated, use trtllm_create_ipc_workspace_for_all_reduce_fusion and trtllm_allreduce_fusion instead"
+)
 def trtllm_custom_all_reduce(
     inp: torch.Tensor,
     out: torch.Tensor,
@@ -791,6 +826,7 @@ def trtllm_allreduce_fusion(
     rms_eps: Optional[float],
     scale_factor: Optional[Union[torch.Tensor, float]],
     layout_code: Optional[QuantizationSFLayout],
+    metadata: Optional[dict] = None,
 ) -> None:
     """
     Parameters:
@@ -815,8 +851,58 @@ def trtllm_allreduce_fusion(
     - rms_eps: the rms epsilon value.
     - scale_factor: the scale factor. For cudaGraphs safety, it should be a tensor.
     - layout_code: the layout code.
+    - metadata: optional workspace metadata dict from create_ipc_workspace_for_all_reduce_fusion.
+                If provided, validates that token_num <= max_token_num, world_size == tp_size,
+                and hidden_dim == workspace hidden_dim. Raises ValueError if validation fails.
     """
 
+    # Validate against workspace metadata if provided
+    if metadata is not None:
+        errors = []
+        required_keys = ["max_token_num", "tp_size", "hidden_dim", "use_fp32_lamport"]
+        for key in required_keys:
+            if key not in metadata:
+                errors.append(f"Workspace metadata is missing required key: {key}")
+        if errors:
+            error_msg = "Workspace metadata validation failed:\n" + "\n".join(
+                f"  - {e}" for e in errors
+            )
+            raise ValueError(error_msg)
+
+        # Check 1: token_num must not exceed max_token_num
+        if token_num > metadata["max_token_num"]:
+            errors.append(
+                f"token_num ({token_num}) exceeds workspace max_token_num ({metadata['max_token_num']}). "
+                f"This may cause Illegal Memory Access."
+            )
+
+        # Check 2: world_size must match tp_size
+        if world_size != metadata["tp_size"]:
+            errors.append(
+                f"world_size ({world_size}) does not match workspace tp_size ({metadata['tp_size']}). "
+                f"Workspace was created for tp_size={metadata['tp_size']}."
+            )
+
+        # Check 3: hidden_dim must match
+        if hidden_dim != metadata["hidden_dim"]:
+            errors.append(
+                f"hidden_dim ({hidden_dim}) does not match workspace hidden_dim ({metadata['hidden_dim']}). "
+                f"Workspace was created for hidden_dim={metadata['hidden_dim']}."
+            )
+
+        # Check 4: use_fp32_lamport must match
+        if metadata["use_fp32_lamport"] != (allreduce_in.dtype == torch.float32):
+            errors.append(
+                f"use_fp32_lamport ({metadata['use_fp32_lamport']}) does not match allreduce_in.dtype ({allreduce_in.dtype}). "
+                f"Workspace was created for use_fp32_lamport={metadata['use_fp32_lamport']}."
+            )
+
+        if errors:
+            error_msg = "Workspace validation failed:\n" + "\n".join(
+                f"  - {e}" for e in errors
+            )
+            raise ValueError(error_msg)
+
     if use_oneshot is None:
         use_oneshot = _should_use_oneshot(
             token_num, hidden_dim, allreduce_in.dtype, world_size
diff --git a/tests/comm/test_trtllm_allreduce_fusion.py b/tests/comm/test_trtllm_allreduce_fusion.py
@@ -57,15 +57,16 @@ def _run_correctness_worker(world_size, rank, dtype, hidden_dim, distributed_ini
 
         lamport_use_fp32 = dtype == torch.float32
 
-        # create workspace for allreduce fusion
-        ipc_handles, workspace_tensor = (
+        # create workspace for allreduce fusion with metadata
+        ipc_handles, workspace_tensor, workspace_metadata = (
             comm.trtllm_create_ipc_workspace_for_all_reduce_fusion(
                 rank,
                 world_size,
                 MAX_TOKEN_NUM,
                 hidden_dim,
                 group=group,
                 use_fp32_lamport=lamport_use_fp32,
+                create_metadata=True,  # Get metadata for validation
             )
         )
 
@@ -184,6 +185,7 @@ def _run_correctness_worker(world_size, rank, dtype, hidden_dim, distributed_ini
                                                 rms_eps=rms_eps,
                                                 scale_factor=scale_factor,
                                                 layout_code=swizzled_layout_code,
+                                                metadata=workspace_metadata,
                                             )
 
                                     # NOTE: in real case, you dont have to set all optional params. You could set those required by fusion pattern.
@@ -213,6 +215,7 @@ def _run_correctness_worker(world_size, rank, dtype, hidden_dim, distributed_ini
                                                 rms_eps=rms_eps,
                                                 scale_factor=scale_factor,
                                                 layout_code=swizzled_layout_code,
+                                                metadata=workspace_metadata,
                                             )
                                     # replay
                                     g.replay()
@@ -304,7 +307,9 @@ def _run_correctness_worker(world_size, rank, dtype, hidden_dim, distributed_ini
     finally:
         dist.barrier(group=group)
 
-        comm.trtllm_destroy_ipc_workspace_for_all_reduce(ipc_handles, group=group)
+        comm.trtllm_destroy_ipc_workspace_for_all_reduce_fusion(
+            ipc_handles, group=group
+        )
 
         dist.destroy_process_group(group=group)