[TRTLLM-8129][feat] Apply AutoTuner to AllReduce Op for strategy tuning.

hyukn · hyukn · commit 8e30117584b0 · 2025-11-03T04:52:48.000Z
* Added AUTOTUNE strategy for AllReduce operations to automatically select and apply the best reduction tactic based on tensor characteristics.
* For AutoTuner, enhanced distributed synchronization with improved MPI coordination across multiple ranks for reliable profiling and operation execution. This grants the ability to use AutoTuner for distributed operations.

Signed-off-by: Yukun He &lt;23156053+hyukn@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/autotuner.py b/tensorrt_llm/_torch/autotuner.py
@@ -15,6 +15,7 @@
 import torch
 
 import tensorrt_llm
+from tensorrt_llm._utils import mpi_barrier, mpi_broadcast
 from tensorrt_llm.bindings.internal.runtime import delay_kernel
 from tensorrt_llm.logger import logger
 
@@ -534,8 +535,6 @@ def __init__(self, warmup=3, repeat=10, stream_delay_micro_secs=1000):
         # Add statistics tracking
         self.stats = AutoTunerStatistics()
 
-        self.profiling_debug = True
-
         # Current captured choose_one() contexts
         self._active_capture: Optional['AutoTuner.TacticsCapture'] = None
         # Last captured choose_one() contexts
@@ -786,6 +785,13 @@ def _profile_runners(
         tuning_config: TuningConfig,
         **kwargs,
     ) -> float:
+        """Profile runners and select the best tactic.
+
+        For multi-rank profiling, only rank 0 performs the actual profiling
+        to avoid sync issues when different ranks select different tactics.
+        The results are then broadcasted to all other ranks.
+        """
+
         min_time = float('inf')
         has_tuning_failure_occured = False
         best_runner_id, best_tactic = None, None
@@ -836,6 +842,13 @@ def _profile_runners(
                     min_time = time_measured
                     best_runner_id, best_tactic = runner_id, tac
 
+        if self._is_sync_op(runner):
+            profiling_results = (best_runner_id, best_tactic, min_time,
+                                 has_tuning_failure_occured)
+            # Broadcast profiling results from rank 0 to all other ranks
+            profiling_results = mpi_broadcast(profiling_results, root=0)
+            best_runner_id, best_tactic, min_time, has_tuning_failure_occured = profiling_results
+
         return best_runner_id, best_tactic, min_time, has_tuning_failure_occured
 
     def _get_input_sizes(self, inputs: List[torch.Tensor]) -> List[torch.Size]:
@@ -871,6 +884,10 @@ def _profile_single_kernel(
             are used to ensure accurate timing.
         """
         stream = torch.cuda.current_stream()
+
+        if self._is_sync_op(runner):
+            mpi_barrier()
+
         # warm up, no timing
         for _ in range(self.warmup):
             runner(inputs, tactic=tactic, **kwargs)
@@ -883,6 +900,9 @@ def _profile_single_kernel(
         start = torch.cuda.Event(enable_timing=True)
         end = torch.cuda.Event(enable_timing=True)
 
+        if self._is_sync_op(runner):
+            mpi_barrier()
+
         start.record(stream=stream)
         for _ in range(self.repeat):
             runner(inputs, tactic=tactic, **kwargs)
@@ -1065,6 +1085,9 @@ def _prepare_input_tensors(
             tensors.append(tensor)
         return tensors
 
+    def _is_sync_op(self, runner: TunableRunner) -> bool:
+        return runner.__class__.__name__ in ["AllReduceRunner"]
+
     def clear_cache(self) -> None:
         """Clear the profiling cache."""
         self.profiling_cache.clear()
diff --git a/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py b/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py
@@ -8,6 +8,7 @@
 import tensorrt_llm.quantization.utils.fp8_utils as fp8_utils
 from tensorrt_llm import deep_gemm
 from tensorrt_llm._utils import get_sm_version
+from tensorrt_llm.functional import AllReduceFusionOp, AllReduceStrategy
 
 from ..autotuner import (AutoTuner, ConstraintSpec, DynamicTensorSpec,
                          OptimizationProfile, TunableRunner, TuningConfig)
@@ -1122,6 +1123,161 @@ def _(
     return x.new_empty((b, d), dtype=o_dtype)
 
 
+class AllReduceRunner(TunableRunner):
+    all_support_ops = {
+        AllReduceFusionOp.NONE.value,
+        AllReduceFusionOp.RESIDUAL_RMS_NORM.value,
+    }
+
+    tuning_config = TuningConfig(
+        dynamic_tensor_specs=(DynamicTensorSpec(
+            0, 0,
+            (8192, 4096, 2048, 1024, 512, 256, 128, 64, 32, 16, 8, 4, 2, 1),
+            last_positive_power_of_2), ),
+        constraint_specs=(ConstraintSpec(1, 0, lambda shapes: shapes[0][0]), ),
+    )
+
+    def __init__(
+        self,
+        tp_size: int,
+        group: List[int],
+        op: int,
+        eps: float,
+        trigger_completion_at_end: bool,
+    ):
+        self.tp_size = tp_size
+        self.op = op
+        self._group = group
+        self._eps = eps
+        self._trigger_completion_at_end = trigger_completion_at_end
+
+    def __hash__(self):
+        return hash((self.tp_size, self.op))
+
+    def get_valid_tactics(
+        self,
+        inputs: List[torch.Tensor],
+        profile: OptimizationProfile,
+        **kwargs,
+    ) -> List[int]:
+        valid_tactics = [
+            AllReduceStrategy.NCCL.value,
+            AllReduceStrategy.ONESHOT.value,
+        ]
+        if inputs[0].shape[0] >= self.tp_size:
+            valid_tactics.append(AllReduceStrategy.TWOSHOT.value)
+        return valid_tactics
+
+    def forward(
+        self,
+        inputs: List[torch.Tensor],
+        tactic: int = -1,
+    ) -> torch.Tensor:
+        input, residual, norm_weight, scale, bias, workspace = inputs
+        if tactic == -1:
+            tactic = AllReduceStrategy.NCCL.value
+
+        return torch.ops.trtllm.allreduce(
+            input,
+            residual,
+            norm_weight,
+            scale,
+            bias,
+            workspace,
+            self._group,
+            tactic,
+            self.op,
+            self._eps,
+            self._trigger_completion_at_end,
+        )
+
+
+@torch.library.custom_op("trtllm::tunable_allreduce", mutates_args=())
+def tunable_allreduce(
+    input: torch.Tensor,
+    residual: Optional[torch.Tensor],
+    norm_weight: Optional[torch.Tensor],
+    scale: Optional[torch.Tensor],
+    bias: Optional[torch.Tensor],
+    workspace: Optional[torch.Tensor],
+    group: List[int],
+    strategy: int,
+    op: int,
+    eps: float,
+    tp_size: int,
+    trigger_completion_at_end: bool,
+) -> List[torch.Tensor]:
+
+    tuner = AutoTuner.get()
+
+    allreduce_runner = AllReduceRunner(
+        tp_size,
+        group,
+        op,
+        eps,
+        trigger_completion_at_end,
+    )
+
+    _, best_tactic = tuner.choose_one(
+        "trtllm::tunable_allreduce::allreduce",
+        [allreduce_runner],
+        AllReduceRunner.tuning_config,
+        [input, residual, norm_weight, scale, bias, workspace],
+    )
+
+    return allreduce_runner(
+        [input, residual, norm_weight, scale, bias, workspace],
+        tactic=best_tactic,
+    )
+
+
+@tunable_allreduce.register_fake
+def _(
+    input: torch.Tensor,
+    residual: Optional[torch.Tensor],
+    norm_weight: Optional[torch.Tensor],
+    scale: Optional[torch.Tensor],
+    bias: Optional[torch.Tensor],
+    workspace: Optional[torch.Tensor],
+    group: List[int],
+    strategy: int,
+    op: int,
+    eps: float,
+    tp_size: int,
+    trigger_completion_at_end: bool,
+) -> torch.Tensor:
+    if op == int(AllReduceFusionOp.NONE):
+        return [torch.empty_like(input)]
+    elif op == int(AllReduceFusionOp.RESIDUAL_RMS_NORM):
+        norm_out = torch.empty_like(input)
+        residual_out = torch.empty_like(input)
+        return [norm_out, residual_out]
+    elif op == int(AllReduceFusionOp.RESIDUAL_RMS_NORM_QUANT_FP8):
+        quant_out = torch.empty_like(input, dtype=torch.float8_e4m3fn)
+        residual_out = torch.empty_like(input)
+        return [quant_out, residual_out]
+    elif op == int(AllReduceFusionOp.RESIDUAL_RMS_NORM_OUT_QUANT_FP8):
+        norm_out = torch.empty_like(input)
+        quant_out = torch.empty_like(input, dtype=torch.float8_e4m3fn)
+        residual_out = torch.empty_like(input)
+        return [norm_out, quant_out, residual_out]
+    elif op == int(AllReduceFusionOp.RESIDUAL_RMS_NORM_QUANT_NVFP4):
+        fp4_shape, scale_shape = fp4_utils.get_fp4_shape(input.shape, 16)
+        quant_fp4 = input.new_empty(fp4_shape, dtype=torch.uint8)
+        scale_fp4 = input.new_empty(scale_shape, dtype=torch.uint8)
+        residual_out = torch.empty_like(input)
+        return [quant_fp4, scale_fp4, residual_out]
+    elif op == int(AllReduceFusionOp.RESIDUAL_RMS_NORM_OUT_QUANT_NVFP4):
+        fp4_shape, scale_shape = fp4_utils.get_fp4_shape(input.shape, 16)
+        quant_fp4 = input.new_empty(fp4_shape, dtype=torch.uint8)
+        scale_fp4 = input.new_empty(scale_shape, dtype=torch.uint8)
+        norm_out = torch.empty_like(input)
+        residual_out = torch.empty_like(input)
+        return [norm_out, quant_fp4, scale_fp4, residual_out]
+    else:
+        return [torch.empty_like(input)]
+
+
 def get_event(event_idx: int):
     from ..utils import get_model_extra_attrs
     extra_attrs = get_model_extra_attrs()
diff --git a/tensorrt_llm/_torch/distributed/ops.py b/tensorrt_llm/_torch/distributed/ops.py
@@ -505,7 +505,6 @@ def __init__(self,
         self._disable_mpi = mpi_disabled()
 
         self.all_reduce_op = torch.ops.trtllm.allreduce_pg if self._disable_mpi else torch.ops.trtllm.allreduce
-
         if self.mapping.tp_size > 1:
             # When Strategy is UB, it is guaranteed that the workspace is not used.
             if self.strategy != AllReduceStrategy.UB:
@@ -574,6 +573,7 @@ def forward(
         input = input.contiguous()  # Underlying op requires contiguous input
 
         allreduce_strategy = self.strategy
+
         if all_reduce_params is None:
             all_reduce_params = AllReduceParams()
 
@@ -598,21 +598,43 @@ def forward(
                 "pg": pg.boxed(),
             }
 
-        output = self.all_reduce_op(
-            input=input,
-            residual=all_reduce_params.residual,
-            norm_weight=all_reduce_params.norm_weight,
-            scale=all_reduce_params.scale,
-            bias=all_reduce_params.bias,
-            workspace=self.workspace,
-            group=self.mapping.tp_group,
-            strategy=allreduce_strategy,
-            op=all_reduce_params.fusion_op,
-            eps=all_reduce_params.eps,
-            trigger_completion_at_end=all_reduce_params.
-            trigger_completion_at_end,
-            **additional_args,
-        )
+            # TODO: args for non mpi version are not supported by Python side custom op
+            # for now, we just fallback to AUTO
+            if self.strategy == AllReduceStrategy.AUTOTUNE:
+                self.strategy = AllReduceStrategy.AUTO
+
+        if self.strategy == AllReduceStrategy.AUTOTUNE:
+            output = torch.ops.trtllm.tunable_allreduce(
+                input=input,
+                residual=all_reduce_params.residual,
+                norm_weight=all_reduce_params.norm_weight,
+                scale=all_reduce_params.scale,
+                bias=all_reduce_params.bias,
+                workspace=self.workspace,
+                group=self.mapping.tp_group,
+                strategy=allreduce_strategy,
+                op=all_reduce_params.fusion_op,
+                eps=all_reduce_params.eps,
+                tp_size=self.mapping.tp_size,
+                trigger_completion_at_end=all_reduce_params.
+                trigger_completion_at_end,
+            )
+        else:
+            output = self.all_reduce_op(
+                input=input,
+                residual=all_reduce_params.residual,
+                norm_weight=all_reduce_params.norm_weight,
+                scale=all_reduce_params.scale,
+                bias=all_reduce_params.bias,
+                workspace=self.workspace,
+                group=self.mapping.tp_group,
+                strategy=allreduce_strategy,
+                op=all_reduce_params.fusion_op,
+                eps=all_reduce_params.eps,
+                trigger_completion_at_end=all_reduce_params.
+                trigger_completion_at_end,
+                **additional_args,
+            )
 
         return output if len(output) > 1 else output[0]
 
diff --git a/tensorrt_llm/_torch/model_config.py b/tensorrt_llm/_torch/model_config.py
@@ -193,7 +193,8 @@ def get_all_reduce_strategy(strategy: str = "AUTO"):
                 "TWOSHOT": AllReduceStrategy.TWOSHOT,
                 "LOWPRECISION": AllReduceStrategy.LOWPRECISION,
                 "MNNVL": AllReduceStrategy.MNNVL,
-                "NCCL_SYMMETRIC": AllReduceStrategy.NCCL_SYMMETRIC
+                "NCCL_SYMMETRIC": AllReduceStrategy.NCCL_SYMMETRIC,
+                "AUTOTUNE": AllReduceStrategy.AUTOTUNE,
             }
             key = strategy.upper()
             return maps[key] if key in maps else AllReduceStrategy.AUTO
diff --git a/tensorrt_llm/_torch/models/modeling_llama.py b/tensorrt_llm/_torch/models/modeling_llama.py
@@ -650,7 +650,8 @@ def __init__(
                                                 eps=config.rms_norm_eps,
                                                 dtype=config.torch_dtype)
 
-        self.all_reduce = AllReduce(mapping=model_config.mapping)
+        self.all_reduce = AllReduce(mapping=model_config.mapping,
+                                    strategy=model_config.allreduce_strategy)
 
         self.next_layer_layernorm: RMSNorm = None
         self.next_attn: LlamaAttention = None
diff --git a/tensorrt_llm/functional.py b/tensorrt_llm/functional.py
@@ -3883,6 +3883,7 @@ class AllReduceStrategy(IntEnum):
     LOWPRECISION = 6
     MNNVL = 7
     NCCL_SYMMETRIC = 8
+    AUTOTUNE = 9
 
 
 class AllReduceFusionOp(IntEnum):
diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py
@@ -2535,12 +2535,12 @@ class TorchLlmArgs(BaseLlmArgs):
         status="prototype",
     )
 
-    allreduce_strategy: Optional[Literal[
-        'AUTO', 'NCCL', 'UB', 'MINLATENCY', 'ONESHOT', 'TWOSHOT',
-        'LOWPRECISION', 'MNNVL',
-        'NCCL_SYMMETRIC']] = Field(default='AUTO',
-                                   description="Allreduce strategy to use.",
-                                   status="beta")
+    allreduce_strategy: Optional[
+        Literal['AUTO', 'NCCL', 'UB', 'MINLATENCY', 'ONESHOT', 'TWOSHOT',
+                'LOWPRECISION', 'MNNVL', 'NCCL_SYMMETRIC',
+                'AUTOTUNE']] = Field(default='AUTO',
+                                     description="Allreduce strategy to use.",
+                                     status="beta")
     checkpoint_loader: Optional[object] = Field(
         default=None,
         description=
diff --git a/tests/unittest/_torch/multi_gpu/test_allreduce.py b/tests/unittest/_torch/multi_gpu/test_allreduce.py