Minor fix on profiling cache sync between different gpus.

hyukn · hyukn · commit a4fb14f745b6 · 2025-10-23T04:25:18.000Z
Signed-off-by: Yukun He &lt;23156053+hyukn@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/thop/allreduceOp.cpp b/cpp/tensorrt_llm/thop/allreduceOp.cpp
@@ -1048,29 +1048,6 @@ class AllreduceOp
             return AllReduceStrategyType::NCCL;
         }
 
-        // This rule based heuristic only chooses between NCCL and MIN_LATENCY strategies.
-
-        // Heurisitic will only be applied on NONE and RESIDUAL_RMS_NORM fusion types.
-        // Because NCCL might be faster on some large messageSize cases.
-        // Otherwise, MIN_LATENCY strategy will be directly returned due to more fusions it can support.
-        // TODO: NCCL AllReduce + subsequent quantization ops (as fallback) can also support the fusion types.
-        // This should be compared with MIN_LATENCY fused kernels to determine the best strategy.
-        switch (mOp)
-        {
-        case AllReduceFusionOp::NONE:
-        case AllReduceFusionOp::RESIDUAL_RMS_NORM: break;
-        case AllReduceFusionOp::RESIDUAL_RMS_NORM_QUANT_FP8:
-        case AllReduceFusionOp::RESIDUAL_RMS_NORM_OUT_QUANT_FP8:
-        case AllReduceFusionOp::RESIDUAL_RMS_NORM_QUANT_NVFP4:
-        case AllReduceFusionOp::RESIDUAL_RMS_NORM_OUT_QUANT_NVFP4: return AllReduceStrategyType::MIN_LATENCY;
-        // Suppose NCCL has fallback implementations for all fusion types.
-        default: return AllReduceStrategyType::NCCL;
-        }
-
-        // Check mOp to be supported by the heuristic.
-        TORCH_CHECK(mOp == AllReduceFusionOp::NONE || mOp == AllReduceFusionOp::RESIDUAL_RMS_NORM,
-            "Only NONE and RESIDUAL_RMS_NORM are supported for NCCL/MIN_LATENCY heuristic.");
-
         // Default to NCCL.
         AllReduceStrategyType strategy = AllReduceStrategyType::NCCL;
 
diff --git a/tensorrt_llm/_torch/autotuner.py b/tensorrt_llm/_torch/autotuner.py
@@ -15,7 +15,7 @@
 import torch
 
 import tensorrt_llm
-from tensorrt_llm._utils import mpi_barrier
+from tensorrt_llm._utils import mpi_barrier, mpi_broadcast
 from tensorrt_llm.bindings.internal.runtime import delay_kernel
 from tensorrt_llm.logger import logger
 
@@ -659,6 +659,13 @@ def _profile_runners(
         tuning_config: TuningConfig,
         **kwargs,
     ) -> float:
+        """Profile runners and select the best tactic.
+
+        For multi-rank profiling, only rank 0 performs the actual profiling
+        to avoid sync issues when different ranks select different tactics.
+        The results are then broadcasted to all other ranks.
+        """
+
         min_time = float('inf')
         has_tuning_failure_occured = False
         best_runner_id, best_tactic = None, None
@@ -709,6 +716,13 @@ def _profile_runners(
                     min_time = time_measured
                     best_runner_id, best_tactic = runner_id, tac
 
+        if self._is_sync_op(runner):
+            profiling_results = (best_runner_id, best_tactic, min_time,
+                                 has_tuning_failure_occured)
+            # Broadcast profiling results from rank 0 to all other ranks
+            profiling_results = mpi_broadcast(profiling_results, root=0)
+            best_runner_id, best_tactic, min_time, has_tuning_failure_occured = profiling_results
+
         return best_runner_id, best_tactic, min_time, has_tuning_failure_occured
 
     def _get_input_sizes(self, inputs: List[torch.Tensor]) -> List[torch.Size]:
diff --git a/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py b/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py
@@ -1091,7 +1091,7 @@ def forward(
         if tactic == -1:
             tactic = AllReduceStrategy.NCCL.value
 
-        torch.ops.trtllm.allreduce(
+        return torch.ops.trtllm.allreduce(
             input,
             residual,
             norm_weight,
@@ -1139,21 +1139,9 @@ def tunable_allreduce(
         [input, residual, norm_weight, scale, bias, workspace],
     )
 
-    if best_tactic == -1:
-        best_tactic = AllReduceStrategy.NCCL.value
-
-    return torch.ops.trtllm.allreduce(
-        input,
-        residual,
-        norm_weight,
-        scale,
-        bias,
-        workspace,
-        group,
-        best_tactic,
-        op,
-        eps,
-        trigger_completion_at_end,
+    return allreduce_runner(
+        [input, residual, norm_weight, scale, bias, workspace],
+        tactic=best_tactic,
     )
 
 
diff --git a/tensorrt_llm/_torch/model_config.py b/tensorrt_llm/_torch/model_config.py
@@ -191,7 +191,8 @@ def get_all_reduce_strategy(strategy: str = "AUTO"):
                 "TWOSHOT": AllReduceStrategy.TWOSHOT,
                 "LOWPRECISION": AllReduceStrategy.LOWPRECISION,
                 "MNNVL": AllReduceStrategy.MNNVL,
-                "NCCL_SYMMETRIC": AllReduceStrategy.NCCL_SYMMETRIC
+                "NCCL_SYMMETRIC": AllReduceStrategy.NCCL_SYMMETRIC,
+                "AUTOTUNE": AllReduceStrategy.AUTOTUNE,
             }
             key = strategy.upper()
             return maps[key] if key in maps else AllReduceStrategy.AUTO
diff --git a/tensorrt_llm/_torch/models/modeling_llama.py b/tensorrt_llm/_torch/models/modeling_llama.py
@@ -650,7 +650,8 @@ def __init__(
                                                 eps=config.rms_norm_eps,
                                                 dtype=config.torch_dtype)
 
-        self.all_reduce = AllReduce(mapping=model_config.mapping)
+        self.all_reduce = AllReduce(mapping=model_config.mapping,
+                                    strategy=model_config.allreduce_strategy)
 
         self.next_layer_layernorm: RMSNorm = None
         self.next_attn: LlamaAttention = None
diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py
@@ -2510,12 +2510,12 @@ class TorchLlmArgs(BaseLlmArgs):
         status="prototype",
     )
 
-    allreduce_strategy: Optional[Literal[
-        'AUTO', 'NCCL', 'UB', 'MINLATENCY', 'ONESHOT', 'TWOSHOT',
-        'LOWPRECISION', 'MNNVL',
-        'NCCL_SYMMETRIC']] = Field(default='AUTO',
-                                   description="Allreduce strategy to use.",
-                                   status="beta")
+    allreduce_strategy: Optional[
+        Literal['AUTO', 'NCCL', 'UB', 'MINLATENCY', 'ONESHOT', 'TWOSHOT',
+                'LOWPRECISION', 'MNNVL', 'NCCL_SYMMETRIC',
+                'AUTOTUNE']] = Field(default='AUTO',
+                                     description="Allreduce strategy to use.",
+                                     status="beta")
     checkpoint_loader: Optional[object] = Field(
         default=None,
         description="The checkpoint loader to use for this LLM instance.",