Reverse allreduceOp.cpp changes.

hyukn · hyukn · commit 6535fa5c78f2 · 2025-10-23T01:52:03.000-07:00
Signed-off-by: Yukun He &lt;23156053+hyukn@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/thop/allreduceOp.cpp b/cpp/tensorrt_llm/thop/allreduceOp.cpp
@@ -1048,6 +1048,29 @@ class AllreduceOp
             return AllReduceStrategyType::NCCL;
         }
 
+        // This rule based heuristic only chooses between NCCL and MIN_LATENCY strategies.
+
+        // Heurisitic will only be applied on NONE and RESIDUAL_RMS_NORM fusion types.
+        // Because NCCL might be faster on some large messageSize cases.
+        // Otherwise, MIN_LATENCY strategy will be directly returned due to more fusions it can support.
+        // TODO: NCCL AllReduce + subsequent quantization ops (as fallback) can also support the fusion types.
+        // This should be compared with MIN_LATENCY fused kernels to determine the best strategy.
+        switch (mOp)
+        {
+        case AllReduceFusionOp::NONE:
+        case AllReduceFusionOp::RESIDUAL_RMS_NORM: break;
+        case AllReduceFusionOp::RESIDUAL_RMS_NORM_QUANT_FP8:
+        case AllReduceFusionOp::RESIDUAL_RMS_NORM_OUT_QUANT_FP8:
+        case AllReduceFusionOp::RESIDUAL_RMS_NORM_QUANT_NVFP4:
+        case AllReduceFusionOp::RESIDUAL_RMS_NORM_OUT_QUANT_NVFP4: return AllReduceStrategyType::MIN_LATENCY;
+        // Suppose NCCL has fallback implementations for all fusion types.
+        default: return AllReduceStrategyType::NCCL;
+        }
+
+        // Check mOp to be supported by the heuristic.
+        TORCH_CHECK(mOp == AllReduceFusionOp::NONE || mOp == AllReduceFusionOp::RESIDUAL_RMS_NORM,
+            "Only NONE and RESIDUAL_RMS_NORM are supported for NCCL/MIN_LATENCY heuristic.");
+
         // Default to NCCL.
         AllReduceStrategyType strategy = AllReduceStrategyType::NCCL;