File tree Expand file tree Collapse file tree 1 file changed +23
-0
lines changed Expand file tree Collapse file tree 1 file changed +23
-0
lines changed Original file line number Diff line number Diff line change @@ -1048,6 +1048,29 @@ class AllreduceOp
10481048 return AllReduceStrategyType::NCCL;
10491049 }
10501050
1051+ // This rule based heuristic only chooses between NCCL and MIN_LATENCY strategies.
1052+
1053+ // Heurisitic will only be applied on NONE and RESIDUAL_RMS_NORM fusion types.
1054+ // Because NCCL might be faster on some large messageSize cases.
1055+ // Otherwise, MIN_LATENCY strategy will be directly returned due to more fusions it can support.
1056+ // TODO: NCCL AllReduce + subsequent quantization ops (as fallback) can also support the fusion types.
1057+ // This should be compared with MIN_LATENCY fused kernels to determine the best strategy.
1058+ switch (mOp )
1059+ {
1060+ case AllReduceFusionOp::NONE:
1061+ case AllReduceFusionOp::RESIDUAL_RMS_NORM: break ;
1062+ case AllReduceFusionOp::RESIDUAL_RMS_NORM_QUANT_FP8:
1063+ case AllReduceFusionOp::RESIDUAL_RMS_NORM_OUT_QUANT_FP8:
1064+ case AllReduceFusionOp::RESIDUAL_RMS_NORM_QUANT_NVFP4:
1065+ case AllReduceFusionOp::RESIDUAL_RMS_NORM_OUT_QUANT_NVFP4: return AllReduceStrategyType::MIN_LATENCY;
1066+ // Suppose NCCL has fallback implementations for all fusion types.
1067+ default : return AllReduceStrategyType::NCCL;
1068+ }
1069+
1070+ // Check mOp to be supported by the heuristic.
1071+ TORCH_CHECK (mOp == AllReduceFusionOp::NONE || mOp == AllReduceFusionOp::RESIDUAL_RMS_NORM,
1072+ " Only NONE and RESIDUAL_RMS_NORM are supported for NCCL/MIN_LATENCY heuristic." );
1073+
10511074 // Default to NCCL.
10521075 AllReduceStrategyType strategy = AllReduceStrategyType::NCCL;
10531076
You can’t perform that action at this time.
0 commit comments