TP communication overlap: enable the overlap between GEMM chunk at Ho…

…pper BF16 Signed-off-by: Sangkug Lym <[email protected]>
NVIDIA · Nov 4, 2024 · c4dcd95 · c4dcd95
1 parent 05c0fb0
commit c4dcd95
Showing 1 changed file with 4 additions and 1 deletion.
diff --git a/transformer_engine/pytorch/cpp_extensions/gemm.py b/transformer_engine/pytorch/cpp_extensions/gemm.py
@@ -284,10 +284,13 @@ def gemm(
             assert (
                 extra_output_tensor is not None
             ), "SPLIT_PIPELINED_RS requires extra output tensor"
+            # Disable the overlap between GEMM chunks at ampere and below
+            major, _ = torch.cuda.get_device_capability()
+            overlap_gemm_chunks = True if major >= 9 else False
             args = tuple(
                 args
                 + (
-                    False,
+                    overlap_gemm_chunks,
                     extra_output_tensor,
                 )
             )