Feature: Add support for L40 FusedMoE in cutlass path (#1973)

amirkl94 · web-flow · commit 159d0a057ab3 · 2025-10-28T17:36:49.000-07:00
## 📌 Description
Fixed a few compilation issues for L40, and removed 1 gemm tactic for
`sm == 89` that crashes due to:
```
Assertion failed: GPU lacks the shared memory resources to run GroupedGEMM kernel
```

## 🧪 Tests

Ran `pytest tests/moe/test_trtllm_cutlass_fused_moe.py` manually on an
L40 GPU and verified all tests passed.

&lt;!-- This is an auto-generated comment: release notes by coderabbit.ai
--&gt;
## Summary by CodeRabbit

* **New Features**
* Official support for SM89 target: build/JIT flags and a public
generation path to target it.

* **Bug Fixes / Compatibility**
* Clarified FP8/FP4 dispatch: FP8 paths enabled for SM89; FP4 usage
remains gated and now requires explicit enablement.

* **Performance**
* Adjusted kernel/tile selection order for certain FP8 paths to prefer
SM89-optimized options.

* **Chores**
  * Reduced logging severity for failed tactic profiling to warn/debug.
&lt;!-- end of auto-generated comment: release notes by coderabbit.ai --&gt;

---------

Signed-off-by: Amir Klein &lt;203507526+amirkl94@users.noreply.github.com&gt;
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.cpp b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.cpp
@@ -158,14 +158,14 @@ std::vector<CutlassTileConfig> get_candidate_tiles(
               CutlassTileConfig::CtaShape256x128x64_WarpShape64x64x64};
     case CutlassGemmType::Fp8:
       if (config_type_param & CutlassGemmConfig::GROUPED_GEMM) {
-        if (sm == 89 || sm >= 120) {
-          return {CutlassTileConfig::CtaShape16x256x128_WarpShape16x64x128,
-                  CutlassTileConfig::CtaShape32x128x64_WarpShape32x32x64,
+        if (sm == 89 || sm == 120) {
+          return {CutlassTileConfig::CtaShape32x128x64_WarpShape32x32x64,
                   CutlassTileConfig::CtaShape64x128x64_WarpShape64x32x64,
                   CutlassTileConfig::CtaShape64x64x128_WarpShape32x64x64,
                   CutlassTileConfig::CtaShape128x64x64_WarpShape64x32x64,
                   CutlassTileConfig::CtaShape128x256x64_WarpShape64x64x64,
-                  CutlassTileConfig::CtaShape256x128x64_WarpShape64x64x64};
+                  CutlassTileConfig::CtaShape256x128x64_WarpShape64x64x64,
+                  CutlassTileConfig::CtaShape16x256x128_WarpShape16x64x128};
         } else {
           // no valid ampere style fp8 configs for sm90
           return {};
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h
@@ -688,28 +688,28 @@ void MoeGemmRunner<T, WeightType, OutputType, ScaleBiasType>::dispatchToArch(
     TLLM_THROW("FP4 data type is not supported on SM < 90");
 #endif
   } else if (sm_ >= 80 && sm_ < 90) {
-#ifdef ENABLE_FP4
-    if constexpr (!std::is_same_v<WeightType, __nv_fp4_e2m1>) {
-      if constexpr (use_fp8 || use_w4afp8) {
+    if constexpr (use_fp8 || use_w4afp8) {
 #if defined(ENABLE_FP8)
-        static_assert(!std::is_same_v<OutputType, __nv_fp8_e4m3> &&
-                          !std::is_same_v<OutputType, __nv_fp8_e5m2>,
-                      "FP8 GEMM Output not supported");
+      static_assert(
+          !std::is_same_v<OutputType, __nv_fp8_e4m3> && !std::is_same_v<OutputType, __nv_fp8_e5m2>,
+          "FP8 GEMM Output not supported");
 #endif
-        TLLM_CHECK_WITH_INFO(sm_ == 89,
-                             "For sm >= 80 and < 90, fp8 is only supported with sm == 89");
-        dispatchMoeGemmToCutlass<T, WeightType, ScaleBiasType, cutlass::arch::Sm89, EpilogueTag>(
-            inputs, multi_processor_count_);
+      TLLM_CHECK_WITH_INFO(sm_ == 89, "For sm >= 80 and < 90, fp8 is only supported with sm == 89");
+      dispatchMoeGemmToCutlass<T, WeightType, ScaleBiasType, cutlass::arch::Sm89, EpilogueTag>(
+          inputs, multi_processor_count_);
+    } else {
+#ifdef ENABLE_FP4
+      if constexpr (std::is_same_v<WeightType, __nv_fp4_e2m1>) {
+        TLLM_THROW("FP4 data type is not supported on SM < 90");
       } else {
         dispatchMoeGemmToCutlass<T, WeightType, ScaleBiasType, cutlass::arch::Sm80, EpilogueTag>(
             inputs, multi_processor_count_);
       }
-    } else {
-      TLLM_THROW("FP4 data type is not supported on SM < 90");
-    }
 #else
-    TLLM_THROW("FP4 data type is not supported on SM < 90");
+      dispatchMoeGemmToCutlass<T, WeightType, ScaleBiasType, cutlass::arch::Sm80, EpilogueTag>(
+          inputs, multi_processor_count_);
 #endif
+    }
   } else if (sm_ >= 90) {
     // For SM120+ FP8 MoE, redirect to SM89 (Ada) FP8 kernel implementations.
     if constexpr (use_fp8) {
diff --git a/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_tma_warp_specialized_traits.h b/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_tma_warp_specialized_traits.h
@@ -32,7 +32,8 @@ template <typename T, typename WeightType,
           TmaWarpSpecializedGroupedGemmInput::EpilogueFusion Fusion =
               TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::NONE>
 constexpr bool isValidSM120MOESpecialisation() {
-#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED)  // TODO Is there a better choice
+#if defined(CUTLASS_ARCH_MMA_SM120_SUPPORTED) && \
+    defined(ENABLE_FP4)  // TODO Is there a better choice
   return cutlass::platform::is_same<T, __nv_fp4_e2m1>::value &&
          cutlass::platform::is_same<T, WeightType>::value &&
          cutlass::platform::is_same<EpilogueTag, cutlass_extensions::EpilogueOpDefault>::value &&
@@ -49,8 +50,13 @@ template <typename T, typename WeightType,
 constexpr bool isValidBlackwellMOESpecialisation() {
 #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED)  // TODO Is there a better choice
   return (cutlass::platform::is_same<T, WeightType>::value ||
+#if defined(ENABLE_FP4)
           (cutlass::platform::is_same<T, __nv_fp8_e4m3>::value &&
-           cutlass::platform::is_same<WeightType, __nv_fp4_e2m1>::value)) &&
+           cutlass::platform::is_same<WeightType, __nv_fp4_e2m1>::value)
+#else
+          false
+#endif
+              ) &&
          cutlass::platform::is_same<EpilogueTag, cutlass_extensions::EpilogueOpDefault>::value &&
          Fusion == TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::NONE;
 #else
diff --git a/flashinfer/autotuner.py b/flashinfer/autotuner.py
@@ -482,8 +482,12 @@ def choose_one(
                             )
                         except Exception as e:
                             shapes = self._get_input_sizes(tensors)
+                            logger.warning(
+                                f"[Autotuner]: Skipping tactic {r} {tac}, due to failure while profiling."
+                            )
 
-                            logger.error(
+                            # Log stacktrace as debug to not spam log
+                            logger.debug(
                                 f"[Autotuner]: Failed when profiling {r} {tac}, shapes={shapes}. Error occurred: {e}"
                             )
 
diff --git a/flashinfer/fused_moe/core.py b/flashinfer/fused_moe/core.py
@@ -37,6 +37,7 @@
     gen_cutlass_fused_moe_sm120_module,
     gen_cutlass_fused_moe_sm100_module,
     gen_cutlass_fused_moe_sm90_module,
+    gen_cutlass_fused_moe_sm89_module,
     gen_trtllm_gen_fused_moe_sm100_module,
 )
 from ..utils import (
@@ -285,6 +286,8 @@ def get_cutlass_fused_moe_module(backend: str = "100", use_fast_build: bool = Fa
         module = gen_cutlass_fused_moe_sm100_module(use_fast_build).build_and_load()
     elif backend == "90":
         module = gen_cutlass_fused_moe_sm90_module(use_fast_build).build_and_load()
+    elif backend == "89":
+        module = gen_cutlass_fused_moe_sm89_module(use_fast_build).build_and_load()
     else:
         raise ValueError(f"Invalid backend: {backend}")
 
diff --git a/flashinfer/jit/core.py b/flashinfer/jit/core.py
@@ -90,6 +90,10 @@ def clear_cache_dir():
     "-DFLASHINFER_ENABLE_FP8_E8M0",
     "-DFLASHINFER_ENABLE_FP4_E2M1",
 ]
+sm89_nvcc_flags = [
+    "-gencode=arch=compute_89,code=sm_89",
+    "-DFLASHINFER_ENABLE_FP8_E8M0",
+]
 sm90a_nvcc_flags = ["-gencode=arch=compute_90a,code=sm_90a"] + common_nvcc_flags
 sm100a_nvcc_flags = ["-gencode=arch=compute_100a,code=sm_100a"] + common_nvcc_flags
 sm103a_nvcc_flags = ["-gencode=arch=compute_103a,code=sm_103a"] + common_nvcc_flags
diff --git a/flashinfer/jit/fused_moe.py b/flashinfer/jit/fused_moe.py
@@ -18,7 +18,13 @@
 
 from . import env as jit_env
 from ..artifacts import ArtifactPath, CheckSumHash
-from .core import JitSpec, gen_jit_spec, current_compilation_context, sm90a_nvcc_flags
+from .core import (
+    JitSpec,
+    gen_jit_spec,
+    current_compilation_context,
+    sm90a_nvcc_flags,
+    sm89_nvcc_flags,
+)
 from .cpp_ext import is_cuda_version_at_least
 from .cubin_loader import get_cubin, get_meta_hash
 from .gemm.cutlass.generate_kernels import generate_gemm_operations
@@ -71,6 +77,16 @@ def gen_cutlass_fused_moe_sm90_module(use_fast_build: bool = False) -> JitSpec:
     return gen_cutlass_fused_moe_module(nvcc_flags, "90", use_fast_build)
 
 
+def gen_cutlass_fused_moe_sm89_module(use_fast_build: bool = False) -> JitSpec:
+    nvcc_flags = sm89_nvcc_flags + [
+        "-DENABLE_BF16",
+        "-DENABLE_FP8",
+        "-DENABLE_FP8_BLOCK_SCALE" if is_cuda_version_at_least("12.8") else "",
+        "-DUSING_OSS_CUTLASS_MOE_GEMM",
+    ]
+    return gen_cutlass_fused_moe_module(nvcc_flags, "89", use_fast_build)
+
+
 def gen_cutlass_fused_moe_module(
     nvcc_flags: List[str], device_arch: str, use_fast_build: bool = False
 ) -> JitSpec:

Original file line number	Diff line number	Diff line change
`@@ -482,8 +482,12 @@ def choose_one(`
`482`	`482`	`)`
`483`	`483`	`except Exception as e:`
`484`	`484`	`shapes = self._get_input_sizes(tensors)`
	`485`	`+ logger.warning(`
	`486`	`+ f"[Autotuner]: Skipping tactic {r} {tac}, due to failure while profiling."`
	`487`	`+ )`
`485`	`488`
`486`		`- logger.error(`
	`489`	`+ # Log stacktrace as debug to not spam log`
	`490`	`+ logger.debug(`
`487`	`491`	`f"[Autotuner]: Failed when profiling {r} {tac}, shapes={shapes}. Error occurred: {e}"`
`488`	`492`	`)`
`489`	`493`