flashinfer-ai · aleozlx · Aug 27, 2025 · Oct 8, 2025 · Oct 8, 2025 · Oct 8, 2025
@@ -18,7 +18,6 @@
 #include "moe_kernels.h"
 
 namespace tensorrt_llm::kernels::cutlass_kernels {
-// ==================== Variable batched GEMM specializations ==================================
 template class CutlassMoeFCRunner<float, float>;
 
 #ifdef ENABLE_BF16
@@ -38,6 +37,7 @@ template class CutlassMoeFCRunner<__nv_fp8_e4m3, cutlass::uint4b_t, half, half>;
 template class CutlassMoeFCRunner<__nv_fp8_e4m3, __nv_fp8_e4m3, __nv_bfloat16>;
 template class CutlassMoeFCRunner<__nv_bfloat16, __nv_fp8_e4m3, __nv_bfloat16>;
 template class CutlassMoeFCRunner<__nv_fp8_e4m3, cutlass::uint4b_t, __nv_bfloat16, __nv_bfloat16>;
+template class CutlassMoeFCRunner<__nv_fp8_e4m3, cutlass::uint4b_t, __nv_bfloat16, __nv_fp8_e4m3>;
 #endif
 #endif
 #ifdef ENABLE_FP4
@@ -54,4 +54,4 @@ template class CutlassMoeFCRunner<__nv_fp8_e4m3, __nv_fp4_e2m1, __nv_bfloat16, _
 template class CutlassMoeFCRunner<__nv_bfloat16, __nv_fp4_e2m1>;
 #endif
 #endif
-};  // namespace tensorrt_llm::kernels::cutlass_kernels
+}  // namespace tensorrt_llm::kernels::cutlass_kernels
@@ -72,6 +72,8 @@ class DtypeUtils {
       default:
         TVM_FFI_ICHECK(false) << "unsupported data type";
     }
+
+    return nvinfer1::DataType::kFLOAT;  // supress compiler warning
   }
 
  private:
@@ -111,6 +113,8 @@ class FusedMoeRunner : public tvm::ffi::ModuleObj {
         TVM_FFI_ICHECK(false) << "Invalid output type " << DLDataTypeToString(output_type)
                               << " specified for " << DLDataTypeToString(mActivationDtype);
     }
+
+    return nullptr;  // supress compiler warning
   };
 
   FusedMoeRunner(DLDataType activation_dtype, DLDataType weight_dtype, DLDataType output_dtype,
@@ -219,7 +223,11 @@ class FusedMoeRunner : public tvm::ffi::ModuleObj {
     }
 
     mProfiler = std::make_shared<kernels::GemmProfilerBackend>();
-    mAllProfiles = mKernelRunner->getTactics();
+    // Get tactics for both GEMM1 and GEMM2, combine them
+    auto gemm1_tactics = mKernelRunner->getTactics(kernels::MoeGemmId::GEMM_1);
+    auto gemm2_tactics = mKernelRunner->getTactics(kernels::MoeGemmId::GEMM_2);
+    mAllProfiles = gemm1_tactics;
+    mAllProfiles.insert(mAllProfiles.end(), gemm2_tactics.begin(), gemm2_tactics.end());
     TVM_FFI_ICHECK(!mAllProfiles.empty())
         << "No valid tactics available for fused moe op with the requested input combination "
            "Activation: "
@@ -368,25 +376,29 @@ class FusedMoeRunner : public tvm::ffi::ModuleObj {
 
     // TODO: support lora in the future
     ::tensorrt_llm::kernels::LoraParams lora_params{};
+    // HACK Define default values for parameters we don't have good values for
+    bool const swizzled_input_sf = false;              // Assume input_sf is not swizzled by default
+    int64_t const unpadded_hidden_size = hidden_size;  // Assume no padding by default
+    bool const use_lora = false;                       // No lora support yet
 #ifdef USING_OSS_CUTLASS_MOE_GEMM
-    mKernelRunner->runMoe(input->data, input_sf.has_value() ? input_sf.value()->data : nullptr,
-                          reinterpret_cast<int const*>(token_selected_experts->data),
-                          token_final_scales.has_value()
-                              ? reinterpret_cast<float const*>(token_final_scales.value()->data)
-                              : nullptr,
-                          fc1_expert_weights->data,
-                          fc1_expert_biases.has_value() ? fc1_expert_biases.value()->data : nullptr,
-                          activation_params, fc2_expert_weights->data,
-                          fc2_expert_biases.has_value() ? fc2_expert_biases.value()->data : nullptr,
-                          quant_params, num_rows, hidden_size, inter_size, num_experts_total,
-                          static_cast<int>(experts_per_token),
-                          static_cast<char*>(workspace_info.workspace->data), output->data,
-                          static_cast<int*>(workspace_info.src_to_dest_map), parallelism_config,
-                          enable_alltoall, false, lora_params, mUseDeepSeekFP8BlockScaling,
-                          min_latency_mode, min_latency_params, enable_pdl, stream);
+    mKernelRunner->runMoe(
+        input->data, input_sf.has_value() ? input_sf.value()->data : nullptr, swizzled_input_sf,
+        reinterpret_cast<int const*>(token_selected_experts->data),
+        token_final_scales.has_value()
+            ? reinterpret_cast<float const*>(token_final_scales.value()->data)
+            : nullptr,
+        fc1_expert_weights->data,
+        fc1_expert_biases.has_value() ? fc1_expert_biases.value()->data : nullptr,
+        activation_params, fc2_expert_weights->data,
+        fc2_expert_biases.has_value() ? fc2_expert_biases.value()->data : nullptr, quant_params,
+        num_rows, hidden_size, unpadded_hidden_size, inter_size, num_experts_total,
+        static_cast<int>(experts_per_token), static_cast<char*>(workspace_info.workspace->data),
+        output->data, static_cast<int*>(workspace_info.src_to_dest_map), parallelism_config,
+        enable_alltoall, use_lora, lora_params, mUseDeepSeekFP8BlockScaling, min_latency_mode,
+        min_latency_params, enable_pdl, stream);
 #else
     mKernelRunner->runMoe(
-        input->data, input_sf.has_value() ? input_sf.value()->data : nullptr,
+        input->data, input_sf.has_value() ? input_sf.value()->data : nullptr, swizzled_input_sf,
         reinterpret_cast<int const*>(token_selected_experts->data),
         token_final_scales.has_value()
             ? reinterpret_cast<float const*>(token_final_scales.value()->data)
@@ -395,10 +407,11 @@ class FusedMoeRunner : public tvm::ffi::ModuleObj {
         fc1_expert_biases.has_value() ? fc1_expert_biases.value()->data : nullptr,
         activation_params, fc2_expert_weights->data,
         fc2_expert_biases.has_value() ? fc2_expert_biases.value()->data : nullptr, quant_params,
-        num_rows, hidden_size, inter_size, num_experts_total, static_cast<int>(experts_per_token),
-        static_cast<char*>(workspace_info.workspace), output->data,
-        static_cast<int*>(workspace_info.src_to_dest_map), parallelism_config, false, lora_params,
-        mUseDeepSeekFP8BlockScaling, min_latency_mode, min_latency_params, enable_pdl, stream);
+        num_rows, hidden_size, unpadded_hidden_size, inter_size, num_experts_total,
+        static_cast<int>(experts_per_token), static_cast<char*>(workspace_info.workspace),
+        output->data, static_cast<int*>(workspace_info.src_to_dest_map), parallelism_config, false,
+        use_lora, lora_params, mUseDeepSeekFP8BlockScaling, min_latency_mode, min_latency_params,
+        enable_pdl, stream);
 #endif
   }
 
@@ -544,25 +557,29 @@ class FusedMoeRunner : public tvm::ffi::ModuleObj {
 
     // TODO: support lora in the future
     ::tensorrt_llm::kernels::LoraParams lora_params{};
+    // HACK Define default values for parameters we don't have good values for
+    bool const swizzled_input_sf_ml = false;  // Assume input_sf is not swizzled by default
+    int64_t const unpadded_hidden_size_ml = hidden_size;  // Assume no padding by default
+    bool const use_lora_ml = false;                       // No lora support yet
 #ifdef USING_OSS_CUTLASS_MOE_GEMM
-    mKernelRunner->runMoe(input->data, input_sf.has_value() ? input_sf.value()->data : nullptr,
-                          reinterpret_cast<int const*>(token_selected_experts->data),
-                          token_final_scales.has_value()
-                              ? reinterpret_cast<float const*>(token_final_scales.value()->data)
-                              : nullptr,
-                          fc1_expert_weights->data,
-                          fc1_expert_biases.has_value() ? fc1_expert_biases.value()->data : nullptr,
-                          activation_params, fc2_expert_weights->data,
-                          fc2_expert_biases.has_value() ? fc2_expert_biases.value()->data : nullptr,
-                          quant_params, num_rows, hidden_size, inter_size, num_experts_total,
-                          static_cast<int>(experts_per_token),
-                          static_cast<char*>(workspace_info.workspace->data), output->data,
-                          static_cast<int*>(workspace_info.src_to_dest_map), parallelism_config,
-                          enable_alltoall, false, lora_params, mUseDeepSeekFP8BlockScaling,
-                          min_latency_mode, min_latency_params, enable_pdl, stream);
+    mKernelRunner->runMoe(
+        input->data, input_sf.has_value() ? input_sf.value()->data : nullptr, swizzled_input_sf_ml,
+        reinterpret_cast<int const*>(token_selected_experts->data),
+        token_final_scales.has_value()
+            ? reinterpret_cast<float const*>(token_final_scales.value()->data)
+            : nullptr,
+        fc1_expert_weights->data,
+        fc1_expert_biases.has_value() ? fc1_expert_biases.value()->data : nullptr,
+        activation_params, fc2_expert_weights->data,
+        fc2_expert_biases.has_value() ? fc2_expert_biases.value()->data : nullptr, quant_params,
+        num_rows, hidden_size, unpadded_hidden_size_ml, inter_size, num_experts_total,
+        static_cast<int>(experts_per_token), static_cast<char*>(workspace_info.workspace->data),
+        output->data, static_cast<int*>(workspace_info.src_to_dest_map), parallelism_config,
+        enable_alltoall, use_lora_ml, lora_params, mUseDeepSeekFP8BlockScaling, min_latency_mode,
+        min_latency_params, enable_pdl, stream);
 #else
     mKernelRunner->runMoe(
-        input->data, input_sf.has_value() ? input_sf.value()->data : nullptr,
+        input->data, input_sf.has_value() ? input_sf.value()->data : nullptr, swizzled_input_sf_ml,
         reinterpret_cast<int const*>(token_selected_experts->data),
         token_final_scales.has_value()
             ? reinterpret_cast<float const*>(token_final_scales.value()->data)
@@ -571,10 +588,11 @@ class FusedMoeRunner : public tvm::ffi::ModuleObj {
         fc1_expert_biases.has_value() ? fc1_expert_biases.value()->data : nullptr,
         activation_params, fc2_expert_weights->data,
         fc2_expert_biases.has_value() ? fc2_expert_biases.value()->data : nullptr, quant_params,
-        num_rows, hidden_size, inter_size, num_experts_total, static_cast<int>(experts_per_token),
-        static_cast<char*>(workspace_info.workspace), output->data,
-        static_cast<int*>(workspace_info.src_to_dest_map), parallelism_config, false, lora_params,
-        mUseDeepSeekFP8BlockScaling, min_latency_mode, min_latency_params, enable_pdl, stream);
+        num_rows, hidden_size, unpadded_hidden_size_ml, inter_size, num_experts_total,
+        static_cast<int>(experts_per_token), static_cast<char*>(workspace_info.workspace),
+        output->data, static_cast<int*>(workspace_info.src_to_dest_map), parallelism_config, false,
+        use_lora_ml, lora_params, mUseDeepSeekFP8BlockScaling, min_latency_mode, min_latency_params,
+        enable_pdl, stream);
 #endif
   }
 
@@ -636,19 +654,20 @@ class FusedMoeRunner : public tvm::ffi::ModuleObj {
       auto activation_dtype =
           (mUseW4GroupScaling && !isWFP4A16Quant()) ? dl_float8_e4m3fn : mActivationDtype;
       activation_dtype = isNvfp4Quant() ? dl_int64 : activation_dtype;
+      int64_t const unpadded_hidden_size_profiler = hidden_size;  // HACK no padding by default
 #ifdef USING_OSS_CUTLASS_MOE_GEMM
       mProfiler->init(*mKernelRunner.get(), mProfiler->mGemmToProfile,
                       DtypeUtils::dataType(activation_dtype), DtypeUtils::dataType(mWeightDtype),
                       DtypeUtils::dataType(mOutputDtype), num_experts, static_cast<int>(top_k),
-                      hidden_size, inter_size, group_size, activation_type, USE_BIAS, USE_LORA,
-                      min_latency_mode,
+                      hidden_size, unpadded_hidden_size_profiler, inter_size, group_size,
+                      activation_type, USE_BIAS, USE_LORA, min_latency_mode,
                       /*need_weights*/ false, parallelism_config, enable_alltoall);
 #else
       mProfiler->init(*mKernelRunner.get(), mProfiler->mGemmToProfile,
                       DtypeUtils::dataType(activation_dtype), DtypeUtils::dataType(mWeightDtype),
                       DtypeUtils::dataType(mOutputDtype), num_experts, static_cast<int>(top_k),
-                      hidden_size, inter_size, group_size, activation_type, USE_BIAS, USE_LORA,
-                      min_latency_mode,
+                      hidden_size, unpadded_hidden_size_profiler, inter_size, group_size,
+                      activation_type, USE_BIAS, USE_LORA, min_latency_mode,
                       /*need_weights*/ false, parallelism_config);
 #endif
 

@@ -1181,6 +1181,9 @@ using Int = ConstExprWrapper<int, VALUE>;
 template <bool VALUE>
 using Bool = ConstExprWrapper<bool, VALUE>;
 
+template <bool VALUE>
+using ConstBool = ConstExprWrapper<bool, VALUE>;
+
 template <typename T>
 struct TmaDescType;