partial cp

fzyzcjy · fzyzcjy · commit 3a74536ee71a · 2025-09-19T20:18:36.000+08:00
diff --git a/csrc/fused_moe/cutlass_backend/cutlass_fused_moe_kernels.cuh b/csrc/fused_moe/cutlass_backend/cutlass_fused_moe_kernels.cuh
@@ -24,6 +24,7 @@
 #include <numeric>
 #include <random>
 #include <sstream>
+#include <type_traits>
 
 #include "tensorrt_llm/common/memoryUtils.h"
 #include "tensorrt_llm/common/workspace.h"
@@ -865,7 +866,7 @@ void threeStepBuildExpertMapsSortFirstToken(
 // ============================== Infer GEMM sizes =================================
 // TODO Could linear search be better for small # experts
 template <class T>
-__device__ inline int64_t findTotalEltsLessThanTarget(T const* sorted_indices,
+__device__ inline int64_t findTotalEltsLessThanTarget_v1(T const* sorted_indices,
                                                       int64_t const arr_length, T const target) {
   int64_t low = 0, high = arr_length - 1, target_location = -1;
   while (low <= high) {
@@ -881,6 +882,48 @@ __device__ inline int64_t findTotalEltsLessThanTarget(T const* sorted_indices,
   return target_location + 1;
 }
 
+template <int ARR_LENGTH_CONST, class T>
+__device__ inline int64_t findTotalEltsLessThanTarget_v2(T const* sorted_indices, int64_t const arr_length, T const target) {
+  if (arr_length != ARR_LENGTH_CONST) {
+      asm("trap;");
+  }
+
+  constexpr unsigned full_mask = 0xffffffffu;
+  constexpr int WARP_SZ = 32;
+  const int lane_id = threadIdx.x & (WARP_SZ - 1);
+
+  int local_count = 0;
+#pragma unroll
+  for (int k = 0; k < ARR_LENGTH_CONST / WARP_SZ; ++k) {
+    const int idx = lane_id + k * WARP_SZ;
+    T v = sorted_indices[idx];
+    local_count += (v < target) ? 1 : 0;
+  }
+
+#pragma unroll
+  for (int offset = 16; offset > 0; offset >>= 1) {
+    local_count += __shfl_down_sync(full_mask, local_count, offset);
+  }
+  int total = __shfl_sync(full_mask, local_count, 0);
+
+  return (int64_t)total;
+}
+
+template <int ARR_LENGTH_CONST, class T>
+__device__ inline int64_t findTotalEltsLessThanTarget(T const* sorted_indices, int64_t const arr_length, T const target) {
+//     return findTotalEltsLessThanTarget_v1(sorted_indices, arr_length, target);
+
+    return findTotalEltsLessThanTarget_v2<ARR_LENGTH_CONST>(sorted_indices, arr_length, target);
+
+//     int64_t out_v1 = findTotalEltsLessThanTarget_v1(sorted_indices, arr_length, target);
+//     int64_t out_v2 = findTotalEltsLessThanTarget_v2(sorted_indices, arr_length, target);
+//     if (out_v1 != out_v2) {
+//         printf("different output! v1=%lld v2=%lld\n", out_v1, out_v2);
+//         asm("trap;");
+//     }
+//     return out_v1;
+}
+
 template <class T>
 using sizeof_bits = cutlass::sizeof_bits<
     typename cutlass_kernels::TllmToCutlassTypeAdapter<std::remove_cv_t<T>>::type>;
@@ -1418,16 +1461,19 @@ constexpr static int EXPAND_THREADS_PER_BLOCK = 256;
 
 template <class InputActivationsType, class ExpandedActivationsType,
           TmaWarpSpecializedGroupedGemmInput::FpXBlockScalingType BlockScalingType,
-          bool PRE_QUANT_AWQ>
+          bool PRE_QUANT_AWQ, int NUM_EXPERTS_PER_NODE_CONST = 128>
 __global__ void expandInputRowsKernel(
     InputActivationsType const* unpermuted_input, ExpandedActivationsType* permuted_output,
     float const* unpermuted_scales, float* permuted_scales,
-    int const* permuted_row_to_unpermuted_row, int64_t const num_tokens, int64_t const hidden_size,
+    int const* permuted_row_to_unpermuted_row, int64_t const num_tokens, int64_t const hidden_size_real_,
     int64_t const k, float const* fc1_act_global_scale, bool use_per_expert_act_scale,
     int64_t const* expert_first_token_offset,
     TmaWarpSpecializedGroupedGemmInput::ElementSF* fc1_act_sf_flat,
     TmaWarpSpecializedGroupedGemmInput::ElementSF const* input_sf,
     int64_t const num_experts_per_node, InputActivationsType const* prequant_scales = nullptr) {
+  constexpr int hidden_size = 7168;
+  if (hidden_size != hidden_size_real_) { asm("trap;"); }
+
   static_assert(BlockScalingType == TmaWarpSpecializedGroupedGemmInput::FpXBlockScalingType::NONE ||
                     !PRE_QUANT_AWQ,
                 "AWQ and Block Scaling are mutually exclusive");
@@ -1503,14 +1549,14 @@ __global__ void expandInputRowsKernel(
                          permuted_row * hidden_size / ELEM_PER_THREAD;
 
     int64_t const start_offset = threadIdx.x;
-    int64_t const stride = EXPAND_THREADS_PER_BLOCK;
-    int64_t const num_elems_in_col = hidden_size / ELEM_PER_THREAD;
+    constexpr int64_t stride = EXPAND_THREADS_PER_BLOCK;
+    constexpr int64_t num_elems_in_col = hidden_size / ELEM_PER_THREAD;
     assert(hidden_size % ELEM_PER_THREAD == 0);
     assert(hidden_size % VecSize == 0);
 
     if constexpr (is_nvfp4 || is_mxfp8) {
       static_assert(ELEM_PER_THREAD == 8, "Expecting 8 elements per thread for quantized types");
-      int64_t expert = findTotalEltsLessThanTarget(expert_first_token_offset, num_experts_per_node,
+      int64_t expert = findTotalEltsLessThanTarget<NUM_EXPERTS_PER_NODE_CONST>(expert_first_token_offset, num_experts_per_node,
                                                    (int64_t)permuted_row + 1) -
                        1;
 
@@ -1519,6 +1565,7 @@ __global__ void expandInputRowsKernel(
       float global_scale_val = fc1_act_global_scale ? fc1_act_global_scale[act_scale_idx] : 1.0f;
       int64_t num_tokens_before_expert = expert_first_token_offset[expert];
 
+#pragma unroll
       for (int elem_index = start_offset; elem_index < num_elems_in_col; elem_index += stride) {
         auto in_vec = source_row_ptr[elem_index];
         if constexpr (need_nvfp4_quant || need_mxfp8_quant) {
@@ -1687,9 +1734,20 @@ void expandInputRowsKernelLauncher(
       TLLM_CHECK_WITH_INFO(quant_params.fp4.fc1.weight_block_scale,
                            "NVFP4 block scaling is expected for FP4xFP4");
       TLLM_CHECK_WITH_INFO(!prequant_scales, "NVFP4 is not supported for AWQ");
-      return &expandInputRowsKernel<InputActivationsType, ExpandedActivationsType,
+      if (num_experts_per_node == 128) {
+        constexpr int NUM_EXPERTS_PER_NODE_CONST = 128;
+        return &expandInputRowsKernel<InputActivationsType, ExpandedActivationsType,
                                     TmaWarpSpecializedGroupedGemmInput::FpXBlockScalingType::NVFP4,
-                                    false>;
+                                    false, NUM_EXPERTS_PER_NODE_CONST>;
+      }
+      if (num_experts_per_node == 64) {
+        constexpr int NUM_EXPERTS_PER_NODE_CONST = 64;
+        return &expandInputRowsKernel<InputActivationsType, ExpandedActivationsType,
+                                    TmaWarpSpecializedGroupedGemmInput::FpXBlockScalingType::NVFP4,
+                                    false, NUM_EXPERTS_PER_NODE_CONST>;
+      }
+      printf("unsupported num_experts_per_node\n");
+      exit(1);
     } else
 #endif
     {
@@ -1748,11 +1806,16 @@ constexpr static int FINALIZE_THREADS_PER_BLOCK = 256;
 // This kernel unpermutes the original data, does the k-way reduction and performs the final skip
 // connection.
 template <typename OutputType, class GemmOutputType, class ScaleBiasType, ScaleMode SCALE_MODE>
-__global__ void finalizeMoeRoutingKernel(
+__global__
+__maxnreg__(64)
+void finalizeMoeRoutingKernel(
     GemmOutputType const* expanded_permuted_rows, OutputType* reduced_unpermuted_output,
     ScaleBiasType const* bias, float const* scales, int const* unpermuted_row_to_permuted_row,
-    int const* token_selected_experts, int64_t const orig_cols, int64_t const experts_per_token,
+    int const* token_selected_experts, int64_t const orig_cols, int64_t const experts_per_token_real_,
     int const num_experts_per_node, int const start_expert_id) {
+  constexpr int experts_per_token = 8;
+  if (experts_per_token != experts_per_token_real_) { asm("trap;"); }
+
   int64_t const original_row = blockIdx.x;
   int64_t const num_rows = gridDim.x;
   auto const offset = original_row * orig_cols;
@@ -2078,7 +2141,7 @@ __global__ void doGatedActivationKernel(ActivationOutputType* output,
   float gate_bias = 0.0f;
   float gate_limit = std::numeric_limits<float>::infinity();
   if (activation_type.swiglu_alpha || activation_type.swiglu_beta || activation_type.swiglu_limit) {
-    int expert = findTotalEltsLessThanTarget(expert_first_token_offset, num_experts_per_node,
+    int expert = findTotalEltsLessThanTarget<128>(expert_first_token_offset, num_experts_per_node,
                                              (int64_t)token + 1) -
                  1;
     gate_alpha = activation_type.swiglu_alpha ? activation_type.swiglu_alpha[expert] : 1.0f;
@@ -2126,14 +2189,17 @@ void doGatedActivation(ActivationOutputType* output, GemmOutputType const* gemm_
 // ============================== Activation =================================
 
 template <class T, class GemmOutputType, class ScaleBiasType, class ActFn,
-          TmaWarpSpecializedGroupedGemmInput::FpXBlockScalingType BlockScalingType>
+          TmaWarpSpecializedGroupedGemmInput::FpXBlockScalingType BlockScalingType, int NUM_EXPERTS_PER_NODE_CONST = 128>
 __global__ void doActivationKernel(T* output, GemmOutputType const* gemm_result,
                                    float const* fp8_quant, ScaleBiasType const* bias_ptr,
                                    bool bias_is_broadcast, int64_t const* expert_first_token_offset,
-                                   int num_experts_per_node, int64_t inter_size,
+                                   int num_experts_per_node, int64_t inter_size_real_,
                                    float const* fc2_act_global_scale, bool use_per_expert_act_scale,
                                    TmaWarpSpecializedGroupedGemmInput::ElementSF* fc2_act_sf_flat,
                                    ActivationParams activation_params) {
+  constexpr int inter_size = 2048;
+  if (inter_size != inter_size_real_) { asm("trap;"); }
+
 #ifdef ENABLE_FP4
   constexpr bool IsNVFP4 =
       std::is_same_v<T, __nv_fp4_e2m1> &&
@@ -2186,7 +2252,7 @@ __global__ void doActivationKernel(T* output, GemmOutputType const* gemm_result,
         activation_params.swiglu_limit) {
       // TODO this is almost certainly faster as a linear scan
       expert =
-          findTotalEltsLessThanTarget(expert_first_token_offset, num_experts_per_node, token + 1) -
+          findTotalEltsLessThanTarget<NUM_EXPERTS_PER_NODE_CONST>(expert_first_token_offset, num_experts_per_node, token + 1) -
           1;
       gate_alpha = activation_params.swiglu_alpha ? activation_params.swiglu_alpha[expert] : 1.0f;
       gate_beta = activation_params.swiglu_beta ? activation_params.swiglu_beta[expert] : 0.0f;
@@ -2218,16 +2284,18 @@ __global__ void doActivationKernel(T* output, GemmOutputType const* gemm_result,
     auto output_vec = reinterpret_cast<OutputElem*>(safe_inc_ptr(output, output_offset));
     auto bias_ptr_vec = reinterpret_cast<BiasElem const*>(bias_ptr + bias_offset);
     int64_t const start_offset = tid;
-    int64_t const stride = ACTIVATION_THREADS_PER_BLOCK;
+    constexpr int64_t stride = ACTIVATION_THREADS_PER_BLOCK;
     assert(inter_size % ACTIVATION_ELEM_PER_THREAD == 0);
-    int64_t const num_elems_in_col = inter_size / ACTIVATION_ELEM_PER_THREAD;
+    constexpr int64_t num_elems_in_col = inter_size / ACTIVATION_ELEM_PER_THREAD;
     assert(gated_off % ACTIVATION_ELEM_PER_THREAD == 0);
     int64_t const gated_off_vec = gated_off / ACTIVATION_ELEM_PER_THREAD;
 
     ActFn fn{};
     fn.alpha = gate_alpha;
     fn.beta = gate_beta;
     fn.limit = gate_limit;
+
+#pragma unroll
     for (int64_t elem_index = start_offset; elem_index < num_elems_in_col; elem_index += stride) {
       auto fc1_value =
           arrayConvert<GemmResultElem, ComputeElem>(gemm_result_vec[elem_index + gated_off_vec]);
@@ -2358,30 +2426,62 @@ void doActivation(T* output, GemmOutputType const* gemm_result, float const* fp8
 
   auto fn = [&]() {
     auto fn = [&](auto block_scaling_type) {
-      auto fn_list = std::array{
-          &doActivationKernel<T, GemmOutputType, ScaleBiasType,
-                              IdentityAdaptor<cutlass::epilogue::thread::GELU>,
-                              decltype(block_scaling_type)::value>,  // Gelu
-          &doActivationKernel<T, GemmOutputType, ScaleBiasType,
-                              IdentityAdaptor<cutlass::epilogue::thread::ReLu>,
-                              decltype(block_scaling_type)::value>,  // Relu
-          &doActivationKernel<T, GemmOutputType, ScaleBiasType,
-                              IdentityAdaptor<cutlass::epilogue::thread::SiLu>,
-                              decltype(block_scaling_type)::value>,  // Silu
-          &doActivationKernel<T, GemmOutputType, ScaleBiasType,
-                              GLUAdaptor<cutlass::epilogue::thread::SiLu>,
-                              decltype(block_scaling_type)::value>,  // Swiglu
-          &doActivationKernel<T, GemmOutputType, ScaleBiasType,
-                              GLUAdaptor<cutlass::epilogue::thread::GELU>,
-                              decltype(block_scaling_type)::value>,  // Geglu
-          &doActivationKernel<T, GemmOutputType, ScaleBiasType, SwigluBiasAdaptor,
-                              decltype(block_scaling_type)::value>,  // SwigluBias
-          &doActivationKernel<T, GemmOutputType, ScaleBiasType,
-                              IdentityAdaptor<cutlass::epilogue::thread::Identity>,
-                              decltype(block_scaling_type)::value>  // Identity
-
-      };
-      return fn_list[static_cast<int>(activation_type.activation_type)];
+      if (num_experts_per_node == 128) {
+        constexpr int NUM_EXPERTS_PER_NODE_CONST = 128;
+        auto fn_list = std::array{
+            &doActivationKernel<T, GemmOutputType, ScaleBiasType,
+                                IdentityAdaptor<cutlass::epilogue::thread::GELU>,
+                                decltype(block_scaling_type)::value, NUM_EXPERTS_PER_NODE_CONST>,  // Gelu
+            &doActivationKernel<T, GemmOutputType, ScaleBiasType,
+                                IdentityAdaptor<cutlass::epilogue::thread::ReLu>,
+                                decltype(block_scaling_type)::value, NUM_EXPERTS_PER_NODE_CONST>,  // Relu
+            &doActivationKernel<T, GemmOutputType, ScaleBiasType,
+                                IdentityAdaptor<cutlass::epilogue::thread::SiLu>,
+                                decltype(block_scaling_type)::value, NUM_EXPERTS_PER_NODE_CONST>,  // Silu
+            &doActivationKernel<T, GemmOutputType, ScaleBiasType,
+                                GLUAdaptor<cutlass::epilogue::thread::SiLu>,
+                                decltype(block_scaling_type)::value, NUM_EXPERTS_PER_NODE_CONST>,  // Swiglu
+            &doActivationKernel<T, GemmOutputType, ScaleBiasType,
+                                GLUAdaptor<cutlass::epilogue::thread::GELU>,
+                                decltype(block_scaling_type)::value, NUM_EXPERTS_PER_NODE_CONST>,  // Geglu
+            &doActivationKernel<T, GemmOutputType, ScaleBiasType, SwigluBiasAdaptor,
+                                decltype(block_scaling_type)::value, NUM_EXPERTS_PER_NODE_CONST>,  // SwigluBias
+            &doActivationKernel<T, GemmOutputType, ScaleBiasType,
+                                IdentityAdaptor<cutlass::epilogue::thread::Identity>,
+                                decltype(block_scaling_type)::value, NUM_EXPERTS_PER_NODE_CONST>  // Identity
+
+        };
+        return fn_list[static_cast<int>(activation_type.activation_type)];
+      }
+      if (num_experts_per_node == 64) {
+        constexpr int NUM_EXPERTS_PER_NODE_CONST = 64;
+        auto fn_list = std::array{
+            &doActivationKernel<T, GemmOutputType, ScaleBiasType,
+                                IdentityAdaptor<cutlass::epilogue::thread::GELU>,
+                                decltype(block_scaling_type)::value, NUM_EXPERTS_PER_NODE_CONST>,  // Gelu
+            &doActivationKernel<T, GemmOutputType, ScaleBiasType,
+                                IdentityAdaptor<cutlass::epilogue::thread::ReLu>,
+                                decltype(block_scaling_type)::value, NUM_EXPERTS_PER_NODE_CONST>,  // Relu
+            &doActivationKernel<T, GemmOutputType, ScaleBiasType,
+                                IdentityAdaptor<cutlass::epilogue::thread::SiLu>,
+                                decltype(block_scaling_type)::value, NUM_EXPERTS_PER_NODE_CONST>,  // Silu
+            &doActivationKernel<T, GemmOutputType, ScaleBiasType,
+                                GLUAdaptor<cutlass::epilogue::thread::SiLu>,
+                                decltype(block_scaling_type)::value, NUM_EXPERTS_PER_NODE_CONST>,  // Swiglu
+            &doActivationKernel<T, GemmOutputType, ScaleBiasType,
+                                GLUAdaptor<cutlass::epilogue::thread::GELU>,
+                                decltype(block_scaling_type)::value, NUM_EXPERTS_PER_NODE_CONST>,  // Geglu
+            &doActivationKernel<T, GemmOutputType, ScaleBiasType, SwigluBiasAdaptor,
+                                decltype(block_scaling_type)::value, NUM_EXPERTS_PER_NODE_CONST>,  // SwigluBias
+            &doActivationKernel<T, GemmOutputType, ScaleBiasType,
+                                IdentityAdaptor<cutlass::epilogue::thread::Identity>,
+                                decltype(block_scaling_type)::value, NUM_EXPERTS_PER_NODE_CONST> // Identity
+
+        };
+        return fn_list[static_cast<int>(activation_type.activation_type)];
+      }
+      printf("unsupported num_experts_per_node\n");
+      exit(1);
     };
     auto NVFP4 = tensorrt_llm::common::ConstExprWrapper<
         TmaWarpSpecializedGroupedGemmInput::FpXBlockScalingType,