fix fbgemm build issues after upgrading CK (#4517)

842974287 · facebook-github-bot · commit e644e2643618 · 2025-07-21T11:37:21.000-07:00
Summary: Pull Request resolved: #4517 X-link: facebookresearch/FBGEMM#1565 Some files are copied over from CK so need to update them. Reviewed By: q10 Differential Revision: D78455742
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fused_moe/fused_moe.hpp b/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fused_moe/fused_moe.hpp
@@ -16,6 +16,7 @@ struct fused_moe_args {
   const void*
       y_smooth_scale_ptr; // [e, 1, n], smooth-quant-scale for 2nd gemm input
   const void* local_expert_mask_ptr; // [e], local_expert_mask_ptr for EP
+  const void* local_tokens; // [1] if not nullptr, tokens read from here
   void* o_ptr; // [m, k], output token (no need to do zeroing)
   void* ws_ptr; // size is moe_sorting_get_workspace_size()
                 // if return zero, then could be nullptr
@@ -58,6 +59,8 @@ struct fused_moe_traits {
   bool local_expert_masking; // if mask experts as local expert
 };
 
+// if return zero, no ws needed
+int fused_moe_get_workspace_size(int tokens, int num_experts, int topk);
 float fused_moe(
     fused_moe_traits,
     fused_moe_args,
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fused_moe/fused_moe_kernel.hip b/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fused_moe/fused_moe_kernel.hip
@@ -77,14 +77,14 @@ at::Tensor fused_moe_impl(
   auto prec_o = get_prec_str(output);
   auto prec_tkw = get_prec_str(topk_weights);
 
-  int workspace_size = ck_tile::moe_sorting_get_workspace_size(tokens, experts);
+  int workspace_size = ck_tile::moe_sorting_get_workspace_size(tokens, experts, topk, 0 /*dispatch policy*/);
   void *ws_ptr = nullptr;
   if (workspace_size > 0)
   {
       auto ws = at::zeros({workspace_size}, at::TensorOptions().dtype(topk_ids.dtype()).device(device_of(topk_ids)));
       ws_ptr = ws.data_ptr();
   }
-  
+
 
   // Set up traits structure
   fused_moe_traits traits{
@@ -109,7 +109,8 @@ at::Tensor fused_moe_impl(
       gate_up_scales.has_value() ? gate_up_scales->data_ptr() : nullptr,
       down_scales.has_value() ? down_scales->data_ptr() : nullptr,
       smooth_scales.has_value() ? smooth_scales->data_ptr() : nullptr,  // expert_mask
-      nullptr,
+      nullptr, // local_expert_mask_ptr
+      nullptr, // local_tokens
       output.data_ptr(),
       ws_ptr,
       topk_ids.data_ptr(),
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fused_moe/fused_moesorting.hpp b/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fused_moe/fused_moesorting.hpp
@@ -15,13 +15,8 @@ struct fused_moesorting_trait {
 
 struct fused_moesorting_args : public ck_tile::MoeSortingHostArgs {};
 
+int fused_moe_get_workspace_size(int tokens, int num_experts, int topk);
 float fused_moesorting(
     fused_moesorting_trait t,
     fused_moesorting_args a,
     ck_tile::stream_config s);
-
-int moe_sorting_get_workspace_size(int tokens, int num_experts);
-float moe_sorting_mp(
-    fused_moesorting_trait t,
-    fused_moesorting_args a,
-    ck_tile::stream_config s);
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fused_moe/instances/fused_moe_api.hip b/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fused_moe/instances/fused_moe_api.hip
@@ -3,6 +3,12 @@
 
 #include "fused_moe.hpp"
 
+int fused_moe_get_workspace_size(int tokens, int num_experts, int topk)
+{
+    return ck_tile::moe_sorting_get_workspace_size(
+        tokens, num_experts, topk, 0 /*dispatch policy*/);
+}
+
 float fused_moe(fused_moe_traits t, fused_moe_args a, const ck_tile::stream_config& s)
 {
     auto s_sub = ck_tile::stream_config{s.stream_id_, false, s.log_level_, 0, 1};
@@ -18,21 +24,28 @@ float fused_moe(fused_moe_traits t, fused_moe_args a, const ck_tile::stream_conf
     }();
 
     auto t0 = fused_moesorting_trait{"int32", "fp32", t.local_expert_masking};
-    auto a0 = fused_moesorting_args{
-        a.topk_ids_ptr,                              // const void* p_topk_ids;
-        a.topk_weight_ptr,                           // const void* p_weights;
-        a.local_expert_mask_ptr,                     // const void* p_local_expert_mask;
-        a.sorted_token_ids_ptr,                      // void* p_sorted_token_ids;
-        a.sorted_weight_ptr,                         // void* p_sorted_weights;
-        a.sorted_expert_ids_ptr,                     // void* p_sorted_expert_ids;
-        a.num_sorted_tiles_ptr,                      // void* p_total_tokens_post_pad;
-        a.o_ptr,                                     // void* p_moe_buf;
-        a.ws_ptr,                                    // moe_sorting_ws
-        a.num_tokens,                                // index_t tokens;
-        a.block_m,                                   // index_t unit_size;
-        a.num_experts,                               // index_t num_experts;
-        a.topk,                                      // index_t topk;
-        a.num_tokens * a.stride_token * o_data_bytes // index_t moe_buf_bytes;
+    auto a0 = fused_moesorting_args
+    {
+        a.topk_ids_ptr,              // const void* p_topk_ids;
+            a.topk_weight_ptr,       // const void* p_weights;
+            a.local_expert_mask_ptr, // const void* p_local_expert_mask;
+            a.local_tokens,
+            a.sorted_token_ids_ptr,  // void* p_sorted_token_ids;
+            a.sorted_weight_ptr,     // void* p_sorted_weights;
+            a.sorted_expert_ids_ptr, // void* p_sorted_expert_ids;
+            a.num_sorted_tiles_ptr,  // void* p_total_tokens_post_pad;
+            a.o_ptr,                 // void* p_moe_buf;
+            a.ws_ptr,                // void* p_ws;
+            a.num_tokens,            // index_t tokens;
+            a.block_m,               // index_t unit_size;
+            a.num_experts,           // index_t num_experts;
+            a.topk,                  // index_t topk;
+#if MOE_SORTING_FMOE_2D_BUF
+            a.stride_token, o_data_bytes,
+#else
+            static_cast<ck_tile::long_index_t>(a.num_tokens) *
+                a.stride_token* o_data_bytes // index_t moe_buf_bytes;
+#endif
     };
 
     auto t1 = fused_moegemm_traits{t.prec_i,
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fused_moe/instances/fused_moesorting_api.hip b/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fused_moe/instances/fused_moesorting_api.hip