Clean up IndexShuffling op. (#4155)

levendlee · facebook-github-bot · commit 52f07e76e800 · 2025-05-20T13:01:04.000-07:00
Summary: Pull Request resolved: #4155 X-link: facebookresearch/FBGEMM#1235 Clean up IndexShuffling op. Better naming and more readable. Reviewed By: jasonjk-park Differential Revision: D75039953 fbshipit-source-id: 201662a7a13aec8b7ac64c1c7424ddee76f2e02a
diff --git a/fbgemm_gpu/experimental/gen_ai/src/moe/index_shuffling.cpp b/fbgemm_gpu/experimental/gen_ai/src/moe/index_shuffling.cpp
@@ -14,24 +14,27 @@
 namespace fbgemm_gpu {
 
 std::tuple<at::Tensor, at::Tensor, at::Tensor> index_shuffling_torch(
-    const at::Tensor& scores,
-    std::optional<at::Tensor> num_valid_tokens);
+    const at::Tensor& routing_scores,
+    std::optional<at::Tensor> valid_token_count);
 
 std::tuple<at::Tensor, at::Tensor, at::Tensor> index_shuffling_torch_meta(
-    const at::Tensor& scores,
-    std::optional<at::Tensor> num_valid_tokens) {
-  int T = scores.size(0);
-  int E = scores.size(1);
-  at::Tensor counts = at::empty({E + 1}, scores.options().dtype(at::kInt));
-  at::Tensor expert_indices = at::empty({T}, scores.options().dtype(at::kInt));
-  at::Tensor token_indices = at::empty({T}, scores.options().dtype(at::kInt));
-  return {counts, expert_indices, token_indices};
+    const at::Tensor& routing_scores,
+    std::optional<at::Tensor> valid_token_count) {
+  int T = routing_scores.size(0);
+  int E = routing_scores.size(1);
+  at::Tensor token_counts_per_expert =
+      at::empty({E + 1}, routing_scores.options().dtype(at::kInt));
+  at::Tensor expert_indices =
+      at::empty({T}, routing_scores.options().dtype(at::kInt));
+  at::Tensor token_indices =
+      at::empty({T}, routing_scores.options().dtype(at::kInt));
+  return {token_counts_per_expert, expert_indices, token_indices};
 }
 
 TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   m.set_python_module("fbgemm_gpu.experimental.gen_ai.moe");
   m.def(
-      "index_shuffling(Tensor scores, Tensor? num_valid_tokens= None) -> (Tensor, Tensor, Tensor)");
+      "index_shuffling(Tensor routing_scores, Tensor? valid_token_count=None) -> (Tensor, Tensor, Tensor)");
 }
 
 TORCH_LIBRARY_IMPL(fbgemm, CUDA, m) {
diff --git a/fbgemm_gpu/experimental/gen_ai/src/moe/index_shuffling.cu b/fbgemm_gpu/experimental/gen_ai/src/moe/index_shuffling.cu
@@ -31,6 +31,17 @@ __inline__ constexpr int ceil_of_ratio(int a, int b) {
   return (a + b - 1) / b;
 };
 
+template <typename T>
+__inline__ T* get_ptr(std::optional<at::Tensor> tensor) {
+  return reinterpret_cast<T*>(
+      tensor.has_value() ? tensor->data_ptr() : nullptr);
+};
+
+template <typename T>
+__inline__ __device__ T get_item(const T* ptr, const T& default_value) {
+  return ptr != nullptr ? *ptr : default_value;
+};
+
 #ifdef USE_ROCM
 __device__ __forceinline__ int atomic_add_relaxed(int* addr, int inc) {
   return __hip_atomic_fetch_add(
@@ -71,24 +82,29 @@ __device__ __forceinline__ int load_aquire(int* addr) {
 
 template <class DataType, class IndexType, int NumExperts, int NumTokensPerTile>
 struct SharedStorage {
-  DataType scores[NumTokensPerTile * NumExperts];
+  DataType routing_scores[NumTokensPerTile * NumExperts];
   IndexType expert_indices[NumTokensPerTile * NumExperts];
-  IndexType expert_count_cumsums[NumExperts];
+  IndexType token_count_cumsums[NumExperts];
 };
 
 template <class DataType, class IndexType>
 struct Params {
-  const DataType* scores;
-  int stride_t_;
-  int stride_e_;
-  int num_tokens;
-  int num_tokens_per_cta;
-  IndexType* counts;
-  IndexType* expert_indices;
-  IndexType* token_indices;
+  // Inputs
+  const DataType* routing_scores;
+  const int stride_t;
+  const int stride_e;
+  const IndexType* valid_token_count;
+  const int num_tokens;
+  const int num_tokens_per_cta;
+
+  // Buffer
+  IndexType* buffered_expert_indices;
+  IndexType* buffered_token_indices;
+
+  // Outputs
+  IndexType* token_count_per_expert;
   IndexType* shuffled_expert_indices;
   IndexType* shuffled_token_indices;
-  IndexType* num_valid_tokens;
 };
 
 template <class DataType, class IndexType, int NumExperts, int NumTokensPerTile>
@@ -106,7 +122,7 @@ __global__ void index_shuffling_kernel(Params<DataType, IndexType> params) {
 
   const int num_total_tokens = params.num_tokens;
   const int num_valid_tokens =
-      params.num_valid_tokens ? *params.num_valid_tokens : num_total_tokens;
+      get_item(params.valid_token_count, num_total_tokens);
 
   const int token_index_offset_start = bidx * params.num_tokens_per_cta;
   const int token_index_offset_end = std::min(
@@ -116,8 +132,8 @@ __global__ void index_shuffling_kernel(Params<DataType, IndexType> params) {
     return;
   }
 
-  const int stride_t_ = params.stride_t_;
-  const int stride_e_ = params.stride_e_;
+  const int stride_t = params.stride_t;
+  const int stride_e = params.stride_e;
 
   for (int token_index_offset = token_index_offset_start;
        token_index_offset < token_index_offset_end;
@@ -129,8 +145,9 @@ __global__ void index_shuffling_kernel(Params<DataType, IndexType> params) {
       int token_index = token_index_offset + i / NumExperts;
       int expert_index = i % NumExperts;
 
-      smem.scores[i] = token_index < num_valid_tokens
-          ? params.scores[token_index * stride_t_ + expert_index * stride_e_]
+      smem.routing_scores[i] = token_index < num_valid_tokens
+          ? params.routing_scores
+                [token_index * stride_t + expert_index * stride_e]
           : static_cast<DataType>(0.0f);
       smem.expert_indices[i] = expert_index;
     }
@@ -160,13 +177,14 @@ __global__ void index_shuffling_kernel(Params<DataType, IndexType> params) {
               (tidx % kNumParallelReductionThreads) * 2;
           int rhs_smem_index = lhs_smem_index + num_reduced_threads;
 
-          auto lhs_score = smem.scores[lhs_smem_index];
-          auto rhs_score = smem.scores[rhs_smem_index];
+          auto lhs_score = smem.routing_scores[lhs_smem_index];
+          auto rhs_score = smem.routing_scores[rhs_smem_index];
           auto lhs_expert_index = smem.expert_indices[lhs_smem_index];
           auto rhs_expert_index = smem.expert_indices[rhs_smem_index];
 
           bool lhs_larger = lhs_score >= rhs_score;
-          smem.scores[lhs_smem_index] = lhs_larger ? lhs_score : rhs_score;
+          smem.routing_scores[lhs_smem_index] =
+              lhs_larger ? lhs_score : rhs_score;
           smem.expert_indices[lhs_smem_index] =
               lhs_larger ? lhs_expert_index : rhs_expert_index;
         }
@@ -193,17 +211,17 @@ __global__ void index_shuffling_kernel(Params<DataType, IndexType> params) {
       if (token_index < num_valid_tokens) {
         auto expert_index = smem.expert_indices[local_token_index * NumExperts];
         auto token_index_in_expert =
-            atomic_add_relaxed(&params.counts[expert_index], 1);
-        params.expert_indices[token_index] = expert_index;
-        params.token_indices[token_index] = token_index_in_expert;
+            atomic_add_relaxed(&params.token_count_per_expert[expert_index], 1);
+        params.buffered_expert_indices[token_index] = expert_index;
+        params.buffered_token_indices[token_index] = token_index_in_expert;
       }
     }
     __syncthreads();
   }
 
   if (tidx == 0) {
     int processed_tokens = 0;
-    int* processed_tokens_addr = &params.counts[NumExperts];
+    int* processed_tokens_addr = &params.token_count_per_expert[NumExperts];
 
     int inc = token_index_offset_end - token_index_offset_start;
     atomic_add_release(processed_tokens_addr, inc);
@@ -217,15 +235,15 @@ __global__ void index_shuffling_kernel(Params<DataType, IndexType> params) {
   // 4. Scan
   static_assert(kNumThreads >= NumExperts, "");
   if (tidx < NumExperts) {
-    smem.expert_count_cumsums[tidx] = params.counts[tidx];
+    smem.token_count_cumsums[tidx] = params.token_count_per_expert[tidx];
   }
   __syncthreads();
 
   if (tidx == 0) {
     // TODO(shikaili): parallel.
 #pragma unroll
     for (int i = 1; i < NumExperts; ++i) {
-      smem.expert_count_cumsums[i] += smem.expert_count_cumsums[i - 1];
+      smem.token_count_cumsums[i] += smem.token_count_cumsums[i - 1];
     }
   }
   __syncthreads();
@@ -236,11 +254,10 @@ __global__ void index_shuffling_kernel(Params<DataType, IndexType> params) {
        global_token_offset += kNumThreads) {
     int token_index = global_token_offset + tidx;
     if (token_index < num_valid_tokens) {
-      int expert_index = params.expert_indices[token_index];
-      int token_index_in_expert = params.token_indices[token_index];
+      int expert_index = params.buffered_expert_indices[token_index];
+      int token_index_in_expert = params.buffered_token_indices[token_index];
       int new_token_index =
-          (expert_index == 0 ? 0
-                             : smem.expert_count_cumsums[expert_index - 1]) +
+          (expert_index == 0 ? 0 : smem.token_count_cumsums[expert_index - 1]) +
           token_index_in_expert;
       params.shuffled_expert_indices[new_token_index] = expert_index;
       params.shuffled_token_indices[new_token_index] = token_index;
@@ -255,42 +272,37 @@ __global__ void index_shuffling_kernel(Params<DataType, IndexType> params) {
 } // namespace
 
 std::tuple<at::Tensor, at::Tensor, at::Tensor> index_shuffling_torch(
-    const at::Tensor& scores,
-    std::optional<at::Tensor> num_valid_tokens) {
-  TORCH_CHECK(scores.dtype() == torch::kBFloat16);
+    const at::Tensor& routing_scores,
+    std::optional<at::Tensor> valid_token_count) {
+  TORCH_CHECK(routing_scores.dtype() == torch::kBFloat16);
   using DataType = __nv_bfloat16;
   using IndexType = int32_t;
 
-  TORCH_CHECK(scores.dim() == 2);
-  const int num_tokens = scores.size(0);
-  const int num_experts = scores.size(1);
+  TORCH_CHECK(routing_scores.dim() == 2);
+  const int num_tokens = routing_scores.size(0);
+  const int num_experts = routing_scores.size(1);
   TORCH_CHECK(num_experts == 16 || num_experts == 128);
 
   auto allocate_index_tensor = [&](int size) {
     return at::empty(
-        {size}, at::TensorOptions().dtype(at::kInt).device(scores.device()));
+        {size},
+        at::TensorOptions().dtype(at::kInt).device(routing_scores.device()));
   };
-  at::Tensor counts = allocate_index_tensor(num_experts + 1);
-  at::Tensor expert_indices = allocate_index_tensor(num_tokens);
-  at::Tensor token_indices = allocate_index_tensor(num_tokens);
+  at::Tensor token_count_per_expert = allocate_index_tensor(num_experts + 1);
   at::Tensor shuffled_expert_indices = allocate_index_tensor(num_tokens);
   at::Tensor shuffled_token_indices = allocate_index_tensor(num_tokens);
+  at::Tensor buffered_expert_indices = allocate_index_tensor(num_tokens);
+  at::Tensor buffered_token_indices = allocate_index_tensor(num_tokens);
 
 #ifdef USE_ROCM
-  counts.zero_();
   // TODO(shikaili): hipMetsetAsync is more expensive than ATen set zero.
-  /*
-  hipMemsetAsync(
-      counts.data_ptr(),
-      0,
-      counts.numel() * counts.dtype().itemsize(),
-      at::cuda::getCurrentCUDAStream());
-  */
+  token_count_per_expert.zero_();
 #else
   cudaMemsetAsync(
-      counts.data_ptr(),
+      token_count_per_expert.data_ptr(),
       0,
-      counts.numel() * counts.dtype().itemsize(),
+      token_count_per_expert.numel() *
+          token_count_per_expert.dtype().itemsize(),
       at::cuda::getCurrentCUDAStream());
 #endif
 
@@ -323,26 +335,30 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> index_shuffling_torch(
 
   const int num_tiles = ceil_of_ratio(num_tokens, kNumTokensPerTile);
   const int num_ctas = std::min(num_tiles, num_sms);
-
-  int num_tokens_per_cta = ceil_of_ratio(num_tokens, num_ctas);
   const int num_tiles_per_cta =
-      ceil_of_ratio(num_tokens_per_cta, kNumTokensPerTile);
-  num_tokens_per_cta = num_tiles_per_cta * kNumTokensPerTile;
+      ceil_of_ratio(ceil_of_ratio(num_tokens, num_ctas), kNumTokensPerTile);
+  const int num_tokens_per_cta = num_tiles_per_cta * kNumTokensPerTile;
 
   Params<DataType, IndexType> params = {
-      reinterpret_cast<DataType*>(scores.data_ptr()),
-      static_cast<int>(scores.stride(0)),
-      static_cast<int>(scores.stride(1)),
-      num_tokens,
-      num_tokens_per_cta,
-      reinterpret_cast<IndexType*>(counts.data_ptr()),
-      reinterpret_cast<IndexType*>(expert_indices.data_ptr()),
-      reinterpret_cast<IndexType*>(token_indices.data_ptr()),
-      reinterpret_cast<IndexType*>(shuffled_expert_indices.data_ptr()),
-      reinterpret_cast<IndexType*>(shuffled_token_indices.data_ptr()),
-      reinterpret_cast<IndexType*>(
-          num_valid_tokens.has_value() ? num_valid_tokens->data_ptr()
-                                       : nullptr)};
+      // Inputs
+      .routing_scores = reinterpret_cast<DataType*>(routing_scores.data_ptr()),
+      .stride_t = static_cast<int>(routing_scores.stride(0)),
+      .stride_e = static_cast<int>(routing_scores.stride(1)),
+      .valid_token_count = get_ptr<IndexType>(valid_token_count),
+      .num_tokens = num_tokens,
+      .num_tokens_per_cta = num_tokens_per_cta,
+      // Buffer
+      .buffered_expert_indices =
+          reinterpret_cast<IndexType*>(buffered_expert_indices.data_ptr()),
+      .buffered_token_indices =
+          reinterpret_cast<IndexType*>(buffered_token_indices.data_ptr()),
+      // Outputs
+      .token_count_per_expert =
+          reinterpret_cast<IndexType*>(token_count_per_expert.data_ptr()),
+      .shuffled_expert_indices =
+          reinterpret_cast<IndexType*>(shuffled_expert_indices.data_ptr()),
+      .shuffled_token_indices =
+          reinterpret_cast<IndexType*>(shuffled_token_indices.data_ptr())};
 
   dim3 grids(num_ctas);
   dim3 blocks(kNumThreads);
@@ -360,7 +376,7 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> index_shuffling_torch(
 #endif
 
   return std::make_tuple(
-      counts, shuffled_expert_indices, shuffled_token_indices);
+      token_count_per_expert, shuffled_expert_indices, shuffled_token_indices);
 }
 
 } // namespace fbgemm_gpu