Migrate TBE cache kernels to FBGEMM_LAUNCH_KERNEL (#4127)

q10 · facebook-github-bot · commit 127848a12618 · 2025-05-15T16:19:25.000-07:00
Summary: X-link: facebookresearch/FBGEMM#1208 Pull Request resolved: #4127 - Migrate TBE cache kernels to `FBGEMM_LAUNCH_KERNEL` Reviewed By: spcyppt Differential Revision: D74272500 fbshipit-source-id: 98c71b6286d3d7aad565cb1ae51111fac37069a5
diff --git a/fbgemm_gpu/src/split_embeddings_cache/common.cuh b/fbgemm_gpu/src/split_embeddings_cache/common.cuh
@@ -34,6 +34,7 @@
 #include "fbgemm_gpu/utils/cuda_prelude.cuh"
 #include "fbgemm_gpu/utils/find_qparams.cuh"
 #include "fbgemm_gpu/utils/fixed_divisor.cuh"
+#include "fbgemm_gpu/utils/kernel_launcher.cuh"
 #include "fbgemm_gpu/utils/stochastic_rounding.cuh"
 #include "fbgemm_gpu/utils/vec4.cuh"
 #include "fbgemm_gpu/utils/vec4acc.cuh"
diff --git a/fbgemm_gpu/src/split_embeddings_cache/lfu_cache_populate.cu b/fbgemm_gpu/src/split_embeddings_cache/lfu_cache_populate.cu
@@ -198,35 +198,27 @@ void lfu_cache_insert_cuda(
                                   ->philox_cuda_state(4);
         }
 
-#ifdef FBGEMM_GPU_MEMCHECK
-        const char* func_name = "lfu_cache_insert_kernel";
-#endif
-
-        lfu_cache_insert_kernel<emb_t, cache_t>
-            <<<std::min(
-                   div_round_up(N, kCacheMaxThreads / kWarpSize),
-                   get_max_thread_blocks_for_cache_kernels_()),
-               dim3(kWarpSize, kCacheMaxThreads / kWarpSize),
-               0,
-               at::cuda::getCurrentCUDAStream()>>>(
-                MAKE_PTA_WITH_NAME(func_name, weights, emb_t, 1, 64),
-                MAKE_PTA_WITH_NAME(
-                    func_name, cache_hash_size_cumsum, int64_t, 1, 32),
-                MAKE_PTA_WITH_NAME(
-                    func_name, cache_index_table_map, int32_t, 1, 64),
-                MAKE_PTA_WITH_NAME(func_name, weights_offsets, int64_t, 1, 32),
-                MAKE_PTA_WITH_NAME(func_name, D_offsets, int32_t, 1, 32),
-                (uint64_t*)sorted_cache_sets.data_ptr<int64_t>(),
-                MAKE_PTA_WITH_NAME(
-                    func_name, cache_set_sorted_unique_indices, int64_t, 1, 32),
-                unique_indices_length.data_ptr<int32_t>(),
-                MAKE_PTA_WITH_NAME(func_name, lxu_cache_state, int64_t, 2, 32),
-                MAKE_PTA_WITH_NAME(
-                    func_name, lxu_cache_weights, cache_t, 2, 64),
-                MAKE_PTA_WITH_NAME(func_name, lfu_state, int64_t, 1, 64),
-                stochastic_rounding_,
-                rng_engine_inputs);
-        C10_CUDA_KERNEL_LAUNCH_CHECK();
+        FBGEMM_LAUNCH_KERNEL(
+            (lfu_cache_insert_kernel<emb_t, cache_t>),
+            std::min(
+                div_round_up(N, kCacheMaxThreads / kWarpSize),
+                get_max_thread_blocks_for_cache_kernels_()),
+            dim3(kWarpSize, kCacheMaxThreads / kWarpSize),
+            0,
+            at::cuda::getCurrentCUDAStream(),
+            PTA_B(weights, emb_t, 1, 64),
+            PTA_B(cache_hash_size_cumsum, int64_t, 1, 32),
+            PTA_B(cache_index_table_map, int32_t, 1, 64),
+            PTA_B(weights_offsets, int64_t, 1, 32),
+            PTA_B(D_offsets, int32_t, 1, 32),
+            (uint64_t*)sorted_cache_sets.data_ptr<int64_t>(),
+            PTA_B(cache_set_sorted_unique_indices, int64_t, 1, 32),
+            unique_indices_length.data_ptr<int32_t>(),
+            PTA_B(lxu_cache_state, int64_t, 2, 32),
+            PTA_B(lxu_cache_weights, cache_t, 2, 64),
+            PTA_B(lfu_state, int64_t, 1, 64),
+            stochastic_rounding_,
+            rng_engine_inputs);
       }));
 }
 
diff --git a/fbgemm_gpu/src/split_embeddings_cache/lru_cache_populate.cu b/fbgemm_gpu/src/split_embeddings_cache/lru_cache_populate.cu
@@ -228,39 +228,30 @@ void lru_cache_insert_cuda(
             ? div_round_up(get_device_sm_cnt_(), ALL_TO_PREFETCH_SM_RATIO)
             : div_round_up(N, kMaxThreads / kWarpSize);
 
-#ifdef FBGEMM_GPU_MEMCHECK
-        const char* func_name = "lru_cache_insert_kernel";
-#endif
-        lru_cache_insert_kernel<emb_t, cache_t>
-            <<<grid_size,
-               dim3(kWarpSize, kMaxThreads / kWarpSize),
-               0,
-               at::cuda::getCurrentCUDAStream()>>>(
-                MAKE_PTA_WITH_NAME(func_name, weights, emb_t, 1, 64),
-                MAKE_PTA_WITH_NAME(
-                    func_name, cache_hash_size_cumsum, int64_t, 1, 32),
-                MAKE_PTA_WITH_NAME(
-                    func_name, cache_index_table_map, int32_t, 1, 64),
-                MAKE_PTA_WITH_NAME(func_name, weights_offsets, int64_t, 1, 32),
-                MAKE_PTA_WITH_NAME(func_name, D_offsets, int32_t, 1, 32),
-                MAKE_PTA_WITH_NAME(
-                    func_name, sorted_cache_sets, int32_t, 1, 32),
-                MAKE_PTA_WITH_NAME(
-                    func_name, cache_set_sorted_unique_indices, int64_t, 1, 32),
-                unique_indices_length.data_ptr<int32_t>(),
-                MAKE_PTA_WITH_NAME(func_name, lxu_cache_state, int64_t, 2, 32),
-                MAKE_PTA_WITH_NAME(
-                    func_name, lxu_cache_weights, cache_t, 2, 64),
-                time_stamp,
-                MAKE_PTA_WITH_NAME(func_name, lru_state, int64_t, 2, 32),
-                stochastic_rounding_,
-                rng_engine_inputs,
-                gather_cache_stats,
-                MAKE_PTA_WITH_NAME(func_name, uvm_cache_stats, int32_t, 1, 32),
-                lock_cache_line,
-                MAKE_PTA_WITH_NAME(
-                    func_name, lxu_cache_locking_counter, int32_t, 2, 32));
-        C10_CUDA_KERNEL_LAUNCH_CHECK();
+        FBGEMM_LAUNCH_KERNEL(
+            (lru_cache_insert_kernel<emb_t, cache_t>),
+            grid_size,
+            dim3(kWarpSize, kMaxThreads / kWarpSize),
+            0,
+            at::cuda::getCurrentCUDAStream(),
+            PTA_B(weights, emb_t, 1, 64),
+            PTA_B(cache_hash_size_cumsum, int64_t, 1, 32),
+            PTA_B(cache_index_table_map, int32_t, 1, 64),
+            PTA_B(weights_offsets, int64_t, 1, 32),
+            PTA_B(D_offsets, int32_t, 1, 32),
+            PTA_B(sorted_cache_sets, int32_t, 1, 32),
+            PTA_B(cache_set_sorted_unique_indices, int64_t, 1, 32),
+            unique_indices_length.data_ptr<int32_t>(),
+            PTA_B(lxu_cache_state, int64_t, 2, 32),
+            PTA_B(lxu_cache_weights, cache_t, 2, 64),
+            time_stamp,
+            PTA_B(lru_state, int64_t, 2, 32),
+            stochastic_rounding_,
+            rng_engine_inputs,
+            gather_cache_stats,
+            PTA_B(uvm_cache_stats, int32_t, 1, 32),
+            lock_cache_line,
+            PTA_B(lxu_cache_locking_counter, int32_t, 2, 32));
       }));
 }
 
diff --git a/fbgemm_gpu/src/split_embeddings_cache/lxu_cache.cu b/fbgemm_gpu/src/split_embeddings_cache/lxu_cache.cu
@@ -126,24 +126,22 @@ DLL_PUBLIC void lxu_cache_flush_cuda(
           rng_engine_inputs = at::check_generator<at::CUDAGeneratorImpl>(gen)
                                   ->philox_cuda_state(4);
         }
-#ifdef FBGEMM_GPU_MEMCHECK
-        const char* func_name = "lxu_cache_flush_kernel";
-#endif
-        lxu_cache_flush_kernel<emb_t, cache_t>
-            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
-                MAKE_PTA_WITH_NAME(func_name, uvm_weights, emb_t, 1, 64),
-                MAKE_PTA_WITH_NAME(
-                    func_name, cache_hash_size_cumsum, int64_t, 1, 32),
-                MAKE_PTA_WITH_NAME(
-                    func_name, cache_index_table_map, int32_t, 1, 64),
-                MAKE_PTA_WITH_NAME(func_name, weights_offsets, int64_t, 1, 32),
-                MAKE_PTA_WITH_NAME(func_name, D_offsets, int32_t, 1, 32),
-                MAKE_PTA_WITH_NAME(func_name, lxu_cache_state, int64_t, 2, 32),
-                MAKE_PTA_WITH_NAME(
-                    func_name, lxu_cache_weights, cache_t, 2, 64),
-                stochastic_rounding_,
-                rng_engine_inputs);
-        C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+        FBGEMM_LAUNCH_KERNEL(
+            (lxu_cache_flush_kernel<emb_t, cache_t>),
+            blocks,
+            threads,
+            0,
+            at::cuda::getCurrentCUDAStream(),
+            PTA_B(uvm_weights, emb_t, 1, 64),
+            PTA_B(cache_hash_size_cumsum, int64_t, 1, 32),
+            PTA_B(cache_index_table_map, int32_t, 1, 64),
+            PTA_B(weights_offsets, int64_t, 1, 32),
+            PTA_B(D_offsets, int32_t, 1, 32),
+            PTA_B(lxu_cache_state, int64_t, 2, 32),
+            PTA_B(lxu_cache_weights, cache_t, 2, 64),
+            stochastic_rounding_,
+            rng_engine_inputs);
       }));
 }
 
@@ -211,34 +209,26 @@ void lxu_cache_locking_counter_decrement_cuda(
       div_round_up(N, kMaxThreads),
       get_max_thread_blocks_for_cache_kernels_()));
 
-#ifdef FBGEMM_GPU_MEMCHECK
-  const char* func_name = "lxu_cache_locations_count_kernel";
-#endif
-
-  lxu_cache_locations_count_kernel<<<
+  FBGEMM_LAUNCH_KERNEL(
+      lxu_cache_locations_count_kernel,
       blocks,
       kMaxThreads,
       0,
-      at::cuda::getCurrentCUDAStream()>>>(
+      at::cuda::getCurrentCUDAStream(),
       MAKE_PTA_WITH_NAME(func_name, lxu_cache_locations, int32_t, 1, 32),
       MAKE_PTA_WITH_NAME(func_name, count, int32_t, 2, 32),
       fd);
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
-
-#ifdef FBGEMM_GPU_MEMCHECK
-  const char* func_name2 = "lxu_cache_locking_counter_decrement_kernel";
-#endif
 
-  lxu_cache_locking_counter_decrement_kernel<<<
+  FBGEMM_LAUNCH_KERNEL(
+      lxu_cache_locking_counter_decrement_kernel,
       std::min(
           div_round_up(C, kMaxThreads / kWarpSize),
           get_max_thread_blocks_for_cache_kernels_()),
       dim3(kWarpSize, kMaxThreads / kWarpSize),
       0,
-      at::cuda::getCurrentCUDAStream()>>>(
+      at::cuda::getCurrentCUDAStream(),
       MAKE_PTA_WITH_NAME(func_name2, lxu_cache_locking_counter, int32_t, 2, 32),
       MAKE_PTA_WITH_NAME(func_name2, count, int32_t, 2, 32));
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
 }
 
 namespace {
@@ -445,14 +435,12 @@ DLL_PUBLIC Tensor lxu_cache_lookup_cuda(
 
   AT_DISPATCH_INDEX_TYPES(
       linear_cache_indices.scalar_type(), "lxu_cache_lookup_cuda", [&] {
-#ifdef FBGEMM_GPU_MEMCHECK
-        const char* func_name = "lxu_cache_lookup_kernel";
-#endif
-        lxu_cache_lookup_kernel<<<
+        FBGEMM_LAUNCH_KERNEL(
+            (lxu_cache_lookup_kernel<index_t>),
             blocks,
             threads,
             0,
-            at::cuda::getCurrentCUDAStream()>>>(
+            at::cuda::getCurrentCUDAStream(),
             MAKE_PTA_WITH_NAME(func_name, linear_cache_indices, index_t, 1, 32),
             MAKE_PTA_WITH_NAME(func_name, lxu_cache_state, int64_t, 2, 32),
             invalid_index,
@@ -462,7 +450,6 @@ DLL_PUBLIC Tensor lxu_cache_lookup_cuda(
             num_uniq_cache_indices.has_value()
                 ? num_uniq_cache_indices.value().data_ptr<int32_t>()
                 : nullptr);
-        C10_CUDA_KERNEL_LAUNCH_CHECK();
       });
   return lxu_cache_locations;
 }
@@ -499,21 +486,18 @@ DLL_PUBLIC Tensor direct_mapped_lxu_cache_lookup_cuda(
       linear_cache_indices.scalar_type(),
       "direct_mapped_lxu_cache_lookup_cuda",
       [&] {
-#ifdef FBGEMM_GPU_MEMCHECK
-        const char* func_name = "direct_mapped_lxu_cache_lookup_kernel";
-#endif
-        direct_mapped_lxu_cache_lookup_kernel<<<
+        FBGEMM_LAUNCH_KERNEL(
+            (direct_mapped_lxu_cache_lookup_kernel<index_t>),
             blocks,
             kMaxThreads,
             0,
-            at::cuda::getCurrentCUDAStream()>>>(
-            MAKE_PTA_WITH_NAME(func_name, linear_cache_indices, index_t, 1, 32),
-            MAKE_PTA_WITH_NAME(func_name, lxu_cache_state, int64_t, 2, 32),
+            at::cuda::getCurrentCUDAStream(),
+            PTA_B(linear_cache_indices, index_t, 1, 32),
+            PTA_B(lxu_cache_state, int64_t, 2, 32),
             invalid_index,
-            MAKE_PTA_WITH_NAME(func_name, lxu_cache_locations, int32_t, 1, 32),
+            PTA_B(lxu_cache_locations, int32_t, 1, 32),
             gather_cache_stats,
-            MAKE_PTA_WITH_NAME(func_name, uvm_cache_stats_, int32_t, 1, 32));
-        C10_CUDA_KERNEL_LAUNCH_CHECK();
+            PTA_B(uvm_cache_stats_, int32_t, 1, 32));
       });
 
   return lxu_cache_locations;
@@ -559,21 +543,17 @@ DLL_PUBLIC void lxu_cache_locations_update_cuda(
       div_round_up(N, kMaxThreads),
       get_max_thread_blocks_for_cache_kernels_()));
 
-#ifdef FBGEMM_GPU_MEMCHECK
-  const char* func_name = "lxu_cache_locations_update_kernel";
-#endif
-
-  lxu_cache_locations_update_kernel<<<
+  FBGEMM_LAUNCH_KERNEL(
+      lxu_cache_locations_update_kernel,
       blocks,
       kMaxThreads,
       0,
-      at::cuda::getCurrentCUDAStream()>>>(
+      at::cuda::getCurrentCUDAStream(),
       MAKE_PTA_WITH_NAME(func_name, lxu_cache_locations, int32_t, 1, 32),
       MAKE_PTA_WITH_NAME(func_name, lxu_cache_locations_new, int32_t, 1, 32),
       num_uniq_cache_indices.has_value()
           ? num_uniq_cache_indices.value().data_ptr<int32_t>()
           : nullptr);
 
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
   return;
 }
diff --git a/fbgemm_gpu/src/split_embeddings_cache/reset_weight_momentum.cu b/fbgemm_gpu/src/split_embeddings_cache/reset_weight_momentum.cu
@@ -35,8 +35,8 @@ __global__ __launch_bounds__(kMaxThreads) void get_cache_indices_kernel(
         linear_cache_indices) {
   const int64_t index = blockIdx.x * blockDim.x + threadIdx.x;
 
-  const int32_t t_i = blockIdx.x / blocks_per_table;
-  const int32_t threads_per_table = blocks_per_table * blockDim.x;
+  const auto t_i = blockIdx.x / blocks_per_table;
+  const auto threads_per_table = blocks_per_table * blockDim.x;
   const int32_t idx_table = index % threads_per_table;
   const int32_t logical_id = logical_table_ids[t_i];
   const int32_t buffer_id = buffer_ids[t_i];
@@ -112,7 +112,7 @@ __global__ __launch_bounds__(kMaxThreads) void reset_weight_momentum_kernel(
         lxu_cache_locations) {
   const int64_t index = blockIdx.x * blockDim.x + threadIdx.x;
 
-  const int32_t t_i = blockIdx.x / blocks_per_table;
+  const auto t_i = blockIdx.x / blocks_per_table;
   const int32_t buffer_id = buffer_ids[t_i];
   const int64_t num_indices =
       pruned_indices_offsets[buffer_id + 1] - pruned_indices_offsets[buffer_id];
@@ -126,7 +126,7 @@ __global__ __launch_bounds__(kMaxThreads) void reset_weight_momentum_kernel(
   const int32_t chunk4s_per_row = D / 4;
   const int64_t total_chunk4s_per_table = num_indices * chunk4s_per_row;
 
-  const int32_t threads_per_table = blocks_per_table * blockDim.x;
+  const auto threads_per_table = blocks_per_table * blockDim.x;
   const int64_t chunk4s_per_thread =
       div_round_up(total_chunk4s_per_table, threads_per_table);
   const int32_t idx_table = index % threads_per_table;
@@ -249,23 +249,19 @@ DLL_PUBLIC void reset_weight_momentum_cuda(
     auto linear_cache_indices = at::zeros(
         {num_pruned_indices}, pruned_indices.options().dtype(at::kLong));
 
-#ifdef FBGEMM_GPU_MEMCHECK
-    const char* func_name = "get_cache_indices_kernel";
-#endif
-
-    get_cache_indices_kernel<<<
+    FBGEMM_LAUNCH_KERNEL(
+        get_cache_indices_kernel,
         num_pruned_tables * blocks_per_table,
         kMaxThreads,
         0,
-        at::cuda::getCurrentCUDAStream()>>>(
+        at::cuda::getCurrentCUDAStream(),
         blocks_per_table,
-        MAKE_PTA_WITH_NAME(func_name, cache_hash_size_cumsum, int64_t, 1, 32),
-        MAKE_PTA_WITH_NAME(func_name, pruned_indices, int64_t, 1, 32),
-        MAKE_PTA_WITH_NAME(func_name, pruned_indices_offsets, int64_t, 1, 32),
-        MAKE_PTA_WITH_NAME(func_name, logical_table_ids, int32_t, 1, 32),
-        MAKE_PTA_WITH_NAME(func_name, buffer_ids, int32_t, 1, 32),
-        MAKE_PTA_WITH_NAME(func_name, linear_cache_indices, int64_t, 1, 32));
-    C10_CUDA_KERNEL_LAUNCH_CHECK();
+        PTA_B(cache_hash_size_cumsum, int64_t, 1, 32),
+        PTA_B(pruned_indices, int64_t, 1, 32),
+        PTA_B(pruned_indices_offsets, int64_t, 1, 32),
+        PTA_B(logical_table_ids, int32_t, 1, 32),
+        PTA_B(buffer_ids, int32_t, 1, 32),
+        PTA_B(linear_cache_indices, int64_t, 1, 32));
 
     // Look up cache locations
     Tensor uvm_cache_stats =