diff --git a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_indice_weights_template.cu b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_indice_weights_template.cu index 01cabc779d..1afb2943bb 100644 --- a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_indice_weights_template.cu +++ b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_indice_weights_template.cu @@ -21,6 +21,7 @@ #include "fbgemm_gpu/utils/ops_utils.h" #include "fbgemm_gpu/utils/tensor_utils.h" #include "fbgemm_gpu/utils/assert_macros.h" +#include "fbgemm_gpu/utils/kernel_launcher.cuh" using Tensor = at::Tensor; using namespace fbgemm_gpu; @@ -405,53 +406,49 @@ Tensor {{ mdesc }}_embedding_codegen_grad_indice_weights{{ vdesc }}_cuda( {%- set kernel_name = "{}_embedding_codegen_grad_indice_weights{}_{}kernel".format( mdesc, vdesc, vbdesc) - %} -#ifdef FBGEMM_GPU_MEMCHECK - const auto func_name = "{{ kernel_name }}"; -#endif - {{ kernel_name }}< - emb_t, - grad_t, - cache_t, - index_t, - kFixedMaxVecsPerThread><<< + %} + FBGEMM_LAUNCH_KERNEL( + ({{ kernel_name }}< + emb_t, + grad_t, + cache_t, + index_t, + kFixedMaxVecsPerThread>), div_round_up(total_B, kForwardMaxThreads / kWarpSize), dim3(kWarpSize, kForwardMaxThreads / kWarpSize), 0, - at::cuda::getCurrentCUDAStream()>>>( - MAKE_PTA_WITH_NAME(func_name, grad_output_reshaped, grad_t, 2, 64), - MAKE_PTA_WITH_NAME(func_name, dev_weights, emb_t, 1, 64), + at::cuda::getCurrentCUDAStream(), + PTA_B(grad_output_reshaped, grad_t, 2, 64), + PTA_B(dev_weights, emb_t, 1, 64), {%- if not dense %} - MAKE_PTA_WITH_NAME(func_name, uvm_weights, emb_t, 1, 64), - MAKE_PTA_WITH_NAME(func_name, lxu_cache_weights, cache_t, 2, 64), - MAKE_PTA_WITH_NAME(func_name, weights_placements, int32_t, 1, 32), + PTA_B(uvm_weights, emb_t, 1, 64), + PTA_B(lxu_cache_weights, cache_t, 2, 64), + PTA_B(weights_placements, int32_t, 1, 32), {%- endif %} - MAKE_PTA_WITH_NAME(func_name, weights_offsets, int64_t, 1, 32), - MAKE_PTA_WITH_NAME(func_name, D_offsets, int32_t, 1, 32), - MAKE_PTA_WITH_NAME(func_name, indices, index_t, 1, 32), - MAKE_PTA_WITH_NAME(func_name, offsets, index_t, 1, 32), + PTA_B(weights_offsets, int64_t, 1, 32), + PTA_B(D_offsets, int32_t, 1, 32), + PTA_B(indices, index_t, 1, 32), + PTA_B(offsets, index_t, 1, 32), {%- if not dense %} - MAKE_PTA_WITH_NAME(func_name, {{ locs_or_addrs_tensor }}, {{ locs_or_addrs_type }}, 1, 32), + PTA_B({{ locs_or_addrs_tensor }}, {{ locs_or_addrs_type }}, 1, 32), {%- endif %} - MAKE_PTA_WITH_NAME(func_name, feature_requires_grad_, int32_t, 1, 32), - MAKE_PTA_ACC_WITH_NAME(func_name, grad_indice_weights, grad_t, 1, 32), + PTA_B(feature_requires_grad_, int32_t, 1, 32), + PTA_ACC_B(grad_indice_weights, grad_t, 1, 32), {%- if vbe %} - MAKE_PTA_WITH_NAME(func_name, vbe_row_output_offsets, int64_t, 1, 32), - MAKE_PTA_WITH_NAME(func_name, vbe_b_t_map, int32_t, 1, 32), + PTA_B(vbe_row_output_offsets, int64_t, 1, 32), + PTA_B(vbe_b_t_map, int32_t, 1, 32), info_B_num_bits, info_B_mask {%- else %} FixedDivisor(total_B / T) {%- endif %} ); - C10_CUDA_KERNEL_LAUNCH_CHECK(); return; }); {%- endfor %} {# /* for use_vec_blocking */ #} }); }); - C10_CUDA_KERNEL_LAUNCH_CHECK(); return grad_indice_weights; } diff --git a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu index 30909db277..225e983648 100644 --- a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu +++ b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu @@ -794,34 +794,22 @@ Tensor {{ embedding_cuda_op }}( // {{ locs_or_addrs_tensor }} run ids and sorted_linear_indices run ids. auto dev_or_uvm_unique_indices = at::zeros_like(weights_placements); - DEBUG_KERNEL_BARRIER_ISOLATE([&] { -#ifdef FBGEMM_GPU_MEMCHECK - const auto func_name = "split_embedding_backward_count_unique_indices_kernel"; -#endif - split_embedding_backward_count_unique_indices_kernel< - {{ "int64_t" if nobag else "int32_t" }}, - {{ "int64_t" if nobag else "uint32_t" }}, - {{ "true" if nobag else "false" }} - ><<< - div_round_up(total_unique_indices, kMaxThreads), - kMaxThreads, - 0, - at::cuda::getCurrentCUDAStream() - >>>( - MAKE_PTA_WITH_NAME( - func_name, sorted_linear_indices_num_runs, int32_t, 1, 32), - MAKE_PTA_WITH_NAME( - func_name, sorted_linear_indices_cumulative_run_lengths, int32_t, 1, 32), - MAKE_PTA_WITH_NAME( - func_name, infos_sorted, {{ "int64_t" if nobag else "int32_t" }}, 1, 32), - MAKE_PTA_WITH_NAME( - func_name, weights_placements, int32_t, 1, 32), - MAKE_PTA_WITH_NAME( - func_name, dev_or_uvm_unique_indices, int32_t, 1, 32), - info_B_num_bits - ); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - }); // DEBUG_KERNEL_BARRIER_ISOLATE + FBGEMM_LAUNCH_KERNEL( + (split_embedding_backward_count_unique_indices_kernel< + {{ "int64_t" if nobag else "int32_t" }}, + {{ "int64_t" if nobag else "uint32_t" }}, + {{ "true" if nobag else "false" }} + >), + div_round_up(total_unique_indices, kMaxThreads), + kMaxThreads, + 0, + at::cuda::getCurrentCUDAStream(), + PTA_B(sorted_linear_indices_num_runs, int32_t, 1, 32), + PTA_B(sorted_linear_indices_cumulative_run_lengths, int32_t, 1, 32), + PTA_B(infos_sorted, {{ "int64_t" if nobag else "int32_t" }}, 1, 32), + PTA_B(weights_placements, int32_t, 1, 32), + PTA_B(dev_or_uvm_unique_indices, int32_t, 1, 32), info_B_num_bits + ); table_unique_indices_offsets = fbgemm_gpu::asynchronous_complete_cumsum_gpu(dev_or_uvm_unique_indices).to(at::kInt); @@ -944,34 +932,27 @@ Tensor {{ embedding_cuda_op }}( Tensor grad_output_mean; if (static_cast(pooling_mode) == PoolingMode::MEAN) { grad_output_mean = at::empty_like(grad_output_reshaped); - {%- if not dense or not vbe %} - DEBUG_KERNEL_BARRIER_ISOLATE([&] { -#ifdef FBGEMM_GPU_MEMCHECK - const auto func_name1 = "grad_mean{{ vdesc }}_kernel"; -#endif - grad_mean{{ vdesc }}_kernel<<< - div_round_up(total_B, kMaxThreads / kWarpSize), - dim3(kWarpSize, kMaxThreads / kWarpSize), - 0, - at::cuda::getCurrentCUDAStream()>>> - ( - MAKE_PTA_WITH_NAME(func_name1, grad_output_mean, grad_t, 2, 64), - MAKE_PTA_WITH_NAME(func_name1, grad_output_reshaped, grad_t, 2, 64), - MAKE_PTA_WITH_NAME(func_name1, D_offsets, int32_t, 1, 32), - MAKE_PTA_WITH_NAME(func_name1, offsets, index_t, 1, 32), - {%- if vbe %} - MAKE_PTA_WITH_NAME(func_name1, vbe_row_output_offsets, int64_t, 1, 32), - MAKE_PTA_WITH_NAME(func_name1, vbe_b_t_map, int32_t, 1, 32), - info_B_num_bits, - info_B_mask - {%- else %} - FixedDivisor(total_B / T) - {%- endif %} - ); - - C10_CUDA_KERNEL_LAUNCH_CHECK(); - }); // DEBUG_KERNEL_BARRIER_ISOLATE + {%- if not dense or not vbe %} + FBGEMM_LAUNCH_KERNEL( + (grad_mean{{ vdesc }}_kernel), + div_round_up(total_B, kMaxThreads / kWarpSize), + dim3(kWarpSize, kMaxThreads / kWarpSize), + 0, + at::cuda::getCurrentCUDAStream(), + PTA_B(grad_output_mean, grad_t, 2, 64), + PTA_B(grad_output_reshaped, grad_t, 2, 64), + PTA_B(D_offsets, int32_t, 1, 32), + PTA_B(offsets, index_t, 1, 32), + {%- if vbe %} + PTA_B(vbe_row_output_offsets, int64_t, 1, 32), + PTA_B(vbe_b_t_map, int32_t, 1, 32), + info_B_num_bits, + info_B_mask + {%- else %} + FixedDivisor(total_B / T) + {%- endif %} + ); {%- endif %} // if not dense or not vbe grad_output_accessor = MAKE_PTA_WITH_NAME("{{ embedding_cuda_op }}.2", grad_output_mean, grad_t, 2, 64); @@ -1012,29 +993,23 @@ Tensor {{ embedding_cuda_op }}( use_deterministic_algorithms ? 0 : (indices.numel() / max_segment_length_per_cta), indices.options().dtype(at::kInt)); - DEBUG_KERNEL_BARRIER_ISOLATE([&] { -#ifdef FBGEMM_GPU_MEMCHECK - const auto func_name2 = "split_embedding_backward_codegen_find_long_segments"; -#endif - - split_embedding_backward_codegen_find_long_segments<<< - div_round_up(total_unique_indices, kMaxThreads), - kMaxThreads, - 0, - at::cuda::getCurrentCUDAStream() - >>>( - MAKE_PTA_WITH_NAME(func_name2, sorted_linear_indices_num_runs, int32_t, 1, 32), - MAKE_PTA_WITH_NAME(func_name2, sorted_linear_indices_run_lengths, int32_t, 1, 32), - MAKE_PTA_WITH_NAME(func_name2, long_run_ids, int32_t, 1, 32), - MAKE_PTA_WITH_NAME(func_name2, num_long_run_ids, int32_t, 1, 32), - MAKE_PTA_WITH_NAME(func_name2, long_run_id_to_really_long_run_ids, int32_t, 1, 32), - MAKE_PTA_WITH_NAME(func_name2, num_really_long_run_ids, int32_t, 1, 32), - MAKE_PTA_WITH_NAME(func_name2, grad_accum_counter, int32_t, 1, 32), - max_segment_length_per_warp, - max_segment_length_per_cta, - use_deterministic_algorithms); - C10_CUDA_KERNEL_LAUNCH_CHECK(); - }); // DEBUG_KERNEL_BARRIER_ISOLATE + FBGEMM_LAUNCH_KERNEL( + split_embedding_backward_codegen_find_long_segments, + div_round_up(total_unique_indices, kMaxThreads), + kMaxThreads, + 0, + at::cuda::getCurrentCUDAStream(), + PTA_B(sorted_linear_indices_num_runs, int32_t, 1, 32), + PTA_B(sorted_linear_indices_run_lengths, int32_t, 1, 32), + PTA_B(long_run_ids, int32_t, 1, 32), + PTA_B(num_long_run_ids, int32_t, 1, 32), + PTA_B(long_run_id_to_really_long_run_ids, int32_t, 1, 32), + PTA_B(num_really_long_run_ids, int32_t, 1, 32), + PTA_B(grad_accum_counter, int32_t, 1, 32), + max_segment_length_per_warp, + max_segment_length_per_cta, + use_deterministic_algorithms + ); // A temp buffer to accumulate gradients with atomics. auto temp_grad_accum = at::zeros(