fix integer overflow problem (pytorch#5051)

Chuanwei Yi · meta-codesync[bot] · commit f849dcd7b79c · 2025-10-27T05:28:22.000-07:00
Summary: Pull Request resolved: pytorch#5051 X-link: https://github.com/facebookresearch/FBGEMM/pull/2055 When token size is very big, the integer multiplication will lead to integer overflow therefore lead to a cuda memory access error. Reviewed By: royren622 Differential Revision: D85330676 fbshipit-source-id: 51a5f61ed52f43358b61f4e689b33b95d051f987
diff --git a/fbgemm_gpu/experimental/gen_ai/src/attention/cuda/cutlass_blackwell_fmha/device/fmha_device_bwd.hpp b/fbgemm_gpu/experimental/gen_ai/src/attention/cuda/cutlass_blackwell_fmha/device/fmha_device_bwd.hpp
@@ -293,7 +293,7 @@ class Sm100FmhaBwd {
     ElementAccumulator* scaled_lse = reinterpret_cast<ElementAccumulator*>(workspace_scaled_lse);
     ElementAccumulator* dQ_acc = reinterpret_cast<ElementAccumulator*>(workspace_dQ);
     params_.dQ_acc = dQ_acc;
-    params_.dQ_acc_size = B*H*Q*D * sizeof(ElementAccumulator);
+    params_.dQ_acc_size = static_cast<size_t>(B)*H*Q*D * sizeof(ElementAccumulator);
     auto args_sum_OdO = to_sum_OdO_arguments(args, sum_OdO, scaled_lse);
     auto args_convert = to_convert_arguments(args, dQ_acc);
     params_.op_sum_OdO.initialize(args_sum_OdO, nullptr, stream);
@@ -320,9 +320,9 @@ class Sm100FmhaBwd {
     int Q = cutlass::round_up(static_cast<int>(Q_), 8);  // Alignment
     char* workspace_chr = reinterpret_cast<char*>(workspace);
     ElementAccumulator* sum_OdO = reinterpret_cast<ElementAccumulator*>(workspace_chr);
-    workspace_chr += B*H*Q * sizeof(ElementAccumulator);
+    workspace_chr += static_cast<size_t>(B)*H*Q * sizeof(ElementAccumulator);
     ElementAccumulator* scaled_lse = reinterpret_cast<ElementAccumulator*>(workspace_chr);
-    workspace_chr += B*H*Q * sizeof(ElementAccumulator);
+    workspace_chr += static_cast<size_t>(B)*H*Q * sizeof(ElementAccumulator);
     ElementAccumulator* dQ_acc = reinterpret_cast<ElementAccumulator*>(workspace_chr);
     return initialize_split(args, dQ_acc, sum_OdO, scaled_lse, stream);
   }