Do FP8 rowwise bias addition in higher precision (#4095)

jwfromm · facebook-github-bot · commit 52487ea51222 · 2025-05-08T11:24:35.000-07:00
Summary: X-link: facebookresearch/FBGEMM#1179 Previously, when bias was used in our FP8 rowwise kernel, it was added to the accumulator in its native precision. For example, if the bias is bf16, we would do a bf16 + bf16 addition. However, it's a bit more efficient and a bit more accurate to leave the accumulator in fp32, cast the bias to fp32, then to an fp32 addition. Reviewed By: jianyuh Differential Revision: D74408348
diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_rowwise/f8f8bf16_rowwise_common.cuh b/fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f8f8bf16_rowwise/f8f8bf16_rowwise_common.cuh
@@ -138,7 +138,7 @@ at::Tensor f8f8bf16_rowwise_impl(
       0,
       TileShape,
       ElementBias,
-      ElementBias,
+      ElementComputeEpilogue,
       cute::Stride<cute::Int<1>, cute::Int<0>, cute::Int<0>>>;
 
   using Accum = cutlass::epilogue::fusion::Sm90AccFetch;
@@ -156,7 +156,7 @@ at::Tensor f8f8bf16_rowwise_impl(
       cutlass::multiplies,
       cute::conditional_t< // Second stage output type.
           USE_BIAS,
-          ElementBias,
+          ElementComputeEpilogue,
           ElementOutput>,
       ElementComputeEpilogue, // Second stage input types.
       cutlass::FloatRoundStyle::round_to_nearest>;
@@ -167,7 +167,7 @@ at::Tensor f8f8bf16_rowwise_impl(
   using ComputeBias = cutlass::epilogue::fusion::Sm90Compute<
       cutlass::plus,
       ElementOutput, // Final (optional) stage output type.
-      ElementBias, // Final stage input types.
+      ElementComputeEpilogue, // Final stage input types.
       cutlass::FloatRoundStyle::round_to_nearest>;
 
   using EVTComputeBias =