diff --git a/examples/77_blackwell_fmha/collective/sm100_fmha_fwd_mainloop_tma_warpspecialized.hpp b/examples/77_blackwell_fmha/collective/sm100_fmha_fwd_mainloop_tma_warpspecialized.hpp index 1e094bf42d..523631279f 100644 --- a/examples/77_blackwell_fmha/collective/sm100_fmha_fwd_mainloop_tma_warpspecialized.hpp +++ b/examples/77_blackwell_fmha/collective/sm100_fmha_fwd_mainloop_tma_warpspecialized.hpp @@ -598,6 +598,8 @@ struct Sm100FmhaFwdMainloopTmaWarpspecialized { tTMEM_STOREVrS(kIdxNewRowMax) = row_max_safe; copy(tiled_tmem_storev, tTMEM_STOREVrS, tTMEM_STOREVtS); + cutlass::arch::fence_view_async_tmem_store(); + pipeline_c.producer_commit(pipeline_c_producer_state); ++pipeline_c_producer_state; @@ -707,6 +709,8 @@ struct Sm100FmhaFwdMainloopTmaWarpspecialized { tTMEM_STOREVrS(kIdxFinalRowMax) = row_max; tTMEM_STOREVrS(kIdxFinalRowSum) = row_sum; copy(tiled_tmem_storev, tTMEM_STOREVrS, tTMEM_STOREVtS); + + cutlass::arch::fence_view_async_tmem_store(); } }