pytorch
diff --git a/‎fbgemm_gpu/experimental/gen_ai/src/attention/cuda/cutlass_blackwell_fmha/77_blackwell_fmha.cu‎
Lines changed: 10 additions & 9 deletions b/‎fbgemm_gpu/experimental/gen_ai/src/attention/cuda/cutlass_blackwell_fmha/77_blackwell_fmha.cu‎
Lines changed: 10 additions & 9 deletions
diff --git a/‎fbgemm_gpu/experimental/gen_ai/src/attention/cuda/cutlass_blackwell_fmha/77_blackwell_mla_fwd.cu‎
Lines changed: 7 additions & 6 deletions b/‎fbgemm_gpu/experimental/gen_ai/src/attention/cuda/cutlass_blackwell_fmha/77_blackwell_mla_fwd.cu‎
Lines changed: 7 additions & 6 deletions
diff --git a/‎fbgemm_gpu/experimental/gen_ai/src/attention/cuda/cutlass_blackwell_fmha/CMakeLists.txt‎
Lines changed: 98 additions & 2 deletions b/‎fbgemm_gpu/experimental/gen_ai/src/attention/cuda/cutlass_blackwell_fmha/CMakeLists.txt‎
Lines changed: 98 additions & 2 deletions
diff --git a/‎fbgemm_gpu/experimental/gen_ai/src/attention/cuda/cutlass_blackwell_fmha/README.md‎
Lines changed: 10 additions & 2 deletions b/‎fbgemm_gpu/experimental/gen_ai/src/attention/cuda/cutlass_blackwell_fmha/README.md‎
Lines changed: 10 additions & 2 deletions
diff --git a/‎fbgemm_gpu/experimental/gen_ai/src/attention/cuda/cutlass_blackwell_fmha/collective/fmha_fusion.hpp‎
Lines changed: 3 additions & 4 deletions b/‎fbgemm_gpu/experimental/gen_ai/src/attention/cuda/cutlass_blackwell_fmha/collective/fmha_fusion.hpp‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎fbgemm_gpu/experimental/gen_ai/src/attention/cuda/cutlass_blackwell_fmha/collective/sm100_fmha_fwd_mainloop_tma_warpspecialized.hpp‎
Lines changed: 2 additions & 0 deletions b/‎fbgemm_gpu/experimental/gen_ai/src/attention/cuda/cutlass_blackwell_fmha/collective/sm100_fmha_fwd_mainloop_tma_warpspecialized.hpp‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎fbgemm_gpu/experimental/gen_ai/src/attention/cuda/cutlass_blackwell_fmha/collective/sm100_fmha_gen_mainloop_warpspecialized.hpp‎
Lines changed: 3 additions & 1 deletion b/‎fbgemm_gpu/experimental/gen_ai/src/attention/cuda/cutlass_blackwell_fmha/collective/sm100_fmha_gen_mainloop_warpspecialized.hpp‎
Lines changed: 3 additions & 1 deletion
@@ -426,16 +426,16 @@ struct FwdRunner {
   using ElementOut = cutlass::half_t;
 #endif
 
-  // Q K D (B H)
+  // Q K D ((H_R, H_K) B)
   using ProblemShapeRegular = cute::tuple<int, int, int, cute::tuple<cute::tuple<int, int>, int>>;
   using ProblemShapeVarlen = cute::tuple<VariableLength, VariableLength, int, cute::tuple<cute::tuple<int, int>, int>>;
   using ProblemShapeType = std::conditional_t<kIsVarlen, ProblemShapeVarlen, ProblemShapeRegular>;
 
-  using StrideQ = cute::tuple<int, _1, cute::tuple<cute::tuple<int, int>, int>>;  // Q D (H_G H_R B)
-  using StrideK = cute::tuple<int, _1, cute::tuple<cute::tuple<_0, int>, int>>;  // K D (H_G H_R B)
+  using StrideQ = cute::tuple<int, _1, cute::tuple<cute::tuple<int, int>, int>>;  // Q D ((H_R, H_K), B)
+  using StrideK = cute::tuple<int, _1, cute::tuple<cute::tuple<_0, int>, int>>;  // K D ((H_R, H_K), B)
   using StrideV = StrideK;
   using StrideO = StrideQ;
-  using StrideLSE = cute::tuple<_1, cute::tuple<cute::tuple<int, int>, int>>;     // Q   (H_G H_R B)
+  using StrideLSE = cute::tuple<_1, cute::tuple<cute::tuple<int, int>, int>>;     // Q ((H_R, H_K), B)
 
   static constexpr bool kIsPersistent = find_option_t<Tag::kIsPersistent, true_type, KernelOptions...>::value;
   using TileScheduler = std::conditional_t<kIsPersistent, cutlass::fmha::kernel::PersistentTileScheduler, cutlass::fmha::kernel::IndividualTileScheduler>;
@@ -618,8 +618,8 @@ struct FwdRunner {
 
     ProblemShapeType problem_size_for_launch;
 
-    get<0>(problem_size_for_launch) = VariableLength{max_seqlen_q};
-    get<1>(problem_size_for_launch) = VariableLength{max_seqlen_kv};
+    get<0>(problem_size_for_launch) = VariableLength{max_seqlen_q, nullptr, total_seqlen_q};
+    get<1>(problem_size_for_launch) = VariableLength{max_seqlen_kv, nullptr, total_seqlen_kv};
     get<2>(problem_size_for_launch) = get<2>(problem_size);
     get<3>(problem_size_for_launch) = get<3>(problem_size);
 
@@ -676,9 +676,9 @@ struct FwdRunner {
     }
 
     auto buffer_init_fn = [&](auto& buffer) {
-      buffer.block_Q.reset(size(shape_QO), kIsVarlen ? D*SQ*H : 0);
-      buffer.block_K.reset(size(shape_KV), kIsVarlen ? D*SK*H_K : 0);
-      buffer.block_V.reset(size(shape_KV), kIsVarlen ? D*SK*H_K : 0);
+      buffer.block_Q.reset(size(shape_QO));
+      buffer.block_K.reset(size(shape_KV));
+      buffer.block_V.reset(size(shape_KV));
       buffer.block_O.reset(size(shape_QO), kIsVarlen ? D*SQ*H : 0);
       buffer.block_LSE.reset(size(shape_LSE));
       buffer.block_ref_O.reset(size(shape_QO), kIsVarlen ? D*SQ*H : 0);
@@ -725,6 +725,7 @@ struct FwdRunner {
     }
     typename Operation::Arguments arguments{
       problem_shape_,
+      // local changes
       nullptr,
       {{ buffers[buffer_index]->block_Q.get(), stride_Q,
         buffers[buffer_index]->block_K.get(), stride_K,
 
@@ -591,8 +591,8 @@ struct MlaFwdRunner {
 
     ProblemShapeType problem_size_for_launch;
 
-    get<0>(problem_size_for_launch) = VariableLength{max_seqlen_q};
-    get<1>(problem_size_for_launch) = VariableLength{max_seqlen_kv};
+    get<0>(problem_size_for_launch) = VariableLength{max_seqlen_q, nullptr, total_seqlen_q};
+    get<1>(problem_size_for_launch) = VariableLength{max_seqlen_kv, nullptr, total_seqlen_kv};
     get<2>(problem_size_for_launch) = get<2>(problem_size);
     get<3>(problem_size_for_launch) = get<3>(problem_size);
 
@@ -652,9 +652,9 @@ struct MlaFwdRunner {
     }
 
     auto buffer_init_fn = [&](auto& buffer) {
-      buffer.block_Q.reset(size(shape_Q), kIsVarlen ? D_latent_rope*SQ*H : 0);
-      buffer.block_K.reset(size(shape_K), kIsVarlen ? D_latent_rope*SK*H_K : 0);
-      buffer.block_V.reset(size(shape_V), kIsVarlen ? D*SK*H_K : 0);
+      buffer.block_Q.reset(size(shape_Q));
+      buffer.block_K.reset(size(shape_K));
+      buffer.block_V.reset(size(shape_V));
       buffer.block_O.reset(size(shape_O), kIsVarlen ? D*SQ*H : 0);
       buffer.block_LSE.reset(size(shape_LSE));
       buffer.block_ref_O.reset(size(shape_O), kIsVarlen ? D*SQ*H : 0);
@@ -850,7 +850,8 @@ struct MlaFwdRunner {
       flops *= static_cast<double>(size<3,1>(problem_shape));
     }
 
-    flops *= 2.0 * (std::is_same_v<ActiveMask, CausalMask<false>> ? 0.5 : 1.0);
+    flops *= 2.0 * (std::is_same_v<ActiveMask, CausalMask<false>> || 
+                    std::is_same_v<ActiveMask, CausalMask<true>> ? 0.5 : 1.0);
     flops *= static_cast<double>(size<3,0>(problem_shape));
 
     double flops0 = flops * static_cast<double>(size<2, 0>(problem_shape) + size<2, 1>(problem_shape));
 
@@ -33,12 +33,14 @@ set_property(
       77_blackwell_fmha_gen.cu
       77_blackwell_mla.cu
       77_blackwell_fmha_bwd.cu
+      77_blackwell_mla_fwd.cu
   PROPERTY
       COMPILE_FLAGS "--use_fast_math -ftemplate-backtrace-limit=0"
 )
 
 set(TEST_BASIC --b=1 --h=4 --q=512 --k=512 --d=128 --verify --mask=no)
-set(TEST_CAUSAL --b=1 --h=4 --q=512 --k=512 --d=128 --verify --mask=causal)
+set(TEST_CAUSAL_00 --b=1 --h=4 --q=512 --k=512 --d=128 --verify --mask=causal)
+set(TEST_CAUSAL_01 --verify --iterations=0 --b=1 --h=1 --h_k=1 --q=1013 --k=1024 --d=128 --mask=causal --causal-type=qend)
 set(TEST_VARLEN --b=1 --h=4 --q=512 --k=512 --d=128 --verify --mask=residual --varlen)
 set(TEST_HDIM64 --b=2 --h=4 --q=512 --k=512 --d=64 --verify)
 set(TEST_GQA --b=2 --h=4 --h_k=2 --q=512 --k=512 --d=64 --verify)
@@ -58,6 +60,41 @@ set(TEST_VARLEN_11 --verify --varlen --mask=causal,residual --d=64 --h=4 --h_k=2
 set(TEST_VARLEN_12 --verify --varlen --mask=causal,residual --d=64 --h=4 --h_k=4 --varlen-q=177:845 --varlen-k=257:766)
 set(TEST_VARLEN_13 --verify --varlen --mask=causal,residual --d=64 --h=4 --h_k=2 --varlen-q=177:366:479 --varlen-k=257:0:766)
 set(TEST_VARLEN_14 --verify --varlen --mask=causal,residual --d=64 --h=4 --h_k=4 --varlen-q=1 --varlen-k=1)
+set(TEST_VARLEN_15 --verify --varlen --mask=causal --causal-type=qbegin --d=128 --h=4 --h_k=4 --varlen-q=128 --varlen-k=128)
+set(TEST_VARLEN_16 --verify --varlen --mask=causal --causal-type=qbegin --d=128 --h=4 --h_k=4 --varlen-q=17 --varlen-k=257)
+set(TEST_VARLEN_17 --verify --varlen --mask=causal --causal-type=qbegin --d=128 --h=4 --h_k=4 --varlen-q=17 --varlen-k=25)
+set(TEST_VARLEN_18 --verify --varlen --mask=causal --causal-type=qend --d=128 --h=4 --h_k=4 --varlen-q=128 --varlen-k=128)
+set(TEST_VARLEN_19 --verify --varlen --mask=causal --causal-type=qend --d=128 --h=4 --h_k=4 --varlen-q=17 --varlen-k=257)
+set(TEST_VARLEN_20 --verify --varlen --mask=causal --causal-type=qend --d=128 --h=4 --h_k=4 --varlen-q=17 --varlen-k=25)
+set(TEST_VARLEN_21 --verify --varlen --mask=causal --causal-type=qend --d=128 --h=4 --h_k=4 --varlen-q=1013 --varlen-k=1024)
+set(TEST_VARLEN_22 --verify --varlen --mask=causal --causal-type=qend --d=128 --h=4 --h_k=4 --varlen-q=1024 --varlen-k=1035)
+
+
+
+set(TEST_MLA_FWD_VARLEN_00 --verify --varlen --mask=causal,residual --dl=128 --dr=64 --h=8 --h_k=4 --varlen-q=128 --varlen-k=128)
+set(TEST_MLA_FWD_VARLEN_01 --verify --varlen --mask=causal,residual --dl=128 --dr=64 --h=4 --h_k=4 --varlen-q=128 --varlen-k=128)
+set(TEST_MLA_FWD_VARLEN_02 --verify --varlen --mask=causal,residual --dl=128 --dr=64 --h=4 --h_k=2 --varlen-q=128 --varlen-k=128)
+set(TEST_MLA_FWD_VARLEN_03 --verify --varlen --mask=causal,residual --dl=128 --dr=64 --h=8 --h_k=8 --varlen-q=256:256 --varlen-k=512:512)
+set(TEST_MLA_FWD_VARLEN_04 --verify --varlen --mask=causal,residual --dl=128 --dr=64 --h=8 --h_k=4 --varlen-q=256:256 --varlen-k=512:512)
+set(TEST_MLA_FWD_VARLEN_05 --verify --varlen --mask=causal,residual --dl=128 --dr=64 --h=8 --h_k=1 --varlen-q=256:256 --varlen-k=512:512)
+set(TEST_MLA_FWD_VARLEN_06 --verify --varlen --mask=causal,residual --dl=128 --dr=64 --h=8 --h_k=2 --varlen-q=256:256:256:256 --varlen-k=256:768:512:512)
+set(TEST_MLA_FWD_VARLEN_07 --verify --varlen --mask=causal,residual --dl=128 --dr=64 --h=8 --h_k=2 --varlen-q=256:256:256:256 --varlen-k=256:0:1280:512)
+set(TEST_MLA_FWD_VARLEN_08 --verify --varlen --mask=causal,residual --dl=128 --dr=64 --h=8 --h_k=2 --varlen-q=256:0:512:256 --varlen-k=256:256:1024:512)
+set(TEST_MLA_FWD_VARLEN_09 --verify --varlen --mask=causal,residual --dl=128 --dr=64 --h=16 --h_k=16 --varlen-q=100:300 --varlen-k=100:300)
+set(TEST_MLA_FWD_VARLEN_10 --verify --varlen --mask=causal,residual --dl=128 --dr=64 --h=4 --h_k=4 --varlen-q=2:3 --varlen-k=2:5)
+set(TEST_MLA_FWD_VARLEN_11 --verify --varlen --mask=causal,residual --dl=128 --dr=64 --h=4 --h_k=2 --varlen-q=11:10 --varlen-k=13:10)
+set(TEST_MLA_FWD_VARLEN_12 --verify --varlen --mask=causal,residual --dl=128 --dr=64 --h=4 --h_k=4 --varlen-q=177:766 --varlen-k=257:845)
+set(TEST_MLA_FWD_VARLEN_13 --verify --varlen --mask=causal,residual --dl=128 --dr=64 --h=4 --h_k=2 --varlen-q=177:0:479 --varlen-k=257:0:766)
+set(TEST_MLA_FWD_VARLEN_14 --verify --varlen --mask=causal,residual --dl=128 --dr=64 --h=4 --h_k=4 --varlen-q=1 --varlen-k=1)
+set(TEST_MLA_FWD_VARLEN_15 --verify --varlen --mask=causal --causal-type=qbegin --d=128 --h=4 --h_k=4 --varlen-q=128 --varlen-k=128)
+set(TEST_MLA_FWD_VARLEN_16 --verify --varlen --mask=causal --causal-type=qbegin --d=128 --h=4 --h_k=4 --varlen-q=17 --varlen-k=257)
+set(TEST_MLA_FWD_VARLEN_17 --verify --varlen --mask=causal --causal-type=qbegin --d=128 --h=4 --h_k=4 --varlen-q=17 --varlen-k=25)
+set(TEST_MLA_FWD_VARLEN_18 --verify --varlen --mask=causal --causal-type=qend --d=128 --h=4 --h_k=4 --varlen-q=128 --varlen-k=128)
+set(TEST_MLA_FWD_VARLEN_19 --verify --varlen --mask=causal --causal-type=qend --d=128 --h=4 --h_k=4 --varlen-q=17 --varlen-k=257)
+set(TEST_MLA_FWD_VARLEN_20 --verify --varlen --mask=causal --causal-type=qend --d=128 --h=4 --h_k=4 --varlen-q=17 --varlen-k=25)
+set(TEST_MLA_FWD_VARLEN_21 --verify --varlen --mask=causal --causal-type=qend --d=128 --h=4 --h_k=4 --varlen-q=1013 --varlen-k=1024)
+set(TEST_MLA_FWD_VARLEN_22 --verify --varlen --mask=causal --causal-type=qend --d=128 --h=4 --h_k=4 --varlen-q=1024 --varlen-k=1035)
+
 
 set(TEST_GEN_BASIC --b=1 --h=4 --k=512 --d=128 --verify)
 set(TEST_GEN_VARLEN --b=1 --h=4 --k=512 --d=128 --verify  --varlen)
@@ -67,6 +104,11 @@ set(TEST_GEN_REMAP --b=2 --h=4 --h_k=2 --k=512 --d=128 --verify --remap)
 set(TEST_GEN_CACHEONLY --b=2 --h=4 --h_k=2 --k=512 --d=128 --verify --cache-only)
 
 set(TEST_MLA_BASIC --b=1 --k=512 --page=128 --verify)
+set(TEST_BWD_MLA_BASIC --b=1 --h=4 --q=512 --k=512 --d=192 --d_vo=128 --verify --mask=no)
+set(TEST_BWD_MLA_VARLEN --b=1 --h=4 --q=512 --k=512 --d=192 --d_vo=128 --verify --mask=residual --varlen)
+
+set(TEST_MLA_SEP_REDUCTION  --b=1 --k=4096 --split_kv=8 --page=128 --verify)
+set(TEST_MLA_FUSE_REDUCTION --b=1 --k=4096 --split_kv=8 --page=128 --fuse_reduction --verify)
 
 if(NOT WIN32 AND (NOT (CMAKE_CXX_COMPILER_ID MATCHES "Clang")) AND (CUTLASS_NVCC_ARCHS MATCHES 100a))
 
@@ -78,7 +120,8 @@ if(NOT WIN32 AND (NOT (CMAKE_CXX_COMPILER_ID MATCHES "Clang")) AND (CUTLASS_NVCC
         77_blackwell_fmha.cu
         TEST_COMMAND_OPTIONS
         TEST_BASIC
-        TEST_CAUSAL
+        TEST_CAUSAL_00
+        TEST_CAUSAL_01
         TEST_VARLEN
         TEST_HDIM64
         TEST_GQA
@@ -97,6 +140,14 @@ if(NOT WIN32 AND (NOT (CMAKE_CXX_COMPILER_ID MATCHES "Clang")) AND (CUTLASS_NVCC
         TEST_VARLEN_12
         TEST_VARLEN_13
         TEST_VARLEN_14
+        TEST_VARLEN_15
+        TEST_VARLEN_16
+        TEST_VARLEN_17
+        TEST_VARLEN_18
+        TEST_VARLEN_19
+        TEST_VARLEN_20
+        TEST_VARLEN_21
+        TEST_VARLEN_22
         )
     target_include_directories(77_blackwell_fmha_${PREC} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
     target_compile_definitions(77_blackwell_fmha_${PREC} PRIVATE ${PREC_MACRO})
@@ -120,6 +171,8 @@ if(NOT WIN32 AND (NOT (CMAKE_CXX_COMPILER_ID MATCHES "Clang")) AND (CUTLASS_NVCC
         77_blackwell_mla.cu
         TEST_COMMAND_OPTIONS
         TEST_MLA_BASIC
+        TEST_MLA_SEP_REDUCTION
+        TEST_MLA_FUSE_REDUCTION
         )
     target_include_directories(77_blackwell_mla_2sm_${PREC} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
     target_compile_definitions(77_blackwell_mla_2sm_${PREC} PRIVATE ${PREC_MACRO})
@@ -130,6 +183,8 @@ if(NOT WIN32 AND (NOT (CMAKE_CXX_COMPILER_ID MATCHES "Clang")) AND (CUTLASS_NVCC
         77_blackwell_mla.cu
         TEST_COMMAND_OPTIONS
         TEST_MLA_BASIC
+        TEST_MLA_SEP_REDUCTION
+        TEST_MLA_FUSE_REDUCTION
         )
     target_include_directories(77_blackwell_mla_2sm_cpasync_${PREC} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
     target_compile_definitions(77_blackwell_mla_2sm_cpasync_${PREC} PRIVATE ${PREC_MACRO} CPASYNC)
@@ -157,10 +212,49 @@ if(NOT WIN32 AND (NOT (CMAKE_CXX_COMPILER_ID MATCHES "Clang")) AND (CUTLASS_NVCC
         TEST_VARLEN_12
         TEST_VARLEN_13
         TEST_VARLEN_14
+        TEST_BWD_MLA_BASIC
+        TEST_BWD_MLA_VARLEN
         )
     target_include_directories(77_blackwell_fmha_bwd_${PREC} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
     target_compile_definitions(77_blackwell_fmha_bwd_${PREC} PRIVATE ${PREC_MACRO})
     target_compile_options(77_blackwell_fmha_bwd_${PREC} PRIVATE -Xptxas -v)
+
+    cutlass_example_add_executable(
+        77_blackwell_mla_fwd_${PREC}
+        77_blackwell_mla_fwd.cu
+        TEST_COMMAND_OPTIONS
+        TEST_BASIC
+        TEST_CAUSAL_00
+        TEST_VARLEN
+        TEST_HDIM64
+        TEST_GQA
+        TEST_MLA_FWD_VARLEN_00
+        TEST_MLA_FWD_VARLEN_01
+        TEST_MLA_FWD_VARLEN_02
+        TEST_MLA_FWD_VARLEN_03
+        TEST_MLA_FWD_VARLEN_04
+        TEST_MLA_FWD_VARLEN_05
+        TEST_MLA_FWD_VARLEN_06
+        TEST_MLA_FWD_VARLEN_07
+        TEST_MLA_FWD_VARLEN_08
+        TEST_MLA_FWD_VARLEN_09
+        TEST_MLA_FWD_VARLEN_10
+        TEST_MLA_FWD_VARLEN_11
+        TEST_MLA_FWD_VARLEN_12
+        TEST_MLA_FWD_VARLEN_13
+        TEST_MLA_FWD_VARLEN_14
+        TEST_MLA_FWD_VARLEN_15
+        TEST_MLA_FWD_VARLEN_16
+        TEST_MLA_FWD_VARLEN_17
+        TEST_MLA_FWD_VARLEN_18
+        TEST_MLA_FWD_VARLEN_19
+        TEST_MLA_FWD_VARLEN_20
+        TEST_MLA_FWD_VARLEN_21
+        TEST_MLA_FWD_VARLEN_22
+        )
+    target_include_directories(77_blackwell_mla_fwd_${PREC} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
+    target_compile_definitions(77_blackwell_mla_fwd_${PREC} PRIVATE ${PREC_MACRO})
+    target_compile_options(77_blackwell_mla_fwd_${PREC} PRIVATE -Xptxas -v)
   endforeach()
 
   # Add a target that builds all examples
@@ -176,5 +270,7 @@ if(NOT WIN32 AND (NOT (CMAKE_CXX_COMPILER_ID MATCHES "Clang")) AND (CUTLASS_NVCC
       77_blackwell_mla_2sm_cpasync_fp16
       77_blackwell_fmha_bwd_fp8
       77_blackwell_fmha_bwd_fp16
+      77_blackwell_mla_fwd_fp8
+      77_blackwell_mla_fwd_fp16
   )
 endif()
@@ -8,7 +8,7 @@ For generation usage, use an M-blocking (Num-Groups) of 128 (although the limit
 
 Context loads are done via TMA, whereas generation usage utilized `cp.async` and is thus more amenable to complex load patterns.
 
-For variable sequence lenght, the code requires a batch of valid (but never used) padding memory ahead of the first input batch. This is achieved with least overhead by leaving one batch free and then arranging QKV consecutively.
+For variable sequence length, the code requires a batch of valid (but never used) padding memory ahead of the first output batch. No padding is needed for the input tensor, but it requires that the input tensor contain no NaN or Inf values. Note that users should set `total_length` to the `problem_shape`.
 
 The approach of this implementation is to reuse the selection logic of the collective gemm builder and recombine the result into an FMHA kernel.
 The kernel and collective layer are then formulated to be fmha-specific.
@@ -37,13 +37,19 @@ There are three kernels to compute backwards:
 
 `Sm100FmhaBwdKernelTmaWarpSpecialized` is the main point of this sample, as it demonstrates how to use tensor cores to achieve a high performance fused kernel.
 
+## MLA Blackwell Backward
+
+The sample also provides the feature of MLA backward(d=192, d_vo=128). To enable MLA backward, please specify `--d=192 --d_vo=128` when running the bwd sample.
+
+`Sm100FmhaBwdMlaKernelTmaWarpSpecialized`is the main point for MLA backward. The MLA approach is slightly different from the original one to enable high performance with the MLA shape.
+
 # MLA Inference for Blackwell
 
 This sample provides code for fused multi-head latent attention inference in
 the weight-absorbed regime, i.e. for latent head dim 512, and rope head dim 64.
 It supports fp16, bf16, and fp8 input and output types.
 
-To accomodate the large output accumulator due to the large latent head dimension,
+To accommodate the large output accumulator due to the large latent head dimension,
 the sample demonstrates how to leverage 2Sm Blackwell tensor cores.
 
 Loading can be done via TMA (either without paging or with page size 128), or using `cp.async`
@@ -61,6 +67,8 @@ For detailed information on how to invoke them, check out either the tests in `C
   to simplify the sample, clarified that `fmha_gen`  sample only supports head
   dim 128.
 
+* 4.3.0: For variable sequence length, the code requires a batch of valid (but never used) padding memory ahead of the first output batch. No padding is needed for the input tensor, but it requires that the input tensor contain no NaN or Inf values. Note that users should set `total_length` to the `problem_shape`.
+
 # Copyright
 
 Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
@@ -317,9 +317,8 @@ struct CausalMask : NoMask {
     if constexpr (IsQBegin) {
       return std::min(trip_count, int(ceil_div(size<0>(tile_shape), size<1>(tile_shape))));
     } else {
-      // Local changes (to be upstreamed https://github.com/NVIDIA/cutlass/pull/2480)
-      const int corner_count = int((get<1>(problem_size) % get<1>(tile_shape) || get<0>(problem_size) % get<0>(tile_shape))) ;
-      return std::min(trip_count, int(ceil_div(get<0>(tile_shape), get<1>(tile_shape))) + corner_count);
+      const int offset_tile_q = (get<1>(problem_size) - get<0>(problem_size)) % get<1>(tile_shape);
+      return std::min(trip_count, int(ceil_div(get<0>(tile_shape) + offset_tile_q, get<1>(tile_shape))));
     }
   }
 
@@ -676,7 +675,7 @@ struct LocalMaskForBackward : LocalMask<kIsQBegin>, ResidualMaskForBackward {
 };
 
 struct VariableLength {
-  int max_length = 0;
+  int max_length;
   int* cumulative_length = nullptr;
   int total_length = -1;
 
 
@@ -199,6 +199,7 @@ struct Sm100FmhaFwdMainloopTmaWarpspecialized {
     // scaling factor to quantize O
     float inv_scale_o = 1.0f;
 
+    // local changes
     int window_size_left = -1;
     int window_size_right = -1;
   };
@@ -211,6 +212,7 @@ struct Sm100FmhaFwdMainloopTmaWarpspecialized {
 
     float scale_output;
 
+    // local changes
     int window_size_left;
     int window_size_right;
   };
 
@@ -86,6 +86,7 @@ struct Sm100FmhaGenMainloopWarpspecialized {
   using Mask = Mask_;
 
   static constexpr int StageCountQ = get<1>(TileShape{}) == 256 ? 1 : 2;
+  // local changes
   static constexpr int StageCountKV = StageCountQ * (sizeof(Element) == 1 ? 11 : 5) ;
 
   using StagesQ = cutlass::gemm::collective::StageCount<StageCountQ>;
@@ -540,6 +541,7 @@ struct Sm100FmhaGenMainloopWarpspecialized {
     tStS_P.data() = warp_uniform(uint32_t(stage == _0{} ? TmemAllocation::P0 : TmemAllocation::P1));
     Tensor tScS_P = tScS.compose(make_layout(make_shape(_128{}, tilePlikeFP32)));
 
+    // local changes
     // Each thread owns a single row
     using TMEM_LOAD = conditional_t<
         size<1>(TileShapeQK{}) < _128{},
@@ -802,7 +804,7 @@ struct Sm100FmhaGenMainloopWarpspecialized {
     // good values would be either 32 or 64
     const int kCorrectionTileSize = 32 / sizeof(ElementOut);
 
-    using TMEM_LOAD = std::conditional_t<kCorrectionTileSize == 32, SM100_TMEM_LOAD_32dp32b32x, SM100_TMEM_LOAD_32dp32b16x>;  // 4x32 threads with 64 cols of 32b elem 
+    using TMEM_LOAD = std::conditional_t<kCorrectionTileSize == 32, SM100_TMEM_LOAD_32dp32b32x, SM100_TMEM_LOAD_32dp32b16x>;  // 4x32 threads with 64 cols of 32b elem
 
     typename CollectiveMmaPV::TiledMma mma;
     Tensor tOtO = partition_fragment_C(mma, select<0,1>(TileShapePV{}));