minor fix for xqa (#1994)

qsang-nv · yzh119 · web-flow · commit 9ce1af735387 · 2025-10-28T16:48:03.000-07:00
## 📌 Description  1 change xqa_mla comments to be consistent with mla instead of mha. 2 put cudaMemcpyFromSymbol/cudaFuncSetAttribute outside of launch function to avoid breaking cuda graph capture 3 use int32 as pagetable index ## 🔍 Related Issues  ## 🚀 Pull Request Checklist Thank you for contributing to FlashInfer! Before we review your pull request, please make sure the following items are complete. ### ✅ Pre-commit Checks - [x] I have installed `pre-commit` by running `pip install pre-commit` (or used your preferred method). - [x] I have installed the hooks with `pre-commit install`. - [x] I have run the hooks manually with `pre-commit run --all-files` and fixed any reported issues. > If you are unsure about how to set up `pre-commit`, see [the pre-commit documentation](https://pre-commit.com/). ## 🧪 Tests - [x] Tests have been added or updated as needed. - [ ] All tests are passing (`unittest`, etc.). ## Reviewer Notes   ## Summary by CodeRabbit * **New Features** * Added MLA variant documentation clarifying SM120 GPU requirement and fixed head group ratio configuration. * **Documentation** * Updated data type specifications for XQA operations; page table now requires int32 instead of uint32. * Added max sequence length derivation notes for page-table-based configurations. * Clarified MLA variant input/output data types (float8_e4m3fn and bfloat16). * **Bug Fixes** * Corrected data type handling in page table processing to ensure compatibility.  Signed-off-by: Qidi Sang <200703406+qsang-nv@users.noreply.github.com> Co-authored-by: yzh119 <zihaoy@nvidia.com>
diff --git a/csrc/xqa/mha.cu b/csrc/xqa/mha.cu
@@ -2655,6 +2655,15 @@ void launchMHA(
 }
 #endif
 
+static uint32_t configureKernel() {
+  uint32_t size;
+  cudaMemcpyFromSymbol(&size, smemSize, sizeof(smemSize));
+  cudaFuncSetAttribute(kernel_mha, cudaFuncAttributeMaxDynamicSharedMemorySize, size);
+  return size;
+}
+
+static uint32_t const hostSmemSize = configureKernel();
+
 void launchMHAFlashInfer(uint32_t multiProcessorCount, uint32_t nbKHeads, uint32_t slidingWinSize,
                          float qScale, OutputHead* output,
 #if LOW_PREC_OUTPUT
@@ -2673,13 +2682,6 @@ void launchMHAFlashInfer(uint32_t multiProcessorCount, uint32_t nbKHeads, uint32
                          uint32_t qSeqLen, uint32_t const* qCuSeqLens, MaskType const* mask,
 #endif
                          uint32_t* semaphores, void* scratch, cudaStream_t stream) {
-  static uint32_t const hostSmemSize = [&]() {
-    uint32_t size;
-    checkCuda(cudaMemcpyFromSymbol(&size, smemSize, sizeof(smemSize)));
-    checkCuda(cudaFuncSetAttribute(kernel_mha, cudaFuncAttributeMaxDynamicSharedMemorySize, size));
-    return size;
-  }();
-
   uint32_t const nbSubSeqPerSeq = [&]() -> uint32_t {
     if (!allowMultiBlockMode) {
       return 1;
diff --git a/csrc/xqa/mha_sm90.cu b/csrc/xqa/mha_sm90.cu
@@ -3165,6 +3165,15 @@ void launchHopperF8MHA(
 }
 #endif
 
+static uint32_t configureKernel() {
+  uint32_t size;
+  cudaMemcpyFromSymbol(&size, smemSize, sizeof(smemSize));
+  cudaFuncSetAttribute(kernel_mha, cudaFuncAttributeMaxDynamicSharedMemorySize, size);
+  return size;
+}
+
+static uint32_t const hostSmemSize = configureKernel();
+
 void launchHopperF8MHAFlashInfer(uint32_t multiProcessorCount, uint32_t nbKHeads,
                                  uint32_t slidingWinSize, float qScale, OutputHead* output,
 #if LOW_PREC_OUTPUT
@@ -3183,12 +3192,6 @@ void launchHopperF8MHAFlashInfer(uint32_t multiProcessorCount, uint32_t nbKHeads
                                  uint32_t qSeqLen, uint32_t const* qCuSeqLens, MaskType const* mask,
 #endif
                                  uint32_t* semaphores, void* scratch, cudaStream_t stream) {
-  static uint32_t const hostSmemSize = [&]() {
-    uint32_t size;
-    checkCuda(cudaMemcpyFromSymbol(&size, smemSize, sizeof(smemSize)));
-    checkCuda(cudaFuncSetAttribute(kernel_mha, cudaFuncAttributeMaxDynamicSharedMemorySize, size));
-    return size;
-  }();
   uint32_t const nbSubSeqPerSeq = [&]() -> uint32_t {
     float const factor = 0.25f;
     return mha::min<uint32_t>(
diff --git a/csrc/xqa/mla_sm120.cu b/csrc/xqa/mla_sm120.cu
@@ -1835,6 +1835,15 @@ void launchMLA(
 #endif
 }
 
+static uint32_t configureKernel() {
+  uint32_t size;
+  cudaMemcpyFromSymbol(&size, smemSize, sizeof(smemSize));
+  cudaFuncSetAttribute(kernel_mha, cudaFuncAttributeMaxDynamicSharedMemorySize, size);
+  return size;
+}
+
+static uint32_t const hostSmemSize = configureKernel();
+
 void launchMLAFlashInfer(
     uint32_t multiProcessorCount,
     uint32_t inputSeqLen,  // uniform for all requests and causal mask is assumed
@@ -1860,13 +1869,6 @@ void launchMLAFlashInfer(
   if (beamWidth != 1) {
     throw std::runtime_error("not implemented");
   }
-  static uint32_t const hostSmemSize = [&]() {
-    // printf("smemSize = %u\n", smemSize);
-    uint32_t size;
-    checkCuda(cudaMemcpyFromSymbol(&size, smemSize, sizeof(smemSize)));
-    checkCuda(cudaFuncSetAttribute(kernel_mha, cudaFuncAttributeMaxDynamicSharedMemorySize, size));
-    return size;
-  }();
   uint32_t const nbKHeads = 1;
   uint32_t const nbVHeads = nbKHeads;
   uint32_t const nbQHeads = nbKHeads * headGrpSize;
diff --git a/flashinfer/xqa.py b/flashinfer/xqa.py
@@ -153,7 +153,7 @@ def xqa(
         Should be the same data type as k_cache.
     page_table : torch.Tensor
         Page table tensor with shape ``batch_size, nb_pages_per_seq``.
-        Data type should be torch.uint32.
+        Data type should be torch.int32.
         K and V share the same table.
     seq_lens : torch.Tensor
         Sequence lengths tensor with shape ``[batch_size, beam_width]``.
@@ -195,6 +195,7 @@ def xqa(
     - input_dtype from q.dtype
     - kv_cache_dtype from k.dtype
     - head_group_ratio from num_q_heads // num_kv_heads
+    - max_seq_len from page_table.shape[-1] * page_size
     """
     # Handle optional parameters
     if sm_count is None:
@@ -352,31 +353,29 @@ def xqa_mla(
     kv_scale: Optional[torch.Tensor] = None,
     sm_count: Optional[int] = None,
 ) -> None:
-    r"""Apply attention with paged KV cache using XQA kernel.
+    r"""Apply attention with paged KV cache using XQA MLA (Multi-Head Latent Attention) kernel.
     Parameters
     ----------
     q : torch.Tensor
         Query tensor with shape ``[batch_size, beam_width, num_q_heads, head_dim]``.
-        Data type should be torch.float16 or torch.bfloat16.
+        Data type should be torch.float8_e4m3fn.
         Now only beam_width 1 is supported.
     k_cache: torch.Tensor
         Paged K cache tensor with shape ``[total_num_cache_heads, head_dim]``.
-        Data type should match query tensor or be torch.float8_e4m3fn, in which case xqa will run fp8 calculation.
-        Should be the same data type as v_cache.
+        Data type should be torch.float8_e4m3fn
     v_cache: torch.Tensor
         Paged V cache tensor with shape ``[total_num_cache_heads, head_dim]``.
-        Data type should match query tensor or be torch.float8_e4m3fn, in which case xqa will run fp8 calculation.
-        Should be the same data type as k_cache.
+        Data type should be torch.float8_e4m3fn
     page_table : torch.Tensor
         Page table tensor with shape ``batch_size, nb_pages_per_seq``.
-        Data type should be torch.uint32.
+        Data type should be torch.int32.
         K and V share the same table.
     seq_lens : torch.Tensor
         Sequence lengths tensor with shape ``[batch_size, beam_width]``.
         Data type should be torch.uint32.
     output : torch.Tensor
         Output tensor with shape ``[batch_size, beam_width, num_q_heads, head_dim]``.
-        Data type should match query tensor. This tensor will be modified in-place.
+        Data type should be torch.bfloat16. This tensor will be modified in-place.
     workspace_buffer : torch.Tensor
         Workspace buffer for temporary computations.
         Data type should be torch.uint8.
@@ -399,8 +398,8 @@ def xqa_mla(
     The function automatically infers several parameters from tensor shapes:
     - batch_size from q.shape[0]
     - head_dim from q.shape[-1]
-    - input_dtype from q.dtype
-    - kv_cache_dtype from k.dtype
+    - head_group_ratio is fixed to 128 for MLA
+    - max_seq_len from page_table.shape[-1] * page_size
     """
     # Handle optional parameters
     if sm_count is None:
@@ -423,7 +422,7 @@ def xqa_mla(
     assert k_cache.dtype == v_cache.dtype, "K and V cache must have the same dtype"
 
     if get_compute_capability(torch.device(device="cuda"))[0] not in [12]:
-        raise RuntimeError("XQA is only supported on SM120 GPUs")
+        raise RuntimeError("XQA MLA is only supported on SM120 GPUs")
 
     xqa_module = get_xqa_module_mla(
         q.dtype,
diff --git a/tests/attention/test_xqa.py b/tests/attention/test_xqa.py
@@ -253,7 +253,7 @@ def test_xqa(
         cache_k_heads /= 4.0
         cache_v_heads /= 4.0
     page_list_arg = torch.zeros(
-        batch_size, nb_pages_per_seq, dtype=torch.uint32, device="cuda"
+        batch_size, nb_pages_per_seq, dtype=torch.int32, device="cuda"
     )
 
     # Initialize page list sequentially
@@ -265,7 +265,7 @@ def test_xqa(
 
     flattened = page_list_arg.flatten()
     indices = torch.randperm(flattened.numel())
-    shuffled_flat = flattened.to(torch.int32)[indices].to(torch.uint32)
+    shuffled_flat = flattened[indices]
     page_list_arg = shuffled_flat.view(page_list_arg.shape)
 
     def cache_head_at(
@@ -470,7 +470,7 @@ def test_xqa_mla(
     cache_v_heads /= 4.0
 
     page_list_arg = torch.zeros(
-        batch_size, nb_pages_per_seq, dtype=torch.uint32, device="cuda"
+        batch_size, nb_pages_per_seq, dtype=torch.int32, device="cuda"
     )
 
     # Initialize page list sequentially
@@ -482,7 +482,7 @@ def test_xqa_mla(
 
     flattened = page_list_arg.flatten()
     indices = torch.randperm(flattened.numel())
-    shuffled_flat = flattened.to(torch.int32)[indices].to(torch.uint32)
+    shuffled_flat = flattened[indices]
     page_list_arg = shuffled_flat.view(page_list_arg.shape)
 
     def cache_head_at(