fix: correct PDL parameter handling in RopeQuantize kernel (#1982)

cicirori · web-flow · commit d4a3ff4356ae · 2025-10-26T10:22:39.000-07:00
## 📌 Description ### 1. Fixed Parameter Alignment - **Issue**: The `stream` parameter was being passed to the wrong position in the `RopeQuantize` function call due to missing `enable_pdl` parameter. SGLang will hang before this pr. - **Fix**: Added the `enable_pdl` parameter to the function signature and properly aligned all parameters ### 2. Fixed PDL Launch Configuration - **Issue**: When `enable_pdl=true`, the kernel would throw CUDA errors due to incorrect PDL attribute handling - **Fix**: Aligned the implementation with `csrc/fmhaReduction.cu`.  ## 🔍 Related Issues  ## 🚀 Pull Request Checklist Thank you for contributing to FlashInfer! Before we review your pull request, please make sure the following items are complete. ### ✅ Pre-commit Checks - [x] I have installed `pre-commit` by running `pip install pre-commit` (or used your preferred method). - [x] I have installed the hooks with `pre-commit install`. - [x] I have run the hooks manually with `pre-commit run --all-files` and fixed any reported issues. > If you are unsure about how to set up `pre-commit`, see [the pre-commit documentation](https://pre-commit.com/). ## 🧪 Tests - [x] Tests have been added or updated as needed. - [x] All tests are passing (`unittest`, etc.). ## Reviewer Notes   ## Summary by CodeRabbit * **New Features** * Added PDL (Programmatic Dynamic Launch) benchmarking capability for rope quantization operations. * Extended configuration options to enable or disable PDL functionality. * **Tests** * Updated test suite to validate PDL enabled and disabled scenarios in rope quantization workflows.
diff --git a/benchmarks/bench_rope_quantize_fp8.py b/benchmarks/bench_rope_quantize_fp8.py
@@ -88,7 +88,7 @@ def _apply_rotary_emb(
             return torch.stack((o1, o2), dim=-1).flatten(-2)
 
 
-def benchmark_config(config_name, num_tokens, provider):
+def benchmark_config(config_name, num_tokens, provider, enable_pdl=False):
     """Benchmark a specific attention configuration."""
     input_dtype = torch.bfloat16
     device = "cuda"
@@ -177,6 +177,7 @@ def execute():
                 k_nope_out=k_nope_out,
                 quant_scale_q=1.0,
                 quant_scale_kv=1.0,
+                enable_pdl=enable_pdl,
             )
 
             if mode_ncu and run_idx == 20:
@@ -278,6 +279,23 @@ def benchmark_mha(provider, num_tokens):
     return benchmark_config("mha", num_tokens, provider)
 
 
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["num_tokens"],
+        x_vals=[768] if mode_ncu else [1, 2, 4, 8, 16, 32, 64, 128, 256, 384, 512, 768],
+        line_arg="enable_pdl",
+        line_vals=[False, True],
+        line_names=["enable_pdl=False", "enable_pdl=True"],
+        styles=[("blue", "-"), ("red", "-")],
+        ylabel="Latency (ms)",
+        plot_name="rope-pdl-benchmark",
+        args={},
+    )
+)
+def benchmark_pdl(enable_pdl, num_tokens):
+    return benchmark_config("mla", num_tokens, "flashinfer", enable_pdl=enable_pdl)
+
+
 if __name__ == "__main__":
     # Run all benchmarks and generate individual plots
     print("Running MLA benchmark...")
@@ -289,6 +307,9 @@ def benchmark_mha(provider, num_tokens):
     print("Running MHA benchmark...")
     benchmark_mha.run(print_data=False, show_plots=True, save_path=".")
 
+    print("Running PDL benchmark...")
+    benchmark_pdl.run(print_data=False, show_plots=True, save_path=".")
+
     # Collect results for summary table
     token_counts = (
         [1, 2, 4, 8, 16, 32, 64, 128, 256, 384, 512, 768] if not mode_ncu else [768]
@@ -319,3 +340,4 @@ def benchmark_mha(provider, num_tokens):
     print("  mla-rope-benchmark.png (FlashInfer vs PyTorch)")
     print("  gqa-rope-benchmark.png (FlashInfer vs PyTorch)")
     print("  mha-rope-benchmark.png (FlashInfer vs PyTorch)")
+    print("  rope-pdl-benchmark.png (enable_pdl=False vs enable_pdl=True)")
diff --git a/csrc/flashinfer_rope_binding.cu b/csrc/flashinfer_rope_binding.cu
@@ -42,8 +42,8 @@ void apply_rope_pos_ids_cos_sin_cache(TensorView q, TensorView k, TensorView q_r
 void rope_quantize(TensorView q_rope_in, TensorView k_rope_in, TensorView q_nope_in,
                    TensorView k_nope_in, TensorView q_rope_out, TensorView k_rope_out,
                    TensorView q_nope_out, TensorView k_nope_out, TensorView cos_sin_cache,
-                   TensorView pos_ids, double quant_scale_q, double quant_scale_kv,
-                   bool interleave);
+                   TensorView pos_ids, double quant_scale_q, double quant_scale_kv, bool interleave,
+                   bool enable_pdl);
 
 TVM_FFI_DLL_EXPORT_TYPED_FUNC(apply_rope, apply_rope);
 TVM_FFI_DLL_EXPORT_TYPED_FUNC(apply_llama31_rope, apply_llama31_rope);
diff --git a/csrc/rope.cu b/csrc/rope.cu
@@ -271,8 +271,8 @@ void apply_llama31_rope_pos_ids(TensorView q, TensorView k, TensorView q_rope, T
 void rope_quantize(TensorView q_rope_in, TensorView k_rope_in, TensorView q_nope_in,
                    TensorView k_nope_in, TensorView q_rope_out, TensorView k_rope_out,
                    TensorView q_nope_out, TensorView k_nope_out, TensorView cos_sin_cache,
-                   TensorView pos_ids, double quant_scale_q, double quant_scale_kv,
-                   bool interleave) {
+                   TensorView pos_ids, double quant_scale_q, double quant_scale_kv, bool interleave,
+                   bool enable_pdl) {
   CHECK_LAST_DIM_CONTIGUOUS_INPUT(q_rope_in);
   CHECK_LAST_DIM_CONTIGUOUS_INPUT(k_rope_in);
   CHECK_LAST_DIM_CONTIGUOUS_INPUT(q_nope_in);
@@ -411,7 +411,7 @@ void rope_quantize(TensorView q_rope_in, TensorView k_rope_in, TensorView q_nope
             q_nope_in_stride_h, q_rope_out_stride_n, q_rope_out_stride_h, q_nope_out_stride_n,
             q_nope_out_stride_h, k_rope_in_stride, k_rope_in_stride_h, k_nope_in_stride,
             k_nope_in_stride_h, k_rope_out_stride, k_rope_out_stride_h, k_nope_out_stride,
-            k_nope_out_stride_h, quant_scale_q, quant_scale_kv, interleave, stream);
+            k_nope_out_stride_h, quant_scale_q, quant_scale_kv, interleave, enable_pdl, stream);
 
         TVM_FFI_ICHECK(status == cudaSuccess)
             << "RopeQuantize failed with error code " << cudaGetErrorString(status);
diff --git a/flashinfer/rope.py b/flashinfer/rope.py
@@ -181,6 +181,7 @@ def _rope_quantize(
     quant_scale_q: float,
     quant_scale_kv: float,
     interleave: bool,
+    enable_pdl: bool,
 ) -> None:
     r"""Custom operator that routes to the CUDA kernel implementation.
 
@@ -201,6 +202,7 @@ def _rope_quantize(
         quant_scale_q,
         quant_scale_kv,
         interleave,
+        enable_pdl,
     )
 
 
@@ -219,6 +221,7 @@ def _fake_rope_quantize(
     quant_scale_q: float,
     quant_scale_kv: float,
     interleave: bool,
+    enable_pdl: bool,
 ) -> None:
     pass
 
@@ -1159,6 +1162,7 @@ def mla_rope_quantize_fp8(
     k_rope_out: Optional[torch.Tensor] = None,
     q_nope_out: Optional[torch.Tensor] = None,
     k_nope_out: Optional[torch.Tensor] = None,
+    enable_pdl: bool = False,
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
     return rope_quantize_fp8(
         q_rope,
@@ -1175,6 +1179,7 @@ def mla_rope_quantize_fp8(
         k_rope_out,
         q_nope_out,
         k_nope_out,
+        enable_pdl,
     )
 
 
@@ -1193,6 +1198,7 @@ def rope_quantize_fp8(
     k_rope_out: Optional[torch.Tensor] = None,
     q_nope_out: Optional[torch.Tensor] = None,
     k_nope_out: Optional[torch.Tensor] = None,
+    enable_pdl: bool = False,
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
     r"""Apply RoPE (Rotary Positional Embeddings) and quantize to FP8 format.
 
@@ -1237,6 +1243,8 @@ def rope_quantize_fp8(
         Pre-allocated output tensor for quantized query (non-rotary). If ``None``, allocated automatically.
     k_nope_out : Optional[torch.Tensor]
         Pre-allocated output tensor for quantized key (non-rotary). If ``None``, allocated automatically.
+    enable_pdl : bool
+        Whether to enable PDL (Programmatic Dependent Launch). Default: ``False``.
 
     Returns
     -------
@@ -1291,6 +1299,7 @@ def rope_quantize_fp8(
         quant_scale_q,
         quant_scale_kv,
         not is_neox,  # interleave
+        enable_pdl,
     )
 
     return q_rope_out, k_rope_out, q_nope_out, k_nope_out
diff --git a/include/flashinfer/pos_enc.cuh b/include/flashinfer/pos_enc.cuh
@@ -813,24 +813,16 @@ cudaError_t RopeQuantize(
       dim3 nblks(nblks_x, total_blocks_y);
       dim3 nthrs(bdx, bdy);
 
+      cudaLaunchAttribute attribute[1];
+      attribute[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
+      attribute[0].val.programmaticStreamSerializationAllowed = enable_pdl ? 1 : 0;
       cudaLaunchConfig_t config;
       config.gridDim = nblks;
       config.blockDim = nthrs;
-      config.dynamicSmemBytes = 0;
       config.stream = stream;
-
-      if (enable_pdl) {
-        // PDL launch config
-        cudaLaunchAttribute attribute[1];
-        attribute[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
-        attribute[0].val.programmaticStreamSerializationAllowed = 1;
-        config.attrs = attribute;
-        config.numAttrs = 1;
-      } else {
-        // Regular launch config
-        config.attrs = nullptr;
-        config.numAttrs = 0;
-      }
+      config.dynamicSmemBytes = 0;
+      config.attrs = attribute;
+      config.numAttrs = 1;
 
       FLASHINFER_CUDA_CALL(cudaLaunchKernelEx(
           &config, kernel, q_rope_in, k_rope_in, q_nope_in, k_nope_in, q_rope_out, k_rope_out,
diff --git a/tests/attention/test_rope.py b/tests/attention/test_rope.py
@@ -380,6 +380,7 @@ def test_rope_cos_sin_cache(
 @pytest.mark.parametrize("num_tokens", [1, 19, 128, 199, 899, 2047])
 @pytest.mark.parametrize("input_dtype", [torch.float16, torch.bfloat16])
 @pytest.mark.parametrize("quant_dtype", [torch.float8_e4m3fn, torch.float8_e5m2])
+@pytest.mark.parametrize("enable_pdl", [True, False])
 def test_generalized_rope_quantize(
     attention_type,
     num_qo_heads,
@@ -389,6 +390,7 @@ def test_generalized_rope_quantize(
     num_tokens,
     input_dtype,
     quant_dtype,
+    enable_pdl,
 ):
     """Test generalized rope + quantization for MLA, GQA, and MHA architectures."""
     device = "cuda:0"
@@ -459,6 +461,7 @@ def test_generalized_rope_quantize(
         k_nope_out=k_nope_out,
         quant_scale_q=1.0,
         quant_scale_kv=1.0,
+        enable_pdl=enable_pdl,
     )
 
     # Verify results
@@ -481,10 +484,12 @@ def test_generalized_rope_quantize(
 @pytest.mark.parametrize("num_tokens", [1, 19, 128, 199, 899, 2047])
 @pytest.mark.parametrize("input_dtype", [torch.float16, torch.bfloat16])
 @pytest.mark.parametrize("quant_dtype", [torch.float8_e4m3fn, torch.float8_e5m2])
+@pytest.mark.parametrize("enable_pdl", [True, False])
 def test_mla_rope_quantize(
     num_tokens,
     input_dtype,
     quant_dtype,
+    enable_pdl,
 ):
     device = "cuda:0"
     num_qo_heads = 128
@@ -525,6 +530,7 @@ def test_mla_rope_quantize(
         k_nope_out=k_out[..., 64:],
         quant_scale_q=1.0,
         quant_scale_kv=1.0,
+        enable_pdl=enable_pdl,
     )
 
     torch.testing.assert_close(