wip

root · wenscarl · commit 309e0b875379 · 2025-10-20T03:27:02.000Z
able to run

ok
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1007,7 +1007,7 @@ endif()
 # For CUDA we also build and ship some external projects.
 if (VLLM_GPU_LANG STREQUAL "CUDA")
     include(cmake/external_projects/flashmla.cmake)
-    include(cmake/external_projects/qutlass.cmake)
+    # include(cmake/external_projects/qutlass.cmake)
 
     # vllm-flash-attn should be last as it overwrites some CMake functions
     include(cmake/external_projects/vllm_flash_attn.cmake)
diff --git a/setup.py b/setup.py
@@ -636,7 +636,7 @@ def _read_requirements(filename: str) -> list[str]:
     ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa2_C"))
     if envs.VLLM_USE_PRECOMPILED or get_nvcc_cuda_version() >= Version("12.3"):
         # FA3 requires CUDA 12.3 or later
-        ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa3_C"))
+        # ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa3_C"))
         # Optional since this doesn't get built (produce an .so file) when
         # not targeting a hopper system
         ext_modules.append(CMakeExtension(name="vllm._flashmla_C", optional=True))
diff --git a/tests/kernels/moe/test_cutedsl_moe.py b/tests/kernels/moe/test_cutedsl_moe.py
@@ -9,10 +9,11 @@
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe.flashinfer_cutedsl_moe import (
     flashinfer_cutedsl_moe_masked,
-    scaled_fp4_grouped_quant,
 )
 from vllm.utils.flashinfer import (
     flashinfer_cutedsl_grouped_gemm_nt_masked as cutedsl_gmm_masked,
+    scaled_fp4_grouped_quantize,
+    silu_and_mul_scaled_nvfp4_experts_quantize,
 )
 
 if torch.cuda.get_device_capability() < (10, 0):
@@ -219,16 +220,16 @@ def flashinfer_cutedsl_grouped_gemm_nt_masked(
 ):
     # hidden_states: [l, m, k]
     # weights: [l, n, k]
-    aq, aq_sf = scaled_fp4_grouped_quant(
+    aq, aq_sf = scaled_fp4_grouped_quantize(
         hidden_states,
-        input_global_scale,
         masked_m.to(hidden_states.device),
+        input_global_scale,
     )
     num_experts, n, k = weights.shape
-    bq, bq_sf = scaled_fp4_grouped_quant(
+    bq, bq_sf = scaled_fp4_grouped_quantize(
         weights,
-        w_global_scale,
         torch.full((num_experts,), n, device=weights.device, dtype=torch.int32),
+        w_global_scale,
     )
 
     out = torch.zeros(
@@ -316,15 +317,15 @@ def test_flashinfer_cutedsl_moe_masked(
         (num_experts,), dtype=torch.float32, device=hidden_states.device
     )  # assume intermediate scale is 1.0
 
-    w1_fp4, w1_blockscale = scaled_fp4_grouped_quant(
+    w1_fp4, w1_blockscale = scaled_fp4_grouped_quantize(
         w1,
+        torch.ones(num_experts, dtype=torch.int32, device=w1.device) * 2 * inter_dim,        
         w1_global_scale,
-        torch.ones(num_experts, dtype=torch.int32, device=w1.device) * 2 * inter_dim,
     )
-    w2_fp4, w2_blockscale = scaled_fp4_grouped_quant(
+    w2_fp4, w2_blockscale = scaled_fp4_grouped_quantize(
         w2,
+        torch.ones(num_experts, dtype=torch.int32, device=w2.device) * hidden_dim,        
         w2_global_scale,
-        torch.ones(num_experts, dtype=torch.int32, device=w2.device) * hidden_dim,
     )
 
     w1_alpha = 1.0 / (input_global_scale * w1_global_scale)
diff --git a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
@@ -111,33 +111,46 @@ def _do_quant(
             x_fp8, x_scales = x
             x = dequant_fp8(x_fp8, x_scales).to(dtype=a1_dtype)
 
-        assert isinstance(x, torch.Tensor)
-
-        num_experts, max_tokens, hidden_dim = x.size()
-
-        # TODO (varun): Optimization - Use a batched version of quant
-        x = x.view((-1, hidden_dim))
+        assert isinstance(x, (torch.Tensor, tuple))
         q_dtype = quant_config.quant_dtype
 
-        if envs.VLLM_FLASHINFER_MOE_BACKEND == "cutedsl":
-            logger.info_once(
-                "Skip quantization when using FlashInfer CUTEDSL for "
-                "ModelOptNvFp4FusedMoE."
+        if q_dtype == "nvfp4":
+            assert isinstance(x, tuple)
+            # num_experts, max_tokens, hidden_dim_by_4 = x[0].size()
+            # print(f"nvfp4 quantization input shape: {x[0].size()}, {x[0].dtype}, {x[0].is_contiguous()}")
+            # print(f"nvfp4 quantization input shape: {x[1].size()}, {x[1].dtype}, {x[1].is_contiguous()}")
+            # print("after permute")
+            x_scales = x[1]
+            x = x[0].permute(2, 0, 1)
+            # print(f"nvfp4 quantization input shape: {x.size()}, {x.dtype}, {x.is_contiguous()}")
+            num_experts, max_tokens, hidden_dim_by_2 = x.shape
+            hidden_dim = hidden_dim_by_2 * 2
+            assert(envs.VLLM_FLASHINFER_MOE_BACKEND == "cutedsl")
+            logger.info_once("skip nvfp4 quant since done by deepep!!")      
+            #            logger.info_once(
+            #                "Skip quantization when using FlashInfer CUTEDSL for "
+            #                "ModelOptNvFp4FusedMoE."
+            #            )
+        else:
+            assert isinstance(x, torch.Tensor)
+            num_experts, max_tokens, hidden_dim = x.size()
+
+            # TODO (varun): Optimization - Use a batched version of quant
+            x = x.view((-1, hidden_dim))
+            x, x_scales = moe_kernel_quantize_input(
+                x,
+                quant_config.a1_scale,
+                q_dtype,
+                quant_config.per_act_token_quant,
+                quant_config.block_shape,
             )
-            q_dtype = None
-
-        x, x_scales = moe_kernel_quantize_input(
-            x,
-            quant_config.a1_scale,
-            q_dtype,
-            quant_config.per_act_token_quant,
-            quant_config.block_shape,
-        )
-        x = x.view((num_experts, -1, hidden_dim))
+            x = x.view((num_experts, -1, hidden_dim))
+    
 
         if q_dtype is not None:
             assert x_scales is not None
-            x_scales = normalize_batched_scales_shape(x_scales, num_experts)
+            if q_dtype != "nvfp4":
+                x_scales = normalize_batched_scales_shape(x_scales, num_experts)
 
         return x, x_scales
 
@@ -167,18 +180,26 @@ def prepare_async(
                 "DeepEP kernels quantize the inputs in blocks of shape 128"
             )
 
+        use_nvfp4 = False
+        # print("mm"*100, quant_config.quant_dtype)
+        if quant_config.quant_dtype == "nvfp4":
+            # print("gg"*100)
+            # print(quant_config.a1_gscale)
+            use_nvfp4 = True
+        qc_a1_gscale_or_scale = quant_config.a1_gscale if quant_config.quant_dtype == "nvfp4" else quant_config.a1_scale
         has_per_token_scales = (
-            quant_config.a1_scale.numel() != 1
-            if quant_config.a1_scale is not None
+            qc_a1_gscale_or_scale.numel() != 1
+            if qc_a1_gscale_or_scale is not None
             else (
                 quant_config.a2_scale.numel() != 1
                 if quant_config.a2_scale is not None
                 else False
             )
         )
-        assert not has_per_token_scales, (
-            "low_latency kernels doesn't support dispatching per-token scales"
-        )
+        if not use_nvfp4:
+            assert not has_per_token_scales, (
+                "low_latency kernels doesn't support dispatching per-token scales"
+            )
 
         if apply_router_weight_on_input:
             topk = topk_ids.size(1)
@@ -189,12 +210,19 @@ def prepare_async(
             a1 = a1 * topk_weights.to(a1.dtype)
 
         # Dispatch
+        # print("qwerwqrq"*100, use_nvfp4, qc_a1_gscale_or_scale.shape, a1.shape, a1.dtype)
         expert_x, expert_num_tokens, handle, _, hook = self.buffer.low_latency_dispatch(
             a1,
             topk_ids,
             self.max_tokens_per_rank,
             num_experts,
             use_fp8=self.use_fp8_dispatch,
+            **(dict(use_nvfp4=True) if use_nvfp4 else dict()),
+            **(
+                dict(x_global_scale=qc_a1_gscale_or_scale)
+                if qc_a1_gscale_or_scale is not None
+                else dict()
+            ),
             async_finish=False,
             return_recv_hook=True,
         )
@@ -220,7 +248,7 @@ def _receiver(
         quant_config: FusedMoEQuantConfig,
     ) -> mk.PrepareResultType:
         expert_x, expert_x_scale = self._do_quant(expert_x, a1_dtype, quant_config)
-
+        
         expert_tokens_meta = mk.ExpertTokensMetadata(
             expert_num_tokens=expert_num_tokens, expert_num_tokens_cpu=None
         )
@@ -275,6 +303,8 @@ def _finalize(
 
         # TODO (varun) : Enable zero copy mode
         dbo_maybe_run_recv_hook()
+        # print("combine"*100)
+        # print(fused_expert_output.shape, fused_expert_output.dtype)
         _, _, recv_hook = self.buffer.low_latency_combine(
             fused_expert_output,
             topk_ids,
diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutedsl_moe.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutedsl_moe.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
+from typing import Union
 import torch
 
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
@@ -18,6 +18,7 @@
 
 logger = init_logger(__name__)
 
+CUTEDSL_MOE_NVFP4_DISPATCH = True
 
 def is_valid_flashinfer_cutedsl_fused_moe(
     hidden_states: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor
@@ -109,7 +110,12 @@ def workspace_shapes(
         - Note: in order for activation chunking to work, the first dimension
           of each tuple must be the number of tokens.
         """
-        output_shape = (local_num_experts, M, K)
+        if CUTEDSL_MOE_NVFP4_DISPATCH:
+            # since it sees quantized a1q
+            K_dim = K * 2
+        else:
+            K_dim = K
+        output_shape = (local_num_experts, M, K_dim)
         workspace2 = (local_num_experts, M, N)
         workspace1 = output_shape
         return (workspace1, workspace2, output_shape)
@@ -145,9 +151,10 @@ def apply(
         assert self.w1_scale.ndim == 3
         assert self.w2_scale.ndim == 3
 
+        # TODO(shuw): replace True by CUTEDSL_MOE_NVFP4_DISPATCH
         flashinfer_cutedsl_moe_masked(
-            hidden_states=hidden_states,
-            input_global_scale=self.a1_gscale,
+            hidden_states=(hidden_states, a1q_scale) if CUTEDSL_MOE_NVFP4_DISPATCH else hidden_states,
+            input_global_scale=(None if CUTEDSL_MOE_NVFP4_DISPATCH else self.a1_gscale),
             w1=w1,
             w1_blockscale=self.w1_scale,
             w1_alpha=self.g1_alphas,
@@ -173,7 +180,7 @@ def get_cute_dtype(input: torch.Tensor) -> str:
 
 
 def flashinfer_cutedsl_moe_masked(
-    hidden_states: torch.Tensor,
+    hidden_states: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
     input_global_scale: torch.Tensor,
     w1: torch.Tensor,
     w1_blockscale: torch.Tensor,
@@ -191,7 +198,9 @@ def flashinfer_cutedsl_moe_masked(
     kernels.
 
     Args:
-        hidden_states (torch.Tensor): [num_experts, m, k], bf16
+        hidden_states: Either of the following case
+            * torch.Tensor: [num_experts, m, k], bf16
+            * tuple[torch.Tensor, torch.Tensor]: [num_experts, m, k // 2], uint8, [num_experts, m, k // 16], float8_e4m3fn
         input_global_scale (torch.Tensor): (l,)
         w1 (torch.Tensor): fp4 weights, [l, 2 * n, k // 2], uint8
         w1_blockscale (torch.Tensor): blockscale factors, e4m3,
@@ -208,9 +217,9 @@ def flashinfer_cutedsl_moe_masked(
     """
 
     # === Assertions on dtypes ===
-    assert input_global_scale.dtype == torch.float32, (
-        f"input_global_scale must be float32, got {input_global_scale.dtype}"
-    )
+#    assert input_global_scale.dtype == torch.float32, (
+#        f"input_global_scale must be float32, got {input_global_scale.dtype}"
+#    )
     assert w1.dtype == torch.uint8, f"w1 must be uint8, got {w1.dtype}"
     assert w1_blockscale.dtype == torch.float8_e4m3fn, (
         f"w1_blockscale must be float8_e4m3fn, got {w1_blockscale.dtype}"
@@ -231,7 +240,32 @@ def flashinfer_cutedsl_moe_masked(
 
     # === Assertions on shapes ===
     n = w2.shape[-1] * 2  # intermediate dimension
-    num_experts, m, k = hidden_states.shape
+    if isinstance(hidden_states, tuple):
+        assert (
+            input_global_scale is None
+        ), "input_global_scale is needed when input needs quant"
+
+        aq = hidden_states[0].view(torch.uint8)
+        aq_sf = hidden_states[1].view(torch.float8_e4m3fn)
+        # m, k_by_2, num_experts = aq.shape
+        num_experts, m, k_by_2  = aq.shape
+        k = k_by_2 * 2
+        aq = aq.permute(1,2,0)
+    else:
+        num_experts, m, k = hidden_states.shape
+
+        assert (
+            input_global_scale.dtype == torch.float32
+        ), f"input_global_scale must be float32, got {input_global_scale.dtype}"
+        assert input_global_scale.shape == (
+            num_experts,
+        ), f"input_global_scale must be (l,), got {input_global_scale.shape}"
+
+        aq, aq_sf = scaled_fp4_grouped_quantize(
+            hidden_states,
+            masked_m,
+            input_global_scale,
+        )    
 
     assert w1.shape[-2] == 2 * n, f"w1 last-2 dim must be 2*n, got {w1.shape}"
     assert w1.shape[-1] * 2 == k, (
@@ -242,9 +276,9 @@ def flashinfer_cutedsl_moe_masked(
         n // 2,
     ), f"w2 shape mismatch, got {w2.shape[-2:]}, expected {(k, n // 2)}"
 
-    assert input_global_scale.shape == (num_experts,), (
-        f"input_global_scale must be (l,), got {input_global_scale.shape}"
-    )
+#    assert input_global_scale.shape == (num_experts,), (
+#        f"input_global_scale must be (l,), got {input_global_scale.shape}"
+#    )
     assert w1_alpha.shape == (num_experts,), (
         f"w1_alpha must be (l,), got {w1_alpha.shape}"
     )
@@ -254,23 +288,31 @@ def flashinfer_cutedsl_moe_masked(
     assert w2_alpha.shape == (num_experts,), (
         f"w2_alpha must be (l,), got {w2_alpha.shape}"
     )
+    # return
 
-    aq, aq_sf = scaled_fp4_grouped_quantize(
-        hidden_states,
-        masked_m,
-        input_global_scale,
-    )
+#    aq, aq_sf = scaled_fp4_grouped_quantize(
+#        hidden_states,
+#        masked_m,
+#        input_global_scale,
+#    )
 
+#    workspace = workspace.permute(1, 2, 0)  # requirement of kernel
+    # workspace = torch.empty(
+    #     (num_experts, m, n * 2), dtype=torch.bfloat16, device=aq.device
+    # )
     workspace = workspace.permute(1, 2, 0)  # requirement of kernel
     sf_vec_size = 16
     assert aq_sf.dtype == torch.float8_e4m3fn
     assert aq.dtype == torch.uint8
     ab_dtype = "float4_e2m1fn"
     sf_dtype = "float8_e4m3fn"
 
-    c_dtype = get_cute_dtype(hidden_states)
+#    c_dtype = get_cute_dtype(hidden_states)
+    c_dtype = "bfloat16"
 
     # Gemm1
+    # print(aq.shape, aq.dtype)
+    # print(aq_sf.shape, aq_sf.dtype)
     flashinfer_cutedsl_grouped_gemm_nt_masked(
         (aq, aq_sf),
         (w1.permute(1, 2, 0), w1_blockscale),
@@ -290,7 +332,7 @@ def flashinfer_cutedsl_moe_masked(
         masked_m,
         a2_global_scale,
     )
-
+    # return 
     # Gemm2
     out = out.permute(1, 2, 0)  # requirement of kernel
     flashinfer_cutedsl_grouped_gemm_nt_masked(
diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
@@ -179,6 +179,7 @@ def prepare(
             quant_config.block_shape,
             is_fp4_scale_swizzled=not self.use_dp,
         )
+
         if self.use_dp:
             topk_weights, topk_ids, a1q, a1q_scale = get_dp_group().all_gatherv(
                 [topk_weights, topk_ids, a1q, a1q_scale],
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py

Original file line number	Diff line number	Diff line change
`@@ -179,6 +179,7 @@ def prepare(`
`179`	`179`	`quant_config.block_shape,`
`180`	`180`	`is_fp4_scale_swizzled=not self.use_dp,`
`181`	`181`	`)`
	`182`	`+`
`182`	`183`	`if self.use_dp:`
`183`	`184`	`topk_weights, topk_ids, a1q, a1q_scale = get_dp_group().all_gatherv(`
`184`	`185`	`[topk_weights, topk_ids, a1q, a1q_scale],`