env var works

wenscarl · wenscarl · commit 925c15afabaf · 2025-10-20T04:35:54.000Z
diff --git a/vllm/envs.py b/vllm/envs.py
@@ -1057,6 +1057,12 @@ def get_vllm_port() -> int | None:
     "VLLM_MXFP4_USE_MARLIN": lambda: maybe_convert_bool(
         os.environ.get("VLLM_MXFP4_USE_MARLIN", None)
     ),
+    # Whether to use DeepEPLL kernels for NVFP4 quantization and dispatch method
+    # only supported on Blackwell GPUs and with
+    # https://github.com/deepseek-ai/DeepEP/pull/341
+    "VLLM_DEEPEPLL_NVFP4_DISPATCH": lambda: bool(
+        int(os.getenv("VLLM_DEEPEPLL_NVFP4_DISPATCH", "0"))
+    ),
     # Whether to turn on the outlines cache for V0
     # This cache is unbounded and on disk, so it's not safe to use in
     # an environment with potentially malicious users.
diff --git a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
@@ -114,7 +114,7 @@ def _do_quant(
         assert isinstance(x, (torch.Tensor, tuple))
         q_dtype = quant_config.quant_dtype
 
-        if q_dtype == "nvfp4":
+        if q_dtype == "nvfp4" and envs.VLLM_DEEPEPLL_NVFP4_DISPATCH:
             assert isinstance(x, tuple)
             # num_experts, max_tokens, hidden_dim_by_4 = x[0].size()
             # print(f"nvfp4 quantization input shape: {x[0].size()}, {x[0].dtype}, {x[0].is_contiguous()}")
@@ -125,13 +125,18 @@ def _do_quant(
             # print(f"nvfp4 quantization input shape: {x.size()}, {x.dtype}, {x.is_contiguous()}")
             num_experts, max_tokens, hidden_dim_by_2 = x.shape
             hidden_dim = hidden_dim_by_2 * 2
-            assert(envs.VLLM_FLASHINFER_MOE_BACKEND == "cutedsl")
-            logger.info_once("skip nvfp4 quant since done by deepep!!")      
-            #            logger.info_once(
-            #                "Skip quantization when using FlashInfer CUTEDSL for "
-            #                "ModelOptNvFp4FusedMoE."
-            #            )
+            assert(envs.VLLM_FLASHINFER_MOE_BACKEND == "cutedsl")   
+            logger.info_once(
+                "Quantization is fused with DeepEP nvfp4 dispatch for " \
+                "FlashInfer CUTEDSL as VLLM_DEEPEPLL_NVFP4_DISPATCH==1"
+            )
         else:
+            if q_dtype == "nvfp4":
+                q_dtype = None
+                logger.info_once(
+                    "Using DeepEP bfloat16 dispatch for FlashInfer CUTEDSL as " \
+                    "VLLM_DEEPEPLL_NVFP4_DISPATCH==0"
+                )
             assert isinstance(x, torch.Tensor)
             num_experts, max_tokens, hidden_dim = x.size()
 
@@ -146,7 +151,6 @@ def _do_quant(
             )
             x = x.view((num_experts, -1, hidden_dim))
     
-
         if q_dtype is not None:
             assert x_scales is not None
             if q_dtype != "nvfp4":
@@ -182,11 +186,12 @@ def prepare_async(
 
         use_nvfp4 = False
         # print("mm"*100, quant_config.quant_dtype)
-        if quant_config.quant_dtype == "nvfp4":
+        nvfp4_dispatch = quant_config.quant_dtype == "nvfp4" and envs.VLLM_DEEPEPLL_NVFP4_DISPATCH
+        if nvfp4_dispatch:
             # print("gg"*100)
             # print(quant_config.a1_gscale)
             use_nvfp4 = True
-        qc_a1_gscale_or_scale = quant_config.a1_gscale if quant_config.quant_dtype == "nvfp4" else quant_config.a1_scale
+        qc_a1_gscale_or_scale = quant_config.a1_gscale if nvfp4_dispatch else quant_config.a1_scale
         has_per_token_scales = (
             qc_a1_gscale_or_scale.numel() != 1
             if qc_a1_gscale_or_scale is not None
diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutedsl_moe.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutedsl_moe.py
@@ -5,6 +5,7 @@
 
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm.logger import init_logger
+from vllm import envs
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
 from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
     TopKWeightAndReduceDelegate,
@@ -18,8 +19,6 @@
 
 logger = init_logger(__name__)
 
-CUTEDSL_MOE_NVFP4_DISPATCH = True
-
 def is_valid_flashinfer_cutedsl_fused_moe(
     hidden_states: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor
 ) -> bool:
@@ -110,7 +109,7 @@ def workspace_shapes(
         - Note: in order for activation chunking to work, the first dimension
           of each tuple must be the number of tokens.
         """
-        if CUTEDSL_MOE_NVFP4_DISPATCH:
+        if envs.VLLM_DEEPEPLL_NVFP4_DISPATCH:
             # since it sees quantized a1q
             K_dim = K * 2
         else:
@@ -151,10 +150,11 @@ def apply(
         assert self.w1_scale.ndim == 3
         assert self.w2_scale.ndim == 3
 
-        # TODO(shuw): replace True by CUTEDSL_MOE_NVFP4_DISPATCH
+        input_global_scale = (None if envs.VLLM_DEEPEPLL_NVFP4_DISPATCH else self.a1_gscale)
+        flashinfer_hidden_states = (hidden_states, a1q_scale) if envs.VLLM_DEEPEPLL_NVFP4_DISPATCH else hidden_states
         flashinfer_cutedsl_moe_masked(
-            hidden_states=(hidden_states, a1q_scale) if CUTEDSL_MOE_NVFP4_DISPATCH else hidden_states,
-            input_global_scale=(None if CUTEDSL_MOE_NVFP4_DISPATCH else self.a1_gscale),
+            hidden_states=flashinfer_hidden_states,
+            input_global_scale=input_global_scale,
             w1=w1,
             w1_blockscale=self.w1_scale,
             w1_alpha=self.g1_alphas,