change name from use_ue8m0_for_nvfp4_sf to use_ue8m0_for_sf

shifangx · shifangx · commit cb1757ad734b · 2025-09-02T01:01:37.000-07:00
diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp
@@ -1094,7 +1094,7 @@ Buffer::low_latency_dispatch(const torch::Tensor& x, const torch::Tensor& topk_i
                              const std::optional<torch::Tensor>& x_sf_scale,
                              int num_max_dispatch_tokens_per_rank, int num_experts,
                              bool use_fp8, bool round_scale, bool use_ue8m0,
-                             bool use_nvfp4, bool use_ue8m0_for_nvfp4_sf,
+                             bool use_nvfp4, bool use_ue8m0_for_sf,
                              bool async, bool return_recv_hook) {
 #ifndef DISABLE_NVSHMEM
     EP_HOST_ASSERT(low_latency_mode);
@@ -1200,7 +1200,7 @@ Buffer::low_latency_dispatch(const torch::Tensor& x, const torch::Tensor& topk_i
                                num_tokens, hidden, num_max_dispatch_tokens_per_rank,
                                num_topk, num_experts, rank, num_ranks,
                                use_fp8, round_scale, use_ue8m0,
-                               use_nvfp4, use_ue8m0_for_nvfp4_sf,
+                               use_nvfp4, use_ue8m0_for_sf,
                                workspace, num_device_sms,
                                launch_stream, phases);
     };
diff --git a/csrc/deep_ep.hpp b/csrc/deep_ep.hpp
@@ -150,7 +150,7 @@ struct Buffer {
                          const std::optional<torch::Tensor>& x_sf_scale,
                          int num_max_dispatch_tokens_per_rank, int num_experts,
                          bool use_fp8, bool round_scale, bool use_ue8m0,
-                         bool use_nvfp4, bool use_ue8m0_for_nvfp4_sf,
+                         bool use_nvfp4, bool use_ue8m0_for_sf,
                          bool async, bool return_recv_hook);
 
     std::tuple<torch::Tensor, std::optional<EventHandle>, std::optional<std::function<void()>>>
diff --git a/csrc/kernels/api.cuh b/csrc/kernels/api.cuh
@@ -151,7 +151,7 @@ void dispatch(void* packed_recv_x, void* packed_recv_x_scales,
               int num_tokens, int hidden, int num_max_dispatch_tokens_per_rank,
               int num_topk, int num_experts, int rank, int num_ranks,
               bool use_fp8, bool round_scale, bool use_ue8m0,
-              bool use_nvfp4, bool use_ue8m0_for_nvfp4_sf,
+              bool use_nvfp4, bool use_ue8m0_for_sf,
               void* workspace, int num_device_sms,
               cudaStream_t stream, int phases);
 
diff --git a/csrc/kernels/internode_ll.cu b/csrc/kernels/internode_ll.cu
@@ -545,7 +545,7 @@ void dispatch(void* packed_recv_x, void* packed_recv_x_scales,
               int num_tokens, int hidden, int num_max_dispatch_tokens_per_rank,
               int num_topk, int num_experts, int rank, int num_ranks,
               bool use_fp8, bool round_scale, bool use_ue8m0,
-              bool use_nvfp4, bool use_ue8m0_for_nvfp4_sf,
+              bool use_nvfp4, bool use_ue8m0_for_sf,
               void* workspace, int num_device_sms,
               cudaStream_t stream, int phases) {
     constexpr int kNumMaxTopK = 9;
@@ -573,9 +573,9 @@ if (use_fp8 and not use_ue8m0) \
     dispatch_func = dispatch<true, false, false, false, hidden>; \
 if (use_fp8 and use_ue8m0) \
     dispatch_func = dispatch<true, true, false, false, hidden>; \
-if (use_nvfp4 and not use_ue8m0_for_nvfp4_sf) \
+if (use_nvfp4 and not use_ue8m0_for_sf) \
     dispatch_func = dispatch<false, false, true, false, hidden>; \
-if (use_nvfp4 and use_ue8m0_for_nvfp4_sf) \
+if (use_nvfp4 and use_ue8m0_for_sf) \
     dispatch_func = dispatch<false, false, true, true, hidden>; \
 LAUNCH_KERNEL(&cfg, dispatch_func, \
               packed_recv_x, packed_recv_x_scales, \
diff --git a/deep_ep/buffer.py b/deep_ep/buffer.py
@@ -530,7 +530,7 @@ def low_latency_dispatch(self, x: torch.Tensor, topk_idx: torch.Tensor,
                              dispatch_wait_recv_cost_stats: Optional[torch.Tensor] = None,
                              x_sf_scale: Optional[torch.Tensor] = None,
                              use_fp8: bool = True, round_scale: bool = False, use_ue8m0: bool = False,
-                             use_nvfp4: bool = False, use_ue8m0_for_nvfp4_sf: bool = False,
+                             use_nvfp4: bool = False, use_ue8m0_for_sf: bool = False,
                              async_finish: bool = False, return_recv_hook: bool = False) -> \
             Tuple[Tuple[torch.Tensor, torch.Tensor], torch.Tensor, Tuple, EventOverlap, Callable]:
         """
@@ -558,7 +558,7 @@ def low_latency_dispatch(self, x: torch.Tensor, topk_idx: torch.Tensor,
             round_scale: whether round the scaling factors into power of 2.
             use_ue8m0: whether use UE8M0 as scaling factor format (available only with `round_scale=True`).
             use_nvfp4: whether to enable NVFP4 casting, with this, the received data will be a tuple of NVFP4 tensor and scaling factors.
-            use_ue8m0_for_nvfp4_sf: whether use UE8M0 as NVFP4 scaling factor format (available only with `use_nvfp4=True`).
+            use_ue8m0_for_sf: whether use UE8M0 as NVFP4 scaling factor format (available only with `use_nvfp4=True`).
             async_finish: the current stream will not wait for the communication kernels to be finished if set.
             return_recv_hook: return a receiving hook if set. If set, the kernel will just do the RDMA request issues,
                 but **without actually receiving the data**. You must call the received hook to make sure the data's arrival.
@@ -590,7 +590,7 @@ def low_latency_dispatch(self, x: torch.Tensor, topk_idx: torch.Tensor,
                                               x_sf_scale,
                                               num_max_dispatch_tokens_per_rank, num_experts,
                                               use_fp8, round_scale, use_ue8m0,
-                                              use_nvfp4, use_ue8m0_for_nvfp4_sf,
+                                              use_nvfp4, use_ue8m0_for_sf,
                                               async_finish, return_recv_hook)
         handle = (packed_recv_src_info, packed_recv_layout_range, num_max_dispatch_tokens_per_rank, x.size(1), num_experts)
         tensors_to_record = (x, topk_idx,
diff --git a/tests/test_low_latency.py b/tests/test_low_latency.py
@@ -9,7 +9,7 @@
 from typing import Optional
 
 import deep_ep
-from utils import init_dist, bench, bench_kineto, calc_diff, hash_tensor, cast_fp8_to_fp32, cast_nvfp4_to_fp32
+from utils import init_dist, bench, bench_kineto, calc_diff, hash_tensor, per_token_cast_back
 
 MAX_E4M3 = 448
 MAX_NVFP4 = 6.0
@@ -54,7 +54,7 @@ def test_main(num_tokens: int, hidden: int, num_experts: int, num_topk: int,
             for dispatch_data_type in ('bf16', 'fp8', 'nvfp4'):
                 dispatch_use_fp8 = dispatch_data_type == 'fp8'
                 dispatch_use_nvfp4 = dispatch_data_type == 'nvfp4'
-                use_ue8m0_for_nvfp4_sf = False
+                use_ue8m0_for_sf = False
                 for round_scale in (False, True) if dispatch_use_fp8 else (False, ):
                     for use_ue8m0 in (False, True) if round_scale else (False, ):
                         num_times += 1
@@ -66,20 +66,20 @@ def test_main(num_tokens: int, hidden: int, num_experts: int, num_topk: int,
                             packed_recv_x, packed_recv_count, handle, event, hook = \
                                 buffer.low_latency_dispatch(current_x, topk_idx, num_tokens, num_experts,
                                                             use_fp8=dispatch_use_fp8, round_scale=round_scale, use_ue8m0=use_ue8m0,
-                                                            use_nvfp4=dispatch_use_nvfp4, use_ue8m0_for_nvfp4_sf=use_ue8m0_for_nvfp4_sf,
+                                                            use_nvfp4=dispatch_use_nvfp4, use_ue8m0_for_sf=use_ue8m0_for_sf,
                                                             cumulative_local_expert_recv_stats=cumulative_local_expert_recv_stats,
                                                             x_sf_scale=x_sf_scale,
                                                             async_finish=not return_recv_hook, return_recv_hook=return_recv_hook)
                             hook() if return_recv_hook else event.current_stream_wait()
                         if dispatch_use_fp8:
                             packed_recv_x = (packed_recv_x[0], packed_recv_x[1].contiguous())
-                            simulated_gemm_x = cast_fp8_to_fp32(packed_recv_x[0].view(-1, hidden), packed_recv_x[1].view(-1, hidden // 128)).view(packed_recv_x[0].shape)
+                            simulated_gemm_x = per_token_cast_back(packed_recv_x[0].view(-1, hidden), packed_recv_x[1].view(-1, hidden // 128)).view(packed_recv_x[0].shape)
                         elif dispatch_use_nvfp4:
                             recv_x_scale_view = packed_recv_x[1]
                             recv_x_scale_view = recv_x_scale_view.permute(5, 2, 0, 1, 4, 3)
                             recv_x_scale_view = recv_x_scale_view.contiguous().view(num_local_experts, int(num_ranks * num_tokens), hidden // 16)
                             packed_recv_x = (packed_recv_x[0], recv_x_scale_view)
-                            simulated_gemm_x = cast_nvfp4_to_fp32(packed_recv_x[0], packed_recv_x[1], x_sf_scale, use_ue8m0_for_nvfp4_sf=use_ue8m0_for_nvfp4_sf)
+                            simulated_gemm_x = per_token_cast_back(packed_recv_x[0], packed_recv_x[1], x_sf_scale, use_ue8m0_for_sf=use_ue8m0_for_sf, src_data_format='nvfp4')
                         else:
                             packed_recv_x = packed_recv_x
                             simulated_gemm_x = packed_recv_x.clone()
diff --git a/tests/utils.py b/tests/utils.py
@@ -90,10 +90,10 @@ def int32_to_8floats_lookup(tensor: torch.Tensor, table: torch.Tensor) -> torch.
     return out
 
 
-def cast_nvfp4_to_fp32(x_nvfp4: torch.Tensor, x_scales: torch.Tensor, x_sf_scale: float, use_ue8m0_for_nvfp4_sf: bool = False):
+def cast_nvfp4_to_fp32(x_nvfp4: torch.Tensor, x_scales: torch.Tensor, x_sf_scale: float, use_ue8m0_for_sf: bool = False):
     assert(x_sf_scale.dim() == 0, f"expect x_sf_scale.dim() == 0, but got {x_sf_scale.dim()}")
     NVFP4_TABLE = torch.tensor([0, 0.5, 1, 1.5, 2, 3, 4, 6, 0, -0.5, -1.0, -1.5, -2, -3, -4, -6], dtype=torch.float32, device='cuda')
-    if use_ue8m0_for_nvfp4_sf:
+    if use_ue8m0_for_sf:
         x_scales = x_scales.view(dtype=torch.int8).to(torch.int) << 23
         x_scales = x_scales.view(dtype=torch.float)
     else:
@@ -111,6 +111,15 @@ def cast_nvfp4_to_fp32(x_nvfp4: torch.Tensor, x_scales: torch.Tensor, x_sf_scale
     return x_fp32
 
 
+def per_token_cast_back(x: torch.Tensor, x_scales: torch.Tensor, x_sf_scale: torch.Tensor = None, use_ue8m0_for_sf: bool = False, src_data_format: str = 'fp8'):
+    if src_data_format == 'fp8':
+        return cast_fp8_to_fp32(x, x_scales)
+    elif src_data_format == 'nvfp4':
+        return cast_nvfp4_to_fp32(x, x_scales, x_sf_scale, use_ue8m0_for_sf)
+    else:
+        raise ValueError(f"Unsupported src_data_format: {src_data_format}")
+
+
 def inplace_unique(x: torch.Tensor, num_slots: int):
     assert x.dim() == 2
     mask = x < 0