deepseek-ai · fzyzcjy · Jun 20, 2025 · Jun 20, 2025 · Jun 20, 2025 · Jun 20, 2025
diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp
@@ -1159,7 +1159,8 @@ Buffer::low_latency_combine(const torch::Tensor& x, const torch::Tensor& topk_id
                             const torch::Tensor& src_info, const torch::Tensor& layout_range,
                             int num_max_dispatch_tokens_per_rank, int num_experts,
                             bool zero_copy, bool async, bool return_recv_hook,
-                            const std::optional<torch::Tensor>& out) {
+                            const std::optional<torch::Tensor>& out,
+                            const std::optional<torch::Tensor>& src_signals, uint32_t src_signal_expect_value) {
 #ifndef DISABLE_NVSHMEM
     EP_HOST_ASSERT(low_latency_mode);
 
@@ -1220,7 +1221,8 @@ Buffer::low_latency_combine(const torch::Tensor& x, const torch::Tensor& topk_id
                               num_combined_tokens, hidden, num_max_dispatch_tokens_per_rank,
                               num_topk, num_experts, rank, num_ranks,
                               workspace, num_device_sms,
-                              launch_stream, phases, zero_copy);
+                              launch_stream, phases, zero_copy,
+                              src_signals.has_value() ? src_signals->data_ptr<uint32_t>() : nullptr, src_signal_expect_value);
     };
     launcher(return_recv_hook ? LOW_LATENCY_SEND_PHASE : (LOW_LATENCY_SEND_PHASE | LOW_LATENCY_RECV_PHASE));
 
@@ -1247,6 +1249,12 @@ Buffer::low_latency_combine(const torch::Tensor& x, const torch::Tensor& topk_id
 #endif
 }
 
+void Buffer::notify_src_signals(const torch::Tensor& src_signals, int index) {
+    const uint32_t* addr = src_signals.data_ptr<uint32_t>() + index;
+    // TODO comm stream or current stream or whatever stream?
+    CU_CHECK(cuStreamWriteValue32(at::cuda::getCurrentCUDAStream(), (CUdeviceptr) addr, 1, 0));
+}
+
 torch::Tensor
 Buffer::get_next_low_latency_combine_buffer(int num_max_dispatch_tokens_per_rank, int hidden, int num_experts) const {
 #ifndef DISABLE_NVSHMEM
@@ -1312,6 +1320,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
         .def("clean_low_latency_buffer", &deep_ep::Buffer::clean_low_latency_buffer)
         .def("low_latency_dispatch", &deep_ep::Buffer::low_latency_dispatch)
         .def("low_latency_combine", &deep_ep::Buffer::low_latency_combine)
+        .def("notify_src_signals", &deep_ep::Buffer::notify_src_signals)
         .def("get_next_low_latency_combine_buffer", &deep_ep::Buffer::get_next_low_latency_combine_buffer);
 
     m.def("is_sm90_compiled", deep_ep::is_sm90_compiled);

diff --git a/csrc/deep_ep.hpp b/csrc/deep_ep.hpp
@@ -144,7 +144,10 @@ struct Buffer {
                         const torch::Tensor& src_info, const torch::Tensor& layout_range,
                         int num_max_dispatch_tokens_per_rank, int num_experts,
                         bool zero_copy, bool async, bool return_recv_hook,
-                        const std::optional<torch::Tensor>& out = std::nullopt);
+                        const std::optional<torch::Tensor>& out = std::nullopt,
+                        const std::optional<torch::Tensor>& src_signals = std::nullopt, uint32_t src_signal_expect_value = 0);
+
+    void notify_src_signals(const torch::Tensor& src_signals, int index);
 
     torch::Tensor
     get_next_low_latency_combine_buffer(int num_max_dispatch_tokens_per_rank, int hidden, int num_experts) const;

diff --git a/csrc/kernels/api.cuh b/csrc/kernels/api.cuh
@@ -158,7 +158,8 @@ void combine(void* combined_x,
              int num_combined_tokens, int hidden, int num_max_dispatch_tokens_per_rank,
              int num_topk, int num_experts, int rank, int num_ranks,
              void* workspace, int num_device_sms,
-             cudaStream_t stream, int phases, bool zero_copy);
+             cudaStream_t stream, int phases, bool zero_copy,
+             uint32_t* src_signals, uint32_t src_signal_expect_value);
 
 } // namespace internode_ll
 

diff --git a/csrc/kernels/exception.cuh b/csrc/kernels/exception.cuh
@@ -31,6 +31,18 @@ do { \
 } while (0)
 #endif
 
+#ifndef CU_CHECK
+#define CU_CHECK(cmd) \
+do { \
+    CUresult e = (cmd); \
+    if (e != CUDA_SUCCESS) { \
+        const char *error_str = NULL; \
+        cuGetErrorString(e, &error_str); \
+        throw EPException("CU", __FILE__, __LINE__, std::string(error_str)); \
+    } \
+} while (0)
+#endif
+
 #ifndef EP_HOST_ASSERT
 #define EP_HOST_ASSERT(cond) \
 do { \

diff --git a/csrc/kernels/internode_ll.cu b/csrc/kernels/internode_ll.cu
diff --git a/deep_ep/buffer.py b/deep_ep/buffer.py
@@ -539,7 +539,8 @@ def low_latency_dispatch(self, x: torch.Tensor, topk_idx: torch.Tensor,
     # noinspection PyTypeChecker
     def low_latency_combine(self, x: torch.Tensor, topk_idx: torch.Tensor, topk_weights: torch.Tensor,
                             handle: tuple, zero_copy: bool = False, async_finish: bool = False,
-                            return_recv_hook: bool = False, out: Optional[torch.Tensor] = None) -> \
+                            return_recv_hook: bool = False, out: Optional[torch.Tensor] = None,
+                            src_signals: Optional[torch.Tensor] = None, src_signal_expect_value: int = 0) -> \
             Tuple[torch.Tensor, EventOverlap, Callable]:
         """
         A low-latency implementation for combining tokens (reduce **with weights**) with IBGDA.
@@ -573,7 +574,7 @@ def low_latency_combine(self, x: torch.Tensor, topk_idx: torch.Tensor, topk_weig
         src_info, layout_range, num_max_dispatch_tokens_per_rank, hidden, num_experts = handle
         combined_x, event, hook = self.runtime.low_latency_combine(x, topk_idx, topk_weights, src_info, layout_range,
                                                                    num_max_dispatch_tokens_per_rank, num_experts,
-                                                                   zero_copy, async_finish, return_recv_hook, out)
+                                                                   zero_copy, async_finish, return_recv_hook, out, src_signals, src_signal_expect_value)
         tensors_to_record = (x, topk_idx, topk_weights, src_info, layout_range, combined_x)
         return combined_x, EventOverlap(event, tensors_to_record if async_finish else None), hook
 

diff --git a/setup.py b/setup.py
@@ -19,7 +19,11 @@
     include_dirs = ['csrc/']
     library_dirs = []
     nvcc_dlink = []
-    extra_link_args = []
+    # NOTE MODIFIED
+    extra_link_args = ['-lcuda']
+
+    # NOTE MODIFIED
+    nvcc_flags += ['-lineinfo']
 
     # NVSHMEM flags
     if disable_nvshmem:

diff --git a/tests/test_internode.py b/tests/test_internode.py
@@ -248,5 +248,5 @@ def test_loop(local_rank: int, num_local_ranks: int):
 
 
 if __name__ == '__main__':
-    num_processes = 8
+    num_processes = int(os.getenv("DEEPEP_TEST_NUM_PROCESSES", "8"))
     torch.multiprocessing.spawn(test_loop, args=(num_processes, ), nprocs=num_processes)
diff --git a/tests/test_intranode.py b/tests/test_intranode.py
@@ -1,3 +1,4 @@
+import os
 import time
 import torch
 import torch.distributed as dist
@@ -12,7 +13,12 @@
 
 def test_main(num_sms: int, local_rank: int, num_ranks: int, rank: int, buffer: deep_ep.Buffer, group: dist.ProcessGroup):
     # Settings
-    num_tokens, hidden, num_topk, num_experts = 4096, 7168, 8, (256 // num_ranks) * num_ranks
+    # num_tokens, hidden, num_topk, num_experts = 4096, 7168, 8, (256 // num_ranks) * num_ranks
+    num_tokens = int(os.environ.get("DEEPEP_TEST_NUM_TOKENS", "4096"))
+    hidden = int(os.environ.get("DEEPEP_TEST_HIDDEN", "7168"))
+    num_topk = int(os.environ.get("DEEPEP_TEST_NUM_TOPK", "8"))
+    num_experts = int(os.environ.get("DEEPEP_TEST_NUM_EXPERTS", str((256 // num_ranks) * num_ranks)))
+
     assert num_experts % num_ranks == 0
     if local_rank == 0:
         print(f'[config] num_tokens={num_tokens}, hidden={hidden}, num_topk={num_topk}', flush=True)
@@ -184,9 +190,9 @@ def check_data(check_x, rank_prefix_matrix):
                 best_time, best_results = t, (num_sms, nvl_chunk_size)
             if local_rank == 0:
                 print(f'[tuning] SMs {num_sms}, NVL chunk {nvl_chunk_size if nvl_chunk_size else "default"}: '
-                      f'{nvl_recv_bytes / 1e9 / t:.2f} GB/s (NVL) ', flush=True)
+                      f'{nvl_recv_bytes / 1e9 / t:.2f} GB/s (NVL) t={t * 1e3}ms', flush=True)
         if local_rank == 0:
-            print(f'[tuning] Best dispatch ({"FP8" if isinstance(current_x, tuple) else "BF16"}): SMs {best_results[0]}, NVL chunk {best_results[1]}, {nvl_recv_bytes / 1e9 / best_time:.2f} GB/s (NVL)', flush=True)
+            print(f'[tuning] Best dispatch ({"FP8" if isinstance(current_x, tuple) else "BF16"}): SMs {best_results[0]}, NVL chunk {best_results[1]}, {nvl_recv_bytes / 1e9 / best_time:.2f} GB/s (NVL) t={best_time * 1e3}ms', flush=True)
             print('', flush=True)
 
         # Gather the best config from rank 0 and the first test setting
@@ -215,12 +221,12 @@ def check_data(check_x, rank_prefix_matrix):
         t = bench(lambda: buffer.combine(**tune_args))[0]
         if local_rank == 0:
             print(f'[tuning] SMs {num_sms}, NVL chunk {nvl_chunk_size if nvl_chunk_size else "default"}: '
-                  f'{combine_bf16_nvl_send_bytes / 1e9 / t:.2f} GB/s (NVL) ', flush=True)
+                  f'{combine_bf16_nvl_send_bytes / 1e9 / t:.2f} GB/s (NVL)  t={t * 1e3}ms', flush=True)
             if t < best_time and nvl_chunk_size > 0:
                 best_time, best_results = t, (num_sms, nvl_chunk_size)
 
     if local_rank == 0:
-        print(f'[tuning] Best combine: SMs {best_results[0]}, NVL chunk {best_results[1]}: {combine_bf16_nvl_send_bytes / 1e9 / best_time:.2f} GB/s (NVL)', flush=True)
+        print(f'[tuning] Best combine: SMs {best_results[0]}, NVL chunk {best_results[1]}: {combine_bf16_nvl_send_bytes / 1e9 / best_time:.2f} GB/s (NVL) t={best_time * 1e3}ms', flush=True)
         print('', flush=True)
 
 
@@ -236,7 +242,9 @@ def test_loop(local_rank: int, num_local_ranks: int):
                             num_qps_per_rank=(ll_num_experts // num_ranks if test_ll_compatibility else 1))
     torch.manual_seed(rank)
 
-    for i in (24, ):
+    num_sms = int(os.environ.get("DEEPEP_TEST_NUM_SMS", "24"))
+
+    for i in (num_sms, ):
         test_main(i, local_rank, num_ranks, rank, buffer, group)
         if local_rank == 0:
             print('', flush=True)
@@ -252,5 +260,5 @@ def test_loop(local_rank: int, num_local_ranks: int):
 
 
 if __name__ == '__main__':
-    num_processes = 8
+    num_processes = int(os.getenv("DEEPEP_TEST_NUM_PROCESSES", "8"))
     torch.multiprocessing.spawn(test_loop, args=(num_processes, ), nprocs=num_processes)