add arg return_recv_hook for get_dispatch_layout, so the kernel will be on compute stream in hook mode

huzhiyi.hzy · huzhiyi.hzy · commit f3df4f1f6c3c · 2025-09-11T17:58:00.000+08:00
diff --git a/csrc/deep_ep.cpp b/csrc/deep_ep.cpp
@@ -241,24 +241,31 @@ void Buffer::sync(const std::vector<int> &device_ids,
 
 std::tuple<torch::Tensor, std::optional<torch::Tensor>, torch::Tensor, torch::Tensor, std::optional<EventHandle>>
 Buffer::get_dispatch_layout(const torch::Tensor& topk_idx, int num_experts,
-                            std::optional<EventHandle>& previous_event, bool async, bool allocate_on_comm_stream) {
+                            std::optional<EventHandle>& previous_event, bool async, bool allocate_on_comm_stream, bool return_recv_hook) {
+    if (return_recv_hook) {
+        EP_HOST_ASSERT(not async);
+    }
+
     EP_HOST_ASSERT(topk_idx.dim() == 2);
     EP_HOST_ASSERT(topk_idx.is_contiguous());
     EP_HOST_ASSERT(num_experts > 0);
 
     // Allocate all tensors on comm stream if set
     // NOTES: do not allocate tensors upfront!
     auto compute_stream = at::cuda::getCurrentCUDAStream();
+    auto launch_stream = return_recv_hook ? compute_stream : comm_stream;
     if (allocate_on_comm_stream) {
         EP_HOST_ASSERT(previous_event.has_value() and async);
         at::cuda::setCurrentCUDAStream(comm_stream);
     }
 
     // Wait previous tasks to be finished
-    if (previous_event.has_value()) {
-        stream_wait(comm_stream, previous_event.value());
-    } else {
-        stream_wait(comm_stream, compute_stream);
+    if(not return_recv_hook) {
+        if (previous_event.has_value()) {
+            stream_wait(launch_stream, previous_event.value());
+        } else {
+            stream_wait(launch_stream, compute_stream);
+        }
     }
 
     auto num_tokens = static_cast<int>(topk_idx.size(0)), num_topk = static_cast<int>(topk_idx.size(1));
@@ -275,14 +282,14 @@ Buffer::get_dispatch_layout(const torch::Tensor& topk_idx, int num_experts,
                                 num_tokens_per_expert.data_ptr<int>(),
                                 is_token_in_rank.data_ptr<bool>(),
                                 num_tokens, num_topk, num_ranks, num_experts,
-                                comm_stream);
+                                launch_stream);
 
     // Wait streams
     std::optional<EventHandle> event;
     if (async) {
-        event = EventHandle(comm_stream);
+        event = EventHandle(launch_stream);
         for (auto& t: {topk_idx, num_tokens_per_rank, num_tokens_per_expert, is_token_in_rank}) {
-            t.record_stream(comm_stream);
+            t.record_stream(launch_stream);
             if (allocate_on_comm_stream)
                 t.record_stream(compute_stream);
         }
@@ -291,8 +298,8 @@ Buffer::get_dispatch_layout(const torch::Tensor& topk_idx, int num_experts,
             if (allocate_on_comm_stream)
                 to.has_value() ? to->record_stream(compute_stream) : void();
         }
-    } else {
-        stream_wait(compute_stream, comm_stream);
+    } else if (not return_recv_hook) {
+        stream_wait(compute_stream, launch_stream);
     }
 
     // Switch back compute stream
diff --git a/csrc/deep_ep.hpp b/csrc/deep_ep.hpp
@@ -107,7 +107,7 @@ struct Buffer {
 
     std::tuple<torch::Tensor, std::optional<torch::Tensor>, torch::Tensor, torch::Tensor, std::optional<EventHandle>>
     get_dispatch_layout(const torch::Tensor& topk_idx, int num_experts, std::optional<EventHandle>& previous_event,
-                        bool async, bool allocate_on_comm_stream);
+                        bool async, bool allocate_on_comm_stream, bool return_recv_hook);
 
     std::tuple<torch::Tensor, std::optional<torch::Tensor>, std::optional<torch::Tensor>, std::optional<torch::Tensor>, std::vector<int>, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor, std::optional<EventHandle>>
     intranode_dispatch(const torch::Tensor& x, const std::optional<torch::Tensor>& x_scales,
diff --git a/deep_ep/buffer.py b/deep_ep/buffer.py
@@ -291,7 +291,7 @@ def get_combine_config(num_ranks: int) -> Config:
     # noinspection PyTypeChecker
     def get_dispatch_layout(self, topk_idx: torch.Tensor, num_experts: int,
                             previous_event: Optional[EventOverlap] = None, async_finish: bool = False,
-                            allocate_on_comm_stream: bool = False) -> \
+                            allocate_on_comm_stream: bool = False, return_recv_hook: bool = False) -> \
             Tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor, torch.Tensor, EventOverlap]:
         """
         Calculate the layout required for later communication.
@@ -314,7 +314,7 @@ def get_dispatch_layout(self, topk_idx: torch.Tensor, num_experts: int,
         """
         num_tokens_per_rank, num_tokens_per_rdma_rank, num_tokens_per_expert, is_token_in_rank, event = \
             self.runtime.get_dispatch_layout(topk_idx, num_experts, getattr(previous_event, 'event', None),
-                                             async_finish, allocate_on_comm_stream)
+                                             async_finish, allocate_on_comm_stream, return_recv_hook)
         return num_tokens_per_rank, num_tokens_per_rdma_rank, num_tokens_per_expert, is_token_in_rank, EventOverlap(event)
 
     # noinspection PyTypeChecker