fix DeepSeek-3.2 failures when ACL Graph is enabled.

DongheJin · DongheJin · commit b80113a354be · 2026-04-03T18:11:18.000+08:00
diff --git a/third_party/xllm_atb_layers b/third_party/xllm_atb_layers
@@ -1 +1 @@
-Subproject commit 918c03d2abc4c9996196a797aefe743863b7e0ae
+Subproject commit d6aa214ce69acac8a3061ee8f0ef48b94dd3f5f6
diff --git a/xllm/core/framework/model_context.cpp b/xllm/core/framework/model_context.cpp
@@ -17,7 +17,9 @@ limitations under the License.
 
 #include <torch/torch.h>
 
+#include "common/global_flags.h"
 #include "platform/device.h"
+#include "util/env_var.h"
 #if defined(USE_NPU)
 #ifdef TORCH_HIGHER_THAN_PTA6
 // #include <torch_npu/csrc/core/npu/NPUFormat.h>
@@ -30,6 +32,21 @@ limitations under the License.
 #endif
 
 namespace xllm {
+
+namespace {
+
+bool should_enable_async_tiling_copy_stream() {
+  // ATB copy-stream teardown is not reversible for the same context on the
+  // current CANN/PTA stack, so contexts that may enter graph capture must not
+  // pre-create the helper stream.
+  if (FLAGS_enable_graph) {
+    return false;
+  }
+  return util::get_bool_env("ATB_USE_TILING_COPY_STREAM", false);
+}
+
+}  // namespace
+
 ModelContext::ModelContext(const ParallelArgs& input_parallel_args,
                            const ModelArgs& model_args,
                            const QuantArgs& quant_args,
@@ -44,7 +61,9 @@ ModelContext::ModelContext(const ParallelArgs& input_parallel_args,
   atb::CreateContext(&context_);
   void* stream = c10_npu::getCurrentNPUStream(device_id).stream();
   context_->SetExecuteStream(stream);
-  context_->SetAsyncTilingCopyStatus(true);
+  if (should_enable_async_tiling_copy_stream()) {
+    context_->SetAsyncTilingCopyStatus(true);
+  }
   atb_workspace_ = std::make_shared<AtbWorkspace>(tensor_options.device());
 #endif
   derive_optimization_config();
diff --git a/xllm/core/framework/parallel_state/mapping_npu.cpp b/xllm/core/framework/parallel_state/mapping_npu.cpp
@@ -42,8 +42,8 @@ MappingNPU::MappingNPU(const std::string rank_table_file,
   num_nodes_ = get_num_nodes();
   world_size_ = world_size;
   local_world_size_ = world_size / num_nodes_;
-  attn_o_proj_tp_.backend("lccl");
-  attn_inner_sp_.backend("lccl");
+  attn_o_proj_tp_.backend("hccl");
+  attn_inner_sp_.backend("hccl");
   parse_parallel_info();
   validate();
   get_tp_group(word_embed_tp_);