intel · yucai-intel · Mar 26, 2025 · Mar 26, 2025 · Mar 27, 2025 · Apr 18, 2025
diff --git a/cmake/ONEMKL.cmake b/cmake/ONEMKL.cmake
@@ -21,3 +21,5 @@ set(TORCH_XPU_OPS_ONEMKL_LIBRARIES ${ONEMKL_LIBRARIES})
 
 list(INSERT TORCH_XPU_OPS_ONEMKL_LIBRARIES 1 "-Wl,--start-group")
 list(APPEND TORCH_XPU_OPS_ONEMKL_LIBRARIES "-Wl,--end-group")
+list(INSERT TORCH_XPU_OPS_ONEMKL_LIBRARIES 0 "-Wl,--no-as-needed")
+list(INSERT TORCH_XPU_OPS_ONEMKL_LIBRARIES 2 "-Wl,--as-needed")
diff --git a/src/ATen/CMakeLists.txt b/src/ATen/CMakeLists.txt
@@ -7,7 +7,9 @@ file(GLOB xpu_native_cpp "native/xpu/*.cpp" "native/sparse/*.cpp" "native/sparse
 file(GLOB xpu_sycl "native/xpu/sycl/*.cpp" "native/sparse/xpu/sycl/*.cpp" "native/nested/xpu/sycl/*.cpp" "native/transformers/sycl/*.cpp" "native/quantized/sycl/*.cpp")
 
 list(APPEND ATen_XPU_CPP_SRCS ${xpu_cpp})
-list(APPEND ATen_XPU_MKL_SRCS ${xpu_mkl})
+if(USE_ONEMKL)
+  list(APPEND ATen_XPU_MKL_SRCS ${xpu_mkl})
+endif()
 list(APPEND ATen_XPU_NATIVE_CPP_SRCS ${xpu_native_cpp})
 list(APPEND ATen_XPU_SYCL_SRCS ${xpu_sycl})
 

diff --git a/src/ATen/native/xpu/BatchLinearAlgebra.cpp b/src/ATen/native/xpu/BatchLinearAlgebra.cpp
@@ -0,0 +1,61 @@
+#include <ATen/core/Tensor.h>
+#include <ATen/native/BatchLinearAlgebra.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/ops/empty_like.h>
+#if defined(USE_ONEMKL)
+#include <ATen/native/xpu/mkl/BatchLinearAlgebra.h>
+#endif // USE_ONEMKL
+
+namespace at::native {
+
+void svd_kernel_xpu(
+    const Tensor& A,
+    const bool full_matrices,
+    const bool compute_uv,
+    const c10::optional<c10::string_view>& driver,
+    const Tensor& U,
+    const Tensor& S,
+    const Tensor& Vh,
+    const Tensor& info) {
+#if defined(USE_ONEMKL)
+  native::xpu::svd_mkl(A, full_matrices, compute_uv, driver, U, S, Vh, info);
+#else
+  const auto A_cpu = A.to(
+      A.options().device(kCPU).memory_format(at::MemoryFormat::Contiguous));
+  // U, S, Vh, info are the right size and strides, but these tensors are on GPU
+  // and need to be copied
+  const auto empty_like_cpu = [](const Tensor& t) {
+    return at::empty_like(t, t.options().device(kCPU));
+  };
+
+  auto U_cpu = compute_uv ? empty_like_cpu(U) : Tensor{};
+  auto S_cpu = empty_like_cpu(S);
+  auto Vh_cpu = compute_uv ? empty_like_cpu(Vh) : Tensor{};
+  auto info_cpu = empty_like_cpu(info);
+
+  svd_stub(
+      at::kCPU,
+      A_cpu,
+      full_matrices,
+      compute_uv,
+      driver,
+      U_cpu,
+      S_cpu,
+      Vh_cpu,
+      info_cpu);
+
+  // Copy from CPU back to XPU
+  // We can do a non_blocking copy, as there is an unconditional check of the
+  // infos in the calling function
+  if (compute_uv) {
+    U.copy_(U_cpu);
+    Vh.copy_(Vh_cpu);
+  }
+  S.copy_(S_cpu);
+  info.copy_(info_cpu);
+#endif // USE_ONEMKL
+}
+
+REGISTER_XPU_DISPATCH(svd_stub, &svd_kernel_xpu);
+
+} // namespace at::native
diff --git a/src/ATen/native/xpu/XPUFallback.template b/src/ATen/native/xpu/XPUFallback.template
@@ -211,7 +211,6 @@ TORCH_LIBRARY_IMPL(aten, XPU, m) {
     "_linalg_slogdet.sign",
     "_linalg_solve_ex.result",
     "linalg_solve_triangular",
-    "_linalg_svd.U",
     "lu_unpack.out",
     "ormqr",
     "_scaled_mm",