Skip to content

Commit

Permalink
Fix delay load for WebGPU EP and DML EP (microsoft#23111)
Browse files Browse the repository at this point in the history
### Description

This change fixes the DLL delay load problem for the WebGPU EP and
DirectML EP. See detailed explanation below.

### Problem

When onnxruntime.dll uses delay loading for its dependencies, the
dependencies are loaded using `LoadLibraryEx()`, which search the
directory of process (.exe) instead of this library (onnxruntime.dll).
This is a problem for usages of Node.js binding and python binding,
because Windows will try to find the dependencies in the directory of
node.exe or python.exe, which is not the directory of onnxruntime.dll.

There was previous attempt to fix this by loading DirectML.dll in the
initialization of onnxruntime nodejs binding, which works for DML EP but
is not a good solution because it does not really "delay" the load.

For WebGPU, the situation became worse because webgpu_dawn.dll depends
on dxil.dll and dxcompiler.dll, which are explicitly dynamically loaded
in the code using `LoadLibraryA()`. This has the same problem of the DLL
search.

### Solutions

For onnxruntime.dll loading its direct dependencies, it can be resolved
by set the [`__pfnDliNotifyHook2`
hook](https://learn.microsoft.com/en-us/cpp/build/reference/understanding-the-helper-function?view=msvc-170#structure-and-constant-definitions)
to load from an absolute path that constructed from the onnxruntime.dll
folder and the DLL name.

For webgpu_dawn.dll loading dxil.dll and dxcompiler.dll, since they are
explicitly loaded in the code, the hook does not work. Instead, it can
be resolved by ~~using WIN32 API `SetDllDirectory()` to add the
onnxruntime.dll folder to the search path.~~ preloading the 2 DLLs from
the onnxruntime.dll folder .
  • Loading branch information
fs-eire authored Dec 19, 2024
1 parent 7807350 commit 8680244
Show file tree
Hide file tree
Showing 16 changed files with 324 additions and 66 deletions.
1 change: 1 addition & 0 deletions cmake/onnxruntime.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ if(WIN32)
onnxruntime_add_shared_library(onnxruntime
${SYMBOL_FILE}
"${ONNXRUNTIME_ROOT}/core/dll/dllmain.cc"
"${ONNXRUNTIME_ROOT}/core/dll/delay_load_hook.cc"
"${ONNXRUNTIME_ROOT}/core/dll/onnxruntime.rc"
)
elseif(onnxruntime_BUILD_APPLE_FRAMEWORK)
Expand Down
20 changes: 17 additions & 3 deletions cmake/onnxruntime_nodejs.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -60,15 +60,26 @@ else()
endif()
endif()

# a list of DLLs that the Node.js binding depends on
set(NODEJS_DLL_DEPS)

# setup providers
if (onnxruntime_USE_CUDA)
set(NODEJS_BINDING_USE_CUDA "--use_cuda")
endif()
if (onnxruntime_USE_DML)
set(NODEJS_BINDING_USE_DML "--use_dml")
list(APPEND NODEJS_DLL_DEPS "$<TARGET_FILE_DIR:onnxruntime>/DirectML.dll")
endif()
if (onnxruntime_USE_WEBGPU)
set(NODEJS_BINDING_USE_WEBGPU "--use_webgpu")
if (WIN32 AND onnxruntime_ENABLE_DAWN_BACKEND_D3D12)
list(APPEND NODEJS_DLL_DEPS "$<TARGET_FILE_DIR:dxcompiler>/dxil.dll")
list(APPEND NODEJS_DLL_DEPS "$<TARGET_FILE_DIR:dxcompiler>/dxcompiler.dll")
endif()
if (onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY)
list(APPEND NODEJS_DLL_DEPS "$<TARGET_FILE:dawn::webgpu_dawn>")
endif()
endif()
if (onnxruntime_USE_TENSORRT)
set(NODEJS_BINDING_USE_TENSORRT "--use_tensorrt")
Expand All @@ -94,9 +105,12 @@ add_custom_target(js_common_npm_ci ALL

add_custom_target(nodejs_binding_wrapper ALL
COMMAND ${NPM_CLI} ci
COMMAND ${NPM_CLI} run build -- --onnxruntime-build-dir=${CMAKE_CURRENT_BINARY_DIR} --config=${CMAKE_BUILD_TYPE} --onnxruntime-generator=${CMAKE_GENERATOR}
--arch=${NODEJS_BINDING_ARCH} ${NODEJS_BINDING_USE_CUDA} ${NODEJS_BINDING_USE_DML} ${NODEJS_BINDING_USE_WEBGPU} ${NODEJS_BINDING_USE_TENSORRT}
${NODEJS_BINDING_USE_COREML} ${NODEJS_BINDING_USE_QNN}
COMMAND ${NPM_CLI} run build -- "--onnxruntime-build-dir=${CMAKE_CURRENT_BINARY_DIR}"
--config=${CMAKE_BUILD_TYPE}
"--onnxruntime-generator=${CMAKE_GENERATOR}"
"--dll_deps=${NODEJS_DLL_DEPS}"
--arch=${NODEJS_BINDING_ARCH} ${NODEJS_BINDING_USE_CUDA} ${NODEJS_BINDING_USE_DML} ${NODEJS_BINDING_USE_WEBGPU}
${NODEJS_BINDING_USE_TENSORRT} ${NODEJS_BINDING_USE_COREML} ${NODEJS_BINDING_USE_QNN}
WORKING_DIRECTORY ${JS_NODE_ROOT}
COMMENT "Using cmake-js to build OnnxRuntime Node.js binding")

Expand Down
36 changes: 27 additions & 9 deletions cmake/onnxruntime_providers_webgpu.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -23,24 +23,42 @@
onnxruntime_add_include_to_target(onnxruntime_providers_webgpu
onnxruntime_common dawn::dawncpp_headers dawn::dawn_headers onnx onnx_proto flatbuffers::flatbuffers Boost::mp11 safeint_interface)

set(onnxruntime_providers_webgpu_dll_deps)

if (onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY)
target_link_libraries(onnxruntime_providers_webgpu dawn::webgpu_dawn)

if (onnxruntime_ENABLE_DELAY_LOADING_WIN_DLLS)
list(APPEND onnxruntime_DELAYLOAD_FLAGS "/DELAYLOAD:webgpu_dawn.dll")
endif()
if (WIN32)
if (onnxruntime_ENABLE_DELAY_LOADING_WIN_DLLS)
list(APPEND onnxruntime_DELAYLOAD_FLAGS "/DELAYLOAD:webgpu_dawn.dll")
endif()

# Copy webgpu_dawn.dll to the output directory
add_custom_command(
TARGET onnxruntime_providers_webgpu
POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy_if_different "$<TARGET_FILE:dawn::webgpu_dawn>" "$<TARGET_FILE_DIR:onnxruntime_providers_webgpu>"
VERBATIM )
list(APPEND onnxruntime_providers_webgpu_dll_deps "$<TARGET_FILE:dawn::webgpu_dawn>")
endif()
else()
if (NOT onnxruntime_USE_EXTERNAL_DAWN)
target_link_libraries(onnxruntime_providers_webgpu dawn::dawn_native)
endif()
target_link_libraries(onnxruntime_providers_webgpu dawn::dawn_proc)
endif()

if (WIN32 AND onnxruntime_ENABLE_DAWN_BACKEND_D3D12)
# Ensure dxil.dll and dxcompiler.dll exist in the output directory $<TARGET_FILE_DIR:dxcompiler>
add_dependencies(onnxruntime_providers_webgpu copy_dxil_dll)
add_dependencies(onnxruntime_providers_webgpu dxcompiler)

list(APPEND onnxruntime_providers_webgpu_dll_deps "$<TARGET_FILE_DIR:dxcompiler>/dxil.dll")
list(APPEND onnxruntime_providers_webgpu_dll_deps "$<TARGET_FILE_DIR:dxcompiler>/dxcompiler.dll")
endif()

if (onnxruntime_providers_webgpu_dll_deps)
# Copy dependency DLLs to the output directory
add_custom_command(
TARGET onnxruntime_providers_webgpu
POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy_if_different "${onnxruntime_providers_webgpu_dll_deps}" "$<TARGET_FILE_DIR:onnxruntime_providers_webgpu>"
COMMAND_EXPAND_LISTS
VERBATIM )
endif()

set_target_properties(onnxruntime_providers_webgpu PROPERTIES FOLDER "ONNXRuntime")
12 changes: 12 additions & 0 deletions cmake/onnxruntime_unittests.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -525,6 +525,9 @@ set (onnxruntime_global_thread_pools_test_SRC
set (onnxruntime_webgpu_external_dawn_test_SRC
${TEST_SRC_DIR}/webgpu/external_dawn/main.cc)

set (onnxruntime_webgpu_delay_load_test_SRC
${TEST_SRC_DIR}/webgpu/delay_load/main.cc)

# tests from lowest level library up.
# the order of libraries should be maintained, with higher libraries being added first in the list

Expand Down Expand Up @@ -1864,4 +1867,13 @@ if (onnxruntime_USE_WEBGPU AND onnxruntime_USE_EXTERNAL_DAWN)
onnxruntime_add_include_to_target(onnxruntime_webgpu_external_dawn_test dawn::dawncpp_headers dawn::dawn_headers)
endif()

if (onnxruntime_USE_WEBGPU AND WIN32 AND onnxruntime_BUILD_SHARED_LIB AND NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten" AND NOT onnxruntime_MINIMAL_BUILD)
AddTest(DYN
TARGET onnxruntime_webgpu_delay_load_test
SOURCES ${onnxruntime_webgpu_delay_load_test_SRC}
LIBS ${SYS_PATH_LIB}
DEPENDS ${all_dependencies}
)
endif()

include(onnxruntime_fuzz_test.cmake)
10 changes: 6 additions & 4 deletions js/node/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -113,10 +113,12 @@ endif()
if (WIN32)
file(COPY ${ONNXRUNTIME_WIN_BIN_DIR}/onnxruntime.dll
DESTINATION ${dist_folder})
if (USE_DML)
file(COPY ${ONNXRUNTIME_WIN_BIN_DIR}/DirectML.dll
DESTINATION ${dist_folder})
endif ()
if (ORT_NODEJS_DLL_DEPS)
foreach(dll ${ORT_NODEJS_DLL_DEPS})
file(COPY ${dll} DESTINATION ${dist_folder})
endforeach()
endif()

elseif (APPLE)
file(COPY ${ONNXRUNTIME_BUILD_DIR}/libonnxruntime.dylib
DESTINATION ${dist_folder} FOLLOW_SYMLINK_CHAIN)
Expand Down
5 changes: 5 additions & 0 deletions js/node/script/build.ts
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ const USE_TENSORRT = !!buildArgs.use_tensorrt;
const USE_COREML = !!buildArgs.use_coreml;
// --use_qnn
const USE_QNN = !!buildArgs.use_qnn;
// --dll_deps=
const DLL_DEPS = buildArgs.dll_deps;

// build path
const ROOT_FOLDER = path.join(__dirname, '..');
Expand Down Expand Up @@ -82,6 +84,9 @@ if (USE_COREML) {
if (USE_QNN) {
args.push('--CDUSE_QNN=ON');
}
if (DLL_DEPS) {
args.push(`--CDORT_NODEJS_DLL_DEPS=${DLL_DEPS}`);
}

// set CMAKE_OSX_ARCHITECTURES for macOS build
if (os.platform() === 'darwin') {
Expand Down
37 changes: 0 additions & 37 deletions js/node/src/directml_load_helper.cc

This file was deleted.

6 changes: 0 additions & 6 deletions js/node/src/directml_load_helper.h

This file was deleted.

4 changes: 0 additions & 4 deletions js/node/src/inference_session_wrap.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
#include "onnxruntime_cxx_api.h"

#include "common.h"
#include "directml_load_helper.h"
#include "inference_session_wrap.h"
#include "run_options_helper.h"
#include "session_options_helper.h"
Expand All @@ -19,9 +18,6 @@ Napi::FunctionReference& InferenceSessionWrap::GetTensorConstructor() {
}

Napi::Object InferenceSessionWrap::Init(Napi::Env env, Napi::Object exports) {
#if defined(USE_DML) && defined(_WIN32)
LoadDirectMLDll(env);
#endif
// create ONNX runtime env
Ort::InitApi();
ORT_NAPI_THROW_ERROR_IF(
Expand Down
83 changes: 83 additions & 0 deletions onnxruntime/core/dll/delay_load_hook.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

// == workaround for delay loading of dependencies of onnxruntime.dll ==
//
// Problem:
//
// When onnxruntime.dll uses delay loading for its dependencies, the dependencies are loaded using LoadLibraryEx,
// which search the directory of process (.exe) instead of this library (onnxruntime.dll). This is a problem for
// usages of Node.js binding and python binding, because Windows will try to find the dependencies in the directory
// of node.exe or python.exe, which is not the directory of onnxruntime.dll.
//
// Solution:
//
// By using the delay load hook `__pfnDliNotifyHook2`, we can intervene the loading procedure by loading from an
// absolute path. The absolute path is constructed by appending the name of the DLL to load to the directory of
// onnxruntime.dll. This way, we can ensure that the dependencies are loaded from the same directory as onnxruntime.dll.
//
// See also:
// - https://learn.microsoft.com/en-us/cpp/build/reference/understanding-the-helper-function?view=msvc-170#structure-and-constant-definitions
// - https://learn.microsoft.com/en-us/windows/win32/dlls/dynamic-link-library-search-order#alternate-search-order-for-unpackaged-apps
//
// The DLL DelayLoad hook is only enabled when the compiler is MSVC and at least one of the following is True:
// - both USE_WEBGPU and BUILD_DAWN_MONOLITHIC_LIBRARY are defined
// - USE_DML is defined
//
#define ORT_DELAY_LOAD_WEBGPU_DAWN_DLL (defined(USE_WEBGPU) && defined(BUILD_DAWN_MONOLITHIC_LIBRARY))
#define ORT_DELAY_LOAD_DIRECTML_DLL defined(USE_DML)
#if defined(_MSC_VER) && (ORT_DELAY_LOAD_WEBGPU_DAWN_DLL || ORT_DELAY_LOAD_DIRECTML_DLL)

#include <Windows.h>
#include <delayimp.h>
#include <stdlib.h>
#include <string>

#include "core/platform/env.h"

namespace {

#define DEFINE_KNOWN_DLL(name) {#name ".dll", L#name L".dll"}

constexpr struct {
const char* str;
const wchar_t* wstr;
} known_dlls[] = {
#if ORT_DELAY_LOAD_WEBGPU_DAWN_DLL
DEFINE_KNOWN_DLL(webgpu_dawn),
#endif
#if ORT_DELAY_LOAD_DIRECTML_DLL
DEFINE_KNOWN_DLL(DirectML),
#endif
};
} // namespace

FARPROC WINAPI delay_load_hook(unsigned dliNotify, PDelayLoadInfo pdli) {
if (dliNotify == dliNotePreLoadLibrary) {
for (size_t i = 0; i < _countof(known_dlls); ++i) {
if (_stricmp(pdli->szDll, known_dlls[i].str) == 0) {
// Try to load the DLL from the same directory as onnxruntime.dll

// First, get the path to onnxruntime.dll
auto path = Env::Default().GetRuntimePath();
if (path.empty()) {
// Failed to get the path to onnxruntime.dll. In this case, we will just return NULL and let the system
// search for the DLL in the default search order.
return NULL;
}

// Append the name of the DLL. Now `path` is the absolute path to the DLL to load.
path.append(known_dlls[i].wstr);

// Load the DLL
return FARPROC(LoadLibraryExW(path.c_str(), NULL,
LOAD_LIBRARY_SEARCH_DEFAULT_DIRS | LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR));
}
}
}
return NULL;
}

extern "C" const PfnDliHook __pfnDliNotifyHook2 = delay_load_hook;

#endif
2 changes: 1 addition & 1 deletion onnxruntime/core/dll/dllmain.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
#pragma GCC diagnostic pop
#endif

// dllmain.cpp : Defines the entry point for the DLL application.
// dllmain.cc : Defines the entry point for the DLL application.
BOOL APIENTRY DllMain(HMODULE /*hModule*/,
DWORD ul_reason_for_call,
LPVOID /*lpReserved*/
Expand Down
26 changes: 26 additions & 0 deletions onnxruntime/core/providers/webgpu/webgpu_context.cc
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
#endif

#include "core/common/common.h"
#include "core/common/path_string.h"
#include "core/platform/env.h"

#include "core/providers/webgpu/compute_context.h"
#include "core/providers/webgpu/webgpu_context.h"
Expand Down Expand Up @@ -50,6 +52,30 @@ void WebGpuContext::Initialize(const WebGpuExecutionProviderInfo& webgpu_ep_info

// Initialization.Step.2 - Create wgpu::Adapter
if (adapter_ == nullptr) {
#if !defined(__EMSCRIPTEN__) && defined(_MSC_VER) && defined(DAWN_ENABLE_D3D12) && !defined(USE_EXTERNAL_DAWN)
// If we are using the D3D12 backend on Windows and the build does not use external Dawn, dxil.dll and dxcompiler.dll are required.
//
// Dawn will try to load them later, but if they are in the different directory to the executable, it may fail to find them.
// To avoid this issue, we try to load them from the same directory as current module (usually onnxruntime.dll).
auto runtime_path = Env::Default().GetRuntimePath();
if (!runtime_path.empty()) {
Status status;
void* module_handle = nullptr;

PathString dxil_path = runtime_path + ToPathString(L"dxil.dll");
status = Env::Default().LoadDynamicLibrary(dxil_path, false, &module_handle);
if (status.IsOK() && module_handle != nullptr) {
modules_.Add(dxil_path, module_handle);
}

PathString dxcompiler_path = runtime_path + ToPathString(L"dxcompiler.dll");
status = Env::Default().LoadDynamicLibrary(dxcompiler_path, false, &module_handle);
if (status.IsOK() && module_handle != nullptr) {
modules_.Add(dxcompiler_path, module_handle);
}
}
#endif

wgpu::RequestAdapterOptions req_adapter_options = {};
wgpu::DawnTogglesDescriptor adapter_toggles_desc = {};
req_adapter_options.nextInChain = &adapter_toggles_desc;
Expand Down
3 changes: 3 additions & 0 deletions onnxruntime/core/providers/webgpu/webgpu_context.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include <webgpu/webgpu_cpp.h>

#include "core/common/common.h"
#include "core/framework/library_handles.h"
#include "core/providers/webgpu/webgpu_execution_provider.h"
#include "core/providers/webgpu/buffer_manager.h"
#include "core/providers/webgpu/program_manager.h"
Expand Down Expand Up @@ -153,6 +154,8 @@ class WebGpuContext final {

std::once_flag init_flag_;

LibraryHandles modules_;

wgpu::Instance instance_;
wgpu::Adapter adapter_;
wgpu::Device device_;
Expand Down
Loading

0 comments on commit 8680244

Please sign in to comment.