From 7b9a0f752a17a9ae2d1fff1301027ebfc90bb1e5 Mon Sep 17 00:00:00 2001 From: Jhalak Patel Date: Tue, 13 Aug 2024 11:16:42 -0700 Subject: [PATCH] Add IGpuAllocator to MLIR-TensorRT --- mlir-tensorrt/README.md | 4 +- .../include/mlir-executor-c/Runtime/Runtime.h | 34 +++++++- .../include/mlir-executor/Runtime/API/API.h | 7 +- .../Runtime/Backend/Lua/LuaRegistration.h | 2 +- .../Runtime/Backend/Lua/LuaRuntime.h | 6 +- .../Lua/Modules/TensorRT/TensorRTModule.h | 2 +- .../mlir-executor/Support/Allocators.h | 73 +++++++++++++++++ .../executor/lib/CAPI/Runtime/Runtime.cpp | 56 ++++++++++++- .../executor/lib/Runtime/API/API.cpp | 7 +- .../lib/Runtime/Backend/Lua/LuaRuntime.cpp | 22 +++--- .../Lua/Modules/TensorRT/TensorRTModule.cpp | 79 +++++++++++++++++-- .../executor/lib/Support/Allocators.cpp | 55 +++++++++++++ .../executor/lib/Tools/ExecutorRunnerMain.cpp | 15 +++- .../python/bindings/Runtime/RuntimePyBind.cpp | 54 ++++++++++++- 14 files changed, 378 insertions(+), 38 deletions(-) diff --git a/mlir-tensorrt/README.md b/mlir-tensorrt/README.md index 3916be7b0..206d1e32e 100644 --- a/mlir-tensorrt/README.md +++ b/mlir-tensorrt/README.md @@ -23,7 +23,7 @@ We currently support only building on Linux x86 systems. We support building several different ways (only via CMake) depending on use-case. In each case, the LLVM-Project version that we are currently aligned to is -given in `build_tools/cmake/LLVMCommit.txt`. +given in `build_tools/cmake/LLVMCommit.cmake`. Note that currently we provide an LLVM patch which essentially cherry-picks the bug fixes from [this open MLIR PR](https://github.com/llvm/llvm-project/pull/91524). @@ -82,7 +82,7 @@ git clone https://github.com/llvm/llvm-project.git llvm-project # Checkout the right commit. Of course, you may try # a newer commit or your own modified LLVM-Project. cd llvm-project -git checkout $(cat build_tools/cmake/LLVMCommit.cmake | grep -Po '(?<=").*(?=")') +git checkout $(cat ../build_tools/cmake/LLVMCommit.cmake | grep -Po '(?<=").*(?=")') # Apply patch from llvm-project PR 91524 git apply ../build_tools/llvm-project.patch diff --git a/mlir-tensorrt/executor/include/mlir-executor-c/Runtime/Runtime.h b/mlir-tensorrt/executor/include/mlir-executor-c/Runtime/Runtime.h index 11ae93519..65377da35 100644 --- a/mlir-tensorrt/executor/include/mlir-executor-c/Runtime/Runtime.h +++ b/mlir-tensorrt/executor/include/mlir-executor-c/Runtime/Runtime.h @@ -319,6 +319,38 @@ mtrtScalarValueCastToRuntimeValue(MTRT_ScalarValue v); MLIR_CAPI_EXPORTED MTRT_Status mtrtScalarValueGetType(MTRT_ScalarValue scalar, MTRT_ScalarTypeCode *code); +//===----------------------------------------------------------------------===// +// MTRT_GpuAllocator +//===----------------------------------------------------------------------===// + +typedef struct MTRT_GpuAllocator { + void *ptr; +} MTRT_GpuAllocator; + +/// Checks nullity of `GpuAllocator`. +MTRT_CAPI_EXPORTED bool mtrtGpuAllocatorIsNull(MTRT_GpuAllocator gpuAllocator); + +/// Returns null `GpuAllocator`. +MTRT_CAPI_EXPORTED MTRT_GpuAllocator mtrtGpuAllocatorGetNull(); + +MTRT_CAPI_EXPORTED MTRT_Status +mtrtGpuAllocatorDestroy(MTRT_GpuAllocator executable); + +MTRT_CAPI_EXPORTED MTRT_Status +mtrtGpuAllocatorCreate(MTRT_GpuAllocator *allocator); + +//===----------------------------------------------------------------------===// +// MTRT_GpuAllocator +//===----------------------------------------------------------------------===// + +MTRT_CAPI_EXPORTED MTRT_Status mtrtGpuAllocatorAllocate( + MTRT_GpuAllocator gpuAllocator, uint64_t size, uint64_t alignment, + uint32_t flags, MTRT_Stream stream, void **memory); + +MTRT_CAPI_EXPORTED MTRT_Status +mtrtGpuAllocatorDeallocate(MTRT_GpuAllocator gpuAllocator, void *memory, + MTRT_Stream stream, bool *result); + //===----------------------------------------------------------------------===// // MTRT_RuntimeSessionOptions //===----------------------------------------------------------------------===// @@ -359,7 +391,7 @@ typedef struct MTRT_RuntimeSession { /// that the session only has a read-only view in to the Executable for code and /// constant data. Therefore the Executable must outlive the RuntimeSession. MLIR_CAPI_EXPORTED MTRT_Status mtrtRuntimeSessionCreate( - MTRT_RuntimeSessionOptions options, MTRT_Executable executable, + MTRT_RuntimeSessionOptions options, MTRT_Executable executable, MTRT_GpuAllocator allocator, MTRT_RuntimeSession *result); /// Destory the session. This does not destroy the associated Executable, which diff --git a/mlir-tensorrt/executor/include/mlir-executor/Runtime/API/API.h b/mlir-tensorrt/executor/include/mlir-executor/Runtime/API/API.h index d3672c149..b35d0b777 100644 --- a/mlir-tensorrt/executor/include/mlir-executor/Runtime/API/API.h +++ b/mlir-tensorrt/executor/include/mlir-executor/Runtime/API/API.h @@ -867,7 +867,8 @@ class RuntimeSession { sol::state state, std::unique_ptr pinnedMemoryAllocator, std::unique_ptr allocTracker, - std::unique_ptr resourceTracker); + std::unique_ptr resourceTracker, + GpuAllocator* gpuAllocator); ExecutableView getExecutable() const { return executable; } @@ -881,6 +882,8 @@ class RuntimeSession { ResourceTracker &getResourceTracker() { return *resourceTracker; } + GpuAllocator* getGpuAllocator() { return gpuAllocator; } + private: RuntimeSessionOptions options; ExecutableView executable; @@ -888,7 +891,7 @@ class RuntimeSession { std::unique_ptr pinnedMemoryAllocator; std::unique_ptr allocTracker; std::unique_ptr resourceTracker; - + GpuAllocator* gpuAllocator; sol::state state; }; diff --git a/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/LuaRegistration.h b/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/LuaRegistration.h index b5fed9c3d..922e964d4 100644 --- a/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/LuaRegistration.h +++ b/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/LuaRegistration.h @@ -37,6 +37,6 @@ void registerLuaRuntimeMethods(lua_State *state, const RuntimeSessionOptions &options, PinnedMemoryAllocator *pinnedMemoryAllocator, AllocTracker *allocTracker, - ResourceTracker *resourceTracker); + ResourceTracker *resourceTracker, GpuAllocator* allocator); } // namespace mlirtrt::runtime diff --git a/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/LuaRuntime.h b/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/LuaRuntime.h index f39eabd7b..0635e8cb4 100644 --- a/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/LuaRuntime.h +++ b/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/LuaRuntime.h @@ -36,7 +36,7 @@ namespace mlirtrt::runtime { /// `main` function. It is assumed that `main` takes no arguments and returns an /// integer result (which is returned if the execution is successful). /// TODO: this should take a handle to a function for streaming output/errors. -StatusOr runExecutorLuaScript(std::string_view luaScript); +StatusOr runExecutorLuaScript(std::string_view luaScript, GpuAllocator* allocator); /// Synchronously run a serialized executor Executable one time. An `Executable` /// is essentially a Lua script packaged with metadata and serialized constants @@ -48,12 +48,12 @@ StatusOr runExecutorLuaScript(std::string_view luaScript); /// execution is successful). /// TODO: this should take a handle to a function for /// streaming output/errors. -StatusOr runExecutorExecutable(std::unique_ptr executable); +StatusOr runExecutorExecutable(std::unique_ptr executable, GpuAllocator* allocator); /// Create an execution state. This will setup a Lua environment and invoke /// global initialization. StatusOr> -createRuntimeSessionWithLuaBackend(ExecutableView executable, +createRuntimeSessionWithLuaBackend(ExecutableView executable, GpuAllocator* allocator, const RuntimeSessionOptions &options); /// Set the primary stream for the loaded executable to use. diff --git a/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.h b/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.h index 37d8de629..1ceb91367 100644 --- a/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.h +++ b/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.h @@ -37,7 +37,7 @@ class ResourceTracker; /// Lua state. void registerExecutorTensorRTModuleLuaRuntimeMethods( lua_State *luaState, PinnedMemoryAllocator *pinnedMemoryAllocator, - AllocTracker *allocTracker, ResourceTracker *resourceTracker); + AllocTracker *allocTracker, ResourceTracker *resourceTracker, GpuAllocator* allocator); } // namespace mlirtrt::runtime diff --git a/mlir-tensorrt/executor/include/mlir-executor/Support/Allocators.h b/mlir-tensorrt/executor/include/mlir-executor/Support/Allocators.h index 180dbf09e..66f54fb46 100644 --- a/mlir-tensorrt/executor/include/mlir-executor/Support/Allocators.h +++ b/mlir-tensorrt/executor/include/mlir-executor/Support/Allocators.h @@ -32,6 +32,79 @@ namespace mlirtrt { struct EventPool; +// Abstract allocator to be implemented by consumers. +using AllocatorFlags = uint32_t; + +class GpuAllocator { +public: + GpuAllocator() = default; + virtual ~GpuAllocator() = default; + + virtual StatusOr reallocate(void *baseAddr, uint64_t alignment, + uint64_t newSize, + std::optional stream) = 0; + + virtual StatusOr allocate(uint64_t const size, + uint64_t const alignment, + AllocatorFlags const flags, + std::optional stream) = 0; + + virtual StatusOr deallocate(void *const memory, + std::optional stream) = 0; + +protected: + GpuAllocator(GpuAllocator const &) = delete; + GpuAllocator(GpuAllocator &&) = delete; + GpuAllocator &operator=(GpuAllocator const &) & = delete; + GpuAllocator &operator=(GpuAllocator &&) & = delete; +}; + +class StubAllocator : public GpuAllocator { +public: + StubAllocator() = default; + ~StubAllocator() = default; + + StatusOr reallocate(void *baseAddr, uint64_t alignment, + uint64_t newSize, + std::optional stream) override { + return getStatusWithMsg( + StatusCode::InternalError, + "[StubAllocator][reallocate]: Must be overriden in Python"); + } + + StatusOr allocate(uint64_t const size, uint64_t const alignment, + AllocatorFlags const flags, + std::optional stream) override { + return getStatusWithMsg( + StatusCode::InternalError, + "[StubAllocator][allocate]: Must be overriden in Python"); + } + + StatusOr deallocate(void *const memory, + std::optional stream) override { + return getStatusWithMsg( + StatusCode::InternalError, + "[StubAllocator][deallocate]: Must be overriden in Python"); + } +}; + +class CustomTensorRTAllocator : public GpuAllocator { +public: + CustomTensorRTAllocator() = default; + ~CustomTensorRTAllocator() = default; + + StatusOr reallocate(void *baseAddr, uint64_t alignment, + uint64_t newSize, + std::optional stream) override; + + StatusOr allocate(uint64_t const size, uint64_t const alignment, + AllocatorFlags const flags, + std::optional stream) override; + + StatusOr deallocate(void *const memory, + std::optional stream) override; +}; + //===----------------------------------------------------------------------===// // PoolTrackedCudaEvent //===----------------------------------------------------------------------===// diff --git a/mlir-tensorrt/executor/lib/CAPI/Runtime/Runtime.cpp b/mlir-tensorrt/executor/lib/CAPI/Runtime/Runtime.cpp index 2b0fbc35c..8d9b16e64 100644 --- a/mlir-tensorrt/executor/lib/CAPI/Runtime/Runtime.cpp +++ b/mlir-tensorrt/executor/lib/CAPI/Runtime/Runtime.cpp @@ -27,6 +27,7 @@ #include "mlir-executor/Runtime/API/API.h" #include "mlir-executor/Runtime/API/ExecutableFlatbuffer.h" #include "mlir-executor/Runtime/Backend/Lua/LuaRuntime.h" +#include "mlir-executor/Support/Allocators.h" #include "mlir-executor/Support/Status.h" #include "mlir/Support/FileUtilities.h" #include "llvm/ADT/SmallVectorExtras.h" @@ -48,6 +49,8 @@ DEFINE_C_API_PTR_METHODS(MTRT_RuntimeSession, ::mlirtrt::runtime::RuntimeSession) DEFINE_C_API_PTR_METHODS(MTRT_RuntimeSessionOptions, ::mlirtrt::runtime::RuntimeSessionOptions) +DEFINE_C_API_PTR_METHODS(MTRT_GpuAllocator, + ::mlirtrt::GpuAllocator) DEFINE_C_API_PTR_METHODS(MTRT_Executable, ::mlirtrt::runtime::Executable) DEFINE_C_API_PTR_METHODS(MTRT_Stream, MTRT_StreamImpl) DEFINE_C_API_PTR_METHODS(MTRT_RuntimeValue, ::mlirtrt::runtime::RuntimeValue) @@ -598,6 +601,55 @@ MTRT_ScalarValue mtrtRuntimeValueDynCastToScalar(MTRT_RuntimeValue v) { return wrap(static_cast(x)); } +//===----------------------------------------------------------------------===// +// MTRT_GpuAllocator +//===----------------------------------------------------------------------===// + +bool mtrtGpuAllocatorIsNull(MTRT_GpuAllocator gpuAllocator) { + return !gpuAllocator.ptr; +} + +MTRT_GpuAllocator mtrtGpuAllocatorGetNull() { return MTRT_GpuAllocator{nullptr}; } + +MTRT_Status mtrtGpuAllocatorDestroy(MTRT_GpuAllocator executable) { + delete unwrap(executable); + return mtrtStatusGetOk(); +} + +MTRT_Status mtrtGpuAllocatorCreate(MTRT_GpuAllocator *allocator) { + *allocator = MTRT_GpuAllocator{/*ptr=*/new StubAllocator()}; + return mtrtStatusGetOk(); +} + +MTRT_Status mtrtGpuAllocatorAllocate(MTRT_GpuAllocator gpuAllocator, + uint64_t size, uint64_t alignment, + uint32_t flags, MTRT_Stream stream, + void **memory) { + GpuAllocator *cppGpuAllocator = unwrap(gpuAllocator); + StatusOr status = cppGpuAllocator->allocate( + size, alignment, flags, + !mtrtStreamIsNull(stream) ? std::optional(unwrap(stream)->getRawStream()) + : std::nullopt); + if (status.isOk()) { + *memory = *status; + } + return mtrtStatusGetOk(); +} + +MTRT_Status mtrtGpuAllocatorDeallocate(MTRT_GpuAllocator gpuAllocator, + void *memory, MTRT_Stream stream, + bool *result) { + GpuAllocator *cppGpuAllocator = unwrap(gpuAllocator); + StatusOr status = cppGpuAllocator->deallocate( + memory, !mtrtStreamIsNull(stream) + ? std::optional(unwrap(stream)->getRawStream()) + : std::nullopt); + if (status.isOk()) { + *result = *status; + } + return mtrtStatusGetOk(); +} + //===----------------------------------------------------------------------===// // MTRT_RuntimeSessionOptions //===----------------------------------------------------------------------===// @@ -625,12 +677,14 @@ mtrtRuntimeSessionOptionsDestroy(MTRT_RuntimeSessionOptions options) { MTRT_Status mtrtRuntimeSessionCreate(MTRT_RuntimeSessionOptions options, MTRT_Executable executable, + MTRT_GpuAllocator gpuAllocator, MTRT_RuntimeSession *result) { RuntimeSessionOptions *cppOptions = unwrap(options); Executable *cppExecutable = unwrap(executable); + GpuAllocator *cppGpuAllocator = unwrap(gpuAllocator); StatusOr> session = - createRuntimeSessionWithLuaBackend(cppExecutable->getView(), *cppOptions); + createRuntimeSessionWithLuaBackend(cppExecutable->getView(), cppGpuAllocator, *cppOptions); if (session.isError()) return wrap(session.getStatus()); diff --git a/mlir-tensorrt/executor/lib/Runtime/API/API.cpp b/mlir-tensorrt/executor/lib/Runtime/API/API.cpp index e1243cf78..cb185f5f7 100644 --- a/mlir-tensorrt/executor/lib/Runtime/API/API.cpp +++ b/mlir-tensorrt/executor/lib/Runtime/API/API.cpp @@ -349,16 +349,17 @@ RuntimeSessionOptions::createUsingSingleHostMpi() { //===----------------------------------------------------------------------===// // RuntimeSession //===----------------------------------------------------------------------===// - RuntimeSession::RuntimeSession( RuntimeSessionOptions options, ExecutableView exe, sol::state state, std::unique_ptr pinnedMemoryAllocator, std::unique_ptr allocTracker, - std::unique_ptr resourceTracker) + std::unique_ptr resourceTracker, + GpuAllocator *gpuAllocator) : options(std::move(options)), executable(exe), pinnedMemoryAllocator(std::move(pinnedMemoryAllocator)), allocTracker(std::move(allocTracker)), - resourceTracker(std::move(resourceTracker)), state(std::move(state)) {} + resourceTracker(std::move(resourceTracker)), gpuAllocator(gpuAllocator), + state(std::move(state)) {} //===----------------------------------------------------------------------===// // AllocTracker diff --git a/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/LuaRuntime.cpp b/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/LuaRuntime.cpp index 7596c9da7..b19541977 100644 --- a/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/LuaRuntime.cpp +++ b/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/LuaRuntime.cpp @@ -72,7 +72,7 @@ static void registerDefaultDeviceDependentMethods(lua_State *state, static void registerLuaRuntimeMethodsCommon( lua_State *state, PinnedMemoryAllocator *pinnedMemoryAllocator, - AllocTracker *allocTracker, ResourceTracker *resourceTracker) { + AllocTracker *allocTracker, ResourceTracker *resourceTracker, GpuAllocator* allocator) { registerExecutorCoreModuleLuaRuntimeMethods(state, pinnedMemoryAllocator, allocTracker); registerExecutorCUDAModuleLuaRuntimeMethods( @@ -84,15 +84,15 @@ static void registerLuaRuntimeMethodsCommon( #endif registerExecutorTensorRTModuleLuaRuntimeMethods( - state, pinnedMemoryAllocator, allocTracker, resourceTracker); + state, pinnedMemoryAllocator, allocTracker, resourceTracker, allocator); } void mlirtrt::runtime::registerLuaRuntimeMethods( lua_State *state, const RuntimeSessionOptions &options, PinnedMemoryAllocator *pinnedMemoryAllocator, AllocTracker *allocTracker, - ResourceTracker *resourceTracker) { + ResourceTracker *resourceTracker, GpuAllocator* allocator) { registerLuaRuntimeMethodsCommon(state, pinnedMemoryAllocator, allocTracker, - resourceTracker); + resourceTracker, allocator); #ifdef MLIR_EXECUTOR_ENABLE_NCCL registerExecutorNCCLModuleLuaRuntimeMethods(state, resourceTracker); registerDeviceDependentNCCLMethods(state, options.getNumDevices(), @@ -108,7 +108,7 @@ void mlirtrt::runtime::registerLuaRuntimeMethods( } StatusOr -mlirtrt::runtime::runExecutorLuaScript(std::string_view luaScript) { +mlirtrt::runtime::runExecutorLuaScript(std::string_view luaScript, GpuAllocator* allocator) { ADD_RUNTIME_MODULE_RANGE("runtime_runExecutorLuaScript"); StatusOr> client = RuntimeClient::create(); @@ -120,7 +120,7 @@ mlirtrt::runtime::runExecutorLuaScript(std::string_view luaScript) { registerLuaRuntimeMethods(lua.lua_state(), RuntimeSessionOptions(), &(*client)->getPinnedMemorAllocator(), &(*client)->getAllocTracker(), - &(*client)->getResourceTracker()); + &(*client)->getResourceTracker(), allocator); sol::protected_function_result result = lua.script(luaScript); if (!result.valid()) { @@ -171,7 +171,7 @@ static Status maybeCheckForValidNcclUuid(const RuntimeSessionOptions &options) { /// global initialization. StatusOr> mlirtrt::runtime::createRuntimeSessionWithLuaBackend( - ExecutableView executable, const RuntimeSessionOptions &options) { + ExecutableView executable, GpuAllocator* allocator, const RuntimeSessionOptions &options) { ADD_RUNTIME_MODULE_RANGE("runtime_loadExecutable"); MTRT_RETURN_IF_ERROR(maybeCheckForValidNcclUuid(options)); @@ -184,7 +184,7 @@ mlirtrt::runtime::createRuntimeSessionWithLuaBackend( lua.open_libraries(sol::lib::base, sol::lib::string); registerLuaRuntimeMethods(lua.lua_state(), options, pinnedMemoryAllocator.get(), allocTracker.get(), - resourceTracker.get()); + resourceTracker.get(), allocator); // Load globals into the context. // TODO: eliminate this copy, we already own the executable. @@ -225,11 +225,11 @@ mlirtrt::runtime::createRuntimeSessionWithLuaBackend( } return std::make_unique( options, executable, std::move(lua), std::move(pinnedMemoryAllocator), - std::move(allocTracker), std::move(resourceTracker)); + std::move(allocTracker), std::move(resourceTracker), allocator); } StatusOr mlirtrt::runtime::runExecutorExecutable( - std::unique_ptr executable) { + std::unique_ptr executable, GpuAllocator* allocator) { StatusOr> client = RuntimeClient::create(); if (!client.isOk()) @@ -245,7 +245,7 @@ StatusOr mlirtrt::runtime::runExecutorExecutable( return options.getStatus(); StatusOr> session = - createRuntimeSessionWithLuaBackend(executable->getView(), *options); + createRuntimeSessionWithLuaBackend(executable->getView(), allocator, *options); if (!session.isOk()) return session.getStatus(); diff --git a/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.cpp b/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.cpp index 1b96eac44..bac6942a1 100644 --- a/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.cpp +++ b/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.cpp @@ -64,6 +64,43 @@ class StdioLogger : public nvinfer1::ILogger { bool verbose; }; +//===----------------------------------------------------------------------===// +// TensorRTCallBackAllocator +//===----------------------------------------------------------------------===// + +class TensorRTCallBackAllocator final : public nvinfer1::IGpuAllocator { +public: + TensorRTCallBackAllocator(GpuAllocator *gpuAllocator) + : nvinfer1::IGpuAllocator(), mGpuAllocatorCallBack(gpuAllocator) {} + + void *allocate(uint64_t size, uint64_t alignment, + nvinfer1::AllocatorFlags flags) noexcept final { + return allocateAsync(size, alignment, flags, nullptr); + } + + bool deallocate(void *memory) noexcept final { + return deallocateAsync(memory, nullptr); + } + + void *allocateAsync(uint64_t const size, uint64_t const alignment, + uint32_t flags, cudaStream_t stream) noexcept final { + StatusOr status = + mGpuAllocatorCallBack->allocate(size, alignment, flags, stream); + assert(status.isOk()); + return *status; + } + + bool deallocateAsync(void *const memory, + cudaStream_t stream) noexcept override { + StatusOr status = mGpuAllocatorCallBack->deallocate(memory, stream); + assert(status.isOk()); + return *status; + } + +private: + GpuAllocator *mGpuAllocatorCallBack; +}; + } // namespace static StdioLogger logger(/*verbose=*/false); @@ -88,13 +125,38 @@ struct Signature { } }; +class NvInferRuntimeWrapper { +public: + explicit NvInferRuntimeWrapper(GpuAllocator* gpuAllocator) { + runtime = std::shared_ptr( + nvinfer1::createInferRuntime(logger), [](nvinfer1::IRuntime *runtime) { + MTRT_DBGF("freeing tensorrt runtime at %lu", + reinterpret_cast(runtime)); + delete runtime; + }); + // GpuAllocator is optional. + if (gpuAllocator) { + callbackAllocator = std::shared_ptr( + new TensorRTCallBackAllocator(gpuAllocator)); + runtime->setGpuAllocator(callbackAllocator.get()); + } + } + + nvinfer1::IRuntime *operator*() { return runtime.get(); } + nvinfer1::IRuntime *operator->() { return runtime.get(); } + + std::shared_ptr runtime; + std::shared_ptr callbackAllocator; +}; + class NvInferEngineWrapper { public: - explicit NvInferEngineWrapper(std::shared_ptr &runtime, + explicit NvInferEngineWrapper(std::shared_ptr runtime, uintptr_t pointer, size_t size) : runtime(runtime) { engine = std::shared_ptr( - runtime->deserializeCudaEngine(reinterpret_cast(pointer), size), + runtime->runtime->deserializeCudaEngine( + reinterpret_cast(pointer), size), [](nvinfer1::ICudaEngine *engine) { MTRT_DBGF("freeing cuda engine at %lu", reinterpret_cast(engine)); @@ -105,7 +167,7 @@ class NvInferEngineWrapper { nvinfer1::ICudaEngine *operator*() { return engine.get(); } nvinfer1::ICudaEngine *operator->() { return engine.get(); } - std::shared_ptr runtime; + std::shared_ptr runtime; std::shared_ptr engine; }; @@ -375,19 +437,20 @@ static Status enqueueV3Wrapper(AllocTracker &tracker, //===----------------------------------------------------------------------===// void mlirtrt::runtime::registerExecutorTensorRTModuleLuaRuntimeMethods( lua_State *luaState, PinnedMemoryAllocator *pinnedMemoryAllocator, - AllocTracker *allocTracker, ResourceTracker *resourceTracker) { + AllocTracker *allocTracker, ResourceTracker *resourceTracker, GpuAllocator* allocator) { sol::state_view lua(luaState); - lua["_trtrt_create_runtime"] = [](sol::this_state state) { + lua["_trtrt_create_runtime"] = + [allocator](sol::this_state state) -> std::shared_ptr { ADD_TENSORRT_MODULE_RANGE("trtrt_create_runtime"); MTRT_DBGF("%s", "creating nvinfer runtime"); - return std::shared_ptr( - nvinfer1::createInferRuntime(logger)); + return std::make_shared(allocator); }; lua["_trtrt_load"] = [allocTracker]( - sol::this_state state, std::shared_ptr &runtime, + sol::this_state state, + std::shared_ptr &runtime, uintptr_t pointer) -> std::shared_ptr { ADD_TENSORRT_MODULE_RANGE("trtrt_load"); const AllocTracker &tracker = *allocTracker; diff --git a/mlir-tensorrt/executor/lib/Support/Allocators.cpp b/mlir-tensorrt/executor/lib/Support/Allocators.cpp index 50dd7a5d8..1fa34325d 100644 --- a/mlir-tensorrt/executor/lib/Support/Allocators.cpp +++ b/mlir-tensorrt/executor/lib/Support/Allocators.cpp @@ -22,11 +22,15 @@ /// //===----------------------------------------------------------------------===// #include "mlir-executor/Support/Allocators.h" +#include "mlir-executor/Support/Status.h" +#include "mlir-executor/Runtime/Support/Support.h" +#include "cuda_runtime_api.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/Support/Debug.h" #include "llvm/Support/MathExtras.h" +#include #include #include #include @@ -38,6 +42,57 @@ using namespace mlirtrt; DEBUG_WITH_TYPE("allocators", fprintf(stderr, "%s:%d " fmt "\n", __FILE__, \ __LINE__, __VA_ARGS__)) +//===----------------------------------------------------------------------===// +// CustomTensorRTAllocator +//===----------------------------------------------------------------------===// + +StatusOr +CustomTensorRTAllocator::allocate(uint64_t const size, uint64_t const alignment, + uint32_t /*flags*/, + std::optional stream) { + uint8_t *memory; + assert(alignment > 0 && (alignment & (alignment - 1)) == 0 && + "Memory alignment has to be power of 2"); + if (stream && *stream != nullptr) { + RETURN_ERROR_IF_CUDART_ERROR( + cudaMallocAsync(reinterpret_cast(&memory), size, *stream)); + MTRT_DBGF("[CustomTensorRTAllocator][allocate]: Asynchronously allocated %lx bytes at 0x%lx on stream %lx", size, + reinterpret_cast(memory), + reinterpret_cast(*stream)); + } else { + RETURN_ERROR_IF_CUDART_ERROR( + cudaMalloc(reinterpret_cast(&memory), size)); + MTRT_DBGF("[CustomTensorRTAllocator][allocate]: Synchronously allocated %lx bytes at 0x%lx", size, + reinterpret_cast(memory)); + } + assert(reinterpret_cast(memory) % alignment == 0); + return memory; +} + +StatusOr +CustomTensorRTAllocator::deallocate(void *const memory, + std::optional stream) { + if (stream && *stream != nullptr) { + MTRT_DBGF("[CustomTensorRTAllocator][deallocate]: Asynchronously freeing CUDA device memory 0x%lx on stream %lx", + reinterpret_cast(memory), + reinterpret_cast(*stream)); + RETURN_ERROR_IF_CUDART_ERROR(cudaFreeAsync(memory, *stream)); + } else { + MTRT_DBGF("[CustomTensorRTAllocator][deallocate]: Synchronously freeing CUDA device/pinned host memory 0x%lx ptr " + "on stream %lx", + reinterpret_cast(memory), + reinterpret_cast(*stream)); + RETURN_ERROR_IF_CUDART_ERROR(cudaFree(memory)); + } + return true; +} + +StatusOr CustomTensorRTAllocator::reallocate( + void * /* baseAddr */, uint64_t /* alignment */, uint64_t /* newSize */, + std::optional /* stream */) { + return nullptr; +} + //===----------------------------------------------------------------------===// // PoolTrackedCudaEvent //===----------------------------------------------------------------------===// diff --git a/mlir-tensorrt/executor/lib/Tools/ExecutorRunnerMain.cpp b/mlir-tensorrt/executor/lib/Tools/ExecutorRunnerMain.cpp index 3241de0da..c8ab9269b 100644 --- a/mlir-tensorrt/executor/lib/Tools/ExecutorRunnerMain.cpp +++ b/mlir-tensorrt/executor/lib/Tools/ExecutorRunnerMain.cpp @@ -98,7 +98,12 @@ struct Options { cl::values(clEnumValN(Lua, "lua", "interpret the input as Lua code")), cl::values(clEnumValN(ExecutorRuntimeExecutable, "rtexe", "load the input file as an Executor executable"))}; + + cl::opt useCustomAllocator{"use-custom-allocator", + cl::desc("Use custom allocator"), + cl::init(false)}; }; + } // namespace LogicalResult @@ -168,13 +173,19 @@ executor::ExecutorRunnerMain(int argc, char **argv, if (result != cudaSuccess) return emitError(loc) << "cudaFree failed: " << cudaGetErrorString(result); + std::unique_ptr allocator{nullptr}; + if (options.useCustomAllocator) { + // Create an optional runtime GPU allocator + allocator.reset(new CustomTensorRTAllocator()); + } + // Read the buffer as a Lua script and execute. if (options.inputType == Lua) { assert(!options.dumpFunctionSignature && "Can not dump function signature for Lua input type."); mlirtrt::StatusOr result = - mlirtrt::runtime::runExecutorLuaScript(input->getBuffer()); + mlirtrt::runtime::runExecutorLuaScript(input->getBuffer(), allocator.get()); if (!result.isOk()) return emitError(UnknownLoc::get(&context)) << result.getString(); return success(*result == 0); @@ -202,7 +213,7 @@ executor::ExecutorRunnerMain(int argc, char **argv, } mlirtrt::StatusOr executionResult = - mlirtrt::runtime::runExecutorExecutable(std::move(*executable)); + mlirtrt::runtime::runExecutorExecutable(std::move(*executable), allocator.get()); if (!executionResult.isOk()) return emitError(UnknownLoc::get(&context)) << "failed to load and run executable: " diff --git a/mlir-tensorrt/python/bindings/Runtime/RuntimePyBind.cpp b/mlir-tensorrt/python/bindings/Runtime/RuntimePyBind.cpp index fef5ad868..e6e6bdda5 100644 --- a/mlir-tensorrt/python/bindings/Runtime/RuntimePyBind.cpp +++ b/mlir-tensorrt/python/bindings/Runtime/RuntimePyBind.cpp @@ -184,6 +184,16 @@ class PyRuntimeValue : public PyMTRTWrapper { mtrtPythonCapsuleToRuntimeValue, mtrtPythonRuntimeValueToCapsule}; }; +/// Python object type wrapper for `MTRT_GpuAllocator`. +class PyGpuAllocator : public PyMTRTWrapper { +public: + using Base::Base; + DECLARE_WRAPPER_CONSTRUCTORS(PyGpuAllocator); + + static constexpr auto kMethodTable = CAPITable{ + mtrtGpuAllocatorIsNull, mtrtGpuAllocatorDestroy}; +}; + /// Python object type wrapper for `MTRT_StableHLOToExecutableOptions`. class PyRuntimeSessionOptions : public PyMTRTWrapper(m, "GpuAllocator", py::module_local()) + .def(py::init<>([]() -> PyGpuAllocator * { + MTRT_GpuAllocator allocator; + MTRT_Status s = mtrtGpuAllocatorCreate(&allocator); + THROW_IF_MTRT_ERROR(s); + return new PyGpuAllocator(allocator); + })) + .def( + "allocate", + [](PyGpuAllocator &self, uint64_t size, uint64_t alignment, + std::optional flags, std::optional stream) { + void *memory{nullptr}; + MTRT_Status s = mtrtGpuAllocatorAllocate( + self, size, alignment, flags ? *flags : 0, + stream ? *stream : mtrtStreamGetNull(), &memory); + THROW_IF_MTRT_ERROR(s); + // Add changes to ensure memory is not released prematurely. + return memory; + }, + py::arg("size"), py::arg("alignment"), py::arg("flags") = py::none(), + py::arg("stream") = py::none()) + .def( + "deallocate", + [](PyGpuAllocator &self, void *memory, + std::optional stream) { + bool result; + MTRT_Status s = mtrtGpuAllocatorDeallocate( + self, memory, stream ? *stream : mtrtStreamGetNull(), &result); + THROW_IF_MTRT_ERROR(s); + // Add changes to ensure memory is not released prematurely. + return result; + }, + py::arg("memory"), py::arg("stream") = py::none()); + py::class_(m, "RuntimeSession", py::module_local()) - .def(py::init<>([](PyRuntimeSessionOptions &options, PyExecutable &exe) { + .def(py::init<>([](PyRuntimeSessionOptions &options, PyExecutable &exe, + std::optional allocator) { MTRT_RuntimeSession session; - MTRT_Status s = mtrtRuntimeSessionCreate(options, exe, &session); + MTRT_Status s = mtrtRuntimeSessionCreate( + options, exe, + allocator ? *allocator : mtrtGpuAllocatorGetNull(), &session); THROW_IF_MTRT_ERROR(s); return new PyRuntimeSession(session); }), - py::arg("options"), py::arg("executable")) + py::arg("options"), py::arg("executable"), + py::arg("gpu_allocator") = py::none()) .def( "execute_function", [](PyRuntimeSession &self, std::string name,