From 99c3080c54586147bb2df6166d53650c445e068b Mon Sep 17 00:00:00 2001 From: Jhalak Patel Date: Tue, 13 Aug 2024 11:16:42 -0700 Subject: [PATCH 1/6] Add IGpuAllocator to MLIR-TensorRT --- mlir-tensorrt/README.md | 4 +- .../include/mlir-executor-c/Runtime/Runtime.h | 34 +++++++- .../include/mlir-executor/Runtime/API/API.h | 7 +- .../Runtime/Backend/Lua/LuaRegistration.h | 2 +- .../Runtime/Backend/Lua/LuaRuntime.h | 6 +- .../Lua/Modules/TensorRT/TensorRTModule.h | 2 +- .../mlir-executor/Support/Allocators.h | 73 +++++++++++++++++ .../executor/lib/CAPI/Runtime/Runtime.cpp | 56 ++++++++++++- .../executor/lib/Runtime/API/API.cpp | 7 +- .../lib/Runtime/Backend/Lua/LuaRuntime.cpp | 22 +++--- .../Lua/Modules/TensorRT/TensorRTModule.cpp | 79 +++++++++++++++++-- .../executor/lib/Support/Allocators.cpp | 54 +++++++++++++ .../executor/lib/Tools/ExecutorRunnerMain.cpp | 15 +++- .../python/bindings/Runtime/RuntimePyBind.cpp | 54 ++++++++++++- 14 files changed, 377 insertions(+), 38 deletions(-) diff --git a/mlir-tensorrt/README.md b/mlir-tensorrt/README.md index 3916be7b0..206d1e32e 100644 --- a/mlir-tensorrt/README.md +++ b/mlir-tensorrt/README.md @@ -23,7 +23,7 @@ We currently support only building on Linux x86 systems. We support building several different ways (only via CMake) depending on use-case. In each case, the LLVM-Project version that we are currently aligned to is -given in `build_tools/cmake/LLVMCommit.txt`. +given in `build_tools/cmake/LLVMCommit.cmake`. Note that currently we provide an LLVM patch which essentially cherry-picks the bug fixes from [this open MLIR PR](https://github.com/llvm/llvm-project/pull/91524). @@ -82,7 +82,7 @@ git clone https://github.com/llvm/llvm-project.git llvm-project # Checkout the right commit. Of course, you may try # a newer commit or your own modified LLVM-Project. cd llvm-project -git checkout $(cat build_tools/cmake/LLVMCommit.cmake | grep -Po '(?<=").*(?=")') +git checkout $(cat ../build_tools/cmake/LLVMCommit.cmake | grep -Po '(?<=").*(?=")') # Apply patch from llvm-project PR 91524 git apply ../build_tools/llvm-project.patch diff --git a/mlir-tensorrt/executor/include/mlir-executor-c/Runtime/Runtime.h b/mlir-tensorrt/executor/include/mlir-executor-c/Runtime/Runtime.h index 11ae93519..65377da35 100644 --- a/mlir-tensorrt/executor/include/mlir-executor-c/Runtime/Runtime.h +++ b/mlir-tensorrt/executor/include/mlir-executor-c/Runtime/Runtime.h @@ -319,6 +319,38 @@ mtrtScalarValueCastToRuntimeValue(MTRT_ScalarValue v); MLIR_CAPI_EXPORTED MTRT_Status mtrtScalarValueGetType(MTRT_ScalarValue scalar, MTRT_ScalarTypeCode *code); +//===----------------------------------------------------------------------===// +// MTRT_GpuAllocator +//===----------------------------------------------------------------------===// + +typedef struct MTRT_GpuAllocator { + void *ptr; +} MTRT_GpuAllocator; + +/// Checks nullity of `GpuAllocator`. +MTRT_CAPI_EXPORTED bool mtrtGpuAllocatorIsNull(MTRT_GpuAllocator gpuAllocator); + +/// Returns null `GpuAllocator`. +MTRT_CAPI_EXPORTED MTRT_GpuAllocator mtrtGpuAllocatorGetNull(); + +MTRT_CAPI_EXPORTED MTRT_Status +mtrtGpuAllocatorDestroy(MTRT_GpuAllocator executable); + +MTRT_CAPI_EXPORTED MTRT_Status +mtrtGpuAllocatorCreate(MTRT_GpuAllocator *allocator); + +//===----------------------------------------------------------------------===// +// MTRT_GpuAllocator +//===----------------------------------------------------------------------===// + +MTRT_CAPI_EXPORTED MTRT_Status mtrtGpuAllocatorAllocate( + MTRT_GpuAllocator gpuAllocator, uint64_t size, uint64_t alignment, + uint32_t flags, MTRT_Stream stream, void **memory); + +MTRT_CAPI_EXPORTED MTRT_Status +mtrtGpuAllocatorDeallocate(MTRT_GpuAllocator gpuAllocator, void *memory, + MTRT_Stream stream, bool *result); + //===----------------------------------------------------------------------===// // MTRT_RuntimeSessionOptions //===----------------------------------------------------------------------===// @@ -359,7 +391,7 @@ typedef struct MTRT_RuntimeSession { /// that the session only has a read-only view in to the Executable for code and /// constant data. Therefore the Executable must outlive the RuntimeSession. MLIR_CAPI_EXPORTED MTRT_Status mtrtRuntimeSessionCreate( - MTRT_RuntimeSessionOptions options, MTRT_Executable executable, + MTRT_RuntimeSessionOptions options, MTRT_Executable executable, MTRT_GpuAllocator allocator, MTRT_RuntimeSession *result); /// Destory the session. This does not destroy the associated Executable, which diff --git a/mlir-tensorrt/executor/include/mlir-executor/Runtime/API/API.h b/mlir-tensorrt/executor/include/mlir-executor/Runtime/API/API.h index d3672c149..b35d0b777 100644 --- a/mlir-tensorrt/executor/include/mlir-executor/Runtime/API/API.h +++ b/mlir-tensorrt/executor/include/mlir-executor/Runtime/API/API.h @@ -867,7 +867,8 @@ class RuntimeSession { sol::state state, std::unique_ptr pinnedMemoryAllocator, std::unique_ptr allocTracker, - std::unique_ptr resourceTracker); + std::unique_ptr resourceTracker, + GpuAllocator* gpuAllocator); ExecutableView getExecutable() const { return executable; } @@ -881,6 +882,8 @@ class RuntimeSession { ResourceTracker &getResourceTracker() { return *resourceTracker; } + GpuAllocator* getGpuAllocator() { return gpuAllocator; } + private: RuntimeSessionOptions options; ExecutableView executable; @@ -888,7 +891,7 @@ class RuntimeSession { std::unique_ptr pinnedMemoryAllocator; std::unique_ptr allocTracker; std::unique_ptr resourceTracker; - + GpuAllocator* gpuAllocator; sol::state state; }; diff --git a/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/LuaRegistration.h b/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/LuaRegistration.h index b5fed9c3d..922e964d4 100644 --- a/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/LuaRegistration.h +++ b/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/LuaRegistration.h @@ -37,6 +37,6 @@ void registerLuaRuntimeMethods(lua_State *state, const RuntimeSessionOptions &options, PinnedMemoryAllocator *pinnedMemoryAllocator, AllocTracker *allocTracker, - ResourceTracker *resourceTracker); + ResourceTracker *resourceTracker, GpuAllocator* allocator); } // namespace mlirtrt::runtime diff --git a/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/LuaRuntime.h b/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/LuaRuntime.h index f39eabd7b..0635e8cb4 100644 --- a/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/LuaRuntime.h +++ b/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/LuaRuntime.h @@ -36,7 +36,7 @@ namespace mlirtrt::runtime { /// `main` function. It is assumed that `main` takes no arguments and returns an /// integer result (which is returned if the execution is successful). /// TODO: this should take a handle to a function for streaming output/errors. -StatusOr runExecutorLuaScript(std::string_view luaScript); +StatusOr runExecutorLuaScript(std::string_view luaScript, GpuAllocator* allocator); /// Synchronously run a serialized executor Executable one time. An `Executable` /// is essentially a Lua script packaged with metadata and serialized constants @@ -48,12 +48,12 @@ StatusOr runExecutorLuaScript(std::string_view luaScript); /// execution is successful). /// TODO: this should take a handle to a function for /// streaming output/errors. -StatusOr runExecutorExecutable(std::unique_ptr executable); +StatusOr runExecutorExecutable(std::unique_ptr executable, GpuAllocator* allocator); /// Create an execution state. This will setup a Lua environment and invoke /// global initialization. StatusOr> -createRuntimeSessionWithLuaBackend(ExecutableView executable, +createRuntimeSessionWithLuaBackend(ExecutableView executable, GpuAllocator* allocator, const RuntimeSessionOptions &options); /// Set the primary stream for the loaded executable to use. diff --git a/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.h b/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.h index 37d8de629..1ceb91367 100644 --- a/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.h +++ b/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.h @@ -37,7 +37,7 @@ class ResourceTracker; /// Lua state. void registerExecutorTensorRTModuleLuaRuntimeMethods( lua_State *luaState, PinnedMemoryAllocator *pinnedMemoryAllocator, - AllocTracker *allocTracker, ResourceTracker *resourceTracker); + AllocTracker *allocTracker, ResourceTracker *resourceTracker, GpuAllocator* allocator); } // namespace mlirtrt::runtime diff --git a/mlir-tensorrt/executor/include/mlir-executor/Support/Allocators.h b/mlir-tensorrt/executor/include/mlir-executor/Support/Allocators.h index 180dbf09e..66f54fb46 100644 --- a/mlir-tensorrt/executor/include/mlir-executor/Support/Allocators.h +++ b/mlir-tensorrt/executor/include/mlir-executor/Support/Allocators.h @@ -32,6 +32,79 @@ namespace mlirtrt { struct EventPool; +// Abstract allocator to be implemented by consumers. +using AllocatorFlags = uint32_t; + +class GpuAllocator { +public: + GpuAllocator() = default; + virtual ~GpuAllocator() = default; + + virtual StatusOr reallocate(void *baseAddr, uint64_t alignment, + uint64_t newSize, + std::optional stream) = 0; + + virtual StatusOr allocate(uint64_t const size, + uint64_t const alignment, + AllocatorFlags const flags, + std::optional stream) = 0; + + virtual StatusOr deallocate(void *const memory, + std::optional stream) = 0; + +protected: + GpuAllocator(GpuAllocator const &) = delete; + GpuAllocator(GpuAllocator &&) = delete; + GpuAllocator &operator=(GpuAllocator const &) & = delete; + GpuAllocator &operator=(GpuAllocator &&) & = delete; +}; + +class StubAllocator : public GpuAllocator { +public: + StubAllocator() = default; + ~StubAllocator() = default; + + StatusOr reallocate(void *baseAddr, uint64_t alignment, + uint64_t newSize, + std::optional stream) override { + return getStatusWithMsg( + StatusCode::InternalError, + "[StubAllocator][reallocate]: Must be overriden in Python"); + } + + StatusOr allocate(uint64_t const size, uint64_t const alignment, + AllocatorFlags const flags, + std::optional stream) override { + return getStatusWithMsg( + StatusCode::InternalError, + "[StubAllocator][allocate]: Must be overriden in Python"); + } + + StatusOr deallocate(void *const memory, + std::optional stream) override { + return getStatusWithMsg( + StatusCode::InternalError, + "[StubAllocator][deallocate]: Must be overriden in Python"); + } +}; + +class CustomTensorRTAllocator : public GpuAllocator { +public: + CustomTensorRTAllocator() = default; + ~CustomTensorRTAllocator() = default; + + StatusOr reallocate(void *baseAddr, uint64_t alignment, + uint64_t newSize, + std::optional stream) override; + + StatusOr allocate(uint64_t const size, uint64_t const alignment, + AllocatorFlags const flags, + std::optional stream) override; + + StatusOr deallocate(void *const memory, + std::optional stream) override; +}; + //===----------------------------------------------------------------------===// // PoolTrackedCudaEvent //===----------------------------------------------------------------------===// diff --git a/mlir-tensorrt/executor/lib/CAPI/Runtime/Runtime.cpp b/mlir-tensorrt/executor/lib/CAPI/Runtime/Runtime.cpp index 41b4db1b2..7c412a290 100644 --- a/mlir-tensorrt/executor/lib/CAPI/Runtime/Runtime.cpp +++ b/mlir-tensorrt/executor/lib/CAPI/Runtime/Runtime.cpp @@ -27,6 +27,7 @@ #include "mlir-executor/Runtime/API/API.h" #include "mlir-executor/Runtime/API/ExecutableFlatbuffer.h" #include "mlir-executor/Runtime/Backend/Lua/LuaRuntime.h" +#include "mlir-executor/Support/Allocators.h" #include "mlir-executor/Support/Status.h" #include "mlir/Support/FileUtilities.h" #include "llvm/ADT/SmallVectorExtras.h" @@ -48,6 +49,8 @@ DEFINE_C_API_PTR_METHODS(MTRT_RuntimeSession, ::mlirtrt::runtime::RuntimeSession) DEFINE_C_API_PTR_METHODS(MTRT_RuntimeSessionOptions, ::mlirtrt::runtime::RuntimeSessionOptions) +DEFINE_C_API_PTR_METHODS(MTRT_GpuAllocator, + ::mlirtrt::GpuAllocator) DEFINE_C_API_PTR_METHODS(MTRT_Executable, ::mlirtrt::runtime::Executable) DEFINE_C_API_PTR_METHODS(MTRT_Stream, MTRT_StreamImpl) DEFINE_C_API_PTR_METHODS(MTRT_RuntimeValue, ::mlirtrt::runtime::RuntimeValue) @@ -600,6 +603,55 @@ MTRT_ScalarValue mtrtRuntimeValueDynCastToScalar(MTRT_RuntimeValue v) { return wrap(static_cast(x)); } +//===----------------------------------------------------------------------===// +// MTRT_GpuAllocator +//===----------------------------------------------------------------------===// + +bool mtrtGpuAllocatorIsNull(MTRT_GpuAllocator gpuAllocator) { + return !gpuAllocator.ptr; +} + +MTRT_GpuAllocator mtrtGpuAllocatorGetNull() { return MTRT_GpuAllocator{nullptr}; } + +MTRT_Status mtrtGpuAllocatorDestroy(MTRT_GpuAllocator executable) { + delete unwrap(executable); + return mtrtStatusGetOk(); +} + +MTRT_Status mtrtGpuAllocatorCreate(MTRT_GpuAllocator *allocator) { + *allocator = MTRT_GpuAllocator{/*ptr=*/new StubAllocator()}; + return mtrtStatusGetOk(); +} + +MTRT_Status mtrtGpuAllocatorAllocate(MTRT_GpuAllocator gpuAllocator, + uint64_t size, uint64_t alignment, + uint32_t flags, MTRT_Stream stream, + void **memory) { + GpuAllocator *cppGpuAllocator = unwrap(gpuAllocator); + StatusOr status = cppGpuAllocator->allocate( + size, alignment, flags, + !mtrtStreamIsNull(stream) ? std::optional(unwrap(stream)->getRawStream()) + : std::nullopt); + if (status.isOk()) { + *memory = *status; + } + return mtrtStatusGetOk(); +} + +MTRT_Status mtrtGpuAllocatorDeallocate(MTRT_GpuAllocator gpuAllocator, + void *memory, MTRT_Stream stream, + bool *result) { + GpuAllocator *cppGpuAllocator = unwrap(gpuAllocator); + StatusOr status = cppGpuAllocator->deallocate( + memory, !mtrtStreamIsNull(stream) + ? std::optional(unwrap(stream)->getRawStream()) + : std::nullopt); + if (status.isOk()) { + *result = *status; + } + return mtrtStatusGetOk(); +} + //===----------------------------------------------------------------------===// // MTRT_RuntimeSessionOptions //===----------------------------------------------------------------------===// @@ -627,12 +679,14 @@ mtrtRuntimeSessionOptionsDestroy(MTRT_RuntimeSessionOptions options) { MTRT_Status mtrtRuntimeSessionCreate(MTRT_RuntimeSessionOptions options, MTRT_Executable executable, + MTRT_GpuAllocator gpuAllocator, MTRT_RuntimeSession *result) { RuntimeSessionOptions *cppOptions = unwrap(options); Executable *cppExecutable = unwrap(executable); + GpuAllocator *cppGpuAllocator = unwrap(gpuAllocator); StatusOr> session = - createRuntimeSessionWithLuaBackend(cppExecutable->getView(), *cppOptions); + createRuntimeSessionWithLuaBackend(cppExecutable->getView(), cppGpuAllocator, *cppOptions); if (session.isError()) return wrap(session.getStatus()); diff --git a/mlir-tensorrt/executor/lib/Runtime/API/API.cpp b/mlir-tensorrt/executor/lib/Runtime/API/API.cpp index 6c10d1f99..b53ab2096 100644 --- a/mlir-tensorrt/executor/lib/Runtime/API/API.cpp +++ b/mlir-tensorrt/executor/lib/Runtime/API/API.cpp @@ -353,16 +353,17 @@ RuntimeSessionOptions::createUsingSingleHostMpi() { //===----------------------------------------------------------------------===// // RuntimeSession //===----------------------------------------------------------------------===// - RuntimeSession::RuntimeSession( RuntimeSessionOptions options, ExecutableView exe, sol::state state, std::unique_ptr pinnedMemoryAllocator, std::unique_ptr allocTracker, - std::unique_ptr resourceTracker) + std::unique_ptr resourceTracker, + GpuAllocator *gpuAllocator) : options(std::move(options)), executable(exe), pinnedMemoryAllocator(std::move(pinnedMemoryAllocator)), allocTracker(std::move(allocTracker)), - resourceTracker(std::move(resourceTracker)), state(std::move(state)) {} + resourceTracker(std::move(resourceTracker)), gpuAllocator(gpuAllocator), + state(std::move(state)) {} //===----------------------------------------------------------------------===// // AllocTracker diff --git a/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/LuaRuntime.cpp b/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/LuaRuntime.cpp index 7596c9da7..b19541977 100644 --- a/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/LuaRuntime.cpp +++ b/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/LuaRuntime.cpp @@ -72,7 +72,7 @@ static void registerDefaultDeviceDependentMethods(lua_State *state, static void registerLuaRuntimeMethodsCommon( lua_State *state, PinnedMemoryAllocator *pinnedMemoryAllocator, - AllocTracker *allocTracker, ResourceTracker *resourceTracker) { + AllocTracker *allocTracker, ResourceTracker *resourceTracker, GpuAllocator* allocator) { registerExecutorCoreModuleLuaRuntimeMethods(state, pinnedMemoryAllocator, allocTracker); registerExecutorCUDAModuleLuaRuntimeMethods( @@ -84,15 +84,15 @@ static void registerLuaRuntimeMethodsCommon( #endif registerExecutorTensorRTModuleLuaRuntimeMethods( - state, pinnedMemoryAllocator, allocTracker, resourceTracker); + state, pinnedMemoryAllocator, allocTracker, resourceTracker, allocator); } void mlirtrt::runtime::registerLuaRuntimeMethods( lua_State *state, const RuntimeSessionOptions &options, PinnedMemoryAllocator *pinnedMemoryAllocator, AllocTracker *allocTracker, - ResourceTracker *resourceTracker) { + ResourceTracker *resourceTracker, GpuAllocator* allocator) { registerLuaRuntimeMethodsCommon(state, pinnedMemoryAllocator, allocTracker, - resourceTracker); + resourceTracker, allocator); #ifdef MLIR_EXECUTOR_ENABLE_NCCL registerExecutorNCCLModuleLuaRuntimeMethods(state, resourceTracker); registerDeviceDependentNCCLMethods(state, options.getNumDevices(), @@ -108,7 +108,7 @@ void mlirtrt::runtime::registerLuaRuntimeMethods( } StatusOr -mlirtrt::runtime::runExecutorLuaScript(std::string_view luaScript) { +mlirtrt::runtime::runExecutorLuaScript(std::string_view luaScript, GpuAllocator* allocator) { ADD_RUNTIME_MODULE_RANGE("runtime_runExecutorLuaScript"); StatusOr> client = RuntimeClient::create(); @@ -120,7 +120,7 @@ mlirtrt::runtime::runExecutorLuaScript(std::string_view luaScript) { registerLuaRuntimeMethods(lua.lua_state(), RuntimeSessionOptions(), &(*client)->getPinnedMemorAllocator(), &(*client)->getAllocTracker(), - &(*client)->getResourceTracker()); + &(*client)->getResourceTracker(), allocator); sol::protected_function_result result = lua.script(luaScript); if (!result.valid()) { @@ -171,7 +171,7 @@ static Status maybeCheckForValidNcclUuid(const RuntimeSessionOptions &options) { /// global initialization. StatusOr> mlirtrt::runtime::createRuntimeSessionWithLuaBackend( - ExecutableView executable, const RuntimeSessionOptions &options) { + ExecutableView executable, GpuAllocator* allocator, const RuntimeSessionOptions &options) { ADD_RUNTIME_MODULE_RANGE("runtime_loadExecutable"); MTRT_RETURN_IF_ERROR(maybeCheckForValidNcclUuid(options)); @@ -184,7 +184,7 @@ mlirtrt::runtime::createRuntimeSessionWithLuaBackend( lua.open_libraries(sol::lib::base, sol::lib::string); registerLuaRuntimeMethods(lua.lua_state(), options, pinnedMemoryAllocator.get(), allocTracker.get(), - resourceTracker.get()); + resourceTracker.get(), allocator); // Load globals into the context. // TODO: eliminate this copy, we already own the executable. @@ -225,11 +225,11 @@ mlirtrt::runtime::createRuntimeSessionWithLuaBackend( } return std::make_unique( options, executable, std::move(lua), std::move(pinnedMemoryAllocator), - std::move(allocTracker), std::move(resourceTracker)); + std::move(allocTracker), std::move(resourceTracker), allocator); } StatusOr mlirtrt::runtime::runExecutorExecutable( - std::unique_ptr executable) { + std::unique_ptr executable, GpuAllocator* allocator) { StatusOr> client = RuntimeClient::create(); if (!client.isOk()) @@ -245,7 +245,7 @@ StatusOr mlirtrt::runtime::runExecutorExecutable( return options.getStatus(); StatusOr> session = - createRuntimeSessionWithLuaBackend(executable->getView(), *options); + createRuntimeSessionWithLuaBackend(executable->getView(), allocator, *options); if (!session.isOk()) return session.getStatus(); diff --git a/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.cpp b/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.cpp index 1b96eac44..bac6942a1 100644 --- a/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.cpp +++ b/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.cpp @@ -64,6 +64,43 @@ class StdioLogger : public nvinfer1::ILogger { bool verbose; }; +//===----------------------------------------------------------------------===// +// TensorRTCallBackAllocator +//===----------------------------------------------------------------------===// + +class TensorRTCallBackAllocator final : public nvinfer1::IGpuAllocator { +public: + TensorRTCallBackAllocator(GpuAllocator *gpuAllocator) + : nvinfer1::IGpuAllocator(), mGpuAllocatorCallBack(gpuAllocator) {} + + void *allocate(uint64_t size, uint64_t alignment, + nvinfer1::AllocatorFlags flags) noexcept final { + return allocateAsync(size, alignment, flags, nullptr); + } + + bool deallocate(void *memory) noexcept final { + return deallocateAsync(memory, nullptr); + } + + void *allocateAsync(uint64_t const size, uint64_t const alignment, + uint32_t flags, cudaStream_t stream) noexcept final { + StatusOr status = + mGpuAllocatorCallBack->allocate(size, alignment, flags, stream); + assert(status.isOk()); + return *status; + } + + bool deallocateAsync(void *const memory, + cudaStream_t stream) noexcept override { + StatusOr status = mGpuAllocatorCallBack->deallocate(memory, stream); + assert(status.isOk()); + return *status; + } + +private: + GpuAllocator *mGpuAllocatorCallBack; +}; + } // namespace static StdioLogger logger(/*verbose=*/false); @@ -88,13 +125,38 @@ struct Signature { } }; +class NvInferRuntimeWrapper { +public: + explicit NvInferRuntimeWrapper(GpuAllocator* gpuAllocator) { + runtime = std::shared_ptr( + nvinfer1::createInferRuntime(logger), [](nvinfer1::IRuntime *runtime) { + MTRT_DBGF("freeing tensorrt runtime at %lu", + reinterpret_cast(runtime)); + delete runtime; + }); + // GpuAllocator is optional. + if (gpuAllocator) { + callbackAllocator = std::shared_ptr( + new TensorRTCallBackAllocator(gpuAllocator)); + runtime->setGpuAllocator(callbackAllocator.get()); + } + } + + nvinfer1::IRuntime *operator*() { return runtime.get(); } + nvinfer1::IRuntime *operator->() { return runtime.get(); } + + std::shared_ptr runtime; + std::shared_ptr callbackAllocator; +}; + class NvInferEngineWrapper { public: - explicit NvInferEngineWrapper(std::shared_ptr &runtime, + explicit NvInferEngineWrapper(std::shared_ptr runtime, uintptr_t pointer, size_t size) : runtime(runtime) { engine = std::shared_ptr( - runtime->deserializeCudaEngine(reinterpret_cast(pointer), size), + runtime->runtime->deserializeCudaEngine( + reinterpret_cast(pointer), size), [](nvinfer1::ICudaEngine *engine) { MTRT_DBGF("freeing cuda engine at %lu", reinterpret_cast(engine)); @@ -105,7 +167,7 @@ class NvInferEngineWrapper { nvinfer1::ICudaEngine *operator*() { return engine.get(); } nvinfer1::ICudaEngine *operator->() { return engine.get(); } - std::shared_ptr runtime; + std::shared_ptr runtime; std::shared_ptr engine; }; @@ -375,19 +437,20 @@ static Status enqueueV3Wrapper(AllocTracker &tracker, //===----------------------------------------------------------------------===// void mlirtrt::runtime::registerExecutorTensorRTModuleLuaRuntimeMethods( lua_State *luaState, PinnedMemoryAllocator *pinnedMemoryAllocator, - AllocTracker *allocTracker, ResourceTracker *resourceTracker) { + AllocTracker *allocTracker, ResourceTracker *resourceTracker, GpuAllocator* allocator) { sol::state_view lua(luaState); - lua["_trtrt_create_runtime"] = [](sol::this_state state) { + lua["_trtrt_create_runtime"] = + [allocator](sol::this_state state) -> std::shared_ptr { ADD_TENSORRT_MODULE_RANGE("trtrt_create_runtime"); MTRT_DBGF("%s", "creating nvinfer runtime"); - return std::shared_ptr( - nvinfer1::createInferRuntime(logger)); + return std::make_shared(allocator); }; lua["_trtrt_load"] = [allocTracker]( - sol::this_state state, std::shared_ptr &runtime, + sol::this_state state, + std::shared_ptr &runtime, uintptr_t pointer) -> std::shared_ptr { ADD_TENSORRT_MODULE_RANGE("trtrt_load"); const AllocTracker &tracker = *allocTracker; diff --git a/mlir-tensorrt/executor/lib/Support/Allocators.cpp b/mlir-tensorrt/executor/lib/Support/Allocators.cpp index 2eadd2cca..2ee7c28f4 100644 --- a/mlir-tensorrt/executor/lib/Support/Allocators.cpp +++ b/mlir-tensorrt/executor/lib/Support/Allocators.cpp @@ -23,11 +23,14 @@ //===----------------------------------------------------------------------===// #include "mlir-executor/Support/Allocators.h" #include "mlir-executor/Support/Status.h" +#include "mlir-executor/Runtime/Support/Support.h" +#include "cuda_runtime_api.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/Support/Debug.h" #include "llvm/Support/MathExtras.h" +#include #include #include #include @@ -39,6 +42,57 @@ using namespace mlirtrt; DEBUG_WITH_TYPE("allocators", fprintf(stderr, "%s:%d " fmt "\n", __FILE__, \ __LINE__, __VA_ARGS__)) +//===----------------------------------------------------------------------===// +// CustomTensorRTAllocator +//===----------------------------------------------------------------------===// + +StatusOr +CustomTensorRTAllocator::allocate(uint64_t const size, uint64_t const alignment, + uint32_t /*flags*/, + std::optional stream) { + uint8_t *memory; + assert(alignment > 0 && (alignment & (alignment - 1)) == 0 && + "Memory alignment has to be power of 2"); + if (stream && *stream != nullptr) { + RETURN_ERROR_IF_CUDART_ERROR( + cudaMallocAsync(reinterpret_cast(&memory), size, *stream)); + MTRT_DBGF("[CustomTensorRTAllocator][allocate]: Asynchronously allocated %lx bytes at 0x%lx on stream %lx", size, + reinterpret_cast(memory), + reinterpret_cast(*stream)); + } else { + RETURN_ERROR_IF_CUDART_ERROR( + cudaMalloc(reinterpret_cast(&memory), size)); + MTRT_DBGF("[CustomTensorRTAllocator][allocate]: Synchronously allocated %lx bytes at 0x%lx", size, + reinterpret_cast(memory)); + } + assert(reinterpret_cast(memory) % alignment == 0); + return memory; +} + +StatusOr +CustomTensorRTAllocator::deallocate(void *const memory, + std::optional stream) { + if (stream && *stream != nullptr) { + MTRT_DBGF("[CustomTensorRTAllocator][deallocate]: Asynchronously freeing CUDA device memory 0x%lx on stream %lx", + reinterpret_cast(memory), + reinterpret_cast(*stream)); + RETURN_ERROR_IF_CUDART_ERROR(cudaFreeAsync(memory, *stream)); + } else { + MTRT_DBGF("[CustomTensorRTAllocator][deallocate]: Synchronously freeing CUDA device/pinned host memory 0x%lx ptr " + "on stream %lx", + reinterpret_cast(memory), + reinterpret_cast(*stream)); + RETURN_ERROR_IF_CUDART_ERROR(cudaFree(memory)); + } + return true; +} + +StatusOr CustomTensorRTAllocator::reallocate( + void * /* baseAddr */, uint64_t /* alignment */, uint64_t /* newSize */, + std::optional /* stream */) { + return nullptr; +} + //===----------------------------------------------------------------------===// // PoolTrackedCudaEvent //===----------------------------------------------------------------------===// diff --git a/mlir-tensorrt/executor/lib/Tools/ExecutorRunnerMain.cpp b/mlir-tensorrt/executor/lib/Tools/ExecutorRunnerMain.cpp index 3241de0da..c8ab9269b 100644 --- a/mlir-tensorrt/executor/lib/Tools/ExecutorRunnerMain.cpp +++ b/mlir-tensorrt/executor/lib/Tools/ExecutorRunnerMain.cpp @@ -98,7 +98,12 @@ struct Options { cl::values(clEnumValN(Lua, "lua", "interpret the input as Lua code")), cl::values(clEnumValN(ExecutorRuntimeExecutable, "rtexe", "load the input file as an Executor executable"))}; + + cl::opt useCustomAllocator{"use-custom-allocator", + cl::desc("Use custom allocator"), + cl::init(false)}; }; + } // namespace LogicalResult @@ -168,13 +173,19 @@ executor::ExecutorRunnerMain(int argc, char **argv, if (result != cudaSuccess) return emitError(loc) << "cudaFree failed: " << cudaGetErrorString(result); + std::unique_ptr allocator{nullptr}; + if (options.useCustomAllocator) { + // Create an optional runtime GPU allocator + allocator.reset(new CustomTensorRTAllocator()); + } + // Read the buffer as a Lua script and execute. if (options.inputType == Lua) { assert(!options.dumpFunctionSignature && "Can not dump function signature for Lua input type."); mlirtrt::StatusOr result = - mlirtrt::runtime::runExecutorLuaScript(input->getBuffer()); + mlirtrt::runtime::runExecutorLuaScript(input->getBuffer(), allocator.get()); if (!result.isOk()) return emitError(UnknownLoc::get(&context)) << result.getString(); return success(*result == 0); @@ -202,7 +213,7 @@ executor::ExecutorRunnerMain(int argc, char **argv, } mlirtrt::StatusOr executionResult = - mlirtrt::runtime::runExecutorExecutable(std::move(*executable)); + mlirtrt::runtime::runExecutorExecutable(std::move(*executable), allocator.get()); if (!executionResult.isOk()) return emitError(UnknownLoc::get(&context)) << "failed to load and run executable: " diff --git a/mlir-tensorrt/python/bindings/Runtime/RuntimePyBind.cpp b/mlir-tensorrt/python/bindings/Runtime/RuntimePyBind.cpp index fef5ad868..e6e6bdda5 100644 --- a/mlir-tensorrt/python/bindings/Runtime/RuntimePyBind.cpp +++ b/mlir-tensorrt/python/bindings/Runtime/RuntimePyBind.cpp @@ -184,6 +184,16 @@ class PyRuntimeValue : public PyMTRTWrapper { mtrtPythonCapsuleToRuntimeValue, mtrtPythonRuntimeValueToCapsule}; }; +/// Python object type wrapper for `MTRT_GpuAllocator`. +class PyGpuAllocator : public PyMTRTWrapper { +public: + using Base::Base; + DECLARE_WRAPPER_CONSTRUCTORS(PyGpuAllocator); + + static constexpr auto kMethodTable = CAPITable{ + mtrtGpuAllocatorIsNull, mtrtGpuAllocatorDestroy}; +}; + /// Python object type wrapper for `MTRT_StableHLOToExecutableOptions`. class PyRuntimeSessionOptions : public PyMTRTWrapper(m, "GpuAllocator", py::module_local()) + .def(py::init<>([]() -> PyGpuAllocator * { + MTRT_GpuAllocator allocator; + MTRT_Status s = mtrtGpuAllocatorCreate(&allocator); + THROW_IF_MTRT_ERROR(s); + return new PyGpuAllocator(allocator); + })) + .def( + "allocate", + [](PyGpuAllocator &self, uint64_t size, uint64_t alignment, + std::optional flags, std::optional stream) { + void *memory{nullptr}; + MTRT_Status s = mtrtGpuAllocatorAllocate( + self, size, alignment, flags ? *flags : 0, + stream ? *stream : mtrtStreamGetNull(), &memory); + THROW_IF_MTRT_ERROR(s); + // Add changes to ensure memory is not released prematurely. + return memory; + }, + py::arg("size"), py::arg("alignment"), py::arg("flags") = py::none(), + py::arg("stream") = py::none()) + .def( + "deallocate", + [](PyGpuAllocator &self, void *memory, + std::optional stream) { + bool result; + MTRT_Status s = mtrtGpuAllocatorDeallocate( + self, memory, stream ? *stream : mtrtStreamGetNull(), &result); + THROW_IF_MTRT_ERROR(s); + // Add changes to ensure memory is not released prematurely. + return result; + }, + py::arg("memory"), py::arg("stream") = py::none()); + py::class_(m, "RuntimeSession", py::module_local()) - .def(py::init<>([](PyRuntimeSessionOptions &options, PyExecutable &exe) { + .def(py::init<>([](PyRuntimeSessionOptions &options, PyExecutable &exe, + std::optional allocator) { MTRT_RuntimeSession session; - MTRT_Status s = mtrtRuntimeSessionCreate(options, exe, &session); + MTRT_Status s = mtrtRuntimeSessionCreate( + options, exe, + allocator ? *allocator : mtrtGpuAllocatorGetNull(), &session); THROW_IF_MTRT_ERROR(s); return new PyRuntimeSession(session); }), - py::arg("options"), py::arg("executable")) + py::arg("options"), py::arg("executable"), + py::arg("gpu_allocator") = py::none()) .def( "execute_function", [](PyRuntimeSession &self, std::string name, From 3f3768ecb0ce52272b912578c17a5e37bb13bdac Mon Sep 17 00:00:00 2001 From: Jhalak Patel Date: Mon, 19 Aug 2024 14:45:55 -0700 Subject: [PATCH 2/6] Add a trampoline class to enable method overriding --- .../include/mlir-executor-c/Runtime/Runtime.h | 32 ++--- .../include/mlir-executor/Runtime/API/API.h | 6 +- .../Runtime/Backend/Lua/LuaRuntime.h | 4 +- .../mlir-executor/Support/Allocators.h | 65 +-------- .../executor/lib/CAPI/Runtime/Runtime.cpp | 84 +++++------ .../executor/lib/Runtime/API/API.cpp | 4 +- .../lib/Runtime/Backend/Lua/LuaRuntime.cpp | 10 +- .../Lua/Modules/TensorRT/TensorRTModule.cpp | 17 +-- .../executor/lib/Support/Allocators.cpp | 44 +----- .../executor/lib/Tools/ExecutorRunnerMain.cpp | 2 +- .../python/bindings/Runtime/RuntimePyBind.cpp | 130 ++++++++++++------ .../IntegrationTests/test_stablehlo_add.py | 30 +++- 12 files changed, 197 insertions(+), 231 deletions(-) diff --git a/mlir-tensorrt/executor/include/mlir-executor-c/Runtime/Runtime.h b/mlir-tensorrt/executor/include/mlir-executor-c/Runtime/Runtime.h index 65377da35..dada8ea67 100644 --- a/mlir-tensorrt/executor/include/mlir-executor-c/Runtime/Runtime.h +++ b/mlir-tensorrt/executor/include/mlir-executor-c/Runtime/Runtime.h @@ -323,33 +323,23 @@ mtrtScalarValueGetType(MTRT_ScalarValue scalar, MTRT_ScalarTypeCode *code); // MTRT_GpuAllocator //===----------------------------------------------------------------------===// + +// Function pointer types for the allocate and deallocate callbacks +typedef void* (*AllocateFunc)(void* self, uint64_t size); +typedef bool (*DeallocateFunc)(void* self, void* memory); + +// The MTRT_GpuAllocator struct typedef struct MTRT_GpuAllocator { - void *ptr; + void* ptr; // Pointer to the implementation (PyGpuAllocatorTrampoline in our case) + AllocateFunc allocate; // Function pointer for allocation + DeallocateFunc deallocate; // Function pointer for deallocation } MTRT_GpuAllocator; /// Checks nullity of `GpuAllocator`. -MTRT_CAPI_EXPORTED bool mtrtGpuAllocatorIsNull(MTRT_GpuAllocator gpuAllocator); - -/// Returns null `GpuAllocator`. -MTRT_CAPI_EXPORTED MTRT_GpuAllocator mtrtGpuAllocatorGetNull(); - -MTRT_CAPI_EXPORTED MTRT_Status -mtrtGpuAllocatorDestroy(MTRT_GpuAllocator executable); - -MTRT_CAPI_EXPORTED MTRT_Status -mtrtGpuAllocatorCreate(MTRT_GpuAllocator *allocator); - -//===----------------------------------------------------------------------===// -// MTRT_GpuAllocator -//===----------------------------------------------------------------------===// - -MTRT_CAPI_EXPORTED MTRT_Status mtrtGpuAllocatorAllocate( - MTRT_GpuAllocator gpuAllocator, uint64_t size, uint64_t alignment, - uint32_t flags, MTRT_Stream stream, void **memory); +MTRT_CAPI_EXPORTED bool GpuAllocatorIsNull(MTRT_GpuAllocator gpuAllocator); MTRT_CAPI_EXPORTED MTRT_Status -mtrtGpuAllocatorDeallocate(MTRT_GpuAllocator gpuAllocator, void *memory, - MTRT_Stream stream, bool *result); +GpuAllocatorDestroy(MTRT_GpuAllocator executable); //===----------------------------------------------------------------------===// // MTRT_RuntimeSessionOptions diff --git a/mlir-tensorrt/executor/include/mlir-executor/Runtime/API/API.h b/mlir-tensorrt/executor/include/mlir-executor/Runtime/API/API.h index b35d0b777..70384c60d 100644 --- a/mlir-tensorrt/executor/include/mlir-executor/Runtime/API/API.h +++ b/mlir-tensorrt/executor/include/mlir-executor/Runtime/API/API.h @@ -868,7 +868,7 @@ class RuntimeSession { std::unique_ptr pinnedMemoryAllocator, std::unique_ptr allocTracker, std::unique_ptr resourceTracker, - GpuAllocator* gpuAllocator); + std::unique_ptr gpuAllocator); ExecutableView getExecutable() const { return executable; } @@ -882,7 +882,7 @@ class RuntimeSession { ResourceTracker &getResourceTracker() { return *resourceTracker; } - GpuAllocator* getGpuAllocator() { return gpuAllocator; } + GpuAllocator &getGpuAllocator() { return *gpuAllocator; } private: RuntimeSessionOptions options; @@ -891,7 +891,7 @@ class RuntimeSession { std::unique_ptr pinnedMemoryAllocator; std::unique_ptr allocTracker; std::unique_ptr resourceTracker; - GpuAllocator* gpuAllocator; + std::unique_ptr gpuAllocator; sol::state state; }; diff --git a/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/LuaRuntime.h b/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/LuaRuntime.h index 0635e8cb4..d4f07f13a 100644 --- a/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/LuaRuntime.h +++ b/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/LuaRuntime.h @@ -48,12 +48,12 @@ StatusOr runExecutorLuaScript(std::string_view luaScript, GpuAllocator* /// execution is successful). /// TODO: this should take a handle to a function for /// streaming output/errors. -StatusOr runExecutorExecutable(std::unique_ptr executable, GpuAllocator* allocator); +StatusOr runExecutorExecutable(std::unique_ptr executable, std::unique_ptr allocator); /// Create an execution state. This will setup a Lua environment and invoke /// global initialization. StatusOr> -createRuntimeSessionWithLuaBackend(ExecutableView executable, GpuAllocator* allocator, +createRuntimeSessionWithLuaBackend(ExecutableView executable, std::unique_ptr allocator, const RuntimeSessionOptions &options); /// Set the primary stream for the loaded executable to use. diff --git a/mlir-tensorrt/executor/include/mlir-executor/Support/Allocators.h b/mlir-tensorrt/executor/include/mlir-executor/Support/Allocators.h index 66f54fb46..393a5a091 100644 --- a/mlir-tensorrt/executor/include/mlir-executor/Support/Allocators.h +++ b/mlir-tensorrt/executor/include/mlir-executor/Support/Allocators.h @@ -32,77 +32,20 @@ namespace mlirtrt { struct EventPool; -// Abstract allocator to be implemented by consumers. -using AllocatorFlags = uint32_t; - class GpuAllocator { public: GpuAllocator() = default; virtual ~GpuAllocator() = default; - - virtual StatusOr reallocate(void *baseAddr, uint64_t alignment, - uint64_t newSize, - std::optional stream) = 0; - - virtual StatusOr allocate(uint64_t const size, - uint64_t const alignment, - AllocatorFlags const flags, - std::optional stream) = 0; - - virtual StatusOr deallocate(void *const memory, - std::optional stream) = 0; - -protected: - GpuAllocator(GpuAllocator const &) = delete; - GpuAllocator(GpuAllocator &&) = delete; - GpuAllocator &operator=(GpuAllocator const &) & = delete; - GpuAllocator &operator=(GpuAllocator &&) & = delete; -}; - -class StubAllocator : public GpuAllocator { -public: - StubAllocator() = default; - ~StubAllocator() = default; - - StatusOr reallocate(void *baseAddr, uint64_t alignment, - uint64_t newSize, - std::optional stream) override { - return getStatusWithMsg( - StatusCode::InternalError, - "[StubAllocator][reallocate]: Must be overriden in Python"); - } - - StatusOr allocate(uint64_t const size, uint64_t const alignment, - AllocatorFlags const flags, - std::optional stream) override { - return getStatusWithMsg( - StatusCode::InternalError, - "[StubAllocator][allocate]: Must be overriden in Python"); - } - - StatusOr deallocate(void *const memory, - std::optional stream) override { - return getStatusWithMsg( - StatusCode::InternalError, - "[StubAllocator][deallocate]: Must be overriden in Python"); - } + virtual void* allocate(uint64_t const size) { return nullptr; } + virtual bool deallocate(void *const memory) { return false; } }; class CustomTensorRTAllocator : public GpuAllocator { public: CustomTensorRTAllocator() = default; ~CustomTensorRTAllocator() = default; - - StatusOr reallocate(void *baseAddr, uint64_t alignment, - uint64_t newSize, - std::optional stream) override; - - StatusOr allocate(uint64_t const size, uint64_t const alignment, - AllocatorFlags const flags, - std::optional stream) override; - - StatusOr deallocate(void *const memory, - std::optional stream) override; + void* allocate(uint64_t const size) override; + bool deallocate(void *const memory) override; }; //===----------------------------------------------------------------------===// diff --git a/mlir-tensorrt/executor/lib/CAPI/Runtime/Runtime.cpp b/mlir-tensorrt/executor/lib/CAPI/Runtime/Runtime.cpp index 7c412a290..74eac79c1 100644 --- a/mlir-tensorrt/executor/lib/CAPI/Runtime/Runtime.cpp +++ b/mlir-tensorrt/executor/lib/CAPI/Runtime/Runtime.cpp @@ -49,8 +49,6 @@ DEFINE_C_API_PTR_METHODS(MTRT_RuntimeSession, ::mlirtrt::runtime::RuntimeSession) DEFINE_C_API_PTR_METHODS(MTRT_RuntimeSessionOptions, ::mlirtrt::runtime::RuntimeSessionOptions) -DEFINE_C_API_PTR_METHODS(MTRT_GpuAllocator, - ::mlirtrt::GpuAllocator) DEFINE_C_API_PTR_METHODS(MTRT_Executable, ::mlirtrt::runtime::Executable) DEFINE_C_API_PTR_METHODS(MTRT_Stream, MTRT_StreamImpl) DEFINE_C_API_PTR_METHODS(MTRT_RuntimeValue, ::mlirtrt::runtime::RuntimeValue) @@ -607,50 +605,24 @@ MTRT_ScalarValue mtrtRuntimeValueDynCastToScalar(MTRT_RuntimeValue v) { // MTRT_GpuAllocator //===----------------------------------------------------------------------===// -bool mtrtGpuAllocatorIsNull(MTRT_GpuAllocator gpuAllocator) { +bool GpuAllocatorIsNull(MTRT_GpuAllocator gpuAllocator) { return !gpuAllocator.ptr; } -MTRT_GpuAllocator mtrtGpuAllocatorGetNull() { return MTRT_GpuAllocator{nullptr}; } - -MTRT_Status mtrtGpuAllocatorDestroy(MTRT_GpuAllocator executable) { - delete unwrap(executable); - return mtrtStatusGetOk(); -} - -MTRT_Status mtrtGpuAllocatorCreate(MTRT_GpuAllocator *allocator) { - *allocator = MTRT_GpuAllocator{/*ptr=*/new StubAllocator()}; - return mtrtStatusGetOk(); -} - -MTRT_Status mtrtGpuAllocatorAllocate(MTRT_GpuAllocator gpuAllocator, - uint64_t size, uint64_t alignment, - uint32_t flags, MTRT_Stream stream, - void **memory) { - GpuAllocator *cppGpuAllocator = unwrap(gpuAllocator); - StatusOr status = cppGpuAllocator->allocate( - size, alignment, flags, - !mtrtStreamIsNull(stream) ? std::optional(unwrap(stream)->getRawStream()) - : std::nullopt); - if (status.isOk()) { - *memory = *status; - } +MTRT_Status GpuAllocatorDestroy(MTRT_GpuAllocator gpuAllocator) { + // delete unwrap(gpuAllocator); return mtrtStatusGetOk(); } -MTRT_Status mtrtGpuAllocatorDeallocate(MTRT_GpuAllocator gpuAllocator, - void *memory, MTRT_Stream stream, - bool *result) { - GpuAllocator *cppGpuAllocator = unwrap(gpuAllocator); - StatusOr status = cppGpuAllocator->deallocate( - memory, !mtrtStreamIsNull(stream) - ? std::optional(unwrap(stream)->getRawStream()) - : std::nullopt); - if (status.isOk()) { - *result = *status; - } - return mtrtStatusGetOk(); -} +// TODO: Implement destroy method to release resources. +// void mtrtGpuAllocatorDestroy(MTRT_GpuAllocator* allocator) { +// if (allocator && allocator->ptr) { +// delete static_cast(allocator->ptr); +// allocator->ptr = nullptr; +// allocator->allocate = nullptr; +// allocator->deallocate = nullptr; +// } +// } //===----------------------------------------------------------------------===// // MTRT_RuntimeSessionOptions @@ -677,16 +649,44 @@ mtrtRuntimeSessionOptionsDestroy(MTRT_RuntimeSessionOptions options) { // MTRT_RuntimeSession //===----------------------------------------------------------------------===// +// A wrapper class for MTRT_GpuAllocator implementing the GpuAllocator +// interface. It encapsulates GPU memory allocation and deallocation operations, +// ensuring correct routing of callbacks from C++ to Python. +class GpuAllocatorWrapper : public GpuAllocator { +private: + MTRT_GpuAllocator mPyGpuAllocator; + +public: + GpuAllocatorWrapper(MTRT_GpuAllocator gpuAllocator) + : mPyGpuAllocator(gpuAllocator) {} + + void *allocate(uint64_t size) override { + return mPyGpuAllocator.allocate(mPyGpuAllocator.ptr, size); + } + + bool deallocate(void *ptr) override { + return mPyGpuAllocator.deallocate(mPyGpuAllocator.ptr, ptr); + } + + // Static method to create a GpuAllocator from MTRT_GpuAllocator + static std::unique_ptr create(MTRT_GpuAllocator gpuAllocator) { + return std::make_unique(gpuAllocator); + } +}; + MTRT_Status mtrtRuntimeSessionCreate(MTRT_RuntimeSessionOptions options, MTRT_Executable executable, MTRT_GpuAllocator gpuAllocator, MTRT_RuntimeSession *result) { RuntimeSessionOptions *cppOptions = unwrap(options); Executable *cppExecutable = unwrap(executable); - GpuAllocator *cppGpuAllocator = unwrap(gpuAllocator); + + std::unique_ptr allocator = + gpuAllocator.ptr ? GpuAllocatorWrapper::create(gpuAllocator) : nullptr; StatusOr> session = - createRuntimeSessionWithLuaBackend(cppExecutable->getView(), cppGpuAllocator, *cppOptions); + createRuntimeSessionWithLuaBackend(cppExecutable->getView(), + std::move(allocator), *cppOptions); if (session.isError()) return wrap(session.getStatus()); diff --git a/mlir-tensorrt/executor/lib/Runtime/API/API.cpp b/mlir-tensorrt/executor/lib/Runtime/API/API.cpp index b53ab2096..52b02f72a 100644 --- a/mlir-tensorrt/executor/lib/Runtime/API/API.cpp +++ b/mlir-tensorrt/executor/lib/Runtime/API/API.cpp @@ -358,11 +358,11 @@ RuntimeSession::RuntimeSession( std::unique_ptr pinnedMemoryAllocator, std::unique_ptr allocTracker, std::unique_ptr resourceTracker, - GpuAllocator *gpuAllocator) + std::unique_ptr gpuAllocator) : options(std::move(options)), executable(exe), pinnedMemoryAllocator(std::move(pinnedMemoryAllocator)), allocTracker(std::move(allocTracker)), - resourceTracker(std::move(resourceTracker)), gpuAllocator(gpuAllocator), + resourceTracker(std::move(resourceTracker)), gpuAllocator(std::move(gpuAllocator)), state(std::move(state)) {} //===----------------------------------------------------------------------===// diff --git a/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/LuaRuntime.cpp b/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/LuaRuntime.cpp index b19541977..17af64a91 100644 --- a/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/LuaRuntime.cpp +++ b/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/LuaRuntime.cpp @@ -171,7 +171,7 @@ static Status maybeCheckForValidNcclUuid(const RuntimeSessionOptions &options) { /// global initialization. StatusOr> mlirtrt::runtime::createRuntimeSessionWithLuaBackend( - ExecutableView executable, GpuAllocator* allocator, const RuntimeSessionOptions &options) { + ExecutableView executable, std::unique_ptr allocator, const RuntimeSessionOptions &options) { ADD_RUNTIME_MODULE_RANGE("runtime_loadExecutable"); MTRT_RETURN_IF_ERROR(maybeCheckForValidNcclUuid(options)); @@ -184,7 +184,7 @@ mlirtrt::runtime::createRuntimeSessionWithLuaBackend( lua.open_libraries(sol::lib::base, sol::lib::string); registerLuaRuntimeMethods(lua.lua_state(), options, pinnedMemoryAllocator.get(), allocTracker.get(), - resourceTracker.get(), allocator); + resourceTracker.get(), allocator.get()); // Load globals into the context. // TODO: eliminate this copy, we already own the executable. @@ -225,11 +225,11 @@ mlirtrt::runtime::createRuntimeSessionWithLuaBackend( } return std::make_unique( options, executable, std::move(lua), std::move(pinnedMemoryAllocator), - std::move(allocTracker), std::move(resourceTracker), allocator); + std::move(allocTracker), std::move(resourceTracker), std::move(allocator)); } StatusOr mlirtrt::runtime::runExecutorExecutable( - std::unique_ptr executable, GpuAllocator* allocator) { + std::unique_ptr executable, std::unique_ptr allocator) { StatusOr> client = RuntimeClient::create(); if (!client.isOk()) @@ -245,7 +245,7 @@ StatusOr mlirtrt::runtime::runExecutorExecutable( return options.getStatus(); StatusOr> session = - createRuntimeSessionWithLuaBackend(executable->getView(), allocator, *options); + createRuntimeSessionWithLuaBackend(executable->getView(), std::move(allocator), *options); if (!session.isOk()) return session.getStatus(); diff --git a/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.cpp b/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.cpp index bac6942a1..fed3a573e 100644 --- a/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.cpp +++ b/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.cpp @@ -82,19 +82,16 @@ class TensorRTCallBackAllocator final : public nvinfer1::IGpuAllocator { return deallocateAsync(memory, nullptr); } - void *allocateAsync(uint64_t const size, uint64_t const alignment, - uint32_t flags, cudaStream_t stream) noexcept final { - StatusOr status = - mGpuAllocatorCallBack->allocate(size, alignment, flags, stream); - assert(status.isOk()); - return *status; + void *allocateAsync(uint64_t const size, uint64_t const /*alignment*/, + uint32_t /*flags*/, cudaStream_t /*stream*/) noexcept final { + void* result = mGpuAllocatorCallBack->allocate(size); + return result; } bool deallocateAsync(void *const memory, - cudaStream_t stream) noexcept override { - StatusOr status = mGpuAllocatorCallBack->deallocate(memory, stream); - assert(status.isOk()); - return *status; + cudaStream_t /*stream*/) noexcept override { + bool result = mGpuAllocatorCallBack->deallocate(memory); + return result; } private: diff --git a/mlir-tensorrt/executor/lib/Support/Allocators.cpp b/mlir-tensorrt/executor/lib/Support/Allocators.cpp index 2ee7c28f4..70f21b9f7 100644 --- a/mlir-tensorrt/executor/lib/Support/Allocators.cpp +++ b/mlir-tensorrt/executor/lib/Support/Allocators.cpp @@ -46,53 +46,17 @@ using namespace mlirtrt; // CustomTensorRTAllocator //===----------------------------------------------------------------------===// -StatusOr -CustomTensorRTAllocator::allocate(uint64_t const size, uint64_t const alignment, - uint32_t /*flags*/, - std::optional stream) { +void *CustomTensorRTAllocator::allocate(uint64_t const size) { uint8_t *memory; - assert(alignment > 0 && (alignment & (alignment - 1)) == 0 && - "Memory alignment has to be power of 2"); - if (stream && *stream != nullptr) { - RETURN_ERROR_IF_CUDART_ERROR( - cudaMallocAsync(reinterpret_cast(&memory), size, *stream)); - MTRT_DBGF("[CustomTensorRTAllocator][allocate]: Asynchronously allocated %lx bytes at 0x%lx on stream %lx", size, - reinterpret_cast(memory), - reinterpret_cast(*stream)); - } else { - RETURN_ERROR_IF_CUDART_ERROR( - cudaMalloc(reinterpret_cast(&memory), size)); - MTRT_DBGF("[CustomTensorRTAllocator][allocate]: Synchronously allocated %lx bytes at 0x%lx", size, - reinterpret_cast(memory)); - } - assert(reinterpret_cast(memory) % alignment == 0); + cudaMalloc(reinterpret_cast(&memory), size); return memory; } -StatusOr -CustomTensorRTAllocator::deallocate(void *const memory, - std::optional stream) { - if (stream && *stream != nullptr) { - MTRT_DBGF("[CustomTensorRTAllocator][deallocate]: Asynchronously freeing CUDA device memory 0x%lx on stream %lx", - reinterpret_cast(memory), - reinterpret_cast(*stream)); - RETURN_ERROR_IF_CUDART_ERROR(cudaFreeAsync(memory, *stream)); - } else { - MTRT_DBGF("[CustomTensorRTAllocator][deallocate]: Synchronously freeing CUDA device/pinned host memory 0x%lx ptr " - "on stream %lx", - reinterpret_cast(memory), - reinterpret_cast(*stream)); - RETURN_ERROR_IF_CUDART_ERROR(cudaFree(memory)); - } +bool CustomTensorRTAllocator::deallocate(void *const memory) { + cudaFree(memory); return true; } -StatusOr CustomTensorRTAllocator::reallocate( - void * /* baseAddr */, uint64_t /* alignment */, uint64_t /* newSize */, - std::optional /* stream */) { - return nullptr; -} - //===----------------------------------------------------------------------===// // PoolTrackedCudaEvent //===----------------------------------------------------------------------===// diff --git a/mlir-tensorrt/executor/lib/Tools/ExecutorRunnerMain.cpp b/mlir-tensorrt/executor/lib/Tools/ExecutorRunnerMain.cpp index c8ab9269b..dc14db16f 100644 --- a/mlir-tensorrt/executor/lib/Tools/ExecutorRunnerMain.cpp +++ b/mlir-tensorrt/executor/lib/Tools/ExecutorRunnerMain.cpp @@ -213,7 +213,7 @@ executor::ExecutorRunnerMain(int argc, char **argv, } mlirtrt::StatusOr executionResult = - mlirtrt::runtime::runExecutorExecutable(std::move(*executable), allocator.get()); + mlirtrt::runtime::runExecutorExecutable(std::move(*executable), std::move(allocator)); if (!executionResult.isOk()) return emitError(UnknownLoc::get(&context)) << "failed to load and run executable: " diff --git a/mlir-tensorrt/python/bindings/Runtime/RuntimePyBind.cpp b/mlir-tensorrt/python/bindings/Runtime/RuntimePyBind.cpp index e6e6bdda5..5a4d10096 100644 --- a/mlir-tensorrt/python/bindings/Runtime/RuntimePyBind.cpp +++ b/mlir-tensorrt/python/bindings/Runtime/RuntimePyBind.cpp @@ -31,6 +31,8 @@ #include #include +#include "cuda_runtime.h" + namespace py = pybind11; using namespace mlirtrt; @@ -184,14 +186,69 @@ class PyRuntimeValue : public PyMTRTWrapper { mtrtPythonCapsuleToRuntimeValue, mtrtPythonRuntimeValueToCapsule}; }; -/// Python object type wrapper for `MTRT_GpuAllocator`. -class PyGpuAllocator : public PyMTRTWrapper { +// Abstract base class for Python-implemented GPU allocators. +// Provides a C++ interface for Python classes and handles C-style callback +// routing. +class PyGpuAllocator { public: - using Base::Base; - DECLARE_WRAPPER_CONSTRUCTORS(PyGpuAllocator); + virtual ~PyGpuAllocator() = default; + virtual std::uintptr_t allocate(uint64_t size) = 0; + virtual bool deallocate(std::uintptr_t ptr) = 0; + + // Creates a C-compatible struct for interfacing with lower-level APIs. + MTRT_GpuAllocator getCApiObject() { return createWithPythonCallbacks(this); } + +private: + // Trampoline function: Routes C-style allocation calls to C++ virtual method. + static void *pyGpuAllocatorAllocate(void *self, uint64_t size) { + auto *allocator = static_cast(self); + std::uintptr_t ptr = allocator->allocate(size); + return reinterpret_cast(ptr); + } + + // Trampoline function: Routes C-style deallocation calls to C++ virtual + // method. + static bool pyGpuAllocatorDeallocate(void *self, void *memory) { + auto *allocator = static_cast(self); + return allocator->deallocate(reinterpret_cast(memory)); + } + + // Constructs MTRT_GpuAllocator with this instance's methods as callbacks. + static MTRT_GpuAllocator + createWithPythonCallbacks(PyGpuAllocator *allocator) { + MTRT_GpuAllocator capi_allocator; + capi_allocator.ptr = allocator; + capi_allocator.allocate = pyGpuAllocatorAllocate; + capi_allocator.deallocate = pyGpuAllocatorDeallocate; + return capi_allocator; + } +}; + +// Pybind11 trampoline class for PyGpuAllocator. +// Enables Python subclasses to override virtual methods of PyGpuAllocator. +class PyGpuAllocatorTrampoline : public PyGpuAllocator { +public: + using PyGpuAllocator::PyGpuAllocator; // Inherit constructors + + // Trampoline for allocate: Dispatches call to Python implementation if + // overridden. + uintptr_t allocate(uint64_t size) override { + PYBIND11_OVERRIDE_PURE(uintptr_t, // Return type + PyGpuAllocator, // Parent class + allocate, // Name of function in C++ + size // Arguments + ); + } - static constexpr auto kMethodTable = CAPITable{ - mtrtGpuAllocatorIsNull, mtrtGpuAllocatorDestroy}; + // Trampoline for deallocate: Dispatches call to Python implementation if + // overridden. + bool deallocate(uintptr_t ptr) override { + PYBIND11_OVERRIDE_PURE(bool, // Return type + PyGpuAllocator, // Parent class + deallocate, // Name of function in C++ + ptr // Arguments + ); + } }; /// Python object type wrapper for `MTRT_StableHLOToExecutableOptions`. @@ -911,47 +968,34 @@ PYBIND11_MODULE(_api, m) { py::arg("num_devices") = 1, py::arg("device_id") = 0, py::arg("nccl_uuid") = py::str("")); - py::class_(m, "GpuAllocator", py::module_local()) - .def(py::init<>([]() -> PyGpuAllocator * { - MTRT_GpuAllocator allocator; - MTRT_Status s = mtrtGpuAllocatorCreate(&allocator); - THROW_IF_MTRT_ERROR(s); - return new PyGpuAllocator(allocator); - })) - .def( - "allocate", - [](PyGpuAllocator &self, uint64_t size, uint64_t alignment, - std::optional flags, std::optional stream) { - void *memory{nullptr}; - MTRT_Status s = mtrtGpuAllocatorAllocate( - self, size, alignment, flags ? *flags : 0, - stream ? *stream : mtrtStreamGetNull(), &memory); - THROW_IF_MTRT_ERROR(s); - // Add changes to ensure memory is not released prematurely. - return memory; - }, - py::arg("size"), py::arg("alignment"), py::arg("flags") = py::none(), - py::arg("stream") = py::none()) - .def( - "deallocate", - [](PyGpuAllocator &self, void *memory, - std::optional stream) { - bool result; - MTRT_Status s = mtrtGpuAllocatorDeallocate( - self, memory, stream ? *stream : mtrtStreamGetNull(), &result); - THROW_IF_MTRT_ERROR(s); - // Add changes to ensure memory is not released prematurely. - return result; - }, - py::arg("memory"), py::arg("stream") = py::none()); + py::class_(m, "GpuAllocator") + .def(py::init<>()) + .def("allocate", &PyGpuAllocator::allocate) + .def("deallocate", &PyGpuAllocator::deallocate) + .def("get_capi_object", &PyGpuAllocator::getCApiObject); py::class_(m, "RuntimeSession", py::module_local()) .def(py::init<>([](PyRuntimeSessionOptions &options, PyExecutable &exe, - std::optional allocator) { + py::object gpu_allocator = py::none()) { MTRT_RuntimeSession session; - MTRT_Status s = mtrtRuntimeSessionCreate( - options, exe, - allocator ? *allocator : mtrtGpuAllocatorGetNull(), &session); + MTRT_Status s; + + if (gpu_allocator.is_none()) { + // Create session without custom allocator + s = mtrtRuntimeSessionCreate( + options, exe, MTRT_GpuAllocator{nullptr}, &session); + } else { + try { + PyGpuAllocator &allocator = + gpu_allocator.cast(); + MTRT_GpuAllocator capi_allocator = allocator.getCApiObject(); + s = mtrtRuntimeSessionCreate(options, exe, capi_allocator, + &session); + } catch (const py::cast_error &) { + throw py::type_error( + "gpu_allocator must be a GpuAllocator object or None"); + } + } THROW_IF_MTRT_ERROR(s); return new PyRuntimeSession(session); }), diff --git a/mlir-tensorrt/test/python/IntegrationTests/test_stablehlo_add.py b/mlir-tensorrt/test/python/IntegrationTests/test_stablehlo_add.py index 2c95a3081..a10957162 100644 --- a/mlir-tensorrt/test/python/IntegrationTests/test_stablehlo_add.py +++ b/mlir-tensorrt/test/python/IntegrationTests/test_stablehlo_add.py @@ -5,6 +5,7 @@ import mlir_tensorrt.compiler.ir as ir import mlir_tensorrt.runtime.api as runtime import numpy as np +import cupy as cp ASM = """ func.func @main(%arg0: tensor<2x3x4xf32>) -> tensor<2x3x4xf32> { @@ -14,6 +15,31 @@ """ +class AppGpuAllocator(runtime.GpuAllocator): + def __init__(self): + # Initialize the base class + super().__init__() + + def allocate(self, size, alignment, flags=None, stream=None): + # Implement memory allocation using CuPy + # Allocate memory on the GPU + import pdb + + pdb.set_trace() + memory = cp.empty(size, dtype=cp.uint8) + print( + f"Allocated memory: {memory.data.ptr}" + ) # Print pointer to allocated GPU memory + return memory # Return the CuPy array + + def deallocate(self, memory, stream=None): + # Deallocate memory in CuPy + # CuPy handles memory automatically, so explicit deallocation is not necessary + print(f"Deallocating memory: {memory}") + del memory # Explicitly delete the CuPy array if needed + return True + + def stablehlo_add(): # Build/parse the main function. with ir.Context() as context: @@ -36,8 +62,10 @@ def stablehlo_add(): if len(devices) == 0: return + allocator = AppGpuAllocator() + session_options = runtime.RuntimeSessionOptions(num_devices=1, device_id=0) - session = runtime.RuntimeSession(session_options, exe) + session = runtime.RuntimeSession(session_options, exe, gpu_allocator=allocator) arg0 = client.create_memref( np.arange(0.0, 24.0, dtype=np.float32).reshape(2, 3, 4).data, From d75c66a449c166aad69f156a976f7353f4fc2ff4 Mon Sep 17 00:00:00 2001 From: Jhalak Patel Date: Tue, 20 Aug 2024 11:13:47 -0700 Subject: [PATCH 3/6] Update tests to use custom allocator --- .../end-to-end-binary.mlir | 2 +- .../end-to-end-unary.mlir | 2 +- .../Lua/IntegrationTests/buffer-ops-bf16.mlir | 2 +- .../IntegrationTests/buffer-ops-dynamic.mlir | 2 +- .../Lua/IntegrationTests/buffer-ops-f16.mlir | 2 +- .../Lua/IntegrationTests/buffer-ops-f32.mlir | 2 +- .../IntegrationTests/buffer-ops-f8E4M3FN.mlir | 2 +- .../Lua/IntegrationTests/buffer-ops-i1.mlir | 2 +- .../Lua/IntegrationTests/buffer-ops-i4.mlir | 2 +- .../Lua/IntegrationTests/memcpy-strided.mlir | 2 +- .../Target/Lua/IntegrationTests/memcpy.mlir | 2 +- .../TRT10/test_stablehlo_add.py | 2 +- .../IntegrationTests/test_call_validation.py | 2 +- .../IntegrationTests/test_stablehlo_add.py | 38 +++++++++---------- .../test_stablehlo_dynamic.py | 2 +- 15 files changed, 32 insertions(+), 34 deletions(-) diff --git a/mlir-tensorrt/test/Target/Lua/IntegrationTests/ClusteringDynamicShape/end-to-end-binary.mlir b/mlir-tensorrt/test/Target/Lua/IntegrationTests/ClusteringDynamicShape/end-to-end-binary.mlir index 84d8e714a..0d3688a17 100644 --- a/mlir-tensorrt/test/Target/Lua/IntegrationTests/ClusteringDynamicShape/end-to-end-binary.mlir +++ b/mlir-tensorrt/test/Target/Lua/IntegrationTests/ClusteringDynamicShape/end-to-end-binary.mlir @@ -3,7 +3,7 @@ // RUN: stablehlo-clustering-pipeline, \ // RUN: post-clustering-pipeline, \ // RUN: executor-lowering-pipeline)" \ -// RUN: | mlir-tensorrt-translate -mlir-to-runtime-executable -allow-unregistered-dialect | mlir-tensorrt-runner -input-type=rtexe +// RUN: | mlir-tensorrt-translate -mlir-to-runtime-executable -allow-unregistered-dialect | mlir-tensorrt-runner -input-type=rtexe --use-custom-allocator #profile = #tensorrt.shape_profile #profile1 = #tensorrt.shape_profile diff --git a/mlir-tensorrt/test/Target/Lua/IntegrationTests/ClusteringDynamicShape/end-to-end-unary.mlir b/mlir-tensorrt/test/Target/Lua/IntegrationTests/ClusteringDynamicShape/end-to-end-unary.mlir index 28718291a..949ce1b7d 100644 --- a/mlir-tensorrt/test/Target/Lua/IntegrationTests/ClusteringDynamicShape/end-to-end-unary.mlir +++ b/mlir-tensorrt/test/Target/Lua/IntegrationTests/ClusteringDynamicShape/end-to-end-unary.mlir @@ -3,7 +3,7 @@ // RUN: stablehlo-clustering-pipeline, \ // RUN: post-clustering-pipeline, \ // RUN: executor-lowering-pipeline)" \ -// RUN: | mlir-tensorrt-translate -mlir-to-runtime-executable -allow-unregistered-dialect | mlir-tensorrt-runner -input-type=rtexe +// RUN: | mlir-tensorrt-translate -mlir-to-runtime-executable -allow-unregistered-dialect | mlir-tensorrt-runner -input-type=rtexe --use-custom-allocator #profile0 = #tensorrt.shape_profile #profile1 = #tensorrt.shape_profile diff --git a/mlir-tensorrt/test/Target/Lua/IntegrationTests/buffer-ops-bf16.mlir b/mlir-tensorrt/test/Target/Lua/IntegrationTests/buffer-ops-bf16.mlir index 15f652aac..f825f236d 100644 --- a/mlir-tensorrt/test/Target/Lua/IntegrationTests/buffer-ops-bf16.mlir +++ b/mlir-tensorrt/test/Target/Lua/IntegrationTests/buffer-ops-bf16.mlir @@ -1,7 +1,7 @@ // REQUIRES: host-has-at-least-1-gpus // RUN: mlir-tensorrt-opt %s -convert-memref-to-cuda -convert-plan-to-executor -convert-cuda-to-executor -executor-lowering-pipeline \ // RUN: | mlir-tensorrt-translate -mlir-to-runtime-executable \ -// RUN: | mlir-tensorrt-runner -input-type=rtexe | FileCheck %s +// RUN: | mlir-tensorrt-runner -input-type=rtexe --use-custom-allocator | FileCheck %s !descriptor1D = !executor.table, !executor.ptr, index, index, index> !hostMemRef = memref<4xbf16, #plan.memory_space> diff --git a/mlir-tensorrt/test/Target/Lua/IntegrationTests/buffer-ops-dynamic.mlir b/mlir-tensorrt/test/Target/Lua/IntegrationTests/buffer-ops-dynamic.mlir index 73c1cd690..0b0230b83 100644 --- a/mlir-tensorrt/test/Target/Lua/IntegrationTests/buffer-ops-dynamic.mlir +++ b/mlir-tensorrt/test/Target/Lua/IntegrationTests/buffer-ops-dynamic.mlir @@ -1,7 +1,7 @@ // REQUIRES: host-has-at-least-1-gpus // RUN: mlir-tensorrt-opt %s -convert-memref-to-cuda -convert-cuda-to-executor -executor-lowering-pipeline \ // RUN: | mlir-tensorrt-translate -mlir-to-runtime-executable \ -// RUN: | mlir-tensorrt-runner -input-type=rtexe | FileCheck %s +// RUN: | mlir-tensorrt-runner -input-type=rtexe --use-custom-allocator | FileCheck %s func.func @run_with_shape_2d(%arg0: memref, %arg1: memref<2xindex>) { %c0 = arith.constant 0 : index diff --git a/mlir-tensorrt/test/Target/Lua/IntegrationTests/buffer-ops-f16.mlir b/mlir-tensorrt/test/Target/Lua/IntegrationTests/buffer-ops-f16.mlir index 448b88c6f..526c8162f 100644 --- a/mlir-tensorrt/test/Target/Lua/IntegrationTests/buffer-ops-f16.mlir +++ b/mlir-tensorrt/test/Target/Lua/IntegrationTests/buffer-ops-f16.mlir @@ -1,7 +1,7 @@ // REQUIRES: host-has-at-least-1-gpus // RUN: mlir-tensorrt-opt %s -convert-memref-to-cuda -convert-plan-to-executor -convert-cuda-to-executor -executor-lowering-pipeline \ // RUN: | mlir-tensorrt-translate -mlir-to-runtime-executable \ -// RUN: | mlir-tensorrt-runner -input-type=rtexe | FileCheck %s +// RUN: | mlir-tensorrt-runner -input-type=rtexe --use-custom-allocator | FileCheck %s !descriptor1D = !executor.table, !executor.ptr, index, index, index> !hostMemRef = memref<4xf16, #plan.memory_space> diff --git a/mlir-tensorrt/test/Target/Lua/IntegrationTests/buffer-ops-f32.mlir b/mlir-tensorrt/test/Target/Lua/IntegrationTests/buffer-ops-f32.mlir index 0d16f189a..6196e5317 100644 --- a/mlir-tensorrt/test/Target/Lua/IntegrationTests/buffer-ops-f32.mlir +++ b/mlir-tensorrt/test/Target/Lua/IntegrationTests/buffer-ops-f32.mlir @@ -1,7 +1,7 @@ // REQUIRES: host-has-at-least-1-gpus // RUN: mlir-tensorrt-opt %s -convert-memref-to-cuda -convert-plan-to-executor -convert-cuda-to-executor -executor-lowering-pipeline \ // RUN: | mlir-tensorrt-translate -mlir-to-runtime-executable \ -// RUN: | mlir-tensorrt-runner -input-type=rtexe | FileCheck %s +// RUN: | mlir-tensorrt-runner -input-type=rtexe --use-custom-allocator | FileCheck %s !descriptor1D = !executor.table, !executor.ptr, index, index, index> !hostMemRef = memref<4xf32, #plan.memory_space> diff --git a/mlir-tensorrt/test/Target/Lua/IntegrationTests/buffer-ops-f8E4M3FN.mlir b/mlir-tensorrt/test/Target/Lua/IntegrationTests/buffer-ops-f8E4M3FN.mlir index 7b3ae4765..6e93ac265 100644 --- a/mlir-tensorrt/test/Target/Lua/IntegrationTests/buffer-ops-f8E4M3FN.mlir +++ b/mlir-tensorrt/test/Target/Lua/IntegrationTests/buffer-ops-f8E4M3FN.mlir @@ -2,7 +2,7 @@ // REQUIRES: all-gpus-support-fp8 // RUN: mlir-tensorrt-opt %s -convert-memref-to-cuda -convert-plan-to-executor -convert-cuda-to-executor -executor-lowering-pipeline \ // RUN: | mlir-tensorrt-translate -mlir-to-runtime-executable \ -// RUN: | mlir-tensorrt-runner -input-type=rtexe | FileCheck %s +// RUN: | mlir-tensorrt-runner -input-type=rtexe --use-custom-allocator | FileCheck %s !descriptor1D = !executor.table, !executor.ptr, index, index, index> !hostMemRef = memref<4xf8E4M3FN, #plan.memory_space> diff --git a/mlir-tensorrt/test/Target/Lua/IntegrationTests/buffer-ops-i1.mlir b/mlir-tensorrt/test/Target/Lua/IntegrationTests/buffer-ops-i1.mlir index f44da93c5..61a74dfdf 100644 --- a/mlir-tensorrt/test/Target/Lua/IntegrationTests/buffer-ops-i1.mlir +++ b/mlir-tensorrt/test/Target/Lua/IntegrationTests/buffer-ops-i1.mlir @@ -1,7 +1,7 @@ // REQUIRES: host-has-at-least-1-gpus // RUN: mlir-tensorrt-opt %s -convert-memref-to-cuda -convert-plan-to-executor -convert-cuda-to-executor -executor-lowering-pipeline \ // RUN: | mlir-tensorrt-translate -mlir-to-runtime-executable \ -// RUN: | mlir-tensorrt-runner -input-type=rtexe | FileCheck %s +// RUN: | mlir-tensorrt-runner -input-type=rtexe --use-custom-allocator | FileCheck %s !descriptor1D = !executor.table, !executor.ptr, index, index, index> !hostMemRef = memref<4xi1, #plan.memory_space> diff --git a/mlir-tensorrt/test/Target/Lua/IntegrationTests/buffer-ops-i4.mlir b/mlir-tensorrt/test/Target/Lua/IntegrationTests/buffer-ops-i4.mlir index 766bec84f..5d917af19 100644 --- a/mlir-tensorrt/test/Target/Lua/IntegrationTests/buffer-ops-i4.mlir +++ b/mlir-tensorrt/test/Target/Lua/IntegrationTests/buffer-ops-i4.mlir @@ -1,7 +1,7 @@ // REQUIRES: host-has-at-least-1-gpus // RUN: mlir-tensorrt-opt %s -convert-memref-to-cuda -convert-plan-to-executor -convert-cuda-to-executor -executor-lowering-pipeline \ // RUN: | mlir-tensorrt-translate -mlir-to-runtime-executable \ -// RUN: | mlir-tensorrt-runner -input-type=rtexe | FileCheck %s +// RUN: | mlir-tensorrt-runner -input-type=rtexe --use-custom-allocator | FileCheck %s !descriptor1D = !executor.table, !executor.ptr, index, index, index> !hostMemRef = memref<4xi4, #plan.memory_space> diff --git a/mlir-tensorrt/test/Target/Lua/IntegrationTests/memcpy-strided.mlir b/mlir-tensorrt/test/Target/Lua/IntegrationTests/memcpy-strided.mlir index 0abcfec01..8cab67749 100644 --- a/mlir-tensorrt/test/Target/Lua/IntegrationTests/memcpy-strided.mlir +++ b/mlir-tensorrt/test/Target/Lua/IntegrationTests/memcpy-strided.mlir @@ -1,7 +1,7 @@ // REQUIRES: host-has-at-least-1-gpus // RUN: mlir-tensorrt-opt %s -convert-memref-to-cuda -convert-plan-to-executor -convert-cuda-to-executor -executor-lowering-pipeline \ // RUN: | mlir-tensorrt-translate -mlir-to-runtime-executable \ -// RUN: | mlir-tensorrt-runner -input-type=rtexe | FileCheck %s +// RUN: | mlir-tensorrt-runner -input-type=rtexe --use-custom-allocator | FileCheck %s func.func @main() -> index { %c0 = arith.constant 0 : index diff --git a/mlir-tensorrt/test/Target/Lua/IntegrationTests/memcpy.mlir b/mlir-tensorrt/test/Target/Lua/IntegrationTests/memcpy.mlir index f750810c8..dd336d8ce 100644 --- a/mlir-tensorrt/test/Target/Lua/IntegrationTests/memcpy.mlir +++ b/mlir-tensorrt/test/Target/Lua/IntegrationTests/memcpy.mlir @@ -1,7 +1,7 @@ // REQUIRES: host-has-at-least-1-gpus // RUN: mlir-tensorrt-opt %s -convert-memref-to-cuda -convert-plan-to-executor -convert-cuda-to-executor -executor-lowering-pipeline \ // RUN: | mlir-tensorrt-translate -mlir-to-runtime-executable \ -// RUN: | mlir-tensorrt-runner -input-type=rtexe | FileCheck %s +// RUN: | mlir-tensorrt-runner -input-type=rtexe --use-custom-allocator | FileCheck %s func.func @main() -> i32 { %c0_i32 = arith.constant 0 : i32 diff --git a/mlir-tensorrt/test/python/IntegrationTests/TRT10/test_stablehlo_add.py b/mlir-tensorrt/test/python/IntegrationTests/TRT10/test_stablehlo_add.py index 480ce74d4..fce25bac3 100644 --- a/mlir-tensorrt/test/python/IntegrationTests/TRT10/test_stablehlo_add.py +++ b/mlir-tensorrt/test/python/IntegrationTests/TRT10/test_stablehlo_add.py @@ -33,7 +33,7 @@ def test_stablehlo_add( exe = compiler.compiler_stablehlo_to_executable(client, m.operation, opts) session_options = runtime.RuntimeSessionOptions(num_devices=1, device_id=0) - session = runtime.RuntimeSession(session_options, exe) + session = runtime.RuntimeSession(session_options, exe, None) session.execute_function( "main", in_args=test.in_args, out_args=test.out_args, stream=stream diff --git a/mlir-tensorrt/test/python/IntegrationTests/test_call_validation.py b/mlir-tensorrt/test/python/IntegrationTests/test_call_validation.py index 1687a8f1b..8ebc2d3c4 100644 --- a/mlir-tensorrt/test/python/IntegrationTests/test_call_validation.py +++ b/mlir-tensorrt/test/python/IntegrationTests/test_call_validation.py @@ -62,7 +62,7 @@ def create_scalar(self, value): return self.client.create_scalar(value, runtime.ScalarTypeCode.i64) def execute(self, arg: runtime.RuntimeValue): - session = runtime.RuntimeSession(self.session_options, self.exe) + session = runtime.RuntimeSession(self.session_options, self.exe, None) try: session.execute_function( "main", in_args=[arg], out_args=[arg], stream=self.stream diff --git a/mlir-tensorrt/test/python/IntegrationTests/test_stablehlo_add.py b/mlir-tensorrt/test/python/IntegrationTests/test_stablehlo_add.py index a10957162..721f6960c 100644 --- a/mlir-tensorrt/test/python/IntegrationTests/test_stablehlo_add.py +++ b/mlir-tensorrt/test/python/IntegrationTests/test_stablehlo_add.py @@ -15,29 +15,26 @@ """ -class AppGpuAllocator(runtime.GpuAllocator): +class CupyGPUAllocator(runtime.GpuAllocator): def __init__(self): - # Initialize the base class super().__init__() + self.allocations = {} # Keep track of allocations - def allocate(self, size, alignment, flags=None, stream=None): - # Implement memory allocation using CuPy - # Allocate memory on the GPU - import pdb + def allocate(self, size): + # Allocate memory on the GPU using CuPy + mem = cp.cuda.alloc(size) + ptr = int(mem.ptr) # Convert to integer + # Store the CuPy memory object + self.allocations[ptr] = mem + return ptr - pdb.set_trace() - memory = cp.empty(size, dtype=cp.uint8) - print( - f"Allocated memory: {memory.data.ptr}" - ) # Print pointer to allocated GPU memory - return memory # Return the CuPy array - - def deallocate(self, memory, stream=None): - # Deallocate memory in CuPy - # CuPy handles memory automatically, so explicit deallocation is not necessary - print(f"Deallocating memory: {memory}") - del memory # Explicitly delete the CuPy array if needed - return True + def deallocate(self, ptr): + if ptr in self.allocations: + # Remove the reference to the CuPy memory object + # This will trigger deallocation if there are no other references + del self.allocations[ptr] + return True + return False def stablehlo_add(): @@ -62,7 +59,8 @@ def stablehlo_add(): if len(devices) == 0: return - allocator = AppGpuAllocator() + # Create an instance of the custom allocator + allocator = CupyGPUAllocator() session_options = runtime.RuntimeSessionOptions(num_devices=1, device_id=0) session = runtime.RuntimeSession(session_options, exe, gpu_allocator=allocator) diff --git a/mlir-tensorrt/test/python/IntegrationTests/test_stablehlo_dynamic.py b/mlir-tensorrt/test/python/IntegrationTests/test_stablehlo_dynamic.py index 35515e054..82c31a05a 100644 --- a/mlir-tensorrt/test/python/IntegrationTests/test_stablehlo_dynamic.py +++ b/mlir-tensorrt/test/python/IntegrationTests/test_stablehlo_dynamic.py @@ -115,7 +115,7 @@ def test_program(program: str, input_shape: Iterable[int], debug: bool = True): return session_options = runtime.RuntimeSessionOptions(num_devices=1, device_id=0) - session = runtime.RuntimeSession(session_options, exe) + session = runtime.RuntimeSession(session_options, exe, gpu_allocator=None) arg0 = client.create_memref( np.ones(input_shape, dtype=np.float32).data, From 2185f9e0f03062a4c9f91e66b1d1f2e925de4253 Mon Sep 17 00:00:00 2001 From: Jhalak Patel Date: Tue, 20 Aug 2024 11:26:48 -0700 Subject: [PATCH 4/6] Fix segmentaiton fault --- mlir-tensorrt/executor/lib/CAPI/Runtime/Runtime.cpp | 11 +++++++++-- .../python/bindings/Runtime/RuntimePyBind.cpp | 11 +++++++++-- .../python/IntegrationTests/test_stablehlo_add.py | 2 +- 3 files changed, 19 insertions(+), 5 deletions(-) diff --git a/mlir-tensorrt/executor/lib/CAPI/Runtime/Runtime.cpp b/mlir-tensorrt/executor/lib/CAPI/Runtime/Runtime.cpp index 74eac79c1..253817934 100644 --- a/mlir-tensorrt/executor/lib/CAPI/Runtime/Runtime.cpp +++ b/mlir-tensorrt/executor/lib/CAPI/Runtime/Runtime.cpp @@ -670,6 +670,11 @@ class GpuAllocatorWrapper : public GpuAllocator { // Static method to create a GpuAllocator from MTRT_GpuAllocator static std::unique_ptr create(MTRT_GpuAllocator gpuAllocator) { + if (!gpuAllocator.ptr || !gpuAllocator.allocate || + !gpuAllocator.deallocate) { + llvm::errs() << "Invalid MTRT_GpuAllocator passed to create()"; + return nullptr; + } return std::make_unique(gpuAllocator); } }; @@ -681,8 +686,10 @@ MTRT_Status mtrtRuntimeSessionCreate(MTRT_RuntimeSessionOptions options, RuntimeSessionOptions *cppOptions = unwrap(options); Executable *cppExecutable = unwrap(executable); - std::unique_ptr allocator = - gpuAllocator.ptr ? GpuAllocatorWrapper::create(gpuAllocator) : nullptr; + std::unique_ptr allocator; + if (gpuAllocator.ptr) { + allocator.reset(GpuAllocatorWrapper::create(gpuAllocator).release()); + } StatusOr> session = createRuntimeSessionWithLuaBackend(cppExecutable->getView(), diff --git a/mlir-tensorrt/python/bindings/Runtime/RuntimePyBind.cpp b/mlir-tensorrt/python/bindings/Runtime/RuntimePyBind.cpp index 5a4d10096..94ebfb2d4 100644 --- a/mlir-tensorrt/python/bindings/Runtime/RuntimePyBind.cpp +++ b/mlir-tensorrt/python/bindings/Runtime/RuntimePyBind.cpp @@ -191,6 +191,9 @@ class PyRuntimeValue : public PyMTRTWrapper { // routing. class PyGpuAllocator { public: + py::object pySelf; + PyGpuAllocator(py::object self) : pySelf(self) {} + virtual ~PyGpuAllocator() = default; virtual std::uintptr_t allocate(uint64_t size) = 0; virtual bool deallocate(std::uintptr_t ptr) = 0; @@ -201,6 +204,7 @@ class PyGpuAllocator { private: // Trampoline function: Routes C-style allocation calls to C++ virtual method. static void *pyGpuAllocatorAllocate(void *self, uint64_t size) { + py::gil_scoped_acquire acquire; auto *allocator = static_cast(self); std::uintptr_t ptr = allocator->allocate(size); return reinterpret_cast(ptr); @@ -209,6 +213,7 @@ class PyGpuAllocator { // Trampoline function: Routes C-style deallocation calls to C++ virtual // method. static bool pyGpuAllocatorDeallocate(void *self, void *memory) { + py::gil_scoped_acquire acquire; auto *allocator = static_cast(self); return allocator->deallocate(reinterpret_cast(memory)); } @@ -969,7 +974,8 @@ PYBIND11_MODULE(_api, m) { py::arg("nccl_uuid") = py::str("")); py::class_(m, "GpuAllocator") - .def(py::init<>()) + .def(py::init<>( + [](py::object self) { return new PyGpuAllocatorTrampoline(self); })) .def("allocate", &PyGpuAllocator::allocate) .def("deallocate", &PyGpuAllocator::deallocate) .def("get_capi_object", &PyGpuAllocator::getCApiObject); @@ -983,7 +989,8 @@ PYBIND11_MODULE(_api, m) { if (gpu_allocator.is_none()) { // Create session without custom allocator s = mtrtRuntimeSessionCreate( - options, exe, MTRT_GpuAllocator{nullptr}, &session); + options, exe, MTRT_GpuAllocator{nullptr, nullptr, nullptr}, + &session); } else { try { PyGpuAllocator &allocator = diff --git a/mlir-tensorrt/test/python/IntegrationTests/test_stablehlo_add.py b/mlir-tensorrt/test/python/IntegrationTests/test_stablehlo_add.py index 721f6960c..ebf31bde7 100644 --- a/mlir-tensorrt/test/python/IntegrationTests/test_stablehlo_add.py +++ b/mlir-tensorrt/test/python/IntegrationTests/test_stablehlo_add.py @@ -17,7 +17,7 @@ class CupyGPUAllocator(runtime.GpuAllocator): def __init__(self): - super().__init__() + super().__init__(self) self.allocations = {} # Keep track of allocations def allocate(self, size): From 4d377a1b6a3693b2e20ebbcdd50e5e594ad6a5d3 Mon Sep 17 00:00:00 2001 From: Jhalak Patel Date: Tue, 20 Aug 2024 16:53:44 -0700 Subject: [PATCH 5/6] Update allocate/deallocate interface --- .../include/mlir-executor-c/Runtime/Runtime.h | 23 +++++------ .../mlir-executor/Support/Allocators.h | 16 ++++++-- .../executor/lib/CAPI/Runtime/Runtime.cpp | 31 ++------------ .../Lua/Modules/TensorRT/TensorRTModule.cpp | 24 ++++------- .../executor/lib/Support/Allocators.cpp | 40 +++++++++++++++++-- .../python/bindings/Runtime/RuntimePyBind.cpp | 22 +++++----- .../IntegrationTests/test_stablehlo_add.py | 2 +- 7 files changed, 83 insertions(+), 75 deletions(-) diff --git a/mlir-tensorrt/executor/include/mlir-executor-c/Runtime/Runtime.h b/mlir-tensorrt/executor/include/mlir-executor-c/Runtime/Runtime.h index dada8ea67..e2a3fa88c 100644 --- a/mlir-tensorrt/executor/include/mlir-executor-c/Runtime/Runtime.h +++ b/mlir-tensorrt/executor/include/mlir-executor-c/Runtime/Runtime.h @@ -32,6 +32,8 @@ #include #include +#include "cuda_runtime.h" + #ifdef __cplusplus extern "C" { #endif @@ -323,24 +325,17 @@ mtrtScalarValueGetType(MTRT_ScalarValue scalar, MTRT_ScalarTypeCode *code); // MTRT_GpuAllocator //===----------------------------------------------------------------------===// +// Function pointer types for the allocate and deallocate callbacks. +typedef void *(*AllocateFunc)(void *self, uint64_t size, uint64_t alignment, uint32_t flags, cudaStream_t* stream); +typedef bool (*DeallocateFunc)(void *self, void *memory, cudaStream_t* stream); -// Function pointer types for the allocate and deallocate callbacks -typedef void* (*AllocateFunc)(void* self, uint64_t size); -typedef bool (*DeallocateFunc)(void* self, void* memory); - -// The MTRT_GpuAllocator struct typedef struct MTRT_GpuAllocator { - void* ptr; // Pointer to the implementation (PyGpuAllocatorTrampoline in our case) - AllocateFunc allocate; // Function pointer for allocation - DeallocateFunc deallocate; // Function pointer for deallocation + void *ptr; // Pointer to the implementation (PyGpuAllocatorTrampoline in our + // case.) + AllocateFunc allocate; // Function pointer for allocation + DeallocateFunc deallocate; // Function pointer for deallocation } MTRT_GpuAllocator; -/// Checks nullity of `GpuAllocator`. -MTRT_CAPI_EXPORTED bool GpuAllocatorIsNull(MTRT_GpuAllocator gpuAllocator); - -MTRT_CAPI_EXPORTED MTRT_Status -GpuAllocatorDestroy(MTRT_GpuAllocator executable); - //===----------------------------------------------------------------------===// // MTRT_RuntimeSessionOptions //===----------------------------------------------------------------------===// diff --git a/mlir-tensorrt/executor/include/mlir-executor/Support/Allocators.h b/mlir-tensorrt/executor/include/mlir-executor/Support/Allocators.h index 393a5a091..536619ba7 100644 --- a/mlir-tensorrt/executor/include/mlir-executor/Support/Allocators.h +++ b/mlir-tensorrt/executor/include/mlir-executor/Support/Allocators.h @@ -36,16 +36,24 @@ class GpuAllocator { public: GpuAllocator() = default; virtual ~GpuAllocator() = default; - virtual void* allocate(uint64_t const size) { return nullptr; } - virtual bool deallocate(void *const memory) { return false; } + virtual void *allocate(uint64_t const size, uint64_t const alignment, + uint32_t flags, cudaStream_t* stream) { + return nullptr; + } + virtual bool deallocate(void *const memory, + cudaStream_t* stream) { + return false; + } }; class CustomTensorRTAllocator : public GpuAllocator { public: CustomTensorRTAllocator() = default; ~CustomTensorRTAllocator() = default; - void* allocate(uint64_t const size) override; - bool deallocate(void *const memory) override; + void *allocate(uint64_t const size, uint64_t const alignment, uint32_t flags, + cudaStream_t* stream) override; + bool deallocate(void *const memory, + cudaStream_t* stream) override; }; //===----------------------------------------------------------------------===// diff --git a/mlir-tensorrt/executor/lib/CAPI/Runtime/Runtime.cpp b/mlir-tensorrt/executor/lib/CAPI/Runtime/Runtime.cpp index 253817934..7e7eb73be 100644 --- a/mlir-tensorrt/executor/lib/CAPI/Runtime/Runtime.cpp +++ b/mlir-tensorrt/executor/lib/CAPI/Runtime/Runtime.cpp @@ -601,29 +601,6 @@ MTRT_ScalarValue mtrtRuntimeValueDynCastToScalar(MTRT_RuntimeValue v) { return wrap(static_cast(x)); } -//===----------------------------------------------------------------------===// -// MTRT_GpuAllocator -//===----------------------------------------------------------------------===// - -bool GpuAllocatorIsNull(MTRT_GpuAllocator gpuAllocator) { - return !gpuAllocator.ptr; -} - -MTRT_Status GpuAllocatorDestroy(MTRT_GpuAllocator gpuAllocator) { - // delete unwrap(gpuAllocator); - return mtrtStatusGetOk(); -} - -// TODO: Implement destroy method to release resources. -// void mtrtGpuAllocatorDestroy(MTRT_GpuAllocator* allocator) { -// if (allocator && allocator->ptr) { -// delete static_cast(allocator->ptr); -// allocator->ptr = nullptr; -// allocator->allocate = nullptr; -// allocator->deallocate = nullptr; -// } -// } - //===----------------------------------------------------------------------===// // MTRT_RuntimeSessionOptions //===----------------------------------------------------------------------===// @@ -660,12 +637,12 @@ class GpuAllocatorWrapper : public GpuAllocator { GpuAllocatorWrapper(MTRT_GpuAllocator gpuAllocator) : mPyGpuAllocator(gpuAllocator) {} - void *allocate(uint64_t size) override { - return mPyGpuAllocator.allocate(mPyGpuAllocator.ptr, size); + void *allocate(uint64_t size, uint64_t alignment, uint32_t flags, cudaStream_t* stream) override { + return mPyGpuAllocator.allocate(mPyGpuAllocator.ptr, size, alignment, flags, stream); } - bool deallocate(void *ptr) override { - return mPyGpuAllocator.deallocate(mPyGpuAllocator.ptr, ptr); + bool deallocate(void *ptr, cudaStream_t* stream) override { + return mPyGpuAllocator.deallocate(mPyGpuAllocator.ptr, ptr, stream); } // Static method to create a GpuAllocator from MTRT_GpuAllocator diff --git a/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.cpp b/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.cpp index fed3a573e..1ae3b4440 100644 --- a/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.cpp +++ b/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.cpp @@ -68,29 +68,21 @@ class StdioLogger : public nvinfer1::ILogger { // TensorRTCallBackAllocator //===----------------------------------------------------------------------===// -class TensorRTCallBackAllocator final : public nvinfer1::IGpuAllocator { +class TensorRTCallBackAllocator final : public nvinfer1::IGpuAsyncAllocator { public: TensorRTCallBackAllocator(GpuAllocator *gpuAllocator) - : nvinfer1::IGpuAllocator(), mGpuAllocatorCallBack(gpuAllocator) {} + : nvinfer1::IGpuAsyncAllocator(), mGpuAllocatorCallBack(gpuAllocator) {} - void *allocate(uint64_t size, uint64_t alignment, - nvinfer1::AllocatorFlags flags) noexcept final { - return allocateAsync(size, alignment, flags, nullptr); - } - - bool deallocate(void *memory) noexcept final { - return deallocateAsync(memory, nullptr); - } - - void *allocateAsync(uint64_t const size, uint64_t const /*alignment*/, - uint32_t /*flags*/, cudaStream_t /*stream*/) noexcept final { - void* result = mGpuAllocatorCallBack->allocate(size); + void *allocateAsync(uint64_t const size, uint64_t const alignment, + uint32_t flags, cudaStream_t stream) noexcept final { + void *result = + mGpuAllocatorCallBack->allocate(size, alignment, flags, &stream); return result; } bool deallocateAsync(void *const memory, - cudaStream_t /*stream*/) noexcept override { - bool result = mGpuAllocatorCallBack->deallocate(memory); + cudaStream_t stream) noexcept override { + bool result = mGpuAllocatorCallBack->deallocate(memory, &stream); return result; } diff --git a/mlir-tensorrt/executor/lib/Support/Allocators.cpp b/mlir-tensorrt/executor/lib/Support/Allocators.cpp index 70f21b9f7..100cb0361 100644 --- a/mlir-tensorrt/executor/lib/Support/Allocators.cpp +++ b/mlir-tensorrt/executor/lib/Support/Allocators.cpp @@ -46,14 +46,46 @@ using namespace mlirtrt; // CustomTensorRTAllocator //===----------------------------------------------------------------------===// -void *CustomTensorRTAllocator::allocate(uint64_t const size) { + +void* +CustomTensorRTAllocator::allocate(uint64_t const size, uint64_t const alignment, + uint32_t /*flags*/, + cudaStream_t* stream) { uint8_t *memory; - cudaMalloc(reinterpret_cast(&memory), size); + assert(alignment > 0 && (alignment & (alignment - 1)) == 0 && + "Memory alignment has to be power of 2"); + if (stream && *stream != nullptr) { + auto status = cudaMallocAsync(reinterpret_cast(&memory), size, *stream); + assert(status == cudaSuccess); + MTRT_DBGF("[CustomTensorRTAllocator][allocate]: Asynchronously allocated %lx bytes at 0x%lx on stream %lx", size, + reinterpret_cast(memory), + reinterpret_cast(*stream)); + } else { + auto status = cudaMalloc(reinterpret_cast(&memory), size); + assert(status == cudaSuccess); + MTRT_DBGF("[CustomTensorRTAllocator][allocate]: Synchronously allocated %lx bytes at 0x%lx", size, + reinterpret_cast(memory)); + } + assert(reinterpret_cast(memory) % alignment == 0); return memory; } -bool CustomTensorRTAllocator::deallocate(void *const memory) { - cudaFree(memory); +bool CustomTensorRTAllocator::deallocate(void *const memory, + cudaStream_t* stream) { + if (stream && *stream != nullptr) { + MTRT_DBGF("[CustomTensorRTAllocator][deallocate]: Asynchronously freeing CUDA device memory 0x%lx on stream %lx", + reinterpret_cast(memory), + reinterpret_cast(*stream)); + cudaError_t status = cudaFreeAsync(memory, *stream); + assert(status == cudaSuccess); + } else { + MTRT_DBGF("[CustomTensorRTAllocator][deallocate]: Synchronously freeing CUDA device/pinned host memory 0x%lx ptr " + "on stream %lx", + reinterpret_cast(memory), + reinterpret_cast(*stream)); + cudaError_t status = cudaFree(memory); + assert(status == cudaSuccess); + } return true; } diff --git a/mlir-tensorrt/python/bindings/Runtime/RuntimePyBind.cpp b/mlir-tensorrt/python/bindings/Runtime/RuntimePyBind.cpp index 94ebfb2d4..3f45138c8 100644 --- a/mlir-tensorrt/python/bindings/Runtime/RuntimePyBind.cpp +++ b/mlir-tensorrt/python/bindings/Runtime/RuntimePyBind.cpp @@ -128,6 +128,7 @@ class PyStream : public PyMTRTWrapper { public: using Base::Base; DECLARE_WRAPPER_CONSTRUCTORS(PyStream); + static constexpr auto kMethodTable = CAPITable{ mtrtStreamIsNull, mtrtStreamDestroy, mtrtPythonCapsuleToStream, mtrtPythonStreamToCapsule}; @@ -195,7 +196,8 @@ class PyGpuAllocator { PyGpuAllocator(py::object self) : pySelf(self) {} virtual ~PyGpuAllocator() = default; - virtual std::uintptr_t allocate(uint64_t size) = 0; + virtual std::uintptr_t allocate(uint64_t size, uint64_t alignment, + uint32_t flags) = 0; virtual bool deallocate(std::uintptr_t ptr) = 0; // Creates a C-compatible struct for interfacing with lower-level APIs. @@ -203,16 +205,19 @@ class PyGpuAllocator { private: // Trampoline function: Routes C-style allocation calls to C++ virtual method. - static void *pyGpuAllocatorAllocate(void *self, uint64_t size) { + static void *pyGpuAllocatorAllocate(void *self, uint64_t size, + uint64_t alignment, uint32_t flags, + cudaStream_t* /*stream*/) { py::gil_scoped_acquire acquire; auto *allocator = static_cast(self); - std::uintptr_t ptr = allocator->allocate(size); + std::uintptr_t ptr = allocator->allocate(size, alignment, flags); return reinterpret_cast(ptr); } // Trampoline function: Routes C-style deallocation calls to C++ virtual // method. - static bool pyGpuAllocatorDeallocate(void *self, void *memory) { + static bool pyGpuAllocatorDeallocate(void *self, void *memory, + cudaStream_t* /*stream*/) { py::gil_scoped_acquire acquire; auto *allocator = static_cast(self); return allocator->deallocate(reinterpret_cast(memory)); @@ -237,12 +242,12 @@ class PyGpuAllocatorTrampoline : public PyGpuAllocator { // Trampoline for allocate: Dispatches call to Python implementation if // overridden. - uintptr_t allocate(uint64_t size) override { + uintptr_t allocate(uint64_t size, uint64_t alignment, uint32_t flags) override { PYBIND11_OVERRIDE_PURE(uintptr_t, // Return type PyGpuAllocator, // Parent class allocate, // Name of function in C++ - size // Arguments - ); + size, // Arguments + alignment, flags); } // Trampoline for deallocate: Dispatches call to Python implementation if @@ -251,8 +256,7 @@ class PyGpuAllocatorTrampoline : public PyGpuAllocator { PYBIND11_OVERRIDE_PURE(bool, // Return type PyGpuAllocator, // Parent class deallocate, // Name of function in C++ - ptr // Arguments - ); + ptr); // Arguments } }; diff --git a/mlir-tensorrt/test/python/IntegrationTests/test_stablehlo_add.py b/mlir-tensorrt/test/python/IntegrationTests/test_stablehlo_add.py index ebf31bde7..1fa932fec 100644 --- a/mlir-tensorrt/test/python/IntegrationTests/test_stablehlo_add.py +++ b/mlir-tensorrt/test/python/IntegrationTests/test_stablehlo_add.py @@ -20,7 +20,7 @@ def __init__(self): super().__init__(self) self.allocations = {} # Keep track of allocations - def allocate(self, size): + def allocate(self, size, alignment, flags): # Allocate memory on the GPU using CuPy mem = cp.cuda.alloc(size) ptr = int(mem.ptr) # Convert to integer From fc7ca69f826ce512a04ad76cb49340221cd071f0 Mon Sep 17 00:00:00 2001 From: Jhalak Patel Date: Wed, 21 Aug 2024 00:22:47 -0700 Subject: [PATCH 6/6] Add IOutputAllocator --- .../include/mlir-executor-c/Runtime/Runtime.h | 70 +++++-- .../include/mlir-executor/Runtime/API/API.h | 22 +++ .../Runtime/Backend/Lua/LuaRegistration.h | 4 +- .../Runtime/Backend/Lua/LuaRuntime.h | 10 +- .../Lua/Modules/TensorRT/TensorRTModule.h | 3 +- .../mlir-executor/Support/Allocators.h | 109 +++++++++++ .../executor/lib/CAPI/Runtime/Runtime.cpp | 89 ++++++++- .../executor/lib/Runtime/API/API.cpp | 6 +- .../lib/Runtime/Backend/Lua/LuaRuntime.cpp | 49 +++-- .../Lua/Modules/TensorRT/TensorRTModule.cpp | 183 ++++++++++++++++-- .../executor/lib/Support/Allocators.cpp | 67 +++++++ .../executor/lib/Tools/ExecutorRunnerMain.cpp | 5 +- .../python/bindings/Runtime/RuntimePyBind.cpp | 144 +++++++++++++- .../IntegrationTests/test_stablehlo_add.py | 37 +++- .../test_stablehlo_dynamic.py | 38 ++++ 15 files changed, 768 insertions(+), 68 deletions(-) diff --git a/mlir-tensorrt/executor/include/mlir-executor-c/Runtime/Runtime.h b/mlir-tensorrt/executor/include/mlir-executor-c/Runtime/Runtime.h index e2a3fa88c..22c87c56e 100644 --- a/mlir-tensorrt/executor/include/mlir-executor-c/Runtime/Runtime.h +++ b/mlir-tensorrt/executor/include/mlir-executor-c/Runtime/Runtime.h @@ -95,6 +95,50 @@ static inline bool mtrtDeviceIsNull(MTRT_Device device) { return !device.ptr; } /// arguments are optional in functions below. static inline MTRT_Device mtrtDeviceGetNull() { return MTRT_Device{nullptr}; } +//===----------------------------------------------------------------------===// +// MTRT_GpuAllocator +//===----------------------------------------------------------------------===// + +// Function pointer types for the allocate and deallocate callbacks. +typedef void *(*AllocateFunc)(void *self, uint64_t size, uint64_t alignment, uint32_t flags, cudaStream_t* stream); +typedef bool (*DeallocateFunc)(void *self, void *memory, cudaStream_t* stream); + +typedef struct MTRT_GpuAllocator { + void *ptr; // Pointer to the implementation (PyGpuAllocatorTrampoline in our + // case.) + // Function pointers to methods. + AllocateFunc allocate; + DeallocateFunc deallocate; +} MTRT_GpuAllocator; + +//===----------------------------------------------------------------------===// +// MTRT_OutputAllocator +//===----------------------------------------------------------------------===// + +// Function pointer types for the allocate and deallocate callbacks. +typedef void (*SetGpuAllocator)(void *self, MTRT_GpuAllocator gpuAllocator); +typedef void (*SetTensorName)(void *self, const char *tensorName); +typedef void (*SetCurrentMemory)(void *self, void *currentMemory); +typedef void (*SetOutputSize)(void *self, const int64_t outputSize); +typedef void *(*ReallocateOutputAsync)(void *self, char const *tensorName, + void *currentMemory, uint64_t size, + uint64_t alignment, + cudaStream_t *stream); +typedef void (*NotifyShape)(void *self, char const *tensorName, const int64_t *dims, + int64_t nbDims); + +typedef struct MTRT_OutputAllocator { + void *ptr; // Pointer to the implementation (PyOutputAllocatorTrampoline in + // our case.) + // Function pointers to methods. + SetGpuAllocator setGpuAllocator; + SetTensorName setTensorName; + SetCurrentMemory setCurrentMemory; + SetOutputSize setOutputSize; + ReallocateOutputAsync reallocateOutputAsync; + NotifyShape notifyShape; +} MTRT_OutputAllocator; + //===----------------------------------------------------------------------===// // MTRT_MemRefValue //===----------------------------------------------------------------------===// @@ -172,6 +216,9 @@ typedef struct MTRT_MemRefValueInfo { MLIR_CAPI_EXPORTED MTRT_Status mtrtMemRefValueGetInfo(MTRT_MemRefValue memref, MTRT_MemRefValueInfo *info); +MLIR_CAPI_EXPORTED MTRT_Status mtrtMemRefValueSetOutputAllocator( + MTRT_MemRefValue memrefValue, MTRT_OutputAllocator pyOutputAllocator); + /// Create DL Managed tensor from MemRefValue. MLIR_CAPI_EXPORTED MTRT_Status mtrtMemRefValueGetDLPackManagedTensor( MTRT_MemRefValue memrefValue, MTRT_DLPackManagedTensor *outTensor); @@ -321,21 +368,6 @@ mtrtScalarValueCastToRuntimeValue(MTRT_ScalarValue v); MLIR_CAPI_EXPORTED MTRT_Status mtrtScalarValueGetType(MTRT_ScalarValue scalar, MTRT_ScalarTypeCode *code); -//===----------------------------------------------------------------------===// -// MTRT_GpuAllocator -//===----------------------------------------------------------------------===// - -// Function pointer types for the allocate and deallocate callbacks. -typedef void *(*AllocateFunc)(void *self, uint64_t size, uint64_t alignment, uint32_t flags, cudaStream_t* stream); -typedef bool (*DeallocateFunc)(void *self, void *memory, cudaStream_t* stream); - -typedef struct MTRT_GpuAllocator { - void *ptr; // Pointer to the implementation (PyGpuAllocatorTrampoline in our - // case.) - AllocateFunc allocate; // Function pointer for allocation - DeallocateFunc deallocate; // Function pointer for deallocation -} MTRT_GpuAllocator; - //===----------------------------------------------------------------------===// // MTRT_RuntimeSessionOptions //===----------------------------------------------------------------------===// @@ -376,8 +408,8 @@ typedef struct MTRT_RuntimeSession { /// that the session only has a read-only view in to the Executable for code and /// constant data. Therefore the Executable must outlive the RuntimeSession. MLIR_CAPI_EXPORTED MTRT_Status mtrtRuntimeSessionCreate( - MTRT_RuntimeSessionOptions options, MTRT_Executable executable, MTRT_GpuAllocator allocator, - MTRT_RuntimeSession *result); + MTRT_RuntimeSessionOptions options, MTRT_Executable executable, + MTRT_GpuAllocator allocator, MTRT_RuntimeSession *result); /// Destory the session. This does not destroy the associated Executable, which /// may be shared among many sessions. @@ -389,6 +421,10 @@ static inline bool mtrtRuntimeSessionIsNull(MTRT_RuntimeSession session) { return !session.ptr; } +MLIR_CAPI_EXPORTED MTRT_Status mtrtAddMemRefOutputAllocatorSessionRegistry( + MTRT_MemRefValue memrefValue, + MTRT_OutputAllocator pyOutputAllocator); + /// Using `session`, execute the pubic function with the specified name. /// The `inArgs` and `outArgs` are arrays for input arguments and destination /// arguments, respectively. Input arguments may be MemRefs or scalars, but diff --git a/mlir-tensorrt/executor/include/mlir-executor/Runtime/API/API.h b/mlir-tensorrt/executor/include/mlir-executor/Runtime/API/API.h index 70384c60d..d48b80d51 100644 --- a/mlir-tensorrt/executor/include/mlir-executor/Runtime/API/API.h +++ b/mlir-tensorrt/executor/include/mlir-executor/Runtime/API/API.h @@ -667,6 +667,12 @@ class MemRefValue : public RuntimeValue { return v->getKind() == Kind::MemRef; } + void setOutputAllocator(OutputAllocator* _outputAllocator) { + outputAllocator = _outputAllocator; + } + + OutputAllocator *getOutputAllocator() { return outputAllocator; } + const std::optional &getScalarType() const { return scalarType; } RuntimeClient *getClient() { return client; } @@ -691,6 +697,7 @@ class MemRefValue : public RuntimeValue { /// address. std::optional device; std::optional scalarType{}; + OutputAllocator *outputAllocator{nullptr}; }; //===----------------------------------------------------------------------===// @@ -868,6 +875,7 @@ class RuntimeSession { std::unique_ptr pinnedMemoryAllocator, std::unique_ptr allocTracker, std::unique_ptr resourceTracker, + std::unique_ptr outputAllocatorTracker, std::unique_ptr gpuAllocator); ExecutableView getExecutable() const { return executable; } @@ -882,6 +890,10 @@ class RuntimeSession { ResourceTracker &getResourceTracker() { return *resourceTracker; } + OutputAllocatorTracker &getOutputAllocatorTracker() { + return *outputAllocatorTracker; + } + GpuAllocator &getGpuAllocator() { return *gpuAllocator; } private: @@ -891,6 +903,7 @@ class RuntimeSession { std::unique_ptr pinnedMemoryAllocator; std::unique_ptr allocTracker; std::unique_ptr resourceTracker; + std::unique_ptr outputAllocatorTracker; std::unique_ptr gpuAllocator; sol::state state; }; @@ -973,6 +986,14 @@ class RuntimeClient { return pinnedMemoryAllocator; } + void addOutputAllocator(std::unique_ptr outputAllocator) { + outputAllocators.emplace_back(std::move(outputAllocator)); + } + + OutputAllocator* getLastOutputAllocator() { + return outputAllocators.back().get(); + } + private: RuntimeClient(llvm::SmallVector> devices) : devices(std::move(devices)) {} @@ -981,6 +1002,7 @@ class RuntimeClient { PinnedMemoryAllocator pinnedMemoryAllocator; AllocTracker allocTracker; ResourceTracker resourceTracker; + std::vector> outputAllocators; }; //===----------------------------------------------------------------------===// diff --git a/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/LuaRegistration.h b/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/LuaRegistration.h index 922e964d4..9dd689de8 100644 --- a/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/LuaRegistration.h +++ b/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/LuaRegistration.h @@ -37,6 +37,8 @@ void registerLuaRuntimeMethods(lua_State *state, const RuntimeSessionOptions &options, PinnedMemoryAllocator *pinnedMemoryAllocator, AllocTracker *allocTracker, - ResourceTracker *resourceTracker, GpuAllocator* allocator); + ResourceTracker *resourceTracker, + OutputAllocatorTracker *outputAllocatorTracker, + GpuAllocator *allocator); } // namespace mlirtrt::runtime diff --git a/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/LuaRuntime.h b/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/LuaRuntime.h index d4f07f13a..e7251580f 100644 --- a/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/LuaRuntime.h +++ b/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/LuaRuntime.h @@ -36,7 +36,8 @@ namespace mlirtrt::runtime { /// `main` function. It is assumed that `main` takes no arguments and returns an /// integer result (which is returned if the execution is successful). /// TODO: this should take a handle to a function for streaming output/errors. -StatusOr runExecutorLuaScript(std::string_view luaScript, GpuAllocator* allocator); +StatusOr runExecutorLuaScript(std::string_view luaScript, + GpuAllocator *allocator); /// Synchronously run a serialized executor Executable one time. An `Executable` /// is essentially a Lua script packaged with metadata and serialized constants @@ -48,12 +49,15 @@ StatusOr runExecutorLuaScript(std::string_view luaScript, GpuAllocator* /// execution is successful). /// TODO: this should take a handle to a function for /// streaming output/errors. -StatusOr runExecutorExecutable(std::unique_ptr executable, std::unique_ptr allocator); +StatusOr +runExecutorExecutable(std::unique_ptr executable, + std::unique_ptr allocator); /// Create an execution state. This will setup a Lua environment and invoke /// global initialization. StatusOr> -createRuntimeSessionWithLuaBackend(ExecutableView executable, std::unique_ptr allocator, +createRuntimeSessionWithLuaBackend(ExecutableView executable, + std::unique_ptr allocator, const RuntimeSessionOptions &options); /// Set the primary stream for the loaded executable to use. diff --git a/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.h b/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.h index 1ceb91367..54655ddf7 100644 --- a/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.h +++ b/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.h @@ -37,7 +37,8 @@ class ResourceTracker; /// Lua state. void registerExecutorTensorRTModuleLuaRuntimeMethods( lua_State *luaState, PinnedMemoryAllocator *pinnedMemoryAllocator, - AllocTracker *allocTracker, ResourceTracker *resourceTracker, GpuAllocator* allocator); + AllocTracker *allocTracker, ResourceTracker *resourceTracker, + OutputAllocatorTracker *outputAllocatorTracker, GpuAllocator *allocator); } // namespace mlirtrt::runtime diff --git a/mlir-tensorrt/executor/include/mlir-executor/Support/Allocators.h b/mlir-tensorrt/executor/include/mlir-executor/Support/Allocators.h index 536619ba7..054bbcf04 100644 --- a/mlir-tensorrt/executor/include/mlir-executor/Support/Allocators.h +++ b/mlir-tensorrt/executor/include/mlir-executor/Support/Allocators.h @@ -32,6 +32,10 @@ namespace mlirtrt { struct EventPool; +//===----------------------------------------------------------------------===// +// GpuAllocator and CustomTensorRTAllocator +//===----------------------------------------------------------------------===// + class GpuAllocator { public: GpuAllocator() = default; @@ -56,6 +60,111 @@ class CustomTensorRTAllocator : public GpuAllocator { cudaStream_t* stream) override; }; +//===----------------------------------------------------------------------===// +// OutputAllocator and CustomTensorRTOuputAllocator +//===----------------------------------------------------------------------===// + +//! +//! Class to allocate memory for outputs with data-dependent shapes. The sizes +//! of those are unknown so pre-allocation is not possible. +//! +class OutputAllocator { +public: + virtual ~OutputAllocator() = default; + virtual void setGpuAllocator(GpuAllocator* gpuAllocator) = 0; + virtual void setTensorName(const char *tensorName) = 0; + virtual void setCurrentMemory(void *currentMemory) = 0; + virtual void setOutputSize(const int64_t outputSize) = 0; + virtual void *reallocateOutputAsync(char const *tensorName, + void *currentMemory, uint64_t size, + uint64_t alignment, + cudaStream_t * /*stream*/) = 0; + virtual void notifyShape(char const *tensorName, const int64_t *dims, + int64_t nbDims) = 0; +}; + +class CustomTensorRTOuputAllocator : public OutputAllocator { +public: + CustomTensorRTOuputAllocator() = default; + ~CustomTensorRTOuputAllocator() { + if (mOutputPtr != nullptr) { + cudaFree(mOutputPtr); + } + } + + void setGpuAllocator(GpuAllocator* gpuAllocator) override { + mGpuAllocator = gpuAllocator; + } + + //! Methods are called just after construction. TODO: can they be called + //! during construction? + void setTensorName(const char *tensorName) override { + mTensorName = tensorName; + } + + void setCurrentMemory(void *currentMemory) override { + mCurrentMemory = currentMemory; + } + + void setOutputSize(int64_t outputSize) override { mOutputSize = outputSize; } + + void *reallocateOutputAsync(char const *tensorName, void *currentMemory, + uint64_t size, uint64_t alignment, + cudaStream_t * /*stream*/) override; + + void notifyShape(char const *tensorName, const int64_t *dims, + int64_t nbDims) override; + + //! nullptr if memory could not be allocated + void *mOutputPtr{nullptr}; + + //! Size of allocation pointed to by output. + uint64_t mOutputSize{0}; + + bool mReallocateOutputCalled{false}; + + bool mNotifyShapeCalled{false}; + + //! Dimensions of tensor. + std::vector mOutputDims; + +private: + GpuAllocator* mGpuAllocator; + const char *mTensorName; + void *mCurrentMemory; +}; + +class OutputAllocatorTracker { +public: + OutputAllocatorTracker() = default; + ~OutputAllocatorTracker() = default; + + OutputAllocatorTracker(const OutputAllocatorTracker &) = delete; + OutputAllocatorTracker &operator=(const OutputAllocatorTracker &) = delete; + OutputAllocatorTracker(OutputAllocatorTracker &&) = default; + OutputAllocatorTracker &operator=(OutputAllocatorTracker &&) = default; + + // Add a new OutputAllocator + void addAllocator(void *ptr, OutputAllocator *allocator) { + mOutputAllocatorRegistry.emplace_back(std::make_pair(ptr, allocator)); + } + + // Get a reference to an OutputAllocator + OutputAllocator *getAllocator(void *ptr) { + auto it = std::find_if( + mOutputAllocatorRegistry.begin(), mOutputAllocatorRegistry.end(), + [ptr](const auto &pair) { return pair.first == ptr; }); + + if (it != mOutputAllocatorRegistry.end()) { + return it->second; + } + return nullptr; + } + +private: + std::vector> mOutputAllocatorRegistry; +}; + //===----------------------------------------------------------------------===// // PoolTrackedCudaEvent //===----------------------------------------------------------------------===// diff --git a/mlir-tensorrt/executor/lib/CAPI/Runtime/Runtime.cpp b/mlir-tensorrt/executor/lib/CAPI/Runtime/Runtime.cpp index 7e7eb73be..abd3b06c7 100644 --- a/mlir-tensorrt/executor/lib/CAPI/Runtime/Runtime.cpp +++ b/mlir-tensorrt/executor/lib/CAPI/Runtime/Runtime.cpp @@ -415,7 +415,64 @@ static void dlpackManagedTensorDeleter(DLManagedTensor *tensor) { } } -MLIR_CAPI_EXPORTED MTRT_Status mtrtMemRefValueGetDLPackManagedTensor( + +class OutputAllocatorWrapper : public OutputAllocator { +private: + MTRT_OutputAllocator mPyOutputAllocator; + +public: + OutputAllocatorWrapper(MTRT_OutputAllocator outputAllocator) + : mPyOutputAllocator(outputAllocator) {} + + void setGpuAllocator(GpuAllocator *gpuAllocator) override { + return mPyOutputAllocator.setGpuAllocator( + mPyOutputAllocator.ptr, + MTRT_GpuAllocator{gpuAllocator, nullptr, nullptr}); + } + + void setTensorName(const char *tensorName) override { + return mPyOutputAllocator.setTensorName(mPyOutputAllocator.ptr, tensorName); + } + + void setCurrentMemory(void *currentMemory) override { + return mPyOutputAllocator.setCurrentMemory(mPyOutputAllocator.ptr, + currentMemory); + } + + void setOutputSize(const int64_t outputSize) override { + return mPyOutputAllocator.setOutputSize(mPyOutputAllocator.ptr, outputSize); + } + + void *reallocateOutputAsync(char const *tensorName, void *currentMemory, + uint64_t size, uint64_t alignment, + cudaStream_t *stream) override { + return mPyOutputAllocator.reallocateOutputAsync(mPyOutputAllocator.ptr, + tensorName, currentMemory, + size, alignment, stream); + } + + void notifyShape(char const *tensorName, const int64_t *dims, + int64_t nbDims) override { + return mPyOutputAllocator.notifyShape(mPyOutputAllocator.ptr, tensorName, + dims, nbDims); + } + + // Static method to create a OutputAllocator from MTRT_OutputAllocator + static std::unique_ptr + create(MTRT_OutputAllocator outputAllocator) { + if (!outputAllocator.ptr || !outputAllocator.setGpuAllocator || + !outputAllocator.setTensorName || !outputAllocator.setCurrentMemory || + !outputAllocator.setOutputSize || + !outputAllocator.reallocateOutputAsync || + !outputAllocator.notifyShape) { + llvm::errs() << "Invalid MTRT_OutputAllocator passed to create()"; + return nullptr; + } + return std::make_unique(outputAllocator); + } +}; + +MTRT_Status mtrtMemRefValueGetDLPackManagedTensor( MTRT_MemRefValue memrefValue, MTRT_DLPackManagedTensor *outTensor) { MemRefValue memref = *unwrap(memrefValue); @@ -462,7 +519,7 @@ MLIR_CAPI_EXPORTED MTRT_Status mtrtMemRefValueGetDLPackManagedTensor( return mtrtStatusGetOk(); } -MLIR_CAPI_EXPORTED MTRT_Status mtrtMemRefValueGetDLPackDevice( +MTRT_Status mtrtMemRefValueGetDLPackDevice( MTRT_MemRefValue memrefValue, int32_t *device_type, int32_t *device_id) { MemRefValue memref = *unwrap(memrefValue); int device = memref.getDevice().has_value() @@ -626,6 +683,25 @@ mtrtRuntimeSessionOptionsDestroy(MTRT_RuntimeSessionOptions options) { // MTRT_RuntimeSession //===----------------------------------------------------------------------===// +MTRT_Status mtrtAddMemRefOutputAllocatorSessionRegistry( + MTRT_MemRefValue memrefValue, MTRT_OutputAllocator pyOutputAllocator) { + auto memref = unwrap(memrefValue); + + std::unique_ptr outputAllocator; + if (pyOutputAllocator.ptr) { + outputAllocator.reset( + OutputAllocatorWrapper::create(pyOutputAllocator).release()); + } + + // Client should own the output allocator. + memref->getClient()->addOutputAllocator(std::move(outputAllocator)); + + // Store the output allocator reference. + memref->setOutputAllocator(memref->getClient()->getLastOutputAllocator()); + + return mtrtStatusGetOk(); +} + // A wrapper class for MTRT_GpuAllocator implementing the GpuAllocator // interface. It encapsulates GPU memory allocation and deallocation operations, // ensuring correct routing of callbacks from C++ to Python. @@ -656,10 +732,11 @@ class GpuAllocatorWrapper : public GpuAllocator { } }; -MTRT_Status mtrtRuntimeSessionCreate(MTRT_RuntimeSessionOptions options, - MTRT_Executable executable, - MTRT_GpuAllocator gpuAllocator, - MTRT_RuntimeSession *result) { +MTRT_Status +mtrtRuntimeSessionCreate(MTRT_RuntimeSessionOptions options, + MTRT_Executable executable, + MTRT_GpuAllocator gpuAllocator, + MTRT_RuntimeSession *result) { RuntimeSessionOptions *cppOptions = unwrap(options); Executable *cppExecutable = unwrap(executable); diff --git a/mlir-tensorrt/executor/lib/Runtime/API/API.cpp b/mlir-tensorrt/executor/lib/Runtime/API/API.cpp index 52b02f72a..583dc344b 100644 --- a/mlir-tensorrt/executor/lib/Runtime/API/API.cpp +++ b/mlir-tensorrt/executor/lib/Runtime/API/API.cpp @@ -358,12 +358,14 @@ RuntimeSession::RuntimeSession( std::unique_ptr pinnedMemoryAllocator, std::unique_ptr allocTracker, std::unique_ptr resourceTracker, + std::unique_ptr outputAllocatorTracker, std::unique_ptr gpuAllocator) : options(std::move(options)), executable(exe), pinnedMemoryAllocator(std::move(pinnedMemoryAllocator)), allocTracker(std::move(allocTracker)), - resourceTracker(std::move(resourceTracker)), gpuAllocator(std::move(gpuAllocator)), - state(std::move(state)) {} + resourceTracker(std::move(resourceTracker)), + outputAllocatorTracker(std::move(outputAllocatorTracker)), + gpuAllocator(std::move(gpuAllocator)), state(std::move(state)) {} //===----------------------------------------------------------------------===// // AllocTracker diff --git a/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/LuaRuntime.cpp b/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/LuaRuntime.cpp index 17af64a91..81897d8c2 100644 --- a/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/LuaRuntime.cpp +++ b/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/LuaRuntime.cpp @@ -72,7 +72,8 @@ static void registerDefaultDeviceDependentMethods(lua_State *state, static void registerLuaRuntimeMethodsCommon( lua_State *state, PinnedMemoryAllocator *pinnedMemoryAllocator, - AllocTracker *allocTracker, ResourceTracker *resourceTracker, GpuAllocator* allocator) { + AllocTracker *allocTracker, ResourceTracker *resourceTracker, + GpuAllocator *allocator, OutputAllocatorTracker *outputAllocatorTracker) { registerExecutorCoreModuleLuaRuntimeMethods(state, pinnedMemoryAllocator, allocTracker); registerExecutorCUDAModuleLuaRuntimeMethods( @@ -84,15 +85,15 @@ static void registerLuaRuntimeMethodsCommon( #endif registerExecutorTensorRTModuleLuaRuntimeMethods( - state, pinnedMemoryAllocator, allocTracker, resourceTracker, allocator); + state, pinnedMemoryAllocator, allocTracker, resourceTracker, outputAllocatorTracker, allocator); } void mlirtrt::runtime::registerLuaRuntimeMethods( lua_State *state, const RuntimeSessionOptions &options, PinnedMemoryAllocator *pinnedMemoryAllocator, AllocTracker *allocTracker, - ResourceTracker *resourceTracker, GpuAllocator* allocator) { + ResourceTracker *resourceTracker, OutputAllocatorTracker* outputAllocatorTracker, GpuAllocator* allocator) { registerLuaRuntimeMethodsCommon(state, pinnedMemoryAllocator, allocTracker, - resourceTracker, allocator); + resourceTracker, allocator, outputAllocatorTracker); #ifdef MLIR_EXECUTOR_ENABLE_NCCL registerExecutorNCCLModuleLuaRuntimeMethods(state, resourceTracker); registerDeviceDependentNCCLMethods(state, options.getNumDevices(), @@ -107,8 +108,8 @@ void mlirtrt::runtime::registerLuaRuntimeMethods( #endif } -StatusOr -mlirtrt::runtime::runExecutorLuaScript(std::string_view luaScript, GpuAllocator* allocator) { +StatusOr mlirtrt::runtime::runExecutorLuaScript( + std::string_view luaScript, GpuAllocator *allocator) { ADD_RUNTIME_MODULE_RANGE("runtime_runExecutorLuaScript"); StatusOr> client = RuntimeClient::create(); @@ -117,10 +118,11 @@ mlirtrt::runtime::runExecutorLuaScript(std::string_view luaScript, GpuAllocator* sol::state lua; lua.open_libraries(sol::lib::base, sol::lib::string); - registerLuaRuntimeMethods(lua.lua_state(), RuntimeSessionOptions(), - &(*client)->getPinnedMemorAllocator(), - &(*client)->getAllocTracker(), - &(*client)->getResourceTracker(), allocator); + registerLuaRuntimeMethods( + lua.lua_state(), RuntimeSessionOptions(), + &(*client)->getPinnedMemorAllocator(), &(*client)->getAllocTracker(), + &(*client)->getResourceTracker(), nullptr /* Output allocator */, + allocator /* can this be nullptr as well */); sol::protected_function_result result = lua.script(luaScript); if (!result.valid()) { @@ -171,7 +173,8 @@ static Status maybeCheckForValidNcclUuid(const RuntimeSessionOptions &options) { /// global initialization. StatusOr> mlirtrt::runtime::createRuntimeSessionWithLuaBackend( - ExecutableView executable, std::unique_ptr allocator, const RuntimeSessionOptions &options) { + ExecutableView executable, std::unique_ptr allocator, + const RuntimeSessionOptions &options) { ADD_RUNTIME_MODULE_RANGE("runtime_loadExecutable"); MTRT_RETURN_IF_ERROR(maybeCheckForValidNcclUuid(options)); @@ -179,12 +182,13 @@ mlirtrt::runtime::createRuntimeSessionWithLuaBackend( auto pinnedMemoryAllocator = std::make_unique(); auto allocTracker = std::make_unique(); auto resourceTracker = std::make_unique(); + auto outputAllocatorTracker = std::make_unique(); sol::state lua; lua.open_libraries(sol::lib::base, sol::lib::string); - registerLuaRuntimeMethods(lua.lua_state(), options, - pinnedMemoryAllocator.get(), allocTracker.get(), - resourceTracker.get(), allocator.get()); + registerLuaRuntimeMethods( + lua.lua_state(), options, pinnedMemoryAllocator.get(), allocTracker.get(), + resourceTracker.get(), outputAllocatorTracker.get(), allocator.get()); // Load globals into the context. // TODO: eliminate this copy, we already own the executable. @@ -225,11 +229,13 @@ mlirtrt::runtime::createRuntimeSessionWithLuaBackend( } return std::make_unique( options, executable, std::move(lua), std::move(pinnedMemoryAllocator), - std::move(allocTracker), std::move(resourceTracker), std::move(allocator)); + std::move(allocTracker), std::move(resourceTracker), + std::move(outputAllocatorTracker), std::move(allocator)); } StatusOr mlirtrt::runtime::runExecutorExecutable( - std::unique_ptr executable, std::unique_ptr allocator) { + std::unique_ptr executable, + std::unique_ptr allocator) { StatusOr> client = RuntimeClient::create(); if (!client.isOk()) @@ -245,7 +251,8 @@ StatusOr mlirtrt::runtime::runExecutorExecutable( return options.getStatus(); StatusOr> session = - createRuntimeSessionWithLuaBackend(executable->getView(), std::move(allocator), *options); + createRuntimeSessionWithLuaBackend(executable->getView(), + std::move(allocator), *options); if (!session.isOk()) return session.getStatus(); @@ -465,6 +472,8 @@ runtime::executeFunctionWithLuaBackend( // Call the main function, if present. sol::state_view lua(session.getLuaState()); AllocTracker &tracker = session.getAllocTracker(); + OutputAllocatorTracker &outputAllocatorTracker = session.getOutputAllocatorTracker(); + sol::protected_function funcObj = lua[name]; if (funcObj.get_type() != sol::type::function) return getStatusWithMsg(StatusCode::InternalError, "no function named \"", @@ -523,6 +532,12 @@ runtime::executeFunctionWithLuaBackend( for (auto [idx, rv] : llvm::enumerate(outputArgs)) { if (MemRefValue *memref = llvm::dyn_cast(rv)) { MTRT_RETURN_IF_ERROR(pushMemRefTableArg(lua, tracker, args, *memref)); + + // Creating a mapping from memref pointer to output allocator tracker. + if (memref->getOutputAllocator()) { + outputAllocatorTracker.addAllocator(memref->getVoidPtr(), memref->getOutputAllocator()); + } + continue; } return getInvalidArgStatus("output (destination) argument #{0} to function " diff --git a/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.cpp b/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.cpp index 1ae3b4440..1aed00592 100644 --- a/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.cpp +++ b/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.cpp @@ -64,6 +64,105 @@ class StdioLogger : public nvinfer1::ILogger { bool verbose; }; +//===----------------------------------------------------------------------===// +// TensorRTCallBackOutputAllocator +//===----------------------------------------------------------------------===// + +static bool isSubByte(nvinfer1::DataType t) { + return t == nvinfer1::DataType::kINT4; +} + +static int32_t elementSizeInBits(nvinfer1::DataType t) { + switch (t) { + case nvinfer1::DataType::kINT64: + return 64; + case nvinfer1::DataType::kINT32: + return 32; + case nvinfer1::DataType::kFLOAT: + return 32; + case nvinfer1::DataType::kHALF: + return 16; + case nvinfer1::DataType::kBF16: + return 16; + case nvinfer1::DataType::kINT8: + return 8; + case nvinfer1::DataType::kBOOL: + return 8; + case nvinfer1::DataType::kUINT8: + return 8; + case nvinfer1::DataType::kFP8: + return 8; + case nvinfer1::DataType::kINT4: + return 4; + } + return 0; +} + +static int32_t elementeSizeInBytes(nvinfer1::DataType dtype) { + if (!isSubByte(dtype)) { + auto bits = elementSizeInBits(dtype); + assert(bits % 8 == 0); + return bits / 8; + } + if (dtype == nvinfer1::DataType::kINT4) { + return 1; + } + return -1; +} + +static int64_t volume(nvinfer1::Dims64 const& d) +{ + int64_t v = 1; + for (int64_t i = 0; i < d.nbDims; i++) + { + v *= d.d[i]; + } + return v; +} + +class TensorRTCallBackOutputAllocator final + : public nvinfer1::IOutputAllocator { +public: + TensorRTCallBackOutputAllocator(GpuAllocator* gpuAllocator, OutputAllocator *outputAllocator, + const char *tensorName, void *currentMemory, + nvinfer1::Dims64 dims, + nvinfer1::DataType dtype) + : nvinfer1::IOutputAllocator(), + mOutputAllocatorCallBack(outputAllocator) { + mOutputAllocatorCallBack->setGpuAllocator(gpuAllocator); + mOutputAllocatorCallBack->setTensorName(tensorName); + mOutputAllocatorCallBack->setCurrentMemory(currentMemory); + mOutputAllocatorCallBack->setOutputSize(volume(dims) * + elementeSizeInBytes(dtype)); + } + + void *reallocateOutput(char const *tensorName, void *currentMemory, + uint64_t size, uint64_t alignment) noexcept override { + return mOutputAllocatorCallBack->reallocateOutputAsync( + tensorName, currentMemory, size, alignment, nullptr); + } + + //! IMirroredBuffer does not implement Async allocation, hence this is just a + //! wrap around + void *reallocateOutputAsync(char const *tensorName, void *currentMemory, + uint64_t size, uint64_t alignment, + cudaStream_t stream) noexcept override { + + return mOutputAllocatorCallBack->reallocateOutputAsync( + tensorName, currentMemory, size, alignment, &stream); + } + + void notifyShape(char const *tensorName, + nvinfer1::Dims const &dims) noexcept override { + return mOutputAllocatorCallBack->notifyShape(tensorName, &dims.d[0], dims.nbDims); + } + + ~TensorRTCallBackOutputAllocator() override {} + +private: + OutputAllocator *mOutputAllocatorCallBack; +}; + //===----------------------------------------------------------------------===// // TensorRTCallBackAllocator //===----------------------------------------------------------------------===// @@ -125,9 +224,11 @@ class NvInferRuntimeWrapper { }); // GpuAllocator is optional. if (gpuAllocator) { - callbackAllocator = std::shared_ptr( - new TensorRTCallBackAllocator(gpuAllocator)); - runtime->setGpuAllocator(callbackAllocator.get()); + callbackAllocatorPair = + std::make_pair(std::shared_ptr( + new TensorRTCallBackAllocator(gpuAllocator)), + gpuAllocator); + runtime->setGpuAllocator(callbackAllocatorPair.first.get()); } } @@ -135,7 +236,7 @@ class NvInferRuntimeWrapper { nvinfer1::IRuntime *operator->() { return runtime.get(); } std::shared_ptr runtime; - std::shared_ptr callbackAllocator; + std::pair, GpuAllocator*> callbackAllocatorPair; }; class NvInferEngineWrapper { @@ -234,6 +335,22 @@ class NvInferExecContextWrapper { /// Returned the pre-allocated host staging buffers. std::vector &getHostIOBuffers() { return hostIOBuffers; } + /// Add a call back output allocator. + void addCallBackAllocators( + std::unique_ptr allocator) { + outputAllocators.emplace_back(std::move(allocator)); + } + + /// Return the last call back output allocator pointer. + TensorRTCallBackOutputAllocator *getLastCallBackAllocatorPtr() { + return outputAllocators.back().get(); + } + + /// Return registered callback gpu allocator. + GpuAllocator *getGpuAllocator() { + return engine->runtime->callbackAllocatorPair.second; + } + private: // We keep a reference to the cuda engine to keep it from going out of scope. // The standard TensorRTRuntime-to-Executor lowering only creates globals for @@ -247,13 +364,14 @@ class NvInferExecContextWrapper { /// A set of pinned host buffers one per input host buffer (shape tensor) to /// the TRT network. std::vector hostIOBuffers; + std::vector> outputAllocators; }; } // namespace -static Status setTensorAddressesOrReport( +static Status setTensorAddressesAndOutputAllocatorsOrReport( NvInferExecContextWrapper &context, const std::vector> - &buffers) { + &buffers, OutputAllocatorTracker &outputAllocatorTracker) { ADD_TENSORRT_MODULE_RANGE("set_tensor_addresses"); unsigned idx = 0; for (auto &[name, ptr, dims] : buffers) { @@ -266,9 +384,10 @@ static Status setTensorAddressesOrReport( bool result = context->setTensorAddress(name.c_str(), reinterpret_cast(ptr)); + const nvinfer1::ICudaEngine &engine = context->getEngine(); + if (!result) { std::stringstream ss; - const nvinfer1::ICudaEngine &engine = context->getEngine(); ss << "Failed to set tensor address for IO tensor: " << name << " at position " << idx << "; the IO tensors are:\n"; for (int64_t i = 0; i < engine.getNbIOTensors(); i++) { @@ -289,6 +408,37 @@ static Status setTensorAddressesOrReport( return getInternalErrorStatus("failed to set input shape"); } + // Set output allocators + if (engine.getTensorIOMode(name.c_str()) == + nvinfer1::TensorIOMode::kOUTPUT and + engine.getTensorLocation(name.c_str()) == + nvinfer1::TensorLocation::kDEVICE) { + + // Since setting output allocator is optional. + if (outputAllocatorTracker.getAllocator(reinterpret_cast(ptr)) != + nullptr) { + context.addCallBackAllocators( + std::make_unique( + context.getGpuAllocator(), + outputAllocatorTracker.getAllocator( + reinterpret_cast(ptr)), + name.c_str(), reinterpret_cast(ptr), dims, + engine.getTensorDataType(name.c_str()))); + context->setOutputAllocator(name.c_str(), + static_cast( + context.getLastCallBackAllocatorPtr())); + } else { + // It is possible that previous call with same output name and different + // memref pointer would have set output allocator. Due to "hacky" naming + // scheme, outputs are always named as "result0", "result1", .... If not + // tracker is found for a given pointer, let's unset the output + // allocator. + if (context->getOutputAllocator(name.c_str())) { + context->setOutputAllocator(name.c_str(), nullptr); + } + } + } + MTRT_DBGF("Set tensor address [%d] = %lu", idx, ptr); idx++; } @@ -390,6 +540,7 @@ prepareBuffers(const AllocTracker &allocTracker, static Status enqueueV3Wrapper(AllocTracker &tracker, ResourceTracker &resourceTracker, + OutputAllocatorTracker &outputAllocatorTracker, NvInferExecContextWrapper &context, CudaStreamPtr stream, sol::table &va) { StatusOr>> @@ -398,8 +549,8 @@ static Status enqueueV3Wrapper(AllocTracker &tracker, return getStatusWithMsg(StatusCode::InternalError, "failed to prepare buffers: ", buffers.getString()); - MTRT_RETURN_IF_ERROR(setTensorAddressesOrReport(context, *buffers)); + MTRT_RETURN_IF_ERROR(setTensorAddressesAndOutputAllocatorsOrReport(context, *buffers, outputAllocatorTracker)); // Create an event that we can wait on for releasing any host-pinned staging // allocations we made. MTRT_ASSIGN_OR_RETURN(CudaEventPtr inputConsumedEvent, @@ -426,7 +577,8 @@ static Status enqueueV3Wrapper(AllocTracker &tracker, //===----------------------------------------------------------------------===// void mlirtrt::runtime::registerExecutorTensorRTModuleLuaRuntimeMethods( lua_State *luaState, PinnedMemoryAllocator *pinnedMemoryAllocator, - AllocTracker *allocTracker, ResourceTracker *resourceTracker, GpuAllocator* allocator) { + AllocTracker *allocTracker, ResourceTracker *resourceTracker, + OutputAllocatorTracker *outputAllocatorTracker, GpuAllocator *allocator) { sol::state_view lua(luaState); lua["_trtrt_create_runtime"] = @@ -463,16 +615,17 @@ void mlirtrt::runtime::registerExecutorTensorRTModuleLuaRuntimeMethods( }; lua["_trtrt_enqueue"] = - [allocTracker, - resourceTracker](sol::this_state state, - std::shared_ptr context, - CudaStreamPtr stream, sol::table va) { + [allocTracker, resourceTracker, outputAllocatorTracker]( + sol::this_state state, + std::shared_ptr context, + CudaStreamPtr stream, sol::table va) { ADD_TENSORRT_MODULE_RANGE("trtrt_enqueue"); sol::state_view luaState(state); assert(context != nullptr); assert(stream != nullptr && "expected valid stream"); - Status result = enqueueV3Wrapper(*allocTracker, *resourceTracker, - *context, stream, va); + Status result = + enqueueV3Wrapper(*allocTracker, *resourceTracker, + *outputAllocatorTracker, *context, stream, va); SET_LUA_ERROR_IF_ERROR(result, state); }; } diff --git a/mlir-tensorrt/executor/lib/Support/Allocators.cpp b/mlir-tensorrt/executor/lib/Support/Allocators.cpp index 100cb0361..8def6eb90 100644 --- a/mlir-tensorrt/executor/lib/Support/Allocators.cpp +++ b/mlir-tensorrt/executor/lib/Support/Allocators.cpp @@ -42,6 +42,73 @@ using namespace mlirtrt; DEBUG_WITH_TYPE("allocators", fprintf(stderr, "%s:%d " fmt "\n", __FILE__, \ __LINE__, __VA_ARGS__)) +//===----------------------------------------------------------------------===// +// CustomTensorRTOutputAllocator +//===----------------------------------------------------------------------===// + +inline uint64_t roundUp(uint64_t m, uint64_t n) { + return ((m + n - 1) / n) * n; +} + +void *CustomTensorRTOuputAllocator::reallocateOutputAsync( + char const *tensorName, void *currentMemory, uint64_t size, + uint64_t alignment, cudaStream_t *stream) { + + assert(currentMemory == mCurrentMemory && "output buffer mismatch"); + assert(strcmp(tensorName, mTensorName) == 0 && "tensor name mismatch"); + assert(!mReallocateOutputCalled && "duplicate call to reallocateOutput"); + mReallocateOutputCalled = true; + // Some memory allocators return nullptr when allocating zero bytes, but + // TensorRT requires a non-null ptr even for empty tensors, so allocate a + // dummy byte. + size = std::max(size, static_cast(1)); + + // Check if reallocation is required. + if (size > mOutputSize) { + size = roundUp(size, alignment); + + if (mOutputPtr) { + if (mGpuAllocator) { + // Use registeted call back GPU allocator for output allocations. + mGpuAllocator->deallocate(mOutputPtr, stream); + } else { + // Fall-back to local memory management. + cudaFree(mOutputPtr); + } + } + + mOutputPtr = nullptr; + mOutputSize = 0; + + void *memory; + if (mGpuAllocator) { + // Use registeted call back GPU allocator for output allocations. + memory = mGpuAllocator->allocate(size, alignment, 0 /* flags */, stream); + } else { + // Fall-back to local memory management. + cudaMalloc(&memory, size); + } + mOutputPtr = memory; + if (mOutputPtr != nullptr) { + mOutputSize = size; + } + return mOutputPtr; + } + return mCurrentMemory; +} + +void CustomTensorRTOuputAllocator::notifyShape(char const *tensorName, + const int64_t *dims, int64_t nbDims) { + assert(mReallocateOutputCalled && + "TensorRT must invoke reallocateOutput first"); + assert(!mNotifyShapeCalled && "duplicate call to notifyShape"); + assert(tensorName == mTensorName); + + mNotifyShapeCalled = true; + mOutputDims.resize(nbDims); + std::copy_n(dims, nbDims, mOutputDims.begin()); +} + //===----------------------------------------------------------------------===// // CustomTensorRTAllocator //===----------------------------------------------------------------------===// diff --git a/mlir-tensorrt/executor/lib/Tools/ExecutorRunnerMain.cpp b/mlir-tensorrt/executor/lib/Tools/ExecutorRunnerMain.cpp index dc14db16f..d06b59618 100644 --- a/mlir-tensorrt/executor/lib/Tools/ExecutorRunnerMain.cpp +++ b/mlir-tensorrt/executor/lib/Tools/ExecutorRunnerMain.cpp @@ -179,8 +179,6 @@ executor::ExecutorRunnerMain(int argc, char **argv, allocator.reset(new CustomTensorRTAllocator()); } - // Read the buffer as a Lua script and execute. - if (options.inputType == Lua) { assert(!options.dumpFunctionSignature && "Can not dump function signature for Lua input type."); @@ -213,7 +211,8 @@ executor::ExecutorRunnerMain(int argc, char **argv, } mlirtrt::StatusOr executionResult = - mlirtrt::runtime::runExecutorExecutable(std::move(*executable), std::move(allocator)); + mlirtrt::runtime::runExecutorExecutable(std::move(*executable), + std::move(allocator)); if (!executionResult.isOk()) return emitError(UnknownLoc::get(&context)) << "failed to load and run executable: " diff --git a/mlir-tensorrt/python/bindings/Runtime/RuntimePyBind.cpp b/mlir-tensorrt/python/bindings/Runtime/RuntimePyBind.cpp index 3f45138c8..efe2bf7ec 100644 --- a/mlir-tensorrt/python/bindings/Runtime/RuntimePyBind.cpp +++ b/mlir-tensorrt/python/bindings/Runtime/RuntimePyBind.cpp @@ -193,6 +193,7 @@ class PyRuntimeValue : public PyMTRTWrapper { class PyGpuAllocator { public: py::object pySelf; + // This ensure that PyGpuAllocator is not deallocated before corresponding Python object lives. PyGpuAllocator(py::object self) : pySelf(self) {} virtual ~PyGpuAllocator() = default; @@ -260,6 +261,127 @@ class PyGpuAllocatorTrampoline : public PyGpuAllocator { } }; +class PyOutputAllocator { +public: + py::object pySelf; + // This ensure that PyOutputAllocator is not deallocated before corresponding + // Python object lives. + PyOutputAllocator(py::object self) : pySelf(self) {} + + virtual ~PyOutputAllocator() = default; + virtual void setTensorName(const char *tensorName) = 0; + virtual void setCurrentMemory(uintptr_t currentMemory) = 0; + virtual void setOutputSize(const int64_t outputSize) = 0; + virtual uintptr_t reallocateOutputAsync(char const *tensorName, + uintptr_t currentMemory, uint64_t size, + uint64_t alignment) = 0; + virtual void notifyShape(char const *tensorName, const int64_t *dims, + int64_t nbDims) = 0; + // Creates a C-compatible struct for interfacing with lower-level APIs. + MTRT_OutputAllocator getCApiObject() { return createWithPythonCallbacks(this); } + +private: + static void PySetGpuAllocator(void *self, MTRT_GpuAllocator gpuAllocator) { + // Let user use the default available gpu allocator for now. + } + + static void PySetTensorName(void *self, const char *tensorName) { + py::gil_scoped_acquire acquire; + auto *allocator = static_cast(self); + return allocator->setTensorName(tensorName); + } + + static void PySetCurrentMemory(void *self, void *currentMemory) { + py::gil_scoped_acquire acquire; + auto *allocator = static_cast(self); + return allocator->setCurrentMemory( + reinterpret_cast(currentMemory)); + } + + static void PySetOutputSize(void *self, const int64_t outputSize) { + py::gil_scoped_acquire acquire; + auto *allocator = static_cast(self); + return allocator->setOutputSize(outputSize); + } + + static void *PyReallocateOutputAsync(void *self, char const *tensorName, + void *currentMemory, uint64_t size, + uint64_t alignment, + cudaStream_t * /*stream*/) { + py::gil_scoped_acquire acquire; + auto *allocator = static_cast(self); + return reinterpret_cast(allocator->reallocateOutputAsync( + tensorName, reinterpret_cast(currentMemory), size, + alignment)); + } + + static void PyNotifyShape(void *self, char const *tensorName, const int64_t *dims, + int64_t nbDims) { + py::gil_scoped_acquire acquire; + auto *allocator = static_cast(self); + return allocator->notifyShape(tensorName, dims, nbDims); + } + + // Constructs MTRT_GpuAllocator with this instance's methods as callbacks. + static MTRT_OutputAllocator + createWithPythonCallbacks(PyOutputAllocator *allocator) { + MTRT_OutputAllocator capi_allocator; + capi_allocator.ptr = allocator; + capi_allocator.setGpuAllocator = PySetGpuAllocator; + capi_allocator.setTensorName = PySetTensorName; + capi_allocator.setCurrentMemory = PySetCurrentMemory; + capi_allocator.setOutputSize = PySetOutputSize; + capi_allocator.reallocateOutputAsync = PyReallocateOutputAsync; + capi_allocator.notifyShape = PyNotifyShape; + return capi_allocator; + } +}; + +// Pybind11 trampoline class for PyOutputAllocator. +// Enables Python subclasses to override virtual methods of PyOutputAllocator. +class PyOutputAllocatorTrampoline : public PyOutputAllocator { +public: + using PyOutputAllocator::PyOutputAllocator; // Inherit constructors + + // Trampoline for setTensorName: Dispatches call to Python implementation if + // overridden. + void setTensorName(const char *tensorName) override { + PYBIND11_OVERRIDE_PURE(void, // Return type + PyOutputAllocator, // Parent class + set_tensor_name, // Name of function in Python + tensorName); // Arguments + } + void setCurrentMemory(uintptr_t currentMemory) override { + PYBIND11_OVERRIDE_PURE(void, // Return type + PyOutputAllocator, // Parent class + set_current_memory,// Name of function in Python + currentMemory); // Arguments + } + void setOutputSize(const int64_t outputSize) override { + PYBIND11_OVERRIDE_PURE(void, // Return type + PyOutputAllocator, // Parent class + set_output_size, // Name of function in Python + outputSize); // Arguments + } + uintptr_t reallocateOutputAsync(char const *tensorName, + uintptr_t currentMemory, uint64_t size, + uint64_t alignment) override { + PYBIND11_OVERRIDE_PURE(uintptr_t, // Return type + PyOutputAllocator, // Parent class + reallocate_output, // Name of function in Python + tensorName, // Arguments + currentMemory, size, alignment); + } + void notifyShape(char const *tensorName, const int64_t *dims, + int64_t nbDims) override { + PYBIND11_OVERRIDE_PURE(void, // Return type + PyOutputAllocator, // Parent class + notify_shape, // Name of function in C++ + tensorName, // Arguments + dims, nbDims); + } +}; + /// Python object type wrapper for `MTRT_StableHLOToExecutableOptions`. class PyRuntimeSessionOptions : public PyMTRTWrapper(m, + "OutputAllocator") + .def(py::init<>( + [](py::object self) { return new PyOutputAllocatorTrampoline(self); })) + .def("set_tensor_name", &PyOutputAllocator::setTensorName) + .def("set_current_memory", &PyOutputAllocator::setCurrentMemory) + .def("set_output_size", &PyOutputAllocator::setOutputSize) + .def("rellocate_output_async", &PyOutputAllocator::reallocateOutputAsync) + .def("notify_shape", &PyOutputAllocator::notifyShape) + .def("get_capi_object", &PyOutputAllocator::getCApiObject); + py::class_(m, "RuntimeSession", py::module_local()) .def(py::init<>([](PyRuntimeSessionOptions &options, PyExecutable &exe, py::object gpu_allocator = py::none()) { diff --git a/mlir-tensorrt/test/python/IntegrationTests/test_stablehlo_add.py b/mlir-tensorrt/test/python/IntegrationTests/test_stablehlo_add.py index 1fa932fec..1e7289b71 100644 --- a/mlir-tensorrt/test/python/IntegrationTests/test_stablehlo_add.py +++ b/mlir-tensorrt/test/python/IntegrationTests/test_stablehlo_add.py @@ -37,6 +37,35 @@ def deallocate(self, ptr): return False +class CupyOutputAllocator(runtime.OutputAllocator): + def __init__(self): + super().__init__(self) + + def set_tensor_name(self, tensor_name): + self.tensor_name = tensor_name + + def set_current_memory(self, memory): + self.memory = memory + + def set_output_size(self, size): + self.size = size + + def reallocate_output(self, tensor_name, memory, size, alignment): + assert self.tensor_name == tensor_name + assert self.memory == memory + + if size > self.size: + # For now just fail if reallocation is required. + assert 0 + + return self.memory + + def notify_shape(self, tensor_name, dims, nb_dims): + assert self.tensor_name == tensor_name + self.dims = dims + self.nb_dims = nb_dims + + def stablehlo_add(): # Build/parse the main function. with ir.Context() as context: @@ -75,6 +104,10 @@ def stablehlo_add(): device=devices[0], stream=stream, ) + + output_allocator = CupyOutputAllocator() + arg1.set_output_allocator(output_allocator) + session.execute_function("main", in_args=[arg0], out_args=[arg1], stream=stream) data = np.asarray(client.copy_to_host(arg1, stream=stream)) @@ -88,12 +121,12 @@ def stablehlo_add(): start_time = time.time() for _ in range(0, num_iter): session.execute_function("main", in_args=[arg0], out_args=[arg0], stream=stream) - data = np.asarray(client.copy_to_host(arg1, stream=stream)) + data = np.asarray(client.copy_to_host(arg0, stream=stream)) stream.sync() end_time = time.time() elapsed = end_time - start_time - print(np.asarray(client.copy_to_host(arg0))) + print(np.asarray(data)) print(f"1000 iterations avg { (elapsed/num_iter)/1000.0} msec per iteration") diff --git a/mlir-tensorrt/test/python/IntegrationTests/test_stablehlo_dynamic.py b/mlir-tensorrt/test/python/IntegrationTests/test_stablehlo_dynamic.py index 82c31a05a..b8c56a6df 100644 --- a/mlir-tensorrt/test/python/IntegrationTests/test_stablehlo_dynamic.py +++ b/mlir-tensorrt/test/python/IntegrationTests/test_stablehlo_dynamic.py @@ -86,6 +86,35 @@ def infer_output_shape(client, session, exe, input_shape): return output_shape +class CupyOutputAllocator(runtime.OutputAllocator): + def __init__(self): + super().__init__(self) + + def set_tensor_name(self, tensor_name): + self.tensor_name = tensor_name + + def set_current_memory(self, memory): + self.memory = memory + + def set_output_size(self, size): + self.size = size + + def reallocate_output(self, tensor_name, memory, size, alignment): + assert self.tensor_name == tensor_name + assert self.memory == memory + + if size > self.size: + # For now just fail if reallocation is required. + assert 0 + + return self.memory + + def notify_shape(self, tensor_name, dims, nb_dims): + assert self.tensor_name == tensor_name + self.dims = dims + self.nb_dims = nb_dims + + def test_program(program: str, input_shape: Iterable[int], debug: bool = True): # Build/parse the main function. with ir.Context() as context: @@ -134,6 +163,15 @@ def test_program(program: str, input_shape: Iterable[int], debug: bool = True): stream=stream, ) + # # Preallocate dummy memory for 1 element. + # arg2 = client.create_memref( + # np.zeros((1, 1, 1), dtype=np.float32).data, + # device=devices[0], + # stream=stream, + # ) + # output_allocator = CupyOutputAllocator() + # arg1.set_output_allocator(output_allocator) + session.execute_function( "main", in_args=[arg0, arg1], out_args=[arg2], stream=stream )