Add IGpuAllocator to MLIR-TensorRT

NVIDIA · Aug 16, 2024 · 7b9a0f7 · 7b9a0f7
1 parent 7705c34
commit 7b9a0f7
Show file tree

Hide file tree

Showing 14 changed files with 378 additions and 38 deletions.
diff --git a/mlir-tensorrt/README.md b/mlir-tensorrt/README.md
@@ -23,7 +23,7 @@ We currently support only building on Linux x86 systems.
 We support building several different ways (only via CMake) depending on use-case.
 
 In each case, the LLVM-Project version that we are currently aligned to is
-given in `build_tools/cmake/LLVMCommit.txt`.
+given in `build_tools/cmake/LLVMCommit.cmake`.
 
 Note that currently we provide an LLVM patch which essentially cherry-picks the
 bug fixes from [this open MLIR PR](https://github.com/llvm/llvm-project/pull/91524).
@@ -82,7 +82,7 @@ git clone https://github.com/llvm/llvm-project.git llvm-project
 # Checkout the right commit. Of course, you may try
 # a newer commit or your own modified LLVM-Project.
 cd llvm-project
-git checkout $(cat build_tools/cmake/LLVMCommit.cmake | grep -Po '(?<=").*(?=")')
+git checkout $(cat ../build_tools/cmake/LLVMCommit.cmake | grep -Po '(?<=").*(?=")')
 
 # Apply patch from llvm-project PR 91524
 git apply ../build_tools/llvm-project.patch

diff --git a/mlir-tensorrt/executor/include/mlir-executor-c/Runtime/Runtime.h b/mlir-tensorrt/executor/include/mlir-executor-c/Runtime/Runtime.h
@@ -319,6 +319,38 @@ mtrtScalarValueCastToRuntimeValue(MTRT_ScalarValue v);
 MLIR_CAPI_EXPORTED MTRT_Status
 mtrtScalarValueGetType(MTRT_ScalarValue scalar, MTRT_ScalarTypeCode *code);
 
+//===----------------------------------------------------------------------===//
+// MTRT_GpuAllocator
+//===----------------------------------------------------------------------===//
+
+typedef struct MTRT_GpuAllocator {
+  void *ptr;
+} MTRT_GpuAllocator;
+
+/// Checks nullity of `GpuAllocator`.
+MTRT_CAPI_EXPORTED bool mtrtGpuAllocatorIsNull(MTRT_GpuAllocator gpuAllocator);
+
+/// Returns null `GpuAllocator`.
+MTRT_CAPI_EXPORTED MTRT_GpuAllocator mtrtGpuAllocatorGetNull();
+
+MTRT_CAPI_EXPORTED MTRT_Status
+mtrtGpuAllocatorDestroy(MTRT_GpuAllocator executable);
+
+MTRT_CAPI_EXPORTED MTRT_Status
+mtrtGpuAllocatorCreate(MTRT_GpuAllocator *allocator);
+
+//===----------------------------------------------------------------------===//
+// MTRT_GpuAllocator
+//===----------------------------------------------------------------------===//
+
+MTRT_CAPI_EXPORTED MTRT_Status mtrtGpuAllocatorAllocate(
+    MTRT_GpuAllocator gpuAllocator, uint64_t size, uint64_t alignment,
+    uint32_t flags, MTRT_Stream stream, void **memory);
+
+MTRT_CAPI_EXPORTED MTRT_Status
+mtrtGpuAllocatorDeallocate(MTRT_GpuAllocator gpuAllocator, void *memory,
+                           MTRT_Stream stream, bool *result);
+
 //===----------------------------------------------------------------------===//
 // MTRT_RuntimeSessionOptions
 //===----------------------------------------------------------------------===//
@@ -359,7 +391,7 @@ typedef struct MTRT_RuntimeSession {
 /// that the session only has a read-only view in to the Executable for code and
 /// constant data. Therefore the Executable must outlive the RuntimeSession.
 MLIR_CAPI_EXPORTED MTRT_Status mtrtRuntimeSessionCreate(
-    MTRT_RuntimeSessionOptions options, MTRT_Executable executable,
+    MTRT_RuntimeSessionOptions options, MTRT_Executable executable, MTRT_GpuAllocator allocator,
     MTRT_RuntimeSession *result);
 
 /// Destory the session. This does not destroy the associated Executable, which

diff --git a/mlir-tensorrt/executor/include/mlir-executor/Runtime/API/API.h b/mlir-tensorrt/executor/include/mlir-executor/Runtime/API/API.h
@@ -867,7 +867,8 @@ class RuntimeSession {
                  sol::state state,
                  std::unique_ptr<PinnedMemoryAllocator> pinnedMemoryAllocator,
                  std::unique_ptr<AllocTracker> allocTracker,
-                 std::unique_ptr<ResourceTracker> resourceTracker);
+                 std::unique_ptr<ResourceTracker> resourceTracker,
+                 GpuAllocator* gpuAllocator);
 
   ExecutableView getExecutable() const { return executable; }
 
@@ -881,14 +882,16 @@ class RuntimeSession {
 
   ResourceTracker &getResourceTracker() { return *resourceTracker; }
 
+  GpuAllocator* getGpuAllocator() { return gpuAllocator; }
+
 private:
   RuntimeSessionOptions options;
   ExecutableView executable;
 
   std::unique_ptr<PinnedMemoryAllocator> pinnedMemoryAllocator;
   std::unique_ptr<AllocTracker> allocTracker;
   std::unique_ptr<ResourceTracker> resourceTracker;
-
+  GpuAllocator* gpuAllocator;
   sol::state state;
 };
 

diff --git a/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/LuaRegistration.h b/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/LuaRegistration.h
@@ -37,6 +37,6 @@ void registerLuaRuntimeMethods(lua_State *state,
                                const RuntimeSessionOptions &options,
                                PinnedMemoryAllocator *pinnedMemoryAllocator,
                                AllocTracker *allocTracker,
-                               ResourceTracker *resourceTracker);
+                               ResourceTracker *resourceTracker, GpuAllocator* allocator);
 
 } // namespace mlirtrt::runtime
diff --git a/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/LuaRuntime.h b/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/LuaRuntime.h
@@ -36,7 +36,7 @@ namespace mlirtrt::runtime {
 /// `main` function. It is assumed that `main` takes no arguments and returns an
 /// integer result (which is returned if the execution is successful).
 /// TODO: this should take a handle to a function for streaming output/errors.
-StatusOr<int64_t> runExecutorLuaScript(std::string_view luaScript);
+StatusOr<int64_t> runExecutorLuaScript(std::string_view luaScript, GpuAllocator* allocator);
 
 /// Synchronously run a serialized executor Executable one time. An `Executable`
 /// is essentially a Lua script packaged with metadata and serialized constants
@@ -48,12 +48,12 @@ StatusOr<int64_t> runExecutorLuaScript(std::string_view luaScript);
 /// execution is successful).
 /// TODO: this should take a handle to a function for
 /// streaming output/errors.
-StatusOr<int64_t> runExecutorExecutable(std::unique_ptr<Executable> executable);
+StatusOr<int64_t> runExecutorExecutable(std::unique_ptr<Executable> executable, GpuAllocator* allocator);
 
 /// Create an execution state. This will setup a Lua environment and invoke
 /// global initialization.
 StatusOr<std::unique_ptr<RuntimeSession>>
-createRuntimeSessionWithLuaBackend(ExecutableView executable,
+createRuntimeSessionWithLuaBackend(ExecutableView executable, GpuAllocator* allocator,
                                    const RuntimeSessionOptions &options);
 
 /// Set the primary stream for the loaded executable to use.

diff --git a/...orrt/executor/include/mlir-executor/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.h b/...orrt/executor/include/mlir-executor/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.h
@@ -37,7 +37,7 @@ class ResourceTracker;
 /// Lua state.
 void registerExecutorTensorRTModuleLuaRuntimeMethods(
     lua_State *luaState, PinnedMemoryAllocator *pinnedMemoryAllocator,
-    AllocTracker *allocTracker, ResourceTracker *resourceTracker);
+    AllocTracker *allocTracker, ResourceTracker *resourceTracker, GpuAllocator* allocator);
 
 } // namespace mlirtrt::runtime
 

diff --git a/mlir-tensorrt/executor/include/mlir-executor/Support/Allocators.h b/mlir-tensorrt/executor/include/mlir-executor/Support/Allocators.h
@@ -32,6 +32,79 @@ namespace mlirtrt {
 
 struct EventPool;
 
+// Abstract allocator to be implemented by consumers.
+using AllocatorFlags = uint32_t;
+
+class GpuAllocator {
+public:
+  GpuAllocator() = default;
+  virtual ~GpuAllocator() = default;
+
+  virtual StatusOr<void *> reallocate(void *baseAddr, uint64_t alignment,
+                                      uint64_t newSize,
+                                      std::optional<cudaStream_t> stream) = 0;
+
+  virtual StatusOr<void *> allocate(uint64_t const size,
+                                    uint64_t const alignment,
+                                    AllocatorFlags const flags,
+                                    std::optional<cudaStream_t> stream) = 0;
+
+  virtual StatusOr<bool> deallocate(void *const memory,
+                                    std::optional<cudaStream_t> stream) = 0;
+
+protected:
+  GpuAllocator(GpuAllocator const &) = delete;
+  GpuAllocator(GpuAllocator &&) = delete;
+  GpuAllocator &operator=(GpuAllocator const &) & = delete;
+  GpuAllocator &operator=(GpuAllocator &&) & = delete;
+};
+
+class StubAllocator : public GpuAllocator {
+public:
+  StubAllocator() = default;
+  ~StubAllocator() = default;
+
+  StatusOr<void *> reallocate(void *baseAddr, uint64_t alignment,
+                              uint64_t newSize,
+                              std::optional<cudaStream_t> stream) override {
+    return getStatusWithMsg(
+        StatusCode::InternalError,
+        "[StubAllocator][reallocate]: Must be overriden in Python");
+  }
+
+  StatusOr<void *> allocate(uint64_t const size, uint64_t const alignment,
+                            AllocatorFlags const flags,
+                            std::optional<cudaStream_t> stream) override {
+    return getStatusWithMsg(
+        StatusCode::InternalError,
+        "[StubAllocator][allocate]: Must be overriden in Python");
+  }
+
+  StatusOr<bool> deallocate(void *const memory,
+                            std::optional<cudaStream_t> stream) override {
+    return getStatusWithMsg(
+        StatusCode::InternalError,
+        "[StubAllocator][deallocate]: Must be overriden in Python");
+  }
+};
+
+class CustomTensorRTAllocator : public GpuAllocator {
+public:
+  CustomTensorRTAllocator() = default;
+  ~CustomTensorRTAllocator() = default;
+
+  StatusOr<void *> reallocate(void *baseAddr, uint64_t alignment,
+                              uint64_t newSize,
+                              std::optional<cudaStream_t> stream) override;
+
+  StatusOr<void *> allocate(uint64_t const size, uint64_t const alignment,
+                            AllocatorFlags const flags,
+                            std::optional<cudaStream_t> stream) override;
+
+  StatusOr<bool> deallocate(void *const memory,
+                            std::optional<cudaStream_t> stream) override;
+};
+
 //===----------------------------------------------------------------------===//
 // PoolTrackedCudaEvent
 //===----------------------------------------------------------------------===//

diff --git a/mlir-tensorrt/executor/lib/CAPI/Runtime/Runtime.cpp b/mlir-tensorrt/executor/lib/CAPI/Runtime/Runtime.cpp
@@ -27,6 +27,7 @@
 #include "mlir-executor/Runtime/API/API.h"
 #include "mlir-executor/Runtime/API/ExecutableFlatbuffer.h"
 #include "mlir-executor/Runtime/Backend/Lua/LuaRuntime.h"
+#include "mlir-executor/Support/Allocators.h"
 #include "mlir-executor/Support/Status.h"
 #include "mlir/Support/FileUtilities.h"
 #include "llvm/ADT/SmallVectorExtras.h"
@@ -48,6 +49,8 @@ DEFINE_C_API_PTR_METHODS(MTRT_RuntimeSession,
                          ::mlirtrt::runtime::RuntimeSession)
 DEFINE_C_API_PTR_METHODS(MTRT_RuntimeSessionOptions,
                          ::mlirtrt::runtime::RuntimeSessionOptions)
+DEFINE_C_API_PTR_METHODS(MTRT_GpuAllocator,
+                         ::mlirtrt::GpuAllocator)
 DEFINE_C_API_PTR_METHODS(MTRT_Executable, ::mlirtrt::runtime::Executable)
 DEFINE_C_API_PTR_METHODS(MTRT_Stream, MTRT_StreamImpl)
 DEFINE_C_API_PTR_METHODS(MTRT_RuntimeValue, ::mlirtrt::runtime::RuntimeValue)
@@ -598,6 +601,55 @@ MTRT_ScalarValue mtrtRuntimeValueDynCastToScalar(MTRT_RuntimeValue v) {
   return wrap(static_cast<ScalarValue *>(x));
 }
 
+//===----------------------------------------------------------------------===//
+// MTRT_GpuAllocator
+//===----------------------------------------------------------------------===//
+
+bool mtrtGpuAllocatorIsNull(MTRT_GpuAllocator gpuAllocator) {
+  return !gpuAllocator.ptr;
+}
+
+MTRT_GpuAllocator mtrtGpuAllocatorGetNull() { return MTRT_GpuAllocator{nullptr}; }
+
+MTRT_Status mtrtGpuAllocatorDestroy(MTRT_GpuAllocator executable) {
+  delete unwrap(executable);
+  return mtrtStatusGetOk();
+}
+
+MTRT_Status mtrtGpuAllocatorCreate(MTRT_GpuAllocator *allocator) {
+  *allocator = MTRT_GpuAllocator{/*ptr=*/new StubAllocator()};
+  return mtrtStatusGetOk();
+}
+
+MTRT_Status mtrtGpuAllocatorAllocate(MTRT_GpuAllocator gpuAllocator,
+                                     uint64_t size, uint64_t alignment,
+                                     uint32_t flags, MTRT_Stream stream,
+                                     void **memory) {
+  GpuAllocator *cppGpuAllocator = unwrap(gpuAllocator);
+  StatusOr<void *> status = cppGpuAllocator->allocate(
+      size, alignment, flags,
+      !mtrtStreamIsNull(stream) ? std::optional(unwrap(stream)->getRawStream())
+                                : std::nullopt);
+  if (status.isOk()) {
+    *memory = *status;
+  }
+  return mtrtStatusGetOk();
+}
+
+MTRT_Status mtrtGpuAllocatorDeallocate(MTRT_GpuAllocator gpuAllocator,
+                                       void *memory, MTRT_Stream stream,
+                                       bool *result) {
+  GpuAllocator *cppGpuAllocator = unwrap(gpuAllocator);
+  StatusOr<bool> status = cppGpuAllocator->deallocate(
+      memory, !mtrtStreamIsNull(stream)
+                  ? std::optional(unwrap(stream)->getRawStream())
+                  : std::nullopt);
+  if (status.isOk()) {
+    *result = *status;
+  }
+  return mtrtStatusGetOk();
+}
+
 //===----------------------------------------------------------------------===//
 // MTRT_RuntimeSessionOptions
 //===----------------------------------------------------------------------===//
@@ -625,12 +677,14 @@ mtrtRuntimeSessionOptionsDestroy(MTRT_RuntimeSessionOptions options) {
 
 MTRT_Status mtrtRuntimeSessionCreate(MTRT_RuntimeSessionOptions options,
                                      MTRT_Executable executable,
+                                     MTRT_GpuAllocator gpuAllocator,
                                      MTRT_RuntimeSession *result) {
   RuntimeSessionOptions *cppOptions = unwrap(options);
   Executable *cppExecutable = unwrap(executable);
+  GpuAllocator *cppGpuAllocator = unwrap(gpuAllocator);
 
   StatusOr<std::unique_ptr<RuntimeSession>> session =
-      createRuntimeSessionWithLuaBackend(cppExecutable->getView(), *cppOptions);
+      createRuntimeSessionWithLuaBackend(cppExecutable->getView(), cppGpuAllocator, *cppOptions);
   if (session.isError())
     return wrap(session.getStatus());
 

diff --git a/mlir-tensorrt/executor/lib/Runtime/API/API.cpp b/mlir-tensorrt/executor/lib/Runtime/API/API.cpp
@@ -349,16 +349,17 @@ RuntimeSessionOptions::createUsingSingleHostMpi() {
 //===----------------------------------------------------------------------===//
 // RuntimeSession
 //===----------------------------------------------------------------------===//
-
 RuntimeSession::RuntimeSession(
     RuntimeSessionOptions options, ExecutableView exe, sol::state state,
     std::unique_ptr<PinnedMemoryAllocator> pinnedMemoryAllocator,
     std::unique_ptr<AllocTracker> allocTracker,
-    std::unique_ptr<ResourceTracker> resourceTracker)
+    std::unique_ptr<ResourceTracker> resourceTracker,
+    GpuAllocator *gpuAllocator)
     : options(std::move(options)), executable(exe),
       pinnedMemoryAllocator(std::move(pinnedMemoryAllocator)),
       allocTracker(std::move(allocTracker)),
-      resourceTracker(std::move(resourceTracker)), state(std::move(state)) {}
+      resourceTracker(std::move(resourceTracker)), gpuAllocator(gpuAllocator),
+      state(std::move(state)) {}
 
 //===----------------------------------------------------------------------===//
 // AllocTracker