From 7b9a0f752a17a9ae2d1fff1301027ebfc90bb1e5 Mon Sep 17 00:00:00 2001
From: Jhalak Patel <jhalakp@nvidia.com>
Date: Tue, 13 Aug 2024 11:16:42 -0700
Subject: [PATCH] Add IGpuAllocator to MLIR-TensorRT

---
 mlir-tensorrt/README.md                       |  4 +-
 .../include/mlir-executor-c/Runtime/Runtime.h | 34 +++++++-
 .../include/mlir-executor/Runtime/API/API.h   |  7 +-
 .../Runtime/Backend/Lua/LuaRegistration.h     |  2 +-
 .../Runtime/Backend/Lua/LuaRuntime.h          |  6 +-
 .../Lua/Modules/TensorRT/TensorRTModule.h     |  2 +-
 .../mlir-executor/Support/Allocators.h        | 73 +++++++++++++++++
 .../executor/lib/CAPI/Runtime/Runtime.cpp     | 56 ++++++++++++-
 .../executor/lib/Runtime/API/API.cpp          |  7 +-
 .../lib/Runtime/Backend/Lua/LuaRuntime.cpp    | 22 +++---
 .../Lua/Modules/TensorRT/TensorRTModule.cpp   | 79 +++++++++++++++++--
 .../executor/lib/Support/Allocators.cpp       | 55 +++++++++++++
 .../executor/lib/Tools/ExecutorRunnerMain.cpp | 15 +++-
 .../python/bindings/Runtime/RuntimePyBind.cpp | 54 ++++++++++++-
 14 files changed, 378 insertions(+), 38 deletions(-)

diff --git a/mlir-tensorrt/README.md b/mlir-tensorrt/README.md
index 3916be7b0..206d1e32e 100644
--- a/mlir-tensorrt/README.md
+++ b/mlir-tensorrt/README.md
@@ -23,7 +23,7 @@ We currently support only building on Linux x86 systems.
 We support building several different ways (only via CMake) depending on use-case.
 
 In each case, the LLVM-Project version that we are currently aligned to is
-given in `build_tools/cmake/LLVMCommit.txt`.
+given in `build_tools/cmake/LLVMCommit.cmake`.
 
 Note that currently we provide an LLVM patch which essentially cherry-picks the
 bug fixes from [this open MLIR PR](https://github.com/llvm/llvm-project/pull/91524).
@@ -82,7 +82,7 @@ git clone https://github.com/llvm/llvm-project.git llvm-project
 # Checkout the right commit. Of course, you may try
 # a newer commit or your own modified LLVM-Project.
 cd llvm-project
-git checkout $(cat build_tools/cmake/LLVMCommit.cmake | grep -Po '(?<=").*(?=")')
+git checkout $(cat ../build_tools/cmake/LLVMCommit.cmake | grep -Po '(?<=").*(?=")')
 
 # Apply patch from llvm-project PR 91524
 git apply ../build_tools/llvm-project.patch
diff --git a/mlir-tensorrt/executor/include/mlir-executor-c/Runtime/Runtime.h b/mlir-tensorrt/executor/include/mlir-executor-c/Runtime/Runtime.h
index 11ae93519..65377da35 100644
--- a/mlir-tensorrt/executor/include/mlir-executor-c/Runtime/Runtime.h
+++ b/mlir-tensorrt/executor/include/mlir-executor-c/Runtime/Runtime.h
@@ -319,6 +319,38 @@ mtrtScalarValueCastToRuntimeValue(MTRT_ScalarValue v);
 MLIR_CAPI_EXPORTED MTRT_Status
 mtrtScalarValueGetType(MTRT_ScalarValue scalar, MTRT_ScalarTypeCode *code);
 
+//===----------------------------------------------------------------------===//
+// MTRT_GpuAllocator
+//===----------------------------------------------------------------------===//
+
+typedef struct MTRT_GpuAllocator {
+  void *ptr;
+} MTRT_GpuAllocator;
+
+/// Checks nullity of `GpuAllocator`.
+MTRT_CAPI_EXPORTED bool mtrtGpuAllocatorIsNull(MTRT_GpuAllocator gpuAllocator);
+
+/// Returns null `GpuAllocator`.
+MTRT_CAPI_EXPORTED MTRT_GpuAllocator mtrtGpuAllocatorGetNull();
+
+MTRT_CAPI_EXPORTED MTRT_Status
+mtrtGpuAllocatorDestroy(MTRT_GpuAllocator executable);
+
+MTRT_CAPI_EXPORTED MTRT_Status
+mtrtGpuAllocatorCreate(MTRT_GpuAllocator *allocator);
+
+//===----------------------------------------------------------------------===//
+// MTRT_GpuAllocator
+//===----------------------------------------------------------------------===//
+
+MTRT_CAPI_EXPORTED MTRT_Status mtrtGpuAllocatorAllocate(
+    MTRT_GpuAllocator gpuAllocator, uint64_t size, uint64_t alignment,
+    uint32_t flags, MTRT_Stream stream, void **memory);
+
+MTRT_CAPI_EXPORTED MTRT_Status
+mtrtGpuAllocatorDeallocate(MTRT_GpuAllocator gpuAllocator, void *memory,
+                           MTRT_Stream stream, bool *result);
+
 //===----------------------------------------------------------------------===//
 // MTRT_RuntimeSessionOptions
 //===----------------------------------------------------------------------===//
@@ -359,7 +391,7 @@ typedef struct MTRT_RuntimeSession {
 /// that the session only has a read-only view in to the Executable for code and
 /// constant data. Therefore the Executable must outlive the RuntimeSession.
 MLIR_CAPI_EXPORTED MTRT_Status mtrtRuntimeSessionCreate(
-    MTRT_RuntimeSessionOptions options, MTRT_Executable executable,
+    MTRT_RuntimeSessionOptions options, MTRT_Executable executable, MTRT_GpuAllocator allocator,
     MTRT_RuntimeSession *result);
 
 /// Destory the session. This does not destroy the associated Executable, which
diff --git a/mlir-tensorrt/executor/include/mlir-executor/Runtime/API/API.h b/mlir-tensorrt/executor/include/mlir-executor/Runtime/API/API.h
index d3672c149..b35d0b777 100644
--- a/mlir-tensorrt/executor/include/mlir-executor/Runtime/API/API.h
+++ b/mlir-tensorrt/executor/include/mlir-executor/Runtime/API/API.h
@@ -867,7 +867,8 @@ class RuntimeSession {
                  sol::state state,
                  std::unique_ptr<PinnedMemoryAllocator> pinnedMemoryAllocator,
                  std::unique_ptr<AllocTracker> allocTracker,
-                 std::unique_ptr<ResourceTracker> resourceTracker);
+                 std::unique_ptr<ResourceTracker> resourceTracker,
+                 GpuAllocator* gpuAllocator);
 
   ExecutableView getExecutable() const { return executable; }
 
@@ -881,6 +882,8 @@ class RuntimeSession {
 
   ResourceTracker &getResourceTracker() { return *resourceTracker; }
 
+  GpuAllocator* getGpuAllocator() { return gpuAllocator; }
+
 private:
   RuntimeSessionOptions options;
   ExecutableView executable;
@@ -888,7 +891,7 @@ class RuntimeSession {
   std::unique_ptr<PinnedMemoryAllocator> pinnedMemoryAllocator;
   std::unique_ptr<AllocTracker> allocTracker;
   std::unique_ptr<ResourceTracker> resourceTracker;
-
+  GpuAllocator* gpuAllocator;
   sol::state state;
 };
 
diff --git a/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/LuaRegistration.h b/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/LuaRegistration.h
index b5fed9c3d..922e964d4 100644
--- a/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/LuaRegistration.h
+++ b/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/LuaRegistration.h
@@ -37,6 +37,6 @@ void registerLuaRuntimeMethods(lua_State *state,
                                const RuntimeSessionOptions &options,
                                PinnedMemoryAllocator *pinnedMemoryAllocator,
                                AllocTracker *allocTracker,
-                               ResourceTracker *resourceTracker);
+                               ResourceTracker *resourceTracker, GpuAllocator* allocator);
 
 } // namespace mlirtrt::runtime
diff --git a/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/LuaRuntime.h b/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/LuaRuntime.h
index f39eabd7b..0635e8cb4 100644
--- a/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/LuaRuntime.h
+++ b/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/LuaRuntime.h
@@ -36,7 +36,7 @@ namespace mlirtrt::runtime {
 /// `main` function. It is assumed that `main` takes no arguments and returns an
 /// integer result (which is returned if the execution is successful).
 /// TODO: this should take a handle to a function for streaming output/errors.
-StatusOr<int64_t> runExecutorLuaScript(std::string_view luaScript);
+StatusOr<int64_t> runExecutorLuaScript(std::string_view luaScript, GpuAllocator* allocator);
 
 /// Synchronously run a serialized executor Executable one time. An `Executable`
 /// is essentially a Lua script packaged with metadata and serialized constants
@@ -48,12 +48,12 @@ StatusOr<int64_t> runExecutorLuaScript(std::string_view luaScript);
 /// execution is successful).
 /// TODO: this should take a handle to a function for
 /// streaming output/errors.
-StatusOr<int64_t> runExecutorExecutable(std::unique_ptr<Executable> executable);
+StatusOr<int64_t> runExecutorExecutable(std::unique_ptr<Executable> executable, GpuAllocator* allocator);
 
 /// Create an execution state. This will setup a Lua environment and invoke
 /// global initialization.
 StatusOr<std::unique_ptr<RuntimeSession>>
-createRuntimeSessionWithLuaBackend(ExecutableView executable,
+createRuntimeSessionWithLuaBackend(ExecutableView executable, GpuAllocator* allocator,
                                    const RuntimeSessionOptions &options);
 
 /// Set the primary stream for the loaded executable to use.
diff --git a/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.h b/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.h
index 37d8de629..1ceb91367 100644
--- a/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.h
+++ b/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.h
@@ -37,7 +37,7 @@ class ResourceTracker;
 /// Lua state.
 void registerExecutorTensorRTModuleLuaRuntimeMethods(
     lua_State *luaState, PinnedMemoryAllocator *pinnedMemoryAllocator,
-    AllocTracker *allocTracker, ResourceTracker *resourceTracker);
+    AllocTracker *allocTracker, ResourceTracker *resourceTracker, GpuAllocator* allocator);
 
 } // namespace mlirtrt::runtime
 
diff --git a/mlir-tensorrt/executor/include/mlir-executor/Support/Allocators.h b/mlir-tensorrt/executor/include/mlir-executor/Support/Allocators.h
index 180dbf09e..66f54fb46 100644
--- a/mlir-tensorrt/executor/include/mlir-executor/Support/Allocators.h
+++ b/mlir-tensorrt/executor/include/mlir-executor/Support/Allocators.h
@@ -32,6 +32,79 @@ namespace mlirtrt {
 
 struct EventPool;
 
+// Abstract allocator to be implemented by consumers.
+using AllocatorFlags = uint32_t;
+
+class GpuAllocator {
+public:
+  GpuAllocator() = default;
+  virtual ~GpuAllocator() = default;
+
+  virtual StatusOr<void *> reallocate(void *baseAddr, uint64_t alignment,
+                                      uint64_t newSize,
+                                      std::optional<cudaStream_t> stream) = 0;
+
+  virtual StatusOr<void *> allocate(uint64_t const size,
+                                    uint64_t const alignment,
+                                    AllocatorFlags const flags,
+                                    std::optional<cudaStream_t> stream) = 0;
+
+  virtual StatusOr<bool> deallocate(void *const memory,
+                                    std::optional<cudaStream_t> stream) = 0;
+
+protected:
+  GpuAllocator(GpuAllocator const &) = delete;
+  GpuAllocator(GpuAllocator &&) = delete;
+  GpuAllocator &operator=(GpuAllocator const &) & = delete;
+  GpuAllocator &operator=(GpuAllocator &&) & = delete;
+};
+
+class StubAllocator : public GpuAllocator {
+public:
+  StubAllocator() = default;
+  ~StubAllocator() = default;
+
+  StatusOr<void *> reallocate(void *baseAddr, uint64_t alignment,
+                              uint64_t newSize,
+                              std::optional<cudaStream_t> stream) override {
+    return getStatusWithMsg(
+        StatusCode::InternalError,
+        "[StubAllocator][reallocate]: Must be overriden in Python");
+  }
+
+  StatusOr<void *> allocate(uint64_t const size, uint64_t const alignment,
+                            AllocatorFlags const flags,
+                            std::optional<cudaStream_t> stream) override {
+    return getStatusWithMsg(
+        StatusCode::InternalError,
+        "[StubAllocator][allocate]: Must be overriden in Python");
+  }
+
+  StatusOr<bool> deallocate(void *const memory,
+                            std::optional<cudaStream_t> stream) override {
+    return getStatusWithMsg(
+        StatusCode::InternalError,
+        "[StubAllocator][deallocate]: Must be overriden in Python");
+  }
+};
+
+class CustomTensorRTAllocator : public GpuAllocator {
+public:
+  CustomTensorRTAllocator() = default;
+  ~CustomTensorRTAllocator() = default;
+
+  StatusOr<void *> reallocate(void *baseAddr, uint64_t alignment,
+                              uint64_t newSize,
+                              std::optional<cudaStream_t> stream) override;
+
+  StatusOr<void *> allocate(uint64_t const size, uint64_t const alignment,
+                            AllocatorFlags const flags,
+                            std::optional<cudaStream_t> stream) override;
+
+  StatusOr<bool> deallocate(void *const memory,
+                            std::optional<cudaStream_t> stream) override;
+};
+
 //===----------------------------------------------------------------------===//
 // PoolTrackedCudaEvent
 //===----------------------------------------------------------------------===//
diff --git a/mlir-tensorrt/executor/lib/CAPI/Runtime/Runtime.cpp b/mlir-tensorrt/executor/lib/CAPI/Runtime/Runtime.cpp
index 2b0fbc35c..8d9b16e64 100644
--- a/mlir-tensorrt/executor/lib/CAPI/Runtime/Runtime.cpp
+++ b/mlir-tensorrt/executor/lib/CAPI/Runtime/Runtime.cpp
@@ -27,6 +27,7 @@
 #include "mlir-executor/Runtime/API/API.h"
 #include "mlir-executor/Runtime/API/ExecutableFlatbuffer.h"
 #include "mlir-executor/Runtime/Backend/Lua/LuaRuntime.h"
+#include "mlir-executor/Support/Allocators.h"
 #include "mlir-executor/Support/Status.h"
 #include "mlir/Support/FileUtilities.h"
 #include "llvm/ADT/SmallVectorExtras.h"
@@ -48,6 +49,8 @@ DEFINE_C_API_PTR_METHODS(MTRT_RuntimeSession,
                          ::mlirtrt::runtime::RuntimeSession)
 DEFINE_C_API_PTR_METHODS(MTRT_RuntimeSessionOptions,
                          ::mlirtrt::runtime::RuntimeSessionOptions)
+DEFINE_C_API_PTR_METHODS(MTRT_GpuAllocator,
+                         ::mlirtrt::GpuAllocator)
 DEFINE_C_API_PTR_METHODS(MTRT_Executable, ::mlirtrt::runtime::Executable)
 DEFINE_C_API_PTR_METHODS(MTRT_Stream, MTRT_StreamImpl)
 DEFINE_C_API_PTR_METHODS(MTRT_RuntimeValue, ::mlirtrt::runtime::RuntimeValue)
@@ -598,6 +601,55 @@ MTRT_ScalarValue mtrtRuntimeValueDynCastToScalar(MTRT_RuntimeValue v) {
   return wrap(static_cast<ScalarValue *>(x));
 }
 
+//===----------------------------------------------------------------------===//
+// MTRT_GpuAllocator
+//===----------------------------------------------------------------------===//
+
+bool mtrtGpuAllocatorIsNull(MTRT_GpuAllocator gpuAllocator) {
+  return !gpuAllocator.ptr;
+}
+
+MTRT_GpuAllocator mtrtGpuAllocatorGetNull() { return MTRT_GpuAllocator{nullptr}; }
+
+MTRT_Status mtrtGpuAllocatorDestroy(MTRT_GpuAllocator executable) {
+  delete unwrap(executable);
+  return mtrtStatusGetOk();
+}
+
+MTRT_Status mtrtGpuAllocatorCreate(MTRT_GpuAllocator *allocator) {
+  *allocator = MTRT_GpuAllocator{/*ptr=*/new StubAllocator()};
+  return mtrtStatusGetOk();
+}
+
+MTRT_Status mtrtGpuAllocatorAllocate(MTRT_GpuAllocator gpuAllocator,
+                                     uint64_t size, uint64_t alignment,
+                                     uint32_t flags, MTRT_Stream stream,
+                                     void **memory) {
+  GpuAllocator *cppGpuAllocator = unwrap(gpuAllocator);
+  StatusOr<void *> status = cppGpuAllocator->allocate(
+      size, alignment, flags,
+      !mtrtStreamIsNull(stream) ? std::optional(unwrap(stream)->getRawStream())
+                                : std::nullopt);
+  if (status.isOk()) {
+    *memory = *status;
+  }
+  return mtrtStatusGetOk();
+}
+
+MTRT_Status mtrtGpuAllocatorDeallocate(MTRT_GpuAllocator gpuAllocator,
+                                       void *memory, MTRT_Stream stream,
+                                       bool *result) {
+  GpuAllocator *cppGpuAllocator = unwrap(gpuAllocator);
+  StatusOr<bool> status = cppGpuAllocator->deallocate(
+      memory, !mtrtStreamIsNull(stream)
+                  ? std::optional(unwrap(stream)->getRawStream())
+                  : std::nullopt);
+  if (status.isOk()) {
+    *result = *status;
+  }
+  return mtrtStatusGetOk();
+}
+
 //===----------------------------------------------------------------------===//
 // MTRT_RuntimeSessionOptions
 //===----------------------------------------------------------------------===//
@@ -625,12 +677,14 @@ mtrtRuntimeSessionOptionsDestroy(MTRT_RuntimeSessionOptions options) {
 
 MTRT_Status mtrtRuntimeSessionCreate(MTRT_RuntimeSessionOptions options,
                                      MTRT_Executable executable,
+                                     MTRT_GpuAllocator gpuAllocator,
                                      MTRT_RuntimeSession *result) {
   RuntimeSessionOptions *cppOptions = unwrap(options);
   Executable *cppExecutable = unwrap(executable);
+  GpuAllocator *cppGpuAllocator = unwrap(gpuAllocator);
 
   StatusOr<std::unique_ptr<RuntimeSession>> session =
-      createRuntimeSessionWithLuaBackend(cppExecutable->getView(), *cppOptions);
+      createRuntimeSessionWithLuaBackend(cppExecutable->getView(), cppGpuAllocator, *cppOptions);
   if (session.isError())
     return wrap(session.getStatus());
 
diff --git a/mlir-tensorrt/executor/lib/Runtime/API/API.cpp b/mlir-tensorrt/executor/lib/Runtime/API/API.cpp
index e1243cf78..cb185f5f7 100644
--- a/mlir-tensorrt/executor/lib/Runtime/API/API.cpp
+++ b/mlir-tensorrt/executor/lib/Runtime/API/API.cpp
@@ -349,16 +349,17 @@ RuntimeSessionOptions::createUsingSingleHostMpi() {
 //===----------------------------------------------------------------------===//
 // RuntimeSession
 //===----------------------------------------------------------------------===//
-
 RuntimeSession::RuntimeSession(
     RuntimeSessionOptions options, ExecutableView exe, sol::state state,
     std::unique_ptr<PinnedMemoryAllocator> pinnedMemoryAllocator,
     std::unique_ptr<AllocTracker> allocTracker,
-    std::unique_ptr<ResourceTracker> resourceTracker)
+    std::unique_ptr<ResourceTracker> resourceTracker,
+    GpuAllocator *gpuAllocator)
     : options(std::move(options)), executable(exe),
       pinnedMemoryAllocator(std::move(pinnedMemoryAllocator)),
       allocTracker(std::move(allocTracker)),
-      resourceTracker(std::move(resourceTracker)), state(std::move(state)) {}
+      resourceTracker(std::move(resourceTracker)), gpuAllocator(gpuAllocator),
+      state(std::move(state)) {}
 
 //===----------------------------------------------------------------------===//
 // AllocTracker
diff --git a/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/LuaRuntime.cpp b/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/LuaRuntime.cpp
index 7596c9da7..b19541977 100644
--- a/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/LuaRuntime.cpp
+++ b/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/LuaRuntime.cpp
@@ -72,7 +72,7 @@ static void registerDefaultDeviceDependentMethods(lua_State *state,
 
 static void registerLuaRuntimeMethodsCommon(
     lua_State *state, PinnedMemoryAllocator *pinnedMemoryAllocator,
-    AllocTracker *allocTracker, ResourceTracker *resourceTracker) {
+    AllocTracker *allocTracker, ResourceTracker *resourceTracker, GpuAllocator* allocator) {
   registerExecutorCoreModuleLuaRuntimeMethods(state, pinnedMemoryAllocator,
                                               allocTracker);
   registerExecutorCUDAModuleLuaRuntimeMethods(
@@ -84,15 +84,15 @@ static void registerLuaRuntimeMethodsCommon(
 #endif
 
   registerExecutorTensorRTModuleLuaRuntimeMethods(
-      state, pinnedMemoryAllocator, allocTracker, resourceTracker);
+      state, pinnedMemoryAllocator, allocTracker, resourceTracker, allocator);
 }
 
 void mlirtrt::runtime::registerLuaRuntimeMethods(
     lua_State *state, const RuntimeSessionOptions &options,
     PinnedMemoryAllocator *pinnedMemoryAllocator, AllocTracker *allocTracker,
-    ResourceTracker *resourceTracker) {
+    ResourceTracker *resourceTracker, GpuAllocator* allocator) {
   registerLuaRuntimeMethodsCommon(state, pinnedMemoryAllocator, allocTracker,
-                                  resourceTracker);
+                                  resourceTracker, allocator);
 #ifdef MLIR_EXECUTOR_ENABLE_NCCL
   registerExecutorNCCLModuleLuaRuntimeMethods(state, resourceTracker);
   registerDeviceDependentNCCLMethods(state, options.getNumDevices(),
@@ -108,7 +108,7 @@ void mlirtrt::runtime::registerLuaRuntimeMethods(
 }
 
 StatusOr<int64_t>
-mlirtrt::runtime::runExecutorLuaScript(std::string_view luaScript) {
+mlirtrt::runtime::runExecutorLuaScript(std::string_view luaScript, GpuAllocator* allocator) {
   ADD_RUNTIME_MODULE_RANGE("runtime_runExecutorLuaScript");
 
   StatusOr<std::unique_ptr<RuntimeClient>> client = RuntimeClient::create();
@@ -120,7 +120,7 @@ mlirtrt::runtime::runExecutorLuaScript(std::string_view luaScript) {
   registerLuaRuntimeMethods(lua.lua_state(), RuntimeSessionOptions(),
                             &(*client)->getPinnedMemorAllocator(),
                             &(*client)->getAllocTracker(),
-                            &(*client)->getResourceTracker());
+                            &(*client)->getResourceTracker(), allocator);
 
   sol::protected_function_result result = lua.script(luaScript);
   if (!result.valid()) {
@@ -171,7 +171,7 @@ static Status maybeCheckForValidNcclUuid(const RuntimeSessionOptions &options) {
 /// global initialization.
 StatusOr<std::unique_ptr<RuntimeSession>>
 mlirtrt::runtime::createRuntimeSessionWithLuaBackend(
-    ExecutableView executable, const RuntimeSessionOptions &options) {
+    ExecutableView executable, GpuAllocator* allocator, const RuntimeSessionOptions &options) {
   ADD_RUNTIME_MODULE_RANGE("runtime_loadExecutable");
 
   MTRT_RETURN_IF_ERROR(maybeCheckForValidNcclUuid(options));
@@ -184,7 +184,7 @@ mlirtrt::runtime::createRuntimeSessionWithLuaBackend(
   lua.open_libraries(sol::lib::base, sol::lib::string);
   registerLuaRuntimeMethods(lua.lua_state(), options,
                             pinnedMemoryAllocator.get(), allocTracker.get(),
-                            resourceTracker.get());
+                            resourceTracker.get(), allocator);
 
   // Load globals into the context.
   // TODO: eliminate this copy, we already own the executable.
@@ -225,11 +225,11 @@ mlirtrt::runtime::createRuntimeSessionWithLuaBackend(
   }
   return std::make_unique<RuntimeSession>(
       options, executable, std::move(lua), std::move(pinnedMemoryAllocator),
-      std::move(allocTracker), std::move(resourceTracker));
+      std::move(allocTracker), std::move(resourceTracker), allocator);
 }
 
 StatusOr<int64_t> mlirtrt::runtime::runExecutorExecutable(
-    std::unique_ptr<Executable> executable) {
+    std::unique_ptr<Executable> executable, GpuAllocator* allocator) {
 
   StatusOr<std::unique_ptr<RuntimeClient>> client = RuntimeClient::create();
   if (!client.isOk())
@@ -245,7 +245,7 @@ StatusOr<int64_t> mlirtrt::runtime::runExecutorExecutable(
     return options.getStatus();
 
   StatusOr<std::unique_ptr<RuntimeSession>> session =
-      createRuntimeSessionWithLuaBackend(executable->getView(), *options);
+      createRuntimeSessionWithLuaBackend(executable->getView(), allocator, *options);
   if (!session.isOk())
     return session.getStatus();
 
diff --git a/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.cpp b/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.cpp
index 1b96eac44..bac6942a1 100644
--- a/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.cpp
+++ b/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.cpp
@@ -64,6 +64,43 @@ class StdioLogger : public nvinfer1::ILogger {
   bool verbose;
 };
 
+//===----------------------------------------------------------------------===//
+// TensorRTCallBackAllocator
+//===----------------------------------------------------------------------===//
+
+class TensorRTCallBackAllocator final : public nvinfer1::IGpuAllocator {
+public:
+  TensorRTCallBackAllocator(GpuAllocator *gpuAllocator)
+      : nvinfer1::IGpuAllocator(), mGpuAllocatorCallBack(gpuAllocator) {}
+
+  void *allocate(uint64_t size, uint64_t alignment,
+                 nvinfer1::AllocatorFlags flags) noexcept final {
+    return allocateAsync(size, alignment, flags, nullptr);
+  }
+
+  bool deallocate(void *memory) noexcept final {
+    return deallocateAsync(memory, nullptr);
+  }
+
+  void *allocateAsync(uint64_t const size, uint64_t const alignment,
+                      uint32_t flags, cudaStream_t stream) noexcept final {
+    StatusOr<void *> status =
+        mGpuAllocatorCallBack->allocate(size, alignment, flags, stream);
+    assert(status.isOk());
+    return *status;
+  }
+
+  bool deallocateAsync(void *const memory,
+                       cudaStream_t stream) noexcept override {
+    StatusOr<bool> status = mGpuAllocatorCallBack->deallocate(memory, stream);
+    assert(status.isOk());
+    return *status;
+  }
+
+private:
+  GpuAllocator *mGpuAllocatorCallBack;
+};
+
 } // namespace
 
 static StdioLogger logger(/*verbose=*/false);
@@ -88,13 +125,38 @@ struct Signature {
   }
 };
 
+class NvInferRuntimeWrapper {
+public:
+  explicit NvInferRuntimeWrapper(GpuAllocator* gpuAllocator) {
+    runtime = std::shared_ptr<nvinfer1::IRuntime>(
+        nvinfer1::createInferRuntime(logger), [](nvinfer1::IRuntime *runtime) {
+          MTRT_DBGF("freeing tensorrt runtime at %lu",
+                    reinterpret_cast<uintptr_t>(runtime));
+          delete runtime;
+        });
+    // GpuAllocator is optional.
+    if (gpuAllocator) {
+      callbackAllocator = std::shared_ptr<nvinfer1::IGpuAllocator>(
+          new TensorRTCallBackAllocator(gpuAllocator));
+      runtime->setGpuAllocator(callbackAllocator.get());
+    }
+  }
+
+  nvinfer1::IRuntime *operator*() { return runtime.get(); }
+  nvinfer1::IRuntime *operator->() { return runtime.get(); }
+
+  std::shared_ptr<nvinfer1::IRuntime> runtime;
+  std::shared_ptr<nvinfer1::IGpuAllocator> callbackAllocator;
+};
+
 class NvInferEngineWrapper {
 public:
-  explicit NvInferEngineWrapper(std::shared_ptr<nvinfer1::IRuntime> &runtime,
+  explicit NvInferEngineWrapper(std::shared_ptr<NvInferRuntimeWrapper> runtime,
                                 uintptr_t pointer, size_t size)
       : runtime(runtime) {
     engine = std::shared_ptr<nvinfer1::ICudaEngine>(
-        runtime->deserializeCudaEngine(reinterpret_cast<void *>(pointer), size),
+        runtime->runtime->deserializeCudaEngine(
+            reinterpret_cast<void *>(pointer), size),
         [](nvinfer1::ICudaEngine *engine) {
           MTRT_DBGF("freeing cuda engine at %lu",
                     reinterpret_cast<uintptr_t>(engine));
@@ -105,7 +167,7 @@ class NvInferEngineWrapper {
   nvinfer1::ICudaEngine *operator*() { return engine.get(); }
   nvinfer1::ICudaEngine *operator->() { return engine.get(); }
 
-  std::shared_ptr<nvinfer1::IRuntime> runtime;
+  std::shared_ptr<NvInferRuntimeWrapper> runtime;
   std::shared_ptr<nvinfer1::ICudaEngine> engine;
 };
 
@@ -375,19 +437,20 @@ static Status enqueueV3Wrapper(AllocTracker &tracker,
 //===----------------------------------------------------------------------===//
 void mlirtrt::runtime::registerExecutorTensorRTModuleLuaRuntimeMethods(
     lua_State *luaState, PinnedMemoryAllocator *pinnedMemoryAllocator,
-    AllocTracker *allocTracker, ResourceTracker *resourceTracker) {
+    AllocTracker *allocTracker, ResourceTracker *resourceTracker, GpuAllocator* allocator) {
   sol::state_view lua(luaState);
 
-  lua["_trtrt_create_runtime"] = [](sol::this_state state) {
+  lua["_trtrt_create_runtime"] =
+      [allocator](sol::this_state state) -> std::shared_ptr<NvInferRuntimeWrapper> {
     ADD_TENSORRT_MODULE_RANGE("trtrt_create_runtime");
     MTRT_DBGF("%s", "creating nvinfer runtime");
-    return std::shared_ptr<nvinfer1::IRuntime>(
-        nvinfer1::createInferRuntime(logger));
+    return std::make_shared<NvInferRuntimeWrapper>(allocator);
   };
 
   lua["_trtrt_load"] =
       [allocTracker](
-          sol::this_state state, std::shared_ptr<nvinfer1::IRuntime> &runtime,
+          sol::this_state state,
+          std::shared_ptr<NvInferRuntimeWrapper> &runtime,
           uintptr_t pointer) -> std::shared_ptr<NvInferEngineWrapper> {
     ADD_TENSORRT_MODULE_RANGE("trtrt_load");
     const AllocTracker &tracker = *allocTracker;
diff --git a/mlir-tensorrt/executor/lib/Support/Allocators.cpp b/mlir-tensorrt/executor/lib/Support/Allocators.cpp
index 50dd7a5d8..1fa34325d 100644
--- a/mlir-tensorrt/executor/lib/Support/Allocators.cpp
+++ b/mlir-tensorrt/executor/lib/Support/Allocators.cpp
@@ -22,11 +22,15 @@
 ///
 //===----------------------------------------------------------------------===//
 #include "mlir-executor/Support/Allocators.h"
+#include "mlir-executor/Support/Status.h"
+#include "mlir-executor/Runtime/Support/Support.h"
+#include "cuda_runtime_api.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
+#include <cassert>
 #include <deque>
 #include <set>
 #include <sstream>
@@ -38,6 +42,57 @@ using namespace mlirtrt;
   DEBUG_WITH_TYPE("allocators", fprintf(stderr, "%s:%d " fmt "\n", __FILE__,   \
                                         __LINE__, __VA_ARGS__))
 
+//===----------------------------------------------------------------------===//
+// CustomTensorRTAllocator
+//===----------------------------------------------------------------------===//
+
+StatusOr<void *>
+CustomTensorRTAllocator::allocate(uint64_t const size, uint64_t const alignment,
+                                  uint32_t /*flags*/,
+                                  std::optional<cudaStream_t> stream) {
+  uint8_t *memory;
+  assert(alignment > 0 && (alignment & (alignment - 1)) == 0 &&
+         "Memory alignment has to be power of 2");
+  if (stream && *stream != nullptr) {
+    RETURN_ERROR_IF_CUDART_ERROR(
+        cudaMallocAsync(reinterpret_cast<void **>(&memory), size, *stream));
+    MTRT_DBGF("[CustomTensorRTAllocator][allocate]: Asynchronously allocated %lx bytes at 0x%lx on stream %lx", size,
+              reinterpret_cast<uintptr_t>(memory),
+              reinterpret_cast<uintptr_t>(*stream));
+  } else {
+    RETURN_ERROR_IF_CUDART_ERROR(
+        cudaMalloc(reinterpret_cast<void **>(&memory), size));
+    MTRT_DBGF("[CustomTensorRTAllocator][allocate]: Synchronously allocated %lx bytes at 0x%lx", size,
+              reinterpret_cast<uintptr_t>(memory));
+  }
+  assert(reinterpret_cast<uintptr_t>(memory) % alignment == 0);
+  return memory;
+}
+
+StatusOr<bool>
+CustomTensorRTAllocator::deallocate(void *const memory,
+                                    std::optional<cudaStream_t> stream) {
+  if (stream && *stream != nullptr) {
+    MTRT_DBGF("[CustomTensorRTAllocator][deallocate]: Asynchronously freeing CUDA device memory 0x%lx on stream %lx",
+              reinterpret_cast<uintptr_t>(memory),
+              reinterpret_cast<uintptr_t>(*stream));
+    RETURN_ERROR_IF_CUDART_ERROR(cudaFreeAsync(memory, *stream));
+  } else {
+    MTRT_DBGF("[CustomTensorRTAllocator][deallocate]: Synchronously freeing CUDA device/pinned host memory 0x%lx ptr "
+              "on stream %lx",
+              reinterpret_cast<uintptr_t>(memory),
+              reinterpret_cast<uintptr_t>(*stream));
+    RETURN_ERROR_IF_CUDART_ERROR(cudaFree(memory));
+  }
+  return true;
+}
+
+StatusOr<void *> CustomTensorRTAllocator::reallocate(
+    void * /* baseAddr */, uint64_t /* alignment */, uint64_t /* newSize */,
+    std::optional<cudaStream_t> /* stream */) {
+  return nullptr;
+}
+
 //===----------------------------------------------------------------------===//
 // PoolTrackedCudaEvent
 //===----------------------------------------------------------------------===//
diff --git a/mlir-tensorrt/executor/lib/Tools/ExecutorRunnerMain.cpp b/mlir-tensorrt/executor/lib/Tools/ExecutorRunnerMain.cpp
index 3241de0da..c8ab9269b 100644
--- a/mlir-tensorrt/executor/lib/Tools/ExecutorRunnerMain.cpp
+++ b/mlir-tensorrt/executor/lib/Tools/ExecutorRunnerMain.cpp
@@ -98,7 +98,12 @@ struct Options {
       cl::values(clEnumValN(Lua, "lua", "interpret the input as Lua code")),
       cl::values(clEnumValN(ExecutorRuntimeExecutable, "rtexe",
                             "load the input file as an Executor executable"))};
+
+  cl::opt<bool> useCustomAllocator{"use-custom-allocator",
+                                      cl::desc("Use custom allocator"),
+                                      cl::init(false)};                 
 };
+
 } // namespace
 
 LogicalResult
@@ -168,13 +173,19 @@ executor::ExecutorRunnerMain(int argc, char **argv,
   if (result != cudaSuccess)
     return emitError(loc) << "cudaFree failed: " << cudaGetErrorString(result);
 
+  std::unique_ptr<GpuAllocator> allocator{nullptr};
+  if (options.useCustomAllocator) {
+    // Create an optional runtime GPU allocator
+    allocator.reset(new CustomTensorRTAllocator());
+  }
+
   // Read the buffer as a Lua script and execute.
 
   if (options.inputType == Lua) {
     assert(!options.dumpFunctionSignature &&
            "Can not dump function signature for Lua input type.");
     mlirtrt::StatusOr<int64_t> result =
-        mlirtrt::runtime::runExecutorLuaScript(input->getBuffer());
+        mlirtrt::runtime::runExecutorLuaScript(input->getBuffer(), allocator.get());
     if (!result.isOk())
       return emitError(UnknownLoc::get(&context)) << result.getString();
     return success(*result == 0);
@@ -202,7 +213,7 @@ executor::ExecutorRunnerMain(int argc, char **argv,
   }
 
   mlirtrt::StatusOr<int64_t> executionResult =
-      mlirtrt::runtime::runExecutorExecutable(std::move(*executable));
+      mlirtrt::runtime::runExecutorExecutable(std::move(*executable), allocator.get());
   if (!executionResult.isOk())
     return emitError(UnknownLoc::get(&context))
            << "failed to load and run executable: "
diff --git a/mlir-tensorrt/python/bindings/Runtime/RuntimePyBind.cpp b/mlir-tensorrt/python/bindings/Runtime/RuntimePyBind.cpp
index fef5ad868..e6e6bdda5 100644
--- a/mlir-tensorrt/python/bindings/Runtime/RuntimePyBind.cpp
+++ b/mlir-tensorrt/python/bindings/Runtime/RuntimePyBind.cpp
@@ -184,6 +184,16 @@ class PyRuntimeValue : public PyMTRTWrapper<PyRuntimeValue, MTRT_RuntimeValue> {
       mtrtPythonCapsuleToRuntimeValue, mtrtPythonRuntimeValueToCapsule};
 };
 
+/// Python object type wrapper for `MTRT_GpuAllocator`.
+class PyGpuAllocator : public PyMTRTWrapper<PyGpuAllocator, MTRT_GpuAllocator> {
+public:
+  using Base::Base;
+  DECLARE_WRAPPER_CONSTRUCTORS(PyGpuAllocator);
+
+  static constexpr auto kMethodTable = CAPITable<MTRT_GpuAllocator>{
+      mtrtGpuAllocatorIsNull, mtrtGpuAllocatorDestroy};
+};
+
 /// Python object type wrapper for `MTRT_StableHLOToExecutableOptions`.
 class PyRuntimeSessionOptions
     : public PyMTRTWrapper<PyRuntimeSessionOptions,
@@ -901,14 +911,52 @@ PYBIND11_MODULE(_api, m) {
            py::arg("num_devices") = 1, py::arg("device_id") = 0,
            py::arg("nccl_uuid") = py::str(""));
 
+  py::class_<PyGpuAllocator>(m, "GpuAllocator", py::module_local())
+      .def(py::init<>([]() -> PyGpuAllocator * {
+        MTRT_GpuAllocator allocator;
+        MTRT_Status s = mtrtGpuAllocatorCreate(&allocator);
+        THROW_IF_MTRT_ERROR(s);
+        return new PyGpuAllocator(allocator);
+      }))
+      .def(
+          "allocate",
+          [](PyGpuAllocator &self, uint64_t size, uint64_t alignment,
+             std::optional<uint32_t> flags, std::optional<MTRT_Stream> stream) {
+            void *memory{nullptr};
+            MTRT_Status s = mtrtGpuAllocatorAllocate(
+                self, size, alignment, flags ? *flags : 0,
+                stream ? *stream : mtrtStreamGetNull(), &memory);
+            THROW_IF_MTRT_ERROR(s);
+            // Add changes to ensure memory is not released prematurely.
+            return memory;
+          },
+          py::arg("size"), py::arg("alignment"), py::arg("flags") = py::none(),
+          py::arg("stream") = py::none())
+      .def(
+          "deallocate",
+          [](PyGpuAllocator &self, void *memory,
+             std::optional<MTRT_Stream> stream) {
+            bool result;
+            MTRT_Status s = mtrtGpuAllocatorDeallocate(
+                self, memory, stream ? *stream : mtrtStreamGetNull(), &result);
+            THROW_IF_MTRT_ERROR(s);
+            // Add changes to ensure memory is not released prematurely.
+            return result;
+          },
+          py::arg("memory"), py::arg("stream") = py::none());
+
   py::class_<PyRuntimeSession>(m, "RuntimeSession", py::module_local())
-      .def(py::init<>([](PyRuntimeSessionOptions &options, PyExecutable &exe) {
+      .def(py::init<>([](PyRuntimeSessionOptions &options, PyExecutable &exe,
+                         std::optional<MTRT_GpuAllocator> allocator) {
              MTRT_RuntimeSession session;
-             MTRT_Status s = mtrtRuntimeSessionCreate(options, exe, &session);
+             MTRT_Status s = mtrtRuntimeSessionCreate(
+                 options, exe,
+                 allocator ? *allocator : mtrtGpuAllocatorGetNull(), &session);
              THROW_IF_MTRT_ERROR(s);
              return new PyRuntimeSession(session);
            }),
-           py::arg("options"), py::arg("executable"))
+           py::arg("options"), py::arg("executable"),
+           py::arg("gpu_allocator") = py::none())
       .def(
           "execute_function",
           [](PyRuntimeSession &self, std::string name,