From 99c3080c54586147bb2df6166d53650c445e068b Mon Sep 17 00:00:00 2001
From: Jhalak Patel <jhalakp@nvidia.com>
Date: Tue, 13 Aug 2024 11:16:42 -0700
Subject: [PATCH 1/6] Add IGpuAllocator to MLIR-TensorRT

---
 mlir-tensorrt/README.md                       |  4 +-
 .../include/mlir-executor-c/Runtime/Runtime.h | 34 +++++++-
 .../include/mlir-executor/Runtime/API/API.h   |  7 +-
 .../Runtime/Backend/Lua/LuaRegistration.h     |  2 +-
 .../Runtime/Backend/Lua/LuaRuntime.h          |  6 +-
 .../Lua/Modules/TensorRT/TensorRTModule.h     |  2 +-
 .../mlir-executor/Support/Allocators.h        | 73 +++++++++++++++++
 .../executor/lib/CAPI/Runtime/Runtime.cpp     | 56 ++++++++++++-
 .../executor/lib/Runtime/API/API.cpp          |  7 +-
 .../lib/Runtime/Backend/Lua/LuaRuntime.cpp    | 22 +++---
 .../Lua/Modules/TensorRT/TensorRTModule.cpp   | 79 +++++++++++++++++--
 .../executor/lib/Support/Allocators.cpp       | 54 +++++++++++++
 .../executor/lib/Tools/ExecutorRunnerMain.cpp | 15 +++-
 .../python/bindings/Runtime/RuntimePyBind.cpp | 54 ++++++++++++-
 14 files changed, 377 insertions(+), 38 deletions(-)

diff --git a/mlir-tensorrt/README.md b/mlir-tensorrt/README.md
index 3916be7b0..206d1e32e 100644
--- a/mlir-tensorrt/README.md
+++ b/mlir-tensorrt/README.md
@@ -23,7 +23,7 @@ We currently support only building on Linux x86 systems.
 We support building several different ways (only via CMake) depending on use-case.
 
 In each case, the LLVM-Project version that we are currently aligned to is
-given in `build_tools/cmake/LLVMCommit.txt`.
+given in `build_tools/cmake/LLVMCommit.cmake`.
 
 Note that currently we provide an LLVM patch which essentially cherry-picks the
 bug fixes from [this open MLIR PR](https://github.com/llvm/llvm-project/pull/91524).
@@ -82,7 +82,7 @@ git clone https://github.com/llvm/llvm-project.git llvm-project
 # Checkout the right commit. Of course, you may try
 # a newer commit or your own modified LLVM-Project.
 cd llvm-project
-git checkout $(cat build_tools/cmake/LLVMCommit.cmake | grep -Po '(?<=").*(?=")')
+git checkout $(cat ../build_tools/cmake/LLVMCommit.cmake | grep -Po '(?<=").*(?=")')
 
 # Apply patch from llvm-project PR 91524
 git apply ../build_tools/llvm-project.patch
diff --git a/mlir-tensorrt/executor/include/mlir-executor-c/Runtime/Runtime.h b/mlir-tensorrt/executor/include/mlir-executor-c/Runtime/Runtime.h
index 11ae93519..65377da35 100644
--- a/mlir-tensorrt/executor/include/mlir-executor-c/Runtime/Runtime.h
+++ b/mlir-tensorrt/executor/include/mlir-executor-c/Runtime/Runtime.h
@@ -319,6 +319,38 @@ mtrtScalarValueCastToRuntimeValue(MTRT_ScalarValue v);
 MLIR_CAPI_EXPORTED MTRT_Status
 mtrtScalarValueGetType(MTRT_ScalarValue scalar, MTRT_ScalarTypeCode *code);
 
+//===----------------------------------------------------------------------===//
+// MTRT_GpuAllocator
+//===----------------------------------------------------------------------===//
+
+typedef struct MTRT_GpuAllocator {
+  void *ptr;
+} MTRT_GpuAllocator;
+
+/// Checks nullity of `GpuAllocator`.
+MTRT_CAPI_EXPORTED bool mtrtGpuAllocatorIsNull(MTRT_GpuAllocator gpuAllocator);
+
+/// Returns null `GpuAllocator`.
+MTRT_CAPI_EXPORTED MTRT_GpuAllocator mtrtGpuAllocatorGetNull();
+
+MTRT_CAPI_EXPORTED MTRT_Status
+mtrtGpuAllocatorDestroy(MTRT_GpuAllocator executable);
+
+MTRT_CAPI_EXPORTED MTRT_Status
+mtrtGpuAllocatorCreate(MTRT_GpuAllocator *allocator);
+
+//===----------------------------------------------------------------------===//
+// MTRT_GpuAllocator
+//===----------------------------------------------------------------------===//
+
+MTRT_CAPI_EXPORTED MTRT_Status mtrtGpuAllocatorAllocate(
+    MTRT_GpuAllocator gpuAllocator, uint64_t size, uint64_t alignment,
+    uint32_t flags, MTRT_Stream stream, void **memory);
+
+MTRT_CAPI_EXPORTED MTRT_Status
+mtrtGpuAllocatorDeallocate(MTRT_GpuAllocator gpuAllocator, void *memory,
+                           MTRT_Stream stream, bool *result);
+
 //===----------------------------------------------------------------------===//
 // MTRT_RuntimeSessionOptions
 //===----------------------------------------------------------------------===//
@@ -359,7 +391,7 @@ typedef struct MTRT_RuntimeSession {
 /// that the session only has a read-only view in to the Executable for code and
 /// constant data. Therefore the Executable must outlive the RuntimeSession.
 MLIR_CAPI_EXPORTED MTRT_Status mtrtRuntimeSessionCreate(
-    MTRT_RuntimeSessionOptions options, MTRT_Executable executable,
+    MTRT_RuntimeSessionOptions options, MTRT_Executable executable, MTRT_GpuAllocator allocator,
     MTRT_RuntimeSession *result);
 
 /// Destory the session. This does not destroy the associated Executable, which
diff --git a/mlir-tensorrt/executor/include/mlir-executor/Runtime/API/API.h b/mlir-tensorrt/executor/include/mlir-executor/Runtime/API/API.h
index d3672c149..b35d0b777 100644
--- a/mlir-tensorrt/executor/include/mlir-executor/Runtime/API/API.h
+++ b/mlir-tensorrt/executor/include/mlir-executor/Runtime/API/API.h
@@ -867,7 +867,8 @@ class RuntimeSession {
                  sol::state state,
                  std::unique_ptr<PinnedMemoryAllocator> pinnedMemoryAllocator,
                  std::unique_ptr<AllocTracker> allocTracker,
-                 std::unique_ptr<ResourceTracker> resourceTracker);
+                 std::unique_ptr<ResourceTracker> resourceTracker,
+                 GpuAllocator* gpuAllocator);
 
   ExecutableView getExecutable() const { return executable; }
 
@@ -881,6 +882,8 @@ class RuntimeSession {
 
   ResourceTracker &getResourceTracker() { return *resourceTracker; }
 
+  GpuAllocator* getGpuAllocator() { return gpuAllocator; }
+
 private:
   RuntimeSessionOptions options;
   ExecutableView executable;
@@ -888,7 +891,7 @@ class RuntimeSession {
   std::unique_ptr<PinnedMemoryAllocator> pinnedMemoryAllocator;
   std::unique_ptr<AllocTracker> allocTracker;
   std::unique_ptr<ResourceTracker> resourceTracker;
-
+  GpuAllocator* gpuAllocator;
   sol::state state;
 };
 
diff --git a/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/LuaRegistration.h b/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/LuaRegistration.h
index b5fed9c3d..922e964d4 100644
--- a/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/LuaRegistration.h
+++ b/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/LuaRegistration.h
@@ -37,6 +37,6 @@ void registerLuaRuntimeMethods(lua_State *state,
                                const RuntimeSessionOptions &options,
                                PinnedMemoryAllocator *pinnedMemoryAllocator,
                                AllocTracker *allocTracker,
-                               ResourceTracker *resourceTracker);
+                               ResourceTracker *resourceTracker, GpuAllocator* allocator);
 
 } // namespace mlirtrt::runtime
diff --git a/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/LuaRuntime.h b/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/LuaRuntime.h
index f39eabd7b..0635e8cb4 100644
--- a/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/LuaRuntime.h
+++ b/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/LuaRuntime.h
@@ -36,7 +36,7 @@ namespace mlirtrt::runtime {
 /// `main` function. It is assumed that `main` takes no arguments and returns an
 /// integer result (which is returned if the execution is successful).
 /// TODO: this should take a handle to a function for streaming output/errors.
-StatusOr<int64_t> runExecutorLuaScript(std::string_view luaScript);
+StatusOr<int64_t> runExecutorLuaScript(std::string_view luaScript, GpuAllocator* allocator);
 
 /// Synchronously run a serialized executor Executable one time. An `Executable`
 /// is essentially a Lua script packaged with metadata and serialized constants
@@ -48,12 +48,12 @@ StatusOr<int64_t> runExecutorLuaScript(std::string_view luaScript);
 /// execution is successful).
 /// TODO: this should take a handle to a function for
 /// streaming output/errors.
-StatusOr<int64_t> runExecutorExecutable(std::unique_ptr<Executable> executable);
+StatusOr<int64_t> runExecutorExecutable(std::unique_ptr<Executable> executable, GpuAllocator* allocator);
 
 /// Create an execution state. This will setup a Lua environment and invoke
 /// global initialization.
 StatusOr<std::unique_ptr<RuntimeSession>>
-createRuntimeSessionWithLuaBackend(ExecutableView executable,
+createRuntimeSessionWithLuaBackend(ExecutableView executable, GpuAllocator* allocator,
                                    const RuntimeSessionOptions &options);
 
 /// Set the primary stream for the loaded executable to use.
diff --git a/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.h b/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.h
index 37d8de629..1ceb91367 100644
--- a/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.h
+++ b/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.h
@@ -37,7 +37,7 @@ class ResourceTracker;
 /// Lua state.
 void registerExecutorTensorRTModuleLuaRuntimeMethods(
     lua_State *luaState, PinnedMemoryAllocator *pinnedMemoryAllocator,
-    AllocTracker *allocTracker, ResourceTracker *resourceTracker);
+    AllocTracker *allocTracker, ResourceTracker *resourceTracker, GpuAllocator* allocator);
 
 } // namespace mlirtrt::runtime
 
diff --git a/mlir-tensorrt/executor/include/mlir-executor/Support/Allocators.h b/mlir-tensorrt/executor/include/mlir-executor/Support/Allocators.h
index 180dbf09e..66f54fb46 100644
--- a/mlir-tensorrt/executor/include/mlir-executor/Support/Allocators.h
+++ b/mlir-tensorrt/executor/include/mlir-executor/Support/Allocators.h
@@ -32,6 +32,79 @@ namespace mlirtrt {
 
 struct EventPool;
 
+// Abstract allocator to be implemented by consumers.
+using AllocatorFlags = uint32_t;
+
+class GpuAllocator {
+public:
+  GpuAllocator() = default;
+  virtual ~GpuAllocator() = default;
+
+  virtual StatusOr<void *> reallocate(void *baseAddr, uint64_t alignment,
+                                      uint64_t newSize,
+                                      std::optional<cudaStream_t> stream) = 0;
+
+  virtual StatusOr<void *> allocate(uint64_t const size,
+                                    uint64_t const alignment,
+                                    AllocatorFlags const flags,
+                                    std::optional<cudaStream_t> stream) = 0;
+
+  virtual StatusOr<bool> deallocate(void *const memory,
+                                    std::optional<cudaStream_t> stream) = 0;
+
+protected:
+  GpuAllocator(GpuAllocator const &) = delete;
+  GpuAllocator(GpuAllocator &&) = delete;
+  GpuAllocator &operator=(GpuAllocator const &) & = delete;
+  GpuAllocator &operator=(GpuAllocator &&) & = delete;
+};
+
+class StubAllocator : public GpuAllocator {
+public:
+  StubAllocator() = default;
+  ~StubAllocator() = default;
+
+  StatusOr<void *> reallocate(void *baseAddr, uint64_t alignment,
+                              uint64_t newSize,
+                              std::optional<cudaStream_t> stream) override {
+    return getStatusWithMsg(
+        StatusCode::InternalError,
+        "[StubAllocator][reallocate]: Must be overriden in Python");
+  }
+
+  StatusOr<void *> allocate(uint64_t const size, uint64_t const alignment,
+                            AllocatorFlags const flags,
+                            std::optional<cudaStream_t> stream) override {
+    return getStatusWithMsg(
+        StatusCode::InternalError,
+        "[StubAllocator][allocate]: Must be overriden in Python");
+  }
+
+  StatusOr<bool> deallocate(void *const memory,
+                            std::optional<cudaStream_t> stream) override {
+    return getStatusWithMsg(
+        StatusCode::InternalError,
+        "[StubAllocator][deallocate]: Must be overriden in Python");
+  }
+};
+
+class CustomTensorRTAllocator : public GpuAllocator {
+public:
+  CustomTensorRTAllocator() = default;
+  ~CustomTensorRTAllocator() = default;
+
+  StatusOr<void *> reallocate(void *baseAddr, uint64_t alignment,
+                              uint64_t newSize,
+                              std::optional<cudaStream_t> stream) override;
+
+  StatusOr<void *> allocate(uint64_t const size, uint64_t const alignment,
+                            AllocatorFlags const flags,
+                            std::optional<cudaStream_t> stream) override;
+
+  StatusOr<bool> deallocate(void *const memory,
+                            std::optional<cudaStream_t> stream) override;
+};
+
 //===----------------------------------------------------------------------===//
 // PoolTrackedCudaEvent
 //===----------------------------------------------------------------------===//
diff --git a/mlir-tensorrt/executor/lib/CAPI/Runtime/Runtime.cpp b/mlir-tensorrt/executor/lib/CAPI/Runtime/Runtime.cpp
index 41b4db1b2..7c412a290 100644
--- a/mlir-tensorrt/executor/lib/CAPI/Runtime/Runtime.cpp
+++ b/mlir-tensorrt/executor/lib/CAPI/Runtime/Runtime.cpp
@@ -27,6 +27,7 @@
 #include "mlir-executor/Runtime/API/API.h"
 #include "mlir-executor/Runtime/API/ExecutableFlatbuffer.h"
 #include "mlir-executor/Runtime/Backend/Lua/LuaRuntime.h"
+#include "mlir-executor/Support/Allocators.h"
 #include "mlir-executor/Support/Status.h"
 #include "mlir/Support/FileUtilities.h"
 #include "llvm/ADT/SmallVectorExtras.h"
@@ -48,6 +49,8 @@ DEFINE_C_API_PTR_METHODS(MTRT_RuntimeSession,
                          ::mlirtrt::runtime::RuntimeSession)
 DEFINE_C_API_PTR_METHODS(MTRT_RuntimeSessionOptions,
                          ::mlirtrt::runtime::RuntimeSessionOptions)
+DEFINE_C_API_PTR_METHODS(MTRT_GpuAllocator,
+                         ::mlirtrt::GpuAllocator)
 DEFINE_C_API_PTR_METHODS(MTRT_Executable, ::mlirtrt::runtime::Executable)
 DEFINE_C_API_PTR_METHODS(MTRT_Stream, MTRT_StreamImpl)
 DEFINE_C_API_PTR_METHODS(MTRT_RuntimeValue, ::mlirtrt::runtime::RuntimeValue)
@@ -600,6 +603,55 @@ MTRT_ScalarValue mtrtRuntimeValueDynCastToScalar(MTRT_RuntimeValue v) {
   return wrap(static_cast<ScalarValue *>(x));
 }
 
+//===----------------------------------------------------------------------===//
+// MTRT_GpuAllocator
+//===----------------------------------------------------------------------===//
+
+bool mtrtGpuAllocatorIsNull(MTRT_GpuAllocator gpuAllocator) {
+  return !gpuAllocator.ptr;
+}
+
+MTRT_GpuAllocator mtrtGpuAllocatorGetNull() { return MTRT_GpuAllocator{nullptr}; }
+
+MTRT_Status mtrtGpuAllocatorDestroy(MTRT_GpuAllocator executable) {
+  delete unwrap(executable);
+  return mtrtStatusGetOk();
+}
+
+MTRT_Status mtrtGpuAllocatorCreate(MTRT_GpuAllocator *allocator) {
+  *allocator = MTRT_GpuAllocator{/*ptr=*/new StubAllocator()};
+  return mtrtStatusGetOk();
+}
+
+MTRT_Status mtrtGpuAllocatorAllocate(MTRT_GpuAllocator gpuAllocator,
+                                     uint64_t size, uint64_t alignment,
+                                     uint32_t flags, MTRT_Stream stream,
+                                     void **memory) {
+  GpuAllocator *cppGpuAllocator = unwrap(gpuAllocator);
+  StatusOr<void *> status = cppGpuAllocator->allocate(
+      size, alignment, flags,
+      !mtrtStreamIsNull(stream) ? std::optional(unwrap(stream)->getRawStream())
+                                : std::nullopt);
+  if (status.isOk()) {
+    *memory = *status;
+  }
+  return mtrtStatusGetOk();
+}
+
+MTRT_Status mtrtGpuAllocatorDeallocate(MTRT_GpuAllocator gpuAllocator,
+                                       void *memory, MTRT_Stream stream,
+                                       bool *result) {
+  GpuAllocator *cppGpuAllocator = unwrap(gpuAllocator);
+  StatusOr<bool> status = cppGpuAllocator->deallocate(
+      memory, !mtrtStreamIsNull(stream)
+                  ? std::optional(unwrap(stream)->getRawStream())
+                  : std::nullopt);
+  if (status.isOk()) {
+    *result = *status;
+  }
+  return mtrtStatusGetOk();
+}
+
 //===----------------------------------------------------------------------===//
 // MTRT_RuntimeSessionOptions
 //===----------------------------------------------------------------------===//
@@ -627,12 +679,14 @@ mtrtRuntimeSessionOptionsDestroy(MTRT_RuntimeSessionOptions options) {
 
 MTRT_Status mtrtRuntimeSessionCreate(MTRT_RuntimeSessionOptions options,
                                      MTRT_Executable executable,
+                                     MTRT_GpuAllocator gpuAllocator,
                                      MTRT_RuntimeSession *result) {
   RuntimeSessionOptions *cppOptions = unwrap(options);
   Executable *cppExecutable = unwrap(executable);
+  GpuAllocator *cppGpuAllocator = unwrap(gpuAllocator);
 
   StatusOr<std::unique_ptr<RuntimeSession>> session =
-      createRuntimeSessionWithLuaBackend(cppExecutable->getView(), *cppOptions);
+      createRuntimeSessionWithLuaBackend(cppExecutable->getView(), cppGpuAllocator, *cppOptions);
   if (session.isError())
     return wrap(session.getStatus());
 
diff --git a/mlir-tensorrt/executor/lib/Runtime/API/API.cpp b/mlir-tensorrt/executor/lib/Runtime/API/API.cpp
index 6c10d1f99..b53ab2096 100644
--- a/mlir-tensorrt/executor/lib/Runtime/API/API.cpp
+++ b/mlir-tensorrt/executor/lib/Runtime/API/API.cpp
@@ -353,16 +353,17 @@ RuntimeSessionOptions::createUsingSingleHostMpi() {
 //===----------------------------------------------------------------------===//
 // RuntimeSession
 //===----------------------------------------------------------------------===//
-
 RuntimeSession::RuntimeSession(
     RuntimeSessionOptions options, ExecutableView exe, sol::state state,
     std::unique_ptr<PinnedMemoryAllocator> pinnedMemoryAllocator,
     std::unique_ptr<AllocTracker> allocTracker,
-    std::unique_ptr<ResourceTracker> resourceTracker)
+    std::unique_ptr<ResourceTracker> resourceTracker,
+    GpuAllocator *gpuAllocator)
     : options(std::move(options)), executable(exe),
       pinnedMemoryAllocator(std::move(pinnedMemoryAllocator)),
       allocTracker(std::move(allocTracker)),
-      resourceTracker(std::move(resourceTracker)), state(std::move(state)) {}
+      resourceTracker(std::move(resourceTracker)), gpuAllocator(gpuAllocator),
+      state(std::move(state)) {}
 
 //===----------------------------------------------------------------------===//
 // AllocTracker
diff --git a/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/LuaRuntime.cpp b/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/LuaRuntime.cpp
index 7596c9da7..b19541977 100644
--- a/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/LuaRuntime.cpp
+++ b/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/LuaRuntime.cpp
@@ -72,7 +72,7 @@ static void registerDefaultDeviceDependentMethods(lua_State *state,
 
 static void registerLuaRuntimeMethodsCommon(
     lua_State *state, PinnedMemoryAllocator *pinnedMemoryAllocator,
-    AllocTracker *allocTracker, ResourceTracker *resourceTracker) {
+    AllocTracker *allocTracker, ResourceTracker *resourceTracker, GpuAllocator* allocator) {
   registerExecutorCoreModuleLuaRuntimeMethods(state, pinnedMemoryAllocator,
                                               allocTracker);
   registerExecutorCUDAModuleLuaRuntimeMethods(
@@ -84,15 +84,15 @@ static void registerLuaRuntimeMethodsCommon(
 #endif
 
   registerExecutorTensorRTModuleLuaRuntimeMethods(
-      state, pinnedMemoryAllocator, allocTracker, resourceTracker);
+      state, pinnedMemoryAllocator, allocTracker, resourceTracker, allocator);
 }
 
 void mlirtrt::runtime::registerLuaRuntimeMethods(
     lua_State *state, const RuntimeSessionOptions &options,
     PinnedMemoryAllocator *pinnedMemoryAllocator, AllocTracker *allocTracker,
-    ResourceTracker *resourceTracker) {
+    ResourceTracker *resourceTracker, GpuAllocator* allocator) {
   registerLuaRuntimeMethodsCommon(state, pinnedMemoryAllocator, allocTracker,
-                                  resourceTracker);
+                                  resourceTracker, allocator);
 #ifdef MLIR_EXECUTOR_ENABLE_NCCL
   registerExecutorNCCLModuleLuaRuntimeMethods(state, resourceTracker);
   registerDeviceDependentNCCLMethods(state, options.getNumDevices(),
@@ -108,7 +108,7 @@ void mlirtrt::runtime::registerLuaRuntimeMethods(
 }
 
 StatusOr<int64_t>
-mlirtrt::runtime::runExecutorLuaScript(std::string_view luaScript) {
+mlirtrt::runtime::runExecutorLuaScript(std::string_view luaScript, GpuAllocator* allocator) {
   ADD_RUNTIME_MODULE_RANGE("runtime_runExecutorLuaScript");
 
   StatusOr<std::unique_ptr<RuntimeClient>> client = RuntimeClient::create();
@@ -120,7 +120,7 @@ mlirtrt::runtime::runExecutorLuaScript(std::string_view luaScript) {
   registerLuaRuntimeMethods(lua.lua_state(), RuntimeSessionOptions(),
                             &(*client)->getPinnedMemorAllocator(),
                             &(*client)->getAllocTracker(),
-                            &(*client)->getResourceTracker());
+                            &(*client)->getResourceTracker(), allocator);
 
   sol::protected_function_result result = lua.script(luaScript);
   if (!result.valid()) {
@@ -171,7 +171,7 @@ static Status maybeCheckForValidNcclUuid(const RuntimeSessionOptions &options) {
 /// global initialization.
 StatusOr<std::unique_ptr<RuntimeSession>>
 mlirtrt::runtime::createRuntimeSessionWithLuaBackend(
-    ExecutableView executable, const RuntimeSessionOptions &options) {
+    ExecutableView executable, GpuAllocator* allocator, const RuntimeSessionOptions &options) {
   ADD_RUNTIME_MODULE_RANGE("runtime_loadExecutable");
 
   MTRT_RETURN_IF_ERROR(maybeCheckForValidNcclUuid(options));
@@ -184,7 +184,7 @@ mlirtrt::runtime::createRuntimeSessionWithLuaBackend(
   lua.open_libraries(sol::lib::base, sol::lib::string);
   registerLuaRuntimeMethods(lua.lua_state(), options,
                             pinnedMemoryAllocator.get(), allocTracker.get(),
-                            resourceTracker.get());
+                            resourceTracker.get(), allocator);
 
   // Load globals into the context.
   // TODO: eliminate this copy, we already own the executable.
@@ -225,11 +225,11 @@ mlirtrt::runtime::createRuntimeSessionWithLuaBackend(
   }
   return std::make_unique<RuntimeSession>(
       options, executable, std::move(lua), std::move(pinnedMemoryAllocator),
-      std::move(allocTracker), std::move(resourceTracker));
+      std::move(allocTracker), std::move(resourceTracker), allocator);
 }
 
 StatusOr<int64_t> mlirtrt::runtime::runExecutorExecutable(
-    std::unique_ptr<Executable> executable) {
+    std::unique_ptr<Executable> executable, GpuAllocator* allocator) {
 
   StatusOr<std::unique_ptr<RuntimeClient>> client = RuntimeClient::create();
   if (!client.isOk())
@@ -245,7 +245,7 @@ StatusOr<int64_t> mlirtrt::runtime::runExecutorExecutable(
     return options.getStatus();
 
   StatusOr<std::unique_ptr<RuntimeSession>> session =
-      createRuntimeSessionWithLuaBackend(executable->getView(), *options);
+      createRuntimeSessionWithLuaBackend(executable->getView(), allocator, *options);
   if (!session.isOk())
     return session.getStatus();
 
diff --git a/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.cpp b/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.cpp
index 1b96eac44..bac6942a1 100644
--- a/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.cpp
+++ b/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.cpp
@@ -64,6 +64,43 @@ class StdioLogger : public nvinfer1::ILogger {
   bool verbose;
 };
 
+//===----------------------------------------------------------------------===//
+// TensorRTCallBackAllocator
+//===----------------------------------------------------------------------===//
+
+class TensorRTCallBackAllocator final : public nvinfer1::IGpuAllocator {
+public:
+  TensorRTCallBackAllocator(GpuAllocator *gpuAllocator)
+      : nvinfer1::IGpuAllocator(), mGpuAllocatorCallBack(gpuAllocator) {}
+
+  void *allocate(uint64_t size, uint64_t alignment,
+                 nvinfer1::AllocatorFlags flags) noexcept final {
+    return allocateAsync(size, alignment, flags, nullptr);
+  }
+
+  bool deallocate(void *memory) noexcept final {
+    return deallocateAsync(memory, nullptr);
+  }
+
+  void *allocateAsync(uint64_t const size, uint64_t const alignment,
+                      uint32_t flags, cudaStream_t stream) noexcept final {
+    StatusOr<void *> status =
+        mGpuAllocatorCallBack->allocate(size, alignment, flags, stream);
+    assert(status.isOk());
+    return *status;
+  }
+
+  bool deallocateAsync(void *const memory,
+                       cudaStream_t stream) noexcept override {
+    StatusOr<bool> status = mGpuAllocatorCallBack->deallocate(memory, stream);
+    assert(status.isOk());
+    return *status;
+  }
+
+private:
+  GpuAllocator *mGpuAllocatorCallBack;
+};
+
 } // namespace
 
 static StdioLogger logger(/*verbose=*/false);
@@ -88,13 +125,38 @@ struct Signature {
   }
 };
 
+class NvInferRuntimeWrapper {
+public:
+  explicit NvInferRuntimeWrapper(GpuAllocator* gpuAllocator) {
+    runtime = std::shared_ptr<nvinfer1::IRuntime>(
+        nvinfer1::createInferRuntime(logger), [](nvinfer1::IRuntime *runtime) {
+          MTRT_DBGF("freeing tensorrt runtime at %lu",
+                    reinterpret_cast<uintptr_t>(runtime));
+          delete runtime;
+        });
+    // GpuAllocator is optional.
+    if (gpuAllocator) {
+      callbackAllocator = std::shared_ptr<nvinfer1::IGpuAllocator>(
+          new TensorRTCallBackAllocator(gpuAllocator));
+      runtime->setGpuAllocator(callbackAllocator.get());
+    }
+  }
+
+  nvinfer1::IRuntime *operator*() { return runtime.get(); }
+  nvinfer1::IRuntime *operator->() { return runtime.get(); }
+
+  std::shared_ptr<nvinfer1::IRuntime> runtime;
+  std::shared_ptr<nvinfer1::IGpuAllocator> callbackAllocator;
+};
+
 class NvInferEngineWrapper {
 public:
-  explicit NvInferEngineWrapper(std::shared_ptr<nvinfer1::IRuntime> &runtime,
+  explicit NvInferEngineWrapper(std::shared_ptr<NvInferRuntimeWrapper> runtime,
                                 uintptr_t pointer, size_t size)
       : runtime(runtime) {
     engine = std::shared_ptr<nvinfer1::ICudaEngine>(
-        runtime->deserializeCudaEngine(reinterpret_cast<void *>(pointer), size),
+        runtime->runtime->deserializeCudaEngine(
+            reinterpret_cast<void *>(pointer), size),
         [](nvinfer1::ICudaEngine *engine) {
           MTRT_DBGF("freeing cuda engine at %lu",
                     reinterpret_cast<uintptr_t>(engine));
@@ -105,7 +167,7 @@ class NvInferEngineWrapper {
   nvinfer1::ICudaEngine *operator*() { return engine.get(); }
   nvinfer1::ICudaEngine *operator->() { return engine.get(); }
 
-  std::shared_ptr<nvinfer1::IRuntime> runtime;
+  std::shared_ptr<NvInferRuntimeWrapper> runtime;
   std::shared_ptr<nvinfer1::ICudaEngine> engine;
 };
 
@@ -375,19 +437,20 @@ static Status enqueueV3Wrapper(AllocTracker &tracker,
 //===----------------------------------------------------------------------===//
 void mlirtrt::runtime::registerExecutorTensorRTModuleLuaRuntimeMethods(
     lua_State *luaState, PinnedMemoryAllocator *pinnedMemoryAllocator,
-    AllocTracker *allocTracker, ResourceTracker *resourceTracker) {
+    AllocTracker *allocTracker, ResourceTracker *resourceTracker, GpuAllocator* allocator) {
   sol::state_view lua(luaState);
 
-  lua["_trtrt_create_runtime"] = [](sol::this_state state) {
+  lua["_trtrt_create_runtime"] =
+      [allocator](sol::this_state state) -> std::shared_ptr<NvInferRuntimeWrapper> {
     ADD_TENSORRT_MODULE_RANGE("trtrt_create_runtime");
     MTRT_DBGF("%s", "creating nvinfer runtime");
-    return std::shared_ptr<nvinfer1::IRuntime>(
-        nvinfer1::createInferRuntime(logger));
+    return std::make_shared<NvInferRuntimeWrapper>(allocator);
   };
 
   lua["_trtrt_load"] =
       [allocTracker](
-          sol::this_state state, std::shared_ptr<nvinfer1::IRuntime> &runtime,
+          sol::this_state state,
+          std::shared_ptr<NvInferRuntimeWrapper> &runtime,
           uintptr_t pointer) -> std::shared_ptr<NvInferEngineWrapper> {
     ADD_TENSORRT_MODULE_RANGE("trtrt_load");
     const AllocTracker &tracker = *allocTracker;
diff --git a/mlir-tensorrt/executor/lib/Support/Allocators.cpp b/mlir-tensorrt/executor/lib/Support/Allocators.cpp
index 2eadd2cca..2ee7c28f4 100644
--- a/mlir-tensorrt/executor/lib/Support/Allocators.cpp
+++ b/mlir-tensorrt/executor/lib/Support/Allocators.cpp
@@ -23,11 +23,14 @@
 //===----------------------------------------------------------------------===//
 #include "mlir-executor/Support/Allocators.h"
 #include "mlir-executor/Support/Status.h"
+#include "mlir-executor/Runtime/Support/Support.h"
+#include "cuda_runtime_api.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
+#include <cassert>
 #include <deque>
 #include <set>
 #include <sstream>
@@ -39,6 +42,57 @@ using namespace mlirtrt;
   DEBUG_WITH_TYPE("allocators", fprintf(stderr, "%s:%d " fmt "\n", __FILE__,   \
                                         __LINE__, __VA_ARGS__))
 
+//===----------------------------------------------------------------------===//
+// CustomTensorRTAllocator
+//===----------------------------------------------------------------------===//
+
+StatusOr<void *>
+CustomTensorRTAllocator::allocate(uint64_t const size, uint64_t const alignment,
+                                  uint32_t /*flags*/,
+                                  std::optional<cudaStream_t> stream) {
+  uint8_t *memory;
+  assert(alignment > 0 && (alignment & (alignment - 1)) == 0 &&
+         "Memory alignment has to be power of 2");
+  if (stream && *stream != nullptr) {
+    RETURN_ERROR_IF_CUDART_ERROR(
+        cudaMallocAsync(reinterpret_cast<void **>(&memory), size, *stream));
+    MTRT_DBGF("[CustomTensorRTAllocator][allocate]: Asynchronously allocated %lx bytes at 0x%lx on stream %lx", size,
+              reinterpret_cast<uintptr_t>(memory),
+              reinterpret_cast<uintptr_t>(*stream));
+  } else {
+    RETURN_ERROR_IF_CUDART_ERROR(
+        cudaMalloc(reinterpret_cast<void **>(&memory), size));
+    MTRT_DBGF("[CustomTensorRTAllocator][allocate]: Synchronously allocated %lx bytes at 0x%lx", size,
+              reinterpret_cast<uintptr_t>(memory));
+  }
+  assert(reinterpret_cast<uintptr_t>(memory) % alignment == 0);
+  return memory;
+}
+
+StatusOr<bool>
+CustomTensorRTAllocator::deallocate(void *const memory,
+                                    std::optional<cudaStream_t> stream) {
+  if (stream && *stream != nullptr) {
+    MTRT_DBGF("[CustomTensorRTAllocator][deallocate]: Asynchronously freeing CUDA device memory 0x%lx on stream %lx",
+              reinterpret_cast<uintptr_t>(memory),
+              reinterpret_cast<uintptr_t>(*stream));
+    RETURN_ERROR_IF_CUDART_ERROR(cudaFreeAsync(memory, *stream));
+  } else {
+    MTRT_DBGF("[CustomTensorRTAllocator][deallocate]: Synchronously freeing CUDA device/pinned host memory 0x%lx ptr "
+              "on stream %lx",
+              reinterpret_cast<uintptr_t>(memory),
+              reinterpret_cast<uintptr_t>(*stream));
+    RETURN_ERROR_IF_CUDART_ERROR(cudaFree(memory));
+  }
+  return true;
+}
+
+StatusOr<void *> CustomTensorRTAllocator::reallocate(
+    void * /* baseAddr */, uint64_t /* alignment */, uint64_t /* newSize */,
+    std::optional<cudaStream_t> /* stream */) {
+  return nullptr;
+}
+
 //===----------------------------------------------------------------------===//
 // PoolTrackedCudaEvent
 //===----------------------------------------------------------------------===//
diff --git a/mlir-tensorrt/executor/lib/Tools/ExecutorRunnerMain.cpp b/mlir-tensorrt/executor/lib/Tools/ExecutorRunnerMain.cpp
index 3241de0da..c8ab9269b 100644
--- a/mlir-tensorrt/executor/lib/Tools/ExecutorRunnerMain.cpp
+++ b/mlir-tensorrt/executor/lib/Tools/ExecutorRunnerMain.cpp
@@ -98,7 +98,12 @@ struct Options {
       cl::values(clEnumValN(Lua, "lua", "interpret the input as Lua code")),
       cl::values(clEnumValN(ExecutorRuntimeExecutable, "rtexe",
                             "load the input file as an Executor executable"))};
+
+  cl::opt<bool> useCustomAllocator{"use-custom-allocator",
+                                      cl::desc("Use custom allocator"),
+                                      cl::init(false)};                 
 };
+
 } // namespace
 
 LogicalResult
@@ -168,13 +173,19 @@ executor::ExecutorRunnerMain(int argc, char **argv,
   if (result != cudaSuccess)
     return emitError(loc) << "cudaFree failed: " << cudaGetErrorString(result);
 
+  std::unique_ptr<GpuAllocator> allocator{nullptr};
+  if (options.useCustomAllocator) {
+    // Create an optional runtime GPU allocator
+    allocator.reset(new CustomTensorRTAllocator());
+  }
+
   // Read the buffer as a Lua script and execute.
 
   if (options.inputType == Lua) {
     assert(!options.dumpFunctionSignature &&
            "Can not dump function signature for Lua input type.");
     mlirtrt::StatusOr<int64_t> result =
-        mlirtrt::runtime::runExecutorLuaScript(input->getBuffer());
+        mlirtrt::runtime::runExecutorLuaScript(input->getBuffer(), allocator.get());
     if (!result.isOk())
       return emitError(UnknownLoc::get(&context)) << result.getString();
     return success(*result == 0);
@@ -202,7 +213,7 @@ executor::ExecutorRunnerMain(int argc, char **argv,
   }
 
   mlirtrt::StatusOr<int64_t> executionResult =
-      mlirtrt::runtime::runExecutorExecutable(std::move(*executable));
+      mlirtrt::runtime::runExecutorExecutable(std::move(*executable), allocator.get());
   if (!executionResult.isOk())
     return emitError(UnknownLoc::get(&context))
            << "failed to load and run executable: "
diff --git a/mlir-tensorrt/python/bindings/Runtime/RuntimePyBind.cpp b/mlir-tensorrt/python/bindings/Runtime/RuntimePyBind.cpp
index fef5ad868..e6e6bdda5 100644
--- a/mlir-tensorrt/python/bindings/Runtime/RuntimePyBind.cpp
+++ b/mlir-tensorrt/python/bindings/Runtime/RuntimePyBind.cpp
@@ -184,6 +184,16 @@ class PyRuntimeValue : public PyMTRTWrapper<PyRuntimeValue, MTRT_RuntimeValue> {
       mtrtPythonCapsuleToRuntimeValue, mtrtPythonRuntimeValueToCapsule};
 };
 
+/// Python object type wrapper for `MTRT_GpuAllocator`.
+class PyGpuAllocator : public PyMTRTWrapper<PyGpuAllocator, MTRT_GpuAllocator> {
+public:
+  using Base::Base;
+  DECLARE_WRAPPER_CONSTRUCTORS(PyGpuAllocator);
+
+  static constexpr auto kMethodTable = CAPITable<MTRT_GpuAllocator>{
+      mtrtGpuAllocatorIsNull, mtrtGpuAllocatorDestroy};
+};
+
 /// Python object type wrapper for `MTRT_StableHLOToExecutableOptions`.
 class PyRuntimeSessionOptions
     : public PyMTRTWrapper<PyRuntimeSessionOptions,
@@ -901,14 +911,52 @@ PYBIND11_MODULE(_api, m) {
            py::arg("num_devices") = 1, py::arg("device_id") = 0,
            py::arg("nccl_uuid") = py::str(""));
 
+  py::class_<PyGpuAllocator>(m, "GpuAllocator", py::module_local())
+      .def(py::init<>([]() -> PyGpuAllocator * {
+        MTRT_GpuAllocator allocator;
+        MTRT_Status s = mtrtGpuAllocatorCreate(&allocator);
+        THROW_IF_MTRT_ERROR(s);
+        return new PyGpuAllocator(allocator);
+      }))
+      .def(
+          "allocate",
+          [](PyGpuAllocator &self, uint64_t size, uint64_t alignment,
+             std::optional<uint32_t> flags, std::optional<MTRT_Stream> stream) {
+            void *memory{nullptr};
+            MTRT_Status s = mtrtGpuAllocatorAllocate(
+                self, size, alignment, flags ? *flags : 0,
+                stream ? *stream : mtrtStreamGetNull(), &memory);
+            THROW_IF_MTRT_ERROR(s);
+            // Add changes to ensure memory is not released prematurely.
+            return memory;
+          },
+          py::arg("size"), py::arg("alignment"), py::arg("flags") = py::none(),
+          py::arg("stream") = py::none())
+      .def(
+          "deallocate",
+          [](PyGpuAllocator &self, void *memory,
+             std::optional<MTRT_Stream> stream) {
+            bool result;
+            MTRT_Status s = mtrtGpuAllocatorDeallocate(
+                self, memory, stream ? *stream : mtrtStreamGetNull(), &result);
+            THROW_IF_MTRT_ERROR(s);
+            // Add changes to ensure memory is not released prematurely.
+            return result;
+          },
+          py::arg("memory"), py::arg("stream") = py::none());
+
   py::class_<PyRuntimeSession>(m, "RuntimeSession", py::module_local())
-      .def(py::init<>([](PyRuntimeSessionOptions &options, PyExecutable &exe) {
+      .def(py::init<>([](PyRuntimeSessionOptions &options, PyExecutable &exe,
+                         std::optional<MTRT_GpuAllocator> allocator) {
              MTRT_RuntimeSession session;
-             MTRT_Status s = mtrtRuntimeSessionCreate(options, exe, &session);
+             MTRT_Status s = mtrtRuntimeSessionCreate(
+                 options, exe,
+                 allocator ? *allocator : mtrtGpuAllocatorGetNull(), &session);
              THROW_IF_MTRT_ERROR(s);
              return new PyRuntimeSession(session);
            }),
-           py::arg("options"), py::arg("executable"))
+           py::arg("options"), py::arg("executable"),
+           py::arg("gpu_allocator") = py::none())
       .def(
           "execute_function",
           [](PyRuntimeSession &self, std::string name,

From 3f3768ecb0ce52272b912578c17a5e37bb13bdac Mon Sep 17 00:00:00 2001
From: Jhalak Patel <jhalakp@nvidia.com>
Date: Mon, 19 Aug 2024 14:45:55 -0700
Subject: [PATCH 2/6] Add a trampoline class to enable method overriding

---
 .../include/mlir-executor-c/Runtime/Runtime.h |  32 ++---
 .../include/mlir-executor/Runtime/API/API.h   |   6 +-
 .../Runtime/Backend/Lua/LuaRuntime.h          |   4 +-
 .../mlir-executor/Support/Allocators.h        |  65 +--------
 .../executor/lib/CAPI/Runtime/Runtime.cpp     |  84 +++++------
 .../executor/lib/Runtime/API/API.cpp          |   4 +-
 .../lib/Runtime/Backend/Lua/LuaRuntime.cpp    |  10 +-
 .../Lua/Modules/TensorRT/TensorRTModule.cpp   |  17 +--
 .../executor/lib/Support/Allocators.cpp       |  44 +-----
 .../executor/lib/Tools/ExecutorRunnerMain.cpp |   2 +-
 .../python/bindings/Runtime/RuntimePyBind.cpp | 130 ++++++++++++------
 .../IntegrationTests/test_stablehlo_add.py    |  30 +++-
 12 files changed, 197 insertions(+), 231 deletions(-)

diff --git a/mlir-tensorrt/executor/include/mlir-executor-c/Runtime/Runtime.h b/mlir-tensorrt/executor/include/mlir-executor-c/Runtime/Runtime.h
index 65377da35..dada8ea67 100644
--- a/mlir-tensorrt/executor/include/mlir-executor-c/Runtime/Runtime.h
+++ b/mlir-tensorrt/executor/include/mlir-executor-c/Runtime/Runtime.h
@@ -323,33 +323,23 @@ mtrtScalarValueGetType(MTRT_ScalarValue scalar, MTRT_ScalarTypeCode *code);
 // MTRT_GpuAllocator
 //===----------------------------------------------------------------------===//
 
+
+// Function pointer types for the allocate and deallocate callbacks
+typedef void* (*AllocateFunc)(void* self, uint64_t size);
+typedef bool (*DeallocateFunc)(void* self, void* memory);
+
+// The MTRT_GpuAllocator struct
 typedef struct MTRT_GpuAllocator {
-  void *ptr;
+    void* ptr;              // Pointer to the implementation (PyGpuAllocatorTrampoline in our case)
+    AllocateFunc allocate;  // Function pointer for allocation
+    DeallocateFunc deallocate;  // Function pointer for deallocation
 } MTRT_GpuAllocator;
 
 /// Checks nullity of `GpuAllocator`.
-MTRT_CAPI_EXPORTED bool mtrtGpuAllocatorIsNull(MTRT_GpuAllocator gpuAllocator);
-
-/// Returns null `GpuAllocator`.
-MTRT_CAPI_EXPORTED MTRT_GpuAllocator mtrtGpuAllocatorGetNull();
-
-MTRT_CAPI_EXPORTED MTRT_Status
-mtrtGpuAllocatorDestroy(MTRT_GpuAllocator executable);
-
-MTRT_CAPI_EXPORTED MTRT_Status
-mtrtGpuAllocatorCreate(MTRT_GpuAllocator *allocator);
-
-//===----------------------------------------------------------------------===//
-// MTRT_GpuAllocator
-//===----------------------------------------------------------------------===//
-
-MTRT_CAPI_EXPORTED MTRT_Status mtrtGpuAllocatorAllocate(
-    MTRT_GpuAllocator gpuAllocator, uint64_t size, uint64_t alignment,
-    uint32_t flags, MTRT_Stream stream, void **memory);
+MTRT_CAPI_EXPORTED bool GpuAllocatorIsNull(MTRT_GpuAllocator gpuAllocator);
 
 MTRT_CAPI_EXPORTED MTRT_Status
-mtrtGpuAllocatorDeallocate(MTRT_GpuAllocator gpuAllocator, void *memory,
-                           MTRT_Stream stream, bool *result);
+GpuAllocatorDestroy(MTRT_GpuAllocator executable);
 
 //===----------------------------------------------------------------------===//
 // MTRT_RuntimeSessionOptions
diff --git a/mlir-tensorrt/executor/include/mlir-executor/Runtime/API/API.h b/mlir-tensorrt/executor/include/mlir-executor/Runtime/API/API.h
index b35d0b777..70384c60d 100644
--- a/mlir-tensorrt/executor/include/mlir-executor/Runtime/API/API.h
+++ b/mlir-tensorrt/executor/include/mlir-executor/Runtime/API/API.h
@@ -868,7 +868,7 @@ class RuntimeSession {
                  std::unique_ptr<PinnedMemoryAllocator> pinnedMemoryAllocator,
                  std::unique_ptr<AllocTracker> allocTracker,
                  std::unique_ptr<ResourceTracker> resourceTracker,
-                 GpuAllocator* gpuAllocator);
+                 std::unique_ptr<GpuAllocator> gpuAllocator);
 
   ExecutableView getExecutable() const { return executable; }
 
@@ -882,7 +882,7 @@ class RuntimeSession {
 
   ResourceTracker &getResourceTracker() { return *resourceTracker; }
 
-  GpuAllocator* getGpuAllocator() { return gpuAllocator; }
+  GpuAllocator &getGpuAllocator() { return *gpuAllocator; }
 
 private:
   RuntimeSessionOptions options;
@@ -891,7 +891,7 @@ class RuntimeSession {
   std::unique_ptr<PinnedMemoryAllocator> pinnedMemoryAllocator;
   std::unique_ptr<AllocTracker> allocTracker;
   std::unique_ptr<ResourceTracker> resourceTracker;
-  GpuAllocator* gpuAllocator;
+  std::unique_ptr<GpuAllocator> gpuAllocator;
   sol::state state;
 };
 
diff --git a/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/LuaRuntime.h b/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/LuaRuntime.h
index 0635e8cb4..d4f07f13a 100644
--- a/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/LuaRuntime.h
+++ b/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/LuaRuntime.h
@@ -48,12 +48,12 @@ StatusOr<int64_t> runExecutorLuaScript(std::string_view luaScript, GpuAllocator*
 /// execution is successful).
 /// TODO: this should take a handle to a function for
 /// streaming output/errors.
-StatusOr<int64_t> runExecutorExecutable(std::unique_ptr<Executable> executable, GpuAllocator* allocator);
+StatusOr<int64_t> runExecutorExecutable(std::unique_ptr<Executable> executable, std::unique_ptr<GpuAllocator> allocator);
 
 /// Create an execution state. This will setup a Lua environment and invoke
 /// global initialization.
 StatusOr<std::unique_ptr<RuntimeSession>>
-createRuntimeSessionWithLuaBackend(ExecutableView executable, GpuAllocator* allocator,
+createRuntimeSessionWithLuaBackend(ExecutableView executable, std::unique_ptr<GpuAllocator> allocator,
                                    const RuntimeSessionOptions &options);
 
 /// Set the primary stream for the loaded executable to use.
diff --git a/mlir-tensorrt/executor/include/mlir-executor/Support/Allocators.h b/mlir-tensorrt/executor/include/mlir-executor/Support/Allocators.h
index 66f54fb46..393a5a091 100644
--- a/mlir-tensorrt/executor/include/mlir-executor/Support/Allocators.h
+++ b/mlir-tensorrt/executor/include/mlir-executor/Support/Allocators.h
@@ -32,77 +32,20 @@ namespace mlirtrt {
 
 struct EventPool;
 
-// Abstract allocator to be implemented by consumers.
-using AllocatorFlags = uint32_t;
-
 class GpuAllocator {
 public:
   GpuAllocator() = default;
   virtual ~GpuAllocator() = default;
-
-  virtual StatusOr<void *> reallocate(void *baseAddr, uint64_t alignment,
-                                      uint64_t newSize,
-                                      std::optional<cudaStream_t> stream) = 0;
-
-  virtual StatusOr<void *> allocate(uint64_t const size,
-                                    uint64_t const alignment,
-                                    AllocatorFlags const flags,
-                                    std::optional<cudaStream_t> stream) = 0;
-
-  virtual StatusOr<bool> deallocate(void *const memory,
-                                    std::optional<cudaStream_t> stream) = 0;
-
-protected:
-  GpuAllocator(GpuAllocator const &) = delete;
-  GpuAllocator(GpuAllocator &&) = delete;
-  GpuAllocator &operator=(GpuAllocator const &) & = delete;
-  GpuAllocator &operator=(GpuAllocator &&) & = delete;
-};
-
-class StubAllocator : public GpuAllocator {
-public:
-  StubAllocator() = default;
-  ~StubAllocator() = default;
-
-  StatusOr<void *> reallocate(void *baseAddr, uint64_t alignment,
-                              uint64_t newSize,
-                              std::optional<cudaStream_t> stream) override {
-    return getStatusWithMsg(
-        StatusCode::InternalError,
-        "[StubAllocator][reallocate]: Must be overriden in Python");
-  }
-
-  StatusOr<void *> allocate(uint64_t const size, uint64_t const alignment,
-                            AllocatorFlags const flags,
-                            std::optional<cudaStream_t> stream) override {
-    return getStatusWithMsg(
-        StatusCode::InternalError,
-        "[StubAllocator][allocate]: Must be overriden in Python");
-  }
-
-  StatusOr<bool> deallocate(void *const memory,
-                            std::optional<cudaStream_t> stream) override {
-    return getStatusWithMsg(
-        StatusCode::InternalError,
-        "[StubAllocator][deallocate]: Must be overriden in Python");
-  }
+  virtual void* allocate(uint64_t const size) { return nullptr; }
+  virtual bool deallocate(void *const memory) { return false; }
 };
 
 class CustomTensorRTAllocator : public GpuAllocator {
 public:
   CustomTensorRTAllocator() = default;
   ~CustomTensorRTAllocator() = default;
-
-  StatusOr<void *> reallocate(void *baseAddr, uint64_t alignment,
-                              uint64_t newSize,
-                              std::optional<cudaStream_t> stream) override;
-
-  StatusOr<void *> allocate(uint64_t const size, uint64_t const alignment,
-                            AllocatorFlags const flags,
-                            std::optional<cudaStream_t> stream) override;
-
-  StatusOr<bool> deallocate(void *const memory,
-                            std::optional<cudaStream_t> stream) override;
+  void* allocate(uint64_t const size) override;
+  bool deallocate(void *const memory) override;
 };
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir-tensorrt/executor/lib/CAPI/Runtime/Runtime.cpp b/mlir-tensorrt/executor/lib/CAPI/Runtime/Runtime.cpp
index 7c412a290..74eac79c1 100644
--- a/mlir-tensorrt/executor/lib/CAPI/Runtime/Runtime.cpp
+++ b/mlir-tensorrt/executor/lib/CAPI/Runtime/Runtime.cpp
@@ -49,8 +49,6 @@ DEFINE_C_API_PTR_METHODS(MTRT_RuntimeSession,
                          ::mlirtrt::runtime::RuntimeSession)
 DEFINE_C_API_PTR_METHODS(MTRT_RuntimeSessionOptions,
                          ::mlirtrt::runtime::RuntimeSessionOptions)
-DEFINE_C_API_PTR_METHODS(MTRT_GpuAllocator,
-                         ::mlirtrt::GpuAllocator)
 DEFINE_C_API_PTR_METHODS(MTRT_Executable, ::mlirtrt::runtime::Executable)
 DEFINE_C_API_PTR_METHODS(MTRT_Stream, MTRT_StreamImpl)
 DEFINE_C_API_PTR_METHODS(MTRT_RuntimeValue, ::mlirtrt::runtime::RuntimeValue)
@@ -607,50 +605,24 @@ MTRT_ScalarValue mtrtRuntimeValueDynCastToScalar(MTRT_RuntimeValue v) {
 // MTRT_GpuAllocator
 //===----------------------------------------------------------------------===//
 
-bool mtrtGpuAllocatorIsNull(MTRT_GpuAllocator gpuAllocator) {
+bool GpuAllocatorIsNull(MTRT_GpuAllocator gpuAllocator) {
   return !gpuAllocator.ptr;
 }
 
-MTRT_GpuAllocator mtrtGpuAllocatorGetNull() { return MTRT_GpuAllocator{nullptr}; }
-
-MTRT_Status mtrtGpuAllocatorDestroy(MTRT_GpuAllocator executable) {
-  delete unwrap(executable);
-  return mtrtStatusGetOk();
-}
-
-MTRT_Status mtrtGpuAllocatorCreate(MTRT_GpuAllocator *allocator) {
-  *allocator = MTRT_GpuAllocator{/*ptr=*/new StubAllocator()};
-  return mtrtStatusGetOk();
-}
-
-MTRT_Status mtrtGpuAllocatorAllocate(MTRT_GpuAllocator gpuAllocator,
-                                     uint64_t size, uint64_t alignment,
-                                     uint32_t flags, MTRT_Stream stream,
-                                     void **memory) {
-  GpuAllocator *cppGpuAllocator = unwrap(gpuAllocator);
-  StatusOr<void *> status = cppGpuAllocator->allocate(
-      size, alignment, flags,
-      !mtrtStreamIsNull(stream) ? std::optional(unwrap(stream)->getRawStream())
-                                : std::nullopt);
-  if (status.isOk()) {
-    *memory = *status;
-  }
+MTRT_Status GpuAllocatorDestroy(MTRT_GpuAllocator gpuAllocator) {
+  // delete unwrap(gpuAllocator);
   return mtrtStatusGetOk();
 }
 
-MTRT_Status mtrtGpuAllocatorDeallocate(MTRT_GpuAllocator gpuAllocator,
-                                       void *memory, MTRT_Stream stream,
-                                       bool *result) {
-  GpuAllocator *cppGpuAllocator = unwrap(gpuAllocator);
-  StatusOr<bool> status = cppGpuAllocator->deallocate(
-      memory, !mtrtStreamIsNull(stream)
-                  ? std::optional(unwrap(stream)->getRawStream())
-                  : std::nullopt);
-  if (status.isOk()) {
-    *result = *status;
-  }
-  return mtrtStatusGetOk();
-}
+// TODO: Implement destroy method to release resources.
+// void mtrtGpuAllocatorDestroy(MTRT_GpuAllocator* allocator) {
+//     if (allocator && allocator->ptr) {
+//         delete static_cast<PyGpuAllocatorTrampoline*>(allocator->ptr);
+//         allocator->ptr = nullptr;
+//         allocator->allocate = nullptr;
+//         allocator->deallocate = nullptr;
+//     }
+// }
 
 //===----------------------------------------------------------------------===//
 // MTRT_RuntimeSessionOptions
@@ -677,16 +649,44 @@ mtrtRuntimeSessionOptionsDestroy(MTRT_RuntimeSessionOptions options) {
 // MTRT_RuntimeSession
 //===----------------------------------------------------------------------===//
 
+// A wrapper class for MTRT_GpuAllocator implementing the GpuAllocator
+// interface. It encapsulates GPU memory allocation and deallocation operations,
+// ensuring correct routing of callbacks from C++ to Python.
+class GpuAllocatorWrapper : public GpuAllocator {
+private:
+  MTRT_GpuAllocator mPyGpuAllocator;
+
+public:
+  GpuAllocatorWrapper(MTRT_GpuAllocator gpuAllocator)
+      : mPyGpuAllocator(gpuAllocator) {}
+
+  void *allocate(uint64_t size) override {
+    return mPyGpuAllocator.allocate(mPyGpuAllocator.ptr, size);
+  }
+
+  bool deallocate(void *ptr) override {
+    return mPyGpuAllocator.deallocate(mPyGpuAllocator.ptr, ptr);
+  }
+
+  // Static method to create a GpuAllocator from MTRT_GpuAllocator
+  static std::unique_ptr<GpuAllocator> create(MTRT_GpuAllocator gpuAllocator) {
+    return std::make_unique<GpuAllocatorWrapper>(gpuAllocator);
+  }
+};
+
 MTRT_Status mtrtRuntimeSessionCreate(MTRT_RuntimeSessionOptions options,
                                      MTRT_Executable executable,
                                      MTRT_GpuAllocator gpuAllocator,
                                      MTRT_RuntimeSession *result) {
   RuntimeSessionOptions *cppOptions = unwrap(options);
   Executable *cppExecutable = unwrap(executable);
-  GpuAllocator *cppGpuAllocator = unwrap(gpuAllocator);
+
+  std::unique_ptr<GpuAllocator> allocator =
+      gpuAllocator.ptr ? GpuAllocatorWrapper::create(gpuAllocator) : nullptr;
 
   StatusOr<std::unique_ptr<RuntimeSession>> session =
-      createRuntimeSessionWithLuaBackend(cppExecutable->getView(), cppGpuAllocator, *cppOptions);
+      createRuntimeSessionWithLuaBackend(cppExecutable->getView(),
+                                         std::move(allocator), *cppOptions);
   if (session.isError())
     return wrap(session.getStatus());
 
diff --git a/mlir-tensorrt/executor/lib/Runtime/API/API.cpp b/mlir-tensorrt/executor/lib/Runtime/API/API.cpp
index b53ab2096..52b02f72a 100644
--- a/mlir-tensorrt/executor/lib/Runtime/API/API.cpp
+++ b/mlir-tensorrt/executor/lib/Runtime/API/API.cpp
@@ -358,11 +358,11 @@ RuntimeSession::RuntimeSession(
     std::unique_ptr<PinnedMemoryAllocator> pinnedMemoryAllocator,
     std::unique_ptr<AllocTracker> allocTracker,
     std::unique_ptr<ResourceTracker> resourceTracker,
-    GpuAllocator *gpuAllocator)
+    std::unique_ptr<GpuAllocator> gpuAllocator)
     : options(std::move(options)), executable(exe),
       pinnedMemoryAllocator(std::move(pinnedMemoryAllocator)),
       allocTracker(std::move(allocTracker)),
-      resourceTracker(std::move(resourceTracker)), gpuAllocator(gpuAllocator),
+      resourceTracker(std::move(resourceTracker)), gpuAllocator(std::move(gpuAllocator)),
       state(std::move(state)) {}
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/LuaRuntime.cpp b/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/LuaRuntime.cpp
index b19541977..17af64a91 100644
--- a/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/LuaRuntime.cpp
+++ b/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/LuaRuntime.cpp
@@ -171,7 +171,7 @@ static Status maybeCheckForValidNcclUuid(const RuntimeSessionOptions &options) {
 /// global initialization.
 StatusOr<std::unique_ptr<RuntimeSession>>
 mlirtrt::runtime::createRuntimeSessionWithLuaBackend(
-    ExecutableView executable, GpuAllocator* allocator, const RuntimeSessionOptions &options) {
+    ExecutableView executable, std::unique_ptr<GpuAllocator> allocator, const RuntimeSessionOptions &options) {
   ADD_RUNTIME_MODULE_RANGE("runtime_loadExecutable");
 
   MTRT_RETURN_IF_ERROR(maybeCheckForValidNcclUuid(options));
@@ -184,7 +184,7 @@ mlirtrt::runtime::createRuntimeSessionWithLuaBackend(
   lua.open_libraries(sol::lib::base, sol::lib::string);
   registerLuaRuntimeMethods(lua.lua_state(), options,
                             pinnedMemoryAllocator.get(), allocTracker.get(),
-                            resourceTracker.get(), allocator);
+                            resourceTracker.get(), allocator.get());
 
   // Load globals into the context.
   // TODO: eliminate this copy, we already own the executable.
@@ -225,11 +225,11 @@ mlirtrt::runtime::createRuntimeSessionWithLuaBackend(
   }
   return std::make_unique<RuntimeSession>(
       options, executable, std::move(lua), std::move(pinnedMemoryAllocator),
-      std::move(allocTracker), std::move(resourceTracker), allocator);
+      std::move(allocTracker), std::move(resourceTracker), std::move(allocator));
 }
 
 StatusOr<int64_t> mlirtrt::runtime::runExecutorExecutable(
-    std::unique_ptr<Executable> executable, GpuAllocator* allocator) {
+    std::unique_ptr<Executable> executable, std::unique_ptr<GpuAllocator> allocator) {
 
   StatusOr<std::unique_ptr<RuntimeClient>> client = RuntimeClient::create();
   if (!client.isOk())
@@ -245,7 +245,7 @@ StatusOr<int64_t> mlirtrt::runtime::runExecutorExecutable(
     return options.getStatus();
 
   StatusOr<std::unique_ptr<RuntimeSession>> session =
-      createRuntimeSessionWithLuaBackend(executable->getView(), allocator, *options);
+      createRuntimeSessionWithLuaBackend(executable->getView(), std::move(allocator), *options);
   if (!session.isOk())
     return session.getStatus();
 
diff --git a/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.cpp b/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.cpp
index bac6942a1..fed3a573e 100644
--- a/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.cpp
+++ b/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.cpp
@@ -82,19 +82,16 @@ class TensorRTCallBackAllocator final : public nvinfer1::IGpuAllocator {
     return deallocateAsync(memory, nullptr);
   }
 
-  void *allocateAsync(uint64_t const size, uint64_t const alignment,
-                      uint32_t flags, cudaStream_t stream) noexcept final {
-    StatusOr<void *> status =
-        mGpuAllocatorCallBack->allocate(size, alignment, flags, stream);
-    assert(status.isOk());
-    return *status;
+  void *allocateAsync(uint64_t const size, uint64_t const /*alignment*/,
+                      uint32_t /*flags*/, cudaStream_t /*stream*/) noexcept final {
+    void* result = mGpuAllocatorCallBack->allocate(size);
+    return result;
   }
 
   bool deallocateAsync(void *const memory,
-                       cudaStream_t stream) noexcept override {
-    StatusOr<bool> status = mGpuAllocatorCallBack->deallocate(memory, stream);
-    assert(status.isOk());
-    return *status;
+                       cudaStream_t /*stream*/) noexcept override {
+    bool result = mGpuAllocatorCallBack->deallocate(memory);
+    return result;
   }
 
 private:
diff --git a/mlir-tensorrt/executor/lib/Support/Allocators.cpp b/mlir-tensorrt/executor/lib/Support/Allocators.cpp
index 2ee7c28f4..70f21b9f7 100644
--- a/mlir-tensorrt/executor/lib/Support/Allocators.cpp
+++ b/mlir-tensorrt/executor/lib/Support/Allocators.cpp
@@ -46,53 +46,17 @@ using namespace mlirtrt;
 // CustomTensorRTAllocator
 //===----------------------------------------------------------------------===//
 
-StatusOr<void *>
-CustomTensorRTAllocator::allocate(uint64_t const size, uint64_t const alignment,
-                                  uint32_t /*flags*/,
-                                  std::optional<cudaStream_t> stream) {
+void *CustomTensorRTAllocator::allocate(uint64_t const size) {
   uint8_t *memory;
-  assert(alignment > 0 && (alignment & (alignment - 1)) == 0 &&
-         "Memory alignment has to be power of 2");
-  if (stream && *stream != nullptr) {
-    RETURN_ERROR_IF_CUDART_ERROR(
-        cudaMallocAsync(reinterpret_cast<void **>(&memory), size, *stream));
-    MTRT_DBGF("[CustomTensorRTAllocator][allocate]: Asynchronously allocated %lx bytes at 0x%lx on stream %lx", size,
-              reinterpret_cast<uintptr_t>(memory),
-              reinterpret_cast<uintptr_t>(*stream));
-  } else {
-    RETURN_ERROR_IF_CUDART_ERROR(
-        cudaMalloc(reinterpret_cast<void **>(&memory), size));
-    MTRT_DBGF("[CustomTensorRTAllocator][allocate]: Synchronously allocated %lx bytes at 0x%lx", size,
-              reinterpret_cast<uintptr_t>(memory));
-  }
-  assert(reinterpret_cast<uintptr_t>(memory) % alignment == 0);
+  cudaMalloc(reinterpret_cast<void **>(&memory), size);
   return memory;
 }
 
-StatusOr<bool>
-CustomTensorRTAllocator::deallocate(void *const memory,
-                                    std::optional<cudaStream_t> stream) {
-  if (stream && *stream != nullptr) {
-    MTRT_DBGF("[CustomTensorRTAllocator][deallocate]: Asynchronously freeing CUDA device memory 0x%lx on stream %lx",
-              reinterpret_cast<uintptr_t>(memory),
-              reinterpret_cast<uintptr_t>(*stream));
-    RETURN_ERROR_IF_CUDART_ERROR(cudaFreeAsync(memory, *stream));
-  } else {
-    MTRT_DBGF("[CustomTensorRTAllocator][deallocate]: Synchronously freeing CUDA device/pinned host memory 0x%lx ptr "
-              "on stream %lx",
-              reinterpret_cast<uintptr_t>(memory),
-              reinterpret_cast<uintptr_t>(*stream));
-    RETURN_ERROR_IF_CUDART_ERROR(cudaFree(memory));
-  }
+bool CustomTensorRTAllocator::deallocate(void *const memory) {
+  cudaFree(memory);
   return true;
 }
 
-StatusOr<void *> CustomTensorRTAllocator::reallocate(
-    void * /* baseAddr */, uint64_t /* alignment */, uint64_t /* newSize */,
-    std::optional<cudaStream_t> /* stream */) {
-  return nullptr;
-}
-
 //===----------------------------------------------------------------------===//
 // PoolTrackedCudaEvent
 //===----------------------------------------------------------------------===//
diff --git a/mlir-tensorrt/executor/lib/Tools/ExecutorRunnerMain.cpp b/mlir-tensorrt/executor/lib/Tools/ExecutorRunnerMain.cpp
index c8ab9269b..dc14db16f 100644
--- a/mlir-tensorrt/executor/lib/Tools/ExecutorRunnerMain.cpp
+++ b/mlir-tensorrt/executor/lib/Tools/ExecutorRunnerMain.cpp
@@ -213,7 +213,7 @@ executor::ExecutorRunnerMain(int argc, char **argv,
   }
 
   mlirtrt::StatusOr<int64_t> executionResult =
-      mlirtrt::runtime::runExecutorExecutable(std::move(*executable), allocator.get());
+      mlirtrt::runtime::runExecutorExecutable(std::move(*executable), std::move(allocator));
   if (!executionResult.isOk())
     return emitError(UnknownLoc::get(&context))
            << "failed to load and run executable: "
diff --git a/mlir-tensorrt/python/bindings/Runtime/RuntimePyBind.cpp b/mlir-tensorrt/python/bindings/Runtime/RuntimePyBind.cpp
index e6e6bdda5..5a4d10096 100644
--- a/mlir-tensorrt/python/bindings/Runtime/RuntimePyBind.cpp
+++ b/mlir-tensorrt/python/bindings/Runtime/RuntimePyBind.cpp
@@ -31,6 +31,8 @@
 #include <stdexcept>
 #include <string_view>
 
+#include "cuda_runtime.h"
+
 namespace py = pybind11;
 using namespace mlirtrt;
 
@@ -184,14 +186,69 @@ class PyRuntimeValue : public PyMTRTWrapper<PyRuntimeValue, MTRT_RuntimeValue> {
       mtrtPythonCapsuleToRuntimeValue, mtrtPythonRuntimeValueToCapsule};
 };
 
-/// Python object type wrapper for `MTRT_GpuAllocator`.
-class PyGpuAllocator : public PyMTRTWrapper<PyGpuAllocator, MTRT_GpuAllocator> {
+// Abstract base class for Python-implemented GPU allocators.
+// Provides a C++ interface for Python classes and handles C-style callback
+// routing.
+class PyGpuAllocator {
 public:
-  using Base::Base;
-  DECLARE_WRAPPER_CONSTRUCTORS(PyGpuAllocator);
+  virtual ~PyGpuAllocator() = default;
+  virtual std::uintptr_t allocate(uint64_t size) = 0;
+  virtual bool deallocate(std::uintptr_t ptr) = 0;
+
+  // Creates a C-compatible struct for interfacing with lower-level APIs.
+  MTRT_GpuAllocator getCApiObject() { return createWithPythonCallbacks(this); }
+
+private:
+  // Trampoline function: Routes C-style allocation calls to C++ virtual method.
+  static void *pyGpuAllocatorAllocate(void *self, uint64_t size) {
+    auto *allocator = static_cast<PyGpuAllocator *>(self);
+    std::uintptr_t ptr = allocator->allocate(size);
+    return reinterpret_cast<void *>(ptr);
+  }
+
+  // Trampoline function: Routes C-style deallocation calls to C++ virtual
+  // method.
+  static bool pyGpuAllocatorDeallocate(void *self, void *memory) {
+    auto *allocator = static_cast<PyGpuAllocator *>(self);
+    return allocator->deallocate(reinterpret_cast<std::uintptr_t>(memory));
+  }
+
+  // Constructs MTRT_GpuAllocator with this instance's methods as callbacks.
+  static MTRT_GpuAllocator
+  createWithPythonCallbacks(PyGpuAllocator *allocator) {
+    MTRT_GpuAllocator capi_allocator;
+    capi_allocator.ptr = allocator;
+    capi_allocator.allocate = pyGpuAllocatorAllocate;
+    capi_allocator.deallocate = pyGpuAllocatorDeallocate;
+    return capi_allocator;
+  }
+};
+
+// Pybind11 trampoline class for PyGpuAllocator.
+// Enables Python subclasses to override virtual methods of PyGpuAllocator.
+class PyGpuAllocatorTrampoline : public PyGpuAllocator {
+public:
+  using PyGpuAllocator::PyGpuAllocator; // Inherit constructors
+
+  // Trampoline for allocate: Dispatches call to Python implementation if
+  // overridden.
+  uintptr_t allocate(uint64_t size) override {
+    PYBIND11_OVERRIDE_PURE(uintptr_t,      // Return type
+                           PyGpuAllocator, // Parent class
+                           allocate,       // Name of function in C++
+                           size            // Arguments
+    );
+  }
 
-  static constexpr auto kMethodTable = CAPITable<MTRT_GpuAllocator>{
-      mtrtGpuAllocatorIsNull, mtrtGpuAllocatorDestroy};
+  // Trampoline for deallocate: Dispatches call to Python implementation if
+  // overridden.
+  bool deallocate(uintptr_t ptr) override {
+    PYBIND11_OVERRIDE_PURE(bool,           // Return type
+                           PyGpuAllocator, // Parent class
+                           deallocate,     // Name of function in C++
+                           ptr             // Arguments
+    );
+  }
 };
 
 /// Python object type wrapper for `MTRT_StableHLOToExecutableOptions`.
@@ -911,47 +968,34 @@ PYBIND11_MODULE(_api, m) {
            py::arg("num_devices") = 1, py::arg("device_id") = 0,
            py::arg("nccl_uuid") = py::str(""));
 
-  py::class_<PyGpuAllocator>(m, "GpuAllocator", py::module_local())
-      .def(py::init<>([]() -> PyGpuAllocator * {
-        MTRT_GpuAllocator allocator;
-        MTRT_Status s = mtrtGpuAllocatorCreate(&allocator);
-        THROW_IF_MTRT_ERROR(s);
-        return new PyGpuAllocator(allocator);
-      }))
-      .def(
-          "allocate",
-          [](PyGpuAllocator &self, uint64_t size, uint64_t alignment,
-             std::optional<uint32_t> flags, std::optional<MTRT_Stream> stream) {
-            void *memory{nullptr};
-            MTRT_Status s = mtrtGpuAllocatorAllocate(
-                self, size, alignment, flags ? *flags : 0,
-                stream ? *stream : mtrtStreamGetNull(), &memory);
-            THROW_IF_MTRT_ERROR(s);
-            // Add changes to ensure memory is not released prematurely.
-            return memory;
-          },
-          py::arg("size"), py::arg("alignment"), py::arg("flags") = py::none(),
-          py::arg("stream") = py::none())
-      .def(
-          "deallocate",
-          [](PyGpuAllocator &self, void *memory,
-             std::optional<MTRT_Stream> stream) {
-            bool result;
-            MTRT_Status s = mtrtGpuAllocatorDeallocate(
-                self, memory, stream ? *stream : mtrtStreamGetNull(), &result);
-            THROW_IF_MTRT_ERROR(s);
-            // Add changes to ensure memory is not released prematurely.
-            return result;
-          },
-          py::arg("memory"), py::arg("stream") = py::none());
+  py::class_<PyGpuAllocator, PyGpuAllocatorTrampoline>(m, "GpuAllocator")
+      .def(py::init<>())
+      .def("allocate", &PyGpuAllocator::allocate)
+      .def("deallocate", &PyGpuAllocator::deallocate)
+      .def("get_capi_object", &PyGpuAllocator::getCApiObject);
 
   py::class_<PyRuntimeSession>(m, "RuntimeSession", py::module_local())
       .def(py::init<>([](PyRuntimeSessionOptions &options, PyExecutable &exe,
-                         std::optional<MTRT_GpuAllocator> allocator) {
+                         py::object gpu_allocator = py::none()) {
              MTRT_RuntimeSession session;
-             MTRT_Status s = mtrtRuntimeSessionCreate(
-                 options, exe,
-                 allocator ? *allocator : mtrtGpuAllocatorGetNull(), &session);
+             MTRT_Status s;
+
+             if (gpu_allocator.is_none()) {
+               // Create session without custom allocator
+               s = mtrtRuntimeSessionCreate(
+                   options, exe, MTRT_GpuAllocator{nullptr}, &session);
+             } else {
+               try {
+                 PyGpuAllocator &allocator =
+                     gpu_allocator.cast<PyGpuAllocator &>();
+                 MTRT_GpuAllocator capi_allocator = allocator.getCApiObject();
+                 s = mtrtRuntimeSessionCreate(options, exe, capi_allocator,
+                                              &session);
+               } catch (const py::cast_error &) {
+                 throw py::type_error(
+                     "gpu_allocator must be a GpuAllocator object or None");
+               }
+             }
              THROW_IF_MTRT_ERROR(s);
              return new PyRuntimeSession(session);
            }),
diff --git a/mlir-tensorrt/test/python/IntegrationTests/test_stablehlo_add.py b/mlir-tensorrt/test/python/IntegrationTests/test_stablehlo_add.py
index 2c95a3081..a10957162 100644
--- a/mlir-tensorrt/test/python/IntegrationTests/test_stablehlo_add.py
+++ b/mlir-tensorrt/test/python/IntegrationTests/test_stablehlo_add.py
@@ -5,6 +5,7 @@
 import mlir_tensorrt.compiler.ir as ir
 import mlir_tensorrt.runtime.api as runtime
 import numpy as np
+import cupy as cp
 
 ASM = """
 func.func @main(%arg0: tensor<2x3x4xf32>) -> tensor<2x3x4xf32> {
@@ -14,6 +15,31 @@
 """
 
 
+class AppGpuAllocator(runtime.GpuAllocator):
+    def __init__(self):
+        # Initialize the base class
+        super().__init__()
+
+    def allocate(self, size, alignment, flags=None, stream=None):
+        # Implement memory allocation using CuPy
+        # Allocate memory on the GPU
+        import pdb
+
+        pdb.set_trace()
+        memory = cp.empty(size, dtype=cp.uint8)
+        print(
+            f"Allocated memory: {memory.data.ptr}"
+        )  # Print pointer to allocated GPU memory
+        return memory  # Return the CuPy array
+
+    def deallocate(self, memory, stream=None):
+        # Deallocate memory in CuPy
+        # CuPy handles memory automatically, so explicit deallocation is not necessary
+        print(f"Deallocating memory: {memory}")
+        del memory  # Explicitly delete the CuPy array if needed
+        return True
+
+
 def stablehlo_add():
     # Build/parse the main function.
     with ir.Context() as context:
@@ -36,8 +62,10 @@ def stablehlo_add():
     if len(devices) == 0:
         return
 
+    allocator = AppGpuAllocator()
+
     session_options = runtime.RuntimeSessionOptions(num_devices=1, device_id=0)
-    session = runtime.RuntimeSession(session_options, exe)
+    session = runtime.RuntimeSession(session_options, exe, gpu_allocator=allocator)
 
     arg0 = client.create_memref(
         np.arange(0.0, 24.0, dtype=np.float32).reshape(2, 3, 4).data,

From d75c66a449c166aad69f156a976f7353f4fc2ff4 Mon Sep 17 00:00:00 2001
From: Jhalak Patel <jhalakp@nvidia.com>
Date: Tue, 20 Aug 2024 11:13:47 -0700
Subject: [PATCH 3/6] Update tests to use custom allocator

---
 .../end-to-end-binary.mlir                    |  2 +-
 .../end-to-end-unary.mlir                     |  2 +-
 .../Lua/IntegrationTests/buffer-ops-bf16.mlir |  2 +-
 .../IntegrationTests/buffer-ops-dynamic.mlir  |  2 +-
 .../Lua/IntegrationTests/buffer-ops-f16.mlir  |  2 +-
 .../Lua/IntegrationTests/buffer-ops-f32.mlir  |  2 +-
 .../IntegrationTests/buffer-ops-f8E4M3FN.mlir |  2 +-
 .../Lua/IntegrationTests/buffer-ops-i1.mlir   |  2 +-
 .../Lua/IntegrationTests/buffer-ops-i4.mlir   |  2 +-
 .../Lua/IntegrationTests/memcpy-strided.mlir  |  2 +-
 .../Target/Lua/IntegrationTests/memcpy.mlir   |  2 +-
 .../TRT10/test_stablehlo_add.py               |  2 +-
 .../IntegrationTests/test_call_validation.py  |  2 +-
 .../IntegrationTests/test_stablehlo_add.py    | 38 +++++++++----------
 .../test_stablehlo_dynamic.py                 |  2 +-
 15 files changed, 32 insertions(+), 34 deletions(-)

diff --git a/mlir-tensorrt/test/Target/Lua/IntegrationTests/ClusteringDynamicShape/end-to-end-binary.mlir b/mlir-tensorrt/test/Target/Lua/IntegrationTests/ClusteringDynamicShape/end-to-end-binary.mlir
index 84d8e714a..0d3688a17 100644
--- a/mlir-tensorrt/test/Target/Lua/IntegrationTests/ClusteringDynamicShape/end-to-end-binary.mlir
+++ b/mlir-tensorrt/test/Target/Lua/IntegrationTests/ClusteringDynamicShape/end-to-end-binary.mlir
@@ -3,7 +3,7 @@
 // RUN: stablehlo-clustering-pipeline, \
 // RUN: post-clustering-pipeline, \
 // RUN: executor-lowering-pipeline)" \
-// RUN: | mlir-tensorrt-translate -mlir-to-runtime-executable -allow-unregistered-dialect |  mlir-tensorrt-runner -input-type=rtexe
+// RUN: | mlir-tensorrt-translate -mlir-to-runtime-executable -allow-unregistered-dialect |  mlir-tensorrt-runner -input-type=rtexe --use-custom-allocator
 
 #profile = #tensorrt.shape_profile<min = [2], opt = [4], max = [6]>
 #profile1 = #tensorrt.shape_profile<min = [1], opt = [3], max = [5]>
diff --git a/mlir-tensorrt/test/Target/Lua/IntegrationTests/ClusteringDynamicShape/end-to-end-unary.mlir b/mlir-tensorrt/test/Target/Lua/IntegrationTests/ClusteringDynamicShape/end-to-end-unary.mlir
index 28718291a..949ce1b7d 100644
--- a/mlir-tensorrt/test/Target/Lua/IntegrationTests/ClusteringDynamicShape/end-to-end-unary.mlir
+++ b/mlir-tensorrt/test/Target/Lua/IntegrationTests/ClusteringDynamicShape/end-to-end-unary.mlir
@@ -3,7 +3,7 @@
 // RUN: stablehlo-clustering-pipeline, \
 // RUN: post-clustering-pipeline, \
 // RUN: executor-lowering-pipeline)" \
-// RUN: | mlir-tensorrt-translate -mlir-to-runtime-executable -allow-unregistered-dialect |  mlir-tensorrt-runner -input-type=rtexe
+// RUN: | mlir-tensorrt-translate -mlir-to-runtime-executable -allow-unregistered-dialect |  mlir-tensorrt-runner -input-type=rtexe --use-custom-allocator
 
 #profile0 = #tensorrt.shape_profile<min = [1], opt = [5], max = [10]>
 #profile1 = #tensorrt.shape_profile<min = [2], opt = [5], max = [6]>
diff --git a/mlir-tensorrt/test/Target/Lua/IntegrationTests/buffer-ops-bf16.mlir b/mlir-tensorrt/test/Target/Lua/IntegrationTests/buffer-ops-bf16.mlir
index 15f652aac..f825f236d 100644
--- a/mlir-tensorrt/test/Target/Lua/IntegrationTests/buffer-ops-bf16.mlir
+++ b/mlir-tensorrt/test/Target/Lua/IntegrationTests/buffer-ops-bf16.mlir
@@ -1,7 +1,7 @@
 // REQUIRES: host-has-at-least-1-gpus
 // RUN: mlir-tensorrt-opt %s -convert-memref-to-cuda -convert-plan-to-executor -convert-cuda-to-executor -executor-lowering-pipeline \
 // RUN:   | mlir-tensorrt-translate -mlir-to-runtime-executable \
-// RUN:   | mlir-tensorrt-runner -input-type=rtexe | FileCheck %s
+// RUN:   | mlir-tensorrt-runner -input-type=rtexe --use-custom-allocator | FileCheck %s
 
 !descriptor1D = !executor.table<!executor.ptr<device>, !executor.ptr<device>, index, index, index>
 !hostMemRef = memref<4xbf16, #plan.memory_space<host_pinned>>
diff --git a/mlir-tensorrt/test/Target/Lua/IntegrationTests/buffer-ops-dynamic.mlir b/mlir-tensorrt/test/Target/Lua/IntegrationTests/buffer-ops-dynamic.mlir
index 73c1cd690..0b0230b83 100644
--- a/mlir-tensorrt/test/Target/Lua/IntegrationTests/buffer-ops-dynamic.mlir
+++ b/mlir-tensorrt/test/Target/Lua/IntegrationTests/buffer-ops-dynamic.mlir
@@ -1,7 +1,7 @@
 // REQUIRES: host-has-at-least-1-gpus
 // RUN: mlir-tensorrt-opt %s -convert-memref-to-cuda -convert-cuda-to-executor -executor-lowering-pipeline \
 // RUN:   | mlir-tensorrt-translate -mlir-to-runtime-executable \
-// RUN:   | mlir-tensorrt-runner -input-type=rtexe | FileCheck %s
+// RUN:   | mlir-tensorrt-runner -input-type=rtexe --use-custom-allocator | FileCheck %s
 
 func.func @run_with_shape_2d(%arg0: memref<?xindex>, %arg1: memref<2xindex>) {
   %c0 = arith.constant 0 : index
diff --git a/mlir-tensorrt/test/Target/Lua/IntegrationTests/buffer-ops-f16.mlir b/mlir-tensorrt/test/Target/Lua/IntegrationTests/buffer-ops-f16.mlir
index 448b88c6f..526c8162f 100644
--- a/mlir-tensorrt/test/Target/Lua/IntegrationTests/buffer-ops-f16.mlir
+++ b/mlir-tensorrt/test/Target/Lua/IntegrationTests/buffer-ops-f16.mlir
@@ -1,7 +1,7 @@
 // REQUIRES: host-has-at-least-1-gpus
 // RUN: mlir-tensorrt-opt %s -convert-memref-to-cuda -convert-plan-to-executor -convert-cuda-to-executor -executor-lowering-pipeline \
 // RUN:   | mlir-tensorrt-translate -mlir-to-runtime-executable \
-// RUN:   | mlir-tensorrt-runner -input-type=rtexe | FileCheck %s
+// RUN:   | mlir-tensorrt-runner -input-type=rtexe --use-custom-allocator | FileCheck %s
 
 !descriptor1D = !executor.table<!executor.ptr<device>, !executor.ptr<device>, index, index, index>
 !hostMemRef = memref<4xf16, #plan.memory_space<host_pinned>>
diff --git a/mlir-tensorrt/test/Target/Lua/IntegrationTests/buffer-ops-f32.mlir b/mlir-tensorrt/test/Target/Lua/IntegrationTests/buffer-ops-f32.mlir
index 0d16f189a..6196e5317 100644
--- a/mlir-tensorrt/test/Target/Lua/IntegrationTests/buffer-ops-f32.mlir
+++ b/mlir-tensorrt/test/Target/Lua/IntegrationTests/buffer-ops-f32.mlir
@@ -1,7 +1,7 @@
 // REQUIRES: host-has-at-least-1-gpus
 // RUN: mlir-tensorrt-opt %s -convert-memref-to-cuda -convert-plan-to-executor -convert-cuda-to-executor -executor-lowering-pipeline \
 // RUN:   | mlir-tensorrt-translate  -mlir-to-runtime-executable \
-// RUN:   | mlir-tensorrt-runner -input-type=rtexe | FileCheck %s
+// RUN:   | mlir-tensorrt-runner -input-type=rtexe --use-custom-allocator | FileCheck %s
 
 !descriptor1D = !executor.table<!executor.ptr<device>, !executor.ptr<device>, index, index, index>
 !hostMemRef = memref<4xf32, #plan.memory_space<host>>
diff --git a/mlir-tensorrt/test/Target/Lua/IntegrationTests/buffer-ops-f8E4M3FN.mlir b/mlir-tensorrt/test/Target/Lua/IntegrationTests/buffer-ops-f8E4M3FN.mlir
index 7b3ae4765..6e93ac265 100644
--- a/mlir-tensorrt/test/Target/Lua/IntegrationTests/buffer-ops-f8E4M3FN.mlir
+++ b/mlir-tensorrt/test/Target/Lua/IntegrationTests/buffer-ops-f8E4M3FN.mlir
@@ -2,7 +2,7 @@
 // REQUIRES: all-gpus-support-fp8
 // RUN: mlir-tensorrt-opt %s -convert-memref-to-cuda -convert-plan-to-executor -convert-cuda-to-executor -executor-lowering-pipeline \
 // RUN:   | mlir-tensorrt-translate -mlir-to-runtime-executable \
-// RUN:   | mlir-tensorrt-runner -input-type=rtexe | FileCheck %s
+// RUN:   | mlir-tensorrt-runner -input-type=rtexe --use-custom-allocator | FileCheck %s
 
 !descriptor1D = !executor.table<!executor.ptr<device>, !executor.ptr<device>, index, index, index>
 !hostMemRef = memref<4xf8E4M3FN, #plan.memory_space<host_pinned>>
diff --git a/mlir-tensorrt/test/Target/Lua/IntegrationTests/buffer-ops-i1.mlir b/mlir-tensorrt/test/Target/Lua/IntegrationTests/buffer-ops-i1.mlir
index f44da93c5..61a74dfdf 100644
--- a/mlir-tensorrt/test/Target/Lua/IntegrationTests/buffer-ops-i1.mlir
+++ b/mlir-tensorrt/test/Target/Lua/IntegrationTests/buffer-ops-i1.mlir
@@ -1,7 +1,7 @@
 // REQUIRES: host-has-at-least-1-gpus
 // RUN: mlir-tensorrt-opt %s -convert-memref-to-cuda -convert-plan-to-executor -convert-cuda-to-executor -executor-lowering-pipeline \
 // RUN:   | mlir-tensorrt-translate -mlir-to-runtime-executable \
-// RUN:   | mlir-tensorrt-runner -input-type=rtexe | FileCheck %s
+// RUN:   | mlir-tensorrt-runner -input-type=rtexe --use-custom-allocator | FileCheck %s
 
 !descriptor1D = !executor.table<!executor.ptr<device>, !executor.ptr<device>, index, index, index>
 !hostMemRef = memref<4xi1, #plan.memory_space<host>>
diff --git a/mlir-tensorrt/test/Target/Lua/IntegrationTests/buffer-ops-i4.mlir b/mlir-tensorrt/test/Target/Lua/IntegrationTests/buffer-ops-i4.mlir
index 766bec84f..5d917af19 100644
--- a/mlir-tensorrt/test/Target/Lua/IntegrationTests/buffer-ops-i4.mlir
+++ b/mlir-tensorrt/test/Target/Lua/IntegrationTests/buffer-ops-i4.mlir
@@ -1,7 +1,7 @@
 // REQUIRES: host-has-at-least-1-gpus
 // RUN: mlir-tensorrt-opt %s -convert-memref-to-cuda -convert-plan-to-executor -convert-cuda-to-executor -executor-lowering-pipeline \
 // RUN:   | mlir-tensorrt-translate -mlir-to-runtime-executable \
-// RUN:   | mlir-tensorrt-runner -input-type=rtexe | FileCheck %s
+// RUN:   | mlir-tensorrt-runner -input-type=rtexe --use-custom-allocator | FileCheck %s
 
 !descriptor1D = !executor.table<!executor.ptr<device>, !executor.ptr<device>, index, index, index>
 !hostMemRef = memref<4xi4, #plan.memory_space<host_pinned>>
diff --git a/mlir-tensorrt/test/Target/Lua/IntegrationTests/memcpy-strided.mlir b/mlir-tensorrt/test/Target/Lua/IntegrationTests/memcpy-strided.mlir
index 0abcfec01..8cab67749 100644
--- a/mlir-tensorrt/test/Target/Lua/IntegrationTests/memcpy-strided.mlir
+++ b/mlir-tensorrt/test/Target/Lua/IntegrationTests/memcpy-strided.mlir
@@ -1,7 +1,7 @@
 // REQUIRES: host-has-at-least-1-gpus
 // RUN: mlir-tensorrt-opt %s -convert-memref-to-cuda -convert-plan-to-executor -convert-cuda-to-executor -executor-lowering-pipeline \
 // RUN:   | mlir-tensorrt-translate -mlir-to-runtime-executable \
-// RUN:   | mlir-tensorrt-runner -input-type=rtexe | FileCheck %s
+// RUN:   | mlir-tensorrt-runner -input-type=rtexe --use-custom-allocator | FileCheck %s
 
 func.func @main() -> index {
   %c0 = arith.constant 0 : index
diff --git a/mlir-tensorrt/test/Target/Lua/IntegrationTests/memcpy.mlir b/mlir-tensorrt/test/Target/Lua/IntegrationTests/memcpy.mlir
index f750810c8..dd336d8ce 100644
--- a/mlir-tensorrt/test/Target/Lua/IntegrationTests/memcpy.mlir
+++ b/mlir-tensorrt/test/Target/Lua/IntegrationTests/memcpy.mlir
@@ -1,7 +1,7 @@
 // REQUIRES: host-has-at-least-1-gpus
 // RUN: mlir-tensorrt-opt %s -convert-memref-to-cuda -convert-plan-to-executor -convert-cuda-to-executor -executor-lowering-pipeline \
 // RUN:   | mlir-tensorrt-translate -mlir-to-runtime-executable \
-// RUN:   | mlir-tensorrt-runner -input-type=rtexe | FileCheck %s
+// RUN:   | mlir-tensorrt-runner -input-type=rtexe --use-custom-allocator | FileCheck %s
 
 func.func @main() -> i32 {
   %c0_i32 = arith.constant 0 : i32
diff --git a/mlir-tensorrt/test/python/IntegrationTests/TRT10/test_stablehlo_add.py b/mlir-tensorrt/test/python/IntegrationTests/TRT10/test_stablehlo_add.py
index 480ce74d4..fce25bac3 100644
--- a/mlir-tensorrt/test/python/IntegrationTests/TRT10/test_stablehlo_add.py
+++ b/mlir-tensorrt/test/python/IntegrationTests/TRT10/test_stablehlo_add.py
@@ -33,7 +33,7 @@ def test_stablehlo_add(
             exe = compiler.compiler_stablehlo_to_executable(client, m.operation, opts)
 
         session_options = runtime.RuntimeSessionOptions(num_devices=1, device_id=0)
-        session = runtime.RuntimeSession(session_options, exe)
+        session = runtime.RuntimeSession(session_options, exe, None)
 
         session.execute_function(
             "main", in_args=test.in_args, out_args=test.out_args, stream=stream
diff --git a/mlir-tensorrt/test/python/IntegrationTests/test_call_validation.py b/mlir-tensorrt/test/python/IntegrationTests/test_call_validation.py
index 1687a8f1b..8ebc2d3c4 100644
--- a/mlir-tensorrt/test/python/IntegrationTests/test_call_validation.py
+++ b/mlir-tensorrt/test/python/IntegrationTests/test_call_validation.py
@@ -62,7 +62,7 @@ def create_scalar(self, value):
         return self.client.create_scalar(value, runtime.ScalarTypeCode.i64)
 
     def execute(self, arg: runtime.RuntimeValue):
-        session = runtime.RuntimeSession(self.session_options, self.exe)
+        session = runtime.RuntimeSession(self.session_options, self.exe, None)
         try:
             session.execute_function(
                 "main", in_args=[arg], out_args=[arg], stream=self.stream
diff --git a/mlir-tensorrt/test/python/IntegrationTests/test_stablehlo_add.py b/mlir-tensorrt/test/python/IntegrationTests/test_stablehlo_add.py
index a10957162..721f6960c 100644
--- a/mlir-tensorrt/test/python/IntegrationTests/test_stablehlo_add.py
+++ b/mlir-tensorrt/test/python/IntegrationTests/test_stablehlo_add.py
@@ -15,29 +15,26 @@
 """
 
 
-class AppGpuAllocator(runtime.GpuAllocator):
+class CupyGPUAllocator(runtime.GpuAllocator):
     def __init__(self):
-        # Initialize the base class
         super().__init__()
+        self.allocations = {}  # Keep track of allocations
 
-    def allocate(self, size, alignment, flags=None, stream=None):
-        # Implement memory allocation using CuPy
-        # Allocate memory on the GPU
-        import pdb
+    def allocate(self, size):
+        # Allocate memory on the GPU using CuPy
+        mem = cp.cuda.alloc(size)
+        ptr = int(mem.ptr)  # Convert to integer
+        # Store the CuPy memory object
+        self.allocations[ptr] = mem
+        return ptr
 
-        pdb.set_trace()
-        memory = cp.empty(size, dtype=cp.uint8)
-        print(
-            f"Allocated memory: {memory.data.ptr}"
-        )  # Print pointer to allocated GPU memory
-        return memory  # Return the CuPy array
-
-    def deallocate(self, memory, stream=None):
-        # Deallocate memory in CuPy
-        # CuPy handles memory automatically, so explicit deallocation is not necessary
-        print(f"Deallocating memory: {memory}")
-        del memory  # Explicitly delete the CuPy array if needed
-        return True
+    def deallocate(self, ptr):
+        if ptr in self.allocations:
+            # Remove the reference to the CuPy memory object
+            # This will trigger deallocation if there are no other references
+            del self.allocations[ptr]
+            return True
+        return False
 
 
 def stablehlo_add():
@@ -62,7 +59,8 @@ def stablehlo_add():
     if len(devices) == 0:
         return
 
-    allocator = AppGpuAllocator()
+    # Create an instance of the custom allocator
+    allocator = CupyGPUAllocator()
 
     session_options = runtime.RuntimeSessionOptions(num_devices=1, device_id=0)
     session = runtime.RuntimeSession(session_options, exe, gpu_allocator=allocator)
diff --git a/mlir-tensorrt/test/python/IntegrationTests/test_stablehlo_dynamic.py b/mlir-tensorrt/test/python/IntegrationTests/test_stablehlo_dynamic.py
index 35515e054..82c31a05a 100644
--- a/mlir-tensorrt/test/python/IntegrationTests/test_stablehlo_dynamic.py
+++ b/mlir-tensorrt/test/python/IntegrationTests/test_stablehlo_dynamic.py
@@ -115,7 +115,7 @@ def test_program(program: str, input_shape: Iterable[int], debug: bool = True):
         return
 
     session_options = runtime.RuntimeSessionOptions(num_devices=1, device_id=0)
-    session = runtime.RuntimeSession(session_options, exe)
+    session = runtime.RuntimeSession(session_options, exe, gpu_allocator=None)
 
     arg0 = client.create_memref(
         np.ones(input_shape, dtype=np.float32).data,

From 2185f9e0f03062a4c9f91e66b1d1f2e925de4253 Mon Sep 17 00:00:00 2001
From: Jhalak Patel <jhalakp@nvidia.com>
Date: Tue, 20 Aug 2024 11:26:48 -0700
Subject: [PATCH 4/6] Fix segmentaiton fault

---
 mlir-tensorrt/executor/lib/CAPI/Runtime/Runtime.cpp   | 11 +++++++++--
 .../python/bindings/Runtime/RuntimePyBind.cpp         | 11 +++++++++--
 .../python/IntegrationTests/test_stablehlo_add.py     |  2 +-
 3 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/mlir-tensorrt/executor/lib/CAPI/Runtime/Runtime.cpp b/mlir-tensorrt/executor/lib/CAPI/Runtime/Runtime.cpp
index 74eac79c1..253817934 100644
--- a/mlir-tensorrt/executor/lib/CAPI/Runtime/Runtime.cpp
+++ b/mlir-tensorrt/executor/lib/CAPI/Runtime/Runtime.cpp
@@ -670,6 +670,11 @@ class GpuAllocatorWrapper : public GpuAllocator {
 
   // Static method to create a GpuAllocator from MTRT_GpuAllocator
   static std::unique_ptr<GpuAllocator> create(MTRT_GpuAllocator gpuAllocator) {
+    if (!gpuAllocator.ptr || !gpuAllocator.allocate ||
+        !gpuAllocator.deallocate) {
+      llvm::errs() << "Invalid MTRT_GpuAllocator passed to create()";
+      return nullptr;
+    }
     return std::make_unique<GpuAllocatorWrapper>(gpuAllocator);
   }
 };
@@ -681,8 +686,10 @@ MTRT_Status mtrtRuntimeSessionCreate(MTRT_RuntimeSessionOptions options,
   RuntimeSessionOptions *cppOptions = unwrap(options);
   Executable *cppExecutable = unwrap(executable);
 
-  std::unique_ptr<GpuAllocator> allocator =
-      gpuAllocator.ptr ? GpuAllocatorWrapper::create(gpuAllocator) : nullptr;
+  std::unique_ptr<GpuAllocator> allocator;
+  if (gpuAllocator.ptr) {
+    allocator.reset(GpuAllocatorWrapper::create(gpuAllocator).release());
+  }
 
   StatusOr<std::unique_ptr<RuntimeSession>> session =
       createRuntimeSessionWithLuaBackend(cppExecutable->getView(),
diff --git a/mlir-tensorrt/python/bindings/Runtime/RuntimePyBind.cpp b/mlir-tensorrt/python/bindings/Runtime/RuntimePyBind.cpp
index 5a4d10096..94ebfb2d4 100644
--- a/mlir-tensorrt/python/bindings/Runtime/RuntimePyBind.cpp
+++ b/mlir-tensorrt/python/bindings/Runtime/RuntimePyBind.cpp
@@ -191,6 +191,9 @@ class PyRuntimeValue : public PyMTRTWrapper<PyRuntimeValue, MTRT_RuntimeValue> {
 // routing.
 class PyGpuAllocator {
 public:
+  py::object pySelf;
+  PyGpuAllocator(py::object self) : pySelf(self) {}
+
   virtual ~PyGpuAllocator() = default;
   virtual std::uintptr_t allocate(uint64_t size) = 0;
   virtual bool deallocate(std::uintptr_t ptr) = 0;
@@ -201,6 +204,7 @@ class PyGpuAllocator {
 private:
   // Trampoline function: Routes C-style allocation calls to C++ virtual method.
   static void *pyGpuAllocatorAllocate(void *self, uint64_t size) {
+    py::gil_scoped_acquire acquire;
     auto *allocator = static_cast<PyGpuAllocator *>(self);
     std::uintptr_t ptr = allocator->allocate(size);
     return reinterpret_cast<void *>(ptr);
@@ -209,6 +213,7 @@ class PyGpuAllocator {
   // Trampoline function: Routes C-style deallocation calls to C++ virtual
   // method.
   static bool pyGpuAllocatorDeallocate(void *self, void *memory) {
+    py::gil_scoped_acquire acquire;
     auto *allocator = static_cast<PyGpuAllocator *>(self);
     return allocator->deallocate(reinterpret_cast<std::uintptr_t>(memory));
   }
@@ -969,7 +974,8 @@ PYBIND11_MODULE(_api, m) {
            py::arg("nccl_uuid") = py::str(""));
 
   py::class_<PyGpuAllocator, PyGpuAllocatorTrampoline>(m, "GpuAllocator")
-      .def(py::init<>())
+      .def(py::init<>(
+          [](py::object self) { return new PyGpuAllocatorTrampoline(self); }))
       .def("allocate", &PyGpuAllocator::allocate)
       .def("deallocate", &PyGpuAllocator::deallocate)
       .def("get_capi_object", &PyGpuAllocator::getCApiObject);
@@ -983,7 +989,8 @@ PYBIND11_MODULE(_api, m) {
              if (gpu_allocator.is_none()) {
                // Create session without custom allocator
                s = mtrtRuntimeSessionCreate(
-                   options, exe, MTRT_GpuAllocator{nullptr}, &session);
+                   options, exe, MTRT_GpuAllocator{nullptr, nullptr, nullptr},
+                   &session);
              } else {
                try {
                  PyGpuAllocator &allocator =
diff --git a/mlir-tensorrt/test/python/IntegrationTests/test_stablehlo_add.py b/mlir-tensorrt/test/python/IntegrationTests/test_stablehlo_add.py
index 721f6960c..ebf31bde7 100644
--- a/mlir-tensorrt/test/python/IntegrationTests/test_stablehlo_add.py
+++ b/mlir-tensorrt/test/python/IntegrationTests/test_stablehlo_add.py
@@ -17,7 +17,7 @@
 
 class CupyGPUAllocator(runtime.GpuAllocator):
     def __init__(self):
-        super().__init__()
+        super().__init__(self)
         self.allocations = {}  # Keep track of allocations
 
     def allocate(self, size):

From 4d377a1b6a3693b2e20ebbcdd50e5e594ad6a5d3 Mon Sep 17 00:00:00 2001
From: Jhalak Patel <jhalakp@nvidia.com>
Date: Tue, 20 Aug 2024 16:53:44 -0700
Subject: [PATCH 5/6] Update allocate/deallocate interface

---
 .../include/mlir-executor-c/Runtime/Runtime.h | 23 +++++------
 .../mlir-executor/Support/Allocators.h        | 16 ++++++--
 .../executor/lib/CAPI/Runtime/Runtime.cpp     | 31 ++------------
 .../Lua/Modules/TensorRT/TensorRTModule.cpp   | 24 ++++-------
 .../executor/lib/Support/Allocators.cpp       | 40 +++++++++++++++++--
 .../python/bindings/Runtime/RuntimePyBind.cpp | 22 +++++-----
 .../IntegrationTests/test_stablehlo_add.py    |  2 +-
 7 files changed, 83 insertions(+), 75 deletions(-)

diff --git a/mlir-tensorrt/executor/include/mlir-executor-c/Runtime/Runtime.h b/mlir-tensorrt/executor/include/mlir-executor-c/Runtime/Runtime.h
index dada8ea67..e2a3fa88c 100644
--- a/mlir-tensorrt/executor/include/mlir-executor-c/Runtime/Runtime.h
+++ b/mlir-tensorrt/executor/include/mlir-executor-c/Runtime/Runtime.h
@@ -32,6 +32,8 @@
 #include <stddef.h>
 #include <stdint.h>
 
+#include "cuda_runtime.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -323,24 +325,17 @@ mtrtScalarValueGetType(MTRT_ScalarValue scalar, MTRT_ScalarTypeCode *code);
 // MTRT_GpuAllocator
 //===----------------------------------------------------------------------===//
 
+// Function pointer types for the allocate and deallocate callbacks.
+typedef void *(*AllocateFunc)(void *self, uint64_t size, uint64_t alignment, uint32_t flags, cudaStream_t* stream);
+typedef bool (*DeallocateFunc)(void *self, void *memory, cudaStream_t* stream);
 
-// Function pointer types for the allocate and deallocate callbacks
-typedef void* (*AllocateFunc)(void* self, uint64_t size);
-typedef bool (*DeallocateFunc)(void* self, void* memory);
-
-// The MTRT_GpuAllocator struct
 typedef struct MTRT_GpuAllocator {
-    void* ptr;              // Pointer to the implementation (PyGpuAllocatorTrampoline in our case)
-    AllocateFunc allocate;  // Function pointer for allocation
-    DeallocateFunc deallocate;  // Function pointer for deallocation
+  void *ptr; // Pointer to the implementation (PyGpuAllocatorTrampoline in our
+             // case.)
+  AllocateFunc allocate;     // Function pointer for allocation
+  DeallocateFunc deallocate; // Function pointer for deallocation
 } MTRT_GpuAllocator;
 
-/// Checks nullity of `GpuAllocator`.
-MTRT_CAPI_EXPORTED bool GpuAllocatorIsNull(MTRT_GpuAllocator gpuAllocator);
-
-MTRT_CAPI_EXPORTED MTRT_Status
-GpuAllocatorDestroy(MTRT_GpuAllocator executable);
-
 //===----------------------------------------------------------------------===//
 // MTRT_RuntimeSessionOptions
 //===----------------------------------------------------------------------===//
diff --git a/mlir-tensorrt/executor/include/mlir-executor/Support/Allocators.h b/mlir-tensorrt/executor/include/mlir-executor/Support/Allocators.h
index 393a5a091..536619ba7 100644
--- a/mlir-tensorrt/executor/include/mlir-executor/Support/Allocators.h
+++ b/mlir-tensorrt/executor/include/mlir-executor/Support/Allocators.h
@@ -36,16 +36,24 @@ class GpuAllocator {
 public:
   GpuAllocator() = default;
   virtual ~GpuAllocator() = default;
-  virtual void* allocate(uint64_t const size) { return nullptr; }
-  virtual bool deallocate(void *const memory) { return false; }
+  virtual void *allocate(uint64_t const size, uint64_t const alignment,
+                         uint32_t flags, cudaStream_t* stream) {
+    return nullptr;
+  }
+  virtual bool deallocate(void *const memory,
+                          cudaStream_t* stream) {
+    return false;
+  }
 };
 
 class CustomTensorRTAllocator : public GpuAllocator {
 public:
   CustomTensorRTAllocator() = default;
   ~CustomTensorRTAllocator() = default;
-  void* allocate(uint64_t const size) override;
-  bool deallocate(void *const memory) override;
+  void *allocate(uint64_t const size, uint64_t const alignment, uint32_t flags,
+                 cudaStream_t* stream) override;
+  bool deallocate(void *const memory,
+                  cudaStream_t* stream) override;
 };
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir-tensorrt/executor/lib/CAPI/Runtime/Runtime.cpp b/mlir-tensorrt/executor/lib/CAPI/Runtime/Runtime.cpp
index 253817934..7e7eb73be 100644
--- a/mlir-tensorrt/executor/lib/CAPI/Runtime/Runtime.cpp
+++ b/mlir-tensorrt/executor/lib/CAPI/Runtime/Runtime.cpp
@@ -601,29 +601,6 @@ MTRT_ScalarValue mtrtRuntimeValueDynCastToScalar(MTRT_RuntimeValue v) {
   return wrap(static_cast<ScalarValue *>(x));
 }
 
-//===----------------------------------------------------------------------===//
-// MTRT_GpuAllocator
-//===----------------------------------------------------------------------===//
-
-bool GpuAllocatorIsNull(MTRT_GpuAllocator gpuAllocator) {
-  return !gpuAllocator.ptr;
-}
-
-MTRT_Status GpuAllocatorDestroy(MTRT_GpuAllocator gpuAllocator) {
-  // delete unwrap(gpuAllocator);
-  return mtrtStatusGetOk();
-}
-
-// TODO: Implement destroy method to release resources.
-// void mtrtGpuAllocatorDestroy(MTRT_GpuAllocator* allocator) {
-//     if (allocator && allocator->ptr) {
-//         delete static_cast<PyGpuAllocatorTrampoline*>(allocator->ptr);
-//         allocator->ptr = nullptr;
-//         allocator->allocate = nullptr;
-//         allocator->deallocate = nullptr;
-//     }
-// }
-
 //===----------------------------------------------------------------------===//
 // MTRT_RuntimeSessionOptions
 //===----------------------------------------------------------------------===//
@@ -660,12 +637,12 @@ class GpuAllocatorWrapper : public GpuAllocator {
   GpuAllocatorWrapper(MTRT_GpuAllocator gpuAllocator)
       : mPyGpuAllocator(gpuAllocator) {}
 
-  void *allocate(uint64_t size) override {
-    return mPyGpuAllocator.allocate(mPyGpuAllocator.ptr, size);
+  void *allocate(uint64_t size, uint64_t alignment, uint32_t flags, cudaStream_t* stream) override {
+    return mPyGpuAllocator.allocate(mPyGpuAllocator.ptr, size, alignment, flags, stream);
   }
 
-  bool deallocate(void *ptr) override {
-    return mPyGpuAllocator.deallocate(mPyGpuAllocator.ptr, ptr);
+  bool deallocate(void *ptr, cudaStream_t* stream) override {
+    return mPyGpuAllocator.deallocate(mPyGpuAllocator.ptr, ptr, stream);
   }
 
   // Static method to create a GpuAllocator from MTRT_GpuAllocator
diff --git a/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.cpp b/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.cpp
index fed3a573e..1ae3b4440 100644
--- a/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.cpp
+++ b/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.cpp
@@ -68,29 +68,21 @@ class StdioLogger : public nvinfer1::ILogger {
 // TensorRTCallBackAllocator
 //===----------------------------------------------------------------------===//
 
-class TensorRTCallBackAllocator final : public nvinfer1::IGpuAllocator {
+class TensorRTCallBackAllocator final : public nvinfer1::IGpuAsyncAllocator {
 public:
   TensorRTCallBackAllocator(GpuAllocator *gpuAllocator)
-      : nvinfer1::IGpuAllocator(), mGpuAllocatorCallBack(gpuAllocator) {}
+      : nvinfer1::IGpuAsyncAllocator(), mGpuAllocatorCallBack(gpuAllocator) {}
 
-  void *allocate(uint64_t size, uint64_t alignment,
-                 nvinfer1::AllocatorFlags flags) noexcept final {
-    return allocateAsync(size, alignment, flags, nullptr);
-  }
-
-  bool deallocate(void *memory) noexcept final {
-    return deallocateAsync(memory, nullptr);
-  }
-
-  void *allocateAsync(uint64_t const size, uint64_t const /*alignment*/,
-                      uint32_t /*flags*/, cudaStream_t /*stream*/) noexcept final {
-    void* result = mGpuAllocatorCallBack->allocate(size);
+  void *allocateAsync(uint64_t const size, uint64_t const alignment,
+                      uint32_t flags, cudaStream_t stream) noexcept final {
+    void *result =
+        mGpuAllocatorCallBack->allocate(size, alignment, flags, &stream);
     return result;
   }
 
   bool deallocateAsync(void *const memory,
-                       cudaStream_t /*stream*/) noexcept override {
-    bool result = mGpuAllocatorCallBack->deallocate(memory);
+                       cudaStream_t stream) noexcept override {
+    bool result = mGpuAllocatorCallBack->deallocate(memory, &stream);
     return result;
   }
 
diff --git a/mlir-tensorrt/executor/lib/Support/Allocators.cpp b/mlir-tensorrt/executor/lib/Support/Allocators.cpp
index 70f21b9f7..100cb0361 100644
--- a/mlir-tensorrt/executor/lib/Support/Allocators.cpp
+++ b/mlir-tensorrt/executor/lib/Support/Allocators.cpp
@@ -46,14 +46,46 @@ using namespace mlirtrt;
 // CustomTensorRTAllocator
 //===----------------------------------------------------------------------===//
 
-void *CustomTensorRTAllocator::allocate(uint64_t const size) {
+
+void*
+CustomTensorRTAllocator::allocate(uint64_t const size, uint64_t const alignment,
+                                  uint32_t /*flags*/,
+                                  cudaStream_t* stream) {
   uint8_t *memory;
-  cudaMalloc(reinterpret_cast<void **>(&memory), size);
+  assert(alignment > 0 && (alignment & (alignment - 1)) == 0 &&
+         "Memory alignment has to be power of 2");
+  if (stream && *stream != nullptr) {
+    auto status = cudaMallocAsync(reinterpret_cast<void **>(&memory), size, *stream);
+    assert(status == cudaSuccess);
+    MTRT_DBGF("[CustomTensorRTAllocator][allocate]: Asynchronously allocated %lx bytes at 0x%lx on stream %lx", size,
+              reinterpret_cast<uintptr_t>(memory),
+              reinterpret_cast<uintptr_t>(*stream));
+  } else {
+    auto status = cudaMalloc(reinterpret_cast<void **>(&memory), size);
+    assert(status == cudaSuccess);
+    MTRT_DBGF("[CustomTensorRTAllocator][allocate]: Synchronously allocated %lx bytes at 0x%lx", size,
+              reinterpret_cast<uintptr_t>(memory));
+  }
+  assert(reinterpret_cast<uintptr_t>(memory) % alignment == 0);
   return memory;
 }
 
-bool CustomTensorRTAllocator::deallocate(void *const memory) {
-  cudaFree(memory);
+bool CustomTensorRTAllocator::deallocate(void *const memory,
+                                    cudaStream_t* stream) {
+  if (stream && *stream != nullptr) {
+    MTRT_DBGF("[CustomTensorRTAllocator][deallocate]: Asynchronously freeing CUDA device memory 0x%lx on stream %lx",
+              reinterpret_cast<uintptr_t>(memory),
+              reinterpret_cast<uintptr_t>(*stream));
+    cudaError_t status = cudaFreeAsync(memory, *stream);
+    assert(status == cudaSuccess);
+  } else {
+    MTRT_DBGF("[CustomTensorRTAllocator][deallocate]: Synchronously freeing CUDA device/pinned host memory 0x%lx ptr "
+              "on stream %lx",
+              reinterpret_cast<uintptr_t>(memory),
+              reinterpret_cast<uintptr_t>(*stream));
+    cudaError_t status = cudaFree(memory);
+    assert(status == cudaSuccess);
+  }
   return true;
 }
 
diff --git a/mlir-tensorrt/python/bindings/Runtime/RuntimePyBind.cpp b/mlir-tensorrt/python/bindings/Runtime/RuntimePyBind.cpp
index 94ebfb2d4..3f45138c8 100644
--- a/mlir-tensorrt/python/bindings/Runtime/RuntimePyBind.cpp
+++ b/mlir-tensorrt/python/bindings/Runtime/RuntimePyBind.cpp
@@ -128,6 +128,7 @@ class PyStream : public PyMTRTWrapper<PyStream, MTRT_Stream> {
 public:
   using Base::Base;
   DECLARE_WRAPPER_CONSTRUCTORS(PyStream);
+
   static constexpr auto kMethodTable = CAPITable<MTRT_Stream>{
       mtrtStreamIsNull, mtrtStreamDestroy, mtrtPythonCapsuleToStream,
       mtrtPythonStreamToCapsule};
@@ -195,7 +196,8 @@ class PyGpuAllocator {
   PyGpuAllocator(py::object self) : pySelf(self) {}
 
   virtual ~PyGpuAllocator() = default;
-  virtual std::uintptr_t allocate(uint64_t size) = 0;
+  virtual std::uintptr_t allocate(uint64_t size, uint64_t alignment,
+                                  uint32_t flags) = 0;
   virtual bool deallocate(std::uintptr_t ptr) = 0;
 
   // Creates a C-compatible struct for interfacing with lower-level APIs.
@@ -203,16 +205,19 @@ class PyGpuAllocator {
 
 private:
   // Trampoline function: Routes C-style allocation calls to C++ virtual method.
-  static void *pyGpuAllocatorAllocate(void *self, uint64_t size) {
+  static void *pyGpuAllocatorAllocate(void *self, uint64_t size,
+                                      uint64_t alignment, uint32_t flags,
+                                      cudaStream_t* /*stream*/) {
     py::gil_scoped_acquire acquire;
     auto *allocator = static_cast<PyGpuAllocator *>(self);
-    std::uintptr_t ptr = allocator->allocate(size);
+    std::uintptr_t ptr = allocator->allocate(size, alignment, flags);
     return reinterpret_cast<void *>(ptr);
   }
 
   // Trampoline function: Routes C-style deallocation calls to C++ virtual
   // method.
-  static bool pyGpuAllocatorDeallocate(void *self, void *memory) {
+  static bool pyGpuAllocatorDeallocate(void *self, void *memory,
+                                       cudaStream_t* /*stream*/) {
     py::gil_scoped_acquire acquire;
     auto *allocator = static_cast<PyGpuAllocator *>(self);
     return allocator->deallocate(reinterpret_cast<std::uintptr_t>(memory));
@@ -237,12 +242,12 @@ class PyGpuAllocatorTrampoline : public PyGpuAllocator {
 
   // Trampoline for allocate: Dispatches call to Python implementation if
   // overridden.
-  uintptr_t allocate(uint64_t size) override {
+  uintptr_t allocate(uint64_t size, uint64_t alignment, uint32_t flags) override {
     PYBIND11_OVERRIDE_PURE(uintptr_t,      // Return type
                            PyGpuAllocator, // Parent class
                            allocate,       // Name of function in C++
-                           size            // Arguments
-    );
+                           size,           // Arguments
+                           alignment, flags);
   }
 
   // Trampoline for deallocate: Dispatches call to Python implementation if
@@ -251,8 +256,7 @@ class PyGpuAllocatorTrampoline : public PyGpuAllocator {
     PYBIND11_OVERRIDE_PURE(bool,           // Return type
                            PyGpuAllocator, // Parent class
                            deallocate,     // Name of function in C++
-                           ptr             // Arguments
-    );
+                           ptr);            // Arguments
   }
 };
 
diff --git a/mlir-tensorrt/test/python/IntegrationTests/test_stablehlo_add.py b/mlir-tensorrt/test/python/IntegrationTests/test_stablehlo_add.py
index ebf31bde7..1fa932fec 100644
--- a/mlir-tensorrt/test/python/IntegrationTests/test_stablehlo_add.py
+++ b/mlir-tensorrt/test/python/IntegrationTests/test_stablehlo_add.py
@@ -20,7 +20,7 @@ def __init__(self):
         super().__init__(self)
         self.allocations = {}  # Keep track of allocations
 
-    def allocate(self, size):
+    def allocate(self, size, alignment, flags):
         # Allocate memory on the GPU using CuPy
         mem = cp.cuda.alloc(size)
         ptr = int(mem.ptr)  # Convert to integer

From fc7ca69f826ce512a04ad76cb49340221cd071f0 Mon Sep 17 00:00:00 2001
From: Jhalak Patel <jhalakp@nvidia.com>
Date: Wed, 21 Aug 2024 00:22:47 -0700
Subject: [PATCH 6/6] Add IOutputAllocator

---
 .../include/mlir-executor-c/Runtime/Runtime.h |  70 +++++--
 .../include/mlir-executor/Runtime/API/API.h   |  22 +++
 .../Runtime/Backend/Lua/LuaRegistration.h     |   4 +-
 .../Runtime/Backend/Lua/LuaRuntime.h          |  10 +-
 .../Lua/Modules/TensorRT/TensorRTModule.h     |   3 +-
 .../mlir-executor/Support/Allocators.h        | 109 +++++++++++
 .../executor/lib/CAPI/Runtime/Runtime.cpp     |  89 ++++++++-
 .../executor/lib/Runtime/API/API.cpp          |   6 +-
 .../lib/Runtime/Backend/Lua/LuaRuntime.cpp    |  49 +++--
 .../Lua/Modules/TensorRT/TensorRTModule.cpp   | 183 ++++++++++++++++--
 .../executor/lib/Support/Allocators.cpp       |  67 +++++++
 .../executor/lib/Tools/ExecutorRunnerMain.cpp |   5 +-
 .../python/bindings/Runtime/RuntimePyBind.cpp | 144 +++++++++++++-
 .../IntegrationTests/test_stablehlo_add.py    |  37 +++-
 .../test_stablehlo_dynamic.py                 |  38 ++++
 15 files changed, 768 insertions(+), 68 deletions(-)

diff --git a/mlir-tensorrt/executor/include/mlir-executor-c/Runtime/Runtime.h b/mlir-tensorrt/executor/include/mlir-executor-c/Runtime/Runtime.h
index e2a3fa88c..22c87c56e 100644
--- a/mlir-tensorrt/executor/include/mlir-executor-c/Runtime/Runtime.h
+++ b/mlir-tensorrt/executor/include/mlir-executor-c/Runtime/Runtime.h
@@ -95,6 +95,50 @@ static inline bool mtrtDeviceIsNull(MTRT_Device device) { return !device.ptr; }
 /// arguments are optional in functions below.
 static inline MTRT_Device mtrtDeviceGetNull() { return MTRT_Device{nullptr}; }
 
+//===----------------------------------------------------------------------===//
+// MTRT_GpuAllocator
+//===----------------------------------------------------------------------===//
+
+// Function pointer types for the allocate and deallocate callbacks.
+typedef void *(*AllocateFunc)(void *self, uint64_t size, uint64_t alignment, uint32_t flags, cudaStream_t* stream);
+typedef bool (*DeallocateFunc)(void *self, void *memory, cudaStream_t* stream);
+
+typedef struct MTRT_GpuAllocator {
+  void *ptr; // Pointer to the implementation (PyGpuAllocatorTrampoline in our
+             // case.)
+  // Function pointers to methods.
+  AllocateFunc allocate;
+  DeallocateFunc deallocate;
+} MTRT_GpuAllocator;
+
+//===----------------------------------------------------------------------===//
+// MTRT_OutputAllocator
+//===----------------------------------------------------------------------===//
+
+// Function pointer types for the allocate and deallocate callbacks.
+typedef void (*SetGpuAllocator)(void *self, MTRT_GpuAllocator gpuAllocator);
+typedef void (*SetTensorName)(void *self, const char *tensorName);
+typedef void (*SetCurrentMemory)(void *self, void *currentMemory);
+typedef void (*SetOutputSize)(void *self, const int64_t outputSize);
+typedef void *(*ReallocateOutputAsync)(void *self, char const *tensorName,
+                                       void *currentMemory, uint64_t size,
+                                       uint64_t alignment,
+                                       cudaStream_t *stream);
+typedef void (*NotifyShape)(void *self, char const *tensorName, const int64_t *dims,
+                            int64_t nbDims);
+
+typedef struct MTRT_OutputAllocator {
+  void *ptr; // Pointer to the implementation (PyOutputAllocatorTrampoline in
+             // our case.)
+  // Function pointers to methods.
+  SetGpuAllocator setGpuAllocator;
+  SetTensorName setTensorName;
+  SetCurrentMemory setCurrentMemory;
+  SetOutputSize setOutputSize;
+  ReallocateOutputAsync reallocateOutputAsync;
+  NotifyShape notifyShape;
+} MTRT_OutputAllocator;
+
 //===----------------------------------------------------------------------===//
 // MTRT_MemRefValue
 //===----------------------------------------------------------------------===//
@@ -172,6 +216,9 @@ typedef struct MTRT_MemRefValueInfo {
 MLIR_CAPI_EXPORTED MTRT_Status
 mtrtMemRefValueGetInfo(MTRT_MemRefValue memref, MTRT_MemRefValueInfo *info);
 
+MLIR_CAPI_EXPORTED MTRT_Status mtrtMemRefValueSetOutputAllocator(
+    MTRT_MemRefValue memrefValue, MTRT_OutputAllocator pyOutputAllocator);
+
 /// Create DL Managed tensor from MemRefValue.
 MLIR_CAPI_EXPORTED MTRT_Status mtrtMemRefValueGetDLPackManagedTensor(
     MTRT_MemRefValue memrefValue, MTRT_DLPackManagedTensor *outTensor);
@@ -321,21 +368,6 @@ mtrtScalarValueCastToRuntimeValue(MTRT_ScalarValue v);
 MLIR_CAPI_EXPORTED MTRT_Status
 mtrtScalarValueGetType(MTRT_ScalarValue scalar, MTRT_ScalarTypeCode *code);
 
-//===----------------------------------------------------------------------===//
-// MTRT_GpuAllocator
-//===----------------------------------------------------------------------===//
-
-// Function pointer types for the allocate and deallocate callbacks.
-typedef void *(*AllocateFunc)(void *self, uint64_t size, uint64_t alignment, uint32_t flags, cudaStream_t* stream);
-typedef bool (*DeallocateFunc)(void *self, void *memory, cudaStream_t* stream);
-
-typedef struct MTRT_GpuAllocator {
-  void *ptr; // Pointer to the implementation (PyGpuAllocatorTrampoline in our
-             // case.)
-  AllocateFunc allocate;     // Function pointer for allocation
-  DeallocateFunc deallocate; // Function pointer for deallocation
-} MTRT_GpuAllocator;
-
 //===----------------------------------------------------------------------===//
 // MTRT_RuntimeSessionOptions
 //===----------------------------------------------------------------------===//
@@ -376,8 +408,8 @@ typedef struct MTRT_RuntimeSession {
 /// that the session only has a read-only view in to the Executable for code and
 /// constant data. Therefore the Executable must outlive the RuntimeSession.
 MLIR_CAPI_EXPORTED MTRT_Status mtrtRuntimeSessionCreate(
-    MTRT_RuntimeSessionOptions options, MTRT_Executable executable, MTRT_GpuAllocator allocator,
-    MTRT_RuntimeSession *result);
+    MTRT_RuntimeSessionOptions options, MTRT_Executable executable,
+    MTRT_GpuAllocator allocator, MTRT_RuntimeSession *result);
 
 /// Destory the session. This does not destroy the associated Executable, which
 /// may be shared among many sessions.
@@ -389,6 +421,10 @@ static inline bool mtrtRuntimeSessionIsNull(MTRT_RuntimeSession session) {
   return !session.ptr;
 }
 
+MLIR_CAPI_EXPORTED MTRT_Status mtrtAddMemRefOutputAllocatorSessionRegistry(
+    MTRT_MemRefValue memrefValue,
+    MTRT_OutputAllocator pyOutputAllocator);
+
 /// Using `session`, execute the pubic function with the specified name.
 /// The `inArgs` and `outArgs` are arrays for input arguments and destination
 /// arguments, respectively. Input arguments may be MemRefs or scalars, but
diff --git a/mlir-tensorrt/executor/include/mlir-executor/Runtime/API/API.h b/mlir-tensorrt/executor/include/mlir-executor/Runtime/API/API.h
index 70384c60d..d48b80d51 100644
--- a/mlir-tensorrt/executor/include/mlir-executor/Runtime/API/API.h
+++ b/mlir-tensorrt/executor/include/mlir-executor/Runtime/API/API.h
@@ -667,6 +667,12 @@ class MemRefValue : public RuntimeValue {
     return v->getKind() == Kind::MemRef;
   }
 
+  void setOutputAllocator(OutputAllocator* _outputAllocator) {
+    outputAllocator = _outputAllocator;
+  }
+
+  OutputAllocator *getOutputAllocator() { return outputAllocator; }
+
   const std::optional<ScalarType> &getScalarType() const { return scalarType; }
 
   RuntimeClient *getClient() { return client; }
@@ -691,6 +697,7 @@ class MemRefValue : public RuntimeValue {
   /// address.
   std::optional<const Device *> device;
   std::optional<ScalarType> scalarType{};
+  OutputAllocator *outputAllocator{nullptr};
 };
 
 //===----------------------------------------------------------------------===//
@@ -868,6 +875,7 @@ class RuntimeSession {
                  std::unique_ptr<PinnedMemoryAllocator> pinnedMemoryAllocator,
                  std::unique_ptr<AllocTracker> allocTracker,
                  std::unique_ptr<ResourceTracker> resourceTracker,
+                 std::unique_ptr<OutputAllocatorTracker> outputAllocatorTracker,
                  std::unique_ptr<GpuAllocator> gpuAllocator);
 
   ExecutableView getExecutable() const { return executable; }
@@ -882,6 +890,10 @@ class RuntimeSession {
 
   ResourceTracker &getResourceTracker() { return *resourceTracker; }
 
+  OutputAllocatorTracker &getOutputAllocatorTracker() {
+    return *outputAllocatorTracker;
+  }
+
   GpuAllocator &getGpuAllocator() { return *gpuAllocator; }
 
 private:
@@ -891,6 +903,7 @@ class RuntimeSession {
   std::unique_ptr<PinnedMemoryAllocator> pinnedMemoryAllocator;
   std::unique_ptr<AllocTracker> allocTracker;
   std::unique_ptr<ResourceTracker> resourceTracker;
+  std::unique_ptr<OutputAllocatorTracker> outputAllocatorTracker;
   std::unique_ptr<GpuAllocator> gpuAllocator;
   sol::state state;
 };
@@ -973,6 +986,14 @@ class RuntimeClient {
     return pinnedMemoryAllocator;
   }
 
+  void addOutputAllocator(std::unique_ptr<OutputAllocator> outputAllocator) {
+    outputAllocators.emplace_back(std::move(outputAllocator));
+  }
+
+  OutputAllocator* getLastOutputAllocator() {
+    return outputAllocators.back().get();
+  }
+
 private:
   RuntimeClient(llvm::SmallVector<std::unique_ptr<Device>> devices)
       : devices(std::move(devices)) {}
@@ -981,6 +1002,7 @@ class RuntimeClient {
   PinnedMemoryAllocator pinnedMemoryAllocator;
   AllocTracker allocTracker;
   ResourceTracker resourceTracker;
+  std::vector<std::unique_ptr<OutputAllocator>> outputAllocators;
 };
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/LuaRegistration.h b/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/LuaRegistration.h
index 922e964d4..9dd689de8 100644
--- a/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/LuaRegistration.h
+++ b/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/LuaRegistration.h
@@ -37,6 +37,8 @@ void registerLuaRuntimeMethods(lua_State *state,
                                const RuntimeSessionOptions &options,
                                PinnedMemoryAllocator *pinnedMemoryAllocator,
                                AllocTracker *allocTracker,
-                               ResourceTracker *resourceTracker, GpuAllocator* allocator);
+                               ResourceTracker *resourceTracker,
+                               OutputAllocatorTracker *outputAllocatorTracker,
+                               GpuAllocator *allocator);
 
 } // namespace mlirtrt::runtime
diff --git a/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/LuaRuntime.h b/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/LuaRuntime.h
index d4f07f13a..e7251580f 100644
--- a/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/LuaRuntime.h
+++ b/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/LuaRuntime.h
@@ -36,7 +36,8 @@ namespace mlirtrt::runtime {
 /// `main` function. It is assumed that `main` takes no arguments and returns an
 /// integer result (which is returned if the execution is successful).
 /// TODO: this should take a handle to a function for streaming output/errors.
-StatusOr<int64_t> runExecutorLuaScript(std::string_view luaScript, GpuAllocator* allocator);
+StatusOr<int64_t> runExecutorLuaScript(std::string_view luaScript,
+                                       GpuAllocator *allocator);
 
 /// Synchronously run a serialized executor Executable one time. An `Executable`
 /// is essentially a Lua script packaged with metadata and serialized constants
@@ -48,12 +49,15 @@ StatusOr<int64_t> runExecutorLuaScript(std::string_view luaScript, GpuAllocator*
 /// execution is successful).
 /// TODO: this should take a handle to a function for
 /// streaming output/errors.
-StatusOr<int64_t> runExecutorExecutable(std::unique_ptr<Executable> executable, std::unique_ptr<GpuAllocator> allocator);
+StatusOr<int64_t>
+runExecutorExecutable(std::unique_ptr<Executable> executable,
+                      std::unique_ptr<GpuAllocator> allocator);
 
 /// Create an execution state. This will setup a Lua environment and invoke
 /// global initialization.
 StatusOr<std::unique_ptr<RuntimeSession>>
-createRuntimeSessionWithLuaBackend(ExecutableView executable, std::unique_ptr<GpuAllocator> allocator,
+createRuntimeSessionWithLuaBackend(ExecutableView executable,
+                                   std::unique_ptr<GpuAllocator> allocator,
                                    const RuntimeSessionOptions &options);
 
 /// Set the primary stream for the loaded executable to use.
diff --git a/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.h b/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.h
index 1ceb91367..54655ddf7 100644
--- a/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.h
+++ b/mlir-tensorrt/executor/include/mlir-executor/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.h
@@ -37,7 +37,8 @@ class ResourceTracker;
 /// Lua state.
 void registerExecutorTensorRTModuleLuaRuntimeMethods(
     lua_State *luaState, PinnedMemoryAllocator *pinnedMemoryAllocator,
-    AllocTracker *allocTracker, ResourceTracker *resourceTracker, GpuAllocator* allocator);
+    AllocTracker *allocTracker, ResourceTracker *resourceTracker,
+    OutputAllocatorTracker *outputAllocatorTracker, GpuAllocator *allocator);
 
 } // namespace mlirtrt::runtime
 
diff --git a/mlir-tensorrt/executor/include/mlir-executor/Support/Allocators.h b/mlir-tensorrt/executor/include/mlir-executor/Support/Allocators.h
index 536619ba7..054bbcf04 100644
--- a/mlir-tensorrt/executor/include/mlir-executor/Support/Allocators.h
+++ b/mlir-tensorrt/executor/include/mlir-executor/Support/Allocators.h
@@ -32,6 +32,10 @@ namespace mlirtrt {
 
 struct EventPool;
 
+//===----------------------------------------------------------------------===//
+// GpuAllocator and CustomTensorRTAllocator
+//===----------------------------------------------------------------------===//
+
 class GpuAllocator {
 public:
   GpuAllocator() = default;
@@ -56,6 +60,111 @@ class CustomTensorRTAllocator : public GpuAllocator {
                   cudaStream_t* stream) override;
 };
 
+//===----------------------------------------------------------------------===//
+// OutputAllocator and CustomTensorRTOuputAllocator
+//===----------------------------------------------------------------------===//
+
+//!
+//! Class to allocate memory for outputs with data-dependent shapes. The sizes
+//! of those are unknown so pre-allocation is not possible.
+//!
+class OutputAllocator {
+public:
+  virtual ~OutputAllocator() = default;
+  virtual void setGpuAllocator(GpuAllocator* gpuAllocator) = 0;
+  virtual void setTensorName(const char *tensorName) = 0;
+  virtual void setCurrentMemory(void *currentMemory) = 0;
+  virtual void setOutputSize(const int64_t outputSize) = 0;
+  virtual void *reallocateOutputAsync(char const *tensorName,
+                                      void *currentMemory, uint64_t size,
+                                      uint64_t alignment,
+                                      cudaStream_t * /*stream*/) = 0;
+  virtual void notifyShape(char const *tensorName, const int64_t *dims,
+                           int64_t nbDims) = 0;
+};
+
+class CustomTensorRTOuputAllocator : public OutputAllocator {
+public:
+  CustomTensorRTOuputAllocator() = default;
+  ~CustomTensorRTOuputAllocator() {
+    if (mOutputPtr != nullptr) {
+      cudaFree(mOutputPtr);
+    }
+  }
+
+  void setGpuAllocator(GpuAllocator* gpuAllocator) override {
+    mGpuAllocator = gpuAllocator;
+  }
+
+  //! Methods are called just after construction. TODO: can they be called
+  //! during construction?
+  void setTensorName(const char *tensorName) override {
+    mTensorName = tensorName;
+  }
+
+  void setCurrentMemory(void *currentMemory) override {
+    mCurrentMemory = currentMemory;
+  }
+  
+  void setOutputSize(int64_t outputSize) override { mOutputSize = outputSize; }
+
+  void *reallocateOutputAsync(char const *tensorName, void *currentMemory,
+                              uint64_t size, uint64_t alignment,
+                              cudaStream_t * /*stream*/) override;
+
+  void notifyShape(char const *tensorName, const int64_t *dims,
+                           int64_t nbDims) override;
+
+  //! nullptr if memory could not be allocated
+  void *mOutputPtr{nullptr};
+
+  //! Size of allocation pointed to by output.
+  uint64_t mOutputSize{0};
+
+  bool mReallocateOutputCalled{false};
+
+  bool mNotifyShapeCalled{false};
+
+  //! Dimensions of tensor.
+  std::vector<int64_t> mOutputDims;
+
+private:
+  GpuAllocator* mGpuAllocator;
+  const char *mTensorName;
+  void *mCurrentMemory;
+};
+
+class OutputAllocatorTracker {
+public:
+  OutputAllocatorTracker() = default;
+  ~OutputAllocatorTracker() = default;
+
+  OutputAllocatorTracker(const OutputAllocatorTracker &) = delete;
+  OutputAllocatorTracker &operator=(const OutputAllocatorTracker &) = delete;
+  OutputAllocatorTracker(OutputAllocatorTracker &&) = default;
+  OutputAllocatorTracker &operator=(OutputAllocatorTracker &&) = default;
+
+  // Add a new OutputAllocator
+  void addAllocator(void *ptr, OutputAllocator *allocator) {
+    mOutputAllocatorRegistry.emplace_back(std::make_pair(ptr, allocator));
+  }
+
+  // Get a reference to an OutputAllocator
+  OutputAllocator *getAllocator(void *ptr) {
+    auto it = std::find_if(
+        mOutputAllocatorRegistry.begin(), mOutputAllocatorRegistry.end(),
+        [ptr](const auto &pair) { return pair.first == ptr; });
+
+    if (it != mOutputAllocatorRegistry.end()) {
+      return it->second;
+    }
+    return nullptr;
+  }
+
+private:
+  std::vector<std::pair<void *, OutputAllocator *>> mOutputAllocatorRegistry;
+};
+
 //===----------------------------------------------------------------------===//
 // PoolTrackedCudaEvent
 //===----------------------------------------------------------------------===//
diff --git a/mlir-tensorrt/executor/lib/CAPI/Runtime/Runtime.cpp b/mlir-tensorrt/executor/lib/CAPI/Runtime/Runtime.cpp
index 7e7eb73be..abd3b06c7 100644
--- a/mlir-tensorrt/executor/lib/CAPI/Runtime/Runtime.cpp
+++ b/mlir-tensorrt/executor/lib/CAPI/Runtime/Runtime.cpp
@@ -415,7 +415,64 @@ static void dlpackManagedTensorDeleter(DLManagedTensor *tensor) {
   }
 }
 
-MLIR_CAPI_EXPORTED MTRT_Status mtrtMemRefValueGetDLPackManagedTensor(
+
+class OutputAllocatorWrapper : public OutputAllocator {
+private:
+  MTRT_OutputAllocator mPyOutputAllocator;
+
+public:
+  OutputAllocatorWrapper(MTRT_OutputAllocator outputAllocator)
+      : mPyOutputAllocator(outputAllocator) {}
+
+  void setGpuAllocator(GpuAllocator *gpuAllocator) override {
+    return mPyOutputAllocator.setGpuAllocator(
+        mPyOutputAllocator.ptr,
+        MTRT_GpuAllocator{gpuAllocator, nullptr, nullptr});
+  }
+
+  void setTensorName(const char *tensorName) override {
+    return mPyOutputAllocator.setTensorName(mPyOutputAllocator.ptr, tensorName);
+  }
+
+  void setCurrentMemory(void *currentMemory) override {
+    return mPyOutputAllocator.setCurrentMemory(mPyOutputAllocator.ptr,
+                                               currentMemory);
+  }
+
+  void setOutputSize(const int64_t outputSize) override {
+    return mPyOutputAllocator.setOutputSize(mPyOutputAllocator.ptr, outputSize);
+  }
+
+  void *reallocateOutputAsync(char const *tensorName, void *currentMemory,
+                              uint64_t size, uint64_t alignment,
+                              cudaStream_t *stream) override {
+    return mPyOutputAllocator.reallocateOutputAsync(mPyOutputAllocator.ptr,
+                                                    tensorName, currentMemory,
+                                                    size, alignment, stream);
+  }
+
+  void notifyShape(char const *tensorName, const int64_t *dims,
+                   int64_t nbDims) override {
+    return mPyOutputAllocator.notifyShape(mPyOutputAllocator.ptr, tensorName,
+                                          dims, nbDims);
+  }
+
+  // Static method to create a OutputAllocator from MTRT_OutputAllocator
+  static std::unique_ptr<OutputAllocator>
+  create(MTRT_OutputAllocator outputAllocator) {
+    if (!outputAllocator.ptr || !outputAllocator.setGpuAllocator ||
+        !outputAllocator.setTensorName || !outputAllocator.setCurrentMemory ||
+        !outputAllocator.setOutputSize ||
+        !outputAllocator.reallocateOutputAsync ||
+        !outputAllocator.notifyShape) {
+      llvm::errs() << "Invalid MTRT_OutputAllocator passed to create()";
+      return nullptr;
+    }
+    return std::make_unique<OutputAllocatorWrapper>(outputAllocator);
+  }
+};
+
+MTRT_Status mtrtMemRefValueGetDLPackManagedTensor(
     MTRT_MemRefValue memrefValue, MTRT_DLPackManagedTensor *outTensor) {
   MemRefValue memref = *unwrap(memrefValue);
 
@@ -462,7 +519,7 @@ MLIR_CAPI_EXPORTED MTRT_Status mtrtMemRefValueGetDLPackManagedTensor(
   return mtrtStatusGetOk();
 }
 
-MLIR_CAPI_EXPORTED MTRT_Status mtrtMemRefValueGetDLPackDevice(
+MTRT_Status mtrtMemRefValueGetDLPackDevice(
     MTRT_MemRefValue memrefValue, int32_t *device_type, int32_t *device_id) {
   MemRefValue memref = *unwrap(memrefValue);
   int device = memref.getDevice().has_value()
@@ -626,6 +683,25 @@ mtrtRuntimeSessionOptionsDestroy(MTRT_RuntimeSessionOptions options) {
 // MTRT_RuntimeSession
 //===----------------------------------------------------------------------===//
 
+MTRT_Status mtrtAddMemRefOutputAllocatorSessionRegistry(
+    MTRT_MemRefValue memrefValue, MTRT_OutputAllocator pyOutputAllocator) {
+  auto memref = unwrap(memrefValue);
+
+  std::unique_ptr<OutputAllocator> outputAllocator;
+  if (pyOutputAllocator.ptr) {
+    outputAllocator.reset(
+        OutputAllocatorWrapper::create(pyOutputAllocator).release());
+  }
+
+  // Client should own the output allocator.
+  memref->getClient()->addOutputAllocator(std::move(outputAllocator));
+
+  // Store the output allocator reference.
+  memref->setOutputAllocator(memref->getClient()->getLastOutputAllocator());
+
+  return mtrtStatusGetOk();
+}
+
 // A wrapper class for MTRT_GpuAllocator implementing the GpuAllocator
 // interface. It encapsulates GPU memory allocation and deallocation operations,
 // ensuring correct routing of callbacks from C++ to Python.
@@ -656,10 +732,11 @@ class GpuAllocatorWrapper : public GpuAllocator {
   }
 };
 
-MTRT_Status mtrtRuntimeSessionCreate(MTRT_RuntimeSessionOptions options,
-                                     MTRT_Executable executable,
-                                     MTRT_GpuAllocator gpuAllocator,
-                                     MTRT_RuntimeSession *result) {
+MTRT_Status
+mtrtRuntimeSessionCreate(MTRT_RuntimeSessionOptions options,
+                         MTRT_Executable executable,
+                         MTRT_GpuAllocator gpuAllocator,
+                         MTRT_RuntimeSession *result) {
   RuntimeSessionOptions *cppOptions = unwrap(options);
   Executable *cppExecutable = unwrap(executable);
 
diff --git a/mlir-tensorrt/executor/lib/Runtime/API/API.cpp b/mlir-tensorrt/executor/lib/Runtime/API/API.cpp
index 52b02f72a..583dc344b 100644
--- a/mlir-tensorrt/executor/lib/Runtime/API/API.cpp
+++ b/mlir-tensorrt/executor/lib/Runtime/API/API.cpp
@@ -358,12 +358,14 @@ RuntimeSession::RuntimeSession(
     std::unique_ptr<PinnedMemoryAllocator> pinnedMemoryAllocator,
     std::unique_ptr<AllocTracker> allocTracker,
     std::unique_ptr<ResourceTracker> resourceTracker,
+    std::unique_ptr<OutputAllocatorTracker> outputAllocatorTracker,
     std::unique_ptr<GpuAllocator> gpuAllocator)
     : options(std::move(options)), executable(exe),
       pinnedMemoryAllocator(std::move(pinnedMemoryAllocator)),
       allocTracker(std::move(allocTracker)),
-      resourceTracker(std::move(resourceTracker)), gpuAllocator(std::move(gpuAllocator)),
-      state(std::move(state)) {}
+      resourceTracker(std::move(resourceTracker)),
+      outputAllocatorTracker(std::move(outputAllocatorTracker)),
+      gpuAllocator(std::move(gpuAllocator)), state(std::move(state)) {}
 
 //===----------------------------------------------------------------------===//
 // AllocTracker
diff --git a/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/LuaRuntime.cpp b/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/LuaRuntime.cpp
index 17af64a91..81897d8c2 100644
--- a/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/LuaRuntime.cpp
+++ b/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/LuaRuntime.cpp
@@ -72,7 +72,8 @@ static void registerDefaultDeviceDependentMethods(lua_State *state,
 
 static void registerLuaRuntimeMethodsCommon(
     lua_State *state, PinnedMemoryAllocator *pinnedMemoryAllocator,
-    AllocTracker *allocTracker, ResourceTracker *resourceTracker, GpuAllocator* allocator) {
+    AllocTracker *allocTracker, ResourceTracker *resourceTracker,
+    GpuAllocator *allocator, OutputAllocatorTracker *outputAllocatorTracker) {
   registerExecutorCoreModuleLuaRuntimeMethods(state, pinnedMemoryAllocator,
                                               allocTracker);
   registerExecutorCUDAModuleLuaRuntimeMethods(
@@ -84,15 +85,15 @@ static void registerLuaRuntimeMethodsCommon(
 #endif
 
   registerExecutorTensorRTModuleLuaRuntimeMethods(
-      state, pinnedMemoryAllocator, allocTracker, resourceTracker, allocator);
+      state, pinnedMemoryAllocator, allocTracker, resourceTracker, outputAllocatorTracker, allocator);
 }
 
 void mlirtrt::runtime::registerLuaRuntimeMethods(
     lua_State *state, const RuntimeSessionOptions &options,
     PinnedMemoryAllocator *pinnedMemoryAllocator, AllocTracker *allocTracker,
-    ResourceTracker *resourceTracker, GpuAllocator* allocator) {
+    ResourceTracker *resourceTracker,  OutputAllocatorTracker* outputAllocatorTracker, GpuAllocator* allocator) {
   registerLuaRuntimeMethodsCommon(state, pinnedMemoryAllocator, allocTracker,
-                                  resourceTracker, allocator);
+                                  resourceTracker, allocator, outputAllocatorTracker);
 #ifdef MLIR_EXECUTOR_ENABLE_NCCL
   registerExecutorNCCLModuleLuaRuntimeMethods(state, resourceTracker);
   registerDeviceDependentNCCLMethods(state, options.getNumDevices(),
@@ -107,8 +108,8 @@ void mlirtrt::runtime::registerLuaRuntimeMethods(
 #endif
 }
 
-StatusOr<int64_t>
-mlirtrt::runtime::runExecutorLuaScript(std::string_view luaScript, GpuAllocator* allocator) {
+StatusOr<int64_t> mlirtrt::runtime::runExecutorLuaScript(
+    std::string_view luaScript, GpuAllocator *allocator) {
   ADD_RUNTIME_MODULE_RANGE("runtime_runExecutorLuaScript");
 
   StatusOr<std::unique_ptr<RuntimeClient>> client = RuntimeClient::create();
@@ -117,10 +118,11 @@ mlirtrt::runtime::runExecutorLuaScript(std::string_view luaScript, GpuAllocator*
 
   sol::state lua;
   lua.open_libraries(sol::lib::base, sol::lib::string);
-  registerLuaRuntimeMethods(lua.lua_state(), RuntimeSessionOptions(),
-                            &(*client)->getPinnedMemorAllocator(),
-                            &(*client)->getAllocTracker(),
-                            &(*client)->getResourceTracker(), allocator);
+  registerLuaRuntimeMethods(
+      lua.lua_state(), RuntimeSessionOptions(),
+      &(*client)->getPinnedMemorAllocator(), &(*client)->getAllocTracker(),
+      &(*client)->getResourceTracker(), nullptr /* Output allocator */,
+      allocator /* can this be nullptr as well */);
 
   sol::protected_function_result result = lua.script(luaScript);
   if (!result.valid()) {
@@ -171,7 +173,8 @@ static Status maybeCheckForValidNcclUuid(const RuntimeSessionOptions &options) {
 /// global initialization.
 StatusOr<std::unique_ptr<RuntimeSession>>
 mlirtrt::runtime::createRuntimeSessionWithLuaBackend(
-    ExecutableView executable, std::unique_ptr<GpuAllocator> allocator, const RuntimeSessionOptions &options) {
+    ExecutableView executable, std::unique_ptr<GpuAllocator> allocator,
+    const RuntimeSessionOptions &options) {
   ADD_RUNTIME_MODULE_RANGE("runtime_loadExecutable");
 
   MTRT_RETURN_IF_ERROR(maybeCheckForValidNcclUuid(options));
@@ -179,12 +182,13 @@ mlirtrt::runtime::createRuntimeSessionWithLuaBackend(
   auto pinnedMemoryAllocator = std::make_unique<PinnedMemoryAllocator>();
   auto allocTracker = std::make_unique<AllocTracker>();
   auto resourceTracker = std::make_unique<ResourceTracker>();
+  auto outputAllocatorTracker = std::make_unique<OutputAllocatorTracker>();
 
   sol::state lua;
   lua.open_libraries(sol::lib::base, sol::lib::string);
-  registerLuaRuntimeMethods(lua.lua_state(), options,
-                            pinnedMemoryAllocator.get(), allocTracker.get(),
-                            resourceTracker.get(), allocator.get());
+  registerLuaRuntimeMethods(
+      lua.lua_state(), options, pinnedMemoryAllocator.get(), allocTracker.get(),
+      resourceTracker.get(), outputAllocatorTracker.get(), allocator.get());
 
   // Load globals into the context.
   // TODO: eliminate this copy, we already own the executable.
@@ -225,11 +229,13 @@ mlirtrt::runtime::createRuntimeSessionWithLuaBackend(
   }
   return std::make_unique<RuntimeSession>(
       options, executable, std::move(lua), std::move(pinnedMemoryAllocator),
-      std::move(allocTracker), std::move(resourceTracker), std::move(allocator));
+      std::move(allocTracker), std::move(resourceTracker),
+      std::move(outputAllocatorTracker), std::move(allocator));
 }
 
 StatusOr<int64_t> mlirtrt::runtime::runExecutorExecutable(
-    std::unique_ptr<Executable> executable, std::unique_ptr<GpuAllocator> allocator) {
+    std::unique_ptr<Executable> executable,
+    std::unique_ptr<GpuAllocator> allocator) {
 
   StatusOr<std::unique_ptr<RuntimeClient>> client = RuntimeClient::create();
   if (!client.isOk())
@@ -245,7 +251,8 @@ StatusOr<int64_t> mlirtrt::runtime::runExecutorExecutable(
     return options.getStatus();
 
   StatusOr<std::unique_ptr<RuntimeSession>> session =
-      createRuntimeSessionWithLuaBackend(executable->getView(), std::move(allocator), *options);
+      createRuntimeSessionWithLuaBackend(executable->getView(),
+                                         std::move(allocator), *options);
   if (!session.isOk())
     return session.getStatus();
 
@@ -465,6 +472,8 @@ runtime::executeFunctionWithLuaBackend(
   // Call the main function, if present.
   sol::state_view lua(session.getLuaState());
   AllocTracker &tracker = session.getAllocTracker();
+  OutputAllocatorTracker &outputAllocatorTracker = session.getOutputAllocatorTracker();
+
   sol::protected_function funcObj = lua[name];
   if (funcObj.get_type() != sol::type::function)
     return getStatusWithMsg(StatusCode::InternalError, "no function named \"",
@@ -523,6 +532,12 @@ runtime::executeFunctionWithLuaBackend(
   for (auto [idx, rv] : llvm::enumerate(outputArgs)) {
     if (MemRefValue *memref = llvm::dyn_cast<MemRefValue>(rv)) {
       MTRT_RETURN_IF_ERROR(pushMemRefTableArg(lua, tracker, args, *memref));
+
+      // Creating a mapping from memref pointer to output allocator tracker.
+      if (memref->getOutputAllocator()) {
+        outputAllocatorTracker.addAllocator(memref->getVoidPtr(), memref->getOutputAllocator());
+      }
+
       continue;
     }
     return getInvalidArgStatus("output (destination) argument #{0} to function "
diff --git a/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.cpp b/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.cpp
index 1ae3b4440..1aed00592 100644
--- a/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.cpp
+++ b/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.cpp
@@ -64,6 +64,105 @@ class StdioLogger : public nvinfer1::ILogger {
   bool verbose;
 };
 
+//===----------------------------------------------------------------------===//
+// TensorRTCallBackOutputAllocator
+//===----------------------------------------------------------------------===//
+
+static bool isSubByte(nvinfer1::DataType t) {
+  return t == nvinfer1::DataType::kINT4;
+}
+
+static int32_t elementSizeInBits(nvinfer1::DataType t) {
+  switch (t) {
+  case nvinfer1::DataType::kINT64:
+    return 64;
+  case nvinfer1::DataType::kINT32:
+    return 32;
+  case nvinfer1::DataType::kFLOAT:
+    return 32;
+  case nvinfer1::DataType::kHALF:
+    return 16;
+  case nvinfer1::DataType::kBF16:
+    return 16;
+  case nvinfer1::DataType::kINT8:
+    return 8;
+  case nvinfer1::DataType::kBOOL:
+    return 8;
+  case nvinfer1::DataType::kUINT8:
+    return 8;
+  case nvinfer1::DataType::kFP8:
+    return 8;
+  case nvinfer1::DataType::kINT4:
+    return 4;
+  }
+  return 0;
+}
+
+static int32_t elementeSizeInBytes(nvinfer1::DataType dtype) {
+  if (!isSubByte(dtype)) {
+    auto bits = elementSizeInBits(dtype);
+    assert(bits % 8 == 0);
+    return bits / 8;
+  }
+  if (dtype == nvinfer1::DataType::kINT4) {
+    return 1;
+  }
+  return -1;
+}
+
+static int64_t volume(nvinfer1::Dims64 const& d)
+{
+    int64_t v = 1;
+    for (int64_t i = 0; i < d.nbDims; i++)
+    {
+        v *= d.d[i];
+    }
+    return v;
+}
+
+class TensorRTCallBackOutputAllocator final
+    : public nvinfer1::IOutputAllocator {
+public:
+  TensorRTCallBackOutputAllocator(GpuAllocator* gpuAllocator, OutputAllocator *outputAllocator,
+                                  const char *tensorName, void *currentMemory,
+                                  nvinfer1::Dims64 dims,
+                                  nvinfer1::DataType dtype)
+      : nvinfer1::IOutputAllocator(),
+        mOutputAllocatorCallBack(outputAllocator) {
+    mOutputAllocatorCallBack->setGpuAllocator(gpuAllocator);
+    mOutputAllocatorCallBack->setTensorName(tensorName);
+    mOutputAllocatorCallBack->setCurrentMemory(currentMemory);
+    mOutputAllocatorCallBack->setOutputSize(volume(dims) *
+                                            elementeSizeInBytes(dtype));
+  }
+
+  void *reallocateOutput(char const *tensorName, void *currentMemory,
+                         uint64_t size, uint64_t alignment) noexcept override {
+    return mOutputAllocatorCallBack->reallocateOutputAsync(
+        tensorName, currentMemory, size, alignment, nullptr);
+  }
+
+  //! IMirroredBuffer does not implement Async allocation, hence this is just a
+  //! wrap around
+  void *reallocateOutputAsync(char const *tensorName, void *currentMemory,
+                              uint64_t size, uint64_t alignment,
+                              cudaStream_t stream) noexcept override {
+
+    return mOutputAllocatorCallBack->reallocateOutputAsync(
+        tensorName, currentMemory, size, alignment, &stream);
+  }
+
+  void notifyShape(char const *tensorName,
+                   nvinfer1::Dims const &dims) noexcept override {
+    return mOutputAllocatorCallBack->notifyShape(tensorName, &dims.d[0], dims.nbDims);
+  }
+
+  ~TensorRTCallBackOutputAllocator() override {}
+
+private:
+  OutputAllocator *mOutputAllocatorCallBack;
+};
+
 //===----------------------------------------------------------------------===//
 // TensorRTCallBackAllocator
 //===----------------------------------------------------------------------===//
@@ -125,9 +224,11 @@ class NvInferRuntimeWrapper {
         });
     // GpuAllocator is optional.
     if (gpuAllocator) {
-      callbackAllocator = std::shared_ptr<nvinfer1::IGpuAllocator>(
-          new TensorRTCallBackAllocator(gpuAllocator));
-      runtime->setGpuAllocator(callbackAllocator.get());
+      callbackAllocatorPair =
+          std::make_pair(std::shared_ptr<nvinfer1::IGpuAsyncAllocator>(
+                             new TensorRTCallBackAllocator(gpuAllocator)),
+                         gpuAllocator);
+      runtime->setGpuAllocator(callbackAllocatorPair.first.get());
     }
   }
 
@@ -135,7 +236,7 @@ class NvInferRuntimeWrapper {
   nvinfer1::IRuntime *operator->() { return runtime.get(); }
 
   std::shared_ptr<nvinfer1::IRuntime> runtime;
-  std::shared_ptr<nvinfer1::IGpuAllocator> callbackAllocator;
+  std::pair<std::shared_ptr<nvinfer1::IGpuAsyncAllocator>, GpuAllocator*> callbackAllocatorPair;
 };
 
 class NvInferEngineWrapper {
@@ -234,6 +335,22 @@ class NvInferExecContextWrapper {
   /// Returned the pre-allocated host staging buffers.
   std::vector<PinnedMemoryBlock> &getHostIOBuffers() { return hostIOBuffers; }
 
+  /// Add a call back output allocator.
+  void addCallBackAllocators(
+      std::unique_ptr<TensorRTCallBackOutputAllocator> allocator) {
+    outputAllocators.emplace_back(std::move(allocator));
+  }
+
+  /// Return the last call back output allocator pointer.
+  TensorRTCallBackOutputAllocator *getLastCallBackAllocatorPtr() {
+    return outputAllocators.back().get();
+  }
+
+  /// Return registered callback gpu allocator.
+  GpuAllocator *getGpuAllocator() {
+    return engine->runtime->callbackAllocatorPair.second;
+  }
+
 private:
   // We keep a reference to the cuda engine to keep it from going out of scope.
   // The standard TensorRTRuntime-to-Executor lowering only creates globals for
@@ -247,13 +364,14 @@ class NvInferExecContextWrapper {
   /// A set of pinned host buffers one per input host buffer (shape tensor) to
   /// the TRT network.
   std::vector<PinnedMemoryBlock> hostIOBuffers;
+  std::vector<std::unique_ptr<TensorRTCallBackOutputAllocator>> outputAllocators;
 };
 } // namespace
 
-static Status setTensorAddressesOrReport(
+static Status setTensorAddressesAndOutputAllocatorsOrReport(
     NvInferExecContextWrapper &context,
     const std::vector<std::tuple<std::string, uintptr_t, nvinfer1::Dims>>
-        &buffers) {
+        &buffers, OutputAllocatorTracker &outputAllocatorTracker) {
   ADD_TENSORRT_MODULE_RANGE("set_tensor_addresses");
   unsigned idx = 0;
   for (auto &[name, ptr, dims] : buffers) {
@@ -266,9 +384,10 @@ static Status setTensorAddressesOrReport(
     bool result =
         context->setTensorAddress(name.c_str(), reinterpret_cast<void *>(ptr));
 
+    const nvinfer1::ICudaEngine &engine = context->getEngine();
+
     if (!result) {
       std::stringstream ss;
-      const nvinfer1::ICudaEngine &engine = context->getEngine();
       ss << "Failed to set tensor address for IO tensor: " << name
          << " at position " << idx << "; the IO tensors are:\n";
       for (int64_t i = 0; i < engine.getNbIOTensors(); i++) {
@@ -289,6 +408,37 @@ static Status setTensorAddressesOrReport(
         return getInternalErrorStatus("failed to set input shape");
     }
 
+    // Set output allocators
+    if (engine.getTensorIOMode(name.c_str()) ==
+            nvinfer1::TensorIOMode::kOUTPUT and
+        engine.getTensorLocation(name.c_str()) ==
+            nvinfer1::TensorLocation::kDEVICE) {
+
+      // Since setting output allocator is optional.
+      if (outputAllocatorTracker.getAllocator(reinterpret_cast<void *>(ptr)) !=
+          nullptr) {
+        context.addCallBackAllocators(
+            std::make_unique<TensorRTCallBackOutputAllocator>(
+                context.getGpuAllocator(),
+                outputAllocatorTracker.getAllocator(
+                    reinterpret_cast<void *>(ptr)),
+                name.c_str(), reinterpret_cast<void *>(ptr), dims,
+                engine.getTensorDataType(name.c_str())));
+        context->setOutputAllocator(name.c_str(),
+                                    static_cast<nvinfer1::IOutputAllocator *>(
+                                        context.getLastCallBackAllocatorPtr()));
+      } else {
+        // It is possible that previous call with same output name and different
+        // memref pointer would have set output allocator. Due to "hacky" naming
+        // scheme, outputs are always named as "result0", "result1", .... If not
+        // tracker is found for a given pointer, let's unset the output
+        // allocator.
+        if (context->getOutputAllocator(name.c_str())) {
+          context->setOutputAllocator(name.c_str(), nullptr);
+        }
+      }
+    }
+
     MTRT_DBGF("Set tensor address [%d] = %lu", idx, ptr);
     idx++;
   }
@@ -390,6 +540,7 @@ prepareBuffers(const AllocTracker &allocTracker,
 
 static Status enqueueV3Wrapper(AllocTracker &tracker,
                                ResourceTracker &resourceTracker,
+                               OutputAllocatorTracker &outputAllocatorTracker,
                                NvInferExecContextWrapper &context,
                                CudaStreamPtr stream, sol::table &va) {
   StatusOr<std::vector<std::tuple<std::string, uintptr_t, nvinfer1::Dims>>>
@@ -398,8 +549,8 @@ static Status enqueueV3Wrapper(AllocTracker &tracker,
     return getStatusWithMsg(StatusCode::InternalError,
                             "failed to prepare buffers: ", buffers.getString());
 
-  MTRT_RETURN_IF_ERROR(setTensorAddressesOrReport(context, *buffers));
 
+  MTRT_RETURN_IF_ERROR(setTensorAddressesAndOutputAllocatorsOrReport(context, *buffers, outputAllocatorTracker));
   // Create an event that we can wait on for releasing any host-pinned staging
   // allocations we made.
   MTRT_ASSIGN_OR_RETURN(CudaEventPtr inputConsumedEvent,
@@ -426,7 +577,8 @@ static Status enqueueV3Wrapper(AllocTracker &tracker,
 //===----------------------------------------------------------------------===//
 void mlirtrt::runtime::registerExecutorTensorRTModuleLuaRuntimeMethods(
     lua_State *luaState, PinnedMemoryAllocator *pinnedMemoryAllocator,
-    AllocTracker *allocTracker, ResourceTracker *resourceTracker, GpuAllocator* allocator) {
+    AllocTracker *allocTracker, ResourceTracker *resourceTracker,
+    OutputAllocatorTracker *outputAllocatorTracker, GpuAllocator *allocator) {
   sol::state_view lua(luaState);
 
   lua["_trtrt_create_runtime"] =
@@ -463,16 +615,17 @@ void mlirtrt::runtime::registerExecutorTensorRTModuleLuaRuntimeMethods(
   };
 
   lua["_trtrt_enqueue"] =
-      [allocTracker,
-       resourceTracker](sol::this_state state,
-                        std::shared_ptr<NvInferExecContextWrapper> context,
-                        CudaStreamPtr stream, sol::table va) {
+      [allocTracker, resourceTracker, outputAllocatorTracker](
+          sol::this_state state,
+          std::shared_ptr<NvInferExecContextWrapper> context,
+          CudaStreamPtr stream, sol::table va) {
         ADD_TENSORRT_MODULE_RANGE("trtrt_enqueue");
         sol::state_view luaState(state);
         assert(context != nullptr);
         assert(stream != nullptr && "expected valid stream");
-        Status result = enqueueV3Wrapper(*allocTracker, *resourceTracker,
-                                         *context, stream, va);
+        Status result =
+            enqueueV3Wrapper(*allocTracker, *resourceTracker,
+                             *outputAllocatorTracker, *context, stream, va);
         SET_LUA_ERROR_IF_ERROR(result, state);
       };
 }
diff --git a/mlir-tensorrt/executor/lib/Support/Allocators.cpp b/mlir-tensorrt/executor/lib/Support/Allocators.cpp
index 100cb0361..8def6eb90 100644
--- a/mlir-tensorrt/executor/lib/Support/Allocators.cpp
+++ b/mlir-tensorrt/executor/lib/Support/Allocators.cpp
@@ -42,6 +42,73 @@ using namespace mlirtrt;
   DEBUG_WITH_TYPE("allocators", fprintf(stderr, "%s:%d " fmt "\n", __FILE__,   \
                                         __LINE__, __VA_ARGS__))
 
+//===----------------------------------------------------------------------===//
+// CustomTensorRTOutputAllocator
+//===----------------------------------------------------------------------===//
+
+inline uint64_t roundUp(uint64_t m, uint64_t n) {
+  return ((m + n - 1) / n) * n;
+}
+
+void *CustomTensorRTOuputAllocator::reallocateOutputAsync(
+    char const *tensorName, void *currentMemory, uint64_t size,
+    uint64_t alignment, cudaStream_t *stream) {
+
+  assert(currentMemory == mCurrentMemory && "output buffer mismatch");
+  assert(strcmp(tensorName, mTensorName) == 0 && "tensor name mismatch");
+  assert(!mReallocateOutputCalled && "duplicate call to reallocateOutput");
+  mReallocateOutputCalled = true;
+  // Some memory allocators return nullptr when allocating zero bytes, but
+  // TensorRT requires a non-null ptr even for empty tensors, so allocate a
+  // dummy byte.
+  size = std::max(size, static_cast<uint64_t>(1));
+
+  // Check if reallocation is required.
+  if (size > mOutputSize) {
+    size = roundUp(size, alignment);
+
+    if (mOutputPtr) {
+      if (mGpuAllocator) {
+        // Use registeted call back GPU allocator for output allocations.
+        mGpuAllocator->deallocate(mOutputPtr, stream);
+      } else {
+        // Fall-back to local memory management.
+        cudaFree(mOutputPtr);
+      }
+    }
+
+    mOutputPtr = nullptr;
+    mOutputSize = 0;
+
+    void *memory;
+    if (mGpuAllocator) {
+      // Use registeted call back GPU allocator for output allocations.
+      memory = mGpuAllocator->allocate(size, alignment, 0 /* flags */, stream);
+    } else {
+      // Fall-back to local memory management.
+      cudaMalloc(&memory, size);
+    }
+    mOutputPtr = memory;
+    if (mOutputPtr != nullptr) {
+      mOutputSize = size;
+    }
+    return mOutputPtr;
+  }
+  return mCurrentMemory;
+}
+
+void CustomTensorRTOuputAllocator::notifyShape(char const *tensorName,
+                                               const int64_t *dims, int64_t nbDims) {
+  assert(mReallocateOutputCalled &&
+         "TensorRT must invoke reallocateOutput first");
+  assert(!mNotifyShapeCalled && "duplicate call to notifyShape");
+  assert(tensorName == mTensorName);
+
+  mNotifyShapeCalled = true;
+  mOutputDims.resize(nbDims);
+  std::copy_n(dims, nbDims, mOutputDims.begin());
+}
+
 //===----------------------------------------------------------------------===//
 // CustomTensorRTAllocator
 //===----------------------------------------------------------------------===//
diff --git a/mlir-tensorrt/executor/lib/Tools/ExecutorRunnerMain.cpp b/mlir-tensorrt/executor/lib/Tools/ExecutorRunnerMain.cpp
index dc14db16f..d06b59618 100644
--- a/mlir-tensorrt/executor/lib/Tools/ExecutorRunnerMain.cpp
+++ b/mlir-tensorrt/executor/lib/Tools/ExecutorRunnerMain.cpp
@@ -179,8 +179,6 @@ executor::ExecutorRunnerMain(int argc, char **argv,
     allocator.reset(new CustomTensorRTAllocator());
   }
 
-  // Read the buffer as a Lua script and execute.
-
   if (options.inputType == Lua) {
     assert(!options.dumpFunctionSignature &&
            "Can not dump function signature for Lua input type.");
@@ -213,7 +211,8 @@ executor::ExecutorRunnerMain(int argc, char **argv,
   }
 
   mlirtrt::StatusOr<int64_t> executionResult =
-      mlirtrt::runtime::runExecutorExecutable(std::move(*executable), std::move(allocator));
+      mlirtrt::runtime::runExecutorExecutable(std::move(*executable),
+                                              std::move(allocator));
   if (!executionResult.isOk())
     return emitError(UnknownLoc::get(&context))
            << "failed to load and run executable: "
diff --git a/mlir-tensorrt/python/bindings/Runtime/RuntimePyBind.cpp b/mlir-tensorrt/python/bindings/Runtime/RuntimePyBind.cpp
index 3f45138c8..efe2bf7ec 100644
--- a/mlir-tensorrt/python/bindings/Runtime/RuntimePyBind.cpp
+++ b/mlir-tensorrt/python/bindings/Runtime/RuntimePyBind.cpp
@@ -193,6 +193,7 @@ class PyRuntimeValue : public PyMTRTWrapper<PyRuntimeValue, MTRT_RuntimeValue> {
 class PyGpuAllocator {
 public:
   py::object pySelf;
+  // This ensure that PyGpuAllocator is not deallocated before corresponding Python object lives.
   PyGpuAllocator(py::object self) : pySelf(self) {}
 
   virtual ~PyGpuAllocator() = default;
@@ -260,6 +261,127 @@ class PyGpuAllocatorTrampoline : public PyGpuAllocator {
   }
 };
 
+class PyOutputAllocator {
+public:
+  py::object pySelf;
+  // This ensure that PyOutputAllocator is not deallocated before corresponding
+  // Python object lives.
+  PyOutputAllocator(py::object self) : pySelf(self) {}
+
+  virtual ~PyOutputAllocator() = default;
+  virtual void setTensorName(const char *tensorName) = 0;
+  virtual void setCurrentMemory(uintptr_t currentMemory) = 0;
+  virtual void setOutputSize(const int64_t outputSize) = 0;
+  virtual uintptr_t reallocateOutputAsync(char const *tensorName,
+                                      uintptr_t currentMemory, uint64_t size,
+                                      uint64_t alignment) = 0;
+  virtual void notifyShape(char const *tensorName, const int64_t *dims,
+                           int64_t nbDims) = 0;
+  // Creates a C-compatible struct for interfacing with lower-level APIs.
+  MTRT_OutputAllocator getCApiObject() { return createWithPythonCallbacks(this); }
+
+private:
+  static void PySetGpuAllocator(void *self, MTRT_GpuAllocator gpuAllocator) {
+    // Let user use the default available gpu allocator for now.
+  }
+
+  static void PySetTensorName(void *self, const char *tensorName) {
+    py::gil_scoped_acquire acquire;
+    auto *allocator = static_cast<PyOutputAllocator *>(self);
+    return allocator->setTensorName(tensorName);
+  }
+
+  static void PySetCurrentMemory(void *self, void *currentMemory) {
+    py::gil_scoped_acquire acquire;
+    auto *allocator = static_cast<PyOutputAllocator *>(self);
+    return allocator->setCurrentMemory(
+        reinterpret_cast<uintptr_t>(currentMemory));
+  }
+
+  static void PySetOutputSize(void *self, const int64_t outputSize) {
+    py::gil_scoped_acquire acquire;
+    auto *allocator = static_cast<PyOutputAllocator *>(self);
+    return allocator->setOutputSize(outputSize);
+  }
+
+  static void *PyReallocateOutputAsync(void *self, char const *tensorName,
+                                       void *currentMemory, uint64_t size,
+                                       uint64_t alignment,
+                                       cudaStream_t * /*stream*/) {
+    py::gil_scoped_acquire acquire;
+    auto *allocator = static_cast<PyOutputAllocator *>(self);
+    return reinterpret_cast<void *>(allocator->reallocateOutputAsync(
+        tensorName, reinterpret_cast<uintptr_t>(currentMemory), size,
+        alignment));
+  }
+
+  static void PyNotifyShape(void *self, char const *tensorName, const int64_t *dims,
+                             int64_t nbDims) {
+    py::gil_scoped_acquire acquire;
+    auto *allocator = static_cast<PyOutputAllocator *>(self);
+    return allocator->notifyShape(tensorName, dims, nbDims);
+  }
+
+  // Constructs MTRT_GpuAllocator with this instance's methods as callbacks.
+  static MTRT_OutputAllocator
+  createWithPythonCallbacks(PyOutputAllocator *allocator) {
+    MTRT_OutputAllocator capi_allocator;
+    capi_allocator.ptr = allocator;
+    capi_allocator.setGpuAllocator = PySetGpuAllocator;
+    capi_allocator.setTensorName = PySetTensorName;
+    capi_allocator.setCurrentMemory = PySetCurrentMemory;
+    capi_allocator.setOutputSize = PySetOutputSize;
+    capi_allocator.reallocateOutputAsync = PyReallocateOutputAsync;
+    capi_allocator.notifyShape = PyNotifyShape;
+    return capi_allocator;
+  }
+};
+
+// Pybind11 trampoline class for PyOutputAllocator.
+// Enables Python subclasses to override virtual methods of PyOutputAllocator.
+class PyOutputAllocatorTrampoline : public PyOutputAllocator {
+public:
+  using PyOutputAllocator::PyOutputAllocator; // Inherit constructors
+
+  // Trampoline for setTensorName: Dispatches call to Python implementation if
+  // overridden.
+  void setTensorName(const char *tensorName) override {
+    PYBIND11_OVERRIDE_PURE(void,              // Return type
+                           PyOutputAllocator, // Parent class
+                           set_tensor_name,   // Name of function in Python
+                           tensorName);       // Arguments
+  }
+  void setCurrentMemory(uintptr_t currentMemory) override {
+    PYBIND11_OVERRIDE_PURE(void,              // Return type
+                           PyOutputAllocator, // Parent class
+                           set_current_memory,// Name of function in Python
+                           currentMemory);    // Arguments
+  }
+  void setOutputSize(const int64_t outputSize) override {
+    PYBIND11_OVERRIDE_PURE(void,              // Return type
+                           PyOutputAllocator, // Parent class
+                           set_output_size,   // Name of function in Python
+                           outputSize);       // Arguments
+  }
+  uintptr_t reallocateOutputAsync(char const *tensorName,
+                                  uintptr_t currentMemory, uint64_t size,
+                                  uint64_t alignment) override {
+    PYBIND11_OVERRIDE_PURE(uintptr_t,             // Return type
+                           PyOutputAllocator,     // Parent class
+                           reallocate_output,     // Name of function in Python
+                           tensorName,            // Arguments
+                           currentMemory, size, alignment);
+  }
+  void notifyShape(char const *tensorName, const int64_t *dims,
+                   int64_t nbDims) override {
+    PYBIND11_OVERRIDE_PURE(void,              // Return type
+                           PyOutputAllocator, // Parent class
+                           notify_shape,      // Name of function in C++
+                           tensorName,        // Arguments
+                           dims, nbDims);
+  }
+};
+
 /// Python object type wrapper for `MTRT_StableHLOToExecutableOptions`.
 class PyRuntimeSessionOptions
     : public PyMTRTWrapper<PyRuntimeSessionOptions,
@@ -708,6 +830,14 @@ PYBIND11_MODULE(_api, m) {
                                THROW_IF_MTRT_ERROR(s);
                                return info.addressSpace;
                              })
+      .def(
+          "set_output_allocator",
+          [](PyMemRefValue &self,
+             PyOutputAllocator &outputAllocator) {
+            MTRT_Status s = mtrtAddMemRefOutputAllocatorSessionRegistry(self, outputAllocator.getCApiObject());
+            THROW_IF_MTRT_ERROR(s);
+          },
+          py::arg("allocator"))
       .def(
           "__dlpack__",
           [](PyMemRefValue &self, int32_t /*stream*/) {
@@ -829,7 +959,6 @@ PYBIND11_MODULE(_api, m) {
           "explicit type "
           "may be provided, otherwise defaults to i64 for Python integers and "
           "f32 for Python floats")
-
       .def(
           "create_memref",
           [](PyRuntimeClient &self, py::buffer array,
@@ -944,6 +1073,8 @@ PYBIND11_MODULE(_api, m) {
              THROW_IF_MTRT_ERROR(s);
              return externalRefCount;
            })
+
+
       .def("is_released_internally", [](PyRuntimeClient &self, uintptr_t ptr) {
         bool isReleasedInternally;
         MTRT_Status s =
@@ -984,6 +1115,17 @@ PYBIND11_MODULE(_api, m) {
       .def("deallocate", &PyGpuAllocator::deallocate)
       .def("get_capi_object", &PyGpuAllocator::getCApiObject);
 
+  py::class_<PyOutputAllocator, PyOutputAllocatorTrampoline>(m,
+                                                             "OutputAllocator")
+      .def(py::init<>(
+          [](py::object self) { return new PyOutputAllocatorTrampoline(self); }))
+      .def("set_tensor_name", &PyOutputAllocator::setTensorName)
+      .def("set_current_memory", &PyOutputAllocator::setCurrentMemory)
+      .def("set_output_size", &PyOutputAllocator::setOutputSize)
+      .def("rellocate_output_async", &PyOutputAllocator::reallocateOutputAsync)
+      .def("notify_shape", &PyOutputAllocator::notifyShape)
+      .def("get_capi_object", &PyOutputAllocator::getCApiObject);
+
   py::class_<PyRuntimeSession>(m, "RuntimeSession", py::module_local())
       .def(py::init<>([](PyRuntimeSessionOptions &options, PyExecutable &exe,
                          py::object gpu_allocator = py::none()) {
diff --git a/mlir-tensorrt/test/python/IntegrationTests/test_stablehlo_add.py b/mlir-tensorrt/test/python/IntegrationTests/test_stablehlo_add.py
index 1fa932fec..1e7289b71 100644
--- a/mlir-tensorrt/test/python/IntegrationTests/test_stablehlo_add.py
+++ b/mlir-tensorrt/test/python/IntegrationTests/test_stablehlo_add.py
@@ -37,6 +37,35 @@ def deallocate(self, ptr):
         return False
 
 
+class CupyOutputAllocator(runtime.OutputAllocator):
+    def __init__(self):
+        super().__init__(self)
+
+    def set_tensor_name(self, tensor_name):
+        self.tensor_name = tensor_name
+
+    def set_current_memory(self, memory):
+        self.memory = memory
+
+    def set_output_size(self, size):
+        self.size = size
+
+    def reallocate_output(self, tensor_name, memory, size, alignment):
+        assert self.tensor_name == tensor_name
+        assert self.memory == memory
+
+        if size > self.size:
+            # For now just fail if reallocation is required.
+            assert 0
+
+        return self.memory
+
+    def notify_shape(self, tensor_name, dims, nb_dims):
+        assert self.tensor_name == tensor_name
+        self.dims = dims
+        self.nb_dims = nb_dims
+
+
 def stablehlo_add():
     # Build/parse the main function.
     with ir.Context() as context:
@@ -75,6 +104,10 @@ def stablehlo_add():
         device=devices[0],
         stream=stream,
     )
+
+    output_allocator = CupyOutputAllocator()
+    arg1.set_output_allocator(output_allocator)
+
     session.execute_function("main", in_args=[arg0], out_args=[arg1], stream=stream)
 
     data = np.asarray(client.copy_to_host(arg1, stream=stream))
@@ -88,12 +121,12 @@ def stablehlo_add():
     start_time = time.time()
     for _ in range(0, num_iter):
         session.execute_function("main", in_args=[arg0], out_args=[arg0], stream=stream)
-    data = np.asarray(client.copy_to_host(arg1, stream=stream))
+    data = np.asarray(client.copy_to_host(arg0, stream=stream))
     stream.sync()
     end_time = time.time()
     elapsed = end_time - start_time
 
-    print(np.asarray(client.copy_to_host(arg0)))
+    print(np.asarray(data))
     print(f"1000 iterations avg { (elapsed/num_iter)/1000.0} msec per iteration")
 
 
diff --git a/mlir-tensorrt/test/python/IntegrationTests/test_stablehlo_dynamic.py b/mlir-tensorrt/test/python/IntegrationTests/test_stablehlo_dynamic.py
index 82c31a05a..b8c56a6df 100644
--- a/mlir-tensorrt/test/python/IntegrationTests/test_stablehlo_dynamic.py
+++ b/mlir-tensorrt/test/python/IntegrationTests/test_stablehlo_dynamic.py
@@ -86,6 +86,35 @@ def infer_output_shape(client, session, exe, input_shape):
     return output_shape
 
 
+class CupyOutputAllocator(runtime.OutputAllocator):
+    def __init__(self):
+        super().__init__(self)
+
+    def set_tensor_name(self, tensor_name):
+        self.tensor_name = tensor_name
+
+    def set_current_memory(self, memory):
+        self.memory = memory
+
+    def set_output_size(self, size):
+        self.size = size
+
+    def reallocate_output(self, tensor_name, memory, size, alignment):
+        assert self.tensor_name == tensor_name
+        assert self.memory == memory
+
+        if size > self.size:
+            # For now just fail if reallocation is required.
+            assert 0
+
+        return self.memory
+
+    def notify_shape(self, tensor_name, dims, nb_dims):
+        assert self.tensor_name == tensor_name
+        self.dims = dims
+        self.nb_dims = nb_dims
+
+
 def test_program(program: str, input_shape: Iterable[int], debug: bool = True):
     # Build/parse the main function.
     with ir.Context() as context:
@@ -134,6 +163,15 @@ def test_program(program: str, input_shape: Iterable[int], debug: bool = True):
         stream=stream,
     )
 
+    # # Preallocate dummy memory for 1 element.
+    # arg2 = client.create_memref(
+    #     np.zeros((1, 1, 1), dtype=np.float32).data,
+    #     device=devices[0],
+    #     stream=stream,
+    # )
+    # output_allocator = CupyOutputAllocator()
+    # arg1.set_output_allocator(output_allocator)
+
     session.execute_function(
         "main", in_args=[arg0, arg1], out_args=[arg2], stream=stream
     )