From af6a470e71c5e325a8798b4484f02507058df537 Mon Sep 17 00:00:00 2001
From: Giorgi Gvalia <ggvalia@login33.chn.perlmutter.nersc.gov>
Date: Thu, 26 Jun 2025 12:53:36 -0700
Subject: [PATCH 1/2] [Offload] Allow CUDA Kernels to use arbitrarily large
 shared memory

Previously, the user was not able to use more than 48 KB of shared
memory on NVIDIA GPUs. In order to do so, setting the function
attribute `CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK` is required, which
was not present in the code base. With this commit, we add the ability
toset this attribute, allowing the user to utilize the full power of
their GPU.

In order to not have to reset the function attribute for each launch
of the same kernel, we keep track of the maximum memory limit (as the
variable `MaxDynCGroupMemLimit`) and only set the attribute if our
desired amount exceeds the limit. By default, this limit is set to 48
KB.

Feedback is greatly appreciated, especially around setting the new
variable as mutable. I did this becuase the `launchImpl` method is
const and I am not able to modify my variable otherwise.
---
 offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp |  1 +
 offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h   |  2 ++
 offload/plugins-nextgen/cuda/src/rtl.cpp           | 14 ++++++++++++++
 3 files changed, 17 insertions(+)

diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
index e5332686fcffb..361a781e8f9b6 100644
--- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
+++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
@@ -31,6 +31,7 @@ DLWRAP(cuDeviceGet, 2)
 DLWRAP(cuDeviceGetAttribute, 3)
 DLWRAP(cuDeviceGetCount, 1)
 DLWRAP(cuFuncGetAttribute, 3)
+DLWRAP(cuFuncSetAttribute, 3)
 
 // Device info
 DLWRAP(cuDeviceGetName, 3)
diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
index 1c5b421768894..b6c022c8e7e8b 100644
--- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
+++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
@@ -258,6 +258,7 @@ typedef enum CUdevice_attribute_enum {
 
 typedef enum CUfunction_attribute_enum {
   CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0,
+  CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES = 8,
 } CUfunction_attribute;
 
 typedef enum CUctx_flags_enum {
@@ -295,6 +296,7 @@ CUresult cuDeviceGet(CUdevice *, int);
 CUresult cuDeviceGetAttribute(int *, CUdevice_attribute, CUdevice);
 CUresult cuDeviceGetCount(int *);
 CUresult cuFuncGetAttribute(int *, CUfunction_attribute, CUfunction);
+CUresult cuFuncSetAttribute(CUfunction, CUfunction_attribute, int);
 
 // Device info
 CUresult cuDeviceGetName(char *, int, CUdevice);
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index 0e662b038c363..fd9528061b55e 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -160,6 +160,9 @@ struct CUDAKernelTy : public GenericKernelTy {
 private:
   /// The CUDA kernel function to execute.
   CUfunction Func;
+  /// The maximum amount of dynamic shared memory per thread group. By default,
+  /// this is set to 48 KB.
+  mutable uint32_t MaxDynCGroupMemLimit = 49152;
 };
 
 /// Class wrapping a CUDA stream reference. These are the objects handled by the
@@ -1300,6 +1303,17 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
   if (GenericDevice.getRPCServer())
     GenericDevice.Plugin.getRPCServer().Thread->notify();
 
+  // In case we require more memory than the current limit.
+  if (MaxDynCGroupMem >= MaxDynCGroupMemLimit) {
+    CUresult AttrResult = cuFuncSetAttribute(
+        Func,
+        CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
+        MaxDynCGroupMem);
+    Plugin::check(AttrResult,
+        "Error in cuLaunchKernel while setting the memory limits: %s");
+    MaxDynCGroupMemLimit = MaxDynCGroupMem;
+  }
+
   CUresult Res = cuLaunchKernel(Func, NumBlocks[0], NumBlocks[1], NumBlocks[2],
                                 NumThreads[0], NumThreads[1], NumThreads[2],
                                 MaxDynCGroupMem, Stream, nullptr, Config);

From a8066949f7139533c71a54ffe55df5e26ee36077 Mon Sep 17 00:00:00 2001
From: Giorgi Gvalia <ggvalia@login07.chn.perlmutter.nersc.gov>
Date: Mon, 30 Jun 2025 10:19:56 -0700
Subject: [PATCH 2/2] [Offload] Fix formatting for shared memory additions in
 rtl.cpp

---
 offload/plugins-nextgen/cuda/src/rtl.cpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index dec1d5b83dc6e..9a9237a74b0ef 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -1308,10 +1308,9 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
   // In case we require more memory than the current limit.
   if (MaxDynCGroupMem >= MaxDynCGroupMemLimit) {
     CUresult AttrResult = cuFuncSetAttribute(
-        Func,
-        CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
-        MaxDynCGroupMem);
-    Plugin::check(AttrResult,
+        Func, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, MaxDynCGroupMem);
+    Plugin::check(
+        AttrResult,
         "Error in cuLaunchKernel while setting the memory limits: %s");
     MaxDynCGroupMemLimit = MaxDynCGroupMem;
   }