From acd5b5647f274b8f8dd89bc327302c2f61441dcf Mon Sep 17 00:00:00 2001
From: Julian Lenz <j.lenz@hzdr.de>
Date: Wed, 22 Jan 2025 07:48:13 +0100
Subject: [PATCH 1/4] First attempt copying over Gallatin integration
 (untested)

---
 CMakeLists.txt                                |   6 +
 cmake/package-lock.cmake                      |   8 +
 .../creationPolicies/GallatinCuda.hpp         | 161 ++++++++++++++++++
 include/mallocMC/mallocMC.hpp                 |   2 +
 include/mallocMC/reservePoolPolicies/Noop.hpp |  60 +++++++
 5 files changed, 237 insertions(+)
 create mode 100644 include/mallocMC/creationPolicies/GallatinCuda.hpp
 create mode 100644 include/mallocMC/reservePoolPolicies/Noop.hpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8360d5bd..fba581ab 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -29,12 +29,18 @@ include(${CMAKE_CURRENT_LIST_DIR}/cmake/add_controlled.cmake)
 add_controlled("PackageProject.cmake" REQUIRED)
 add_controlled("alpaka" REQUIRED)
 
+if(alpaka_ACC_GPU_CUDA_ENABLE)
+  add_controlled("Gallatin")
+endif()
+
 # ---- Create library ----
 
 # Note: for header-only libraries change all PUBLIC flags to INTERFACE and create an interface
 add_library(${PROJECT_NAME} INTERFACE)
 set_target_properties(${PROJECT_NAME} PROPERTIES CXX_STANDARD 20)
 
+target_link_libraries(${PROJECT_NAME} PRIVATE gallatin)
+
 # being a cross-platform target, we enforce standards conformance on MSVC
 target_compile_options(${PROJECT_NAME} INTERFACE "$<$<COMPILE_LANG_AND_ID:CXX,MSVC>:/permissive->")
 
diff --git a/cmake/package-lock.cmake b/cmake/package-lock.cmake
index 4e2727a4..267ec6a1 100644
--- a/cmake/package-lock.cmake
+++ b/cmake/package-lock.cmake
@@ -35,3 +35,11 @@ CPMDeclarePackage(Catch2
   SYSTEM YES
   EXCLUDE_FROM_ALL YES
 )
+# Gallatin
+CPMDeclarePackage(Catch2
+  # There's no release available yet.
+  GIT_TAG 1aa70ade136c3c2042e2a9c2f25565aa56168a0f
+  GITHUB_REPOSITORY saltsystemslab/Gallatin
+  SYSTEM YES
+  EXCLUDE_FROM_ALL YES
+)
diff --git a/include/mallocMC/creationPolicies/GallatinCuda.hpp b/include/mallocMC/creationPolicies/GallatinCuda.hpp
new file mode 100644
index 00000000..049a85e8
--- /dev/null
+++ b/include/mallocMC/creationPolicies/GallatinCuda.hpp
@@ -0,0 +1,161 @@
+/*
+  mallocMC: Memory Allocator for Many Core Architectures.
+
+  Copyright 2014-2024 Institute of Radiation Physics,
+                 Helmholtz-Zentrum Dresden - Rossendorf
+
+  Author(s):  Carlchristian Eckert - c.eckert ( at ) hzdr.de
+              Julian Lenz - j.lenz ( at ) hzdr.de
+
+  Permission is hereby granted, free of charge, to any person obtaining a copy
+  of this software and associated documentation files (the "Software"), to deal
+  in the Software without restriction, including without limitation the rights
+  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+  copies of the Software, and to permit persons to whom the Software is
+  furnished to do so, subject to the following conditions:
+
+  The above copyright notice and this permission notice shall be included in
+  all copies or substantial portions of the Software.
+
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+  THE SOFTWARE.
+*/
+
+#pragma once
+
+#include <alpaka/alpaka.hpp>
+
+#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+#    include <gallatin/allocators/gallatin.cuh>
+#else
+
+// Construct a fake, so we get a nice error message when we try to use is
+// and it's not in the way when we don't.
+namespace gallatin::allocators
+{
+    template<size_t...>
+    struct Gallatin
+    {
+        static auto generate_on_device(auto...)
+        {
+            return nullptr;
+        }
+    };
+} // namespace gallatin::allocators
+
+#endif
+
+namespace mallocMC
+{
+    namespace CreationPolicies
+    {
+        /**
+         * @brief Prototype integration of Gallatin (https://dl.acm.org/doi/10.1145/3627535.3638499)
+         *
+         * This CreationPolicy integrates the CUDA code for the Gallatin prototype into mallocMC
+         * as a thin wrapper. Its intended for proof-of-principle tests and benchmarks only and
+         * obviously only works with on CUDA devices.
+         *
+         * It also only works with the reservePoolPolicies::Noop beccause it does what CudaSetLimits
+         * does internally on its own.
+         *
+         * If we should ever see the need for it, we'd re-implement it in alpaka for a fully-fletched
+         * and well-maintained version of this.
+         * Experience has been mixed so far: While we could reproduce good performance in some cases,
+         * fragmentation was found to be unusably high (to the point of single-digit utilisaton of
+         * available memory) in PIConGPU. That's why there's currently no plan to lift the prototype
+         * status in the near future.
+         */
+        template<
+            typename T_AlignmentPolicy,
+            size_t bytes_per_segment = 16ULL * 1024 * 1024,
+            size_t smallest_slice = 16,
+            size_t largest_slice = 4096>
+        class GallatinCudaImpl
+        {
+            using Gallatin = gallatin::allocators::Gallatin<bytes_per_segment, smallest_slice, largest_slice>;
+
+        public:
+            template<typename T_AlignmentPolicyLocal>
+            using AlignmentAwarePolicy
+                = GallatinCudaImpl<T_AlignmentPolicyLocal, bytes_per_segment, smallest_slice, largest_slice>;
+            Gallatin* heap{nullptr};
+
+            static constexpr auto providesAvailableSlots = false;
+
+            template<typename AlpakaAcc>
+            ALPAKA_FN_ACC auto create(AlpakaAcc const& acc, uint32_t bytes) const -> void*
+            {
+                return heap->malloc(static_cast<size_t>(bytes));
+            }
+
+            template<typename AlpakaAcc>
+            ALPAKA_FN_ACC void destroy(AlpakaAcc const& /*acc*/, void* mem) const
+            {
+                heap->free(mem);
+            }
+
+            ALPAKA_FN_ACC auto isOOM(void* p, size_t s) const -> bool
+            {
+                return s != 0 && (p == nullptr);
+            }
+
+            template<typename AlpakaAcc, typename AlpakaDevice, typename AlpakaQueue, typename T_DeviceAllocator>
+            static void initHeap(
+                AlpakaDevice& dev,
+                AlpakaQueue& queue,
+                T_DeviceAllocator* devAllocator,
+                void*,
+                size_t memsize)
+            {
+                static_assert(
+                    std::is_same_v<alpaka::AccToTag<AlpakaAcc>, alpaka::TagGpuCudaRt>,
+                    "The GallatinCuda creation policy is only available on CUDA architectures. Please choose a "
+                    "different one.");
+
+                // This is an extremely hot fix:
+                // PIConGPU initialises its allocator with 0 bytes to be able to distribute the pointer.
+                // Only afterwards it can find out its actual memory requirements and uses destructiveResize to set
+                // the correct heap size. Gallatin runs into issues with this approach.
+                // Instead, we simply don't believe the request if it's 0.
+                if(memsize == 0)
+                    return;
+
+                auto devHost = alpaka::getDevByIdx(alpaka::PlatformCpu{}, 0);
+                using Dim = typename alpaka::trait::DimType<AlpakaAcc>::type;
+                using Idx = typename alpaka::trait::IdxType<AlpakaAcc>::type;
+                using VecType = alpaka::Vec<Dim, Idx>;
+
+                auto tmp = Gallatin::generate_on_device(memsize, 42, true);
+                auto workDivSingleThread
+                    = alpaka::WorkDivMembers<Dim, Idx>{VecType::ones(), VecType::ones(), VecType::ones()};
+                alpaka::exec<AlpakaAcc>(
+                    queue,
+                    workDivSingleThread,
+                    [tmp, devAllocator] ALPAKA_FN_ACC(AlpakaAcc const&) { devAllocator->heap = tmp; });
+            }
+
+            static auto classname() -> std::string
+            {
+                return "GallatinCuda";
+            }
+        };
+
+        template<
+            size_t bytes_per_segment = 16ULL * 1024 * 1024,
+            size_t smallest_slice = 16,
+            size_t largest_slice = 4096>
+        struct GallatinCuda
+        {
+            template<typename T_AlignmentPolicy>
+            using AlignmentAwarePolicy
+                = GallatinCudaImpl<T_AlignmentPolicy, bytes_per_segment, smallest_slice, largest_slice>;
+        };
+
+    } // namespace CreationPolicies
+} // namespace mallocMC
diff --git a/include/mallocMC/mallocMC.hpp b/include/mallocMC/mallocMC.hpp
index de96b7e0..41bf4882 100644
--- a/include/mallocMC/mallocMC.hpp
+++ b/include/mallocMC/mallocMC.hpp
@@ -47,6 +47,7 @@
 #include "alignmentPolicies/Noop.hpp"
 #include "alignmentPolicies/Shrink.hpp"
 #include "creationPolicies/FlatterScatter.hpp"
+#include "creationPolicies/GallatinCuda.hpp"
 #include "creationPolicies/OldMalloc.hpp"
 #include "creationPolicies/Scatter.hpp"
 #include "distributionPolicies/Noop.hpp"
@@ -55,3 +56,4 @@
 #include "oOMPolicies/ReturnNull.hpp"
 #include "reservePoolPolicies/AlpakaBuf.hpp"
 #include "reservePoolPolicies/CudaSetLimits.hpp"
+#include "reservePoolPolicies/Noop.hpp"
diff --git a/include/mallocMC/reservePoolPolicies/Noop.hpp b/include/mallocMC/reservePoolPolicies/Noop.hpp
new file mode 100644
index 00000000..c6b82f18
--- /dev/null
+++ b/include/mallocMC/reservePoolPolicies/Noop.hpp
@@ -0,0 +1,60 @@
+/*
+  mallocMC: Memory Allocator for Many Core Architectures.
+
+  Copyright 2014-2024 Institute of Radiation Physics,
+                 Helmholtz-Zentrum Dresden - Rossendorf
+
+  Author(s):  Carlchristian Eckert - c.eckert ( at ) hzdr.de
+              Julian Lenz - j.lenz ( at ) hzdr.de
+
+  Permission is hereby granted, free of charge, to any person obtaining a copy
+  of this software and associated documentation files (the "Software"), to deal
+  in the Software without restriction, including without limitation the rights
+  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+  copies of the Software, and to permit persons to whom the Software is
+  furnished to do so, subject to the following conditions:
+
+  The above copyright notice and this permission notice shall be included in
+  all copies or substantial portions of the Software.
+
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+  THE SOFTWARE.
+*/
+
+#pragma once
+
+namespace mallocMC
+{
+    namespace ReservePoolPolicies
+    {
+        /**
+         * @brief Does exactly nothing.
+         *
+         * This is intended for use with prototypes that were originally designed
+         * to handle these aspects on their own. Currently needed for GallatinCuda.
+         */
+        struct Noop
+        {
+            template<typename AlpakaDev>
+            auto setMemPool(AlpakaDev const& dev, size_t memsize) -> void*
+            {
+                return nullptr;
+            }
+
+            static void resetMemPool()
+            {
+            }
+
+            static auto classname() -> std::string
+            {
+                return "Noop";
+            }
+        };
+
+    } // namespace ReservePoolPolicies
+} // namespace mallocMC

From edbdcdf5f79ab51a0fc817bedf3ab4ad436eefc8 Mon Sep 17 00:00:00 2001
From: Julian Lenz <j.lenz@hzdr.de>
Date: Wed, 22 Jan 2025 11:43:48 +0100
Subject: [PATCH 2/4] Add examples and make them run

---
 CMakeLists.txt                             |  8 +++---
 cmake/package-lock.cmake                   |  7 +++---
 examples/getAvailableSlots/source/main.cpp | 26 ++++++++++++++-----
 examples/vectorAdd/source/main.cpp         | 29 +++++++++++++++-------
 4 files changed, 48 insertions(+), 22 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index fba581ab..2cd30213 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -29,9 +29,6 @@ include(${CMAKE_CURRENT_LIST_DIR}/cmake/add_controlled.cmake)
 add_controlled("PackageProject.cmake" REQUIRED)
 add_controlled("alpaka" REQUIRED)
 
-if(alpaka_ACC_GPU_CUDA_ENABLE)
-  add_controlled("Gallatin")
-endif()
 
 # ---- Create library ----
 
@@ -39,7 +36,10 @@ endif()
 add_library(${PROJECT_NAME} INTERFACE)
 set_target_properties(${PROJECT_NAME} PROPERTIES CXX_STANDARD 20)
 
-target_link_libraries(${PROJECT_NAME} PRIVATE gallatin)
+if(alpaka_ACC_GPU_CUDA_ENABLE)
+  add_controlled("Gallatin")
+  target_link_libraries(${PROJECT_NAME} INTERFACE gallatin)
+endif()
 
 # being a cross-platform target, we enforce standards conformance on MSVC
 target_compile_options(${PROJECT_NAME} INTERFACE "$<$<COMPILE_LANG_AND_ID:CXX,MSVC>:/permissive->")
diff --git a/cmake/package-lock.cmake b/cmake/package-lock.cmake
index 267ec6a1..9f1a9f1c 100644
--- a/cmake/package-lock.cmake
+++ b/cmake/package-lock.cmake
@@ -36,10 +36,11 @@ CPMDeclarePackage(Catch2
   EXCLUDE_FROM_ALL YES
 )
 # Gallatin
-CPMDeclarePackage(Catch2
+CPMDeclarePackage(Gallatin
   # There's no release available yet.
-  GIT_TAG 1aa70ade136c3c2042e2a9c2f25565aa56168a0f
-  GITHUB_REPOSITORY saltsystemslab/Gallatin
+  GIT_TAG ac0cb8e380ffcb74156bafb8805fb60412817c5f
+  # Use our own fork for some patches
+  GITHUB_REPOSITORY chillenzer/Gallatin
   SYSTEM YES
   EXCLUDE_FROM_ALL YES
 )
diff --git a/examples/getAvailableSlots/source/main.cpp b/examples/getAvailableSlots/source/main.cpp
index 8e183fde..4d134715 100644
--- a/examples/getAvailableSlots/source/main.cpp
+++ b/examples/getAvailableSlots/source/main.cpp
@@ -99,7 +99,10 @@ struct ExampleKernel
     }
 };
 
-template<typename T_CreationPolicy>
+template<
+    typename T_CreationPolicy,
+    typename T_ReservePoolPolicy,
+    typename T_AlignmentPolicy = mallocMC::AlignmentPolicies::Shrink<AlignmentConfig>>
 auto example03() -> int
 {
     using Allocator = mallocMC::Allocator<
@@ -107,8 +110,8 @@ auto example03() -> int
         T_CreationPolicy,
         mallocMC::DistributionPolicies::Noop,
         mallocMC::OOMPolicies::ReturnNull,
-        mallocMC::ReservePoolPolicies::AlpakaBuf<Acc>,
-        mallocMC::AlignmentPolicies::Shrink<AlignmentConfig>>;
+        T_ReservePoolPolicy,
+        T_AlignmentPolicy>;
 
     auto const platform = alpaka::Platform<Acc>{};
     auto const dev = alpaka::getDevByIdx(platform, 0);
@@ -130,8 +133,19 @@ auto example03() -> int
 
 auto main(int /*argc*/, char* /*argv*/[]) -> int
 {
-    example03<FlatterScatter<FlatterScatterHeapConfig>>();
-    example03<Scatter<FlatterScatterHeapConfig>>();
-    example03<OldMalloc>();
+    example03<FlatterScatter<FlatterScatterHeapConfig>, mallocMC::ReservePoolPolicies::AlpakaBuf<Acc>>();
+    example03<Scatter<FlatterScatterHeapConfig>, mallocMC::ReservePoolPolicies::AlpakaBuf<Acc>>();
+#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+    example03<
+        mallocMC::CreationPolicies::GallatinCuda<>,
+        mallocMC::ReservePoolPolicies::Noop,
+        mallocMC::AlignmentPolicies::Noop>();
+    // GallatinCuda already uses cudaSetLimits and we're not allowed to call it a second time.
+    example03<OldMalloc, mallocMC::ReservePoolPolicies::Noop>();
+    // This should normally be:
+    //    example01<OldMalloc, mallocMC::ReservePoolPolicies::CudaSetLimits>();
+#else
+    example03<OldMalloc, mallocMC::ReservePoolPolicies::Noop>();
+#endif
     return 0;
 }
diff --git a/examples/vectorAdd/source/main.cpp b/examples/vectorAdd/source/main.cpp
index 002af965..2c5833c1 100644
--- a/examples/vectorAdd/source/main.cpp
+++ b/examples/vectorAdd/source/main.cpp
@@ -27,9 +27,6 @@
   THE SOFTWARE.
 */
 
-#include "mallocMC/creationPolicies/FlatterScatter.hpp"
-#include "mallocMC/creationPolicies/OldMalloc.hpp"
-
 #include <alpaka/alpaka.hpp>
 #include <alpaka/example/ExampleDefaultAcc.hpp>
 
@@ -80,7 +77,10 @@ ALPAKA_STATIC_ACC_MEM_GLOBAL int** arA;
 ALPAKA_STATIC_ACC_MEM_GLOBAL int** arB;
 ALPAKA_STATIC_ACC_MEM_GLOBAL int** arC;
 
-template<typename T_CreationPolicy>
+template<
+    typename T_CreationPolicy,
+    typename T_ReservePoolPolicy,
+    typename T_AlignmentPolicy = mallocMC::AlignmentPolicies::Shrink<ShrinkConfig>>
 auto example01() -> int
 {
     using Allocator = mallocMC::Allocator<
@@ -88,8 +88,8 @@ auto example01() -> int
         T_CreationPolicy,
         mallocMC::DistributionPolicies::Noop,
         mallocMC::OOMPolicies::ReturnNull,
-        mallocMC::ReservePoolPolicies::AlpakaBuf<Acc>,
-        mallocMC::AlignmentPolicies::Shrink<ShrinkConfig>>;
+        T_ReservePoolPolicy,
+        T_AlignmentPolicy>;
 
     constexpr auto length = 100;
 
@@ -227,8 +227,19 @@ auto example01() -> int
 
 auto main(int /*argc*/, char* /*argv*/[]) -> int
 {
-    example01<FlatterScatter<FlatterScatterHeapConfig>>();
-    example01<Scatter<FlatterScatterHeapConfig>>();
-    example01<OldMalloc>();
+    example01<FlatterScatter<FlatterScatterHeapConfig>, mallocMC::ReservePoolPolicies::AlpakaBuf<Acc>>();
+    example01<Scatter<FlatterScatterHeapConfig>, mallocMC::ReservePoolPolicies::AlpakaBuf<Acc>>();
+#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+    example01<
+        mallocMC::CreationPolicies::GallatinCuda<>,
+        mallocMC::ReservePoolPolicies::Noop,
+        mallocMC::AlignmentPolicies::Noop>();
+    // GallatinCuda already uses cudaSetLimits and we're not allowed to call it a second time.
+    example01<OldMalloc, mallocMC::ReservePoolPolicies::Noop>();
+    // This should normally be:
+    //    example01<OldMalloc, mallocMC::ReservePoolPolicies::CudaSetLimits>();
+#else
+    example01<OldMalloc, mallocMC::ReservePoolPolicies::Noop>();
+#endif
     return 0;
 }

From 1944ff4f7928bff8b5a994382b60c11d38f6e17c Mon Sep 17 00:00:00 2001
From: Julian Lenz <j.lenz@hzdr.de>
Date: Wed, 22 Jan 2025 11:48:50 +0100
Subject: [PATCH 3/4] Fix warnings

---
 include/mallocMC/reservePoolPolicies/Noop.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/mallocMC/reservePoolPolicies/Noop.hpp b/include/mallocMC/reservePoolPolicies/Noop.hpp
index c6b82f18..57bf8261 100644
--- a/include/mallocMC/reservePoolPolicies/Noop.hpp
+++ b/include/mallocMC/reservePoolPolicies/Noop.hpp
@@ -41,7 +41,7 @@ namespace mallocMC
         struct Noop
         {
             template<typename AlpakaDev>
-            auto setMemPool(AlpakaDev const& dev, size_t memsize) -> void*
+            auto setMemPool(AlpakaDev const& /*dev*/, size_t /*memsize*/) -> void*
             {
                 return nullptr;
             }

From 1a0530e98d197c6d6a4ae75e09beb245d15d4c46 Mon Sep 17 00:00:00 2001
From: Julian Lenz <j.lenz@hzdr.de>
Date: Wed, 22 Jan 2025 16:48:42 +0100
Subject: [PATCH 4/4] Add a warning about compute capabilities

---
 CMakeLists.txt | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2cd30213..75f88423 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -38,6 +38,26 @@ set_target_properties(${PROJECT_NAME} PROPERTIES CXX_STANDARD 20)
 
 if(alpaka_ACC_GPU_CUDA_ENABLE)
   add_controlled("Gallatin")
+
+  # Gallatin needs some fairly recent compute capability from CUDA.
+  # CMake defaults to taking the oldest supported by the device
+  # (https://cmake.org/cmake/help/latest/variable/CMAKE_CUDA_ARCHITECTURES.html)
+  # which can be too old. This leads to compilation errors along the lines of
+  #
+  #  error: no instance of overloaded function "atomicCAS" matches the argument list
+  #         argument types are: (unsigned short *, unsigned short, unsigned short)
+  #
+  # because this overload was only added later (apparently?).
+
+  if ("${CMAKE_CUDA_ARCHITECTURES}" LESS 70)
+    message(
+        WARNING
+        "CUDA architecture detected is too old: ${CMAKE_CUDA_ARCHITECTURES}. "
+        "If the architecture set is too old, this can lead to compilation errors with Gallatin. "
+        "If Gallatin is needed, please set CMAKE_CUDA_ARCHITECTURES to the correct value >= 70."
+      )
+  endif()
+
   target_link_libraries(${PROJECT_NAME} INTERFACE gallatin)
 endif()