diff --git a/CMakeLists.txt b/CMakeLists.txt
index da2a9ab9..ab140ce8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -17,6 +17,17 @@ if(PROJECT_SOURCE_DIR STREQUAL PROJECT_BINARY_DIR)
   )
 endif()
 
+# ---- Options ----
+
+option(mallocMC_BUILD_TESTING "Turn on/off building the tests" OFF)
+option(mallocMC_BUILD_EXAMPLES "Turn on/off building the examples" OFF)
+if (mallocMC_BUILD_TESTING OR mallocMC_BUILD_EXAMPLES)
+  enable_testing()
+endif()
+if (mallocMC_BUILD_TESTING)
+  set(alpaka_ACC_CPU_B_SEQ_T_SEQ_ENABLE ON CACHE BOOL "" FORCE)
+endif()
+
 # ---- Add dependencies via CPM ----
 # see https://github.com/TheLartians/CPM.cmake for more info
 
@@ -39,6 +50,12 @@ set_target_properties(${PROJECT_NAME} PROPERTIES CXX_STANDARD 20)
 if(alpaka_ACC_GPU_CUDA_ENABLE)
   add_controlled("Gallatin")
 
+  if (TARGET gallatin::gallatin)
+    set(mallocMC_HAS_Gallatin_AVAILABLE YES)
+  else()
+    set(mallocMC_HAS_Gallatin_AVAILABLE NO)
+  endif()
+
   # Gallatin needs some fairly recent compute capability from CUDA.
   # CMake defaults to taking the oldest supported by the device
   # (https://cmake.org/cmake/help/latest/variable/CMAKE_CUDA_ARCHITECTURES.html)
@@ -56,9 +73,13 @@ if(alpaka_ACC_GPU_CUDA_ENABLE)
         "If the architecture set is too old, this can lead to compilation errors with Gallatin. "
         "If Gallatin is needed, please set CMAKE_CUDA_ARCHITECTURES to the correct value >= 70."
       )
+    set(mallocMC_HAS_Gallatin_AVAILABLE NO)
   endif()
 
-  target_link_libraries(${PROJECT_NAME} INTERFACE gallatin)
+  if (mallocMC_HAS_Gallatin_AVAILABLE)
+    target_link_libraries(${PROJECT_NAME} INTERFACE gallatin)
+    target_compile_definitions(${PROJECT_NAME} INTERFACE mallocMC_HAS_Gallatin_AVAILABLE)
+  endif()
 endif()
 
 # being a cross-platform target, we enforce standards conformance on MSVC
@@ -68,15 +89,14 @@ target_include_directories(
   ${PROJECT_NAME} INTERFACE $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include>
                          $<INSTALL_INTERFACE:include/${PROJECT_NAME}-${PROJECT_VERSION}>
 )
+target_link_libraries(${PROJECT_NAME} INTERFACE alpaka::alpaka)
+
 
-option(mallocMC_BUILD_TESTING "Turn on/off building the tests" OFF)
 if(mallocMC_BUILD_TESTING)
   include(${CMAKE_CURRENT_LIST_DIR}/cmake/tools.cmake)
-  enable_testing()
   add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/test ${CMAKE_BINARY_DIR}/test)
 endif()
 
-option(mallocMC_BUILD_EXAMPLES "Turn on/off building the examples" OFF)
 if(mallocMC_BUILD_EXAMPLES)
   include(${CMAKE_CURRENT_LIST_DIR}/cmake/tools.cmake)
   add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/examples ${CMAKE_BINARY_DIR}/examples)
diff --git a/cmake/package-lock.cmake b/cmake/package-lock.cmake
index 9f1a9f1c..cb68bb05 100644
--- a/cmake/package-lock.cmake
+++ b/cmake/package-lock.cmake
@@ -11,10 +11,11 @@ CPMDeclarePackage(PackageProject.cmake
 # alpaka
 CPMDeclarePackage(alpaka
   NAME alpaka
-  GIT_TAG 1.2.0
+  # This is a development version slightly after 1.2.0 because we needed a patch
+  GIT_TAG 95c0bf2397255a89467bb5c151a96367ad1d1f93
   GITHUB_REPOSITORY alpaka-group/alpaka
   OPTIONS
-    "alpaka_CXX_STANDARD 20"
+    "alpaka_CXX_STANDARD 20;alpaka_INSTALL ON"
   # It is recommended to let CPM cache dependencies in order to reduce redundant downloads.
   # However, we might in the foreseeable future turn to unstable references like the `dev` branch here.
   # Setting the following option tells CPM to not use the cache.
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 1f5672b6..560c8558 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -12,8 +12,22 @@ add_subdirectory(
     ${CMAKE_BINARY_DIR}/examples/getAvailableSlots
 )
 
-add_custom_target(
+check_language(CUDA)
+if (CMAKE_CUDA_COMPILER AND alpaka_ACC_GPU_CUDA_ENABLE)
+  add_subdirectory(
+    ${CMAKE_CURRENT_LIST_DIR}/native-cuda
+    ${CMAKE_BINARY_DIR}/examples/native-cuda
+  )
+
+  add_custom_target(
+    mallocMCExamples
+    DEPENDS mallocMCExampleVectorAdd mallocMCExampleGetAvailableSlots mallocMCExampleNativeCuda
+    COMMENT "Shortcut for building all examples."
+  )
+else()
+  add_custom_target(
     mallocMCExamples
     DEPENDS mallocMCExampleVectorAdd mallocMCExampleGetAvailableSlots
     COMMENT "Shortcut for building all examples."
-)
+  )
+endif()
diff --git a/examples/getAvailableSlots/CMakeLists.txt b/examples/getAvailableSlots/CMakeLists.txt
index 0ec6e4b5..aa2a74c0 100644
--- a/examples/getAvailableSlots/CMakeLists.txt
+++ b/examples/getAvailableSlots/CMakeLists.txt
@@ -32,3 +32,4 @@ set_target_properties(${PROJECT_NAME}
 )
 
 target_link_libraries(${PROJECT_NAME} mallocMC::mallocMC alpaka::alpaka)
+add_test(NAME ${PROJECT_NAME} COMMAND ${PROJECT_NAME})
diff --git a/examples/getAvailableSlots/source/main.cpp b/examples/getAvailableSlots/source/main.cpp
index 2ff0ba32..cc5e2531 100644
--- a/examples/getAvailableSlots/source/main.cpp
+++ b/examples/getAvailableSlots/source/main.cpp
@@ -136,14 +136,17 @@ auto main(int /*argc*/, char* /*argv*/[]) -> int
     example03<FlatterScatter<FlatterScatterHeapConfig>, mallocMC::ReservePoolPolicies::AlpakaBuf<Acc>>();
     example03<Scatter<FlatterScatterHeapConfig>, mallocMC::ReservePoolPolicies::AlpakaBuf<Acc>>();
 #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+#    ifdef mallocMC_HAS_Gallatin_AVAILABLE
     example03<
         mallocMC::CreationPolicies::GallatinCuda<>,
         mallocMC::ReservePoolPolicies::Noop,
         mallocMC::AlignmentPolicies::Noop>();
     // GallatinCuda already uses cudaSetLimits and we're not allowed to call it a second time.
     example03<OldMalloc, mallocMC::ReservePoolPolicies::Noop>();
+#    else
     // This should normally be:
-    //    example01<OldMalloc, mallocMC::ReservePoolPolicies::CudaSetLimits>();
+    example03<OldMalloc, mallocMC::ReservePoolPolicies::CudaSetLimits>();
+#    endif
 #else
     example03<OldMalloc, mallocMC::ReservePoolPolicies::Noop>();
 #endif
diff --git a/examples/native-cuda/CMakeLists.txt b/examples/native-cuda/CMakeLists.txt
new file mode 100644
index 00000000..f7acefe6
--- /dev/null
+++ b/examples/native-cuda/CMakeLists.txt
@@ -0,0 +1,31 @@
+cmake_minimum_required(VERSION 3.14...3.22)
+
+project(mallocMCExampleNativeCuda LANGUAGES CXX CUDA)
+
+# --- Import tools ----
+
+include(${CMAKE_CURRENT_LIST_DIR}/../../cmake/tools.cmake)
+
+# ---- Dependencies ----
+
+include(${CMAKE_CURRENT_LIST_DIR}/../../cmake/CPM_0.40.2.cmake)
+CPMUsePackageLock(${CMAKE_CURRENT_LIST_DIR}/../../cmake/package-lock.cmake)
+
+if(NOT TARGET mallocMC)
+  CPMAddPackage(NAME mallocMC SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR}/../..)
+endif()
+
+# ---- Create standalone executable ----
+
+add_executable(${PROJECT_NAME} ${CMAKE_CURRENT_SOURCE_DIR}/source/main.cu)
+
+set_target_properties(${PROJECT_NAME}
+    PROPERTIES
+        CXX_STANDARD 20
+        OUTPUT_NAME ${PROJECT_NAME}
+        CXX_STANDARD_REQUIRED ON
+        CXX_EXTENSIONS OFF
+  )
+
+target_link_libraries(${PROJECT_NAME} mallocMC::mallocMC ${CUDA_LIBRARIES})
+add_test(NAME ${PROJECT_NAME} COMMAND ${PROJECT_NAME})
diff --git a/examples/native-cuda/source/main.cu b/examples/native-cuda/source/main.cu
new file mode 100644
index 00000000..00c429a9
--- /dev/null
+++ b/examples/native-cuda/source/main.cu
@@ -0,0 +1,104 @@
+/*
+  mallocMC: Memory Allocator for Many Core Architectures.
+  https://www.hzdr.de/crp
+
+  Copyright 2025 Institute of Radiation Physics,
+                 Helmholtz-Zentrum Dresden - Rossendorf
+
+  Author(s):  Julian Lenz - j.lenz ( at ) hzdr.de
+
+  Permission is hereby granted, free of charge, to any person obtaining a copy
+  of this software and associated documentation files (the "Software"), to deal
+  in the Software without restriction, including without limitation the rights
+  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+  copies of the Software, and to permit persons to whom the Software is
+  furnished to do so, subject to the following conditions:
+
+  The above copyright notice and this permission notice shall be included in
+  all copies or substantial portions of the Software.
+
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+  THE SOFTWARE.
+*/
+
+#include <mallocMC/mallocMC.cuh>
+
+#include <cstdint>
+#include <cstdlib>
+#include <functional>
+#include <span>
+
+/**
+ * @brief Computes the sum of squares of the first `n` natural numbers.
+ *
+ * This function calculates the sum of squares of the first `n` natural numbers using the formula:
+ * \[
+ * \text{sumOfSquares}(n) = \frac{n \times (n + 1) \times (2n + 1)}{6}
+ * \]
+ * It's used to check the computed value in the kernel.
+ *
+ * @param n The number of natural numbers to consider.
+ * @return The sum of squares of the first `n` natural numbers.
+ */
+__device__ auto sumOfSquares(auto const n)
+{
+    return (n * (n + 1) * (2 * n + 1)) / 6;
+}
+
+/**
+ * @brief Computes the dot product of two vectors for each thread.
+ *
+ * This kernel computes the dot product of two vectors, `a` and `b`, for each thread.
+ * Each thread allocates memory for its own vectors, initializes them with consecutive values,
+ * computes the dot product, and checks if the result matches the expected value.
+ * If the result does not match, the thread prints an error message and halts execution.
+ *
+ * @param memoryManager A CUDA memory manager object used for memory allocation and deallocation.
+ * @param numValues The number of elements in each vector.
+ *
+ * @note This kernnel is, of course, not very realistic as a workload but it fulfills its purpose of showcasing a
+ * native CUDA application.
+ */
+__global__ void oneDotProductPerThread(mallocMC::CudaMemoryManager<> memoryManager, uint64_t numValues)
+{
+    uint64_t tid = threadIdx.x + blockIdx.x * blockDim.x;
+
+    // Not very realistic, all threads are doing this on their own:
+    auto a = std::span<uint64_t>(
+        reinterpret_cast<uint64_t*>(memoryManager.malloc(numValues * sizeof(uint64_t))),
+        numValues);
+    auto b = std::span<uint64_t>(
+        reinterpret_cast<uint64_t*>(memoryManager.malloc(numValues * sizeof(uint64_t))),
+        numValues);
+
+    std::iota(std::begin(a), std::end(a), tid);
+    std::iota(std::begin(b), std::end(b), tid);
+
+    uint64_t result = std::transform_reduce(std::cbegin(a), std::cend(a), std::cbegin(b), 0U);
+
+    auto expected = sumOfSquares(numValues + tid - 1) - (tid > 0 ? sumOfSquares(tid - 1) : 0);
+    if(result != expected)
+    {
+        printf("Thread %lu: Result %lu != Expected %lu. \n", tid, result, expected);
+        __trap();
+    }
+
+    memoryManager.free(a.data());
+    memoryManager.free(b.data());
+}
+
+int main()
+{
+    size_t const heapSize = 1024U * 1024U * 1024U;
+    uint64_t const numValues = 32U;
+    mallocMC::CudaHostInfrastructure<> hostInfrastructure{heapSize};
+    auto memoryManager = mallocMC::CudaMemoryManager{hostInfrastructure};
+
+    std::cout << "Running native CUDA kernel." << std::endl;
+    oneDotProductPerThread<<<8, 256>>>(memoryManager, numValues);
+}
diff --git a/examples/vectorAdd/CMakeLists.txt b/examples/vectorAdd/CMakeLists.txt
index 3f475723..11421048 100644
--- a/examples/vectorAdd/CMakeLists.txt
+++ b/examples/vectorAdd/CMakeLists.txt
@@ -32,3 +32,5 @@ set_target_properties(${PROJECT_NAME}
 )
 
 target_link_libraries(${PROJECT_NAME} mallocMC::mallocMC alpaka::alpaka)
+
+add_test(NAME ${PROJECT_NAME} COMMAND ${PROJECT_NAME})
diff --git a/examples/vectorAdd/source/main.cpp b/examples/vectorAdd/source/main.cpp
index 04c10377..461cef97 100644
--- a/examples/vectorAdd/source/main.cpp
+++ b/examples/vectorAdd/source/main.cpp
@@ -229,15 +229,19 @@ auto main(int /*argc*/, char* /*argv*/[]) -> int
 {
     example01<FlatterScatter<FlatterScatterHeapConfig>, mallocMC::ReservePoolPolicies::AlpakaBuf<Acc>>();
     example01<Scatter<FlatterScatterHeapConfig>, mallocMC::ReservePoolPolicies::AlpakaBuf<Acc>>();
+
 #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+#    ifdef mallocMC_HAS_Gallatin_AVAILABLE
     example01<
         mallocMC::CreationPolicies::GallatinCuda<>,
         mallocMC::ReservePoolPolicies::Noop,
         mallocMC::AlignmentPolicies::Noop>();
     // GallatinCuda already uses cudaSetLimits and we're not allowed to call it a second time.
     example01<OldMalloc, mallocMC::ReservePoolPolicies::Noop>();
+#    else
     // This should normally be:
-    //    example01<OldMalloc, mallocMC::ReservePoolPolicies::CudaSetLimits>();
+    example01<OldMalloc, mallocMC::ReservePoolPolicies::CudaSetLimits>();
+#    endif
 #else
     example01<OldMalloc, mallocMC::ReservePoolPolicies::Noop>();
 #endif
diff --git a/include/mallocMC/allocator.hpp b/include/mallocMC/allocator.hpp
index d59411c7..447b381d 100644
--- a/include/mallocMC/allocator.hpp
+++ b/include/mallocMC/allocator.hpp
@@ -198,7 +198,7 @@ namespace mallocMC
         }
 
         ALPAKA_FN_HOST
-        auto getAllocatorHandle() -> AllocatorHandle
+        auto getAllocatorHandle() const -> AllocatorHandle
         {
             return AllocatorHandle{alpaka::getPtrNative(*devAllocatorBuffer)};
         }
diff --git a/include/mallocMC/creationPolicies/GallatinCuda.hpp b/include/mallocMC/creationPolicies/GallatinCuda.hpp
index 049a85e8..0fabc969 100644
--- a/include/mallocMC/creationPolicies/GallatinCuda.hpp
+++ b/include/mallocMC/creationPolicies/GallatinCuda.hpp
@@ -30,7 +30,7 @@
 
 #include <alpaka/alpaka.hpp>
 
-#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+#ifdef mallocMC_HAS_Gallatin_AVAILABLE
 #    include <gallatin/allocators/gallatin.cuh>
 #else
 
@@ -45,6 +45,23 @@ namespace gallatin::allocators
         {
             return nullptr;
         }
+
+        template<typename... T>
+        auto malloc(T... /*unused*/) -> void*
+        {
+            // This always triggers but it depends on the template parameter, so it's only instantiated if we actually
+            // use it.
+            static_assert(sizeof...(T) < 0, "Attempt to use malloc of unavailable gallatin prototype.");
+            return nullptr;
+        }
+
+        template<typename... T>
+        auto free(T... /*unused*/)
+        {
+            // This always triggers but it depends on the template parameter, so it's only instantiated if we actually
+            // use it.
+            static_assert(sizeof...(T) < 0, "Attempt to use free of unavailable gallatin prototype.");
+        }
     };
 } // namespace gallatin::allocators
 
@@ -89,7 +106,7 @@ namespace mallocMC
             static constexpr auto providesAvailableSlots = false;
 
             template<typename AlpakaAcc>
-            ALPAKA_FN_ACC auto create(AlpakaAcc const& acc, uint32_t bytes) const -> void*
+            ALPAKA_FN_ACC auto create(AlpakaAcc const& /*acc*/, uint32_t bytes) const -> void*
             {
                 return heap->malloc(static_cast<size_t>(bytes));
             }
@@ -107,7 +124,7 @@ namespace mallocMC
 
             template<typename AlpakaAcc, typename AlpakaDevice, typename AlpakaQueue, typename T_DeviceAllocator>
             static void initHeap(
-                AlpakaDevice& dev,
+                AlpakaDevice& /*dev*/,
                 AlpakaQueue& queue,
                 T_DeviceAllocator* devAllocator,
                 void*,
diff --git a/include/mallocMC/mallocMC.cuh b/include/mallocMC/mallocMC.cuh
new file mode 100644
index 00000000..eec79ac6
--- /dev/null
+++ b/include/mallocMC/mallocMC.cuh
@@ -0,0 +1,184 @@
+/*
+  mallocMC: Memory Allocator for Many Core Architectures.
+  https://www.hzdr.de/crp
+
+  Copyright 2025 Institute of Radiation Physics,
+                 Helmholtz-Zentrum Dresden - Rossendorf
+
+  Author(s):  Julian Lenz - j.lenz ( at ) hzdr.de
+
+  Permission is hereby granted, free of charge, to any person obtaining a copy
+  of this software and associated documentation files (the "Software"), to deal
+  in the Software without restriction, including without limitation the rights
+  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+  copies of the Software, and to permit persons to whom the Software is
+  furnished to do so, subject to the following conditions:
+
+  The above copyright notice and this permission notice shall be included in
+  all copies or substantial portions of the Software.
+
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+  THE SOFTWARE.
+*/
+
+#include "mallocMC/alignmentPolicies/Shrink.hpp"
+#include "mallocMC/creationPolicies/FlatterScatter.hpp"
+#include "mallocMC/reservePoolPolicies/AlpakaBuf.hpp"
+
+#include <mallocMC/mallocMC.hpp>
+#include <sys/types.h>
+
+#include <cstdint>
+
+namespace mallocMC
+{
+    // This namespace implements an alpaka-agnostic interface by choosing some reasonable defaults working fine for
+    // CUDA devices. Further below, we export the necessary names to the global mallocMC:: namespace. See below if
+    // you're only interested in usage. Look inside if you want to understand what we've done here or want to port this
+    // to other architectures.
+    namespace detail
+    {
+        using Dim = alpaka::DimInt<1>;
+        using Idx = std::uint32_t;
+        using Acc = alpaka::AccGpuCudaRt<Dim, Idx>;
+
+        // Hide the alpaka-specific Acc argument of `ReservePoolPolicies::AlpakaBuf`.
+        using CudaAlpakaBuf = ReservePoolPolicies::AlpakaBuf<Acc>;
+
+        /**
+         * @brief Allocator template with hidden alpaka-specifics.
+         */
+        template<
+            typename T_CreationPolicy = CreationPolicies::FlatterScatter<>,
+            typename T_DistributionPolicy = DistributionPolicies::Noop,
+            typename T_OOMPolicy = OOMPolicies::ReturnNull,
+            typename T_ReservePoolPolicy = CudaAlpakaBuf,
+            typename T_AlignmentPolicy = AlignmentPolicies::Shrink<>>
+        using CudaAllocator = Allocator<
+            alpaka::AccToTag<Acc>,
+            T_CreationPolicy,
+            T_DistributionPolicy,
+            T_OOMPolicy,
+            T_ReservePoolPolicy,
+            T_AlignmentPolicy>;
+
+        /**
+         * @brief Host-side infrastructure needed for setting up everything.
+         *
+         * You need to create an instance of this on the host. It provides the alpaka infrastructure and sets up
+         * everything on the device side, so you can get started allocating stuff.
+         */
+        template<
+            typename T_CreationPolicy = CreationPolicies::FlatterScatter<>,
+            typename T_DistributionPolicy = DistributionPolicies::Noop,
+            typename T_OOMPolicy = OOMPolicies::ReturnNull,
+            typename T_ReservePoolPolicy = ReservePoolPolicies::AlpakaBuf<Acc>,
+            typename T_AlignmentPolicy = AlignmentPolicies::Shrink<>>
+        struct CudaHostInfrastructure
+        {
+            using MyAllocatorType = CudaAllocator<
+                T_CreationPolicy,
+                T_DistributionPolicy,
+                T_OOMPolicy,
+                T_ReservePoolPolicy,
+                T_AlignmentPolicy>;
+
+            // Keep this first, so compiler-generated constructors can be called as just
+            // CudaHostInfrastructure<>{heapSize};
+            size_t heapSize{};
+
+            // All of this is necessary alpaka infrastructure.
+            alpaka::Platform<Acc> const platform{};
+            std::remove_cv_t<decltype(alpaka::getDevByIdx(platform, 0))> const dev{alpaka::getDevByIdx(platform, 0)};
+            alpaka::Queue<Acc, alpaka::NonBlocking> queue{dev};
+
+            // This is our actual host-side instance of the allocator. It sets up everything on the device and provides
+            // the handle that we can pass to kernels.
+            MyAllocatorType hostInstance{dev, queue, heapSize};
+        };
+
+        /**
+         * @brief Memory manager to pass to kernels.
+         *
+         * Create this on the host and pass it to your kernels. It's a lightweight object barely more than a pointer,
+         * so you can just copy it around as needed. Its main purpose is to provide an alpaka-agnostic interface by
+         * adding an accelerator internally before forwarding malloc/free calls to mallocMC.
+         */
+        template<
+            typename T_CreationPolicy = CreationPolicies::FlatterScatter<>,
+            typename T_DistributionPolicy = DistributionPolicies::Noop,
+            typename T_OOMPolicy = OOMPolicies::ReturnNull,
+            typename T_ReservePoolPolicy = ReservePoolPolicies::AlpakaBuf<Acc>,
+            typename T_AlignmentPolicy = AlignmentPolicies::Shrink<>>
+        struct CudaMemoryManager
+        {
+            using MyHostInfrastructure = CudaHostInfrastructure<
+                T_CreationPolicy,
+                T_DistributionPolicy,
+                T_OOMPolicy,
+                T_ReservePoolPolicy,
+                T_AlignmentPolicy>;
+
+            /**
+             * @brief Construct the memory manager from the host infrastructure.
+             *
+             * @param hostInfrastructure Reference to the host infrastructure.
+             */
+            explicit CudaMemoryManager(MyHostInfrastructure const& hostInfrastructure)
+                : deviceHandle(hostInfrastructure.hostInstance.getAllocatorHandle())
+            {
+            }
+
+            /**
+             * @brief Allocates memory on the device.
+             *
+             * @param size Size of the memory to allocate.
+             * @return Pointer to the allocated memory.
+             */
+            __device__ __forceinline__ void* malloc(size_t size)
+            {
+                // This is cheating a tiny little bit. The accelerator could, in general, be a stateful object but
+                // concretely for CUDA and HIP it just forwards to the corresponding API calls, so it doesn't actually
+                // carry any information by itself. We're rather using it as a tag here.
+                std::array<std::byte, sizeof(Acc)> fakeAccMemory{};
+                return deviceHandle.malloc(*reinterpret_cast<Acc*>(fakeAccMemory.data()), size);
+            }
+
+            /**
+             * @brief Frees memory on the device.
+             *
+             * @param ptr Pointer to the memory to free.
+             */
+            __device__ __forceinline__ void free(void* ptr)
+            {
+                std::array<std::byte, sizeof(Acc)> fakeAccMemory{};
+                deviceHandle.free(*reinterpret_cast<Acc*>(fakeAccMemory.data()), ptr);
+            }
+
+            /**
+             * @brief Handle to the device allocator.
+             *
+             * This is what actually does the work in mallocMC. We forward all our calls to this.
+             */
+            MyHostInfrastructure::MyAllocatorType::AllocatorHandle deviceHandle;
+        };
+    } // namespace detail
+
+    // Use the following in your native CUDA code and you are good to go! All alpaka-specific interfaces are patched
+    // away.
+    using detail::CudaAllocator;
+    using detail::CudaHostInfrastructure;
+    using detail::CudaMemoryManager;
+
+    namespace ReservePoolPolicies
+    {
+        // This is provided because the original ReservePoolPolicies::AlpakaBuf takes an alpaka::Acc tag as template
+        // argument. In contrast, this is alpaka-agnostic.
+        using detail::CudaAlpakaBuf;
+    } // namespace ReservePoolPolicies
+} // namespace mallocMC
diff --git a/include/mallocMC/reservePoolPolicies/CudaSetLimits.hpp b/include/mallocMC/reservePoolPolicies/CudaSetLimits.hpp
index 99bf4b86..b94daed8 100644
--- a/include/mallocMC/reservePoolPolicies/CudaSetLimits.hpp
+++ b/include/mallocMC/reservePoolPolicies/CudaSetLimits.hpp
@@ -59,7 +59,7 @@ namespace mallocMC
         struct CudaSetLimits
         {
             template<typename AlpakaDev>
-            auto setMemPool(AlpakaDev const& dev, size_t memsize) -> void*
+            auto setMemPool(AlpakaDev const& /*dev*/, size_t memsize) -> void*
             {
                 cudaDeviceSetLimit(cudaLimitMallocHeapSize, memsize);
                 return nullptr;
diff --git a/test/unit/source/Allocator.cpp b/test/unit/source/Allocator.cpp
index fcfc22f8..59aba169 100644
--- a/test/unit/source/Allocator.cpp
+++ b/test/unit/source/Allocator.cpp
@@ -51,7 +51,7 @@ TEST_CASE("Allocator")
         auto queue = alpaka::Queue<Acc, alpaka::Blocking>{dev};
 
         mallocMC::Allocator<
-            Acc,
+            alpaka::AccToTag<Acc>,
             mallocMC::CreationPolicies::FlatterScatter<>,
             mallocMC::DistributionPolicies::Noop,
             mallocMC::OOMPolicies::ReturnNull,