From acd5b5647f274b8f8dd89bc327302c2f61441dcf Mon Sep 17 00:00:00 2001 From: Julian Lenz Date: Wed, 22 Jan 2025 07:48:13 +0100 Subject: [PATCH 1/4] First attempt copying over Gallatin integration (untested) --- CMakeLists.txt | 6 + cmake/package-lock.cmake | 8 + .../creationPolicies/GallatinCuda.hpp | 161 ++++++++++++++++++ include/mallocMC/mallocMC.hpp | 2 + include/mallocMC/reservePoolPolicies/Noop.hpp | 60 +++++++ 5 files changed, 237 insertions(+) create mode 100644 include/mallocMC/creationPolicies/GallatinCuda.hpp create mode 100644 include/mallocMC/reservePoolPolicies/Noop.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 8360d5bd..fba581ab 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -29,12 +29,18 @@ include(${CMAKE_CURRENT_LIST_DIR}/cmake/add_controlled.cmake) add_controlled("PackageProject.cmake" REQUIRED) add_controlled("alpaka" REQUIRED) +if(alpaka_ACC_GPU_CUDA_ENABLE) + add_controlled("Gallatin") +endif() + # ---- Create library ---- # Note: for header-only libraries change all PUBLIC flags to INTERFACE and create an interface add_library(${PROJECT_NAME} INTERFACE) set_target_properties(${PROJECT_NAME} PROPERTIES CXX_STANDARD 20) +target_link_libraries(${PROJECT_NAME} PRIVATE gallatin) + # being a cross-platform target, we enforce standards conformance on MSVC target_compile_options(${PROJECT_NAME} INTERFACE "$<$:/permissive->") diff --git a/cmake/package-lock.cmake b/cmake/package-lock.cmake index 4e2727a4..267ec6a1 100644 --- a/cmake/package-lock.cmake +++ b/cmake/package-lock.cmake @@ -35,3 +35,11 @@ CPMDeclarePackage(Catch2 SYSTEM YES EXCLUDE_FROM_ALL YES ) +# Gallatin +CPMDeclarePackage(Catch2 + # There's no release available yet. + GIT_TAG 1aa70ade136c3c2042e2a9c2f25565aa56168a0f + GITHUB_REPOSITORY saltsystemslab/Gallatin + SYSTEM YES + EXCLUDE_FROM_ALL YES +) diff --git a/include/mallocMC/creationPolicies/GallatinCuda.hpp b/include/mallocMC/creationPolicies/GallatinCuda.hpp new file mode 100644 index 00000000..049a85e8 --- /dev/null +++ b/include/mallocMC/creationPolicies/GallatinCuda.hpp @@ -0,0 +1,161 @@ +/* + mallocMC: Memory Allocator for Many Core Architectures. + + Copyright 2014-2024 Institute of Radiation Physics, + Helmholtz-Zentrum Dresden - Rossendorf + + Author(s): Carlchristian Eckert - c.eckert ( at ) hzdr.de + Julian Lenz - j.lenz ( at ) hzdr.de + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#pragma once + +#include + +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED +# include +#else + +// Construct a fake, so we get a nice error message when we try to use is +// and it's not in the way when we don't. +namespace gallatin::allocators +{ + template + struct Gallatin + { + static auto generate_on_device(auto...) + { + return nullptr; + } + }; +} // namespace gallatin::allocators + +#endif + +namespace mallocMC +{ + namespace CreationPolicies + { + /** + * @brief Prototype integration of Gallatin (https://dl.acm.org/doi/10.1145/3627535.3638499) + * + * This CreationPolicy integrates the CUDA code for the Gallatin prototype into mallocMC + * as a thin wrapper. Its intended for proof-of-principle tests and benchmarks only and + * obviously only works with on CUDA devices. + * + * It also only works with the reservePoolPolicies::Noop beccause it does what CudaSetLimits + * does internally on its own. + * + * If we should ever see the need for it, we'd re-implement it in alpaka for a fully-fletched + * and well-maintained version of this. + * Experience has been mixed so far: While we could reproduce good performance in some cases, + * fragmentation was found to be unusably high (to the point of single-digit utilisaton of + * available memory) in PIConGPU. That's why there's currently no plan to lift the prototype + * status in the near future. + */ + template< + typename T_AlignmentPolicy, + size_t bytes_per_segment = 16ULL * 1024 * 1024, + size_t smallest_slice = 16, + size_t largest_slice = 4096> + class GallatinCudaImpl + { + using Gallatin = gallatin::allocators::Gallatin; + + public: + template + using AlignmentAwarePolicy + = GallatinCudaImpl; + Gallatin* heap{nullptr}; + + static constexpr auto providesAvailableSlots = false; + + template + ALPAKA_FN_ACC auto create(AlpakaAcc const& acc, uint32_t bytes) const -> void* + { + return heap->malloc(static_cast(bytes)); + } + + template + ALPAKA_FN_ACC void destroy(AlpakaAcc const& /*acc*/, void* mem) const + { + heap->free(mem); + } + + ALPAKA_FN_ACC auto isOOM(void* p, size_t s) const -> bool + { + return s != 0 && (p == nullptr); + } + + template + static void initHeap( + AlpakaDevice& dev, + AlpakaQueue& queue, + T_DeviceAllocator* devAllocator, + void*, + size_t memsize) + { + static_assert( + std::is_same_v, alpaka::TagGpuCudaRt>, + "The GallatinCuda creation policy is only available on CUDA architectures. Please choose a " + "different one."); + + // This is an extremely hot fix: + // PIConGPU initialises its allocator with 0 bytes to be able to distribute the pointer. + // Only afterwards it can find out its actual memory requirements and uses destructiveResize to set + // the correct heap size. Gallatin runs into issues with this approach. + // Instead, we simply don't believe the request if it's 0. + if(memsize == 0) + return; + + auto devHost = alpaka::getDevByIdx(alpaka::PlatformCpu{}, 0); + using Dim = typename alpaka::trait::DimType::type; + using Idx = typename alpaka::trait::IdxType::type; + using VecType = alpaka::Vec; + + auto tmp = Gallatin::generate_on_device(memsize, 42, true); + auto workDivSingleThread + = alpaka::WorkDivMembers{VecType::ones(), VecType::ones(), VecType::ones()}; + alpaka::exec( + queue, + workDivSingleThread, + [tmp, devAllocator] ALPAKA_FN_ACC(AlpakaAcc const&) { devAllocator->heap = tmp; }); + } + + static auto classname() -> std::string + { + return "GallatinCuda"; + } + }; + + template< + size_t bytes_per_segment = 16ULL * 1024 * 1024, + size_t smallest_slice = 16, + size_t largest_slice = 4096> + struct GallatinCuda + { + template + using AlignmentAwarePolicy + = GallatinCudaImpl; + }; + + } // namespace CreationPolicies +} // namespace mallocMC diff --git a/include/mallocMC/mallocMC.hpp b/include/mallocMC/mallocMC.hpp index de96b7e0..41bf4882 100644 --- a/include/mallocMC/mallocMC.hpp +++ b/include/mallocMC/mallocMC.hpp @@ -47,6 +47,7 @@ #include "alignmentPolicies/Noop.hpp" #include "alignmentPolicies/Shrink.hpp" #include "creationPolicies/FlatterScatter.hpp" +#include "creationPolicies/GallatinCuda.hpp" #include "creationPolicies/OldMalloc.hpp" #include "creationPolicies/Scatter.hpp" #include "distributionPolicies/Noop.hpp" @@ -55,3 +56,4 @@ #include "oOMPolicies/ReturnNull.hpp" #include "reservePoolPolicies/AlpakaBuf.hpp" #include "reservePoolPolicies/CudaSetLimits.hpp" +#include "reservePoolPolicies/Noop.hpp" diff --git a/include/mallocMC/reservePoolPolicies/Noop.hpp b/include/mallocMC/reservePoolPolicies/Noop.hpp new file mode 100644 index 00000000..c6b82f18 --- /dev/null +++ b/include/mallocMC/reservePoolPolicies/Noop.hpp @@ -0,0 +1,60 @@ +/* + mallocMC: Memory Allocator for Many Core Architectures. + + Copyright 2014-2024 Institute of Radiation Physics, + Helmholtz-Zentrum Dresden - Rossendorf + + Author(s): Carlchristian Eckert - c.eckert ( at ) hzdr.de + Julian Lenz - j.lenz ( at ) hzdr.de + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#pragma once + +namespace mallocMC +{ + namespace ReservePoolPolicies + { + /** + * @brief Does exactly nothing. + * + * This is intended for use with prototypes that were originally designed + * to handle these aspects on their own. Currently needed for GallatinCuda. + */ + struct Noop + { + template + auto setMemPool(AlpakaDev const& dev, size_t memsize) -> void* + { + return nullptr; + } + + static void resetMemPool() + { + } + + static auto classname() -> std::string + { + return "Noop"; + } + }; + + } // namespace ReservePoolPolicies +} // namespace mallocMC From edbdcdf5f79ab51a0fc817bedf3ab4ad436eefc8 Mon Sep 17 00:00:00 2001 From: Julian Lenz Date: Wed, 22 Jan 2025 11:43:48 +0100 Subject: [PATCH 2/4] Add examples and make them run --- CMakeLists.txt | 8 +++--- cmake/package-lock.cmake | 7 +++--- examples/getAvailableSlots/source/main.cpp | 26 ++++++++++++++----- examples/vectorAdd/source/main.cpp | 29 +++++++++++++++------- 4 files changed, 48 insertions(+), 22 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index fba581ab..2cd30213 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -29,9 +29,6 @@ include(${CMAKE_CURRENT_LIST_DIR}/cmake/add_controlled.cmake) add_controlled("PackageProject.cmake" REQUIRED) add_controlled("alpaka" REQUIRED) -if(alpaka_ACC_GPU_CUDA_ENABLE) - add_controlled("Gallatin") -endif() # ---- Create library ---- @@ -39,7 +36,10 @@ endif() add_library(${PROJECT_NAME} INTERFACE) set_target_properties(${PROJECT_NAME} PROPERTIES CXX_STANDARD 20) -target_link_libraries(${PROJECT_NAME} PRIVATE gallatin) +if(alpaka_ACC_GPU_CUDA_ENABLE) + add_controlled("Gallatin") + target_link_libraries(${PROJECT_NAME} INTERFACE gallatin) +endif() # being a cross-platform target, we enforce standards conformance on MSVC target_compile_options(${PROJECT_NAME} INTERFACE "$<$:/permissive->") diff --git a/cmake/package-lock.cmake b/cmake/package-lock.cmake index 267ec6a1..9f1a9f1c 100644 --- a/cmake/package-lock.cmake +++ b/cmake/package-lock.cmake @@ -36,10 +36,11 @@ CPMDeclarePackage(Catch2 EXCLUDE_FROM_ALL YES ) # Gallatin -CPMDeclarePackage(Catch2 +CPMDeclarePackage(Gallatin # There's no release available yet. - GIT_TAG 1aa70ade136c3c2042e2a9c2f25565aa56168a0f - GITHUB_REPOSITORY saltsystemslab/Gallatin + GIT_TAG ac0cb8e380ffcb74156bafb8805fb60412817c5f + # Use our own fork for some patches + GITHUB_REPOSITORY chillenzer/Gallatin SYSTEM YES EXCLUDE_FROM_ALL YES ) diff --git a/examples/getAvailableSlots/source/main.cpp b/examples/getAvailableSlots/source/main.cpp index 8e183fde..4d134715 100644 --- a/examples/getAvailableSlots/source/main.cpp +++ b/examples/getAvailableSlots/source/main.cpp @@ -99,7 +99,10 @@ struct ExampleKernel } }; -template +template< + typename T_CreationPolicy, + typename T_ReservePoolPolicy, + typename T_AlignmentPolicy = mallocMC::AlignmentPolicies::Shrink> auto example03() -> int { using Allocator = mallocMC::Allocator< @@ -107,8 +110,8 @@ auto example03() -> int T_CreationPolicy, mallocMC::DistributionPolicies::Noop, mallocMC::OOMPolicies::ReturnNull, - mallocMC::ReservePoolPolicies::AlpakaBuf, - mallocMC::AlignmentPolicies::Shrink>; + T_ReservePoolPolicy, + T_AlignmentPolicy>; auto const platform = alpaka::Platform{}; auto const dev = alpaka::getDevByIdx(platform, 0); @@ -130,8 +133,19 @@ auto example03() -> int auto main(int /*argc*/, char* /*argv*/[]) -> int { - example03>(); - example03>(); - example03(); + example03, mallocMC::ReservePoolPolicies::AlpakaBuf>(); + example03, mallocMC::ReservePoolPolicies::AlpakaBuf>(); +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED + example03< + mallocMC::CreationPolicies::GallatinCuda<>, + mallocMC::ReservePoolPolicies::Noop, + mallocMC::AlignmentPolicies::Noop>(); + // GallatinCuda already uses cudaSetLimits and we're not allowed to call it a second time. + example03(); + // This should normally be: + // example01(); +#else + example03(); +#endif return 0; } diff --git a/examples/vectorAdd/source/main.cpp b/examples/vectorAdd/source/main.cpp index 002af965..2c5833c1 100644 --- a/examples/vectorAdd/source/main.cpp +++ b/examples/vectorAdd/source/main.cpp @@ -27,9 +27,6 @@ THE SOFTWARE. */ -#include "mallocMC/creationPolicies/FlatterScatter.hpp" -#include "mallocMC/creationPolicies/OldMalloc.hpp" - #include #include @@ -80,7 +77,10 @@ ALPAKA_STATIC_ACC_MEM_GLOBAL int** arA; ALPAKA_STATIC_ACC_MEM_GLOBAL int** arB; ALPAKA_STATIC_ACC_MEM_GLOBAL int** arC; -template +template< + typename T_CreationPolicy, + typename T_ReservePoolPolicy, + typename T_AlignmentPolicy = mallocMC::AlignmentPolicies::Shrink> auto example01() -> int { using Allocator = mallocMC::Allocator< @@ -88,8 +88,8 @@ auto example01() -> int T_CreationPolicy, mallocMC::DistributionPolicies::Noop, mallocMC::OOMPolicies::ReturnNull, - mallocMC::ReservePoolPolicies::AlpakaBuf, - mallocMC::AlignmentPolicies::Shrink>; + T_ReservePoolPolicy, + T_AlignmentPolicy>; constexpr auto length = 100; @@ -227,8 +227,19 @@ auto example01() -> int auto main(int /*argc*/, char* /*argv*/[]) -> int { - example01>(); - example01>(); - example01(); + example01, mallocMC::ReservePoolPolicies::AlpakaBuf>(); + example01, mallocMC::ReservePoolPolicies::AlpakaBuf>(); +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED + example01< + mallocMC::CreationPolicies::GallatinCuda<>, + mallocMC::ReservePoolPolicies::Noop, + mallocMC::AlignmentPolicies::Noop>(); + // GallatinCuda already uses cudaSetLimits and we're not allowed to call it a second time. + example01(); + // This should normally be: + // example01(); +#else + example01(); +#endif return 0; } From 1944ff4f7928bff8b5a994382b60c11d38f6e17c Mon Sep 17 00:00:00 2001 From: Julian Lenz Date: Wed, 22 Jan 2025 11:48:50 +0100 Subject: [PATCH 3/4] Fix warnings --- include/mallocMC/reservePoolPolicies/Noop.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/mallocMC/reservePoolPolicies/Noop.hpp b/include/mallocMC/reservePoolPolicies/Noop.hpp index c6b82f18..57bf8261 100644 --- a/include/mallocMC/reservePoolPolicies/Noop.hpp +++ b/include/mallocMC/reservePoolPolicies/Noop.hpp @@ -41,7 +41,7 @@ namespace mallocMC struct Noop { template - auto setMemPool(AlpakaDev const& dev, size_t memsize) -> void* + auto setMemPool(AlpakaDev const& /*dev*/, size_t /*memsize*/) -> void* { return nullptr; } From 1a0530e98d197c6d6a4ae75e09beb245d15d4c46 Mon Sep 17 00:00:00 2001 From: Julian Lenz Date: Wed, 22 Jan 2025 16:48:42 +0100 Subject: [PATCH 4/4] Add a warning about compute capabilities --- CMakeLists.txt | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2cd30213..75f88423 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -38,6 +38,26 @@ set_target_properties(${PROJECT_NAME} PROPERTIES CXX_STANDARD 20) if(alpaka_ACC_GPU_CUDA_ENABLE) add_controlled("Gallatin") + + # Gallatin needs some fairly recent compute capability from CUDA. + # CMake defaults to taking the oldest supported by the device + # (https://cmake.org/cmake/help/latest/variable/CMAKE_CUDA_ARCHITECTURES.html) + # which can be too old. This leads to compilation errors along the lines of + # + # error: no instance of overloaded function "atomicCAS" matches the argument list + # argument types are: (unsigned short *, unsigned short, unsigned short) + # + # because this overload was only added later (apparently?). + + if ("${CMAKE_CUDA_ARCHITECTURES}" LESS 70) + message( + WARNING + "CUDA architecture detected is too old: ${CMAKE_CUDA_ARCHITECTURES}. " + "If the architecture set is too old, this can lead to compilation errors with Gallatin. " + "If Gallatin is needed, please set CMAKE_CUDA_ARCHITECTURES to the correct value >= 70." + ) + endif() + target_link_libraries(${PROJECT_NAME} INTERFACE gallatin) endif()