diff --git a/CMakeLists.txt b/CMakeLists.txt index da2a9ab9..7facb5fb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -68,6 +68,8 @@ target_include_directories( ${PROJECT_NAME} INTERFACE $ $ ) +target_link_libraries(${PROJECT_NAME} INTERFACE alpaka::alpaka) + option(mallocMC_BUILD_TESTING "Turn on/off building the tests" OFF) if(mallocMC_BUILD_TESTING) diff --git a/cmake/package-lock.cmake b/cmake/package-lock.cmake index 9f1a9f1c..07b9c332 100644 --- a/cmake/package-lock.cmake +++ b/cmake/package-lock.cmake @@ -11,10 +11,11 @@ CPMDeclarePackage(PackageProject.cmake # alpaka CPMDeclarePackage(alpaka NAME alpaka - GIT_TAG 1.2.0 - GITHUB_REPOSITORY alpaka-group/alpaka + # temporary solution until this is merged into alpaka + GIT_TAG add-option-for-installation + GITHUB_REPOSITORY chillenzer/alpaka OPTIONS - "alpaka_CXX_STANDARD 20" + "alpaka_CXX_STANDARD 20;alpaka_INSTALL ON" # It is recommended to let CPM cache dependencies in order to reduce redundant downloads. # However, we might in the foreseeable future turn to unstable references like the `dev` branch here. # Setting the following option tells CPM to not use the cache. diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 1f5672b6..382d4995 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -12,8 +12,13 @@ add_subdirectory( ${CMAKE_BINARY_DIR}/examples/getAvailableSlots ) +add_subdirectory( + ${CMAKE_CURRENT_LIST_DIR}/native-cuda + ${CMAKE_BINARY_DIR}/examples/native-cuda +) + add_custom_target( mallocMCExamples - DEPENDS mallocMCExampleVectorAdd mallocMCExampleGetAvailableSlots + DEPENDS mallocMCExampleVectorAdd mallocMCExampleGetAvailableSlots mallocMCExampleNativeCuda COMMENT "Shortcut for building all examples." ) diff --git a/examples/native-cuda/CMakeLists.txt b/examples/native-cuda/CMakeLists.txt new file mode 100644 index 00000000..6f954816 --- /dev/null +++ b/examples/native-cuda/CMakeLists.txt @@ -0,0 +1,33 @@ +cmake_minimum_required(VERSION 3.14...3.22) + +check_language(CUDA) +if (CMAKE_CUDA_COMPILER) + project(mallocMCExampleNativeCuda LANGUAGES CXX CUDA) + + # --- Import tools ---- + + include(${CMAKE_CURRENT_LIST_DIR}/../../cmake/tools.cmake) + + # ---- Dependencies ---- + + include(${CMAKE_CURRENT_LIST_DIR}/../../cmake/CPM_0.40.2.cmake) + CPMUsePackageLock(${CMAKE_CURRENT_LIST_DIR}/../../cmake/package-lock.cmake) + + if(NOT TARGET mallocMC) + CPMAddPackage(NAME mallocMC SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR}/../..) + endif() + + # ---- Create standalone executable ---- + + add_executable(${PROJECT_NAME} ${CMAKE_CURRENT_SOURCE_DIR}/source/main.cu) + + set_target_properties(${PROJECT_NAME} + PROPERTIES + CXX_STANDARD 20 + OUTPUT_NAME ${PROJECT_NAME} + CXX_STANDARD_REQUIRED ON + CXX_EXTENSIONS OFF + ) + + target_link_libraries(${PROJECT_NAME} mallocMC::mallocMC ${CUDA_LIBRARIES}) +endif() diff --git a/examples/native-cuda/source/main.cu b/examples/native-cuda/source/main.cu new file mode 100644 index 00000000..00c429a9 --- /dev/null +++ b/examples/native-cuda/source/main.cu @@ -0,0 +1,104 @@ +/* + mallocMC: Memory Allocator for Many Core Architectures. + https://www.hzdr.de/crp + + Copyright 2025 Institute of Radiation Physics, + Helmholtz-Zentrum Dresden - Rossendorf + + Author(s): Julian Lenz - j.lenz ( at ) hzdr.de + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#include + +#include +#include +#include +#include + +/** + * @brief Computes the sum of squares of the first `n` natural numbers. + * + * This function calculates the sum of squares of the first `n` natural numbers using the formula: + * \[ + * \text{sumOfSquares}(n) = \frac{n \times (n + 1) \times (2n + 1)}{6} + * \] + * It's used to check the computed value in the kernel. + * + * @param n The number of natural numbers to consider. + * @return The sum of squares of the first `n` natural numbers. + */ +__device__ auto sumOfSquares(auto const n) +{ + return (n * (n + 1) * (2 * n + 1)) / 6; +} + +/** + * @brief Computes the dot product of two vectors for each thread. + * + * This kernel computes the dot product of two vectors, `a` and `b`, for each thread. + * Each thread allocates memory for its own vectors, initializes them with consecutive values, + * computes the dot product, and checks if the result matches the expected value. + * If the result does not match, the thread prints an error message and halts execution. + * + * @param memoryManager A CUDA memory manager object used for memory allocation and deallocation. + * @param numValues The number of elements in each vector. + * + * @note This kernnel is, of course, not very realistic as a workload but it fulfills its purpose of showcasing a + * native CUDA application. + */ +__global__ void oneDotProductPerThread(mallocMC::CudaMemoryManager<> memoryManager, uint64_t numValues) +{ + uint64_t tid = threadIdx.x + blockIdx.x * blockDim.x; + + // Not very realistic, all threads are doing this on their own: + auto a = std::span( + reinterpret_cast(memoryManager.malloc(numValues * sizeof(uint64_t))), + numValues); + auto b = std::span( + reinterpret_cast(memoryManager.malloc(numValues * sizeof(uint64_t))), + numValues); + + std::iota(std::begin(a), std::end(a), tid); + std::iota(std::begin(b), std::end(b), tid); + + uint64_t result = std::transform_reduce(std::cbegin(a), std::cend(a), std::cbegin(b), 0U); + + auto expected = sumOfSquares(numValues + tid - 1) - (tid > 0 ? sumOfSquares(tid - 1) : 0); + if(result != expected) + { + printf("Thread %lu: Result %lu != Expected %lu. \n", tid, result, expected); + __trap(); + } + + memoryManager.free(a.data()); + memoryManager.free(b.data()); +} + +int main() +{ + size_t const heapSize = 1024U * 1024U * 1024U; + uint64_t const numValues = 32U; + mallocMC::CudaHostInfrastructure<> hostInfrastructure{heapSize}; + auto memoryManager = mallocMC::CudaMemoryManager{hostInfrastructure}; + + std::cout << "Running native CUDA kernel." << std::endl; + oneDotProductPerThread<<<8, 256>>>(memoryManager, numValues); +} diff --git a/include/mallocMC/allocator.hpp b/include/mallocMC/allocator.hpp index d59411c7..447b381d 100644 --- a/include/mallocMC/allocator.hpp +++ b/include/mallocMC/allocator.hpp @@ -198,7 +198,7 @@ namespace mallocMC } ALPAKA_FN_HOST - auto getAllocatorHandle() -> AllocatorHandle + auto getAllocatorHandle() const -> AllocatorHandle { return AllocatorHandle{alpaka::getPtrNative(*devAllocatorBuffer)}; } diff --git a/include/mallocMC/mallocMC.cuh b/include/mallocMC/mallocMC.cuh new file mode 100644 index 00000000..64b893ed --- /dev/null +++ b/include/mallocMC/mallocMC.cuh @@ -0,0 +1,184 @@ +/* + mallocMC: Memory Allocator for Many Core Architectures. + https://www.hzdr.de/crp + + Copyright 2025 Institute of Radiation Physics, + Helmholtz-Zentrum Dresden - Rossendorf + + Author(s): Julian Lenz - j.lenz ( at ) hzdr.de + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. +*/ + +#include "mallocMC/alignmentPolicies/Shrink.hpp" +#include "mallocMC/creationPolicies/FlatterScatter.hpp" +#include "mallocMC/reservePoolPolicies/AlpakaBuf.hpp" + +#include +#include + +#include + +namespace mallocMC +{ + // This namespace implements an alpaka-agnostic interface by choosing some reasonable defaults working fine for + // CUDA devices. Further below, we export the necessary names to the global mallocMC:: namespace. See below if + // you're only interested in usage. Look inside if you want to understand what we've done here or want to port this + // to other architectures. + namespace detail + { + using Dim = alpaka::DimInt<1>; + using Idx = std::uint32_t; + using Acc = alpaka::AccGpuCudaRt; + + // Hide the alpaka-specific Acc argument of `ReservePoolPolicies::AlpakaBuf`. + using CudaAlpakaBuf = ReservePoolPolicies::AlpakaBuf; + + /** + * @brief Allocator template with hidden alpaka-specifics. + */ + template< + typename T_CreationPolicy = CreationPolicies::FlatterScatter<>, + typename T_DistributionPolicy = DistributionPolicies::Noop, + typename T_OOMPolicy = OOMPolicies::ReturnNull, + typename T_ReservePoolPolicy = CudaAlpakaBuf, + typename T_AlignmentPolicy = AlignmentPolicies::Shrink<>> + using CudaAllocator = Allocator< + Acc, + T_CreationPolicy, + T_DistributionPolicy, + T_OOMPolicy, + T_ReservePoolPolicy, + T_AlignmentPolicy>; + + /** + * @brief Host-side infrastructure needed for setting up everything. + * + * You need to create an instance of this on the host. It provides the alpaka infrastructure and sets up + * everything on the device side, so you can get started allocating stuff. + */ + template< + typename T_CreationPolicy = CreationPolicies::FlatterScatter<>, + typename T_DistributionPolicy = DistributionPolicies::Noop, + typename T_OOMPolicy = OOMPolicies::ReturnNull, + typename T_ReservePoolPolicy = ReservePoolPolicies::AlpakaBuf, + typename T_AlignmentPolicy = AlignmentPolicies::Shrink<>> + struct CudaHostInfrastructure + { + using MyAllocatorType = CudaAllocator< + T_CreationPolicy, + T_DistributionPolicy, + T_OOMPolicy, + T_ReservePoolPolicy, + T_AlignmentPolicy>; + + // Keep this first, so compiler-generated constructors can be called as just + // CudaHostInfrastructure<>{heapSize}; + size_t heapSize{}; + + // All of this is necessary alpaka infrastructure. + alpaka::Platform const platform{}; + std::remove_cv_t const dev{alpaka::getDevByIdx(platform, 0)}; + alpaka::Queue queue{dev}; + + // This is our actual host-side instance of the allocator. It sets up everything on the device and provides + // the handle that we can pass to kernels. + MyAllocatorType hostInstance{dev, queue, heapSize}; + }; + + /** + * @brief Memory manager to pass to kernels. + * + * Create this on the host and pass it to your kernels. It's a lightweight object barely more than a pointer, + * so you can just copy it around as needed. Its main purpose is to provide an alpaka-agnostic interface by + * adding an accelerator internally before forwarding malloc/free calls to mallocMC. + */ + template< + typename T_CreationPolicy = CreationPolicies::FlatterScatter<>, + typename T_DistributionPolicy = DistributionPolicies::Noop, + typename T_OOMPolicy = OOMPolicies::ReturnNull, + typename T_ReservePoolPolicy = ReservePoolPolicies::AlpakaBuf, + typename T_AlignmentPolicy = AlignmentPolicies::Shrink<>> + struct CudaMemoryManager + { + using MyHostInfrastructure = CudaHostInfrastructure< + T_CreationPolicy, + T_DistributionPolicy, + T_OOMPolicy, + T_ReservePoolPolicy, + T_AlignmentPolicy>; + + /** + * @brief Construct the memory manager from the host infrastructure. + * + * @param hostInfrastructure Reference to the host infrastructure. + */ + explicit CudaMemoryManager(MyHostInfrastructure const& hostInfrastructure) + : deviceHandle(hostInfrastructure.hostInstance.getAllocatorHandle()) + { + } + + /** + * @brief Allocates memory on the device. + * + * @param size Size of the memory to allocate. + * @return Pointer to the allocated memory. + */ + __device__ __forceinline__ void* malloc(size_t size) + { + // This is cheating a tiny little bit. The accelerator could, in general, be a stateful object but + // concretely for CUDA and HIP it just forwards to the corresponding API calls, so it doesn't actually + // carry any information by itself. We're rather using it as a tag here. + std::array fakeAccMemory{}; + return deviceHandle.malloc(*reinterpret_cast(fakeAccMemory.data()), size); + } + + /** + * @brief Frees memory on the device. + * + * @param ptr Pointer to the memory to free. + */ + __device__ __forceinline__ void free(void* ptr) + { + std::array fakeAccMemory{}; + deviceHandle.free(*reinterpret_cast(fakeAccMemory.data()), ptr); + } + + /** + * @brief Handle to the device allocator. + * + * This is what actually does the work in mallocMC. We forward all our calls to this. + */ + MyHostInfrastructure::MyAllocatorType::AllocatorHandle deviceHandle; + }; + } // namespace detail + + // Use the following in your native CUDA code and you are good to go! All alpaka-specific interfaces are patched + // away. + using detail::CudaAllocator; + using detail::CudaHostInfrastructure; + using detail::CudaMemoryManager; + + namespace ReservePoolPolicies + { + // This is provided because the original ReservePoolPolicies::AlpakaBuf takes an alpaka::Acc tag as template + // argument. In contrast, this is alpaka-agnostic. + using detail::CudaAlpakaBuf; + } // namespace ReservePoolPolicies +} // namespace mallocMC