Skip to content

Commit

Permalink
Merge pull request #288 from chillenzer/native-cuda-example
Browse files Browse the repository at this point in the history
Add infrastructure and example for native CUDA
  • Loading branch information
psychocoderHPC authored Feb 12, 2025
2 parents 510520d + 9b6f17c commit 65c15d7
Show file tree
Hide file tree
Showing 14 changed files with 397 additions and 16 deletions.
28 changes: 24 additions & 4 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,17 @@ if(PROJECT_SOURCE_DIR STREQUAL PROJECT_BINARY_DIR)
)
endif()

# ---- Options ----

option(mallocMC_BUILD_TESTING "Turn on/off building the tests" OFF)
option(mallocMC_BUILD_EXAMPLES "Turn on/off building the examples" OFF)
if (mallocMC_BUILD_TESTING OR mallocMC_BUILD_EXAMPLES)
enable_testing()
endif()
if (mallocMC_BUILD_TESTING)
set(alpaka_ACC_CPU_B_SEQ_T_SEQ_ENABLE ON CACHE BOOL "" FORCE)
endif()

# ---- Add dependencies via CPM ----
# see https://github.com/TheLartians/CPM.cmake for more info

Expand All @@ -39,6 +50,12 @@ set_target_properties(${PROJECT_NAME} PROPERTIES CXX_STANDARD 20)
if(alpaka_ACC_GPU_CUDA_ENABLE)
add_controlled("Gallatin")

if (TARGET gallatin::gallatin)
set(mallocMC_HAS_Gallatin_AVAILABLE YES)
else()
set(mallocMC_HAS_Gallatin_AVAILABLE NO)
endif()

# Gallatin needs some fairly recent compute capability from CUDA.
# CMake defaults to taking the oldest supported by the device
# (https://cmake.org/cmake/help/latest/variable/CMAKE_CUDA_ARCHITECTURES.html)
Expand All @@ -56,9 +73,13 @@ if(alpaka_ACC_GPU_CUDA_ENABLE)
"If the architecture set is too old, this can lead to compilation errors with Gallatin. "
"If Gallatin is needed, please set CMAKE_CUDA_ARCHITECTURES to the correct value >= 70."
)
set(mallocMC_HAS_Gallatin_AVAILABLE NO)
endif()

target_link_libraries(${PROJECT_NAME} INTERFACE gallatin)
if (mallocMC_HAS_Gallatin_AVAILABLE)
target_link_libraries(${PROJECT_NAME} INTERFACE gallatin)
target_compile_definitions(${PROJECT_NAME} INTERFACE mallocMC_HAS_Gallatin_AVAILABLE)
endif()
endif()

# being a cross-platform target, we enforce standards conformance on MSVC
Expand All @@ -68,15 +89,14 @@ target_include_directories(
${PROJECT_NAME} INTERFACE $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include>
$<INSTALL_INTERFACE:include/${PROJECT_NAME}-${PROJECT_VERSION}>
)
target_link_libraries(${PROJECT_NAME} INTERFACE alpaka::alpaka)


option(mallocMC_BUILD_TESTING "Turn on/off building the tests" OFF)
if(mallocMC_BUILD_TESTING)
include(${CMAKE_CURRENT_LIST_DIR}/cmake/tools.cmake)
enable_testing()
add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/test ${CMAKE_BINARY_DIR}/test)
endif()

option(mallocMC_BUILD_EXAMPLES "Turn on/off building the examples" OFF)
if(mallocMC_BUILD_EXAMPLES)
include(${CMAKE_CURRENT_LIST_DIR}/cmake/tools.cmake)
add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/examples ${CMAKE_BINARY_DIR}/examples)
Expand Down
5 changes: 3 additions & 2 deletions cmake/package-lock.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,11 @@ CPMDeclarePackage(PackageProject.cmake
# alpaka
CPMDeclarePackage(alpaka
NAME alpaka
GIT_TAG 1.2.0
# This is a development version slightly after 1.2.0 because we needed a patch
GIT_TAG 95c0bf2397255a89467bb5c151a96367ad1d1f93
GITHUB_REPOSITORY alpaka-group/alpaka
OPTIONS
"alpaka_CXX_STANDARD 20"
"alpaka_CXX_STANDARD 20;alpaka_INSTALL ON"
# It is recommended to let CPM cache dependencies in order to reduce redundant downloads.
# However, we might in the foreseeable future turn to unstable references like the `dev` branch here.
# Setting the following option tells CPM to not use the cache.
Expand Down
18 changes: 16 additions & 2 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,22 @@ add_subdirectory(
${CMAKE_BINARY_DIR}/examples/getAvailableSlots
)

add_custom_target(
check_language(CUDA)
if (CMAKE_CUDA_COMPILER AND alpaka_ACC_GPU_CUDA_ENABLE)
add_subdirectory(
${CMAKE_CURRENT_LIST_DIR}/native-cuda
${CMAKE_BINARY_DIR}/examples/native-cuda
)

add_custom_target(
mallocMCExamples
DEPENDS mallocMCExampleVectorAdd mallocMCExampleGetAvailableSlots mallocMCExampleNativeCuda
COMMENT "Shortcut for building all examples."
)
else()
add_custom_target(
mallocMCExamples
DEPENDS mallocMCExampleVectorAdd mallocMCExampleGetAvailableSlots
COMMENT "Shortcut for building all examples."
)
)
endif()
1 change: 1 addition & 0 deletions examples/getAvailableSlots/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,4 @@ set_target_properties(${PROJECT_NAME}
)

target_link_libraries(${PROJECT_NAME} mallocMC::mallocMC alpaka::alpaka)
add_test(NAME ${PROJECT_NAME} COMMAND ${PROJECT_NAME})
5 changes: 4 additions & 1 deletion examples/getAvailableSlots/source/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -136,14 +136,17 @@ auto main(int /*argc*/, char* /*argv*/[]) -> int
example03<FlatterScatter<FlatterScatterHeapConfig>, mallocMC::ReservePoolPolicies::AlpakaBuf<Acc>>();
example03<Scatter<FlatterScatterHeapConfig>, mallocMC::ReservePoolPolicies::AlpakaBuf<Acc>>();
#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
# ifdef mallocMC_HAS_Gallatin_AVAILABLE
example03<
mallocMC::CreationPolicies::GallatinCuda<>,
mallocMC::ReservePoolPolicies::Noop,
mallocMC::AlignmentPolicies::Noop>();
// GallatinCuda already uses cudaSetLimits and we're not allowed to call it a second time.
example03<OldMalloc, mallocMC::ReservePoolPolicies::Noop>();
# else
// This should normally be:
// example01<OldMalloc, mallocMC::ReservePoolPolicies::CudaSetLimits>();
example03<OldMalloc, mallocMC::ReservePoolPolicies::CudaSetLimits>();
# endif
#else
example03<OldMalloc, mallocMC::ReservePoolPolicies::Noop>();
#endif
Expand Down
31 changes: 31 additions & 0 deletions examples/native-cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
cmake_minimum_required(VERSION 3.14...3.22)

project(mallocMCExampleNativeCuda LANGUAGES CXX CUDA)

# --- Import tools ----

include(${CMAKE_CURRENT_LIST_DIR}/../../cmake/tools.cmake)

# ---- Dependencies ----

include(${CMAKE_CURRENT_LIST_DIR}/../../cmake/CPM_0.40.2.cmake)
CPMUsePackageLock(${CMAKE_CURRENT_LIST_DIR}/../../cmake/package-lock.cmake)

if(NOT TARGET mallocMC)
CPMAddPackage(NAME mallocMC SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR}/../..)
endif()

# ---- Create standalone executable ----

add_executable(${PROJECT_NAME} ${CMAKE_CURRENT_SOURCE_DIR}/source/main.cu)

set_target_properties(${PROJECT_NAME}
PROPERTIES
CXX_STANDARD 20
OUTPUT_NAME ${PROJECT_NAME}
CXX_STANDARD_REQUIRED ON
CXX_EXTENSIONS OFF
)

target_link_libraries(${PROJECT_NAME} mallocMC::mallocMC ${CUDA_LIBRARIES})
add_test(NAME ${PROJECT_NAME} COMMAND ${PROJECT_NAME})
104 changes: 104 additions & 0 deletions examples/native-cuda/source/main.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
/*
mallocMC: Memory Allocator for Many Core Architectures.
https://www.hzdr.de/crp
Copyright 2025 Institute of Radiation Physics,
Helmholtz-Zentrum Dresden - Rossendorf
Author(s): Julian Lenz - j.lenz ( at ) hzdr.de
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/

#include <mallocMC/mallocMC.cuh>

#include <cstdint>
#include <cstdlib>
#include <functional>
#include <span>

/**
* @brief Computes the sum of squares of the first `n` natural numbers.
*
* This function calculates the sum of squares of the first `n` natural numbers using the formula:
* \[
* \text{sumOfSquares}(n) = \frac{n \times (n + 1) \times (2n + 1)}{6}
* \]
* It's used to check the computed value in the kernel.
*
* @param n The number of natural numbers to consider.
* @return The sum of squares of the first `n` natural numbers.
*/
__device__ auto sumOfSquares(auto const n)
{
return (n * (n + 1) * (2 * n + 1)) / 6;
}

/**
* @brief Computes the dot product of two vectors for each thread.
*
* This kernel computes the dot product of two vectors, `a` and `b`, for each thread.
* Each thread allocates memory for its own vectors, initializes them with consecutive values,
* computes the dot product, and checks if the result matches the expected value.
* If the result does not match, the thread prints an error message and halts execution.
*
* @param memoryManager A CUDA memory manager object used for memory allocation and deallocation.
* @param numValues The number of elements in each vector.
*
* @note This kernnel is, of course, not very realistic as a workload but it fulfills its purpose of showcasing a
* native CUDA application.
*/
__global__ void oneDotProductPerThread(mallocMC::CudaMemoryManager<> memoryManager, uint64_t numValues)
{
uint64_t tid = threadIdx.x + blockIdx.x * blockDim.x;

// Not very realistic, all threads are doing this on their own:
auto a = std::span<uint64_t>(
reinterpret_cast<uint64_t*>(memoryManager.malloc(numValues * sizeof(uint64_t))),
numValues);
auto b = std::span<uint64_t>(
reinterpret_cast<uint64_t*>(memoryManager.malloc(numValues * sizeof(uint64_t))),
numValues);

std::iota(std::begin(a), std::end(a), tid);
std::iota(std::begin(b), std::end(b), tid);

uint64_t result = std::transform_reduce(std::cbegin(a), std::cend(a), std::cbegin(b), 0U);

auto expected = sumOfSquares(numValues + tid - 1) - (tid > 0 ? sumOfSquares(tid - 1) : 0);
if(result != expected)
{
printf("Thread %lu: Result %lu != Expected %lu. \n", tid, result, expected);
__trap();
}

memoryManager.free(a.data());
memoryManager.free(b.data());
}

int main()
{
size_t const heapSize = 1024U * 1024U * 1024U;
uint64_t const numValues = 32U;
mallocMC::CudaHostInfrastructure<> hostInfrastructure{heapSize};
auto memoryManager = mallocMC::CudaMemoryManager{hostInfrastructure};

std::cout << "Running native CUDA kernel." << std::endl;
oneDotProductPerThread<<<8, 256>>>(memoryManager, numValues);
}
2 changes: 2 additions & 0 deletions examples/vectorAdd/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,5 @@ set_target_properties(${PROJECT_NAME}
)

target_link_libraries(${PROJECT_NAME} mallocMC::mallocMC alpaka::alpaka)

add_test(NAME ${PROJECT_NAME} COMMAND ${PROJECT_NAME})
6 changes: 5 additions & 1 deletion examples/vectorAdd/source/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -229,15 +229,19 @@ auto main(int /*argc*/, char* /*argv*/[]) -> int
{
example01<FlatterScatter<FlatterScatterHeapConfig>, mallocMC::ReservePoolPolicies::AlpakaBuf<Acc>>();
example01<Scatter<FlatterScatterHeapConfig>, mallocMC::ReservePoolPolicies::AlpakaBuf<Acc>>();

#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
# ifdef mallocMC_HAS_Gallatin_AVAILABLE
example01<
mallocMC::CreationPolicies::GallatinCuda<>,
mallocMC::ReservePoolPolicies::Noop,
mallocMC::AlignmentPolicies::Noop>();
// GallatinCuda already uses cudaSetLimits and we're not allowed to call it a second time.
example01<OldMalloc, mallocMC::ReservePoolPolicies::Noop>();
# else
// This should normally be:
// example01<OldMalloc, mallocMC::ReservePoolPolicies::CudaSetLimits>();
example01<OldMalloc, mallocMC::ReservePoolPolicies::CudaSetLimits>();
# endif
#else
example01<OldMalloc, mallocMC::ReservePoolPolicies::Noop>();
#endif
Expand Down
2 changes: 1 addition & 1 deletion include/mallocMC/allocator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ namespace mallocMC
}

ALPAKA_FN_HOST
auto getAllocatorHandle() -> AllocatorHandle
auto getAllocatorHandle() const -> AllocatorHandle
{
return AllocatorHandle{alpaka::getPtrNative(*devAllocatorBuffer)};
}
Expand Down
23 changes: 20 additions & 3 deletions include/mallocMC/creationPolicies/GallatinCuda.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@

#include <alpaka/alpaka.hpp>

#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
#ifdef mallocMC_HAS_Gallatin_AVAILABLE
# include <gallatin/allocators/gallatin.cuh>
#else

Expand All @@ -45,6 +45,23 @@ namespace gallatin::allocators
{
return nullptr;
}

template<typename... T>
auto malloc(T... /*unused*/) -> void*
{
// This always triggers but it depends on the template parameter, so it's only instantiated if we actually
// use it.
static_assert(sizeof...(T) < 0, "Attempt to use malloc of unavailable gallatin prototype.");
return nullptr;
}

template<typename... T>
auto free(T... /*unused*/)
{
// This always triggers but it depends on the template parameter, so it's only instantiated if we actually
// use it.
static_assert(sizeof...(T) < 0, "Attempt to use free of unavailable gallatin prototype.");
}
};
} // namespace gallatin::allocators

Expand Down Expand Up @@ -89,7 +106,7 @@ namespace mallocMC
static constexpr auto providesAvailableSlots = false;

template<typename AlpakaAcc>
ALPAKA_FN_ACC auto create(AlpakaAcc const& acc, uint32_t bytes) const -> void*
ALPAKA_FN_ACC auto create(AlpakaAcc const& /*acc*/, uint32_t bytes) const -> void*
{
return heap->malloc(static_cast<size_t>(bytes));
}
Expand All @@ -107,7 +124,7 @@ namespace mallocMC

template<typename AlpakaAcc, typename AlpakaDevice, typename AlpakaQueue, typename T_DeviceAllocator>
static void initHeap(
AlpakaDevice& dev,
AlpakaDevice& /*dev*/,
AlpakaQueue& queue,
T_DeviceAllocator* devAllocator,
void*,
Expand Down
Loading

0 comments on commit 65c15d7

Please sign in to comment.