Skip to content

Commit

Permalink
Merge pull request #277 from chillenzer/add-GallatinCUDAafterReorgani…
Browse files Browse the repository at this point in the history
…sation

Add gallatin cuda after reorganisation
  • Loading branch information
psychocoderHPC authored Feb 10, 2025
2 parents a288377 + 1a0530e commit bde4b6f
Show file tree
Hide file tree
Showing 7 changed files with 298 additions and 15 deletions.
26 changes: 26 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,38 @@ include(${CMAKE_CURRENT_LIST_DIR}/cmake/add_controlled.cmake)
add_controlled("PackageProject.cmake" REQUIRED)
add_controlled("alpaka" REQUIRED)


# ---- Create library ----

# Note: for header-only libraries change all PUBLIC flags to INTERFACE and create an interface
add_library(${PROJECT_NAME} INTERFACE)
set_target_properties(${PROJECT_NAME} PROPERTIES CXX_STANDARD 20)

if(alpaka_ACC_GPU_CUDA_ENABLE)
add_controlled("Gallatin")

# Gallatin needs some fairly recent compute capability from CUDA.
# CMake defaults to taking the oldest supported by the device
# (https://cmake.org/cmake/help/latest/variable/CMAKE_CUDA_ARCHITECTURES.html)
# which can be too old. This leads to compilation errors along the lines of
#
# error: no instance of overloaded function "atomicCAS" matches the argument list
# argument types are: (unsigned short *, unsigned short, unsigned short)
#
# because this overload was only added later (apparently?).

if ("${CMAKE_CUDA_ARCHITECTURES}" LESS 70)
message(
WARNING
"CUDA architecture detected is too old: ${CMAKE_CUDA_ARCHITECTURES}. "
"If the architecture set is too old, this can lead to compilation errors with Gallatin. "
"If Gallatin is needed, please set CMAKE_CUDA_ARCHITECTURES to the correct value >= 70."
)
endif()

target_link_libraries(${PROJECT_NAME} INTERFACE gallatin)
endif()

# being a cross-platform target, we enforce standards conformance on MSVC
target_compile_options(${PROJECT_NAME} INTERFACE "$<$<COMPILE_LANG_AND_ID:CXX,MSVC>:/permissive->")

Expand Down
9 changes: 9 additions & 0 deletions cmake/package-lock.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,12 @@ CPMDeclarePackage(Catch2
SYSTEM YES
EXCLUDE_FROM_ALL YES
)
# Gallatin
CPMDeclarePackage(Gallatin
# There's no release available yet.
GIT_TAG ac0cb8e380ffcb74156bafb8805fb60412817c5f
# Use our own fork for some patches
GITHUB_REPOSITORY chillenzer/Gallatin
SYSTEM YES
EXCLUDE_FROM_ALL YES
)
26 changes: 20 additions & 6 deletions examples/getAvailableSlots/source/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,16 +99,19 @@ struct ExampleKernel
}
};

template<typename T_CreationPolicy>
template<
typename T_CreationPolicy,
typename T_ReservePoolPolicy,
typename T_AlignmentPolicy = mallocMC::AlignmentPolicies::Shrink<AlignmentConfig>>
auto example03() -> int
{
using Allocator = mallocMC::Allocator<
Acc,
T_CreationPolicy,
mallocMC::DistributionPolicies::Noop,
mallocMC::OOMPolicies::ReturnNull,
mallocMC::ReservePoolPolicies::AlpakaBuf<Acc>,
mallocMC::AlignmentPolicies::Shrink<AlignmentConfig>>;
T_ReservePoolPolicy,
T_AlignmentPolicy>;

auto const platform = alpaka::Platform<Acc>{};
auto const dev = alpaka::getDevByIdx(platform, 0);
Expand All @@ -130,8 +133,19 @@ auto example03() -> int

auto main(int /*argc*/, char* /*argv*/[]) -> int
{
example03<FlatterScatter<FlatterScatterHeapConfig>>();
example03<Scatter<FlatterScatterHeapConfig>>();
example03<OldMalloc>();
example03<FlatterScatter<FlatterScatterHeapConfig>, mallocMC::ReservePoolPolicies::AlpakaBuf<Acc>>();
example03<Scatter<FlatterScatterHeapConfig>, mallocMC::ReservePoolPolicies::AlpakaBuf<Acc>>();
#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
example03<
mallocMC::CreationPolicies::GallatinCuda<>,
mallocMC::ReservePoolPolicies::Noop,
mallocMC::AlignmentPolicies::Noop>();
// GallatinCuda already uses cudaSetLimits and we're not allowed to call it a second time.
example03<OldMalloc, mallocMC::ReservePoolPolicies::Noop>();
// This should normally be:
// example01<OldMalloc, mallocMC::ReservePoolPolicies::CudaSetLimits>();
#else
example03<OldMalloc, mallocMC::ReservePoolPolicies::Noop>();
#endif
return 0;
}
29 changes: 20 additions & 9 deletions examples/vectorAdd/source/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,6 @@
THE SOFTWARE.
*/

#include "mallocMC/creationPolicies/FlatterScatter.hpp"
#include "mallocMC/creationPolicies/OldMalloc.hpp"

#include <alpaka/alpaka.hpp>
#include <alpaka/example/ExampleDefaultAcc.hpp>

Expand Down Expand Up @@ -80,16 +77,19 @@ ALPAKA_STATIC_ACC_MEM_GLOBAL int** arA;
ALPAKA_STATIC_ACC_MEM_GLOBAL int** arB;
ALPAKA_STATIC_ACC_MEM_GLOBAL int** arC;

template<typename T_CreationPolicy>
template<
typename T_CreationPolicy,
typename T_ReservePoolPolicy,
typename T_AlignmentPolicy = mallocMC::AlignmentPolicies::Shrink<ShrinkConfig>>
auto example01() -> int
{
using Allocator = mallocMC::Allocator<
Acc,
T_CreationPolicy,
mallocMC::DistributionPolicies::Noop,
mallocMC::OOMPolicies::ReturnNull,
mallocMC::ReservePoolPolicies::AlpakaBuf<Acc>,
mallocMC::AlignmentPolicies::Shrink<ShrinkConfig>>;
T_ReservePoolPolicy,
T_AlignmentPolicy>;

constexpr auto length = 100;

Expand Down Expand Up @@ -227,8 +227,19 @@ auto example01() -> int

auto main(int /*argc*/, char* /*argv*/[]) -> int
{
example01<FlatterScatter<FlatterScatterHeapConfig>>();
example01<Scatter<FlatterScatterHeapConfig>>();
example01<OldMalloc>();
example01<FlatterScatter<FlatterScatterHeapConfig>, mallocMC::ReservePoolPolicies::AlpakaBuf<Acc>>();
example01<Scatter<FlatterScatterHeapConfig>, mallocMC::ReservePoolPolicies::AlpakaBuf<Acc>>();
#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
example01<
mallocMC::CreationPolicies::GallatinCuda<>,
mallocMC::ReservePoolPolicies::Noop,
mallocMC::AlignmentPolicies::Noop>();
// GallatinCuda already uses cudaSetLimits and we're not allowed to call it a second time.
example01<OldMalloc, mallocMC::ReservePoolPolicies::Noop>();
// This should normally be:
// example01<OldMalloc, mallocMC::ReservePoolPolicies::CudaSetLimits>();
#else
example01<OldMalloc, mallocMC::ReservePoolPolicies::Noop>();
#endif
return 0;
}
161 changes: 161 additions & 0 deletions include/mallocMC/creationPolicies/GallatinCuda.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
/*
mallocMC: Memory Allocator for Many Core Architectures.
Copyright 2014-2024 Institute of Radiation Physics,
Helmholtz-Zentrum Dresden - Rossendorf
Author(s): Carlchristian Eckert - c.eckert ( at ) hzdr.de
Julian Lenz - j.lenz ( at ) hzdr.de
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/

#pragma once

#include <alpaka/alpaka.hpp>

#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
# include <gallatin/allocators/gallatin.cuh>
#else

// Construct a fake, so we get a nice error message when we try to use is
// and it's not in the way when we don't.
namespace gallatin::allocators
{
template<size_t...>
struct Gallatin
{
static auto generate_on_device(auto...)
{
return nullptr;
}
};
} // namespace gallatin::allocators

#endif

namespace mallocMC
{
namespace CreationPolicies
{
/**
* @brief Prototype integration of Gallatin (https://dl.acm.org/doi/10.1145/3627535.3638499)
*
* This CreationPolicy integrates the CUDA code for the Gallatin prototype into mallocMC
* as a thin wrapper. Its intended for proof-of-principle tests and benchmarks only and
* obviously only works with on CUDA devices.
*
* It also only works with the reservePoolPolicies::Noop beccause it does what CudaSetLimits
* does internally on its own.
*
* If we should ever see the need for it, we'd re-implement it in alpaka for a fully-fletched
* and well-maintained version of this.
* Experience has been mixed so far: While we could reproduce good performance in some cases,
* fragmentation was found to be unusably high (to the point of single-digit utilisaton of
* available memory) in PIConGPU. That's why there's currently no plan to lift the prototype
* status in the near future.
*/
template<
typename T_AlignmentPolicy,
size_t bytes_per_segment = 16ULL * 1024 * 1024,
size_t smallest_slice = 16,
size_t largest_slice = 4096>
class GallatinCudaImpl
{
using Gallatin = gallatin::allocators::Gallatin<bytes_per_segment, smallest_slice, largest_slice>;

public:
template<typename T_AlignmentPolicyLocal>
using AlignmentAwarePolicy
= GallatinCudaImpl<T_AlignmentPolicyLocal, bytes_per_segment, smallest_slice, largest_slice>;
Gallatin* heap{nullptr};

static constexpr auto providesAvailableSlots = false;

template<typename AlpakaAcc>
ALPAKA_FN_ACC auto create(AlpakaAcc const& acc, uint32_t bytes) const -> void*
{
return heap->malloc(static_cast<size_t>(bytes));
}

template<typename AlpakaAcc>
ALPAKA_FN_ACC void destroy(AlpakaAcc const& /*acc*/, void* mem) const
{
heap->free(mem);
}

ALPAKA_FN_ACC auto isOOM(void* p, size_t s) const -> bool
{
return s != 0 && (p == nullptr);
}

template<typename AlpakaAcc, typename AlpakaDevice, typename AlpakaQueue, typename T_DeviceAllocator>
static void initHeap(
AlpakaDevice& dev,
AlpakaQueue& queue,
T_DeviceAllocator* devAllocator,
void*,
size_t memsize)
{
static_assert(
std::is_same_v<alpaka::AccToTag<AlpakaAcc>, alpaka::TagGpuCudaRt>,
"The GallatinCuda creation policy is only available on CUDA architectures. Please choose a "
"different one.");

// This is an extremely hot fix:
// PIConGPU initialises its allocator with 0 bytes to be able to distribute the pointer.
// Only afterwards it can find out its actual memory requirements and uses destructiveResize to set
// the correct heap size. Gallatin runs into issues with this approach.
// Instead, we simply don't believe the request if it's 0.
if(memsize == 0)
return;

auto devHost = alpaka::getDevByIdx(alpaka::PlatformCpu{}, 0);
using Dim = typename alpaka::trait::DimType<AlpakaAcc>::type;
using Idx = typename alpaka::trait::IdxType<AlpakaAcc>::type;
using VecType = alpaka::Vec<Dim, Idx>;

auto tmp = Gallatin::generate_on_device(memsize, 42, true);
auto workDivSingleThread
= alpaka::WorkDivMembers<Dim, Idx>{VecType::ones(), VecType::ones(), VecType::ones()};
alpaka::exec<AlpakaAcc>(
queue,
workDivSingleThread,
[tmp, devAllocator] ALPAKA_FN_ACC(AlpakaAcc const&) { devAllocator->heap = tmp; });
}

static auto classname() -> std::string
{
return "GallatinCuda";
}
};

template<
size_t bytes_per_segment = 16ULL * 1024 * 1024,
size_t smallest_slice = 16,
size_t largest_slice = 4096>
struct GallatinCuda
{
template<typename T_AlignmentPolicy>
using AlignmentAwarePolicy
= GallatinCudaImpl<T_AlignmentPolicy, bytes_per_segment, smallest_slice, largest_slice>;
};

} // namespace CreationPolicies
} // namespace mallocMC
2 changes: 2 additions & 0 deletions include/mallocMC/mallocMC.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
#include "alignmentPolicies/Noop.hpp"
#include "alignmentPolicies/Shrink.hpp"
#include "creationPolicies/FlatterScatter.hpp"
#include "creationPolicies/GallatinCuda.hpp"
#include "creationPolicies/OldMalloc.hpp"
#include "creationPolicies/Scatter.hpp"
#include "distributionPolicies/Noop.hpp"
Expand All @@ -55,3 +56,4 @@
#include "oOMPolicies/ReturnNull.hpp"
#include "reservePoolPolicies/AlpakaBuf.hpp"
#include "reservePoolPolicies/CudaSetLimits.hpp"
#include "reservePoolPolicies/Noop.hpp"
60 changes: 60 additions & 0 deletions include/mallocMC/reservePoolPolicies/Noop.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
/*
mallocMC: Memory Allocator for Many Core Architectures.
Copyright 2014-2024 Institute of Radiation Physics,
Helmholtz-Zentrum Dresden - Rossendorf
Author(s): Carlchristian Eckert - c.eckert ( at ) hzdr.de
Julian Lenz - j.lenz ( at ) hzdr.de
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/

#pragma once

namespace mallocMC
{
namespace ReservePoolPolicies
{
/**
* @brief Does exactly nothing.
*
* This is intended for use with prototypes that were originally designed
* to handle these aspects on their own. Currently needed for GallatinCuda.
*/
struct Noop
{
template<typename AlpakaDev>
auto setMemPool(AlpakaDev const& /*dev*/, size_t /*memsize*/) -> void*
{
return nullptr;
}

static void resetMemPool()
{
}

static auto classname() -> std::string
{
return "Noop";
}
};

} // namespace ReservePoolPolicies
} // namespace mallocMC

0 comments on commit bde4b6f

Please sign in to comment.