Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -482,7 +482,7 @@ rmm::mr::polymorphic_allocator<int> stream_alloc;
// Constructs an adaptor that forwards all (de)allocations to `stream_alloc` on `stream`.
auto adapted = rmm::mr::stream_allocator_adaptor(stream_alloc, stream);

// Allocates 100 bytes using `stream_alloc` on `stream`
// Allocates storage for 100 ints using `stream_alloc` on `stream`
auto p = adapted.allocate(100);
...
// Deallocates using `stream_alloc` on `stream`
Expand Down
7 changes: 7 additions & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ option(BUILD_BENCHMARKS "Configure CMake to build (google) benchmarks" OFF)
# This is mostly so that dependent libraries are configured in shared mode for downstream dependents
# of RMM that get their common dependencies transitively.
option(BUILD_SHARED_LIBS "Build RMM shared libraries" ON)
option(RMM_ENABLE_LEGACY_MR_INTERFACE "Enable legacy memory resource interface" ON)
set(RMM_LOGGING_LEVEL
"INFO"
CACHE STRING "Choose the logging level.")
Expand All @@ -54,6 +55,7 @@ message(VERBOSE "RMM: Build with NVTX support: ${RMM_NVTX}")
# Set logging level. Must go before including gtests and benchmarks. Set the possible values of
# build type for cmake-gui.
message(STATUS "RMM: RMM_LOGGING_LEVEL = '${RMM_LOGGING_LEVEL}'")
message(STATUS "RMM: Legacy MR interface enabled: ${RMM_ENABLE_LEGACY_MR_INTERFACE}")

# cudart can be linked statically or dynamically
option(CUDA_STATIC_RUNTIME "Statically link the CUDA runtime" OFF)
Expand Down Expand Up @@ -132,6 +134,11 @@ if(RMM_NVTX)
target_compile_definitions(rmm PUBLIC RMM_NVTX)
endif()

# Control legacy MR interface visibility
if(RMM_ENABLE_LEGACY_MR_INTERFACE)
target_compile_definitions(rmm PUBLIC RMM_ENABLE_LEGACY_MR_INTERFACE)
endif()

# ##################################################################################################
# * tests and benchmarks ---------------------------------------------------------------------------

Expand Down
4 changes: 2 additions & 2 deletions cpp/benchmarks/async_priming/async_priming_bench.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ void BM_AsyncPrimingImpact(benchmark::State& state, MRFactoryFunc factory)

// Deallocate all
for (auto* ptr : allocations) {
mr->deallocate(ptr, allocation_size);
mr->deallocate_sync(ptr, allocation_size);
}
allocations.clear();

Expand Down Expand Up @@ -118,7 +118,7 @@ void BM_AsyncPrimingImpact(benchmark::State& state, MRFactoryFunc factory)

// Clean up for next iteration
for (auto* ptr : allocations) {
mr->deallocate(ptr, allocation_size);
mr->deallocate_sync(ptr, allocation_size);
}
allocations.clear();
}
Expand Down
6 changes: 3 additions & 3 deletions cpp/benchmarks/random_allocations/random_allocations.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2024, NVIDIA CORPORATION.
* Copyright (c) 2019-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -94,7 +94,7 @@ void random_allocation_free(rmm::mr::device_memory_resource& mr,
void* ptr = nullptr;
if (do_alloc) { // try to allocate
try {
ptr = mr.allocate(size, stream);
ptr = mr.allocate(stream, size);
} catch (rmm::bad_alloc const&) {
do_alloc = false;
#if VERBOSE
Expand All @@ -118,7 +118,7 @@ void random_allocation_free(rmm::mr::device_memory_resource& mr,
std::size_t index = index_distribution(generator) % active_allocations;
active_allocations--;
allocation to_free = remove_at(allocations, index);
mr.deallocate(to_free.ptr, to_free.size, stream);
mr.deallocate(stream, to_free.ptr, to_free.size);
allocation_size -= to_free.size;

#if VERBOSE
Expand Down
1 change: 0 additions & 1 deletion cpp/include/rmm/aligned.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@

#include <cassert>
#include <cstddef>
#include <cstdint>

namespace RMM_EXPORT rmm {

Expand Down
68 changes: 4 additions & 64 deletions cpp/include/rmm/detail/cccl_adaptors.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ class cccl_resource_ref : public ResourceType {

cccl_resource_ref(base&& other) : base(std::move(other)) {}

#ifdef RMM_ENABLE_LEGACY_MR_INTERFACE
void* allocate(std::size_t bytes) { return this->allocate_sync(bytes); }

void* allocate(std::size_t bytes, std::size_t alignment)
Expand All @@ -50,8 +51,8 @@ class cccl_resource_ref : public ResourceType {
{
return this->deallocate_sync(ptr, bytes, alignment);
}
#endif // RMM_ENABLE_LEGACY_MR_INTERFACE

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK so the idea is that the cccl_resource_ref models our old setup where allocate -> synchronous and allocate_async -> stream-ordered.

Whereas async_cccl_resource_ref is allocate_sync -> synchronous and allocate -> stream-ordered.

Since currently we assume allocate is synchronous we need to adapt everyone to that first. And the way to do that is to migrate everyone using allocate to use allocate_sync. Then we can move them onto the async_cccl_resource_ref concept and then we can move sync allocations that could be async back to allocate?

Copy link
Contributor Author

@bdice bdice Oct 16, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Generally that's the right direction.

The migration here is actually a little easier than stated above, because the signature for allocate changed too (in addition to changing sync-to-async). Disabling the "legacy" interface will cause a compile error anywhere the old interface was being used, allowing us to migrate to the new API names and new parameter order in each RAPIDS repository. I am starting that migration now. :)

Once that migration is complete, I will deprecate the "legacy" interface (at which point RAPIDS should not be using the legacy interface at all), then remove it in the subsequent release.

#if CCCL_MAJOR_VERSION > 3 || (CCCL_MAJOR_VERSION == 3 && CCCL_MINOR_VERSION >= 1)
void* allocate_sync(std::size_t bytes) { return base::allocate_sync(bytes); }

void* allocate_sync(std::size_t bytes, std::size_t alignment)
Expand All @@ -68,24 +69,6 @@ class cccl_resource_ref : public ResourceType {
{
return base::deallocate_sync(ptr, bytes, alignment);
}
#else
void* allocate_sync(std::size_t bytes) { return base::allocate(bytes); }

void* allocate_sync(std::size_t bytes, std::size_t alignment)
{
return base::allocate(bytes, alignment);
}

void deallocate_sync(void* ptr, std::size_t bytes) noexcept
{
return base::deallocate(ptr, bytes);
}

void deallocate_sync(void* ptr, std::size_t bytes, std::size_t alignment) noexcept
{
return base::deallocate(ptr, bytes, alignment);
}
#endif
};

template <typename ResourceType>
Expand All @@ -98,8 +81,7 @@ class cccl_async_resource_ref : public ResourceType {
cccl_async_resource_ref(base const& other) : base(other) {}
cccl_async_resource_ref(base&& other) : base(std::move(other)) {}

// BEGINNING OF LEGACY MR METHODS

#ifdef RMM_ENABLE_LEGACY_MR_INTERFACE
void* allocate(std::size_t bytes) { return this->allocate_sync(bytes); }

void* allocate(std::size_t bytes, std::size_t alignment)
Expand Down Expand Up @@ -140,9 +122,8 @@ class cccl_async_resource_ref : public ResourceType {
return this->deallocate(stream, ptr, bytes, alignment);
}

// END OF LEGACY MR METHODS
#endif // RMM_ENABLE_LEGACY_MR_INTERFACE

#if CCCL_MAJOR_VERSION > 3 || (CCCL_MAJOR_VERSION == 3 && CCCL_MINOR_VERSION >= 1)
void* allocate_sync(std::size_t bytes) { return base::allocate_sync(bytes); }

void* allocate_sync(std::size_t bytes, std::size_t alignment)
Expand Down Expand Up @@ -182,47 +163,6 @@ class cccl_async_resource_ref : public ResourceType {
{
return base::deallocate(stream, ptr, bytes, alignment);
}
#else
void* allocate_sync(std::size_t bytes) { return base::allocate(bytes); }

void* allocate_sync(std::size_t bytes, std::size_t alignment)
{
return base::allocate(bytes, alignment);
}

void deallocate_sync(void* ptr, std::size_t bytes) noexcept
{
return base::deallocate(ptr, bytes);
}

void deallocate_sync(void* ptr, std::size_t bytes, std::size_t alignment) noexcept
{
return base::deallocate(ptr, bytes, alignment);
}

void* allocate(cuda_stream_view stream, std::size_t bytes)
{
return base::allocate_async(bytes, stream);
}

void* allocate(cuda_stream_view stream, std::size_t bytes, std::size_t alignment)
{
return base::allocate_async(bytes, alignment, stream);
}

void deallocate(cuda_stream_view stream, void* ptr, std::size_t bytes) noexcept
{
return base::deallocate_async(ptr, bytes, stream);
}

void deallocate(cuda_stream_view stream,
void* ptr,
std::size_t bytes,
std::size_t alignment) noexcept
{
return base::deallocate_async(ptr, bytes, alignment, stream);
}
#endif
};

} // namespace detail
Expand Down
13 changes: 1 addition & 12 deletions cpp/include/rmm/detail/cuda_memory_resource.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
#ifndef LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE
#error \
"RMM requires LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE to be defined. Please add -DLIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE to the compiler flags (this is done automatically when using RMM via CMake)."
#endif
#endif // LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE

#include <rmm/detail/export.hpp>

Expand All @@ -28,7 +28,6 @@ namespace RMM_NAMESPACE {
namespace detail {
namespace polyfill {

#if CCCL_MAJOR_VERSION > 3 || (CCCL_MAJOR_VERSION == 3 && CCCL_MINOR_VERSION >= 1)
template <class Resource>
inline constexpr bool resource = cuda::mr::synchronous_resource<Resource>;
template <class Resource, class... Properties>
Expand All @@ -37,16 +36,6 @@ template <class Resource>
inline constexpr bool async_resource = cuda::mr::resource<Resource>;
template <class Resource, class... Properties>
inline constexpr bool async_resource_with = cuda::mr::resource_with<Resource, Properties...>;
#else // ^^^ CCCL >= 3.1 ^^^ / vvv CCCL < 3.1 vvv
template <class Resource>
inline constexpr bool resource = cuda::mr::resource<Resource>;
template <class Resource, class... Properties>
inline constexpr bool resource_with = cuda::mr::resource_with<Resource, Properties...>;
template <class Resource>
inline constexpr bool async_resource = cuda::mr::async_resource<Resource>;
template <class Resource, class... Properties>
inline constexpr bool async_resource_with = cuda::mr::async_resource_with<Resource, Properties...>;
#endif // CCCL < 3.1

} // namespace polyfill
} // namespace detail
Expand Down
8 changes: 4 additions & 4 deletions cpp/include/rmm/mr/device/aligned_resource_adaptor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -143,10 +143,10 @@ class aligned_resource_adaptor final : public device_memory_resource {
void* do_allocate(std::size_t bytes, cuda_stream_view stream) override
{
if (alignment_ == rmm::CUDA_ALLOCATION_ALIGNMENT || bytes < alignment_threshold_) {
return get_upstream_resource().allocate_async(bytes, 1, stream);
return get_upstream_resource().allocate(stream, bytes, 1);
}
auto const size = upstream_allocation_size(bytes);
void* pointer = get_upstream_resource().allocate_async(size, 1, stream);
void* pointer = get_upstream_resource().allocate(stream, size, 1);
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
auto const address = reinterpret_cast<std::size_t>(pointer);
auto const aligned_address = rmm::align_up(address, alignment_);
Expand All @@ -169,7 +169,7 @@ class aligned_resource_adaptor final : public device_memory_resource {
void do_deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream) noexcept override
{
if (alignment_ == rmm::CUDA_ALLOCATION_ALIGNMENT || bytes < alignment_threshold_) {
get_upstream_resource().deallocate_async(ptr, bytes, 1, stream);
get_upstream_resource().deallocate(stream, ptr, bytes, 1);
} else {
{
lock_guard lock(mtx_);
Expand All @@ -179,7 +179,7 @@ class aligned_resource_adaptor final : public device_memory_resource {
pointers_.erase(iter);
}
}
get_upstream_resource().deallocate_async(ptr, upstream_allocation_size(bytes), 1, stream);
get_upstream_resource().deallocate(stream, ptr, upstream_allocation_size(bytes), 1);
}
}

Expand Down
22 changes: 11 additions & 11 deletions cpp/include/rmm/mr/device/arena_memory_resource.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -156,14 +156,14 @@ class arena_memory_resource final : public device_memory_resource {

{
std::shared_lock lock(mtx_);
void* pointer = arena.allocate(bytes);
void* pointer = arena.allocate_sync(bytes);
if (pointer != nullptr) { return pointer; }
}

{
std::unique_lock lock(mtx_);
defragment();
void* pointer = arena.allocate(bytes);
void* pointer = arena.allocate_sync(bytes);
if (pointer == nullptr) {
if (dump_log_on_failure_) { dump_memory_log(bytes); }
auto const msg = std::string("Maximum pool size exceeded (failed to allocate ") +
Expand Down Expand Up @@ -209,7 +209,7 @@ class arena_memory_resource final : public device_memory_resource {
{
std::shared_lock lock(mtx_);
// If the memory being freed does not belong to the arena, the following will return false.
if (arena.deallocate(ptr, bytes, stream)) { return; }
if (arena.deallocate(stream, ptr, bytes)) { return; }
}

{
Expand All @@ -218,31 +218,31 @@ class arena_memory_resource final : public device_memory_resource {
stream.synchronize_no_throw();

std::unique_lock lock(mtx_);
deallocate_from_other_arena(ptr, bytes, stream);
deallocate_from_other_arena(stream, ptr, bytes);
}
}

/**
* @brief Deallocate memory pointed to by `ptr` that was allocated in a different arena.
*
* @param stream Stream on which to perform deallocation.
* @param ptr Pointer to be deallocated.
* @param bytes The size in bytes of the allocation. This must be equal to the
* value of `bytes` that was passed to the `allocate` call that returned `ptr`.
* @param stream Stream on which to perform deallocation.
*/
void deallocate_from_other_arena(void* ptr, std::size_t bytes, cuda_stream_view stream)
void deallocate_from_other_arena(cuda_stream_view stream, void* ptr, std::size_t bytes)
{
if (use_per_thread_arena(stream)) {
for (auto const& thread_arena : thread_arenas_) {
if (thread_arena.second->deallocate(ptr, bytes)) { return; }
if (thread_arena.second->deallocate_sync(ptr, bytes)) { return; }
}
} else {
for (auto& stream_arena : stream_arenas_) {
if (stream_arena.second.deallocate(ptr, bytes)) { return; }
if (stream_arena.second.deallocate_sync(ptr, bytes)) { return; }
}
}

if (!global_arena_.deallocate(ptr, bytes)) {
if (!global_arena_.deallocate_sync(ptr, bytes)) {
// It's possible to use per thread default streams along with another pool of streams.
// This means that it's possible for an allocation to move from a thread or stream arena
// back into the global arena during a defragmentation and then move down into another arena
Expand All @@ -253,11 +253,11 @@ class arena_memory_resource final : public device_memory_resource {
// arenas all the time.
if (use_per_thread_arena(stream)) {
for (auto& stream_arena : stream_arenas_) {
if (stream_arena.second.deallocate(ptr, bytes)) { return; }
if (stream_arena.second.deallocate_sync(ptr, bytes)) { return; }
}
} else {
for (auto const& thread_arena : thread_arenas_) {
if (thread_arena.second->deallocate(ptr, bytes)) { return; }
if (thread_arena.second->deallocate_sync(ptr, bytes)) { return; }
}
}
RMM_FAIL("allocation not found");
Expand Down
4 changes: 2 additions & 2 deletions cpp/include/rmm/mr/device/binning_memory_resource.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ class binning_memory_resource final : public device_memory_resource {
void* do_allocate(std::size_t bytes, cuda_stream_view stream) override
{
if (bytes <= 0) { return nullptr; }
return get_resource_ref(bytes).allocate_async(bytes, stream);
return get_resource_ref(bytes).allocate(stream, bytes);
}

/**
Expand All @@ -209,7 +209,7 @@ class binning_memory_resource final : public device_memory_resource {
*/
void do_deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream) noexcept override
{
get_resource_ref(bytes).deallocate_async(ptr, bytes, stream);
get_resource_ref(bytes).deallocate(stream, ptr, bytes);
}

device_async_resource_ref
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ class cuda_async_managed_memory_resource final : public device_memory_resource {
*/
void* do_allocate(std::size_t bytes, rmm::cuda_stream_view stream) override
{
return pool_.allocate(bytes, stream);
return pool_.allocate(stream, bytes);
}

/**
Expand All @@ -112,7 +112,7 @@ class cuda_async_managed_memory_resource final : public device_memory_resource {
*/
void do_deallocate(void* ptr, std::size_t bytes, rmm::cuda_stream_view stream) noexcept override
{
pool_.deallocate(ptr, bytes, stream);
pool_.deallocate(stream, ptr, bytes);
}

/**
Expand Down
4 changes: 2 additions & 2 deletions cpp/include/rmm/mr/device/cuda_async_memory_resource.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ class cuda_async_memory_resource final : public device_memory_resource {
void* do_allocate(std::size_t bytes, rmm::cuda_stream_view stream) override
{
void* ptr{nullptr};
ptr = pool_.allocate(bytes, stream);
ptr = pool_.allocate(stream, bytes);
return ptr;
}

Expand All @@ -194,7 +194,7 @@ class cuda_async_memory_resource final : public device_memory_resource {
*/
void do_deallocate(void* ptr, std::size_t bytes, rmm::cuda_stream_view stream) noexcept override
{
pool_.deallocate(ptr, bytes, stream);
pool_.deallocate(stream, ptr, bytes);
}

/**
Expand Down
Loading
Loading