Skip to content
Draft
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 80 additions & 0 deletions cpp/include/rmm/device_buffer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@

#include <cassert>
#include <cstddef>
#include <optional>

namespace RMM_NAMESPACE {
/**
Expand Down Expand Up @@ -80,6 +81,17 @@ namespace RMM_NAMESPACE {
*/
class device_buffer {
public:
/**
* @brief A struct to configure memory resources for a `device_buffer`.
*
* This struct allows for specifying a device memory resource for the buffer's storage and
* an optional host memory resource for a bounce buffer to optimize host-device transfers.
*/
struct memory_resource_args {
device_async_resource_ref device_mr{mr::get_current_device_resource_ref()};
std::optional<host_resource_ref> bounce_buffer_host_mr{std::nullopt};
};

// The copy constructor and copy assignment operator without a stream are deleted because they
// provide no way to specify an explicit stream
device_buffer(device_buffer const& other) = delete;
Expand Down Expand Up @@ -107,6 +119,21 @@ class device_buffer {
cuda_stream_view stream,
device_async_resource_ref mr = mr::get_current_device_resource_ref());

/**
* @brief Constructs a new device buffer of `size` uninitialized bytes
* with optional host bounce buffer
*
* @throws rmm::bad_alloc If allocation fails.
*
* @param size Size in bytes to allocate in device memory.
* @param stream CUDA stream on which memory may be allocated if the memory
* resource supports streams.
* @param mr_args Arguments to configure memory resources for a `device_buffer`.
*/
explicit device_buffer(std::size_t size,
cuda_stream_view stream,
memory_resource_args const& mr_args);

/**
* @brief Construct a new device buffer by copying from a raw pointer to an existing host or
* device memory allocation.
Expand All @@ -131,6 +158,30 @@ class device_buffer {
cuda_stream_view stream,
device_async_resource_ref mr = mr::get_current_device_resource_ref());

/**
* @brief Construct a new device buffer by copying from a raw pointer to an existing host or
* device memory allocation with optional host bounce buffer.
*
* @note This function does not synchronize `stream`. `source_data` is copied on `stream`, so the
* caller is responsible for correct synchronization to ensure that `source_data` is valid when
* the copy occurs. This includes destroying `source_data` in stream order after this function is
* called, or synchronizing or waiting on `stream` after this function returns as necessary.
*
* @throws rmm::bad_alloc If creating the new allocation fails.
* @throws rmm::logic_error If `source_data` is null, and `size != 0`.
* @throws rmm::cuda_error if copying from the device memory fails.
*
* @param source_data Pointer to the host or device memory to copy from.
* @param size Size in bytes to copy.
* @param stream CUDA stream on which memory may be allocated if the memory
* resource supports streams.
* @param mr_args Arguments to configure memory resources for a `device_buffer`.
*/
device_buffer(void const* source_data,
std::size_t size,
cuda_stream_view stream,
memory_resource_args const& mr_args);

/**
* @brief Construct a new `device_buffer` by deep copying the contents of
* another `device_buffer`, optionally using the specified stream and memory
Expand All @@ -156,6 +207,30 @@ class device_buffer {
cuda_stream_view stream,
device_async_resource_ref mr = mr::get_current_device_resource_ref());

/**
* @brief Construct a new `device_buffer` by deep copying the contents of
* another `device_buffer` with optional host bounce buffer.
*
* @note Only copies `other.size()` bytes from `other`, i.e., if
*`other.size() != other.capacity()`, then the size and capacity of the newly
* constructed `device_buffer` will be equal to `other.size()`.
*
* @note This function does not synchronize `stream`. `other` is copied on `stream`, so the
* caller is responsible for correct synchronization to ensure that `other` is valid when
* the copy occurs. This includes destroying `other` in stream order after this function is
* called, or synchronizing or waiting on `stream` after this function returns as necessary.
*
* @throws rmm::bad_alloc If creating the new allocation fails.
* @throws rmm::cuda_error if copying from `other` fails.
*
* @param other The `device_buffer` whose contents will be copied
* @param stream The stream to use for the allocation and copy
* @param mr_args Arguments to configure memory resources for a `device_buffer`.
*/
device_buffer(device_buffer const& other,
cuda_stream_view stream,
memory_resource_args const& mr_args);

/**
* @brief Constructs a new `device_buffer` by moving the contents of another
* `device_buffer` into the newly constructed one.
Expand Down Expand Up @@ -332,6 +407,11 @@ class device_buffer {
///< allocate/deallocate device memory
cuda_device_id _device{get_current_cuda_device()};

std::optional<host_resource_ref> _host_mr{
std::nullopt}; ///< Optional host memory resource for bounce buffers
std::optional<void*> _host_bounce_buffer{
std::nullopt}; ///< Optional bounce buffer for host-device transfers

/**
* @brief Allocates the specified amount of memory and updates the size/capacity accordingly.
*
Expand Down
119 changes: 105 additions & 14 deletions cpp/src/device_buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@

#include <cuda_runtime_api.h>

#include <cstring>

namespace rmm {

device_buffer::device_buffer() : _mr{rmm::mr::get_current_device_resource_ref()} {}
Expand All @@ -32,6 +34,15 @@ device_buffer::device_buffer(std::size_t size,
allocate_async(size);
}

device_buffer::device_buffer(std::size_t size,
cuda_stream_view stream,
memory_resource_args const& mr_args)
: _stream{stream}, _mr{mr_args.device_mr}, _host_mr{mr_args.bounce_buffer_host_mr}
{
cuda_set_device_raii dev{_device};
allocate_async(size);
}

device_buffer::device_buffer(void const* source_data,
std::size_t size,
cuda_stream_view stream,
Expand All @@ -43,26 +54,48 @@ device_buffer::device_buffer(void const* source_data,
copy_async(source_data, size);
}

device_buffer::device_buffer(void const* source_data,
std::size_t size,
cuda_stream_view stream,
memory_resource_args const& mr_args)
: _stream{stream}, _mr{mr_args.device_mr}, _host_mr{mr_args.bounce_buffer_host_mr}
{
cuda_set_device_raii dev{_device};
allocate_async(size);
copy_async(source_data, size);
}

device_buffer::device_buffer(device_buffer const& other,
cuda_stream_view stream,
device_async_resource_ref mr)
: device_buffer{other.data(), other.size(), stream, mr}
{
}

device_buffer::device_buffer(device_buffer const& other,
cuda_stream_view stream,
memory_resource_args const& mr_args)
: device_buffer{other.data(), other.size(), stream, mr_args}
{
}

device_buffer::device_buffer(device_buffer&& other) noexcept
: _data{other._data},
_size{other._size},
_capacity{other._capacity},
_stream{other.stream()},
_mr{other._mr},
_device{other._device}
_device{other._device},
_host_mr{other._host_mr},
_host_bounce_buffer{other._host_bounce_buffer}
Copy link
Contributor Author

@JigaoLuo JigaoLuo Jul 23, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To Line 90: I’ll also start with a simple discussion point: I believe the bounce buffer is stateless—it doesn’t store anything and serves only as a temporary transfer buffer. If you agree, I’d prefer to leave it uncopied or moved, to reflect its transient nature.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It doesn't store anything, but it is an allocation that's a part of the object's state.
When we copy an object, I assume we want the new one to have the equivalent bounce buffer. So, we don't need to copy, but still need to allocate a new one.
When we move, reusing the old allocation should be the cheapest option, even though we don't care about the content.

{
other._data = nullptr;
other._size = 0;
other._capacity = 0;
other.set_stream(cuda_stream_view{});
other._device = cuda_device_id{-1};
other._device = cuda_device_id{-1};
other._host_mr = std::nullopt;
other._host_bounce_buffer = std::nullopt;
}

device_buffer& device_buffer::operator=(device_buffer&& other) noexcept
Expand All @@ -75,14 +108,18 @@ device_buffer& device_buffer::operator=(device_buffer&& other) noexcept
_size = other._size;
_capacity = other._capacity;
set_stream(other.stream());
_mr = other._mr;
_device = other._device;
_mr = other._mr;
_device = other._device;
_host_mr = other._host_mr;
_host_bounce_buffer = other._host_bounce_buffer;
Copy link
Contributor Author

@JigaoLuo JigaoLuo Jul 23, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To Line 114: Same reasoning as my earlier point about statelessness.


other._data = nullptr;
other._size = 0;
other._capacity = 0;
other.set_stream(cuda_stream_view{});
other._device = cuda_device_id{-1};
other._device = cuda_device_id{-1};
other._host_mr = std::nullopt;
other._host_bounce_buffer = std::nullopt;
}
return *this;
}
Expand All @@ -96,14 +133,30 @@ device_buffer::~device_buffer() noexcept

void device_buffer::allocate_async(std::size_t bytes)
{
_size = bytes;
_capacity = bytes;
_data = (bytes > 0) ? _mr.allocate_async(bytes, stream()) : nullptr;
auto const old_capacity = _capacity;
_size = bytes;
_capacity = bytes;
_data = (bytes > 0) ? _mr.allocate_async(bytes, stream()) : nullptr;

// Resize host bounce buffer if needed
if (_host_mr.has_value() && bytes > 0) {
if (_host_bounce_buffer.has_value()) {
_host_mr->deallocate(_host_bounce_buffer.value(), old_capacity);
}
_host_bounce_buffer = _host_mr->allocate(bytes);
}
}

void device_buffer::deallocate_async() noexcept
{
if (capacity() > 0) { _mr.deallocate_async(data(), capacity(), stream()); }

// Deallocate host bounce buffer if it exists
if (_host_bounce_buffer.has_value() && _host_mr.has_value()) {
_host_mr->deallocate(_host_bounce_buffer.value(), capacity());
_host_bounce_buffer = std::nullopt;
}

_size = 0;
_capacity = 0;
_data = nullptr;
Expand All @@ -115,6 +168,20 @@ void device_buffer::copy_async(void const* source, std::size_t bytes)
RMM_EXPECTS(nullptr != source, "Invalid copy from nullptr.");
RMM_EXPECTS(nullptr != _data, "Invalid copy to nullptr.");

if (_host_bounce_buffer.has_value() && _host_mr.has_value()) {
// If source is host memory, use bounce buffer for optimized transfer
cudaPointerAttributes attributes;
cudaError_t result = cudaPointerGetAttributes(&attributes, source);
if (result == cudaSuccess && attributes.type == cudaMemoryTypeHost) {
RMM_CUDA_TRY(cudaMemcpyAsync(
_host_bounce_buffer.value(), source, bytes, cudaMemcpyHostToHost, stream().value()));
RMM_CUDA_TRY(cudaMemcpyAsync(
_data, _host_bounce_buffer.value(), bytes, cudaMemcpyHostToDevice, stream().value()));
return;
Comment on lines +175 to +180
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the user's source pointer is in page-locked host memory this is a pessimisation, I think.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this branch is only executes when user passes a host mr, with the expectation that this would be some kind of pinned memory allocator. When the host mr is not set (the default case), copy is done the same way it was before this PR.

}
}

// Fallback to direct copy (device-to-device or host-to-device without bounce buffer)
RMM_CUDA_TRY(cudaMemcpyAsync(_data, source, bytes, cudaMemcpyDefault, stream().value()));
}
}
Expand All @@ -124,9 +191,18 @@ void device_buffer::reserve(std::size_t new_capacity, cuda_stream_view stream)
set_stream(stream);
if (new_capacity > capacity()) {
cuda_set_device_raii dev{_device};
auto tmp = device_buffer{new_capacity, stream, _mr};

device_buffer tmp;
if (_host_mr.has_value()) {
memory_resource_args args{_mr, _host_mr};
tmp = device_buffer{new_capacity, stream, args};
} else {
tmp = device_buffer{new_capacity, stream, _mr};
}

auto const old_size = size();
RMM_CUDA_TRY(cudaMemcpyAsync(tmp.data(), data(), size(), cudaMemcpyDefault, stream.value()));
RMM_CUDA_TRY(cudaMemcpyAsync(
tmp.data(), data(), size(), cudaMemcpyDefault, stream.value())); // device-to-device copy
*this = std::move(tmp);
_size = old_size;
}
Expand All @@ -141,8 +217,17 @@ void device_buffer::resize(std::size_t new_size, cuda_stream_view stream)
_size = new_size;
} else {
cuda_set_device_raii dev{_device};
auto tmp = device_buffer{new_size, stream, _mr};
RMM_CUDA_TRY(cudaMemcpyAsync(tmp.data(), data(), size(), cudaMemcpyDefault, stream.value()));

device_buffer tmp;
if (_host_mr.has_value()) {
memory_resource_args args{_mr, _host_mr};
tmp = device_buffer{new_size, stream, args};
} else {
tmp = device_buffer{new_size, stream, _mr};
}

RMM_CUDA_TRY(cudaMemcpyAsync(
tmp.data(), data(), size(), cudaMemcpyDefault, stream.value())); // device-to-device copy
*this = std::move(tmp);
}
}
Expand All @@ -155,8 +240,14 @@ void device_buffer::shrink_to_fit(cuda_stream_view stream)
// Invoke copy ctor on self which only copies `[0, size())` and swap it
// with self. The temporary `device_buffer` will hold the old contents
// which will then be destroyed
auto tmp = device_buffer{*this, stream, _mr};
std::swap(tmp, *this);
device_buffer tmp;
if (_host_mr.has_value()) {
memory_resource_args args{_mr, _host_mr};
tmp = device_buffer{*this, stream, args};
} else {
tmp = device_buffer{*this, stream, _mr};
}
*this = std::move(tmp);
}
}

Expand Down