rapidsai · JigaoLuo · Jul 23, 2025 · Jul 23, 2025 · JigaoLuo · Jul 23, 2025
@@ -26,6 +26,7 @@
 
 #include <cassert>
 #include <cstddef>
+#include <optional>
 
 namespace RMM_NAMESPACE {
 /**
@@ -80,6 +81,17 @@ namespace RMM_NAMESPACE {
  */
 class device_buffer {
  public:
+  /**
+   * @brief A struct to configure memory resources for a `device_buffer`.
+   *
+   * This struct allows for specifying a device memory resource for the buffer's storage and
+   * an optional host memory resource for a bounce buffer to optimize host-device transfers.
+   */
+  struct memory_resource_args {
+    device_async_resource_ref device_mr{mr::get_current_device_resource_ref()};
+    std::optional<host_resource_ref> bounce_buffer_host_mr{std::nullopt};
+  };
+
   // The copy constructor and copy assignment operator without a stream are deleted because they
   // provide no way to specify an explicit stream
   device_buffer(device_buffer const& other)            = delete;
@@ -107,6 +119,21 @@ class device_buffer {
                          cuda_stream_view stream,
                          device_async_resource_ref mr = mr::get_current_device_resource_ref());
 
+  /**
+   * @brief Constructs a new device buffer of `size` uninitialized bytes
+   *        with optional host bounce buffer
+   *
+   * @throws rmm::bad_alloc If allocation fails.
+   *
+   * @param size Size in bytes to allocate in device memory.
+   * @param stream CUDA stream on which memory may be allocated if the memory
+   * resource supports streams.
+   * @param mr_args Arguments to configure memory resources for a `device_buffer`.
+   */
+  explicit device_buffer(std::size_t size,
+                         cuda_stream_view stream,
+                         memory_resource_args const& mr_args);
+
   /**
    * @brief Construct a new device buffer by copying from a raw pointer to an existing host or
    * device memory allocation.
@@ -131,6 +158,30 @@ class device_buffer {
                 cuda_stream_view stream,
                 device_async_resource_ref mr = mr::get_current_device_resource_ref());
 
+  /**
+   * @brief Construct a new device buffer by copying from a raw pointer to an existing host or
+   * device memory allocation with optional host bounce buffer.
+   *
+   * @note This function does not synchronize `stream`. `source_data` is copied on `stream`, so the
+   * caller is responsible for correct synchronization to ensure that `source_data` is valid when
+   * the copy occurs. This includes destroying `source_data` in stream order after this function is
+   * called, or synchronizing or waiting on `stream` after this function returns as necessary.
+   *
+   * @throws rmm::bad_alloc If creating the new allocation fails.
+   * @throws rmm::logic_error If `source_data` is null, and `size != 0`.
+   * @throws rmm::cuda_error if copying from the device memory fails.
+   *
+   * @param source_data Pointer to the host or device memory to copy from.
+   * @param size Size in bytes to copy.
+   * @param stream CUDA stream on which memory may be allocated if the memory
+   * resource supports streams.
+   * @param mr_args Arguments to configure memory resources for a `device_buffer`.
+   */
+  device_buffer(void const* source_data,
+                std::size_t size,
+                cuda_stream_view stream,
+                memory_resource_args const& mr_args);
+
   /**
    * @brief Construct a new `device_buffer` by deep copying the contents of
    * another `device_buffer`, optionally using the specified stream and memory
@@ -156,6 +207,30 @@ class device_buffer {
                 cuda_stream_view stream,
                 device_async_resource_ref mr = mr::get_current_device_resource_ref());
 
+  /**
+   * @brief Construct a new `device_buffer` by deep copying the contents of
+   * another `device_buffer` with optional host bounce buffer.
+   *
+   * @note Only copies `other.size()` bytes from `other`, i.e., if
+   *`other.size() != other.capacity()`, then the size and capacity of the newly
+   * constructed `device_buffer` will be equal to `other.size()`.
+   *
+   * @note This function does not synchronize `stream`. `other` is copied on `stream`, so the
+   * caller is responsible for correct synchronization to ensure that `other` is valid when
+   * the copy occurs. This includes destroying `other` in stream order after this function is
+   * called, or synchronizing or waiting on `stream` after this function returns as necessary.
+   *
+   * @throws rmm::bad_alloc If creating the new allocation fails.
+   * @throws rmm::cuda_error if copying from `other` fails.
+   *
+   * @param other The `device_buffer` whose contents will be copied
+   * @param stream The stream to use for the allocation and copy
+   * @param mr_args Arguments to configure memory resources for a `device_buffer`.
+   */
+  device_buffer(device_buffer const& other,
+                cuda_stream_view stream,
+                memory_resource_args const& mr_args);
+
   /**
    * @brief Constructs a new `device_buffer` by moving the contents of another
    * `device_buffer` into the newly constructed one.
@@ -332,6 +407,11 @@ class device_buffer {
                                                   ///< allocate/deallocate device memory
   cuda_device_id _device{get_current_cuda_device()};
 
+  std::optional<host_resource_ref> _host_mr{
+    std::nullopt};  ///< Optional host memory resource for bounce buffers
+  std::optional<void*> _host_bounce_buffer{
+    std::nullopt};  ///< Optional bounce buffer for host-device transfers
+
   /**
    * @brief Allocates the specified amount of memory and updates the size/capacity accordingly.
    *

diff --git a/cpp/src/device_buffer.cpp b/cpp/src/device_buffer.cpp
@@ -19,6 +19,8 @@
 
 #include <cuda_runtime_api.h>
 
+#include <cstring>
+
 namespace rmm {
 
 device_buffer::device_buffer() : _mr{rmm::mr::get_current_device_resource_ref()} {}
@@ -32,6 +34,15 @@ device_buffer::device_buffer(std::size_t size,
   allocate_async(size);
 }
 
+device_buffer::device_buffer(std::size_t size,
+                             cuda_stream_view stream,
+                             memory_resource_args const& mr_args)
+  : _stream{stream}, _mr{mr_args.device_mr}, _host_mr{mr_args.bounce_buffer_host_mr}
+{
+  cuda_set_device_raii dev{_device};
+  allocate_async(size);
+}
+
 device_buffer::device_buffer(void const* source_data,
                              std::size_t size,
                              cuda_stream_view stream,
@@ -43,26 +54,48 @@ device_buffer::device_buffer(void const* source_data,
   copy_async(source_data, size);
 }
 
+device_buffer::device_buffer(void const* source_data,
+                             std::size_t size,
+                             cuda_stream_view stream,
+                             memory_resource_args const& mr_args)
+  : _stream{stream}, _mr{mr_args.device_mr}, _host_mr{mr_args.bounce_buffer_host_mr}
+{
+  cuda_set_device_raii dev{_device};
+  allocate_async(size);
+  copy_async(source_data, size);
+}
+
 device_buffer::device_buffer(device_buffer const& other,
                              cuda_stream_view stream,
                              device_async_resource_ref mr)
   : device_buffer{other.data(), other.size(), stream, mr}
 {
 }
 
+device_buffer::device_buffer(device_buffer const& other,
+                             cuda_stream_view stream,
+                             memory_resource_args const& mr_args)
+  : device_buffer{other.data(), other.size(), stream, mr_args}
+{
+}
+
 device_buffer::device_buffer(device_buffer&& other) noexcept
   : _data{other._data},
     _size{other._size},
     _capacity{other._capacity},
     _stream{other.stream()},
     _mr{other._mr},
-    _device{other._device}
+    _device{other._device},
+    _host_mr{other._host_mr},
+    _host_bounce_buffer{other._host_bounce_buffer}
 {
   other._data     = nullptr;
   other._size     = 0;
   other._capacity = 0;
   other.set_stream(cuda_stream_view{});
-  other._device = cuda_device_id{-1};
+  other._device             = cuda_device_id{-1};
+  other._host_mr            = std::nullopt;
+  other._host_bounce_buffer = std::nullopt;
 }
 
 device_buffer& device_buffer::operator=(device_buffer&& other) noexcept
@@ -75,14 +108,18 @@ device_buffer& device_buffer::operator=(device_buffer&& other) noexcept
     _size     = other._size;
     _capacity = other._capacity;
     set_stream(other.stream());
-    _mr     = other._mr;
-    _device = other._device;
+    _mr                 = other._mr;
+    _device             = other._device;
+    _host_mr            = other._host_mr;
+    _host_bounce_buffer = other._host_bounce_buffer;
 
     other._data     = nullptr;
     other._size     = 0;
     other._capacity = 0;
     other.set_stream(cuda_stream_view{});
-    other._device = cuda_device_id{-1};
+    other._device             = cuda_device_id{-1};
+    other._host_mr            = std::nullopt;
+    other._host_bounce_buffer = std::nullopt;
   }
   return *this;
 }
@@ -96,14 +133,30 @@ device_buffer::~device_buffer() noexcept
 
 void device_buffer::allocate_async(std::size_t bytes)
 {
-  _size     = bytes;
-  _capacity = bytes;
-  _data     = (bytes > 0) ? _mr.allocate_async(bytes, stream()) : nullptr;
+  auto const old_capacity = _capacity;
+  _size                   = bytes;
+  _capacity               = bytes;
+  _data                   = (bytes > 0) ? _mr.allocate_async(bytes, stream()) : nullptr;
+
+  // Resize host bounce buffer if needed
+  if (_host_mr.has_value() && bytes > 0) {
+    if (_host_bounce_buffer.has_value()) {
+      _host_mr->deallocate(_host_bounce_buffer.value(), old_capacity);
+    }
+    _host_bounce_buffer = _host_mr->allocate(bytes);
+  }
 }
 
 void device_buffer::deallocate_async() noexcept
 {
   if (capacity() > 0) { _mr.deallocate_async(data(), capacity(), stream()); }
+
+  // Deallocate host bounce buffer if it exists
+  if (_host_bounce_buffer.has_value() && _host_mr.has_value()) {
+    _host_mr->deallocate(_host_bounce_buffer.value(), capacity());
+    _host_bounce_buffer = std::nullopt;
+  }
+
   _size     = 0;
   _capacity = 0;
   _data     = nullptr;
@@ -115,6 +168,20 @@ void device_buffer::copy_async(void const* source, std::size_t bytes)
     RMM_EXPECTS(nullptr != source, "Invalid copy from nullptr.");
     RMM_EXPECTS(nullptr != _data, "Invalid copy to nullptr.");
 
+    if (_host_bounce_buffer.has_value() && _host_mr.has_value()) {
+      // If source is host memory, use bounce buffer for optimized transfer
+      cudaPointerAttributes attributes;
+      cudaError_t result = cudaPointerGetAttributes(&attributes, source);
+      if (result == cudaSuccess && attributes.type == cudaMemoryTypeHost) {
+        RMM_CUDA_TRY(cudaMemcpyAsync(
+          _host_bounce_buffer.value(), source, bytes, cudaMemcpyHostToHost, stream().value()));
+        RMM_CUDA_TRY(cudaMemcpyAsync(
+          _data, _host_bounce_buffer.value(), bytes, cudaMemcpyHostToDevice, stream().value()));
+        return;
+      }
+    }
+
+    // Fallback to direct copy (device-to-device or host-to-device without bounce buffer)
     RMM_CUDA_TRY(cudaMemcpyAsync(_data, source, bytes, cudaMemcpyDefault, stream().value()));
   }
 }
@@ -124,9 +191,18 @@ void device_buffer::reserve(std::size_t new_capacity, cuda_stream_view stream)
   set_stream(stream);
   if (new_capacity > capacity()) {
     cuda_set_device_raii dev{_device};
-    auto tmp            = device_buffer{new_capacity, stream, _mr};
+
+    device_buffer tmp;
+    if (_host_mr.has_value()) {
+      memory_resource_args args{_mr, _host_mr};
+      tmp = device_buffer{new_capacity, stream, args};
+    } else {
+      tmp = device_buffer{new_capacity, stream, _mr};
+    }
+
     auto const old_size = size();
-    RMM_CUDA_TRY(cudaMemcpyAsync(tmp.data(), data(), size(), cudaMemcpyDefault, stream.value()));
+    RMM_CUDA_TRY(cudaMemcpyAsync(
+      tmp.data(), data(), size(), cudaMemcpyDefault, stream.value()));  // device-to-device copy
     *this = std::move(tmp);
     _size = old_size;
   }
@@ -141,8 +217,17 @@ void device_buffer::resize(std::size_t new_size, cuda_stream_view stream)
     _size = new_size;
   } else {
     cuda_set_device_raii dev{_device};
-    auto tmp = device_buffer{new_size, stream, _mr};
-    RMM_CUDA_TRY(cudaMemcpyAsync(tmp.data(), data(), size(), cudaMemcpyDefault, stream.value()));
+
+    device_buffer tmp;
+    if (_host_mr.has_value()) {
+      memory_resource_args args{_mr, _host_mr};
+      tmp = device_buffer{new_size, stream, args};
+    } else {
+      tmp = device_buffer{new_size, stream, _mr};
+    }
+
+    RMM_CUDA_TRY(cudaMemcpyAsync(
+      tmp.data(), data(), size(), cudaMemcpyDefault, stream.value()));  // device-to-device copy
     *this = std::move(tmp);
   }
 }
@@ -155,8 +240,14 @@ void device_buffer::shrink_to_fit(cuda_stream_view stream)
     // Invoke copy ctor on self which only copies `[0, size())` and swap it
     // with self. The temporary `device_buffer` will hold the old contents
     // which will then be destroyed
-    auto tmp = device_buffer{*this, stream, _mr};
-    std::swap(tmp, *this);
+    device_buffer tmp;
+    if (_host_mr.has_value()) {
+      memory_resource_args args{_mr, _host_mr};
+      tmp = device_buffer{*this, stream, args};
+    } else {
+      tmp = device_buffer{*this, stream, _mr};
+    }
+    *this = std::move(tmp);
   }
 }