-
Notifications
You must be signed in to change notification settings - Fork 231
[ 🚧 Draft] : Adding host-mr for pinned bounce buffer to rmm::device_buffer
#1996
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: branch-25.08
Are you sure you want to change the base?
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -19,6 +19,8 @@ | |
|
|
||
| #include <cuda_runtime_api.h> | ||
|
|
||
| #include <cstring> | ||
|
|
||
| namespace rmm { | ||
|
|
||
| device_buffer::device_buffer() : _mr{rmm::mr::get_current_device_resource_ref()} {} | ||
|
|
@@ -32,6 +34,15 @@ device_buffer::device_buffer(std::size_t size, | |
| allocate_async(size); | ||
| } | ||
|
|
||
| device_buffer::device_buffer(std::size_t size, | ||
| cuda_stream_view stream, | ||
| memory_resource_args const& mr_args) | ||
| : _stream{stream}, _mr{mr_args.device_mr}, _host_mr{mr_args.bounce_buffer_host_mr} | ||
| { | ||
| cuda_set_device_raii dev{_device}; | ||
| allocate_async(size); | ||
| } | ||
|
|
||
| device_buffer::device_buffer(void const* source_data, | ||
| std::size_t size, | ||
| cuda_stream_view stream, | ||
|
|
@@ -43,26 +54,48 @@ device_buffer::device_buffer(void const* source_data, | |
| copy_async(source_data, size); | ||
| } | ||
|
|
||
| device_buffer::device_buffer(void const* source_data, | ||
| std::size_t size, | ||
| cuda_stream_view stream, | ||
| memory_resource_args const& mr_args) | ||
| : _stream{stream}, _mr{mr_args.device_mr}, _host_mr{mr_args.bounce_buffer_host_mr} | ||
| { | ||
| cuda_set_device_raii dev{_device}; | ||
| allocate_async(size); | ||
| copy_async(source_data, size); | ||
| } | ||
|
|
||
| device_buffer::device_buffer(device_buffer const& other, | ||
| cuda_stream_view stream, | ||
| device_async_resource_ref mr) | ||
| : device_buffer{other.data(), other.size(), stream, mr} | ||
| { | ||
| } | ||
|
|
||
| device_buffer::device_buffer(device_buffer const& other, | ||
| cuda_stream_view stream, | ||
| memory_resource_args const& mr_args) | ||
| : device_buffer{other.data(), other.size(), stream, mr_args} | ||
| { | ||
| } | ||
|
|
||
| device_buffer::device_buffer(device_buffer&& other) noexcept | ||
| : _data{other._data}, | ||
| _size{other._size}, | ||
| _capacity{other._capacity}, | ||
| _stream{other.stream()}, | ||
| _mr{other._mr}, | ||
| _device{other._device} | ||
| _device{other._device}, | ||
| _host_mr{other._host_mr}, | ||
| _host_bounce_buffer{other._host_bounce_buffer} | ||
| { | ||
| other._data = nullptr; | ||
| other._size = 0; | ||
| other._capacity = 0; | ||
| other.set_stream(cuda_stream_view{}); | ||
| other._device = cuda_device_id{-1}; | ||
| other._device = cuda_device_id{-1}; | ||
| other._host_mr = std::nullopt; | ||
| other._host_bounce_buffer = std::nullopt; | ||
| } | ||
|
|
||
| device_buffer& device_buffer::operator=(device_buffer&& other) noexcept | ||
|
|
@@ -75,14 +108,18 @@ device_buffer& device_buffer::operator=(device_buffer&& other) noexcept | |
| _size = other._size; | ||
| _capacity = other._capacity; | ||
| set_stream(other.stream()); | ||
| _mr = other._mr; | ||
| _device = other._device; | ||
| _mr = other._mr; | ||
| _device = other._device; | ||
| _host_mr = other._host_mr; | ||
| _host_bounce_buffer = other._host_bounce_buffer; | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To Line 114: Same reasoning as my earlier point about statelessness. |
||
|
|
||
| other._data = nullptr; | ||
| other._size = 0; | ||
| other._capacity = 0; | ||
| other.set_stream(cuda_stream_view{}); | ||
| other._device = cuda_device_id{-1}; | ||
| other._device = cuda_device_id{-1}; | ||
| other._host_mr = std::nullopt; | ||
| other._host_bounce_buffer = std::nullopt; | ||
| } | ||
| return *this; | ||
| } | ||
|
|
@@ -96,14 +133,30 @@ device_buffer::~device_buffer() noexcept | |
|
|
||
| void device_buffer::allocate_async(std::size_t bytes) | ||
| { | ||
| _size = bytes; | ||
| _capacity = bytes; | ||
| _data = (bytes > 0) ? _mr.allocate_async(bytes, stream()) : nullptr; | ||
| auto const old_capacity = _capacity; | ||
| _size = bytes; | ||
| _capacity = bytes; | ||
| _data = (bytes > 0) ? _mr.allocate_async(bytes, stream()) : nullptr; | ||
|
|
||
| // Resize host bounce buffer if needed | ||
| if (_host_mr.has_value() && bytes > 0) { | ||
| if (_host_bounce_buffer.has_value()) { | ||
| _host_mr->deallocate(_host_bounce_buffer.value(), old_capacity); | ||
| } | ||
| _host_bounce_buffer = _host_mr->allocate(bytes); | ||
| } | ||
| } | ||
|
|
||
| void device_buffer::deallocate_async() noexcept | ||
| { | ||
| if (capacity() > 0) { _mr.deallocate_async(data(), capacity(), stream()); } | ||
|
|
||
| // Deallocate host bounce buffer if it exists | ||
| if (_host_bounce_buffer.has_value() && _host_mr.has_value()) { | ||
| _host_mr->deallocate(_host_bounce_buffer.value(), capacity()); | ||
| _host_bounce_buffer = std::nullopt; | ||
| } | ||
|
|
||
| _size = 0; | ||
| _capacity = 0; | ||
| _data = nullptr; | ||
|
|
@@ -115,6 +168,20 @@ void device_buffer::copy_async(void const* source, std::size_t bytes) | |
| RMM_EXPECTS(nullptr != source, "Invalid copy from nullptr."); | ||
| RMM_EXPECTS(nullptr != _data, "Invalid copy to nullptr."); | ||
|
|
||
| if (_host_bounce_buffer.has_value() && _host_mr.has_value()) { | ||
| // If source is host memory, use bounce buffer for optimized transfer | ||
| cudaPointerAttributes attributes; | ||
| cudaError_t result = cudaPointerGetAttributes(&attributes, source); | ||
| if (result == cudaSuccess && attributes.type == cudaMemoryTypeHost) { | ||
| RMM_CUDA_TRY(cudaMemcpyAsync( | ||
| _host_bounce_buffer.value(), source, bytes, cudaMemcpyHostToHost, stream().value())); | ||
| RMM_CUDA_TRY(cudaMemcpyAsync( | ||
| _data, _host_bounce_buffer.value(), bytes, cudaMemcpyHostToDevice, stream().value())); | ||
| return; | ||
|
Comment on lines
+175
to
+180
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If the user's source pointer is in page-locked host memory this is a pessimisation, I think. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this branch is only executes when user passes a host mr, with the expectation that this would be some kind of pinned memory allocator. When the host mr is not set (the default case), copy is done the same way it was before this PR. |
||
| } | ||
| } | ||
|
|
||
| // Fallback to direct copy (device-to-device or host-to-device without bounce buffer) | ||
| RMM_CUDA_TRY(cudaMemcpyAsync(_data, source, bytes, cudaMemcpyDefault, stream().value())); | ||
| } | ||
| } | ||
|
|
@@ -124,9 +191,18 @@ void device_buffer::reserve(std::size_t new_capacity, cuda_stream_view stream) | |
| set_stream(stream); | ||
| if (new_capacity > capacity()) { | ||
| cuda_set_device_raii dev{_device}; | ||
| auto tmp = device_buffer{new_capacity, stream, _mr}; | ||
|
|
||
| device_buffer tmp; | ||
| if (_host_mr.has_value()) { | ||
| memory_resource_args args{_mr, _host_mr}; | ||
| tmp = device_buffer{new_capacity, stream, args}; | ||
| } else { | ||
| tmp = device_buffer{new_capacity, stream, _mr}; | ||
| } | ||
|
|
||
| auto const old_size = size(); | ||
| RMM_CUDA_TRY(cudaMemcpyAsync(tmp.data(), data(), size(), cudaMemcpyDefault, stream.value())); | ||
| RMM_CUDA_TRY(cudaMemcpyAsync( | ||
| tmp.data(), data(), size(), cudaMemcpyDefault, stream.value())); // device-to-device copy | ||
| *this = std::move(tmp); | ||
| _size = old_size; | ||
| } | ||
|
|
@@ -141,8 +217,17 @@ void device_buffer::resize(std::size_t new_size, cuda_stream_view stream) | |
| _size = new_size; | ||
| } else { | ||
| cuda_set_device_raii dev{_device}; | ||
| auto tmp = device_buffer{new_size, stream, _mr}; | ||
| RMM_CUDA_TRY(cudaMemcpyAsync(tmp.data(), data(), size(), cudaMemcpyDefault, stream.value())); | ||
|
|
||
| device_buffer tmp; | ||
| if (_host_mr.has_value()) { | ||
| memory_resource_args args{_mr, _host_mr}; | ||
| tmp = device_buffer{new_size, stream, args}; | ||
| } else { | ||
| tmp = device_buffer{new_size, stream, _mr}; | ||
| } | ||
|
|
||
| RMM_CUDA_TRY(cudaMemcpyAsync( | ||
| tmp.data(), data(), size(), cudaMemcpyDefault, stream.value())); // device-to-device copy | ||
| *this = std::move(tmp); | ||
| } | ||
| } | ||
|
|
@@ -155,8 +240,14 @@ void device_buffer::shrink_to_fit(cuda_stream_view stream) | |
| // Invoke copy ctor on self which only copies `[0, size())` and swap it | ||
| // with self. The temporary `device_buffer` will hold the old contents | ||
| // which will then be destroyed | ||
| auto tmp = device_buffer{*this, stream, _mr}; | ||
| std::swap(tmp, *this); | ||
| device_buffer tmp; | ||
| if (_host_mr.has_value()) { | ||
| memory_resource_args args{_mr, _host_mr}; | ||
| tmp = device_buffer{*this, stream, args}; | ||
| } else { | ||
| tmp = device_buffer{*this, stream, _mr}; | ||
| } | ||
| *this = std::move(tmp); | ||
| } | ||
| } | ||
|
|
||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
To Line 90: I’ll also start with a simple discussion point: I believe the bounce buffer is stateless—it doesn’t store anything and serves only as a temporary transfer buffer. If you agree, I’d prefer to leave it uncopied or moved, to reflect its transient nature.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It doesn't store anything, but it is an allocation that's a part of the object's state.
When we copy an object, I assume we want the new one to have the equivalent bounce buffer. So, we don't need to copy, but still need to allocate a new one.
When we move, reusing the old allocation should be the cheapest option, even though we don't care about the content.