Skip to content

Commit 6a7134c

Browse files
authored
Replace rmm::device_scalar with cudf::detail::device_scalar due to unnecessary synchronization (Part 3 of miss-sync) (#19119)
For issue #18967, this PR is one part of merging the PR Draft #18968. In this PR, almost all `rmm::device_scalar` calls in libcudf are replaced with `cudf::detail::device_scalar` due to its internal host-pinned bounce buffer. This is also a call to action to use host-pinned memory globally in libcudf, with arguments stated in #18967 and #18968. Authors: - Jigao Luo (https://github.com/JigaoLuo) - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Vukasin Milovanovic (https://github.com/vuule) - Nghia Truong (https://github.com/ttnghia) - David Wendt (https://github.com/davidwendt) URL: #19119
1 parent 23b59a9 commit 6a7134c

File tree

7 files changed

+31
-17
lines changed

7 files changed

+31
-17
lines changed

cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -655,6 +655,21 @@ kernel<<<...>>>(int_scalar.data(),...);
655655
int host_value = int_scalar.value();
656656
```
657657
658+
##### cudf::detail::device_scalar<T>
659+
Acts as a drop-in replacement for `rmm::device_scalar<T>`, with the key difference
660+
being the use of pinned host memory as a bounce buffer for data transfers.
661+
It is recommended for internal use to avoid the implicit synchronization overhead caused by
662+
memcpy operations on pageable host memory.
663+
664+
```c++
665+
// Same as the case with rmm::device_scalar<T> above
666+
cudf::detail::device_scalar<int> int_scalar{42, stream, mr};
667+
kernel<<<...>>>(int_scalar.data(),...);
668+
669+
// Note: This device-to-host transfer uses host-pinned bounce buffer for efficient memcpy
670+
int host_value = int_scalar.value();
671+
```
672+
658673
#### rmm::device_vector<T>
659674

660675
Allocates a specified number of elements of the specified type. If no initialization value is

cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,12 @@
1717
#pragma once
1818

1919
#include <cudf/column/column_factories.hpp>
20+
#include <cudf/detail/device_scalar.hpp>
2021
#include <cudf/detail/iterator.cuh>
2122
#include <cudf/types.hpp>
2223
#include <cudf/utilities/memory_resource.hpp>
2324

2425
#include <rmm/cuda_stream_view.hpp>
25-
#include <rmm/device_scalar.hpp>
2626
#include <rmm/exec_policy.hpp>
2727

2828
#include <cuda/functional>
@@ -203,7 +203,7 @@ struct sizes_to_offsets_iterator {
203203
* auto begin = // begin input iterator
204204
* auto end = // end input iterator
205205
* auto result = rmm::device_uvector(std::distance(begin,end), stream);
206-
* auto last = rmm::device_scalar<int64_t>(0, stream);
206+
* auto last = cudf::detail::device_scalar<int64_t>(0, stream);
207207
* auto itr = make_sizes_to_offsets_iterator(result.begin(),
208208
* result.end(),
209209
* last.data());
@@ -270,7 +270,7 @@ auto sizes_to_offsets(SizesIterator begin,
270270
"Only numeric types are supported by sizes_to_offsets");
271271

272272
using LastType = std::conditional_t<std::is_signed_v<SizeType>, int64_t, uint64_t>;
273-
auto last_element = rmm::device_scalar<LastType>(0, stream);
273+
auto last_element = cudf::detail::device_scalar<LastType>(0, stream);
274274
auto output_itr =
275275
make_sizes_to_offsets_iterator(result, result + std::distance(begin, end), last_element.data());
276276
// This function uses the type of the initialization parameter as the accumulator type

cpp/include/cudf/reduction/detail/reduction.cuh

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,13 @@
1919
#include "reduction_operators.cuh"
2020

2121
#include <cudf/column/column_factories.hpp>
22+
#include <cudf/detail/device_scalar.hpp>
2223
#include <cudf/detail/utilities/cast_functor.cuh>
2324
#include <cudf/utilities/memory_resource.hpp>
2425
#include <cudf/utilities/type_dispatcher.hpp>
2526

2627
#include <rmm/cuda_stream_view.hpp>
2728
#include <rmm/device_buffer.hpp>
28-
#include <rmm/device_scalar.hpp>
2929
#include <rmm/exec_policy.hpp>
3030

3131
#include <cub/device/device_reduce.cuh>
@@ -123,7 +123,7 @@ std::unique_ptr<scalar> reduce(InputIterator d_in,
123123
{
124124
auto const binary_op = cudf::detail::cast_functor<OutputType>(op.get_binary_op());
125125
auto const initial_value = init.value_or(op.template get_identity<OutputType>());
126-
auto dev_result = rmm::device_scalar<OutputType>{initial_value, stream};
126+
auto dev_result = cudf::detail::device_scalar<OutputType>{initial_value, stream};
127127

128128
// Allocate temporary storage
129129
rmm::device_buffer d_temp_storage;
@@ -167,7 +167,6 @@ std::unique_ptr<scalar> reduce(InputIterator d_in,
167167
* @param op the reduction operator
168168
* @param valid_count Number of valid items
169169
* @param ddof Delta degrees of freedom used for standard deviation and variance
170-
* @param init Optional initial value of the reduction
171170
* @param stream CUDA stream used for device memory operations and kernel launches
172171
* @param mr Device memory resource used to allocate the returned scalar's device memory
173172
* @returns Output scalar in device memory
@@ -187,7 +186,7 @@ std::unique_ptr<scalar> reduce(InputIterator d_in,
187186
auto const binary_op = cudf::detail::cast_functor<IntermediateType>(op.get_binary_op());
188187
auto const initial_value = op.template get_identity<IntermediateType>();
189188

190-
rmm::device_scalar<IntermediateType> intermediate_result{initial_value, stream};
189+
cudf::detail::device_scalar<IntermediateType> intermediate_result{initial_value, stream};
191190

192191
// Allocate temporary storage
193192
rmm::device_buffer d_temp_storage;

cpp/include/cudf_test/nanoarrow_utils.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -160,7 +160,7 @@ std::enable_if_t<std::is_same_v<T, cudf::string_view>, void> populate_from_col(
160160
ArrowArrayBuffer(arr, 2)->size_bytes = sview.chars_size(cudf::get_default_stream());
161161
ArrowArrayBuffer(arr, 2)->data = const_cast<uint8_t*>(view.data<uint8_t>());
162162
} else {
163-
auto zero = rmm::device_scalar<int32_t>(0, cudf::get_default_stream());
163+
auto zero = cudf::detail::device_scalar<int32_t>(0, cudf::get_default_stream());
164164
uint8_t const* ptr = reinterpret_cast<uint8_t*>(zero.data());
165165
nanoarrow::BufferInitWrapped(ArrowArrayBuffer(arr, 1), std::move(zero), ptr, 4);
166166
}

cpp/src/join/sort_merge_join.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -174,8 +174,8 @@ merge<LargerIterator, SmallerIterator>::matches_per_row(rmm::cuda_stream_view st
174174

175175
// naive: iterate through larger table and binary search on smaller table
176176
auto const larger_numrows = larger.num_rows();
177-
rmm::device_scalar<bound_type> d_lb_type(bound_type::LOWER, stream, temp_mr);
178-
rmm::device_scalar<bound_type> d_ub_type(bound_type::UPPER, stream, temp_mr);
177+
cudf::detail::device_scalar<bound_type> d_lb_type(bound_type::LOWER, stream, temp_mr);
178+
cudf::detail::device_scalar<bound_type> d_ub_type(bound_type::UPPER, stream, temp_mr);
179179

180180
auto match_counts =
181181
cudf::detail::make_zeroed_device_uvector_async<size_type>(larger_numrows + 1, stream, temp_mr);

cpp/tests/iterator/sizes_to_offsets_iterator_test.cu

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2022-2023, NVIDIA CORPORATION.
2+
* Copyright (c) 2022-2025, NVIDIA CORPORATION.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -17,10 +17,10 @@
1717
#include <cudf_test/column_wrapper.hpp>
1818
#include <cudf_test/type_lists.hpp>
1919

20+
#include <cudf/detail/device_scalar.hpp>
2021
#include <cudf/detail/sizes_to_offsets_iterator.cuh>
2122
#include <cudf/utilities/default_stream.hpp>
2223

23-
#include <rmm/device_scalar.hpp>
2424
#include <rmm/device_uvector.hpp>
2525
#include <rmm/exec_policy.hpp>
2626

@@ -48,7 +48,7 @@ TYPED_TEST(SizesToOffsetsIteratorTestTyped, ExclusiveScan)
4848
auto d_col = cudf::test::fixed_width_column_wrapper<T>(sizes.begin(), sizes.end());
4949
auto d_view = cudf::column_view(d_col);
5050

51-
auto last = rmm::device_scalar<LastType>(0, stream);
51+
auto last = cudf::detail::device_scalar<LastType>(0, stream);
5252
auto result = rmm::device_uvector<T>(d_view.size(), stream);
5353
auto output_itr =
5454
cudf::detail::make_sizes_to_offsets_iterator(result.begin(), result.end(), last.data());
@@ -80,7 +80,7 @@ TEST_F(SizesToOffsetsIteratorTest, ScanWithOverflow)
8080
auto d_col = cudf::test::fixed_width_column_wrapper<int32_t>(values.begin(), values.end());
8181
auto d_view = cudf::column_view(d_col);
8282

83-
auto last = rmm::device_scalar<int64_t>(0, stream);
83+
auto last = cudf::detail::device_scalar<int64_t>(0, stream);
8484
auto result = rmm::device_uvector<int32_t>(d_view.size(), stream);
8585
auto output_itr =
8686
cudf::detail::make_sizes_to_offsets_iterator(result.begin(), result.end(), last.data());

cpp/tests/scalar/scalar_device_view_test.cu

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ TYPED_TEST(TypedScalarDeviceViewTest, Value)
5959

6060
auto scalar_device_view = cudf::get_scalar_device_view(s);
6161
auto scalar_device_view1 = cudf::get_scalar_device_view(s1);
62-
rmm::device_scalar<bool> result{cudf::get_default_stream()};
62+
cudf::detail::device_scalar<bool> result{cudf::get_default_stream()};
6363

6464
test_set_value<<<1, 1, 0, cudf::get_default_stream().value()>>>(scalar_device_view,
6565
scalar_device_view1);
@@ -86,7 +86,7 @@ TYPED_TEST(TypedScalarDeviceViewTest, ConstructNull)
8686
TypeParam value = cudf::test::make_type_param_scalar<TypeParam>(5);
8787
cudf::scalar_type_t<TypeParam> s(value, false);
8888
auto scalar_device_view = cudf::get_scalar_device_view(s);
89-
rmm::device_scalar<bool> result{cudf::get_default_stream()};
89+
cudf::detail::device_scalar<bool> result{cudf::get_default_stream()};
9090

9191
test_null<<<1, 1, 0, cudf::get_default_stream().value()>>>(scalar_device_view, result.data());
9292
CUDF_CHECK_CUDA(0);
@@ -130,7 +130,7 @@ TEST_F(StringScalarDeviceViewTest, Value)
130130
cudf::string_scalar s(value);
131131

132132
auto scalar_device_view = cudf::get_scalar_device_view(s);
133-
rmm::device_scalar<bool> result{cudf::get_default_stream()};
133+
cudf::detail::device_scalar<bool> result{cudf::get_default_stream()};
134134
auto value_v = cudf::detail::make_device_uvector(
135135
value, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
136136

0 commit comments

Comments
 (0)