Replace rmm::device_scalar with cudf::detail::device_scalar due to unnecessary synchronization (Part 3 of miss-sync) (#19119)

JigaoLuo · web-flow · commit 6a7134c9a261 · 2025-08-12T22:43:11.000Z
For issue #18967, this PR is one part of merging the PR Draft #18968. In this PR, almost all `rmm::device_scalar` calls in libcudf are replaced with `cudf::detail::device_scalar` due to its internal host-pinned bounce buffer. This is also a call to action to use host-pinned memory globally in libcudf, with arguments stated in #18967 and #18968. Authors: - Jigao Luo (https://github.com/JigaoLuo) - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Vukasin Milovanovic (https://github.com/vuule) - Nghia Truong (https://github.com/ttnghia) - David Wendt (https://github.com/davidwendt) URL: #19119
diff --git a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
@@ -655,6 +655,21 @@ kernel<<<...>>>(int_scalar.data(),...);
 int host_value = int_scalar.value();
 ```
 
+##### cudf::detail::device_scalar<T>
+Acts as a drop-in replacement for `rmm::device_scalar<T>`, with the key difference
+being the use of pinned host memory as a bounce buffer for data transfers.
+It is recommended for internal use to avoid the implicit synchronization overhead caused by
+memcpy operations on pageable host memory.
+
+```c++
+// Same as the case with rmm::device_scalar<T> above
+cudf::detail::device_scalar<int> int_scalar{42, stream, mr};
+kernel<<<...>>>(int_scalar.data(),...);
+
+// Note: This device-to-host transfer uses host-pinned bounce buffer for efficient memcpy
+int host_value = int_scalar.value();
+```
+
 #### rmm::device_vector<T>
 
 Allocates a specified number of elements of the specified type. If no initialization value is
diff --git a/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh b/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh
@@ -17,12 +17,12 @@
 #pragma once
 
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/types.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_scalar.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <cuda/functional>
@@ -203,7 +203,7 @@ struct sizes_to_offsets_iterator {
  *  auto begin = // begin input iterator
  *  auto end = // end input iterator
  *  auto result = rmm::device_uvector(std::distance(begin,end), stream);
- *  auto last = rmm::device_scalar<int64_t>(0, stream);
+ *  auto last = cudf::detail::device_scalar<int64_t>(0, stream);
  *  auto itr = make_sizes_to_offsets_iterator(result.begin(),
  *                                            result.end(),
  *                                            last.data());
@@ -270,7 +270,7 @@ auto sizes_to_offsets(SizesIterator begin,
                 "Only numeric types are supported by sizes_to_offsets");
 
   using LastType    = std::conditional_t<std::is_signed_v<SizeType>, int64_t, uint64_t>;
-  auto last_element = rmm::device_scalar<LastType>(0, stream);
+  auto last_element = cudf::detail::device_scalar<LastType>(0, stream);
   auto output_itr =
     make_sizes_to_offsets_iterator(result, result + std::distance(begin, end), last_element.data());
   // This function uses the type of the initialization parameter as the accumulator type
diff --git a/cpp/include/cudf/reduction/detail/reduction.cuh b/cpp/include/cudf/reduction/detail/reduction.cuh
@@ -19,13 +19,13 @@
 #include "reduction_operators.cuh"
 
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/utilities/cast_functor.cuh>
 #include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
-#include <rmm/device_scalar.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <cub/device/device_reduce.cuh>
@@ -123,7 +123,7 @@ std::unique_ptr<scalar> reduce(InputIterator d_in,
 {
   auto const binary_op     = cudf::detail::cast_functor<OutputType>(op.get_binary_op());
   auto const initial_value = init.value_or(op.template get_identity<OutputType>());
-  auto dev_result          = rmm::device_scalar<OutputType>{initial_value, stream};
+  auto dev_result          = cudf::detail::device_scalar<OutputType>{initial_value, stream};
 
   // Allocate temporary storage
   rmm::device_buffer d_temp_storage;
@@ -167,7 +167,6 @@ std::unique_ptr<scalar> reduce(InputIterator d_in,
  * @param op          the reduction operator
  * @param valid_count Number of valid items
  * @param ddof        Delta degrees of freedom used for standard deviation and variance
- * @param init        Optional initial value of the reduction
  * @param stream      CUDA stream used for device memory operations and kernel launches
  * @param mr          Device memory resource used to allocate the returned scalar's device memory
  * @returns Output scalar in device memory
@@ -187,7 +186,7 @@ std::unique_ptr<scalar> reduce(InputIterator d_in,
   auto const binary_op     = cudf::detail::cast_functor<IntermediateType>(op.get_binary_op());
   auto const initial_value = op.template get_identity<IntermediateType>();
 
-  rmm::device_scalar<IntermediateType> intermediate_result{initial_value, stream};
+  cudf::detail::device_scalar<IntermediateType> intermediate_result{initial_value, stream};
 
   // Allocate temporary storage
   rmm::device_buffer d_temp_storage;
diff --git a/cpp/include/cudf_test/nanoarrow_utils.hpp b/cpp/include/cudf_test/nanoarrow_utils.hpp
@@ -160,7 +160,7 @@ std::enable_if_t<std::is_same_v<T, cudf::string_view>, void> populate_from_col(
     ArrowArrayBuffer(arr, 2)->size_bytes = sview.chars_size(cudf::get_default_stream());
     ArrowArrayBuffer(arr, 2)->data       = const_cast<uint8_t*>(view.data<uint8_t>());
   } else {
-    auto zero          = rmm::device_scalar<int32_t>(0, cudf::get_default_stream());
+    auto zero          = cudf::detail::device_scalar<int32_t>(0, cudf::get_default_stream());
     uint8_t const* ptr = reinterpret_cast<uint8_t*>(zero.data());
     nanoarrow::BufferInitWrapped(ArrowArrayBuffer(arr, 1), std::move(zero), ptr, 4);
   }
diff --git a/cpp/src/join/sort_merge_join.cu b/cpp/src/join/sort_merge_join.cu
@@ -174,8 +174,8 @@ merge<LargerIterator, SmallerIterator>::matches_per_row(rmm::cuda_stream_view st
 
   // naive: iterate through larger table and binary search on smaller table
   auto const larger_numrows = larger.num_rows();
-  rmm::device_scalar<bound_type> d_lb_type(bound_type::LOWER, stream, temp_mr);
-  rmm::device_scalar<bound_type> d_ub_type(bound_type::UPPER, stream, temp_mr);
+  cudf::detail::device_scalar<bound_type> d_lb_type(bound_type::LOWER, stream, temp_mr);
+  cudf::detail::device_scalar<bound_type> d_ub_type(bound_type::UPPER, stream, temp_mr);
 
   auto match_counts =
     cudf::detail::make_zeroed_device_uvector_async<size_type>(larger_numrows + 1, stream, temp_mr);
diff --git a/cpp/tests/iterator/sizes_to_offsets_iterator_test.cu b/cpp/tests/iterator/sizes_to_offsets_iterator_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,10 +17,10 @@
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/type_lists.hpp>
 
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/sizes_to_offsets_iterator.cuh>
 #include <cudf/utilities/default_stream.hpp>
 
-#include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
@@ -48,7 +48,7 @@ TYPED_TEST(SizesToOffsetsIteratorTestTyped, ExclusiveScan)
   auto d_col  = cudf::test::fixed_width_column_wrapper<T>(sizes.begin(), sizes.end());
   auto d_view = cudf::column_view(d_col);
 
-  auto last   = rmm::device_scalar<LastType>(0, stream);
+  auto last   = cudf::detail::device_scalar<LastType>(0, stream);
   auto result = rmm::device_uvector<T>(d_view.size(), stream);
   auto output_itr =
     cudf::detail::make_sizes_to_offsets_iterator(result.begin(), result.end(), last.data());
@@ -80,7 +80,7 @@ TEST_F(SizesToOffsetsIteratorTest, ScanWithOverflow)
   auto d_col  = cudf::test::fixed_width_column_wrapper<int32_t>(values.begin(), values.end());
   auto d_view = cudf::column_view(d_col);
 
-  auto last   = rmm::device_scalar<int64_t>(0, stream);
+  auto last   = cudf::detail::device_scalar<int64_t>(0, stream);
   auto result = rmm::device_uvector<int32_t>(d_view.size(), stream);
   auto output_itr =
     cudf::detail::make_sizes_to_offsets_iterator(result.begin(), result.end(), last.data());
diff --git a/cpp/tests/scalar/scalar_device_view_test.cu b/cpp/tests/scalar/scalar_device_view_test.cu
@@ -59,7 +59,7 @@ TYPED_TEST(TypedScalarDeviceViewTest, Value)
 
   auto scalar_device_view  = cudf::get_scalar_device_view(s);
   auto scalar_device_view1 = cudf::get_scalar_device_view(s1);
-  rmm::device_scalar<bool> result{cudf::get_default_stream()};
+  cudf::detail::device_scalar<bool> result{cudf::get_default_stream()};
 
   test_set_value<<<1, 1, 0, cudf::get_default_stream().value()>>>(scalar_device_view,
                                                                   scalar_device_view1);
@@ -86,7 +86,7 @@ TYPED_TEST(TypedScalarDeviceViewTest, ConstructNull)
   TypeParam value = cudf::test::make_type_param_scalar<TypeParam>(5);
   cudf::scalar_type_t<TypeParam> s(value, false);
   auto scalar_device_view = cudf::get_scalar_device_view(s);
-  rmm::device_scalar<bool> result{cudf::get_default_stream()};
+  cudf::detail::device_scalar<bool> result{cudf::get_default_stream()};
 
   test_null<<<1, 1, 0, cudf::get_default_stream().value()>>>(scalar_device_view, result.data());
   CUDF_CHECK_CUDA(0);
@@ -130,7 +130,7 @@ TEST_F(StringScalarDeviceViewTest, Value)
   cudf::string_scalar s(value);
 
   auto scalar_device_view = cudf::get_scalar_device_view(s);
-  rmm::device_scalar<bool> result{cudf::get_default_stream()};
+  cudf::detail::device_scalar<bool> result{cudf::get_default_stream()};
   auto value_v = cudf::detail::make_device_uvector(
     value, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 

Original file line number	Diff line number	Diff line change
`@@ -160,7 +160,7 @@ std::enable_if_t<std::is_same_v<T, cudf::string_view>, void> populate_from_col(`
`160`	`160`	`ArrowArrayBuffer(arr, 2)->size_bytes = sview.chars_size(cudf::get_default_stream());`
`161`	`161`	`ArrowArrayBuffer(arr, 2)->data = const_cast<uint8_t*>(view.data<uint8_t>());`
`162`	`162`	`} else {`
`163`		`- auto zero = rmm::device_scalar<int32_t>(0, cudf::get_default_stream());`
	`163`	`+ auto zero = cudf::detail::device_scalar<int32_t>(0, cudf::get_default_stream());`
`164`	`164`	`uint8_t const* ptr = reinterpret_cast<uint8_t*>(zero.data());`
`165`	`165`	`nanoarrow::BufferInitWrapped(ArrowArrayBuffer(arr, 1), std::move(zero), ptr, 4);`
`166`	`166`	`}`