Skip to content
Merged
Show file tree
Hide file tree
Changes from 25 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
7f0785a
[No Miss-sync] use cudf::detail::device_scalar in reduction.cuh (and …
JigaoLuo Jun 8, 2025
3fab90c
[No Miss-sync] use cudf::detail::device_scalar in sizes_to_offsets_it…
JigaoLuo Jun 8, 2025
e0431a3
[No Miss-sync] use cudf::detail::device_scalar in tests
JigaoLuo Jun 8, 2025
b214963
[No Miss-sync] update include
JigaoLuo Jun 8, 2025
9866c40
[No Miss-sync] use cudf::detail::device_scalar in sort_merge_join.cu
JigaoLuo Jun 8, 2025
f4eb927
[No Miss-sync] use cudf::detail::device_scalar in nanoarrow_utils.hpp
JigaoLuo Jun 8, 2025
06ae1d2
[No Miss-sync] Update DEVELOPER_GUIDE.md
JigaoLuo Jun 8, 2025
04e5415
Merge branch 'branch-25.08' into no-miss-sync-cudf-devicescalar
vuule Jun 9, 2025
cc6baa9
Revert "[No Miss-sync] Update DEVELOPER_GUIDE.md"
JigaoLuo Jun 10, 2025
5c54e60
Revert "[No Miss-sync] use cudf::detail::device_scalar in reduction.c…
JigaoLuo Jun 10, 2025
027961b
Overloading chrono_scalar with cudf::detail::device_scalar to remove …
JigaoLuo Jun 16, 2025
90d0a7c
[No Miss-sync] use cudf::detail::device_scalar in reduction.cuh
JigaoLuo Jun 16, 2025
a28c3f3
Merge branch 'branch-25.08' into no-miss-sync-cudf-devicescalar
JigaoLuo Jun 16, 2025
b06cc24
Merge branch 'branch-25.08' into no-miss-sync-cudf-devicescalar
JigaoLuo Jun 17, 2025
25903b2
Merge branch 'branch-25.08' into no-miss-sync-cudf-devicescalar
vuule Aug 1, 2025
fbf6aeb
revert chrono_scalar changes
vuule Aug 2, 2025
1d467c1
Merge branch 'branch-25.10' into no-miss-sync-cudf-devicescalar
vuule Aug 2, 2025
b8c9424
revert one of the replacements
vuule Aug 7, 2025
9e7c8db
Merge branch 'branch-25.10' into no-miss-sync-cudf-devicescalar
vuule Aug 7, 2025
be5c1f4
Reapply "[No Miss-sync] Update DEVELOPER_GUIDE.md"
JigaoLuo Aug 9, 2025
a6162f8
Merge remote-tracking branch 'upstream/branch-25.10' into no-miss-syn…
JigaoLuo Aug 11, 2025
3691539
Revert "Reapply "[No Miss-sync] Update DEVELOPER_GUIDE.md""
JigaoLuo Aug 12, 2025
ccae710
add cudf device_scalar in DEVELOPER_GUIDE.md
JigaoLuo Aug 12, 2025
2a274e2
tiny change wording
JigaoLuo Aug 12, 2025
6708c96
Merge branch 'branch-25.10' into no-miss-sync-cudf-devicescalar
vuule Aug 12, 2025
5fc3411
remove trailing whitespace
vuule Aug 12, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -655,6 +655,21 @@ kernel<<<...>>>(int_scalar.data(),...);
int host_value = int_scalar.value();
```

##### cudf::detail::device_scalar<T>
Acts as a drop-in replacement for `rmm::device_scalar<T>`, with the key difference
being the use of pinned host memory as a bounce buffer for data transfers.
It is recommended for internal use to avoid the implicit synchronization overhead caused by
memcpy operations on pageable host memory.

```c++
// Same as the case with rmm::device_scalar<T> above
cudf::detail::device_scalar<int> int_scalar{42, stream, mr};
kernel<<<...>>>(int_scalar.data(),...);

// Note: This device-to-host transfer uses host-pinned bounce buffer for efficient memcpy
int host_value = int_scalar.value();
```

#### rmm::device_vector<T>

Allocates a specified number of elements of the specified type. If no initialization value is
Expand Down
6 changes: 3 additions & 3 deletions cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,12 @@
#pragma once

#include <cudf/column/column_factories.hpp>
#include <cudf/detail/device_scalar.hpp>
#include <cudf/detail/iterator.cuh>
#include <cudf/types.hpp>
#include <cudf/utilities/memory_resource.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_scalar.hpp>
#include <rmm/exec_policy.hpp>

#include <cuda/functional>
Expand Down Expand Up @@ -203,7 +203,7 @@ struct sizes_to_offsets_iterator {
* auto begin = // begin input iterator
* auto end = // end input iterator
* auto result = rmm::device_uvector(std::distance(begin,end), stream);
* auto last = rmm::device_scalar<int64_t>(0, stream);
* auto last = cudf::detail::device_scalar<int64_t>(0, stream);
* auto itr = make_sizes_to_offsets_iterator(result.begin(),
* result.end(),
* last.data());
Expand Down Expand Up @@ -270,7 +270,7 @@ auto sizes_to_offsets(SizesIterator begin,
"Only numeric types are supported by sizes_to_offsets");

using LastType = std::conditional_t<std::is_signed_v<SizeType>, int64_t, uint64_t>;
auto last_element = rmm::device_scalar<LastType>(0, stream);
auto last_element = cudf::detail::device_scalar<LastType>(0, stream);
auto output_itr =
make_sizes_to_offsets_iterator(result, result + std::distance(begin, end), last_element.data());
// This function uses the type of the initialization parameter as the accumulator type
Expand Down
7 changes: 3 additions & 4 deletions cpp/include/cudf/reduction/detail/reduction.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,13 @@
#include "reduction_operators.cuh"

#include <cudf/column/column_factories.hpp>
#include <cudf/detail/device_scalar.hpp>
#include <cudf/detail/utilities/cast_functor.cuh>
#include <cudf/utilities/memory_resource.hpp>
#include <cudf/utilities/type_dispatcher.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_buffer.hpp>
#include <rmm/device_scalar.hpp>
#include <rmm/exec_policy.hpp>

#include <cub/device/device_reduce.cuh>
Expand Down Expand Up @@ -123,7 +123,7 @@ std::unique_ptr<scalar> reduce(InputIterator d_in,
{
auto const binary_op = cudf::detail::cast_functor<OutputType>(op.get_binary_op());
auto const initial_value = init.value_or(op.template get_identity<OutputType>());
auto dev_result = rmm::device_scalar<OutputType>{initial_value, stream};
auto dev_result = cudf::detail::device_scalar<OutputType>{initial_value, stream};

// Allocate temporary storage
rmm::device_buffer d_temp_storage;
Expand Down Expand Up @@ -167,7 +167,6 @@ std::unique_ptr<scalar> reduce(InputIterator d_in,
* @param op the reduction operator
* @param valid_count Number of valid items
* @param ddof Delta degrees of freedom used for standard deviation and variance
* @param init Optional initial value of the reduction
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned scalar's device memory
* @returns Output scalar in device memory
Expand All @@ -187,7 +186,7 @@ std::unique_ptr<scalar> reduce(InputIterator d_in,
auto const binary_op = cudf::detail::cast_functor<IntermediateType>(op.get_binary_op());
auto const initial_value = op.template get_identity<IntermediateType>();

rmm::device_scalar<IntermediateType> intermediate_result{initial_value, stream};
cudf::detail::device_scalar<IntermediateType> intermediate_result{initial_value, stream};

// Allocate temporary storage
rmm::device_buffer d_temp_storage;
Expand Down
2 changes: 1 addition & 1 deletion cpp/include/cudf_test/nanoarrow_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ std::enable_if_t<std::is_same_v<T, cudf::string_view>, void> populate_from_col(
ArrowArrayBuffer(arr, 2)->size_bytes = sview.chars_size(cudf::get_default_stream());
ArrowArrayBuffer(arr, 2)->data = const_cast<uint8_t*>(view.data<uint8_t>());
} else {
auto zero = rmm::device_scalar<int32_t>(0, cudf::get_default_stream());
auto zero = cudf::detail::device_scalar<int32_t>(0, cudf::get_default_stream());
uint8_t const* ptr = reinterpret_cast<uint8_t*>(zero.data());
nanoarrow::BufferInitWrapped(ArrowArrayBuffer(arr, 1), std::move(zero), ptr, 4);
}
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/join/sort_merge_join.cu
Original file line number Diff line number Diff line change
Expand Up @@ -174,8 +174,8 @@ merge<LargerIterator, SmallerIterator>::matches_per_row(rmm::cuda_stream_view st

// naive: iterate through larger table and binary search on smaller table
auto const larger_numrows = larger.num_rows();
rmm::device_scalar<bound_type> d_lb_type(bound_type::LOWER, stream, temp_mr);
rmm::device_scalar<bound_type> d_ub_type(bound_type::UPPER, stream, temp_mr);
cudf::detail::device_scalar<bound_type> d_lb_type(bound_type::LOWER, stream, temp_mr);
cudf::detail::device_scalar<bound_type> d_ub_type(bound_type::UPPER, stream, temp_mr);

auto match_counts =
cudf::detail::make_zeroed_device_uvector_async<size_type>(larger_numrows + 1, stream, temp_mr);
Expand Down
8 changes: 4 additions & 4 deletions cpp/tests/iterator/sizes_to_offsets_iterator_test.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2022-2023, NVIDIA CORPORATION.
* Copyright (c) 2022-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -17,10 +17,10 @@
#include <cudf_test/column_wrapper.hpp>
#include <cudf_test/type_lists.hpp>

#include <cudf/detail/device_scalar.hpp>
#include <cudf/detail/sizes_to_offsets_iterator.cuh>
#include <cudf/utilities/default_stream.hpp>

#include <rmm/device_scalar.hpp>
#include <rmm/device_uvector.hpp>
#include <rmm/exec_policy.hpp>

Expand Down Expand Up @@ -48,7 +48,7 @@ TYPED_TEST(SizesToOffsetsIteratorTestTyped, ExclusiveScan)
auto d_col = cudf::test::fixed_width_column_wrapper<T>(sizes.begin(), sizes.end());
auto d_view = cudf::column_view(d_col);

auto last = rmm::device_scalar<LastType>(0, stream);
auto last = cudf::detail::device_scalar<LastType>(0, stream);
auto result = rmm::device_uvector<T>(d_view.size(), stream);
auto output_itr =
cudf::detail::make_sizes_to_offsets_iterator(result.begin(), result.end(), last.data());
Expand Down Expand Up @@ -80,7 +80,7 @@ TEST_F(SizesToOffsetsIteratorTest, ScanWithOverflow)
auto d_col = cudf::test::fixed_width_column_wrapper<int32_t>(values.begin(), values.end());
auto d_view = cudf::column_view(d_col);

auto last = rmm::device_scalar<int64_t>(0, stream);
auto last = cudf::detail::device_scalar<int64_t>(0, stream);
auto result = rmm::device_uvector<int32_t>(d_view.size(), stream);
auto output_itr =
cudf::detail::make_sizes_to_offsets_iterator(result.begin(), result.end(), last.data());
Expand Down
6 changes: 3 additions & 3 deletions cpp/tests/scalar/scalar_device_view_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ TYPED_TEST(TypedScalarDeviceViewTest, Value)

auto scalar_device_view = cudf::get_scalar_device_view(s);
auto scalar_device_view1 = cudf::get_scalar_device_view(s1);
rmm::device_scalar<bool> result{cudf::get_default_stream()};
cudf::detail::device_scalar<bool> result{cudf::get_default_stream()};

test_set_value<<<1, 1, 0, cudf::get_default_stream().value()>>>(scalar_device_view,
scalar_device_view1);
Expand All @@ -86,7 +86,7 @@ TYPED_TEST(TypedScalarDeviceViewTest, ConstructNull)
TypeParam value = cudf::test::make_type_param_scalar<TypeParam>(5);
cudf::scalar_type_t<TypeParam> s(value, false);
auto scalar_device_view = cudf::get_scalar_device_view(s);
rmm::device_scalar<bool> result{cudf::get_default_stream()};
cudf::detail::device_scalar<bool> result{cudf::get_default_stream()};

test_null<<<1, 1, 0, cudf::get_default_stream().value()>>>(scalar_device_view, result.data());
CUDF_CHECK_CUDA(0);
Expand Down Expand Up @@ -130,7 +130,7 @@ TEST_F(StringScalarDeviceViewTest, Value)
cudf::string_scalar s(value);

auto scalar_device_view = cudf::get_scalar_device_view(s);
rmm::device_scalar<bool> result{cudf::get_default_stream()};
cudf::detail::device_scalar<bool> result{cudf::get_default_stream()};
auto value_v = cudf::detail::make_device_uvector(
value, cudf::get_default_stream(), cudf::get_current_device_resource_ref());

Expand Down
Loading