Skip to content

Commit 5faed19

Browse files
authored
Implement parallel cuda::std::is_sorted (#8064)
This implements the `is_sorted` algorithms for the cuda backend. * `std::is_sorted` see https://en.cppreference.com/w/cpp/algorithm/is_sorted.html * `std::is_sorted_until` see https://en.cppreference.com/w/cpp/algorithm/is_sorted_until.html It provides tests and benchmarks similar to Thrust and some boilerplate for libcu++ The functionality is publicly available yet and implemented in a private internal header Fixes #7762
1 parent 77e72d5 commit 5faed19

File tree

11 files changed

+921
-0
lines changed

11 files changed

+921
-0
lines changed
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of libcu++, the C++ Standard Library for your entire system,
4+
// under the Apache License v2.0 with LLVM Exceptions.
5+
// See https://llvm.org/LICENSE.txt for license information.
6+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7+
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
8+
//
9+
//===----------------------------------------------------------------------===//
10+
11+
#include <thrust/device_vector.h>
12+
#include <thrust/sequence.h>
13+
#include <thrust/sort.h>
14+
15+
#include <cuda/functional>
16+
#include <cuda/std/__pstl_algorithm>
17+
#include <cuda/stream>
18+
19+
#include "nvbench_helper.cuh"
20+
21+
template <typename T>
22+
static void basic(nvbench::state& state, nvbench::type_list<T>)
23+
{
24+
// set up input
25+
const auto elements = static_cast<std::size_t>(state.get_int64("Elements"));
26+
const auto common_prefix = state.get_float64("MismatchAt");
27+
const auto mismatch_point = ::cuda::std::clamp<std::size_t>(elements * common_prefix, 0ull, elements - 1);
28+
29+
thrust::device_vector<T> dinput(elements, thrust::no_init);
30+
thrust::sequence(dinput.begin(), dinput.end(), T{0});
31+
dinput[mismatch_point] = T{-1};
32+
33+
state.add_global_memory_reads<T>(mismatch_point + 1);
34+
state.add_global_memory_writes<size_t>(1);
35+
36+
caching_allocator_t alloc{};
37+
38+
state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
39+
[&](nvbench::launch& launch) {
40+
do_not_optimize(cuda::std::is_sorted(cuda_policy(alloc, launch), dinput.begin(), dinput.end()));
41+
});
42+
}
43+
44+
NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(fundamental_types))
45+
.set_name("base")
46+
.add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4))
47+
.add_float64_axis("MismatchAt", std::vector{1.0, 0.5, 0.01});
48+
49+
template <typename T>
50+
static void with_predicate(nvbench::state& state, nvbench::type_list<T>)
51+
{
52+
// set up input
53+
const auto elements = static_cast<std::size_t>(state.get_int64("Elements"));
54+
const auto common_prefix = state.get_float64("MismatchAt");
55+
const auto mismatch_point = ::cuda::std::clamp<std::size_t>(elements * common_prefix, 0ull, elements - 1);
56+
57+
thrust::device_vector<T> dinput(elements, thrust::no_init);
58+
thrust::sequence(dinput.begin(), dinput.end(), T{0});
59+
dinput[mismatch_point] = T{-1};
60+
61+
state.add_global_memory_reads<T>(mismatch_point + 1);
62+
state.add_global_memory_writes<size_t>(1);
63+
64+
caching_allocator_t alloc{};
65+
66+
state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
67+
[&](nvbench::launch& launch) {
68+
do_not_optimize(
69+
cuda::std::is_sorted(cuda_policy(alloc, launch), dinput.begin(), dinput.end(), cuda::std::less<>{}));
70+
});
71+
}
72+
73+
NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(fundamental_types))
74+
.set_name("base")
75+
.add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4))
76+
.add_float64_axis("MismatchAt", std::vector{1.0, 0.5, 0.01});
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of libcu++, the C++ Standard Library for your entire system,
4+
// under the Apache License v2.0 with LLVM Exceptions.
5+
// See https://llvm.org/LICENSE.txt for license information.
6+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7+
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
8+
//
9+
//===----------------------------------------------------------------------===//
10+
11+
#include <thrust/device_vector.h>
12+
#include <thrust/sequence.h>
13+
#include <thrust/sort.h>
14+
15+
#include <cuda/functional>
16+
#include <cuda/std/__pstl_algorithm>
17+
#include <cuda/stream>
18+
19+
#include "nvbench_helper.cuh"
20+
21+
template <typename T>
22+
static void basic(nvbench::state& state, nvbench::type_list<T>)
23+
{
24+
// set up input
25+
const auto elements = static_cast<std::size_t>(state.get_int64("Elements"));
26+
const auto common_prefix = state.get_float64("MismatchAt");
27+
const auto mismatch_point = ::cuda::std::clamp<std::size_t>(elements * common_prefix, 0ull, elements - 1);
28+
29+
thrust::device_vector<T> dinput(elements, thrust::no_init);
30+
thrust::sequence(dinput.begin(), dinput.end(), T{0});
31+
dinput[mismatch_point] = T{-1};
32+
33+
state.add_global_memory_reads<T>(mismatch_point + 1);
34+
state.add_global_memory_writes<size_t>(1);
35+
36+
caching_allocator_t alloc{};
37+
38+
state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
39+
[&](nvbench::launch& launch) {
40+
do_not_optimize(cuda::std::is_sorted_until(cuda_policy(alloc, launch), dinput.begin(), dinput.end()));
41+
});
42+
}
43+
44+
NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(fundamental_types))
45+
.set_name("base")
46+
.add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4))
47+
.add_float64_axis("MismatchAt", std::vector{1.0, 0.5, 0.01});
48+
49+
template <typename T>
50+
static void with_predicate(nvbench::state& state, nvbench::type_list<T>)
51+
{
52+
// set up input
53+
const auto elements = static_cast<std::size_t>(state.get_int64("Elements"));
54+
const auto common_prefix = state.get_float64("MismatchAt");
55+
const auto mismatch_point = ::cuda::std::clamp<std::size_t>(elements * common_prefix, 0ull, elements - 1);
56+
57+
thrust::device_vector<T> dinput(elements, thrust::no_init);
58+
thrust::sequence(dinput.begin(), dinput.end(), T{0});
59+
dinput[mismatch_point] = T{-1};
60+
61+
state.add_global_memory_reads<T>(mismatch_point + 1);
62+
state.add_global_memory_writes<size_t>(1);
63+
64+
caching_allocator_t alloc{};
65+
66+
state.exec(
67+
nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
68+
do_not_optimize(
69+
cuda::std::is_sorted_until(cuda_policy(alloc, launch), dinput.begin(), dinput.end(), cuda::std::less<>{}));
70+
});
71+
}
72+
73+
NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(fundamental_types))
74+
.set_name("base")
75+
.add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4))
76+
.add_float64_axis("MismatchAt", std::vector{1.0, 0.5, 0.01});
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of libcu++, the C++ Standard Library for your entire system,
4+
// under the Apache License v2.0 with LLVM Exceptions.
5+
// See https://llvm.org/LICENSE.txt for license information.
6+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7+
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
8+
//
9+
//===----------------------------------------------------------------------===//
10+
11+
#ifndef _CUDA_STD___PSTL_IS_SORTED_H
12+
#define _CUDA_STD___PSTL_IS_SORTED_H
13+
14+
#include <cuda/std/detail/__config>
15+
16+
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
17+
# pragma GCC system_header
18+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
19+
# pragma clang system_header
20+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
21+
# pragma system_header
22+
#endif // no system header
23+
24+
#if !_CCCL_COMPILER(NVRTC)
25+
26+
# include <cuda/__iterator/zip_function.h>
27+
# include <cuda/__iterator/zip_iterator.h>
28+
# include <cuda/__nvtx/nvtx.h>
29+
# include <cuda/std/__algorithm/is_sorted.h>
30+
# include <cuda/std/__concepts/concept_macros.h>
31+
# include <cuda/std/__execution/policy.h>
32+
# include <cuda/std/__functional/operations.h>
33+
# include <cuda/std/__iterator/concepts.h>
34+
# include <cuda/std/__iterator/iterator_traits.h>
35+
# include <cuda/std/__pstl/dispatch.h>
36+
# include <cuda/std/__type_traits/always_false.h>
37+
# include <cuda/std/__type_traits/is_execution_policy.h>
38+
# include <cuda/std/__utility/move.h>
39+
40+
# if _CCCL_HAS_BACKEND_CUDA()
41+
# include <cuda/std/__pstl/cuda/find_if.h>
42+
# endif // _CCCL_HAS_BACKEND_CUDA()
43+
44+
# include <cuda/std/__cccl/prologue.h>
45+
46+
_CCCL_BEGIN_NAMESPACE_CUDA_STD
47+
48+
_CCCL_BEGIN_NAMESPACE_ARCH_DEPENDENT
49+
50+
_CCCL_TEMPLATE(class _Policy, class _InputIterator, class _BinaryPredicate = less<>)
51+
_CCCL_REQUIRES(__has_forward_traversal<_InputIterator> _CCCL_AND is_execution_policy_v<_Policy>)
52+
[[nodiscard]] _CCCL_HOST_API bool is_sorted(
53+
[[maybe_unused]] const _Policy& __policy, _InputIterator __first, _InputIterator __last, _BinaryPredicate __pred = {})
54+
{
55+
static_assert(indirect_binary_predicate<_BinaryPredicate, _InputIterator, _InputIterator>,
56+
"cuda::std::is_sorted: BinaryPredicate must satisfy "
57+
"indirect_binary_predicate<BinaryPredicate, InputIterator, InputIterator>");
58+
59+
[[maybe_unused]] auto __dispatch =
60+
::cuda::std::execution::__pstl_select_dispatch<::cuda::std::execution::__pstl_algorithm::__find_if, _Policy>();
61+
if constexpr (::cuda::std::execution::__pstl_can_dispatch<decltype(__dispatch)>)
62+
{
63+
_CCCL_NVTX_RANGE_SCOPE("cuda::std::is_sorted");
64+
65+
if (__first == __last)
66+
{
67+
return true;
68+
}
69+
70+
// Note we compare __first + 1 and __first, so that we do not need to negate the predicate
71+
auto __result = __dispatch(
72+
__policy,
73+
::cuda::zip_iterator{__first + 1, __first},
74+
::cuda::zip_iterator{__last, __last},
75+
::cuda::zip_function{::cuda::std::move(__pred)});
76+
return ::cuda::std::get<0>(__result.__iterators()) == __last;
77+
}
78+
else
79+
{
80+
static_assert(__always_false_v<_Policy>, "Parallel cuda::std::is_sorted requires at least one selected backend");
81+
return ::cuda::std::is_sorted(::cuda::std::move(__first), ::cuda::std::move(__last), ::cuda::std::move(__pred));
82+
}
83+
}
84+
85+
_CCCL_END_NAMESPACE_ARCH_DEPENDENT
86+
87+
_CCCL_END_NAMESPACE_CUDA_STD
88+
89+
# include <cuda/std/__cccl/epilogue.h>
90+
91+
#endif // !_CCCL_COMPILER(NVRTC)
92+
93+
#endif // _CUDA_STD___PSTL_IS_SORTED_H
Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of libcu++, the C++ Standard Library for your entire system,
4+
// under the Apache License v2.0 with LLVM Exceptions.
5+
// See https://llvm.org/LICENSE.txt for license information.
6+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7+
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
8+
//
9+
//===----------------------------------------------------------------------===//
10+
11+
#ifndef _CUDA_STD___PSTL_IS_SORTED_UNTIL_H
12+
#define _CUDA_STD___PSTL_IS_SORTED_UNTIL_H
13+
14+
#include <cuda/std/detail/__config>
15+
16+
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
17+
# pragma GCC system_header
18+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
19+
# pragma clang system_header
20+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
21+
# pragma system_header
22+
#endif // no system header
23+
24+
#if !_CCCL_COMPILER(NVRTC)
25+
26+
# include <cuda/__iterator/zip_function.h>
27+
# include <cuda/__iterator/zip_iterator.h>
28+
# include <cuda/__nvtx/nvtx.h>
29+
# include <cuda/std/__algorithm/is_sorted_until.h>
30+
# include <cuda/std/__concepts/concept_macros.h>
31+
# include <cuda/std/__execution/policy.h>
32+
# include <cuda/std/__functional/operations.h>
33+
# include <cuda/std/__iterator/concepts.h>
34+
# include <cuda/std/__iterator/iterator_traits.h>
35+
# include <cuda/std/__pstl/dispatch.h>
36+
# include <cuda/std/__type_traits/always_false.h>
37+
# include <cuda/std/__type_traits/is_execution_policy.h>
38+
# include <cuda/std/__utility/move.h>
39+
40+
# if _CCCL_HAS_BACKEND_CUDA()
41+
# include <cuda/std/__pstl/cuda/find_if.h>
42+
# endif // _CCCL_HAS_BACKEND_CUDA()
43+
44+
# include <cuda/std/__cccl/prologue.h>
45+
46+
_CCCL_BEGIN_NAMESPACE_CUDA_STD
47+
48+
_CCCL_BEGIN_NAMESPACE_ARCH_DEPENDENT
49+
50+
_CCCL_TEMPLATE(class _Policy, class _InputIterator, class _BinaryPredicate = less<>)
51+
_CCCL_REQUIRES(__has_forward_traversal<_InputIterator> _CCCL_AND is_execution_policy_v<_Policy>)
52+
[[nodiscard]] _CCCL_HOST_API _InputIterator is_sorted_until(
53+
[[maybe_unused]] const _Policy& __policy, _InputIterator __first, _InputIterator __last, _BinaryPredicate __pred = {})
54+
{
55+
static_assert(indirect_binary_predicate<_BinaryPredicate, _InputIterator, _InputIterator>,
56+
"cuda::std::is_sorted_until: BinaryPredicate must satisfy "
57+
"indirect_binary_predicate<BinaryPredicate, InputIterator, InputIterator>");
58+
59+
[[maybe_unused]] auto __dispatch =
60+
::cuda::std::execution::__pstl_select_dispatch<::cuda::std::execution::__pstl_algorithm::__find_if, _Policy>();
61+
if constexpr (::cuda::std::execution::__pstl_can_dispatch<decltype(__dispatch)>)
62+
{
63+
_CCCL_NVTX_RANGE_SCOPE("cuda::std::is_sorted_until");
64+
65+
if (__first == __last)
66+
{
67+
return __first;
68+
}
69+
70+
// Note we compare __first + 1 and __first, so that we do not need to negate the predicate
71+
auto __result = __dispatch(
72+
__policy,
73+
::cuda::zip_iterator{__first + 1, __first},
74+
::cuda::zip_iterator{__last, __last},
75+
::cuda::zip_function{::cuda::std::move(__pred)});
76+
return ::cuda::std::get<0>(__result.__iterators());
77+
}
78+
else
79+
{
80+
static_assert(__always_false_v<_Policy>,
81+
"Parallel cuda::std::is_sorted_until requires at least one selected backend");
82+
return ::cuda::std::is_sorted_until(
83+
::cuda::std::move(__first), ::cuda::std::move(__last), ::cuda::std::move(__pred));
84+
}
85+
}
86+
87+
_CCCL_END_NAMESPACE_ARCH_DEPENDENT
88+
89+
_CCCL_END_NAMESPACE_CUDA_STD
90+
91+
# include <cuda/std/__cccl/epilogue.h>
92+
93+
#endif // !_CCCL_COMPILER(NVRTC)
94+
95+
#endif // _CUDA_STD___PSTL_IS_SORTED_UNTIL_H

libcudacxx/include/cuda/std/__pstl_algorithm

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,8 @@
4242
#include <cuda/std/__pstl/generate.h>
4343
#include <cuda/std/__pstl/generate_n.h>
4444
#include <cuda/std/__pstl/inclusive_scan.h>
45+
#include <cuda/std/__pstl/is_sorted.h>
46+
#include <cuda/std/__pstl/is_sorted_until.h>
4547
#include <cuda/std/__pstl/merge.h>
4648
#include <cuda/std/__pstl/mismatch.h>
4749
#include <cuda/std/__pstl/none_of.h>

0 commit comments

Comments
 (0)