NVIDIA
diff --git a/‎libcudacxx/benchmarks/bench/is_sorted/basic.cu‎
Lines changed: 76 additions & 0 deletions b/‎libcudacxx/benchmarks/bench/is_sorted/basic.cu‎
Lines changed: 76 additions & 0 deletions
diff --git a/‎libcudacxx/benchmarks/bench/is_sorted_until/basic.cu‎
Lines changed: 76 additions & 0 deletions b/‎libcudacxx/benchmarks/bench/is_sorted_until/basic.cu‎
Lines changed: 76 additions & 0 deletions
diff --git a/‎libcudacxx/include/cuda/std/__pstl/is_sorted.h‎
Lines changed: 93 additions & 0 deletions b/‎libcudacxx/include/cuda/std/__pstl/is_sorted.h‎
Lines changed: 93 additions & 0 deletions
diff --git a/‎libcudacxx/include/cuda/std/__pstl/is_sorted_until.h‎
Lines changed: 95 additions & 0 deletions b/‎libcudacxx/include/cuda/std/__pstl/is_sorted_until.h‎
Lines changed: 95 additions & 0 deletions
diff --git a/‎libcudacxx/include/cuda/std/__pstl_algorithm‎
Lines changed: 2 additions & 0 deletions b/‎libcudacxx/include/cuda/std/__pstl_algorithm‎
Lines changed: 2 additions & 0 deletions
@@ -0,0 +1,76 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <thrust/device_vector.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+
+#include <cuda/functional>
+#include <cuda/std/__pstl_algorithm>
+#include <cuda/stream>
+
+#include "nvbench_helper.cuh"
+
+template <typename T>
+static void basic(nvbench::state& state, nvbench::type_list<T>)
+{
+  // set up input
+  const auto elements       = static_cast<std::size_t>(state.get_int64("Elements"));
+  const auto common_prefix  = state.get_float64("MismatchAt");
+  const auto mismatch_point = ::cuda::std::clamp<std::size_t>(elements * common_prefix, 0ull, elements - 1);
+
+  thrust::device_vector<T> dinput(elements, thrust::no_init);
+  thrust::sequence(dinput.begin(), dinput.end(), T{0});
+  dinput[mismatch_point] = T{-1};
+
+  state.add_global_memory_reads<T>(mismatch_point + 1);
+  state.add_global_memory_writes<size_t>(1);
+
+  caching_allocator_t alloc{};
+
+  state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
+             [&](nvbench::launch& launch) {
+               do_not_optimize(cuda::std::is_sorted(cuda_policy(alloc, launch), dinput.begin(), dinput.end()));
+             });
+}
+
+NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(fundamental_types))
+  .set_name("base")
+  .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4))
+  .add_float64_axis("MismatchAt", std::vector{1.0, 0.5, 0.01});
+
+template <typename T>
+static void with_predicate(nvbench::state& state, nvbench::type_list<T>)
+{
+  // set up input
+  const auto elements       = static_cast<std::size_t>(state.get_int64("Elements"));
+  const auto common_prefix  = state.get_float64("MismatchAt");
+  const auto mismatch_point = ::cuda::std::clamp<std::size_t>(elements * common_prefix, 0ull, elements - 1);
+
+  thrust::device_vector<T> dinput(elements, thrust::no_init);
+  thrust::sequence(dinput.begin(), dinput.end(), T{0});
+  dinput[mismatch_point] = T{-1};
+
+  state.add_global_memory_reads<T>(mismatch_point + 1);
+  state.add_global_memory_writes<size_t>(1);
+
+  caching_allocator_t alloc{};
+
+  state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
+             [&](nvbench::launch& launch) {
+               do_not_optimize(
+                 cuda::std::is_sorted(cuda_policy(alloc, launch), dinput.begin(), dinput.end(), cuda::std::less<>{}));
+             });
+}
+
+NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(fundamental_types))
+  .set_name("base")
+  .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4))
+  .add_float64_axis("MismatchAt", std::vector{1.0, 0.5, 0.01});
@@ -0,0 +1,76 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <thrust/device_vector.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+
+#include <cuda/functional>
+#include <cuda/std/__pstl_algorithm>
+#include <cuda/stream>
+
+#include "nvbench_helper.cuh"
+
+template <typename T>
+static void basic(nvbench::state& state, nvbench::type_list<T>)
+{
+  // set up input
+  const auto elements       = static_cast<std::size_t>(state.get_int64("Elements"));
+  const auto common_prefix  = state.get_float64("MismatchAt");
+  const auto mismatch_point = ::cuda::std::clamp<std::size_t>(elements * common_prefix, 0ull, elements - 1);
+
+  thrust::device_vector<T> dinput(elements, thrust::no_init);
+  thrust::sequence(dinput.begin(), dinput.end(), T{0});
+  dinput[mismatch_point] = T{-1};
+
+  state.add_global_memory_reads<T>(mismatch_point + 1);
+  state.add_global_memory_writes<size_t>(1);
+
+  caching_allocator_t alloc{};
+
+  state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
+             [&](nvbench::launch& launch) {
+               do_not_optimize(cuda::std::is_sorted_until(cuda_policy(alloc, launch), dinput.begin(), dinput.end()));
+             });
+}
+
+NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(fundamental_types))
+  .set_name("base")
+  .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4))
+  .add_float64_axis("MismatchAt", std::vector{1.0, 0.5, 0.01});
+
+template <typename T>
+static void with_predicate(nvbench::state& state, nvbench::type_list<T>)
+{
+  // set up input
+  const auto elements       = static_cast<std::size_t>(state.get_int64("Elements"));
+  const auto common_prefix  = state.get_float64("MismatchAt");
+  const auto mismatch_point = ::cuda::std::clamp<std::size_t>(elements * common_prefix, 0ull, elements - 1);
+
+  thrust::device_vector<T> dinput(elements, thrust::no_init);
+  thrust::sequence(dinput.begin(), dinput.end(), T{0});
+  dinput[mismatch_point] = T{-1};
+
+  state.add_global_memory_reads<T>(mismatch_point + 1);
+  state.add_global_memory_writes<size_t>(1);
+
+  caching_allocator_t alloc{};
+
+  state.exec(
+    nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+      do_not_optimize(
+        cuda::std::is_sorted_until(cuda_policy(alloc, launch), dinput.begin(), dinput.end(), cuda::std::less<>{}));
+    });
+}
+
+NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(fundamental_types))
+  .set_name("base")
+  .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4))
+  .add_float64_axis("MismatchAt", std::vector{1.0, 0.5, 0.01});
@@ -0,0 +1,93 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_STD___PSTL_IS_SORTED_H
+#define _CUDA_STD___PSTL_IS_SORTED_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if !_CCCL_COMPILER(NVRTC)
+
+#  include <cuda/__iterator/zip_function.h>
+#  include <cuda/__iterator/zip_iterator.h>
+#  include <cuda/__nvtx/nvtx.h>
+#  include <cuda/std/__algorithm/is_sorted.h>
+#  include <cuda/std/__concepts/concept_macros.h>
+#  include <cuda/std/__execution/policy.h>
+#  include <cuda/std/__functional/operations.h>
+#  include <cuda/std/__iterator/concepts.h>
+#  include <cuda/std/__iterator/iterator_traits.h>
+#  include <cuda/std/__pstl/dispatch.h>
+#  include <cuda/std/__type_traits/always_false.h>
+#  include <cuda/std/__type_traits/is_execution_policy.h>
+#  include <cuda/std/__utility/move.h>
+
+#  if _CCCL_HAS_BACKEND_CUDA()
+#    include <cuda/std/__pstl/cuda/find_if.h>
+#  endif // _CCCL_HAS_BACKEND_CUDA()
+
+#  include <cuda/std/__cccl/prologue.h>
+
+_CCCL_BEGIN_NAMESPACE_CUDA_STD
+
+_CCCL_BEGIN_NAMESPACE_ARCH_DEPENDENT
+
+_CCCL_TEMPLATE(class _Policy, class _InputIterator, class _BinaryPredicate = less<>)
+_CCCL_REQUIRES(__has_forward_traversal<_InputIterator> _CCCL_AND is_execution_policy_v<_Policy>)
+[[nodiscard]] _CCCL_HOST_API bool is_sorted(
+  [[maybe_unused]] const _Policy& __policy, _InputIterator __first, _InputIterator __last, _BinaryPredicate __pred = {})
+{
+  static_assert(indirect_binary_predicate<_BinaryPredicate, _InputIterator, _InputIterator>,
+                "cuda::std::is_sorted: BinaryPredicate must satisfy "
+                "indirect_binary_predicate<BinaryPredicate, InputIterator, InputIterator>");
+
+  [[maybe_unused]] auto __dispatch =
+    ::cuda::std::execution::__pstl_select_dispatch<::cuda::std::execution::__pstl_algorithm::__find_if, _Policy>();
+  if constexpr (::cuda::std::execution::__pstl_can_dispatch<decltype(__dispatch)>)
+  {
+    _CCCL_NVTX_RANGE_SCOPE("cuda::std::is_sorted");
+
+    if (__first == __last)
+    {
+      return true;
+    }
+
+    // Note we compare __first + 1 and __first, so that we do not need to negate the predicate
+    auto __result = __dispatch(
+      __policy,
+      ::cuda::zip_iterator{__first + 1, __first},
+      ::cuda::zip_iterator{__last, __last},
+      ::cuda::zip_function{::cuda::std::move(__pred)});
+    return ::cuda::std::get<0>(__result.__iterators()) == __last;
+  }
+  else
+  {
+    static_assert(__always_false_v<_Policy>, "Parallel cuda::std::is_sorted requires at least one selected backend");
+    return ::cuda::std::is_sorted(::cuda::std::move(__first), ::cuda::std::move(__last), ::cuda::std::move(__pred));
+  }
+}
+
+_CCCL_END_NAMESPACE_ARCH_DEPENDENT
+
+_CCCL_END_NAMESPACE_CUDA_STD
+
+#  include <cuda/std/__cccl/epilogue.h>
+
+#endif // !_CCCL_COMPILER(NVRTC)
+
+#endif // _CUDA_STD___PSTL_IS_SORTED_H
@@ -0,0 +1,95 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_STD___PSTL_IS_SORTED_UNTIL_H
+#define _CUDA_STD___PSTL_IS_SORTED_UNTIL_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if !_CCCL_COMPILER(NVRTC)
+
+#  include <cuda/__iterator/zip_function.h>
+#  include <cuda/__iterator/zip_iterator.h>
+#  include <cuda/__nvtx/nvtx.h>
+#  include <cuda/std/__algorithm/is_sorted_until.h>
+#  include <cuda/std/__concepts/concept_macros.h>
+#  include <cuda/std/__execution/policy.h>
+#  include <cuda/std/__functional/operations.h>
+#  include <cuda/std/__iterator/concepts.h>
+#  include <cuda/std/__iterator/iterator_traits.h>
+#  include <cuda/std/__pstl/dispatch.h>
+#  include <cuda/std/__type_traits/always_false.h>
+#  include <cuda/std/__type_traits/is_execution_policy.h>
+#  include <cuda/std/__utility/move.h>
+
+#  if _CCCL_HAS_BACKEND_CUDA()
+#    include <cuda/std/__pstl/cuda/find_if.h>
+#  endif // _CCCL_HAS_BACKEND_CUDA()
+
+#  include <cuda/std/__cccl/prologue.h>
+
+_CCCL_BEGIN_NAMESPACE_CUDA_STD
+
+_CCCL_BEGIN_NAMESPACE_ARCH_DEPENDENT
+
+_CCCL_TEMPLATE(class _Policy, class _InputIterator, class _BinaryPredicate = less<>)
+_CCCL_REQUIRES(__has_forward_traversal<_InputIterator> _CCCL_AND is_execution_policy_v<_Policy>)
+[[nodiscard]] _CCCL_HOST_API _InputIterator is_sorted_until(
+  [[maybe_unused]] const _Policy& __policy, _InputIterator __first, _InputIterator __last, _BinaryPredicate __pred = {})
+{
+  static_assert(indirect_binary_predicate<_BinaryPredicate, _InputIterator, _InputIterator>,
+                "cuda::std::is_sorted_until: BinaryPredicate must satisfy "
+                "indirect_binary_predicate<BinaryPredicate, InputIterator, InputIterator>");
+
+  [[maybe_unused]] auto __dispatch =
+    ::cuda::std::execution::__pstl_select_dispatch<::cuda::std::execution::__pstl_algorithm::__find_if, _Policy>();
+  if constexpr (::cuda::std::execution::__pstl_can_dispatch<decltype(__dispatch)>)
+  {
+    _CCCL_NVTX_RANGE_SCOPE("cuda::std::is_sorted_until");
+
+    if (__first == __last)
+    {
+      return __first;
+    }
+
+    // Note we compare __first + 1 and __first, so that we do not need to negate the predicate
+    auto __result = __dispatch(
+      __policy,
+      ::cuda::zip_iterator{__first + 1, __first},
+      ::cuda::zip_iterator{__last, __last},
+      ::cuda::zip_function{::cuda::std::move(__pred)});
+    return ::cuda::std::get<0>(__result.__iterators());
+  }
+  else
+  {
+    static_assert(__always_false_v<_Policy>,
+                  "Parallel cuda::std::is_sorted_until requires at least one selected backend");
+    return ::cuda::std::is_sorted_until(
+      ::cuda::std::move(__first), ::cuda::std::move(__last), ::cuda::std::move(__pred));
+  }
+}
+
+_CCCL_END_NAMESPACE_ARCH_DEPENDENT
+
+_CCCL_END_NAMESPACE_CUDA_STD
+
+#  include <cuda/std/__cccl/epilogue.h>
+
+#endif // !_CCCL_COMPILER(NVRTC)
+
+#endif // _CUDA_STD___PSTL_IS_SORTED_UNTIL_H
@@ -42,6 +42,8 @@
 #include <cuda/std/__pstl/generate.h>
 #include <cuda/std/__pstl/generate_n.h>
 #include <cuda/std/__pstl/inclusive_scan.h>
+#include <cuda/std/__pstl/is_sorted.h>
+#include <cuda/std/__pstl/is_sorted_until.h>
 #include <cuda/std/__pstl/merge.h>
 #include <cuda/std/__pstl/mismatch.h>
 #include <cuda/std/__pstl/none_of.h>