Skip to content

Commit 161f870

Browse files
authored
Add env overloads for Select::* in place algorithms (#8018)
* Add env overloads for Select::* in place algorithms * add missing doc details and fix version numbers
1 parent b3d0557 commit 161f870

File tree

3 files changed

+540
-2
lines changed

3 files changed

+540
-2
lines changed

cub/cub/device/device_select.cuh

Lines changed: 288 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -334,7 +334,7 @@ public:
334334
//! The total number of items selected is written to ``d_num_selected_out``.
335335
//!
336336
//! .. versionadded:: 3.4.0
337-
//! First appears in CUDA Toolkit 12.4.
337+
//! First appears in CUDA Toolkit 13.4.
338338
//!
339339
//! This is an environment-based API that allows customization of:
340340
//!
@@ -436,12 +436,104 @@ public:
436436
});
437437
}
438438

439+
//! @rst
440+
//! Uses the ``d_flags`` sequence to selectively compact items in ``d_data``.
441+
//! The total number of items selected is written to ``d_num_selected_out``.
442+
//!
443+
//! .. versionadded:: 3.4.0
444+
//! First appears in CUDA Toolkit 13.4.
445+
//!
446+
//! This is an environment-based API that allows customization of:
447+
//!
448+
//! - Stream: Query via ``cuda::get_stream``
449+
//! - Memory resource: Query via ``cuda::mr::get_memory_resource``
450+
//!
451+
//! - The value type of ``d_flags`` must be castable to ``bool`` (e.g., ``bool``, ``char``, ``int``, etc.).
452+
//! - Copies of the selected items are compacted in-place and maintain their original relative ordering.
453+
//! - | The ``d_data`` may equal ``d_flags``. The range ``[d_data, d_data + num_items)`` shall not overlap
454+
//! | ``[d_flags, d_flags + num_items)`` in any other way.
455+
//!
456+
//!
457+
//! Snippet
458+
//! +++++++++++++++++++++++++++++++++++++++++++++
459+
//!
460+
//! The code snippet below illustrates the in-place compaction of items selected from an ``int`` device vector
461+
//! using environment-based API:
462+
//!
463+
//! .. literalinclude:: ../../../cub/test/catch2_test_device_select_env_api.cu
464+
//! :language: c++
465+
//! :dedent:
466+
//! :start-after: example-begin select-flagged-inplace-env
467+
//! :end-before: example-end select-flagged-inplace-env
468+
//!
469+
//! @endrst
470+
//!
471+
//! @tparam IteratorT
472+
//! **[inferred]** Random-access iterator type for reading and writing selected items @iterator
473+
//!
474+
//! @tparam FlagIterator
475+
//! **[inferred]** Random-access input iterator type for reading selection flags @iterator
476+
//!
477+
//! @tparam NumSelectedIteratorT
478+
//! **[inferred]** Output iterator type for recording the number of items selected @iterator
479+
//!
480+
//! @tparam NumItemsT
481+
//! **[inferred]** Type of num_items
482+
//!
483+
//! @tparam EnvT
484+
//! **[inferred]** Environment type (e.g., `cuda::std::execution::env<...>`)
485+
//!
486+
//! @param[in,out] d_data
487+
//! Pointer to the sequence of data items
488+
//!
489+
//! @param[in] d_flags
490+
//! Pointer to the input sequence of selection flags
491+
//!
492+
//! @param[out] d_num_selected_out
493+
//! Pointer to the output total number of items selected
494+
//!
495+
//! @param[in] num_items
496+
//! Total number of input items (i.e., length of `d_data`)
497+
//!
498+
//! @param[in] env
499+
//! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
500+
template <typename IteratorT,
501+
typename FlagIterator,
502+
typename NumSelectedIteratorT,
503+
typename NumItemsT,
504+
typename EnvT = ::cuda::std::execution::env<>,
505+
::cuda::std::enable_if_t<::cuda::std::is_integral_v<NumItemsT> && !::cuda::std::is_same_v<IteratorT, void*>
506+
&& !::cuda::std::is_same_v<FlagIterator, size_t&>,
507+
int> = 0>
508+
[[nodiscard]] CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Flagged(
509+
IteratorT d_data, FlagIterator d_flags, NumSelectedIteratorT d_num_selected_out, NumItemsT num_items, EnvT env = {})
510+
{
511+
_CCCL_NVTX_RANGE_SCOPE("cub::DeviceSelect::Flagged");
512+
513+
using offset_t = detail::choose_offset_t<NumItemsT>;
514+
515+
return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, auto stream) {
516+
using tuning_t = decltype(tuning);
517+
return select_impl<tuning_t, SelectImpl::SelectPotentiallyInPlace>(
518+
storage,
519+
bytes,
520+
d_data,
521+
d_flags,
522+
d_data,
523+
d_num_selected_out,
524+
static_cast<offset_t>(num_items),
525+
NullType{},
526+
NullType{},
527+
stream);
528+
});
529+
}
530+
439531
//! @rst
440532
//! Uses the ``select_op`` functor to selectively copy items from ``d_in`` into ``d_out``.
441533
//! The total number of items selected is written to ``d_num_selected_out``.
442534
//!
443535
//! .. versionadded:: 3.4.0
444-
//! First appears in CUDA Toolkit 12.4.
536+
//! First appears in CUDA Toolkit 13.4.
445537
//!
446538
//! This is an environment-based API that allows customization of:
447539
//!
@@ -542,6 +634,95 @@ public:
542634
});
543635
}
544636

637+
//! @rst
638+
//! Uses the ``select_op`` functor to selectively compact items in ``d_data``.
639+
//! The total number of items selected is written to ``d_num_selected_out``.
640+
//!
641+
//! .. versionadded:: 3.4.0
642+
//! First appears in CUDA Toolkit 13.4.
643+
//!
644+
//! This is an environment-based API that allows customization of:
645+
//!
646+
//! - Stream: Query via ``cuda::get_stream``
647+
//! - Memory resource: Query via ``cuda::mr::get_memory_resource``
648+
//!
649+
//! - Copies of the selected items are compacted in ``d_data`` and maintain
650+
//! their original relative ordering.
651+
//!
652+
//!
653+
//! Snippet
654+
//! +++++++++++++++++++++++++++++++++++++++++++++
655+
//!
656+
//! The code snippet below illustrates the in-place compaction of items selected from an ``int`` device vector
657+
//! using environment-based API:
658+
//!
659+
//! .. literalinclude:: ../../../cub/test/catch2_test_device_select_env_api.cu
660+
//! :language: c++
661+
//! :dedent:
662+
//! :start-after: example-begin select-if-inplace-env
663+
//! :end-before: example-end select-if-inplace-env
664+
//!
665+
//! @endrst
666+
//!
667+
//! @tparam IteratorT
668+
//! **[inferred]** Random-access iterator type for reading and writing items @iterator
669+
//!
670+
//! @tparam NumSelectedIteratorT
671+
//! **[inferred]** Output iterator type for recording the number of items selected @iterator
672+
//!
673+
//! @tparam SelectOp
674+
//! **[inferred]** Selection operator type having member `bool operator()(const T &a)`
675+
//!
676+
//! @tparam NumItemsT
677+
//! **[inferred]** Type of num_items
678+
//!
679+
//! @tparam EnvT
680+
//! **[inferred]** Environment type (e.g., `cuda::std::execution::env<...>`)
681+
//!
682+
//! @param[in,out] d_data
683+
//! Pointer to the sequence of data items
684+
//!
685+
//! @param[out] d_num_selected_out
686+
//! Pointer to the output total number of items selected
687+
//!
688+
//! @param[in] num_items
689+
//! Total number of input items (i.e., length of `d_data`)
690+
//!
691+
//! @param[in] select_op
692+
//! Unary selection operator
693+
//!
694+
//! @param[in] env
695+
//! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
696+
template <typename IteratorT,
697+
typename NumSelectedIteratorT,
698+
typename SelectOp,
699+
typename NumItemsT,
700+
typename EnvT = ::cuda::std::execution::env<>,
701+
::cuda::std::enable_if_t<::cuda::std::is_integral_v<NumItemsT> && !::cuda::std::is_same_v<IteratorT, void*>,
702+
int> = 0>
703+
[[nodiscard]] CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t
704+
If(IteratorT d_data, NumSelectedIteratorT d_num_selected_out, NumItemsT num_items, SelectOp select_op, EnvT env = {})
705+
{
706+
_CCCL_NVTX_RANGE_SCOPE("cub::DeviceSelect::If");
707+
708+
using offset_t = detail::choose_offset_t<NumItemsT>;
709+
710+
return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, auto stream) {
711+
using tuning_t = decltype(tuning);
712+
return select_impl<tuning_t, SelectImpl::SelectPotentiallyInPlace>(
713+
storage,
714+
bytes,
715+
d_data,
716+
static_cast<NullType*>(nullptr),
717+
d_data,
718+
d_num_selected_out,
719+
static_cast<offset_t>(num_items),
720+
select_op,
721+
NullType{},
722+
stream);
723+
});
724+
}
725+
545726
//! @rst
546727
//! Uses the ``d_flags`` sequence to selectively compact the items in `d_data``.
547728
//! The total number of items selected is written to ``d_num_selected_out``.
@@ -1280,6 +1461,111 @@ public:
12801461
});
12811462
}
12821463

1464+
//! @rst
1465+
//! Uses the ``select_op`` functor applied to ``d_flags`` to selectively compact
1466+
//! items in ``d_data``.
1467+
//! The total number of items selected is written to ``d_num_selected_out``.
1468+
//!
1469+
//! .. versionadded:: 3.4.0
1470+
//! First appears in CUDA Toolkit 13.4.
1471+
//!
1472+
//! This is an environment-based API that allows customization of:
1473+
//!
1474+
//! - Stream: Query via ``cuda::get_stream``
1475+
//! - Memory resource: Query via ``cuda::mr::get_memory_resource``
1476+
//!
1477+
//! - The expression ``select_op(flag)`` must be convertible to ``bool``,
1478+
//! where the type of ``flag`` corresponds to the value type of ``FlagIterator``.
1479+
//! - Copies of the selected items are compacted in-place and maintain their original relative ordering.
1480+
//! - | The ``d_data`` may equal ``d_flags``. The range ``[d_data, d_data + num_items)`` shall not overlap
1481+
//! | ``[d_flags, d_flags + num_items)`` in any other way.
1482+
//!
1483+
//!
1484+
//! Snippet
1485+
//! +++++++++++++++++++++++++++++++++++++++++++++
1486+
//!
1487+
//! The code snippet below illustrates the in-place compaction of items selected from an ``int`` device vector
1488+
//! using environment-based API:
1489+
//!
1490+
//! .. literalinclude:: ../../../cub/test/catch2_test_device_select_env_api.cu
1491+
//! :language: c++
1492+
//! :dedent:
1493+
//! :start-after: example-begin select-flaggedif-inplace-env
1494+
//! :end-before: example-end select-flaggedif-inplace-env
1495+
//!
1496+
//! @endrst
1497+
//!
1498+
//! @tparam IteratorT
1499+
//! **[inferred]** Random-access iterator type for reading and writing selected items @iterator
1500+
//!
1501+
//! @tparam FlagIterator
1502+
//! **[inferred]** Random-access input iterator type for reading selection flags @iterator
1503+
//!
1504+
//! @tparam NumSelectedIteratorT
1505+
//! **[inferred]** Output iterator type for recording the number of items selected @iterator
1506+
//!
1507+
//! @tparam SelectOp
1508+
//! **[inferred]** Selection operator type having member `bool operator()(const T &a)`
1509+
//!
1510+
//! @tparam NumItemsT
1511+
//! **[inferred]** Type of num_items
1512+
//!
1513+
//! @tparam EnvT
1514+
//! **[inferred]** Environment type (e.g., `cuda::std::execution::env<...>`)
1515+
//!
1516+
//! @param[in,out] d_data
1517+
//! Pointer to the sequence of data items
1518+
//!
1519+
//! @param[in] d_flags
1520+
//! Pointer to the input sequence of selection flags
1521+
//!
1522+
//! @param[out] d_num_selected_out
1523+
//! Pointer to the output total number of items selected
1524+
//!
1525+
//! @param[in] num_items
1526+
//! Total number of input items (i.e., length of `d_data`)
1527+
//!
1528+
//! @param[in] select_op
1529+
//! Unary selection operator
1530+
//!
1531+
//! @param[in] env
1532+
//! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
1533+
template <
1534+
typename IteratorT,
1535+
typename FlagIterator,
1536+
typename NumSelectedIteratorT,
1537+
typename SelectOp,
1538+
typename NumItemsT,
1539+
typename EnvT = ::cuda::std::execution::env<>,
1540+
::cuda::std::enable_if_t<::cuda::std::is_integral_v<NumItemsT> && !::cuda::std::is_same_v<IteratorT, void*>, int> = 0>
1541+
[[nodiscard]] CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t FlaggedIf(
1542+
IteratorT d_data,
1543+
FlagIterator d_flags,
1544+
NumSelectedIteratorT d_num_selected_out,
1545+
NumItemsT num_items,
1546+
SelectOp select_op,
1547+
EnvT env = {})
1548+
{
1549+
_CCCL_NVTX_RANGE_SCOPE("cub::DeviceSelect::FlaggedIf");
1550+
1551+
using offset_t = detail::choose_offset_t<NumItemsT>;
1552+
1553+
return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, auto stream) {
1554+
using tuning_t = decltype(tuning);
1555+
return select_impl<tuning_t, SelectImpl::SelectPotentiallyInPlace>(
1556+
storage,
1557+
bytes,
1558+
d_data,
1559+
d_flags,
1560+
d_data,
1561+
d_num_selected_out,
1562+
static_cast<offset_t>(num_items),
1563+
select_op,
1564+
NullType{},
1565+
stream);
1566+
});
1567+
}
1568+
12831569
//! @rst
12841570
//! Given an input sequence ``d_in`` having runs of consecutive equal-valued keys,
12851571
//! only the first key from each run is selectively copied to ``d_out``.

0 commit comments

Comments
 (0)