@@ -334,7 +334,7 @@ public:
334334 // ! The total number of items selected is written to ``d_num_selected_out``.
335335 // !
336336 // ! .. versionadded:: 3.4.0
337- // ! First appears in CUDA Toolkit 12 .4.
337+ // ! First appears in CUDA Toolkit 13 .4.
338338 // !
339339 // ! This is an environment-based API that allows customization of:
340340 // !
@@ -436,12 +436,104 @@ public:
436436 });
437437 }
438438
439+ // ! @rst
440+ // ! Uses the ``d_flags`` sequence to selectively compact items in ``d_data``.
441+ // ! The total number of items selected is written to ``d_num_selected_out``.
442+ // !
443+ // ! .. versionadded:: 3.4.0
444+ // ! First appears in CUDA Toolkit 13.4.
445+ // !
446+ // ! This is an environment-based API that allows customization of:
447+ // !
448+ // ! - Stream: Query via ``cuda::get_stream``
449+ // ! - Memory resource: Query via ``cuda::mr::get_memory_resource``
450+ // !
451+ // ! - The value type of ``d_flags`` must be castable to ``bool`` (e.g., ``bool``, ``char``, ``int``, etc.).
452+ // ! - Copies of the selected items are compacted in-place and maintain their original relative ordering.
453+ // ! - | The ``d_data`` may equal ``d_flags``. The range ``[d_data, d_data + num_items)`` shall not overlap
454+ // ! | ``[d_flags, d_flags + num_items)`` in any other way.
455+ // !
456+ // !
457+ // ! Snippet
458+ // ! +++++++++++++++++++++++++++++++++++++++++++++
459+ // !
460+ // ! The code snippet below illustrates the in-place compaction of items selected from an ``int`` device vector
461+ // ! using environment-based API:
462+ // !
463+ // ! .. literalinclude:: ../../../cub/test/catch2_test_device_select_env_api.cu
464+ // ! :language: c++
465+ // ! :dedent:
466+ // ! :start-after: example-begin select-flagged-inplace-env
467+ // ! :end-before: example-end select-flagged-inplace-env
468+ // !
469+ // ! @endrst
470+ // !
471+ // ! @tparam IteratorT
472+ // ! **[inferred]** Random-access iterator type for reading and writing selected items @iterator
473+ // !
474+ // ! @tparam FlagIterator
475+ // ! **[inferred]** Random-access input iterator type for reading selection flags @iterator
476+ // !
477+ // ! @tparam NumSelectedIteratorT
478+ // ! **[inferred]** Output iterator type for recording the number of items selected @iterator
479+ // !
480+ // ! @tparam NumItemsT
481+ // ! **[inferred]** Type of num_items
482+ // !
483+ // ! @tparam EnvT
484+ // ! **[inferred]** Environment type (e.g., `cuda::std::execution::env<...>`)
485+ // !
486+ // ! @param[in,out] d_data
487+ // ! Pointer to the sequence of data items
488+ // !
489+ // ! @param[in] d_flags
490+ // ! Pointer to the input sequence of selection flags
491+ // !
492+ // ! @param[out] d_num_selected_out
493+ // ! Pointer to the output total number of items selected
494+ // !
495+ // ! @param[in] num_items
496+ // ! Total number of input items (i.e., length of `d_data`)
497+ // !
498+ // ! @param[in] env
499+ // ! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
500+ template <typename IteratorT,
501+ typename FlagIterator,
502+ typename NumSelectedIteratorT,
503+ typename NumItemsT,
504+ typename EnvT = ::cuda::std::execution::env<>,
505+ ::cuda::std::enable_if_t <::cuda::std::is_integral_v<NumItemsT> && !::cuda::std::is_same_v<IteratorT, void *>
506+ && !::cuda::std::is_same_v<FlagIterator, size_t &>,
507+ int > = 0 >
508+ [[nodiscard]] CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Flagged (
509+ IteratorT d_data, FlagIterator d_flags, NumSelectedIteratorT d_num_selected_out, NumItemsT num_items, EnvT env = {})
510+ {
511+ _CCCL_NVTX_RANGE_SCOPE (" cub::DeviceSelect::Flagged" );
512+
513+ using offset_t = detail::choose_offset_t <NumItemsT>;
514+
515+ return detail::dispatch_with_env (env, [&]([[maybe_unused]] auto tuning, void * storage, size_t & bytes, auto stream) {
516+ using tuning_t = decltype (tuning);
517+ return select_impl<tuning_t , SelectImpl::SelectPotentiallyInPlace>(
518+ storage,
519+ bytes,
520+ d_data,
521+ d_flags,
522+ d_data,
523+ d_num_selected_out,
524+ static_cast <offset_t >(num_items),
525+ NullType{},
526+ NullType{},
527+ stream);
528+ });
529+ }
530+
439531 // ! @rst
440532 // ! Uses the ``select_op`` functor to selectively copy items from ``d_in`` into ``d_out``.
441533 // ! The total number of items selected is written to ``d_num_selected_out``.
442534 // !
443535 // ! .. versionadded:: 3.4.0
444- // ! First appears in CUDA Toolkit 12 .4.
536+ // ! First appears in CUDA Toolkit 13 .4.
445537 // !
446538 // ! This is an environment-based API that allows customization of:
447539 // !
@@ -542,6 +634,95 @@ public:
542634 });
543635 }
544636
637+ // ! @rst
638+ // ! Uses the ``select_op`` functor to selectively compact items in ``d_data``.
639+ // ! The total number of items selected is written to ``d_num_selected_out``.
640+ // !
641+ // ! .. versionadded:: 3.4.0
642+ // ! First appears in CUDA Toolkit 13.4.
643+ // !
644+ // ! This is an environment-based API that allows customization of:
645+ // !
646+ // ! - Stream: Query via ``cuda::get_stream``
647+ // ! - Memory resource: Query via ``cuda::mr::get_memory_resource``
648+ // !
649+ // ! - Copies of the selected items are compacted in ``d_data`` and maintain
650+ // ! their original relative ordering.
651+ // !
652+ // !
653+ // ! Snippet
654+ // ! +++++++++++++++++++++++++++++++++++++++++++++
655+ // !
656+ // ! The code snippet below illustrates the in-place compaction of items selected from an ``int`` device vector
657+ // ! using environment-based API:
658+ // !
659+ // ! .. literalinclude:: ../../../cub/test/catch2_test_device_select_env_api.cu
660+ // ! :language: c++
661+ // ! :dedent:
662+ // ! :start-after: example-begin select-if-inplace-env
663+ // ! :end-before: example-end select-if-inplace-env
664+ // !
665+ // ! @endrst
666+ // !
667+ // ! @tparam IteratorT
668+ // ! **[inferred]** Random-access iterator type for reading and writing items @iterator
669+ // !
670+ // ! @tparam NumSelectedIteratorT
671+ // ! **[inferred]** Output iterator type for recording the number of items selected @iterator
672+ // !
673+ // ! @tparam SelectOp
674+ // ! **[inferred]** Selection operator type having member `bool operator()(const T &a)`
675+ // !
676+ // ! @tparam NumItemsT
677+ // ! **[inferred]** Type of num_items
678+ // !
679+ // ! @tparam EnvT
680+ // ! **[inferred]** Environment type (e.g., `cuda::std::execution::env<...>`)
681+ // !
682+ // ! @param[in,out] d_data
683+ // ! Pointer to the sequence of data items
684+ // !
685+ // ! @param[out] d_num_selected_out
686+ // ! Pointer to the output total number of items selected
687+ // !
688+ // ! @param[in] num_items
689+ // ! Total number of input items (i.e., length of `d_data`)
690+ // !
691+ // ! @param[in] select_op
692+ // ! Unary selection operator
693+ // !
694+ // ! @param[in] env
695+ // ! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
696+ template <typename IteratorT,
697+ typename NumSelectedIteratorT,
698+ typename SelectOp,
699+ typename NumItemsT,
700+ typename EnvT = ::cuda::std::execution::env<>,
701+ ::cuda::std::enable_if_t <::cuda::std::is_integral_v<NumItemsT> && !::cuda::std::is_same_v<IteratorT, void *>,
702+ int > = 0 >
703+ [[nodiscard]] CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t
704+ If (IteratorT d_data, NumSelectedIteratorT d_num_selected_out, NumItemsT num_items, SelectOp select_op, EnvT env = {})
705+ {
706+ _CCCL_NVTX_RANGE_SCOPE (" cub::DeviceSelect::If" );
707+
708+ using offset_t = detail::choose_offset_t <NumItemsT>;
709+
710+ return detail::dispatch_with_env (env, [&]([[maybe_unused]] auto tuning, void * storage, size_t & bytes, auto stream) {
711+ using tuning_t = decltype (tuning);
712+ return select_impl<tuning_t , SelectImpl::SelectPotentiallyInPlace>(
713+ storage,
714+ bytes,
715+ d_data,
716+ static_cast <NullType*>(nullptr ),
717+ d_data,
718+ d_num_selected_out,
719+ static_cast <offset_t >(num_items),
720+ select_op,
721+ NullType{},
722+ stream);
723+ });
724+ }
725+
545726 // ! @rst
546727 // ! Uses the ``d_flags`` sequence to selectively compact the items in `d_data``.
547728 // ! The total number of items selected is written to ``d_num_selected_out``.
@@ -1280,6 +1461,111 @@ public:
12801461 });
12811462 }
12821463
1464+ // ! @rst
1465+ // ! Uses the ``select_op`` functor applied to ``d_flags`` to selectively compact
1466+ // ! items in ``d_data``.
1467+ // ! The total number of items selected is written to ``d_num_selected_out``.
1468+ // !
1469+ // ! .. versionadded:: 3.4.0
1470+ // ! First appears in CUDA Toolkit 13.4.
1471+ // !
1472+ // ! This is an environment-based API that allows customization of:
1473+ // !
1474+ // ! - Stream: Query via ``cuda::get_stream``
1475+ // ! - Memory resource: Query via ``cuda::mr::get_memory_resource``
1476+ // !
1477+ // ! - The expression ``select_op(flag)`` must be convertible to ``bool``,
1478+ // ! where the type of ``flag`` corresponds to the value type of ``FlagIterator``.
1479+ // ! - Copies of the selected items are compacted in-place and maintain their original relative ordering.
1480+ // ! - | The ``d_data`` may equal ``d_flags``. The range ``[d_data, d_data + num_items)`` shall not overlap
1481+ // ! | ``[d_flags, d_flags + num_items)`` in any other way.
1482+ // !
1483+ // !
1484+ // ! Snippet
1485+ // ! +++++++++++++++++++++++++++++++++++++++++++++
1486+ // !
1487+ // ! The code snippet below illustrates the in-place compaction of items selected from an ``int`` device vector
1488+ // ! using environment-based API:
1489+ // !
1490+ // ! .. literalinclude:: ../../../cub/test/catch2_test_device_select_env_api.cu
1491+ // ! :language: c++
1492+ // ! :dedent:
1493+ // ! :start-after: example-begin select-flaggedif-inplace-env
1494+ // ! :end-before: example-end select-flaggedif-inplace-env
1495+ // !
1496+ // ! @endrst
1497+ // !
1498+ // ! @tparam IteratorT
1499+ // ! **[inferred]** Random-access iterator type for reading and writing selected items @iterator
1500+ // !
1501+ // ! @tparam FlagIterator
1502+ // ! **[inferred]** Random-access input iterator type for reading selection flags @iterator
1503+ // !
1504+ // ! @tparam NumSelectedIteratorT
1505+ // ! **[inferred]** Output iterator type for recording the number of items selected @iterator
1506+ // !
1507+ // ! @tparam SelectOp
1508+ // ! **[inferred]** Selection operator type having member `bool operator()(const T &a)`
1509+ // !
1510+ // ! @tparam NumItemsT
1511+ // ! **[inferred]** Type of num_items
1512+ // !
1513+ // ! @tparam EnvT
1514+ // ! **[inferred]** Environment type (e.g., `cuda::std::execution::env<...>`)
1515+ // !
1516+ // ! @param[in,out] d_data
1517+ // ! Pointer to the sequence of data items
1518+ // !
1519+ // ! @param[in] d_flags
1520+ // ! Pointer to the input sequence of selection flags
1521+ // !
1522+ // ! @param[out] d_num_selected_out
1523+ // ! Pointer to the output total number of items selected
1524+ // !
1525+ // ! @param[in] num_items
1526+ // ! Total number of input items (i.e., length of `d_data`)
1527+ // !
1528+ // ! @param[in] select_op
1529+ // ! Unary selection operator
1530+ // !
1531+ // ! @param[in] env
1532+ // ! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
1533+ template <
1534+ typename IteratorT,
1535+ typename FlagIterator,
1536+ typename NumSelectedIteratorT,
1537+ typename SelectOp,
1538+ typename NumItemsT,
1539+ typename EnvT = ::cuda::std::execution::env<>,
1540+ ::cuda::std::enable_if_t <::cuda::std::is_integral_v<NumItemsT> && !::cuda::std::is_same_v<IteratorT, void *>, int > = 0 >
1541+ [[nodiscard]] CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t FlaggedIf (
1542+ IteratorT d_data,
1543+ FlagIterator d_flags,
1544+ NumSelectedIteratorT d_num_selected_out,
1545+ NumItemsT num_items,
1546+ SelectOp select_op,
1547+ EnvT env = {})
1548+ {
1549+ _CCCL_NVTX_RANGE_SCOPE (" cub::DeviceSelect::FlaggedIf" );
1550+
1551+ using offset_t = detail::choose_offset_t <NumItemsT>;
1552+
1553+ return detail::dispatch_with_env (env, [&]([[maybe_unused]] auto tuning, void * storage, size_t & bytes, auto stream) {
1554+ using tuning_t = decltype (tuning);
1555+ return select_impl<tuning_t , SelectImpl::SelectPotentiallyInPlace>(
1556+ storage,
1557+ bytes,
1558+ d_data,
1559+ d_flags,
1560+ d_data,
1561+ d_num_selected_out,
1562+ static_cast <offset_t >(num_items),
1563+ select_op,
1564+ NullType{},
1565+ stream);
1566+ });
1567+ }
1568+
12831569 // ! @rst
12841570 // ! Given an input sequence ``d_in`` having runs of consecutive equal-valued keys,
12851571 // ! only the first key from each run is selectively copied to ``d_out``.
0 commit comments