@@ -572,11 +572,8 @@ inline _CUDA_VSTD::uint64_t * barrier_native_handle(barrier<thread_scope_block>
572572
573573#if defined(_CCCL_CUDA_COMPILER)
574574
575- // Hide arrive_tx when CUDA architecture is insufficient. Note the
576- // (!defined(__CUDA_MINIMUM_ARCH__)). This is required to make sure the function
577- // does not get removed by cudafe, which does not define __CUDA_MINIMUM_ARCH__.
578- #if (defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__) || (!defined(__CUDA_MINIMUM_ARCH__))
579-
575+ #if __cccl_ptx_isa >= 800
576+ extern " C" _LIBCUDACXX_DEVICE void __cuda_ptx_barrier_arrive_tx_is_not_supported_before_SM_90__ ();
580577_LIBCUDACXX_NODISCARD_ATTRIBUTE _LIBCUDACXX_DEVICE inline
581578barrier<thread_scope_block>::arrival_token barrier_arrive_tx (
582579 barrier<thread_scope_block> & __b,
@@ -591,7 +588,7 @@ barrier<thread_scope_block>::arrival_token barrier_arrive_tx(
591588 _LIBCUDACXX_DEBUG_ASSERT (__transaction_count_update <= (1 << 20 ) - 1 , " Transaction count update cannot exceed 2^20 - 1." );
592589
593590 barrier<thread_scope_block>::arrival_token __token = {};
594- NV_IF_TARGET (
591+ NV_IF_ELSE_TARGET (
595592 // On architectures pre-sm90, arrive_tx is not supported.
596593 NV_PROVIDES_SM_90, (
597594 // We do not check for the statespace of the barrier here. This is
@@ -619,11 +616,47 @@ barrier<thread_scope_block>::arrival_token barrier_arrive_tx(
619616 _CUDA_VPTX::sem_release, _CUDA_VPTX::scope_cta, _CUDA_VPTX::space_shared, __native_handle, __arrive_count_update
620617 );
621618 }
619+ ),(
620+ __cuda_ptx_barrier_arrive_tx_is_not_supported_before_SM_90__ ();
622621 )
623622 );
624623 return __token;
625624}
626625
626+ extern " C" _LIBCUDACXX_DEVICE void __cuda_ptx_barrier_expect_tx_is_not_supported_before_SM_90__ ();
627+ _LIBCUDACXX_DEVICE inline
628+ void barrier_expect_tx (
629+ barrier<thread_scope_block> & __b,
630+ _CUDA_VSTD::ptrdiff_t __transaction_count_update) {
631+
632+ _LIBCUDACXX_DEBUG_ASSERT (__isShared (barrier_native_handle (__b)), " Barrier must be located in local shared memory." );
633+ _LIBCUDACXX_DEBUG_ASSERT (__transaction_count_update >= 0 , " Transaction count update must be non-negative." );
634+ // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#contents-of-the-mbarrier-object
635+ _LIBCUDACXX_DEBUG_ASSERT (__transaction_count_update <= (1 << 20 ) - 1 , " Transaction count update cannot exceed 2^20 - 1." );
636+
637+ // We do not check for the statespace of the barrier here. This is
638+ // on purpose. This allows debugging tools like memcheck/racecheck
639+ // to detect that we are passing a pointer with the wrong state
640+ // space to mbarrier.arrive. If we checked for the state space here,
641+ // and __trap() if wrong, then those tools would not be able to help
642+ // us in release builds. In debug builds, the error would be caught
643+ // by the asserts at the top of this function.
644+ NV_IF_ELSE_TARGET (
645+ // On architectures pre-sm90, arrive_tx is not supported.
646+ NV_PROVIDES_SM_90, (
647+ auto __bh = __cvta_generic_to_shared (barrier_native_handle (__b));
648+ asm (
649+ " mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;"
650+ :
651+ : " r" (static_cast <_CUDA_VSTD::uint32_t >(__bh)),
652+ " r" (static_cast <_CUDA_VSTD::uint32_t >(__transaction_count_update))
653+ : " memory" );
654+ ),(
655+ __cuda_ptx_barrier_expect_tx_is_not_supported_before_SM_90__ ();
656+ ));
657+ }
658+
659+ extern " C" _LIBCUDACXX_DEVICE void __cuda_ptx_memcpy_async_tx_is_not_supported_before_SM_90__ ();
627660template <typename _Tp, _CUDA_VSTD::size_t _Alignment>
628661_LIBCUDACXX_DEVICE inline async_contract_fulfillment memcpy_async_tx (
629662 _Tp* __dest,
@@ -643,6 +676,7 @@ _LIBCUDACXX_DEVICE inline async_contract_fulfillment memcpy_async_tx(
643676 _LIBCUDACXX_DEBUG_ASSERT (__isShared (__dest), " dest must point to shared memory." );
644677 _LIBCUDACXX_DEBUG_ASSERT (__isGlobal (__src), " src must point to global memory." );
645678
679+ NV_IF_ELSE_TARGET (NV_PROVIDES_SM_90,(
646680 auto __bh = __cvta_generic_to_shared (barrier_native_handle (__b));
647681 if (__isShared (__dest) && __isGlobal (__src)) {
648682 asm volatile (
@@ -660,36 +694,13 @@ _LIBCUDACXX_DEVICE inline async_contract_fulfillment memcpy_async_tx(
660694 // is not yet implemented. So we trap in this case as well.
661695 _LIBCUDACXX_UNREACHABLE ();
662696 }
697+ ),(
698+ __cuda_ptx_memcpy_async_tx_is_not_supported_before_SM_90__ ();
699+ ));
663700
664701 return async_contract_fulfillment::async;
665702}
666-
667- _LIBCUDACXX_DEVICE inline
668- void barrier_expect_tx (
669- barrier<thread_scope_block> & __b,
670- _CUDA_VSTD::ptrdiff_t __transaction_count_update) {
671-
672- _LIBCUDACXX_DEBUG_ASSERT (__isShared (barrier_native_handle (__b)), " Barrier must be located in local shared memory." );
673- _LIBCUDACXX_DEBUG_ASSERT (__transaction_count_update >= 0 , " Transaction count update must be non-negative." );
674- // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#contents-of-the-mbarrier-object
675- _LIBCUDACXX_DEBUG_ASSERT (__transaction_count_update <= (1 << 20 ) - 1 , " Transaction count update cannot exceed 2^20 - 1." );
676-
677- // We do not check for the statespace of the barrier here. This is
678- // on purpose. This allows debugging tools like memcheck/racecheck
679- // to detect that we are passing a pointer with the wrong state
680- // space to mbarrier.arrive. If we checked for the state space here,
681- // and __trap() if wrong, then those tools would not be able to help
682- // us in release builds. In debug builds, the error would be caught
683- // by the asserts at the top of this function.
684- auto __bh = __cvta_generic_to_shared (barrier_native_handle (__b));
685- asm (
686- " mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;"
687- :
688- : " r" (static_cast <_CUDA_VSTD::uint32_t >(__bh)),
689- " r" (static_cast <_CUDA_VSTD::uint32_t >(__transaction_count_update))
690- : " memory" );
691- }
692- #endif // __CUDA_MINIMUM_ARCH__
703+ #endif // __cccl_ptx_isa >= 800
693704#endif // _CCCL_CUDA_COMPILER
694705
695706_LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE
@@ -796,13 +807,15 @@ struct __memcpy_completion_impl {
796807 // bulk group to be used with shared memory barriers.
797808 _LIBCUDACXX_UNREACHABLE ();
798809 case __completion_mechanism::__mbarrier_complete_tx:
810+ #if __cccl_ptx_isa >= 800
799811 // Pre-sm90, the mbarrier_complete_tx completion mechanism is not available.
800812 NV_IF_TARGET (NV_PROVIDES_SM_90, (
801813 // Only perform the expect_tx operation with the leader thread
802814 if (__group.thread_rank () == 0 ) {
803815 ::cuda::device::barrier_expect_tx (__barrier, __size);
804816 }
805817 ));
818+ #endif // __cccl_ptx_isa >= 800
806819 return async_contract_fulfillment::async;
807820 case __completion_mechanism::__sync:
808821 // sync: In this case, we do not need to do anything. The user will have
@@ -929,11 +942,13 @@ struct __memcpy_completion_impl {
929942 * 5. normal synchronous copy (fallback)
930943 ***********************************************************************/
931944
932- #if (defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__) || (!defined(__CUDA_MINIMUM_ARCH__))
945+ #if __cccl_ptx_isa >= 800
946+ extern " C" _LIBCUDACXX_DEVICE void __cuda_ptx_cp_async_bulk_shared_global_is_not_supported_before_SM_90__ ();
933947template <typename _Group>
934948inline __device__
935949void __cp_async_bulk_shared_global (const _Group &__g, char * __dest, const char * __src, size_t __size, uint64_t *__bar_handle) {
936950 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk
951+ NV_IF_ELSE_TARGET (NV_PROVIDES_SM_90,(
937952 if (__g.thread_rank () == 0 ) {
938953 asm volatile (
939954 " cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];\n "
@@ -944,10 +959,13 @@ void __cp_async_bulk_shared_global(const _Group &__g, char * __dest, const char
944959 " r" (static_cast <_CUDA_VSTD::uint32_t >(__cvta_generic_to_shared (__bar_handle)))
945960 : " memory" );
946961 }
962+ ),(
963+ __cuda_ptx_cp_async_bulk_shared_global_is_not_supported_before_SM_90__ ();
964+ ));
947965}
948- #endif // __CUDA_MINIMUM_ARCH__
966+ #endif // __cccl_ptx_isa >= 800
949967
950- # if (defined(__CUDA_MINIMUM_ARCH__) && 800 <= __CUDA_MINIMUM_ARCH__) || (!defined(__CUDA_MINIMUM_ARCH__))
968+ extern " C " _LIBCUDACXX_DEVICE void __cuda_ptx_cp_async_shared_global_is_not_supported_before_SM_80__ ();
951969template <size_t _Copy_size>
952970inline __device__
953971void __cp_async_shared_global (char * __dest, const char * __src) {
@@ -959,27 +977,35 @@ void __cp_async_shared_global(char * __dest, const char * __src) {
959977 static_assert (_Copy_size == 4 || _Copy_size == 8 || _Copy_size == 16 , " cp.async.shared.global requires a copy size of 4, 8, or 16." );
960978#endif // _LIBCUDACXX_STD_VER >= 17
961979
980+ NV_IF_ELSE_TARGET (NV_PROVIDES_SM_80,(
962981 asm volatile (
963982 " cp.async.ca.shared.global [%0], [%1], %2, %2;"
964983 :
965984 : " r" (static_cast <_CUDA_VSTD::uint32_t >(__cvta_generic_to_shared (__dest))),
966985 " l" (static_cast <_CUDA_VSTD::uint64_t >(__cvta_generic_to_global (__src))),
967986 " n" (_Copy_size)
968987 : " memory" );
988+ ),(
989+ __cuda_ptx_cp_async_shared_global_is_not_supported_before_SM_80__ ();
990+ ));
969991}
970992
971993template <>
972994inline __device__
973995void __cp_async_shared_global<16 >(char * __dest, const char * __src) {
974996 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async
975997 // When copying 16 bytes, it is possible to skip L1 cache (.cg).
998+ NV_IF_ELSE_TARGET (NV_PROVIDES_SM_80,(
976999 asm volatile (
9771000 " cp.async.cg.shared.global [%0], [%1], %2, %2;"
9781001 :
9791002 : " r" (static_cast <_CUDA_VSTD::uint32_t >(__cvta_generic_to_shared (__dest))),
9801003 " l" (static_cast <_CUDA_VSTD::uint64_t >(__cvta_generic_to_global (__src))),
9811004 " n" (16 )
9821005 : " memory" );
1006+ ),(
1007+ __cuda_ptx_cp_async_shared_global_is_not_supported_before_SM_80__ ();
1008+ ));
9831009}
9841010
9851011template <size_t _Alignment, typename _Group>
@@ -1002,7 +1028,6 @@ void __cp_async_shared_global_mechanism(_Group __g, char * __dest, const char *
10021028 __cp_async_shared_global<__copy_size>(__dest + __offset, __src + __offset);
10031029 }
10041030}
1005- #endif // __CUDA_MINIMUM_ARCH__
10061031
10071032template <size_t _Copy_size>
10081033struct __copy_chunk {
@@ -1083,6 +1108,7 @@ __completion_mechanism __dispatch_memcpy_async_any_to_any(_Group const & __group
10831108template <_CUDA_VSTD::size_t _Align, typename _Group>
10841109_LIBCUDACXX_NODISCARD_ATTRIBUTE _LIBCUDACXX_DEVICE inline
10851110__completion_mechanism __dispatch_memcpy_async_global_to_shared (_Group const & __group, char * __dest_char, char const * __src_char, _CUDA_VSTD::size_t __size, uint32_t __allowed_completions, uint64_t * __bar_handle) {
1111+ #if __cccl_ptx_isa >= 800
10861112 NV_IF_TARGET (NV_PROVIDES_SM_90, (
10871113 const bool __can_use_complete_tx = __allowed_completions & uint32_t (__completion_mechanism::__mbarrier_complete_tx);
10881114 _LIBCUDACXX_DEBUG_ASSERT (__can_use_complete_tx == (nullptr != __bar_handle), " Pass non-null bar_handle if and only if can_use_complete_tx." );
@@ -1094,6 +1120,7 @@ __completion_mechanism __dispatch_memcpy_async_global_to_shared(_Group const & _
10941120 }
10951121 // Fallthrough to SM 80..
10961122 ));
1123+ #endif // __cccl_ptx_isa >= 800
10971124
10981125 NV_IF_TARGET (NV_PROVIDES_SM_80, (
10991126 if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (_Align >= 4 ) {
0 commit comments