Skip to content

Commit df93a01

Browse files
authored
[BACKPORT]: Fix ptx usage to account for PTX ISA availability (#1359) (#1421)
* Fix ptx usage to account for PTX ISA availability (#1359) Currently we only guard those instructions based on the available architecture. However, it is also valid to compile with an old toolkit for a new machine. Consequently we need to strengthen our checks against available PTX ISA * Do not use VLAs in `cp_async_bulk_tensor_*` tests VLAs are a compiler extension and are correctly errored out by some compilers. As we always know the exact size of the array anyway just swtich to a `cuda::std::array` Fixes nvbug4476664 * Use proper shared memory size Authored-by: Allard Hendriksen <[email protected]> * Fix incorrect linker issue * Ensure runfail tests do not fail without execution * Ensure that __cccl_ptx_isa properly guards feature flags
1 parent 5487cac commit df93a01

29 files changed

+240
-168
lines changed

libcudacxx/include/cuda/barrier

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE_EXPERIMENTAL
4040
// capability 9.0 and above. The check for (!defined(__CUDA_MINIMUM_ARCH__)) is
4141
// necessary to prevent cudafe from ripping out the device functions before
4242
// device compilation begins.
43-
#if (!defined(__CUDA_MINIMUM_ARCH__)) || (defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__)
43+
#ifdef __cccl_lib_experimental_ctk12_cp_async_exposure
4444

4545
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk
4646
inline _LIBCUDACXX_DEVICE
@@ -278,7 +278,7 @@ void cp_async_bulk_wait_group_read()
278278
: "memory");
279279
}
280280

281-
#endif // __CUDA_MINIMUM_ARCH__
281+
#endif // __cccl_lib_experimental_ctk12_cp_async_exposure
282282

283283
_LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE_EXPERIMENTAL
284284

Original file line numberDiff line numberDiff line change
@@ -1,19 +1,15 @@
1-
// -*- C++ -*-
21
//===----------------------------------------------------------------------===//
32
//
43
// Part of libcu++, the C++ Standard Library for your entire system,
54
// under the Apache License v2.0 with LLVM Exceptions.
65
// See https://llvm.org/LICENSE.txt for license information.
76
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
8-
// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
7+
// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
98
//
109
//===----------------------------------------------------------------------===//
1110

12-
13-
#ifndef _CUDA_PTX_PTX_ISA_TARGET_MACROS_H_
14-
#define _CUDA_PTX_PTX_ISA_TARGET_MACROS_H_
15-
16-
#include <nv/target> // __CUDA_MINIMUM_ARCH__ and friends
11+
#ifndef __CCCL_PTX_ISA_H_
12+
#define __CCCL_PTX_ISA_H_
1713

1814
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
1915
# pragma GCC system_header
@@ -23,6 +19,8 @@
2319
# pragma system_header
2420
#endif // no system header
2521

22+
#include <nv/target> // __CUDA_MINIMUM_ARCH__ and friends
23+
2624
/*
2725
* Targeting macros
2826
*
@@ -31,47 +29,77 @@
3129
*/
3230

3331
// PTX ISA 8.3 is available from CUDA 12.3, driver r545
34-
#if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 3)) || (!defined(__CUDACC_VER_MAJOR__))
32+
// The first define is for future major versions of CUDACC.
33+
// We make sure that these get the highest known PTX ISA version.
34+
#if (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ > 12)) || (!defined(__CUDACC_VER_MAJOR__))
35+
# define __cccl_ptx_isa 830ULL
36+
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 3)) \
37+
|| (!defined(__CUDACC_VER_MAJOR__))
3538
# define __cccl_ptx_isa 830ULL
3639
// PTX ISA 8.2 is available from CUDA 12.2, driver r535
37-
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 2)) || (!defined(__CUDACC_VER_MAJOR__))
40+
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 2)) \
41+
|| (!defined(__CUDACC_VER_MAJOR__))
3842
# define __cccl_ptx_isa 820ULL
3943
// PTX ISA 8.1 is available from CUDA 12.1, driver r530
40-
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 1)) || (!defined(__CUDACC_VER_MAJOR__))
44+
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 1)) \
45+
|| (!defined(__CUDACC_VER_MAJOR__))
4146
# define __cccl_ptx_isa 810ULL
4247
// PTX ISA 8.0 is available from CUDA 12.0, driver r525
43-
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 0)) || (!defined(__CUDACC_VER_MAJOR__))
48+
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 0)) \
49+
|| (!defined(__CUDACC_VER_MAJOR__))
4450
# define __cccl_ptx_isa 800ULL
4551
// PTX ISA 7.8 is available from CUDA 11.8, driver r520
46-
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 8)) || (!defined(__CUDACC_VER_MAJOR__))
52+
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 8)) \
53+
|| (!defined(__CUDACC_VER_MAJOR__))
4754
# define __cccl_ptx_isa 780ULL
4855
// PTX ISA 7.7 is available from CUDA 11.7, driver r515
49-
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 7)) || (!defined(__CUDACC_VER_MAJOR__))
56+
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 7)) \
57+
|| (!defined(__CUDACC_VER_MAJOR__))
5058
# define __cccl_ptx_isa 770ULL
5159
// PTX ISA 7.6 is available from CUDA 11.6, driver r510
52-
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 6)) || (!defined(__CUDACC_VER_MAJOR__))
60+
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 6)) \
61+
|| (!defined(__CUDACC_VER_MAJOR__))
5362
# define __cccl_ptx_isa 760ULL
5463
// PTX ISA 7.5 is available from CUDA 11.5, driver r495
55-
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 5)) || (!defined(__CUDACC_VER_MAJOR__))
64+
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 5)) \
65+
|| (!defined(__CUDACC_VER_MAJOR__))
5666
# define __cccl_ptx_isa 750ULL
5767
// PTX ISA 7.4 is available from CUDA 11.4, driver r470
58-
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 4)) || (!defined(__CUDACC_VER_MAJOR__))
68+
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 4)) \
69+
|| (!defined(__CUDACC_VER_MAJOR__))
5970
# define __cccl_ptx_isa 740ULL
6071
// PTX ISA 7.3 is available from CUDA 11.3, driver r465
61-
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 3)) || (!defined(__CUDACC_VER_MAJOR__))
72+
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 3)) \
73+
|| (!defined(__CUDACC_VER_MAJOR__))
6274
# define __cccl_ptx_isa 730ULL
6375
// PTX ISA 7.2 is available from CUDA 11.2, driver r460
64-
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 2)) || (!defined(__CUDACC_VER_MAJOR__))
76+
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 2)) \
77+
|| (!defined(__CUDACC_VER_MAJOR__))
6578
# define __cccl_ptx_isa 720ULL
6679
// PTX ISA 7.1 is available from CUDA 11.1, driver r455
67-
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 1)) || (!defined(__CUDACC_VER_MAJOR__))
80+
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 1)) \
81+
|| (!defined(__CUDACC_VER_MAJOR__))
6882
# define __cccl_ptx_isa 710ULL
6983
// PTX ISA 7.0 is available from CUDA 11.0, driver r445
70-
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 0)) || (!defined(__CUDACC_VER_MAJOR__))
84+
#elif (defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 0)) \
85+
|| (!defined(__CUDACC_VER_MAJOR__))
7186
# define __cccl_ptx_isa 700ULL
7287
// Fallback case. Define the ISA version to be zero. This ensures that the macro is always defined.
7388
#else
7489
# define __cccl_ptx_isa 0ULL
7590
#endif
7691

77-
#endif // _CUDA_PTX_PTX_ISA_TARGET_MACROS_H_
92+
// We define certain feature test macros depending on availability. When
93+
// __CUDA_MINIMUM_ARCH__ is not available, we define the following features
94+
// depending on PTX ISA. This permits checking for the feature in host code.
95+
// When __CUDA_MINIMUM_ARCH__ is available, we only enable the feature when the
96+
// hardware supports it.
97+
#if __cccl_ptx_isa >= 800
98+
#if (!defined(__CUDA_MINIMUM_ARCH__)) \
99+
|| (defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__)
100+
# define __cccl_lib_local_barrier_arrive_tx
101+
# define __cccl_lib_experimental_ctk12_cp_async_exposure
102+
#endif
103+
#endif // __cccl_ptx_isa >= 800
104+
105+
#endif // __CCCL_PTX_ISA_H_

libcudacxx/include/cuda/std/detail/libcxx/include/__cccl_config

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,8 @@
187187
# define _CCCL_NV_DIAG_DEFAULT(_WARNING)
188188
#endif // other compilers
189189

190+
#include "__cccl/ptx_isa.h"
191+
#include "__cccl/version.h"
190192
#include "__cccl/visibility.h"
191193

192194
#endif // __CCCL_CONFIG

libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h

Lines changed: 64 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -572,11 +572,8 @@ inline _CUDA_VSTD::uint64_t * barrier_native_handle(barrier<thread_scope_block>
572572

573573
#if defined(_CCCL_CUDA_COMPILER)
574574

575-
// Hide arrive_tx when CUDA architecture is insufficient. Note the
576-
// (!defined(__CUDA_MINIMUM_ARCH__)). This is required to make sure the function
577-
// does not get removed by cudafe, which does not define __CUDA_MINIMUM_ARCH__.
578-
#if (defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__) || (!defined(__CUDA_MINIMUM_ARCH__))
579-
575+
#if __cccl_ptx_isa >= 800
576+
extern "C" _LIBCUDACXX_DEVICE void __cuda_ptx_barrier_arrive_tx_is_not_supported_before_SM_90__();
580577
_LIBCUDACXX_NODISCARD_ATTRIBUTE _LIBCUDACXX_DEVICE inline
581578
barrier<thread_scope_block>::arrival_token barrier_arrive_tx(
582579
barrier<thread_scope_block> & __b,
@@ -591,7 +588,7 @@ barrier<thread_scope_block>::arrival_token barrier_arrive_tx(
591588
_LIBCUDACXX_DEBUG_ASSERT(__transaction_count_update <= (1 << 20) - 1, "Transaction count update cannot exceed 2^20 - 1.");
592589

593590
barrier<thread_scope_block>::arrival_token __token = {};
594-
NV_IF_TARGET(
591+
NV_IF_ELSE_TARGET(
595592
// On architectures pre-sm90, arrive_tx is not supported.
596593
NV_PROVIDES_SM_90, (
597594
// We do not check for the statespace of the barrier here. This is
@@ -619,11 +616,47 @@ barrier<thread_scope_block>::arrival_token barrier_arrive_tx(
619616
_CUDA_VPTX::sem_release, _CUDA_VPTX::scope_cta, _CUDA_VPTX::space_shared, __native_handle, __arrive_count_update
620617
);
621618
}
619+
),(
620+
__cuda_ptx_barrier_arrive_tx_is_not_supported_before_SM_90__();
622621
)
623622
);
624623
return __token;
625624
}
626625

626+
extern "C" _LIBCUDACXX_DEVICE void __cuda_ptx_barrier_expect_tx_is_not_supported_before_SM_90__();
627+
_LIBCUDACXX_DEVICE inline
628+
void barrier_expect_tx(
629+
barrier<thread_scope_block> & __b,
630+
_CUDA_VSTD::ptrdiff_t __transaction_count_update) {
631+
632+
_LIBCUDACXX_DEBUG_ASSERT(__isShared(barrier_native_handle(__b)), "Barrier must be located in local shared memory.");
633+
_LIBCUDACXX_DEBUG_ASSERT(__transaction_count_update >= 0, "Transaction count update must be non-negative.");
634+
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#contents-of-the-mbarrier-object
635+
_LIBCUDACXX_DEBUG_ASSERT(__transaction_count_update <= (1 << 20) - 1, "Transaction count update cannot exceed 2^20 - 1.");
636+
637+
// We do not check for the statespace of the barrier here. This is
638+
// on purpose. This allows debugging tools like memcheck/racecheck
639+
// to detect that we are passing a pointer with the wrong state
640+
// space to mbarrier.arrive. If we checked for the state space here,
641+
// and __trap() if wrong, then those tools would not be able to help
642+
// us in release builds. In debug builds, the error would be caught
643+
// by the asserts at the top of this function.
644+
NV_IF_ELSE_TARGET(
645+
// On architectures pre-sm90, arrive_tx is not supported.
646+
NV_PROVIDES_SM_90, (
647+
auto __bh = __cvta_generic_to_shared(barrier_native_handle(__b));
648+
asm (
649+
"mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;"
650+
:
651+
: "r"(static_cast<_CUDA_VSTD::uint32_t>(__bh)),
652+
"r"(static_cast<_CUDA_VSTD::uint32_t>(__transaction_count_update))
653+
: "memory");
654+
),(
655+
__cuda_ptx_barrier_expect_tx_is_not_supported_before_SM_90__();
656+
));
657+
}
658+
659+
extern "C" _LIBCUDACXX_DEVICE void __cuda_ptx_memcpy_async_tx_is_not_supported_before_SM_90__();
627660
template <typename _Tp, _CUDA_VSTD::size_t _Alignment>
628661
_LIBCUDACXX_DEVICE inline async_contract_fulfillment memcpy_async_tx(
629662
_Tp* __dest,
@@ -643,6 +676,7 @@ _LIBCUDACXX_DEVICE inline async_contract_fulfillment memcpy_async_tx(
643676
_LIBCUDACXX_DEBUG_ASSERT(__isShared(__dest), "dest must point to shared memory.");
644677
_LIBCUDACXX_DEBUG_ASSERT(__isGlobal(__src), "src must point to global memory.");
645678

679+
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
646680
auto __bh = __cvta_generic_to_shared(barrier_native_handle(__b));
647681
if (__isShared(__dest) && __isGlobal(__src)) {
648682
asm volatile(
@@ -660,36 +694,13 @@ _LIBCUDACXX_DEVICE inline async_contract_fulfillment memcpy_async_tx(
660694
// is not yet implemented. So we trap in this case as well.
661695
_LIBCUDACXX_UNREACHABLE();
662696
}
697+
),(
698+
__cuda_ptx_memcpy_async_tx_is_not_supported_before_SM_90__();
699+
));
663700

664701
return async_contract_fulfillment::async;
665702
}
666-
667-
_LIBCUDACXX_DEVICE inline
668-
void barrier_expect_tx(
669-
barrier<thread_scope_block> & __b,
670-
_CUDA_VSTD::ptrdiff_t __transaction_count_update) {
671-
672-
_LIBCUDACXX_DEBUG_ASSERT(__isShared(barrier_native_handle(__b)), "Barrier must be located in local shared memory.");
673-
_LIBCUDACXX_DEBUG_ASSERT(__transaction_count_update >= 0, "Transaction count update must be non-negative.");
674-
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#contents-of-the-mbarrier-object
675-
_LIBCUDACXX_DEBUG_ASSERT(__transaction_count_update <= (1 << 20) - 1, "Transaction count update cannot exceed 2^20 - 1.");
676-
677-
// We do not check for the statespace of the barrier here. This is
678-
// on purpose. This allows debugging tools like memcheck/racecheck
679-
// to detect that we are passing a pointer with the wrong state
680-
// space to mbarrier.arrive. If we checked for the state space here,
681-
// and __trap() if wrong, then those tools would not be able to help
682-
// us in release builds. In debug builds, the error would be caught
683-
// by the asserts at the top of this function.
684-
auto __bh = __cvta_generic_to_shared(barrier_native_handle(__b));
685-
asm (
686-
"mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;"
687-
:
688-
: "r"(static_cast<_CUDA_VSTD::uint32_t>(__bh)),
689-
"r"(static_cast<_CUDA_VSTD::uint32_t>(__transaction_count_update))
690-
: "memory");
691-
}
692-
#endif // __CUDA_MINIMUM_ARCH__
703+
#endif // __cccl_ptx_isa >= 800
693704
#endif // _CCCL_CUDA_COMPILER
694705

695706
_LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE
@@ -796,13 +807,15 @@ struct __memcpy_completion_impl {
796807
// bulk group to be used with shared memory barriers.
797808
_LIBCUDACXX_UNREACHABLE();
798809
case __completion_mechanism::__mbarrier_complete_tx:
810+
#if __cccl_ptx_isa >= 800
799811
// Pre-sm90, the mbarrier_complete_tx completion mechanism is not available.
800812
NV_IF_TARGET(NV_PROVIDES_SM_90, (
801813
// Only perform the expect_tx operation with the leader thread
802814
if (__group.thread_rank() == 0) {
803815
::cuda::device::barrier_expect_tx(__barrier, __size);
804816
}
805817
));
818+
#endif // __cccl_ptx_isa >= 800
806819
return async_contract_fulfillment::async;
807820
case __completion_mechanism::__sync:
808821
// sync: In this case, we do not need to do anything. The user will have
@@ -929,11 +942,13 @@ struct __memcpy_completion_impl {
929942
* 5. normal synchronous copy (fallback)
930943
***********************************************************************/
931944

932-
#if (defined(__CUDA_MINIMUM_ARCH__) && 900 <= __CUDA_MINIMUM_ARCH__) || (!defined(__CUDA_MINIMUM_ARCH__))
945+
#if __cccl_ptx_isa >= 800
946+
extern "C" _LIBCUDACXX_DEVICE void __cuda_ptx_cp_async_bulk_shared_global_is_not_supported_before_SM_90__();
933947
template <typename _Group>
934948
inline __device__
935949
void __cp_async_bulk_shared_global(const _Group &__g, char * __dest, const char * __src, size_t __size, uint64_t *__bar_handle) {
936950
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk
951+
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
937952
if (__g.thread_rank() == 0) {
938953
asm volatile(
939954
"cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];\n"
@@ -944,10 +959,13 @@ void __cp_async_bulk_shared_global(const _Group &__g, char * __dest, const char
944959
"r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__bar_handle)))
945960
: "memory");
946961
}
962+
),(
963+
__cuda_ptx_cp_async_bulk_shared_global_is_not_supported_before_SM_90__();
964+
));
947965
}
948-
#endif // __CUDA_MINIMUM_ARCH__
966+
#endif // __cccl_ptx_isa >= 800
949967

950-
#if (defined(__CUDA_MINIMUM_ARCH__) && 800 <= __CUDA_MINIMUM_ARCH__) || (!defined(__CUDA_MINIMUM_ARCH__))
968+
extern "C" _LIBCUDACXX_DEVICE void __cuda_ptx_cp_async_shared_global_is_not_supported_before_SM_80__();
951969
template <size_t _Copy_size>
952970
inline __device__
953971
void __cp_async_shared_global(char * __dest, const char * __src) {
@@ -959,27 +977,35 @@ void __cp_async_shared_global(char * __dest, const char * __src) {
959977
static_assert(_Copy_size == 4 || _Copy_size == 8 || _Copy_size == 16, "cp.async.shared.global requires a copy size of 4, 8, or 16.");
960978
#endif // _LIBCUDACXX_STD_VER >= 17
961979

980+
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,(
962981
asm volatile(
963982
"cp.async.ca.shared.global [%0], [%1], %2, %2;"
964983
:
965984
: "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__dest))),
966985
"l"(static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__src))),
967986
"n"(_Copy_size)
968987
: "memory");
988+
),(
989+
__cuda_ptx_cp_async_shared_global_is_not_supported_before_SM_80__();
990+
));
969991
}
970992

971993
template <>
972994
inline __device__
973995
void __cp_async_shared_global<16>(char * __dest, const char * __src) {
974996
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async
975997
// When copying 16 bytes, it is possible to skip L1 cache (.cg).
998+
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,(
976999
asm volatile(
9771000
"cp.async.cg.shared.global [%0], [%1], %2, %2;"
9781001
:
9791002
: "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(__dest))),
9801003
"l"(static_cast<_CUDA_VSTD::uint64_t>(__cvta_generic_to_global(__src))),
9811004
"n"(16)
9821005
: "memory");
1006+
),(
1007+
__cuda_ptx_cp_async_shared_global_is_not_supported_before_SM_80__();
1008+
));
9831009
}
9841010

9851011
template <size_t _Alignment, typename _Group>
@@ -1002,7 +1028,6 @@ void __cp_async_shared_global_mechanism(_Group __g, char * __dest, const char *
10021028
__cp_async_shared_global<__copy_size>(__dest + __offset, __src + __offset);
10031029
}
10041030
}
1005-
#endif // __CUDA_MINIMUM_ARCH__
10061031

10071032
template <size_t _Copy_size>
10081033
struct __copy_chunk {
@@ -1083,6 +1108,7 @@ __completion_mechanism __dispatch_memcpy_async_any_to_any(_Group const & __group
10831108
template<_CUDA_VSTD::size_t _Align, typename _Group>
10841109
_LIBCUDACXX_NODISCARD_ATTRIBUTE _LIBCUDACXX_DEVICE inline
10851110
__completion_mechanism __dispatch_memcpy_async_global_to_shared(_Group const & __group, char * __dest_char, char const * __src_char, _CUDA_VSTD::size_t __size, uint32_t __allowed_completions, uint64_t* __bar_handle) {
1111+
#if __cccl_ptx_isa >= 800
10861112
NV_IF_TARGET(NV_PROVIDES_SM_90, (
10871113
const bool __can_use_complete_tx = __allowed_completions & uint32_t(__completion_mechanism::__mbarrier_complete_tx);
10881114
_LIBCUDACXX_DEBUG_ASSERT(__can_use_complete_tx == (nullptr != __bar_handle), "Pass non-null bar_handle if and only if can_use_complete_tx.");
@@ -1094,6 +1120,7 @@ __completion_mechanism __dispatch_memcpy_async_global_to_shared(_Group const & _
10941120
}
10951121
// Fallthrough to SM 80..
10961122
));
1123+
#endif // __cccl_ptx_isa >= 800
10971124

10981125
NV_IF_TARGET(NV_PROVIDES_SM_80, (
10991126
if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (_Align >= 4) {

libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@
2626

2727
#include <nv/target> // __CUDA_MINIMUM_ARCH__ and friends
2828

29-
#include "../__cuda/ptx/ptx_isa_target_macros.h"
3029
#include "../__cuda/ptx/ptx_dot_variants.h"
3130
#include "../__cuda/ptx/ptx_helper_functions.h"
3231
#include "../__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h"

0 commit comments

Comments
 (0)