|
1 | 1 | /**
|
2 | 2 | * MIT License
|
3 | 3 | *
|
4 |
| - * Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc. All rights reserved. |
| 4 | + * Copyright (c) 2019 - 2025 Advanced Micro Devices, Inc. All rights reserved. |
5 | 5 | *
|
6 | 6 | * Permission is hereby granted, free of charge, to any person obtaining a copy
|
7 | 7 | * of this software and associated documentation files (the "Software"), to deal
|
|
130 | 130 | #define __BF16_DEVICE_STATIC__ __BF16_DEVICE__ static inline
|
131 | 131 | #define __BF16_HOST_DEVICE_STATIC__ __BF16_HOST_DEVICE__ static inline
|
132 | 132 |
|
| 133 | +#pragma push_macro("MAYBE_UNDEF") |
| 134 | +#if defined(__has_attribute) && __has_attribute(maybe_undef) |
| 135 | +#define MAYBE_UNDEF __attribute__((maybe_undef)) |
| 136 | +#else |
| 137 | +#define MAYBE_UNDEF |
| 138 | +#endif |
| 139 | + |
133 | 140 | #define HIPRT_ONE_BF16 __ushort_as_bfloat16((unsigned short)0x3F80U)
|
134 | 141 | #define HIPRT_ZERO_BF16 __ushort_as_bfloat16((unsigned short)0x0000U)
|
135 | 142 | #define HIPRT_INF_BF16 __ushort_as_bfloat16((unsigned short)0x7F80U)
|
@@ -592,6 +599,52 @@ __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __ushort_as_bfloat16(const unsigned s
|
592 | 599 | return u.bf16;
|
593 | 600 | }
|
594 | 601 |
|
| 602 | +/** |
| 603 | + * \ingroup HIP_INTRINSIC_BFLOAT16_SHFL |
| 604 | + * \brief shfl warp intrinsic for bfloat16 |
| 605 | + */ |
| 606 | +__BF16_DEVICE_STATIC__ |
| 607 | +__hip_bfloat16 __shfl(MAYBE_UNDEF __hip_bfloat16 var, int src_lane, int width = warpSize) { |
| 608 | + union { int i; __hip_bfloat16 f; } tmp; tmp.f = var; |
| 609 | + tmp.i = __shfl(tmp.i, src_lane, width); |
| 610 | + return tmp.f; |
| 611 | +} |
| 612 | + |
| 613 | +/** |
| 614 | + * \ingroup HIP_INTRINSIC_BFLOAT16_SHFL |
| 615 | + * \brief shfl up warp intrinsic for bfloat16 |
| 616 | + */ |
| 617 | +__BF16_DEVICE_STATIC__ |
| 618 | +__hip_bfloat16 __shfl_up(MAYBE_UNDEF __hip_bfloat16 var, |
| 619 | + unsigned int lane_delta, int width = warpSize) { |
| 620 | + union { int i; __hip_bfloat16 f; } tmp; tmp.f = var; |
| 621 | + tmp.i = __shfl_up(tmp.i, lane_delta, width); |
| 622 | + return tmp.f; |
| 623 | +} |
| 624 | + |
| 625 | +/** |
| 626 | + * \ingroup HIP_INTRINSIC_BFLOAT16_SHFL |
| 627 | + * \brief shfl down warp intrinsic for bfloat16 |
| 628 | + */ |
| 629 | +__BF16_DEVICE_STATIC__ |
| 630 | +__hip_bfloat16 __shfl_down(MAYBE_UNDEF __hip_bfloat16 var, |
| 631 | + unsigned int lane_delta, int width = warpSize) { |
| 632 | + union { int i; __hip_bfloat16 f; } tmp; tmp.f = var; |
| 633 | + tmp.i = __shfl_down(tmp.i, lane_delta, width); |
| 634 | + return tmp.f; |
| 635 | +} |
| 636 | + |
| 637 | +/** |
| 638 | + * \ingroup HIP_INTRINSIC_BFLOAT16_SHFL |
| 639 | + * \brief shfl xor warp intrinsic for bfloat16 |
| 640 | + */ |
| 641 | +__BF16_DEVICE_STATIC__ |
| 642 | +__hip_bfloat16 __shfl_xor(MAYBE_UNDEF __hip_bfloat16 var, int lane_mask, int width = warpSize) { |
| 643 | + union { int i; __hip_bfloat16 f; } tmp; tmp.f = var; |
| 644 | + tmp.i = __shfl_xor(tmp.i, lane_mask, width); |
| 645 | + return tmp.f; |
| 646 | +} |
| 647 | + |
595 | 648 | #ifdef HIP_ENABLE_WARP_SYNC_BUILTINS
|
596 | 649 | /**
|
597 | 650 | * \ingroup HIP_INTRINSIC_BFLOAT16_MOVE
|
@@ -1787,4 +1840,5 @@ __BF16_DEVICE_STATIC__ __hip_bfloat16 unsafeAtomicAdd(__hip_bfloat16 *address,
|
1787 | 1840 | return __high2bfloat16(out);
|
1788 | 1841 | }
|
1789 | 1842 | #endif // defined(__clang__) && defined(__HIP__)
|
| 1843 | +#pragma pop_macro("MAYBE_UNDEF") |
1790 | 1844 | #endif
|
0 commit comments