SWDEV-516595 - Add __shfl functions with __hip_bfloat16 datatype (#42)

satyanveshd · web-flow · commit 376f23b86a12 · 2025-03-25T15:38:01.000+05:30
Also removes asserts in cooperative groups shfl functions since
__hip_bfloat16 shfl is present now

Change-Id: I57578b6e68dccc10c2ddcd194e9cc18bc7732ce1
diff --git a/hipamd/include/hip/amd_detail/amd_hip_bf16.h b/hipamd/include/hip/amd_detail/amd_hip_bf16.h
@@ -1,7 +1,7 @@
 /**
  * MIT License
  *
- * Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2019 - 2025 Advanced Micro Devices, Inc. All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -130,6 +130,13 @@
 #define __BF16_DEVICE_STATIC__ __BF16_DEVICE__ static inline
 #define __BF16_HOST_DEVICE_STATIC__ __BF16_HOST_DEVICE__ static inline
 
+#pragma push_macro("MAYBE_UNDEF")
+#if defined(__has_attribute) && __has_attribute(maybe_undef)
+#define MAYBE_UNDEF __attribute__((maybe_undef))
+#else
+#define MAYBE_UNDEF
+#endif
+
 #define HIPRT_ONE_BF16 __ushort_as_bfloat16((unsigned short)0x3F80U)
 #define HIPRT_ZERO_BF16 __ushort_as_bfloat16((unsigned short)0x0000U)
 #define HIPRT_INF_BF16 __ushort_as_bfloat16((unsigned short)0x7F80U)
@@ -592,6 +599,52 @@ __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __ushort_as_bfloat16(const unsigned s
   return u.bf16;
 }
 
+/**
+ * \ingroup HIP_INTRINSIC_BFLOAT16_SHFL
+ * \brief shfl warp intrinsic for bfloat16
+ */
+__BF16_DEVICE_STATIC__
+__hip_bfloat16 __shfl(MAYBE_UNDEF __hip_bfloat16 var, int src_lane, int width = warpSize) {
+    union { int i; __hip_bfloat16 f; } tmp; tmp.f = var;
+    tmp.i = __shfl(tmp.i, src_lane, width);
+    return tmp.f;
+}
+
+/**
+ * \ingroup HIP_INTRINSIC_BFLOAT16_SHFL
+ * \brief shfl up warp intrinsic for bfloat16
+ */
+__BF16_DEVICE_STATIC__
+__hip_bfloat16 __shfl_up(MAYBE_UNDEF __hip_bfloat16 var,
+                         unsigned int lane_delta, int width = warpSize) {
+    union { int i; __hip_bfloat16 f; } tmp; tmp.f = var;
+    tmp.i = __shfl_up(tmp.i, lane_delta, width);
+    return tmp.f;
+}
+
+/**
+ * \ingroup HIP_INTRINSIC_BFLOAT16_SHFL
+ * \brief shfl down warp intrinsic for bfloat16
+ */
+__BF16_DEVICE_STATIC__
+__hip_bfloat16 __shfl_down(MAYBE_UNDEF __hip_bfloat16 var,
+                           unsigned int lane_delta, int width = warpSize) {
+    union { int i; __hip_bfloat16 f; } tmp; tmp.f = var;
+    tmp.i = __shfl_down(tmp.i, lane_delta, width);
+    return tmp.f;
+}
+
+/**
+ * \ingroup HIP_INTRINSIC_BFLOAT16_SHFL
+ * \brief shfl xor warp intrinsic for bfloat16
+ */
+__BF16_DEVICE_STATIC__
+__hip_bfloat16 __shfl_xor(MAYBE_UNDEF __hip_bfloat16 var, int lane_mask, int width = warpSize) {
+    union { int i; __hip_bfloat16 f; } tmp; tmp.f = var;
+    tmp.i = __shfl_xor(tmp.i, lane_mask, width);
+    return tmp.f;
+}
+
 #ifdef HIP_ENABLE_WARP_SYNC_BUILTINS
 /**
  * \ingroup HIP_INTRINSIC_BFLOAT16_MOVE
@@ -1787,4 +1840,5 @@ __BF16_DEVICE_STATIC__ __hip_bfloat16 unsafeAtomicAdd(__hip_bfloat16 *address,
   return __high2bfloat16(out);
 }
 #endif  // defined(__clang__) && defined(__HIP__)
+#pragma pop_macro("MAYBE_UNDEF")
 #endif
diff --git a/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h b/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h
@@ -462,7 +462,6 @@ class coalesced_group : public thread_group {
    */
   template <class T>
   __CG_QUALIFIER__ T shfl(T var, int srcRank) const {
-    static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
 
     srcRank = srcRank % static_cast<int>(size());
 
@@ -489,7 +488,6 @@ class coalesced_group : public thread_group {
    */
   template <class T>
   __CG_QUALIFIER__ T shfl_down(T var, unsigned int lane_delta) const {
-    static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
 
     // Note: The cuda implementation appears to use the remainder of lane_delta
     // and WARP_SIZE as the shift value rather than lane_delta itself.
@@ -530,7 +528,6 @@ class coalesced_group : public thread_group {
    */
   template <class T>
   __CG_QUALIFIER__ T shfl_up(T var, unsigned int lane_delta) const {
-    static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
 
     // Note: The cuda implementation appears to use the remainder of lane_delta
     // and WARP_SIZE as the shift value rather than lane_delta itself.
@@ -838,22 +835,18 @@ template <unsigned int size> class thread_block_tile_base : public tile_base<siz
   }
 
   template <class T> __CG_QUALIFIER__ T shfl(T var, int srcRank) const {
-    static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
     return (__shfl(var, srcRank, numThreads));
   }
 
   template <class T> __CG_QUALIFIER__ T shfl_down(T var, unsigned int lane_delta) const {
-    static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
     return (__shfl_down(var, lane_delta, numThreads));
   }
 
   template <class T> __CG_QUALIFIER__ T shfl_up(T var, unsigned int lane_delta) const {
-    static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
     return (__shfl_up(var, lane_delta, numThreads));
   }
 
   template <class T> __CG_QUALIFIER__ T shfl_xor(T var, unsigned int laneMask) const {
-    static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
     return (__shfl_xor(var, laneMask, numThreads));
   }