From 271a33af95a9862d3aaf7f71a8ded1bfbee4ac4a Mon Sep 17 00:00:00 2001
From: rraminen <rraminen@amd.com>
Date: Wed, 12 Feb 2025 22:47:42 +0000
Subject: [PATCH 1/3] __forceinline__ needs inline and always_inline on ROCm

---
 qtorch/quant/quant_cuda/bit_helper.cu         | 4 ++++
 qtorch/quant/quant_cuda/fixed_point_kernel.cu | 3 +++
 2 files changed, 7 insertions(+)
diff --git a/qtorch/quant/quant_cuda/bit_helper.cu b/qtorch/quant/quant_cuda/bit_helper.cu
index 794255f..3b2edf8 100644
--- a/qtorch/quant/quant_cuda/bit_helper.cu
+++ b/qtorch/quant/quant_cuda/bit_helper.cu
@@ -1,6 +1,10 @@
 #define FLOAT_TO_BITS(x) (*reinterpret_cast<unsigned int*>(x))
 #define BITS_TO_FLOAT(x) (*reinterpret_cast<float*>(x))
 
+#ifdef __HIP_PLATFORM_AMD__
+#define __forceinline__ inline __attribute__((always_inline))
+#endif
+
 __device__ __forceinline__ unsigned int extract_exponent(float *a) {
   unsigned int temp = *(reinterpret_cast<unsigned int*>(a));
   temp = (temp << 1 >> 24); // single preciision, 1 sign bit, 23 mantissa bits
diff --git a/qtorch/quant/quant_cuda/fixed_point_kernel.cu b/qtorch/quant/quant_cuda/fixed_point_kernel.cu
index 99b7727..d9d3bf2 100644
--- a/qtorch/quant/quant_cuda/fixed_point_kernel.cu
+++ b/qtorch/quant/quant_cuda/fixed_point_kernel.cu
@@ -1,6 +1,9 @@
 #include "quant_kernel.h"
 #include "sim_helper.cu"
 
+#ifdef __HIP_PLATFORM_AMD__
+#define __forceinline__ inline __attribute__((always_inline))
+#endif
 
 template <typename T>
 __device__ __forceinline__ T clamp_helper(T a, T min, T max) {

From 3799e5d442f2ec7fcfe5925661f6bb527116e237 Mon Sep 17 00:00:00 2001
From: rraminen <rraminen@amd.com>
Date: Fri, 14 Feb 2025 22:12:25 +0000
Subject: [PATCH 2/3] ifdef __HIP__

---
 qtorch/quant/quant_cuda/bit_helper.cu         | 4 +++-
 qtorch/quant/quant_cuda/fixed_point_kernel.cu | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/qtorch/quant/quant_cuda/bit_helper.cu b/qtorch/quant/quant_cuda/bit_helper.cu
index 3b2edf8..1df46a2 100644
--- a/qtorch/quant/quant_cuda/bit_helper.cu
+++ b/qtorch/quant/quant_cuda/bit_helper.cu
@@ -1,9 +1,11 @@
 #define FLOAT_TO_BITS(x) (*reinterpret_cast<unsigned int*>(x))
 #define BITS_TO_FLOAT(x) (*reinterpret_cast<float*>(x))
 
-#ifdef __HIP_PLATFORM_AMD__
+#ifdef __HIP__
+#ifndef __forceinline__
 #define __forceinline__ inline __attribute__((always_inline))
 #endif
+#endif
 
 __device__ __forceinline__ unsigned int extract_exponent(float *a) {
   unsigned int temp = *(reinterpret_cast<unsigned int*>(a));
diff --git a/qtorch/quant/quant_cuda/fixed_point_kernel.cu b/qtorch/quant/quant_cuda/fixed_point_kernel.cu
index d9d3bf2..6158eda 100644
--- a/qtorch/quant/quant_cuda/fixed_point_kernel.cu
+++ b/qtorch/quant/quant_cuda/fixed_point_kernel.cu
@@ -1,9 +1,11 @@
 #include "quant_kernel.h"
 #include "sim_helper.cu"
 
-#ifdef __HIP_PLATFORM_AMD__
+#ifdef __HIP__
+#ifndef __forceinline__
 #define __forceinline__ inline __attribute__((always_inline))
 #endif
+#endif
 
 template <typename T>
 __device__ __forceinline__ T clamp_helper(T a, T min, T max) {

From c542f5108dbdeda382a092ef133733f76b0e6c04 Mon Sep 17 00:00:00 2001
From: rraminen <rraminen@amd.com>
Date: Fri, 14 Feb 2025 22:25:56 +0000
Subject: [PATCH 3/3] extra_include_paths is required for the hipification of
 quant_cuda header files.

---
 qtorch/quant/quant_function.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/qtorch/quant/quant_function.py b/qtorch/quant/quant_function.py
index 061676c..6555bc6 100644
--- a/qtorch/quant/quant_function.py
+++ b/qtorch/quant/quant_function.py
@@ -28,6 +28,7 @@
             os.path.join(current_path, "quant_cuda/fixed_point_kernel.cu"),
             os.path.join(current_path, "quant_cuda/quant.cu"),
         ],
+        extra_include_paths=[os.path.join(current_path, "quant_cuda")],
     )
 else:
     quant_cuda = quant_cpu