From 271a33af95a9862d3aaf7f71a8ded1bfbee4ac4a Mon Sep 17 00:00:00 2001 From: rraminen Date: Wed, 12 Feb 2025 22:47:42 +0000 Subject: [PATCH 1/3] __forceinline__ needs inline and always_inline on ROCm --- qtorch/quant/quant_cuda/bit_helper.cu | 4 ++++ qtorch/quant/quant_cuda/fixed_point_kernel.cu | 3 +++ 2 files changed, 7 insertions(+) diff --git a/qtorch/quant/quant_cuda/bit_helper.cu b/qtorch/quant/quant_cuda/bit_helper.cu index 794255f..3b2edf8 100644 --- a/qtorch/quant/quant_cuda/bit_helper.cu +++ b/qtorch/quant/quant_cuda/bit_helper.cu @@ -1,6 +1,10 @@ #define FLOAT_TO_BITS(x) (*reinterpret_cast(x)) #define BITS_TO_FLOAT(x) (*reinterpret_cast(x)) +#ifdef __HIP_PLATFORM_AMD__ +#define __forceinline__ inline __attribute__((always_inline)) +#endif + __device__ __forceinline__ unsigned int extract_exponent(float *a) { unsigned int temp = *(reinterpret_cast(a)); temp = (temp << 1 >> 24); // single preciision, 1 sign bit, 23 mantissa bits diff --git a/qtorch/quant/quant_cuda/fixed_point_kernel.cu b/qtorch/quant/quant_cuda/fixed_point_kernel.cu index 99b7727..d9d3bf2 100644 --- a/qtorch/quant/quant_cuda/fixed_point_kernel.cu +++ b/qtorch/quant/quant_cuda/fixed_point_kernel.cu @@ -1,6 +1,9 @@ #include "quant_kernel.h" #include "sim_helper.cu" +#ifdef __HIP_PLATFORM_AMD__ +#define __forceinline__ inline __attribute__((always_inline)) +#endif template __device__ __forceinline__ T clamp_helper(T a, T min, T max) { From 3799e5d442f2ec7fcfe5925661f6bb527116e237 Mon Sep 17 00:00:00 2001 From: rraminen Date: Fri, 14 Feb 2025 22:12:25 +0000 Subject: [PATCH 2/3] ifdef __HIP__ --- qtorch/quant/quant_cuda/bit_helper.cu | 4 +++- qtorch/quant/quant_cuda/fixed_point_kernel.cu | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/qtorch/quant/quant_cuda/bit_helper.cu b/qtorch/quant/quant_cuda/bit_helper.cu index 3b2edf8..1df46a2 100644 --- a/qtorch/quant/quant_cuda/bit_helper.cu +++ b/qtorch/quant/quant_cuda/bit_helper.cu @@ -1,9 +1,11 @@ #define FLOAT_TO_BITS(x) (*reinterpret_cast(x)) #define BITS_TO_FLOAT(x) (*reinterpret_cast(x)) -#ifdef __HIP_PLATFORM_AMD__ +#ifdef __HIP__ +#ifndef __forceinline__ #define __forceinline__ inline __attribute__((always_inline)) #endif +#endif __device__ __forceinline__ unsigned int extract_exponent(float *a) { unsigned int temp = *(reinterpret_cast(a)); diff --git a/qtorch/quant/quant_cuda/fixed_point_kernel.cu b/qtorch/quant/quant_cuda/fixed_point_kernel.cu index d9d3bf2..6158eda 100644 --- a/qtorch/quant/quant_cuda/fixed_point_kernel.cu +++ b/qtorch/quant/quant_cuda/fixed_point_kernel.cu @@ -1,9 +1,11 @@ #include "quant_kernel.h" #include "sim_helper.cu" -#ifdef __HIP_PLATFORM_AMD__ +#ifdef __HIP__ +#ifndef __forceinline__ #define __forceinline__ inline __attribute__((always_inline)) #endif +#endif template __device__ __forceinline__ T clamp_helper(T a, T min, T max) { From c542f5108dbdeda382a092ef133733f76b0e6c04 Mon Sep 17 00:00:00 2001 From: rraminen Date: Fri, 14 Feb 2025 22:25:56 +0000 Subject: [PATCH 3/3] extra_include_paths is required for the hipification of quant_cuda header files. --- qtorch/quant/quant_function.py | 1 + 1 file changed, 1 insertion(+) diff --git a/qtorch/quant/quant_function.py b/qtorch/quant/quant_function.py index 061676c..6555bc6 100644 --- a/qtorch/quant/quant_function.py +++ b/qtorch/quant/quant_function.py @@ -28,6 +28,7 @@ os.path.join(current_path, "quant_cuda/fixed_point_kernel.cu"), os.path.join(current_path, "quant_cuda/quant.cu"), ], + extra_include_paths=[os.path.join(current_path, "quant_cuda")], ) else: quant_cuda = quant_cpu