From 663b06be078a4580e5c7e4ed52de55cbc119e3ea Mon Sep 17 00:00:00 2001 From: Niyas Sait Date: Wed, 13 Oct 2021 07:04:32 +0100 Subject: [PATCH 1/2] Enable compiling arm/neon with MSVC for windows on arm64 --- include/xsimd/arch/xsimd_neon.hpp | 498 +++++++++---------- include/xsimd/arch/xsimd_neon64.hpp | 40 +- include/xsimd/arch/xsimd_neon_dispatcher.hpp | 119 +++++ include/xsimd/config/xsimd_config.hpp | 3 + include/xsimd/types/xsimd_batch.hpp | 8 +- include/xsimd/types/xsimd_neon_register.hpp | 31 ++ 6 files changed, 417 insertions(+), 282 deletions(-) create mode 100644 include/xsimd/arch/xsimd_neon_dispatcher.hpp diff --git a/include/xsimd/arch/xsimd_neon.hpp b/include/xsimd/arch/xsimd_neon.hpp index e3b02018a..5e56c414a 100644 --- a/include/xsimd/arch/xsimd_neon.hpp +++ b/include/xsimd/arch/xsimd_neon.hpp @@ -19,52 +19,53 @@ #include "../types/xsimd_neon_register.hpp" #include "../types/xsimd_utils.hpp" +#include "xsimd_neon_dispatcher.hpp" // Wrap intrinsics so we can pass them as function pointers // - OP: intrinsics name prefix, e.g., vorrq // - RT: type traits to deduce intrinsics return types -#define WRAP_BINARY_INT_EXCLUDING_64(OP, RT) \ - namespace wrap { \ - inline RT OP##_u8 (uint8x16_t a, uint8x16_t b) { return ::OP##_u8 (a, b); } \ - inline RT OP##_s8 (int8x16_t a, int8x16_t b) { return ::OP##_s8 (a, b); } \ - inline RT OP##_u16(uint16x8_t a, uint16x8_t b) { return ::OP##_u16(a, b); } \ - inline RT OP##_s16(int16x8_t a, int16x8_t b) { return ::OP##_s16(a, b); } \ - inline RT OP##_u32(uint32x4_t a, uint32x4_t b) { return ::OP##_u32(a, b); } \ - inline RT OP##_s32(int32x4_t a, int32x4_t b) { return ::OP##_s32(a, b); } \ +#define WRAP_BINARY_INT_EXCLUDING_64(OP, RT) \ + namespace wrap { \ + inline RT _##OP##_u8 (uint8x16_t a, uint8x16_t b) { return ::OP##_u8 (a, b); } \ + inline RT _##OP##_s8 (int8x16_t a, int8x16_t b) { return ::OP##_s8 (a, b); } \ + inline RT _##OP##_u16(uint16x8_t a, uint16x8_t b) { return ::OP##_u16(a, b); } \ + inline RT _##OP##_s16(int16x8_t a, int16x8_t b) { return ::OP##_s16(a, b); } \ + inline RT _##OP##_u32(uint32x4_t a, uint32x4_t b) { return ::OP##_u32(a, b); } \ + inline RT _##OP##_s32(int32x4_t a, int32x4_t b) { return ::OP##_s32(a, b); } \ } -#define WRAP_BINARY_INT(OP, RT) \ - WRAP_BINARY_INT_EXCLUDING_64(OP, RT) \ - namespace wrap { \ - inline RT OP##_u64(uint64x2_t a, uint64x2_t b) { return ::OP##_u64(a, b); } \ - inline RT OP##_s64(int64x2_t a, int64x2_t b) { return ::OP##_s64(a, b); } \ +#define WRAP_BINARY_INT(OP, RT) \ + WRAP_BINARY_INT_EXCLUDING_64(OP, RT) \ + namespace wrap { \ + inline RT _##OP##_u64(uint64x2_t a, uint64x2_t b) { return ::OP##_u64(a, b); } \ + inline RT _##OP##_s64(int64x2_t a, int64x2_t b) { return ::OP##_s64(a, b); } \ } -#define WRAP_BINARY_FLOAT(OP, RT) \ - namespace wrap { \ - inline RT OP##_f32(float32x4_t a, float32x4_t b) { return ::OP##_f32(a, b); } \ +#define WRAP_BINARY_FLOAT(OP, RT) \ + namespace wrap { \ + inline RT _##OP##_f32(float32x4_t a, float32x4_t b) { return ::OP##_f32(a, b); } \ } -#define WRAP_UNARY_INT_EXCLUDING_64(OP) \ - namespace wrap { \ - inline uint8x16_t OP##_u8 (uint8x16_t a) { return ::OP##_u8 (a); } \ - inline int8x16_t OP##_s8 (int8x16_t a) { return ::OP##_s8 (a); } \ - inline uint16x8_t OP##_u16(uint16x8_t a) { return ::OP##_u16(a); } \ - inline int16x8_t OP##_s16(int16x8_t a) { return ::OP##_s16(a); } \ - inline uint32x4_t OP##_u32(uint32x4_t a) { return ::OP##_u32(a); } \ - inline int32x4_t OP##_s32(int32x4_t a) { return ::OP##_s32(a); } \ +#define WRAP_UNARY_INT_EXCLUDING_64(OP) \ + namespace wrap { \ + inline uint8x16_t _##OP##_u8 (uint8x16_t a) { return ::OP##_u8 (a); } \ + inline int8x16_t _##OP##_s8 (int8x16_t a) { return ::OP##_s8 (a); } \ + inline uint16x8_t _##OP##_u16(uint16x8_t a) { return ::OP##_u16(a); } \ + inline int16x8_t _##OP##_s16(int16x8_t a) { return ::OP##_s16(a); } \ + inline uint32x4_t _##OP##_u32(uint32x4_t a) { return ::OP##_u32(a); } \ + inline int32x4_t _##OP##_s32(int32x4_t a) { return ::OP##_s32(a); } \ } -#define WRAP_UNARY_INT(OP) \ - WRAP_UNARY_INT_EXCLUDING_64(OP) \ - namespace wrap { \ - inline uint64x2_t OP##_u64(uint64x2_t a) { return ::OP##_u64(a); } \ - inline int64x2_t OP##_s64(int64x2_t a) { return ::OP##_s64(a); } \ +#define WRAP_UNARY_INT(OP) \ + WRAP_UNARY_INT_EXCLUDING_64(OP) \ + namespace wrap { \ + inline uint64x2_t _##OP##_u64(uint64x2_t a) { return ::OP##_u64(a); } \ + inline int64x2_t _##OP##_s64(int64x2_t a) { return ::OP##_s64(a); } \ } -#define WRAP_UNARY_FLOAT(OP) \ - namespace wrap { \ - inline float32x4_t OP##_f32(float32x4_t a) { return ::OP##_f32(a); } \ +#define WRAP_UNARY_FLOAT(OP) \ + namespace wrap { \ + inline float32x4_t _##OP##_f32(float32x4_t a) { return ::OP##_f32(a); } \ } // Dummy identity caster to ease coding @@ -155,6 +156,8 @@ namespace xsimd using type = uint8x16_t; }; +// MSVC uses same underlying type for all vector variants which would cause C++ function overload ambiguity +#if !defined(_WIN32) || (defined(__clang__)) template <> struct comp_return_type_impl { @@ -202,6 +205,7 @@ namespace xsimd { using type = uint32x4_t; }; +#endif template using comp_return_type = typename comp_return_type_impl::type; @@ -314,7 +318,7 @@ namespace xsimd template = 0> batch set(batch const&, requires_arch, Args... args) { - return xsimd::types::detail::neon_vector_type{args...}; + return INITIALIZER_LIST_TO_NEON_VECTOR(xsimd::types::detail::neon_vector_type, {args...}); } template = 0> @@ -322,7 +326,7 @@ namespace xsimd { using register_type = typename batch_bool::register_type; using unsigned_type = as_unsigned_integer_t; - return register_type{static_cast(args ? -1LL : 0LL)...}; + return INITIALIZER_LIST_TO_NEON_VECTOR(register_type , {static_cast(args ? -1LL : 0LL)...}); } template @@ -336,7 +340,7 @@ namespace xsimd { using register_type = typename batch_bool::register_type; using unsigned_type = as_unsigned_integer_t; - return register_type{static_cast(args ? -1LL : 0LL)...}; + return INITIALIZER_LIST_TO_NEON_VECTOR(register_type, {static_cast(args ? -1LL : 0LL)...}); } /************* @@ -346,13 +350,13 @@ namespace xsimd template = 0> batch from_bool(batch_bool const& arg, requires_arch) { - return vandq_u8(arg, vdupq_n_u8(1)); + return vandq_u8(arg.data, vdupq_n_u8(1)); } template = 0> batch from_bool(batch_bool const& arg, requires_arch) { - return vandq_s8(reinterpret_cast(arg.data), vdupq_n_s8(1)); + return vandq_s8(REINTERPRET_CAST(int8x16_t, arg.data), vdupq_n_s8(1)); } template = 0> @@ -364,7 +368,7 @@ namespace xsimd template = 0> batch from_bool(batch_bool const& arg, requires_arch) { - return vandq_s16(reinterpret_cast(arg.data), vdupq_n_s16(1)); + return vandq_s16(REINTERPRET_CAST(int16x8_t, arg.data), vdupq_n_s16(1)); } template = 0> @@ -376,7 +380,7 @@ namespace xsimd template = 0> batch from_bool(batch_bool const& arg, requires_arch) { - return vandq_s32(reinterpret_cast(arg.data), vdupq_n_s32(1)); + return vandq_s32(REINTERPRET_CAST(int32x4_t, arg.data), vdupq_n_s32(1)); } template = 0> @@ -388,7 +392,7 @@ namespace xsimd template = 0> batch from_bool(batch_bool const& arg, requires_arch) { - return vandq_s64(reinterpret_cast(arg.data), vdupq_n_s64(1)); + return vandq_s64(REINTERPRET_CAST(int64x2_t, arg.data), vdupq_n_s64(1)); } template @@ -630,13 +634,11 @@ namespace xsimd batch add(batch const& lhs, batch const& rhs, requires_arch) { using register_type = typename batch::register_type; - const detail::neon_dispatcher::binary dispatcher = - { - std::make_tuple(wrap::vaddq_u8, wrap::vaddq_s8, wrap::vaddq_u16, wrap::vaddq_s16, - wrap::vaddq_u32, wrap::vaddq_s32, wrap::vaddq_u64, wrap::vaddq_s64, - wrap::vaddq_f32) - }; - return dispatcher.apply(register_type(lhs), register_type(rhs)); + register_type result; + NEON_DISPATCHER_BINARY(wrap::_vaddq_u8, wrap::_vaddq_s8, wrap::_vaddq_u16, wrap::_vaddq_s16, + wrap::_vaddq_u32, wrap::_vaddq_s32, wrap::_vaddq_u64, wrap::_vaddq_s64, + wrap::_vaddq_f32, T, register_type(lhs), register_type(rhs), result); + return result; } /******** @@ -649,13 +651,11 @@ namespace xsimd batch sadd(batch const& lhs, batch const& rhs, requires_arch) { using register_type = typename batch::register_type; - const detail::neon_dispatcher::binary dispatcher = - { - std::make_tuple(wrap::vqaddq_u8, wrap::vqaddq_s8, wrap::vqaddq_u16, wrap::vqaddq_s16, - wrap::vqaddq_u32, wrap::vqaddq_s32, wrap::vqaddq_u64, wrap::vqaddq_s64, - wrap::vaddq_f32) - }; - return dispatcher.apply(register_type(lhs), register_type(rhs)); + register_type result; + NEON_DISPATCHER_BINARY(wrap::_vqaddq_u8, wrap::_vqaddq_s8, wrap::_vqaddq_u16, wrap::_vqaddq_s16, + wrap::_vqaddq_u32, wrap::_vqaddq_s32, wrap::_vqaddq_u64, wrap::_vqaddq_s64, + wrap::_vaddq_f32, T, register_type(lhs), register_type(rhs), result); + return result; } /******* @@ -669,13 +669,11 @@ namespace xsimd batch sub(batch const& lhs, batch const& rhs, requires_arch) { using register_type = typename batch::register_type; - const detail::neon_dispatcher::binary dispatcher = - { - std::make_tuple(wrap::vsubq_u8, wrap::vsubq_s8, wrap::vsubq_u16, wrap::vsubq_s16, - wrap::vsubq_u32, wrap::vsubq_s32, wrap::vsubq_u64, wrap::vsubq_s64, - wrap::vsubq_f32) - }; - return dispatcher.apply(register_type(lhs), register_type(rhs)); + register_type result; + NEON_DISPATCHER_BINARY(wrap::_vsubq_u8, wrap::_vsubq_s8, wrap::_vsubq_u16, wrap::_vsubq_s16, + wrap::_vsubq_u32, wrap::_vsubq_s32, wrap::_vsubq_u64, wrap::_vsubq_s64, + wrap::_vsubq_f32, T, register_type(lhs), register_type(rhs), result); + return result; } /******** @@ -688,13 +686,11 @@ namespace xsimd batch ssub(batch const& lhs, batch const& rhs, requires_arch) { using register_type = typename batch::register_type; - const detail::neon_dispatcher::binary dispatcher = - { - std::make_tuple(wrap::vqsubq_u8, wrap::vqsubq_s8, wrap::vqsubq_u16, wrap::vqsubq_s16, - wrap::vqsubq_u32, wrap::vqsubq_s32, wrap::vqsubq_u64, wrap::vqsubq_s64, - wrap::vsubq_f32) - }; - return dispatcher.apply(register_type(lhs), register_type(rhs)); + register_type result; + NEON_DISPATCHER_BINARY(wrap::_vqsubq_u8, wrap::_vqsubq_s8, wrap::_vqsubq_u16, wrap::_vqsubq_s16, + wrap::_vqsubq_u32, wrap::_vqsubq_s32, wrap::_vqsubq_u64, wrap::_vqsubq_s64, + wrap::_vsubq_f32, T, register_type(lhs), register_type(rhs), result); + return result; } @@ -709,12 +705,11 @@ namespace xsimd batch mul(batch const& lhs, batch const& rhs, requires_arch) { using register_type = typename batch::register_type; - const detail::excluding_int64_dispatcher::binary dispatcher = - { - std::make_tuple(wrap::vmulq_u8, wrap::vmulq_s8, wrap::vmulq_u16, wrap::vmulq_s16, - wrap::vmulq_u32, wrap::vmulq_s32, wrap::vmulq_f32) - }; - return dispatcher.apply(register_type(lhs), register_type(rhs)); + register_type result; + NEON_DISPATCHER_BINARY_EXCLUDE_64(wrap::_vmulq_u8, wrap::_vmulq_s8, wrap::_vmulq_u16, wrap::_vmulq_s16, + wrap::_vmulq_u32, wrap::_vmulq_s32, wrap::_vmulq_f32, T, + register_type(lhs), register_type(rhs), result); + return result; } /******* @@ -763,24 +758,27 @@ namespace xsimd batch_bool eq(batch const& lhs, batch const& rhs, requires_arch) { using register_type = typename batch::register_type; - const detail::excluding_int64_comp_dispatcher::binary dispatcher = - { - std::make_tuple(wrap::vceqq_u8, wrap::vceqq_s8, wrap::vceqq_u16, wrap::vceqq_s16, - wrap::vceqq_u32, wrap::vceqq_s32, wrap::vceqq_f32) - }; - return dispatcher.apply(register_type(lhs), register_type(rhs)); + register_type result; + NEON_DISPATCHER_BINARY_EXCLUDE_64(wrap::_vceqq_u8, wrap::_vceqq_s8, wrap::_vceqq_u16, wrap::_vceqq_s16, + wrap::_vceqq_u32, wrap::_vceqq_s32, wrap::_vceqq_f32, T, + register_type(lhs), register_type(rhs), result); + return result; } template = 0> batch_bool eq(batch_bool const& lhs, batch_bool const& rhs, requires_arch) { using register_type = typename batch_bool::register_type; - using dispatcher_type = detail::neon_comp_dispatcher_impl::binary; - const dispatcher_type dispatcher = - { - std::make_tuple(wrap::vceqq_u8, wrap::vceqq_u16, wrap::vceqq_u32) - }; - return dispatcher.apply(register_type(lhs), register_type(rhs)); + switch(sizeof(T)){ + case 1: + return wrap::_vceqq_u8(register_type(lhs), register_type(rhs)); + case 2: + return wrap::_vceqq_u16(register_type(lhs), register_type(rhs)); + case 4: + return wrap::_vceqq_u32(register_type(lhs), register_type(rhs)); + default: + assert(false && "invalid size"); return {}; + } } template = 0> @@ -806,12 +804,11 @@ namespace xsimd batch_bool lt(batch const& lhs, batch const& rhs, requires_arch) { using register_type = typename batch::register_type; - const detail::excluding_int64_comp_dispatcher::binary dispatcher = - { - std::make_tuple(wrap::vcltq_u8, wrap::vcltq_s8, wrap::vcltq_u16, wrap::vcltq_s16, - wrap::vcltq_u32, wrap::vcltq_s32, wrap::vcltq_f32) - }; - return dispatcher.apply(register_type(lhs), register_type(rhs)); + register_type result; + NEON_DISPATCHER_BINARY_EXCLUDE_64(wrap::_vcltq_u8, wrap::_vcltq_s8, wrap::_vcltq_u16, wrap::_vcltq_s16, + wrap::_vcltq_u32, wrap::_vcltq_s32, wrap::_vcltq_f32, + T, register_type(lhs), register_type(rhs), result); + return result; } template = 0> @@ -831,12 +828,11 @@ namespace xsimd batch_bool le(batch const& lhs, batch const& rhs, requires_arch) { using register_type = typename batch::register_type; - const detail::excluding_int64_comp_dispatcher::binary dispatcher = - { - std::make_tuple(wrap::vcleq_u8, wrap::vcleq_s8, wrap::vcleq_u16, wrap::vcleq_s16, - wrap::vcleq_u32, wrap::vcleq_s32, wrap::vcleq_f32) - }; - return dispatcher.apply(register_type(lhs), register_type(rhs)); + register_type result; + NEON_DISPATCHER_BINARY_EXCLUDE_64(wrap::_vcleq_u8, wrap::_vcleq_s8, wrap::_vcleq_u16, wrap::_vcleq_s16, + wrap::_vcleq_u32, wrap::_vcleq_s32, wrap::_vcleq_f32, + T, register_type(lhs), register_type(rhs), result); + return result; } template = 0> @@ -856,12 +852,11 @@ namespace xsimd batch_bool gt(batch const& lhs, batch const& rhs, requires_arch) { using register_type = typename batch::register_type; - const detail::excluding_int64_comp_dispatcher::binary dispatcher = - { - std::make_tuple(wrap::vcgtq_u8, wrap::vcgtq_s8, wrap::vcgtq_u16, wrap::vcgtq_s16, - wrap::vcgtq_u32, wrap::vcgtq_s32, wrap::vcgtq_f32) - }; - return dispatcher.apply(register_type(lhs), register_type(rhs)); + register_type result; + NEON_DISPATCHER_BINARY_EXCLUDE_64(wrap::_vcgtq_u8, wrap::_vcgtq_s8, wrap::_vcgtq_u16, wrap::_vcgtq_s16, + wrap::_vcgtq_u32, wrap::_vcgtq_s32, wrap::_vcgtq_f32, + T, register_type(lhs), register_type(rhs), result); + return result; } template = 0> @@ -878,15 +873,14 @@ namespace xsimd WRAP_BINARY_FLOAT(vcgeq, detail::comp_return_type) template = 0> - batch_bool ge(batch const& lhs, batch const& rhs, requires_arch) + batch_bool get(batch const& lhs, batch const& rhs, requires_arch) { using register_type = typename batch::register_type; - const detail::excluding_int64_comp_dispatcher::binary dispatcher = - { - std::make_tuple(wrap::vcgeq_u8, wrap::vcgeq_s8, wrap::vcgeq_u16, wrap::vcgeq_s16, - wrap::vcgeq_u32, wrap::vcgeq_s32, wrap::vcgeq_f32) - }; - return dispatcher.apply(register_type(lhs), register_type(rhs)); + register_type result; + NEON_DISPATCHER_BINARY_EXCLUDE_64(wrap::_vcgeq_u8, wrap::_vcgeq_s8, wrap::_vcgeq_u16, wrap::_vcgeq_s16, + wrap::_vcgeq_u32, wrap::_vcgeq_s32, wrap::_vcgeq_f32, T, + register_type(lhs), register_type(rhs), result); + return result; } template = 0> @@ -908,32 +902,28 @@ namespace xsimd return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(lhs), vreinterpretq_u32_f32(rhs))); } - - template - V bitwise_and_neon(V const& lhs, V const& rhs) - { - const neon_dispatcher::binary dispatcher = - { - std::make_tuple(wrap::vandq_u8, wrap::vandq_s8, wrap::vandq_u16, wrap::vandq_s16, - wrap::vandq_u32, wrap::vandq_s32, wrap::vandq_u64, wrap::vandq_s64, - bitwise_and_f32) - }; - return dispatcher.apply(lhs, rhs); - } } template = 0> batch bitwise_and(batch const& lhs, batch const& rhs, requires_arch) { using register_type = typename batch::register_type; - return detail::bitwise_and_neon(register_type(lhs), register_type(rhs)); + register_type result; + NEON_DISPATCHER_BINARY(wrap::_vandq_u8, wrap::_vandq_s8, wrap::_vandq_u16, wrap::_vandq_s16, + wrap::_vandq_u32, wrap::_vandq_s32, wrap::_vandq_u64, wrap::_vandq_s64, + detail::bitwise_and_f32, T, register_type(lhs), register_type(rhs), result); + return result; } template = 0> batch_bool bitwise_and(batch_bool const& lhs, batch_bool const& rhs, requires_arch) { using register_type = typename batch_bool::register_type; - return detail::bitwise_and_neon(register_type(lhs), register_type(rhs)); + register_type result; + NEON_DISPATCHER_BINARY(wrap::_vandq_u8, wrap::_vandq_s8, wrap::_vandq_u16, wrap::_vandq_s16, + wrap::_vandq_u32, wrap::_vandq_s32, wrap::_vandq_u64, wrap::_vandq_s64, + detail::bitwise_and_f32, T, register_type(lhs), register_type(rhs), result); + return result; } /************** @@ -949,32 +939,28 @@ namespace xsimd return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(lhs), vreinterpretq_u32_f32(rhs))); } - - template - V bitwise_or_neon(V const& lhs, V const& rhs) - { - const neon_dispatcher::binary dispatcher = - { - std::make_tuple(wrap::vorrq_u8, wrap::vorrq_s8, wrap::vorrq_u16, wrap::vorrq_s16, - wrap::vorrq_u32, wrap::vorrq_s32, wrap::vorrq_u64, wrap::vorrq_s64, - bitwise_or_f32) - }; - return dispatcher.apply(lhs, rhs); - } } template = 0> batch bitwise_or(batch const& lhs, batch const& rhs, requires_arch) { using register_type = typename batch::register_type; - return detail::bitwise_or_neon(register_type(lhs), register_type(rhs)); + register_type result; + NEON_DISPATCHER_BINARY(wrap::_vorrq_u8, wrap::_vorrq_s8, wrap::_vorrq_u16, wrap::_vorrq_s16, + wrap::_vorrq_u32, wrap::_vorrq_s32, wrap::_vorrq_u64, wrap::_vorrq_s64, + detail::bitwise_or_f32, T, register_type(lhs), register_type(rhs), result); + return result; } template = 0> batch_bool bitwise_or(batch_bool const& lhs, batch_bool const& rhs, requires_arch) { using register_type = typename batch_bool::register_type; - return detail::bitwise_or_neon(register_type(lhs), register_type(rhs)); + register_type result; + NEON_DISPATCHER_BINARY(wrap::_vorrq_u8, wrap::_vorrq_s8, wrap::_vorrq_u16, wrap::_vorrq_s16, + wrap::_vorrq_u32, wrap::_vorrq_s32, wrap::_vorrq_u64, wrap::_vorrq_s64, + detail::bitwise_or_f32, T, register_type(lhs), register_type(rhs), result); + return result; } /*************** @@ -991,31 +977,38 @@ namespace xsimd vreinterpretq_u32_f32(rhs))); } - template - V bitwise_xor_neon(V const& lhs, V const& rhs) - { - const neon_dispatcher::binary dispatcher = - { - std::make_tuple(wrap::veorq_u8, wrap::veorq_s8, wrap::veorq_u16, wrap::veorq_s16, - wrap::veorq_u32, wrap::veorq_s32, wrap::veorq_u64, wrap::veorq_s64, - bitwise_xor_f32) - }; - return dispatcher.apply(lhs, rhs); + template + batch bitwise_xor_neon(batch const& lhs, batch const& rhs) + { + using register_type = typename batch_bool::register_type; + register_type result; + NEON_DISPATCHER_BINARY(wrap::_veorq_u8, wrap::_veorq_s8, wrap::_veorq_u16, wrap::_veorq_s16, + wrap::_veorq_u32, wrap::_veorq_s32, wrap::_veorq_u64, wrap::_veorq_s64, + detail::bitwise_xor_f32, T, register_type(lhs), register_type(rhs), result); + return result; } } template = 0> batch bitwise_xor(batch const& lhs, batch const& rhs, requires_arch) { - using register_type = typename batch::register_type; - return detail::bitwise_xor_neon(register_type(lhs), register_type(rhs)); + using register_type = typename batch_bool::register_type; + register_type result; + NEON_DISPATCHER_BINARY(wrap::_veorq_u8, wrap::_veorq_s8, wrap::_veorq_u16, wrap::_veorq_s16, + wrap::_veorq_u32, wrap::_veorq_s32, wrap::_veorq_u64, wrap::_veorq_s64, + detail::bitwise_xor_f32, T, register_type(lhs), register_type(rhs), result); + return result; } template = 0> batch_bool bitwise_xor(batch_bool const& lhs, batch_bool const& rhs, requires_arch) { using register_type = typename batch_bool::register_type; - return detail::bitwise_xor_neon(register_type(lhs), register_type(rhs)); + register_type result; + NEON_DISPATCHER_BINARY(wrap::_veorq_u8, wrap::_veorq_s8, wrap::_veorq_u16, wrap::_veorq_s16, + wrap::_veorq_u32, wrap::_veorq_s32, wrap::_veorq_u64, wrap::_veorq_s64, + detail::bitwise_xor_f32, T, register_type(lhs), register_type(rhs), result); + return result; } /******* @@ -1051,17 +1044,16 @@ namespace xsimd return vreinterpretq_f32_u32(vmvnq_u32(vreinterpretq_u32_f32(arg))); } - template - V bitwise_not_neon(V const& arg) - { - const neon_dispatcher::unary dispatcher = - { - std::make_tuple(wrap::vmvnq_u8, wrap::vmvnq_s8, wrap::vmvnq_u16, wrap::vmvnq_s16, - wrap::vmvnq_u32, wrap::vmvnq_s32, - bitwise_not_u64, bitwise_not_s64, - bitwise_not_f32) - }; - return dispatcher.apply(arg); + template + batch bitwise_not_neon(batch const& arg) + { + using register_type = typename batch::register_type; + register_type result; + NEON_DISPATCHER_UNARY(wrap::_vmvnq_u8, wrap::_vmvnq_s8, wrap::_vmvnq_u16, wrap::_vmvnq_s16, + wrap::_vmvnq_u32, wrap::_vmvnq_s32, + bitwise_not_u64, bitwise_not_s64, + bitwise_not_f32, T, register_type(arg), result); + return result; } } @@ -1069,14 +1061,24 @@ namespace xsimd batch bitwise_not(batch const& arg, requires_arch) { using register_type = typename batch::register_type; - return detail::bitwise_not_neon(register_type(arg)); + register_type result; + NEON_DISPATCHER_UNARY(wrap::_vmvnq_u8, wrap::_vmvnq_s8, wrap::_vmvnq_u16, wrap::_vmvnq_s16, + wrap::_vmvnq_u32, wrap::_vmvnq_s32, + detail::bitwise_not_u64, detail::bitwise_not_s64, + detail::bitwise_not_f32, T, register_type(arg), result); + return result; } template = 0> batch_bool bitwise_not(batch_bool const& arg, requires_arch) { using register_type = typename batch_bool::register_type; - return detail::bitwise_not_neon(register_type(arg)); + register_type result; + NEON_DISPATCHER_UNARY(wrap::_vmvnq_u8, wrap::_vmvnq_s8, wrap::_vmvnq_u16, wrap::_vmvnq_s16, + wrap::_vmvnq_u32, wrap::_vmvnq_s32, + detail::bitwise_not_u64, detail::bitwise_not_s64, + detail::bitwise_not_f32, T, register_type(arg), result); + return result; } /****************** @@ -1091,32 +1093,28 @@ namespace xsimd { return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(lhs), vreinterpretq_u32_f32(rhs))); } - - template - V bitwise_andnot_neon(V const& lhs, V const& rhs) - { - const detail::neon_dispatcher::binary dispatcher = - { - std::make_tuple(wrap::vbicq_u8, wrap::vbicq_s8, wrap::vbicq_u16, wrap::vbicq_s16, - wrap::vbicq_u32, wrap::vbicq_s32, wrap::vbicq_u64, wrap::vbicq_s64, - bitwise_andnot_f32) - }; - return dispatcher.apply(lhs, rhs); - } } template = 0> batch bitwise_andnot(batch const& lhs, batch const& rhs, requires_arch) { using register_type = typename batch::register_type; - return detail::bitwise_andnot_neon(register_type(lhs), register_type(rhs)); + register_type result; + NEON_DISPATCHER_BINARY(wrap::_vbicq_u8, wrap::_vbicq_s8, wrap::_vbicq_u16, wrap::_vbicq_s16, + wrap::_vbicq_u32, wrap::_vbicq_s32, wrap::_vbicq_u64, wrap::_vbicq_s64, + detail::bitwise_andnot_f32, T, register_type(lhs), register_type(rhs), result); + return result; } template = 0> batch_bool bitwise_andnot(batch_bool const& lhs, batch_bool const& rhs, requires_arch) { using register_type = typename batch_bool::register_type; - return detail::bitwise_andnot_neon(register_type(lhs), register_type(rhs)); + register_type result; + NEON_DISPATCHER_BINARY(wrap::_vbicq_u8, wrap::_vbicq_s8, wrap::_vbicq_u16, wrap::_vbicq_s16, + wrap::_vbicq_u32, wrap::_vbicq_s32, wrap::_vbicq_u64, wrap::_vbicq_s64, + detail::bitwise_andnot_f32, T, register_type(lhs), register_type(rhs), result); + return result; } /******* @@ -1130,12 +1128,11 @@ namespace xsimd batch min(batch const& lhs, batch const& rhs, requires_arch) { using register_type = typename batch::register_type; - const detail::excluding_int64_dispatcher::binary dispatcher = - { - std::make_tuple(wrap::vminq_u8, wrap::vminq_s8, wrap::vminq_u16, wrap::vminq_s16, - wrap::vminq_u32, wrap::vminq_s32, wrap::vminq_f32) - }; - return dispatcher.apply(register_type(lhs), register_type(rhs)); + register_type result; + NEON_DISPATCHER_BINARY_EXCLUDE_64(wrap::_vminq_u8, wrap::_vminq_s8, wrap::_vminq_u16, wrap::_vminq_s16, + wrap::_vminq_u32, wrap::_vminq_s32, wrap::_vminq_f32, T, + register_type(lhs), register_type(rhs), result); + return result; } template = 0> @@ -1155,12 +1152,11 @@ namespace xsimd batch max(batch const& lhs, batch const& rhs, requires_arch) { using register_type = typename batch::register_type; - const detail::excluding_int64_dispatcher::binary dispatcher = - { - std::make_tuple(wrap::vmaxq_u8, wrap::vmaxq_s8, wrap::vmaxq_u16, wrap::vmaxq_s16, - wrap::vmaxq_u32, wrap::vmaxq_s32, wrap::vmaxq_f32) - }; - return dispatcher.apply(register_type(lhs), register_type(rhs)); + register_type result; + NEON_DISPATCHER_BINARY_EXCLUDE_64(wrap::_vmaxq_u8, wrap::_vmaxq_s8, wrap::_vmaxq_u16, wrap::_vmaxq_s16, + wrap::_vmaxq_u32, wrap::_vmaxq_s32, wrap::_vmaxq_f32, T, + register_type(lhs), register_type(rhs), result); + return result; } template = 0> @@ -1174,9 +1170,9 @@ namespace xsimd *******/ namespace wrap { - inline int8x16_t vabsq_s8 (int8x16_t a) { return ::vabsq_s8 (a); } - inline int16x8_t vabsq_s16(int16x8_t a) { return ::vabsq_s16(a); } - inline int32x4_t vabsq_s32(int32x4_t a) { return ::vabsq_s32(a); } + inline int8x16_t _vabsq_s8 (int8x16_t a) { return vabsq_s8 (a); } + inline int16x8_t _vabsq_s16(int16x8_t a) { return vabsq_s16(a); } + inline int32x4_t _vabsq_s32(int32x4_t a) { return vabsq_s32(a); } } WRAP_UNARY_FLOAT(vabsq) @@ -1202,12 +1198,11 @@ namespace xsimd batch abs(batch const& arg, requires_arch) { using register_type = typename batch::register_type; - const detail::excluding_int64_dispatcher::unary dispatcher = - { - std::make_tuple(detail::abs_u8, wrap::vabsq_s8, detail::abs_u16, wrap::vabsq_s16, - detail::abs_u32, wrap::vabsq_s32, wrap::vabsq_f32) - }; - return dispatcher.apply(register_type(arg)); + register_type result; + NEON_DISPATCHER_UNARY_EXCLUDE_64(detail::abs_u8, wrap::_vabsq_s8, detail::abs_u16, wrap::_vabsq_s16, + detail::abs_u32, wrap::_vabsq_s32, wrap::_vabsq_f32, T, register_type(arg), + result); + return result; } /******** @@ -1349,15 +1344,15 @@ namespace xsimd **********/ namespace wrap { - inline uint8x16_t vbslq_u8 (uint8x16_t a, uint8x16_t b, uint8x16_t c) { return ::vbslq_u8 (a, b, c); } - inline int8x16_t vbslq_s8 (uint8x16_t a, int8x16_t b, int8x16_t c) { return ::vbslq_s8 (a, b, c); } - inline uint16x8_t vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) { return ::vbslq_u16(a, b, c); } - inline int16x8_t vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c) { return ::vbslq_s16(a, b, c); } - inline uint32x4_t vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) { return ::vbslq_u32(a, b, c); } - inline int32x4_t vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c) { return ::vbslq_s32(a, b, c); } - inline uint64x2_t vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c) { return ::vbslq_u64(a, b, c); } - inline int64x2_t vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c) { return ::vbslq_s64(a, b, c); } - inline float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c) { return ::vbslq_f32(a, b, c); } + inline uint8x16_t _vbslq_u8 (uint8x16_t a, uint8x16_t b, uint8x16_t c) { return ::vbslq_u8 (a, b, c); } + inline int8x16_t _vbslq_s8 (uint8x16_t a, int8x16_t b, int8x16_t c) { return ::vbslq_s8 (a, b, c); } + inline uint16x8_t _vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) { return ::vbslq_u16(a, b, c); } + inline int16x8_t _vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c) { return ::vbslq_s16(a, b, c); } + inline uint32x4_t _vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) { return ::vbslq_u32(a, b, c); } + inline int32x4_t _vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c) { return ::vbslq_s32(a, b, c); } + inline uint64x2_t _vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c) { return ::vbslq_u64(a, b, c); } + inline int64x2_t _vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c) { return ::vbslq_s64(a, b, c); } + inline float32x4_t _vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c) { return ::vbslq_f32(a, b, c); } } namespace detail @@ -1389,13 +1384,12 @@ namespace xsimd { using bool_register_type = typename batch_bool::register_type; using register_type = typename batch::register_type; - const detail::neon_select_dispatcher dispatcher = - { - std::make_tuple(wrap::vbslq_u8, wrap::vbslq_s8, wrap::vbslq_u16, wrap::vbslq_s16, - wrap::vbslq_u32, wrap::vbslq_s32, wrap::vbslq_u64, wrap::vbslq_s64, - wrap::vbslq_f32) - }; - return dispatcher.apply(bool_register_type(cond), register_type(a), register_type(b)); + register_type result; + NEON_DISPATCHER_SELECT(wrap::_vbslq_u8, wrap::_vbslq_s8, wrap::_vbslq_u16, wrap::_vbslq_s16, + wrap::_vbslq_u32, wrap::_vbslq_s32, wrap::_vbslq_u64, wrap::_vbslq_s64, + wrap::_vbslq_f32, T, bool_register_type(cond), register_type(a), + register_type(b), result); + return result; } template = 0> @@ -2130,15 +2124,15 @@ namespace xsimd #define WRAP_CAST(SUFFIX, TYPE) \ namespace wrap { \ - inline TYPE vreinterpretq_##SUFFIX##_u8(uint8x16_t a) { return ::vreinterpretq_##SUFFIX##_u8 (a); } \ - inline TYPE vreinterpretq_##SUFFIX##_s8(int8x16_t a) { return ::vreinterpretq_##SUFFIX##_s8 (a); } \ - inline TYPE vreinterpretq_##SUFFIX##_u16(uint16x8_t a) { return ::vreinterpretq_##SUFFIX##_u16(a); } \ - inline TYPE vreinterpretq_##SUFFIX##_s16(int16x8_t a) { return ::vreinterpretq_##SUFFIX##_s16(a); } \ - inline TYPE vreinterpretq_##SUFFIX##_u32(uint32x4_t a) { return ::vreinterpretq_##SUFFIX##_u32(a); } \ - inline TYPE vreinterpretq_##SUFFIX##_s32(int32x4_t a) { return ::vreinterpretq_##SUFFIX##_s32(a); } \ - inline TYPE vreinterpretq_##SUFFIX##_u64(uint64x2_t a) { return ::vreinterpretq_##SUFFIX##_u64(a); } \ - inline TYPE vreinterpretq_##SUFFIX##_s64(int64x2_t a) { return ::vreinterpretq_##SUFFIX##_s64(a); } \ - inline TYPE vreinterpretq_##SUFFIX##_f32(float32x4_t a) { return ::vreinterpretq_##SUFFIX##_f32(a); } \ + inline TYPE _vreinterpretq_##SUFFIX##_u8(uint8x16_t a) { return vreinterpretq_##SUFFIX##_u8 (a); } \ + inline TYPE _vreinterpretq_##SUFFIX##_s8(int8x16_t a) { return vreinterpretq_##SUFFIX##_s8 (a); } \ + inline TYPE _vreinterpretq_##SUFFIX##_u16(uint16x8_t a) { return vreinterpretq_##SUFFIX##_u16(a); } \ + inline TYPE _vreinterpretq_##SUFFIX##_s16(int16x8_t a) { return vreinterpretq_##SUFFIX##_s16(a); } \ + inline TYPE _vreinterpretq_##SUFFIX##_u32(uint32x4_t a) { return vreinterpretq_##SUFFIX##_u32(a); } \ + inline TYPE _vreinterpretq_##SUFFIX##_s32(int32x4_t a) { return vreinterpretq_##SUFFIX##_s32(a); } \ + inline TYPE _vreinterpretq_##SUFFIX##_u64(uint64x2_t a) { return vreinterpretq_##SUFFIX##_u64(a); } \ + inline TYPE _vreinterpretq_##SUFFIX##_s64(int64x2_t a) { return vreinterpretq_##SUFFIX##_s64(a); } \ + inline TYPE _vreinterpretq_##SUFFIX##_f32(float32x4_t a) { return vreinterpretq_##SUFFIX##_f32(a); } \ } WRAP_CAST(u8, uint8x16_t) @@ -2212,33 +2206,33 @@ namespace xsimd { const detail::neon_bitwise_caster caster = { std::make_tuple( - detail::make_bitwise_caster_impl(wrap::vreinterpretq_u8_u8, wrap::vreinterpretq_u8_s8, wrap::vreinterpretq_u8_u16, wrap::vreinterpretq_u8_s16, - wrap::vreinterpretq_u8_u32, wrap::vreinterpretq_u8_s32, wrap::vreinterpretq_u8_u64, wrap::vreinterpretq_u8_s64, - wrap::vreinterpretq_u8_f32), - detail::make_bitwise_caster_impl(wrap::vreinterpretq_s8_u8, wrap::vreinterpretq_s8_s8, wrap::vreinterpretq_s8_u16, wrap::vreinterpretq_s8_s16, - wrap::vreinterpretq_s8_u32, wrap::vreinterpretq_s8_s32, wrap::vreinterpretq_s8_u64, wrap::vreinterpretq_s8_s64, - wrap::vreinterpretq_s8_f32), - detail::make_bitwise_caster_impl(wrap::vreinterpretq_u16_u8, wrap::vreinterpretq_u16_s8, wrap::vreinterpretq_u16_u16, wrap::vreinterpretq_u16_s16, - wrap::vreinterpretq_u16_u32, wrap::vreinterpretq_u16_s32, wrap::vreinterpretq_u16_u64, wrap::vreinterpretq_u16_s64, - wrap::vreinterpretq_u16_f32), - detail::make_bitwise_caster_impl(wrap::vreinterpretq_s16_u8, wrap::vreinterpretq_s16_s8, wrap::vreinterpretq_s16_u16, wrap::vreinterpretq_s16_s16, - wrap::vreinterpretq_s16_u32, wrap::vreinterpretq_s16_s32, wrap::vreinterpretq_s16_u64, wrap::vreinterpretq_s16_s64, - wrap::vreinterpretq_s16_f32), - detail::make_bitwise_caster_impl(wrap::vreinterpretq_u32_u8, wrap::vreinterpretq_u32_s8, wrap::vreinterpretq_u32_u16, wrap::vreinterpretq_u32_s16, - wrap::vreinterpretq_u32_u32, wrap::vreinterpretq_u32_s32, wrap::vreinterpretq_u32_u64, wrap::vreinterpretq_u32_s64, - wrap::vreinterpretq_u32_f32), - detail::make_bitwise_caster_impl(wrap::vreinterpretq_s32_u8, wrap::vreinterpretq_s32_s8, wrap::vreinterpretq_s32_u16, wrap::vreinterpretq_s32_s16, - wrap::vreinterpretq_s32_u32, wrap::vreinterpretq_s32_s32, wrap::vreinterpretq_s32_u64, wrap::vreinterpretq_s32_s64, - wrap::vreinterpretq_s32_f32), - detail::make_bitwise_caster_impl(wrap::vreinterpretq_u64_u8, wrap::vreinterpretq_u64_s8, wrap::vreinterpretq_u64_u16, wrap::vreinterpretq_u64_s16, - wrap::vreinterpretq_u64_u32, wrap::vreinterpretq_u64_s32, wrap::vreinterpretq_u64_u64, wrap::vreinterpretq_u64_s64, - wrap::vreinterpretq_u64_f32), - detail::make_bitwise_caster_impl(wrap::vreinterpretq_s64_u8, wrap::vreinterpretq_s64_s8, wrap::vreinterpretq_s64_u16, wrap::vreinterpretq_s64_s16, - wrap::vreinterpretq_s64_u32, wrap::vreinterpretq_s64_s32, wrap::vreinterpretq_s64_u64, wrap::vreinterpretq_s64_s64, - wrap::vreinterpretq_s64_f32), - detail::make_bitwise_caster_impl(wrap::vreinterpretq_f32_u8, wrap::vreinterpretq_f32_s8, wrap::vreinterpretq_f32_u16, wrap::vreinterpretq_f32_s16, - wrap::vreinterpretq_f32_u32, wrap::vreinterpretq_f32_s32, wrap::vreinterpretq_f32_u64, wrap::vreinterpretq_f32_s64, - wrap::vreinterpretq_f32_f32)) + detail::make_bitwise_caster_impl(wrap::_vreinterpretq_u8_u8, wrap::_vreinterpretq_u8_s8, wrap::_vreinterpretq_u8_u16, wrap::_vreinterpretq_u8_s16, + wrap::_vreinterpretq_u8_u32, wrap::_vreinterpretq_u8_s32, wrap::_vreinterpretq_u8_u64, wrap::_vreinterpretq_u8_s64, + wrap::_vreinterpretq_u8_f32), + detail::make_bitwise_caster_impl(wrap::_vreinterpretq_s8_u8, wrap::_vreinterpretq_s8_s8, wrap::_vreinterpretq_s8_u16, wrap::_vreinterpretq_s8_s16, + wrap::_vreinterpretq_s8_u32, wrap::_vreinterpretq_s8_s32, wrap::_vreinterpretq_s8_u64, wrap::_vreinterpretq_s8_s64, + wrap::_vreinterpretq_s8_f32), + detail::make_bitwise_caster_impl(wrap::_vreinterpretq_u16_u8, wrap::_vreinterpretq_u16_s8, wrap::_vreinterpretq_u16_u16, wrap::_vreinterpretq_u16_s16, + wrap::_vreinterpretq_u16_u32, wrap::_vreinterpretq_u16_s32, wrap::_vreinterpretq_u16_u64, wrap::_vreinterpretq_u16_s64, + wrap::_vreinterpretq_u16_f32), + detail::make_bitwise_caster_impl(wrap::_vreinterpretq_s16_u8, wrap::_vreinterpretq_s16_s8, wrap::_vreinterpretq_s16_u16, wrap::_vreinterpretq_s16_s16, + wrap::_vreinterpretq_s16_u32, wrap::_vreinterpretq_s16_s32, wrap::_vreinterpretq_s16_u64, wrap::_vreinterpretq_s16_s64, + wrap::_vreinterpretq_s16_f32), + detail::make_bitwise_caster_impl(wrap::_vreinterpretq_u32_u8, wrap::_vreinterpretq_u32_s8, wrap::_vreinterpretq_u32_u16, wrap::_vreinterpretq_u32_s16, + wrap::_vreinterpretq_u32_u32, wrap::_vreinterpretq_u32_s32, wrap::_vreinterpretq_u32_u64, wrap::_vreinterpretq_u32_s64, + wrap::_vreinterpretq_u32_f32), + detail::make_bitwise_caster_impl(wrap::_vreinterpretq_s32_u8, wrap::_vreinterpretq_s32_s8, wrap::_vreinterpretq_s32_u16, wrap::_vreinterpretq_s32_s16, + wrap::_vreinterpretq_s32_u32, wrap::_vreinterpretq_s32_s32, wrap::_vreinterpretq_s32_u64, wrap::_vreinterpretq_s32_s64, + wrap::_vreinterpretq_s32_f32), + detail::make_bitwise_caster_impl(wrap::_vreinterpretq_u64_u8, wrap::_vreinterpretq_u64_s8, wrap::_vreinterpretq_u64_u16, wrap::_vreinterpretq_u64_s16, + wrap::_vreinterpretq_u64_u32, wrap::_vreinterpretq_u64_s32, wrap::_vreinterpretq_u64_u64, wrap::_vreinterpretq_u64_s64, + wrap::_vreinterpretq_u64_f32), + detail::make_bitwise_caster_impl(wrap::_vreinterpretq_s64_u8, wrap::_vreinterpretq_s64_s8, wrap::_vreinterpretq_s64_u16, wrap::_vreinterpretq_s64_s16, + wrap::_vreinterpretq_s64_u32, wrap::_vreinterpretq_s64_s32, wrap::_vreinterpretq_s64_u64, wrap::_vreinterpretq_s64_s64, + wrap::_vreinterpretq_s64_f32), + detail::make_bitwise_caster_impl(wrap::_vreinterpretq_f32_u8, wrap::_vreinterpretq_f32_s8, wrap::_vreinterpretq_f32_u16, wrap::_vreinterpretq_f32_s16, + wrap::_vreinterpretq_f32_u32, wrap::_vreinterpretq_f32_s32, wrap::_vreinterpretq_f32_u64, wrap::_vreinterpretq_f32_s64, + wrap::_vreinterpretq_f32_f32)) }; using src_register_type = typename batch::register_type; using dst_register_type = typename batch::register_type; diff --git a/include/xsimd/arch/xsimd_neon64.hpp b/include/xsimd/arch/xsimd_neon64.hpp index f63254adf..a958c9c29 100644 --- a/include/xsimd/arch/xsimd_neon64.hpp +++ b/include/xsimd/arch/xsimd_neon64.hpp @@ -18,6 +18,7 @@ #include "../types/xsimd_neon64_register.hpp" #include "../types/xsimd_utils.hpp" +#include "xsimd_neon_dispatcher.hpp" namespace xsimd { @@ -779,8 +780,8 @@ namespace xsimd #define WRAP_CAST(SUFFIX, TYPE) \ namespace wrap { \ - inline float64x2_t vreinterpretq_f64_##SUFFIX(TYPE a) { return ::vreinterpretq_f64_##SUFFIX(a); } \ - inline TYPE vreinterpretq_##SUFFIX##_f64(float64x2_t a) { return ::vreinterpretq_##SUFFIX##_f64(a); } \ + inline float64x2_t _vreinterpretq_f64_##SUFFIX(TYPE a) { return vreinterpretq_f64_##SUFFIX(a); } \ + inline TYPE _vreinterpretq_##SUFFIX##_f64(float64x2_t a) { return vreinterpretq_##SUFFIX##_f64(a); } \ } WRAP_CAST(u8, uint8x16_t) @@ -798,19 +799,13 @@ namespace xsimd template batch bitwise_cast(batch const& arg, batch const&, requires_arch) { - using caster_type = detail::bitwise_caster_impl; - const caster_type caster = { - std::make_tuple(wrap::vreinterpretq_f64_u8, wrap::vreinterpretq_f64_s8, wrap::vreinterpretq_f64_u16, wrap::vreinterpretq_f64_s16, - wrap::vreinterpretq_f64_u32, wrap::vreinterpretq_f64_s32, wrap::vreinterpretq_f64_u64, wrap::vreinterpretq_f64_s64, - wrap::vreinterpretq_f64_f32) - }; using register_type = typename batch::register_type; - return caster.apply(register_type(arg)); + register_type result; + NEON_DISPATCHER_UNARY(wrap::_vreinterpretq_f64_u8, wrap::_vreinterpretq_f64_s8, wrap::_vreinterpretq_f64_u16, + wrap::_vreinterpretq_f64_s16, wrap::_vreinterpretq_f64_u32, wrap::_vreinterpretq_f64_s32, + wrap::_vreinterpretq_f64_u64, wrap::_vreinterpretq_f64_s64, wrap::_vreinterpretq_f64_f32, + T, register_type(arg), result); + return result; } namespace detail @@ -834,20 +829,13 @@ namespace xsimd template batch bitwise_cast(batch const& arg, batch const&, requires_arch) { - using caster_type = detail::bitwise_caster_neon64; - const caster_type caster = { - std::make_tuple(wrap::vreinterpretq_u8_f64, wrap::vreinterpretq_s8_f64, wrap::vreinterpretq_u16_f64, wrap::vreinterpretq_s16_f64, - wrap::vreinterpretq_u32_f64, wrap::vreinterpretq_s32_f64, wrap::vreinterpretq_u64_f64, wrap::vreinterpretq_s64_f64, - wrap::vreinterpretq_f32_f64) - }; using src_register_type = typename batch::register_type; using dst_register_type = typename batch::register_type; - return caster.apply(src_register_type(arg)); + src_register_type result; + NEON_DISPATCHER_UNARY(wrap::_vreinterpretq_u8_f64, wrap::_vreinterpretq_s8_f64, wrap::_vreinterpretq_u16_f64, wrap::_vreinterpretq_s16_f64, + wrap::_vreinterpretq_u32_f64, wrap::_vreinterpretq_s32_f64, wrap::_vreinterpretq_u64_f64, wrap::_vreinterpretq_s64_f64, + wrap::_vreinterpretq_f32_f64, R, src_register_type(arg), result); + return dst_register_type(result); } template diff --git a/include/xsimd/arch/xsimd_neon_dispatcher.hpp b/include/xsimd/arch/xsimd_neon_dispatcher.hpp new file mode 100644 index 000000000..cd9cf4e55 --- /dev/null +++ b/include/xsimd/arch/xsimd_neon_dispatcher.hpp @@ -0,0 +1,119 @@ +/*************************************************************************** +* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * +* Martin Renou * +* Copyright (c) QuantStack * +* Copyright (c) Serge Guelton * +* * +* Distributed under the terms of the BSD 3-Clause License. * +* * +* The full license is in the file LICENSE, distributed with this software. * +****************************************************************************/ + +// Few macros to select neon intrinsic function based on the scalar type +#define NEON_DISPATCHER_BINARY(U8, S8, U16, S16, U32, S32, U64, S64, F32, type, arg1, arg2, result)\ + if (std::is_same::value) {\ + result = U8(arg1, arg2);\ + } else if(std::is_same::value) {\ + result = S8(arg1, arg2);\ + } else if(std::is_same::value) {\ + result = U16(arg1, arg2);\ + } else if(std::is_same::value) {\ + result = S16(arg1, arg2);\ + } else if(std::is_same::value) {\ + result = U32(arg1, arg2);\ + } else if(std::is_same::value) {\ + result = S32(arg1, arg2);\ + } else if(std::is_same::value) {\ + result = U64(arg1, arg2);\ + } else if(std::is_same::value) {\ + result = S64(arg1, arg2);\ + } else if(std::is_same::value) {\ + result = F32(arg1, arg2);\ + } else {\ + assert(false && "unsupported type");\ + } + +#define NEON_DISPATCHER_BINARY_EXCLUDE_64(U8, S8, U16, S16, U32, S32, F32, type, arg1, arg2, result)\ + if (std::is_same::value) {\ + result = U8(arg1, arg2);\ + } else if(std::is_same::value) {\ + result = S8(arg1, arg2);\ + } else if(std::is_same::value) {\ + result = U16(arg1, arg2);\ + } else if(std::is_same::value) {\ + result = S16(arg1, arg2);\ + } else if(std::is_same::value) {\ + result = U32(arg1, arg2);\ + } else if(std::is_same::value) {\ + result = S32(arg1, arg2);\ + } else if(std::is_same::value) {\ + result = F32(arg1, arg2);\ + } else {\ + assert(false && "unsupported type");\ + } + +#define NEON_DISPATCHER_UNARY(U8, S8, U16, S16, U32, S32, U64, S64, F32, type, arg, result)\ + if (std::is_same::value) {\ + result = U8(arg);\ + } else if(std::is_same::value) {\ + result = S8(arg);\ + } else if(std::is_same::value) {\ + result = U16(arg);\ + } else if(std::is_same::value) {\ + result = S16(arg);\ + } else if(std::is_same::value) {\ + result = U32(arg);\ + } else if(std::is_same::value) {\ + result = S32(arg);\ + } else if(std::is_same::value) {\ + result = U64(arg);\ + } else if(std::is_same::value) {\ + result = S64(arg);\ + } else if(std::is_same::value) {\ + result = F32(arg);\ + } else {\ + assert(false && "unsupported type");\ + } + +#define NEON_DISPATCHER_UNARY_EXCLUDE_64(U8, S8, U16, S16, U32, S32, F32, type, arg, result)\ + if (std::is_same::value) {\ + result = U8(arg);\ + } else if(std::is_same::value) {\ + result = S8(arg);\ + } else if(std::is_same::value) {\ + result = U16(arg);\ + } else if(std::is_same::value) {\ + result = S16(arg);\ + } else if(std::is_same::value) {\ + result = U32(arg);\ + } else if(std::is_same::value) {\ + result = S32(arg);\ + } else if(std::is_same::value) {\ + result = F32(arg);\ + } else {\ + assert(false && "unsupported type");\ + } + +#define NEON_DISPATCHER_SELECT(U8, S8, U16, S16, U32, S32, U64, S64, F32, type, cond, arg1, arg2, result)\ + if (std::is_same::value) {\ + result = U8(cond, arg1, arg2);\ + } else if(std::is_same::value) {\ + result = S8(cond, arg1, arg2);\ + } else if(std::is_same::value) {\ + result = U16(cond, arg1, arg2);\ + } else if(std::is_same::value) {\ + result = S16(cond, arg1, arg2);\ + } else if(std::is_same::value) {\ + result = U32(cond, arg1, arg2);\ + } else if(std::is_same::value) {\ + result = S32(cond, arg1, arg2);\ + } else if(std::is_same::value) {\ + result = U64(cond, arg1, arg2);\ + } else if(std::is_same::value) {\ + result = S64(cond, arg1, arg2);\ + } else if(std::is_same::value) {\ + result = F32(cond, arg1, arg2);\ + } else {\ + assert(false && "unsupported type");\ + } + diff --git a/include/xsimd/config/xsimd_config.hpp b/include/xsimd/config/xsimd_config.hpp index a65726365..fac02d068 100644 --- a/include/xsimd/config/xsimd_config.hpp +++ b/include/xsimd/config/xsimd_config.hpp @@ -213,6 +213,9 @@ #else #define XSIMD_WITH_NEON64 0 #endif +#elif defined(_MSC_VER) && defined(_M_ARM64) + #define XSIMD_WITH_NEON 1 + #define XSIMD_WITH_NEON64 1 #else #define XSIMD_WITH_NEON 0 #define XSIMD_WITH_NEON64 0 diff --git a/include/xsimd/types/xsimd_batch.hpp b/include/xsimd/types/xsimd_batch.hpp index f0481de87..7366100c2 100644 --- a/include/xsimd/types/xsimd_batch.hpp +++ b/include/xsimd/types/xsimd_batch.hpp @@ -183,7 +183,7 @@ namespace xsimd private: template - batch(T const* data, detail::index_sequence); + batch(T const* data, xsimd::detail::index_sequence); batch logical_and(batch const& other) const; batch logical_or(batch const& other) const; @@ -242,13 +242,13 @@ namespace xsimd private: template - batch_bool(bool const* data, detail::index_sequence); + batch_bool(bool const* data, xsimd::detail::index_sequence); template - static register_type make_register(detail::index_sequence, U u, V... v); + static register_type make_register(xsimd::detail::index_sequence, U u, V... v); template - static register_type make_register(detail::index_sequence<>, V... v); + static register_type make_register(xsimd::detail::index_sequence<>, V... v); }; template diff --git a/include/xsimd/types/xsimd_neon_register.hpp b/include/xsimd/types/xsimd_neon_register.hpp index 43a7db442..6e522f5b4 100644 --- a/include/xsimd/types/xsimd_neon_register.hpp +++ b/include/xsimd/types/xsimd_neon_register.hpp @@ -150,6 +150,37 @@ namespace xsimd : detail::neon_bool_simd_register { }; + + // Few macros and function to support MSVC + #if defined(_MSC_VER) && !defined(__clang__) + #define INITIALIZER_LIST_TO_NEON_VECTOR(T, args) (neon_vector_initializer_constructor(args)) + // Convert an initialiser list to neon vector type + // Note: MSVC does not provide a initialiser_list constructor for neon vector type. + template + S neon_vector_initializer_constructor(std::initializer_list data){ + S target; + if (std::is_signed::value) { + switch(data.size()) { + case 16: std::copy(data.begin(), data.end(), target.n128_i8); break; + case 8: std::copy(data.begin(), data.end(), target.n128_i16); break; + case 4: std::copy(data.begin(), data.end(), target.n128_i32); break; + case 2: std::copy(data.begin(), data.end(), target.n128_i64); break; + } + } else { + switch(data.size()) { + case 16: std::copy(data.begin(), data.end(), target.n128_u8); break; + case 8: std::copy(data.begin(), data.end(), target.n128_u16); break; + case 4: std::copy(data.begin(), data.end(), target.n128_u32); break; + case 2: std::copy(data.begin(), data.end(), target.n128_u64); break; + } + } + return target; + } + #define REINTERPRET_CAST(T, R) (R) + #else + #define INITIALIZER_LIST_TO_NEON_VECTOR(T, args) (T args) + #define REINTERPRET_CAST(T, R) reinterpret_cast(R) + #endif } #endif From a82ed953c90711dad9de34b3fe182b136f82e160 Mon Sep 17 00:00:00 2001 From: Niyas Sait Date: Tue, 19 Oct 2021 16:02:46 +0100 Subject: [PATCH 2/2] Remove unused neon dispatcher --- include/xsimd/arch/xsimd_neon.hpp | 235 +++++------------------------- 1 file changed, 39 insertions(+), 196 deletions(-) diff --git a/include/xsimd/arch/xsimd_neon.hpp b/include/xsimd/arch/xsimd_neon.hpp index 5e56c414a..eff01e60d 100644 --- a/include/xsimd/arch/xsimd_neon.hpp +++ b/include/xsimd/arch/xsimd_neon.hpp @@ -24,26 +24,26 @@ // Wrap intrinsics so we can pass them as function pointers // - OP: intrinsics name prefix, e.g., vorrq // - RT: type traits to deduce intrinsics return types -#define WRAP_BINARY_INT_EXCLUDING_64(OP, RT) \ +#define WRAP_BINARY_INT_EXCLUDING_64(OP) \ namespace wrap { \ - inline RT _##OP##_u8 (uint8x16_t a, uint8x16_t b) { return ::OP##_u8 (a, b); } \ - inline RT _##OP##_s8 (int8x16_t a, int8x16_t b) { return ::OP##_s8 (a, b); } \ - inline RT _##OP##_u16(uint16x8_t a, uint16x8_t b) { return ::OP##_u16(a, b); } \ - inline RT _##OP##_s16(int16x8_t a, int16x8_t b) { return ::OP##_s16(a, b); } \ - inline RT _##OP##_u32(uint32x4_t a, uint32x4_t b) { return ::OP##_u32(a, b); } \ - inline RT _##OP##_s32(int32x4_t a, int32x4_t b) { return ::OP##_s32(a, b); } \ + inline uint8x16_t _##OP##_u8 (uint8x16_t a, uint8x16_t b) { return ::OP##_u8 (a, b); } \ + inline int8x16_t _##OP##_s8 (int8x16_t a, int8x16_t b) { return ::OP##_s8 (a, b); } \ + inline uint16x8_t _##OP##_u16(uint16x8_t a, uint16x8_t b) { return ::OP##_u16(a, b); } \ + inline int16x8_t _##OP##_s16(int16x8_t a, int16x8_t b) { return ::OP##_s16(a, b); } \ + inline uint32x4_t _##OP##_u32(uint32x4_t a, uint32x4_t b) { return ::OP##_u32(a, b); } \ + inline int32x4_t _##OP##_s32(int32x4_t a, int32x4_t b) { return ::OP##_s32(a, b); } \ } -#define WRAP_BINARY_INT(OP, RT) \ - WRAP_BINARY_INT_EXCLUDING_64(OP, RT) \ +#define WRAP_BINARY_INT(OP) \ + WRAP_BINARY_INT_EXCLUDING_64(OP) \ namespace wrap { \ - inline RT _##OP##_u64(uint64x2_t a, uint64x2_t b) { return ::OP##_u64(a, b); } \ - inline RT _##OP##_s64(int64x2_t a, int64x2_t b) { return ::OP##_s64(a, b); } \ + inline uint64x2_t _##OP##_u64(uint64x2_t a, uint64x2_t b) { return ::OP##_u64(a, b); } \ + inline int64x2_t _##OP##_s64(int64x2_t a, int64x2_t b) { return ::OP##_s64(a, b); } \ } -#define WRAP_BINARY_FLOAT(OP, RT) \ +#define WRAP_BINARY_FLOAT(OP) \ namespace wrap { \ - inline RT _##OP##_f32(float32x4_t a, float32x4_t b) { return ::OP##_f32(a, b); } \ + inline float32x4_t _##OP##_f32(float32x4_t a, float32x4_t b) { return ::OP##_f32(a, b); } \ } #define WRAP_UNARY_INT_EXCLUDING_64(OP) \ @@ -87,139 +87,6 @@ namespace xsimd namespace detail { - template