From 17a49bde03f8a1cbc388be52747a2b6025359fdb Mon Sep 17 00:00:00 2001 From: Kim Walisch Date: Mon, 15 Apr 2024 19:20:15 +0200 Subject: [PATCH] Improve Windows multiarch support --- ChangeLog | 3 +- cmake/multiarch_avx512_vbmi2.cmake | 36 +++++--- include/primesieve/CPUID.hpp | 45 +--------- include/primesieve/PrimeGenerator.hpp | 48 ++++++++-- .../primesieve/cpu_supports_avx512_vbmi2.hpp | 85 ++++++++++++++++++ include/primesieve/cpu_supports_popcnt.hpp | 58 ++++++++++++ include/primesieve/intrinsics.hpp | 11 ++- src/CpuInfo.cpp | 88 ++----------------- src/PrimeGenerator.cpp | 26 ++---- test/CPUID.cpp | 6 +- 10 files changed, 233 insertions(+), 173 deletions(-) create mode 100644 include/primesieve/cpu_supports_avx512_vbmi2.hpp create mode 100644 include/primesieve/cpu_supports_popcnt.hpp diff --git a/ChangeLog b/ChangeLog index 18b3458ae..4a07fa953 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,6 +1,7 @@ -Changes in version 12.3, 04/04/2024 +Changes in version 12.3, 15/04/2024 =================================== +* Improve Windows multiarch support (now works with MinGW64). * Add runtime POPCNT detection using CPUID for x86 CPUs. * Improve GCC/Clang multiarch preprocessor logic. * CMakeLists.txt: Remove POPCNT/BMI check for x86 CPUs. diff --git a/cmake/multiarch_avx512_vbmi2.cmake b/cmake/multiarch_avx512_vbmi2.cmake index 2f52c4799..f56d5e933 100644 --- a/cmake/multiarch_avx512_vbmi2.cmake +++ b/cmake/multiarch_avx512_vbmi2.cmake @@ -1,14 +1,15 @@ # We use GCC/Clang's function multi-versioning for AVX512 # support. This code will automatically dispatch to the -# AVX512 algorithm if the CPU supports AVX512 and use the -# default (portable) algorithm otherwise. +# AVX512 VBMI2 algorithm if the CPU supports it and use +# the default (portable) algorithm otherwise. include(CheckCXXSourceCompiles) +include(CMakePushCheckState) -check_cxx_source_compiles(" - #include - #include +cmake_push_check_state() +set(CMAKE_REQUIRED_INCLUDES "${PROJECT_SOURCE_DIR}/include") +check_cxx_source_compiles(" // GCC/Clang function multiversioning for AVX512 is not needed if // the user compiles with -mavx512f -mavx512vbmi -mavx512vbmi2. // GCC/Clang function multiversioning generally causes a minor @@ -19,22 +20,31 @@ check_cxx_source_compiles(" Error: AVX512VBMI2 multiarch not needed! #endif + #include + #include + #include + class PrimeGenerator { - public: - __attribute__ ((target (\"default\"))) - void fillNextPrimes(uint64_t* primes64); + public: __attribute__ ((target (\"avx512f,avx512vbmi,avx512vbmi2\"))) - void fillNextPrimes(uint64_t* primes64); + void fillNextPrimes_avx512(uint64_t* primes64); + void fillNextPrimes_default(uint64_t* primes64); + void fillNextPrimes(uint64_t* primes64) + { + if (cpu_supports_avx512_vbmi2) + fillNextPrimes_avx512(primes64); + else + fillNextPrimes_default(primes64); + } }; - __attribute__ ((target (\"default\"))) - void PrimeGenerator::fillNextPrimes(uint64_t* primes64) + void PrimeGenerator::fillNextPrimes_default(uint64_t* primes64) { primes64[0] = 2; } __attribute__ ((target (\"avx512f,avx512vbmi,avx512vbmi2\"))) - void PrimeGenerator::fillNextPrimes(uint64_t* primes64) + void PrimeGenerator::fillNextPrimes_avx512(uint64_t* primes64) { __m512i bytes_0_to_7 = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7); __m512i base = _mm512_set1_epi64(123); @@ -56,3 +66,5 @@ check_cxx_source_compiles(" if(multiarch_avx512_vbmi2) set(ENABLE_MULTIARCH_AVX512 "ENABLE_MULTIARCH_AVX512") endif() + +cmake_pop_check_state() diff --git a/include/primesieve/CPUID.hpp b/include/primesieve/CPUID.hpp index a162a1748..92b81002a 100644 --- a/include/primesieve/CPUID.hpp +++ b/include/primesieve/CPUID.hpp @@ -1,6 +1,6 @@ /// -/// @file CPUID.hpp -/// @brief POPCNT detection fo x86 and x86-64 CPUs. +/// @file cpuid.hpp +/// @brief CPUID for x86 and x86-64 CPUs. /// /// Copyright (C) 2024 Kim Walisch, /// @@ -11,32 +11,13 @@ #ifndef CPUID_HPP #define CPUID_HPP -// Enable on x86 and x86-64 CPUs -#if defined(__x86_64__) || \ - defined(__i386__) || \ - defined(_M_X64) || \ - defined(_M_IX86) - -// Both GCC and Clang (even Clang on Windows) define the __POPCNT__ -// macro if the user compiles with -mpopcnt. The __POPCNT__ -// macro is even defined if the user compiles with other flags -// such as -mavx or -march=native. -#if defined(__POPCNT__) - #define HAS_POPCNT -// The MSVC compiler does not support a POPCNT macro, but if the user -// compiles with e.g. /arch:AVX or /arch:AVX512 then MSVC defines -// the __AVX__ macro and POPCNT is also supported. -#elif defined(_MSC_VER) && defined(__AVX__) - #define HAS_POPCNT -#endif - #if defined(_MSC_VER) #include #endif namespace { -inline void run_CPUID(int eax, int ecx, int* abcd) +inline void run_cpuid(int eax, int ecx, int* abcd) { #if defined(_MSC_VER) __cpuidex(abcd, eax, ecx); @@ -69,26 +50,6 @@ inline void run_CPUID(int eax, int ecx, int* abcd) #endif } -#if !defined(HAS_POPCNT) -#define ENABLE_CPUID_POPCNT - -inline bool run_CPUID_POPCNT() -{ - // %ecx POPCNT bit flag - int bit_POPCNT = 1 << 23; - int abcd[4]; - - run_CPUID(1, 0, abcd); - return (abcd[2] & bit_POPCNT) == bit_POPCNT; -} - -/// Initialized at startup -const bool HAS_CPUID_POPCNT = run_CPUID_POPCNT(); - -#endif // ENABLE_CPUID_POPCNT - } // namespace -#endif // x86 CPU - #endif diff --git a/include/primesieve/PrimeGenerator.hpp b/include/primesieve/PrimeGenerator.hpp index d2a92fa77..59f7567d9 100644 --- a/include/primesieve/PrimeGenerator.hpp +++ b/include/primesieve/PrimeGenerator.hpp @@ -23,6 +23,20 @@ #include #include +#if defined(__AVX512F__) && \ + defined(__AVX512VBMI__) && \ + defined(__AVX512VBMI2__) && \ + __has_include() + #define ENABLE_AVX512 + +#elif defined(ENABLE_MULTIARCH_AVX512) && \ + __has_include() + #include "cpu_supports_avx512_vbmi2.hpp" + #define ENABLE_DEFAULT +#else + #define ENABLE_DEFAULT +#endif + namespace primesieve { class PreSieve; @@ -34,18 +48,36 @@ class PrimeGenerator : public Erat void fillPrevPrimes(Vector& primes, std::size_t* size); static uint64_t maxCachedPrime(); -#if defined(ENABLE_MULTIARCH_AVX512) - #define ENABLE_MULTIARCH_DEFAULT - __attribute__ ((target ("avx512f,avx512vbmi,avx512vbmi2"))) - void fillNextPrimes(Vector& primes, std::size_t* size); + ALWAYS_INLINE void fillNextPrimes(Vector& primes, std::size_t* size) + { + #if defined(ENABLE_AVX512) + fillNextPrimes_avx512(primes, size); + #elif defined(ENABLE_MULTIARCH_AVX512) + if (cpu_supports_avx512_vbmi2) + fillNextPrimes_avx512(primes, size); + else + fillNextPrimes_default(primes, size); + #else + fillNextPrimes_default(primes, size); + #endif + } + +private: + +#if defined(ENABLE_DEFAULT) + void fillNextPrimes_default(Vector& primes, std::size_t* size); #endif -#if defined(ENABLE_MULTIARCH_DEFAULT) - __attribute__ ((target ("default"))) +#if defined(ENABLE_AVX512) || \ + defined(ENABLE_MULTIARCH_AVX512) + + #if defined(ENABLE_MULTIARCH_AVX512) + __attribute__ ((target ("avx512f,avx512vbmi,avx512vbmi2"))) + #endif + void fillNextPrimes_avx512(Vector& primes, std::size_t* size); + #endif - void fillNextPrimes(Vector& primes, std::size_t* size); -private: bool isInit_ = false; uint64_t low_ = 0; uint64_t prime_ = 0; diff --git a/include/primesieve/cpu_supports_avx512_vbmi2.hpp b/include/primesieve/cpu_supports_avx512_vbmi2.hpp new file mode 100644 index 000000000..5ab575eb0 --- /dev/null +++ b/include/primesieve/cpu_supports_avx512_vbmi2.hpp @@ -0,0 +1,85 @@ +/// +/// @file cpu_supports_avx512_vbmi2.hpp +/// @brief Detect if the x86 CPU supports AVX512 VBMI2. +/// +/// Copyright (C) 2024 Kim Walisch, +/// +/// This file is distributed under the BSD License. See the COPYING +/// file in the top level directory. +/// + +#ifndef CPU_SUPPORTS_AVX512_VBMI2_HPP +#define CPU_SUPPORTS_AVX512_VBMI2_HPP + +#include "cpuid.hpp" + +#if defined(_MSC_VER) + #include +#endif + +// %ebx bit flags +#define bit_AVX512F (1 << 16) + +// %ecx bit flags +#define bit_AVX512VBMI (1 << 1) +#define bit_AVX512VBMI2 (1 << 6) + +// xgetbv bit flags +#define XSTATE_SSE (1 << 1) +#define XSTATE_YMM (1 << 2) +#define XSTATE_ZMM (7 << 5) + +namespace { + +// Get Value of Extended Control Register +inline int get_xcr0() +{ + int xcr0; + +#if defined(_MSC_VER) + xcr0 = (int) _xgetbv(0); +#else + __asm__ ("xgetbv" : "=a" (xcr0) : "c" (0) : "%edx" ); +#endif + + return xcr0; +} + +inline bool run_cpuid_avx512_vbmi2() +{ + int abcd[4]; + + run_cpuid(1, 0, abcd); + + int osxsave_mask = (1 << 27); + + // Ensure OS supports extended processor state management + if ((abcd[2] & osxsave_mask) != osxsave_mask) + return false; + + int ymm_mask = XSTATE_SSE | XSTATE_YMM; + int zmm_mask = XSTATE_SSE | XSTATE_YMM | XSTATE_ZMM; + + int xcr0 = get_xcr0(); + + // Check AVX OS support + if ((xcr0 & ymm_mask) != ymm_mask) + return false; + + // Check AVX512 OS support + if ((xcr0 & zmm_mask) != zmm_mask) + return false; + + run_cpuid(7, 0, abcd); + + // PrimeGenerator::fillNextPrimes() requires AVX512F, AVX512VBMI & AVX512VBMI2 + return ((abcd[1] & bit_AVX512F) == bit_AVX512F && + (abcd[2] & (bit_AVX512VBMI | bit_AVX512VBMI2)) == (bit_AVX512VBMI | bit_AVX512VBMI2)); +} + +/// Initialized at startup +bool cpu_supports_avx512_vbmi2 = run_cpuid_avx512_vbmi2(); + +} // namespace + +#endif diff --git a/include/primesieve/cpu_supports_popcnt.hpp b/include/primesieve/cpu_supports_popcnt.hpp new file mode 100644 index 000000000..212c0f3b2 --- /dev/null +++ b/include/primesieve/cpu_supports_popcnt.hpp @@ -0,0 +1,58 @@ +/// +/// @file cpu_supports_popcnt.hpp +/// @brief POPCNT detection fo x86 and x86-64 CPUs. +/// +/// Copyright (C) 2024 Kim Walisch, +/// +/// This file is distributed under the BSD License. See the COPYING +/// file in the top level directory. +/// + +#ifndef CPU_SUPPORTS_POPCNT_HPP +#define CPU_SUPPORTS_POPCNT_HPP + +// Enable CPUID on x86 and x86-64 CPUs +#if defined(__x86_64__) || \ + defined(__i386__) || \ + defined(_M_X64) || \ + defined(_M_IX86) + +// Both GCC and Clang (even Clang on Windows) define the __POPCNT__ +// macro if the user compiles with -mpopcnt. The __POPCNT__ +// macro is even defined if the user compiles with other flags +// such as -mavx or -march=native. +#if defined(__POPCNT__) + #define HAS_POPCNT +// The MSVC compiler does not support a POPCNT macro, but if the user +// compiles with e.g. /arch:AVX or /arch:AVX512 then MSVC defines +// the __AVX__ macro and POPCNT is also supported. +#elif defined(_MSC_VER) && defined(__AVX__) + #define HAS_POPCNT +#endif + +#if !defined(HAS_POPCNT) + +#include "cpuid.hpp" +#define ENABLE_CPUID_POPCNT + +namespace { + +inline bool run_cpuid_supports_popcnt() +{ + int abcd[4]; + run_cpuid(1, 0, abcd); + + // %ecx POPCNT bit flag + int bit_POPCNT = 1 << 23; + return (abcd[2] & bit_POPCNT) == bit_POPCNT; +} + +/// Initialized at startup +bool cpu_supports_popcnt = run_cpuid_supports_popcnt(); + +} // namespace + +#endif // !defined(HAS_POPCNT) +#endif // CPUID + +#endif diff --git a/include/primesieve/intrinsics.hpp b/include/primesieve/intrinsics.hpp index 3a8c5aa8c..3ce0725d1 100644 --- a/include/primesieve/intrinsics.hpp +++ b/include/primesieve/intrinsics.hpp @@ -11,7 +11,7 @@ #ifndef INTRINSICS_HPP #define INTRINSICS_HPP -#include "CPUID.hpp" +#include "cpu_supports_popcnt.hpp" #include "macros.hpp" #include @@ -46,7 +46,6 @@ inline uint64_t popcnt64_bitwise(uint64_t x) // CPUID is only enabled on x86 and x86-64 CPUs // if the user compiles without -mpopcnt. #if defined(ENABLE_CPUID_POPCNT) - #if defined(__x86_64__) namespace { @@ -55,7 +54,7 @@ inline uint64_t popcnt64(uint64_t x) { // On my AMD EPYC 7642 CPU using GCC 12 this runtime // check incurs an overall overhead of about 1%. - if_likely(HAS_CPUID_POPCNT) + if_likely(cpu_supports_popcnt) { __asm__("popcnt %1, %0" : "=r"(x) : "r"(x)); return x; @@ -78,7 +77,7 @@ namespace { inline uint64_t popcnt64(uint64_t x) { - if_likely(HAS_CPUID_POPCNT) + if_likely(cpu_supports_popcnt) { uint32_t x0 = uint32_t(x); uint32_t x1 = uint32_t(x >> 32); @@ -135,7 +134,7 @@ inline uint64_t popcnt64(uint64_t x) #if defined(HAS_POPCNT) return __popcnt64(x); #elif defined(ENABLE_CPUID_POPCNT) - if_likely(HAS_CPUID_POPCNT) + if_likely(cpu_supports_popcnt) return __popcnt64(x); else return popcnt64_bitwise(x); @@ -160,7 +159,7 @@ inline uint64_t popcnt64(uint64_t x) return __popcnt(uint32_t(x)) + __popcnt(uint32_t(x >> 32)); #elif defined(ENABLE_CPUID_POPCNT) - if_likely(HAS_CPUID_POPCNT) + if_likely(cpu_supports_popcnt) return __popcnt(uint32_t(x)) + __popcnt(uint32_t(x >> 32)); else diff --git a/src/CpuInfo.cpp b/src/CpuInfo.cpp index 4d5f07a23..365a24396 100644 --- a/src/CpuInfo.cpp +++ b/src/CpuInfo.cpp @@ -25,7 +25,6 @@ /// #include -#include #include #include @@ -44,76 +43,9 @@ defined(__x86_64__) || \ defined(_M_IX86) || \ defined(_M_X64) - -#if defined(_MSC_VER) - #include - #include -#endif - -#define HAS_CPUID - -/* %ebx bit flags */ -#define bit_AVX512F (1 << 16) - -/* %ecx bit flags */ -#define bit_AVX512VBMI (1 << 1) -#define bit_AVX512VBMI2 (1 << 6) - -/* xgetbv bit flags */ -#define XSTATE_SSE (1 << 1) -#define XSTATE_YMM (1 << 2) -#define XSTATE_ZMM (7 << 5) - -namespace { - -// Get Value of Extended Control Register -int get_xcr0() -{ - int xcr0; - -#if defined(_MSC_VER) - xcr0 = (int) _xgetbv(0); -#else - __asm__ ("xgetbv" : "=a" (xcr0) : "c" (0) : "%edx" ); -#endif - - return xcr0; -} - -bool has_AVX512() -{ - int abcd[4]; - - run_CPUID(1, 0, abcd); - - int osxsave_mask = (1 << 27); - - // Ensure OS supports extended processor state management - if ((abcd[2] & osxsave_mask) != osxsave_mask) - return false; - - int ymm_mask = XSTATE_SSE | XSTATE_YMM; - int zmm_mask = XSTATE_SSE | XSTATE_YMM | XSTATE_ZMM; - - int xcr0 = get_xcr0(); - - // Check AVX OS support - if ((xcr0 & ymm_mask) != ymm_mask) - return false; - - // Check AVX512 OS support - if ((xcr0 & zmm_mask) != zmm_mask) - return false; - - run_CPUID(7, 0, abcd); - - // PrimeGenerator::fillNextPrimes() requires AVX512F, AVX512VBMI & AVX512VBMI2 - return ((abcd[1] & bit_AVX512F) == bit_AVX512F && - (abcd[2] & (bit_AVX512VBMI | bit_AVX512VBMI2)) == (bit_AVX512VBMI | bit_AVX512VBMI2)); -} - -} // namespace - + #include + #include + #define HAS_CPUID #endif #if defined(_WIN32) @@ -136,19 +68,19 @@ std::string getCpuName() // https://en.wikipedia.org/wiki/CPUID int cpuInfo[4] = { 0, 0, 0, 0 }; - run_CPUID(0x80000000, 0, cpuInfo); + run_cpuid(0x80000000, 0, cpuInfo); std::vector vect; // check if CPU name is supported if ((unsigned) cpuInfo[0] >= 0x80000004u) { - run_CPUID(0x80000002, 0, cpuInfo); + run_cpuid(0x80000002, 0, cpuInfo); std::copy_n(cpuInfo, 4, std::back_inserter(vect)); - run_CPUID(0x80000003, 0, cpuInfo); + run_cpuid(0x80000003, 0, cpuInfo); std::copy_n(cpuInfo, 4, std::back_inserter(vect)); - run_CPUID(0x80000004, 0, cpuInfo); + run_cpuid(0x80000004, 0, cpuInfo); std::copy_n(cpuInfo, 4, std::back_inserter(vect)); vect.push_back(0); @@ -823,14 +755,10 @@ std::string CpuInfo::cpuName() const } } -/// This method is only used by the primesieve command-line app -/// with the --cpu-info option. Therefore we currently don't -/// cache the result of has_AVX512(). -/// bool CpuInfo::hasAVX512() const { #if defined(HAS_CPUID) - return has_AVX512(); + return cpu_supports_avx512_vbmi2; #else return false; #endif diff --git a/src/PrimeGenerator.cpp b/src/PrimeGenerator.cpp index 758f98cef..3be1e0a1b 100644 --- a/src/PrimeGenerator.cpp +++ b/src/PrimeGenerator.cpp @@ -35,19 +35,9 @@ #include #include -#if defined(ENABLE_MULTIARCH_AVX512) && \ - __has_include() - #include - -#elif defined(__AVX512F__) && \ - defined(__AVX512VBMI__) && \ - defined(__AVX512VBMI2__) && \ - __has_include() +#if defined(ENABLE_AVX512) || \ + defined(ENABLE_MULTIARCH_AVX512) #include - #define ENABLE_AVX512 - -#else - #define ENABLE_DEFAULT #endif namespace { @@ -404,8 +394,7 @@ void PrimeGenerator::fillPrevPrimes(Vector& primes, } } -#if defined(ENABLE_DEFAULT) || \ - defined(ENABLE_MULTIARCH_DEFAULT) +#if defined(ENABLE_DEFAULT) /// This method is used by iterator::next_prime(). /// This method stores only the next few primes (~ 1000) in the @@ -414,11 +403,7 @@ void PrimeGenerator::fillPrevPrimes(Vector& primes, /// this reason iterator::next_prime() runs up to 2x faster /// than iterator::prev_prime(). /// -#if defined(ENABLE_MULTIARCH_DEFAULT) - __attribute__ ((target ("default"))) -#endif -void PrimeGenerator::fillNextPrimes(Vector& primes, - std::size_t* size) +void PrimeGenerator::fillNextPrimes_default(Vector& primes, std::size_t* size) { *size = 0; @@ -492,8 +477,7 @@ void PrimeGenerator::fillNextPrimes(Vector& primes, #if defined(ENABLE_MULTIARCH_AVX512) __attribute__ ((target ("avx512f,avx512vbmi,avx512vbmi2"))) #endif -void PrimeGenerator::fillNextPrimes(Vector& primes, - std::size_t* size) +void PrimeGenerator::fillNextPrimes_avx512(Vector& primes, std::size_t* size) { *size = 0; diff --git a/test/CPUID.cpp b/test/CPUID.cpp index 38a0261ba..2b6a94d0f 100644 --- a/test/CPUID.cpp +++ b/test/CPUID.cpp @@ -1,5 +1,5 @@ /// -/// @file CPUID.cpp +/// @file cpuid.cpp /// @brief Test CPUID code on x86 and x64 CPUs. /// /// Copyright (C) 2024 Kim Walisch, @@ -8,7 +8,7 @@ /// file in the top level directory. /// -#include +#include #include int main() @@ -64,7 +64,7 @@ int main() #endif #if defined(ENABLE_CPUID_POPCNT) - std::cout << "CPU supports POPCNT: " << (HAS_CPUID_POPCNT ? "yes" : "no") << std::endl; + std::cout << "CPU supports POPCNT: " << (cpu_supports_popcnt ? "yes" : "no") << std::endl; #endif #endif