diff --git a/CMakeLists.txt b/CMakeLists.txt index 508102a6..da9af758 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -86,9 +86,10 @@ set(LIB_SRC src/api-c.cpp if(WITH_MULTIARCH) include("${PROJECT_SOURCE_DIR}/cmake/multiarch_x86_popcnt.cmake") + include("${PROJECT_SOURCE_DIR}/cmake/multiarch_avx512_bw.cmake") include("${PROJECT_SOURCE_DIR}/cmake/multiarch_avx512_vbmi2.cmake") - if(multiarch_x86_popcnt OR multiarch_avx512_vbmi2) + if(multiarch_x86_popcnt OR multiarch_avx512_bw OR multiarch_avx512_vbmi2) set(LIB_SRC ${LIB_SRC} src/x86/cpuid.cpp) endif() endif() diff --git a/cmake/multiarch_avx512_bw.cmake b/cmake/multiarch_avx512_bw.cmake new file mode 100644 index 00000000..e1007884 --- /dev/null +++ b/cmake/multiarch_avx512_bw.cmake @@ -0,0 +1,80 @@ +# We use GCC/Clang's function multi-versioning for AVX512 +# support. This code will automatically dispatch to the +# AVX512 BW algorithm if the CPU supports it and use the +# default (portable) algorithm otherwise. + +include(CheckCXXSourceCompiles) +include(CMakePushCheckState) + +cmake_push_check_state() +set(CMAKE_REQUIRED_INCLUDES "${PROJECT_SOURCE_DIR}") + +check_cxx_source_compiles(" + // GCC/Clang function multiversioning for AVX512 is not needed if + // the user compiles with -mavx512f -mavx512bw. + // GCC/Clang function multiversioning generally causes a minor + // overhead, hence we disable it if it is not needed. + #if defined(__AVX512F__) && \ + defined(__AVX512BW__) + Error: AVX512BW multiarch not needed! + #endif + + #include + #include + #include + #include + + __attribute__ ((target (\"avx512f,avx512bw\"))) + void AND_PreSieveTables_avx512(const uint8_t* __restrict preSieve0, + const uint8_t* __restrict preSieve1, + uint8_t* __restrict sieve, + std::size_t bytes) + { + std::size_t i = 0; + + for (; i + 64 <= bytes; i += sizeof(__m512i)) + { + _mm512_storeu_epi8((__m512i*) &sieve[i], + _mm512_and_si512(_mm512_loadu_epi8((const __m512i*) &preSieve0[i]), + _mm512_loadu_epi8((const __m512i*) &preSieve1[i]))); + } + + if (i < bytes) + { + __mmask64 mask = 0xffffffffffffffffull >> (i + 64 - bytes); + + _mm512_mask_storeu_epi8((__m512i*) &sieve[i], mask, + _mm512_and_si512(_mm512_maskz_loadu_epi8(mask, (const __m512i*) &preSieve0[i]), + _mm512_maskz_loadu_epi8(mask, (const __m512i*) &preSieve1[i]))); + } + } + + void AND_PreSieveTables_default(const uint8_t* __restrict preSieved0, + const uint8_t* __restrict preSieved1, + uint8_t* __restrict sieve, + std::size_t bytes) + { + for (std::size_t i = 0; i < bytes; i++) + sieve[i] = preSieved0[i] & preSieved1[i]; + } + + int main() + { + uint8_t sieve[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }; + uint8_t PreSieveTable1[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }; + uint8_t PreSieveTable2[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }; + + if (primesieve::has_cpuid_avx512_bw()) + AND_PreSieveTables_avx512(&PreSieveTable1[0], &PreSieveTable1[1], &sieve[0], 10); + else + AND_PreSieveTables_default(&PreSieveTable1[0], &PreSieveTable1[1], &sieve[0], 10); + + return (sieve[0] == 0) ? 0 : 1; + } +" multiarch_avx512_bw) + +if(multiarch_avx512_bw) + list(APPEND PRIMESIEVE_COMPILE_DEFINITIONS "ENABLE_MULTIARCH_BW") +endif() + +cmake_pop_check_state() diff --git a/include/primesieve/cpu_supports_avx512_bw.hpp b/include/primesieve/cpu_supports_avx512_bw.hpp new file mode 100644 index 00000000..e0a866e1 --- /dev/null +++ b/include/primesieve/cpu_supports_avx512_bw.hpp @@ -0,0 +1,27 @@ +/// +/// @file cpu_supports_avx512_bw.hpp +/// @brief Detect if the x86 CPU supports AVX512 BW. +/// +/// Copyright (C) 2024 Kim Walisch, +/// +/// This file is distributed under the BSD License. See the COPYING +/// file in the top level directory. +/// + +#ifndef CPU_SUPPORTS_AVX512_BW_HPP +#define CPU_SUPPORTS_AVX512_BW_HPP + +namespace primesieve { + +bool has_cpuid_avx512_bw(); + +} // namespace + +namespace { + +/// Initialized at startup +const bool cpu_supports_avx512_bw = primesieve::has_cpuid_avx512_bw(); + +} // namespace + +#endif diff --git a/src/PreSieve.cpp b/src/PreSieve.cpp index e84cbe39..8d987f68 100644 --- a/src/PreSieve.cpp +++ b/src/PreSieve.cpp @@ -59,8 +59,94 @@ #define HAS_ARM_NEON #endif +#if defined(__AVX512F__) && \ + defined(__AVX512BW__) && \ + __has_include() + #include + #define ENABLE_AVX512_BW + +#elif defined(ENABLE_MULTIARCH_BW) && \ + __has_include() + #include + #include +#endif + +#if !defined(ENABLE_AVX512_BW) + #define ENABLE_DEFAULT +#endif + namespace { +#if defined(ENABLE_AVX512_BW) || \ + defined(ENABLE_MULTIARCH_BW) + +#if defined(ENABLE_MULTIARCH_BW) + __attribute__ ((target ("avx512f,avx512bw"))) +#endif +void AND_PreSieveTables_avx512(const uint8_t* __restrict preSieved0, + const uint8_t* __restrict preSieved1, + const uint8_t* __restrict preSieved2, + const uint8_t* __restrict preSieved3, + uint8_t* __restrict sieve, + std::size_t bytes) +{ + std::size_t i = 0; + + for (; i + 64 <= bytes; i += sizeof(__m512i)) + { + _mm512_storeu_epi8((__m512i*) &sieve[i], + _mm512_and_si512( + _mm512_and_si512(_mm512_loadu_epi8((const __m512i*) &preSieved0[i]), _mm512_loadu_epi8((const __m512i*) &preSieved1[i])), + _mm512_and_si512(_mm512_loadu_epi8((const __m512i*) &preSieved2[i]), _mm512_loadu_epi8((const __m512i*) &preSieved3[i])))); + } + + if (i < bytes) + { + __mmask64 mask = 0xffffffffffffffffull >> (i + 64 - bytes); + + _mm512_mask_storeu_epi8((__m512i*) &sieve[i], mask, + _mm512_and_si512( + _mm512_and_si512(_mm512_maskz_loadu_epi8(mask, (const __m512i*) &preSieved0[i]), _mm512_maskz_loadu_epi8(mask, (const __m512i*) &preSieved1[i])), + _mm512_and_si512(_mm512_maskz_loadu_epi8(mask, (const __m512i*) &preSieved2[i]), _mm512_maskz_loadu_epi8(mask, (const __m512i*) &preSieved3[i])))); + } +} + +#if defined(ENABLE_MULTIARCH_BW) + __attribute__ ((target ("avx512f,avx512bw"))) +#endif +void AND_PreSieveTables_Sieve_avx512(const uint8_t* __restrict preSieved0, + const uint8_t* __restrict preSieved1, + const uint8_t* __restrict preSieved2, + const uint8_t* __restrict preSieved3, + uint8_t* __restrict sieve, + std::size_t bytes) +{ + std::size_t i = 0; + + for (; i + 64 <= bytes; i += sizeof(__m512i)) + { + _mm512_storeu_epi8((__m512i*) &sieve[i], + _mm512_and_si512(_mm512_loadu_epi8((const __m512i*) &sieve[i]), _mm512_and_si512( + _mm512_and_si512(_mm512_loadu_epi8((const __m512i*) &preSieved0[i]), _mm512_loadu_epi8((const __m512i*) &preSieved1[i])), + _mm512_and_si512(_mm512_loadu_epi8((const __m512i*) &preSieved2[i]), _mm512_loadu_epi8((const __m512i*) &preSieved3[i]))))); + } + + if (i < bytes) + { + __mmask64 mask = 0xffffffffffffffffull >> (i + 64 - bytes); + + _mm512_mask_storeu_epi8((__m512i*) &sieve[i], mask, + _mm512_and_si512(_mm512_maskz_loadu_epi8(mask, (const __m512i*) &sieve[i]), _mm512_and_si512( + _mm512_and_si512(_mm512_maskz_loadu_epi8(mask, (const __m512i*) &preSieved0[i]), _mm512_maskz_loadu_epi8(mask, (const __m512i*) &preSieved1[i])), + _mm512_and_si512(_mm512_maskz_loadu_epi8(mask, (const __m512i*) &preSieved2[i]), _mm512_maskz_loadu_epi8(mask, (const __m512i*) &preSieved3[i]))))); + } +} + +#endif + +/// This section contains portable SIMD algorithms that don't need +/// any runtime CPU support checks. +#if defined(ENABLE_DEFAULT) #if defined(HAS_SSE2) /// Since compiler auto-vectorization is not 100% reliable, we have @@ -68,12 +154,12 @@ namespace { /// This algorithm is portable since all x64 CPUs support the SSE2 /// instruction set. /// -void AND_PreSieveTables(const uint8_t* __restrict preSieved0, - const uint8_t* __restrict preSieved1, - const uint8_t* __restrict preSieved2, - const uint8_t* __restrict preSieved3, - uint8_t* __restrict sieve, - std::size_t bytes) +void AND_PreSieveTables_default(const uint8_t* __restrict preSieved0, + const uint8_t* __restrict preSieved1, + const uint8_t* __restrict preSieved2, + const uint8_t* __restrict preSieved3, + uint8_t* __restrict sieve, + std::size_t bytes) { std::size_t i = 0; std::size_t limit = bytes - bytes % sizeof(__m128i); @@ -97,12 +183,12 @@ void AND_PreSieveTables(const uint8_t* __restrict preSieved0, sieve[i] = preSieved0[i] & preSieved1[i] & preSieved2[i] & preSieved3[i]; } -void AND_PreSieveTables_Sieve(const uint8_t* __restrict preSieved0, - const uint8_t* __restrict preSieved1, - const uint8_t* __restrict preSieved2, - const uint8_t* __restrict preSieved3, - uint8_t* __restrict sieve, - std::size_t bytes) +void AND_PreSieveTables_Sieve_default(const uint8_t* __restrict preSieved0, + const uint8_t* __restrict preSieved1, + const uint8_t* __restrict preSieved2, + const uint8_t* __restrict preSieved3, + uint8_t* __restrict sieve, + std::size_t bytes) { std::size_t i = 0; std::size_t limit = bytes - bytes % sizeof(__m128i); @@ -128,12 +214,12 @@ void AND_PreSieveTables_Sieve(const uint8_t* __restrict preSieved0, /// workaround for this Homebrew issue we have manually vectorized /// the Bitwise AND loop using ARM NEON. /// -void AND_PreSieveTables(const uint8_t* __restrict preSieved0, - const uint8_t* __restrict preSieved1, - const uint8_t* __restrict preSieved2, - const uint8_t* __restrict preSieved3, - uint8_t* __restrict sieve, - std::size_t bytes) +void AND_PreSieveTables_default(const uint8_t* __restrict preSieved0, + const uint8_t* __restrict preSieved1, + const uint8_t* __restrict preSieved2, + const uint8_t* __restrict preSieved3, + uint8_t* __restrict sieve, + std::size_t bytes) { std::size_t i = 0; std::size_t limit = bytes - bytes % sizeof(uint8x16_t); @@ -150,12 +236,12 @@ void AND_PreSieveTables(const uint8_t* __restrict preSieved0, sieve[i] = preSieved0[i] & preSieved1[i] & preSieved2[i] & preSieved3[i]; } -void AND_PreSieveTables_Sieve(const uint8_t* __restrict preSieved0, - const uint8_t* __restrict preSieved1, - const uint8_t* __restrict preSieved2, - const uint8_t* __restrict preSieved3, - uint8_t* __restrict sieve, - std::size_t bytes) +void AND_PreSieveTables_Sieve_default(const uint8_t* __restrict preSieved0, + const uint8_t* __restrict preSieved1, + const uint8_t* __restrict preSieved2, + const uint8_t* __restrict preSieved3, + uint8_t* __restrict sieve, + std::size_t bytes) { std::size_t i = 0; std::size_t limit = bytes - bytes % sizeof(uint8x16_t); @@ -174,12 +260,12 @@ void AND_PreSieveTables_Sieve(const uint8_t* __restrict preSieved0, #else -void AND_PreSieveTables(const uint8_t* __restrict preSieved0, - const uint8_t* __restrict preSieved1, - const uint8_t* __restrict preSieved2, - const uint8_t* __restrict preSieved3, - uint8_t* __restrict sieve, - std::size_t bytes) +void AND_PreSieveTables_default(const uint8_t* __restrict preSieved0, + const uint8_t* __restrict preSieved1, + const uint8_t* __restrict preSieved2, + const uint8_t* __restrict preSieved3, + uint8_t* __restrict sieve, + std::size_t bytes) { // This loop will get auto-vectorized if compiled with GCC/Clang // using -O3. Using GCC -O2 does not auto-vectorize this loop @@ -190,6 +276,39 @@ void AND_PreSieveTables(const uint8_t* __restrict preSieved0, sieve[i] = preSieved0[i] & preSieved1[i] & preSieved2[i] & preSieved3[i]; } +void AND_PreSieveTables_Sieve_default(const uint8_t* __restrict preSieved0, + const uint8_t* __restrict preSieved1, + const uint8_t* __restrict preSieved2, + const uint8_t* __restrict preSieved3, + uint8_t* __restrict sieve, + std::size_t bytes) +{ + for (std::size_t i = 0; i < bytes; i++) + sieve[i] &= preSieved0[i] & preSieved1[i] & preSieved2[i] & preSieved3[i]; +} + +#endif +#endif + +void AND_PreSieveTables(const uint8_t* __restrict preSieved0, + const uint8_t* __restrict preSieved1, + const uint8_t* __restrict preSieved2, + const uint8_t* __restrict preSieved3, + uint8_t* __restrict sieve, + std::size_t bytes) +{ +#if defined(ENABLE_AVX512_BW) + AND_PreSieveTables_avx512(preSieved0, preSieved1, preSieved2, preSieved3, sieve, bytes); +#elif defined(ENABLE_MULTIARCH_BW) + if (cpu_supports_avx512_bw) + AND_PreSieveTables_avx512(preSieved0, preSieved1, preSieved2, preSieved3, sieve, bytes); + else + AND_PreSieveTables_default(preSieved0, preSieved1, preSieved2, preSieved3, sieve, bytes); +#else + AND_PreSieveTables_default(preSieved0, preSieved1, preSieved2, preSieved3, sieve, bytes); +#endif +} + void AND_PreSieveTables_Sieve(const uint8_t* __restrict preSieved0, const uint8_t* __restrict preSieved1, const uint8_t* __restrict preSieved2, @@ -197,11 +316,17 @@ void AND_PreSieveTables_Sieve(const uint8_t* __restrict preSieved0, uint8_t* __restrict sieve, std::size_t bytes) { - for (std::size_t i = 0; i < bytes; i++) - sieve[i] &= preSieved0[i] & preSieved1[i] & preSieved2[i] & preSieved3[i]; -} - +#if defined(ENABLE_AVX512_BW) + AND_PreSieveTables_Sieve_avx512(preSieved0, preSieved1, preSieved2, preSieved3, sieve, bytes); +#elif defined(ENABLE_MULTIARCH_BW) + if (cpu_supports_avx512_bw) + AND_PreSieveTables_Sieve_avx512(preSieved0, preSieved1, preSieved2, preSieved3, sieve, bytes); + else + AND_PreSieveTables_Sieve_default(preSieved0, preSieved1, preSieved2, preSieved3, sieve, bytes); +#else + AND_PreSieveTables_Sieve_default(preSieved0, preSieved1, preSieved2, preSieved3, sieve, bytes); #endif +} } // namespace diff --git a/src/x86/cpuid.cpp b/src/x86/cpuid.cpp index d51e7908..9a95cb5d 100644 --- a/src/x86/cpuid.cpp +++ b/src/x86/cpuid.cpp @@ -19,7 +19,8 @@ // https://en.wikipedia.org/wiki/CPUID // %ebx bit flags -#define bit_AVX512F (1 << 16) +#define bit_AVX512F (1 << 16) +#define bit_AVX512BW (1 << 30) // %ecx bit flags #define bit_AVX512VBMI (1 << 1) @@ -91,6 +92,37 @@ bool has_cpuid_popcnt() return (abcd[2] & bit_POPCNT) == bit_POPCNT; } +bool has_cpuid_avx512_bw() +{ + int abcd[4]; + + run_cpuid(1, 0, abcd); + + int osxsave_mask = (1 << 27); + + // Ensure OS supports extended processor state management + if ((abcd[2] & osxsave_mask) != osxsave_mask) + return false; + + uint64_t ymm_mask = XSTATE_SSE | XSTATE_YMM; + uint64_t zmm_mask = XSTATE_SSE | XSTATE_YMM | XSTATE_ZMM; + uint64_t xcr0 = get_xcr0(); + + // Check AVX OS support + if ((xcr0 & ymm_mask) != ymm_mask) + return false; + + // Check AVX512 OS support + if ((xcr0 & zmm_mask) != zmm_mask) + return false; + + run_cpuid(7, 0, abcd); + + // AND_PreSieveTables_avx512 requires AVX512F, AVX512BW + return ((abcd[1] & bit_AVX512F) == bit_AVX512F && + (abcd[1] & bit_AVX512F) == bit_AVX512BW); +} + bool has_cpuid_avx512_vbmi2() { int abcd[4];