diff --git a/CMakeLists.txt b/CMakeLists.txt index ae552499a..38d83a4d2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -112,7 +112,7 @@ endif() if(WITH_MULTIARCH) include("${PROJECT_SOURCE_DIR}/cmake/multiarch_avx512_vbmi2.cmake") if(multiarch_avx512_vbmi2) - set(MULTIARCH_AVX512 "MULTIARCH_AVX512") + set(ENABLE_MULTIARCH_AVX512 "ENABLE_MULTIARCH_AVX512") endif() endif() @@ -129,7 +129,7 @@ if(BUILD_SHARED_LIBS) set_target_properties(libprimesieve PROPERTIES SOVERSION ${PRIMESIEVE_SOVERSION_MAJOR}) set_target_properties(libprimesieve PROPERTIES VERSION ${PRIMESIEVE_SOVERSION}) target_compile_options(libprimesieve PRIVATE ${FTREE_VECTORIZE_FLAG} ${FVECT_COST_MODEL_FLAG}) - target_compile_definitions(libprimesieve PRIVATE "${ENABLE_ASSERT}" "${MULTIARCH_AVX512}") + target_compile_definitions(libprimesieve PRIVATE "${ENABLE_ASSERT}" "${ENABLE_MULTIARCH_AVX512}") if(WIN32_MSVC_COMPATIBLE) # On Windows the shared library will be named primesieve.dll @@ -167,7 +167,7 @@ if(BUILD_STATIC_LIBS) set_target_properties(libprimesieve-static PROPERTIES OUTPUT_NAME primesieve) target_link_libraries(libprimesieve-static PRIVATE Threads::Threads ${LIBATOMIC}) target_compile_options(libprimesieve-static PRIVATE ${FTREE_VECTORIZE_FLAG} ${FVECT_COST_MODEL_FLAG}) - target_compile_definitions(libprimesieve-static PRIVATE "${ENABLE_ASSERT}" "${MULTIARCH_AVX512}") + target_compile_definitions(libprimesieve-static PRIVATE "${ENABLE_ASSERT}" "${ENABLE_MULTIARCH_AVX512}") if(WITH_MSVC_CRT_STATIC) set_target_properties(libprimesieve-static PROPERTIES MSVC_RUNTIME_LIBRARY "MultiThreaded") diff --git a/cmake/multiarch_avx512_vbmi2.cmake b/cmake/multiarch_avx512_vbmi2.cmake index f786ed270..38a345c50 100644 --- a/cmake/multiarch_avx512_vbmi2.cmake +++ b/cmake/multiarch_avx512_vbmi2.cmake @@ -1,12 +1,24 @@ -include(CheckCXXSourceCompiles) - # We use GCC/Clang's function multi-versioning for AVX512 # support. This code will automatically dispatch to the # AVX512 algorithm if the CPU supports AVX512 and use the # default (portable) algorithm otherwise. + +include(CheckCXXSourceCompiles) + check_cxx_source_compiles(" #include #include + + // GCC/Clang function multiversioning for AVX512 is not needed if + // the user compiles with -mavx512f -mavx512vbmi -mavx512vbmi2. + // GCC/Clang function multiversioning generally causes a minor + // overhead, hence we disable it if it is not needed. + #if defined(__AVX512F__) && \ + defined(__AVX512VBMI__) && \ + defined(__AVX512VBMI2__) + Error: AVX512VBMI2 multiarch not needed! + #endif + class PrimeGenerator { public: __attribute__ ((target (\"default\"))) @@ -14,11 +26,13 @@ check_cxx_source_compiles(" __attribute__ ((target (\"avx512f,avx512vbmi,avx512vbmi2\"))) void fillNextPrimes(uint64_t* primes64); }; + __attribute__ ((target (\"default\"))) void PrimeGenerator::fillNextPrimes(uint64_t* primes64) { primes64[0] = 2; } + __attribute__ ((target (\"avx512f,avx512vbmi,avx512vbmi2\"))) void PrimeGenerator::fillNextPrimes(uint64_t* primes64) { @@ -29,6 +43,7 @@ check_cxx_source_compiles(" vprimes = _mm512_add_epi64(base, vprimes); _mm512_storeu_si512(primes64, vprimes); } + int main() { uint64_t primes[8]; diff --git a/include/primesieve/PrimeGenerator.hpp b/include/primesieve/PrimeGenerator.hpp index 0a46a6c3f..d2a92fa77 100644 --- a/include/primesieve/PrimeGenerator.hpp +++ b/include/primesieve/PrimeGenerator.hpp @@ -23,21 +23,6 @@ #include #include -#if defined(MULTIARCH_AVX512) - // GCC/Clang function multiversioning for AVX512 is not needed if - // the user compiles with -mavx512f -mavx512vbmi -mavx512vbmi2. - // GCC/Clang function multiversioning generally causes a minor - // overhead, hence we disable it if it is not needed. - #if defined(__AVX512__) || (defined(__AVX512F__) && \ - defined(__AVX512VBMI__) && \ - defined(__AVX512VBMI2__)) - #undef MULTIARCH_AVX512 - #else - #define MULTIARCH_TARGET_DEFAULT - #define MULTIARCH_TARGET_AVX512 - #endif -#endif - namespace primesieve { class PreSieve; @@ -49,16 +34,17 @@ class PrimeGenerator : public Erat void fillPrevPrimes(Vector& primes, std::size_t* size); static uint64_t maxCachedPrime(); -#if defined(MULTIARCH_TARGET_DEFAULT) - __attribute__ ((target ("default"))) -#endif - void fillNextPrimes(Vector& primes, std::size_t* size); - -#if defined(MULTIARCH_TARGET_AVX512) +#if defined(ENABLE_MULTIARCH_AVX512) + #define ENABLE_MULTIARCH_DEFAULT __attribute__ ((target ("avx512f,avx512vbmi,avx512vbmi2"))) void fillNextPrimes(Vector& primes, std::size_t* size); #endif +#if defined(ENABLE_MULTIARCH_DEFAULT) + __attribute__ ((target ("default"))) +#endif + void fillNextPrimes(Vector& primes, std::size_t* size); + private: bool isInit_ = false; uint64_t low_ = 0; diff --git a/src/PrimeGenerator.cpp b/src/PrimeGenerator.cpp index 4f73d3fca..fefdfd9f8 100644 --- a/src/PrimeGenerator.cpp +++ b/src/PrimeGenerator.cpp @@ -35,21 +35,25 @@ #include #include -// x86-64 AVX512 -#if __has_include() && \ - (defined(__AVX512__) || (defined(__AVX512F__) && \ - defined(__AVX512VBMI__) && \ - defined(__AVX512VBMI2__))) +#if defined(ENABLE_MULTIARCH_AVX512) && \ + __has_include() #include - #define HAS_AVX512_VBMI2 -// GCC/Clang function multiversioning -#elif defined(MULTIARCH_TARGET_AVX512) && \ - __has_include() +#elif defined(__AVX512F__) && \ + defined(__AVX512VBMI__) && \ + defined(__AVX512VBMI2__) && \ + __has_include() + #include + #define ENABLE_AVX512 + +#elif defined(_MSC_VER) && \ + defined(__AVX512__) && \ + __has_include() #include + #define ENABLE_AVX512 -#else // Default portable algorithm - #define DEFAULT_CPU_ARCH +#else + #define ENABLE_DEFAULT #endif namespace { @@ -406,8 +410,8 @@ void PrimeGenerator::fillPrevPrimes(Vector& primes, } } -#if defined(DEFAULT_CPU_ARCH) || \ - defined(MULTIARCH_TARGET_DEFAULT) +#if defined(ENABLE_DEFAULT) || \ + defined(ENABLE_MULTIARCH_DEFAULT) /// This method is used by iterator::next_prime(). /// This method stores only the next few primes (~ 1000) in the @@ -416,7 +420,7 @@ void PrimeGenerator::fillPrevPrimes(Vector& primes, /// this reason iterator::next_prime() runs up to 2x faster /// than iterator::prev_prime(). /// -#if defined(MULTIARCH_TARGET_DEFAULT) +#if defined(ENABLE_MULTIARCH_DEFAULT) __attribute__ ((target ("default"))) #endif void PrimeGenerator::fillNextPrimes(Vector& primes, @@ -475,8 +479,8 @@ void PrimeGenerator::fillNextPrimes(Vector& primes, #endif -#if defined(HAS_AVX512_VBMI2) || \ - defined(MULTIARCH_TARGET_AVX512) +#if defined(ENABLE_AVX512) || \ + defined(ENABLE_MULTIARCH_AVX512) /// This algorithm converts 1 bits from the sieve array into primes /// using AVX512. The algorithm is a modified version of the AVX512 @@ -491,7 +495,7 @@ void PrimeGenerator::fillNextPrimes(Vector& primes, /// benchmarks this algorithm ran about 10% faster than the default /// fillNextPrimes() algorithm which uses __builtin_ctzll(). /// -#if defined(MULTIARCH_TARGET_AVX512) +#if defined(ENABLE_MULTIARCH_AVX512) __attribute__ ((target ("avx512f,avx512vbmi,avx512vbmi2"))) #endif void PrimeGenerator::fillNextPrimes(Vector& primes, diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 53885a4eb..190ed9e7e 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -5,6 +5,6 @@ foreach(file ${files}) get_filename_component(binary_name ${file} NAME_WE) add_executable(${binary_name} ${file}) target_link_libraries(${binary_name} primesieve::primesieve) - target_compile_definitions(${binary_name} PRIVATE "${ENABLE_ASSERT}" "${MULTIARCH_AVX512}") + target_compile_definitions(${binary_name} PRIVATE "${ENABLE_ASSERT}" "${ENABLE_MULTIARCH_AVX512}") add_test(NAME ${binary_name} COMMAND ${binary_name}) endforeach()