Skip to content

Commit

Permalink
Improve Windows multiarch support
Browse files Browse the repository at this point in the history
kimwalisch committed Apr 15, 2024
1 parent 34179cc commit 17a49bd
Showing 10 changed files with 233 additions and 173 deletions.
3 changes: 2 additions & 1 deletion ChangeLog
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
Changes in version 12.3, 04/04/2024
Changes in version 12.3, 15/04/2024
===================================

* Improve Windows multiarch support (now works with MinGW64).
* Add runtime POPCNT detection using CPUID for x86 CPUs.
* Improve GCC/Clang multiarch preprocessor logic.
* CMakeLists.txt: Remove POPCNT/BMI check for x86 CPUs.
36 changes: 24 additions & 12 deletions cmake/multiarch_avx512_vbmi2.cmake
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
# We use GCC/Clang's function multi-versioning for AVX512
# support. This code will automatically dispatch to the
# AVX512 algorithm if the CPU supports AVX512 and use the
# default (portable) algorithm otherwise.
# AVX512 VBMI2 algorithm if the CPU supports it and use
# the default (portable) algorithm otherwise.

include(CheckCXXSourceCompiles)
include(CMakePushCheckState)

check_cxx_source_compiles("
#include <immintrin.h>
#include <stdint.h>
cmake_push_check_state()
set(CMAKE_REQUIRED_INCLUDES "${PROJECT_SOURCE_DIR}/include")

check_cxx_source_compiles("
// GCC/Clang function multiversioning for AVX512 is not needed if
// the user compiles with -mavx512f -mavx512vbmi -mavx512vbmi2.
// GCC/Clang function multiversioning generally causes a minor
@@ -19,22 +20,31 @@ check_cxx_source_compiles("
Error: AVX512VBMI2 multiarch not needed!
#endif
#include <primesieve/cpu_supports_avx512_vbmi2.hpp>
#include <immintrin.h>
#include <stdint.h>
class PrimeGenerator {
public:
__attribute__ ((target (\"default\")))
void fillNextPrimes(uint64_t* primes64);
public:
__attribute__ ((target (\"avx512f,avx512vbmi,avx512vbmi2\")))
void fillNextPrimes(uint64_t* primes64);
void fillNextPrimes_avx512(uint64_t* primes64);
void fillNextPrimes_default(uint64_t* primes64);
void fillNextPrimes(uint64_t* primes64)
{
if (cpu_supports_avx512_vbmi2)
fillNextPrimes_avx512(primes64);
else
fillNextPrimes_default(primes64);
}
};
__attribute__ ((target (\"default\")))
void PrimeGenerator::fillNextPrimes(uint64_t* primes64)
void PrimeGenerator::fillNextPrimes_default(uint64_t* primes64)
{
primes64[0] = 2;
}
__attribute__ ((target (\"avx512f,avx512vbmi,avx512vbmi2\")))
void PrimeGenerator::fillNextPrimes(uint64_t* primes64)
void PrimeGenerator::fillNextPrimes_avx512(uint64_t* primes64)
{
__m512i bytes_0_to_7 = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);
__m512i base = _mm512_set1_epi64(123);
@@ -56,3 +66,5 @@ check_cxx_source_compiles("
if(multiarch_avx512_vbmi2)
set(ENABLE_MULTIARCH_AVX512 "ENABLE_MULTIARCH_AVX512")
endif()

cmake_pop_check_state()
45 changes: 3 additions & 42 deletions include/primesieve/CPUID.hpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
///
/// @file CPUID.hpp
/// @brief POPCNT detection fo x86 and x86-64 CPUs.
/// @file cpuid.hpp
/// @brief CPUID for x86 and x86-64 CPUs.
///
/// Copyright (C) 2024 Kim Walisch, <kim.walisch@gmail.com>
///
@@ -11,32 +11,13 @@
#ifndef CPUID_HPP
#define CPUID_HPP

// Enable on x86 and x86-64 CPUs
#if defined(__x86_64__) || \
defined(__i386__) || \
defined(_M_X64) || \
defined(_M_IX86)

// Both GCC and Clang (even Clang on Windows) define the __POPCNT__
// macro if the user compiles with -mpopcnt. The __POPCNT__
// macro is even defined if the user compiles with other flags
// such as -mavx or -march=native.
#if defined(__POPCNT__)
#define HAS_POPCNT
// The MSVC compiler does not support a POPCNT macro, but if the user
// compiles with e.g. /arch:AVX or /arch:AVX512 then MSVC defines
// the __AVX__ macro and POPCNT is also supported.
#elif defined(_MSC_VER) && defined(__AVX__)
#define HAS_POPCNT
#endif

#if defined(_MSC_VER)
#include <intrin.h>
#endif

namespace {

inline void run_CPUID(int eax, int ecx, int* abcd)
inline void run_cpuid(int eax, int ecx, int* abcd)
{
#if defined(_MSC_VER)
__cpuidex(abcd, eax, ecx);
@@ -69,26 +50,6 @@ inline void run_CPUID(int eax, int ecx, int* abcd)
#endif
}

#if !defined(HAS_POPCNT)
#define ENABLE_CPUID_POPCNT

inline bool run_CPUID_POPCNT()
{
// %ecx POPCNT bit flag
int bit_POPCNT = 1 << 23;
int abcd[4];

run_CPUID(1, 0, abcd);
return (abcd[2] & bit_POPCNT) == bit_POPCNT;
}

/// Initialized at startup
const bool HAS_CPUID_POPCNT = run_CPUID_POPCNT();

#endif // ENABLE_CPUID_POPCNT

} // namespace

#endif // x86 CPU

#endif
48 changes: 40 additions & 8 deletions include/primesieve/PrimeGenerator.hpp
Original file line number Diff line number Diff line change
@@ -23,6 +23,20 @@
#include <stdint.h>
#include <cstddef>

#if defined(__AVX512F__) && \
defined(__AVX512VBMI__) && \
defined(__AVX512VBMI2__) && \
__has_include(<immintrin.h>)
#define ENABLE_AVX512

#elif defined(ENABLE_MULTIARCH_AVX512) && \
__has_include(<immintrin.h>)
#include "cpu_supports_avx512_vbmi2.hpp"
#define ENABLE_DEFAULT
#else
#define ENABLE_DEFAULT
#endif

namespace primesieve {

class PreSieve;
@@ -34,18 +48,36 @@ class PrimeGenerator : public Erat
void fillPrevPrimes(Vector<uint64_t>& primes, std::size_t* size);
static uint64_t maxCachedPrime();

#if defined(ENABLE_MULTIARCH_AVX512)
#define ENABLE_MULTIARCH_DEFAULT
__attribute__ ((target ("avx512f,avx512vbmi,avx512vbmi2")))
void fillNextPrimes(Vector<uint64_t>& primes, std::size_t* size);
ALWAYS_INLINE void fillNextPrimes(Vector<uint64_t>& primes, std::size_t* size)
{
#if defined(ENABLE_AVX512)
fillNextPrimes_avx512(primes, size);
#elif defined(ENABLE_MULTIARCH_AVX512)
if (cpu_supports_avx512_vbmi2)
fillNextPrimes_avx512(primes, size);
else
fillNextPrimes_default(primes, size);
#else
fillNextPrimes_default(primes, size);
#endif
}

private:

#if defined(ENABLE_DEFAULT)
void fillNextPrimes_default(Vector<uint64_t>& primes, std::size_t* size);
#endif

#if defined(ENABLE_MULTIARCH_DEFAULT)
__attribute__ ((target ("default")))
#if defined(ENABLE_AVX512) || \
defined(ENABLE_MULTIARCH_AVX512)

#if defined(ENABLE_MULTIARCH_AVX512)
__attribute__ ((target ("avx512f,avx512vbmi,avx512vbmi2")))
#endif
void fillNextPrimes_avx512(Vector<uint64_t>& primes, std::size_t* size);

#endif
void fillNextPrimes(Vector<uint64_t>& primes, std::size_t* size);

private:
bool isInit_ = false;
uint64_t low_ = 0;
uint64_t prime_ = 0;
85 changes: 85 additions & 0 deletions include/primesieve/cpu_supports_avx512_vbmi2.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
///
/// @file cpu_supports_avx512_vbmi2.hpp
/// @brief Detect if the x86 CPU supports AVX512 VBMI2.
///
/// Copyright (C) 2024 Kim Walisch, <kim.walisch@gmail.com>
///
/// This file is distributed under the BSD License. See the COPYING
/// file in the top level directory.
///

#ifndef CPU_SUPPORTS_AVX512_VBMI2_HPP
#define CPU_SUPPORTS_AVX512_VBMI2_HPP

#include "cpuid.hpp"

#if defined(_MSC_VER)
#include <immintrin.h>
#endif

// %ebx bit flags
#define bit_AVX512F (1 << 16)

// %ecx bit flags
#define bit_AVX512VBMI (1 << 1)
#define bit_AVX512VBMI2 (1 << 6)

// xgetbv bit flags
#define XSTATE_SSE (1 << 1)
#define XSTATE_YMM (1 << 2)
#define XSTATE_ZMM (7 << 5)

namespace {

// Get Value of Extended Control Register
inline int get_xcr0()
{
int xcr0;

#if defined(_MSC_VER)
xcr0 = (int) _xgetbv(0);
#else
__asm__ ("xgetbv" : "=a" (xcr0) : "c" (0) : "%edx" );
#endif

return xcr0;
}

inline bool run_cpuid_avx512_vbmi2()
{
int abcd[4];

run_cpuid(1, 0, abcd);

int osxsave_mask = (1 << 27);

// Ensure OS supports extended processor state management
if ((abcd[2] & osxsave_mask) != osxsave_mask)
return false;

int ymm_mask = XSTATE_SSE | XSTATE_YMM;
int zmm_mask = XSTATE_SSE | XSTATE_YMM | XSTATE_ZMM;

int xcr0 = get_xcr0();

// Check AVX OS support
if ((xcr0 & ymm_mask) != ymm_mask)
return false;

// Check AVX512 OS support
if ((xcr0 & zmm_mask) != zmm_mask)
return false;

run_cpuid(7, 0, abcd);

// PrimeGenerator::fillNextPrimes() requires AVX512F, AVX512VBMI & AVX512VBMI2
return ((abcd[1] & bit_AVX512F) == bit_AVX512F &&
(abcd[2] & (bit_AVX512VBMI | bit_AVX512VBMI2)) == (bit_AVX512VBMI | bit_AVX512VBMI2));
}

/// Initialized at startup
bool cpu_supports_avx512_vbmi2 = run_cpuid_avx512_vbmi2();

} // namespace

#endif
58 changes: 58 additions & 0 deletions include/primesieve/cpu_supports_popcnt.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
///
/// @file cpu_supports_popcnt.hpp
/// @brief POPCNT detection fo x86 and x86-64 CPUs.
///
/// Copyright (C) 2024 Kim Walisch, <kim.walisch@gmail.com>
///
/// This file is distributed under the BSD License. See the COPYING
/// file in the top level directory.
///

#ifndef CPU_SUPPORTS_POPCNT_HPP
#define CPU_SUPPORTS_POPCNT_HPP

// Enable CPUID on x86 and x86-64 CPUs
#if defined(__x86_64__) || \
defined(__i386__) || \
defined(_M_X64) || \
defined(_M_IX86)

// Both GCC and Clang (even Clang on Windows) define the __POPCNT__
// macro if the user compiles with -mpopcnt. The __POPCNT__
// macro is even defined if the user compiles with other flags
// such as -mavx or -march=native.
#if defined(__POPCNT__)
#define HAS_POPCNT
// The MSVC compiler does not support a POPCNT macro, but if the user
// compiles with e.g. /arch:AVX or /arch:AVX512 then MSVC defines
// the __AVX__ macro and POPCNT is also supported.
#elif defined(_MSC_VER) && defined(__AVX__)
#define HAS_POPCNT
#endif

#if !defined(HAS_POPCNT)

#include "cpuid.hpp"
#define ENABLE_CPUID_POPCNT

namespace {

inline bool run_cpuid_supports_popcnt()
{
int abcd[4];
run_cpuid(1, 0, abcd);

// %ecx POPCNT bit flag
int bit_POPCNT = 1 << 23;
return (abcd[2] & bit_POPCNT) == bit_POPCNT;
}

/// Initialized at startup
bool cpu_supports_popcnt = run_cpuid_supports_popcnt();

} // namespace

#endif // !defined(HAS_POPCNT)
#endif // CPUID

#endif
11 changes: 5 additions & 6 deletions include/primesieve/intrinsics.hpp
Original file line number Diff line number Diff line change
@@ -11,7 +11,7 @@
#ifndef INTRINSICS_HPP
#define INTRINSICS_HPP

#include "CPUID.hpp"
#include "cpu_supports_popcnt.hpp"
#include "macros.hpp"

#include <stdint.h>
@@ -46,7 +46,6 @@ inline uint64_t popcnt64_bitwise(uint64_t x)
// CPUID is only enabled on x86 and x86-64 CPUs
// if the user compiles without -mpopcnt.
#if defined(ENABLE_CPUID_POPCNT)

#if defined(__x86_64__)

namespace {
@@ -55,7 +54,7 @@ inline uint64_t popcnt64(uint64_t x)
{
// On my AMD EPYC 7642 CPU using GCC 12 this runtime
// check incurs an overall overhead of about 1%.
if_likely(HAS_CPUID_POPCNT)
if_likely(cpu_supports_popcnt)
{
__asm__("popcnt %1, %0" : "=r"(x) : "r"(x));
return x;
@@ -78,7 +77,7 @@ namespace {

inline uint64_t popcnt64(uint64_t x)
{
if_likely(HAS_CPUID_POPCNT)
if_likely(cpu_supports_popcnt)
{
uint32_t x0 = uint32_t(x);
uint32_t x1 = uint32_t(x >> 32);
@@ -135,7 +134,7 @@ inline uint64_t popcnt64(uint64_t x)
#if defined(HAS_POPCNT)
return __popcnt64(x);
#elif defined(ENABLE_CPUID_POPCNT)
if_likely(HAS_CPUID_POPCNT)
if_likely(cpu_supports_popcnt)
return __popcnt64(x);
else
return popcnt64_bitwise(x);
@@ -160,7 +159,7 @@ inline uint64_t popcnt64(uint64_t x)
return __popcnt(uint32_t(x)) +
__popcnt(uint32_t(x >> 32));
#elif defined(ENABLE_CPUID_POPCNT)
if_likely(HAS_CPUID_POPCNT)
if_likely(cpu_supports_popcnt)
return __popcnt(uint32_t(x)) +
__popcnt(uint32_t(x >> 32));
else
88 changes: 8 additions & 80 deletions src/CpuInfo.cpp
Original file line number Diff line number Diff line change
@@ -25,7 +25,6 @@
///

#include <primesieve/CpuInfo.hpp>
#include <primesieve/CPUID.hpp>
#include <primesieve/macros.hpp>

#include <algorithm>
@@ -44,76 +43,9 @@
defined(__x86_64__) || \
defined(_M_IX86) || \
defined(_M_X64)

#if defined(_MSC_VER)
#include <intrin.h>
#include <immintrin.h>
#endif

#define HAS_CPUID

/* %ebx bit flags */
#define bit_AVX512F (1 << 16)

/* %ecx bit flags */
#define bit_AVX512VBMI (1 << 1)
#define bit_AVX512VBMI2 (1 << 6)

/* xgetbv bit flags */
#define XSTATE_SSE (1 << 1)
#define XSTATE_YMM (1 << 2)
#define XSTATE_ZMM (7 << 5)

namespace {

// Get Value of Extended Control Register
int get_xcr0()
{
int xcr0;

#if defined(_MSC_VER)
xcr0 = (int) _xgetbv(0);
#else
__asm__ ("xgetbv" : "=a" (xcr0) : "c" (0) : "%edx" );
#endif

return xcr0;
}

bool has_AVX512()
{
int abcd[4];

run_CPUID(1, 0, abcd);

int osxsave_mask = (1 << 27);

// Ensure OS supports extended processor state management
if ((abcd[2] & osxsave_mask) != osxsave_mask)
return false;

int ymm_mask = XSTATE_SSE | XSTATE_YMM;
int zmm_mask = XSTATE_SSE | XSTATE_YMM | XSTATE_ZMM;

int xcr0 = get_xcr0();

// Check AVX OS support
if ((xcr0 & ymm_mask) != ymm_mask)
return false;

// Check AVX512 OS support
if ((xcr0 & zmm_mask) != zmm_mask)
return false;

run_CPUID(7, 0, abcd);

// PrimeGenerator::fillNextPrimes() requires AVX512F, AVX512VBMI & AVX512VBMI2
return ((abcd[1] & bit_AVX512F) == bit_AVX512F &&
(abcd[2] & (bit_AVX512VBMI | bit_AVX512VBMI2)) == (bit_AVX512VBMI | bit_AVX512VBMI2));
}

} // namespace

#include <primesieve/cpuid.hpp>
#include <primesieve/cpu_supports_avx512_vbmi2.hpp>
#define HAS_CPUID
#endif

#if defined(_WIN32)
@@ -136,19 +68,19 @@ std::string getCpuName()
// https://en.wikipedia.org/wiki/CPUID

int cpuInfo[4] = { 0, 0, 0, 0 };
run_CPUID(0x80000000, 0, cpuInfo);
run_cpuid(0x80000000, 0, cpuInfo);
std::vector<int> vect;

// check if CPU name is supported
if ((unsigned) cpuInfo[0] >= 0x80000004u)
{
run_CPUID(0x80000002, 0, cpuInfo);
run_cpuid(0x80000002, 0, cpuInfo);
std::copy_n(cpuInfo, 4, std::back_inserter(vect));

run_CPUID(0x80000003, 0, cpuInfo);
run_cpuid(0x80000003, 0, cpuInfo);
std::copy_n(cpuInfo, 4, std::back_inserter(vect));

run_CPUID(0x80000004, 0, cpuInfo);
run_cpuid(0x80000004, 0, cpuInfo);
std::copy_n(cpuInfo, 4, std::back_inserter(vect));

vect.push_back(0);
@@ -823,14 +755,10 @@ std::string CpuInfo::cpuName() const
}
}

/// This method is only used by the primesieve command-line app
/// with the --cpu-info option. Therefore we currently don't
/// cache the result of has_AVX512().
///
bool CpuInfo::hasAVX512() const
{
#if defined(HAS_CPUID)
return has_AVX512();
return cpu_supports_avx512_vbmi2;
#else
return false;
#endif
26 changes: 5 additions & 21 deletions src/PrimeGenerator.cpp
Original file line number Diff line number Diff line change
@@ -35,19 +35,9 @@
#include <algorithm>
#include <limits>

#if defined(ENABLE_MULTIARCH_AVX512) && \
__has_include(<immintrin.h>)
#include <immintrin.h>

#elif defined(__AVX512F__) && \
defined(__AVX512VBMI__) && \
defined(__AVX512VBMI2__) && \
__has_include(<immintrin.h>)
#if defined(ENABLE_AVX512) || \
defined(ENABLE_MULTIARCH_AVX512)
#include <immintrin.h>
#define ENABLE_AVX512

#else
#define ENABLE_DEFAULT
#endif

namespace {
@@ -404,8 +394,7 @@ void PrimeGenerator::fillPrevPrimes(Vector<uint64_t>& primes,
}
}

#if defined(ENABLE_DEFAULT) || \
defined(ENABLE_MULTIARCH_DEFAULT)
#if defined(ENABLE_DEFAULT)

/// This method is used by iterator::next_prime().
/// This method stores only the next few primes (~ 1000) in the
@@ -414,11 +403,7 @@ void PrimeGenerator::fillPrevPrimes(Vector<uint64_t>& primes,
/// this reason iterator::next_prime() runs up to 2x faster
/// than iterator::prev_prime().
///
#if defined(ENABLE_MULTIARCH_DEFAULT)
__attribute__ ((target ("default")))
#endif
void PrimeGenerator::fillNextPrimes(Vector<uint64_t>& primes,
std::size_t* size)
void PrimeGenerator::fillNextPrimes_default(Vector<uint64_t>& primes, std::size_t* size)
{
*size = 0;

@@ -492,8 +477,7 @@ void PrimeGenerator::fillNextPrimes(Vector<uint64_t>& primes,
#if defined(ENABLE_MULTIARCH_AVX512)
__attribute__ ((target ("avx512f,avx512vbmi,avx512vbmi2")))
#endif
void PrimeGenerator::fillNextPrimes(Vector<uint64_t>& primes,
std::size_t* size)
void PrimeGenerator::fillNextPrimes_avx512(Vector<uint64_t>& primes, std::size_t* size)
{
*size = 0;

6 changes: 3 additions & 3 deletions test/CPUID.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
///
/// @file CPUID.cpp
/// @file cpuid.cpp
/// @brief Test CPUID code on x86 and x64 CPUs.
///
/// Copyright (C) 2024 Kim Walisch, <kim.walisch@gmail.com>
@@ -8,7 +8,7 @@
/// file in the top level directory.
///

#include <primesieve/CPUID.hpp>
#include <primesieve/cpu_supports_avx512_vbmi2.hpp>
#include <iostream>

int main()
@@ -64,7 +64,7 @@ int main()
#endif

#if defined(ENABLE_CPUID_POPCNT)
std::cout << "CPU supports POPCNT: " << (HAS_CPUID_POPCNT ? "yes" : "no") << std::endl;
std::cout << "CPU supports POPCNT: " << (cpu_supports_popcnt ? "yes" : "no") << std::endl;
#endif

#endif

0 comments on commit 17a49bd

Please sign in to comment.