Skip to content

Commit

Permalink
Add AVX512 pre-sieving
Browse files Browse the repository at this point in the history
  • Loading branch information
kimwalisch committed Nov 11, 2024
1 parent cdfd3e1 commit d59a2be
Show file tree
Hide file tree
Showing 5 changed files with 301 additions and 36 deletions.
3 changes: 2 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -86,9 +86,10 @@ set(LIB_SRC src/api-c.cpp

if(WITH_MULTIARCH)
include("${PROJECT_SOURCE_DIR}/cmake/multiarch_x86_popcnt.cmake")
include("${PROJECT_SOURCE_DIR}/cmake/multiarch_avx512_bw.cmake")
include("${PROJECT_SOURCE_DIR}/cmake/multiarch_avx512_vbmi2.cmake")

if(multiarch_x86_popcnt OR multiarch_avx512_vbmi2)
if(multiarch_x86_popcnt OR multiarch_avx512_bw OR multiarch_avx512_vbmi2)
set(LIB_SRC ${LIB_SRC} src/x86/cpuid.cpp)
endif()
endif()
Expand Down
80 changes: 80 additions & 0 deletions cmake/multiarch_avx512_bw.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
# We use GCC/Clang's function multi-versioning for AVX512
# support. This code will automatically dispatch to the
# AVX512 BW algorithm if the CPU supports it and use the
# default (portable) algorithm otherwise.

include(CheckCXXSourceCompiles)
include(CMakePushCheckState)

cmake_push_check_state()
set(CMAKE_REQUIRED_INCLUDES "${PROJECT_SOURCE_DIR}")

check_cxx_source_compiles("
// GCC/Clang function multiversioning for AVX512 is not needed if
// the user compiles with -mavx512f -mavx512bw.
// GCC/Clang function multiversioning generally causes a minor
// overhead, hence we disable it if it is not needed.
#if defined(__AVX512F__) && \
defined(__AVX512BW__)
Error: AVX512BW multiarch not needed!
#endif
#include <src/x86/cpuid.cpp>
#include <immintrin.h>
#include <stdint.h>
#include <cstddef>
__attribute__ ((target (\"avx512f,avx512bw\")))
void AND_PreSieveTables_avx512(const uint8_t* __restrict preSieve0,
const uint8_t* __restrict preSieve1,
uint8_t* __restrict sieve,
std::size_t bytes)
{
std::size_t i = 0;
for (; i + 64 <= bytes; i += sizeof(__m512i))
{
_mm512_storeu_epi8((__m512i*) &sieve[i],
_mm512_and_si512(_mm512_loadu_epi8((const __m512i*) &preSieve0[i]),
_mm512_loadu_epi8((const __m512i*) &preSieve1[i])));
}
if (i < bytes)
{
__mmask64 mask = 0xffffffffffffffffull >> (i + 64 - bytes);
_mm512_mask_storeu_epi8((__m512i*) &sieve[i], mask,
_mm512_and_si512(_mm512_maskz_loadu_epi8(mask, (const __m512i*) &preSieve0[i]),
_mm512_maskz_loadu_epi8(mask, (const __m512i*) &preSieve1[i])));
}
}
void AND_PreSieveTables_default(const uint8_t* __restrict preSieved0,
const uint8_t* __restrict preSieved1,
uint8_t* __restrict sieve,
std::size_t bytes)
{
for (std::size_t i = 0; i < bytes; i++)
sieve[i] = preSieved0[i] & preSieved1[i];
}
int main()
{
uint8_t sieve[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
uint8_t PreSieveTable1[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
uint8_t PreSieveTable2[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
if (primesieve::has_cpuid_avx512_bw())
AND_PreSieveTables_avx512(&PreSieveTable1[0], &PreSieveTable1[1], &sieve[0], 10);
else
AND_PreSieveTables_default(&PreSieveTable1[0], &PreSieveTable1[1], &sieve[0], 10);
return (sieve[0] == 0) ? 0 : 1;
}
" multiarch_avx512_bw)

if(multiarch_avx512_bw)
list(APPEND PRIMESIEVE_COMPILE_DEFINITIONS "ENABLE_MULTIARCH_BW")
endif()

cmake_pop_check_state()
27 changes: 27 additions & 0 deletions include/primesieve/cpu_supports_avx512_bw.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
///
/// @file cpu_supports_avx512_bw.hpp
/// @brief Detect if the x86 CPU supports AVX512 BW.
///
/// Copyright (C) 2024 Kim Walisch, <[email protected]>
///
/// This file is distributed under the BSD License. See the COPYING
/// file in the top level directory.
///

#ifndef CPU_SUPPORTS_AVX512_BW_HPP
#define CPU_SUPPORTS_AVX512_BW_HPP

namespace primesieve {

bool has_cpuid_avx512_bw();

} // namespace

namespace {

/// Initialized at startup
const bool cpu_supports_avx512_bw = primesieve::has_cpuid_avx512_bw();

} // namespace

#endif
193 changes: 159 additions & 34 deletions src/PreSieve.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,21 +59,107 @@
#define HAS_ARM_NEON
#endif

#if defined(__AVX512F__) && \
defined(__AVX512BW__) && \
__has_include(<immintrin.h>)
#include <immintrin.h>
#define ENABLE_AVX512_BW

#elif defined(ENABLE_MULTIARCH_BW) && \
__has_include(<immintrin.h>)
#include <primesieve/cpu_supports_avx512_bw.hpp>
#include <immintrin.h>
#endif

#if !defined(ENABLE_AVX512_BW)
#define ENABLE_DEFAULT
#endif

namespace {

#if defined(ENABLE_AVX512_BW) || \
defined(ENABLE_MULTIARCH_BW)

#if defined(ENABLE_MULTIARCH_BW)
__attribute__ ((target ("avx512f,avx512bw")))
#endif
void AND_PreSieveTables_avx512(const uint8_t* __restrict preSieved0,
const uint8_t* __restrict preSieved1,
const uint8_t* __restrict preSieved2,
const uint8_t* __restrict preSieved3,
uint8_t* __restrict sieve,
std::size_t bytes)
{
std::size_t i = 0;

for (; i + 64 <= bytes; i += sizeof(__m512i))
{
_mm512_storeu_epi8((__m512i*) &sieve[i],
_mm512_and_si512(
_mm512_and_si512(_mm512_loadu_epi8((const __m512i*) &preSieved0[i]), _mm512_loadu_epi8((const __m512i*) &preSieved1[i])),
_mm512_and_si512(_mm512_loadu_epi8((const __m512i*) &preSieved2[i]), _mm512_loadu_epi8((const __m512i*) &preSieved3[i]))));
}

if (i < bytes)
{
__mmask64 mask = 0xffffffffffffffffull >> (i + 64 - bytes);

_mm512_mask_storeu_epi8((__m512i*) &sieve[i], mask,
_mm512_and_si512(
_mm512_and_si512(_mm512_maskz_loadu_epi8(mask, (const __m512i*) &preSieved0[i]), _mm512_maskz_loadu_epi8(mask, (const __m512i*) &preSieved1[i])),
_mm512_and_si512(_mm512_maskz_loadu_epi8(mask, (const __m512i*) &preSieved2[i]), _mm512_maskz_loadu_epi8(mask, (const __m512i*) &preSieved3[i]))));
}
}

#if defined(ENABLE_MULTIARCH_BW)
__attribute__ ((target ("avx512f,avx512bw")))
#endif
void AND_PreSieveTables_Sieve_avx512(const uint8_t* __restrict preSieved0,
const uint8_t* __restrict preSieved1,
const uint8_t* __restrict preSieved2,
const uint8_t* __restrict preSieved3,
uint8_t* __restrict sieve,
std::size_t bytes)
{
std::size_t i = 0;

for (; i + 64 <= bytes; i += sizeof(__m512i))
{
_mm512_storeu_epi8((__m512i*) &sieve[i],
_mm512_and_si512(_mm512_loadu_epi8((const __m512i*) &sieve[i]), _mm512_and_si512(
_mm512_and_si512(_mm512_loadu_epi8((const __m512i*) &preSieved0[i]), _mm512_loadu_epi8((const __m512i*) &preSieved1[i])),
_mm512_and_si512(_mm512_loadu_epi8((const __m512i*) &preSieved2[i]), _mm512_loadu_epi8((const __m512i*) &preSieved3[i])))));
}

if (i < bytes)
{
__mmask64 mask = 0xffffffffffffffffull >> (i + 64 - bytes);

_mm512_mask_storeu_epi8((__m512i*) &sieve[i], mask,
_mm512_and_si512(_mm512_maskz_loadu_epi8(mask, (const __m512i*) &sieve[i]), _mm512_and_si512(
_mm512_and_si512(_mm512_maskz_loadu_epi8(mask, (const __m512i*) &preSieved0[i]), _mm512_maskz_loadu_epi8(mask, (const __m512i*) &preSieved1[i])),
_mm512_and_si512(_mm512_maskz_loadu_epi8(mask, (const __m512i*) &preSieved2[i]), _mm512_maskz_loadu_epi8(mask, (const __m512i*) &preSieved3[i])))));
}
}

#endif

/// This section contains portable SIMD algorithms that don't need
/// any runtime CPU support checks.
#if defined(ENABLE_DEFAULT)
#if defined(HAS_SSE2)

/// Since compiler auto-vectorization is not 100% reliable, we have
/// manually vectorized the AND_PreSieveTables() function for x64 CPUs.
/// This algorithm is portable since all x64 CPUs support the SSE2
/// instruction set.
///
void AND_PreSieveTables(const uint8_t* __restrict preSieved0,
const uint8_t* __restrict preSieved1,
const uint8_t* __restrict preSieved2,
const uint8_t* __restrict preSieved3,
uint8_t* __restrict sieve,
std::size_t bytes)
void AND_PreSieveTables_default(const uint8_t* __restrict preSieved0,
const uint8_t* __restrict preSieved1,
const uint8_t* __restrict preSieved2,
const uint8_t* __restrict preSieved3,
uint8_t* __restrict sieve,
std::size_t bytes)
{
std::size_t i = 0;
std::size_t limit = bytes - bytes % sizeof(__m128i);
Expand All @@ -97,12 +183,12 @@ void AND_PreSieveTables(const uint8_t* __restrict preSieved0,
sieve[i] = preSieved0[i] & preSieved1[i] & preSieved2[i] & preSieved3[i];
}

void AND_PreSieveTables_Sieve(const uint8_t* __restrict preSieved0,
const uint8_t* __restrict preSieved1,
const uint8_t* __restrict preSieved2,
const uint8_t* __restrict preSieved3,
uint8_t* __restrict sieve,
std::size_t bytes)
void AND_PreSieveTables_Sieve_default(const uint8_t* __restrict preSieved0,
const uint8_t* __restrict preSieved1,
const uint8_t* __restrict preSieved2,
const uint8_t* __restrict preSieved3,
uint8_t* __restrict sieve,
std::size_t bytes)
{
std::size_t i = 0;
std::size_t limit = bytes - bytes % sizeof(__m128i);
Expand All @@ -128,12 +214,12 @@ void AND_PreSieveTables_Sieve(const uint8_t* __restrict preSieved0,
/// workaround for this Homebrew issue we have manually vectorized
/// the Bitwise AND loop using ARM NEON.
///
void AND_PreSieveTables(const uint8_t* __restrict preSieved0,
const uint8_t* __restrict preSieved1,
const uint8_t* __restrict preSieved2,
const uint8_t* __restrict preSieved3,
uint8_t* __restrict sieve,
std::size_t bytes)
void AND_PreSieveTables_default(const uint8_t* __restrict preSieved0,
const uint8_t* __restrict preSieved1,
const uint8_t* __restrict preSieved2,
const uint8_t* __restrict preSieved3,
uint8_t* __restrict sieve,
std::size_t bytes)
{
std::size_t i = 0;
std::size_t limit = bytes - bytes % sizeof(uint8x16_t);
Expand All @@ -150,12 +236,12 @@ void AND_PreSieveTables(const uint8_t* __restrict preSieved0,
sieve[i] = preSieved0[i] & preSieved1[i] & preSieved2[i] & preSieved3[i];
}

void AND_PreSieveTables_Sieve(const uint8_t* __restrict preSieved0,
const uint8_t* __restrict preSieved1,
const uint8_t* __restrict preSieved2,
const uint8_t* __restrict preSieved3,
uint8_t* __restrict sieve,
std::size_t bytes)
void AND_PreSieveTables_Sieve_default(const uint8_t* __restrict preSieved0,
const uint8_t* __restrict preSieved1,
const uint8_t* __restrict preSieved2,
const uint8_t* __restrict preSieved3,
uint8_t* __restrict sieve,
std::size_t bytes)
{
std::size_t i = 0;
std::size_t limit = bytes - bytes % sizeof(uint8x16_t);
Expand All @@ -174,12 +260,12 @@ void AND_PreSieveTables_Sieve(const uint8_t* __restrict preSieved0,

#else

void AND_PreSieveTables(const uint8_t* __restrict preSieved0,
const uint8_t* __restrict preSieved1,
const uint8_t* __restrict preSieved2,
const uint8_t* __restrict preSieved3,
uint8_t* __restrict sieve,
std::size_t bytes)
void AND_PreSieveTables_default(const uint8_t* __restrict preSieved0,
const uint8_t* __restrict preSieved1,
const uint8_t* __restrict preSieved2,
const uint8_t* __restrict preSieved3,
uint8_t* __restrict sieve,
std::size_t bytes)
{
// This loop will get auto-vectorized if compiled with GCC/Clang
// using -O3. Using GCC -O2 does not auto-vectorize this loop
Expand All @@ -190,18 +276,57 @@ void AND_PreSieveTables(const uint8_t* __restrict preSieved0,
sieve[i] = preSieved0[i] & preSieved1[i] & preSieved2[i] & preSieved3[i];
}

void AND_PreSieveTables_Sieve_default(const uint8_t* __restrict preSieved0,
const uint8_t* __restrict preSieved1,
const uint8_t* __restrict preSieved2,
const uint8_t* __restrict preSieved3,
uint8_t* __restrict sieve,
std::size_t bytes)
{
for (std::size_t i = 0; i < bytes; i++)
sieve[i] &= preSieved0[i] & preSieved1[i] & preSieved2[i] & preSieved3[i];
}

#endif
#endif

void AND_PreSieveTables(const uint8_t* __restrict preSieved0,
const uint8_t* __restrict preSieved1,
const uint8_t* __restrict preSieved2,
const uint8_t* __restrict preSieved3,
uint8_t* __restrict sieve,
std::size_t bytes)
{
#if defined(ENABLE_AVX512_BW)
AND_PreSieveTables_avx512(preSieved0, preSieved1, preSieved2, preSieved3, sieve, bytes);
#elif defined(ENABLE_MULTIARCH_BW)
if (cpu_supports_avx512_bw)
AND_PreSieveTables_avx512(preSieved0, preSieved1, preSieved2, preSieved3, sieve, bytes);
else
AND_PreSieveTables_default(preSieved0, preSieved1, preSieved2, preSieved3, sieve, bytes);
#else
AND_PreSieveTables_default(preSieved0, preSieved1, preSieved2, preSieved3, sieve, bytes);
#endif
}

void AND_PreSieveTables_Sieve(const uint8_t* __restrict preSieved0,
const uint8_t* __restrict preSieved1,
const uint8_t* __restrict preSieved2,
const uint8_t* __restrict preSieved3,
uint8_t* __restrict sieve,
std::size_t bytes)
{
for (std::size_t i = 0; i < bytes; i++)
sieve[i] &= preSieved0[i] & preSieved1[i] & preSieved2[i] & preSieved3[i];
}

#if defined(ENABLE_AVX512_BW)
AND_PreSieveTables_Sieve_avx512(preSieved0, preSieved1, preSieved2, preSieved3, sieve, bytes);
#elif defined(ENABLE_MULTIARCH_BW)
if (cpu_supports_avx512_bw)
AND_PreSieveTables_Sieve_avx512(preSieved0, preSieved1, preSieved2, preSieved3, sieve, bytes);
else
AND_PreSieveTables_Sieve_default(preSieved0, preSieved1, preSieved2, preSieved3, sieve, bytes);
#else
AND_PreSieveTables_Sieve_default(preSieved0, preSieved1, preSieved2, preSieved3, sieve, bytes);
#endif
}

} // namespace

Expand Down
Loading

0 comments on commit d59a2be

Please sign in to comment.