diff --git a/include/primesieve/popcnt.hpp b/include/primesieve/popcnt.hpp index a112b2cc..455c63b1 100644 --- a/include/primesieve/popcnt.hpp +++ b/include/primesieve/popcnt.hpp @@ -19,6 +19,15 @@ #include "cpu_supports_popcnt.hpp" #endif +// GCC & Clang +#if defined(__GNUC__) || \ + __has_builtin(__builtin_popcountl) + +// CPUID is only enabled on x86 and x86-64 CPUs +// if the user compiles without -mpopcnt. +#if defined(ENABLE_MULTIARCH_x86_POPCNT) +#if defined(__x86_64__) + namespace { /// This uses fewer arithmetic operations than any other known @@ -40,19 +49,6 @@ NOINLINE uint64_t popcnt64_bitwise(uint64_t x) return (x * h01) >> 56; } -} // namespace - -// GCC & Clang -#if defined(__GNUC__) || \ - __has_builtin(__builtin_popcountl) - -// CPUID is only enabled on x86 and x86-64 CPUs -// if the user compiles without -mpopcnt. -#if defined(ENABLE_MULTIARCH_x86_POPCNT) -#if defined(__x86_64__) - -namespace { - ALWAYS_INLINE uint64_t popcnt64(uint64_t x) { // On my AMD EPYC 7642 CPU using GCC 12 this runtime @@ -72,6 +68,25 @@ ALWAYS_INLINE uint64_t popcnt64(uint64_t x) namespace { +/// This uses fewer arithmetic operations than any other known +/// implementation on machines with fast multiplication. +/// It uses 12 arithmetic operations, one of which is a multiply. +/// http://en.wikipedia.org/wiki/Hamming_weight#Efficient_implementation +/// +NOINLINE uint64_t popcnt64_bitwise(uint64_t x) +{ + uint64_t m1 = 0x5555555555555555ull; + uint64_t m2 = 0x3333333333333333ull; + uint64_t m4 = 0x0F0F0F0F0F0F0F0Full; + uint64_t h01 = 0x0101010101010101ull; + + x -= (x >> 1) & m1; + x = (x & m2) + ((x >> 2) & m2); + x = (x + (x >> 4)) & m4; + + return (x * h01) >> 56; +} + ALWAYS_INLINE uint64_t popcnt64(uint64_t x) { if_likely(cpu_supports_popcnt) @@ -120,23 +135,66 @@ ALWAYS_INLINE uint64_t popcnt64(uint64_t x) namespace { -ALWAYS_INLINE uint64_t popcnt64(uint64_t x) -{ #if defined(__POPCNT__) || \ defined(__AVX__) + +ALWAYS_INLINE uint64_t popcnt64(uint64_t x) +{ return __popcnt64(x); +} #elif defined(ENABLE_MULTIARCH_x86_POPCNT) + +/// This uses fewer arithmetic operations than any other known +/// implementation on machines with fast multiplication. +/// It uses 12 arithmetic operations, one of which is a multiply. +/// http://en.wikipedia.org/wiki/Hamming_weight#Efficient_implementation +/// +NOINLINE uint64_t popcnt64_bitwise(uint64_t x) +{ + uint64_t m1 = 0x5555555555555555ull; + uint64_t m2 = 0x3333333333333333ull; + uint64_t m4 = 0x0F0F0F0F0F0F0F0Full; + uint64_t h01 = 0x0101010101010101ull; + + x -= (x >> 1) & m1; + x = (x & m2) + ((x >> 2) & m2); + x = (x + (x >> 4)) & m4; + + return (x * h01) >> 56; +} + +ALWAYS_INLINE uint64_t popcnt64(uint64_t x) +{ if_likely(cpu_supports_popcnt) return __popcnt64(x); else return popcnt64_bitwise(x); +} #else - return popcnt64_bitwise(x); -#endif + +/// This uses fewer arithmetic operations than any other known +/// implementation on machines with fast multiplication. +/// It uses 12 arithmetic operations, one of which is a multiply. +/// http://en.wikipedia.org/wiki/Hamming_weight#Efficient_implementation +/// +ALWAYS_INLINE uint64_t popcnt64(uint64_t x) +{ + uint64_t m1 = 0x5555555555555555ull; + uint64_t m2 = 0x3333333333333333ull; + uint64_t m4 = 0x0F0F0F0F0F0F0F0Full; + uint64_t h01 = 0x0101010101010101ull; + + x -= (x >> 1) & m1; + x = (x & m2) + ((x >> 2) & m2); + x = (x + (x >> 4)) & m4; + + return (x * h01) >> 56; } +#endif + } // namespace #elif defined(_MSC_VER) && \ @@ -147,25 +205,68 @@ ALWAYS_INLINE uint64_t popcnt64(uint64_t x) namespace { -ALWAYS_INLINE uint64_t popcnt64(uint64_t x) -{ #if defined(__POPCNT__) || \ defined(__AVX__) + +ALWAYS_INLINE uint64_t popcnt64(uint64_t x) +{ return __popcnt(uint32_t(x)) + __popcnt(uint32_t(x >> 32)); +} #elif defined(ENABLE_MULTIARCH_x86_POPCNT) + +/// This uses fewer arithmetic operations than any other known +/// implementation on machines with fast multiplication. +/// It uses 12 arithmetic operations, one of which is a multiply. +/// http://en.wikipedia.org/wiki/Hamming_weight#Efficient_implementation +/// +NOINLINE uint64_t popcnt64_bitwise(uint64_t x) +{ + uint64_t m1 = 0x5555555555555555ull; + uint64_t m2 = 0x3333333333333333ull; + uint64_t m4 = 0x0F0F0F0F0F0F0F0Full; + uint64_t h01 = 0x0101010101010101ull; + + x -= (x >> 1) & m1; + x = (x & m2) + ((x >> 2) & m2); + x = (x + (x >> 4)) & m4; + + return (x * h01) >> 56; +} + +ALWAYS_INLINE uint64_t popcnt64(uint64_t x) +{ if_likely(cpu_supports_popcnt) return __popcnt(uint32_t(x)) + __popcnt(uint32_t(x >> 32)); else return popcnt64_bitwise(x); +} #else - return popcnt64_bitwise(x); -#endif + +/// This uses fewer arithmetic operations than any other known +/// implementation on machines with fast multiplication. +/// It uses 12 arithmetic operations, one of which is a multiply. +/// http://en.wikipedia.org/wiki/Hamming_weight#Efficient_implementation +/// +ALWAYS_INLINE uint64_t popcnt64(uint64_t x) +{ + uint64_t m1 = 0x5555555555555555ull; + uint64_t m2 = 0x3333333333333333ull; + uint64_t m4 = 0x0F0F0F0F0F0F0F0Full; + uint64_t h01 = 0x0101010101010101ull; + + x -= (x >> 1) & m1; + x = (x & m2) + ((x >> 2) & m2); + x = (x + (x >> 4)) & m4; + + return (x * h01) >> 56; } +#endif + } // namespace #elif __cplusplus >= 202002L && \ @@ -189,10 +290,23 @@ ALWAYS_INLINE uint64_t popcnt64(uint64_t x) namespace { -/// Portable (but slow) popcount algorithm +/// This uses fewer arithmetic operations than any other known +/// implementation on machines with fast multiplication. +/// It uses 12 arithmetic operations, one of which is a multiply. +/// http://en.wikipedia.org/wiki/Hamming_weight#Efficient_implementation +/// ALWAYS_INLINE uint64_t popcnt64(uint64_t x) { - return popcnt64_bitwise(x); + uint64_t m1 = 0x5555555555555555ull; + uint64_t m2 = 0x3333333333333333ull; + uint64_t m4 = 0x0F0F0F0F0F0F0F0Full; + uint64_t h01 = 0x0101010101010101ull; + + x -= (x >> 1) & m1; + x = (x & m2) + ((x >> 2) & m2); + x = (x + (x >> 4)) & m4; + + return (x * h01) >> 56; } } // namespace