From 04a7f6ea005c6594e3689646454e863d3fa8af60 Mon Sep 17 00:00:00 2001 From: Walter Date: Sat, 16 Nov 2024 09:06:26 +1100 Subject: [PATCH 1/2] 64-bit div & mod for MSVC Added 64-bit fastdiv & mod support for MSVC via multi-word arithmetic and VS specific intrinsics. This produces almost identical asm output to GCC/Clang's __uint128_t. I also had to slightly modify the tests to make it compatible for MSVC: - Removed `#ifndef _MSC_VER` directives - `__uint128_t` => `auto` - Literals `-2147483648` & `-0x80000000` => `INT32_MIN` (to prevent overflow) - Added VS compatible `doNotOptimizeAway` --- include/fastmod.h | 128 +++++++++++++++++++++++++++++++++++-- tests/cppincludetest1.cpp | 6 +- tests/cppincludetest2.cpp | 10 +-- tests/moddivnbenchmark.cpp | 20 +++++- tests/modnbenchmark.cpp | 20 +++++- tests/unit.c | 11 ++-- 6 files changed, 173 insertions(+), 22 deletions(-) diff --git a/include/fastmod.h b/include/fastmod.h index 226648f..adbeebb 100644 --- a/include/fastmod.h +++ b/include/fastmod.h @@ -49,6 +49,71 @@ FASTMOD_API uint64_t mul128_s32(uint64_t lowbits, int32_t d) { return mul128_u32(lowbits, d); } +// Need _udiv128 to calculate the magic number (maps to x86 64-bit div) +#if defined(_M_AMD64) && ( _MSC_VER >= 1923) +// This is for the 64-bit functions. +// Visual Studio lacks support for 128-bit integers so they simulated are using +// multiword arithmatic and VS specific intrinsics. + + FASTMOD_API uint64_t add128_u64( + uint64_t M_hi, uint64_t M_lo, uint64_t addend, uint64_t* sum_hi + ) { + uint64_t sum_lo; + + bool carry = _addcarry_u64(0, M_lo, addend, &sum_lo); + _addcarry_u64(carry, M_hi, 0, sum_hi); // Encourages 'adc' + + return sum_lo; + } + + FASTMOD_API uint64_t div128_u64( + uint64_t dividend_hi, uint64_t dividend_lo, uint64_t divisor, uint64_t* quotient_hi + ) { + *quotient_hi = dividend_hi / divisor; + uint64_t remainder_hi = dividend_hi % divisor; + + // When long div starts to consider the low dividend, + // the high part would have became its remainder. + // Prevents an arithmetic exception when _udiv128 calculates a >64-bit quotient + uint64_t remainder; // Discard + return _udiv128(remainder_hi, dividend_lo, divisor, &remainder); + } + + // Multiplies the 128-bit integer by d and returns the lower 128-bits of the product + FASTMOD_API uint64_t mul128_u64_lo( + uint64_t M_hi, uint64_t M_lo, uint64_t d, uint64_t* product_hi + ) { + uint64_t lowbits_hi; + uint64_t lowbits_lo = _umul128(M_lo, d, &lowbits_hi); + + *product_hi = lowbits_hi + (M_hi * d); + + return lowbits_lo; + } + + // Multiplies the 128-bit integer by d and returns the highest 64-bits of the product + FASTMOD_API uint64_t mul128_u64_hi(uint64_t lowbits_hi, uint64_t lowbits_lo, uint64_t d) { + uint64_t bottomHalf_hi = __umulh(lowbits_lo, d); + + uint64_t topHalf_hi; + uint64_t topHalf_lo = _umul128(lowbits_hi, d, &topHalf_hi); + + uint64_t bothHalves_hi; + add128_u64(topHalf_hi, topHalf_lo, bottomHalf_hi, &bothHalves_hi); + + return bothHalves_hi; + } + + FASTMOD_API bool isgreater_u128(uint64_t a_hi, uint64_t a_low, uint64_t b_hi, uint64_t b_low) { + // Only when low is greater, high equality should return true + uint64_t discard; + bool borrowWhenEql = _subborrow_u64(0, b_low, a_low, &discard); + + // borrow(b - (a + C_in)) = C_in? (a >= b) : (a > b) + return _subborrow_u64(borrowWhenEql, b_hi, a_hi, &discard); + } + +#endif // End MSVC 64-bit support #else // _MSC_VER NOT defined FASTMOD_API uint64_t mul128_u32(uint64_t lowbits, uint32_t d) { @@ -143,13 +208,11 @@ FASTMOD_API int32_t fastdiv_s32(int32_t a, uint64_t M, int32_t d) { return (int32_t)(highbits); } -#ifndef _MSC_VER - // What follows is the 64-bit functions. -// They are currently not supported on Visual Studio -// due to the lack of a mul128_u64 function. -// They may not be faster than what the compiler -// can produce. +// They may not be faster than what the compiler can produce. + +#ifndef _MSC_VER +// No __uint128_t in VS, so they have to use a diffrent method. FASTMOD_API __uint128_t computeM_u64(uint64_t d) { // what follows is just ((__uint128_t)0 - 1) / d) + 1 spelled out @@ -170,6 +233,59 @@ FASTMOD_API uint64_t fastdiv_u64(uint64_t a, __uint128_t M) { return mul128_u64(M, a); } +// given precomputed M, is_divisible checks whether n % d == 0 +FASTMOD_API bool is_divisible_u64(uint64_t n, __uint128_t M) { return n * M <= M - 1; } + +#elif defined(_MSC_VER) && defined(_M_AMD64) && (_MSC_VER >= 1923) +// Visual Studio lacks support for 128-bit integers +// so they simulated are using multiword arithmatic +// and VS specific intrinsics. + +// Using a struct in the multiword arithmetic functions produces +// worse asm output but isn't that bad for public functions + +typedef struct { uint64_t low; uint64_t hi; } fastmod_u128_t; + +FASTMOD_API fastmod_u128_t computeM_u64(uint64_t d) { + // UINT128MAX / d + uint64_t magic_quotient_hi; + uint64_t magic_quotient_lo = div128_u64( + ~UINT64_C(0), ~UINT64_C(0), d, &magic_quotient_hi + ); + + // quotient_u128 + 1 + fastmod_u128_t M; + M.low = add128_u64(magic_quotient_hi, magic_quotient_lo, 1, &M.hi); + return M; +} + +// computes (a % d) given precomputed M +FASTMOD_API uint64_t fastmod_u64(uint64_t a, fastmod_u128_t M, uint64_t d) { + uint64_t lowbits_hi; + uint64_t lowbits_lo = mul128_u64_lo(M.hi, M.low, a, &lowbits_hi); + + return mul128_u64_hi(lowbits_hi, lowbits_lo, d); +} + +// computes (a / d) given precomputed M for d>1 +FASTMOD_API uint64_t fastdiv_u64(uint64_t a, fastmod_u128_t M) { + return mul128_u64_hi(M.hi, M.low, a); +} + +// given precomputed M, is_divisible checks whether n % d == 0 +FASTMOD_API bool is_divisible_u64(uint64_t n, fastmod_u128_t M) { + uint64_t lowBits_hi; + uint64_t lowBits_low = mul128_u64_lo(M.hi, M.low, n, &lowBits_hi); + + uint64_t Mdec_low, Mdec_hi; + bool borrow_hi = _subborrow_u64(0, M.low, 1, &Mdec_low); + _subborrow_u64(borrow_hi, M.hi, 0, &Mdec_hi); + + // n * M <= M - 1 + return !isgreater_u128(lowBits_hi, lowBits_low, Mdec_hi, Mdec_low); +} + + // End of the 64-bit functions #endif // #ifndef _MSC_VER diff --git a/tests/cppincludetest1.cpp b/tests/cppincludetest1.cpp index 4dab58f..359b6a3 100644 --- a/tests/cppincludetest1.cpp +++ b/tests/cppincludetest1.cpp @@ -15,8 +15,8 @@ bool testunsigned64(uint64_t min, uint64_t max, bool verbose) { printf("skipping d = 0\n"); continue; } -#ifndef _MSC_VER - __uint128_t M64 = computeM_u64(d); + + auto M64 = computeM_u64(d); if (verbose) printf("d = %" PRIu64 " (unsigned 64-bit) ", d); else @@ -37,7 +37,7 @@ bool testunsigned64(uint64_t min, uint64_t max, bool verbose) { return false; } } -#endif + if (verbose) printf("ok!\n"); } diff --git a/tests/cppincludetest2.cpp b/tests/cppincludetest2.cpp index fe53425..bf36b30 100644 --- a/tests/cppincludetest2.cpp +++ b/tests/cppincludetest2.cpp @@ -119,7 +119,7 @@ bool testsigned(int32_t min, int32_t max, bool verbose) { printf("skipping d = 0 as it cannot be supported\n"); continue; } - if (d == -2147483648) { + if (d == INT32_MIN) { printf("skipping d = -2147483648 as it is unsupported\n"); continue; } @@ -166,7 +166,7 @@ bool testdivsigned(int32_t min, int32_t max, bool verbose) { printf("\n==Testing divsigned with min = %d and max = %d\n", min, max); assert(min != max + 1); // infinite loop! size_t count = 0; - static_assert(int32_t(-2147483648) < int32_t(2147483647)); + static_assert(int32_t(INT32_MIN) < int32_t(2147483647)); for (int32_t d = min; (d <= max) && (d >= min); d++) { if (d == 0) { printf("skipping d = 0\n"); @@ -205,7 +205,7 @@ bool testdivsigned(int32_t min, int32_t max, bool verbose) { d, a); printf("expected %d div %d = %d \n", a, d, computedDiv); printf("got %d div %d = %d \n", a, d, computedFastDiv); - if (d == -2147483648) { + if (d == INT32_MIN) { printf("Note: d = -2147483648 is unsupported\n"); } else { return false; @@ -235,7 +235,7 @@ int main(int argc, char *argv[]) { } isok = isok && testdivsigned(0x7ffffff8, 0x7fffffff, verbose); - isok = isok && testdivsigned(-0x80000000, -0x7ffffff8, verbose); + isok = isok && testdivsigned(INT32_MIN, -0x7ffffff8, verbose); isok = isok && testdivsigned(2, 10, verbose); isok = isok && testdivsigned(-10, -2, verbose); isok = isok && testunsigned64(1, 0x10, verbose); @@ -251,7 +251,7 @@ int main(int argc, char *argv[]) { isok = isok && testsigned(-8, -1, verbose); isok = isok && testsigned(1, 8, verbose); isok = isok && testsigned(0x7ffffff8, 0x7fffffff, verbose); - isok = isok && testsigned(-0x80000000, -0x7ffffff8, verbose); + isok = isok && testsigned(INT32_MIN, -0x7ffffff8, verbose); for (int k = 0; k < 100; k++) { int32_t x = rand(); isok = isok && testsigned(x, x, verbose); diff --git a/tests/moddivnbenchmark.cpp b/tests/moddivnbenchmark.cpp index 5ebb08a..92fe628 100644 --- a/tests/moddivnbenchmark.cpp +++ b/tests/moddivnbenchmark.cpp @@ -3,12 +3,28 @@ #include #include +#ifdef _MSC_VER + +// Taken from Facebook's folly +// https://github.com/facebook/folly/blob/0f6bc7a3f0133bd49226b50026de60e708900577/folly/Benchmark.h#L270-L284 +#pragma optimize("", off) +inline void doNotOptimizeDependencySink(const void*) {} + +#pragma optimize("", on) +template +void doNotOptimizeAway(const T& datum) { + doNotOptimizeDependencySink(&datum); +} +#else + template inline void doNotOptimizeAway(T &&datum) { // Taken from Facebook's folly // https://github.com/facebook/folly/blob/0f6bc7a3f0133bd49226b50026de60e708900577/folly/Benchmark.h#L318-L326 asm volatile("" ::"m"(datum) : "memory"); } +#endif + using namespace fastmod; template uint64_t time(const F &x, const std::vector &zomg) { @@ -29,7 +45,7 @@ int main() { std::vector zomg(10000000); for (auto &e : zomg) e = mt(); -#ifndef _MSC_VER + const auto M = computeM_u64(mod); auto fm = time( [M, mod](uint64_t v) { @@ -39,5 +55,5 @@ int main() { auto sm = time([mod](uint64_t x) { return (x % mod) + (x / mod); }, zomg); std::fprintf(stderr, "fastmod + fastdiv is %lf as fast as x86 mod + div: \n", (double)sm / fm); -#endif + } diff --git a/tests/modnbenchmark.cpp b/tests/modnbenchmark.cpp index 503cc87..f7539c6 100644 --- a/tests/modnbenchmark.cpp +++ b/tests/modnbenchmark.cpp @@ -4,12 +4,28 @@ #include #include +#ifdef _MSC_VER + +// Taken from Facebook's folly +// https://github.com/facebook/folly/blob/0f6bc7a3f0133bd49226b50026de60e708900577/folly/Benchmark.h#L270-L284 +#pragma optimize("", off) +inline void doNotOptimizeDependencySink(const void*) {} + +#pragma optimize("", on) +template +void doNotOptimizeAway(const T& datum) { + doNotOptimizeDependencySink(&datum); +} +#else + template inline void doNotOptimizeAway(T &&datum) { // Taken from Facebook's folly // https://github.com/facebook/folly/blob/0f6bc7a3f0133bd49226b50026de60e708900577/folly/Benchmark.h#L318-L326 asm volatile("" ::"m"(datum) : "memory"); } +#endif + using namespace fastmod; template uint64_t time(const F &x, std::vector &zomg) { auto start = std::chrono::high_resolution_clock::now(); @@ -30,7 +46,7 @@ int main() { std::vector zomg(100000000); for (auto &e : zomg) e = mt(); -#ifndef _MSC_VER + const auto M = computeM_u64(mod); std::cout << "timing fastmod_u64 " << std::endl; auto fmtime = @@ -59,5 +75,5 @@ int main() { std::fprintf(stderr, "masking is %lf as fast as fastmod and %lf as fast as modding\n", (double)fmtime / masktime, (double)modtime / masktime); -#endif + } diff --git a/tests/unit.c b/tests/unit.c index 23023ed..c1e0a25 100644 --- a/tests/unit.c +++ b/tests/unit.c @@ -9,6 +9,9 @@ #ifdef __cplusplus using namespace fastmod; #endif +#ifdef _MSC_VER +typedef fastmod_u128_t __uint128_t; +#endif bool testunsigned64(uint64_t min, uint64_t max, bool verbose) { for (uint64_t d = min; (d <= max) && (d >= min); d++) { @@ -130,7 +133,7 @@ bool testsigned(int32_t min, int32_t max, bool verbose) { printf("skipping d = 0 as it cannot be supported\n"); continue; } - if (d == -2147483648) { + if (d == INT32_MIN) { printf("skipping d = -2147483648 as it is unsupported\n"); continue; } @@ -181,7 +184,7 @@ bool testdivsigned(int32_t min, int32_t max, bool verbose) { printf("skipping d = -1 as it is not supported\n"); continue; } - if (d == -2147483648) { + if (d == INT32_MIN) { printf("skipping d = -2147483648 as it is unsupported\n"); continue; } @@ -228,7 +231,7 @@ int main(int argc, char *argv[]) { isok = isok && testunsigned64(UINT64_C(0xffffffffff00000), UINT64_C(0xffffffffff00000) + 0x100, verbose); isok = isok && testunsigned(1, 8, verbose); isok = isok && testunsigned(0xfffffff8, 0xffffffff, verbose); - isok = isok && testdivsigned(-0x80000000, -0x7ffffff8, verbose); + isok = isok && testdivsigned(INT32_MIN, -0x7ffffff8, verbose); isok = isok && testdivsigned(2, 10, verbose); isok = isok && testdivsigned(0x7ffffff8, 0x7fffffff, verbose); isok = isok && testdivsigned(-10, -2, verbose); @@ -239,7 +242,7 @@ int main(int argc, char *argv[]) { isok = isok && testsigned(-8, -1, verbose); isok = isok && testsigned(1, 8, verbose); isok = isok && testsigned(0x7ffffff8, 0x7fffffff, verbose); - isok = isok && testsigned(-0x80000000, -0x7ffffff8, verbose); + isok = isok && testsigned(INT32_MIN, -0x7ffffff8, verbose); for (int k = 0; k < 100; k++) { int32_t x = rand(); isok = isok && testsigned(x, x, verbose); From bb35ec909ba39997b8b4f589a2d364789bdddb3b Mon Sep 17 00:00:00 2001 From: Walter Date: Sat, 16 Nov 2024 09:24:46 +1100 Subject: [PATCH 2/2] Style: Removed indentation There was a bunch of superfluous indentation in the multi-word arithmetic functions. --- include/fastmod.h | 114 +++++++++++++++++++++++----------------------- 1 file changed, 57 insertions(+), 57 deletions(-) diff --git a/include/fastmod.h b/include/fastmod.h index adbeebb..7da3a55 100644 --- a/include/fastmod.h +++ b/include/fastmod.h @@ -55,63 +55,63 @@ FASTMOD_API uint64_t mul128_s32(uint64_t lowbits, int32_t d) { // Visual Studio lacks support for 128-bit integers so they simulated are using // multiword arithmatic and VS specific intrinsics. - FASTMOD_API uint64_t add128_u64( - uint64_t M_hi, uint64_t M_lo, uint64_t addend, uint64_t* sum_hi - ) { - uint64_t sum_lo; - - bool carry = _addcarry_u64(0, M_lo, addend, &sum_lo); - _addcarry_u64(carry, M_hi, 0, sum_hi); // Encourages 'adc' - - return sum_lo; - } - - FASTMOD_API uint64_t div128_u64( - uint64_t dividend_hi, uint64_t dividend_lo, uint64_t divisor, uint64_t* quotient_hi - ) { - *quotient_hi = dividend_hi / divisor; - uint64_t remainder_hi = dividend_hi % divisor; - - // When long div starts to consider the low dividend, - // the high part would have became its remainder. - // Prevents an arithmetic exception when _udiv128 calculates a >64-bit quotient - uint64_t remainder; // Discard - return _udiv128(remainder_hi, dividend_lo, divisor, &remainder); - } - - // Multiplies the 128-bit integer by d and returns the lower 128-bits of the product - FASTMOD_API uint64_t mul128_u64_lo( - uint64_t M_hi, uint64_t M_lo, uint64_t d, uint64_t* product_hi - ) { - uint64_t lowbits_hi; - uint64_t lowbits_lo = _umul128(M_lo, d, &lowbits_hi); - - *product_hi = lowbits_hi + (M_hi * d); - - return lowbits_lo; - } - - // Multiplies the 128-bit integer by d and returns the highest 64-bits of the product - FASTMOD_API uint64_t mul128_u64_hi(uint64_t lowbits_hi, uint64_t lowbits_lo, uint64_t d) { - uint64_t bottomHalf_hi = __umulh(lowbits_lo, d); - - uint64_t topHalf_hi; - uint64_t topHalf_lo = _umul128(lowbits_hi, d, &topHalf_hi); - - uint64_t bothHalves_hi; - add128_u64(topHalf_hi, topHalf_lo, bottomHalf_hi, &bothHalves_hi); - - return bothHalves_hi; - } - - FASTMOD_API bool isgreater_u128(uint64_t a_hi, uint64_t a_low, uint64_t b_hi, uint64_t b_low) { - // Only when low is greater, high equality should return true - uint64_t discard; - bool borrowWhenEql = _subborrow_u64(0, b_low, a_low, &discard); - - // borrow(b - (a + C_in)) = C_in? (a >= b) : (a > b) - return _subborrow_u64(borrowWhenEql, b_hi, a_hi, &discard); - } +FASTMOD_API uint64_t add128_u64( + uint64_t M_hi, uint64_t M_lo, uint64_t addend, uint64_t* sum_hi + ) { + uint64_t sum_lo; + + bool carry = _addcarry_u64(0, M_lo, addend, &sum_lo); + _addcarry_u64(carry, M_hi, 0, sum_hi); // Encourages 'adc' + + return sum_lo; +} + +FASTMOD_API uint64_t div128_u64( + uint64_t dividend_hi, uint64_t dividend_lo, uint64_t divisor, uint64_t* quotient_hi + ) { + *quotient_hi = dividend_hi / divisor; + uint64_t remainder_hi = dividend_hi % divisor; + + // When long div starts to consider the low dividend, + // the high part would have became its remainder. + // Prevents an arithmetic exception when _udiv128 calculates a >64-bit quotient + uint64_t remainder; // Discard + return _udiv128(remainder_hi, dividend_lo, divisor, &remainder); +} + +// Multiplies the 128-bit integer by d and returns the lower 128-bits of the product +FASTMOD_API uint64_t mul128_u64_lo( + uint64_t M_hi, uint64_t M_lo, uint64_t d, uint64_t* product_hi + ) { + uint64_t lowbits_hi; + uint64_t lowbits_lo = _umul128(M_lo, d, &lowbits_hi); + + *product_hi = lowbits_hi + (M_hi * d); + + return lowbits_lo; +} + +// Multiplies the 128-bit integer by d and returns the highest 64-bits of the product +FASTMOD_API uint64_t mul128_u64_hi(uint64_t lowbits_hi, uint64_t lowbits_lo, uint64_t d) { + uint64_t bottomHalf_hi = __umulh(lowbits_lo, d); + + uint64_t topHalf_hi; + uint64_t topHalf_lo = _umul128(lowbits_hi, d, &topHalf_hi); + + uint64_t bothHalves_hi; + add128_u64(topHalf_hi, topHalf_lo, bottomHalf_hi, &bothHalves_hi); + + return bothHalves_hi; +} + +FASTMOD_API bool isgreater_u128(uint64_t a_hi, uint64_t a_low, uint64_t b_hi, uint64_t b_low) { + // Only when low is greater, high equality should return true + uint64_t discard; + bool borrowWhenEql = _subborrow_u64(0, b_low, a_low, &discard); + + // borrow(b - (a + C_in)) = C_in? (a >= b) : (a > b) + return _subborrow_u64(borrowWhenEql, b_hi, a_hi, &discard); +} #endif // End MSVC 64-bit support #else // _MSC_VER NOT defined