Skip to content

Commit

Permalink
Merge pull request #16 from WalterKruger/master
Browse files Browse the repository at this point in the history
64-bit fastdiv & mod for MSVC
  • Loading branch information
lemire authored Nov 15, 2024
2 parents fa0c0a1 + bb35ec9 commit 5589d93
Show file tree
Hide file tree
Showing 6 changed files with 173 additions and 22 deletions.
128 changes: 122 additions & 6 deletions include/fastmod.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,71 @@ FASTMOD_API uint64_t mul128_s32(uint64_t lowbits, int32_t d) {
return mul128_u32(lowbits, d);
}

// Need _udiv128 to calculate the magic number (maps to x86 64-bit div)
#if defined(_M_AMD64) && ( _MSC_VER >= 1923)
// This is for the 64-bit functions.
// Visual Studio lacks support for 128-bit integers so they simulated are using
// multiword arithmatic and VS specific intrinsics.

FASTMOD_API uint64_t add128_u64(
uint64_t M_hi, uint64_t M_lo, uint64_t addend, uint64_t* sum_hi
) {
uint64_t sum_lo;

bool carry = _addcarry_u64(0, M_lo, addend, &sum_lo);
_addcarry_u64(carry, M_hi, 0, sum_hi); // Encourages 'adc'

return sum_lo;
}

FASTMOD_API uint64_t div128_u64(
uint64_t dividend_hi, uint64_t dividend_lo, uint64_t divisor, uint64_t* quotient_hi
) {
*quotient_hi = dividend_hi / divisor;
uint64_t remainder_hi = dividend_hi % divisor;

// When long div starts to consider the low dividend,
// the high part would have became its remainder.
// Prevents an arithmetic exception when _udiv128 calculates a >64-bit quotient
uint64_t remainder; // Discard
return _udiv128(remainder_hi, dividend_lo, divisor, &remainder);
}

// Multiplies the 128-bit integer by d and returns the lower 128-bits of the product
FASTMOD_API uint64_t mul128_u64_lo(
uint64_t M_hi, uint64_t M_lo, uint64_t d, uint64_t* product_hi
) {
uint64_t lowbits_hi;
uint64_t lowbits_lo = _umul128(M_lo, d, &lowbits_hi);

*product_hi = lowbits_hi + (M_hi * d);

return lowbits_lo;
}

// Multiplies the 128-bit integer by d and returns the highest 64-bits of the product
FASTMOD_API uint64_t mul128_u64_hi(uint64_t lowbits_hi, uint64_t lowbits_lo, uint64_t d) {
uint64_t bottomHalf_hi = __umulh(lowbits_lo, d);

uint64_t topHalf_hi;
uint64_t topHalf_lo = _umul128(lowbits_hi, d, &topHalf_hi);

uint64_t bothHalves_hi;
add128_u64(topHalf_hi, topHalf_lo, bottomHalf_hi, &bothHalves_hi);

return bothHalves_hi;
}

FASTMOD_API bool isgreater_u128(uint64_t a_hi, uint64_t a_low, uint64_t b_hi, uint64_t b_low) {
// Only when low is greater, high equality should return true
uint64_t discard;
bool borrowWhenEql = _subborrow_u64(0, b_low, a_low, &discard);

// borrow(b - (a + C_in)) = C_in? (a >= b) : (a > b)
return _subborrow_u64(borrowWhenEql, b_hi, a_hi, &discard);
}

#endif // End MSVC 64-bit support
#else // _MSC_VER NOT defined

FASTMOD_API uint64_t mul128_u32(uint64_t lowbits, uint32_t d) {
Expand Down Expand Up @@ -143,13 +208,11 @@ FASTMOD_API int32_t fastdiv_s32(int32_t a, uint64_t M, int32_t d) {
return (int32_t)(highbits);
}

#ifndef _MSC_VER

// What follows is the 64-bit functions.
// They are currently not supported on Visual Studio
// due to the lack of a mul128_u64 function.
// They may not be faster than what the compiler
// can produce.
// They may not be faster than what the compiler can produce.

#ifndef _MSC_VER
// No __uint128_t in VS, so they have to use a diffrent method.

FASTMOD_API __uint128_t computeM_u64(uint64_t d) {
// what follows is just ((__uint128_t)0 - 1) / d) + 1 spelled out
Expand All @@ -170,6 +233,59 @@ FASTMOD_API uint64_t fastdiv_u64(uint64_t a, __uint128_t M) {
return mul128_u64(M, a);
}

// given precomputed M, is_divisible checks whether n % d == 0
FASTMOD_API bool is_divisible_u64(uint64_t n, __uint128_t M) { return n * M <= M - 1; }

#elif defined(_MSC_VER) && defined(_M_AMD64) && (_MSC_VER >= 1923)
// Visual Studio lacks support for 128-bit integers
// so they simulated are using multiword arithmatic
// and VS specific intrinsics.

// Using a struct in the multiword arithmetic functions produces
// worse asm output but isn't that bad for public functions

typedef struct { uint64_t low; uint64_t hi; } fastmod_u128_t;

FASTMOD_API fastmod_u128_t computeM_u64(uint64_t d) {
// UINT128MAX / d
uint64_t magic_quotient_hi;
uint64_t magic_quotient_lo = div128_u64(
~UINT64_C(0), ~UINT64_C(0), d, &magic_quotient_hi
);

// quotient_u128 + 1
fastmod_u128_t M;
M.low = add128_u64(magic_quotient_hi, magic_quotient_lo, 1, &M.hi);
return M;
}

// computes (a % d) given precomputed M
FASTMOD_API uint64_t fastmod_u64(uint64_t a, fastmod_u128_t M, uint64_t d) {
uint64_t lowbits_hi;
uint64_t lowbits_lo = mul128_u64_lo(M.hi, M.low, a, &lowbits_hi);

return mul128_u64_hi(lowbits_hi, lowbits_lo, d);
}

// computes (a / d) given precomputed M for d>1
FASTMOD_API uint64_t fastdiv_u64(uint64_t a, fastmod_u128_t M) {
return mul128_u64_hi(M.hi, M.low, a);
}

// given precomputed M, is_divisible checks whether n % d == 0
FASTMOD_API bool is_divisible_u64(uint64_t n, fastmod_u128_t M) {
uint64_t lowBits_hi;
uint64_t lowBits_low = mul128_u64_lo(M.hi, M.low, n, &lowBits_hi);

uint64_t Mdec_low, Mdec_hi;
bool borrow_hi = _subborrow_u64(0, M.low, 1, &Mdec_low);
_subborrow_u64(borrow_hi, M.hi, 0, &Mdec_hi);

// n * M <= M - 1
return !isgreater_u128(lowBits_hi, lowBits_low, Mdec_hi, Mdec_low);
}


// End of the 64-bit functions

#endif // #ifndef _MSC_VER
Expand Down
6 changes: 3 additions & 3 deletions tests/cppincludetest1.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ bool testunsigned64(uint64_t min, uint64_t max, bool verbose) {
printf("skipping d = 0\n");
continue;
}
#ifndef _MSC_VER
__uint128_t M64 = computeM_u64(d);

auto M64 = computeM_u64(d);
if (verbose)
printf("d = %" PRIu64 " (unsigned 64-bit) ", d);
else
Expand All @@ -37,7 +37,7 @@ bool testunsigned64(uint64_t min, uint64_t max, bool verbose) {
return false;
}
}
#endif

if (verbose)
printf("ok!\n");
}
Expand Down
10 changes: 5 additions & 5 deletions tests/cppincludetest2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ bool testsigned(int32_t min, int32_t max, bool verbose) {
printf("skipping d = 0 as it cannot be supported\n");
continue;
}
if (d == -2147483648) {
if (d == INT32_MIN) {
printf("skipping d = -2147483648 as it is unsupported\n");
continue;
}
Expand Down Expand Up @@ -166,7 +166,7 @@ bool testdivsigned(int32_t min, int32_t max, bool verbose) {
printf("\n==Testing divsigned with min = %d and max = %d\n", min, max);
assert(min != max + 1); // infinite loop!
size_t count = 0;
static_assert(int32_t(-2147483648) < int32_t(2147483647));
static_assert(int32_t(INT32_MIN) < int32_t(2147483647));
for (int32_t d = min; (d <= max) && (d >= min); d++) {
if (d == 0) {
printf("skipping d = 0\n");
Expand Down Expand Up @@ -205,7 +205,7 @@ bool testdivsigned(int32_t min, int32_t max, bool verbose) {
d, a);
printf("expected %d div %d = %d \n", a, d, computedDiv);
printf("got %d div %d = %d \n", a, d, computedFastDiv);
if (d == -2147483648) {
if (d == INT32_MIN) {
printf("Note: d = -2147483648 is unsupported\n");
} else {
return false;
Expand Down Expand Up @@ -235,7 +235,7 @@ int main(int argc, char *argv[]) {
}

isok = isok && testdivsigned(0x7ffffff8, 0x7fffffff, verbose);
isok = isok && testdivsigned(-0x80000000, -0x7ffffff8, verbose);
isok = isok && testdivsigned(INT32_MIN, -0x7ffffff8, verbose);
isok = isok && testdivsigned(2, 10, verbose);
isok = isok && testdivsigned(-10, -2, verbose);
isok = isok && testunsigned64(1, 0x10, verbose);
Expand All @@ -251,7 +251,7 @@ int main(int argc, char *argv[]) {
isok = isok && testsigned(-8, -1, verbose);
isok = isok && testsigned(1, 8, verbose);
isok = isok && testsigned(0x7ffffff8, 0x7fffffff, verbose);
isok = isok && testsigned(-0x80000000, -0x7ffffff8, verbose);
isok = isok && testsigned(INT32_MIN, -0x7ffffff8, verbose);
for (int k = 0; k < 100; k++) {
int32_t x = rand();
isok = isok && testsigned(x, x, verbose);
Expand Down
20 changes: 18 additions & 2 deletions tests/moddivnbenchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,28 @@
#include <cstdio>
#include <random>

#ifdef _MSC_VER

// Taken from Facebook's folly
// https://github.com/facebook/folly/blob/0f6bc7a3f0133bd49226b50026de60e708900577/folly/Benchmark.h#L270-L284
#pragma optimize("", off)
inline void doNotOptimizeDependencySink(const void*) {}

#pragma optimize("", on)
template <class T>
void doNotOptimizeAway(const T& datum) {
doNotOptimizeDependencySink(&datum);
}
#else

template <typename T> inline void doNotOptimizeAway(T &&datum) {
// Taken from Facebook's folly
// https://github.com/facebook/folly/blob/0f6bc7a3f0133bd49226b50026de60e708900577/folly/Benchmark.h#L318-L326
asm volatile("" ::"m"(datum) : "memory");
}

#endif

using namespace fastmod;
template <typename F>
uint64_t time(const F &x, const std::vector<uint64_t> &zomg) {
Expand All @@ -29,7 +45,7 @@ int main() {
std::vector<uint64_t> zomg(10000000);
for (auto &e : zomg)
e = mt();
#ifndef _MSC_VER

const auto M = computeM_u64(mod);
auto fm = time(
[M, mod](uint64_t v) {
Expand All @@ -39,5 +55,5 @@ int main() {
auto sm = time([mod](uint64_t x) { return (x % mod) + (x / mod); }, zomg);
std::fprintf(stderr, "fastmod + fastdiv is %lf as fast as x86 mod + div: \n",
(double)sm / fm);
#endif

}
20 changes: 18 additions & 2 deletions tests/modnbenchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,28 @@
#include <iostream>
#include <random>

#ifdef _MSC_VER

// Taken from Facebook's folly
// https://github.com/facebook/folly/blob/0f6bc7a3f0133bd49226b50026de60e708900577/folly/Benchmark.h#L270-L284
#pragma optimize("", off)
inline void doNotOptimizeDependencySink(const void*) {}

#pragma optimize("", on)
template <class T>
void doNotOptimizeAway(const T& datum) {
doNotOptimizeDependencySink(&datum);
}
#else

template <typename T> inline void doNotOptimizeAway(T &&datum) {
// Taken from Facebook's folly
// https://github.com/facebook/folly/blob/0f6bc7a3f0133bd49226b50026de60e708900577/folly/Benchmark.h#L318-L326
asm volatile("" ::"m"(datum) : "memory");
}

#endif

using namespace fastmod;
template <typename F> uint64_t time(const F &x, std::vector<uint64_t> &zomg) {
auto start = std::chrono::high_resolution_clock::now();
Expand All @@ -30,7 +46,7 @@ int main() {
std::vector<uint64_t> zomg(100000000);
for (auto &e : zomg)
e = mt();
#ifndef _MSC_VER

const auto M = computeM_u64(mod);
std::cout << "timing fastmod_u64 " << std::endl;
auto fmtime =
Expand Down Expand Up @@ -59,5 +75,5 @@ int main() {
std::fprintf(stderr,
"masking is %lf as fast as fastmod and %lf as fast as modding\n",
(double)fmtime / masktime, (double)modtime / masktime);
#endif

}
11 changes: 7 additions & 4 deletions tests/unit.c
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@
#ifdef __cplusplus
using namespace fastmod;
#endif
#ifdef _MSC_VER
typedef fastmod_u128_t __uint128_t;
#endif

bool testunsigned64(uint64_t min, uint64_t max, bool verbose) {
for (uint64_t d = min; (d <= max) && (d >= min); d++) {
Expand Down Expand Up @@ -130,7 +133,7 @@ bool testsigned(int32_t min, int32_t max, bool verbose) {
printf("skipping d = 0 as it cannot be supported\n");
continue;
}
if (d == -2147483648) {
if (d == INT32_MIN) {
printf("skipping d = -2147483648 as it is unsupported\n");
continue;
}
Expand Down Expand Up @@ -181,7 +184,7 @@ bool testdivsigned(int32_t min, int32_t max, bool verbose) {
printf("skipping d = -1 as it is not supported\n");
continue;
}
if (d == -2147483648) {
if (d == INT32_MIN) {
printf("skipping d = -2147483648 as it is unsupported\n");
continue;
}
Expand Down Expand Up @@ -228,7 +231,7 @@ int main(int argc, char *argv[]) {
isok = isok && testunsigned64(UINT64_C(0xffffffffff00000), UINT64_C(0xffffffffff00000) + 0x100, verbose);
isok = isok && testunsigned(1, 8, verbose);
isok = isok && testunsigned(0xfffffff8, 0xffffffff, verbose);
isok = isok && testdivsigned(-0x80000000, -0x7ffffff8, verbose);
isok = isok && testdivsigned(INT32_MIN, -0x7ffffff8, verbose);
isok = isok && testdivsigned(2, 10, verbose);
isok = isok && testdivsigned(0x7ffffff8, 0x7fffffff, verbose);
isok = isok && testdivsigned(-10, -2, verbose);
Expand All @@ -239,7 +242,7 @@ int main(int argc, char *argv[]) {
isok = isok && testsigned(-8, -1, verbose);
isok = isok && testsigned(1, 8, verbose);
isok = isok && testsigned(0x7ffffff8, 0x7fffffff, verbose);
isok = isok && testsigned(-0x80000000, -0x7ffffff8, verbose);
isok = isok && testsigned(INT32_MIN, -0x7ffffff8, verbose);
for (int k = 0; k < 100; k++) {
int32_t x = rand();
isok = isok && testsigned(x, x, verbose);
Expand Down

0 comments on commit 5589d93

Please sign in to comment.