From 04a7f6ea005c6594e3689646454e863d3fa8af60 Mon Sep 17 00:00:00 2001
From: Walter <walter.kruger@hotmail.com>
Date: Sat, 16 Nov 2024 09:06:26 +1100
Subject: [PATCH 1/2] 64-bit div & mod for MSVC

Added 64-bit fastdiv & mod support for MSVC via multi-word arithmetic and VS specific intrinsics. This produces almost identical asm output to GCC/Clang's __uint128_t.

I also had to slightly modify the tests to make it compatible for MSVC:
- Removed `#ifndef _MSC_VER` directives
- `__uint128_t` => `auto`
- Literals `-2147483648` & `-0x80000000` => `INT32_MIN` (to prevent overflow)
- Added VS compatible `doNotOptimizeAway`
---
 include/fastmod.h          | 128 +++++++++++++++++++++++++++++++++++--
 tests/cppincludetest1.cpp  |   6 +-
 tests/cppincludetest2.cpp  |  10 +--
 tests/moddivnbenchmark.cpp |  20 +++++-
 tests/modnbenchmark.cpp    |  20 +++++-
 tests/unit.c               |  11 ++--
 6 files changed, 173 insertions(+), 22 deletions(-)

diff --git a/include/fastmod.h b/include/fastmod.h
index 226648f..adbeebb 100644
--- a/include/fastmod.h
+++ b/include/fastmod.h
@@ -49,6 +49,71 @@ FASTMOD_API uint64_t mul128_s32(uint64_t lowbits, int32_t d) {
   return mul128_u32(lowbits, d);
 }
 
+// Need _udiv128 to calculate the magic number (maps to x86 64-bit div)
+#if defined(_M_AMD64) && ( _MSC_VER >= 1923)
+// This is for the 64-bit functions.
+// Visual Studio lacks support for 128-bit integers so they simulated are using
+// multiword arithmatic and VS specific intrinsics.
+
+    FASTMOD_API uint64_t add128_u64(
+        uint64_t M_hi, uint64_t M_lo, uint64_t addend, uint64_t* sum_hi
+    ) {
+        uint64_t sum_lo;
+
+        bool carry = _addcarry_u64(0, M_lo, addend, &sum_lo);
+        _addcarry_u64(carry, M_hi, 0, sum_hi); // Encourages 'adc'
+
+        return sum_lo;
+    }
+
+    FASTMOD_API uint64_t div128_u64(
+        uint64_t dividend_hi, uint64_t dividend_lo, uint64_t divisor, uint64_t* quotient_hi
+    ) {
+        *quotient_hi = dividend_hi / divisor;
+        uint64_t remainder_hi = dividend_hi % divisor;
+
+        // When long div starts to consider the low dividend,
+        // the high part would have became its remainder.
+        // Prevents an arithmetic exception when _udiv128 calculates a >64-bit quotient
+        uint64_t remainder; // Discard
+        return _udiv128(remainder_hi, dividend_lo, divisor, &remainder);
+    }
+
+    // Multiplies the 128-bit integer by d and returns the lower 128-bits of the product
+    FASTMOD_API uint64_t mul128_u64_lo(
+        uint64_t M_hi, uint64_t M_lo, uint64_t d, uint64_t* product_hi
+    ) {
+        uint64_t lowbits_hi;
+        uint64_t lowbits_lo = _umul128(M_lo, d, &lowbits_hi);
+
+        *product_hi = lowbits_hi + (M_hi * d);
+
+        return lowbits_lo;
+    }
+
+    // Multiplies the 128-bit integer by d and returns the highest 64-bits of the product
+    FASTMOD_API uint64_t mul128_u64_hi(uint64_t lowbits_hi, uint64_t lowbits_lo, uint64_t d) {
+        uint64_t bottomHalf_hi = __umulh(lowbits_lo, d);
+
+        uint64_t topHalf_hi;
+        uint64_t topHalf_lo = _umul128(lowbits_hi, d, &topHalf_hi);
+
+        uint64_t bothHalves_hi;
+        add128_u64(topHalf_hi, topHalf_lo, bottomHalf_hi, &bothHalves_hi);
+
+        return bothHalves_hi;
+    }
+
+    FASTMOD_API bool isgreater_u128(uint64_t a_hi, uint64_t a_low, uint64_t b_hi, uint64_t b_low) {
+        // Only when low is greater, high equality should return true
+        uint64_t discard;
+        bool borrowWhenEql = _subborrow_u64(0, b_low, a_low, &discard);
+
+        // borrow(b - (a + C_in)) = C_in? (a >= b) : (a > b)
+        return _subborrow_u64(borrowWhenEql, b_hi, a_hi, &discard);
+    }
+
+#endif // End MSVC 64-bit support
 #else // _MSC_VER NOT defined
 
 FASTMOD_API uint64_t mul128_u32(uint64_t lowbits, uint32_t d) {
@@ -143,13 +208,11 @@ FASTMOD_API int32_t fastdiv_s32(int32_t a, uint64_t M, int32_t d) {
   return (int32_t)(highbits);
 }
 
-#ifndef _MSC_VER
-
 // What follows is the 64-bit functions.
-// They are currently not supported on Visual Studio
-// due to the lack of a mul128_u64 function.
-// They may not be faster than what the compiler
-// can produce.
+// They may not be faster than what the compiler can produce.
+
+#ifndef _MSC_VER
+// No __uint128_t in VS, so they have to use a diffrent method.
 
 FASTMOD_API __uint128_t computeM_u64(uint64_t d) {
   // what follows is just ((__uint128_t)0 - 1) / d) + 1 spelled out
@@ -170,6 +233,59 @@ FASTMOD_API uint64_t fastdiv_u64(uint64_t a, __uint128_t M) {
   return mul128_u64(M, a);
 }
 
+// given precomputed M, is_divisible checks whether n % d == 0
+FASTMOD_API bool is_divisible_u64(uint64_t n, __uint128_t M) { return n * M <= M - 1; }
+
+#elif defined(_MSC_VER) && defined(_M_AMD64) && (_MSC_VER >= 1923)
+// Visual Studio lacks support for 128-bit integers
+// so they simulated are using multiword arithmatic
+// and VS specific intrinsics.
+
+// Using a struct in the multiword arithmetic functions produces
+// worse asm output but isn't that bad for public functions
+
+typedef struct { uint64_t low; uint64_t hi; } fastmod_u128_t;
+
+FASTMOD_API fastmod_u128_t computeM_u64(uint64_t d) {
+  // UINT128MAX / d
+  uint64_t magic_quotient_hi;
+  uint64_t magic_quotient_lo = div128_u64(
+    ~UINT64_C(0), ~UINT64_C(0), d, &magic_quotient_hi
+  );
+
+  // quotient_u128 + 1
+  fastmod_u128_t M;
+  M.low = add128_u64(magic_quotient_hi, magic_quotient_lo, 1, &M.hi);
+  return M;
+}
+
+// computes (a % d) given precomputed M
+FASTMOD_API uint64_t fastmod_u64(uint64_t a, fastmod_u128_t M, uint64_t d) {
+  uint64_t lowbits_hi;
+  uint64_t lowbits_lo = mul128_u64_lo(M.hi, M.low, a, &lowbits_hi);
+
+  return mul128_u64_hi(lowbits_hi, lowbits_lo, d);
+}
+
+// computes (a / d) given precomputed M for d>1
+FASTMOD_API uint64_t fastdiv_u64(uint64_t a, fastmod_u128_t M) {
+  return mul128_u64_hi(M.hi, M.low, a);
+}
+
+// given precomputed M, is_divisible checks whether n % d == 0
+FASTMOD_API bool is_divisible_u64(uint64_t n, fastmod_u128_t M) {
+  uint64_t lowBits_hi;
+  uint64_t lowBits_low = mul128_u64_lo(M.hi, M.low, n, &lowBits_hi);
+
+  uint64_t Mdec_low, Mdec_hi;
+  bool borrow_hi = _subborrow_u64(0, M.low, 1, &Mdec_low);
+  _subborrow_u64(borrow_hi, M.hi, 0, &Mdec_hi);
+
+  // n * M <= M - 1
+  return !isgreater_u128(lowBits_hi, lowBits_low, Mdec_hi, Mdec_low);
+}
+
+
 // End of the 64-bit functions
 
 #endif // #ifndef _MSC_VER
diff --git a/tests/cppincludetest1.cpp b/tests/cppincludetest1.cpp
index 4dab58f..359b6a3 100644
--- a/tests/cppincludetest1.cpp
+++ b/tests/cppincludetest1.cpp
@@ -15,8 +15,8 @@ bool testunsigned64(uint64_t min, uint64_t max, bool verbose) {
       printf("skipping d = 0\n");
       continue;
     }
-#ifndef _MSC_VER
-    __uint128_t M64 = computeM_u64(d);
+
+    auto M64 = computeM_u64(d);
     if (verbose)
       printf("d = %" PRIu64 " (unsigned 64-bit) ", d);
     else
@@ -37,7 +37,7 @@ bool testunsigned64(uint64_t min, uint64_t max, bool verbose) {
         return false;
       }
     }
-#endif
+
     if (verbose)
       printf("ok!\n");
   }
diff --git a/tests/cppincludetest2.cpp b/tests/cppincludetest2.cpp
index fe53425..bf36b30 100644
--- a/tests/cppincludetest2.cpp
+++ b/tests/cppincludetest2.cpp
@@ -119,7 +119,7 @@ bool testsigned(int32_t min, int32_t max, bool verbose) {
       printf("skipping d = 0 as it cannot be supported\n");
       continue;
     }
-    if (d == -2147483648) {
+    if (d == INT32_MIN) {
       printf("skipping d = -2147483648 as it is unsupported\n");
       continue;
     }
@@ -166,7 +166,7 @@ bool testdivsigned(int32_t min, int32_t max, bool verbose) {
   printf("\n==Testing divsigned with min = %d and max = %d\n", min, max);
   assert(min != max + 1); // infinite loop!
   size_t count = 0;
-  static_assert(int32_t(-2147483648) < int32_t(2147483647));
+  static_assert(int32_t(INT32_MIN) < int32_t(2147483647));
   for (int32_t d = min; (d <= max) && (d >= min); d++) {
     if (d == 0) {
       printf("skipping d = 0\n");
@@ -205,7 +205,7 @@ bool testdivsigned(int32_t min, int32_t max, bool verbose) {
             d, a);
         printf("expected %d div %d = %d \n", a, d, computedDiv);
         printf("got %d div %d = %d \n", a, d, computedFastDiv);
-        if (d == -2147483648) {
+        if (d == INT32_MIN) {
           printf("Note: d = -2147483648 is unsupported\n");
         } else {
           return false;
@@ -235,7 +235,7 @@ int main(int argc, char *argv[]) {
   }
 
   isok = isok && testdivsigned(0x7ffffff8, 0x7fffffff, verbose);
-  isok = isok && testdivsigned(-0x80000000, -0x7ffffff8, verbose);
+  isok = isok && testdivsigned(INT32_MIN, -0x7ffffff8, verbose);
   isok = isok && testdivsigned(2, 10, verbose);
   isok = isok && testdivsigned(-10, -2, verbose);
   isok = isok && testunsigned64(1, 0x10, verbose);
@@ -251,7 +251,7 @@ int main(int argc, char *argv[]) {
   isok = isok && testsigned(-8, -1, verbose);
   isok = isok && testsigned(1, 8, verbose);
   isok = isok && testsigned(0x7ffffff8, 0x7fffffff, verbose);
-  isok = isok && testsigned(-0x80000000, -0x7ffffff8, verbose);
+  isok = isok && testsigned(INT32_MIN, -0x7ffffff8, verbose);
   for (int k = 0; k < 100; k++) {
     int32_t x = rand();
     isok = isok && testsigned(x, x, verbose);
diff --git a/tests/moddivnbenchmark.cpp b/tests/moddivnbenchmark.cpp
index 5ebb08a..92fe628 100644
--- a/tests/moddivnbenchmark.cpp
+++ b/tests/moddivnbenchmark.cpp
@@ -3,12 +3,28 @@
 #include <cstdio>
 #include <random>
 
+#ifdef _MSC_VER
+
+// Taken from Facebook's folly
+// https://github.com/facebook/folly/blob/0f6bc7a3f0133bd49226b50026de60e708900577/folly/Benchmark.h#L270-L284
+#pragma optimize("", off)
+inline void doNotOptimizeDependencySink(const void*) {}
+
+#pragma optimize("", on)
+template <class T>
+void doNotOptimizeAway(const T& datum) {
+    doNotOptimizeDependencySink(&datum);
+}
+#else
+
 template <typename T> inline void doNotOptimizeAway(T &&datum) {
   // Taken from Facebook's folly
   // https://github.com/facebook/folly/blob/0f6bc7a3f0133bd49226b50026de60e708900577/folly/Benchmark.h#L318-L326
   asm volatile("" ::"m"(datum) : "memory");
 }
 
+#endif
+
 using namespace fastmod;
 template <typename F>
 uint64_t time(const F &x, const std::vector<uint64_t> &zomg) {
@@ -29,7 +45,7 @@ int main() {
   std::vector<uint64_t> zomg(10000000);
   for (auto &e : zomg)
     e = mt();
-#ifndef _MSC_VER
+
   const auto M = computeM_u64(mod);
   auto fm = time(
       [M, mod](uint64_t v) {
@@ -39,5 +55,5 @@ int main() {
   auto sm = time([mod](uint64_t x) { return (x % mod) + (x / mod); }, zomg);
   std::fprintf(stderr, "fastmod + fastdiv is %lf as fast as x86 mod + div: \n",
                (double)sm / fm);
-#endif
+
 }
diff --git a/tests/modnbenchmark.cpp b/tests/modnbenchmark.cpp
index 503cc87..f7539c6 100644
--- a/tests/modnbenchmark.cpp
+++ b/tests/modnbenchmark.cpp
@@ -4,12 +4,28 @@
 #include <iostream>
 #include <random>
 
+#ifdef _MSC_VER
+
+// Taken from Facebook's folly
+// https://github.com/facebook/folly/blob/0f6bc7a3f0133bd49226b50026de60e708900577/folly/Benchmark.h#L270-L284
+#pragma optimize("", off)
+inline void doNotOptimizeDependencySink(const void*) {}
+
+#pragma optimize("", on)
+template <class T>
+void doNotOptimizeAway(const T& datum) {
+    doNotOptimizeDependencySink(&datum);
+}
+#else
+
 template <typename T> inline void doNotOptimizeAway(T &&datum) {
   // Taken from Facebook's folly
   // https://github.com/facebook/folly/blob/0f6bc7a3f0133bd49226b50026de60e708900577/folly/Benchmark.h#L318-L326
   asm volatile("" ::"m"(datum) : "memory");
 }
 
+#endif
+
 using namespace fastmod;
 template <typename F> uint64_t time(const F &x, std::vector<uint64_t> &zomg) {
   auto start = std::chrono::high_resolution_clock::now();
@@ -30,7 +46,7 @@ int main() {
   std::vector<uint64_t> zomg(100000000);
   for (auto &e : zomg)
     e = mt();
-#ifndef _MSC_VER
+
   const auto M = computeM_u64(mod);
   std::cout << "timing fastmod_u64 " << std::endl;
   auto fmtime =
@@ -59,5 +75,5 @@ int main() {
   std::fprintf(stderr,
                "masking is %lf as fast as fastmod and %lf as fast as modding\n",
                (double)fmtime / masktime, (double)modtime / masktime);
-#endif
+
 }
diff --git a/tests/unit.c b/tests/unit.c
index 23023ed..c1e0a25 100644
--- a/tests/unit.c
+++ b/tests/unit.c
@@ -9,6 +9,9 @@
 #ifdef __cplusplus
 using namespace fastmod;
 #endif
+#ifdef _MSC_VER
+typedef fastmod_u128_t __uint128_t;
+#endif
 
 bool testunsigned64(uint64_t min, uint64_t max, bool verbose) {
   for (uint64_t d = min; (d <= max) && (d >= min); d++) {
@@ -130,7 +133,7 @@ bool testsigned(int32_t min, int32_t max, bool verbose) {
       printf("skipping d = 0 as it cannot be supported\n");
       continue;
     }
-    if (d == -2147483648) {
+    if (d == INT32_MIN) {
       printf("skipping d = -2147483648 as it is unsupported\n");
       continue;
     }
@@ -181,7 +184,7 @@ bool testdivsigned(int32_t min, int32_t max, bool verbose) {
       printf("skipping d = -1 as it is not supported\n");
       continue;
     }
-    if (d == -2147483648) {
+    if (d == INT32_MIN) {
       printf("skipping d = -2147483648 as it is unsupported\n");
       continue;
     }
@@ -228,7 +231,7 @@ int main(int argc, char *argv[]) {
   isok = isok && testunsigned64(UINT64_C(0xffffffffff00000), UINT64_C(0xffffffffff00000) + 0x100, verbose);
   isok = isok && testunsigned(1, 8, verbose);
   isok = isok && testunsigned(0xfffffff8, 0xffffffff, verbose);
-  isok = isok && testdivsigned(-0x80000000, -0x7ffffff8, verbose);
+  isok = isok && testdivsigned(INT32_MIN, -0x7ffffff8, verbose);
   isok = isok && testdivsigned(2, 10, verbose);
   isok = isok && testdivsigned(0x7ffffff8, 0x7fffffff, verbose);
   isok = isok && testdivsigned(-10, -2, verbose);
@@ -239,7 +242,7 @@ int main(int argc, char *argv[]) {
   isok = isok && testsigned(-8, -1, verbose);
   isok = isok && testsigned(1, 8, verbose);
   isok = isok && testsigned(0x7ffffff8, 0x7fffffff, verbose);
-  isok = isok && testsigned(-0x80000000, -0x7ffffff8, verbose);
+  isok = isok && testsigned(INT32_MIN, -0x7ffffff8, verbose);
   for (int k = 0; k < 100; k++) {
     int32_t x = rand();
     isok = isok && testsigned(x, x, verbose);

From bb35ec909ba39997b8b4f589a2d364789bdddb3b Mon Sep 17 00:00:00 2001
From: Walter <walter.kruger@hotmail.com>
Date: Sat, 16 Nov 2024 09:24:46 +1100
Subject: [PATCH 2/2] Style: Removed indentation

There was a bunch of superfluous indentation in the multi-word arithmetic functions.
---
 include/fastmod.h | 114 +++++++++++++++++++++++-----------------------
 1 file changed, 57 insertions(+), 57 deletions(-)

diff --git a/include/fastmod.h b/include/fastmod.h
index adbeebb..7da3a55 100644
--- a/include/fastmod.h
+++ b/include/fastmod.h
@@ -55,63 +55,63 @@ FASTMOD_API uint64_t mul128_s32(uint64_t lowbits, int32_t d) {
 // Visual Studio lacks support for 128-bit integers so they simulated are using
 // multiword arithmatic and VS specific intrinsics.
 
-    FASTMOD_API uint64_t add128_u64(
-        uint64_t M_hi, uint64_t M_lo, uint64_t addend, uint64_t* sum_hi
-    ) {
-        uint64_t sum_lo;
-
-        bool carry = _addcarry_u64(0, M_lo, addend, &sum_lo);
-        _addcarry_u64(carry, M_hi, 0, sum_hi); // Encourages 'adc'
-
-        return sum_lo;
-    }
-
-    FASTMOD_API uint64_t div128_u64(
-        uint64_t dividend_hi, uint64_t dividend_lo, uint64_t divisor, uint64_t* quotient_hi
-    ) {
-        *quotient_hi = dividend_hi / divisor;
-        uint64_t remainder_hi = dividend_hi % divisor;
-
-        // When long div starts to consider the low dividend,
-        // the high part would have became its remainder.
-        // Prevents an arithmetic exception when _udiv128 calculates a >64-bit quotient
-        uint64_t remainder; // Discard
-        return _udiv128(remainder_hi, dividend_lo, divisor, &remainder);
-    }
-
-    // Multiplies the 128-bit integer by d and returns the lower 128-bits of the product
-    FASTMOD_API uint64_t mul128_u64_lo(
-        uint64_t M_hi, uint64_t M_lo, uint64_t d, uint64_t* product_hi
-    ) {
-        uint64_t lowbits_hi;
-        uint64_t lowbits_lo = _umul128(M_lo, d, &lowbits_hi);
-
-        *product_hi = lowbits_hi + (M_hi * d);
-
-        return lowbits_lo;
-    }
-
-    // Multiplies the 128-bit integer by d and returns the highest 64-bits of the product
-    FASTMOD_API uint64_t mul128_u64_hi(uint64_t lowbits_hi, uint64_t lowbits_lo, uint64_t d) {
-        uint64_t bottomHalf_hi = __umulh(lowbits_lo, d);
-
-        uint64_t topHalf_hi;
-        uint64_t topHalf_lo = _umul128(lowbits_hi, d, &topHalf_hi);
-
-        uint64_t bothHalves_hi;
-        add128_u64(topHalf_hi, topHalf_lo, bottomHalf_hi, &bothHalves_hi);
-
-        return bothHalves_hi;
-    }
-
-    FASTMOD_API bool isgreater_u128(uint64_t a_hi, uint64_t a_low, uint64_t b_hi, uint64_t b_low) {
-        // Only when low is greater, high equality should return true
-        uint64_t discard;
-        bool borrowWhenEql = _subborrow_u64(0, b_low, a_low, &discard);
-
-        // borrow(b - (a + C_in)) = C_in? (a >= b) : (a > b)
-        return _subborrow_u64(borrowWhenEql, b_hi, a_hi, &discard);
-    }
+FASTMOD_API uint64_t add128_u64(
+    uint64_t M_hi, uint64_t M_lo, uint64_t addend, uint64_t* sum_hi
+  ) {
+  uint64_t sum_lo;
+
+  bool carry = _addcarry_u64(0, M_lo, addend, &sum_lo);
+  _addcarry_u64(carry, M_hi, 0, sum_hi); // Encourages 'adc'
+
+  return sum_lo;
+}
+
+FASTMOD_API uint64_t div128_u64(
+    uint64_t dividend_hi, uint64_t dividend_lo, uint64_t divisor, uint64_t* quotient_hi
+  ) {
+  *quotient_hi = dividend_hi / divisor;
+  uint64_t remainder_hi = dividend_hi % divisor;
+
+  // When long div starts to consider the low dividend,
+  // the high part would have became its remainder.
+  // Prevents an arithmetic exception when _udiv128 calculates a >64-bit quotient
+  uint64_t remainder; // Discard
+  return _udiv128(remainder_hi, dividend_lo, divisor, &remainder);
+}
+
+// Multiplies the 128-bit integer by d and returns the lower 128-bits of the product
+FASTMOD_API uint64_t mul128_u64_lo(
+    uint64_t M_hi, uint64_t M_lo, uint64_t d, uint64_t* product_hi
+  ) {
+  uint64_t lowbits_hi;
+  uint64_t lowbits_lo = _umul128(M_lo, d, &lowbits_hi);
+
+  *product_hi = lowbits_hi + (M_hi * d);
+
+  return lowbits_lo;
+}
+
+// Multiplies the 128-bit integer by d and returns the highest 64-bits of the product
+FASTMOD_API uint64_t mul128_u64_hi(uint64_t lowbits_hi, uint64_t lowbits_lo, uint64_t d) {
+  uint64_t bottomHalf_hi = __umulh(lowbits_lo, d);
+
+  uint64_t topHalf_hi;
+  uint64_t topHalf_lo = _umul128(lowbits_hi, d, &topHalf_hi);
+
+  uint64_t bothHalves_hi;
+  add128_u64(topHalf_hi, topHalf_lo, bottomHalf_hi, &bothHalves_hi);
+
+  return bothHalves_hi;
+}
+
+FASTMOD_API bool isgreater_u128(uint64_t a_hi, uint64_t a_low, uint64_t b_hi, uint64_t b_low) {
+  // Only when low is greater, high equality should return true
+  uint64_t discard;
+  bool borrowWhenEql = _subborrow_u64(0, b_low, a_low, &discard);
+
+  // borrow(b - (a + C_in)) = C_in? (a >= b) : (a > b)
+  return _subborrow_u64(borrowWhenEql, b_hi, a_hi, &discard);
+}
 
 #endif // End MSVC 64-bit support
 #else // _MSC_VER NOT defined