Optimize performance of fmod with Barrett multiplication

quaternic · quaternic · commit c811592b4679 · 2025-05-02T22:47:39.000+03:00
diff --git a/libm/src/math/generic/fmod.rs b/libm/src/math/generic/fmod.rs
@@ -1,5 +1,6 @@
 /* SPDX-License-Identifier: MIT OR Apache-2.0 */
 use super::super::{CastFrom, Float, Int, MinInt};
+use crate::support::{DInt, HInt, Reducer};
 
 #[inline]
 pub fn fmod<F: Float>(x: F, y: F) -> F {
@@ -59,10 +60,102 @@ fn into_sig_exp<F: Float>(mut bits: F::Int) -> (F::Int, u32) {
 
 /// Compute the remainder `(x * 2.pow(e)) % y` without overflow.
 fn reduction<I: Int>(mut x: I, e: u32, y: I) -> I {
+    // FIXME: This is a temporary hack to get around the lack of `u256 / u256`.
+    // Actually, the algorithm only needs the operation `(x << I::BITS) / y`
+    // where `x < y`. That is, a division `u256 / u128` where the quotient must
+    // not overflow `u128` would be sufficient for `f128`.
+    unsafe {
+        use core::mem::transmute_copy;
+        if I::BITS == 64 {
+            let x = transmute_copy::<I, u64>(&x);
+            let y = transmute_copy::<I, u64>(&y);
+            let r = fast_reduction::<f64, u64>(x, e, y);
+            return transmute_copy::<u64, I>(&r);
+        }
+        if I::BITS == 32 {
+            let x = transmute_copy::<I, u32>(&x);
+            let y = transmute_copy::<I, u32>(&y);
+            let r = fast_reduction::<f32, u32>(x, e, y);
+            return transmute_copy::<u32, I>(&r);
+        }
+        #[cfg(f16_enabled)]
+        if I::BITS == 16 {
+            let x = transmute_copy::<I, u16>(&x);
+            let y = transmute_copy::<I, u16>(&y);
+            let r = fast_reduction::<f16, u16>(x, e, y);
+            return transmute_copy::<u16, I>(&r);
+        }
+    }
+
     x %= y;
     for _ in 0..e {
         x <<= 1;
         x = x.checked_sub(y).unwrap_or(x);
     }
     x
 }
+
+trait SafeShift: Float {
+    // How many guaranteed leading zeros do the values have?
+    // A normalized floating point mantissa has `EXP_BITS` guaranteed leading
+    // zeros (exludes the implicit bit, but includes the now-zeroed sign bit)
+    // `-1` because we want to shift by either `BASE_SHIFT` or `BASE_SHIFT + 1`
+    const BASE_SHIFT: u32 = Self::EXP_BITS - 1;
+}
+impl<F: Float> SafeShift for F {}
+
+fn fast_reduction<F, I>(x: I, e: u32, y: I) -> I
+where
+    F: Float<Int = I>,
+    I: Int + HInt,
+    I::D: Int + DInt<H = I>,
+{
+    let _0 = I::ZERO;
+    let _1 = I::ONE;
+
+    if y == _1 {
+        return _0;
+    }
+
+    if e <= F::BASE_SHIFT {
+        return (x << e) % y;
+    }
+
+    // Find least depth s.t. `(e >> depth) < I::BITS`
+    let depth = (I::BITS - 1)
+        .leading_zeros()
+        .saturating_sub(e.leading_zeros());
+
+    let initial = (e >> depth) - F::BASE_SHIFT;
+
+    let max_rem = y.wrapping_sub(_1);
+    let max_ilog2 = max_rem.ilog2();
+    let mut pow2 = _1 << max_ilog2.min(initial);
+    for _ in max_ilog2..initial {
+        pow2 <<= 1;
+        pow2 = pow2.checked_sub(y).unwrap_or(pow2);
+    }
+
+    // At each step `k in [depth, ..., 0]`,
+    // `p` is `(e >> k) - BASE_SHIFT`
+    // `m` is `(1 << p) % y`
+    let mut k = depth;
+    let mut p = initial;
+    let mut m = Reducer::new(pow2, y);
+
+    while k > 0 {
+        k -= 1;
+        p = p + p + F::BASE_SHIFT;
+        if e & (1 << k) != 0 {
+            m = m.squared_with_shift(F::BASE_SHIFT + 1);
+            p += 1;
+        } else {
+            m = m.squared_with_shift(F::BASE_SHIFT);
+        };
+
+        debug_assert!(p == (e >> k) - F::BASE_SHIFT);
+    }
+
+    // (x << BASE_SHIFT) * (1 << p) == x << e
+    m.mul_into_div_rem(x << F::BASE_SHIFT).1
+}
diff --git a/libm/src/math/support/int_traits/mod_mul.rs b/libm/src/math/support/int_traits/mod_mul.rs
@@ -2,14 +2,11 @@ use super::{DInt, HInt, Int};
 
 /// Barrett reduction using the constant `R == (1 << K) == (1 << U::BITS)`
 ///
-/// More specifically, implements single-word [Barrett multiplication]
-/// (https://en.wikipedia.org/wiki/Barrett_reduction#Single-word_Barrett_multiplication)
-/// and [division]
-/// (https://en.wikipedia.org/wiki/Barrett_reduction#Barrett_Division)
-/// for unsigned integers.
+/// For a more detailed description, see
+/// <https://en.wikipedia.org/wiki/Barrett_reduction>.
 ///
 /// After constructing as `Reducer::new(b, n)`,
-/// provides operations to efficiently compute
+/// has operations to efficiently compute
 ///  - `(a * b) / n` and `(a * b) % n`
 ///  - `Reducer::new((a * b * b) % n, n)`, as long as `a * (n - 1) < R`
 #[derive(Clone, Copy, PartialEq, Eq, Debug)]
@@ -103,7 +100,7 @@ where
             let ar_ns = a.widen_mul(self.rem) + _s.widen_mul(self.div);
             assert!(ab_tn.hi().is_zero());
             assert!(ar_ns.lo().is_zero());
-            assert_eq!(ab_tn.lo(), ar_ns.hi());
+            assert!(ab_tn.lo() == ar_ns.hi());
         }
         // Since `s < R` and `r < n`,
         // ```
@@ -124,7 +121,7 @@ where
     /// Requires `r * ab == ra * b`, where `r = bR % n`.
     #[inline(always)]
     fn with_scaled_num_rem(&self, ab: U, ra: U) -> Self {
-        debug_assert_eq!(ab.widen_mul(self.rem), ra.widen_mul(self.num));
+        debug_assert!(ab.widen_mul(self.rem) == ra.widen_mul(self.num));
         // The new factor `v = abb mod n`:
         let (_, v) = self.mul_into_div_rem(ab);