Skip to content

Commit c811592

Browse files
committed
Optimize performance of fmod with Barrett multiplication
1 parent bf85502 commit c811592

File tree

2 files changed

+98
-8
lines changed

2 files changed

+98
-8
lines changed

libm/src/math/generic/fmod.rs

+93
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
/* SPDX-License-Identifier: MIT OR Apache-2.0 */
22
use super::super::{CastFrom, Float, Int, MinInt};
3+
use crate::support::{DInt, HInt, Reducer};
34

45
#[inline]
56
pub fn fmod<F: Float>(x: F, y: F) -> F {
@@ -59,10 +60,102 @@ fn into_sig_exp<F: Float>(mut bits: F::Int) -> (F::Int, u32) {
5960

6061
/// Compute the remainder `(x * 2.pow(e)) % y` without overflow.
6162
fn reduction<I: Int>(mut x: I, e: u32, y: I) -> I {
63+
// FIXME: This is a temporary hack to get around the lack of `u256 / u256`.
64+
// Actually, the algorithm only needs the operation `(x << I::BITS) / y`
65+
// where `x < y`. That is, a division `u256 / u128` where the quotient must
66+
// not overflow `u128` would be sufficient for `f128`.
67+
unsafe {
68+
use core::mem::transmute_copy;
69+
if I::BITS == 64 {
70+
let x = transmute_copy::<I, u64>(&x);
71+
let y = transmute_copy::<I, u64>(&y);
72+
let r = fast_reduction::<f64, u64>(x, e, y);
73+
return transmute_copy::<u64, I>(&r);
74+
}
75+
if I::BITS == 32 {
76+
let x = transmute_copy::<I, u32>(&x);
77+
let y = transmute_copy::<I, u32>(&y);
78+
let r = fast_reduction::<f32, u32>(x, e, y);
79+
return transmute_copy::<u32, I>(&r);
80+
}
81+
#[cfg(f16_enabled)]
82+
if I::BITS == 16 {
83+
let x = transmute_copy::<I, u16>(&x);
84+
let y = transmute_copy::<I, u16>(&y);
85+
let r = fast_reduction::<f16, u16>(x, e, y);
86+
return transmute_copy::<u16, I>(&r);
87+
}
88+
}
89+
6290
x %= y;
6391
for _ in 0..e {
6492
x <<= 1;
6593
x = x.checked_sub(y).unwrap_or(x);
6694
}
6795
x
6896
}
97+
98+
trait SafeShift: Float {
99+
// How many guaranteed leading zeros do the values have?
100+
// A normalized floating point mantissa has `EXP_BITS` guaranteed leading
101+
// zeros (exludes the implicit bit, but includes the now-zeroed sign bit)
102+
// `-1` because we want to shift by either `BASE_SHIFT` or `BASE_SHIFT + 1`
103+
const BASE_SHIFT: u32 = Self::EXP_BITS - 1;
104+
}
105+
impl<F: Float> SafeShift for F {}
106+
107+
fn fast_reduction<F, I>(x: I, e: u32, y: I) -> I
108+
where
109+
F: Float<Int = I>,
110+
I: Int + HInt,
111+
I::D: Int + DInt<H = I>,
112+
{
113+
let _0 = I::ZERO;
114+
let _1 = I::ONE;
115+
116+
if y == _1 {
117+
return _0;
118+
}
119+
120+
if e <= F::BASE_SHIFT {
121+
return (x << e) % y;
122+
}
123+
124+
// Find least depth s.t. `(e >> depth) < I::BITS`
125+
let depth = (I::BITS - 1)
126+
.leading_zeros()
127+
.saturating_sub(e.leading_zeros());
128+
129+
let initial = (e >> depth) - F::BASE_SHIFT;
130+
131+
let max_rem = y.wrapping_sub(_1);
132+
let max_ilog2 = max_rem.ilog2();
133+
let mut pow2 = _1 << max_ilog2.min(initial);
134+
for _ in max_ilog2..initial {
135+
pow2 <<= 1;
136+
pow2 = pow2.checked_sub(y).unwrap_or(pow2);
137+
}
138+
139+
// At each step `k in [depth, ..., 0]`,
140+
// `p` is `(e >> k) - BASE_SHIFT`
141+
// `m` is `(1 << p) % y`
142+
let mut k = depth;
143+
let mut p = initial;
144+
let mut m = Reducer::new(pow2, y);
145+
146+
while k > 0 {
147+
k -= 1;
148+
p = p + p + F::BASE_SHIFT;
149+
if e & (1 << k) != 0 {
150+
m = m.squared_with_shift(F::BASE_SHIFT + 1);
151+
p += 1;
152+
} else {
153+
m = m.squared_with_shift(F::BASE_SHIFT);
154+
};
155+
156+
debug_assert!(p == (e >> k) - F::BASE_SHIFT);
157+
}
158+
159+
// (x << BASE_SHIFT) * (1 << p) == x << e
160+
m.mul_into_div_rem(x << F::BASE_SHIFT).1
161+
}

libm/src/math/support/int_traits/mod_mul.rs

+5-8
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,11 @@ use super::{DInt, HInt, Int};
22

33
/// Barrett reduction using the constant `R == (1 << K) == (1 << U::BITS)`
44
///
5-
/// More specifically, implements single-word [Barrett multiplication]
6-
/// (https://en.wikipedia.org/wiki/Barrett_reduction#Single-word_Barrett_multiplication)
7-
/// and [division]
8-
/// (https://en.wikipedia.org/wiki/Barrett_reduction#Barrett_Division)
9-
/// for unsigned integers.
5+
/// For a more detailed description, see
6+
/// <https://en.wikipedia.org/wiki/Barrett_reduction>.
107
///
118
/// After constructing as `Reducer::new(b, n)`,
12-
/// provides operations to efficiently compute
9+
/// has operations to efficiently compute
1310
/// - `(a * b) / n` and `(a * b) % n`
1411
/// - `Reducer::new((a * b * b) % n, n)`, as long as `a * (n - 1) < R`
1512
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
@@ -103,7 +100,7 @@ where
103100
let ar_ns = a.widen_mul(self.rem) + _s.widen_mul(self.div);
104101
assert!(ab_tn.hi().is_zero());
105102
assert!(ar_ns.lo().is_zero());
106-
assert_eq!(ab_tn.lo(), ar_ns.hi());
103+
assert!(ab_tn.lo() == ar_ns.hi());
107104
}
108105
// Since `s < R` and `r < n`,
109106
// ```
@@ -124,7 +121,7 @@ where
124121
/// Requires `r * ab == ra * b`, where `r = bR % n`.
125122
#[inline(always)]
126123
fn with_scaled_num_rem(&self, ab: U, ra: U) -> Self {
127-
debug_assert_eq!(ab.widen_mul(self.rem), ra.widen_mul(self.num));
124+
debug_assert!(ab.widen_mul(self.rem) == ra.widen_mul(self.num));
128125
// The new factor `v = abb mod n`:
129126
let (_, v) = self.mul_into_div_rem(ab);
130127

0 commit comments

Comments
 (0)