|
1 |
| -use core::f64; |
| 1 | +/* |
| 2 | +Copyright (c) 2022 Alexei Sibidanov. |
| 3 | +
|
| 4 | +This file is part of the CORE-MATH project |
| 5 | +(https://core-math.gitlabpages.inria.fr/). |
| 6 | +*/ |
2 | 7 |
|
3 | 8 | use super::sqrt;
|
4 | 9 |
|
5 |
| -const SPLIT: f64 = 134217728. + 1.; // 0x1p27 + 1 === (2 ^ 27) + 1 |
| 10 | +#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)] |
| 11 | +pub fn hypot(x: f64, y: f64) -> f64 { |
| 12 | + return cr_hypot(x, y); |
| 13 | +} |
| 14 | + |
| 15 | +fn cr_hypot(mut x: f64, mut y: f64) -> f64 { |
| 16 | + let flag = get_flags(); |
| 17 | + |
| 18 | + let xi = x.to_bits(); |
| 19 | + let yi = y.to_bits(); |
| 20 | + |
| 21 | + let emsk: u64 = 0x7ffu64 << 52; |
| 22 | + let mut ex: u64 = xi & emsk; |
| 23 | + let mut ey: u64 = yi & emsk; |
| 24 | + /* emsk corresponds to the upper bits of NaN and Inf (apart the sign bit) */ |
| 25 | + x = __builtin_fabs(x); |
| 26 | + y = __builtin_fabs(y); |
| 27 | + if __builtin_expect(ex == emsk || ey == emsk, false) { |
| 28 | + /* Either x or y is NaN or Inf */ |
| 29 | + let wx: u64 = xi << 1; |
| 30 | + let wy: u64 = yi << 1; |
| 31 | + let wm: u64 = emsk << 1; |
| 32 | + let ninf: i32 = ((wx == wm) ^ (wy == wm)) as i32; |
| 33 | + let nqnn: i32 = (((wx >> 52) == 0xfff) ^ ((wy >> 52) == 0xfff)) as i32; |
| 34 | + /* ninf is 1 when only one of x and y is +/-Inf |
| 35 | + nqnn is 1 when only one of x and y is qNaN |
| 36 | + IEEE 754 says that hypot(+/-Inf,qNaN)=hypot(qNaN,+/-Inf)=+Inf. */ |
| 37 | + if ninf != 0 && nqnn != 0 { |
| 38 | + return f64::INFINITY; |
| 39 | + } |
| 40 | + return x + y; /* inf, nan */ |
| 41 | + } |
| 42 | + |
| 43 | + let u: f64 = x.max(y); |
| 44 | + let v: f64 = x.min(y); |
| 45 | + let mut xd: u64 = u.to_bits(); |
| 46 | + let mut yd: u64 = v.to_bits(); |
| 47 | + ey = yd; |
| 48 | + |
| 49 | + if __builtin_expect(ey >> 52 == 0, false) { |
| 50 | + if yd == 0 { |
| 51 | + return f64::from_bits(xd); |
| 52 | + } |
| 53 | + |
| 54 | + ex = xd; |
| 55 | + |
| 56 | + if __builtin_expect(ex >> 52 == 0, false) { |
| 57 | + if ex == 0 { |
| 58 | + return 0.0; |
| 59 | + } |
| 60 | + |
| 61 | + return as_hypot_denorm(ex, ey); |
| 62 | + } |
| 63 | + |
| 64 | + let nz: u32 = ey.leading_zeros(); |
| 65 | + ey <<= nz - 11; |
| 66 | + ey &= u64::MAX >> 12; |
| 67 | + ey = ey.wrapping_sub(((nz as i64 - 12i64) << 52) as u64); |
| 68 | + let t = ey; // why did they do this? |
| 69 | + yd = t; |
| 70 | + } |
6 | 71 |
|
7 |
| -fn sq(x: f64) -> (f64, f64) { |
8 |
| - let xh: f64; |
9 |
| - let xl: f64; |
10 |
| - let xc: f64; |
| 72 | + let de: u64 = xd.wrapping_sub(yd); |
| 73 | + if __builtin_expect(de > (27_u64 << 52), false) { |
| 74 | + return __builtin_fma(hf64!("0x1p-27"), v, u); |
| 75 | + } |
11 | 76 |
|
12 |
| - xc = x * SPLIT; |
13 |
| - xh = x - xc + xc; |
14 |
| - xl = x - xh; |
15 |
| - let hi = x * x; |
16 |
| - let lo = xh * xh - hi + 2. * xh * xl + xl * xl; |
17 |
| - (hi, lo) |
| 77 | + let off: i64 = (0x3ff_i64 << 52) - (xd & emsk) as i64; |
| 78 | + xd = xd.wrapping_add(off as u64); |
| 79 | + yd = yd.wrapping_add(off as u64); |
| 80 | + x = f64::from_bits(xd); |
| 81 | + y = f64::from_bits(yd); |
| 82 | + let x2: f64 = x * x; |
| 83 | + let dx2: f64 = __builtin_fma(x, x, -x2); |
| 84 | + let y2: f64 = y * y; |
| 85 | + let dy2: f64 = __builtin_fma(y, y, -y2); |
| 86 | + let r2: f64 = x2 + y2; |
| 87 | + let ir2: f64 = 0.5 / r2; |
| 88 | + let dr2: f64 = ((x2 - r2) + y2) + (dx2 + dy2); |
| 89 | + let mut th: f64 = sqrt(r2); |
| 90 | + let rsqrt: f64 = th * ir2; |
| 91 | + let dz: f64 = dr2 - __builtin_fma(th, th, -r2); |
| 92 | + let mut tl: f64 = rsqrt * dz; |
| 93 | + th = fasttwosum(th, tl, &mut tl); |
| 94 | + let mut thd: u64 = th.to_bits(); |
| 95 | + let tld = __builtin_fabs(tl).to_bits(); |
| 96 | + ex = thd; |
| 97 | + ey = tld; |
| 98 | + ex &= 0x7ff_u64 << 52; |
| 99 | + let aidr: u64 = ey + (0x3fe_u64 << 52) - ex; |
| 100 | + let mid: u64 = (aidr.wrapping_sub(0x3c90000000000000) + 16) >> 5; |
| 101 | + if __builtin_expect( |
| 102 | + mid == 0 || aidr < 0x39b0000000000000_u64 || aidr > 0x3c9fffffffffff80_u64, |
| 103 | + false, |
| 104 | + ) { |
| 105 | + thd = as_hypot_hard(x, y, flag).to_bits(); |
| 106 | + } |
| 107 | + thd = thd.wrapping_sub(off as u64); |
| 108 | + if __builtin_expect(thd >= (0x7ff_u64 << 52), false) { |
| 109 | + return as_hypot_overflow(); |
| 110 | + } |
| 111 | + |
| 112 | + f64::from_bits(thd) |
18 | 113 | }
|
19 | 114 |
|
20 |
| -#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)] |
21 |
| -pub fn hypot(mut x: f64, mut y: f64) -> f64 { |
22 |
| - let x1p700 = f64::from_bits(0x6bb0000000000000); // 0x1p700 === 2 ^ 700 |
23 |
| - let x1p_700 = f64::from_bits(0x1430000000000000); // 0x1p-700 === 2 ^ -700 |
24 |
| - |
25 |
| - let mut uxi = x.to_bits(); |
26 |
| - let mut uyi = y.to_bits(); |
27 |
| - let uti; |
28 |
| - let ex: i64; |
29 |
| - let ey: i64; |
30 |
| - let mut z: f64; |
31 |
| - |
32 |
| - /* arrange |x| >= |y| */ |
33 |
| - uxi &= -1i64 as u64 >> 1; |
34 |
| - uyi &= -1i64 as u64 >> 1; |
35 |
| - if uxi < uyi { |
36 |
| - uti = uxi; |
37 |
| - uxi = uyi; |
38 |
| - uyi = uti; |
| 115 | +fn fasttwosum(x: f64, y: f64, e: &mut f64) -> f64 { |
| 116 | + let s: f64 = x + y; |
| 117 | + let z: f64 = s - x; |
| 118 | + *e = y - z; |
| 119 | + s |
| 120 | +} |
| 121 | + |
| 122 | +fn as_hypot_overflow() -> f64 { |
| 123 | + let z: f64 = hf64!("0x1.fffffffffffffp1023"); |
| 124 | + let f = z + z; |
| 125 | + if f > z { |
| 126 | + // errno = ERANGE |
| 127 | + } |
| 128 | + f |
| 129 | +} |
| 130 | + |
| 131 | +/* Here the square root is refined by Newton iterations: x^2+y^2 is exact |
| 132 | +and fits in a 128-bit integer, so the approximation is squared (which |
| 133 | +also fits in a 128-bit integer), compared and adjusted if necessary using |
| 134 | +the exact value of x^2+y^2. */ |
| 135 | +fn as_hypot_hard(x: f64, y: f64, flag: FExcept) -> f64 { |
| 136 | + let op: f64 = 1.0 + hf64!("0x1p-54"); |
| 137 | + let om: f64 = 1.0 - hf64!("0x1p-54"); |
| 138 | + let mut xi: u64 = x.to_bits(); |
| 139 | + let yi: u64 = y.to_bits(); |
| 140 | + let mut bm: u64 = (xi & (u64::MAX >> 12)) | 1u64 << 52; |
| 141 | + let mut lm: u64 = (yi & (u64::MAX >> 12)) | 1u64 << 52; |
| 142 | + let be: i32 = (xi >> 52) as i32; |
| 143 | + let le: i32 = (yi >> 52) as i32; |
| 144 | + let ri: u64 = sqrt(x * x + y * y).to_bits(); |
| 145 | + let bs: i32 = 2; |
| 146 | + let mut rm: u64 = ri & (u64::MAX >> 12); |
| 147 | + let mut re: i32 = (ri >> 52) as i32 - 0x3ff; |
| 148 | + rm |= 1u64 << 52; |
| 149 | + |
| 150 | + for _ in 0..3 { |
| 151 | + if __builtin_expect(rm == 1u64 << 52, true) { |
| 152 | + rm = u64::MAX >> 11; |
| 153 | + re -= 1; |
| 154 | + } else { |
| 155 | + rm -= 1; |
| 156 | + } |
| 157 | + } |
| 158 | + |
| 159 | + bm <<= bs; |
| 160 | + let mut m2: u64 = bm.wrapping_mul(bm); |
| 161 | + let de: i32 = be - le; |
| 162 | + let mut ls: i32 = bs - de; |
| 163 | + |
| 164 | + if __builtin_expect(ls >= 0, true) { |
| 165 | + lm <<= ls; |
| 166 | + m2 += lm.wrapping_mul(lm); |
| 167 | + } else { |
| 168 | + let lm2: u128 = (lm as u128) * (lm as u128); |
| 169 | + ls *= 2; |
| 170 | + m2 += (lm2 >> -ls) as u64; |
| 171 | + m2 |= ((lm2 << (128 + ls)) != 0) as u64; |
| 172 | + extern crate std; |
| 173 | + std::println!("here"); |
39 | 174 | }
|
40 | 175 |
|
41 |
| - /* special cases */ |
42 |
| - ex = (uxi >> 52) as i64; |
43 |
| - ey = (uyi >> 52) as i64; |
44 |
| - x = f64::from_bits(uxi); |
45 |
| - y = f64::from_bits(uyi); |
46 |
| - /* note: hypot(inf,nan) == inf */ |
47 |
| - if ey == 0x7ff { |
48 |
| - return y; |
| 176 | + let k: i32 = bs + re; |
| 177 | + let mut d: i64; |
| 178 | + |
| 179 | + loop { |
| 180 | + rm += 1 + (rm >= (1u64 << 53)) as u64; |
| 181 | + let tm: u64 = rm << k; |
| 182 | + let rm2: u64 = tm * tm; |
| 183 | + d = m2 as i64 - rm2 as i64; |
| 184 | + |
| 185 | + if d <= 0 { |
| 186 | + break; |
| 187 | + } |
49 | 188 | }
|
50 |
| - if ex == 0x7ff || uyi == 0 { |
51 |
| - return x; |
| 189 | + |
| 190 | + if d == 0 { |
| 191 | + set_flags(flag); |
| 192 | + } else { |
| 193 | + if __builtin_expect(op == om, true) { |
| 194 | + let tm: u64 = (rm << k) - (1 << (k - (rm <= (1u64 << 53)) as i32)); |
| 195 | + d = m2 as i64 - (tm * tm) as i64; |
| 196 | + |
| 197 | + if __builtin_expect(d != 0, true) { |
| 198 | + rm += d as u64 >> 63; |
| 199 | + } else { |
| 200 | + rm -= rm & 1; |
| 201 | + } |
| 202 | + } else { |
| 203 | + rm -= ((op == 1.0) as u64) << (rm > (1u64 << 53)) as u32; |
| 204 | + } |
52 | 205 | }
|
53 |
| - /* note: hypot(x,y) ~= x + y*y/x/2 with inexact for small y/x */ |
54 |
| - /* 64 difference is enough for ld80 double_t */ |
55 |
| - if ex - ey > 64 { |
56 |
| - return x + y; |
| 206 | + |
| 207 | + if rm >= (1u64 << 53) { |
| 208 | + rm >>= 1; |
| 209 | + re += 1; |
57 | 210 | }
|
58 | 211 |
|
59 |
| - /* precise sqrt argument in nearest rounding mode without overflow */ |
60 |
| - /* xh*xh must not overflow and xl*xl must not underflow in sq */ |
61 |
| - z = 1.; |
62 |
| - if ex > 0x3ff + 510 { |
63 |
| - z = x1p700; |
64 |
| - x *= x1p_700; |
65 |
| - y *= x1p_700; |
66 |
| - } else if ey < 0x3ff - 450 { |
67 |
| - z = x1p_700; |
68 |
| - x *= x1p700; |
69 |
| - y *= x1p700; |
| 212 | + let e: u64 = (be - 1 + re) as u64; |
| 213 | + xi = (e << 52) + rm; |
| 214 | + |
| 215 | + f64::from_bits(xi) |
| 216 | +} |
| 217 | + |
| 218 | +fn as_hypot_denorm(mut a: u64, mut b: u64) -> f64 { |
| 219 | + let op: f64 = 1.0 + hf64!("0x1p-54"); |
| 220 | + let om: f64 = 1.0 - hf64!("0x1p-54"); |
| 221 | + let af: f64 = a as i64 as f64; |
| 222 | + let bf: f64 = b as i64 as f64; |
| 223 | + a <<= 1; |
| 224 | + b <<= 1; |
| 225 | + // Is this casting right? |
| 226 | + let mut rm: u64 = sqrt(af * af + bf * bf) as u64; |
| 227 | + let tm: u64 = rm << 1; |
| 228 | + let mut d: i64 = (a.wrapping_mul(a) as i64) |
| 229 | + .wrapping_sub(tm.wrapping_mul(tm) as i64) |
| 230 | + .wrapping_add(b.wrapping_mul(b) as i64); |
| 231 | + let sd: i64 = d >> 63; |
| 232 | + let um: i64 = ((rm as i64) ^ sd) - sd; |
| 233 | + let mut drm: i64 = sd + 1; |
| 234 | + let mut dd: i64 = (um << 3) + 4; |
| 235 | + let mut p_d: i64; |
| 236 | + rm -= drm as u64; |
| 237 | + drm += sd; |
| 238 | + loop { |
| 239 | + p_d = d; |
| 240 | + rm = rm.wrapping_add(drm as u64); |
| 241 | + d = d.wrapping_sub(dd); |
| 242 | + dd = d.wrapping_add(8); |
| 243 | + if !__builtin_expect((d ^ p_d) > 0, false) { |
| 244 | + break; |
| 245 | + } |
70 | 246 | }
|
71 |
| - let (hx, lx) = sq(x); |
72 |
| - let (hy, ly) = sq(y); |
73 |
| - z * sqrt(ly + lx + hy + hx) |
| 247 | + p_d = (sd & d) + (!sd & p_d); |
| 248 | + if __builtin_expect(p_d != 0, true) { |
| 249 | + if __builtin_expect(op == om, true) { |
| 250 | + let sum: i64 = p_d.wrapping_sub(4_i64.wrapping_mul(rm as i64)).wrapping_sub(1); |
| 251 | + |
| 252 | + if __builtin_expect(sum != 0, true) { |
| 253 | + rm = rm.wrapping_add((sum as u64 >> 63) + 1); |
| 254 | + } else { |
| 255 | + rm += rm & 1; |
| 256 | + } |
| 257 | + } else { |
| 258 | + rm += (op > 1.0) as u64; |
| 259 | + } |
| 260 | + } |
| 261 | + let xi: u64 = rm; |
| 262 | + f64::from_bits(xi) |
| 263 | +} |
| 264 | + |
| 265 | +fn __builtin_expect<T>(v: T, _exp: T) -> T { |
| 266 | + v |
| 267 | +} |
| 268 | + |
| 269 | +fn __builtin_fabs(x: f64) -> f64 { |
| 270 | + x.abs() |
| 271 | +} |
| 272 | + |
| 273 | +fn __builtin_copysign(x: f64, y: f64) -> f64 { |
| 274 | + x.copysign(y) |
| 275 | +} |
| 276 | + |
| 277 | +fn __builtin_fma(x: f64, y: f64, z: f64) -> f64 { |
| 278 | + unsafe { core::intrinsics::fmaf64(x, y, z) } |
| 279 | +} |
| 280 | + |
| 281 | +type FExcept = u32; |
| 282 | + |
| 283 | +fn get_rounding_mode(_flag: &mut FExcept) -> i32 { |
| 284 | + // Always nearest |
| 285 | + 0 |
| 286 | +} |
| 287 | + |
| 288 | +fn set_flags(_flag: FExcept) {} |
| 289 | + |
| 290 | +fn get_flags() -> FExcept { |
| 291 | + 0 |
74 | 292 | }
|
0 commit comments