|
| 1 | +/* |
| 2 | +Copyright (c) 2022 Alexei Sibidanov. |
| 3 | +
|
| 4 | +This file is part of the CORE-MATH project |
| 5 | +(https://core-math.gitlabpages.inria.fr/). |
| 6 | +*/ |
| 7 | + |
1 | 8 | use core::f64;
|
2 | 9 |
|
3 | 10 | use super::sqrt;
|
4 | 11 |
|
5 | 12 | const SPLIT: f64 = 134217728. + 1.; // 0x1p27 + 1 === (2 ^ 27) + 1
|
6 | 13 |
|
7 |
| -fn sq(x: f64) -> (f64, f64) { |
8 |
| - let xh: f64; |
9 |
| - let xl: f64; |
10 |
| - let xc: f64; |
11 |
| - |
12 |
| - xc = x * SPLIT; |
13 |
| - xh = x - xc + xc; |
14 |
| - xl = x - xh; |
15 |
| - let hi = x * x; |
16 |
| - let lo = xh * xh - hi + 2. * xh * xl + xl * xl; |
17 |
| - (hi, lo) |
18 |
| -} |
19 |
| - |
20 | 14 | #[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
|
21 | 15 | pub fn hypot(mut x: f64, mut y: f64) -> f64 {
|
| 16 | + if true { |
| 17 | + return cr_hypot(x, y); |
| 18 | + } |
| 19 | + |
22 | 20 | let x1p700 = f64::from_bits(0x6bb0000000000000); // 0x1p700 === 2 ^ 700
|
23 | 21 | let x1p_700 = f64::from_bits(0x1430000000000000); // 0x1p-700 === 2 ^ -700
|
24 | 22 |
|
@@ -72,3 +70,281 @@ pub fn hypot(mut x: f64, mut y: f64) -> f64 {
|
72 | 70 | let (hy, ly) = sq(y);
|
73 | 71 | z * sqrt(ly + lx + hy + hx)
|
74 | 72 | }
|
| 73 | + |
| 74 | +fn sq(x: f64) -> (f64, f64) { |
| 75 | + let xh: f64; |
| 76 | + let xl: f64; |
| 77 | + let xc: f64; |
| 78 | + |
| 79 | + xc = x * SPLIT; |
| 80 | + xh = x - xc + xc; |
| 81 | + xl = x - xh; |
| 82 | + let hi = x * x; |
| 83 | + let lo = xh * xh - hi + 2. * xh * xl + xl * xl; |
| 84 | + (hi, lo) |
| 85 | +} |
| 86 | + |
| 87 | +fn cr_hypot(mut x: f64, mut y: f64) -> f64 { |
| 88 | + let flag = get_flags(); |
| 89 | + |
| 90 | + let xi = x.to_bits(); |
| 91 | + let yi = y.to_bits(); |
| 92 | + |
| 93 | + let emsk: u64 = 0x7ffu64 << 52; |
| 94 | + let mut ex: u64 = xi & emsk; |
| 95 | + let mut ey: u64 = yi & emsk; |
| 96 | + /* emsk corresponds to the upper bits of NaN and Inf (apart the sign bit) */ |
| 97 | + x = __builtin_fabs(x); |
| 98 | + y = __builtin_fabs(y); |
| 99 | + if __builtin_expect(ex == emsk || ey == emsk, false) { |
| 100 | + /* Either x or y is NaN or Inf */ |
| 101 | + let wx: u64 = xi << 1; |
| 102 | + let wy: u64 = yi << 1; |
| 103 | + let wm: u64 = emsk << 1; |
| 104 | + let ninf: i32 = ((wx == wm) ^ (wy == wm)) as i32; |
| 105 | + let nqnn: i32 = (((wx >> 52) == 0xfff) ^ ((wy >> 52) == 0xfff)) as i32; |
| 106 | + /* ninf is 1 when only one of x and y is +/-Inf |
| 107 | + nqnn is 1 when only one of x and y is qNaN |
| 108 | + IEEE 754 says that hypot(+/-Inf,qNaN)=hypot(qNaN,+/-Inf)=+Inf. */ |
| 109 | + if ninf > 0 && nqnn > 0 { |
| 110 | + return f64::INFINITY; |
| 111 | + } |
| 112 | + return x + y; /* inf, nan */ |
| 113 | + } |
| 114 | + |
| 115 | + let u: f64 = x.max(y); |
| 116 | + let v: f64 = x.min(y); |
| 117 | + let mut xd: u64 = u.to_bits(); |
| 118 | + let mut yd: u64 = v.to_bits(); |
| 119 | + ey = yd; |
| 120 | + if __builtin_expect(ey >> 52 == 0, false) { |
| 121 | + if yd == 0 { |
| 122 | + return f64::from_bits(xd); |
| 123 | + } |
| 124 | + ex = xd; |
| 125 | + if __builtin_expect(ex >> 52 == 0, false) { |
| 126 | + if ex == 0 { |
| 127 | + return 0.0; |
| 128 | + } |
| 129 | + return as_hypot_denorm(ex, ey); |
| 130 | + } |
| 131 | + |
| 132 | + let nz: u32 = ey.leading_zeros(); |
| 133 | + ey <<= nz - 11; |
| 134 | + ey &= u64::MAX >> 12; |
| 135 | + ey -= ((nz as i64 - 12i64) << 52) as u64; |
| 136 | + let t = ey; // why did they do this? |
| 137 | + yd = t; |
| 138 | + } |
| 139 | + |
| 140 | + let de: u64 = xd - yd; |
| 141 | + if __builtin_expect(de > (27_u64 << 52), false) { |
| 142 | + return __builtin_fma(hf64!("0x1p-27"), v, u); |
| 143 | + } |
| 144 | + |
| 145 | + let off: i64 = (0x3ff_i64 << 52) - (xd & emsk) as i64; |
| 146 | + xd = xd.wrapping_mul(off as u64); |
| 147 | + yd = yd.wrapping_mul(off as u64); |
| 148 | + x = f64::from_bits(xd); |
| 149 | + y = f64::from_bits(yd); |
| 150 | + let x2: f64 = x * x; |
| 151 | + let dx2: f64 = __builtin_fma(x, x, -x2); |
| 152 | + let y2: f64 = y * y; |
| 153 | + let dy2: f64 = __builtin_fma(y, y, -y2); |
| 154 | + let r2: f64 = x2 + y2; |
| 155 | + let ir2: f64 = 0.5 / r2; |
| 156 | + let dr2: f64 = ((x2 - r2) + y2) + (dx2 + dy2); |
| 157 | + let mut th: f64 = sqrt(r2); |
| 158 | + let rsqrt: f64 = th * ir2; |
| 159 | + let dz: f64 = dr2 - __builtin_fma(th, th, -r2); |
| 160 | + let mut tl: f64 = rsqrt * dz; |
| 161 | + th = fasttwosum(th, tl, &mut tl); |
| 162 | + let mut thd: u64 = th.to_bits(); |
| 163 | + let tld = __builtin_fabs(tl).to_bits(); |
| 164 | + ex = thd; |
| 165 | + ey = tld; |
| 166 | + ex &= 0x7ff_u64 << 52; |
| 167 | + let aidr: u64 = ey + (0x3fe_u64 << 52) - ex; |
| 168 | + let mid: u64 = (aidr.wrapping_sub(0x3c90000000000000) + 16) >> 5; |
| 169 | + if __builtin_expect( |
| 170 | + mid == 0 || aidr < 0x39b0000000000000_u64 || aidr > 0x3c9fffffffffff80_u64, |
| 171 | + false, |
| 172 | + ) { |
| 173 | + thd = as_hypot_hard(x, y, flag).to_bits(); |
| 174 | + } |
| 175 | + thd -= off as u64; |
| 176 | + if __builtin_expect(thd >= (0x7ff_u64 << 52), false) { |
| 177 | + return as_hypot_overflow(); |
| 178 | + } |
| 179 | + |
| 180 | + f64::from_bits(thd) |
| 181 | +} |
| 182 | + |
| 183 | +fn fasttwosum(x: f64, y: f64, e: &mut f64) -> f64 { |
| 184 | + let s: f64 = x + y; |
| 185 | + let z: f64 = s - x; |
| 186 | + *e = y - z; |
| 187 | + s |
| 188 | +} |
| 189 | + |
| 190 | +fn as_hypot_overflow() -> f64 { |
| 191 | + let z: f64 = hf64!("0x1.fffffffffffffp1023"); |
| 192 | + let f = z + z; |
| 193 | + if f > z { |
| 194 | + // errno = ERANGE |
| 195 | + } |
| 196 | + f |
| 197 | +} |
| 198 | + |
| 199 | +/* Here the square root is refined by Newton iterations: x^2+y^2 is exact |
| 200 | +and fits in a 128-bit integer, so the approximation is squared (which |
| 201 | +also fits in a 128-bit integer), compared and adjusted if necessary using |
| 202 | +the exact value of x^2+y^2. */ |
| 203 | +fn as_hypot_hard(x: f64, y: f64, flag: FExcept) -> f64 { |
| 204 | + let op: f64 = 1.0 + hf64!("0x1p-54"); |
| 205 | + let om: f64 = 1.0 - hf64!("0x1p-54"); |
| 206 | + let mut xi: u64 = x.to_bits(); |
| 207 | + let yi: u64 = y.to_bits(); |
| 208 | + let mut bm: u64 = (xi & (u64::MAX >> 12)) | 1u64 << 52; |
| 209 | + let mut lm: u64 = (yi & (u64::MAX >> 12)) | 1u64 << 52; |
| 210 | + let be: i32 = (xi >> 52) as i32; |
| 211 | + let le: i32 = (yi >> 52) as i32; |
| 212 | + let ri: u64 = sqrt(x * x + y * y).to_bits(); |
| 213 | + let bs: i32 = 2; |
| 214 | + let mut rm: u64 = ri & (u64::MAX >> 12); |
| 215 | + let mut re: i32 = ((ri >> 52) - 0x3ff) as i32; |
| 216 | + rm |= 1u64 << 52; |
| 217 | + |
| 218 | + for _ in 0..3 { |
| 219 | + if __builtin_expect(rm == 1u64 << 52, true) { |
| 220 | + rm = u64::MAX >> 11; |
| 221 | + re -= 1; |
| 222 | + } else { |
| 223 | + rm -= 1; |
| 224 | + } |
| 225 | + } |
| 226 | + bm <<= bs; |
| 227 | + let mut m2: u64 = bm * bm; |
| 228 | + let de: i32 = be - le; |
| 229 | + let mut ls: i32 = bs - de; |
| 230 | + if __builtin_expect(ls >= 0, true) { |
| 231 | + lm <<= ls; |
| 232 | + m2 += lm * lm; |
| 233 | + } else { |
| 234 | + let lm2: u128 = (lm as u128) * (lm as u128); |
| 235 | + ls *= 2; |
| 236 | + m2 += (lm2 >> -ls) as u64; |
| 237 | + m2 |= ((lm2 << (128 + ls)) != 0) as u64; |
| 238 | + } |
| 239 | + let k: i32 = bs + re; |
| 240 | + let mut d: i64; |
| 241 | + loop { |
| 242 | + rm += 1 + (rm >= (1u64 << 53)) as u64; |
| 243 | + let tm: u64 = rm << k; |
| 244 | + let rm2: u64 = tm * tm; |
| 245 | + d = m2 as i64 - rm2 as i64; |
| 246 | + |
| 247 | + if d <= 0 { |
| 248 | + break; |
| 249 | + } |
| 250 | + } |
| 251 | + |
| 252 | + if d == 0 { |
| 253 | + set_flags(flag); |
| 254 | + } else { |
| 255 | + if __builtin_expect(op == om, true) { |
| 256 | + let tm: u64 = (rm << k) - (1 << (k - (rm <= (1u64 << 53)) as i32)); |
| 257 | + d = m2 as i64 - (tm * tm) as i64; |
| 258 | + |
| 259 | + if __builtin_expect(d != 0, true) { |
| 260 | + rm += d as u64 >> 63; |
| 261 | + } else { |
| 262 | + rm -= rm & 1; |
| 263 | + } |
| 264 | + } else { |
| 265 | + rm -= ((op == 1.0) as u64) << (rm > (1u64 << 53)) as u32; |
| 266 | + } |
| 267 | + } |
| 268 | + if rm >= (1u64 << 53) { |
| 269 | + rm >>= 1; |
| 270 | + re += 1; |
| 271 | + } |
| 272 | + |
| 273 | + let e: u64 = (be - 1 + re) as u64; |
| 274 | + xi = (e << 52) + rm; |
| 275 | + |
| 276 | + f64::from_bits(xi) |
| 277 | +} |
| 278 | + |
| 279 | +fn as_hypot_denorm(mut a: u64, mut b: u64) -> f64 { |
| 280 | + let op: f64 = 1.0 + hf64!("0x1p-54"); |
| 281 | + let om: f64 = 1.0 - hf64!("0x1p-54"); |
| 282 | + let af: f64 = a as f64; |
| 283 | + let bf: f64 = b as f64; |
| 284 | + a <<= 1; |
| 285 | + b <<= 1; |
| 286 | + // Is this casting right? |
| 287 | + let mut rm: u64 = sqrt(af * af + bf * bf) as u64; |
| 288 | + let tm: u64 = rm << 1; |
| 289 | + let mut d: i64 = ((a * a) as i64) - ((tm * tm) as i64) + ((b * b) as i64); |
| 290 | + let s_d: i64 = d >> 63; |
| 291 | + let um: i64 = ((rm as i64) ^ s_d) - s_d; |
| 292 | + let mut drm: i64 = s_d + 1; |
| 293 | + let mut dd: i64 = (um << 3) + 4; |
| 294 | + let mut p_d: i64; |
| 295 | + rm -= drm as u64; |
| 296 | + drm += s_d; |
| 297 | + loop { |
| 298 | + p_d = d; |
| 299 | + rm += drm as u64; |
| 300 | + d -= dd; |
| 301 | + dd += 8; |
| 302 | + if !__builtin_expect((d ^ p_d) > 0, false) { |
| 303 | + break; |
| 304 | + } |
| 305 | + } |
| 306 | + p_d = (s_d & d) + (!s_d & p_d); |
| 307 | + if __builtin_expect(p_d != 0, true) { |
| 308 | + if __builtin_expect(op == om, true) { |
| 309 | + let sum: i64 = p_d - 4 * (rm as i64) - 1; |
| 310 | + if __builtin_expect(sum != 0, true) { |
| 311 | + rm += (sum as u64 >> 63) + 1; |
| 312 | + } else { |
| 313 | + rm += rm & 1; |
| 314 | + } |
| 315 | + } else { |
| 316 | + rm += (op > 1.0) as u64; |
| 317 | + } |
| 318 | + } |
| 319 | + let xi: u64 = rm; |
| 320 | + f64::from_bits(xi) |
| 321 | +} |
| 322 | + |
| 323 | +fn __builtin_expect<T>(v: T, _exp: T) -> T { |
| 324 | + v |
| 325 | +} |
| 326 | + |
| 327 | +fn __builtin_fabs(x: f64) -> f64 { |
| 328 | + unsafe { core::intrinsics::fabsf64(x) } |
| 329 | +} |
| 330 | + |
| 331 | +fn __builtin_copysign(x: f64, y: f64) -> f64 { |
| 332 | + unsafe { core::intrinsics::copysignf64(x, y) } |
| 333 | +} |
| 334 | + |
| 335 | +fn __builtin_fma(x: f64, y: f64, z: f64) -> f64 { |
| 336 | + unsafe { core::intrinsics::fmaf64(x, y, z) } |
| 337 | +} |
| 338 | + |
| 339 | +type FExcept = u32; |
| 340 | + |
| 341 | +fn get_rounding_mode(_flag: &mut FExcept) -> i32 { |
| 342 | + // Always nearest |
| 343 | + 0 |
| 344 | +} |
| 345 | + |
| 346 | +fn set_flags(_flag: FExcept) {} |
| 347 | + |
| 348 | +fn get_flags() -> FExcept { |
| 349 | + 0 |
| 350 | +} |
0 commit comments