Skip to content

Commit 8e3dbcf

Browse files
author
andy-thomason
committed
Fully auto-generated trig functions.
1 parent 8db3885 commit 8e3dbcf

File tree

3 files changed

+277
-288
lines changed

3 files changed

+277
-288
lines changed

crates/std_float/src/lib.rs

Lines changed: 6 additions & 288 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,10 @@ use core_simd::simd;
1111

1212
use simd::{LaneCount, Simd, SupportedLaneCount};
1313

14+
mod libm32;
15+
#[cfg(test)]
16+
mod test_libm32;
17+
1418
#[cfg(feature = "as_crate")]
1519
mod experimental {
1620
pub trait Sealed {}
@@ -115,7 +119,9 @@ pub trait StdFloat: Sealed + Sized {
115119
/// Returns the floating point's fractional value, with its integer part removed.
116120
#[must_use = "method returns a new vector and does not mutate the original value"]
117121
fn fract(self) -> Self;
122+
}
118123

124+
pub trait StdLibm : StdFloat {
119125
fn sin(self) -> Self;
120126

121127
fn cos(self) -> Self;
@@ -143,112 +149,6 @@ where
143149
fn fract(self) -> Self {
144150
self - self.trunc()
145151
}
146-
147-
/// Calculate the sine of the angle
148-
/// Note: this is hand-edited from generated scalar code.
149-
/// In an ideal world, we would generate this directly by code transformation.
150-
#[inline]
151-
fn sin(self) -> Self {
152-
#[allow(non_snake_case)]
153-
let RECIP_2PI = Self::splat(0.15915494);
154-
155-
let scaled = self * RECIP_2PI;
156-
let x = scaled - scaled.round();
157-
Self::splat(-12.26885994095919635608)
158-
.mul_add(x * x, Self::splat(41.21624105096574396575))
159-
.mul_add(x * x, Self::splat(-76.58672703333290836700))
160-
.mul_add(x * x, Self::splat(81.59746095374827019356))
161-
.mul_add(x * x, Self::splat(-41.34151143437582891705))
162-
.mul_add(x * x, Self::splat(6.28318452581127506328))
163-
* x
164-
}
165-
166-
fn cos(self) -> Self {
167-
#[allow(non_snake_case)]
168-
let RECIP_2PI = Self::splat(0.15915494);
169-
170-
let scaled = self * RECIP_2PI;
171-
let x = scaled - scaled.round();
172-
Self::splat(6.52865816174499269880)
173-
.mul_add(x * x, Self::splat(-25.97327546890330396608))
174-
.mul_add(x * x, Self::splat(60.17118230812820383560))
175-
.mul_add(x * x, Self::splat(-85.45091743827674607508))
176-
.mul_add(x * x, Self::splat(64.93918704099473042873))
177-
.mul_add(x * x, Self::splat(-19.73920667935656472596))
178-
.mul_add(x * x, Self::splat(1.00000000000000000000))
179-
}
180-
181-
fn tan(self) -> Self {
182-
use core::f32::consts::PI;
183-
let scaled: Self = self * Self::splat(1.0 / PI);
184-
let x: Self = scaled - scaled.round();
185-
let recip: Self = (x * x - Self::splat(0.25)).recip();
186-
let y: Self = Self::splat(0.01439730036301634345)
187-
.mul_add(x * x, Self::splat(0.02101734538976238579))
188-
.mul_add(x * x, Self::splat(0.05285888255895108345))
189-
.mul_add(x * x, Self::splat(0.13475448281475060771))
190-
.mul_add(x * x, Self::splat(0.55773663386075044866))
191-
.mul_add(x * x, Self::splat(-0.78539816491781455948))
192-
* x;
193-
y * recip
194-
}
195-
196-
fn asin(self) -> Self {
197-
use core::f32::consts::PI;
198-
let lim: Self = Self::splat(0.9);
199-
let c: Self = self.lanes_lt(Self::splat(0.0)).select(Self::splat(-PI / 2.0), Self::splat(PI / 2.0));
200-
let s: Self = self.lanes_lt(Self::splat(0.0)).select(Self::splat(-1.0), Self::splat(1.0));
201-
let x: Self = (self * self).lanes_lt(lim * lim).select(self, (Self::splat(1.0) - self * self).sqrt());
202-
let y: Self = Self::splat(4374.97702992533695457424)
203-
.mul_add(x * x, Self::splat(-13781.55764426881951685974))
204-
.mul_add(x * x, Self::splat(17105.69475701115952774357))
205-
.mul_add(x * x, Self::splat(-10486.64894150265898388567))
206-
.mul_add(x * x, Self::splat(3231.76028705607279348342))
207-
.mul_add(x * x, Self::splat(-447.56480696327035255708))
208-
.mul_add(x * x, Self::splat(21.78206149264184872939))
209-
.mul_add(x * x, Self::splat(0.84158415752395745675))
210-
* x;
211-
(self * self).lanes_lt(lim * lim).select(y, c - y * s)
212-
}
213-
214-
fn acos(self) -> Self {
215-
use core::f32::consts::PI;
216-
let lim: Self = Self::splat(0.9);
217-
let c: Self = self.lanes_lt(Self::splat(0.0)).select(Self::splat(PI), Self::splat(0.0));
218-
let s: Self = self.lanes_lt(Self::splat(0.0)).select(Self::splat(1.0), Self::splat(-1.0));
219-
let x: Self = (self * self).lanes_lt(lim * lim).select(self, (Self::splat(1.0) - self * self).sqrt());
220-
// let c: Self = select(self < 0.0, PI, 0.0);
221-
// let s: Self = select(self < 0.0, 1.0, -1.0);
222-
// let x: Self = select(self * self < lim * lim, self, (1.0 - self * self).sqrt());
223-
let y: Self = Self::splat(4374.97702992533695457424)
224-
.mul_add(x * x, Self::splat(-13781.55764426881951685974))
225-
.mul_add(x * x, Self::splat(17105.69475701115952774357))
226-
.mul_add(x * x, Self::splat(-10486.64894150265898388567))
227-
.mul_add(x * x, Self::splat(3231.76028705607279348342))
228-
.mul_add(x * x, Self::splat(-447.56480696327035255708))
229-
.mul_add(x * x, Self::splat(21.78206149264184872939))
230-
.mul_add(x * x, Self::splat(0.84158415752395745675))
231-
* x;
232-
(self * self).lanes_lt(lim * lim).select(y, c - y * s)
233-
}
234-
235-
fn atan(self) -> Self {
236-
use core::f32::consts::PI;
237-
let lim: Self = Self::splat(1.0);
238-
let c: Self = self.lanes_lt(Self::splat(0.0)).select(Self::splat(-PI / 2.0), Self::splat(PI / 2.0));
239-
let small = self.abs().lanes_lt(lim);
240-
let x: Self = small.select(self, self.recip());
241-
let y: Self = Self::splat(95.70126383842530559360)
242-
.mul_add(x * x, Self::splat(424.99907022806059540464))
243-
.mul_add(x * x, Self::splat(-767.48259680040570156003))
244-
.mul_add(x * x, Self::splat(714.51953012224223415829))
245-
.mul_add(x * x, Self::splat(-354.32654395426962592865))
246-
.mul_add(x * x, Self::splat(83.96179897148539189638))
247-
.mul_add(x * x, Self::splat(-6.23958170715441509270))
248-
.mul_add(x * x, Self::splat(1.05498514186427524914))
249-
* x;
250-
small.select(y, c - y)
251-
}
252152
}
253153

254154
impl<const N: usize> StdFloat for Simd<f64, N>
@@ -261,36 +161,6 @@ where
261161
fn fract(self) -> Self {
262162
self - self.trunc()
263163
}
264-
265-
#[inline]
266-
fn sin(self) -> Self {
267-
self
268-
}
269-
270-
#[inline]
271-
fn cos(self) -> Self {
272-
self
273-
}
274-
275-
#[inline]
276-
fn tan(self) -> Self {
277-
self
278-
}
279-
280-
#[inline]
281-
fn asin(self) -> Self {
282-
self
283-
}
284-
285-
#[inline]
286-
fn acos(self) -> Self {
287-
self
288-
}
289-
290-
#[inline]
291-
fn atan(self) -> Self {
292-
self
293-
}
294164
}
295165

296166
#[cfg(test)]
@@ -311,156 +181,4 @@ mod tests {
311181
let _ = x2.abs() * x2;
312182
let _ = x.sin();
313183
}
314-
315-
const NUM_ITER: usize = 0x10000;
316-
317-
macro_rules! test_range {
318-
(
319-
min: $min: expr,
320-
max: $max: expr,
321-
limit: $limit: expr,
322-
scalar_fn: $scalar_fn: expr,
323-
vector_fn: $vector_fn: expr,
324-
scalar_type: $scalar_type: ty,
325-
vector_type: $vector_type: ty,
326-
) => {{
327-
let limit = <$vector_type>::splat($limit);
328-
let b = (($max) - ($min)) * (1.0 / NUM_ITER as $scalar_type);
329-
let a = $min;
330-
let sf = $scalar_fn;
331-
let vf = $vector_fn;
332-
for i in (0..NUM_ITER / 4) {
333-
let fi = (i * 4) as $scalar_type;
334-
let x = <$vector_type>::from_array([
335-
(fi + 0.0) * b + a,
336-
(fi + 1.0) * b + a,
337-
(fi + 2.0) * b + a,
338-
(fi + 3.0) * b + a,
339-
]);
340-
let yref = <$vector_type>::from_array([sf(x[0]), sf(x[1]), sf(x[2]), sf(x[3])]);
341-
let y = vf(x);
342-
let e = (y - yref);
343-
if !(e.abs().lanes_le(limit)).all() {
344-
panic!("\nx ={:20.16?}\ne ={:20.16?}\nlimit ={:20.16?}\nvector={:20.16?}\nscalar={:20.16?}\nvector_fn={}", x, e, limit, y, yref, stringify!($vector_fn));
345-
}
346-
}
347-
}};
348-
}
349-
350-
#[test]
351-
fn sin_f32() {
352-
use core::f32::consts::PI;
353-
let one_ulp = (2.0_f32).powi(-23);
354-
355-
test_range!(
356-
min: -PI/4.0,
357-
max: PI/4.0,
358-
limit: one_ulp * 1.0,
359-
scalar_fn: |x : f32| x.sin(),
360-
vector_fn: |x : f32x4| x.sin(),
361-
scalar_type: f32,
362-
vector_type: f32x4,
363-
);
364-
365-
test_range!(
366-
min: -PI/2.0,
367-
max: PI/2.0,
368-
limit: one_ulp * 2.0,
369-
scalar_fn: |x : f32| x.sin(),
370-
vector_fn: |x : f32x4| x.sin(),
371-
scalar_type: f32,
372-
vector_type: f32x4,
373-
);
374-
375-
test_range!(
376-
min: -PI,
377-
max: PI,
378-
limit: one_ulp * 8.0,
379-
scalar_fn: |x : f32| x.sin(),
380-
vector_fn: |x : f32x4| x.sin(),
381-
scalar_type: f32,
382-
vector_type: f32x4,
383-
);
384-
}
385-
386-
#[test]
387-
fn cos_f32() {
388-
use core::f32::consts::PI;
389-
let one_ulp = (2.0_f32).powi(-23);
390-
391-
// In the range +/- pi/4 the input has 1 ulp of error.
392-
test_range!(
393-
min: -PI/4.0,
394-
max: PI/4.0,
395-
limit: one_ulp * 1.0,
396-
scalar_fn: |x : f32| x.cos(),
397-
vector_fn: |x : f32x4| x.cos(),
398-
scalar_type: f32,
399-
vector_type: f32x4,
400-
);
401-
402-
// In the range +/- pi/2 the input and output has 2 ulp of error.
403-
test_range!(
404-
min: -PI/2.0,
405-
max: PI/2.0,
406-
limit: one_ulp * 2.0,
407-
scalar_fn: |x : f32| x.cos(),
408-
vector_fn: |x : f32x4| x.cos(),
409-
scalar_type: f32,
410-
vector_type: f32x4,
411-
);
412-
413-
// In the range +/- pi the input has 4 ulp of error and the output has 5.
414-
// Note that the scalar cos also has this error but the implementation
415-
// is different.
416-
test_range!(
417-
min: -PI,
418-
max: PI,
419-
limit: one_ulp * 8.0,
420-
scalar_fn: |x : f32| x.cos(),
421-
vector_fn: |x : f32x4| x.cos(),
422-
scalar_type: f32,
423-
vector_type: f32x4,
424-
);
425-
}
426-
427-
#[test]
428-
fn tan_f32() {
429-
use core::f32::consts::PI;
430-
let one_ulp = (2.0_f32).powi(-23);
431-
432-
// For the outsides, reciprocal accuracy is important.
433-
// Note that the vector function correctly gets -inf for -PI/2
434-
// but the scalar function does not.
435-
test_range!(
436-
min: -PI/2.0 + 0.00001,
437-
max: -PI/4.0,
438-
limit: one_ulp * 3.0,
439-
scalar_fn: |x : f32| x.tan().recip(),
440-
vector_fn: |x : f32x4| x.tan().recip(),
441-
scalar_type: f32,
442-
vector_type: f32x4,
443-
);
444-
445-
// For the insides, absolute accuracy is important.
446-
test_range!(
447-
min: -PI/4.0,
448-
max: PI/4.0,
449-
limit: one_ulp * 2.0,
450-
scalar_fn: |x : f32| x.tan(),
451-
vector_fn: |x : f32x4| x.tan(),
452-
scalar_type: f32,
453-
vector_type: f32x4,
454-
);
455-
456-
test_range!(
457-
min: PI/4.0,
458-
max: PI/2.0 - 0.00001,
459-
limit: one_ulp * 3.0,
460-
scalar_fn: |x : f32| x.tan().recip(),
461-
vector_fn: |x : f32x4| x.tan().recip(),
462-
scalar_type: f32,
463-
vector_type: f32x4,
464-
);
465-
}
466184
}

0 commit comments

Comments
 (0)