@@ -7,10 +7,38 @@ use core::f32;
77use fearless_simd:: { f32x16, f32x4, f32x8, f64x4, f64x8, Simd , SimdBase , SimdFloat , SimdFrom } ;
88use num_traits:: Float ;
99
10+ /// DIT butterfly for chunk_size == 2 (f64)
11+ #[ inline( never) ] // otherwise every kernel gets inlined into the parent and ARM perf drops due to register pressure
12+ pub fn fft_dit_chunk_2_f64 < S : Simd > ( simd : S , reals : & mut [ f64 ] , imags : & mut [ f64 ] ) {
13+ simd. vectorize (
14+ #[ inline( always) ]
15+ || fft_dit_chunk_2_simd_f64 ( simd, reals, imags) ,
16+ )
17+ }
18+
19+ /// DIT butterfly for chunk_size == 2 (f32)
20+ #[ inline( never) ] // otherwise every kernel gets inlined into the parent and ARM perf drops due to register pressure
21+ pub fn fft_dit_chunk_2_f32 < S : Simd > ( simd : S , reals : & mut [ f32 ] , imags : & mut [ f32 ] ) {
22+ simd. vectorize (
23+ #[ inline( always) ]
24+ || fft_dit_chunk_2_simd_f32 ( simd, reals, imags) ,
25+ )
26+ }
27+
28+ #[ inline( always) ] // required by fearless_simd
29+ pub fn fft_dit_chunk_2_simd_f32 < S : Simd > ( simd : S , reals : & mut [ f32 ] , imags : & mut [ f32 ] ) {
30+ fft_dit_chunk_2 ( simd, reals, imags)
31+ }
32+
33+ #[ inline( always) ] // required by fearless_simd
34+ pub fn fft_dit_chunk_2_simd_f64 < S : Simd > ( simd : S , reals : & mut [ f64 ] , imags : & mut [ f64 ] ) {
35+ fft_dit_chunk_2 ( simd, reals, imags)
36+ }
37+
1038/// DIT butterfly for chunk_size == 2
1139/// Identical to DIF version (no twiddles at size 2)
1240#[ inline( always) ] // required by fearless_simd
13- pub fn fft_dit_chunk_2 < S : Simd , T : Float > ( _simd : S , reals : & mut [ T ] , imags : & mut [ T ] ) {
41+ fn fft_dit_chunk_2 < S : Simd , T : Float > ( _simd : S , reals : & mut [ T ] , imags : & mut [ T ] ) {
1442 reals
1543 . chunks_exact_mut ( 2 )
1644 . zip ( imags. chunks_exact_mut ( 2 ) )
0 commit comments