Skip to content

Commit c4450a9

Browse files
committed
refactor: restructure code into modules
1 parent 8e25087 commit c4450a9

File tree

11 files changed

+1767
-1472
lines changed

11 files changed

+1767
-1472
lines changed

src/algorithms/dif.rs

Lines changed: 249 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,249 @@
1+
//! Decimation-in-Frequency (DIF) FFT Implementation
2+
//!
3+
//! The DIF algorithm applies butterflies that progressively increase from size 2^{0} up to size
4+
//! `2^{log(N)}`, where `N` is the size of the input, and `log(N)` is the # of stages required to
5+
//! process the input.
6+
//!
7+
//! Input is processed in natural order and output can be bit-reversed.
8+
//!
9+
//! ## Algorithm Overview
10+
//!
11+
//! 1. Start with butterflies with strides of 2^{log(N)}, where N is the size of the input.
12+
//! 2. Works up to the last stage, with log(N) stages in total.
13+
//! 3. Optionally apply bit-reversal at the end
14+
//!
15+
use crate::cobra::cobra_apply;
16+
use crate::kernels::common::{fft_chunk_2, fft_chunk_4};
17+
use crate::kernels::dif::{fft_32_chunk_n_simd, fft_64_chunk_n_simd, fft_chunk_n};
18+
use crate::options::Options;
19+
use crate::planner::{Direction, Planner32, Planner64};
20+
use crate::twiddles::filter_twiddles;
21+
22+
/// DIF FFT for f64 with options and pre-computed planner
23+
///
24+
/// This is the core DIF implementation that processes data from large butterflies
25+
/// to small, optionally applying bit-reversal at the end.
26+
///
27+
/// # Arguments
28+
///
29+
/// * `reals` - Real components of the signal (modified in-place)
30+
/// * `imags` - Imaginary components of the signal (modified in-place)
31+
/// * `opts` - Options controlling optimization strategies
32+
/// * `planner` - Pre-computed planner with twiddle factors
33+
///
34+
/// # Panics
35+
///
36+
/// Panics if input length is not a power of 2 or if real and imaginary arrays have different lengths
37+
#[multiversion::multiversion(
38+
targets("x86_64+avx512f+avx512bw+avx512cd+avx512dq+avx512vl", // x86_64-v4
39+
"x86_64+avx2+fma", // x86_64-v3
40+
"x86_64+sse4.2", // x86_64-v2
41+
"x86+avx512f+avx512bw+avx512cd+avx512dq+avx512vl",
42+
"x86+avx2+fma",
43+
"x86+sse4.2",
44+
"x86+sse2",
45+
"aarch64+neon", // ARM64 with NEON (Apple Silicon M1/M2)
46+
))]
47+
pub fn fft_64_with_opts_and_plan(
48+
reals: &mut [f64],
49+
imags: &mut [f64],
50+
opts: &Options,
51+
planner: &Planner64,
52+
) {
53+
assert!(reals.len() == imags.len() && reals.len().is_power_of_two());
54+
let n: usize = reals.len().ilog2() as usize;
55+
56+
let twiddles_re = &planner.twiddles_re;
57+
let twiddles_im = &planner.twiddles_im;
58+
59+
assert!(twiddles_re.len() == reals.len() / 2 && twiddles_im.len() == imags.len() / 2);
60+
61+
// Handle inverse transform
62+
if let Direction::Reverse = planner.direction {
63+
for z_im in imags.iter_mut() {
64+
*z_im = -*z_im;
65+
}
66+
};
67+
68+
// First stage - no twiddle filtering needed
69+
let dist = 1 << (n - 1);
70+
let chunk_size = dist << 1;
71+
72+
if chunk_size > 4 {
73+
if chunk_size >= 8 * 2 {
74+
fft_64_chunk_n_simd(reals, imags, twiddles_re, twiddles_im, dist);
75+
} else {
76+
fft_chunk_n(reals, imags, twiddles_re, twiddles_im, dist);
77+
}
78+
} else if chunk_size == 4 {
79+
fft_chunk_4(reals, imags);
80+
} else if chunk_size == 2 {
81+
fft_chunk_2(reals, imags);
82+
}
83+
84+
let (mut filtered_twiddles_re, mut filtered_twiddles_im) =
85+
filter_twiddles(twiddles_re, twiddles_im);
86+
87+
// Subsequent stages with filtered twiddles
88+
for t in (0..n - 1).rev() {
89+
let dist = 1 << t;
90+
let chunk_size = dist << 1;
91+
92+
if chunk_size > 4 {
93+
if chunk_size >= 8 * 2 {
94+
fft_64_chunk_n_simd(
95+
reals,
96+
imags,
97+
&filtered_twiddles_re,
98+
&filtered_twiddles_im,
99+
dist,
100+
);
101+
} else {
102+
fft_chunk_n(
103+
reals,
104+
imags,
105+
&filtered_twiddles_re,
106+
&filtered_twiddles_im,
107+
dist,
108+
);
109+
}
110+
} else if chunk_size == 4 {
111+
fft_chunk_4(reals, imags);
112+
} else if chunk_size == 2 {
113+
fft_chunk_2(reals, imags);
114+
}
115+
116+
(filtered_twiddles_re, filtered_twiddles_im) =
117+
filter_twiddles(&filtered_twiddles_re, &filtered_twiddles_im);
118+
}
119+
120+
// Optional bit reversal (controlled by options)
121+
if opts.dif_perform_bit_reversal {
122+
if opts.multithreaded_bit_reversal {
123+
std::thread::scope(|s| {
124+
s.spawn(|| cobra_apply(reals, n));
125+
s.spawn(|| cobra_apply(imags, n));
126+
});
127+
} else {
128+
cobra_apply(reals, n);
129+
cobra_apply(imags, n);
130+
}
131+
}
132+
133+
// Scaling for inverse transform
134+
if let Direction::Reverse = planner.direction {
135+
let scaling_factor = (reals.len() as f64).recip();
136+
for (z_re, z_im) in reals.iter_mut().zip(imags.iter_mut()) {
137+
*z_re *= scaling_factor;
138+
*z_im *= -scaling_factor;
139+
}
140+
}
141+
}
142+
143+
/// DIF FFT for f32 with options and pre-computed planner
144+
///
145+
/// This is the core DIF implementation for single-precision floating point.
146+
#[multiversion::multiversion(targets(
147+
"x86_64+avx512f+avx512bw+avx512cd+avx512dq+avx512vl",
148+
"x86_64+avx2+fma",
149+
"x86_64+sse4.2",
150+
"x86+avx512f+avx512bw+avx512cd+avx512dq+avx512vl",
151+
"x86+avx2+fma",
152+
"x86+sse4.2",
153+
"x86+sse2",
154+
"aarch64+neon",
155+
))]
156+
pub fn fft_32_with_opts_and_plan(
157+
reals: &mut [f32],
158+
imags: &mut [f32],
159+
opts: &Options,
160+
planner: &Planner32,
161+
) {
162+
assert!(reals.len() == imags.len() && reals.len().is_power_of_two());
163+
let n: usize = reals.len().ilog2() as usize;
164+
165+
let twiddles_re = &planner.twiddles_re;
166+
let twiddles_im = &planner.twiddles_im;
167+
168+
assert!(twiddles_re.len() == reals.len() / 2 && twiddles_im.len() == imags.len() / 2);
169+
170+
// Handle inverse transform
171+
if let Direction::Reverse = planner.direction {
172+
for z_im in imags.iter_mut() {
173+
*z_im = -*z_im;
174+
}
175+
}
176+
177+
// First stage - no twiddle filtering needed
178+
let dist = 1 << (n - 1);
179+
let chunk_size = dist << 1;
180+
181+
if chunk_size > 4 {
182+
if chunk_size >= 16 * 2 {
183+
fft_32_chunk_n_simd(reals, imags, twiddles_re, twiddles_im, dist);
184+
} else {
185+
fft_chunk_n(reals, imags, twiddles_re, twiddles_im, dist);
186+
}
187+
} else if chunk_size == 4 {
188+
fft_chunk_4(reals, imags);
189+
} else if chunk_size == 2 {
190+
fft_chunk_2(reals, imags);
191+
}
192+
193+
let (mut filtered_twiddles_re, mut filtered_twiddles_im) =
194+
filter_twiddles(twiddles_re, twiddles_im);
195+
196+
// Subsequent stages with filtered twiddles
197+
for t in (0..n - 1).rev() {
198+
let dist = 1 << t;
199+
let chunk_size = dist << 1;
200+
201+
if chunk_size > 4 {
202+
if chunk_size >= 16 * 2 {
203+
fft_32_chunk_n_simd(
204+
reals,
205+
imags,
206+
&filtered_twiddles_re,
207+
&filtered_twiddles_im,
208+
dist,
209+
);
210+
} else {
211+
fft_chunk_n(
212+
reals,
213+
imags,
214+
&filtered_twiddles_re,
215+
&filtered_twiddles_im,
216+
dist,
217+
);
218+
}
219+
} else if chunk_size == 4 {
220+
fft_chunk_4(reals, imags);
221+
} else if chunk_size == 2 {
222+
fft_chunk_2(reals, imags);
223+
}
224+
225+
(filtered_twiddles_re, filtered_twiddles_im) =
226+
filter_twiddles(&filtered_twiddles_re, &filtered_twiddles_im);
227+
}
228+
229+
if opts.dif_perform_bit_reversal {
230+
if opts.multithreaded_bit_reversal {
231+
std::thread::scope(|s| {
232+
s.spawn(|| cobra_apply(reals, n));
233+
s.spawn(|| cobra_apply(imags, n));
234+
});
235+
} else {
236+
cobra_apply(reals, n);
237+
cobra_apply(imags, n);
238+
}
239+
}
240+
241+
// Scaling for inverse transform
242+
if let Direction::Reverse = planner.direction {
243+
let scaling_factor = (reals.len() as f32).recip();
244+
for (z_re, z_im) in reals.iter_mut().zip(imags.iter_mut()) {
245+
*z_re *= scaling_factor;
246+
*z_im *= -scaling_factor;
247+
}
248+
};
249+
}

0 commit comments

Comments
 (0)