diff --git a/doc/classes/SignalSmith.xml b/doc/classes/SignalSmith.xml new file mode 100644 index 0000000000..352f02cfe0 --- /dev/null +++ b/doc/classes/SignalSmith.xml @@ -0,0 +1,60 @@ + + + + Performs time-stretching and pitch-shifting on raw audio buffers using Signalsmith. + + + SignalSmith is a low-level audio processing utility which wraps the Signalsmith time-stretching library. It operates on raw interleaved floating-point PCM audio buffers and allows independent control of playback tempo and pitch. + + + + + + + + + Processes a block of interleaved audio samples and returns a new buffer containing the time-stretched and pitch-shifted result. + The effective playback speed is determined by the ratio of input samples to output samples, as influenced by the current tempo setting. The input buffer must contain 32-bit floating-point PCM data and be interleaved according to the configured channel count. + + + + + + Resets the internal processing state. + This should be called when restarting playback or discontinuously changing input streams. + + + + + + + Sets the number of audio channels. + Input and output buffers are expected to be interleaved according to this channel count. + + + + + + + Sets the pitch transpose factor. + A value of `1.0` leaves pitch unchanged. Values greater than `1.0` raise pitch, while values less than `1.0` lower pitch. + + + + + + + Sets the sample rate, in Hz, used by the internal processing engine. + Changing the sample rate resets the internal state. + + + + + + + Sets the tempo multiplier used during processing. + This value influences the ratio between input and output buffer sizes during time-stretching. A value of `1.0` preserves the original tempo, values greater than `1.0` speed up playback, and values less than `1.0` slow it down. + + + + diff --git a/modules/signalsmith/SCsub b/modules/signalsmith/SCsub new file mode 100644 index 0000000000..3de9dedca8 --- /dev/null +++ b/modules/signalsmith/SCsub @@ -0,0 +1,33 @@ +#!/usr/bin/env python +from misc.utility.scons_hints import * + +Import("env") + +signalsmith_env = env.Clone() + +signalsmith_env.Append(CPPPATH=["#thirdparty"]) + +cxx = env.subst("$CXX") +is_clang = "clang" in cxx +is_gcc = ("gcc" in cxx or "g++" in cxx) and not is_clang +is_msvc = ("cl" in cxx) and not is_clang + +if is_gcc: + signalsmith_env.Append(CCFLAGS=["-Wno-class-memaccess"]) + signalsmith_env.Append(CCFLAGS=["-Wno-shadow"]) + +# silence "hides previous declaration/member" warnings +if is_msvc: + signalsmith_env.Append( + CCFLAGS=[ + "/wd4456", # hides previous local declaration + "/wd4458", # hides class member + ] + ) + +module_sources = [ + "register_types.cpp", + "signalsmith_module.cpp", +] + +signalsmith_env.add_source_files(env.modules_sources, module_sources) diff --git a/modules/signalsmith/config.py b/modules/signalsmith/config.py new file mode 100644 index 0000000000..d22f9454ed --- /dev/null +++ b/modules/signalsmith/config.py @@ -0,0 +1,6 @@ +def can_build(env, platform): + return True + + +def configure(env): + pass diff --git a/modules/signalsmith/register_types.cpp b/modules/signalsmith/register_types.cpp new file mode 100644 index 0000000000..a28b031257 --- /dev/null +++ b/modules/signalsmith/register_types.cpp @@ -0,0 +1,49 @@ +/**************************************************************************/ +/* register_types.cpp */ +/**************************************************************************/ +/* This file is part of: */ +/* REDOT ENGINE */ +/* https://redotengine.org */ +/**************************************************************************/ +/* Copyright (c) 2024-present Redot Engine contributors */ +/* (see REDOT_AUTHORS.md) */ +/* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */ +/* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur. */ +/* */ +/* Permission is hereby granted, free of charge, to any person obtaining */ +/* a copy of this software and associated documentation files (the */ +/* "Software"), to deal in the Software without restriction, including */ +/* without limitation the rights to use, copy, modify, merge, publish, */ +/* distribute, sublicense, and/or sell copies of the Software, and to */ +/* permit persons to whom the Software is furnished to do so, subject to */ +/* the following conditions: */ +/* */ +/* The above copyright notice and this permission notice shall be */ +/* included in all copies or substantial portions of the Software. */ +/* */ +/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */ +/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */ +/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */ +/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY */ +/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, */ +/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE */ +/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +/**************************************************************************/ + +#include "register_types.h" +#include "core/object/class_db.h" +#include "signalsmith_module.h" + +void initialize_signalsmith_module(ModuleInitializationLevel p_level) { + if (p_level != MODULE_INITIALIZATION_LEVEL_SCENE) { + return; + } + + ClassDB::register_class(); +} + +void uninitialize_signalsmith_module(ModuleInitializationLevel p_level) { + if (p_level != MODULE_INITIALIZATION_LEVEL_SCENE) { + return; + } +} diff --git a/modules/signalsmith/register_types.h b/modules/signalsmith/register_types.h new file mode 100644 index 0000000000..5ead1a2a91 --- /dev/null +++ b/modules/signalsmith/register_types.h @@ -0,0 +1,38 @@ +/**************************************************************************/ +/* register_types.h */ +/**************************************************************************/ +/* This file is part of: */ +/* REDOT ENGINE */ +/* https://redotengine.org */ +/**************************************************************************/ +/* Copyright (c) 2024-present Redot Engine contributors */ +/* (see REDOT_AUTHORS.md) */ +/* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */ +/* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur. */ +/* */ +/* Permission is hereby granted, free of charge, to any person obtaining */ +/* a copy of this software and associated documentation files (the */ +/* "Software"), to deal in the Software without restriction, including */ +/* without limitation the rights to use, copy, modify, merge, publish, */ +/* distribute, sublicense, and/or sell copies of the Software, and to */ +/* permit persons to whom the Software is furnished to do so, subject to */ +/* the following conditions: */ +/* */ +/* The above copyright notice and this permission notice shall be */ +/* included in all copies or substantial portions of the Software. */ +/* */ +/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */ +/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */ +/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */ +/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY */ +/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, */ +/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE */ +/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +/**************************************************************************/ + +#pragma once + +#include "modules/register_module_types.h" + +void initialize_signalsmith_module(ModuleInitializationLevel p_level); +void uninitialize_signalsmith_module(ModuleInitializationLevel p_level); diff --git a/modules/signalsmith/signalsmith_module.cpp b/modules/signalsmith/signalsmith_module.cpp new file mode 100644 index 0000000000..3ead1acbe6 --- /dev/null +++ b/modules/signalsmith/signalsmith_module.cpp @@ -0,0 +1,173 @@ +/**************************************************************************/ +/* signalsmith_module.cpp */ +/**************************************************************************/ +/* This file is part of: */ +/* REDOT ENGINE */ +/* https://redotengine.org */ +/**************************************************************************/ +/* Copyright (c) 2024-present Redot Engine contributors */ +/* (see REDOT_AUTHORS.md) */ +/* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */ +/* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur. */ +/* */ +/* Permission is hereby granted, free of charge, to any person obtaining */ +/* a copy of this software and associated documentation files (the */ +/* "Software"), to deal in the Software without restriction, including */ +/* without limitation the rights to use, copy, modify, merge, publish, */ +/* distribute, sublicense, and/or sell copies of the Software, and to */ +/* permit persons to whom the Software is furnished to do so, subject to */ +/* the following conditions: */ +/* */ +/* The above copyright notice and this permission notice shall be */ +/* included in all copies or substantial portions of the Software. */ +/* */ +/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */ +/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */ +/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */ +/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY */ +/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, */ +/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE */ +/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +/**************************************************************************/ + +#include "signalsmith_module.h" + +#include "core/os/memory.h" + +#include +#include + +void SignalSmith::_bind_methods() { + ClassDB::bind_method(D_METHOD("set_sample_rate", "rate"), &SignalSmith::set_sample_rate); + ClassDB::bind_method(D_METHOD("set_channels", "channels"), &SignalSmith::set_channels); + ClassDB::bind_method(D_METHOD("set_pitch", "pitch"), &SignalSmith::set_pitch); + ClassDB::bind_method(D_METHOD("set_tempo", "tempo"), &SignalSmith::set_tempo); + ClassDB::bind_method(D_METHOD("reset"), &SignalSmith::reset); + ClassDB::bind_method(D_METHOD("process", "input"), &SignalSmith::process); +} + +SignalSmith::SignalSmith() { + stretch.presetDefault(channels, sample_rate); +} + +SignalSmith::~SignalSmith() {} + +void SignalSmith::set_sample_rate(int p_rate) { + if (p_rate < 1) { + return; + } + + sample_rate = p_rate; + stretch.presetDefault(channels, sample_rate); +} + +void SignalSmith::set_channels(int p_channels) { + if (p_channels < 1) { + return; + } + + channels = p_channels; + stretch.presetDefault(channels, sample_rate); +} + +void SignalSmith::set_pitch(float p_pitch) { + if (!(p_pitch > 0.0f)) { + return; + } + + stretch.setTransposeFactor(p_pitch); +} + +void SignalSmith::set_tempo(float p_tempo) { + if (!(p_tempo > 0.0f)) { + return; + } + + tempo = p_tempo; +} + +void SignalSmith::reset() { + stretch.reset(); +} + +PackedFloat32Array SignalSmith::process(const PackedFloat32Array &input) { + PackedFloat32Array output; + + if (channels < 1) { + return output; + } + + const int total_samples = input.size(); + + if (total_samples <= 0) { + return output; + } + + if (total_samples % channels != 0) { + ERR_FAIL_V_MSG(output, "Input array size must be a multiple of channel count."); + } + + const int input_frames = total_samples / channels; + + if (input_frames <= 0) { + return output; + } + + const float tf = (tempo > 0.0f) ? tempo : 1.0f; + int output_frames = (int)std::lround((double)input_frames / (double)tf); + + if (output_frames < 0) { + output_frames = 0; + } + + // Deinterleave + std::vector> in_ch; + in_ch.resize((size_t)channels); + + for (int c = 0; c < channels; c++) { + in_ch[(size_t)c].resize((size_t)input_frames); + } + + const float *src = input.ptr(); + + for (int i = 0; i < input_frames; i++) { + const int base = i * channels; + + for (int c = 0; c < channels; c++) { + in_ch[(size_t)c][(size_t)i] = src[base + c]; + } + } + + // Output buffers + std::vector> out_ch; + out_ch.resize((size_t)channels); + + for (int c = 0; c < channels; c++) { + out_ch[(size_t)c].assign((size_t)output_frames, 0.0f); + } + + std::vector in_ptrs((size_t)channels, nullptr); + std::vector out_ptrs((size_t)channels, nullptr); + + for (int c = 0; c < channels; c++) { + in_ptrs[(size_t)c] = in_ch[(size_t)c].data(); + out_ptrs[(size_t)c] = out_ch[(size_t)c].data(); + } + + // Process: (inputs, inputSamples, outputs, outputSamples) + stretch.process(in_ptrs.data(), input_frames, out_ptrs.data(), output_frames); + + // Interleave + output.resize(output_frames * channels); + float *dst = output.ptrw(); + + for (int i = 0; i < output_frames; i++) { + const int base = i * channels; + + for (int c = 0; c < channels; c++) { + dst[base + c] = out_ch[(size_t)c][(size_t)i]; + } + } + + return output; +} diff --git a/modules/signalsmith/signalsmith_module.h b/modules/signalsmith/signalsmith_module.h new file mode 100644 index 0000000000..e60453dee8 --- /dev/null +++ b/modules/signalsmith/signalsmith_module.h @@ -0,0 +1,63 @@ +/**************************************************************************/ +/* signalsmith_module.h */ +/**************************************************************************/ +/* This file is part of: */ +/* REDOT ENGINE */ +/* https://redotengine.org */ +/**************************************************************************/ +/* Copyright (c) 2024-present Redot Engine contributors */ +/* (see REDOT_AUTHORS.md) */ +/* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */ +/* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur. */ +/* */ +/* Permission is hereby granted, free of charge, to any person obtaining */ +/* a copy of this software and associated documentation files (the */ +/* "Software"), to deal in the Software without restriction, including */ +/* without limitation the rights to use, copy, modify, merge, publish, */ +/* distribute, sublicense, and/or sell copies of the Software, and to */ +/* permit persons to whom the Software is furnished to do so, subject to */ +/* the following conditions: */ +/* */ +/* The above copyright notice and this permission notice shall be */ +/* included in all copies or substantial portions of the Software. */ +/* */ +/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */ +/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */ +/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */ +/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY */ +/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, */ +/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE */ +/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +/**************************************************************************/ + +#pragma once + +#include "core/object/class_db.h" +#include "core/object/ref_counted.h" +#include "signalsmith-stretch/signalsmith-stretch.h" +#include + +class SignalSmith : public RefCounted { + GDCLASS(SignalSmith, RefCounted); + +private: + signalsmith::stretch::SignalsmithStretch stretch; + int sample_rate = 44100; + int channels = 2; + float tempo = 1.0f; + +protected: + static void _bind_methods(); + +public: + SignalSmith(); + ~SignalSmith(); + + void set_sample_rate(int p_rate); + void set_channels(int p_channels); + void set_pitch(float p_pitch); + void set_tempo(float p_tempo); + void reset(); + + PackedFloat32Array process(const PackedFloat32Array &input); +}; diff --git a/thirdparty/signalsmith-linear/.gitignore b/thirdparty/signalsmith-linear/.gitignore new file mode 100644 index 0000000000..ee8cad017e --- /dev/null +++ b/thirdparty/signalsmith-linear/.gitignore @@ -0,0 +1,3 @@ +.DS_Store +tests/others/dsp +out/ diff --git a/thirdparty/signalsmith-linear/LICENSE.txt b/thirdparty/signalsmith-linear/LICENSE.txt new file mode 100644 index 0000000000..c2569820cd --- /dev/null +++ b/thirdparty/signalsmith-linear/LICENSE.txt @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2025 Signalsmith Audio + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/thirdparty/signalsmith-linear/README.md b/thirdparty/signalsmith-linear/README.md new file mode 100644 index 0000000000..177b7c0227 --- /dev/null +++ b/thirdparty/signalsmith-linear/README.md @@ -0,0 +1,99 @@ +# Signalsmith Linear + +Header-only C++11 wrappers for common things needed by our audio libraries. It's designed for internal use, so [caveat developor](https://en.wikipedia.org/wiki/Caveat_emptor). The goal is to wrap around Accelerate/IPP when available, but still work without it. + +Everything is in `signalsmith::linear::` namespace. + +## FFTs + +```cpp +#include "signalsmith-linear/fft.h" +``` + +This provides real and complex FFTs. They all have `.resize(size_t)`, and `.fft()`/`.ifft()` methods which can take either `std::complex<>` pointers, or a real/imaginary pair ("split-complex" form) for each complex argument. + +The `Pow2FFT<>` and `Pow2RealFFT<>` templates wrap around fast implementations where available. + +The main `FFT<>`, `RealFFT<>` and `ModifiedRealFFT<>` templates wrap around the `Pow2<>` implementations, to add support for multiples of 3 and 5. They provide a static `.fastSizeAbove()` to find the next biggest size. + +### Chunked computation + +The main FFT classes also provide `.steps()` method, and an optional `size_t` first argument to the `.fft()`/`.ifft()` methods, so that computation can be divided up into chunks. + +The computation time for the chunks is not exactly equal, but when you're doing large FFTs periodically (instead of smaller ones regularly) it can help distribute the computation out, without using threads. + +## STFTs + +```cpp +#include "signalsmith-linear/stft.h" +``` + +This provides `DynamicSTFT<>` template, which is configured for any block length (zero padding to a fast size), and a (default!) interval between blocks. It can have a different number of input/output channels, using `.spectrum(c)` to access both. + +### Input and output + +The `.synthesise()` method moves the output time forward, and adds in output block(s) generated from the spectrum. The result can then be queried using `.readOutput()`. + +Input is passed in using `.writeInput()`, but you must move the input time forward using `.moveInput()` before calling `.analyse()`. By default, this will analyse the most recent block of input, but if you passed in a non-zero `extraInputHistory` when configuring, you can analyse input from some time in the past. + +### Window shape and block interval + +It has separate analysis/synthesis windows, which can be changed on-the-fly. Both windows have an "offset" marking the centre of the window, to support asymmetrical setups (for reduced latency). + +The "synthesis interval" (optionally passed to `.synthesise()`) can also change arbitrarily. The output is normalised according to these gaps (and the analysis/synthesis windows) such that regardless of spacing or window shape, it would perfectly reconstruct the input. + +The analysis should remain constant between analysis and synthesis, since it's used for normalising the output properly. If you're synthesising signals from scratch (which have no inherent input windowing), then you should reflect that by setting the analysis window to all 1s. + +### Chunked computation + +Similar to the FFTs' `.steps()` methods, `DynamicSTFT` has `.analyseSteps()`/`.synthesiseSteps()` methods, and `.analyseStep()`/`.synthesiseStep()` to help you spread the computation out over time. + +The window shapes/offsets, input/output and synthesis interval must stay the same until the analysis/synthesis is finished. + +## Expressions + +```cpp +#include "signalsmith-linear/linear.h" +``` + +The main `Linear` class provides expression templates, which wrap around three types of pointer: real, complex, and split-complex. You can wrap these pointers (or `std::vector`s) into `Expression`s using the `()` operator: + +```cpp +Linear linear; + +float *a, *b; +std::complex *c; +size_t size; + +// Once the above variables are set up: +linear(a, size) = linear(b) + linear(c).abs(); + +// Pass in two real pointers to make a split-complex expression (which is often a bit faster). +linear(c, size) = linear(a, b).conj(); +``` + +Implementations may use temporary internal storage. This means it's not thread-safe, it should be a member of your processing class, and you should also call `.reserve???()` during configuration, with the longest vector length you expect to use. + +## Building + +### CMake + +If you're using CMake, include this directory. It will add a `signalsmith-linear` target which doesn't build anything, but linking to this "library" will add the include path. + +By default, it will link to Accelerate on Mac. If you don't want this, set the CMake option `SIGNALSMITH_USE_ACCELERATE` to `OFF`. + +The similar option `SIGNALSMITH_USE_IPP` if `OFF` by default. When enabled, it will link to IPP and set the `SIGNALSMITH_USE_IPP` preprocessor definition. + +`SIGNALSMITH_USE_PFFFT` and `SIGNALSMITH_USE_PFFFT_DOUBLE` don't link to anything, because there are multiple versions. + +### Other + +To use Accelerate on Mac, link the framework and define `SIGNALSMITH_USE_ACCELERATE`: + +``` +g++ -framework Accelerate -DSIGNALSMITH_USE_ACCELERATE +``` + +Similarly, define `SIGNALSMITH_USE_IPP` (and link to `IPP::ippcore` and `IPP::ipps`) for IPP. + +Not all PFFFT versions support double-precision, so there are separate `SIGNALSMITH_USE_PFFFT`/`SIGNALSMITH_USE_PFFFT_DOUBLE` flags. diff --git a/thirdparty/signalsmith-linear/SUPPORT.txt b/thirdparty/signalsmith-linear/SUPPORT.txt new file mode 100644 index 0000000000..96e02c2956 --- /dev/null +++ b/thirdparty/signalsmith-linear/SUPPORT.txt @@ -0,0 +1,10 @@ +# https://github.com/geraintluff/SUPPORT.txt +# These people have committed to maintain the project until at least these dates + +2027-01-01 Geraint Luff + +FFTs and STFTs: +2027-01-01 Geraint Luff + +Linear: +This is too experimental, you're on your own. diff --git a/thirdparty/signalsmith-linear/fft.h b/thirdparty/signalsmith-linear/fft.h new file mode 100644 index 0000000000..80b559bab8 --- /dev/null +++ b/thirdparty/signalsmith-linear/fft.h @@ -0,0 +1,1380 @@ +#ifndef SIGNALSMITH_AUDIO_LINEAR_FFT_H +#define SIGNALSMITH_AUDIO_LINEAR_FFT_H + +#include +#include +#include + +#if defined(__FAST_MATH__) && (__apple_build_version__ >= 16000000) && (__apple_build_version__ <= 16000099) && !defined(SIGNALSMITH_IGNORE_BROKEN_APPLECLANG) +# error Apple Clang 16.0.0 generates incorrect SIMD for ARM. If you HAVE to use this version of Clang, turn off -ffast-math. +#endif + +#ifndef M_PI +# define M_PI 3.14159265358979323846 +#endif + +namespace signalsmith { namespace linear { + +namespace _impl { + // Helpers for complex arithmetic, ignoring the NaN/Inf edge-cases you get without `-ffast-math` + template + void complexMul(std::complex *a, const std::complex *b, const std::complex *c, size_t size) { + for (size_t i = 0; i < size; ++i) { + auto bi = b[i], ci = c[i]; + a[i] = {bi.real()*ci.real() - bi.imag()*ci.imag(), bi.imag()*ci.real() + bi.real()*ci.imag()}; + } + } + template + void complexMulConj(std::complex *a, const std::complex *b, const std::complex *c, size_t size) { + for (size_t i = 0; i < size; ++i) { + auto bi = b[i], ci = c[i]; + a[i] = {bi.real()*ci.real() + bi.imag()*ci.imag(), bi.imag()*ci.real() - bi.real()*ci.imag()}; + } + } + template + void complexMul(V *ar, V *ai, const V *br, const V *bi, const V *cr, const V *ci, size_t size) { + for (size_t i = 0; i < size; ++i) { + V rr = br[i]*cr[i] - bi[i]*ci[i]; + V ri = br[i]*ci[i] + bi[i]*cr[i]; + ar[i] = rr; + ai[i] = ri; + } + } + template + void complexMulConj(V *ar, V *ai, const V *br, const V *bi, const V *cr, const V *ci, size_t size) { + for (size_t i = 0; i < size; ++i) { + V rr = cr[i]*br[i] + ci[i]*bi[i]; + V ri = cr[i]*bi[i] - ci[i]*br[i]; + ar[i] = rr; + ai[i] = ri; + } + } + + // Input: aStride elements next to each other -> output with bStride + template + void interleaveCopy(const V *a, V *b, size_t bStride) { + for (size_t bi = 0; bi < bStride; ++bi) { + const V *offsetA = a + bi*aStride; + V *offsetB = b + bi; + for (size_t ai = 0; ai < aStride; ++ai) { + offsetB[ai*bStride] = offsetA[ai]; + } + } + } + template + void interleaveCopy(const V *a, V *b, size_t aStride, size_t bStride) { + for (size_t bi = 0; bi < bStride; ++bi) { + const V *offsetA = a + bi*aStride; + V *offsetB = b + bi; + for (size_t ai = 0; ai < aStride; ++ai) { + offsetB[ai*bStride] = offsetA[ai]; + } + } + } + template + void interleaveCopy(const V *aReal, const V *aImag, V *bReal, V *bImag, size_t bStride) { + for (size_t bi = 0; bi < bStride; ++bi) { + const V *offsetAr = aReal + bi*aStride; + const V *offsetAi = aImag + bi*aStride; + V *offsetBr = bReal + bi; + V *offsetBi = bImag + bi; + for (size_t ai = 0; ai < aStride; ++ai) { + offsetBr[ai*bStride] = offsetAr[ai]; + offsetBi[ai*bStride] = offsetAi[ai]; + } + } + } + template + void interleaveCopy(const V *aReal, const V *aImag, V *bReal, V *bImag, size_t aStride, size_t bStride) { + for (size_t bi = 0; bi < bStride; ++bi) { + const V *offsetAr = aReal + bi*aStride; + const V *offsetAi = aImag + bi*aStride; + V *offsetBr = bReal + bi; + V *offsetBi = bImag + bi; + for (size_t ai = 0; ai < aStride; ++ai) { + offsetBr[ai*bStride] = offsetAr[ai]; + offsetBi[ai*bStride] = offsetAi[ai]; + } + } + } +} + +/// Fairly simple and very portable power-of-2 FFT +template +struct SimpleFFT { + using Complex = std::complex; + + SimpleFFT(size_t size=0) { + resize(size); + } + + void resize(size_t size) { + twiddles.resize(size*3/4); + for (size_t i = 0; i < size*3/4; ++i) { + Sample twiddlePhase = -2*M_PI*i/size; + twiddles[i] = std::polar(Sample(1), twiddlePhase); + } + working.resize(size); + } + + void fft(const Complex *time, Complex *freq) { + size_t size = working.size(); + if (size <= 1) { + *freq = *time; + return; + } + fftPass(size, 1, time, freq, working.data()); + } + + void ifft(const Complex *freq, Complex *time) { + size_t size = working.size(); + if (size <= 1) { + *time = *freq; + return; + } + fftPass(size, 1, freq, time, working.data()); + } + + void fft(const Sample *inR, const Sample *inI, Sample *outR, Sample *outI) { + size_t size = working.size(); + if (size <= 1) { + *outR = *inR; + *outI = *inI; + return; + } + Sample *workingR = (Sample *)working.data(), *workingI = workingR + size; + fftPass(size, 1, inR, inI, outR, outI, workingR, workingI); + } + void ifft(const Sample *inR, const Sample *inI, Sample *outR, Sample *outI) { + size_t size = working.size(); + if (size <= 1) { + *outR = *inR; + *outI = *inI; + return; + } + Sample *workingR = (Sample *)working.data(), *workingI = workingR + size; + fftPass(size, 1, inR, inI, outR, outI, workingR, workingI); + } +private: + std::vector twiddles; + std::vector working; + + template + static Complex mul(const Complex &a, const Complex &b) { + return conjB ? Complex{ + a.real()*b.real() + a.imag()*b.imag(), + a.imag()*b.real() - a.real()*b.imag() + } : Complex{ + a.real()*b.real() - a.imag()*b.imag(), + a.imag()*b.real() + a.real()*b.imag() + }; + } + + // Calculate a [size]-point FFT, where each element is a block of [stride] values + template + void fftPass(size_t size, size_t stride, const Complex *input, Complex *output, Complex *working) { + if (size/4 > 1) { + // Calculate four quarter-size FFTs + fftPass(size/4, stride*4, input, working, output); + combine4(size, stride, working, output); + } else if (size == 4) { + combine4(4, stride, input, output); + } else { + // 2-point FFT + for (size_t s = 0; s < stride; ++s) { + Complex a = input[s]; + Complex b = input[s + stride]; + output[s] = a + b; + output[s + stride] = a - b; + } + } + } + + // Combine interleaved results into a single spectrum + template + void combine4(size_t size, size_t stride, const Complex *input, Complex *output) const { + auto twiddleStep = working.size()/size; + for (size_t i = 0; i < size/4; ++i) { + Complex twiddleB = twiddles[i*twiddleStep]; + Complex twiddleC = twiddles[i*2*twiddleStep]; + Complex twiddleD = twiddles[i*3*twiddleStep]; + + const Complex *inputA = input + 4*i*stride; + const Complex *inputB = input + (4*i + 1)*stride; + const Complex *inputC = input + (4*i + 2)*stride; + const Complex *inputD = input + (4*i + 3)*stride; + Complex *outputA = output + i*stride; + Complex *outputB = output + (i + size/4)*stride; + Complex *outputC = output + (i + size/4*2)*stride; + Complex *outputD = output + (i + size/4*3)*stride; + for (size_t s = 0; s < stride; ++s) { + Complex a = inputA[s]; + Complex b = mul(inputB[s], twiddleB); + Complex c = mul(inputC[s], twiddleC); + Complex d = mul(inputD[s], twiddleD); + Complex ac0 = a + c, ac1 = a - c; + Complex bd0 = b + d, bd1 = inverse ? (b - d) : (d - b); + Complex bd1i = {-bd1.imag(), bd1.real()}; + outputA[s] = ac0 + bd0; + outputB[s] = ac1 + bd1i; + outputC[s] = ac0 - bd0; + outputD[s] = ac1 - bd1i; + } + } + } + + // The same thing, but translated for split-complex input/output + template + void fftPass(size_t size, size_t stride, const Sample *inputR, const Sample *inputI, Sample *outputR, Sample *outputI, Sample *workingR, Sample *workingI) const { + if (size/4 > 1) { + // Calculate four quarter-size FFTs + fftPass(size/4, stride*4, inputR, inputI, workingR, workingI, outputR, outputI); + combine4(size, stride, workingR, workingI, outputR, outputI); + } else if (size == 4) { + combine4(4, stride, inputR, inputI, outputR, outputI); + } else { + // 2-point FFT + for (size_t s = 0; s < stride; ++s) { + Sample ar = inputR[s], ai = inputI[s]; + Sample br = inputR[s + stride], bi = inputI[s + stride]; + outputR[s] = ar + br; + outputI[s] = ai + bi; + outputR[s + stride] = ar - br; + outputI[s + stride] = ai - bi; + } + } + } + + // Combine interleaved results into a single spectrum + template + void combine4(size_t size, size_t stride, const Sample *inputR, const Sample *inputI, Sample *outputR, Sample *outputI) const { + auto twiddleStep = working.size()/size; + for (size_t i = 0; i < size/4; ++i) { + Complex twiddleB = twiddles[i*twiddleStep]; + Complex twiddleC = twiddles[i*2*twiddleStep]; + Complex twiddleD = twiddles[i*3*twiddleStep]; + + const Sample *inputAr = inputR + 4*i*stride, *inputAi = inputI + 4*i*stride; + const Sample *inputBr = inputR + (4*i + 1)*stride, *inputBi = inputI + (4*i + 1)*stride; + const Sample *inputCr = inputR + (4*i + 2)*stride, *inputCi = inputI + (4*i + 2)*stride; + const Sample *inputDr = inputR + (4*i + 3)*stride, *inputDi = inputI + (4*i + 3)*stride; + Sample *outputAr = outputR + i*stride, *outputAi = outputI + i*stride; + Sample *outputBr = outputR + (i + size/4)*stride, *outputBi = outputI + (i + size/4)*stride; + Sample *outputCr = outputR + (i + size/4*2)*stride, *outputCi = outputI + (i + size/4*2)*stride; + Sample *outputDr = outputR + (i + size/4*3)*stride, *outputDi = outputI + (i + size/4*3)*stride; + for (size_t s = 0; s < stride; ++s) { + Complex a = {inputAr[s], inputAi[s]}; + Complex b = mul({inputBr[s], inputBi[s]}, twiddleB); + Complex c = mul({inputCr[s], inputCi[s]}, twiddleC); + Complex d = mul({inputDr[s], inputDi[s]}, twiddleD); + Complex ac0 = a + c, ac1 = a - c; + Complex bd0 = b + d, bd1 = inverse ? (b - d) : (d - b); + Complex bd1i = {-bd1.imag(), bd1.real()}; + outputAr[s] = ac0.real() + bd0.real(); + outputAi[s] = ac0.imag() + bd0.imag(); + outputBr[s] = ac1.real() + bd1i.real(); + outputBi[s] = ac1.imag() + bd1i.imag(); + outputCr[s] = ac0.real() - bd0.real(); + outputCi[s] = ac0.imag() - bd0.imag(); + outputDr[s] = ac1.real() - bd1i.real(); + outputDi[s] = ac1.imag() - bd1i.imag(); + } + } + } +}; + +/// A power-of-2 only FFT, specialised with platform-specific fast implementations where available +template +struct Pow2FFT { + static constexpr bool prefersSplit = true; // whether this FFT implementation is faster when given split-complex inputs + using Complex = std::complex; + + Pow2FFT(size_t size=0) { + resize(size); + } + // Allow move, but not copy + Pow2FFT(const Pow2FFT &other) = delete; + Pow2FFT(Pow2FFT &&other) : tmp(std::move(other.tmp)), simpleFFT(std::move(other.simpleFFT)) {} + + void resize(size_t size) { + simpleFFT.resize(size); + tmp.resize(size); + } + + void fft(const Complex *time, Complex *freq) { + simpleFFT.fft(time, freq); + } + void fft(const Sample *inR, const Sample *inI, Sample *outR, Sample *outI) { + simpleFFT.fft(inR, inI, outR, outI); + } + + void ifft(const Complex *freq, Complex *time) { + simpleFFT.ifft(freq, time); + } + void ifft(const Sample *inR, const Sample *inI, Sample *outR, Sample *outI) { + simpleFFT.ifft(inR, inI, outR, outI); + } + +private: + std::vector tmp; + SimpleFFT simpleFFT; +}; + +/// An FFT which can handle multiples of 3 and 5, and can be computed in chunks +template +struct SplitFFT { + using Complex = std::complex; + static constexpr bool prefersSplit = Pow2FFT::prefersSplit; + + static constexpr size_t maxSplit = splitComputation ? 4 : 1; + static constexpr size_t minInnerSize = 32; + + static size_t fastSizeAbove(size_t size) { + size_t pow2 = 1; + while (pow2 < 16 && pow2 < size) pow2 *= 2; + while (pow2*8 < size) pow2 *= 2; + size_t multiple = (size + pow2 - 1)/pow2; // will be 1-8 + if (multiple == 7) ++multiple; + return multiple*pow2; + } + + SplitFFT(size_t size=0) { + resize(size); + } + + void resize(size_t size) { + innerSize = 1; + outerSize = size; + + dftTmp.resize(0); + dftTwists.resize(0); + plan.resize(0); + if (!size) return; + + // Inner size = largest power of 2 such that either the inner size >= minInnerSize, or we have the target number of splits + while (!(outerSize&1) && (outerSize > maxSplit || innerSize < minInnerSize)) { + innerSize *= 2; + outerSize /= 2; + } + tmpFreq.resize(size); + innerFFT.resize(innerSize); + + outerTwiddles.resize(innerSize*(outerSize - 1)); + outerTwiddlesR.resize(innerSize*(outerSize - 1)); + outerTwiddlesI.resize(innerSize*(outerSize - 1)); + for (size_t i = 0; i < innerSize; ++i) { + for (size_t s = 1; s < outerSize; ++s) { + Sample twiddlePhase = Sample(-2*M_PI*i/innerSize*s/outerSize); + outerTwiddles[i + (s - 1)*innerSize] = std::polar(Sample(1), twiddlePhase); + } + } + for (size_t i = 0; i < outerTwiddles.size(); ++i) { + outerTwiddlesR[i] = outerTwiddles[i].real(); + outerTwiddlesI[i] = outerTwiddles[i].imag(); + } + + StepType interleaveStep = StepType::interleaveOrderN; + StepType finalStep = StepType::finalOrderN; + if (outerSize == 2) { + interleaveStep = StepType::interleaveOrder2; + finalStep = StepType::finalOrder2; + } + if (outerSize == 3) { + interleaveStep = StepType::interleaveOrder3; + finalStep = StepType::finalOrder3; + } + if (outerSize == 4) { + interleaveStep = StepType::interleaveOrder4; + finalStep = StepType::finalOrder4; + } + if (outerSize == 5) { + interleaveStep = StepType::interleaveOrder5; + finalStep = StepType::finalOrder5; + } + + if (outerSize <= 1) { + if (size > 0) plan.push_back(Step{StepType::passthrough, 0}); + } else { + plan.push_back({interleaveStep, 0}); + plan.push_back({StepType::firstFFT, 0}); + for (size_t s = 1; s < outerSize; ++s) { + plan.push_back({StepType::middleFFT, s*innerSize}); + } + plan.push_back({StepType::twiddles, 0}); + plan.push_back({finalStep, 0}); + + if (finalStep == StepType::finalOrderN) { + dftTmp.resize(outerSize); + dftTwists.resize(outerSize); + for (size_t s = 0; s < outerSize; ++s) { + Sample dftPhase = Sample(-2*M_PI*s/outerSize); + dftTwists[s] = std::polar(Sample(1), dftPhase); + } + } + } + } + + size_t size() const { + return innerSize*outerSize; + } + size_t steps() const { + return plan.size(); + } + + void fft(const Complex *time, Complex *freq) { + for (auto &step : plan) { + fftStep(step, time, freq); + } + } + void fft(size_t step, const Complex *time, Complex *freq) { + fftStep(plan[step], time, freq); + } + void fft(const Sample *inR, const Sample *inI, Sample *outR, Sample *outI) { + for (auto &step : plan) { + fftStep(step, inR, inI, outR, outI); + } + } + void fft(size_t step, const Sample *inR, const Sample *inI, Sample *outR, Sample *outI) { + fftStep(plan[step], inR, inI, outR, outI); + } + + void ifft(const Complex *freq, Complex *time) { + for (auto &step : plan) { + fftStep(step, freq, time); + } + } + void ifft(size_t step, const Complex *freq, Complex *time) { + fftStep(plan[step], freq, time); + } + void ifft(const Sample *inR, const Sample *inI, Sample *outR, Sample *outI) { + for (auto &step : plan) { + fftStep(step, inR, inI, outR, outI); + } + } + void ifft(size_t step, const Sample *inR, const Sample *inI, Sample *outR, Sample *outI) { + fftStep(plan[step], inR, inI, outR, outI); + } +private: + using InnerFFT = Pow2FFT; + InnerFFT innerFFT; + + size_t innerSize, outerSize; + std::vector tmpFreq; + std::vector outerTwiddles; + std::vector outerTwiddlesR, outerTwiddlesI; + std::vector dftTwists, dftTmp; + + enum class StepType { + passthrough, + interleaveOrder2, interleaveOrder3, interleaveOrder4, interleaveOrder5, interleaveOrderN, + firstFFT, middleFFT, + twiddles, + finalOrder2, finalOrder3, finalOrder4, finalOrder5, finalOrderN + }; + struct Step { + StepType type; + size_t offset; + }; + std::vector plan; + + template + void fftStep(Step step, const Complex *time, Complex *freq) { + switch (step.type) { + case (StepType::passthrough): { + if (inverse) { + innerFFT.ifft(time, freq); + } else { + innerFFT.fft(time, freq); + } + break; + } + case (StepType::interleaveOrder2): { + _impl::interleaveCopy<2>(time, tmpFreq.data(), innerSize); + break; + } + case (StepType::interleaveOrder3): { + _impl::interleaveCopy<3>(time, tmpFreq.data(), innerSize); + break; + } + case (StepType::interleaveOrder4): { + _impl::interleaveCopy<4>(time, tmpFreq.data(), innerSize); + break; + } + case (StepType::interleaveOrder5): { + _impl::interleaveCopy<5>(time, tmpFreq.data(), innerSize); + break; + } + case (StepType::interleaveOrderN): { + _impl::interleaveCopy(time, tmpFreq.data(), outerSize, innerSize); + break; + } + case (StepType::firstFFT): { + if (inverse) { + innerFFT.ifft(tmpFreq.data(), freq); + } else { + innerFFT.fft(tmpFreq.data(), freq); + } + break; + } + case (StepType::middleFFT): { + Complex *offsetOut = freq + step.offset; + if (inverse) { + innerFFT.ifft(tmpFreq.data() + step.offset, offsetOut); + } else { + innerFFT.fft(tmpFreq.data() + step.offset, offsetOut); + } + break; + } + case (StepType::twiddles): { + if (inverse) { + _impl::complexMulConj(freq + innerSize, freq + innerSize, outerTwiddles.data(), innerSize*(outerSize - 1)); + } else { + _impl::complexMul(freq + innerSize, freq + innerSize, outerTwiddles.data(), innerSize*(outerSize - 1)); + } + break; + } + case StepType::finalOrder2: + finalPass2(freq); + break; + case StepType::finalOrder3: + finalPass3(freq); + break; + case StepType::finalOrder4: + finalPass4(freq); + break; + case StepType::finalOrder5: + finalPass5(freq); + break; + case StepType::finalOrderN: + finalPassN(freq); + break; + } + } + template + void fftStep(Step step, const Sample *inR, const Sample *inI, Sample *outR, Sample *outI) { + Sample *tmpR = (Sample *)tmpFreq.data(), *tmpI = tmpR + tmpFreq.size(); + switch (step.type) { + case (StepType::passthrough): { + if (inverse) { + innerFFT.ifft(inR, inI, outR, outI); + } else { + innerFFT.fft(inR, inI, outR, outI); + } + break; + } + case (StepType::interleaveOrder2): { + _impl::interleaveCopy<2>(inR, tmpR, innerSize); + _impl::interleaveCopy<2>(inI, tmpI, innerSize); + break; + } + case (StepType::interleaveOrder3): { + _impl::interleaveCopy<3>(inR, tmpR, innerSize); + _impl::interleaveCopy<3>(inI, tmpI, innerSize); + break; + } + case (StepType::interleaveOrder4): { + _impl::interleaveCopy<4>(inR, tmpR, innerSize); + _impl::interleaveCopy<4>(inI, tmpI, innerSize); + break; + } + case (StepType::interleaveOrder5): { + _impl::interleaveCopy<5>(inR, tmpR, innerSize); + _impl::interleaveCopy<5>(inI, tmpI, innerSize); + break; + } + case (StepType::interleaveOrderN): { + _impl::interleaveCopy(inR, inI, tmpR, tmpI, outerSize, innerSize); + break; + } + case (StepType::firstFFT): { + if (inverse) { + innerFFT.ifft(tmpR, tmpI, outR, outI); + } else { + innerFFT.fft(tmpR, tmpI, outR, outI); + } + break; + } + case (StepType::middleFFT): { + size_t offset = step.offset; + Sample *offsetOutR = outR + offset; + Sample *offsetOutI = outI + offset; + if (inverse) { + innerFFT.ifft(tmpR + offset, tmpI + offset, offsetOutR, offsetOutI); + } else { + innerFFT.fft(tmpR + offset, tmpI + offset, offsetOutR, offsetOutI); + } + break; + } + case(StepType::twiddles): { + auto *twiddlesR = outerTwiddlesR.data(); + auto *twiddlesI = outerTwiddlesI.data(); + if (inverse) { + _impl::complexMulConj(outR + innerSize, outI + innerSize, outR + innerSize, outI + innerSize, twiddlesR, twiddlesI, innerSize*(outerSize - 1)); + } else { + _impl::complexMul(outR + innerSize, outI + innerSize, outR + innerSize, outI + innerSize, twiddlesR, twiddlesI, innerSize*(outerSize - 1)); + } + break; + } + case StepType::finalOrder2: + finalPass2(outR, outI); + break; + case StepType::finalOrder3: + finalPass3(outR, outI); + break; + case StepType::finalOrder4: + finalPass4(outR, outI); + break; + case StepType::finalOrder5: + finalPass5(outR, outI); + break; + case StepType::finalOrderN: + finalPassN(outR, outI); + break; + } + } + + void finalPass2(Complex *f0) { + auto *f1 = f0 + innerSize; + for (size_t i = 0; i < innerSize; ++i) { + Complex a = f0[i], b = f1[i]; + f0[i] = a + b; + f1[i] = a - b; + } + } + void finalPass2(Sample *f0r, Sample *f0i) { + auto *f1r = f0r + innerSize; + auto *f1i = f0i + innerSize; + for (size_t i = 0; i < innerSize; ++i) { + Sample ar = f0r[i], ai = f0i[i]; + Sample br = f1r[i], bi = f1i[i]; + f0r[i] = ar + br; + f0i[i] = ai + bi; + f1r[i] = ar - br; + f1i[i] = ai - bi; + } + } + template + void finalPass3(Complex *f0) { + auto *f1 = f0 + innerSize; + auto *f2 = f0 + innerSize*2; + const Complex tw1{Sample(-0.5), Sample(-std::sqrt(0.75)*(inverse ? -1 : 1))}; + for (size_t i = 0; i < innerSize; ++i) { + Complex a = f0[i], b = f1[i], c = f2[i]; + Complex bc0 = b + c, bc1 = b - c; + f0[i] = a + bc0; + f1[i] = { + a.real() + bc0.real()*tw1.real() - bc1.imag()*tw1.imag(), + a.imag() + bc0.imag()*tw1.real() + bc1.real()*tw1.imag() + }; + f2[i] = { + a.real() + bc0.real()*tw1.real() + bc1.imag()*tw1.imag(), + a.imag() + bc0.imag()*tw1.real() - bc1.real()*tw1.imag() + }; + } + } + template + void finalPass3(Sample *f0r, Sample *f0i) { + auto *f1r = f0r + innerSize; + auto *f1i = f0i + innerSize; + auto *f2r = f0r + innerSize*2; + auto *f2i = f0i + innerSize*2; + const Sample tw1r = -0.5, tw1i = -std::sqrt(0.75)*(inverse ? -1 : 1); + + for (size_t i = 0; i < innerSize; ++i) { + Sample ar = f0r[i], ai = f0i[i], br = f1r[i], bi = f1i[i], cr = f2r[i], ci = f2i[i]; + + f0r[i] = ar + br + cr; + f0i[i] = ai + bi + ci; + f1r[i] = ar + br*tw1r - bi*tw1i + cr*tw1r + ci*tw1i; + f1i[i] = ai + bi*tw1r + br*tw1i - cr*tw1i + ci*tw1r; + f2r[i] = ar + br*tw1r + bi*tw1i + cr*tw1r - ci*tw1i; + f2i[i] = ai + bi*tw1r - br*tw1i + cr*tw1i + ci*tw1r; + } + } + template + void finalPass4(Complex *f0) { + auto *f1 = f0 + innerSize; + auto *f2 = f0 + innerSize*2; + auto *f3 = f0 + innerSize*3; + for (size_t i = 0; i < innerSize; ++i) { + Complex a = f0[i], b = f1[i], c = f2[i], d = f3[i]; + + Complex ac0 = a + c, ac1 = a - c; + Complex bd0 = b + d, bd1 = inverse ? (b - d) : (d - b); + Complex bd1i = {-bd1.imag(), bd1.real()}; + f0[i] = ac0 + bd0; + f1[i] = ac1 + bd1i; + f2[i] = ac0 - bd0; + f3[i] = ac1 - bd1i; + } + } + template + void finalPass4(Sample *f0r, Sample *f0i) { + auto *f1r = f0r + innerSize; + auto *f1i = f0i + innerSize; + auto *f2r = f0r + innerSize*2; + auto *f2i = f0i + innerSize*2; + auto *f3r = f0r + innerSize*3; + auto *f3i = f0i + innerSize*3; + for (size_t i = 0; i < innerSize; ++i) { + Sample ar = f0r[i], ai = f0i[i], br = f1r[i], bi = f1i[i], cr = f2r[i], ci = f2i[i], dr = f3r[i], di = f3i[i]; + + Sample ac0r = ar + cr, ac0i = ai + ci; + Sample ac1r = ar - cr, ac1i = ai - ci; + Sample bd0r = br + dr, bd0i = bi + di; + Sample bd1r = br - dr, bd1i = bi - di; + + f0r[i] = ac0r + bd0r; + f0i[i] = ac0i + bd0i; + f1r[i] = inverse ? (ac1r - bd1i) : (ac1r + bd1i); + f1i[i] = inverse ? (ac1i + bd1r) : (ac1i - bd1r); + f2r[i] = ac0r - bd0r; + f2i[i] = ac0i - bd0i; + f3r[i] = inverse ? (ac1r + bd1i) : (ac1r - bd1i); + f3i[i] = inverse ? (ac1i - bd1r) : (ac1i + bd1r); + } + } + template + void finalPass5(Complex *f0) { + auto *f1 = f0 + innerSize; + auto *f2 = f0 + innerSize*2; + auto *f3 = f0 + innerSize*3; + auto *f4 = f0 + innerSize*4; + const Sample tw1r = 0.30901699437494745; + const Sample tw1i = -0.9510565162951535*(inverse ? -1 : 1); + const Sample tw2r = -0.8090169943749473; + const Sample tw2i = -0.5877852522924732*(inverse ? -1 : 1); + for (size_t i = 0; i < innerSize; ++i) { + Complex a = f0[i], b = f1[i], c = f2[i], d = f3[i], e = f4[i]; + + Complex be0 = b + e, be1 = {e.imag() - b.imag(), b.real() - e.real()}; // (b - e)*i + Complex cd0 = c + d, cd1 = {d.imag() - c.imag(), c.real() - d.real()}; + + Complex bcde01 = be0*tw1r + cd0*tw2r; + Complex bcde02 = be0*tw2r + cd0*tw1r; + Complex bcde11 = be1*tw1i + cd1*tw2i; + Complex bcde12 = be1*tw2i - cd1*tw1i; + + f0[i] = a + be0 + cd0; + f1[i] = a + bcde01 + bcde11; + f2[i] = a + bcde02 + bcde12; + f3[i] = a + bcde02 - bcde12; + f4[i] = a + bcde01 - bcde11; + } + } + template + void finalPass5(Sample *f0r, Sample *f0i) { + auto *f1r = f0r + innerSize; + auto *f1i = f0i + innerSize; + auto *f2r = f0r + innerSize*2; + auto *f2i = f0i + innerSize*2; + auto *f3r = f0r + innerSize*3; + auto *f3i = f0i + innerSize*3; + auto *f4r = f0r + innerSize*4; + auto *f4i = f0i + innerSize*4; + + const Sample tw1r = 0.30901699437494745; + const Sample tw1i = -0.9510565162951535*(inverse ? -1 : 1); + const Sample tw2r = -0.8090169943749473; + const Sample tw2i = -0.5877852522924732*(inverse ? -1 : 1); + for (size_t i = 0; i < innerSize; ++i) { + Sample ar = f0r[i], ai = f0i[i], br = f1r[i], bi = f1i[i], cr = f2r[i], ci = f2i[i], dr = f3r[i], di = f3i[i], er = f4r[i], ei = f4i[i]; + + Sample be0r = br + er, be0i = bi + ei; + Sample be1r = ei - bi, be1i = br - er; + Sample cd0r = cr + dr, cd0i = ci + di; + Sample cd1r = di - ci, cd1i = cr - dr; + + Sample bcde01r = be0r*tw1r + cd0r*tw2r, bcde01i = be0i*tw1r + cd0i*tw2r; + Sample bcde02r = be0r*tw2r + cd0r*tw1r, bcde02i = be0i*tw2r + cd0i*tw1r; + Sample bcde11r = be1r*tw1i + cd1r*tw2i, bcde11i = be1i*tw1i + cd1i*tw2i; + Sample bcde12r = be1r*tw2i - cd1r*tw1i, bcde12i = be1i*tw2i - cd1i*tw1i; + + f0r[i] = ar + be0r + cd0r; + f0i[i] = ai + be0i + cd0i; + f1r[i] = ar + bcde01r + bcde11r; + f1i[i] = ai + bcde01i + bcde11i; + f2r[i] = ar + bcde02r + bcde12r; + f2i[i] = ai + bcde02i + bcde12i; + f3r[i] = ar + bcde02r - bcde12r; + f3i[i] = ai + bcde02i - bcde12i; + f4r[i] = ar + bcde01r - bcde11r; + f4i[i] = ai + bcde01i - bcde11i; + } + } + + template + void finalPassN(Complex *f0) { + for (size_t i = 0; i < innerSize; ++i) { + Complex *offsetFreq = f0 + i; + Complex sum = 0; + for (size_t i2 = 0; i2 < outerSize; ++i2) { + sum += (dftTmp[i2] = offsetFreq[i2*innerSize]); + } + offsetFreq[0] = sum; + + for (size_t f = 1; f < outerSize; ++f) { + Complex sum = dftTmp[0]; + + for (size_t i2 = 1; i2 < outerSize; ++i2) { + size_t twistIndex = (i2*f)%outerSize; + Complex twist = inverse ? std::conj(dftTwists[twistIndex]) : dftTwists[twistIndex]; + sum += Complex{ + dftTmp[i2].real()*twist.real() - dftTmp[i2].imag()*twist.imag(), + dftTmp[i2].imag()*twist.real() + dftTmp[i2].real()*twist.imag() + }; + } + + offsetFreq[f*innerSize] = sum; + } + } + } + template + void finalPassN(Sample *f0r, Sample *f0i) { + Sample *tmpR = (Sample *)dftTmp.data(), *tmpI = tmpR + outerSize; + + for (size_t i = 0; i < innerSize; ++i) { + Sample *offsetR = f0r + i; + Sample *offsetI = f0i + i; + Sample sumR = 0, sumI = 0; + for (size_t i2 = 0; i2 < outerSize; ++i2) { + sumR += (tmpR[i2] = offsetR[i2*innerSize]); + sumI += (tmpI[i2] = offsetI[i2*innerSize]); + } + offsetR[0] = sumR; + offsetI[0] = sumI; + + for (size_t f = 1; f < outerSize; ++f) { + Sample sumR = *tmpR, sumI = *tmpI; + + for (size_t i2 = 1; i2 < outerSize; ++i2) { + size_t twistIndex = (i2*f)%outerSize; + Complex twist = inverse ? std::conj(dftTwists[twistIndex]) : dftTwists[twistIndex]; + sumR += tmpR[i2]*twist.real() - tmpI[i2]*twist.imag(); + sumI += tmpI[i2]*twist.real() + tmpR[i2]*twist.imag(); + } + + offsetR[f*innerSize] = sumR; + offsetI[f*innerSize] = sumI; + } + } + } +}; + +template +using FFT = SplitFFT; + +// Wraps a complex FFT into a real one +template> +struct SimpleRealFFT { + using Complex = std::complex; + + static constexpr bool prefersSplit = ComplexFFT::prefersSplit; + + SimpleRealFFT(size_t size=0) { + resize(size); + } + + void resize(size_t size) { + complexFft.resize(size); + tmpTime.resize(size); + tmpFreq.resize(size); + } + + void fft(const Sample *time, Complex *freq) { + for (size_t i = 0; i < tmpTime.size(); ++i) { + tmpTime[i] = time[i]; + } + complexFft.fft(tmpTime.data(), tmpFreq.data()); + for (size_t i = 0; i < tmpFreq.size()/2; ++i) { + freq[i] = tmpFreq[i]; + } + freq[0] = { + tmpFreq[0].real(), + tmpFreq[tmpFreq.size()/2].real() + }; + } + void fft(const Sample *inR, Sample *outR, Sample *outI) { + Sample *tmpFreqR = (Sample *)tmpFreq.data(), *tmpFreqI = tmpFreqR + tmpFreq.size(); + for (size_t i = 0; i < tmpTime.size()/2; ++i) { + tmpTime[i] = 0; + } + complexFft.fft(inR, (const Sample *)tmpTime.data(), tmpFreqR, tmpFreqI); + for (size_t i = 0; i < tmpTime.size()/2; ++i) { + outR[i] = tmpFreqR[i]; + outI[i] = tmpFreqI[i]; + } + outI[0] = tmpFreqR[tmpFreq.size()/2]; + } + + void ifft(const Complex *freq, Sample *time) { + tmpFreq[0] = freq[0].real(); + tmpFreq[tmpFreq.size()/2] = freq[0].imag(); + for (size_t i = 1; i < tmpFreq.size()/2; ++i) { + tmpFreq[i] = freq[i]; + tmpFreq[tmpFreq.size() - i] = std::conj(freq[i]); + } + complexFft.ifft(tmpFreq.data(), tmpTime.data()); + for (size_t i = 0; i < tmpTime.size(); ++i) { + time[i] = tmpTime[i].real(); + } + } + void ifft(const Sample *inR, const Sample *inI, Sample *outR) { + Sample *tmpFreqR = (Sample *)tmpFreq.data(), *tmpFreqI = tmpFreqR + tmpFreq.size(); + tmpFreqR[0] = inR[0]; + tmpFreqR[tmpFreq.size()/2] = inI[0]; + tmpFreqI[0] = 0; + tmpFreqI[tmpFreq.size()/2] = 0; + for (size_t i = 1; i < tmpFreq.size()/2; ++i) { + tmpFreqR[i] = inR[i]; + tmpFreqI[i] = inI[i]; + tmpFreqR[tmpFreq.size() - i] = inR[i]; + tmpFreqI[tmpFreq.size() - i] = -inI[i]; + } + complexFft.ifft(tmpFreqR, tmpFreqI, outR, (Sample *)tmpTime.data()); + } + +private: + ComplexFFT complexFft; + std::vector tmpTime, tmpFreq; +}; + +/// A default power-of-2 FFT, specialised with platform-specific fast implementations where available +template +struct Pow2RealFFT : public SimpleRealFFT { + static constexpr bool prefersSplit = SimpleRealFFT::prefersSplit; + + using SimpleRealFFT::SimpleRealFFT; + + // Prevent copying, since it might be a problem for specialisations + Pow2RealFFT(const Pow2RealFFT &other) = delete; + // Pass move-constructor through, just to be explicit about it + Pow2RealFFT(Pow2RealFFT &&other) : SimpleRealFFT(std::move(other)) {} +}; + +/// A Real FFT which can handle multiples of 3 and 5, and can be computed in chunks +template +struct RealFFT { + using Complex = std::complex; + static constexpr bool prefersSplit = SplitFFT::prefersSplit; + + static size_t fastSizeAbove(size_t size) { + return ComplexFFT::fastSizeAbove((size + 1)/2)*2; + } + + RealFFT(size_t size=0) { + resize(size); + } + + void resize(size_t size) { + size_t hSize = size/2; + complexFft.resize(hSize); + tmpFreq.resize(hSize); + tmpTime.resize(hSize); + + twiddles.resize(hSize/2 + 1); + + if (!halfBinShift) { + for (size_t i = 0; i < twiddles.size(); ++i) { + Sample rotPhase = i*(-2*M_PI/size) - M_PI/2; // bake rotation by (-i) into twiddles + twiddles[i] = std::polar(Sample(1), rotPhase); + } + } else { + for (size_t i = 0; i < twiddles.size(); ++i) { + Sample rotPhase = (i + 0.5)*(-2*M_PI/size) - M_PI/2; + twiddles[i] = std::polar(Sample(1), rotPhase); + } + + halfBinTwists.resize(hSize); + for (size_t i = 0; i < hSize; ++i) { + Sample twistPhase = -2*M_PI*i/size; + halfBinTwists[i] = std::polar(Sample(1), twistPhase); + } + } + } + + size_t size() const { + return complexFft.size()*2; + } + size_t steps() const { + return complexFft.steps() + (splitComputation ? 3 : 2); + } + + void fft(const Sample *time, Complex *freq) { + for (size_t s = 0; s < steps(); ++s) { + fft(s, time, freq); + } + } + void fft(size_t step, const Sample *time, Complex *freq) { + if (complexPrefersSplit) { + size_t hSize = complexFft.size(); + Sample *tmpTimeR = (Sample *)tmpTime.data(), *tmpTimeI = tmpTimeR + hSize; + Sample *tmpFreqR = (Sample *)tmpFreq.data(), *tmpFreqI = tmpFreqR + hSize; + if (step-- == 0) { + size_t hSize = complexFft.size(); + if (halfBinShift) { + for (size_t i = 0; i < hSize; ++i) { + Sample tr = time[2*i], ti = time[2*i + 1]; + Complex twist = halfBinTwists[i]; + tmpTimeR[i] = tr*twist.real() - ti*twist.imag(); + tmpTimeI[i] = ti*twist.real() + tr*twist.imag(); + } + } else { + for (size_t i = 0; i < hSize; ++i) { + tmpTimeR[i] = time[2*i]; + tmpTimeI[i] = time[2*i + 1]; + } + } + } else if (step < complexFft.steps()) { + complexFft.fft(step, tmpTimeR, tmpTimeI, tmpFreqR, tmpFreqI); + } else { + if (!halfBinShift) { + Sample bin0r = tmpFreqR[0], bin0i = tmpFreqI[0]; + freq[0] = {bin0r + bin0i, bin0r - bin0i}; + } + + size_t startI = halfBinShift ? 0 : 1; + size_t endI = hSize/2 + 1; + if (splitComputation) { // Do this last twiddle in two halves + if (step == complexFft.steps()) { + endI = (startI + endI)/2; + } else { + startI = (startI + endI)/2; + } + } + for (size_t i = startI; i < endI; ++i) { + size_t conjI = halfBinShift ? (hSize - 1 - i) : (hSize - i); + Complex twiddle = twiddles[i]; + + Sample oddR = (tmpFreqR[i] + tmpFreqR[conjI])*Sample(0.5); + Sample oddI = (tmpFreqI[i] - tmpFreqI[conjI])*Sample(0.5); + Sample evenIR = (tmpFreqR[i] - tmpFreqR[conjI])*Sample(0.5); + Sample evenII = (tmpFreqI[i] + tmpFreqI[conjI])*Sample(0.5); + Sample evenRotMinusIR = evenIR*twiddle.real() - evenII*twiddle.imag(); + Sample evenRotMinusII = evenII*twiddle.real() + evenIR*twiddle.imag(); + + freq[i] = {oddR + evenRotMinusIR, oddI + evenRotMinusII}; + freq[conjI] = {oddR - evenRotMinusIR, evenRotMinusII - oddI}; + } + } + } else { + bool canUseTime = !halfBinShift && !(size_t(time)%alignof(Complex)); + if (step-- == 0) { + size_t hSize = complexFft.size(); + if (halfBinShift) { + for (size_t i = 0; i < hSize; ++i) { + Sample tr = time[2*i], ti = time[2*i + 1]; + Complex twist = halfBinTwists[i]; + tmpTime[i] = { + tr*twist.real() - ti*twist.imag(), + ti*twist.real() + tr*twist.imag() + }; + } + } else if (!canUseTime) { + std::memcpy(tmpTime.data(), time, sizeof(Complex)*hSize); + } + } else if (step < complexFft.steps()) { + complexFft.fft(step, canUseTime ? (const Complex *)time : tmpTime.data(), tmpFreq.data()); + } else { + if (!halfBinShift) { + Complex bin0 = tmpFreq[0]; + freq[0] = { // pack DC & Nyquist together + bin0.real() + bin0.imag(), + bin0.real() - bin0.imag() + }; + } + + size_t hSize = complexFft.size(); + size_t startI = halfBinShift ? 0 : 1; + size_t endI = hSize/2 + 1; + if (splitComputation) { // Do this last twiddle in two halves + if (step == complexFft.steps()) { + endI = (startI + endI)/2; + } else { + startI = (startI + endI)/2; + } + } + for (size_t i = startI; i < endI; ++i) { + size_t conjI = halfBinShift ? (hSize - 1 - i) : (hSize - i); + Complex twiddle = twiddles[i]; + + Complex odd = (tmpFreq[i] + std::conj(tmpFreq[conjI]))*Sample(0.5); + Complex evenI = (tmpFreq[i] - std::conj(tmpFreq[conjI]))*Sample(0.5); + Complex evenRotMinusI = { // twiddle includes a factor of -i + evenI.real()*twiddle.real() - evenI.imag()*twiddle.imag(), + evenI.imag()*twiddle.real() + evenI.real()*twiddle.imag() + }; + + freq[i] = odd + evenRotMinusI; + freq[conjI] = {odd.real() - evenRotMinusI.real(), evenRotMinusI.imag() - odd.imag()}; + } + } + } + } + void fft(const Sample *inR, Sample *outR, Sample *outI) { + for (size_t s = 0; s < steps(); ++s) { + fft(s, inR, outR, outI); + } + } + void fft(size_t step, const Sample *inR, Sample *outR, Sample *outI) { + size_t hSize = complexFft.size(); + Sample *tmpTimeR = (Sample *)tmpTime.data(), *tmpTimeI = tmpTimeR + hSize; + Sample *tmpFreqR = (Sample *)tmpFreq.data(), *tmpFreqI = tmpFreqR + hSize; + if (step-- == 0) { + size_t hSize = complexFft.size(); + if (halfBinShift) { + for (size_t i = 0; i < hSize; ++i) { + Sample tr = inR[2*i], ti = inR[2*i + 1]; + Complex twist = halfBinTwists[i]; + tmpTimeR[i] = tr*twist.real() - ti*twist.imag(); + tmpTimeI[i] = ti*twist.real() + tr*twist.imag(); + } + } else { + for (size_t i = 0; i < hSize; ++i) { + tmpTimeR[i] = inR[2*i]; + tmpTimeI[i] = inR[2*i + 1]; + } + } + } else if (step < complexFft.steps()) { + complexFft.fft(step, tmpTimeR, tmpTimeI, tmpFreqR, tmpFreqI); + } else { + if (!halfBinShift) { + Sample bin0r = tmpFreqR[0], bin0i = tmpFreqI[0]; + outR[0] = bin0r + bin0i; + outI[0] = bin0r - bin0i; + } + + size_t startI = halfBinShift ? 0 : 1; + size_t endI = hSize/2 + 1; + if (splitComputation) { // Do this last twiddle in two halves + if (step == complexFft.steps()) { + endI = (startI + endI)/2; + } else { + startI = (startI + endI)/2; + } + } + for (size_t i = startI; i < endI; ++i) { + size_t conjI = halfBinShift ? (hSize - 1 - i) : (hSize - i); + Complex twiddle = twiddles[i]; + + Sample oddR = (tmpFreqR[i] + tmpFreqR[conjI])*Sample(0.5); + Sample oddI = (tmpFreqI[i] - tmpFreqI[conjI])*Sample(0.5); + Sample evenIR = (tmpFreqR[i] - tmpFreqR[conjI])*Sample(0.5); + Sample evenII = (tmpFreqI[i] + tmpFreqI[conjI])*Sample(0.5); + Sample evenRotMinusIR = evenIR*twiddle.real() - evenII*twiddle.imag(); + Sample evenRotMinusII = evenII*twiddle.real() + evenIR*twiddle.imag(); + + outR[i] = oddR + evenRotMinusIR; + outI[i] = oddI + evenRotMinusII; + outR[conjI] = oddR - evenRotMinusIR; + outI[conjI] = evenRotMinusII - oddI; + } + } + } + + void ifft(const Complex *freq, Sample *time) { + for (size_t s = 0; s < steps(); ++s) { + ifft(s, freq, time); + } + } + void ifft(size_t step, const Complex *freq, Sample *time) { + if (complexPrefersSplit) { + size_t hSize = complexFft.size(); + Sample *tmpTimeR = (Sample *)tmpTime.data(), *tmpTimeI = tmpTimeR + hSize; + Sample *tmpFreqR = (Sample *)tmpFreq.data(), *tmpFreqI = tmpFreqR + hSize; + + bool splitFirst = splitComputation && (step-- == 0); + if (splitFirst || step-- == 0) { + Complex bin0 = freq[0]; + if (!halfBinShift) { + tmpFreqR[0] = bin0.real() + bin0.imag(); + tmpFreqI[0] = bin0.real() - bin0.imag(); + } + size_t startI = halfBinShift ? 0 : 1; + size_t endI = hSize/2 + 1; + if (splitComputation) { // Do this first twiddle in two halves + if (splitFirst) { + endI = (startI + endI)/2; + } else { + startI = (startI + endI)/2; + } + } + for (size_t i = startI; i < endI; ++i) { + size_t conjI = halfBinShift ? (hSize - 1 - i) : (hSize - i); + Complex twiddle = twiddles[i]; + + Complex odd = freq[i] + std::conj(freq[conjI]); + Complex evenRotMinusI = freq[i] - std::conj(freq[conjI]); + Complex evenI = { // Conjugate twiddle + evenRotMinusI.real()*twiddle.real() + evenRotMinusI.imag()*twiddle.imag(), + evenRotMinusI.imag()*twiddle.real() - evenRotMinusI.real()*twiddle.imag() + }; + + tmpFreqR[i] = odd.real() + evenI.real(); + tmpFreqI[i] = odd.imag() + evenI.imag(); + tmpFreqR[conjI] = odd.real() - evenI.real(); + tmpFreqI[conjI] = evenI.imag() - odd.imag(); + } + } else if (step < complexFft.steps()) { + complexFft.ifft(step, tmpFreqR, tmpFreqI, tmpTimeR, tmpTimeI); + } else { + size_t hSize = complexFft.size(); + if (halfBinShift) { + for (size_t i = 0; i < hSize; ++i) { + Sample tr = tmpTimeR[i], ti = tmpTimeI[i]; + Complex twist = halfBinTwists[i]; + time[2*i] = tr*twist.real() + ti*twist.imag(); + time[2*i + 1] = ti*twist.real() - tr*twist.imag(); + } + } else { + for (size_t i = 0; i < hSize; ++i) { + time[2*i] = tmpTimeR[i]; + time[2*i + 1] = tmpTimeI[i]; + } + } + } + } else { + bool canUseTime = !halfBinShift && !(size_t(time)%alignof(Complex)); + bool splitFirst = splitComputation && (step-- == 0); + if (splitFirst || step-- == 0) { + Complex bin0 = freq[0]; + if (!halfBinShift) { + tmpFreq[0] = { + bin0.real() + bin0.imag(), + bin0.real() - bin0.imag() + }; + } + size_t hSize = complexFft.size(); + size_t startI = halfBinShift ? 0 : 1; + size_t endI = hSize/2 + 1; + if (splitComputation) { // Do this first twiddle in two halves + if (splitFirst) { + endI = (startI + endI)/2; + } else { + startI = (startI + endI)/2; + } + } + for (size_t i = startI; i < endI; ++i) { + size_t conjI = halfBinShift ? (hSize - 1 - i) : (hSize - i); + Complex twiddle = twiddles[i]; + + Complex odd = freq[i] + std::conj(freq[conjI]); + Complex evenRotMinusI = freq[i] - std::conj(freq[conjI]); + Complex evenI = { // Conjugate twiddle + evenRotMinusI.real()*twiddle.real() + evenRotMinusI.imag()*twiddle.imag(), + evenRotMinusI.imag()*twiddle.real() - evenRotMinusI.real()*twiddle.imag() + }; + + tmpFreq[i] = odd + evenI; + tmpFreq[conjI] = {odd.real() - evenI.real(), evenI.imag() - odd.imag()}; + } + } else if (step < complexFft.steps()) { + // Can't just use time as (Complex *), since it might not be aligned properly + complexFft.ifft(step, tmpFreq.data(), canUseTime ? (Complex *)time : tmpTime.data()); + } else { + size_t hSize = complexFft.size(); + if (halfBinShift) { + for (size_t i = 0; i < hSize; ++i) { + Complex t = tmpTime[i]; + Complex twist = halfBinTwists[i]; + time[2*i] = t.real()*twist.real() + t.imag()*twist.imag(); + time[2*i + 1] = t.imag()*twist.real() - t.real()*twist.imag(); + } + } else if (!canUseTime) { + std::memcpy(time, tmpTime.data(), sizeof(Complex)*hSize); + } + } + } + } + void ifft(const Sample *inR, const Sample *inI, Sample *outR) { + for (size_t s = 0; s < steps(); ++s) { + ifft(s, inR, inI, outR); + } + } + void ifft(size_t step, const Sample *inR, const Sample *inI, Sample *outR) { + size_t hSize = complexFft.size(); + Sample *tmpTimeR = (Sample *)tmpTime.data(), *tmpTimeI = tmpTimeR + hSize; + Sample *tmpFreqR = (Sample *)tmpFreq.data(), *tmpFreqI = tmpFreqR + hSize; + + bool splitFirst = splitComputation && (step-- == 0); + if (splitFirst || step-- == 0) { + Sample bin0r = inR[0], bin0i = inI[0]; + if (!halfBinShift) { + tmpFreqR[0] = bin0r + bin0i; + tmpFreqI[0] = bin0r - bin0i; + } + size_t startI = halfBinShift ? 0 : 1; + size_t endI = hSize/2 + 1; + if (splitComputation) { // Do this first twiddle in two halves + if (splitFirst) { + endI = (startI + endI)/2; + } else { + startI = (startI + endI)/2; + } + } + for (size_t i = startI; i < endI; ++i) { + size_t conjI = halfBinShift ? (hSize - 1 - i) : (hSize - i); + Complex twiddle = twiddles[i]; + Sample fir = inR[i], fii = inI[i]; + Sample fcir = inR[conjI], fcii = inI[conjI]; + + Complex odd = {fir + fcir, fii - fcii}; + Complex evenRotMinusI = {fir - fcir, fii + fcii}; + Complex evenI = { // Conjugate twiddle + evenRotMinusI.real()*twiddle.real() + evenRotMinusI.imag()*twiddle.imag(), + evenRotMinusI.imag()*twiddle.real() - evenRotMinusI.real()*twiddle.imag() + }; + + tmpFreqR[i] = odd.real() + evenI.real(); + tmpFreqI[i] = odd.imag() + evenI.imag(); + tmpFreqR[conjI] = odd.real() - evenI.real(); + tmpFreqI[conjI] = evenI.imag() - odd.imag(); + } + } else if (step < complexFft.steps()) { + // Can't just use time as (Complex *), since it might not be aligned properly + complexFft.ifft(step, tmpFreqR, tmpFreqI, tmpTimeR, tmpTimeI); + } else { + if (halfBinShift) { + for (size_t i = 0; i < hSize; ++i) { + Sample tr = tmpTimeR[i], ti = tmpTimeI[i]; + Complex twist = halfBinTwists[i]; + outR[2*i] = tr*twist.real() + ti*twist.imag(); + outR[2*i + 1] = ti*twist.real() - tr*twist.imag(); + } + } else { + for (size_t i = 0; i < hSize; ++i) { + outR[2*i] = tmpTimeR[i]; + outR[2*i + 1] = tmpTimeI[i]; + } + } + } + } +private: + using ComplexFFT = SplitFFT; + ComplexFFT complexFft; + + static constexpr bool complexPrefersSplit = ComplexFFT::prefersSplit; + std::vector tmpFreq, tmpTime; + std::vector twiddles, halfBinTwists; +}; + +template +using ModifiedRealFFT = RealFFT; + +}} // namespace + +// Override `Pow2FFT` / `Pow2RealFFT` templates with faster implementations +#if defined(SIGNALSMITH_USE_PFFFT) || defined(SIGNALSMITH_USE_PFFFT_DOUBLE) +# if defined(SIGNALSMITH_USE_PFFFT) +# include "./platform/fft-pffft.h" +# endif +# if defined(SIGNALSMITH_USE_PFFFT_DOUBLE) +# include "./platform/fft-pffft-double.h" +# endif +#elif defined(SIGNALSMITH_USE_ACCELERATE) +# include "./platform/fft-accelerate.h" +#elif defined(SIGNALSMITH_USE_IPP) +# include "./platform/fft-ipp.h" +#endif + +#endif // include guard diff --git a/thirdparty/signalsmith-linear/include/signalsmith-linear/fft.h b/thirdparty/signalsmith-linear/include/signalsmith-linear/fft.h new file mode 100644 index 0000000000..661709d42b --- /dev/null +++ b/thirdparty/signalsmith-linear/include/signalsmith-linear/fft.h @@ -0,0 +1 @@ +#include "../../fft.h" diff --git a/thirdparty/signalsmith-linear/include/signalsmith-linear/linear.h b/thirdparty/signalsmith-linear/include/signalsmith-linear/linear.h new file mode 100644 index 0000000000..2b276f5cc9 --- /dev/null +++ b/thirdparty/signalsmith-linear/include/signalsmith-linear/linear.h @@ -0,0 +1 @@ +#include "../../linear.h" diff --git a/thirdparty/signalsmith-linear/include/signalsmith-linear/stft.h b/thirdparty/signalsmith-linear/include/signalsmith-linear/stft.h new file mode 100644 index 0000000000..9665aee3ab --- /dev/null +++ b/thirdparty/signalsmith-linear/include/signalsmith-linear/stft.h @@ -0,0 +1 @@ +#include "../../stft.h" diff --git a/thirdparty/signalsmith-linear/linear.h b/thirdparty/signalsmith-linear/linear.h new file mode 100644 index 0000000000..9097ac7458 --- /dev/null +++ b/thirdparty/signalsmith-linear/linear.h @@ -0,0 +1,1075 @@ +#ifndef SIGNALSMITH_AUDIO_LINEAR_H +#define SIGNALSMITH_AUDIO_LINEAR_H + +#if defined(__FAST_MATH__) && (__apple_build_version__ >= 16000000) && (__apple_build_version__ <= 16000099) && !defined(SIGNALSMITH_IGNORE_BROKEN_APPLECLANG) +# error Apple Clang 16.0.0 generates incorrect SIMD for ARM. If you HAVE to use this version of Clang, turn off -ffast-math. +#endif + +#ifndef M_PI +# define M_PI 3.14159265358979323846 +#endif + +#include +#include +#include +#include +#include +#include +#include + +namespace signalsmith { namespace linear { + +template +using ConstRealPointer = const V *; +template +using RealPointer = V *; + +template +using ConstComplexPointer = const std::complex *; +template +using ComplexPointer = std::complex *; + +template +struct ConstSplitPointer { + ConstRealPointer real, imag; + ConstSplitPointer(ConstRealPointer real, ConstRealPointer imag) : real(real), imag(imag) {} + + // Array-like access for convenience + const std::complex operator[](std::ptrdiff_t i) const { + return {real[i], imag[i]}; + } +}; + +template +struct SplitPointer { + using Complex = std::complex; + + RealPointer real, imag; + SplitPointer(RealPointer real, RealPointer imag) : real(real), imag(imag) {} + operator ConstSplitPointer() { + return {real, imag}; + } + + // Array-like access for convenience + const Complex operator[](std::ptrdiff_t i) const { + return {real[i], imag[i]}; + } + + // Assignable (if not const) and converts to `std::complex` + struct Value : public Complex { + Value(V &_real, V &_imag) : Complex(_real, _imag), _real(_real), _imag(_imag) {} + + V real() const { + return _real; + } + void real(V v) { + _real = v; + Complex::real(v); + } + V imag() const { + return _imag; + } + void imag(V v) { + _imag = v; + Complex::imag(v); + } + +#define LINEAR_SPLIT_POINTER_ASSIGNMENT_OP(OP) \ + template \ + Value & operator OP(Other &&v) { \ + std::complex::operator OP(std::forward(v)); \ + _real = v.real(); \ + _imag = v.imag(); \ + return *this; \ + } + LINEAR_SPLIT_POINTER_ASSIGNMENT_OP(=); + LINEAR_SPLIT_POINTER_ASSIGNMENT_OP(+=); + LINEAR_SPLIT_POINTER_ASSIGNMENT_OP(-=); + LINEAR_SPLIT_POINTER_ASSIGNMENT_OP(*=); + LINEAR_SPLIT_POINTER_ASSIGNMENT_OP(/=); +#undef LINEAR_SPLIT_POINTER_ASSIGNMENT_OP + + private: + V &_real; + V &_imag; + }; + + Value operator[](std::ptrdiff_t i) { + return {real[i], imag[i]}; + } +}; + +template +struct LinearImpl; +using Linear = LinearImpl; + +// Everything we deal with is actually one of these +template +struct Expression; +template +struct WritableExpression; + +#define EXPRESSION_NAME(Class, nameExpr) \ + static std::string name() {\ + return nameExpr; \ + } + +//#undef EXPRESSION_NAME +//#include +//#define EXPRESSION_NAME(Class, nameExpr) \ +// static std::string name() { \ +// return typeid(Class).name(); \ +// } + +// Expression templates, which always hold const pointers +namespace expression { + // All base Exprs inherit from this, so we can SFINAE-test for them + struct Base {}; + inline void mustBeExpr(const Base &) {} + + template + struct ConstantExpr : public Base { + EXPRESSION_NAME(Constant, "V"); + V value; + + static_assert(std::is_trivially_copyable::value, "ConstantExpr values must be trivially copyable"); + + ConstantExpr(V value) : value(value) {} + + const V get(std::ptrdiff_t) const { + return value; + } + }; + + template + using Arithmetic = decltype(std::declval() + std::declval()); + + // If the constant is one of our assignable complex proxies, use the basic one instead + template<> + struct ConstantExpr::Value> : public ConstantExpr> { + using ConstantExpr>::ConstantExpr; + }; + template<> + struct ConstantExpr::Value> : public ConstantExpr> { + using ConstantExpr>::ConstantExpr; + }; + + template + struct ExprTest { + using Constant = ConstantExpr>; + + static_assert(std::is_trivially_copyable::value, "ConstantExpr must be trivially copyable"); + + static Constant wrap(const V &v) { + return {v}; + } + }; + template + struct ExprTest()))> { + static Expr wrap(const Expr &expr) { + return expr; + } + }; + // Constant class, only defined for non-Expr types + template + using Constant = typename ExprTest::Constant; + + template + auto ensureExpr(const Expr &expr) -> decltype(ExprTest::wrap(expr)) { + return ExprTest::wrap(expr); + }; + + // Remove `Expression<>` or `WritableExpression<>` layers + template + E unwrapped(E e) { + return e; + } + template + E unwrapped(Expression e) { + return e; + } + template + E unwrapped(WritableExpression e) { + return e; + } + template + using Unwrapped = decltype(unwrapped(std::declval())); + + // Expressions that just read from a pointer + template + struct ReadableReal : public Base { + EXPRESSION_NAME(ReadableReal, "const V*"); + ConstRealPointer pointer; + + ReadableReal(ConstRealPointer pointer) : pointer(pointer) {} + + V get(std::ptrdiff_t i) const { + return pointer[i]; + } + }; + template + struct ReadableComplex : public Base { + EXPRESSION_NAME(ReadableComplex, "const VC*"); + ConstComplexPointer pointer; + + ReadableComplex(ConstComplexPointer pointer) : pointer(pointer) {} + + std::complex get(std::ptrdiff_t i) const { + return pointer[i]; + } + }; + template + struct ReadableSplit : public Base { + EXPRESSION_NAME(ReadableSplit, "const VS*"); + ConstSplitPointer pointer; + + ReadableSplit(ConstSplitPointer pointer) : pointer(pointer) {} + + std::complex get(std::ptrdiff_t i) const { + return {pointer.real[i], pointer.imag[i]}; + } + }; +} + +// + - * / % ^ & | ~ ! = < > += -= *= /= %= ^= &= |= << >> >>= <<= == != <= >= <=>(since C++20) && || ++ -- , ->* -> ( ) [ ] + +#define SIGNALSMITH_AUDIO_LINEAR_UNARY_PREFIX(Name, OP) \ +namespace expression { \ + template \ + struct Name : public Base { \ + EXPRESSION_NAME(Name, (#Name "<") + A::name() + ">"); \ + A a; \ + Name(const A &a) : a(a) {} \ + auto get(std::ptrdiff_t i) const -> decltype(OP a.get(i)) const { \ + return OP a.get(i); \ + } \ + }; \ + template \ + Name> make##Name(A a) { \ + return {a}; \ + } \ +} \ +template \ +Expression> operator OP(const Expression &a) { \ + return {a}; \ +} +SIGNALSMITH_AUDIO_LINEAR_UNARY_PREFIX(Neg, -) +// Two negatives cancel out +template +Expression operator-(const Expression> &expr) { \ + return expr.a; +} +#undef SIGNALSMITH_AUDIO_LINEAR_UNARY_PREFIX + +#define SIGNALSMITH_AUDIO_LINEAR_BINARY_INFIX(Name, OP) \ +namespace expression { \ + template \ + struct Name : public Base { \ + EXPRESSION_NAME(Name, (#Name "<") + A::name() + "," + B::name() + ">"); \ + A a; \ + B b; \ + Name(const A &a, const B &b) : a(a), b(b) {} \ + auto get(std::ptrdiff_t i) const -> decltype(a.get(i) OP b.get(i)) const { \ + return a.get(i) OP b.get(i); \ + } \ + }; \ + template \ + Name, Unwrapped> make##Name(A a, B b) { \ + return {a, b}; \ + } \ +} \ +template \ +const Expression> operator OP(const Expression &a, const Expression &b) { \ + return {a, b}; \ +} \ +template \ +const Expression>> operator OP(const Expression &a, const B &b) { \ + return {a, b}; \ +} \ +template \ +const Expression, B>> operator OP(const A &a, const Expression &b) { \ + return {a, b}; \ +} +SIGNALSMITH_AUDIO_LINEAR_BINARY_INFIX(Add, +) +SIGNALSMITH_AUDIO_LINEAR_BINARY_INFIX(Mul, *) +SIGNALSMITH_AUDIO_LINEAR_BINARY_INFIX(Sub, -) +SIGNALSMITH_AUDIO_LINEAR_BINARY_INFIX(Div, /) +#undef SIGNALSMITH_AUDIO_LINEAR_BINARY_INFIX + +namespace expression { +#define SIGNALSMITH_AUDIO_LINEAR_FUNC1(Name, func) \ + template \ + struct Name; \ + template \ + Name> make##Name(A a) { \ + return {a}; \ + } \ + template \ + struct Name : public Base { \ + EXPRESSION_NAME(Name, (#Name "<") + A::name() + ">"); \ + A a; \ + Name(const A &a) : a(a) {} \ + auto get(std::ptrdiff_t i) const -> decltype(func(a.get(i))) { \ + return func(a.get(i)); \ + } \ + }; + + template + A fastAbs(const A &a) { + return std::abs(a); + } + template + A fastAbs(const std::complex &a) { + return std::hypot(a.real(), a.imag()); + } + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Abs, fastAbs) + + template + A fastNorm(const A &a) { + return a*a; + } + template + A fastNorm(const std::complex &a) { + A real = a.real(), imag = a.imag(); + return real*real + imag*imag; + } + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Norm, fastNorm) + + // Single-argument functions + // Abs, Norm, Exp, Exp2, Log, Log2, Log10, Sqrt, Cbrt, Ceil, Floor, Trunc, Round, Conj, Real, Imag, Arg, Proj, Sin, Cos, Tan, Asin, Acos, Atan, Sinh, Cosh, Tanh, Asinh, Acosh, Atanh, Erf, Erfc, Tgamma, Lgamma + + // .abs and .norm are handled above + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Exp, std::exp) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Exp2, std::exp2) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Log, std::log) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Log2, std::log2) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Log10, std::log10) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Sqrt, std::sqrt) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Cbrt, std::cbrt) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Ceil, std::ceil) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Floor, std::floor) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Trunc, std::trunc) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Round, std::round) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Conj, std::conj) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Real, std::real) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Imag, std::imag) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Arg, std::arg) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Proj, std::proj) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Sin, std::sin) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Cos, std::cos) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Tan, std::tan) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Asin, std::asin) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Acos, std::acos) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Atan, std::atan) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Sinh, std::sinh) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Cosh, std::cosh) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Tanh, std::tanh) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Asinh, std::asinh) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Acosh, std::acosh) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Atanh, std::atanh) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Erf, std::erf) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Erfc, std::erfc) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Tgamma, std::tgamma) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Lgamma, std::lgamma) +#undef SIGNALSMITH_AUDIO_LINEAR_FUNC1 + +#define SIGNALSMITH_AUDIO_LINEAR_FUNC2(Name, func) \ + template \ + auto common##Name(VA a, VB b) -> decltype(func((decltype(a + b))a, (decltype(a + b))b)) { \ + return func((decltype(a + b))a, (decltype(a + b))b); \ + } \ + template \ + struct Name : public Base { \ + EXPRESSION_NAME(Name, (#Name "<") + A::name() + "," + B::name() + ">"); \ + A a; \ + B b; \ + Name(const A &a, const B &b) : a(a), b(b) {} \ + auto get(std::ptrdiff_t i) const -> decltype(common##Name(a.get(i), b.get(i))) const { \ + return common##Name(a.get(i), b.get(i)); \ + } \ + }; \ + template \ + Name, Unwrapped> make##Name(A a, B b) { \ + return {a, b}; \ + } + // Min, Max, Dim, Pow, Atan2, Hypot, Copysign, Polar + SIGNALSMITH_AUDIO_LINEAR_FUNC2(Max, std::fmax); + SIGNALSMITH_AUDIO_LINEAR_FUNC2(Min, std::fmin); + SIGNALSMITH_AUDIO_LINEAR_FUNC2(Dim, std::fdim); + SIGNALSMITH_AUDIO_LINEAR_FUNC2(Pow, std::pow); + SIGNALSMITH_AUDIO_LINEAR_FUNC2(Atan2, std::atan2); + SIGNALSMITH_AUDIO_LINEAR_FUNC2(Hypot, std::hypot); + SIGNALSMITH_AUDIO_LINEAR_FUNC2(Copysign, std::copysign); + SIGNALSMITH_AUDIO_LINEAR_FUNC2(Polar, std::polar); +#undef SIGNALSMITH_AUDIO_LINEAR_FUNC2 + +} // expression:: + +template +struct Expression : public BaseExpr { + template + Expression(Args &&...args) : BaseExpr(std::forward(args)...) { + static_assert(std::is_trivially_copyable::value, "BaseExpr must be trivially copyable"); + static_assert(std::is_trivially_copyable::value, "Expression<> must be trivially copyable"); + } + + auto operator[](std::ptrdiff_t i) -> decltype(BaseExpr::get(i)) const { + return BaseExpr::get(i); + } + +#define SIGNALSMITH_AUDIO_LINEAR_FUNC1(ExprName, methodName) \ + const Expression> methodName() const { \ + return {*this}; \ + } + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Abs, abs) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Norm, norm) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Exp, exp) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Exp2, exp2) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Log, log) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Log2, log2) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Log10, log10) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Sqrt, sqrt) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Cbrt, cbrt) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Ceil, ceil) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Floor, floor) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Trunc, trunc) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Round, round) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Conj, conj) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Real, real) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Imag, imag) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Arg, arg) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Proj, proj) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Sin, sin) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Cos, cos) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Tan, tan) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Asin, asin) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Acos, acos) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Atan, atan) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Sinh, sinh) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Cosh, cosh) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Tanh, tanh) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Asinh, asinh) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Acosh, acosh) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Atanh, atanh) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Erf, erf) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Erfc, erfc) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Tgamma, tgamma) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Lgamma, lgamma) +#undef SIGNALSMITH_AUDIO_LINEAR_FUNC1 +}; +template +struct WritableExpression : public Expression { + using Expression::Expression; + + WritableExpression(const WritableExpression &other) = default; + + template + WritableExpression & operator=(const Expr &expr) { + this->linear.fill(this->pointer, expression::ensureExpr(expr), this->size); + return *this; + } + + WritableExpression & operator=(const WritableExpression &expr) { + this->linear.fill(this->pointer, expr, this->size); + return *this; + } +#define SIGNALSMITH_AUDIO_ASSIGNMENT_OP(assignOp, binaryOp) \ + template\ + WritableExpression & operator assignOp(const E &expr) { \ + return *this = (*this) binaryOp expr; \ + } + SIGNALSMITH_AUDIO_ASSIGNMENT_OP(+=, +); + SIGNALSMITH_AUDIO_ASSIGNMENT_OP(-=, -); + SIGNALSMITH_AUDIO_ASSIGNMENT_OP(*=, *); + SIGNALSMITH_AUDIO_ASSIGNMENT_OP(/=, /); +#undef SIGNALSMITH_AUDIO_ASSIGNMENT_OP + + // Use the pointer's `operator[]` instead of the expression + auto operator[](std::ptrdiff_t i) -> decltype(this->pointer[i]) { + return this->pointer[i]; + } + + auto operator[](std::ptrdiff_t i) const -> decltype(this->pointer[i]) { + return this->pointer[i]; + } +}; + +/// Helper class for temporary storage +template +struct Temporary { + // This is called if we don't have enough reserved space and end up allocating + std::function allocationWarning; + + void reserve(size_t size) { + if (buffer) delete[] buffer; + buffer = new V[size]; + alignedBuffer = nextAligned(buffer); + if (alignedBuffer != buffer) { + delete[] buffer; + buffer = new V[size + extraAlignmentItems]; + alignedBuffer = nextAligned(buffer); + } + start = alignedBuffer; + end = alignedBuffer + size; + } + + void clear() { + start = alignedBuffer; + fallbacks.resize(0); + fallbacks.shrink_to_fit(); + } + + // You should have one of these every time you're using the temporary storage + struct Scoped { + Scoped(Temporary &temporary) : temporary(temporary), restoreStart(temporary.start), fallbackSize(temporary.fallbacks.size()) {} + + ~Scoped() { + temporary.start = restoreStart; + temporary.fallbacks.resize(fallbackSize); + } + + V * operator()(size_t size) { + return temporary.getChunk(size); + } + + private: + Temporary &temporary; + V *restoreStart; + size_t fallbackSize; + }; + +private: + static constexpr size_t extraAlignmentItems = alignBytes/sizeof(V); + static V * nextAligned(V *ptr) { + return (V*)((size_t(ptr) + (alignBytes - 1))&~(alignBytes - 1)); + } + + size_t depth = 0; + V *start = nullptr, *end = nullptr; + V *buffer = nullptr, *alignedBuffer = nullptr; + + std::vector> fallbacks; + + /// Valid until the next call to .clear() or .reserve() + V * getChunk(size_t size) { + V *result = start; + V *newStart = start + size; + if (newStart > end) { + // OK, actually we ran out of temporary space, so allocate + fallbacks.emplace_back(size + extraAlignmentItems); + result = nextAligned(fallbacks.back().data()); + // but we're not happy about it. >:( + if (allocationWarning) allocationWarning(newStart - buffer); + } + start = nextAligned(newStart); + return result; + } +}; + +template +struct CachedResults { + using TemporaryFloats = Temporary; + using TemporaryDoubles = Temporary; + + template + using WritableReal = typename Linear::template WritableReal; + template + using WritableComplex = typename Linear::template WritableComplex; + template + using WritableSplit = typename Linear::template WritableSplit; + + CachedResults(Linear &linear) : linear(linear) {} + + struct ScopedFloat : public TemporaryFloats::Scoped { + ScopedFloat(Linear &linear, TemporaryFloats &temporary) : TemporaryFloats::Scoped(temporary), linear(linear) {} + + template + ConstRealPointer real(Expr expr, size_t size) { + auto chunk = (*this)(size); + linear.fill(chunk, expr, size); + return chunk; + } + ConstRealPointer real(expression::ReadableReal expr, size_t) { + return expr.pointer; + } + ConstRealPointer real(WritableReal expr, size_t) { + return expr.pointer; + } + template + ConstRealPointer real(Expr expr, size_t size, RealPointer canUse) { + linear.fill(canUse, expr, size); + return canUse; + } + ConstRealPointer real(expression::ReadableReal expr, size_t, RealPointer) { + return expr.pointer; + } + ConstRealPointer real(WritableReal expr, size_t, RealPointer) { + return expr.pointer; + } + template + ConstComplexPointer complex(Expr expr, size_t size) { + auto chunk = (*this)(size*2); + linear.fill((ComplexPointer)chunk, expr, size); + return chunk; + } + ConstComplexPointer complex(expression::ReadableComplex expr, size_t) { + return expr.pointer; + } + ConstComplexPointer complex(WritableComplex expr, size_t) { + return expr.pointer; + } + template + ConstComplexPointer complex(Expr expr, size_t size, ComplexPointer canUse) { + linear.fill(canUse, expr, size); + return canUse; + } + ConstComplexPointer complex(expression::ReadableComplex expr, size_t, ComplexPointer) { + return expr.pointer; + } + ConstComplexPointer complex(WritableComplex expr, size_t, ComplexPointer) { + return expr.pointer; + } + template + ConstSplitPointer split(Expr expr, size_t size) { + SplitPointer chunk{(*this)(size), (*this)(size)}; + linear.fill(chunk, expr, size); + return chunk; + } + ConstSplitPointer split(expression::ReadableSplit expr, size_t) { + return expr.pointer; + } + ConstSplitPointer split(WritableSplit expr, size_t) { + return expr.pointer; + } + template + ConstSplitPointer split(Expr expr, size_t size, SplitPointer canUse) { + linear.fill(canUse, expr, size); + return canUse; + } + ConstSplitPointer split(expression::ReadableSplit expr, size_t, SplitPointer) { + return expr.pointer; + } + ConstSplitPointer split(WritableSplit expr, size_t, SplitPointer) { + return expr.pointer; + } + private: + Linear &linear; + }; + ScopedFloat floatScope() { + return {linear, floats}; + } + void reserveFloats(size_t size) { + floats.reserve(size); + } + template + void reserveFloats(size_t size, Fn &&allocationWarning) { + floats.reserve(size); + floats.allocationWarning = allocationWarning; + } + + struct ScopedDouble : public TemporaryDoubles::Scoped { + ScopedDouble(Linear &linear, TemporaryDoubles &temporary) : TemporaryDoubles::Scoped(temporary), linear(linear) {} + + template + ConstRealPointer real(Expr expr, size_t size) { + auto chunk = (*this)(size); + linear.fill(chunk, expr, size); + return chunk; + } + ConstRealPointer real(expression::ReadableReal expr, size_t) { + return expr.pointer; + } + ConstRealPointer real(WritableReal expr, size_t) { + return expr.pointer; + } + template + ConstRealPointer real(Expr expr, size_t size, RealPointer canUse) { + linear.fill(canUse, expr, size); + return canUse; + } + ConstRealPointer real(expression::ReadableReal expr, size_t, RealPointer) { + return expr.pointer; + } + ConstRealPointer real(WritableReal expr, size_t, RealPointer) { + return expr.pointer; + } + template + ConstComplexPointer complex(Expr expr, size_t size) { + auto chunk = (*this)(size*2); + linear.fill((ComplexPointer)chunk, expr, size); + return chunk; + } + ConstComplexPointer complex(expression::ReadableComplex expr, size_t) { + return expr.pointer; + } + ConstComplexPointer complex(WritableComplex expr, size_t) { + return expr.pointer; + } + template + ConstComplexPointer complex(Expr expr, size_t size, ComplexPointer canUse) { + linear.fill(canUse, expr, size); + return canUse; + } + ConstComplexPointer complex(expression::ReadableComplex expr, size_t, ComplexPointer) { + return expr.pointer; + } + ConstComplexPointer complex(WritableComplex expr, size_t, ComplexPointer) { + return expr.pointer; + } + template + ConstSplitPointer split(Expr expr, size_t size) { + SplitPointer chunk{(*this)(size), (*this)(size)}; + linear.fill(chunk, expr, size); + return chunk; + } + ConstSplitPointer split(expression::ReadableSplit expr, size_t) { + return expr.pointer; + } + ConstSplitPointer split(WritableSplit expr, size_t) { + return expr.pointer; + } + template + ConstSplitPointer split(Expr expr, size_t size, SplitPointer canUse) { + linear.fill(canUse, expr, size); + return canUse; + } + ConstSplitPointer split(expression::ReadableSplit expr, size_t, SplitPointer) { + return expr.pointer; + } + ConstSplitPointer split(WritableSplit expr, size_t, SplitPointer) { + return expr.pointer; + } + private: + Linear &linear; + }; + ScopedDouble doubleScope() { + return {linear, doubles}; + } + void reserveDoubles(size_t size) { + doubles.reserve(size); + } + template + void reserveDoubles(size_t size, Fn &&allocationWarning) { + doubles.reserve(size); + doubles.allocationWarning = allocationWarning; + } + +private: + TemporaryFloats floats; + TemporaryDoubles doubles; + Linear &linear; +}; + +template +struct LinearImplBase { + using Linear = LinearImpl; + + template + void reserve(size_t) {} + + template + struct WritableReal { + EXPRESSION_NAME(WritableReal, "V*"); + Linear &linear; + RealPointer pointer; + size_t size; + WritableReal(Linear &linear, RealPointer pointer, size_t size) : linear(linear), pointer(pointer), size(size) { + static_assert(std::is_trivially_copyable::value, "must be trivially copyable"); + } + + operator expression::ReadableReal() const { + return {pointer}; + } + + V get(std::ptrdiff_t i) const { + return pointer[i]; + } + + V & operator[](std::ptrdiff_t i) { + return pointer[i]; + } + const V & operator[](std::ptrdiff_t i) const { + return pointer[i]; + } + }; + template + struct WritableComplex { + EXPRESSION_NAME(WritableComplex, "VC*"); + Linear &linear; + ComplexPointer pointer; + size_t size; + WritableComplex(Linear &linear, ComplexPointer pointer, size_t size) : linear(linear), pointer(pointer), size(size) {} + + operator expression::ReadableComplex() const { + return {pointer}; + } + + std::complex get(std::ptrdiff_t i) const { + return pointer[i]; + } + + std::complex & operator[](std::ptrdiff_t i) { + return pointer[i]; + } + const std::complex & operator[](std::ptrdiff_t i) const { + return pointer[i]; + } + }; + template + struct WritableSplit { + EXPRESSION_NAME(WritableSplit, "VS*"); + Linear &linear; + SplitPointer pointer; + size_t size; + WritableSplit(Linear &linear, SplitPointer pointer, size_t size) : linear(linear), pointer(pointer), size(size) {} + + operator expression::ReadableSplit() const { + return {pointer}; + } + + std::complex get(std::ptrdiff_t i) const { + return {pointer.real[i], pointer.imag[i]}; + } + auto operator[](std::ptrdiff_t i) -> decltype(pointer[i]) { + return pointer[i]; + } + auto operator[](std::ptrdiff_t i) const -> decltype(pointer[i]) { + return pointer[i]; + } + }; + + // Arithmetic values get turned into constants + template::value, V>::type> + Expression> wrap(V v) { + return {v}; + } + + // Pass `Expression`s through + template + Expression wrap(Expression expr) { + return expr; + } + + // Wrap a pointer as an expression + template + Expression> wrap(ConstRealPointer pointer) { + return {pointer}; + } + template + Expression> wrap(ConstComplexPointer pointer) { + return {pointer}; + } + template + Expression> wrap(ConstSplitPointer pointer) { + return {pointer}; + } + template + Expression> wrap(ConstRealPointer real, ConstRealPointer imag) { + return {ConstSplitPointer{real, imag}}; + } + + // When a length is supplied, make it writable + template + WritableExpression> wrap(RealPointer pointer, size_t size) { + return {self(), pointer, size}; + } + template + WritableExpression> wrap(ComplexPointer pointer, size_t size) { + return {self(), pointer, size}; + } + template + WritableExpression> wrap(SplitPointer pointer, size_t size) { + return {self(), pointer, size}; + } + template + WritableExpression> wrap(RealPointer real, RealPointer imag, size_t size) { + return {self(), SplitPointer{real, imag}, size}; + } + + template + WritableExpression> wrap(std::vector &vector) { + return {self(), vector.data(), vector.size()}; + } + template + WritableExpression> wrap(std::vector> &vector) { + return {self(), vector.data(), vector.size()}; + } + template + WritableExpression> wrap(std::vector &real, std::vector &imag) { + SplitPointer pointer{real.data(), imag.data()}; + size_t size = std::min(real.size(), imag.size()); + return {self(), pointer, size}; + } + + template + Expression> wrap(const std::vector &vector) { + return {vector.data()}; + } + template + Expression> wrap(const std::vector> &vector) { + return {vector.data()}; + } + template + Expression> wrap(const std::vector &real, const std::vector &imag) { + ConstSplitPointer pointer{real.data(), imag.data()}; + return {pointer}; + } + + template + auto operator()(Args &&...args) -> decltype(wrap(std::forward(args)...)) { + return wrap(std::forward(args)...); + } + + template + void fill(Pointer pointer, Expr expr, size_t size) { + for (size_t i = 0; i < size; ++i) { + pointer[i] = expr.get(i); + } + } + template + void fill(SplitPointer pointer, Expr expr, size_t size) { + for (size_t i = 0; i < size; ++i) { + std::complex v = expr.get(i); + pointer.real[i] = v.real(); + pointer.imag[i] = v.imag(); + } + } + + // Remove the Expression<...> layer, so the simplification template-matching works + template + void fill(Pointer pointer, Expression expr, size_t size) { + return self().fill(pointer, (Expr &)expr, size); + }; + // Separate otherwise it's ambiguous between the two previous ones + template + void fill(SplitPointer pointer, Expression expr, size_t size) { + return self().fill(pointer, (Expr &)expr, size); + }; + + +#define SIGNALSMITH_AUDIO_LINEAR_FUNC1(ExprName, methodName) \ + template \ + auto methodName(A a) -> Expression { \ + return expression::make##ExprName(wrap(a)); \ + } + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Abs, abs) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Norm, norm) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Exp, exp) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Exp2, exp2) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Log, log) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Log2, log2) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Log10, log10) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Sqrt, sqrt) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Cbrt, cbrt) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Ceil, ceil) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Floor, floor) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Trunc, trunc) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Round, round) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Conj, conj) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Real, real) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Imag, imag) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Arg, arg) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Proj, proj) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Sin, sin) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Cos, cos) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Tan, tan) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Asin, asin) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Acos, acos) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Atan, atan) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Sinh, sinh) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Cosh, cosh) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Tanh, tanh) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Asinh, asinh) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Acosh, acosh) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Atanh, atanh) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Erf, erf) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Erfc, erfc) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Tgamma, tgamma) + SIGNALSMITH_AUDIO_LINEAR_FUNC1(Lgamma, lgamma) +#undef SIGNALSMITH_AUDIO_LINEAR_FUNC1 + +#define SIGNALSMITH_AUDIO_LINEAR_FUNC2(ExprName, methodName) \ + template \ + auto methodName(A a, B b) -> Expression { \ + return expression::make##ExprName(wrap(a), wrap(b)); \ + } + SIGNALSMITH_AUDIO_LINEAR_FUNC2(Max, max); + SIGNALSMITH_AUDIO_LINEAR_FUNC2(Min, min); + SIGNALSMITH_AUDIO_LINEAR_FUNC2(Dim, dim); + SIGNALSMITH_AUDIO_LINEAR_FUNC2(Pow, pow); + SIGNALSMITH_AUDIO_LINEAR_FUNC2(Atan2, atan2); + SIGNALSMITH_AUDIO_LINEAR_FUNC2(Hypot, hypot); + SIGNALSMITH_AUDIO_LINEAR_FUNC2(Copysign, copysign); + SIGNALSMITH_AUDIO_LINEAR_FUNC2(Polar, polar); +#undef SIGNALSMITH_AUDIO_LINEAR_FUNC2 + +protected: + LinearImplBase(Linear *linearThis) { + assert((LinearImplBase *)linearThis == this); + } + + Linear & self() { + return *(Linear *)this; + } +}; + +/* SFINAE template for checking that an expression naturally returns a particular item type + +e.g. + template + ItemType fill(...) +*/ +template +using ItemType = typename std::enable_if< + std::is_same< + typename std::decay().get(0))>::type, + Item + >::value, + OutputExpr +>::type; + +// Fallback implementation - this should be specialised (with useLinear=true) with faster methods where available +template +struct LinearImpl : public LinearImplBase { + LinearImpl() : LinearImplBase(this), cached(*this) {} + + using LinearImplBase::fill; + + // Override .fill() for specific pointer/expressions which you can do quickly. Calling `cached.realFloat()` etc. will call back to .fill() + template + void fill(RealPointer pointer, expression::Sqrt> expr, size_t size) { + auto floats = cached.floatScope(); + + auto normExpr = expr.a; + auto array = floats.real(normExpr, size); + + // The idea is to use an existing fast function, but this is an example + for (size_t i = 0; i < size; ++i) { + pointer[i] = std::sqrt(array[i]); + } + } + + template + void reserve(size_t) {} + // Makes sure we don't allocate + template<> + void reserve(size_t size) { + cached.reserveFloats(size); + } + template<> + void reserve(size_t size) { + cached.reserveDoubles(size); + } +private: + CachedResults cached; +}; + +}}; // namespace + +#if defined(SIGNALSMITH_USE_ACCELERATE) +# include "./platform/linear-accelerate.h" +#elif 0//defined(SIGNALSMITH_USE_IPP) +# include "./platform/linear-ipp.h" +#endif + +#endif // include guard diff --git a/thirdparty/signalsmith-linear/platform/fft-accelerate.h b/thirdparty/signalsmith-linear/platform/fft-accelerate.h new file mode 100644 index 0000000000..b421696a82 --- /dev/null +++ b/thirdparty/signalsmith-linear/platform/fft-accelerate.h @@ -0,0 +1,340 @@ +// If possible, only include vecLib, since JUCE has conflicts with vImage +#if defined(__has_include) && __has_include() +# include +#else +# include +#endif + +namespace signalsmith { namespace linear { + +namespace _impl { + template<> + inline void complexMul(std::complex *a, const std::complex *b, const std::complex *c, size_t size) { + DSPSplitComplex aSplit = {(float *)a, (float *)a + 1}; + DSPSplitComplex bSplit = {(float *)b, (float *)b + 1}; + DSPSplitComplex cSplit = {(float *)c, (float *)c + 1}; + vDSP_zvmul(&cSplit, 2, &bSplit, 2, &aSplit, 2, size, 1); + } + template<> + inline void complexMulConj(std::complex *a, const std::complex *b, const std::complex *c, size_t size) { + DSPSplitComplex aSplit = {(float *)a, (float *)a + 1}; + DSPSplitComplex bSplit = {(float *)b, (float *)b + 1}; + DSPSplitComplex cSplit = {(float *)c, (float *)c + 1}; + vDSP_zvmul(&cSplit, 2, &bSplit, 2, &aSplit, 2, size, -1); + } + template<> + inline void complexMul(float *ar, float *ai, const float *br, const float *bi, const float *cr, const float *ci, size_t size) { + DSPSplitComplex aSplit = {ar, ai}; + DSPSplitComplex bSplit = {(float *)br, (float *)bi}; + DSPSplitComplex cSplit = {(float *)cr, (float *)ci}; + vDSP_zvmul(&cSplit, 1, &bSplit, 1, &aSplit, 1, size, 1); + } + template<> + inline void complexMulConj(float *ar, float *ai, const float *br, const float *bi, const float *cr, const float *ci, size_t size) { + DSPSplitComplex aSplit = {ar, ai}; + DSPSplitComplex bSplit = {(float *)br, (float *)bi}; + DSPSplitComplex cSplit = {(float *)cr, (float *)ci}; + vDSP_zvmul(&cSplit, 1, &bSplit, 1, &aSplit, 1, size, -1); + } + + // doubles + template<> + inline void complexMul(std::complex *a, const std::complex *b, const std::complex *c, size_t size) { + DSPDoubleSplitComplex aSplit = {(double *)a, (double *)a + 1}; + DSPDoubleSplitComplex bSplit = {(double *)b, (double *)b + 1}; + DSPDoubleSplitComplex cSplit = {(double *)c, (double *)c + 1}; + vDSP_zvmulD(&cSplit, 2, &bSplit, 2, &aSplit, 2, size, 1); + } + template<> + inline void complexMulConj(std::complex *a, const std::complex *b, const std::complex *c, size_t size) { + DSPDoubleSplitComplex aSplit = {(double *)a, (double *)a + 1}; + DSPDoubleSplitComplex bSplit = {(double *)b, (double *)b + 1}; + DSPDoubleSplitComplex cSplit = {(double *)c, (double *)c + 1}; + vDSP_zvmulD(&cSplit, 2, &bSplit, 2, &aSplit, 2, size, -1); + } + template<> + inline void complexMul(double *ar, double *ai, const double *br, const double *bi, const double *cr, const double *ci, size_t size) { + DSPDoubleSplitComplex aSplit = {ar, ai}; + DSPDoubleSplitComplex bSplit = {(double *)br, (double *)bi}; + DSPDoubleSplitComplex cSplit = {(double *)cr, (double *)ci}; + vDSP_zvmulD(&cSplit, 1, &bSplit, 1, &aSplit, 1, size, 1); + } + template<> + inline void complexMulConj(double *ar, double *ai, const double *br, const double *bi, const double *cr, const double *ci, size_t size) { + DSPDoubleSplitComplex aSplit = {ar, ai}; + DSPDoubleSplitComplex bSplit = {(double *)br, (double *)bi}; + DSPDoubleSplitComplex cSplit = {(double *)cr, (double *)ci}; + vDSP_zvmulD(&cSplit, 1, &bSplit, 1, &aSplit, 1, size, -1); + } +} + +template<> +struct Pow2FFT { + static constexpr bool prefersSplit = true; + + using Complex = std::complex; + + Pow2FFT(size_t size = 0) { + resize(size); + } + ~Pow2FFT() { + if (hasSetup) vDSP_destroy_fftsetup(fftSetup); + } + // Allow move, but not copy + Pow2FFT(const Pow2FFT &other) = delete; + Pow2FFT(Pow2FFT &&other) : _size(other._size), hasSetup(other.hasSetup), fftSetup(other.fftSetup), log2(other.log2), splitReal(std::move(other.splitReal)), splitImag(std::move(other.splitImag)) { + // Avoid double-free + other.hasSetup = false; + } + + void resize(size_t size) { + _size = size; + if (hasSetup) vDSP_destroy_fftsetup(fftSetup); + if (!size) { + hasSetup = false; + return; + } + + splitReal.resize(size); + splitImag.resize(size); + log2 = std::round(std::log2(size)); + fftSetup = vDSP_create_fftsetup(log2, FFT_RADIX2); + hasSetup = fftSetup; + } + + void fft(const Complex* input, Complex* output) { + DSPSplitComplex splitComplex{ splitReal.data(), splitImag.data() }; + vDSP_ctoz((DSPComplex*)input, 2, &splitComplex, 1, _size); + vDSP_fft_zip(fftSetup, &splitComplex, 1, log2, kFFTDirection_Forward); + vDSP_ztoc(&splitComplex, 1, (DSPComplex*)output, 2, _size); + } + void fft(const float *inR, const float *inI, float *outR, float *outI) { + DSPSplitComplex inSplit{(float *)inR, (float *)inI}; + DSPSplitComplex outSplit{outR, outI}; + vDSP_fft_zop(fftSetup, &inSplit, 1, &outSplit, 1, log2, kFFTDirection_Forward); + } + + void ifft(const Complex* input, Complex* output) { + DSPSplitComplex splitComplex{ splitReal.data(), splitImag.data() }; + vDSP_ctoz((DSPComplex*)input, 2, &splitComplex, 1, _size); + vDSP_fft_zip(fftSetup, &splitComplex, 1, log2, kFFTDirection_Inverse); + vDSP_ztoc(&splitComplex, 1, (DSPComplex*)output, 2, _size); + } + void ifft(const float *inR, const float *inI, float *outR, float *outI) { + DSPSplitComplex inSplit{(float *)inR, (float *)inI}; + DSPSplitComplex outSplit{outR, outI}; + vDSP_fft_zop(fftSetup, &inSplit, 1, &outSplit, 1, log2, kFFTDirection_Inverse); + } + +private: + size_t _size = 0; + bool hasSetup = false; + FFTSetup fftSetup; + int log2 = 0; + std::vector splitReal, splitImag; +}; + +template<> +struct Pow2RealFFT { + static constexpr bool prefersSplit = true; + + using Complex = std::complex; + + Pow2RealFFT(size_t size = 0) { + resize(size); + } + ~Pow2RealFFT() { + if (hasSetup) vDSP_destroy_fftsetup(fftSetup); + } + // Allow move, but not copy + Pow2RealFFT(const Pow2RealFFT &other) = delete; + Pow2RealFFT(Pow2RealFFT &&other) : _size(other._size), hasSetup(other.hasSetup), fftSetup(other.fftSetup), log2(other.log2), splitReal(std::move(other.splitReal)), splitImag(std::move(other.splitImag)) { + // Avoid double-free + other.hasSetup = false; + } + + void resize(size_t size) { + _size = size; + if (hasSetup) vDSP_destroy_fftsetup(fftSetup); + if (!size) { + hasSetup = false; + return; + } + + splitReal.resize(size); + splitImag.resize(size); + log2 = std::log2(size); + fftSetup = vDSP_create_fftsetup(log2, FFT_RADIX2); + hasSetup = fftSetup; + } + + void fft(const float* input, Complex* output) { + float mul = 0.5f; + vDSP_vsmul(input, 2, &mul, splitReal.data(), 1, _size/2); + vDSP_vsmul(input + 1, 2, &mul, splitImag.data(), 1, _size/2); + DSPSplitComplex tmpSplit{splitReal.data(), splitImag.data()}; + vDSP_fft_zrip(fftSetup, &tmpSplit, 1, log2, kFFTDirection_Forward); + vDSP_ztoc(&tmpSplit, 1, (DSPComplex *)output, 2, _size/2); + } + void fft(const float *inR, float *outR, float *outI) { + DSPSplitComplex outputSplit{outR, outI}; + float mul = 0.5f; + vDSP_vsmul(inR, 2, &mul, outR, 1, _size/2); + vDSP_vsmul(inR + 1, 2, &mul, outI, 1, _size/2); + vDSP_fft_zrip(fftSetup, &outputSplit, 1, log2, kFFTDirection_Forward); + } + + void ifft(const Complex * input, float * output) { + DSPSplitComplex tmpSplit{splitReal.data(), splitImag.data()}; + vDSP_ctoz((DSPComplex*)input, 2, &tmpSplit, 1, _size/2); + vDSP_fft_zrip(fftSetup, &tmpSplit, 1, log2, kFFTDirection_Inverse); + DSPSplitComplex outputSplit{output, output + 1}; + vDSP_zvmov(&tmpSplit, 1, &outputSplit, 2, _size/2); + } + void ifft(const float *inR, const float *inI, float *outR) { + DSPSplitComplex inputSplit{(float *)inR, (float *)inI}; + DSPSplitComplex tmpSplit{splitReal.data(), splitImag.data()}; + vDSP_fft_zrop(fftSetup, &inputSplit, 1, &tmpSplit, 1, log2, kFFTDirection_Inverse); + DSPSplitComplex outputSplit{outR, outR + 1}; + // We can't use vDSP_ztoc without knowing the alignment + vDSP_zvmov(&tmpSplit, 1, &outputSplit, 2, _size/2); + } + +private: + size_t _size = 0; + bool hasSetup = false; + FFTSetup fftSetup; + int log2 = 0; + std::vector splitReal, splitImag; +}; + +template<> +struct Pow2FFT { + static constexpr bool prefersSplit = true; + + using Complex = std::complex; + + Pow2FFT(size_t size=0) { + resize(size); + } + ~Pow2FFT() { + if (hasSetup) vDSP_destroy_fftsetupD(fftSetup); + } + + void resize(size_t size) { + _size = size; + if (hasSetup) vDSP_destroy_fftsetupD(fftSetup); + if (!size) { + hasSetup = false; + return; + } + + log2 = std::round(std::log2(size)); + fftSetup = vDSP_create_fftsetupD(log2, FFT_RADIX2); + hasSetup = fftSetup; + + splitReal.resize(size); + splitImag.resize(size); + } + + void fft(const Complex* input, Complex* output) { + DSPDoubleSplitComplex splitComplex{ splitReal.data(), splitImag.data() }; + vDSP_ctozD((DSPDoubleComplex*)input, 2, &splitComplex, 1, _size); + vDSP_fft_zipD(fftSetup, &splitComplex, 1, log2, kFFTDirection_Forward); + vDSP_ztocD(&splitComplex, 1, (DSPDoubleComplex*)output, 2, _size); + } + void fft(const double *inR, const double *inI, double *outR, double *outI) { + DSPDoubleSplitComplex inSplit{(double *)inR, (double *)inI}; + DSPDoubleSplitComplex outSplit{outR, outI}; + vDSP_fft_zopD(fftSetup, &inSplit, 1, &outSplit, 1, log2, kFFTDirection_Forward); + } + + void ifft(const Complex* input, Complex* output) { + DSPDoubleSplitComplex splitComplex{ splitReal.data(), splitImag.data() }; + vDSP_ctozD((DSPDoubleComplex*)input, 2, &splitComplex, 1, _size); + vDSP_fft_zipD(fftSetup, &splitComplex, 1, log2, kFFTDirection_Inverse); + vDSP_ztocD(&splitComplex, 1, (DSPDoubleComplex*)output, 2, _size); + } + void ifft(const double *inR, const double *inI, double *outR, double *outI) { + DSPDoubleSplitComplex inSplit{(double *)inR, (double *)inI}; + DSPDoubleSplitComplex outSplit{outR, outI}; + vDSP_fft_zopD(fftSetup, &inSplit, 1, &outSplit, 1, log2, kFFTDirection_Inverse); + } + +private: + size_t _size = 0; + bool hasSetup = false; + FFTSetupD fftSetup; + int log2 = 0; + std::vector splitReal, splitImag; +}; + +template<> +struct Pow2RealFFT { + static constexpr bool prefersSplit = true; + + using Complex = std::complex; + + Pow2RealFFT(size_t size = 0) { + resize(size); + } + ~Pow2RealFFT() { + if (hasSetup) vDSP_destroy_fftsetupD(fftSetup); + } + + void resize(size_t size) { + _size = size; + if (hasSetup) vDSP_destroy_fftsetupD(fftSetup); + if (!size) { + hasSetup = false; + return; + } + + splitReal.resize(size); + splitImag.resize(size); + log2 = std::log2(size); + fftSetup = vDSP_create_fftsetupD(log2, FFT_RADIX2); + hasSetup = fftSetup; + } + + void fft(const double* input, Complex* output) { + double mul = 0.5f; + vDSP_vsmulD(input, 2, &mul, splitReal.data(), 1, _size/2); + vDSP_vsmulD(input + 1, 2, &mul, splitImag.data(), 1, _size/2); + DSPDoubleSplitComplex tmpSplit{splitReal.data(), splitImag.data()}; + vDSP_fft_zripD(fftSetup, &tmpSplit, 1, log2, kFFTDirection_Forward); + vDSP_ztocD(&tmpSplit, 1, (DSPDoubleComplex *)output, 2, _size/2); + } + void fft(const double *inR, double *outR, double *outI) { + DSPDoubleSplitComplex outputSplit{outR, outI}; + double mul = 0.5f; + vDSP_vsmulD(inR, 2, &mul, outR, 1, _size/2); + vDSP_vsmulD(inR + 1, 2, &mul, outI, 1, _size/2); + vDSP_fft_zripD(fftSetup, &outputSplit, 1, log2, kFFTDirection_Forward); + } + + void ifft(const Complex * input, double * output) { + DSPDoubleSplitComplex tmpSplit{splitReal.data(), splitImag.data()}; + vDSP_ctozD((DSPDoubleComplex*)input, 2, &tmpSplit, 1, _size/2); + vDSP_fft_zripD(fftSetup, &tmpSplit, 1, log2, kFFTDirection_Inverse); + DSPDoubleSplitComplex outputSplit{output, output + 1}; + vDSP_zvmovD(&tmpSplit, 1, &outputSplit, 2, _size/2); + } + void ifft(const double *inR, const double *inI, double *outR) { + DSPDoubleSplitComplex inputSplit{(double *)inR, (double *)inI}; + DSPDoubleSplitComplex tmpSplit{splitReal.data(), splitImag.data()}; + vDSP_fft_zropD(fftSetup, &inputSplit, 1, &tmpSplit, 1, log2, kFFTDirection_Inverse); + DSPDoubleSplitComplex outputSplit{outR, outR + 1}; + // We can't use vDSP_ztoc without knowing the alignment + vDSP_zvmovD(&tmpSplit, 1, &outputSplit, 2, _size/2); + } + +private: + size_t _size = 0; + bool hasSetup = false; + FFTSetupD fftSetup; + int log2 = 0; + std::vector splitReal, splitImag; +}; + +}} // namespace diff --git a/thirdparty/signalsmith-linear/platform/fft-ipp.h b/thirdparty/signalsmith-linear/platform/fft-ipp.h new file mode 100644 index 0000000000..7648d42bf8 --- /dev/null +++ b/thirdparty/signalsmith-linear/platform/fft-ipp.h @@ -0,0 +1,532 @@ +#ifndef FFT2_FFT2_IPP_H +#define FFT2_FFT2_IPP_H + +#include +#include + +#include +#include +#include +#include + +namespace signalsmith { namespace linear { + +namespace { + +void checkStatus(IppStatus status) { + assert(status == ippStsNoErr); +} + +template +struct State{}; + +template<> +struct State { +public: + explicit State(size_t size) { + IppStatus status; + const auto order = static_cast(std::log2(size)); + Ipp8u* initBuffer{ nullptr }; + + int maxWorkingSize = 0; + + { + int fftSpecSize, fftInitBuffSize, fftWorkBuffSize; + status = ippsFFTGetSize_C_32fc(order, IPP_FFT_NODIV_BY_ANY, ippAlgHintFast, &fftSpecSize, &fftInitBuffSize, &fftWorkBuffSize); + checkStatus(status); + maxWorkingSize = std::max(maxWorkingSize, fftWorkBuffSize); + specBuffer = ippsMalloc_8u(fftSpecSize); + spec = (IppsFFTSpec_C_32fc*)specBuffer; + if (fftInitBuffSize != 0) { + initBuffer = ippsMalloc_8u(fftInitBuffSize); + } + status = ippsFFTInit_C_32fc(&spec, order, IPP_FFT_NODIV_BY_ANY, ippAlgHintFast, specBuffer, initBuffer); + + checkStatus(status); + if (initBuffer) { + ippsFree(initBuffer); + } + } + + { + int fftSpecSize, fftInitBuffSize, fftWorkBuffSize; + status = ippsFFTGetSize_C_32f(order, IPP_FFT_NODIV_BY_ANY, ippAlgHintFast, &fftSpecSize, &fftInitBuffSize, &fftWorkBuffSize); + checkStatus(status); + maxWorkingSize = std::max(maxWorkingSize, fftWorkBuffSize); + specSplitBuffer = ippsMalloc_8u(fftSpecSize); + specSplit = (IppsFFTSpec_C_32f*)specSplitBuffer; + if (fftInitBuffSize != 0) { + initBuffer = ippsMalloc_8u(fftInitBuffSize); + } + status = ippsFFTInit_C_32f(&specSplit, order, IPP_FFT_NODIV_BY_ANY, ippAlgHintFast, specSplitBuffer, initBuffer); + + checkStatus(status); + if (initBuffer) { + ippsFree(initBuffer); + } + } + + if(maxWorkingSize != 0) { + workBuffer = ippsMalloc_8u(maxWorkingSize); + workSize = maxWorkingSize; + } + } + + ~State() noexcept { + if(specBuffer) { + ippsFree(specBuffer); + } + if(specSplitBuffer) { + ippsFree(specSplitBuffer); + } + if(workBuffer) { + ippsFree(workBuffer); + } + } + + IppsFFTSpec_C_32fc *spec{nullptr}; + IppsFFTSpec_C_32f *specSplit{nullptr}; + Ipp8u *specBuffer{nullptr}; + Ipp8u *specSplitBuffer{nullptr}; + Ipp8u *workBuffer{nullptr}; + size_t workSize = 0; +}; + +template<> +struct State { +public: + explicit State(size_t size) { + IppStatus status; + const auto order = static_cast(std::log2(size)); + Ipp8u* initBuffer{ nullptr }; + + int maxWorkingSize = 0; + + int fftSpecSize, fftInitBuffSize, fftWorkBuffSize; + status = ippsFFTGetSize_R_32f(order, IPP_FFT_NODIV_BY_ANY, ippAlgHintFast, &fftSpecSize, &fftInitBuffSize, &fftWorkBuffSize); + checkStatus(status); + maxWorkingSize = std::max(maxWorkingSize, fftWorkBuffSize); + specBuffer = ippsMalloc_8u(fftSpecSize); + spec = (IppsFFTSpec_R_32f*)specBuffer; + if (fftInitBuffSize != 0) { + initBuffer = ippsMalloc_8u(fftInitBuffSize); + } + status = ippsFFTInit_R_32f(&spec, order, IPP_FFT_NODIV_BY_ANY, ippAlgHintFast, specBuffer, initBuffer); + + checkStatus(status); + if (initBuffer) { + ippsFree(initBuffer); + } + + if(maxWorkingSize != 0) { + workBuffer = ippsMalloc_8u(maxWorkingSize); + workSize = maxWorkingSize; + } + } + + ~State() noexcept { + if(specBuffer) { + ippsFree(specBuffer); + } + if(workBuffer) { + ippsFree(workBuffer); + } + } + + IppsFFTSpec_R_32f *spec{nullptr}; + Ipp8u *specBuffer{nullptr}; + Ipp8u *workBuffer{nullptr}; + size_t workSize = 0; +}; + +template<> +struct State { + explicit State(size_t size) { + IppStatus status; + const auto order = static_cast(std::log2(size)); + Ipp8u* initBuffer{ nullptr }; + + int maxWorkingSize = 0; + + { + int fftSpecSize, fftInitBuffSize, fftWorkBuffSize; + status = ippsFFTGetSize_C_64fc(order, IPP_FFT_NODIV_BY_ANY, ippAlgHintFast, &fftSpecSize, &fftInitBuffSize, &fftWorkBuffSize); + checkStatus(status); + maxWorkingSize = std::max(maxWorkingSize, fftWorkBuffSize); + specBuffer = ippsMalloc_8u(fftSpecSize); + spec = (IppsFFTSpec_C_64fc*)specBuffer; + if (fftInitBuffSize != 0) { + initBuffer = ippsMalloc_8u(fftInitBuffSize); + } + status = ippsFFTInit_C_64fc(&spec, order, IPP_FFT_NODIV_BY_ANY, ippAlgHintFast, specBuffer, initBuffer); + + checkStatus(status); + if (initBuffer) { + ippsFree(initBuffer); + } + } + + { + int fftSpecSize, fftInitBuffSize, fftWorkBuffSize; + status = ippsFFTGetSize_C_64f(order, IPP_FFT_NODIV_BY_ANY, ippAlgHintFast, &fftSpecSize, &fftInitBuffSize, &fftWorkBuffSize); + checkStatus(status); + maxWorkingSize = std::max(maxWorkingSize, fftWorkBuffSize); + specSplitBuffer = ippsMalloc_8u(fftSpecSize); + specSplit = (IppsFFTSpec_C_64f*)specSplitBuffer; + if (fftInitBuffSize != 0) { + initBuffer = ippsMalloc_8u(fftInitBuffSize); + } + status = ippsFFTInit_C_64f(&specSplit, order, IPP_FFT_NODIV_BY_ANY, ippAlgHintFast, specSplitBuffer, initBuffer); + + checkStatus(status); + if (initBuffer) { + ippsFree(initBuffer); + } + } + + if(maxWorkingSize != 0) { + workBuffer = ippsMalloc_8u(maxWorkingSize); + workSize = maxWorkingSize; + } + } + + ~State() noexcept { + if(specBuffer) { + ippsFree(specBuffer); + } + if(specSplitBuffer) { + ippsFree(specSplitBuffer); + } + if(workBuffer) { + ippsFree(workBuffer); + } + } + + IppsFFTSpec_C_64fc *spec{nullptr}; + IppsFFTSpec_C_64f *specSplit{nullptr}; + Ipp8u *specBuffer{nullptr}; + Ipp8u *specSplitBuffer{nullptr}; + Ipp8u *workBuffer{nullptr}; + size_t workSize = 0; +}; + +template<> +struct State { + explicit State(size_t size) { + IppStatus status; + const auto order = static_cast(std::log2(size)); + Ipp8u* initBuffer{ nullptr }; + + int maxWorkingSize = 0; + + int fftSpecSize, fftInitBuffSize, fftWorkBuffSize; + status = ippsFFTGetSize_R_64f(order, IPP_FFT_NODIV_BY_ANY, ippAlgHintFast, &fftSpecSize, &fftInitBuffSize, &fftWorkBuffSize); + checkStatus(status); + maxWorkingSize = std::max(maxWorkingSize, fftWorkBuffSize); + specBuffer = ippsMalloc_8u(fftSpecSize); + spec = (IppsFFTSpec_R_64f*)specBuffer; + if (fftInitBuffSize != 0) { + initBuffer = ippsMalloc_8u(fftInitBuffSize); + } + status = ippsFFTInit_R_64f(&spec, order, IPP_FFT_NODIV_BY_ANY, ippAlgHintFast, specBuffer, initBuffer); + + checkStatus(status); + if (initBuffer) { + ippsFree(initBuffer); + } + + + if(maxWorkingSize != 0) { + workBuffer = ippsMalloc_8u(maxWorkingSize); + workSize = maxWorkingSize; + } + } + + ~State() noexcept { + if(specBuffer) { + ippsFree(specBuffer); + } + if(workBuffer) { + ippsFree(workBuffer); + } + } + + IppsFFTSpec_R_64f *spec{nullptr}; + Ipp8u *specBuffer{nullptr}; + Ipp8u *workBuffer{nullptr}; + size_t workSize = 0; + +}; +} // namespace + + +namespace _impl { + template<> + void complexMul(std::complex *a, const std::complex *b, const std::complex *c, size_t size) { + auto status = ippsMul_32fc((const Ipp32fc*)b, (const Ipp32fc*)c, (Ipp32fc*)a, int(size)); + checkStatus(status); + } + //template<> + //void complexMulConj(std::complex *a, const std::complex *b, const std::complex *c, size_t size) { + // DSPSplitComplex aSplit = {(float *)a, (float *)a + 1}; + // DSPSplitComplex bSplit = {(float *)b, (float *)b + 1}; + // DSPSplitComplex cSplit = {(float *)c, (float *)c + 1}; + // vDSP_zvmul(&cSplit, 2, &bSplit, 2, &aSplit, 2, size, -1); + //} + //template<> + //void complexMul(float *ar, float *ai, const float *br, const float *bi, const float *cr, const float *ci, size_t size) { + // DSPSplitComplex aSplit = {ar, ai}; + // DSPSplitComplex bSplit = {(float *)br, (float *)bi}; + // DSPSplitComplex cSplit = {(float *)cr, (float *)ci}; + // vDSP_zvmul(&cSplit, 1, &bSplit, 1, &aSplit, 1, size, 1); + //} + //template<> + //void complexMulConj(float *ar, float *ai, const float *br, const float *bi, const float *cr, const float *ci, size_t size) { + // DSPSplitComplex aSplit = {ar, ai}; + // DSPSplitComplex bSplit = {(float *)br, (float *)bi}; + // DSPSplitComplex cSplit = {(float *)cr, (float *)ci}; + // vDSP_zvmul(&cSplit, 1, &bSplit, 1, &aSplit, 1, size, -1); + //} + + // doubles + template<> + void complexMul(std::complex *a, const std::complex *b, const std::complex *c, size_t size) { + auto status = ippsMul_64fc((const Ipp64fc*)b, (const Ipp64fc*)c, (Ipp64fc*)a, int(size)); + checkStatus(status); + } + //template<> + //void complexMulConj(std::complex *a, const std::complex *b, const std::complex *c, size_t size) { + // DSPDoubleSplitComplex aSplit = {(double *)a, (double *)a + 1}; + // DSPDoubleSplitComplex bSplit = {(double *)b, (double *)b + 1}; + // DSPDoubleSplitComplex cSplit = {(double *)c, (double *)c + 1}; + // vDSP_zvmulD(&cSplit, 2, &bSplit, 2, &aSplit, 2, size, -1); + //} + //template<> + //void complexMul(double *ar, double *ai, const double *br, const double *bi, const double *cr, const double *ci, size_t size) { + // DSPDoubleSplitComplex aSplit = {ar, ai}; + // DSPDoubleSplitComplex bSplit = {(double *)br, (double *)bi}; + // DSPDoubleSplitComplex cSplit = {(double *)cr, (double *)ci}; + // vDSP_zvmulD(&cSplit, 1, &bSplit, 1, &aSplit, 1, size, 1); + //} + //template<> + //void complexMulConj(double *ar, double *ai, const double *br, const double *bi, const double *cr, const double *ci, size_t size) { + // DSPDoubleSplitComplex aSplit = {ar, ai}; + // DSPDoubleSplitComplex bSplit = {(double *)br, (double *)bi}; + // DSPDoubleSplitComplex cSplit = {(double *)cr, (double *)ci}; + // vDSP_zvmulD(&cSplit, 1, &bSplit, 1, &aSplit, 1, size, -1); + //} +} + +template<> +struct Pow2FFT { + static constexpr bool prefersSplit = true; // whether this FFT implementation is faster when given split-complex inputs + + Pow2FFT(size_t size = 0) { + if(size > 0) { + resize(size); + } + } + + void resize(size_t size) { + m_state = std::make_unique>(size); + } + + void fft(const std::complex* input, std::complex* output) { + auto* ippComplexIn = reinterpret_cast(input); + auto* ippComplexOut = reinterpret_cast(output); + const auto status = ippsFFTFwd_CToC_32fc(ippComplexIn, ippComplexOut, m_state->spec, m_state->workBuffer); + checkStatus(status); + } + + void fft(const float *inR, const float *inI, float *outR, float *outI) { + const auto status = ippsFFTFwd_CToC_32f((const Ipp32f*)inR, (const Ipp32f*)inI, (Ipp32f*)outR, (Ipp32f*)outI, m_state->specSplit, m_state->workBuffer); + checkStatus(status); + } + + void ifft(const std::complex* input, std::complex* output) { + auto* ippComplexIn = reinterpret_cast(input); + auto* ippComplexOut = reinterpret_cast(output); + const auto status = ippsFFTInv_CToC_32fc(ippComplexIn, ippComplexOut, m_state->spec, m_state->workBuffer); + checkStatus(status); + } + + void ifft(const float *inR, const float *inI, float *outR, float *outI) { + const auto status = ippsFFTInv_CToC_32f((const Ipp32f*)inR, (const Ipp32f*)inI, (Ipp32f*)outR, (Ipp32f*)outI, m_state->specSplit, m_state->workBuffer); + checkStatus(status); + } + +private: + std::unique_ptr> m_state{ nullptr }; +}; + +template<> +struct Pow2RealFFT { + static constexpr bool prefersSplit = false; // whether this FFT implementation is faster when given split-complex inputs + + Pow2RealFFT(size_t size = 0) { + if(size > 0) { + resize(size); + } + } + + void resize(size_t size) { + hSize = size/2; + size = hSize * 2; + m_state = std::make_unique>(size); + + // Either use the existing working buffer (if it's big enough), or allocate a new one using std::vector<> + size_t complexAlign = alignof(std::complex); + size_t work = size_t(m_state->workBuffer); + size_t aligned = (work + (complexAlign - 1)) & ~(complexAlign - 1); + + if (work + m_state->workSize - aligned >= size * sizeof(std::complex)) { + tmp = (std::complex *) aligned; + tmpVector.resize(0); + } else { + tmpVector.resize(size); + tmp = tmpVector.data(); + } + } + + void fft(const float *input, std::complex* output) { + const auto status = ippsFFTFwd_RToPerm_32f((const Ipp32f*)input, (Ipp32f*)output, m_state->spec, m_state->workBuffer); + checkStatus(status); + } + + void fft(const float *inR, float *outR, float *outI) { + const auto status = ippsFFTFwd_RToPerm_32f((const Ipp32f*)inR, (Ipp32f*)tmp, m_state->spec, m_state->workBuffer); + checkStatus(status); + for (size_t i = 0; i < hSize; ++i) { + outR[i] = tmp[i].real(); + outI[i] = tmp[i].imag(); + } + } + + void ifft(const std::complex* input, float *output) { + const auto status = ippsFFTInv_PermToR_32f((const Ipp32f*)input, (Ipp32f*)output, m_state->spec, m_state->workBuffer); + checkStatus(status); + } + + void ifft(const float *inR, const float *inI, float *outR) { + for (size_t i = 0; i < hSize; ++i) { + tmp[i] = { inR[i], inI[i] }; + } + const auto status = ippsFFTInv_PermToR_32f((const Ipp32f*)tmp, (Ipp32f*)outR, m_state->spec, m_state->workBuffer); + checkStatus(status); + } + +private: + std::unique_ptr> m_state{ nullptr }; + std::complex* tmp = nullptr; + std::vector> tmpVector; + size_t hSize; +}; + +template<> +struct Pow2FFT { + static constexpr bool prefersSplit = true; // whether this FFT implementation is faster when given split-complex inputs + + Pow2FFT(size_t size = 0) { + if(size > 0) { + resize(size); + } + } + + void resize(size_t size) { + m_state = std::make_unique>(size); + } + + void fft(const std::complex* input, std::complex* output) { + auto* ippComplexIn = reinterpret_cast(input); + auto* ippComplexOut = reinterpret_cast(output); + const auto status = ippsFFTFwd_CToC_64fc(ippComplexIn, ippComplexOut, m_state->spec, m_state->workBuffer); + checkStatus(status); + } + + void fft(const double *inR, const double *inI, double *outR, double *outI) { + const auto status = ippsFFTFwd_CToC_64f((const Ipp64f*)inR, (const Ipp64f*)inI, (Ipp64f*)outR, (Ipp64f*)outI, m_state->specSplit, m_state->workBuffer); + checkStatus(status); + } + + void ifft(const std::complex* input, std::complex* output) { + auto* ippComplexIn = reinterpret_cast(input); + auto* ippComplexOut = reinterpret_cast(output); + const auto status = ippsFFTInv_CToC_64fc(ippComplexIn, ippComplexOut, m_state->spec, m_state->workBuffer); + checkStatus(status); + } + + void ifft(const double *inR, const double *inI, double *outR, double *outI) { + const auto status = ippsFFTInv_CToC_64f((const Ipp64f*)inR, (const Ipp64f*)inI, (Ipp64f*)outR, (Ipp64f*)outI, m_state->specSplit, m_state->workBuffer); + checkStatus(status); + } + +private: + std::unique_ptr> m_state{ nullptr }; +}; + +template<> +struct Pow2RealFFT { + static constexpr bool prefersSplit = false; // whether this FFT implementation is faster when given split-complex inputs + + Pow2RealFFT(size_t size = 0) { + if(size > 0) { + resize(size); + } + } + + void resize(size_t size) { + hSize = size/2; + size = hSize * 2; + m_state = std::make_unique>(size); + + // Either use the existing working buffer (if it's big enough), or allocate a new one using std::vector<> + size_t complexAlign = alignof(std::complex); + size_t work = size_t(m_state->workBuffer); + size_t aligned = (work + (complexAlign - 1)) & ~(complexAlign - 1); + + if (work + m_state->workSize - aligned >= size * sizeof(std::complex)) { + tmp = (std::complex *) aligned; + tmpVector.resize(0); + } else { + tmpVector.resize(size); + tmp = tmpVector.data(); + } + } + + void fft(const double *input, std::complex* output) { + const auto status = ippsFFTFwd_RToPerm_64f((const Ipp64f*)input, (Ipp64f*)output, m_state->spec, m_state->workBuffer); + checkStatus(status); + } + + void fft(const double *inR, double *outR, double *outI) { + const auto status = ippsFFTFwd_RToPerm_64f((const Ipp64f*)inR, (Ipp64f*)tmp, m_state->spec, m_state->workBuffer); + checkStatus(status); + for (size_t i = 0; i < hSize; ++i) { + outR[i] = tmp[i].real(); + outI[i] = tmp[i].imag(); + } + } + + void ifft(const std::complex* input, double *output) { + const auto status = ippsFFTInv_PermToR_64f((const Ipp64f*)input, (Ipp64f*)output, m_state->spec, m_state->workBuffer); + checkStatus(status); + } + + void ifft(const double *inR, const double *inI, double *outR) { + for (size_t i = 0; i < hSize; ++i) { + tmp[i] = { inR[i], inI[i] }; + } + const auto status = ippsFFTInv_PermToR_64f((const Ipp64f*)tmp, (Ipp64f*)outR, m_state->spec, m_state->workBuffer); + checkStatus(status); + } + +private: + std::unique_ptr> m_state{ nullptr }; + std::complex* tmp = nullptr; + std::vector> tmpVector; + size_t hSize; +}; + +}} + +#endif // include guard diff --git a/thirdparty/signalsmith-linear/platform/fft-pffft-double.h b/thirdparty/signalsmith-linear/platform/fft-pffft-double.h new file mode 100644 index 0000000000..fe4be12407 --- /dev/null +++ b/thirdparty/signalsmith-linear/platform/fft-pffft-double.h @@ -0,0 +1,221 @@ +#ifndef SIGNALSMITH_LINEAR_PLATFORM_FFT_PFFFTDOUBLE_H +#define SIGNALSMITH_LINEAR_PLATFORM_FFT_PFFFTDOUBLE_H + +#if defined(__has_include) && !__has_include("pffft/pffft_double.h") +# include "pffft_double.h" +#else +# include "pffft/pffft_double.h" +#endif + +#include +#include +#include +#include + +namespace signalsmith { namespace linear { + +template<> +struct Pow2FFT { + static constexpr bool prefersSplit = false; + + using Complex = std::complex; + + Pow2FFT(size_t size=0) { + resize(size); + } + ~Pow2FFT() { + resize(0); // frees everything + } + // Allow move, but not copy + Pow2FFT(const Pow2FFT &other) = delete; + Pow2FFT(Pow2FFT &&other) : _size(other._size), hasSetup(other.hasSetup), fftSetup(other.fftSetup), fallback(std::move(other.fallback)), work(other.work), tmpAligned(other.tmpAligned) { + // Avoid double-free + other.hasSetup = false; + other.work = nullptr; + other.tmpAligned = nullptr; + } + + void resize(size_t size) { + _size = size; + fallback = nullptr; + if (hasSetup) pffftd_destroy_setup(fftSetup); + if (work) pffftd_aligned_free(work); + work = nullptr; + if (tmpAligned) pffftd_aligned_free(tmpAligned); + tmpAligned = nullptr; + + // We use this for split-real, even if there's no PFFFT setup + tmpAligned = (double *)pffftd_aligned_malloc(sizeof(double)*size*2); + + if (size < 16) { + // PFFFT doesn't support smaller sizes + hasSetup = false; + fallback = std::unique_ptr>{ + new SimpleFFT(size) + }; + return; + } + + work = (double *)pffftd_aligned_malloc(sizeof(double)*size*2); + fftSetup = pffftd_new_setup(int(size), PFFFT_COMPLEX); + hasSetup = fftSetup; + } + + void fft(const Complex* input, Complex* output) { + if (fallback) return fallback->fft(input, output); + fftInner(input, output, PFFFT_FORWARD); + } + void fft(const double *inR, const double *inI, double *outR, double *outI) { + if (fallback) return fallback->fft(inR, inI, outR, outI); + fftInner(inR, inI, outR, outI, PFFFT_FORWARD); + } + + void ifft(const Complex* input, Complex* output) { + if (fallback) return fallback->ifft(input, output); + fftInner(input, output, PFFFT_BACKWARD); + } + void ifft(const double *inR, const double *inI, double *outR, double *outI) { + if (fallback) return fallback->ifft(inR, inI, outR, outI); + fftInner(inR, inI, outR, outI, PFFFT_BACKWARD); + } + +private: + void fftInner(const Complex *input, Complex *output, pffft_direction_t direction) { + // 32-byte alignment + if (size_t(input)&0x1F) { + // `tmpAligned` is always aligned, so copy into that + std::memcpy(tmpAligned, input, sizeof(Complex)*_size); + input = (const Complex *)tmpAligned; + } + if (size_t(output)&0x1F) { + // Output to `tmpAligned` - might be in-place if input is unaligned, but that's fine + pffftd_transform_ordered(fftSetup, (const double *)input, tmpAligned, work, direction); + std::memcpy(output, tmpAligned, sizeof(Complex)*_size); + } else { + pffftd_transform_ordered(fftSetup, (const double *)input, (double *)output, work, direction); + } + } + void fftInner(const double *inR, const double *inI, double *outR, double *outI, pffft_direction_t direction) { + for (size_t i = 0; i < _size; ++i) { + tmpAligned[2*i] = inR[i]; + tmpAligned[2*i + 1] = inI[i]; + } + // PFFFT supports in-place transforms + fftInner((const Complex *)tmpAligned, (Complex *)tmpAligned, direction); + // Un-interleave + for (size_t i = 0; i < _size; ++i) { + outR[i] = tmpAligned[2*i]; + outI[i] = tmpAligned[2*i + 1]; + } + } + + size_t _size = 0; + bool hasSetup = false; + PFFFTD_Setup *fftSetup = nullptr; + std::unique_ptr> fallback; + double *work = nullptr, *tmpAligned = nullptr; +}; + +template<> +struct Pow2RealFFT { +private: + using FallbackFFT = SimpleRealFFT; // this wraps the complex one +public: + static constexpr bool prefersSplit = false; + + using Complex = std::complex; + + Pow2RealFFT(size_t size=0) { + resize(size); + } + ~Pow2RealFFT() { + resize(0); + } + // Allow move, but not copy + Pow2RealFFT(const Pow2RealFFT &other) = delete; + Pow2RealFFT(Pow2RealFFT &&other) : _size(other._size), hasSetup(other.hasSetup), fftSetup(other.fftSetup), fallback(std::move(other.fallback)), work(other.work), tmpAligned(other.tmpAligned) { + // Avoid double-free + other.hasSetup = false; + other.work = nullptr; + other.tmpAligned = nullptr; + } + + void resize(size_t size) { + _size = size; + fallback = nullptr; + if (hasSetup) pffftd_destroy_setup(fftSetup); + if (work) pffftd_aligned_free(work); + work = nullptr; + if (tmpAligned) pffftd_aligned_free(tmpAligned); + tmpAligned = nullptr; + + // We use this for split-real, even if there's no PFFFT setup + tmpAligned = (double *)pffftd_aligned_malloc(sizeof(double)*size*2); + + // TODO: just go for it, and check for success before allocating `work` + if (size < 32) { + // PFFFT doesn't support smaller sizes + hasSetup = false; + fallback = std::unique_ptr{ + new FallbackFFT(size) + }; + return; + } + + work = (double *)pffftd_aligned_malloc(sizeof(double)*size); + fftSetup = pffftd_new_setup(int(size), PFFFT_REAL); + hasSetup = fftSetup; + } + + void fft(const double *input, Complex *output) { + if (fallback) return fallback->fft(input, output); + fftInner(input, (double *)output, PFFFT_FORWARD); + } + void fft(const double *inR, double *outR, double *outI) { + if (fallback) return fallback->fft(inR, outR, outI); + fftInner(inR, tmpAligned, PFFFT_FORWARD); + for (size_t i = 0; i < _size/2; ++i) { + outR[i] = tmpAligned[2*i]; + outI[i] = tmpAligned[2*i + 1]; + } + } + + void ifft(const Complex *input, double *output) { + if (fallback) return fallback->ifft(input, output); + fftInner((const double *)input, output, PFFFT_BACKWARD); + } + void ifft(const double *inR, const double *inI, double *outR) { + if (fallback) return fallback->ifft(inR, inI, outR); + for (size_t i = 0; i < _size/2; ++i) { + tmpAligned[2*i] = inR[i]; + tmpAligned[2*i + 1] = inI[i]; + } + fftInner(tmpAligned, outR, PFFFT_BACKWARD); + } + +private: + void fftInner(const double *input, double *output, pffft_direction_t direction) { + // 32-byte alignment + if (size_t(input)&0x1F) { + // `tmpAligned` is always aligned, so copy into that + std::memcpy(tmpAligned, input, sizeof(double)*_size); + input = tmpAligned; + } + if (size_t(output)&0x1F) { + // Output to `tmpAligned` - might be in-place if input is unaligned, but that's fine + pffftd_transform_ordered(fftSetup, input, tmpAligned, work, direction); + std::memcpy(output, tmpAligned, sizeof(double)*_size); + } else { + pffftd_transform_ordered(fftSetup, input, output, work, direction); + } + } + + size_t _size = 0; + bool hasSetup = false; + PFFFTD_Setup *fftSetup = nullptr; + std::unique_ptr fallback; + double *work = nullptr, *tmpAligned = nullptr; +}; + +}} // namespace +#endif // include guard diff --git a/thirdparty/signalsmith-linear/platform/fft-pffft.h b/thirdparty/signalsmith-linear/platform/fft-pffft.h new file mode 100644 index 0000000000..dec42b224b --- /dev/null +++ b/thirdparty/signalsmith-linear/platform/fft-pffft.h @@ -0,0 +1,221 @@ +#ifndef SIGNALSMITH_LINEAR_PLATFORM_FFT_PFFFT_H +#define SIGNALSMITH_LINEAR_PLATFORM_FFT_PFFFT_H + +#if defined(__has_include) && !__has_include("pffft/pffft.h") +# include "pffft.h" +#else +# include "pffft/pffft.h" +#endif + +#include +#include +#include +#include + +namespace signalsmith { namespace linear { + +template<> +struct Pow2FFT { + static constexpr bool prefersSplit = false; + + using Complex = std::complex; + + Pow2FFT(size_t size=0) { + resize(size); + } + ~Pow2FFT() { + resize(0); // frees everything + } + // Allow move, but not copy + Pow2FFT(const Pow2FFT &other) = delete; + Pow2FFT(Pow2FFT &&other) : _size(other._size), hasSetup(other.hasSetup), fftSetup(other.fftSetup), fallback(std::move(other.fallback)), work(other.work), tmpAligned(other.tmpAligned) { + // Avoid double-free + other.hasSetup = false; + other.work = nullptr; + other.tmpAligned = nullptr; + } + + void resize(size_t size) { + _size = size; + fallback = nullptr; + if (hasSetup) pffft_destroy_setup(fftSetup); + if (work) pffft_aligned_free(work); + work = nullptr; + if (tmpAligned) pffft_aligned_free(tmpAligned); + tmpAligned = nullptr; + + // We use this for split-real, even if there's no PFFFT setup + tmpAligned = (float *)pffft_aligned_malloc(sizeof(float)*size*2); + + if (size < 16) { + // PFFFT doesn't support smaller sizes + hasSetup = false; + fallback = std::unique_ptr>{ + new SimpleFFT(size) + }; + return; + } + + work = (float *)pffft_aligned_malloc(sizeof(float)*size*2); + fftSetup = pffft_new_setup(int(size), PFFFT_COMPLEX); + hasSetup = fftSetup; + } + + void fft(const Complex* input, Complex* output) { + if (fallback) return fallback->fft(input, output); + fftInner(input, output, PFFFT_FORWARD); + } + void fft(const float *inR, const float *inI, float *outR, float *outI) { + if (fallback) return fallback->fft(inR, inI, outR, outI); + fftInner(inR, inI, outR, outI, PFFFT_FORWARD); + } + + void ifft(const Complex* input, Complex* output) { + if (fallback) return fallback->ifft(input, output); + fftInner(input, output, PFFFT_BACKWARD); + } + void ifft(const float *inR, const float *inI, float *outR, float *outI) { + if (fallback) return fallback->ifft(inR, inI, outR, outI); + fftInner(inR, inI, outR, outI, PFFFT_BACKWARD); + } + +private: + void fftInner(const Complex *input, Complex *output, pffft_direction_t direction) { + // 16-byte alignment + if (size_t(input)&0x0F) { + // `tmpAligned` is always aligned, so copy into that + std::memcpy(tmpAligned, input, sizeof(Complex)*_size); + input = (const Complex *)tmpAligned; + } + if (size_t(output)&0x0F) { + // Output to `tmpAligned` - might be in-place if input is unaligned, but that's fine + pffft_transform_ordered(fftSetup, (const float *)input, tmpAligned, work, direction); + std::memcpy(output, tmpAligned, sizeof(Complex)*_size); + } else { + pffft_transform_ordered(fftSetup, (const float *)input, (float *)output, work, direction); + } + } + void fftInner(const float *inR, const float *inI, float *outR, float *outI, pffft_direction_t direction) { + for (size_t i = 0; i < _size; ++i) { + tmpAligned[2*i] = inR[i]; + tmpAligned[2*i + 1] = inI[i]; + } + // PFFFT supports in-place transforms + fftInner((const Complex *)tmpAligned, (Complex *)tmpAligned, direction); + // Un-interleave + for (size_t i = 0; i < _size; ++i) { + outR[i] = tmpAligned[2*i]; + outI[i] = tmpAligned[2*i + 1]; + } + } + + size_t _size = 0; + bool hasSetup = false; + PFFFT_Setup *fftSetup = nullptr; + std::unique_ptr> fallback; + float *work = nullptr, *tmpAligned = nullptr; +}; + +template<> +struct Pow2RealFFT { +private: + using FallbackFFT = SimpleRealFFT; // this wraps the complex one +public: + static constexpr bool prefersSplit = false; + + using Complex = std::complex; + + Pow2RealFFT(size_t size=0) { + resize(size); + } + ~Pow2RealFFT() { + resize(0); + } + // Allow move, but not copy + Pow2RealFFT(const Pow2RealFFT &other) = delete; + Pow2RealFFT(Pow2RealFFT &&other) : _size(other._size), hasSetup(other.hasSetup), fftSetup(other.fftSetup), fallback(std::move(other.fallback)), work(other.work), tmpAligned(other.tmpAligned) { + // Avoid double-free + other.hasSetup = false; + other.work = nullptr; + other.tmpAligned = nullptr; + } + + void resize(size_t size) { + _size = size; + fallback = nullptr; + if (hasSetup) pffft_destroy_setup(fftSetup); + if (work) pffft_aligned_free(work); + work = nullptr; + if (tmpAligned) pffft_aligned_free(tmpAligned); + tmpAligned = nullptr; + + // We use this for split-real, even if there's no PFFFT setup + tmpAligned = (float *)pffft_aligned_malloc(sizeof(float)*size*2); + + // TODO: just go for it, and check for success before allocating `work` + if (size < 32) { + // PFFFT doesn't support smaller sizes + hasSetup = false; + fallback = std::unique_ptr{ + new FallbackFFT(size) + }; + return; + } + + work = (float *)pffft_aligned_malloc(sizeof(float)*size); + fftSetup = pffft_new_setup(int(size), PFFFT_REAL); + hasSetup = fftSetup; + } + + void fft(const float *input, Complex *output) { + if (fallback) return fallback->fft(input, output); + fftInner(input, (float *)output, PFFFT_FORWARD); + } + void fft(const float *inR, float *outR, float *outI) { + if (fallback) return fallback->fft(inR, outR, outI); + fftInner(inR, tmpAligned, PFFFT_FORWARD); + for (size_t i = 0; i < _size/2; ++i) { + outR[i] = tmpAligned[2*i]; + outI[i] = tmpAligned[2*i + 1]; + } + } + + void ifft(const Complex *input, float *output) { + if (fallback) return fallback->ifft(input, output); + fftInner((const float *)input, output, PFFFT_BACKWARD); + } + void ifft(const float *inR, const float *inI, float *outR) { + if (fallback) return fallback->ifft(inR, inI, outR); + for (size_t i = 0; i < _size/2; ++i) { + tmpAligned[2*i] = inR[i]; + tmpAligned[2*i + 1] = inI[i]; + } + fftInner(tmpAligned, outR, PFFFT_BACKWARD); + } + +private: + void fftInner(const float *input, float *output, pffft_direction_t direction) { + // 16-byte alignment + if (size_t(input)&0x0F) { + // `tmpAligned` is always aligned, so copy into that + std::memcpy(tmpAligned, input, sizeof(float)*_size); + input = tmpAligned; + } + if (size_t(output)&0x0F) { + // Output to `tmpAligned` - might be in-place if input is unaligned, but that's fine + pffft_transform_ordered(fftSetup, input, tmpAligned, work, direction); + std::memcpy(output, tmpAligned, sizeof(float)*_size); + } else { + pffft_transform_ordered(fftSetup, input, output, work, direction); + } + } + + size_t _size = 0; + bool hasSetup = false; + PFFFT_Setup *fftSetup = nullptr; + std::unique_ptr fallback; + float *work = nullptr, *tmpAligned = nullptr; +}; + +}} // namespace +#endif // include guard diff --git a/thirdparty/signalsmith-linear/platform/linear-accelerate.h b/thirdparty/signalsmith-linear/platform/linear-accelerate.h new file mode 100644 index 0000000000..5bd65b4d5c --- /dev/null +++ b/thirdparty/signalsmith-linear/platform/linear-accelerate.h @@ -0,0 +1,575 @@ +//#define ACCELERATE_NEW_LAPACK + +// If possible, only include vecLib, since JUCE has conflicts with vImage +#if defined(__has_include) && __has_include() +# include +#else +# include +#endif + +#include // std::memcpy +#include // We log to stderr when an expression is unoptimised + +namespace signalsmith { namespace linear { + +template +static std::string typeName() { + return typeid(T).name(); +} +template<> +std::string typeName() { + return "float"; +} +template<> +std::string typeName() { + return "double"; +} +template<> +std::string typeName>() { + return "complex"; +} +template<> +std::string typeName>() { + return "complex"; +} +template<> +std::string typeName>() { + return "const complex"; +} +template<> +std::string typeName>() { + return "const complex"; +} +static size_t _basicFillWarningCounter = 1; +static void basicFillWarningReset() { + // All warnings will print again + ++_basicFillWarningCounter; +} +template +static void basicFillWarning() { + static size_t counter = 0; + if (counter != _basicFillWarningCounter) { + counter = _basicFillWarningCounter; + std::cerr << "used basic .fill() for " << Expr::name() << " -> " << typeName().get(0))>() << "\n"; + } +} + +template<> +struct LinearImpl : public LinearImplBase { + using Base = LinearImplBase; + + LinearImpl() : Base(this), cached(*this) { + basicFillWarningReset(); + } + + template + void reserve(size_t) {} + + template<> + void reserve(size_t size) { + cached.reserveFloats(size*4); + } + template<> + void reserve(size_t size) { + cached.reserveDoubles(size*4); + } + + template + void fill(Pointer pointer, Expr expr, size_t size) { + fillExpr(pointer, expr, size); + } + + template + void fill(Pointer pointer, Expression expr, size_t size) { + return fillExpr(pointer, (Expr &)expr, size); + }; + template + void fill(Pointer pointer, WritableExpression expr, size_t size) { + return fillExpr(pointer, (Expr &)expr, size); + }; + +private: + // Most generic fill + template + void fillExpr(Pointer pointer, Expr expr, size_t size) { + fillBasic(pointer, expr, size); + } + + template + void fillBasic(Pointer pointer, Expr expr, size_t size) { + basicFillWarning(); + for (size_t i = 0; i < size; ++i) { + pointer[i] = expr.get(i); + } + } + template + void fillBasic(SplitPointer pointer, Expr expr, size_t size) { + basicFillWarning(); + using Complex = typename SplitPointer::Complex; + for (size_t i = 0; i < size; ++i) { + Complex c = expr.get(i); + pointer.real[i] = c.real(); + pointer.imag[i] = c.imag(); + } + } + + template + void clear(V *v, size_t size) { + for (size_t i = 0; i < size; ++i) v[i] = 0; + } + void clear(float *v, size_t size) { + vDSP_vclr(v, 1, size); + } + void clear(double *v, size_t size) { + vDSP_vclrD(v, 1, size); + } + // Filling a split-complex vector with real values won't hit the specialisations below, so we handle it here + template + ItemType fillBasic(SplitPointer pointer, Expr expr, size_t size) { + fillExpr(pointer.real, expr, size); + clear(pointer.imag, size); + } + template + ItemType fillBasic(SplitPointer pointer, Expr expr, size_t size) { + fillExpr(pointer.real, expr, size); + clear(pointer.imag, size); + } + + // Copying from existing pointer + void fillExpr(RealPointer pointer, expression::ReadableReal expr, size_t size) { + std::memcpy(pointer, expr.pointer, size*sizeof(float)); + } + void fillExpr(RealPointer pointer, WritableReal expr, size_t size) { + std::memcpy(pointer, expr.pointer, size*sizeof(float)); + } + void fillExpr(RealPointer pointer, expression::ReadableReal expr, size_t size) { + std::memcpy(pointer, expr.pointer, size*sizeof(double)); + } + void fillExpr(RealPointer pointer, WritableReal expr, size_t size) { + std::memcpy(pointer, expr.pointer, size*sizeof(double)); + } + void fillExpr(ComplexPointer pointer, expression::ReadableComplex expr, size_t size) { + std::memcpy(pointer, expr.pointer, size*sizeof(std::complex)); + } + void fillExpr(ComplexPointer pointer, WritableComplex expr, size_t size) { + std::memcpy(pointer, expr.pointer, size*sizeof(std::complex)); + } + void fillExpr(ComplexPointer pointer, expression::ReadableComplex expr, size_t size) { + std::memcpy(pointer, expr.pointer, size*sizeof(std::complex)); + } + void fillExpr(ComplexPointer pointer, WritableComplex expr, size_t size) { + std::memcpy(pointer, expr.pointer, size*sizeof(std::complex)); + } + + // Filling with a constant + template + void fillExpr(RealPointer pointer, expression::ConstantExpr expr, size_t size) { + float v = expr.value; + vDSP_vfill(&v, pointer, 1, size); + } + template + void fillExpr(RealPointer pointer, expression::ConstantExpr expr, size_t size) { + double v = expr.value; + vDSP_vfillD(&v, pointer, 1, size); + } + template + void fillExpr(ComplexPointer pointer, expression::ConstantExpr expr, size_t size) { + std::complex v = expr.value; + auto vSplit = dspSplit(&v); + auto pSplit = dspSplit(pointer); + vDSP_zvfill(&vSplit, &pSplit, 2, size); + } + template + void fillExpr(ComplexPointer pointer, expression::ConstantExpr expr, size_t size) { + std::complex v = expr.value; + auto vSplit = dspSplit(&v); + auto pSplit = dspSplit(pointer); + vDSP_zvfillD(&vSplit, &pSplit, 2, size); + } + template + void fillExpr(SplitPointer pointer, expression::ConstantExpr expr, size_t size) { + std::complex v = expr.value; + float vr = v.real(), vi = v.imag(); + auto vSplit = dspSplit({&vr, &vi}); + auto pSplit = dspSplit(pointer); + vDSP_zvfill(&vSplit, &pSplit, 1, size); + } + template + void fillExpr(SplitPointer pointer, expression::ConstantExpr expr, size_t size) { + std::complex v = expr.value; + double vr = v.real(), vi = v.imag(); + auto vSplit = dspSplit({&vr, &vi}); + auto pSplit = dspSplit(pointer); + vDSP_zvfillD(&vSplit, &pSplit, 1, size); + } + +// Forwards .fillExpr() to .fillName(), but doesn't define that +#define SIGNALSMITH_AUDIO_LINEAR_OP1_R(Name) \ + template \ + void fillExpr(RealPointer pointer, expression::Name expr, size_t size) { \ + fill##Name(pointer, expr, size); \ + } \ + template \ + void fillExpr(RealPointer pointer, expression::Name expr, size_t size) { \ + fill##Name(pointer, expr, size); \ + } \ + template \ + void fill##Name(RealPointer pointer, Expr expr, size_t size) { \ + fillBasic(pointer, expr, size); \ + } \ + template \ + void fill##Name(RealPointer pointer, Expr expr, size_t size) { \ + fillBasic(pointer, expr, size); \ + } +#define SIGNALSMITH_AUDIO_LINEAR_OP1_C(Name) \ + template \ + void fillExpr(ComplexPointer pointer, expression::Name expr, size_t size) { \ + fill##Name(pointer, expr, size); \ + } \ + template \ + void fillExpr(ComplexPointer pointer, expression::Name expr, size_t size) { \ + fill##Name(pointer, expr, size); \ + } \ + template \ + void fill##Name(ComplexPointer pointer, Expr expr, size_t size) { \ + fillBasic(pointer, expr, size); \ + } \ + template \ + void fill##Name(ComplexPointer pointer, Expr expr, size_t size) { \ + fillBasic(pointer, expr, size); \ + } \ + template \ + void fillExpr(SplitPointer pointer, expression::Name expr, size_t size) { \ + fill##Name(pointer, expr, size); \ + } \ + template \ + void fillExpr(SplitPointer pointer, expression::Name expr, size_t size) { \ + fill##Name(pointer, expr, size); \ + } \ + template \ + void fill##Name(SplitPointer pointer, Expr expr, size_t size) { \ + fillBasic(pointer, expr, size); \ + } \ + template \ + void fill##Name(SplitPointer pointer, Expr expr, size_t size) { \ + fillBasic(pointer, expr, size); \ + } +// R -> R operators +#define SIGNALSMITH_AUDIO_LINEAR_TREE1_RR(Name, vDSP_func) \ + template \ + ItemType fill##Name(RealPointer pointer, expression::Name expr, size_t size) { \ + auto floats = cached.floatScope(); \ + auto *a = floats.real(expr.a, size); \ + vDSP_func(a, 1, pointer, 1, size); \ + } \ + template \ + ItemType fill##Name(RealPointer pointer, expression::Name expr, size_t size) { \ + auto doubles = cached.doubleScope(); \ + auto *a = doubles.real(expr.a, size); \ + vDSP_func##D(a, 1, pointer, 1, size); \ + } +// C -> R operators +#define SIGNALSMITH_AUDIO_LINEAR_TREE1_RC(Name, vDSP_func) \ + template \ + ItemType, void> fill##Name(RealPointer pointer, expression::Name expr, size_t size) { \ + auto floats = cached.floatScope(); \ + auto a = dspSplit(floats.split(expr.a, size)); \ + vDSP_func(&a, 1, pointer, 1, size); \ + } \ + template \ + ItemType, void> fill##Name(RealPointer pointer, expression::Name expr, size_t size) { \ + auto doubles = cached.doubleScope(); \ + auto a = dspSplit(doubles.split(expr.a, size)); \ + vDSP_func##D(&a, 1, pointer, 1, size); \ + } +// C -> C operators +#define SIGNALSMITH_AUDIO_LINEAR_TREE1_CC(Name, vDSP_func) \ + template \ + ItemType, void> fill##Name(ComplexPointer pointer, expression::Name expr, size_t size) { \ + auto floats = cached.floatScope(); \ + auto a = dspSplit(floats.complex(expr.a, size)); \ + auto p = dspSplit(pointer); \ + vDSP_func(&a, 2, &p, 2, size); \ + } \ + template \ + ItemType, void> fill##Name(ComplexPointer pointer, expression::Name expr, size_t size) { \ + auto doubles = cached.doubleScope(); \ + auto a = dspSplit(doubles.complex(expr.a, size)); \ + auto p = dspSplit(pointer); \ + vDSP_func##D(&a, 2, &p, 2, size); \ + }\ + template \ + ItemType, void> fill##Name(SplitPointer pointer, expression::Name expr, size_t size) { \ + auto floats = cached.floatScope(); \ + auto a = dspSplit(floats.split(expr.a, size)); \ + auto p = dspSplit(pointer); \ + vDSP_func(&a, 1, &p, 1, size); \ + } \ + template \ + ItemType, void> fill##Name(SplitPointer pointer, expression::Name expr, size_t size) { \ + auto doubles = cached.doubleScope(); \ + auto a = dspSplit(doubles.split(expr.a, size)); \ + auto p = dspSplit(pointer); \ + vDSP_func##D(&a, 1, &p, 1, size); \ + } + SIGNALSMITH_AUDIO_LINEAR_OP1_R(Abs) + SIGNALSMITH_AUDIO_LINEAR_TREE1_RC(Abs, vDSP_zvabs); + SIGNALSMITH_AUDIO_LINEAR_OP1_R(Neg) + SIGNALSMITH_AUDIO_LINEAR_OP1_C(Neg) + SIGNALSMITH_AUDIO_LINEAR_TREE1_RR(Neg, vDSP_vneg); + SIGNALSMITH_AUDIO_LINEAR_TREE1_CC(Neg, vDSP_zvneg); +#undef SIGNALSMITH_AUDIO_LINEAR_TREE1_RR +#undef SIGNALSMITH_AUDIO_LINEAR_TREE1_RC +#undef SIGNALSMITH_AUDIO_LINEAR_TREE1_CC + +// vForce stuff +// R -> R operators +#define SIGNALSMITH_AUDIO_LINEAR_TREE1_RR(Name, vForce_float, vForce_double) \ + template \ + ItemType fill##Name(RealPointer pointer, expression::Name expr, size_t size) { \ + auto floats = cached.floatScope(); \ + auto *a = floats.real(expr.a, size, pointer); \ + int intSize = int(size); \ + vForce_float(pointer, a, &intSize); \ + } \ + template \ + ItemType fill##Name(RealPointer pointer, expression::Name expr, size_t size) { \ + auto doubles = cached.doubleScope(); \ + auto *a = doubles.real(expr.a, size, pointer); \ + int intSize = int(size); \ + vForce_double(pointer, a, &intSize); \ + } + SIGNALSMITH_AUDIO_LINEAR_TREE1_RR(Abs, vvfabsf, vvfabs); + SIGNALSMITH_AUDIO_LINEAR_OP1_R(Exp) + SIGNALSMITH_AUDIO_LINEAR_TREE1_RR(Exp, vvexpf, vvexp); + SIGNALSMITH_AUDIO_LINEAR_OP1_R(Exp2) + SIGNALSMITH_AUDIO_LINEAR_TREE1_RR(Exp2, vvexp2f, vvexp2); + SIGNALSMITH_AUDIO_LINEAR_OP1_R(Log) + SIGNALSMITH_AUDIO_LINEAR_TREE1_RR(Log, vvlogf, vvlog); + SIGNALSMITH_AUDIO_LINEAR_OP1_R(Log2) + SIGNALSMITH_AUDIO_LINEAR_TREE1_RR(Log2, vvlog2f, vvlog2); + SIGNALSMITH_AUDIO_LINEAR_OP1_R(Log10) + SIGNALSMITH_AUDIO_LINEAR_TREE1_RR(Log10, vvlog10f, vvlog10); + SIGNALSMITH_AUDIO_LINEAR_OP1_R(Sqrt) + SIGNALSMITH_AUDIO_LINEAR_TREE1_RR(Sqrt, vvsqrtf, vvsqrt); + SIGNALSMITH_AUDIO_LINEAR_OP1_R(Ceil) + SIGNALSMITH_AUDIO_LINEAR_TREE1_RR(Ceil, vvceilf, vvceil); + SIGNALSMITH_AUDIO_LINEAR_OP1_R(Floor) + SIGNALSMITH_AUDIO_LINEAR_TREE1_RR(Floor, vvfloorf, vvfloor); +#undef SIGNALSMITH_AUDIO_LINEAR_OP1_R + +// Forwards .fillExpr() to .fillName(), but doesn't define that +#define SIGNALSMITH_AUDIO_LINEAR_OP2_R(Name) \ +public: \ + template \ + void fillExpr(RealPointer pointer, expression::Name expr, size_t size) { \ + fill##Name(pointer, expr, size); \ + } \ + template \ + void fillExpr(RealPointer pointer, expression::Name expr, size_t size) { \ + fill##Name(pointer, expr, size); \ + } \ +private: +// R x R -> R operators where the vDSP function arguments are the other way around for some reason +#define SIGNALSMITH_AUDIO_LINEAR_TREE2FLIP_RRR(Name, vDSP_func) \ + template \ + void fill##Name(RealPointer pointer, expression::Name expr, size_t size) { \ + auto floats = cached.floatScope(); \ + auto *a = floats.real(expr.a, size, pointer); \ + auto *b = floats.real(expr.b, size); \ + vDSP_func(b, 1, a, 1, pointer, 1, size); \ + } \ + template \ + void fill##Name(RealPointer pointer, expression::Name expr, size_t size) { \ + auto doubles = cached.doubleScope(); \ + auto *a = doubles.real(expr.a, size, pointer); \ + auto *b = doubles.real(expr.b, size); \ + vDSP_func##D(b, 1, a, 1, pointer, 1, size); \ + } +#define SIGNALSMITH_AUDIO_LINEAR_TREE2COMM_RRkR(Name, vDSP_func) \ + template \ + void fill##Name(RealPointer pointer, expression::Name> expr, size_t size) { \ + auto floats = cached.floatScope(); \ + auto *a = floats.real(expr.a, size, pointer); \ + float b = expr.b.value; \ + vDSP_func(a, 1, &b, pointer, 1, size); \ + } \ + template \ + void fill##Name(RealPointer pointer, expression::Name> expr, size_t size) { \ + auto doubles = cached.doubleScope(); \ + auto *a = doubles.real(expr.a, size, pointer); \ + double b = expr.b.value; \ + vDSP_func##D(a, 1, &b, pointer, 1, size); \ + } \ + template \ + void fill##Name(RealPointer pointer, expression::Name, B> expr, size_t size) { \ + auto floats = cached.floatScope(); \ + float a = expr.a.value; \ + auto *b = floats.real(expr.b, size, pointer); \ + vDSP_func(b, 1, &a, pointer, 1, size); \ + } \ + template \ + void fill##Name(RealPointer pointer, expression::Name, B> expr, size_t size) { \ + auto doubles = cached.doubleScope(); \ + double a = expr.a.value; \ + auto *b = doubles.real(expr.b, size, pointer); \ + vDSP_func##D(b, 1, &a, pointer, 1, size); \ + } + SIGNALSMITH_AUDIO_LINEAR_OP2_R(Add) + SIGNALSMITH_AUDIO_LINEAR_TREE2FLIP_RRR(Add, vDSP_vadd); + SIGNALSMITH_AUDIO_LINEAR_TREE2COMM_RRkR(Add, vDSP_vsadd); + SIGNALSMITH_AUDIO_LINEAR_OP2_R(Sub) + SIGNALSMITH_AUDIO_LINEAR_TREE2FLIP_RRR(Sub, vDSP_vsub); + template + void fillSub(RealPointer pointer, expression::Sub> expr, size_t size) { + expression::ConstantExpr negB{-expr.b.value}; + fillAdd(pointer, expression::makeAdd(expr.a, negB), size); + } + template + void fillSub(RealPointer pointer, expression::Sub> expr, size_t size) { + expression::ConstantExpr negB{-expr.b.value}; + fillAdd(pointer, expression::makeAdd(expr.a, negB), size); + } + template + void fillSub(RealPointer pointer, expression::Sub, A> expr, size_t size) { + expression::ConstantExpr negA{-expr.a.value}; + fillNeg(pointer, expression::makeNeg(expression::makeAdd(expr.b, negA)), size); + } + template + void fillSub(RealPointer pointer, expression::Sub, A> expr, size_t size) { + expression::ConstantExpr negA{-expr.a.value}; + fillNeg(pointer, expression::makeNeg(expression::makeAdd(expr.b, negA)), size); + } + SIGNALSMITH_AUDIO_LINEAR_OP2_R(Mul) + SIGNALSMITH_AUDIO_LINEAR_TREE2FLIP_RRR(Mul, vDSP_vmul); + SIGNALSMITH_AUDIO_LINEAR_TREE2COMM_RRkR(Mul, vDSP_vsmul); + SIGNALSMITH_AUDIO_LINEAR_OP2_R(Div) + template + void fillDiv(RealPointer pointer, expression::Div> expr, size_t size) { + expression::ConstantExpr recipB{1.0f/expr.b.value}; + fillMul(pointer, expression::makeMul(expr.a, recipB), size); + } + template + void fillDiv(RealPointer pointer, expression::Sub> expr, size_t size) { + expression::ConstantExpr recipB{1.0/expr.b.value}; + fillMul(pointer, expression::makeMul(expr.a, recipB), size); + } +#undef SIGNALSMITH_AUDIO_LINEAR_TREE2FLIP_RRR +#undef SIGNALSMITH_AUDIO_LINEAR_TREE2COMM_RRkR + +// R x R -> R operators in vForce +#define SIGNALSMITH_AUDIO_LINEAR_TREE2_RRR(Name, vForce_float, vForce_double) \ + template \ + void fill##Name(RealPointer pointer, expression::Name expr, size_t size) { \ + auto floats = cached.floatScope(); \ + auto *a = floats.real(expr.a, size, pointer); \ + auto *b = floats.real(expr.b, size); \ + int intSize = int(size); \ + vForce_float(pointer, a, b, &intSize); \ + } \ + template \ + void fill##Name(RealPointer pointer, expression::Name expr, size_t size) { \ + auto doubles = cached.doubleScope(); \ + auto *a = doubles.real(expr.a, size, pointer); \ + auto *b = doubles.real(expr.b, size); \ + int intSize = int(size); \ + vForce_double(pointer, a, b, &intSize); \ + } + SIGNALSMITH_AUDIO_LINEAR_TREE2_RRR(Div, vvdivf, vvdiv); +#undef SIGNALSMITH_AUDIO_LINEAR_TREE2_RRR +#undef SIGNALSMITH_AUDIO_LINEAR_OP2_R + +// Forwards .fillName() to .fillNameName2(), but doesn't define that +#define SIGNALSMITH_AUDIO_LINEAR_Op3L_R(Name, NameL) \ + template \ + void fill##Name(RealPointer pointer, expression::Name, C> expr, size_t size) { \ + fill##Name##NameL(pointer, expr, size); \ + } \ + template \ + void fill##Name(RealPointer pointer, expression::Name, C> expr, size_t size) { \ + fill##Name##NameL(pointer, expr, size); \ + } +// (A $ B) $ C +#define SIGNALSMITH_AUDIO_LINEAR_TREE3L_RRR(Name, NameL, vDSP_func) \ + template \ + void fill##Name##NameL(RealPointer pointer, expression::Name, C> expr, size_t size) { \ + auto floats = cached.floatScope(); \ + auto *a = floats.real(expr.a.a, size); \ + auto *b = floats.real(expr.a.b, size); \ + auto *c = floats.real(expr.b, size, pointer); \ + vDSP_func(a, 1, b, 1, c, 1, pointer, 1, size); \ + } \ + template \ + void fill##Name##NameL(RealPointer pointer, expression::Name, C> expr, size_t size) { \ + auto doubles = cached.doubleScope(); \ + auto *a = doubles.real(expr.a.a, size); \ + auto *b = doubles.real(expr.a.b, size); \ + auto *c = doubles.real(expr.b, size, pointer); \ + vDSP_func##D(a, 1, b, 1, c, 1, pointer, 1, size); \ + } + SIGNALSMITH_AUDIO_LINEAR_Op3L_R(Mul, Add) + SIGNALSMITH_AUDIO_LINEAR_TREE3L_RRR(Mul, Add, vDSP_vam) + +#undef SIGNALSMITH_AUDIO_LINEAR_TREE3L_RRR +#undef SIGNALSMITH_AUDIO_LINEAR_Op3L_R + +// SIGNALSMITH_AUDIO_LINEAR_TREE3COMMUTATIVE_RRR(Add, Mul, vDSP_vam, 0) +// SIGNALSMITH_AUDIO_LINEAR_TREE3COMMUTATIVE_RRR(Mul, Add, vDSP_vma, 1) +// SIGNALSMITH_AUDIO_LINEAR_TREE3COMMUTATIVE_RRR(Sub, Mul, vDSP_vsbm, 2) +// SIGNALSMITH_AUDIO_LINEAR_TREE3L_RRR(Mul, Sub, vDSP_vmsb, 3) + +// Forwards .fillExpr() to .fillName(), but doesn't define that +#define SIGNALSMITH_AUDIO_LINEAR_OP2_C(Name) \ +public: \ + template \ + void fillExpr(ComplexPointer pointer, expression::Name expr, size_t size) { \ + fill##Name(pointer, expr, size); \ + } \ + template \ + void fillExpr(ComplexPointer pointer, expression::Name expr, size_t size) { \ + fill##Name(pointer, expr, size); \ + } \ + template \ + void fillExpr(SplitPointer pointer, expression::Name expr, size_t size) { \ + fill##Name(pointer, expr, size); \ + } \ + template \ + void fillExpr(SplitPointer pointer, expression::Name expr, size_t size) { \ + fill##Name(pointer, expr, size); \ + } \ +private: +// SIGNALSMITH_AUDIO_LINEAR_OP2_C(Add) +// SIGNALSMITH_AUDIO_LINEAR_OP2_C(Sub) +// SIGNALSMITH_AUDIO_LINEAR_OP2_C(Mul) +// SIGNALSMITH_AUDIO_LINEAR_OP2_C(Div) + +protected: + CachedResults cached; + + static DSPSplitComplex dspSplit(ConstSplitPointer x) { + DSPSplitComplex dsp; + dsp.realp = (float *)x.real; + dsp.imagp = (float *)x.imag; + return dsp; + } + static DSPSplitComplex dspSplit(ConstComplexPointer x) { + DSPSplitComplex dsp; + dsp.realp = (float *)x; + dsp.imagp = (float *)x + 1; + return dsp; + } + static DSPDoubleSplitComplex dspSplit(ConstSplitPointer x) { + DSPDoubleSplitComplex dsp; + dsp.realp = (double *)x.real; + dsp.imagp = (double *)x.imag; + return dsp; + } + static DSPDoubleSplitComplex dspSplit(ConstComplexPointer x) { + DSPDoubleSplitComplex dsp; + dsp.realp = (double *)x; + dsp.imagp = (double *)x + 1; + return dsp; + } +}; + +}}; // namespace diff --git a/thirdparty/signalsmith-linear/stft.h b/thirdparty/signalsmith-linear/stft.h new file mode 100644 index 0000000000..58e2d8eaf4 --- /dev/null +++ b/thirdparty/signalsmith-linear/stft.h @@ -0,0 +1,631 @@ +#ifndef SIGNALSMITH_AUDIO_LINEAR_STFT_H +#define SIGNALSMITH_AUDIO_LINEAR_STFT_H + +#include "./fft.h" + +namespace signalsmith { namespace linear { + +enum { + STFT_SPECTRUM_PACKED=0, + STFT_SPECTRUM_MODIFIED=1, + STFT_SPECTRUM_UNPACKED=2, +}; + +/// A self-normalising STFT, with variable position/window for output blocks +template +struct DynamicSTFT { + static constexpr bool modified = (spectrumType == STFT_SPECTRUM_MODIFIED); + static constexpr bool unpacked = (spectrumType == STFT_SPECTRUM_UNPACKED); + RealFFT fft; + + using Complex = std::complex; + + enum class WindowShape {ignore, acg, kaiser}; + static constexpr WindowShape acg = WindowShape::acg; + static constexpr WindowShape kaiser = WindowShape::kaiser; + + void configure(size_t inChannels, size_t outChannels, size_t blockSamples, size_t extraInputHistory=0, size_t intervalSamples=0, Sample asymmetry=0) { + _analysisChannels = inChannels; + _synthesisChannels = outChannels; + _blockSamples = blockSamples; + _fftSamples = fft.fastSizeAbove((blockSamples + 1)/2)*2; + fft.resize(_fftSamples); + _fftBins = _fftSamples/2 + (spectrumType == STFT_SPECTRUM_UNPACKED); + + _inputLengthSamples = _blockSamples + extraInputHistory; + input.buffer.resize(_inputLengthSamples*_analysisChannels); + + output.buffer.resize(_blockSamples*_synthesisChannels); + output.windowProducts.resize(_blockSamples); + spectrumBuffer.resize(_fftBins*std::max(_analysisChannels, _synthesisChannels)); + timeBuffer.resize(_fftSamples); + + _analysisWindow.resize(_blockSamples); + _synthesisWindow.resize(_blockSamples); + setInterval(intervalSamples ? intervalSamples : blockSamples/4, acg, asymmetry); + + reset(); + } + + size_t blockSamples() const { + return _blockSamples; + } + size_t fftSamples() const { + return _fftSamples; + } + size_t defaultInterval() const { + return _defaultInterval; + } + size_t bands() const { + return _fftBins; + } + size_t analysisLatency() const { + return _blockSamples - _analysisOffset; + } + size_t synthesisLatency() const { + return _synthesisOffset; + } + size_t latency() const { + return synthesisLatency() + analysisLatency(); + } + Sample binToFreq(Sample b) const { + return (modified ? b + Sample(0.5) : b)/_fftSamples; + } + Sample freqToBin(Sample f) const { + return modified ? f*_fftSamples - Sample(0.5) : f*_fftSamples; + } + + void reset(Sample productWeight=1) { + input.pos = _blockSamples; + output.pos = 0; + for (auto &v : input.buffer) v = 0; + for (auto &v : output.buffer) v = 0; + for (auto &v : spectrumBuffer) v = 0; + for (auto &v : output.windowProducts) v = 0; + addWindowProduct(); + for (int i = int(_blockSamples) - int(_defaultInterval) - 1; i >= 0; --i) { + output.windowProducts[i] += output.windowProducts[i + _defaultInterval]; + } + for (auto &v : output.windowProducts) v = v*productWeight + almostZero; + moveOutput(_defaultInterval); // ready for first block immediately + } + + void writeInput(size_t channel, size_t offset, size_t length, const Sample *inputArray) { + Sample *buffer = input.buffer.data() + channel*_inputLengthSamples; + + size_t offsetPos = (input.pos + offset)%_inputLengthSamples; + size_t inputWrapIndex = _inputLengthSamples - offsetPos; + size_t chunk1 = std::min(length, inputWrapIndex); + for (size_t i = 0; i < chunk1; ++i) { + size_t i2 = offsetPos + i; + buffer[i2] = inputArray[i]; + } + for (size_t i = chunk1; i < length; ++i) { + size_t i2 = i + offsetPos -_inputLengthSamples; + buffer[i2] = inputArray[i]; + } + } + void writeInput(size_t channel, size_t length, const Sample *inputArray) { + writeInput(channel, 0, length, inputArray); + } + void moveInput(size_t samples, bool clearInput=false) { + if (clearInput) { + size_t inputWrapIndex = _inputLengthSamples - input.pos; + size_t chunk1 = std::min(samples, inputWrapIndex); + for (size_t c = 0; c < _analysisChannels; ++c) { + Sample *buffer = input.buffer.data() + c*_inputLengthSamples; + for (size_t i = 0; i < chunk1; ++i) { + size_t i2 = input.pos + i; + buffer[i2] = 0; + } + for (size_t i = chunk1; i < samples; ++i) { + size_t i2 = i + input.pos - _inputLengthSamples; + buffer[i2] = 0; + } + } + } + + input.pos = (input.pos + samples)%_inputLengthSamples; + _samplesSinceAnalysis += samples; + } + size_t samplesSinceAnalysis() const { + return _samplesSinceAnalysis; + } + + /// When no more synthesis is expected, let output taper away to 0 based on windowing. Otherwise, the output will be scaled as if there's just a very long block interval, which can exaggerate artefacts and numerical errors. You still can't read more than `blockSamples()` into the future. + void finishOutput(Sample strength=1, size_t offset=0) { + Sample maxWindowProduct = 0; + + size_t chunk1 = std::max(offset, std::min(_blockSamples, _blockSamples - output.pos)); + + for (size_t i = offset; i < chunk1; ++i) { + size_t i2 = output.pos + i; + Sample &wp = output.windowProducts[i2]; + maxWindowProduct = std::max(wp, maxWindowProduct); + wp += (maxWindowProduct - wp)*strength; + } + for (size_t i = chunk1; i < _blockSamples; ++i) { + size_t i2 = i + output.pos - _blockSamples; + Sample &wp = output.windowProducts[i2]; + maxWindowProduct = std::max(wp, maxWindowProduct); + wp += (maxWindowProduct - wp)*strength; + } + } + + void readOutput(size_t channel, size_t offset, size_t length, Sample *outputArray) { + Sample *buffer = output.buffer.data() + channel*_blockSamples; + size_t offsetPos = (output.pos + offset)%_blockSamples; + size_t outputWrapIndex = _blockSamples - offsetPos; + size_t chunk1 = std::min(length, outputWrapIndex); + for (size_t i = 0; i < chunk1; ++i) { + size_t i2 = offsetPos + i; + outputArray[i] = buffer[i2]/output.windowProducts[i2]; + } + for (size_t i = chunk1; i < length; ++i) { + size_t i2 = i + offsetPos - _blockSamples; + outputArray[i] = buffer[i2]/output.windowProducts[i2]; + } + } + void readOutput(size_t channel, size_t length, Sample *outputArray) { + return readOutput(channel, 0, length, outputArray); + } + void addOutput(size_t channel, size_t offset, size_t length, const Sample *newOutputArray) { + length = std::min(_blockSamples, length); + Sample *buffer = output.buffer.data() + channel*_blockSamples; + size_t offsetPos = (output.pos + offset)%_blockSamples; + size_t outputWrapIndex = _blockSamples - offsetPos; + size_t chunk1 = std::min(length, outputWrapIndex); + for (size_t i = 0; i < chunk1; ++i) { + size_t i2 = offsetPos + i; + buffer[i2] += newOutputArray[i]*output.windowProducts[i2]; + } + for (size_t i = chunk1; i < length; ++i) { + size_t i2 = i + offsetPos - _blockSamples; + buffer[i2] += newOutputArray[i]*output.windowProducts[i2]; + } + } + void addOutput(size_t channel, size_t length, const Sample *newOutputArray) { + return addOutput(channel, 0, length, newOutputArray); + } + void replaceOutput(size_t channel, size_t offset, size_t length, const Sample *newOutputArray) { + length = std::min(_blockSamples, length); + Sample *buffer = output.buffer.data() + channel*_blockSamples; + size_t offsetPos = (output.pos + offset)%_blockSamples; + size_t outputWrapIndex = _blockSamples - offsetPos; + size_t chunk1 = std::min(length, outputWrapIndex); + for (size_t i = 0; i < chunk1; ++i) { + size_t i2 = offsetPos + i; + buffer[i2] = newOutputArray[i]*output.windowProducts[i2]; + } + for (size_t i = chunk1; i < length; ++i) { + size_t i2 = i + offsetPos - _blockSamples; + buffer[i2] = newOutputArray[i]*output.windowProducts[i2]; + } + } + void replaceOutput(size_t channel, size_t length, const Sample *newOutputArray) { + return replaceOutput(channel, 0, length, newOutputArray); + } + void moveOutput(size_t samples) { + if (samples == 1) { // avoid all the loops/chunks if we can + for (size_t c = 0; c < _synthesisChannels; ++c) { + output.buffer[output.pos + c*_blockSamples] = 0; + } + output.windowProducts[output.pos] = almostZero; + if (++output.pos >= _blockSamples) output.pos = 0; + return; + } + // Zero the output buffer as we cross it + size_t outputWrapIndex = _blockSamples - output.pos; + size_t chunk1 = std::min(samples, outputWrapIndex); + for (size_t c = 0; c < _synthesisChannels; ++c) { + Sample *buffer = output.buffer.data() + c*_blockSamples; + for (size_t i = 0; i < chunk1; ++i) { + size_t i2 = output.pos + i; + buffer[i2] = 0; + } + for (size_t i = chunk1; i < samples; ++i) { + size_t i2 = i + output.pos - _blockSamples; + buffer[i2] = 0; + } + } + for (size_t i = 0; i < chunk1; ++i) { + size_t i2 = output.pos + i; + output.windowProducts[i2] = almostZero; + } + for (size_t i = chunk1; i < samples; ++i) { + size_t i2 = i + output.pos - _blockSamples; + output.windowProducts[i2] = almostZero; + } + output.pos = (output.pos + samples)%_blockSamples; + _samplesSinceSynthesis += samples; + } + size_t samplesSinceSynthesis() const { + return _samplesSinceSynthesis; + } + + Complex * spectrum(size_t channel) { + return spectrumBuffer.data() + channel*_fftBins; + } + const Complex * spectrum(size_t channel) const { + return spectrumBuffer.data() + channel*_fftBins; + } + + Sample * analysisWindow() { + return _analysisWindow.data(); + } + const Sample * analysisWindow() const { + return _analysisWindow.data(); + } + // Sets the centre index of the window + void analysisOffset(size_t offset) { + _analysisOffset = offset; + } + size_t analysisOffset() const { + return _analysisOffset; + } + Sample * synthesisWindow() { + return _synthesisWindow.data(); + } + const Sample * synthesisWindow() const { + return _synthesisWindow.data(); + } + // Sets the centre index of the window + void synthesisOffset(size_t offset) { + _synthesisOffset = offset; + } + size_t synthesisOffset() const { + return _synthesisOffset; + } + + void setInterval(size_t defaultInterval, WindowShape windowShape=WindowShape::ignore, Sample asymmetry=0) { + _defaultInterval = defaultInterval; + if (windowShape == WindowShape::ignore) return; + + if (windowShape == acg) { + auto window = ApproximateConfinedGaussian::withBandwidth(double(_blockSamples)/defaultInterval); + window.fill(_synthesisWindow, _blockSamples, asymmetry, false); + } else if (windowShape == kaiser) { + auto window = Kaiser::withBandwidth(double(_blockSamples)/defaultInterval, true); + window.fill(_synthesisWindow, _blockSamples, asymmetry, true); + } + + _analysisOffset = _synthesisOffset = _blockSamples/2; + if (_analysisChannels == 0) { + for (auto &v : _analysisWindow) v = 1; + } else if (asymmetry == 0) { + forcePerfectReconstruction(_synthesisWindow, _blockSamples, _defaultInterval); + for (size_t i = 0; i < _blockSamples; ++i) { + _analysisWindow[i] = _synthesisWindow[i]; + } + } else { + for (size_t i = 0; i < _blockSamples; ++i) { + _analysisWindow[i] = _synthesisWindow[_blockSamples - 1 - i]; + } + } + // Set offsets to peak's index + for (size_t i = 0; i < _blockSamples; ++i) { + if (_analysisWindow[i] > _analysisWindow[_analysisOffset]) _analysisOffset = i; + if (_synthesisWindow[i] > _synthesisWindow[_synthesisOffset]) _synthesisOffset = i; + } + } + + void analyse(size_t samplesInPast=0) { + for (size_t s = 0; s < analyseSteps(); ++s) { + analyseStep(s, samplesInPast); + } + } + size_t analyseSteps() const { + return splitComputation ? _analysisChannels*(fft.steps() + 1) : _analysisChannels; + } + void analyseStep(size_t step, std::size_t samplesInPast=0) { + size_t fftSteps = splitComputation ? fft.steps() : 0; + size_t channel = step/(fftSteps + 1); + step -= channel*(fftSteps + 1); + + if (step-- == 0) { // extra step at start of each channel: copy windowed input into buffer + size_t offsetPos = (_inputLengthSamples*2 + input.pos - _blockSamples - samplesInPast)%_inputLengthSamples; + size_t inputWrapIndex = _inputLengthSamples - offsetPos; + size_t chunk1 = std::min(_analysisOffset, inputWrapIndex); + size_t chunk2 = std::max(_analysisOffset, std::min(_blockSamples, inputWrapIndex)); + + _samplesSinceAnalysis = samplesInPast; + Sample *buffer = input.buffer.data() + channel*_inputLengthSamples; + for (size_t i = 0; i < chunk1; ++i) { + Sample w = modified ? -_analysisWindow[i] : _analysisWindow[i]; + size_t ti = i + (_fftSamples - _analysisOffset); + size_t bi = offsetPos + i; + timeBuffer[ti] = buffer[bi]*w; + } + for (size_t i = chunk1; i < _analysisOffset; ++i) { + Sample w = modified ? -_analysisWindow[i] : _analysisWindow[i]; + size_t ti = i + (_fftSamples - _analysisOffset); + size_t bi = i + offsetPos - _inputLengthSamples; + timeBuffer[ti] = buffer[bi]*w; + } + for (size_t i = _analysisOffset; i < chunk2; ++i) { + Sample w = _analysisWindow[i]; + size_t ti = i - _analysisOffset; + size_t bi = offsetPos + i; + timeBuffer[ti] = buffer[bi]*w; + } + for (size_t i = chunk2; i < _blockSamples; ++i) { + Sample w = _analysisWindow[i]; + size_t ti = i - _analysisOffset; + size_t bi = i + offsetPos - _inputLengthSamples; + timeBuffer[ti] = buffer[bi]*w; + } + for (size_t i = _blockSamples - _analysisOffset; i < _fftSamples - _analysisOffset; ++i) { + timeBuffer[i] = 0; + } + if (splitComputation) return; + } + auto *spectrumPtr = spectrum(channel); + if (splitComputation) { + fft.fft(step, timeBuffer.data(), spectrumPtr); + if (unpacked && step == fft.steps() - 1) { + spectrumPtr[_fftBins - 1] = spectrumPtr[0].imag(); + spectrumPtr[0].imag(0); + } + } else { + fft.fft(timeBuffer.data(), spectrum(channel)); + if (unpacked) { + spectrumPtr[_fftBins - 1] = spectrumPtr[0].imag(); + spectrumPtr[0].imag(0); + } + } + } + + void synthesise() { + for (size_t s = 0; s < synthesiseSteps(); ++s) { + synthesiseStep(s); + } + } + size_t synthesiseSteps() const { + return splitComputation ? (_synthesisChannels*(fft.steps() + 1) + 1) : _synthesisChannels; + } + void synthesiseStep(size_t step) { + if (step == 0) { // Extra first step which adds in the effective gain for a pure analysis-synthesis cycle + addWindowProduct(); + if (splitComputation) return; + } + if (splitComputation) --step; + + size_t fftSteps = splitComputation ? fft.steps() : 0; + size_t channel = step/(fftSteps + 1); + step -= channel*(fftSteps + 1); + + auto *spectrumPtr = spectrum(channel); + if (unpacked && step == 0) { // re-pack + spectrumPtr[0].imag(spectrumPtr[_fftBins - 1].real()); + } + + if (splitComputation) { + if (step < fftSteps) { + fft.ifft(step, spectrumPtr, timeBuffer.data()); + return; + } + } else { + fft.ifft(spectrumPtr, timeBuffer.data()); + } + + // extra step after each channel's FFT + Sample *buffer = output.buffer.data() + channel*_blockSamples; + size_t outputWrapIndex = _blockSamples - output.pos; + size_t chunk1 = std::min(_synthesisOffset, outputWrapIndex); + size_t chunk2 = std::min(_blockSamples, std::max(_synthesisOffset, outputWrapIndex)); + + for (size_t i = 0; i < chunk1; ++i) { + Sample w = modified ? -_synthesisWindow[i] : _synthesisWindow[i]; + size_t ti = i + (_fftSamples - _synthesisOffset); + size_t bi = output.pos + i; + buffer[bi] += timeBuffer[ti]*w; + } + for (size_t i = chunk1; i < _synthesisOffset; ++i) { + Sample w = modified ? -_synthesisWindow[i] : _synthesisWindow[i]; + size_t ti = i + (_fftSamples - _synthesisOffset); + size_t bi = i + output.pos - _blockSamples; + buffer[bi] += timeBuffer[ti]*w; + } + for (size_t i = _synthesisOffset; i < chunk2; ++i) { + Sample w = _synthesisWindow[i]; + size_t ti = i - _synthesisOffset; + size_t bi = output.pos + i; + buffer[bi] += timeBuffer[ti]*w; + } + for (size_t i = chunk2; i < _blockSamples; ++i) { + Sample w = _synthesisWindow[i]; + size_t ti = i - _synthesisOffset; + size_t bi = i + output.pos - _blockSamples; + buffer[bi] += timeBuffer[ti]*w; + } + } + +#define COMPAT_SPELLING(name, alt) \ + template \ + void alt(Args &&...args) { \ + name(std::forward(args)...); \ + } + COMPAT_SPELLING(analyse, analyze); + COMPAT_SPELLING(analyseStep, analyseStep); + COMPAT_SPELLING(analyseSteps, analyzeSteps); + COMPAT_SPELLING(synthesise, synthesize); + COMPAT_SPELLING(synthesiseStep, synthesizeStep); + COMPAT_SPELLING(synthesiseSteps, synthesizeSteps); + + /// Input (only available so we can save/restore the input state) + struct Input { + void swap(Input &other) { + std::swap(pos, other.pos); + std::swap(buffer, other.buffer); + } + + Input & operator=(const Input &other) { + pos = other.pos; + buffer.assign(other.buffer.begin(), other.buffer.end()); + return *this; + } + private: + friend struct DynamicSTFT; + size_t pos = 0; + std::vector buffer; + }; + Input input; + + /// Output (only available so we can save/restore the output state) + struct Output { + void swap(Output &other) { + std::swap(pos, other.pos); + std::swap(buffer, other.buffer); + std::swap(windowProducts, other.windowProducts); + } + + Output & operator=(const Output &other) { + pos = other.pos; + buffer.assign(other.buffer.begin(), other.buffer.end()); + windowProducts.assign(other.windowProducts.begin(), other.windowProducts.end()); + return *this; + } + private: + friend struct DynamicSTFT; + size_t pos = 0; + std::vector buffer; + std::vector windowProducts; + }; + Output output; + +private: + static constexpr Sample almostZero = 1e-30; + + size_t _analysisChannels, _synthesisChannels, _inputLengthSamples, _blockSamples, _fftSamples, _fftBins; + size_t _defaultInterval = 0; + + std::vector _analysisWindow, _synthesisWindow; + size_t _analysisOffset = 0, _synthesisOffset = 0; + + std::vector spectrumBuffer; + std::vector timeBuffer; + + size_t _samplesSinceSynthesis = 0, _samplesSinceAnalysis = 0; + + void addWindowProduct() { + _samplesSinceSynthesis = 0; + + int windowShift = int(_synthesisOffset) - int(_analysisOffset); + size_t wMin = std::max(0, windowShift); + size_t wMax = std::min(_blockSamples, int(_blockSamples) + windowShift); + + Sample *windowProduct = output.windowProducts.data(); + size_t outputWrapIndex = _blockSamples - output.pos; + size_t chunk1 = std::min(wMax, std::max(wMin, outputWrapIndex)); + for (size_t i = wMin; i < chunk1; ++i) { + Sample wa = _analysisWindow[i - windowShift]; + Sample ws = _synthesisWindow[i]; + size_t bi = output.pos + i; + windowProduct[bi] += wa*ws*_fftSamples; + } + for (size_t i = chunk1; i < wMax; ++i) { + Sample wa = _analysisWindow[i - windowShift]; + Sample ws = _synthesisWindow[i]; + size_t bi = i + output.pos - _blockSamples; + windowProduct[bi] += wa*ws*_fftSamples; + } + } + + // Copied from DSP library `windows.h` + class Kaiser { + inline static double bessel0(double x) { + const double significanceLimit = 1e-4; + double result = 0; + double term = 1; + double m = 0; + while (term > significanceLimit) { + result += term; + ++m; + term *= (x*x)/(4*m*m); + } + + return result; + } + double beta; + double invB0; + + static double heuristicBandwidth(double bandwidth) { + return bandwidth + 8/((bandwidth + 3)*(bandwidth + 3)) + 0.25*std::max(3 - bandwidth, 0.0); + } + public: + Kaiser(double beta) : beta(beta), invB0(1/bessel0(beta)) {} + + static Kaiser withBandwidth(double bandwidth, bool heuristicOptimal=false) { + return Kaiser(bandwidthToBeta(bandwidth, heuristicOptimal)); + } + static double bandwidthToBeta(double bandwidth, bool heuristicOptimal=false) { + if (heuristicOptimal) { // Heuristic based on numerical search + bandwidth = heuristicBandwidth(bandwidth); + } + bandwidth = std::max(bandwidth, 2.0); + double alpha = std::sqrt(bandwidth*bandwidth*0.25 - 1); + return alpha*M_PI; + } + + template + void fill(Data &&data, size_t size, double warp, bool isForSynthesis) const { + double invSize = 1.0/size; + size_t offsetI = (size&1) ? 1 : (isForSynthesis ? 0 : 2); + for (size_t i = 0; i < size; ++i) { + double r = (2*i + offsetI)*invSize - 1; + r = (r + warp)/(1 + r*warp); + double arg = std::sqrt(1 - r*r); + data[i] = bessel0(beta*arg)*invB0; + } + } + }; + + class ApproximateConfinedGaussian { + double gaussianFactor; + + double gaussian(double x) const { + return std::exp(-x*x*gaussianFactor); + } + public: + static double bandwidthToSigma(double bandwidth) { + return 0.3/std::sqrt(bandwidth); + } + static ApproximateConfinedGaussian withBandwidth(double bandwidth) { + return ApproximateConfinedGaussian(bandwidthToSigma(bandwidth)); + } + + ApproximateConfinedGaussian(double sigma) : gaussianFactor(0.0625/(sigma*sigma)) {} + + /// Fills an arbitrary container + template + void fill(Data &&data, size_t size, double warp, bool isForSynthesis) const { + double invSize = 1.0/size; + double offsetScale = gaussian(1)/(gaussian(3) + gaussian(-1)); + double norm = 1/(gaussian(0) - 2*offsetScale*(gaussian(2))); + size_t offsetI = (size&1) ? 1 : (isForSynthesis ? 0 : 2); + for (size_t i = 0; i < size; ++i) { + double r = (2*i + offsetI)*invSize - 1; + r = (r + warp)/(1 + r*warp); + data[i] = norm*(gaussian(r) - offsetScale*(gaussian(r - 2) + gaussian(r + 2))); + } + } + }; + + template + void forcePerfectReconstruction(Data &&data, size_t windowLength, size_t interval) { + for (size_t i = 0; i < interval; ++i) { + double sum2 = 0; + for (size_t index = i; index < windowLength; index += interval) { + sum2 += data[index]*data[index]; + } + double factor = 1/std::sqrt(sum2); + for (size_t index = i; index < windowLength; index += interval) { + data[index] *= factor; + } + } + } +}; + +}} // namespace + +#endif // include guard diff --git a/thirdparty/signalsmith-linear/tests/CMakeLists.txt b/thirdparty/signalsmith-linear/tests/CMakeLists.txt new file mode 100644 index 0000000000..7260fafa42 --- /dev/null +++ b/thirdparty/signalsmith-linear/tests/CMakeLists.txt @@ -0,0 +1,36 @@ +cmake_minimum_required(VERSION 3.24) +project(linear-tests VERSION 1.0.0) +set(CMAKE_CXX_STANDARD 11) + +# MSVC needs an extra flag to handle this many templates +if (MSVC) + add_compile_options(/bigobj) + #set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /bigobj") +endif() + +include(FetchContent) + +# DSP library +FetchContent_Declare( + signalsmith_dsp + GIT_REPOSITORY https://github.com/Signalsmith-Audio/dsp.git + GIT_TAG 5c7d1f3eb375b4862b682d310ccfbaafb6a4477e # main + GIT_SHALLOW OFF + SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/others/dsp" +) +FetchContent_MakeAvailable(signalsmith_dsp) + +# Plot +FetchContent_Declare( + signalsmith_plot + GIT_REPOSITORY https://github.com/Signalsmith-Audio/plot.git + GIT_TAG 1.0.0 + GIT_SHALLOW ON + SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/plot" +) +FetchContent_MakeAvailable(signalsmith_plot) + +add_subdirectory(".." ${CMAKE_CURRENT_BINARY_DIR}/signalsmith-linear) + +add_executable(tests ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp) +target_link_libraries(tests PRIVATE signalsmith-linear) diff --git a/thirdparty/signalsmith-linear/tests/Makefile b/thirdparty/signalsmith-linear/tests/Makefile new file mode 100644 index 0000000000..3a6e9b73f8 --- /dev/null +++ b/thirdparty/signalsmith-linear/tests/Makefile @@ -0,0 +1,40 @@ +main: out/tests + @cd out && ./tests + +dev: out-cmake/tests + cd out-cmake && ./tests benchmark 0.001 fft + +benchmark: out/tests + @cd out && ./tests benchmark + +benchmark-fast: out/tests + @cd out && ./tests benchmark 0.001 + +# MAC config +out/tests: *.cpp *.h ../*.h ../platform/*.h + mkdir -p out + g++ -std=c++11 \ + -O3 -ffast-math -DSIGNALSMITH_IGNORE_BROKEN_APPLECLANG \ + -Wall -Wextra -Wfatal-errors -Wpedantic -pedantic-errors \ + -framework Accelerate -DSIGNALSMITH_USE_ACCELERATE -DACCELERATE_NEW_LAPACK \ + main.cpp -o out/tests + +out-cmake/tests: CMakeLists.txt *.cpp *.h ../*.h ../platform/*.h + cmake -B build -G Xcode + cmake --build build --config Release + mkdir -p out-cmake + cp build/Release/tests out-cmake + +cmake: out-cmake/tests + cd out-cmake && ./tests + +cmake-benchmark: out-cmake/tests + cd out-cmake && ./tests benchmark + +cmake-benchmark-fast: out-cmake/tests + cd out-cmake && ./tests benchmark 0.001 + +clean: + rm -rf out + rm -rf out-cmake + rm -rf build diff --git a/thirdparty/signalsmith-linear/tests/main.cpp b/thirdparty/signalsmith-linear/tests/main.cpp new file mode 100644 index 0000000000..7e3c761e63 --- /dev/null +++ b/thirdparty/signalsmith-linear/tests/main.cpp @@ -0,0 +1,66 @@ +#include +#define LOG_EXPR(expr) std::cout << #expr << " = " << (expr) << std::endl; + +#include "./test-linear.h" +#include "./test-ffts.h" +#include "./test-stfts.h" + +#include + +#include "./stop-denormals.h" + +int main(int argc, char *argv[]) { + StopDenormals scoped; + +#ifdef SIGNALSMITH_USE_ACCELERATE + std::cout << u8"✅ SIGNALSMITH_USE_ACCELERATE\n"; +#else + std::cout << u8"❌ SIGNALSMITH_USE_ACCELERATE\n"; +#endif +#ifdef SIGNALSMITH_USE_IPP + std::cout << u8"✅ SIGNALSMITH_USE_IPP\n"; +#else + std::cout << u8"❌ SIGNALSMITH_USE_IPP\n"; +#endif +#ifdef SIGNALSMITH_USE_PFFFT + std::cout << u8"✅ SIGNALSMITH_USE_PFFFT\n"; +#else + std::cout << u8"❌ SIGNALSMITH_USE_PFFFT\n"; +#endif +#ifdef SIGNALSMITH_USE_PFFFT_DOUBLE + std::cout << u8"✅ SIGNALSMITH_USE_PFFFT_DOUBLE\n"; +#else + std::cout << u8"❌ SIGNALSMITH_USE_PFFFT_DOUBLE\n"; +#endif +#ifdef __FAST_MATH__ + std::cout << u8"✅ __FAST_MATH__\n"; +#else + std::cout << u8"❌ __FAST_MATH__\n"; +#endif + + int maxSize = 8192; + double benchmarkSeconds = 0; + if (argc > 1 && !std::strcmp(argv[1], "benchmark")) { + maxSize = 65536*8; + benchmarkSeconds = 0.05; + if (argc > 2) { + benchmarkSeconds = std::strtod(argv[2], nullptr); + } + + } + if (argc <= 3) { + testFfts(maxSize, benchmarkSeconds); + testStfts(maxSize, benchmarkSeconds); + testLinear(maxSize, benchmarkSeconds); + } else { + for (int i = 3; i < argc; ++i) { + if (!std::strcmp(argv[i], "fft")) { + testFfts(maxSize, benchmarkSeconds); + } else if (!std::strcmp(argv[i], "stft")) { + testStfts(maxSize, benchmarkSeconds); + } else if (!std::strcmp(argv[i], "linear")) { + testLinear(maxSize, benchmarkSeconds); + } + } + } +} diff --git a/thirdparty/signalsmith-linear/tests/others/signalsmith-fft.h b/thirdparty/signalsmith-linear/tests/others/signalsmith-fft.h new file mode 100644 index 0000000000..85a30da786 --- /dev/null +++ b/thirdparty/signalsmith-linear/tests/others/signalsmith-fft.h @@ -0,0 +1,512 @@ +#ifndef SIGNALSMITH_FFT_V5_prev +#define SIGNALSMITH_FFT_V5_prev +// So we can easily include multiple versions by redefining this +#ifndef SIGNALSMITH_FFT_NAMESPACE +#define SIGNALSMITH_FFT_NAMESPACE signalsmith +#endif + +#include +#include +#include +#include +#include + +#ifndef SIGNALSMITH_INLINE +#ifdef __GNUC__ +#define SIGNALSMITH_INLINE __attribute__((always_inline)) inline +#elif defined(__MSVC__) +#define SIGNALSMITH_INLINE __forceinline inline +#else +#define SIGNALSMITH_INLINE inline +#endif +#endif + +#ifndef M_PI +#define M_PI 3.14159265358979323846264338327950288 +#endif + +namespace SIGNALSMITH_FFT_NAMESPACE { + + namespace perf { + // Complex multiplication has edge-cases around Inf/NaN - handling those properly makes std::complex non-inlineable, so we use our own + template + SIGNALSMITH_INLINE std::complex complexMul(const std::complex &a, const std::complex &b) { + return conjugateSecond ? std::complex{ + b.real()*a.real() + b.imag()*a.imag(), + b.real()*a.imag() - b.imag()*a.real() + } : std::complex{ + a.real()*b.real() - a.imag()*b.imag(), + a.real()*b.imag() + a.imag()*b.real() + }; + } + + template + SIGNALSMITH_INLINE std::complex complexAddI(const std::complex &a, const std::complex &b) { + return flipped ? std::complex{ + a.real() + b.imag(), + a.imag() - b.real() + } : std::complex{ + a.real() - b.imag(), + a.imag() + b.real() + }; + } + } + + // Use SFINAE to get an iterator from std::begin(), if supported - otherwise assume the value itself is an iterator + template + struct GetIterator { + static T get(const T &t) { + return t; + } + }; + template + struct GetIterator()))> { + static auto get(const T &t) -> decltype(std::begin(t)) { + return std::begin(t); + } + }; + + template + class FFT { + using complex = std::complex; + size_t _size; + std::vector workingVector; + + enum class StepType { + generic, step2, step3, step4 + }; + struct Step { + StepType type; + size_t factor; + size_t startIndex; + size_t innerRepeats; + size_t outerRepeats; + size_t twiddleIndex; + }; + std::vector factors; + std::vector plan; + std::vector twiddleVector; + + struct PermutationPair {size_t from, to;}; + std::vector permutation; + + void addPlanSteps(size_t factorIndex, size_t start, size_t length, size_t repeats) { + if (factorIndex >= factors.size()) return; + + size_t factor = factors[factorIndex]; + if (factorIndex + 1 < factors.size()) { + if (factors[factorIndex] == 2 && factors[factorIndex + 1] == 2) { + ++factorIndex; + factor = 4; + } + } + + size_t subLength = length/factor; + Step mainStep{StepType::generic, factor, start, subLength, repeats, twiddleVector.size()}; + + if (factor == 2) mainStep.type = StepType::step2; + if (factor == 3) mainStep.type = StepType::step3; + if (factor == 4) mainStep.type = StepType::step4; + + // Twiddles + bool foundStep = false; + for (const Step &existingStep : plan) { + if (existingStep.factor == mainStep.factor && existingStep.innerRepeats == mainStep.innerRepeats) { + foundStep = true; + mainStep.twiddleIndex = existingStep.twiddleIndex; + break; + } + } + if (!foundStep) { + for (size_t i = 0; i < subLength; ++i) { + for (size_t f = 0; f < factor; ++f) { + V phase = 2*M_PI*i*f/length; + complex twiddle = {cos(phase), -sin(phase)}; + twiddleVector.push_back(twiddle); + } + } + } + + if (repeats == 1 && sizeof(complex)*subLength > 65536) { + for (size_t i = 0; i < factor; ++i) { + addPlanSteps(factorIndex + 1, start + i*subLength, subLength, 1); + } + } else { + addPlanSteps(factorIndex + 1, start, subLength, repeats*factor); + } + plan.push_back(mainStep); + } + void setPlan() { + factors.resize(0); + size_t size = _size, f = 2; + while (size > 1) { + if (size%f == 0) { + factors.push_back(f); + size /= f; + } else if (f > sqrt(size)) { + f = size; + } else { + ++f; + } + } + + plan.resize(0); + twiddleVector.resize(0); + addPlanSteps(0, 0, _size, 1); + + permutation.resize(0); + permutation.push_back(PermutationPair{0, 0}); + size_t indexLow = 0, indexHigh = factors.size(); + size_t inputStepLow = _size, outputStepLow = 1; + size_t inputStepHigh = 1, outputStepHigh = _size; + while (outputStepLow*inputStepHigh < _size) { + size_t f, inputStep, outputStep; + if (outputStepLow <= inputStepHigh) { + f = factors[indexLow++]; + inputStep = (inputStepLow /= f); + outputStep = outputStepLow; + outputStepLow *= f; + } else { + f = factors[--indexHigh]; + inputStep = inputStepHigh; + inputStepHigh *= f; + outputStep = (outputStepHigh /= f); + } + size_t oldSize = permutation.size(); + for (size_t i = 1; i < f; ++i) { + for (size_t j = 0; j < oldSize; ++j) { + PermutationPair pair = permutation[j]; + pair.from += i*inputStep; + pair.to += i*outputStep; + permutation.push_back(pair); + } + } + } + } + + template + void fftStepGeneric(RandomAccessIterator &&origData, const Step &step) { + complex *working = workingVector.data(); + const size_t stride = step.innerRepeats; + + for (size_t outerRepeat = 0; outerRepeat < step.outerRepeats; ++outerRepeat) { + RandomAccessIterator data = origData; + + const complex *twiddles = twiddleVector.data() + step.twiddleIndex; + const size_t factor = step.factor; + for (size_t repeat = 0; repeat < step.innerRepeats; ++repeat) { + for (size_t i = 0; i < step.factor; ++i) { + working[i] = perf::complexMul(data[i*stride], twiddles[i]); + } + for (size_t f = 0; f < factor; ++f) { + complex sum = working[0]; + for (size_t i = 1; i < factor; ++i) { + V phase = 2*M_PI*f*i/factor; + complex factor = {cos(phase), -sin(phase)}; + sum += perf::complexMul(working[i], factor); + } + data[f*stride] = sum; + } + ++data; + twiddles += factor; + } + origData += step.factor*step.innerRepeats; + } + } + + template + void fftStep2(RandomAccessIterator &&origData, const Step &step) { + const size_t stride = step.innerRepeats; + const complex *origTwiddles = twiddleVector.data() + step.twiddleIndex; + for (size_t outerRepeat = 0; outerRepeat < step.outerRepeats; ++outerRepeat) { + const complex* twiddles = origTwiddles; + for (RandomAccessIterator data = origData; data < origData + stride; ++data) { + complex A = data[0]; + complex B = perf::complexMul(data[stride], twiddles[1]); + + data[0] = A + B; + data[stride] = A - B; + twiddles += 2; + } + origData += 2*stride; + } + } + + template + void fftStep3(RandomAccessIterator &&origData, const Step &step) { + constexpr complex factor3 = {-0.5, inverse ? 0.8660254037844386 : -0.8660254037844386}; + const size_t stride = step.innerRepeats; + const complex *origTwiddles = twiddleVector.data() + step.twiddleIndex; + + for (size_t outerRepeat = 0; outerRepeat < step.outerRepeats; ++outerRepeat) { + const complex* twiddles = origTwiddles; + for (RandomAccessIterator data = origData; data < origData + stride; ++data) { + complex A = data[0]; + complex B = perf::complexMul(data[stride], twiddles[1]); + complex C = perf::complexMul(data[stride*2], twiddles[2]); + + complex realSum = A + (B + C)*factor3.real(); + complex imagSum = (B - C)*factor3.imag(); + + data[0] = A + B + C; + data[stride] = perf::complexAddI(realSum, imagSum); + data[stride*2] = perf::complexAddI(realSum, imagSum); + + twiddles += 3; + } + origData += 3*stride; + } + } + + template + void fftStep4(RandomAccessIterator &&origData, const Step &step) { + const size_t stride = step.innerRepeats; + const complex *origTwiddles = twiddleVector.data() + step.twiddleIndex; + + for (size_t outerRepeat = 0; outerRepeat < step.outerRepeats; ++outerRepeat) { + const complex* twiddles = origTwiddles; + for (RandomAccessIterator data = origData; data < origData + stride; ++data) { + complex A = data[0]; + complex C = perf::complexMul(data[stride], twiddles[2]); + complex B = perf::complexMul(data[stride*2], twiddles[1]); + complex D = perf::complexMul(data[stride*3], twiddles[3]); + + complex sumAC = A + C, sumBD = B + D; + complex diffAC = A - C, diffBD = B - D; + + data[0] = sumAC + sumBD; + data[stride] = perf::complexAddI(diffAC, diffBD); + data[stride*2] = sumAC - sumBD; + data[stride*3] = perf::complexAddI(diffAC, diffBD); + + twiddles += 4; + } + origData += 4*stride; + } + } + + template + void permute(InputIterator input, OutputIterator data) { + for (auto pair : permutation) { + data[pair.from] = input[pair.to]; + } + } + + template + void run(InputIterator &&input, OutputIterator &&data) { + permute(input, data); + + for (const Step &step : plan) { + switch (step.type) { + case StepType::generic: + fftStepGeneric(data + step.startIndex, step); + break; + case StepType::step2: + fftStep2(data + step.startIndex, step); + break; + case StepType::step3: + fftStep3(data + step.startIndex, step); + break; + case StepType::step4: + fftStep4(data + step.startIndex, step); + break; + } + } + } + + static bool validSize(size_t size) { + constexpr static bool filter[32] = { + 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, // 0-9 + 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, // 10-19 + 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, // 20-29 + 0, 0 + }; + return filter[size]; + } + public: + static size_t sizeMinimum(size_t size) { + size_t power2 = 1; + while (size >= 32) { + size = (size - 1)/2 + 1; + power2 *= 2; + } + while (size < 32 && !validSize(size)) { + ++size; + } + return power2*size; + } + static size_t sizeMaximum(size_t size) { + size_t power2 = 1; + while (size >= 32) { + size /= 2; + power2 *= 2; + } + while (size > 1 && !validSize(size)) { + --size; + } + return power2*size; + } + + FFT(size_t size, int fastDirection=0) : _size(0) { + if (fastDirection > 0) size = sizeMinimum(size); + if (fastDirection < 0) size = sizeMaximum(size); + this->setSize(size); + } + + size_t setSize(size_t size) { + if (size != _size) { + _size = size; + workingVector.resize(size); + setPlan(); + } + return _size; + } + size_t setSizeMinimum(size_t size) { + return setSize(sizeMinimum(size)); + } + size_t setSizeMaximum(size_t size) { + return setSize(sizeMaximum(size)); + } + const size_t & size() const { + return _size; + } + + template + void fft(InputIterator &&input, OutputIterator &&output) { + auto inputIter = GetIterator::get(input); + auto outputIter = GetIterator::get(output); + return run(inputIter, outputIter); + } + + template + void ifft(InputIterator &&input, OutputIterator &&output) { + auto inputIter = GetIterator::get(input); + auto outputIter = GetIterator::get(output); + return run(inputIter, outputIter); + } + }; + + struct FFTOptions { + static constexpr int halfFreqShift = 1; + }; + + template + class RealFFT { + static constexpr bool modified = (optionFlags&FFTOptions::halfFreqShift); + + using complex = std::complex; + std::vector complexBuffer1, complexBuffer2; + std::vector twiddlesMinusI; + std::vector modifiedRotations; + FFT complexFft; + public: + static size_t sizeMinimum(size_t size) { + return (FFT::sizeMinimum((size - 1)/2) + 1)*2; + } + static size_t sizeMaximum(size_t size) { + return FFT::sizeMinimum(size/2)*2; + } + + RealFFT(size_t size, int fastDirection=0) : complexFft(0) { + if (fastDirection > 0) size = sizeMinimum(size); + if (fastDirection < 0) size = sizeMaximum(size); + this->setSize(size); + } + + size_t setSize(size_t size) { + complexBuffer1.resize(size/2); + complexBuffer2.resize(size/2); + + size_t hhSize = size/4 + 1; + twiddlesMinusI.resize(hhSize); + for (size_t i = 0; i < hhSize; ++i) { + double rotPhase = -2*M_PI*(modified ? i + 0.5 : i)/size; + twiddlesMinusI[i] = {sin(rotPhase), -cos(rotPhase)}; + } + if (modified) { + modifiedRotations.resize(size/2); + for (size_t i = 0; i < size/2; ++i) { + double rotPhase = -2*M_PI*i/size; + modifiedRotations[i] = {cos(rotPhase), sin(rotPhase)}; + } + } + + return complexFft.setSize(size/2); + } + size_t setSizeMinimum(size_t size) { + return setSize(sizeMinimum(size)); + } + size_t setSizeMaximum(size_t size) { + return setSize(sizeMaximum(size)); + } + size_t size() const { + return complexFft.size()*2; + } + + template + void fft(InputIterator &&input, OutputIterator &&output) { + size_t hSize = complexFft.size(); + for (size_t i = 0; i < hSize; ++i) { + if (modified) { + complexBuffer1[i] = perf::complexMul({input[2*i], input[2*i + 1]}, modifiedRotations[i]); + } else { + complexBuffer1[i] = {input[2*i], input[2*i + 1]}; + } + } + + complexFft.fft(complexBuffer1.data(), complexBuffer2.data()); + + if (!modified) output[0] = { + complexBuffer2[0].real() + complexBuffer2[0].imag(), + complexBuffer2[0].real() - complexBuffer2[0].imag() + }; + for (size_t i = modified ? 0 : 1; i <= hSize/2; ++i) { + size_t conjI = modified ? (hSize - 1 - i) : (hSize - i); + + complex odd = (complexBuffer2[i] + conj(complexBuffer2[conjI]))*(V)0.5; + complex evenI = (complexBuffer2[i] - conj(complexBuffer2[conjI]))*(V)0.5; + complex evenRotMinusI = perf::complexMul(evenI, twiddlesMinusI[i]); + + output[i] = odd + evenRotMinusI; + output[conjI] = conj(odd - evenRotMinusI); + } + } + + template + void ifft(InputIterator &&input, OutputIterator &&output) { + size_t hSize = complexFft.size(); + if (!modified) complexBuffer1[0] = { + input[0].real() + input[0].imag(), + input[0].real() - input[0].imag() + }; + for (size_t i = modified ? 0 : 1; i <= hSize/2; ++i) { + size_t conjI = modified ? (hSize - 1 - i) : (hSize - i); + complex v = input[i], v2 = input[conjI]; + + complex odd = v + conj(v2); + complex evenRotMinusI = v - conj(v2); + complex evenI = perf::complexMul(evenRotMinusI, twiddlesMinusI[i]); + + complexBuffer1[i] = odd + evenI; + complexBuffer1[conjI] = conj(odd - evenI); + } + + complexFft.ifft(complexBuffer1.data(), complexBuffer2.data()); + + for (size_t i = 0; i < hSize; ++i) { + complex v = complexBuffer2[i]; + if (modified) v = perf::complexMul(v, modifiedRotations[i]); + output[2*i] = v.real(); + output[2*i + 1] = v.imag(); + } + } + }; + + template + struct ModifiedRealFFT : public RealFFT { + using RealFFT::RealFFT; + }; +} + +#undef SIGNALSMITH_FFT_NAMESPACE +#endif // SIGNALSMITH_FFT_V5 diff --git a/thirdparty/signalsmith-linear/tests/stop-denormals.h b/thirdparty/signalsmith-linear/tests/stop-denormals.h new file mode 100644 index 0000000000..c9a7bb5e05 --- /dev/null +++ b/thirdparty/signalsmith-linear/tests/stop-denormals.h @@ -0,0 +1,34 @@ +#pragma once + +#if defined(__SSE__) || defined(_M_X64) + class StopDenormals { + unsigned int controlStatusRegister; + public: + StopDenormals() : controlStatusRegister(_mm_getcsr()) { + _mm_setcsr(controlStatusRegister|0x8040); // Flush-to-Zero and Denormals-Are-Zero + } + ~StopDenormals() { + _mm_setcsr(controlStatusRegister); + } + }; +#elif (defined (__ARM_NEON) || defined (__ARM_NEON__)) + class StopDenormals { + uintptr_t status; + public: + StopDenormals() { + uintptr_t asmStatus; + asm volatile("mrs %0, fpcr" : "=r"(asmStatus)); + status = asmStatus = asmStatus|0x01000000U; // Flush to Zero + asm volatile("msr fpcr, %0" : : "ri"(asmStatus)); + } + ~StopDenormals() { + uintptr_t asmStatus = status; + asm volatile("msr fpcr, %0" : : "ri"(asmStatus)); + } + }; +#else +# if __cplusplus >= 202302L +# warning "The `StopDenormals` class doesn't do anything for this architecture" +# endif + class StopDenormals {}; // FIXME: add for other architectures +#endif diff --git a/thirdparty/signalsmith-linear/tests/stopwatch.h b/thirdparty/signalsmith-linear/tests/stopwatch.h new file mode 100644 index 0000000000..0fb2ac7b1a --- /dev/null +++ b/thirdparty/signalsmith-linear/tests/stopwatch.h @@ -0,0 +1,103 @@ +#ifndef SIGNALSMITH_STOPWATCH_UTIL_H +#define SIGNALSMITH_STOPWATCH_UTIL_H + +#include +#include +#include +#include + +#ifdef WINDOWS // completely untested! +# include +class Stopwatch { + using Time = __int64; + using Duration = Time; + inline Time now() { + LARGE_INTEGER result; + QueryPerformanceCounter(&result); + return result.QuadPart; + } + static double toSeconds(Duration t) { + LARGE_INTEGER freq; + QueryPerformanceFrequency(&freq); + return t/double(freq); + } +#else +# include +class Stopwatch { + using Clock = std::conditional::type; + using Time = Clock::time_point; + using Duration = std::chrono::duration; + + inline Time now() { + return Clock::now(); + } + static double toSeconds(Duration duration) { + return duration.count(); + } +#endif + + std::atomic