diff --git a/platformio.ini b/platformio.ini
index 9ca7757350..96c06b9e2c 100644
--- a/platformio.ini
+++ b/platformio.ini
@@ -1577,6 +1577,9 @@ build_flags = ${common.build_flags_esp8266} -D WLED_DISABLE_OTA
   -D WLED_DISABLE_HUESYNC
   -D WLED_DISABLE_ESPNOW   ;; exceeds flash size limits
   -D WLED_DISABLE_INFRARED ;; exceeds flash size limits
+  -D WLED_DISABLE_PARTICLESYSTEM1D ;; exceeds flash size limit
+  -D WLED_DISABLE_PARTICLESYSTEM2D ;; exceeds flash size limit
+
 lib_deps = ${esp8266.lib_deps}
 lib_ignore = IRremoteESP8266 ; use with WLED_DISABLE_INFRARED for faster compilation
 ; RAM:   [======    ]  60.6% (used 49616 bytes from 81920 bytes)
@@ -1607,6 +1610,7 @@ build_flags = ${esp32_4MB_V4_S_base.esp32_build_flags}
   ; -D SR_DEBUG
   ; -D MIC_LOGGER
   ${common_mm.HUB75_build_flags}
+  -D WLED_DISABLE_PARTICLESYSTEM1D ;; exceeds flash size limit
 lib_deps = ${esp32_4MB_V4_S_base.esp32_lib_deps}
     ${common_mm.HUB75_lib_deps}
 lib_ignore = IRremoteESP8266 ; use with WLED_DISABLE_INFRARED for faster compilation
@@ -2218,6 +2222,8 @@ build_flags = ${common.build_flags} ${esp32s3.build_flags} -Wno-misleading-inden
   ; -D WLED_DEBUG
   ; -D SR_DEBUG
   ; -D MIC_LOGGER
+  -D WLED_DISABLE_PARTICLESYSTEM1D ;; exceeds flash size limit
+  -D WLED_DISABLE_PARTICLESYSTEM2D ;; exceeds flash size limit
 lib_deps = ${esp32s3.lib_deps} ${common_mm.lib_deps_S}  ${common_mm.lib_deps_V4_M}
 lib_ignore =
   IRremoteESP8266 ; use with WLED_DISABLE_INFRARED for faster compilation
@@ -2426,7 +2432,8 @@ build_flags = ${common.build_flags} ${esp32c3.build_flags}
   ;-D HW_PIN_SDA=0 -D HW_PIN_SCL=1 ;; for I2C Qwiic connector
   -D SR_DMTYPE=1 -D I2S_SDPIN=5 -D I2S_WSPIN=6 -D I2S_CKPIN=4 -D MCLK_PIN=7
   ; -D WLED_USE_MY_CONFIG
-
+  -D WLED_DISABLE_PARTICLESYSTEM1D ;; exceeds flash size limit
+  -D WLED_DISABLE_PARTICLESYSTEM2D ;; exceeds flash size limit
 lib_deps = ${esp32c3.lib_deps} ${common_mm.lib_deps_S} ${common_mm.lib_deps_V4_M}
 lib_ignore = 
   ;IRremoteESP8266 ; use with WLED_DISABLE_INFRARED for faster compilation
@@ -2450,6 +2457,8 @@ build_flags = ${env:esp32c3dev_4MB_M.build_flags}
   -D WLED_DISABLE_BROWNOUT_DET    ;; the board only has a 500mA LDO, better to disable brownout detection
   -D WLED_DISABLE_ADALIGHT        ;; to disable serial protocols for boards with CDC USB (Serial RX will receive junk commands, unless its pulled down by resistor)
   -D HW_PIN_SDA=0 -D HW_PIN_SCL=1 ;; avoid pin conflicts
+  -D WLED_DISABLE_PARTICLESYSTEM1D ;; exceeds flash size limit
+  -D WLED_DISABLE_PARTICLESYSTEM2D ;; exceeds flash size limit
 ; RAM:   [==        ]  24.6% (used 80556 bytes from 327680 bytes)
 ; Flash: [==========]  97.3% (used 1530346 bytes from 1572864 bytes)
 
@@ -2476,7 +2485,8 @@ build_flags = ${env:esp32c3dev_4MB_M.build_flags}
   -D WLED_DISABLE_BROWNOUT_DET    ;; the board only has a 500mA LDO, better to disable brownout detection
   -D WLED_DISABLE_ADALIGHT        ;; to disable serial protocols for boards with CDC USB (Serial RX will receive junk commands, unless its pulled down by resistor)
   -D HW_PIN_SDA=0 -D HW_PIN_SCL=1 ;; avoid pin conflicts
-
+  -D WLED_DISABLE_PARTICLESYSTEM1D ;; exceeds flash size limit
+  -D WLED_DISABLE_PARTICLESYSTEM2D ;; exceeds flash size limit
 ; RAM:   [==        ]  24.0% (used 78676 bytes from 327680 bytes)
 ; Flash: [==========]  96.4% (used 1516068 bytes from 1572864 bytes)
 
diff --git a/usermods/audioreactive/audio_reactive.h b/usermods/audioreactive/audio_reactive.h
index 0be1bd7974..15a632cd53 100644
--- a/usermods/audioreactive/audio_reactive.h
+++ b/usermods/audioreactive/audio_reactive.h
@@ -38,9 +38,7 @@
 #define FFT_USE_SLIDING_WINDOW             // perform FFT with sliding window =  50% overlap
 #endif
 
-
-#define FFT_PREFER_EXACT_PEAKS  // use different FFT windowing -> results in "sharper" peaks and less "leaking" into other frequencies
-//#define SR_STATS
+#define SR_STATS
 
 #if !defined(FFTTASK_PRIORITY)
 #if defined(WLEDMM_FASTPATH) && !defined(CONFIG_IDF_TARGET_ESP32S2) && !defined(CONFIG_IDF_TARGET_ESP32C3) && defined(ARDUINO_ARCH_ESP32)
@@ -181,7 +179,7 @@ static uint16_t decayTime = 300;              // int: decay time in milliseconds
 
 // peak detection
 #ifdef ARDUINO_ARCH_ESP32
-static void detectSamplePeak(void);  // peak detection function (needs scaled FFT results in vReal[]) - no used for 8266 receive-only mode
+static void detectSamplePeak(void);  // peak detection function (needs scaled FFT results in valFFT[]) - no used for 8266 receive-only mode
 #endif
 static void autoResetPeak(void);     // peak auto-reset function
 static uint8_t maxVol = 31;          // (was 10) Reasonable value for constant volume for 'peak detector', as it won't always trigger  (deprecated)
@@ -189,6 +187,35 @@ static uint8_t binNum = 8;           // Used to select the bin for FFT based bea
 
 #ifdef ARDUINO_ARCH_ESP32
 
+#if !defined(UM_AUDIOREACTIVE_USE_ESPDSP_FFT) && (defined(CONFIG_IDF_TARGET_ESP32S3) || defined(CONFIG_IDF_TARGET_ESP32))
+#define UM_AUDIOREACTIVE_USE_ARDUINO_FFT // use ArduinoFFT library for FFT instead of ESP-IDF DSP library by default on ESP32 and S3
+#endif
+
+#if ESP_IDF_VERSION < ESP_IDF_VERSION_VAL(4, 4, 0)
+#define UM_AUDIOREACTIVE_USE_ARDUINO_FFT // DSP FFT library is not available in ESP-IDF < 4.4
+#endif
+
+#ifdef UM_AUDIOREACTIVE_USE_ARDUINO_FFT
+#define sqrt(x) sqrtf(x)                // little hack that reduces FFT time by 10-50% on ESP32 (as alternative to FFT_SQRT_APPROXIMATION)
+#define sqrt_internal sqrtf             // see https://github.com/kosme/arduinoFFT/pull/83
+#include <arduinoFFT.h>                 // ArduinoFFT library for FFT and window functions
+#undef UM_AUDIOREACTIVE_USE_INTEGER_FFT // arduinoFFT has not integer support
+#else
+#include "dsps_fft2r.h" // ESP-IDF DSP library for FFT and window functions
+#include "dsps_wind.h"  //available window functions: hann, nuttall, blackman_harris, flat_top, blackman_nuttall, blackman
+#if defined(CONFIG_IDF_TARGET_ESP32S2) || defined(CONFIG_IDF_TARGET_ESP32C3)
+#define UM_AUDIOREACTIVE_USE_INTEGER_FFT // always use integer FFT on ESP32-S2 and ESP32-C3
+#endif
+#endif
+
+#if !defined(UM_AUDIOREACTIVE_USE_INTEGER_FFT)
+using FFTsampleType = float;
+using FFTmathType = float;
+#else
+using FFTsampleType = int16_t;
+using FFTmathType = int32_t;
+#endif
+
 // use audio source class (ESP32 specific)
 #include "audio_source.h"
 constexpr int BLOCK_SIZE = 128;                  // I2S buffer size (samples)
@@ -277,7 +304,7 @@ static volatile float    micReal_max2 = 0.0f;             // MicIn data max afte
 static float mapf(float x, float in_min, float in_max, float out_min, float out_max); // map function for float
 static float fftAddAvg(int from, int to);   // average of several FFT result bins
 void FFTcode(void * parameter);             // audio processing task: read samples, run FFT, fill GEQ channels from FFT results
-static void runMicFilter(uint16_t numSamples, float *sampleBuffer);          // pre-filtering of raw samples (band-pass)
+static void runMicFilter(uint16_t numSamples, FFTsampleType *sampleBuffer);          // pre-filtering of raw samples (band-pass)
 static void postProcessFFTResults(bool noiseGateOpen, int numberOfChannels, bool i2sFastpath); // post-processing and post-amp of GEQ channels
 
 
@@ -344,9 +371,7 @@ static float filterTime = 0;        // avg time for filtering I2S samples
 #endif
 
 // FFT Task variables (filtering and post-processing)
-static float   lastFftCalc[NUM_GEQ_CHANNELS] = {0.0f};                // backup of last FFT channels (before postprocessing)
 
-#if !defined(CONFIG_IDF_TARGET_ESP32C3)
 // audio source parameters and constant
 constexpr SRate_t SAMPLE_RATE = 22050;        // Base sample rate in Hz - 22Khz is a standard rate. Physical sample time -> 23ms
 //constexpr SRate_t SAMPLE_RATE = 16000;        // 16kHz - use if FFTtask takes more than 20ms. Physical sample time -> 32ms
@@ -364,29 +389,33 @@ constexpr SRate_t SAMPLE_RATE = 22050;        // Base sample rate in Hz - 22Khz
 //#define FFT_MIN_CYCLE 30                      // Use with 16Khz sampling
 //#define FFT_MIN_CYCLE 23                      // minimum time before FFT task is repeated. Use with 20Khz sampling
 //#define FFT_MIN_CYCLE 46                      // minimum time before FFT task is repeated. Use with 10Khz sampling
-#else
-// slightly lower the sampling rate for -C3, to improve stability
-//constexpr SRate_t SAMPLE_RATE = 20480;        // 20Khz; Physical sample time -> 25ms
-//#define FFT_MIN_CYCLE 23                      // minimum time before FFT task is repeated.
-constexpr SRate_t SAMPLE_RATE = 18000;          // 18Khz; Physical sample time -> 28ms
-#define FFT_MIN_CYCLE 25                        // minimum time before FFT task is repeated.
-// try 16Khz in case your device still lags and responds too slowly.
-//constexpr SRate_t SAMPLE_RATE = 16000;        // 16Khz -> Physical sample time -> 32ms
-//#define FFT_MIN_CYCLE 30                      // minimum time before FFT task is repeated.
-#endif
+
 
 // FFT Constants
 constexpr uint16_t samplesFFT = 512;            // Samples in an FFT batch - This value MUST ALWAYS be a power of 2
 constexpr uint16_t samplesFFT_2 = 256;          // meaningful part of FFT results - only the "lower half" contains useful information.
 // the following are observed values, supported by a bit of "educated guessing"
-//#define FFT_DOWNSCALE 0.65f                             // 20kHz - downscaling factor for FFT results - "Flat-Top" window @20Khz, old freq channels 
-//#define FFT_DOWNSCALE 0.46f                             // downscaling factor for FFT results - for "Flat-Top" window @22Khz, new freq channels
-#define FFT_DOWNSCALE 0.40f                             // downscaling factor for FFT results, RMS averaging
 #define LOG_256  5.54517744f                            // log(256)
 
+#if !defined(UM_AUDIOREACTIVE_USE_INTEGER_FFT)
+#define FFTabs fabsf
+#define FFT_BIN_SCALE 16.0                              // scale factor for FFT results
+#define FFT_BIN_SCALE_RAW 16.0                          // scale factor for raw FFT results
+#else
+#define FFTabs abs
+#define FFT_BIN_SCALE 32                                // scale factor for FFT post processing to match float version (=512/16)
+#define FFT_BIN_SCALE_RAW 512                           // scale factor for raw FFT results (raw int FFT to raw float FFT)
+#undef FFT_MAJORPEAK_HUMAN_EAR // correction not (yet) supported for ingeter FFT
+#endif
+
 // These are the input and output vectors.  Input vectors receive computed results from FFT.
-static float* vReal = nullptr;       // FFT sample inputs / freq output -  these are our raw result bins
+static FFTsampleType* valFFT = nullptr;      // FFT sample inputs / freq output -  these are our raw result bins (real part for ArduinoFFT, real+imaginary for DSP FFT)
+#ifdef UM_AUDIOREACTIVE_USE_ARDUINO_FFT
 static float* vImag = nullptr;       // imaginary parts
+#else
+// buffer for pre-computed window function
+FFTsampleType* windowFFT;
+#endif
 
 #ifdef FFT_MAJORPEAK_HUMAN_EAR
 static float* pinkFactors = nullptr;                        // "pink noise" correction factors
@@ -402,9 +431,6 @@ constexpr float binWidth = SAMPLE_RATE / (float)samplesFFT; // frequency range o
 //#define FFT_SPEED_OVER_PRECISION     // enables use of reciprocals (1/x etc), and an a few other speedups - WLEDMM not faster on ESP32
 //#define FFT_SQRT_APPROXIMATION       // enables "quake3" style inverse sqrt                               - WLEDMM slower on ESP32
 #endif
-#define sqrt(x) sqrtf(x)             // little hack that reduces FFT time by 10-50% on ESP32 (as alternative to FFT_SQRT_APPROXIMATION)
-#define sqrt_internal sqrtf          // see https://github.com/kosme/arduinoFFT/pull/83
-#include <arduinoFFT.h>
 
 // Helper functions
 
@@ -416,43 +442,63 @@ static float mapf(float x, float in_min, float in_max, float out_min, float out_
 // compute average of several FFT result bins
 // linear average
 static float fftAddAvgLin(int from, int to) {
-  float result = 0.0f;
+  FFTmathType result = 0;
   for (int i = from; i <= to; i++) {
-    result += vReal[i];
+    result += valFFT[i];
   }
-  return result / float(to - from + 1);
+  result = result * FFT_BIN_SCALE; // divide by 16 to reduce magnitude. Want end result to be scaled linear and ~4096 max.
+  return (float)result / float(to - from + 1);
 }
 // RMS average
 static float fftAddAvgRMS(int from, int to) {
   double result = 0.0;
   for (int i = from; i <= to; i++) {
-    result += vReal[i] * vReal[i];
+    result += valFFT[i] * valFFT[i];
   }
-  return sqrtf(result / float(to - from + 1));
+#if !defined(UM_AUDIOREACTIVE_USE_INTEGER_FFT)
+  return sqrtf(result * (FFT_BIN_SCALE*FFT_BIN_SCALE) / float(to - from + 1));
+#else
+  return sqrt32_bw(result * (FFT_BIN_SCALE*FFT_BIN_SCALE) / (to - from + 1));
+#endif
 }
 
 static float fftAddAvg(int from, int to) {
-  if (from == to) return vReal[from];              // small optimization
-  if (averageByRMS) return fftAddAvgRMS(from, to); // use RMS
-  else return fftAddAvgLin(from, to);              // use linear average
+  if (from == to) return valFFT[from] * FFT_BIN_SCALE;   // small optimization
+  if (averageByRMS) return fftAddAvgRMS(from, to);       // use RMS
+  else return fftAddAvgLin(from, to);                    // use linear average
 }
 
-#if defined(CONFIG_IDF_TARGET_ESP32C3)
-constexpr bool skipSecondFFT = true;
-#else
-constexpr bool skipSecondFFT = false;
-#endif
-
 // allocate FFT sample buffers from heap
 static bool alocateFFTBuffers(void) {
   #ifdef SR_DEBUG
     USER_PRINT(F("\nFree heap ")); USER_PRINTLN(ESP.getFreeHeap());
   #endif
-
-  if (vReal) free(vReal); // should not happen
-  if (vImag) free(vImag); // should not happen
-  if ((vReal = (float*) calloc(sizeof(float), samplesFFT)) == nullptr) return false; // calloc or die
+#ifdef UM_AUDIOREACTIVE_USE_ARDUINO_FFT
+  if (valFFT) free(valFFT); // should not happen
+  if (vImag) free(vImag);   // should not happen
+  if ((valFFT = (float*) calloc(sizeof(float), samplesFFT)) == nullptr) return false; // calloc or die
   if ((vImag = (float*) calloc(sizeof(float), samplesFFT)) == nullptr) return false;
+#elif !defined(UM_AUDIOREACTIVE_USE_INTEGER_FFT)
+// note: free() is never used on these pointers. If it ever is implemented, this implementation can cause memory leaks (need to free raw pointers)
+  if (valFFT == nullptr) {
+    float* raw_buffer = (float*)heap_caps_malloc((2 * samplesFFT * sizeof(float)) + 16, MALLOC_CAP_8BIT);
+    if ((raw_buffer == nullptr)) return false; // something went wrong
+    valFFT = (float*)(((uintptr_t)raw_buffer + 15) & ~15);  // SIMD requires aligned memory to 16-byte boundary. note in IDF5 there is MALLOC_CAP_SIMD available
+  }
+  // create window
+  if (windowFFT == nullptr) {
+    float* raw_buffer = (float*)heap_caps_malloc((samplesFFT * sizeof(float)) + 16, MALLOC_CAP_8BIT);
+    if ((raw_buffer == nullptr)) return false; // something went wrong
+    windowFFT = (float*)(((uintptr_t)raw_buffer + 15) & ~15);  // SIMD requires aligned memory to 16-byte boundary
+  }
+#else
+  // allocate and initialize integer FFT buffers on first call
+  if (valFFT == nullptr) valFFT = (int16_t*) calloc(sizeof(int16_t), samplesFFT * 2);
+  if ((valFFT == nullptr)) return false; // something went wrong
+  // create window
+  if (windowFFT == nullptr) windowFFT = (int16_t*) calloc(sizeof(int16_t), samplesFFT);
+  if ((windowFFT == nullptr)) return false; // something went wrong
+#endif
 #ifdef FFT_MAJORPEAK_HUMAN_EAR
   if (pinkFactors) free(pinkFactors);
   if ((pinkFactors = (float*) calloc(sizeof(float), samplesFFT)) == nullptr) return false;
@@ -469,7 +515,8 @@ static bool alocateFFTBuffers(void) {
 
 // High-Pass "DC blocker" filter
 // see https://www.dsprelated.com/freebooks/filters/DC_Blocker.html
-static void runDCBlocker(uint_fast16_t numSamples, float *sampleBuffer) {
+static void runDCBlocker(uint_fast16_t numSamples, FFTsampleType *sampleBuffer) {
+ #if !defined(UM_AUDIOREACTIVE_USE_INTEGER_FFT)
   constexpr float filterR = 0.990f;      // around 40hz
   static float xm1 = 0.0f;
   static SR_HIRES_TYPE ym1 = 0.0f;
@@ -481,6 +528,21 @@ static void runDCBlocker(uint_fast16_t numSamples, float *sampleBuffer) {
     ym1 = filtered;    
     sampleBuffer[i] = filtered;
   }  
+  #else
+  constexpr int32_t FILTER_R_Q14 = 16187;  // 0.990 * 16384
+  static int32_t xm1 = 0;  // Previous input sample
+  static int32_t ym1 = 0;  // Previous output sample
+  for (unsigned i = 0; i < numSamples; i++) {
+    int32_t value = (int32_t)sampleBuffer[i];  // Current input as int32_t
+    int32_t filtered = (((value - xm1) << 14) + (FILTER_R_Q14 * ym1)) >> 14;  // Convert back from Q15
+    xm1 = value;
+    ym1 = filtered;
+    // Clamp output to int16_t range to prevent overflow note: this function has a AC gain of up to 2 at nyquist! TODO: is this intentional?
+    if (filtered > 32767) filtered = 32767;
+    else if (filtered < -32768) filtered = -32768;
+    sampleBuffer[i] = (int16_t)filtered;
+  }
+  #endif
 }
 
 //
@@ -500,33 +562,105 @@ void FFTcode(void * parameter)
   // see https://www.freertos.org/vtaskdelayuntil.html
   const TickType_t xFrequency = FFT_MIN_CYCLE * portTICK_PERIOD_MS;  
   const TickType_t xFrequencyDouble = FFT_MIN_CYCLE * portTICK_PERIOD_MS * 2;  
-  static bool isFirstRun = false;
 
 #ifdef FFT_USE_SLIDING_WINDOW
-  static float* oldSamples = nullptr; // previous 50% of samples
+  static FFTsampleType* oldSamples = nullptr; // previous 50% of samples
   static bool haveOldSamples = false; // for sliding window FFT
   bool usingOldSamples = false;
-  if (!oldSamples) oldSamples = (float*) calloc(sizeof(float), samplesFFT_2); // allocate on first run
+  if (!oldSamples) oldSamples = (FFTsampleType*) calloc(sizeof(FFTsampleType), samplesFFT_2); // allocate on first run
   if (!oldSamples) { disableSoundProcessing = true; return; }                 // no memory -> die
 #endif
 
+  float wc = 1.0; // FFT window correction factor, relative to Blackman_Harris  TODO: could make this global and apply it in fftAddAvg() OR: scale the window function and remove this variable
   bool success = true;
-  if ((vReal == nullptr) || (vImag == nullptr)) success = alocateFFTBuffers(); // allocate sample buffers on first run
+#ifdef UM_AUDIOREACTIVE_USE_ARDUINO_FFT
+  if ((valFFT == nullptr) || (vImag == nullptr)) success = alocateFFTBuffers(); // allocate sample buffers on first run
   if (success == false) { disableSoundProcessing = true; return; }             // no memory -> die
-
-  // create FFT object - we have to do if after allocating buffers
+    // create FFT object - we have to do if after allocating buffers
 #if defined(FFT_LIB_REV) && FFT_LIB_REV > 0x19
   // arduinoFFT 2.x has a slightly different API
-  static ArduinoFFT<float> FFT = ArduinoFFT<float>( vReal, vImag, samplesFFT, SAMPLE_RATE, true);
+  static ArduinoFFT<float> FFT = ArduinoFFT<float>( valFFT, vImag, samplesFFT, SAMPLE_RATE, true);
 #else
   // recommended version optimized by @softhack007 (API version 1.9)
   #if defined(WLED_ENABLE_HUB75MATRIX) && defined(CONFIG_IDF_TARGET_ESP32)
     static float* windowWeighingFactors = nullptr;
     if (!windowWeighingFactors) windowWeighingFactors = (float*) calloc(sizeof(float), samplesFFT); // cache for FFT windowing factors - use heap
+    if ((windowWeighingFactors == nullptr)) { disableSoundProcessing = true; return; }              // something went wrong
   #else
     static float windowWeighingFactors[samplesFFT] = {0.0f};                                        // cache for FFT windowing factors - use global RAM
   #endif
-  static ArduinoFFT<float> FFT = ArduinoFFT<float>( vReal, vImag, samplesFFT, SAMPLE_RATE, windowWeighingFactors);
+  static ArduinoFFT<float> FFT = ArduinoFFT<float>( valFFT, vImag, samplesFFT, SAMPLE_RATE, windowWeighingFactors);
+#endif
+#else
+  if ((valFFT == nullptr) && (windowFFT == nullptr)) success = alocateFFTBuffers(); // allocate sample buffers on first run
+  if (success == false) { disableSoundProcessing = true; return; }             // no memory -> die
+#if !defined(UM_AUDIOREACTIVE_USE_INTEGER_FFT)
+  if (dsps_fft2r_init_fc32(NULL, samplesFFT) != ESP_OK) return; // initialize FFT tables
+  // create window function for FFT
+    switch(fftWindow) {                             // apply FFT window. note: DSPS windows use ~2k of flash
+    case 1:
+      dsps_wind_hann_f32(windowFFT, samplesFFT);  // recommended for 50% overlap
+      wc = 0.66415918066;     // 1.8554726898 * 2.0
+    break;
+    case 2:
+      dsps_wind_nuttall_f32(windowFFT, samplesFFT);
+      wc = 0.9916873881f;     // 2.8163172034 * 2.0
+    break;
+    case 5:
+      dsps_wind_blackman_f32(windowFFT, samplesFFT); //1539460 - 1541582
+      wc = 0.84762867875f;     // 2.3673474360 * 2.0
+    break;
+    case 3:
+      dsps_wind_hann_f32(windowFFT, samplesFFT);  //Hamming window is not avilable in ESP-IDF, Hann is similar
+      wc = 0.66415918066;     // 1.8554726898 * 2.0
+    break;
+    case 4:
+        dsps_wind_flat_top_f32(windowFFT, samplesFFT);      // Weigh data using "Flat Top" function - better amplitude preservation, low frequency accuracy
+      wc = 1.276771793156f;   // 3.5659039231 * 2.0
+    break;
+    case 0: // falls through
+    default:
+      dsps_wind_blackman_harris_f32(windowFFT, samplesFFT); // Weigh data using "Blackman- Harris" window - sharp peaks due to excellent sideband rejection
+      wc = 1.0f;              // 2.7929062517 * 2.0
+  }
+#else
+  if (dsps_fft2r_init_sc16(NULL, samplesFFT) != ESP_OK) return; // initialize FFT tables
+  // create window function for FFT
+  float *windowFloat = (float*) calloc(sizeof(float), samplesFFT); // temporary buffer for window function
+  if ((windowFloat == nullptr)) return; // something went wrong
+    switch(fftWindow) {                             // apply FFT window. note: DSPS windows use ~2k of flash
+    case 1:
+      dsps_wind_hann_f32(windowFloat, samplesFFT);  // recommended for 50% overlap
+      wc = 0.66415918066;     // 1.8554726898 * 2.0
+    break;
+    case 2:
+      dsps_wind_nuttall_f32(windowFloat, samplesFFT);
+      wc = 0.9916873881f;     // 2.8163172034 * 2.0
+    break;
+    case 5:
+      dsps_wind_blackman_f32(windowFloat, samplesFFT); //1539460 - 1541582
+      wc = 0.84762867875f;     // 2.3673474360 * 2.0
+    break;
+    case 3:
+      dsps_wind_hann_f32(windowFloat, samplesFFT);  //Hamming window is not avilable in ESP-IDF, Hann is similar
+      wc = 0.66415918066;     // 1.8554726898 * 2.0
+    break;
+    case 4:
+        dsps_wind_flat_top_f32(windowFloat, samplesFFT);      // Weigh data using "Flat Top" function - better amplitude preservation, low frequency accuracy
+      wc = 1.276771793156f;   // 3.5659039231 * 2.0
+    break;
+    case 0: // falls through
+    default:
+      dsps_wind_blackman_harris_f32(windowFloat, samplesFFT); // Weigh data using "Blackman- Harris" window - sharp peaks due to excellent sideband rejection
+      wc = 1.0f;              // 2.7929062517 * 2.0
+  }
+
+  // convert float window to 16-bit int
+  for (int i = 0; i < samplesFFT; i++) {
+    windowFFT[i] = (int16_t)(windowFloat[i] * 32767.0f); // TODO: bake-in the window correction factor and save a few float multiplications
+  }
+  free(windowFloat); // free temporary buffer
+#endif
 #endif
 
   #ifdef FFT_MAJORPEAK_HUMAN_EAR
@@ -547,7 +681,6 @@ void FFTcode(void * parameter)
 
     // Don't run FFT computing code if we're in Receive mode or in realtime mode
     if (disableSoundProcessing || (audioSyncEnabled == AUDIOSYNC_REC)) {
-      isFirstRun = false;
       #ifdef FFT_USE_SLIDING_WINDOW
         haveOldSamples = false;
       #endif
@@ -570,11 +703,11 @@ void FFTcode(void * parameter)
 #endif
 
     // get a fresh batch of samples from I2S
-    memset(vReal, 0, sizeof(float) * samplesFFT); // start clean
+    memset(valFFT, 0, sizeof(FFTsampleType) * samplesFFT); // start clean
 #ifdef FFT_USE_SLIDING_WINDOW
     uint16_t readOffset;
     if (haveOldSamples && (doSlidingFFT > 0)) {
-      memcpy(vReal, oldSamples, sizeof(float) * samplesFFT_2);                     // copy first 50% from buffer
+      memcpy(valFFT, oldSamples, sizeof(FFTsampleType) * samplesFFT_2);                     // copy first 50% from buffer
       usingOldSamples = true;
       readOffset = samplesFFT_2;
     } else {
@@ -584,11 +717,11 @@ void FFTcode(void * parameter)
     // read fresh samples, in chunks of 50%
     do {
       // this looks a bit cumbersome, but it onlyworks this way - any second instance of the getSamples() call delivers junk data.
-      if (audioSource) audioSource->getSamples(vReal+readOffset, samplesFFT_2);
+      if (audioSource) audioSource->getSamples(valFFT+readOffset, samplesFFT_2);
       readOffset += samplesFFT_2;
     } while (readOffset < samplesFFT);
 #else
-    if (audioSource) audioSource->getSamples(vReal, samplesFFT);
+    if (audioSource) audioSource->getSamples(valFFT, samplesFFT);
 #endif
 
 #if defined(WLED_DEBUG) || defined(SR_DEBUG)|| defined(SR_STATS)
@@ -608,7 +741,6 @@ void FFTcode(void * parameter)
 #endif
 
     xLastWakeTime = xTaskGetTickCount();       // update "last unblocked time" for vTaskDelay
-    isFirstRun = !isFirstRun; //  toggle throttle
 
 #ifdef MIC_LOGGER
     float datMin = 0.0f;
@@ -616,12 +748,12 @@ void FFTcode(void * parameter)
     double datAvg = 0.0f;
     for (int i=0; i < samplesFFT; i++) {
       if (i==0) {
-        datMin = datMax = vReal[i];
+        datMin = datMax = valFFT[i];
       } else {
-        if (datMin >  vReal[i]) datMin =  vReal[i];
-        if (datMax <  vReal[i]) datMax =  vReal[i];
+        if (datMin >  valFFT[i]) datMin =  valFFT[i];
+        if (datMax <  valFFT[i]) datMax =  valFFT[i];
       }
-      datAvg +=  vReal[i];
+      datAvg +=  valFFT[i];
     }
 #endif
 
@@ -633,12 +765,12 @@ void FFTcode(void * parameter)
 #endif
 
     // normal mode: filter everything
-    float *samplesStart = vReal;
+    FFTsampleType *samplesStart = valFFT;
     uint16_t sampleCount = samplesFFT;
     #ifdef FFT_USE_SLIDING_WINDOW
     if (usingOldSamples) {
       // sliding window mode: only latest 50% need filtering
-      samplesStart = vReal + samplesFFT_2;
+      samplesStart = valFFT + samplesFFT_2;
       sampleCount = samplesFFT_2;
     }
     #endif
@@ -662,11 +794,8 @@ void FFTcode(void * parameter)
     start = esp_timer_get_time(); // start measuring FFT time
 #endif
 
-    // set imaginary parts to 0
-    memset(vImag, 0, sizeof(float) * samplesFFT);
-
     #ifdef FFT_USE_SLIDING_WINDOW
-    memcpy(oldSamples, vReal+samplesFFT_2, sizeof(float) * samplesFFT_2);  // copy last 50% to buffer (for sliding window FFT)
+    memcpy(oldSamples, valFFT+samplesFFT_2, sizeof(FFTsampleType) * samplesFFT_2);  // copy last 50% to buffer (for sliding window FFT)
     haveOldSamples = true;
     #endif
 
@@ -675,22 +804,27 @@ void FFTcode(void * parameter)
     uint_fast16_t newZeroCrossingCount = 0;
     for (int i=0; i < samplesFFT; i++) {
 	    // pick our  our current mic sample - we take the max value from all samples that go into FFT
-	    if ((vReal[i] <= (INT16_MAX - 1024)) && (vReal[i] >= (INT16_MIN + 1024))) { //skip extreme values - normally these are artefacts
+	    if ((valFFT[i] <= (INT16_MAX - 1024)) && (valFFT[i] >= (INT16_MIN + 1024))) { //skip extreme values - normally these are artefacts
       #ifdef FFT_USE_SLIDING_WINDOW
         if (usingOldSamples) {
-          if ((i >= samplesFFT_2) && (fabsf(vReal[i]) > maxSample)) maxSample = fabsf(vReal[i]);  // only look at newest 50%
+          if ((i >= samplesFFT_2) && (FFTabs(valFFT[i]) > maxSample)) maxSample = FFTabs(valFFT[i]);  // only look at newest 50%
         } else
       #endif
-        if (fabsf((float)vReal[i]) > maxSample) maxSample = fabsf((float)vReal[i]);
+        if (FFTabs((float)valFFT[i]) > maxSample) maxSample = FFTabs((float)valFFT[i]);
       }
       // WLED-MM/TroyHacks: Calculate zero crossings
       //
       if (i < (samplesFFT-1)) {
-        if (__builtin_signbit(vReal[i]) != __builtin_signbit(vReal[i+1]))  // test sign bit: sign changed -> zero crossing
-            newZeroCrossingCount++;
+#if !defined(UM_AUDIOREACTIVE_USE_INTEGER_FFT)
+        if (__builtin_signbit(valFFT[i]) != __builtin_signbit(valFFT[i+1]))  // test sign bit: sign changed -> zero crossing
+          newZeroCrossingCount++;
+#else
+        if ((valFFT[i] ^ valFFT[i+1]) < 0) // if sign bits differ, xor of sign bit will be 1 -> result will be negative
+          newZeroCrossingCount++;
+#endif
       }
     }
-    newZeroCrossingCount = (newZeroCrossingCount*2)/3; // reduce value so it typically stays below 256
+    newZeroCrossingCount = (newZeroCrossingCount*341)>>9; // multiply by 2/3 to reduce value so it typically stays below 256
     zeroCrossingCount = newZeroCrossingCount; // update only once, to avoid that effects pick up an intermediate value
 
     // release highest sample to volume reactive effects early - not strictly necessary here - could also be done at the end of the function
@@ -704,144 +838,198 @@ void FFTcode(void * parameter)
     // compute mix/max again after filering - usefull for filter debugging
     for (int i=0; i < samplesFFT; i++) {
       if (i==0) {
-        datMin = datMax = vReal[i];
+        datMin = datMax = valFFT[i];
       } else {
-        if (datMin >  vReal[i]) datMin =  vReal[i];
-        if (datMax <  vReal[i]) datMax =  vReal[i];
+        if (datMin >  valFFT[i]) datMin =  valFFT[i];
+        if (datMax <  valFFT[i]) datMax =  valFFT[i];
       }
     }
     micReal_min2 = datMin;
     micReal_max2 = datMax;
 #endif
-#endif
-
-    float wc = 1.0; // FFT window correction factor, relative to Blackman_Harris
+#endif 
 
     // run FFT (takes 3-5ms on ESP32)
     if (fabsf(volumeSmth) > 0.25f) { // noise gate open
-      if ((skipSecondFFT == false) || (isFirstRun == true)) {
-        // run FFT (takes 2-3ms on ESP32, ~12ms on ESP32-S2, ~30ms on -C3)
-        if (doDCRemoval) FFT.dcRemoval();                                            // remove DC offset
-        switch(fftWindow) {                                                          // apply FFT window
-          case 1:
-            FFT.windowing(FFTWindow::Hann, FFTDirection::Forward);  // recommended for 50% overlap
-            wc = 0.66415918066;     // 1.8554726898 * 2.0
-          break;
-          case 2:
-            FFT.windowing( FFTWindow::Nuttall, FFTDirection::Forward);
-            wc = 0.9916873881f;     // 2.8163172034 * 2.0
-          break;
-          case 5:
-            FFT.windowing( FFTWindow::Blackman, FFTDirection::Forward);
-            wc = 0.84762867875f;     // 2.3673474360 * 2.0
-          break;
-          case 3:
-            FFT.windowing( FFTWindow::Hamming, FFTDirection::Forward);
-            wc = 0.664159180663f;   // 1.8549343278 * 2.0
-          break;
-          case 4:
-            FFT.windowing( FFTWindow::Flat_top, FFTDirection::Forward);        // Weigh data using "Flat Top" function - better amplitude preservation, low frequency accuracy
-            wc = 1.276771793156f;   // 3.5659039231 * 2.0
-          break;
-          case 0: // falls through
-          default:
-            FFT.windowing(FFTWindow::Blackman_Harris, FFTDirection::Forward);  // Weigh data using "Blackman- Harris" window - sharp peaks due to excellent sideband rejection
-            wc = 1.0f;              // 2.7929062517 * 2.0
-        }
-        #ifdef FFT_USE_SLIDING_WINDOW
-        if (usingOldSamples) wc = wc * 1.10f; // compensate for loss caused by averaging
-        #endif
-
-        FFT.compute( FFTDirection::Forward );                       // Compute FFT
-        FFT.complexToMagnitude();                                   // Compute magnitudes
-        vReal[0] = 0;   // The remaining DC offset on the signal produces a strong spike on position 0 that should be eliminated to avoid issues.
-
-        float last_majorpeak = FFT_MajorPeak;
-        float last_magnitude = FFT_Magnitude;
-
-        #ifdef FFT_MAJORPEAK_HUMAN_EAR
-        // scale FFT results
-        for(uint_fast16_t binInd = 0; binInd < samplesFFT; binInd++)
-          vReal[binInd] *= pinkFactors[binInd];
-        #endif
+      float last_majorpeak = FFT_MajorPeak;
+      float last_magnitude = FFT_Magnitude;
+#ifdef UM_AUDIOREACTIVE_USE_ARDUINO_FFT
+      // run FFT (takes 2-3ms on ESP32, ~12ms on ESP32-S2, ~30ms on -C3)
+      if (doDCRemoval) FFT.dcRemoval();                                            // remove DC offset
+      switch(fftWindow) {                                                          // apply FFT window
+        case 1:
+          FFT.windowing(FFTWindow::Hann, FFTDirection::Forward);  // recommended for 50% overlap
+          wc = 0.66415918066;     // 1.8554726898 * 2.0
+        break;
+        case 2:
+          FFT.windowing( FFTWindow::Nuttall, FFTDirection::Forward);
+          wc = 0.9916873881f;     // 2.8163172034 * 2.0
+        break;
+        case 5:
+          FFT.windowing( FFTWindow::Blackman, FFTDirection::Forward);
+          wc = 0.84762867875f;     // 2.3673474360 * 2.0
+        break;
+        case 3:
+          FFT.windowing( FFTWindow::Hamming, FFTDirection::Forward);
+          wc = 0.664159180663f;   // 1.8549343278 * 2.0
+        break;
+        case 4:
+          FFT.windowing( FFTWindow::Flat_top, FFTDirection::Forward);        // Weigh data using "Flat Top" function - better amplitude preservation, low frequency accuracy
+          wc = 1.276771793156f;   // 3.5659039231 * 2.0
+        break;
+        case 0: // falls through
+        default:
+          FFT.windowing(FFTWindow::Blackman_Harris, FFTDirection::Forward);  // Weigh data using "Blackman- Harris" window - sharp peaks due to excellent sideband rejection
+          wc = 1.0f;              // 2.7929062517 * 2.0
+      }
+      #ifdef FFT_USE_SLIDING_WINDOW
+      if (usingOldSamples) wc = wc * 1.10f; // compensate for loss caused by averaging
+      #endif
+      // set imaginary parts to 0
+      memset(vImag, 0, sizeof(float) * samplesFFT);
+      FFT.compute( FFTDirection::Forward );                       // Compute FFT
+      FFT.complexToMagnitude();                                   // Compute magnitudes
+      valFFT[0] = 0;   // The remaining DC offset on the signal produces a strong spike on position 0 that should be eliminated to avoid issues.
+
+      #ifdef FFT_MAJORPEAK_HUMAN_EAR
+      // scale FFT results
+      for(uint_fast16_t binInd = 0; binInd < samplesFFT; binInd++)
+        valFFT[binInd] *= pinkFactors[binInd];
+      #endif
 
-        #if defined(FFT_LIB_REV) && FFT_LIB_REV > 0x19
-          // arduinoFFT 2.x has a slightly different API
-          FFT.majorPeak(&FFT_MajorPeak, &FFT_Magnitude);
-        #else
-          FFT.majorPeak(FFT_MajorPeak, FFT_Magnitude);                // let the effects know which freq was most dominant
-        #endif
-        FFT_Magnitude *= wc;  // apply correction factor
-
-        if (FFT_MajorPeak < (SAMPLE_RATE /  samplesFFT)) {FFT_MajorPeak = 1.0f; FFT_Magnitude = 0;}                  // too low - use zero
-        if (FFT_MajorPeak > (0.42f * SAMPLE_RATE)) {FFT_MajorPeak = last_majorpeak; FFT_Magnitude = last_magnitude;} // too high - keep last peak
-
-        #ifdef FFT_MAJORPEAK_HUMAN_EAR
-        // undo scaling - we want unmodified values for FFTResult[] computations
-        for(uint_fast16_t binInd = 0; binInd < samplesFFT; binInd++)
-          vReal[binInd] *= 1.0f/pinkFactors[binInd];
-        //fix peak magnitude
-        if ((FFT_MajorPeak > (binWidth/1.25f)) && (FFT_MajorPeak < (SAMPLE_RATE/2.2f)) && (FFT_Magnitude > 4.0f)) {
-          unsigned peakBin = constrain((int)((FFT_MajorPeak + binWidth/2.0f) / binWidth), 0, samplesFFT -1);
-          FFT_Magnitude *= fmaxf(1.0f/pinkFactors[peakBin], 1.0f);
+      #if defined(FFT_LIB_REV) && FFT_LIB_REV > 0x19
+        // arduinoFFT 2.x has a slightly different API
+        FFT.majorPeak(&FFT_MajorPeak, &FFT_Magnitude);
+      #else
+        FFT.majorPeak(FFT_MajorPeak, FFT_Magnitude);                // let the effects know which freq was most dominant
+      #endif
+#else
+      // run run float DSP FFT (takes ~x ms on ESP32, ~x ms on ESP32-S2, , ~x ms on ESP32-C3) TODO: test and fill in these values
+      // remove DC offset
+      if (doDCRemoval) {
+      FFTmathType sum = 0;
+      for (int i = 0; i < samplesFFT; i++) sum += valFFT[i];
+      FFTmathType mean = sum / (FFTmathType)samplesFFT;
+      for (int i = 0; i < samplesFFT; i++) valFFT[i] -= mean;
+      }
+#if !defined(UM_AUDIOREACTIVE_USE_INTEGER_FFT)
+      //apply window function to samples and fill buffer with interleaved complex values [Re,Im,Re,Im,...]
+      for (int i = samplesFFT - 1; i >= 0 ; i--) {
+        // fill the buffer back to front to avoid overwriting samples
+        float windowed_sample = valFFT[i] * windowFFT[i];
+        valFFT[i * 2] = windowed_sample;
+        valFFT[i * 2 + 1] = 0.0; // set imaginary part to zero
+      }
+#if defined(CONFIG_IDF_TARGET_ESP32S3)
+      dsps_fft2r_fc32_aes3(valFFT, samplesFFT); // ESP32 S3 optimized version of FFT
+#elif defined(CONFIG_IDF_TARGET_ESP32)
+      dsps_fft2r_fc32_ae32(valFFT, samplesFFT); // ESP32 optimized version of FFT
+#else
+      dsps_fft2r_fc32_ansi(valFFT, samplesFFT); // perform FFT using ANSI C implementation
+#endif
+      dsps_bit_rev_fc32(valFFT, samplesFFT);    // bit reverse
+      valFFT[0] = 0;  // set DC bin to 0, as it is not needed and can cause issues
+      // convert to magnitude & find FFT_MajorPeak and FFT_Magnitude
+      FFT_MajorPeak = 0;
+      FFT_Magnitude = 0;
+      for (int i = 1; i < samplesFFT_2; i++) {  // skip [0] as it is DC offset
+        float real_part = valFFT[i * 2];
+        float imag_part = valFFT[i * 2 + 1];
+        valFFT[i] = sqrtf(real_part * real_part + imag_part * imag_part);
+        if (valFFT[i] > FFT_Magnitude) {
+          FFT_Magnitude = valFFT[i];
+          FFT_MajorPeak = i*(SAMPLE_RATE/samplesFFT);
         }
-        #endif
-        FFT_MajorPeak = constrain(FFT_MajorPeak, 1.0f, 11025.0f);   // restrict value to range expected by effects
-        FFT_MajPeakSmth = FFT_MajPeakSmth + 0.42 * (FFT_MajorPeak - FFT_MajPeakSmth);   // I like this "swooping peak" look
+      }
+#else
+      // run integer DSP FFT (takes ~x ms on ESP32, ~x ms on ESP32-S2, , ~1.5 ms on ESP32-C3) TODO: test and fill in these values
+      //apply window function to samples and fill buffer with interleaved complex values [Re,Im,Re,Im,...]
+      for (int i = samplesFFT - 1; i >= 0 ; i--) {
+        // fill the buffer back to front to avoid overwriting samples
+        int16_t windowed_sample = ((int32_t)valFFT[i] * (int32_t)windowFFT[i]) >> 15; // both values are ±15bit
+        valFFT[i * 2] = windowed_sample;
+        valFFT[i * 2 + 1] = 0; // set imaginary part to zero
+      }
+      dsps_fft2r_sc16_ansi(valFFT, samplesFFT);   // perform FFT on complex value pairs (Re,Im)
+      dsps_bit_rev_sc16_ansi(valFFT, samplesFFT); // bit reverse i.e. "unshuffle" the results
+      valFFT[0] = 0; // set DC bin to 0, as it is not needed and can cause issues
+      // convert to magnitude, FFT returns interleaved complex values [Re,Im,Re,Im,...]
+      int FFT_MajorPeak_int = 0;
+      int FFT_Magnitude_int = 0;
+      for (int i = 1; i < samplesFFT_2; i++) { // skip [0], it is DC offset
+        int32_t real_part = valFFT[i * 2];
+        int32_t imag_part = valFFT[i * 2 + 1];
+        valFFT[i] = sqrt32_bw(real_part * real_part + imag_part * imag_part); // note: this should never overflow as Re and Im form a vector of maximum length 32767
+        if (valFFT[i] > FFT_Magnitude_int) {
+          FFT_Magnitude_int = valFFT[i] * FFT_BIN_SCALE_RAW; // scale to match raw float value
+          FFT_MajorPeak_int = ((i * SAMPLE_RATE)/samplesFFT);
+        }
+        // note: scaling is done in fftAddAvg(), so we don't neet to scale here
+      }
+      FFT_MajorPeak = FFT_MajorPeak_int;
+      FFT_Magnitude = FFT_Magnitude_int;
+#endif
+#endif
+      FFT_Magnitude *= wc;  // apply correction factor
+      if (FFT_MajorPeak < (SAMPLE_RATE /  samplesFFT)) {FFT_MajorPeak = 1.0f; FFT_Magnitude = 0;}                  // too low - use zero
+      if (FFT_MajorPeak > (0.42f * SAMPLE_RATE)) {FFT_MajorPeak = last_majorpeak; FFT_Magnitude = last_magnitude;} // too high - keep last peak
 
-      } else { // skip second run --> clear fft results, keep peaks
-        memset(vReal, 0, sizeof(float) * samplesFFT); 
+      #ifdef FFT_MAJORPEAK_HUMAN_EAR
+      // undo scaling - we want unmodified values for FFTResult[] computations
+      for(uint_fast16_t binInd = 0; binInd < samplesFFT; binInd++)
+        valFFT[binInd] *= 1.0f/pinkFactors[binInd];
+      //fix peak magnitude
+      if ((FFT_MajorPeak > (binWidth/1.25f)) && (FFT_MajorPeak < (SAMPLE_RATE/2.2f)) && (FFT_Magnitude > 4.0f)) {
+        unsigned peakBin = constrain((int)((FFT_MajorPeak + binWidth/2.0f) / binWidth), 0, samplesFFT -1);
+        FFT_Magnitude *= fmaxf(1.0f/pinkFactors[peakBin], 1.0f);
       }
+      #endif
+      FFT_MajorPeak = constrain(FFT_MajorPeak, 1.0f, 11025.0f);   // restrict value to range expected by effects
+      FFT_MajPeakSmth = FFT_MajPeakSmth + 0.42 * (FFT_MajorPeak - FFT_MajPeakSmth);   // I like this "swooping peak" look
 #if defined(WLED_DEBUG) || defined(SR_DEBUG) || defined(SR_STATS)
       haveDoneFFT = true;
 #endif
 
     } else { // noise gate closed - only clear results as FFT was skipped. MIC samples are still valid when we do this.
-      memset(vReal, 0, sizeof(float) * samplesFFT);
+      memset(valFFT, 0, sizeof(float) * samplesFFT);
       FFT_MajorPeak = 1;
       FFT_Magnitude = 0.001;
     }
 
-    if ((skipSecondFFT == false) || (isFirstRun == true)) {
-      for (int i = 0; i < samplesFFT; i++) {
-        float t = fabsf(vReal[i]);                      // just to be sure - values in fft bins should be positive any way
-        vReal[i] = t / 16.0f;                           // Reduce magnitude. Want end result to be scaled linear and ~4096 max.
-      } // for()
-
-      // mapping of FFT result bins to frequency channels
-      //if (fabsf(sampleAvg) > 0.25f) { // noise gate open
-      if (fabsf(volumeSmth) > 0.25f) { // noise gate open
-        //WLEDMM: different distributions
-        if (freqDist == 0) {
-          /* new mapping, optimized for 22050 Hz by softhack007 --- update: removed overlap */
-                                                         // bins frequency  range
-          if (useInputFilter==1) {
-            // skip frequencies below 100hz
-            fftCalc[ 0] = wc * 0.8f * fftAddAvg(3,3);
-            fftCalc[ 1] = wc * 0.9f * fftAddAvg(4,4);
-            fftCalc[ 2] = wc * fftAddAvg(5,5);
-            fftCalc[ 3] = wc * fftAddAvg(6,6);
-            // don't use the last bins from 206 to 255. 
-            fftCalc[15] = wc * fftAddAvg(165,205) * 0.75f;   // 40 7106 - 8828 high             -- with some damping
-          } else {
-            fftCalc[ 0] = wc * fftAddAvg(1,1);               // 1    43 - 86   sub-bass
-            fftCalc[ 1] = wc * fftAddAvg(2,2);               // 1    86 - 129  bass
-            fftCalc[ 2] = wc * fftAddAvg(3,4);               // 2   129 - 216  bass
-            fftCalc[ 3] = wc * fftAddAvg(5,6);               // 2   216 - 301  bass + midrange
-            // don't use the last bins from 216 to 255. They are usually contaminated by aliasing (aka noise) 
-            fftCalc[15] = wc * fftAddAvg(165,215) * 0.70f;   // 50 7106 - 9259 high             -- with some damping
-          }
-          fftCalc[ 4] = wc * fftAddAvg(7,9);                 // 3   301 - 430  midrange
-          fftCalc[ 5] = wc * fftAddAvg(10,12);               // 3   430 - 560  midrange
-          fftCalc[ 6] = wc * fftAddAvg(13,18);               // 5   560 - 818  midrange
-          fftCalc[ 7] = wc * fftAddAvg(19,25);               // 7   818 - 1120 midrange -- 1Khz should always be the center !
-          fftCalc[ 8] = wc * fftAddAvg(26,32);               // 7  1120 - 1421 midrange
-          fftCalc[ 9] = wc * fftAddAvg(33,43);               // 9  1421 - 1895 midrange
-          fftCalc[10] = wc * fftAddAvg(44,55);               // 12 1895 - 2412 midrange + high mid
-          fftCalc[11] = wc * fftAddAvg(56,69);               // 14 2412 - 3015 high mid
-          fftCalc[12] = wc * fftAddAvg(70,85);               // 16 3015 - 3704 high mid
-          fftCalc[13] = wc * fftAddAvg(86,103);              // 18 3704 - 4479 high mid
-          fftCalc[14] = wc * fftAddAvg(104,164) * 0.88f;     // 61 4479 - 7106 high mid + high  -- with slight damping
+    // mapping of FFT result bins to frequency channels
+    //if (fabsf(sampleAvg) > 0.25f) { // noise gate open
+    if (fabsf(volumeSmth) > 0.25f) { // noise gate open
+      //WLEDMM: different distributions
+      if (freqDist == 0) {
+        /* new mapping, optimized for 22050 Hz by softhack007 --- update: removed overlap */
+                                                        // bins frequency  range
+        if (useInputFilter==1) {
+          // skip frequencies below 100hz
+          fftCalc[ 0] = wc * 0.8f * fftAddAvg(3,3);
+          fftCalc[ 1] = wc * 0.9f * fftAddAvg(4,4);
+          fftCalc[ 2] = wc * fftAddAvg(5,5);
+          fftCalc[ 3] = wc * fftAddAvg(6,6);
+          // don't use the last bins from 206 to 255. 
+          fftCalc[15] = wc * fftAddAvg(165,205) * 0.75f;   // 40 7106 - 8828 high             -- with some damping
+        } else {
+          fftCalc[ 0] = wc * fftAddAvg(1,1);               // 1    43 - 86   sub-bass
+          fftCalc[ 1] = wc * fftAddAvg(2,2);               // 1    86 - 129  bass
+          fftCalc[ 2] = wc * fftAddAvg(3,4);               // 2   129 - 216  bass
+          fftCalc[ 3] = wc * fftAddAvg(5,6);               // 2   216 - 301  bass + midrange
+          // don't use the last bins from 216 to 255. They are usually contaminated by aliasing (aka noise) 
+          fftCalc[15] = wc * fftAddAvg(165,215) * 0.70f;   // 50 7106 - 9259 high             -- with some damping
+        }
+        fftCalc[ 4] = wc * fftAddAvg(7,9);                 // 3   301 - 430  midrange
+        fftCalc[ 5] = wc * fftAddAvg(10,12);               // 3   430 - 560  midrange
+        fftCalc[ 6] = wc * fftAddAvg(13,18);               // 5   560 - 818  midrange
+        fftCalc[ 7] = wc * fftAddAvg(19,25);               // 7   818 - 1120 midrange -- 1Khz should always be the center !
+        fftCalc[ 8] = wc * fftAddAvg(26,32);               // 7  1120 - 1421 midrange
+        fftCalc[ 9] = wc * fftAddAvg(33,43);               // 9  1421 - 1895 midrange
+        fftCalc[10] = wc * fftAddAvg(44,55);               // 12 1895 - 2412 midrange + high mid
+        fftCalc[11] = wc * fftAddAvg(56,69);               // 14 2412 - 3015 high mid
+        fftCalc[12] = wc * fftAddAvg(70,85);               // 16 3015 - 3704 high mid
+        fftCalc[13] = wc * fftAddAvg(86,103);              // 18 3704 - 4479 high mid
+        fftCalc[14] = wc * fftAddAvg(104,164) * 0.88f;     // 61 4479 - 7106 high mid + high  -- with slight damping
       } else if (freqDist == 1) { //WLEDMM: Rightshift: note ewowi: frequencies in comments are not correct
         if (useInputFilter==1) {
           // skip frequencies below 100hz
@@ -872,18 +1060,11 @@ void FFTcode(void * parameter)
         fftCalc[14] = wc * fftAddAvg(98,164) * 0.88f;      // 61 4479 - 7106 high mid + high  -- with slight damping
       }
     } else {  // noise gate closed - just decay old values
-      isFirstRun = false;
       for (int i=0; i < NUM_GEQ_CHANNELS; i++) {
         fftCalc[i] *= 0.85f;  // decay to zero
         if (fftCalc[i] < 4.0f) fftCalc[i] = 0.0f;
     } }
 
-      memcpy(lastFftCalc, fftCalc, sizeof(lastFftCalc)); // make a backup of last "good" channels
-
-    } else { // if second run skipped
-      memcpy(fftCalc, lastFftCalc, sizeof(fftCalc)); // restore last "good" channels
-    }
-
     // post-processing of frequency channels (pink noise adjustment, AGC, smoothing, scaling)
     if (pinkIndex > MAX_PINK) pinkIndex = MAX_PINK;
 
@@ -918,11 +1099,8 @@ void FFTcode(void * parameter)
         vTaskDelayUntil( &xLastWakeTime, xFrequencyDouble); // we need a double wait when no old data was used
       } else
   #endif
-      if ((skipSecondFFT == false) || (fabsf(volumeSmth) < 0.25f)) {
+      if (fabsf(volumeSmth) < 0.25f)
         vTaskDelayUntil( &xLastWakeTime, xFrequency);        // release CPU, and let I2S fill its buffers
-      } else if (isFirstRun == true) {
-        vTaskDelayUntil( &xLastWakeTime, xFrequencyDouble);  // release CPU after performing FFT in "skip second run" mode
-      }
     }
   } // for(;;)ever
 } // FFTcode() task end
@@ -932,11 +1110,14 @@ void FFTcode(void * parameter)
 // Pre / Postprocessing  //
 ///////////////////////////
 
-static void runMicFilter(uint16_t numSamples, float *sampleBuffer)          // pre-filtering of raw samples (band-pass)
+static void runMicFilter(uint16_t numSamples, FFTsampleType *sampleBuffer)          // pre-filtering of raw samples (band-pass)
 {
+#if !defined(UM_AUDIOREACTIVE_USE_INTEGER_FFT)
   // low frequency cutoff parameter - see https://dsp.stackexchange.com/questions/40462/exponential-moving-average-cut-off-frequency
   //constexpr float alpha = 0.04f;   // 150Hz
   //constexpr float alpha = 0.03f;   // 110Hz
+  //constexpr float alpha = 0.0285f; //100Hz
+  //constexpr float alpha = 0.0256f; //90Hz
   constexpr float alpha = 0.0225f; // 80hz
   //constexpr float alpha = 0.01693f;// 60hz
   // high frequency cutoff  parameter
@@ -961,6 +1142,38 @@ static void runMicFilter(uint16_t numSamples, float *sampleBuffer)          // p
         lowfilt += alpha * (sampleBuffer[i] - lowfilt);
         sampleBuffer[i] = sampleBuffer[i] - lowfilt;
   }
+#else
+  //constexpr int32_t ALPHA_FP = 1311;    // 0.04f * (1<<15) (150Hz)
+  //constexpr int32_t ALPHA_FP = 983;     // 0.03f * (1<<15) (110Hz)
+  //constexpr int32_t ALPHA_FP = 934;     // 0.0285f * (1<<15) (100Hz)
+  constexpr int32_t ALPHA_FP = 840;       // 0.0256f * (1<<15) (90Hz)
+  //constexpr int32_t ALPHA_FP = 737;     // 0.0225f * (1<<15) (80Hz)
+  //constexpr int32_t ALPHA_FP = 555;     // 0.01693f * (1<<15) (60Hz)
+
+  // high frequency cutoff parameters 16.16 fixed point format
+  //constexpr int32_t BETA1_FP = 49152;   // 0.75f * (1<<16) (11KHz)
+  //constexpr int32_t BETA1_FP = 53740;   // 0.82f * (1<<16) (15KHz)
+  //constexpr int32_t BETA1_FP = 54297;   // 0.8285f * (1<<16) (18KHz)
+  constexpr int32_t BETA1_FP = 55706;     // 0.85f * (1<<16) (20KHz)
+  constexpr int32_t BETA2_FP = (65536 - BETA1_FP) / 2;  // ((1.0f - beta1) / 2.0f) * (1<<16)
+
+  static int32_t last_vals[2] = { 0 };    // FIR high freq cutoff filter (scaled by sample range)
+  static int32_t lowfilt_fp = 0;          // IIR low frequency cutoff filter (16.16 fixed point)
+
+  for (int i = 0; i < numSamples; i++) {
+    // FIR lowpass filter to remove high frequency noise
+    int32_t highFilteredSample_fp;
+
+    if (i < (numSamples - 1))
+      highFilteredSample_fp = (BETA1_FP * (int32_t)sampleBuffer[i] + BETA2_FP * last_vals[0] + BETA2_FP * (int32_t)sampleBuffer[i + 1]) >> 16; // smooth out spikes
+    else
+      highFilteredSample_fp = (BETA1_FP * (int32_t)sampleBuffer[i] + BETA2_FP * last_vals[0] + BETA2_FP * last_vals[1]) >> 16; // special handling for last sample in array
+    last_vals[1] = last_vals[0];
+    last_vals[0] = (int32_t)sampleBuffer[i];
+    lowfilt_fp += ALPHA_FP * (highFilteredSample_fp - (lowfilt_fp >> 15)); // low pass filter in 17.15 fixed point format
+    sampleBuffer[i] = highFilteredSample_fp - (lowfilt_fp >> 15);
+  }
+#endif
 }
 
 static void postProcessFFTResults(bool noiseGateOpen, int numberOfChannels, bool i2sFastpath) // post-processing and post-amp of GEQ channels
@@ -970,7 +1183,6 @@ static void postProcessFFTResults(bool noiseGateOpen, int numberOfChannels, bool
       if (noiseGateOpen) { // noise gate open
         // Adjustment for frequency curves.
         fftCalc[i] *= fftResultPink[pinkIndex][i];
-        if (FFTScalingMode > 0) fftCalc[i] *= FFT_DOWNSCALE;  // adjustment related to FFT windowing function
         // Manual linear adjustment of gain using sampleGain adjustment for different input types.
         fftCalc[i] *= soundAgc ? multAgc : ((float)sampleGain/40.0f * (float)inputLevel/128.0f + 1.0f/16.0f); //apply gain, with inputLevel adjustment
         if(fftCalc[i] < 0) fftCalc[i] = 0;
@@ -1061,7 +1273,7 @@ static void postProcessFFTResults(bool noiseGateOpen, int numberOfChannels, bool
 // Peak detection //
 ////////////////////
 
-// peak detection is called from FFT task when vReal[] contains valid FFT results
+// peak detection is called from FFT task when valFFT[] contains valid (unscaled) FFT results
 static void detectSamplePeak(void) {
   bool havePeak = false;
 #if 1
@@ -1069,7 +1281,7 @@ static void detectSamplePeak(void) {
   // Poor man's beat detection by seeing if sample > Average + some value.
   // This goes through ALL of the 255 bins - but ignores stupid settings
   // Then we got a peak, else we don't. The peak has to time out on its own in order to support UDP sound sync.
-  if ((sampleAvg > 1) && (maxVol > 0) && (binNum > 4) && (vReal[binNum] > maxVol) && ((millis() - timeOfPeak) > 100)) {
+  if ((sampleAvg > 1) && (maxVol > 0) && (binNum > 4) && (valFFT[binNum]*FFT_BIN_SCALE_RAW > maxVol) && ((millis() - timeOfPeak) > 100)) {
     havePeak = true;
   }
 #endif
diff --git a/usermods/audioreactive/audio_source.h b/usermods/audioreactive/audio_source.h
index 5548ac5602..b1c54e91a3 100644
--- a/usermods/audioreactive/audio_source.h
+++ b/usermods/audioreactive/audio_source.h
@@ -158,7 +158,7 @@ class AudioSource {
        Read num_samples from the microphone, and store them in the provided
        buffer
     */
-    virtual void getSamples(float *buffer, uint16_t num_samples) = 0;
+    virtual void getSamples(FFTsampleType *buffer, uint16_t num_samples) = 0;
 
     /* check if the audio source driver was initialized successfully */
     virtual bool isInitialized(void) {return(_initialized);}
@@ -387,12 +387,12 @@ class I2SSource : public AudioSource {
       if (_mclkPin != I2S_PIN_NO_CHANGE) pinManager.deallocatePin(_mclkPin, PinOwner::UM_Audioreactive);
     }
 
-    virtual void getSamples(float *buffer, uint16_t num_samples) {
+    virtual void getSamples(FFTsampleType *buffer, uint16_t num_samples) {
       if (_initialized) {
         esp_err_t err;
         size_t bytes_read = 0;        /* Counter variable to check if we actually got enough data */
 
-        memset(buffer, 0, sizeof(float) * num_samples);  // clear output buffer
+        memset(buffer, 0, sizeof(FFTsampleType) * num_samples);  // clear output buffer
         I2S_datatype *newSamples = newSampleBuffer; // use global input buffer
         if (num_samples > I2S_SAMPLES_MAX) num_samples = I2S_SAMPLES_MAX; // protect the buffer from overflow
 
@@ -408,19 +408,35 @@ class I2SSource : public AudioSource {
           return;
         }
 
-        // Store samples in sample buffer and update DC offset
+        // Store samples in sample buffer
+#if defined(UM_AUDIOREACTIVE_USE_INTEGER_FFT)
+        //constexpr int32_t FIXEDSHIFT = 8; // shift by 8 bits for fixed point math (no loss at 24bit input sample resolution)
+        //int32_t intSampleScale = _sampleScale * (1<<FIXEDSHIFT); // _sampleScale <= 1.0f, shift for fixed point math
+#endif
         for (int i = 0; i < num_samples; i++) {
 
           newSamples[i] = postProcessSample(newSamples[i]);  // perform postprocessing (needed for ADC samples)
-          
-          float currSample = 0.0f;
-#ifdef I2S_SAMPLE_DOWNSCALE_TO_16BIT
-              currSample = (float) newSamples[i] / 65536.0f;      // 32bit input -> 16bit; keeping lower 16bits as decimal places
+#if !defined(UM_AUDIOREACTIVE_USE_INTEGER_FFT)
+  #ifdef I2S_SAMPLE_DOWNSCALE_TO_16BIT
+          float currSample = (float) newSamples[i] / 65536.0f;      // 32bit input -> 16bit; keeping lower 16bits as decimal places
+  #else
+          float currSample = (float) newSamples[i];                 // 16bit input -> use as-is
+  #endif
+          buffer[i] = currSample;
+          buffer[i] *= _sampleScale;                               // scale samples
 #else
-              currSample = (float) newSamples[i];                 // 16bit input -> use as-is
+  #ifdef I2S_SAMPLE_DOWNSCALE_TO_16BIT
+          // note on sample scaling: scaling is only used for inputs with master clock and those are better suited for ESP32 or S3
+          // execution speed is critical on single core MCUs
+          //int32_t currSample = newSamples[i] >> FIXEDSHIFT;   // shift to avoid overlow in multiplication
+          //currSample = (currSample * intSampleScale) >> 16;   // scale samples, shift down to 16bit
+          int16_t currSample = newSamples[i] >> 16;           // no sample scaling, just shift down to 16bit (not scaling saves ~0.4ms on C3)
+  #else
+          //int32_t currSample = (newSamples[i] * intSampleScale) >> FIXEDSHIFT;   // scale samples, shift back down to 16bit
+          int16_t currSample = newSamples[i];                 // 16bit input -> use as-is
+  #endif
+          buffer[i] = (int16_t)currSample;
 #endif
-          buffer[i] = currSample;
-          buffer[i] *= _sampleScale;                              // scale samples
         }
       }
     }
@@ -1088,7 +1104,7 @@ class I2SAdcSource : public I2SSource {
     }
 
 
-    void getSamples(float *buffer, uint16_t num_samples) {
+    void getSamples(FFTsampleType *buffer, uint16_t num_samples) {
       /* Enable ADC. This has to be enabled and disabled directly before and
        * after sampling, otherwise Wifi dies
        */
diff --git a/wled00/fcn_declare.h b/wled00/fcn_declare.h
index 1a9247a2d9..9da45dd7d7 100644
--- a/wled00/fcn_declare.h
+++ b/wled00/fcn_declare.h
@@ -470,6 +470,7 @@ uint8_t cos8_t(uint8_t theta);
 float sin_approx(float theta); // uses integer math (converted to float), accuracy +/-0.0015 (compared to sinf())
 float cos_approx(float theta);
 float tan_approx(float x);
+uint32_t sqrt32_bw(uint32_t x);
 //float atan2_t(float y, float x);
 //float acos_t(float x);
 //float asin_t(float x);
diff --git a/wled00/wled_math.cpp b/wled00/wled_math.cpp
index bd5423389a..f3fff3d988 100644
--- a/wled00/wled_math.cpp
+++ b/wled00/wled_math.cpp
@@ -110,6 +110,30 @@ float tan_approx(float x) {
   return res;
 }
 
+// bit-wise integer square root calculation (exact) by @dedehai
+uint32_t sqrt32_bw(uint32_t x) {
+  uint32_t res = 0;
+  uint32_t bit;
+  uint32_t num = x; // use 32bit for faster calculation
+
+  if(num < 1 << 10)  bit = 1 << 10; // speed optimization for small numbers < 32^2
+  else if (num < 1 << 20) bit = 1 << 20; // speed optimization for medium numbers < 1024^2
+  else bit = 1 << 30; // start with highest power of 4 <= 2^32
+
+  while (bit > num) bit >>= 2; // reduce iterations
+
+  while (bit != 0) {
+    if (num >= res + bit) {
+      num -= res + bit;
+      res = (res >> 1) + bit;
+    } else {
+      res >>= 1;
+    }
+    bit >>= 2;
+  }
+  return res;
+}
+
 #if 0  // WLEDMM we prefer libm functions that are accurate and fast.
 #define ATAN2_CONST_A 0.1963f
 #define ATAN2_CONST_B 0.9817f