add amplitude_floor in vad masking (#1883)

yujonglee · web-flow · commit ff6fc8ac91fa · 2025-11-25T20:59:48.000+09:00
diff --git a/crates/vad-ext/src/continuous2.rs b/crates/vad-ext/src/continuous2.rs
@@ -7,47 +7,34 @@ use futures_util::Stream;
 use hypr_audio_utils::f32_to_i16_samples;
 use hypr_vad3::earshot::{VoiceActivityDetector, VoiceActivityProfile};
 
-/// Wraps a `Stream<Item = Result<Vec<f32>, E>>` and zeroes out samples that are
-/// classified as non-speech by a 16 kHz VAD, with a configurable hangover.
-///
-/// - Expects 16 kHz mono PCM in `[-1.0, 1.0]`.
-/// - Never changes the length of any chunk.
-/// - Fails open: on VAD errors, audio is passed through as speech.
 pub struct ContinuousVadMaskStream<S> {
     inner: S,
     vad: VoiceActivityDetector,
-    /// Number of consecutive non-speech frames to keep as speech
-    /// after speech has ended (to avoid chopping word endings).
     hangover_frames: usize,
     trailing_non_speech: usize,
     in_speech: bool,
-    /// Scratch buffer for padding partial frames, reused across calls.
     scratch_frame: Vec<f32>,
+    amplitude_floor: f32,
 }
 
 impl<S> ContinuousVadMaskStream<S> {
-    /// Construct with the default QUALITY profile and conservative hangover.
     pub fn new(inner: S) -> Self {
         Self {
             inner,
             vad: VoiceActivityDetector::new(VoiceActivityProfile::QUALITY),
             hangover_frames: 3,
             trailing_non_speech: 0,
-            // Start in speech to be conservative: better to leak a bit of
-            // initial noise than to truncate the first spoken frames.
             in_speech: true,
             scratch_frame: Vec::new(),
+            amplitude_floor: 0.001,
         }
     }
 
-    /// Override the number of hangover frames (measured in VAD frames).
     pub fn with_hangover_frames(mut self, frames: usize) -> Self {
         self.hangover_frames = frames;
         self
     }
 
-    /// Optional: allow callers to change the VAD profile without changing behavior
-    /// for existing code.
     pub fn with_profile(mut self, profile: VoiceActivityProfile) -> Self {
         self.vad = VoiceActivityDetector::new(profile);
         self
@@ -61,28 +48,20 @@ impl<S> ContinuousVadMaskStream<S> {
         let frame_size = hypr_vad3::choose_optimal_frame_size(chunk.len());
         debug_assert!(frame_size > 0, "VAD frame size must be > 0");
 
-        // `chunks_mut` yields exactly one frame when `chunk.len() <= frame_size`,
-        // and multiple frames otherwise. No need for a special-case branch.
         for frame in chunk.chunks_mut(frame_size) {
             self.process_frame(frame, frame_size);
         }
     }
 
-    /// Internal state machine that applies hangover smoothing to the raw VAD
-    /// decision. Returns the final `is_speech` value.
     fn smooth_vad_decision(&mut self, raw_is_speech: bool) -> bool {
         if raw_is_speech {
-            // Fresh speech: reset hangover.
             self.in_speech = true;
             self.trailing_non_speech = 0;
             true
         } else if self.in_speech && self.trailing_non_speech < self.hangover_frames {
-            // Still treat as speech for a few frames after VAD says "no"
-            // to avoid chopping word endings or short pauses.
             self.trailing_non_speech += 1;
             true
         } else {
-            // Long enough non-speech: actually treat as silence.
             self.in_speech = false;
             self.trailing_non_speech = 0;
             false
@@ -94,13 +73,19 @@ impl<S> ContinuousVadMaskStream<S> {
             return;
         }
 
-        // Convert to i16, padding only when needed. We always feed `frame_size`
-        // samples into the VAD (either directly or via `scratch_frame`).
+        let rms = Self::calculate_rms(frame);
+        if rms < self.amplitude_floor {
+            let is_speech = self.smooth_vad_decision(false);
+            if !is_speech {
+                frame.fill(0.0);
+            }
+            return;
+        }
+
         let raw_is_speech = if frame.len() == frame_size {
             let i16_samples = f32_to_i16_samples(frame);
             self.vad.predict_16khz(&i16_samples).unwrap_or(true)
         } else {
-            // Partial frame at the end of a chunk: copy + pad with zeros.
             self.scratch_frame.clear();
             self.scratch_frame.extend_from_slice(frame);
             self.scratch_frame.resize(frame_size, 0.0);
@@ -111,11 +96,18 @@ impl<S> ContinuousVadMaskStream<S> {
 
         let is_speech = self.smooth_vad_decision(raw_is_speech);
 
-        // Mask non-speech frames by zeroing in-place.
         if !is_speech {
             frame.fill(0.0);
         }
     }
+
+    fn calculate_rms(samples: &[f32]) -> f32 {
+        if samples.is_empty() {
+            return 0.0;
+        }
+        let sum_sq: f32 = samples.iter().map(|&s| s * s).sum();
+        (sum_sq / samples.len() as f32).sqrt()
+    }
 }
 
 impl<S, E> Stream for ContinuousVadMaskStream<S>
@@ -138,8 +130,6 @@ where
 }
 
 pub trait VadMaskExt: Sized {
-    /// Wrap this stream with a VAD-based mask that zeros non-speech samples
-    /// but never drops or reorders audio.
     fn mask_with_vad(self) -> ContinuousVadMaskStream<Self>;
 }