rhasspy · jhbruhn · Mar 28, 2024 · Mar 28, 2024
diff --git a/requirements.txt b/requirements.txt
@@ -1,2 +1,3 @@
 tflite-runtime-nightly
+onnxruntime>=1.10.0,<2
 wyoming==1.5.3
diff --git a/wyoming_openwakeword/__main__.py b/wyoming_openwakeword/__main__.py
@@ -49,6 +49,12 @@ async def main() -> None:
         default=1,
         help="Number of activations before detection (default: 1)",
     )
+    parser.add_argument(
+        "--vad-threshold",
+        type=float,
+        default=0.0,
+        help="Voice Activation Detection threshold (0.0-1.0, default: 0.0)",
+    )
     #
     parser.add_argument("--output-dir", help="Path to save audio and detections")
     #
@@ -99,6 +105,7 @@ async def main() -> None:
         args.preload_model,
         threshold=args.threshold,
         trigger_level=args.trigger_level,
+        vad_threshold=args.vad_threshold,
     )
 
     # audio -> mels

diff --git a/wyoming_openwakeword/const.py b/wyoming_openwakeword/const.py
@@ -4,6 +4,8 @@
 import numpy as np
 from wyoming.server import AsyncEventHandler
 
+from .vad import VAD
+
 _AUTOFILL_SECONDS: Final = 3
 _MAX_SECONDS: Final = 10
 
@@ -68,6 +70,7 @@ class ClientData:
     )
     wake_words: Dict[str, WakeWordData] = field(default_factory=dict)
     wake_word_names: Optional[Set[str]] = None
+    vad: VAD = VAD()
 
     def reset(self) -> None:
         self.audio.fill(0)

diff --git a/wyoming_openwakeword/handler.py b/wyoming_openwakeword/handler.py
@@ -75,6 +75,7 @@ async def handle_event(self, event: Event) -> bool:
                     detect.names,
                     threshold=self.cli_args.threshold,
                     trigger_level=self.cli_args.trigger_level,
+                    vad_threshold=self.cli_args.vad_threshold,
                 )
 
                 # Only process audio with these wake word models
@@ -145,6 +146,8 @@ async def handle_event(self, event: Event) -> bool:
                 # Wait until no new embeddings still need to be processed
                 await asyncio.sleep(0.1)
 
+            self.data.vad.reset_states()
+
             if not any(
                 ww_data.is_detected for ww_data in self.data.wake_words.values()
             ):
@@ -211,7 +214,7 @@ def _get_info(self) -> Info:
 # -----------------------------------------------------------------------------
 
 
-def ensure_loaded(state: State, names: List[str], threshold: float, trigger_level: int):
+def ensure_loaded(state: State, names: List[str], threshold: float, trigger_level: int, vad_threshold: float):
     """Ensure wake words are loaded by name."""
     with state.clients_lock, state.ww_threads_lock:
         for model_name in names:
@@ -251,6 +254,7 @@ def ensure_loaded(state: State, names: List[str], threshold: float, trigger_leve
                     model_key,
                     model_path,
                     asyncio.get_running_loop(),
+                    vad_threshold,
                 ),
             )
             state.ww_threads[model_key].start()

diff --git a/wyoming_openwakeword/models/silero_vad.onnx b/wyoming_openwakeword/models/silero_vad.onnx
diff --git a/wyoming_openwakeword/openwakeword.py b/wyoming_openwakeword/openwakeword.py
@@ -71,11 +71,13 @@ def mels_proc(state: State):
                     todo_timestamps: List[int] = []
                     for i, client_id in enumerate(todo_ids):
                         client = state.clients[client_id]
-                        audio_tensor[i, :] = client.audio[
+                        audio = client.audio[
                             -client.new_audio_samples : len(client.audio)
                             - client.new_audio_samples
                             + MEL_SAMPLES
                         ]
+                        audio_tensor[i, :] = audio
+                        client.vad(audio)
                         client.new_audio_samples = max(
                             0, client.new_audio_samples - SAMPLES_PER_CHUNK
                         )
@@ -241,6 +243,7 @@ def ww_proc(
     ww_model_key: str,
     ww_model_path: str,
     loop: asyncio.AbstractEventLoop,
+    vad_threshold: float,
 ):
     """Transform embedding features to wake word probabilities (without batching)."""
     try:
@@ -251,7 +254,6 @@ def ww_proc(
         ww_windows = ww_input_shape[1]
         ww_input_index = ww_input["index"]
         ww_output_index = ww_model.get_output_details()[0]["index"]
-
         # ww = [batch x window x features (96)] => [batch x probability]
 
         client: Optional[ClientData] = None
@@ -322,12 +324,16 @@ def ww_proc(
                             # Client disconnected
                             continue
 
+                        vad_frames = list(client.vad.prediction_buffer)[-7:-4]
+                        vad_max_score = np.max(vad_frames) if len(vad_frames) > 0 else 0
+                        voice_detected = (vad_threshold <= 0.0 or vad_max_score >= vad_threshold)
                         if state.debug_probability:
                             _LOGGER.debug(
-                                "client=%s, wake_word=%s, probability=%s",
+                                "client=%s, wake_word=%s, probability=%s, vad_probability=%s",
                                 client_id,
                                 ww_model_key,
                                 probability.item(),
+                                vad_max_score,
                             )
 
                         prob_file: Optional[TextIO] = None
@@ -346,7 +352,8 @@ def ww_proc(
                             )
 
                         client_data = client.wake_words[ww_model_key]
-                        if probability.item() >= client_data.threshold:
+
+                        if probability.item() >= client_data.threshold and voice_detected:
                             # Increase activation
                             client_data.activations += 1
 

diff --git a/wyoming_openwakeword/vad.py b/wyoming_openwakeword/vad.py
@@ -0,0 +1,129 @@
+# Copyright 2022 David Scripka. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#######################
+# Silero VAD License
+#######################
+
+# MIT License
+
+# Copyright (c) 2020-present Silero Team
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+########################################
+
+# This file contains the implementation of a class for voice activity detection (VAD),
+# based on the pre-trained model from Silero (https://github.com/snakers4/silero-vad).
+# It can be used as with the openWakeWord library, or independently.
+
+# Imports
+import onnxruntime as ort
+import numpy as np
+import os
+from collections import deque
+
+
+class VAD():
+    """
+    A model class for a voice activity detection (VAD) based on Silero's model:
+
+    https://github.com/snakers4/silero-vad
+    """
+    def __init__(self,
+                 model_path: str = os.path.join(
+                    os.path.dirname(os.path.abspath(__file__)),
+                    "models",
+                    "silero_vad.onnx"
+                 ),
+                 n_threads: int = 1
+                 ):
+        """Initialize the VAD model object.
+
+            Args:
+                model_path (str): The path to the Silero VAD ONNX model.
+                n_threads (int): The number of threads to use for the VAD model.
+        """
+
+        # Initialize the ONNX model
+        sessionOptions = ort.SessionOptions()
+        sessionOptions.inter_op_num_threads = n_threads
+        sessionOptions.intra_op_num_threads = n_threads
+        self.model = ort.InferenceSession(model_path, sess_options=sessionOptions,
+                                          providers=["CPUExecutionProvider"])
+
+        # Create buffer
+        self.prediction_buffer: deque = deque(maxlen=125)  # buffer lenght of 10 seconds
+
+        # Set model parameters
+        self.sample_rate = np.array(16000).astype(np.int64)
+
+        # Reset model to start
+        self.reset_states()
+
+    def reset_states(self, batch_size=1):
+        self._h = np.zeros((2, batch_size, 64)).astype('float32')
+        self._c = np.zeros((2, batch_size, 64)).astype('float32')
+        self._last_sr = 0
+        self._last_batch_size = 0
+
+    def predict(self, x, frame_size=480):
+        """
+        Get the VAD predictions for the input audio frame.
+
+        Args:
+            x (np.ndarray): The input audio, must be 16 khz and 16-bit PCM format.
+                            If longer than the input frame, will be split into
+                            chunks of length `frame_size` and the predictions for
+                            each chunk returned. Must be a length that is integer
+                            multiples of the `frame_size` argument.
+            frame_size (int): The frame size in samples. The reccomended
+                              default is 480 samples (30 ms @ 16khz),
+                              but smaller and larger values
+                              can be used (though performance may decrease).
+
+        Returns
+            float: The average predicted score for the audio frame
+        """
+        chunks = [(x[i:i+frame_size]/32767).astype(np.float32)
+                  for i in range(0, x.shape[0], frame_size)]
+
+        frame_predictions = []
+        for chunk in chunks:
+            ort_inputs = {'input': chunk[None, ],
+                          'h': self._h, 'c': self._c, 'sr': self.sample_rate}
+            ort_outs = self.model.run(None, ort_inputs)
+            out, self._h, self._c = ort_outs
+            frame_predictions.append(out[0][0])
+
+        return np.mean(frame_predictions)
+
+    def __call__(self, x, frame_size=160*4):
+        self.prediction_buffer.append(self.predict(x, frame_size))