-
Notifications
You must be signed in to change notification settings - Fork 111
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add C++ and Python API for silero VAD. (#701)
- Loading branch information
1 parent
43d38c2
commit f46678e
Showing
33 changed files
with
1,320 additions
and
16 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -25,3 +25,5 @@ sherpa-nemo-ctc* | |
*.pt | ||
tokens.txt | ||
*.bin | ||
sherpa-sense-voice-zh-en-ja-ko-yue-2025-01-06 | ||
sherpa-whisper-tiny.en |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,129 @@ | ||
#!/usr/bin/env python3 | ||
# | ||
# Copyright (c) 2025 Xiaomi Corporation | ||
|
||
""" | ||
Please download sense voice model from | ||
https://github.com/k2-fsa/sherpa/releases/tag/asr-models | ||
E.g., | ||
wget https://github.com/k2-fsa/sherpa/releases/download/asr-models/sherpa-sense-voice-zh-en-ja-ko-yue-2025-01-06.tar.bz2 | ||
""" | ||
import time | ||
from typing import Tuple | ||
|
||
import librosa | ||
import numpy as np | ||
import sherpa | ||
import soundfile as sf | ||
|
||
|
||
def load_audio(filename: str) -> Tuple[np.ndarray, int]: | ||
data, sample_rate = sf.read( | ||
filename, | ||
always_2d=True, | ||
dtype="float32", | ||
) | ||
data = data[:, 0] # use only the first channel | ||
samples = np.ascontiguousarray(data) | ||
return samples, sample_rate | ||
|
||
|
||
def create_recognizer(): | ||
config = sherpa.OfflineRecognizerConfig( | ||
model=sherpa.OfflineModelConfig( | ||
sense_voice=sherpa.OfflineSenseVoiceModelConfig( | ||
model="./sherpa-sense-voice-zh-en-ja-ko-yue-2025-01-06/model.pt", | ||
use_itn=True, | ||
language="auto", | ||
), | ||
debug=False, | ||
), | ||
tokens="./sherpa-sense-voice-zh-en-ja-ko-yue-2025-01-06/tokens.txt", | ||
use_gpu=False, | ||
) | ||
|
||
# You have to call config.Validate() to make it work! | ||
config.validate() | ||
return sherpa.OfflineRecognizer(config) | ||
|
||
|
||
def test_decoding_single_file(recognizer): | ||
print("----------Test a single file----------") | ||
test_wave_file = "./sherpa-sense-voice-zh-en-ja-ko-yue-2025-01-06/test_wavs/zh.wav" | ||
|
||
samples, sample_rate = load_audio(test_wave_file) | ||
if sample_rate != 16000: | ||
samples = librosa.resample(samples, orig_sr=sample_rate, target_sr=16000) | ||
sample_rate = 16000 | ||
|
||
start = time.time() | ||
|
||
stream = recognizer.create_stream() | ||
stream.accept_waveform(samples) | ||
recognizer.decode_stream(stream) | ||
text = stream.result.text | ||
|
||
end = time.time() | ||
|
||
elapsed_seconds = end - start | ||
audio_duration = len(samples) / sample_rate | ||
real_time_factor = elapsed_seconds / audio_duration | ||
|
||
print(text) | ||
print(f"Elapsed seconds: {elapsed_seconds:.3f}") | ||
print(f"Audio duration in seconds: {audio_duration:.3f}") | ||
print(f"RTF: {elapsed_seconds:.3f}/{audio_duration:.3f} = {real_time_factor:.3f}") | ||
|
||
|
||
def test_decoding_multipl_files(recognizer): | ||
print("----------Test decoding multiple files----------") | ||
test_wave_file1 = "./sherpa-sense-voice-zh-en-ja-ko-yue-2025-01-06/test_wavs/zh.wav" | ||
test_wave_file2 = "./sherpa-sense-voice-zh-en-ja-ko-yue-2025-01-06/test_wavs/en.wav" | ||
|
||
samples1, sample_rate1 = load_audio(test_wave_file1) | ||
if sample_rate1 != 16000: | ||
samples1 = librosa.resample(samples1, orig_sr=sample_rate1, target_sr=16000) | ||
sample_rate1 = 16000 | ||
|
||
samples2, sample_rate2 = load_audio(test_wave_file2) | ||
if sample_rate2 != 16000: | ||
samples2 = librosa.resample(samples2, orig_sr=sample_rate2, target_sr=16000) | ||
sample_rate2 = 16000 | ||
|
||
start = time.time() | ||
stream1 = recognizer.create_stream() | ||
stream1.accept_waveform(samples1) | ||
|
||
stream2 = recognizer.create_stream() | ||
stream2.accept_waveform(samples2) | ||
|
||
recognizer.decode_streams([stream1, stream2]) | ||
text1 = stream1.result.text | ||
text2 = stream2.result.text | ||
|
||
end = time.time() | ||
|
||
elapsed_seconds = end - start | ||
audio_duration = len(samples1) / sample_rate1 + len(samples2) / sample_rate2 | ||
real_time_factor = elapsed_seconds / audio_duration | ||
|
||
print(f"{test_wave_file1}\n {text1}") | ||
print() | ||
print(f"{test_wave_file2}\n {text2}") | ||
|
||
print() | ||
|
||
print(f"Elapsed seconds: {elapsed_seconds:.3f}") | ||
print(f"Audio duration in seconds: {audio_duration:.3f}") | ||
print(f"RTF: {elapsed_seconds:.3f}/{audio_duration:.3f} = {real_time_factor:.3f}") | ||
|
||
|
||
def main(): | ||
recognizer = create_recognizer() | ||
test_decoding_single_file(recognizer) | ||
test_decoding_multipl_files(recognizer) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
#!/usr/bin/env python3 | ||
|
||
""" | ||
Please download sense voice model from | ||
https://github.com/k2-fsa/sherpa/releases/tag/asr-models | ||
E.g., | ||
wget https://github.com/k2-fsa/sherpa/releases/download/asr-models/sherpa-sense-voice-zh-en-ja-ko-yue-2025-01-06.tar.bz2 | ||
Please download VAD models from | ||
https://github.com/k2-fsa/sherpa/releases/tag/vad-models | ||
E.g., | ||
wget https://github.com/k2-fsa/sherpa/releases/download/vad-models/silero-vad-v4.pt | ||
""" | ||
from typing import Tuple | ||
|
||
import librosa | ||
import numpy as np | ||
import sherpa | ||
import soundfile as sf | ||
import torch | ||
|
||
|
||
def load_audio(filename: str) -> Tuple[np.ndarray, int]: | ||
data, sample_rate = sf.read( | ||
filename, | ||
always_2d=True, | ||
dtype="float32", | ||
) | ||
data = data[:, 0] # use only the first channel | ||
samples = np.ascontiguousarray(data) | ||
return samples, sample_rate | ||
|
||
|
||
def create_recognizer(): | ||
config = sherpa.OfflineRecognizerConfig( | ||
model=sherpa.OfflineModelConfig( | ||
sense_voice=sherpa.OfflineSenseVoiceModelConfig( | ||
model="./sherpa-sense-voice-zh-en-ja-ko-yue-2025-01-06/model.pt", | ||
use_itn=True, | ||
language="auto", | ||
), | ||
debug=False, | ||
), | ||
tokens="./sherpa-sense-voice-zh-en-ja-ko-yue-2025-01-06/tokens.txt", | ||
use_gpu=False, | ||
) | ||
|
||
# You have to call config.Validate() to make it work! | ||
config.validate() | ||
return sherpa.OfflineRecognizer(config) | ||
|
||
|
||
def create_vad(): | ||
config = sherpa.VoiceActivityDetectorConfig( | ||
segment_size=20, | ||
model=sherpa.VadModelConfig( | ||
silero_vad=sherpa.SileroVadModelConfig( | ||
model="./silero-vad-v4.pt", | ||
threshold=0.5, | ||
min_speech_duration=0.25, | ||
min_silence_duration=0.5, | ||
), | ||
sample_rate=16000, | ||
), | ||
) | ||
return sherpa.VoiceActivityDetector(config) | ||
|
||
|
||
def main(): | ||
vad = create_vad() | ||
recognizer = create_recognizer() | ||
|
||
test_wave_file = "./lei-jun-test.wav" | ||
|
||
samples, sample_rate = load_audio(test_wave_file) | ||
if sample_rate != 16000: | ||
samples = librosa.resample(samples, orig_sr=sample_rate, target_sr=16000) | ||
sample_rate = 16000 | ||
|
||
segments = vad.process(torch.from_numpy(samples)) | ||
for s in segments: | ||
start_sample = int(s.start * sample_rate) | ||
end_sample = int(s.end * sample_rate) | ||
stream = recognizer.create_stream() | ||
stream.accept_waveform(samples[start_sample:end_sample]) | ||
recognizer.decode_stream(stream) | ||
text = stream.result.text | ||
|
||
print(f"{s.start:.3f} -- {s.end:.3f} {text}") | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,127 @@ | ||
#!/usr/bin/env python3 | ||
# | ||
# Copyright (c) 2025 Xiaomi Corporation | ||
|
||
""" | ||
Please download a whisper model from | ||
https://github.com/k2-fsa/sherpa/releases/tag/asr-models | ||
E.g., | ||
wget https://github.com/k2-fsa/sherpa/releases/download/asr-models/sherpa-whisper-tiny.en.tar.bz2 | ||
""" | ||
import time | ||
from typing import Tuple | ||
|
||
import librosa | ||
import numpy as np | ||
import sherpa | ||
import soundfile as sf | ||
|
||
|
||
def load_audio(filename: str) -> Tuple[np.ndarray, int]: | ||
data, sample_rate = sf.read( | ||
filename, | ||
always_2d=True, | ||
dtype="float32", | ||
) | ||
data = data[:, 0] # use only the first channel | ||
samples = np.ascontiguousarray(data) | ||
return samples, sample_rate | ||
|
||
|
||
def create_recognizer(): | ||
config = sherpa.OfflineRecognizerConfig( | ||
model=sherpa.OfflineModelConfig( | ||
whisper=sherpa.OfflineWhisperModelConfig( | ||
model="./sherpa-whisper-tiny.en/model.pt", | ||
), | ||
debug=False, | ||
), | ||
tokens="./sherpa-whisper-tiny.en/tokens.txt", | ||
use_gpu=False, | ||
) | ||
|
||
# You have to call config.Validate() to make it work! | ||
config.validate() | ||
return sherpa.OfflineRecognizer(config) | ||
|
||
|
||
def test_decoding_single_file(recognizer): | ||
print("----------Test a single file----------") | ||
test_wave_file = "./sherpa-whisper-tiny.en/test_wavs/0.wav" | ||
|
||
samples, sample_rate = load_audio(test_wave_file) | ||
if sample_rate != 16000: | ||
samples = librosa.resample(samples, orig_sr=sample_rate, target_sr=16000) | ||
sample_rate = 16000 | ||
|
||
start = time.time() | ||
|
||
stream = recognizer.create_stream() | ||
stream.accept_waveform(samples) | ||
recognizer.decode_stream(stream) | ||
text = stream.result.text | ||
|
||
end = time.time() | ||
|
||
elapsed_seconds = end - start | ||
audio_duration = len(samples) / sample_rate | ||
real_time_factor = elapsed_seconds / audio_duration | ||
|
||
print(text) | ||
print(f"Elapsed seconds: {elapsed_seconds:.3f}") | ||
print(f"Audio duration in seconds: {audio_duration:.3f}") | ||
print(f"RTF: {elapsed_seconds:.3f}/{audio_duration:.3f} = {real_time_factor:.3f}") | ||
|
||
|
||
def test_decoding_multipl_files(recognizer): | ||
print("----------Test decoding multiple files----------") | ||
test_wave_file1 = "./sherpa-whisper-tiny.en/test_wavs/0.wav" | ||
test_wave_file2 = "./sherpa-whisper-tiny.en/test_wavs/1.wav" | ||
|
||
samples1, sample_rate1 = load_audio(test_wave_file1) | ||
if sample_rate1 != 16000: | ||
samples1 = librosa.resample(samples1, orig_sr=sample_rate1, target_sr=16000) | ||
sample_rate1 = 16000 | ||
|
||
samples2, sample_rate2 = load_audio(test_wave_file2) | ||
if sample_rate2 != 16000: | ||
samples2 = librosa.resample(samples2, orig_sr=sample_rate2, target_sr=16000) | ||
sample_rate2 = 16000 | ||
|
||
start = time.time() | ||
stream1 = recognizer.create_stream() | ||
stream1.accept_waveform(samples1) | ||
|
||
stream2 = recognizer.create_stream() | ||
stream2.accept_waveform(samples2) | ||
|
||
recognizer.decode_streams([stream1, stream2]) | ||
text1 = stream1.result.text | ||
text2 = stream2.result.text | ||
|
||
end = time.time() | ||
|
||
elapsed_seconds = end - start | ||
audio_duration = len(samples1) / sample_rate1 + len(samples2) / sample_rate2 | ||
real_time_factor = elapsed_seconds / audio_duration | ||
|
||
print(f"{test_wave_file1}\n {text1}") | ||
print() | ||
print(f"{test_wave_file2}\n {text2}") | ||
|
||
print() | ||
|
||
print(f"Elapsed seconds: {elapsed_seconds:.3f}") | ||
print(f"Audio duration in seconds: {audio_duration:.3f}") | ||
print(f"RTF: {elapsed_seconds:.3f}/{audio_duration:.3f} = {real_time_factor:.3f}") | ||
|
||
|
||
def main(): | ||
recognizer = create_recognizer() | ||
test_decoding_single_file(recognizer) | ||
test_decoding_multipl_files(recognizer) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.