Skip to content

Commit

Permalink
Add C++ and Python API for silero VAD. (#701)
Browse files Browse the repository at this point in the history
  • Loading branch information
csukuangfj authored Jan 21, 2025
1 parent 43d38c2 commit f46678e
Show file tree
Hide file tree
Showing 33 changed files with 1,320 additions and 16 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,5 @@ sherpa-nemo-ctc*
*.pt
tokens.txt
*.bin
sherpa-sense-voice-zh-en-ja-ko-yue-2025-01-06
sherpa-whisper-tiny.en
129 changes: 129 additions & 0 deletions python-api-examples/sense-voice.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
#!/usr/bin/env python3
#
# Copyright (c) 2025 Xiaomi Corporation

"""
Please download sense voice model from
https://github.com/k2-fsa/sherpa/releases/tag/asr-models
E.g.,
wget https://github.com/k2-fsa/sherpa/releases/download/asr-models/sherpa-sense-voice-zh-en-ja-ko-yue-2025-01-06.tar.bz2
"""
import time
from typing import Tuple

import librosa
import numpy as np
import sherpa
import soundfile as sf


def load_audio(filename: str) -> Tuple[np.ndarray, int]:
data, sample_rate = sf.read(
filename,
always_2d=True,
dtype="float32",
)
data = data[:, 0] # use only the first channel
samples = np.ascontiguousarray(data)
return samples, sample_rate


def create_recognizer():
config = sherpa.OfflineRecognizerConfig(
model=sherpa.OfflineModelConfig(
sense_voice=sherpa.OfflineSenseVoiceModelConfig(
model="./sherpa-sense-voice-zh-en-ja-ko-yue-2025-01-06/model.pt",
use_itn=True,
language="auto",
),
debug=False,
),
tokens="./sherpa-sense-voice-zh-en-ja-ko-yue-2025-01-06/tokens.txt",
use_gpu=False,
)

# You have to call config.Validate() to make it work!
config.validate()
return sherpa.OfflineRecognizer(config)


def test_decoding_single_file(recognizer):
print("----------Test a single file----------")
test_wave_file = "./sherpa-sense-voice-zh-en-ja-ko-yue-2025-01-06/test_wavs/zh.wav"

samples, sample_rate = load_audio(test_wave_file)
if sample_rate != 16000:
samples = librosa.resample(samples, orig_sr=sample_rate, target_sr=16000)
sample_rate = 16000

start = time.time()

stream = recognizer.create_stream()
stream.accept_waveform(samples)
recognizer.decode_stream(stream)
text = stream.result.text

end = time.time()

elapsed_seconds = end - start
audio_duration = len(samples) / sample_rate
real_time_factor = elapsed_seconds / audio_duration

print(text)
print(f"Elapsed seconds: {elapsed_seconds:.3f}")
print(f"Audio duration in seconds: {audio_duration:.3f}")
print(f"RTF: {elapsed_seconds:.3f}/{audio_duration:.3f} = {real_time_factor:.3f}")


def test_decoding_multipl_files(recognizer):
print("----------Test decoding multiple files----------")
test_wave_file1 = "./sherpa-sense-voice-zh-en-ja-ko-yue-2025-01-06/test_wavs/zh.wav"
test_wave_file2 = "./sherpa-sense-voice-zh-en-ja-ko-yue-2025-01-06/test_wavs/en.wav"

samples1, sample_rate1 = load_audio(test_wave_file1)
if sample_rate1 != 16000:
samples1 = librosa.resample(samples1, orig_sr=sample_rate1, target_sr=16000)
sample_rate1 = 16000

samples2, sample_rate2 = load_audio(test_wave_file2)
if sample_rate2 != 16000:
samples2 = librosa.resample(samples2, orig_sr=sample_rate2, target_sr=16000)
sample_rate2 = 16000

start = time.time()
stream1 = recognizer.create_stream()
stream1.accept_waveform(samples1)

stream2 = recognizer.create_stream()
stream2.accept_waveform(samples2)

recognizer.decode_streams([stream1, stream2])
text1 = stream1.result.text
text2 = stream2.result.text

end = time.time()

elapsed_seconds = end - start
audio_duration = len(samples1) / sample_rate1 + len(samples2) / sample_rate2
real_time_factor = elapsed_seconds / audio_duration

print(f"{test_wave_file1}\n {text1}")
print()
print(f"{test_wave_file2}\n {text2}")

print()

print(f"Elapsed seconds: {elapsed_seconds:.3f}")
print(f"Audio duration in seconds: {audio_duration:.3f}")
print(f"RTF: {elapsed_seconds:.3f}/{audio_duration:.3f} = {real_time_factor:.3f}")


def main():
recognizer = create_recognizer()
test_decoding_single_file(recognizer)
test_decoding_multipl_files(recognizer)


if __name__ == "__main__":
main()
96 changes: 96 additions & 0 deletions python-api-examples/vad-with-sense-voice.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
#!/usr/bin/env python3

"""
Please download sense voice model from
https://github.com/k2-fsa/sherpa/releases/tag/asr-models
E.g.,
wget https://github.com/k2-fsa/sherpa/releases/download/asr-models/sherpa-sense-voice-zh-en-ja-ko-yue-2025-01-06.tar.bz2
Please download VAD models from
https://github.com/k2-fsa/sherpa/releases/tag/vad-models
E.g.,
wget https://github.com/k2-fsa/sherpa/releases/download/vad-models/silero-vad-v4.pt
"""
from typing import Tuple

import librosa
import numpy as np
import sherpa
import soundfile as sf
import torch


def load_audio(filename: str) -> Tuple[np.ndarray, int]:
data, sample_rate = sf.read(
filename,
always_2d=True,
dtype="float32",
)
data = data[:, 0] # use only the first channel
samples = np.ascontiguousarray(data)
return samples, sample_rate


def create_recognizer():
config = sherpa.OfflineRecognizerConfig(
model=sherpa.OfflineModelConfig(
sense_voice=sherpa.OfflineSenseVoiceModelConfig(
model="./sherpa-sense-voice-zh-en-ja-ko-yue-2025-01-06/model.pt",
use_itn=True,
language="auto",
),
debug=False,
),
tokens="./sherpa-sense-voice-zh-en-ja-ko-yue-2025-01-06/tokens.txt",
use_gpu=False,
)

# You have to call config.Validate() to make it work!
config.validate()
return sherpa.OfflineRecognizer(config)


def create_vad():
config = sherpa.VoiceActivityDetectorConfig(
segment_size=20,
model=sherpa.VadModelConfig(
silero_vad=sherpa.SileroVadModelConfig(
model="./silero-vad-v4.pt",
threshold=0.5,
min_speech_duration=0.25,
min_silence_duration=0.5,
),
sample_rate=16000,
),
)
return sherpa.VoiceActivityDetector(config)


def main():
vad = create_vad()
recognizer = create_recognizer()

test_wave_file = "./lei-jun-test.wav"

samples, sample_rate = load_audio(test_wave_file)
if sample_rate != 16000:
samples = librosa.resample(samples, orig_sr=sample_rate, target_sr=16000)
sample_rate = 16000

segments = vad.process(torch.from_numpy(samples))
for s in segments:
start_sample = int(s.start * sample_rate)
end_sample = int(s.end * sample_rate)
stream = recognizer.create_stream()
stream.accept_waveform(samples[start_sample:end_sample])
recognizer.decode_stream(stream)
text = stream.result.text

print(f"{s.start:.3f} -- {s.end:.3f} {text}")


if __name__ == "__main__":
main()
127 changes: 127 additions & 0 deletions python-api-examples/whisper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
#!/usr/bin/env python3
#
# Copyright (c) 2025 Xiaomi Corporation

"""
Please download a whisper model from
https://github.com/k2-fsa/sherpa/releases/tag/asr-models
E.g.,
wget https://github.com/k2-fsa/sherpa/releases/download/asr-models/sherpa-whisper-tiny.en.tar.bz2
"""
import time
from typing import Tuple

import librosa
import numpy as np
import sherpa
import soundfile as sf


def load_audio(filename: str) -> Tuple[np.ndarray, int]:
data, sample_rate = sf.read(
filename,
always_2d=True,
dtype="float32",
)
data = data[:, 0] # use only the first channel
samples = np.ascontiguousarray(data)
return samples, sample_rate


def create_recognizer():
config = sherpa.OfflineRecognizerConfig(
model=sherpa.OfflineModelConfig(
whisper=sherpa.OfflineWhisperModelConfig(
model="./sherpa-whisper-tiny.en/model.pt",
),
debug=False,
),
tokens="./sherpa-whisper-tiny.en/tokens.txt",
use_gpu=False,
)

# You have to call config.Validate() to make it work!
config.validate()
return sherpa.OfflineRecognizer(config)


def test_decoding_single_file(recognizer):
print("----------Test a single file----------")
test_wave_file = "./sherpa-whisper-tiny.en/test_wavs/0.wav"

samples, sample_rate = load_audio(test_wave_file)
if sample_rate != 16000:
samples = librosa.resample(samples, orig_sr=sample_rate, target_sr=16000)
sample_rate = 16000

start = time.time()

stream = recognizer.create_stream()
stream.accept_waveform(samples)
recognizer.decode_stream(stream)
text = stream.result.text

end = time.time()

elapsed_seconds = end - start
audio_duration = len(samples) / sample_rate
real_time_factor = elapsed_seconds / audio_duration

print(text)
print(f"Elapsed seconds: {elapsed_seconds:.3f}")
print(f"Audio duration in seconds: {audio_duration:.3f}")
print(f"RTF: {elapsed_seconds:.3f}/{audio_duration:.3f} = {real_time_factor:.3f}")


def test_decoding_multipl_files(recognizer):
print("----------Test decoding multiple files----------")
test_wave_file1 = "./sherpa-whisper-tiny.en/test_wavs/0.wav"
test_wave_file2 = "./sherpa-whisper-tiny.en/test_wavs/1.wav"

samples1, sample_rate1 = load_audio(test_wave_file1)
if sample_rate1 != 16000:
samples1 = librosa.resample(samples1, orig_sr=sample_rate1, target_sr=16000)
sample_rate1 = 16000

samples2, sample_rate2 = load_audio(test_wave_file2)
if sample_rate2 != 16000:
samples2 = librosa.resample(samples2, orig_sr=sample_rate2, target_sr=16000)
sample_rate2 = 16000

start = time.time()
stream1 = recognizer.create_stream()
stream1.accept_waveform(samples1)

stream2 = recognizer.create_stream()
stream2.accept_waveform(samples2)

recognizer.decode_streams([stream1, stream2])
text1 = stream1.result.text
text2 = stream2.result.text

end = time.time()

elapsed_seconds = end - start
audio_duration = len(samples1) / sample_rate1 + len(samples2) / sample_rate2
real_time_factor = elapsed_seconds / audio_duration

print(f"{test_wave_file1}\n {text1}")
print()
print(f"{test_wave_file2}\n {text2}")

print()

print(f"Elapsed seconds: {elapsed_seconds:.3f}")
print(f"Audio duration in seconds: {audio_duration:.3f}")
print(f"RTF: {elapsed_seconds:.3f}/{audio_duration:.3f} = {real_time_factor:.3f}")


def main():
recognizer = create_recognizer()
test_decoding_single_file(recognizer)
test_decoding_multipl_files(recognizer)


if __name__ == "__main__":
main()
1 change: 1 addition & 0 deletions scripts/silero-vad/export-v4.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
def main():
m = torch.jit.load("./silero_vad_v4.jit")
meta_data = {
"model_type": "silero_vad",
"version": "4",
}
m.save("silero-vad-v4.pt", _extra_files=meta_data)
Expand Down
Loading

0 comments on commit f46678e

Please sign in to comment.