Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add C++ and Python API for silero VAD. #701

Merged
merged 4 commits into from
Jan 21, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,5 @@ sherpa-nemo-ctc*
*.pt
tokens.txt
*.bin
sherpa-sense-voice-zh-en-ja-ko-yue-2025-01-06
sherpa-whisper-tiny.en
129 changes: 129 additions & 0 deletions python-api-examples/sense-voice.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
#!/usr/bin/env python3
#
# Copyright (c) 2025 Xiaomi Corporation

"""
Please download sense voice model from
https://github.com/k2-fsa/sherpa/releases/tag/asr-models

E.g.,
wget https://github.com/k2-fsa/sherpa/releases/download/asr-models/sherpa-sense-voice-zh-en-ja-ko-yue-2025-01-06.tar.bz2
"""
import time
from typing import Tuple

import librosa
import numpy as np
import sherpa
import soundfile as sf


def load_audio(filename: str) -> Tuple[np.ndarray, int]:
data, sample_rate = sf.read(
filename,
always_2d=True,
dtype="float32",
)
data = data[:, 0] # use only the first channel
samples = np.ascontiguousarray(data)
return samples, sample_rate


def create_recognizer():
config = sherpa.OfflineRecognizerConfig(
model=sherpa.OfflineModelConfig(
sense_voice=sherpa.OfflineSenseVoiceModelConfig(
model="./sherpa-sense-voice-zh-en-ja-ko-yue-2025-01-06/model.pt",
use_itn=True,
language="auto",
),
debug=False,
),
tokens="./sherpa-sense-voice-zh-en-ja-ko-yue-2025-01-06/tokens.txt",
use_gpu=False,
)

# You have to call config.Validate() to make it work!
config.validate()
return sherpa.OfflineRecognizer(config)


def test_decoding_single_file(recognizer):
print("----------Test a single file----------")
test_wave_file = "./sherpa-sense-voice-zh-en-ja-ko-yue-2025-01-06/test_wavs/zh.wav"

samples, sample_rate = load_audio(test_wave_file)
if sample_rate != 16000:
samples = librosa.resample(samples, orig_sr=sample_rate, target_sr=16000)
sample_rate = 16000

start = time.time()

stream = recognizer.create_stream()
stream.accept_waveform(samples)
recognizer.decode_stream(stream)
text = stream.result.text

end = time.time()

elapsed_seconds = end - start
audio_duration = len(samples) / sample_rate
real_time_factor = elapsed_seconds / audio_duration

print(text)
print(f"Elapsed seconds: {elapsed_seconds:.3f}")
print(f"Audio duration in seconds: {audio_duration:.3f}")
print(f"RTF: {elapsed_seconds:.3f}/{audio_duration:.3f} = {real_time_factor:.3f}")


def test_decoding_multipl_files(recognizer):
print("----------Test decoding multiple files----------")
test_wave_file1 = "./sherpa-sense-voice-zh-en-ja-ko-yue-2025-01-06/test_wavs/zh.wav"
test_wave_file2 = "./sherpa-sense-voice-zh-en-ja-ko-yue-2025-01-06/test_wavs/en.wav"

samples1, sample_rate1 = load_audio(test_wave_file1)
if sample_rate1 != 16000:
samples1 = librosa.resample(samples1, orig_sr=sample_rate1, target_sr=16000)
sample_rate1 = 16000

samples2, sample_rate2 = load_audio(test_wave_file2)
if sample_rate2 != 16000:
samples2 = librosa.resample(samples2, orig_sr=sample_rate2, target_sr=16000)
sample_rate2 = 16000

start = time.time()
stream1 = recognizer.create_stream()
stream1.accept_waveform(samples1)

stream2 = recognizer.create_stream()
stream2.accept_waveform(samples2)

recognizer.decode_streams([stream1, stream2])
text1 = stream1.result.text
text2 = stream2.result.text

end = time.time()

elapsed_seconds = end - start
audio_duration = len(samples1) / sample_rate1 + len(samples2) / sample_rate2
real_time_factor = elapsed_seconds / audio_duration

print(f"{test_wave_file1}\n {text1}")
print()
print(f"{test_wave_file2}\n {text2}")

print()

print(f"Elapsed seconds: {elapsed_seconds:.3f}")
print(f"Audio duration in seconds: {audio_duration:.3f}")
print(f"RTF: {elapsed_seconds:.3f}/{audio_duration:.3f} = {real_time_factor:.3f}")


def main():
recognizer = create_recognizer()
test_decoding_single_file(recognizer)
test_decoding_multipl_files(recognizer)


if __name__ == "__main__":
main()
96 changes: 96 additions & 0 deletions python-api-examples/vad-with-sense-voice.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
#!/usr/bin/env python3

"""
Please download sense voice model from
https://github.com/k2-fsa/sherpa/releases/tag/asr-models

E.g.,
wget https://github.com/k2-fsa/sherpa/releases/download/asr-models/sherpa-sense-voice-zh-en-ja-ko-yue-2025-01-06.tar.bz2


Please download VAD models from
https://github.com/k2-fsa/sherpa/releases/tag/vad-models

E.g.,
wget https://github.com/k2-fsa/sherpa/releases/download/vad-models/silero-vad-v4.pt
"""
from typing import Tuple

import librosa
import numpy as np
import sherpa
import soundfile as sf
import torch


def load_audio(filename: str) -> Tuple[np.ndarray, int]:
data, sample_rate = sf.read(
filename,
always_2d=True,
dtype="float32",
)
data = data[:, 0] # use only the first channel
samples = np.ascontiguousarray(data)
return samples, sample_rate


def create_recognizer():
config = sherpa.OfflineRecognizerConfig(
model=sherpa.OfflineModelConfig(
sense_voice=sherpa.OfflineSenseVoiceModelConfig(
model="./sherpa-sense-voice-zh-en-ja-ko-yue-2025-01-06/model.pt",
use_itn=True,
language="auto",
),
debug=False,
),
tokens="./sherpa-sense-voice-zh-en-ja-ko-yue-2025-01-06/tokens.txt",
use_gpu=False,
)

# You have to call config.Validate() to make it work!
config.validate()
return sherpa.OfflineRecognizer(config)


def create_vad():
config = sherpa.VoiceActivityDetectorConfig(
segment_size=20,
model=sherpa.VadModelConfig(
silero_vad=sherpa.SileroVadModelConfig(
model="./silero-vad-v4.pt",
threshold=0.5,
min_speech_duration=0.25,
min_silence_duration=0.5,
),
sample_rate=16000,
),
)
return sherpa.VoiceActivityDetector(config)


def main():
vad = create_vad()
recognizer = create_recognizer()

test_wave_file = "./lei-jun-test.wav"

samples, sample_rate = load_audio(test_wave_file)
if sample_rate != 16000:
samples = librosa.resample(samples, orig_sr=sample_rate, target_sr=16000)
sample_rate = 16000

segments = vad.process(torch.from_numpy(samples))
for s in segments:
start_sample = int(s.start * sample_rate)
end_sample = int(s.end * sample_rate)
stream = recognizer.create_stream()
stream.accept_waveform(samples[start_sample:end_sample])
recognizer.decode_stream(stream)
text = stream.result.text

print(f"{s.start:.3f} -- {s.end:.3f} {text}")


if __name__ == "__main__":
main()
127 changes: 127 additions & 0 deletions python-api-examples/whisper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
#!/usr/bin/env python3
#
# Copyright (c) 2025 Xiaomi Corporation

"""
Please download a whisper model from
https://github.com/k2-fsa/sherpa/releases/tag/asr-models

E.g.,
wget https://github.com/k2-fsa/sherpa/releases/download/asr-models/sherpa-whisper-tiny.en.tar.bz2
"""
import time
from typing import Tuple

import librosa
import numpy as np
import sherpa
import soundfile as sf


def load_audio(filename: str) -> Tuple[np.ndarray, int]:
data, sample_rate = sf.read(
filename,
always_2d=True,
dtype="float32",
)
data = data[:, 0] # use only the first channel
samples = np.ascontiguousarray(data)
return samples, sample_rate


def create_recognizer():
config = sherpa.OfflineRecognizerConfig(
model=sherpa.OfflineModelConfig(
whisper=sherpa.OfflineWhisperModelConfig(
model="./sherpa-whisper-tiny.en/model.pt",
),
debug=False,
),
tokens="./sherpa-whisper-tiny.en/tokens.txt",
use_gpu=False,
)

# You have to call config.Validate() to make it work!
config.validate()
return sherpa.OfflineRecognizer(config)


def test_decoding_single_file(recognizer):
print("----------Test a single file----------")
test_wave_file = "./sherpa-whisper-tiny.en/test_wavs/0.wav"

samples, sample_rate = load_audio(test_wave_file)
if sample_rate != 16000:
samples = librosa.resample(samples, orig_sr=sample_rate, target_sr=16000)
sample_rate = 16000

start = time.time()

stream = recognizer.create_stream()
stream.accept_waveform(samples)
recognizer.decode_stream(stream)
text = stream.result.text

end = time.time()

elapsed_seconds = end - start
audio_duration = len(samples) / sample_rate
real_time_factor = elapsed_seconds / audio_duration

print(text)
print(f"Elapsed seconds: {elapsed_seconds:.3f}")
print(f"Audio duration in seconds: {audio_duration:.3f}")
print(f"RTF: {elapsed_seconds:.3f}/{audio_duration:.3f} = {real_time_factor:.3f}")


def test_decoding_multipl_files(recognizer):
print("----------Test decoding multiple files----------")
test_wave_file1 = "./sherpa-whisper-tiny.en/test_wavs/0.wav"
test_wave_file2 = "./sherpa-whisper-tiny.en/test_wavs/1.wav"

samples1, sample_rate1 = load_audio(test_wave_file1)
if sample_rate1 != 16000:
samples1 = librosa.resample(samples1, orig_sr=sample_rate1, target_sr=16000)
sample_rate1 = 16000

samples2, sample_rate2 = load_audio(test_wave_file2)
if sample_rate2 != 16000:
samples2 = librosa.resample(samples2, orig_sr=sample_rate2, target_sr=16000)
sample_rate2 = 16000

start = time.time()
stream1 = recognizer.create_stream()
stream1.accept_waveform(samples1)

stream2 = recognizer.create_stream()
stream2.accept_waveform(samples2)

recognizer.decode_streams([stream1, stream2])
text1 = stream1.result.text
text2 = stream2.result.text

end = time.time()

elapsed_seconds = end - start
audio_duration = len(samples1) / sample_rate1 + len(samples2) / sample_rate2
real_time_factor = elapsed_seconds / audio_duration

print(f"{test_wave_file1}\n {text1}")
print()
print(f"{test_wave_file2}\n {text2}")

print()

print(f"Elapsed seconds: {elapsed_seconds:.3f}")
print(f"Audio duration in seconds: {audio_duration:.3f}")
print(f"RTF: {elapsed_seconds:.3f}/{audio_duration:.3f} = {real_time_factor:.3f}")


def main():
recognizer = create_recognizer()
test_decoding_single_file(recognizer)
test_decoding_multipl_files(recognizer)


if __name__ == "__main__":
main()
1 change: 1 addition & 0 deletions scripts/silero-vad/export-v4.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
def main():
m = torch.jit.load("./silero_vad_v4.jit")
meta_data = {
"model_type": "silero_vad",
"version": "4",
}
m.save("silero-vad-v4.pt", _extra_files=meta_data)
Expand Down
Loading
Loading