-
Notifications
You must be signed in to change notification settings - Fork 23
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
3c4cb05
commit e5703f1
Showing
14 changed files
with
1,752 additions
and
0 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,182 @@ | ||
"""Contains the audio featurizer class.""" | ||
import numpy as np | ||
# https://github.com/jameslyons/python_speech_features | ||
from python_speech_features import mfcc | ||
from python_speech_features import delta | ||
|
||
|
||
class AudioFeaturizer(object): | ||
"""Audio featurizer, for extracting features from audio contents of | ||
AudioSegment or SpeechSegment. | ||
Currently, it supports feature types of linear spectrogram and mfcc. | ||
:param specgram_type: Specgram feature type. Options: 'linear'. | ||
:type specgram_type: str | ||
:param stride_ms: Striding size (in milliseconds) for generating frames. | ||
:type stride_ms: float | ||
:param window_ms: Window size (in milliseconds) for generating frames. | ||
:type window_ms: float | ||
:param max_freq: When specgram_type is 'linear', only FFT bins | ||
corresponding to frequencies between [0, max_freq] are | ||
returned; when specgram_type is 'mfcc', max_feq is the | ||
highest band edge of mel filters. | ||
:types max_freq: None|float | ||
:param target_sample_rate: Audio are resampled (if upsampling or | ||
downsampling is allowed) to this before | ||
extracting spectrogram features. | ||
:type target_sample_rate: float | ||
:param use_dB_normalization: Whether to normalize the audio to a certain | ||
decibels before extracting the features. | ||
:type use_dB_normalization: bool | ||
:param target_dB: Target audio decibels for normalization. | ||
:type target_dB: float | ||
""" | ||
|
||
def __init__(self, | ||
specgram_type='linear', | ||
stride_ms=10.0, | ||
window_ms=20.0, | ||
max_freq=None, | ||
target_sample_rate=16000, | ||
use_dB_normalization=True, | ||
target_dB=-20): | ||
self._specgram_type = specgram_type | ||
self._stride_ms = stride_ms | ||
self._window_ms = window_ms | ||
self._max_freq = max_freq | ||
self._target_sample_rate = target_sample_rate | ||
self._use_dB_normalization = use_dB_normalization | ||
self._target_dB = target_dB | ||
|
||
def featurize(self, | ||
audio_segment, | ||
allow_downsampling=True, | ||
allow_upsampling=True): | ||
"""Extract audio features from AudioSegment or SpeechSegment. | ||
:param audio_segment: Audio/speech segment to extract features from. | ||
:type audio_segment: AudioSegment|SpeechSegment | ||
:param allow_downsampling: Whether to allow audio downsampling before | ||
featurizing. | ||
:type allow_downsampling: bool | ||
:param allow_upsampling: Whether to allow audio upsampling before | ||
featurizing. | ||
:type allow_upsampling: bool | ||
:return: Spectrogram audio feature in 2darray. | ||
:rtype: ndarray | ||
:raises ValueError: If audio sample rate is not supported. | ||
""" | ||
# upsampling or downsampling | ||
if ((audio_segment.sample_rate > self._target_sample_rate and | ||
allow_downsampling) or | ||
(audio_segment.sample_rate < self._target_sample_rate and | ||
allow_upsampling)): | ||
audio_segment.resample(self._target_sample_rate) | ||
if audio_segment.sample_rate != self._target_sample_rate: | ||
raise ValueError("Audio sample rate is not supported. " | ||
"Turn allow_downsampling or allow up_sampling on.") | ||
# decibel normalization | ||
if self._use_dB_normalization: | ||
audio_segment.normalize(target_db=self._target_dB) | ||
# extract spectrogram | ||
return self._compute_specgram(audio_segment.samples, | ||
audio_segment.sample_rate) | ||
|
||
def _compute_specgram(self, samples, sample_rate): | ||
"""Extract various audio features.""" | ||
if self._specgram_type == 'linear': | ||
return self._compute_linear_specgram( | ||
samples, sample_rate, self._stride_ms, self._window_ms, | ||
self._max_freq) | ||
elif self._specgram_type == 'mfcc': | ||
return self._compute_mfcc(samples, sample_rate, self._stride_ms, | ||
self._window_ms, self._max_freq) | ||
else: | ||
raise ValueError("Unknown specgram_type %s. " | ||
"Supported values: linear." % self._specgram_type) | ||
|
||
def _compute_linear_specgram(self, | ||
samples, | ||
sample_rate, | ||
stride_ms=10.0, | ||
window_ms=20.0, | ||
max_freq=None, | ||
eps=1e-14): | ||
"""Compute the linear spectrogram from FFT energy.""" | ||
if max_freq is None: | ||
max_freq = sample_rate / 2 | ||
if max_freq > sample_rate / 2: | ||
raise ValueError("max_freq must not be greater than half of " | ||
"sample rate.") | ||
if stride_ms > window_ms: | ||
raise ValueError("Stride size must not be greater than " | ||
"window size.") | ||
stride_size = int(0.001 * sample_rate * stride_ms) | ||
window_size = int(0.001 * sample_rate * window_ms) | ||
specgram, freqs = self._specgram_real( | ||
samples, | ||
window_size=window_size, | ||
stride_size=stride_size, | ||
sample_rate=sample_rate) | ||
ind = np.where(freqs <= max_freq)[0][-1] + 1 | ||
return np.log(specgram[:ind, :] + eps) | ||
|
||
def _specgram_real(self, samples, window_size, stride_size, sample_rate): | ||
"""Compute the spectrogram for samples from a real signal.""" | ||
# extract strided windows | ||
truncate_size = (len(samples) - window_size) % stride_size | ||
samples = samples[:len(samples) - truncate_size] | ||
nshape = (window_size, (len(samples) - window_size) // stride_size + 1) | ||
nstrides = (samples.strides[0], samples.strides[0] * stride_size) | ||
windows = np.lib.stride_tricks.as_strided( | ||
samples, shape=nshape, strides=nstrides) | ||
assert np.all( | ||
windows[:, 1] == samples[stride_size:(stride_size + window_size)]) | ||
# window weighting, squared Fast Fourier Transform (fft), scaling | ||
weighting = np.hanning(window_size)[:, None] | ||
fft = np.fft.rfft(windows * weighting, axis=0) | ||
fft = np.absolute(fft) | ||
fft = fft**2 | ||
scale = np.sum(weighting**2) * sample_rate | ||
fft[1:-1, :] *= (2.0 / scale) | ||
fft[(0, -1), :] /= scale | ||
# prepare fft frequency list | ||
freqs = float(sample_rate) / window_size * np.arange(fft.shape[0]) | ||
return fft, freqs | ||
|
||
def _compute_mfcc(self, | ||
samples, | ||
sample_rate, | ||
stride_ms=10.0, | ||
window_ms=20.0, | ||
max_freq=None): | ||
"""Compute mfcc from samples.""" | ||
if max_freq is None: | ||
max_freq = sample_rate / 2 | ||
if max_freq > sample_rate / 2: | ||
raise ValueError("max_freq must not be greater than half of " | ||
"sample rate.") | ||
if stride_ms > window_ms: | ||
raise ValueError("Stride size must not be greater than " | ||
"window size.") | ||
# compute the 13 cepstral coefficients, and the first one is replaced | ||
# by log(frame energy) | ||
mfcc_feat = mfcc( | ||
signal=samples, | ||
samplerate=sample_rate, | ||
winlen=0.001 * window_ms, | ||
winstep=0.001 * stride_ms, | ||
highfreq=max_freq) | ||
# Deltas | ||
d_mfcc_feat = delta(mfcc_feat, 2) | ||
# Deltas-Deltas | ||
dd_mfcc_feat = delta(d_mfcc_feat, 2) | ||
# transpose | ||
mfcc_feat = np.transpose(mfcc_feat) | ||
d_mfcc_feat = np.transpose(d_mfcc_feat) | ||
dd_mfcc_feat = np.transpose(dd_mfcc_feat) | ||
# concat above three features | ||
concat_mfcc_feat = np.concatenate( | ||
(mfcc_feat, d_mfcc_feat, dd_mfcc_feat)) | ||
return concat_mfcc_feat |
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
import pyaudio | ||
import wave | ||
|
||
class Recorder(object): | ||
'''A recorder class for recording audio to a WAV file. | ||
Records in mono by default. | ||
''' | ||
|
||
def __init__(self, channels=1, rate=44100, frames_per_buffer=1024): | ||
self.channels = channels | ||
self.rate = rate | ||
self.frames_per_buffer = frames_per_buffer | ||
|
||
def open(self, fname, mode='wb'): | ||
return RecordingFile(fname, mode, self.channels, self.rate, | ||
self.frames_per_buffer) | ||
|
||
class RecordingFile(object): | ||
def __init__(self, fname, mode, channels, | ||
rate, frames_per_buffer): | ||
self.fname = fname | ||
self.mode = mode | ||
self.channels = channels | ||
self.rate = rate | ||
self.frames_per_buffer = frames_per_buffer | ||
self._pa = pyaudio.PyAudio() | ||
self.wavefile = self._prepare_file(self.fname, self.mode) | ||
self._stream = None | ||
|
||
def __enter__(self): | ||
return self | ||
|
||
def __exit__(self, exception, value, traceback): | ||
self.close() | ||
|
||
def record(self, duration): | ||
# Use a stream with no callback function in blocking mode | ||
self._stream = self._pa.open(format=pyaudio.paInt16, | ||
channels=self.channels, | ||
rate=self.rate, | ||
input=True, | ||
frames_per_buffer=self.frames_per_buffer) | ||
for _ in range(int(self.rate / self.frames_per_buffer * duration)): | ||
audio = self._stream.read(self.frames_per_buffer) | ||
self.wavefile.writeframes(audio) | ||
return None | ||
|
||
def start_recording(self): | ||
# Use a stream with a callback in non-blocking mode | ||
self._stream = self._pa.open(format=pyaudio.paInt16, | ||
channels=self.channels, | ||
rate=self.rate, | ||
input=True, | ||
frames_per_buffer=self.frames_per_buffer, | ||
stream_callback=self.get_callback()) | ||
self._stream.start_stream() | ||
return self | ||
|
||
def stop_recording(self): | ||
self._stream.stop_stream() | ||
return self | ||
|
||
def get_callback(self): | ||
def callback(in_data, frame_count, time_info, status): | ||
self.wavefile.writeframes(in_data) | ||
return in_data, pyaudio.paContinue | ||
return callback | ||
|
||
|
||
def close(self): | ||
self._stream.close() | ||
self._pa.terminate() | ||
self.wavefile.close() | ||
|
||
def _prepare_file(self, fname, mode='wb'): | ||
wavefile = wave.open(fname, mode) | ||
wavefile.setnchannels(self.channels) | ||
wavefile.setsampwidth(self._pa.get_sample_size(pyaudio.paInt16)) | ||
wavefile.setframerate(self.rate) | ||
return wavefile |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
import scipy.io.wavfile as wav | ||
import numpy as np | ||
import os | ||
import pydub | ||
import tempfile | ||
import scipy | ||
import random | ||
from python_speech_features import logfbank | ||
|
||
from lib.tools_math import * | ||
|
||
def changeRateTo16000(filepath): | ||
if filepath[-4:].lower() =='.wav': | ||
sound = pydub.AudioSegment.from_wav(filepath) | ||
sound = sound.set_frame_rate(16000) | ||
sound.export(filepath, format="wav") | ||
elif filepath[-4:].lower() =='.m4a': | ||
sound = pydub.AudioSegment.from_file(filepath, "m4a") | ||
sound = sound.set_frame_rate(16000) | ||
sound.export(filepath[:-3]+"wav", format="wav") | ||
elif filepath[-4:].lower() =='.mp3': | ||
sound = pydub.AudioSegment.from_mp3(filepath) | ||
sound = sound.set_frame_rate(16000) | ||
sound.export(filepath[:-3]+"wav", format="wav") | ||
else: | ||
print("Unsupported Format.") | ||
|
||
def read_wav(file_path): | ||
assert file_path[-4:]=='.wav' | ||
rate, data = wav.read(file_path) | ||
return rate, data | ||
|
||
def read_m4a(file_path): | ||
path, ext = os.path.splitext(file_path) | ||
assert ext=='.m4a' | ||
aac_version = pydub.AudioSegment.from_file(file_path, "m4a") | ||
_, path = tempfile.mkstemp() | ||
aac_version.export(path, format="wav") | ||
rate, data = scipy.io.wavfile.read(path) | ||
os.remove(path) | ||
return rate, data | ||
|
||
def read_mp3(file_path): | ||
path, ext = os.path.splitext(file_path) | ||
assert ext=='.mp3' | ||
mp3 = pydub.AudioSegment.from_mp3(file_path) | ||
_, path = tempfile.mkstemp() | ||
mp3.export(path, format="wav") | ||
rate, data = scipy.io.wavfile.read(path) | ||
os.remove(path) | ||
return rate, data | ||
|
||
def mp3_to_wav(file_path, obj_path): | ||
path, ext = os.path.splitext(file_path) | ||
assert ext=='.mp3' | ||
mp3 = pydub.AudioSegment.from_mp3(file_path) | ||
mp3.export(obj_path, format="wav") | ||
|
||
def mergeChannels(data): | ||
data = normalize(data) | ||
if len(data.shape)==1: | ||
return data | ||
if len(data.shape)==2: | ||
return np.mean(data, axis = 1) | ||
raise ValueError("This is not what an audio file ought to be!") | ||
|
||
def getDefaultSpectrogram(rate, data): | ||
f, t, Sxx = signal.spectrogram(data, fs=rate, window='hamming', nperseg=400, noverlap=240, nfft=1024, scaling='spectrum', return_onesided=True) | ||
return Sxx | ||
|
||
def frame_split(data, frame_width, frame_step): | ||
if len(data)<frame_width: | ||
raise ValueError("The length of data is shorter than the frame width.") | ||
frame_max = int(np.floor((len(data)-frame_width+frame_step)/float(frame_step))) | ||
result = [] | ||
for i in range(frame_max): | ||
start = i*frame_step | ||
end = start+frame_width | ||
result.append(data[start:end]) | ||
return np.array(result) | ||
|
||
def get_mel_db(wave_data, sr, winlen = 0.025, winstep = 0.01, nfft = 512, num_mel = 40, wav_process=False): | ||
# Input 10.015 sec, output, (1000, 40) | ||
# nfft >= num data points in one window(frame) | ||
if wav_process == True: | ||
frame_shift = int(sr * winstep) | ||
frame_size = int(sr * winlen) | ||
wave_data, index = librosa.effects.trim(wave_data, frame_length=frame_size, hop_length=frame_shift) | ||
mel_db = logfbank(wave_data, samplerate=sr, winlen=winlen, winstep=winstep, | ||
nfilt=num_mel, nfft=nfft, lowfreq=0, highfreq=None, preemph=0.97) | ||
mel_db -= (np.mean(mel_db,axis=1).reshape(-1,1)+1e-8) | ||
return mel_db |
Oops, something went wrong.