Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
chenmingxiang110 authored Mar 11, 2019
1 parent 3c4cb05 commit e5703f1
Show file tree
Hide file tree
Showing 14 changed files with 1,752 additions and 0 deletions.
678 changes: 678 additions & 0 deletions lib/contrib/audio.py

Large diffs are not rendered by default.

182 changes: 182 additions & 0 deletions lib/contrib/audio_featurizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
"""Contains the audio featurizer class."""
import numpy as np
# https://github.com/jameslyons/python_speech_features
from python_speech_features import mfcc
from python_speech_features import delta


class AudioFeaturizer(object):
"""Audio featurizer, for extracting features from audio contents of
AudioSegment or SpeechSegment.
Currently, it supports feature types of linear spectrogram and mfcc.
:param specgram_type: Specgram feature type. Options: 'linear'.
:type specgram_type: str
:param stride_ms: Striding size (in milliseconds) for generating frames.
:type stride_ms: float
:param window_ms: Window size (in milliseconds) for generating frames.
:type window_ms: float
:param max_freq: When specgram_type is 'linear', only FFT bins
corresponding to frequencies between [0, max_freq] are
returned; when specgram_type is 'mfcc', max_feq is the
highest band edge of mel filters.
:types max_freq: None|float
:param target_sample_rate: Audio are resampled (if upsampling or
downsampling is allowed) to this before
extracting spectrogram features.
:type target_sample_rate: float
:param use_dB_normalization: Whether to normalize the audio to a certain
decibels before extracting the features.
:type use_dB_normalization: bool
:param target_dB: Target audio decibels for normalization.
:type target_dB: float
"""

def __init__(self,
specgram_type='linear',
stride_ms=10.0,
window_ms=20.0,
max_freq=None,
target_sample_rate=16000,
use_dB_normalization=True,
target_dB=-20):
self._specgram_type = specgram_type
self._stride_ms = stride_ms
self._window_ms = window_ms
self._max_freq = max_freq
self._target_sample_rate = target_sample_rate
self._use_dB_normalization = use_dB_normalization
self._target_dB = target_dB

def featurize(self,
audio_segment,
allow_downsampling=True,
allow_upsampling=True):
"""Extract audio features from AudioSegment or SpeechSegment.
:param audio_segment: Audio/speech segment to extract features from.
:type audio_segment: AudioSegment|SpeechSegment
:param allow_downsampling: Whether to allow audio downsampling before
featurizing.
:type allow_downsampling: bool
:param allow_upsampling: Whether to allow audio upsampling before
featurizing.
:type allow_upsampling: bool
:return: Spectrogram audio feature in 2darray.
:rtype: ndarray
:raises ValueError: If audio sample rate is not supported.
"""
# upsampling or downsampling
if ((audio_segment.sample_rate > self._target_sample_rate and
allow_downsampling) or
(audio_segment.sample_rate < self._target_sample_rate and
allow_upsampling)):
audio_segment.resample(self._target_sample_rate)
if audio_segment.sample_rate != self._target_sample_rate:
raise ValueError("Audio sample rate is not supported. "
"Turn allow_downsampling or allow up_sampling on.")
# decibel normalization
if self._use_dB_normalization:
audio_segment.normalize(target_db=self._target_dB)
# extract spectrogram
return self._compute_specgram(audio_segment.samples,
audio_segment.sample_rate)

def _compute_specgram(self, samples, sample_rate):
"""Extract various audio features."""
if self._specgram_type == 'linear':
return self._compute_linear_specgram(
samples, sample_rate, self._stride_ms, self._window_ms,
self._max_freq)
elif self._specgram_type == 'mfcc':
return self._compute_mfcc(samples, sample_rate, self._stride_ms,
self._window_ms, self._max_freq)
else:
raise ValueError("Unknown specgram_type %s. "
"Supported values: linear." % self._specgram_type)

def _compute_linear_specgram(self,
samples,
sample_rate,
stride_ms=10.0,
window_ms=20.0,
max_freq=None,
eps=1e-14):
"""Compute the linear spectrogram from FFT energy."""
if max_freq is None:
max_freq = sample_rate / 2
if max_freq > sample_rate / 2:
raise ValueError("max_freq must not be greater than half of "
"sample rate.")
if stride_ms > window_ms:
raise ValueError("Stride size must not be greater than "
"window size.")
stride_size = int(0.001 * sample_rate * stride_ms)
window_size = int(0.001 * sample_rate * window_ms)
specgram, freqs = self._specgram_real(
samples,
window_size=window_size,
stride_size=stride_size,
sample_rate=sample_rate)
ind = np.where(freqs <= max_freq)[0][-1] + 1
return np.log(specgram[:ind, :] + eps)

def _specgram_real(self, samples, window_size, stride_size, sample_rate):
"""Compute the spectrogram for samples from a real signal."""
# extract strided windows
truncate_size = (len(samples) - window_size) % stride_size
samples = samples[:len(samples) - truncate_size]
nshape = (window_size, (len(samples) - window_size) // stride_size + 1)
nstrides = (samples.strides[0], samples.strides[0] * stride_size)
windows = np.lib.stride_tricks.as_strided(
samples, shape=nshape, strides=nstrides)
assert np.all(
windows[:, 1] == samples[stride_size:(stride_size + window_size)])
# window weighting, squared Fast Fourier Transform (fft), scaling
weighting = np.hanning(window_size)[:, None]
fft = np.fft.rfft(windows * weighting, axis=0)
fft = np.absolute(fft)
fft = fft**2
scale = np.sum(weighting**2) * sample_rate
fft[1:-1, :] *= (2.0 / scale)
fft[(0, -1), :] /= scale
# prepare fft frequency list
freqs = float(sample_rate) / window_size * np.arange(fft.shape[0])
return fft, freqs

def _compute_mfcc(self,
samples,
sample_rate,
stride_ms=10.0,
window_ms=20.0,
max_freq=None):
"""Compute mfcc from samples."""
if max_freq is None:
max_freq = sample_rate / 2
if max_freq > sample_rate / 2:
raise ValueError("max_freq must not be greater than half of "
"sample rate.")
if stride_ms > window_ms:
raise ValueError("Stride size must not be greater than "
"window size.")
# compute the 13 cepstral coefficients, and the first one is replaced
# by log(frame energy)
mfcc_feat = mfcc(
signal=samples,
samplerate=sample_rate,
winlen=0.001 * window_ms,
winstep=0.001 * stride_ms,
highfreq=max_freq)
# Deltas
d_mfcc_feat = delta(mfcc_feat, 2)
# Deltas-Deltas
dd_mfcc_feat = delta(d_mfcc_feat, 2)
# transpose
mfcc_feat = np.transpose(mfcc_feat)
d_mfcc_feat = np.transpose(d_mfcc_feat)
dd_mfcc_feat = np.transpose(dd_mfcc_feat)
# concat above three features
concat_mfcc_feat = np.concatenate(
(mfcc_feat, d_mfcc_feat, dd_mfcc_feat))
return concat_mfcc_feat
Binary file added lib/pinyinDictNoTone.pickle
Binary file not shown.
Binary file added lib/pinyinDictNoToneInv.pickle
Binary file not shown.
80 changes: 80 additions & 0 deletions lib/recorder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import pyaudio
import wave

class Recorder(object):
'''A recorder class for recording audio to a WAV file.
Records in mono by default.
'''

def __init__(self, channels=1, rate=44100, frames_per_buffer=1024):
self.channels = channels
self.rate = rate
self.frames_per_buffer = frames_per_buffer

def open(self, fname, mode='wb'):
return RecordingFile(fname, mode, self.channels, self.rate,
self.frames_per_buffer)

class RecordingFile(object):
def __init__(self, fname, mode, channels,
rate, frames_per_buffer):
self.fname = fname
self.mode = mode
self.channels = channels
self.rate = rate
self.frames_per_buffer = frames_per_buffer
self._pa = pyaudio.PyAudio()
self.wavefile = self._prepare_file(self.fname, self.mode)
self._stream = None

def __enter__(self):
return self

def __exit__(self, exception, value, traceback):
self.close()

def record(self, duration):
# Use a stream with no callback function in blocking mode
self._stream = self._pa.open(format=pyaudio.paInt16,
channels=self.channels,
rate=self.rate,
input=True,
frames_per_buffer=self.frames_per_buffer)
for _ in range(int(self.rate / self.frames_per_buffer * duration)):
audio = self._stream.read(self.frames_per_buffer)
self.wavefile.writeframes(audio)
return None

def start_recording(self):
# Use a stream with a callback in non-blocking mode
self._stream = self._pa.open(format=pyaudio.paInt16,
channels=self.channels,
rate=self.rate,
input=True,
frames_per_buffer=self.frames_per_buffer,
stream_callback=self.get_callback())
self._stream.start_stream()
return self

def stop_recording(self):
self._stream.stop_stream()
return self

def get_callback(self):
def callback(in_data, frame_count, time_info, status):
self.wavefile.writeframes(in_data)
return in_data, pyaudio.paContinue
return callback


def close(self):
self._stream.close()
self._pa.terminate()
self.wavefile.close()

def _prepare_file(self, fname, mode='wb'):
wavefile = wave.open(fname, mode)
wavefile.setnchannels(self.channels)
wavefile.setsampwidth(self._pa.get_sample_size(pyaudio.paInt16))
wavefile.setframerate(self.rate)
return wavefile
92 changes: 92 additions & 0 deletions lib/tools_audio.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
import scipy.io.wavfile as wav
import numpy as np
import os
import pydub
import tempfile
import scipy
import random
from python_speech_features import logfbank

from lib.tools_math import *

def changeRateTo16000(filepath):
if filepath[-4:].lower() =='.wav':
sound = pydub.AudioSegment.from_wav(filepath)
sound = sound.set_frame_rate(16000)
sound.export(filepath, format="wav")
elif filepath[-4:].lower() =='.m4a':
sound = pydub.AudioSegment.from_file(filepath, "m4a")
sound = sound.set_frame_rate(16000)
sound.export(filepath[:-3]+"wav", format="wav")
elif filepath[-4:].lower() =='.mp3':
sound = pydub.AudioSegment.from_mp3(filepath)
sound = sound.set_frame_rate(16000)
sound.export(filepath[:-3]+"wav", format="wav")
else:
print("Unsupported Format.")

def read_wav(file_path):
assert file_path[-4:]=='.wav'
rate, data = wav.read(file_path)
return rate, data

def read_m4a(file_path):
path, ext = os.path.splitext(file_path)
assert ext=='.m4a'
aac_version = pydub.AudioSegment.from_file(file_path, "m4a")
_, path = tempfile.mkstemp()
aac_version.export(path, format="wav")
rate, data = scipy.io.wavfile.read(path)
os.remove(path)
return rate, data

def read_mp3(file_path):
path, ext = os.path.splitext(file_path)
assert ext=='.mp3'
mp3 = pydub.AudioSegment.from_mp3(file_path)
_, path = tempfile.mkstemp()
mp3.export(path, format="wav")
rate, data = scipy.io.wavfile.read(path)
os.remove(path)
return rate, data

def mp3_to_wav(file_path, obj_path):
path, ext = os.path.splitext(file_path)
assert ext=='.mp3'
mp3 = pydub.AudioSegment.from_mp3(file_path)
mp3.export(obj_path, format="wav")

def mergeChannels(data):
data = normalize(data)
if len(data.shape)==1:
return data
if len(data.shape)==2:
return np.mean(data, axis = 1)
raise ValueError("This is not what an audio file ought to be!")

def getDefaultSpectrogram(rate, data):
f, t, Sxx = signal.spectrogram(data, fs=rate, window='hamming', nperseg=400, noverlap=240, nfft=1024, scaling='spectrum', return_onesided=True)
return Sxx

def frame_split(data, frame_width, frame_step):
if len(data)<frame_width:
raise ValueError("The length of data is shorter than the frame width.")
frame_max = int(np.floor((len(data)-frame_width+frame_step)/float(frame_step)))
result = []
for i in range(frame_max):
start = i*frame_step
end = start+frame_width
result.append(data[start:end])
return np.array(result)

def get_mel_db(wave_data, sr, winlen = 0.025, winstep = 0.01, nfft = 512, num_mel = 40, wav_process=False):
# Input 10.015 sec, output, (1000, 40)
# nfft >= num data points in one window(frame)
if wav_process == True:
frame_shift = int(sr * winstep)
frame_size = int(sr * winlen)
wave_data, index = librosa.effects.trim(wave_data, frame_length=frame_size, hop_length=frame_shift)
mel_db = logfbank(wave_data, samplerate=sr, winlen=winlen, winstep=winstep,
nfilt=num_mel, nfft=nfft, lowfreq=0, highfreq=None, preemph=0.97)
mel_db -= (np.mean(mel_db,axis=1).reshape(-1,1)+1e-8)
return mel_db
Loading

0 comments on commit e5703f1

Please sign in to comment.