diff --git a/lib/contrib/audio.py b/lib/contrib/audio.py new file mode 100644 index 0000000..97c9268 --- /dev/null +++ b/lib/contrib/audio.py @@ -0,0 +1,678 @@ +"""Contains the audio segment class.""" +import numpy as np +import io +import struct +import re +import soundfile +import resampy +from scipy import signal +import random +import copy + + +class AudioSegment(object): + """Monaural audio segment abstraction. + + :param samples: Audio samples [num_samples x num_channels]. + :type samples: ndarray.float32 + :param sample_rate: Audio sample rate. + :type sample_rate: int + :raises TypeError: If the sample data type is not float or int. + """ + + def __init__(self, samples, sample_rate): + """Create audio segment from samples. + + Samples are convert float32 internally, with int scaled to [-1, 1]. + """ + self._samples = self._convert_samples_to_float32(samples) + self._sample_rate = sample_rate + if self._samples.ndim >= 2: + self._samples = np.mean(self._samples, 1) + + def __eq__(self, other): + """Return whether two objects are equal.""" + if type(other) is not type(self): + return False + if self._sample_rate != other._sample_rate: + return False + if self._samples.shape != other._samples.shape: + return False + if np.any(self.samples != other._samples): + return False + return True + + def __ne__(self, other): + """Return whether two objects are unequal.""" + return not self.__eq__(other) + + def __str__(self): + """Return human-readable representation of segment.""" + return ("%s: num_samples=%d, sample_rate=%d, duration=%.2fsec, " + "rms=%.2fdB" % (type(self), self.num_samples, self.sample_rate, + self.duration, self.rms_db)) + + @classmethod + def from_file(cls, file): + """Create audio segment from audio file. + + :param filepath: Filepath or file object to audio file. + :type filepath: basestring|file + :return: Audio segment instance. + :rtype: AudioSegment + """ + samples, sample_rate = soundfile.read(file, dtype='float32') + return cls(samples, sample_rate) + + @classmethod + def slice_from_file(cls, file, start=None, end=None): + """Loads a small section of an audio without having to load + the entire file into the memory which can be incredibly wasteful. + + :param file: Input audio filepath or file object. + :type file: basestring|file + :param start: Start time in seconds. If start is negative, it wraps + around from the end. If not provided, this function + reads from the very beginning. + :type start: float + :param end: End time in seconds. If end is negative, it wraps around + from the end. If not provided, the default behvaior is + to read to the end of the file. + :type end: float + :return: AudioSegment instance of the specified slice of the input + audio file. + :rtype: AudioSegment + :raise ValueError: If start or end is incorrectly set, e.g. out of + bounds in time. + """ + sndfile = soundfile.SoundFile(file) + sample_rate = sndfile.samplerate + duration = float(len(sndfile)) / sample_rate + start = 0. if start is None else start + end = 0. if end is None else end + if start < 0.0: + start += duration + if end < 0.0: + end += duration + if start < 0.0: + raise ValueError("The slice start position (%f s) is out of " + "bounds." % start) + if end < 0.0: + raise ValueError("The slice end position (%f s) is out of bounds." % + end) + if start > end: + raise ValueError("The slice start position (%f s) is later than " + "the slice end position (%f s)." % (start, end)) + if end > duration: + raise ValueError("The slice end position (%f s) is out of bounds " + "(> %f s)" % (end, duration)) + start_frame = int(start * sample_rate) + end_frame = int(end * sample_rate) + sndfile.seek(start_frame) + data = sndfile.read(frames=end_frame - start_frame, dtype='float32') + return cls(data, sample_rate) + + @classmethod + def from_sequence_file(cls, filepath): + """Create audio segment from sequence file. Sequence file is a binary + file containing a collection of multiple audio files, with several + header bytes in the head indicating the offsets of each audio byte data + chunk. + + The format is: + + 4 bytes (int, version), + 4 bytes (int, num of utterance), + 4 bytes (int, bytes per header), + [bytes_per_header*(num_utterance+1)] bytes (offsets for each audio), + audio_bytes_data_of_1st_utterance, + audio_bytes_data_of_2nd_utterance, + ...... + + Sequence file name must end with ".seqbin". And the filename of the 5th + utterance's audio file in sequence file "xxx.seqbin" must be + "xxx.seqbin_5", with "5" indicating the utterance index within this + sequence file (starting from 1). + + :param filepath: Filepath of sequence file. + :type filepath: basestring + :return: Audio segment instance. + :rtype: AudioSegment + """ + # parse filepath + matches = re.match(r"(.+\.seqbin)_(\d+)", filepath) + if matches is None: + raise IOError("File type of %s is not supported" % filepath) + filename = matches.group(1) + fileno = int(matches.group(2)) + + # read headers + f = open(filename, 'rb') + version = f.read(4) + num_utterances = struct.unpack("i", f.read(4))[0] + bytes_per_header = struct.unpack("i", f.read(4))[0] + header_bytes = f.read(bytes_per_header * (num_utterances + 1)) + header = [ + struct.unpack("i", header_bytes[bytes_per_header * i: + bytes_per_header * (i + 1)])[0] + for i in range(num_utterances + 1) + ] + + # read audio bytes + f.seek(header[fileno - 1]) + audio_bytes = f.read(header[fileno] - header[fileno - 1]) + f.close() + + # create audio segment + try: + return cls.from_bytes(audio_bytes) + except Exception as e: + samples = np.frombuffer(audio_bytes, dtype='int16') + return cls(samples=samples, sample_rate=8000) + + @classmethod + def from_bytes(cls, bytes): + """Create audio segment from a byte string containing audio samples. + + :param bytes: Byte string containing audio samples. + :type bytes: str + :return: Audio segment instance. + :rtype: AudioSegment + """ + samples, sample_rate = soundfile.read( + io.BytesIO(bytes), dtype='float32') + return cls(samples, sample_rate) + + @classmethod + def concatenate(cls, *segments): + """Concatenate an arbitrary number of audio segments together. + + :param *segments: Input audio segments to be concatenated. + :type *segments: tuple of AudioSegment + :return: Audio segment instance as concatenating results. + :rtype: AudioSegment + :raises ValueError: If the number of segments is zero, or if the + sample_rate of any segments does not match. + :raises TypeError: If any segment is not AudioSegment instance. + """ + # Perform basic sanity-checks. + if len(segments) == 0: + raise ValueError("No audio segments are given to concatenate.") + sample_rate = segments[0]._sample_rate + for seg in segments: + if sample_rate != seg._sample_rate: + raise ValueError("Can't concatenate segments with " + "different sample rates") + if type(seg) is not cls: + raise TypeError("Only audio segments of the same type " + "can be concatenated.") + samples = np.concatenate([seg.samples for seg in segments]) + return cls(samples, sample_rate) + + @classmethod + def make_silence(cls, duration, sample_rate): + """Creates a silent audio segment of the given duration and sample rate. + + :param duration: Length of silence in seconds. + :type duration: float + :param sample_rate: Sample rate. + :type sample_rate: float + :return: Silent AudioSegment instance of the given duration. + :rtype: AudioSegment + """ + samples = np.zeros(int(duration * sample_rate)) + return cls(samples, sample_rate) + + def to_wav_file(self, filepath, dtype='float32'): + """Save audio segment to disk as wav file. + + :param filepath: WAV filepath or file object to save the + audio segment. + :type filepath: basestring|file + :param dtype: Subtype for audio file. Options: 'int16', 'int32', + 'float32', 'float64'. Default is 'float32'. + :type dtype: str + :raises TypeError: If dtype is not supported. + """ + samples = self._convert_samples_from_float32(self._samples, dtype) + subtype_map = { + 'int16': 'PCM_16', + 'int32': 'PCM_32', + 'float32': 'FLOAT', + 'float64': 'DOUBLE' + } + soundfile.write( + filepath, + samples, + self._sample_rate, + format='WAV', + subtype=subtype_map[dtype]) + + def superimpose(self, other): + """Add samples from another segment to those of this segment + (sample-wise addition, not segment concatenation). + + Note that this is an in-place transformation. + + :param other: Segment containing samples to be added in. + :type other: AudioSegments + :raise TypeError: If type of two segments don't match. + :raise ValueError: If the sample rates of the two segments are not + equal, or if the lengths of segments don't match. + """ + if isinstance(other, type(self)): + raise TypeError("Cannot add segments of different types: %s " + "and %s." % (type(self), type(other))) + if self._sample_rate != other._sample_rate: + raise ValueError("Sample rates must match to add segments.") + if len(self._samples) != len(other._samples): + raise ValueError("Segment lengths must match to add segments.") + self._samples += other._samples + + def to_bytes(self, dtype='float32'): + """Create a byte string containing the audio content. + + :param dtype: Data type for export samples. Options: 'int16', 'int32', + 'float32', 'float64'. Default is 'float32'. + :type dtype: str + :return: Byte string containing audio content. + :rtype: str + """ + samples = self._convert_samples_from_float32(self._samples, dtype) + return samples.tostring() + + def gain_db(self, gain): + """Apply gain in decibels to samples. + + Note that this is an in-place transformation. + + :param gain: Gain in decibels to apply to samples. + :type gain: float|1darray + """ + self._samples *= 10.**(gain / 20.) + + def change_speed(self, speed_rate): + """Change the audio speed by linear interpolation. + + Note that this is an in-place transformation. + + :param speed_rate: Rate of speed change: + speed_rate > 1.0, speed up the audio; + speed_rate = 1.0, unchanged; + speed_rate < 1.0, slow down the audio; + speed_rate <= 0.0, not allowed, raise ValueError. + :type speed_rate: float + :raises ValueError: If speed_rate <= 0.0. + """ + if speed_rate <= 0: + raise ValueError("speed_rate should be greater than zero.") + old_length = self._samples.shape[0] + new_length = int(old_length / speed_rate) + old_indices = np.arange(old_length) + new_indices = np.linspace(start=0, stop=old_length, num=new_length) + self._samples = np.interp(new_indices, old_indices, self._samples) + + def normalize(self, target_db=-20, max_gain_db=300.0): + """Normalize audio to be of the desired RMS value in decibels. + + Note that this is an in-place transformation. + + :param target_db: Target RMS value in decibels. This value should be + less than 0.0 as 0.0 is full-scale audio. + :type target_db: float + :param max_gain_db: Max amount of gain in dB that can be applied for + normalization. This is to prevent nans when + attempting to normalize a signal consisting of + all zeros. + :type max_gain_db: float + :raises ValueError: If the required gain to normalize the segment to + the target_db value exceeds max_gain_db. + """ + gain = target_db - self.rms_db + if gain > max_gain_db: + raise ValueError( + "Unable to normalize segment to %f dB because the " + "the probable gain have exceeds max_gain_db (%f dB)" % + (target_db, max_gain_db)) + self.gain_db(min(max_gain_db, target_db - self.rms_db)) + + def normalize_online_bayesian(self, + target_db, + prior_db, + prior_samples, + startup_delay=0.0): + """Normalize audio using a production-compatible online/causal + algorithm. This uses an exponential likelihood and gamma prior to + make online estimates of the RMS even when there are very few samples. + + Note that this is an in-place transformation. + + :param target_db: Target RMS value in decibels. + :type target_bd: float + :param prior_db: Prior RMS estimate in decibels. + :type prior_db: float + :param prior_samples: Prior strength in number of samples. + :type prior_samples: float + :param startup_delay: Default 0.0s. If provided, this function will + accrue statistics for the first startup_delay + seconds before applying online normalization. + :type startup_delay: float + """ + # Estimate total RMS online. + startup_sample_idx = min(self.num_samples - 1, + int(self.sample_rate * startup_delay)) + prior_mean_squared = 10.**(prior_db / 10.) + prior_sum_of_squares = prior_mean_squared * prior_samples + cumsum_of_squares = np.cumsum(self.samples**2) + sample_count = np.arange(self.num_samples) + 1 + if startup_sample_idx > 0: + cumsum_of_squares[:startup_sample_idx] = \ + cumsum_of_squares[startup_sample_idx] + sample_count[:startup_sample_idx] = \ + sample_count[startup_sample_idx] + mean_squared_estimate = ((cumsum_of_squares + prior_sum_of_squares) / + (sample_count + prior_samples)) + rms_estimate_db = 10 * np.log10(mean_squared_estimate) + # Compute required time-varying gain. + gain_db = target_db - rms_estimate_db + self.gain_db(gain_db) + + def resample(self, target_sample_rate, filter='kaiser_best'): + """Resample the audio to a target sample rate. + + Note that this is an in-place transformation. + + :param target_sample_rate: Target sample rate. + :type target_sample_rate: int + :param filter: The resampling filter to use one of {'kaiser_best', + 'kaiser_fast'}. + :type filter: str + """ + self._samples = resampy.resample( + self.samples, self.sample_rate, target_sample_rate, filter=filter) + self._sample_rate = target_sample_rate + + def pad_silence(self, duration, sides='both'): + """Pad this audio sample with a period of silence. + + Note that this is an in-place transformation. + + :param duration: Length of silence in seconds to pad. + :type duration: float + :param sides: Position for padding: + 'beginning' - adds silence in the beginning; + 'end' - adds silence in the end; + 'both' - adds silence in both the beginning and the end. + :type sides: str + :raises ValueError: If sides is not supported. + """ + if duration == 0.0: + return self + cls = type(self) + silence = self.make_silence(duration, self._sample_rate) + if sides == "beginning": + padded = cls.concatenate(silence, self) + elif sides == "end": + padded = cls.concatenate(self, silence) + elif sides == "both": + padded = cls.concatenate(silence, self, silence) + else: + raise ValueError("Unknown value for the sides %s" % sides) + self._samples = padded._samples + + def shift(self, shift_ms): + """Shift the audio in time. If `shift_ms` is positive, shift with time + advance; if negative, shift with time delay. Silence are padded to + keep the duration unchanged. + + Note that this is an in-place transformation. + + :param shift_ms: Shift time in millseconds. If positive, shift with + time advance; if negative; shift with time delay. + :type shift_ms: float + :raises ValueError: If shift_ms is longer than audio duration. + """ + if abs(shift_ms) / 1000.0 > self.duration: + raise ValueError("Absolute value of shift_ms should be smaller " + "than audio duration.") + shift_samples = int(shift_ms * self._sample_rate / 1000) + if shift_samples > 0: + # time advance + self._samples[:-shift_samples] = self._samples[shift_samples:] + self._samples[-shift_samples:] = 0 + elif shift_samples < 0: + # time delay + self._samples[-shift_samples:] = self._samples[:shift_samples] + self._samples[:-shift_samples] = 0 + + def subsegment(self, start_sec=None, end_sec=None): + """Cut the AudioSegment between given boundaries. + + Note that this is an in-place transformation. + + :param start_sec: Beginning of subsegment in seconds. + :type start_sec: float + :param end_sec: End of subsegment in seconds. + :type end_sec: float + :raise ValueError: If start_sec or end_sec is incorrectly set, e.g. out + of bounds in time. + """ + start_sec = 0.0 if start_sec is None else start_sec + end_sec = self.duration if end_sec is None else end_sec + if start_sec < 0.0: + start_sec = self.duration + start_sec + if end_sec < 0.0: + end_sec = self.duration + end_sec + if start_sec < 0.0: + raise ValueError("The slice start position (%f s) is out of " + "bounds." % start_sec) + if end_sec < 0.0: + raise ValueError("The slice end position (%f s) is out of bounds." % + end_sec) + if start_sec > end_sec: + raise ValueError("The slice start position (%f s) is later than " + "the end position (%f s)." % (start_sec, end_sec)) + if end_sec > self.duration: + raise ValueError("The slice end position (%f s) is out of bounds " + "(> %f s)" % (end_sec, self.duration)) + start_sample = int(round(start_sec * self._sample_rate)) + end_sample = int(round(end_sec * self._sample_rate)) + self._samples = self._samples[start_sample:end_sample] + + def random_subsegment(self, subsegment_length, rng=None): + """Cut the specified length of the audiosegment randomly. + + Note that this is an in-place transformation. + + :param subsegment_length: Subsegment length in seconds. + :type subsegment_length: float + :param rng: Random number generator state. + :type rng: random.Random + :raises ValueError: If the length of subsegment is greater than + the origineal segemnt. + """ + rng = random.Random() if rng is None else rng + if subsegment_length > self.duration: + raise ValueError("Length of subsegment must not be greater " + "than original segment.") + start_time = rng.uniform(0.0, self.duration - subsegment_length) + self.subsegment(start_time, start_time + subsegment_length) + + def convolve(self, impulse_segment, allow_resample=False): + """Convolve this audio segment with the given impulse segment. + + Note that this is an in-place transformation. + + :param impulse_segment: Impulse response segments. + :type impulse_segment: AudioSegment + :param allow_resample: Indicates whether resampling is allowed when + the impulse_segment has a different sample + rate from this signal. + :type allow_resample: bool + :raises ValueError: If the sample rate is not match between two + audio segments when resample is not allowed. + """ + if allow_resample and self.sample_rate != impulse_segment.sample_rate: + impulse_segment.resample(self.sample_rate) + if self.sample_rate != impulse_segment.sample_rate: + raise ValueError("Impulse segment's sample rate (%d Hz) is not " + "equal to base signal sample rate (%d Hz)." % + (impulse_segment.sample_rate, self.sample_rate)) + samples = signal.fftconvolve(self.samples, impulse_segment.samples, + "full") + self._samples = samples + + def convolve_and_normalize(self, impulse_segment, allow_resample=False): + """Convolve and normalize the resulting audio segment so that it + has the same average power as the input signal. + + Note that this is an in-place transformation. + + :param impulse_segment: Impulse response segments. + :type impulse_segment: AudioSegment + :param allow_resample: Indicates whether resampling is allowed when + the impulse_segment has a different sample + rate from this signal. + :type allow_resample: bool + """ + target_db = self.rms_db + self.convolve(impulse_segment, allow_resample=allow_resample) + self.normalize(target_db) + + def add_noise(self, + noise, + snr_dB, + allow_downsampling=False, + max_gain_db=300.0, + rng=None): + """Add the given noise segment at a specific signal-to-noise ratio. + If the noise segment is longer than this segment, a random subsegment + of matching length is sampled from it and used instead. + + Note that this is an in-place transformation. + + :param noise: Noise signal to add. + :type noise: AudioSegment + :param snr_dB: Signal-to-Noise Ratio, in decibels. + :type snr_dB: float + :param allow_downsampling: Whether to allow the noise signal to be + downsampled to match the base signal sample + rate. + :type allow_downsampling: bool + :param max_gain_db: Maximum amount of gain to apply to noise signal + before adding it in. This is to prevent attempting + to apply infinite gain to a zero signal. + :type max_gain_db: float + :param rng: Random number generator state. + :type rng: None|random.Random + :raises ValueError: If the sample rate does not match between the two + audio segments when downsampling is not allowed, or + if the duration of noise segments is shorter than + original audio segments. + """ + rng = random.Random() if rng is None else rng + if allow_downsampling and noise.sample_rate > self.sample_rate: + noise = noise.resample(self.sample_rate) + if noise.sample_rate != self.sample_rate: + raise ValueError("Noise sample rate (%d Hz) is not equal to base " + "signal sample rate (%d Hz)." % (noise.sample_rate, + self.sample_rate)) + if noise.duration < self.duration: + raise ValueError("Noise signal (%f sec) must be at least as long as" + " base signal (%f sec)." % + (noise.duration, self.duration)) + noise_gain_db = min(self.rms_db - noise.rms_db - snr_dB, max_gain_db) + noise_new = copy.deepcopy(noise) + noise_new.random_subsegment(self.duration, rng=rng) + noise_new.gain_db(noise_gain_db) + self.superimpose(noise_new) + + @property + def samples(self): + """Return audio samples. + + :return: Audio samples. + :rtype: ndarray + """ + return self._samples.copy() + + @property + def sample_rate(self): + """Return audio sample rate. + + :return: Audio sample rate. + :rtype: int + """ + return self._sample_rate + + @property + def num_samples(self): + """Return number of samples. + + :return: Number of samples. + :rtype: int + """ + return self._samples.shape[0] + + @property + def duration(self): + """Return audio duration. + + :return: Audio duration in seconds. + :rtype: float + """ + return self._samples.shape[0] / float(self._sample_rate) + + @property + def rms_db(self): + """Return root mean square energy of the audio in decibels. + + :return: Root mean square energy in decibels. + :rtype: float + """ + # square root => multiply by 10 instead of 20 for dBs + mean_square = np.mean(self._samples**2) + return 10 * np.log10(mean_square) + + def _convert_samples_to_float32(self, samples): + """Convert sample type to float32. + + Audio sample type is usually integer or float-point. + Integers will be scaled to [-1, 1] in float32. + """ + float32_samples = samples.astype('float32') + if samples.dtype in np.sctypes['int']: + bits = np.iinfo(samples.dtype).bits + float32_samples *= (1. / 2**(bits - 1)) + elif samples.dtype in np.sctypes['float']: + pass + else: + raise TypeError("Unsupported sample type: %s." % samples.dtype) + return float32_samples + + def _convert_samples_from_float32(self, samples, dtype): + """Convert sample type from float32 to dtype. + + Audio sample type is usually integer or float-point. For integer + type, float32 will be rescaled from [-1, 1] to the maximum range + supported by the integer type. + + This is for writing a audio file. + """ + dtype = np.dtype(dtype) + output_samples = samples.copy() + if dtype in np.sctypes['int']: + bits = np.iinfo(dtype).bits + output_samples *= (2**(bits - 1) / 1.) + min_val = np.iinfo(dtype).min + max_val = np.iinfo(dtype).max + output_samples[output_samples > max_val] = max_val + output_samples[output_samples < min_val] = min_val + elif samples.dtype in np.sctypes['float']: + min_val = np.finfo(dtype).min + max_val = np.finfo(dtype).max + output_samples[output_samples > max_val] = max_val + output_samples[output_samples < min_val] = min_val + else: + raise TypeError("Unsupported sample type: %s." % samples.dtype) + return output_samples.astype(dtype) diff --git a/lib/contrib/audio_featurizer.py b/lib/contrib/audio_featurizer.py new file mode 100644 index 0000000..0069b2e --- /dev/null +++ b/lib/contrib/audio_featurizer.py @@ -0,0 +1,182 @@ +"""Contains the audio featurizer class.""" +import numpy as np +# https://github.com/jameslyons/python_speech_features +from python_speech_features import mfcc +from python_speech_features import delta + + +class AudioFeaturizer(object): + """Audio featurizer, for extracting features from audio contents of + AudioSegment or SpeechSegment. + + Currently, it supports feature types of linear spectrogram and mfcc. + + :param specgram_type: Specgram feature type. Options: 'linear'. + :type specgram_type: str + :param stride_ms: Striding size (in milliseconds) for generating frames. + :type stride_ms: float + :param window_ms: Window size (in milliseconds) for generating frames. + :type window_ms: float + :param max_freq: When specgram_type is 'linear', only FFT bins + corresponding to frequencies between [0, max_freq] are + returned; when specgram_type is 'mfcc', max_feq is the + highest band edge of mel filters. + :types max_freq: None|float + :param target_sample_rate: Audio are resampled (if upsampling or + downsampling is allowed) to this before + extracting spectrogram features. + :type target_sample_rate: float + :param use_dB_normalization: Whether to normalize the audio to a certain + decibels before extracting the features. + :type use_dB_normalization: bool + :param target_dB: Target audio decibels for normalization. + :type target_dB: float + """ + + def __init__(self, + specgram_type='linear', + stride_ms=10.0, + window_ms=20.0, + max_freq=None, + target_sample_rate=16000, + use_dB_normalization=True, + target_dB=-20): + self._specgram_type = specgram_type + self._stride_ms = stride_ms + self._window_ms = window_ms + self._max_freq = max_freq + self._target_sample_rate = target_sample_rate + self._use_dB_normalization = use_dB_normalization + self._target_dB = target_dB + + def featurize(self, + audio_segment, + allow_downsampling=True, + allow_upsampling=True): + """Extract audio features from AudioSegment or SpeechSegment. + + :param audio_segment: Audio/speech segment to extract features from. + :type audio_segment: AudioSegment|SpeechSegment + :param allow_downsampling: Whether to allow audio downsampling before + featurizing. + :type allow_downsampling: bool + :param allow_upsampling: Whether to allow audio upsampling before + featurizing. + :type allow_upsampling: bool + :return: Spectrogram audio feature in 2darray. + :rtype: ndarray + :raises ValueError: If audio sample rate is not supported. + """ + # upsampling or downsampling + if ((audio_segment.sample_rate > self._target_sample_rate and + allow_downsampling) or + (audio_segment.sample_rate < self._target_sample_rate and + allow_upsampling)): + audio_segment.resample(self._target_sample_rate) + if audio_segment.sample_rate != self._target_sample_rate: + raise ValueError("Audio sample rate is not supported. " + "Turn allow_downsampling or allow up_sampling on.") + # decibel normalization + if self._use_dB_normalization: + audio_segment.normalize(target_db=self._target_dB) + # extract spectrogram + return self._compute_specgram(audio_segment.samples, + audio_segment.sample_rate) + + def _compute_specgram(self, samples, sample_rate): + """Extract various audio features.""" + if self._specgram_type == 'linear': + return self._compute_linear_specgram( + samples, sample_rate, self._stride_ms, self._window_ms, + self._max_freq) + elif self._specgram_type == 'mfcc': + return self._compute_mfcc(samples, sample_rate, self._stride_ms, + self._window_ms, self._max_freq) + else: + raise ValueError("Unknown specgram_type %s. " + "Supported values: linear." % self._specgram_type) + + def _compute_linear_specgram(self, + samples, + sample_rate, + stride_ms=10.0, + window_ms=20.0, + max_freq=None, + eps=1e-14): + """Compute the linear spectrogram from FFT energy.""" + if max_freq is None: + max_freq = sample_rate / 2 + if max_freq > sample_rate / 2: + raise ValueError("max_freq must not be greater than half of " + "sample rate.") + if stride_ms > window_ms: + raise ValueError("Stride size must not be greater than " + "window size.") + stride_size = int(0.001 * sample_rate * stride_ms) + window_size = int(0.001 * sample_rate * window_ms) + specgram, freqs = self._specgram_real( + samples, + window_size=window_size, + stride_size=stride_size, + sample_rate=sample_rate) + ind = np.where(freqs <= max_freq)[0][-1] + 1 + return np.log(specgram[:ind, :] + eps) + + def _specgram_real(self, samples, window_size, stride_size, sample_rate): + """Compute the spectrogram for samples from a real signal.""" + # extract strided windows + truncate_size = (len(samples) - window_size) % stride_size + samples = samples[:len(samples) - truncate_size] + nshape = (window_size, (len(samples) - window_size) // stride_size + 1) + nstrides = (samples.strides[0], samples.strides[0] * stride_size) + windows = np.lib.stride_tricks.as_strided( + samples, shape=nshape, strides=nstrides) + assert np.all( + windows[:, 1] == samples[stride_size:(stride_size + window_size)]) + # window weighting, squared Fast Fourier Transform (fft), scaling + weighting = np.hanning(window_size)[:, None] + fft = np.fft.rfft(windows * weighting, axis=0) + fft = np.absolute(fft) + fft = fft**2 + scale = np.sum(weighting**2) * sample_rate + fft[1:-1, :] *= (2.0 / scale) + fft[(0, -1), :] /= scale + # prepare fft frequency list + freqs = float(sample_rate) / window_size * np.arange(fft.shape[0]) + return fft, freqs + + def _compute_mfcc(self, + samples, + sample_rate, + stride_ms=10.0, + window_ms=20.0, + max_freq=None): + """Compute mfcc from samples.""" + if max_freq is None: + max_freq = sample_rate / 2 + if max_freq > sample_rate / 2: + raise ValueError("max_freq must not be greater than half of " + "sample rate.") + if stride_ms > window_ms: + raise ValueError("Stride size must not be greater than " + "window size.") + # compute the 13 cepstral coefficients, and the first one is replaced + # by log(frame energy) + mfcc_feat = mfcc( + signal=samples, + samplerate=sample_rate, + winlen=0.001 * window_ms, + winstep=0.001 * stride_ms, + highfreq=max_freq) + # Deltas + d_mfcc_feat = delta(mfcc_feat, 2) + # Deltas-Deltas + dd_mfcc_feat = delta(d_mfcc_feat, 2) + # transpose + mfcc_feat = np.transpose(mfcc_feat) + d_mfcc_feat = np.transpose(d_mfcc_feat) + dd_mfcc_feat = np.transpose(dd_mfcc_feat) + # concat above three features + concat_mfcc_feat = np.concatenate( + (mfcc_feat, d_mfcc_feat, dd_mfcc_feat)) + return concat_mfcc_feat diff --git a/lib/pinyinDictNoTone.pickle b/lib/pinyinDictNoTone.pickle new file mode 100644 index 0000000..1b2fe00 Binary files /dev/null and b/lib/pinyinDictNoTone.pickle differ diff --git a/lib/pinyinDictNoToneInv.pickle b/lib/pinyinDictNoToneInv.pickle new file mode 100644 index 0000000..bd03a7a Binary files /dev/null and b/lib/pinyinDictNoToneInv.pickle differ diff --git a/lib/recorder.py b/lib/recorder.py new file mode 100644 index 0000000..856f0a5 --- /dev/null +++ b/lib/recorder.py @@ -0,0 +1,80 @@ +import pyaudio +import wave + +class Recorder(object): + '''A recorder class for recording audio to a WAV file. + Records in mono by default. + ''' + + def __init__(self, channels=1, rate=44100, frames_per_buffer=1024): + self.channels = channels + self.rate = rate + self.frames_per_buffer = frames_per_buffer + + def open(self, fname, mode='wb'): + return RecordingFile(fname, mode, self.channels, self.rate, + self.frames_per_buffer) + +class RecordingFile(object): + def __init__(self, fname, mode, channels, + rate, frames_per_buffer): + self.fname = fname + self.mode = mode + self.channels = channels + self.rate = rate + self.frames_per_buffer = frames_per_buffer + self._pa = pyaudio.PyAudio() + self.wavefile = self._prepare_file(self.fname, self.mode) + self._stream = None + + def __enter__(self): + return self + + def __exit__(self, exception, value, traceback): + self.close() + + def record(self, duration): + # Use a stream with no callback function in blocking mode + self._stream = self._pa.open(format=pyaudio.paInt16, + channels=self.channels, + rate=self.rate, + input=True, + frames_per_buffer=self.frames_per_buffer) + for _ in range(int(self.rate / self.frames_per_buffer * duration)): + audio = self._stream.read(self.frames_per_buffer) + self.wavefile.writeframes(audio) + return None + + def start_recording(self): + # Use a stream with a callback in non-blocking mode + self._stream = self._pa.open(format=pyaudio.paInt16, + channels=self.channels, + rate=self.rate, + input=True, + frames_per_buffer=self.frames_per_buffer, + stream_callback=self.get_callback()) + self._stream.start_stream() + return self + + def stop_recording(self): + self._stream.stop_stream() + return self + + def get_callback(self): + def callback(in_data, frame_count, time_info, status): + self.wavefile.writeframes(in_data) + return in_data, pyaudio.paContinue + return callback + + + def close(self): + self._stream.close() + self._pa.terminate() + self.wavefile.close() + + def _prepare_file(self, fname, mode='wb'): + wavefile = wave.open(fname, mode) + wavefile.setnchannels(self.channels) + wavefile.setsampwidth(self._pa.get_sample_size(pyaudio.paInt16)) + wavefile.setframerate(self.rate) + return wavefile diff --git a/lib/tools_audio.py b/lib/tools_audio.py new file mode 100644 index 0000000..c25bc32 --- /dev/null +++ b/lib/tools_audio.py @@ -0,0 +1,92 @@ +import scipy.io.wavfile as wav +import numpy as np +import os +import pydub +import tempfile +import scipy +import random +from python_speech_features import logfbank + +from lib.tools_math import * + +def changeRateTo16000(filepath): + if filepath[-4:].lower() =='.wav': + sound = pydub.AudioSegment.from_wav(filepath) + sound = sound.set_frame_rate(16000) + sound.export(filepath, format="wav") + elif filepath[-4:].lower() =='.m4a': + sound = pydub.AudioSegment.from_file(filepath, "m4a") + sound = sound.set_frame_rate(16000) + sound.export(filepath[:-3]+"wav", format="wav") + elif filepath[-4:].lower() =='.mp3': + sound = pydub.AudioSegment.from_mp3(filepath) + sound = sound.set_frame_rate(16000) + sound.export(filepath[:-3]+"wav", format="wav") + else: + print("Unsupported Format.") + +def read_wav(file_path): + assert file_path[-4:]=='.wav' + rate, data = wav.read(file_path) + return rate, data + +def read_m4a(file_path): + path, ext = os.path.splitext(file_path) + assert ext=='.m4a' + aac_version = pydub.AudioSegment.from_file(file_path, "m4a") + _, path = tempfile.mkstemp() + aac_version.export(path, format="wav") + rate, data = scipy.io.wavfile.read(path) + os.remove(path) + return rate, data + +def read_mp3(file_path): + path, ext = os.path.splitext(file_path) + assert ext=='.mp3' + mp3 = pydub.AudioSegment.from_mp3(file_path) + _, path = tempfile.mkstemp() + mp3.export(path, format="wav") + rate, data = scipy.io.wavfile.read(path) + os.remove(path) + return rate, data + +def mp3_to_wav(file_path, obj_path): + path, ext = os.path.splitext(file_path) + assert ext=='.mp3' + mp3 = pydub.AudioSegment.from_mp3(file_path) + mp3.export(obj_path, format="wav") + +def mergeChannels(data): + data = normalize(data) + if len(data.shape)==1: + return data + if len(data.shape)==2: + return np.mean(data, axis = 1) + raise ValueError("This is not what an audio file ought to be!") + +def getDefaultSpectrogram(rate, data): + f, t, Sxx = signal.spectrogram(data, fs=rate, window='hamming', nperseg=400, noverlap=240, nfft=1024, scaling='spectrum', return_onesided=True) + return Sxx + +def frame_split(data, frame_width, frame_step): + if len(data)= num data points in one window(frame) + if wav_process == True: + frame_shift = int(sr * winstep) + frame_size = int(sr * winlen) + wave_data, index = librosa.effects.trim(wave_data, frame_length=frame_size, hop_length=frame_shift) + mel_db = logfbank(wave_data, samplerate=sr, winlen=winlen, winstep=winstep, + nfilt=num_mel, nfft=nfft, lowfreq=0, highfreq=None, preemph=0.97) + mel_db -= (np.mean(mel_db,axis=1).reshape(-1,1)+1e-8) + return mel_db diff --git a/lib/tools_augmentation.py b/lib/tools_augmentation.py new file mode 100644 index 0000000..23ce1db --- /dev/null +++ b/lib/tools_augmentation.py @@ -0,0 +1,368 @@ +import numpy as np +import pyfftw +import resampy +from scipy.ndimage import zoom + +def randomAugment(data, rate, num, obj_length = None, noiseSource = None, bgMaximum = 0.08, verbose = False): + """ + Perform random augmentations. Recommended bgMaximum: random noise:0.07, + office - 0.1, youtube human: 0.07, youtube backgrounds: 0.15. + + :param np.ndarray data: The audio's data point. One channel, which means the + length of the shape should be one. + :param int rate: The sampling rate of the audio. + :param int num: Number of augmentation. + :param int obj_length: Output audio lengths. Will not be padded if leave it + none + :param np.ndarray noiseSource: The source of noise. Will add white noise if + leave it none. + :param float bgMaximum: The maximum background sound. + :param boolean verbose: If true, print out the adjustment made during the + process. + + :return: A list of audio data points. + :raises ValueError: If num < 0 or obj_length <= 0 or 0.75*len(data). + """ + if obj_length is not None: + if (obj_length <= 0): + raise ValueError('Objective length must be above than 0.') + if (obj_length <= 0.75*len(data)): + raise ValueError('Objective length too short.') + if num < 0: + raise ValueError('Number of augmentation must be above than or equal to 0.') + if num == 0: + return [] + + result = [] + data = _normalize(data) + for _ in range(num): + # 1. shift the data a little bit. + if len(data)>16000: + shifty = min(int(len(data)/10), np.random.randint(4000)) + if np.random.random()>0.5: shifty *= -1 + transformed = shift(data, shifty) + else: + transformed = data + # 2. Adjust the speed. + ub = 1.25 + lb = 0.8 + if obj_length is not None: + zoomUpperBound = min(ub, obj_length/float(len(transformed))) + if zoomUpperBound= upper. + """ + yf = pyfftw.interfaces.numpy_fft.fft(data) + trans = np.copy(yf) + trans *= 0 + + # Clip the frequency. + if freq_range is not None: + # Determine the maximum and the minimum frequency. + minF, maxF = freq_range + assert maxF>minF + fBound = int(rate/2) + minF = max(0, minF) + maxF = min(fBound, maxF) + # Determine the maximum and the minimum point. + minP = int(len(yf)*minF/(2*fBound)) + maxP = int(len(yf)*maxF/(2*fBound)) + # Trim the fourier form. + trans[minP:maxP] = yf[minP:maxP] + trans[-maxP:-minP] = yf[-maxP:-minP] + yf = trans + trans = np.copy(yf) + trans *= 0 + + # Shift by the bias. + for i in range(int(len(yf)/2)): + obj_index = int(i-bias*len(yf)/rate) + if (obj_index<=(len(yf)/2)) and (obj_index>=0): + trans[i] = yf[obj_index] + trans[-i] = yf[-obj_index] + + s = _normalize(pyfftw.interfaces.numpy_fft.ifft(trans).real) + return s + +def dataTrim(data, trim_lower, trim_upper): + """ + Trim the audio, which means the data points before trim_lower or after + trim_upper will be removed. trim_upper can be negative. + + :param np.ndarray data: The audio's data point. One channel, which means the + length of the shape should be one. + :param int trim_lower: The lower bound to trim. + :param int trim_upper: The upper bound to trim. + + :return: Transformed audio data points. + :raises ValueError: If trim_lower >= trim_upper. + """ + if (trim_lower < 0): + raise ValueError('Lower bound must be above than or equal to zero.') + if (trim_upper == 0): + raise ValueError('Upper bound cannot be zero.') + if (trim_upper > 0) and (trim_lower >= trim_upper): + raise ValueError('Lower bound is larger than or equal to the upper bound.') + if (trim_upper < 0) and ((trim_lower-trim_upper)>=len(data)): + raise ValueError('Lower bound is larger than or equal to the upper bound.') + + return data[trim_lower:trim_upper] + +def dataPadding(data, padding_lower, padding_upper): + """ + Add zeros to the beginning or the end of the audio. + + :param np.ndarray data: The audio's data point. One channel, which means the + length of the shape should be one. + :param int padding_lower: Number of zeros to be added to the beginning. + :param int padding_upper: Number of zeros to be added to the end. + + :return: Transformed audio data points. + :raises ValueError: If padding_lower or padding_upper < 0. + """ + if (padding_lower <= 0) or (padding_upper <= 0): + raise ValueError('Number of padding must be above than zero.') + result = np.zeros(len(data)+padding_lower+padding_upper) + try: + result[padding_lower:-padding_upper] = data + except ValueError: + print(padding_lower) + print(padding_upper) + print(len(data)) + raise ValueError("FUCK YOU!!!!!!!!!") + return result + +def audioResize(data, zoomFactor): + """ + Resize the audio. Not only the length, but the frequency will also be + changed. If dealing with speech data, the zoomFactor bound recommended is + [0.75,1.35]. + + :param np.ndarray data: The audio's data point. One channel, which means the + length of the shape should be one. + :param int zoomFactor: The objective zoomFactor. + + :return: Transformed audio data points. + :raises ValueError: If zoomFactor <= 0. + """ + if (zoomFactor <= 0): + raise ValueError('The zoomFactor should be larger than zero.') + return zoom(data, zoomFactor) + +def audioVolume(data, maximum): + """ + Set the maximum volume of the audio. I personally do not recommend setting + the maximum above than 0.99 + + :param np.ndarray data: The audio's data point. One channel, which means the + length of the shape should be one. + :param float maximum: The maximum volume. + + :return: Transformed audio data points. Should be the same size as the + 'data'. + :raises ValueError: If maximum < 0 or >1. + """ + if (maximum < 0) or (maximum > 1): + raise ValueError('The maximum should be between 0 and 1.') + return maximum * data / np.max(np.abs(data)) + +def audioVolumeLinear(data, maximum_start, maximum_end): + """ + Set the maximum volume of the audio linearly regarding the time. + + :param np.ndarray data: The audio's data point. One channel, which means the + length of the shape should be one. + :param float maximum_start: The maximum volume at the beginning. + :param float maximum_end: The maximum volume in the end. + + :return: Transformed audio data points. Should be the same size as the + 'data'. + :raises ValueError: If (maximum_start < 0 or >1) or (maximum_end < 0 or >1). + """ + if (maximum_start < 0) or (maximum_start > 1) or (maximum_end < 0) or (maximum_end > 1): + raise ValueError('The maximum should be between 0 and 1.') + maximum = np.linspace(maximum_start, maximum_end, num=len(data)) + return maximum * data / np.max(np.abs(data)) + +def addNoise(data, noise_factor): + """ + Add random noise to the audio. + + :param np.ndarray data: The audio's data point. One channel, which means the + length of the shape should be one. + :param float noise_factor: The ratio of noise in volume. + + :return: Transformed audio data points. Should be the same size as the + 'data'. + :raises ValueError: If noise_factor < 0 or >1. + """ + if (noise_factor < 0) or (noise_factor > 1): + raise ValueError('The noise factor should be between 0 and 1.') + noise = np.random.random(size=len(data))*2-1 + noise /= (np.max(np.abs(noise))/noise_factor) + audio = (1.0-noise_factor) * data / np.max(np.abs(data)) + return noise+audio + +def addNoiseFrom(data, noiseSource, noise_factor): + """ + Add random noise from source. If noise from people speaking, recommend below + 0.05, else, recommend below 0.3 + + :param np.ndarray data: The audio's data point. One channel, which means the + length of the shape should be one. + :param np.ndarray noiseSource: The noise data. Must be longer or equal to + the length of 'data'. + :param float noise_factor: The ratio of noise in volume. + + :return: Transformed audio data points. Should be the same size as the + 'data'. + :raises ValueError: If len(noiseSource)1. + """ + if (noise_factor < 0) or (noise_factor > 1): + raise ValueError('The noise factor should be between 0 and 1.') + if (len(noiseSource) len(audio): + raise ValueError("Absolute value of shift_ms should be smaller " + "than audio duration.") + if shifty > 0: + # time advance + audio[:-shifty] = audio[shifty:] + audio[-shifty:] = 0 + elif shifty < 0: + # time delay + audio[-shifty:] = audio[:shifty] + audio[:-shifty] = 0 + return audio + +def resample(audio, rate, target_sample_rate, filter='kaiser_best'): + """ + Resample the audio to a target sample rate. + + :param np.ndarray audio: Audio data points. + :param int target_sample_rate: Target sample rate. + :param str filter: The resampling filter to use one of {'kaiser_best', + 'kaiser_fast'}. + """ + audio = resampy.resample(audio, rate, target_sample_rate, filter=filter) + return audio + +def simple_echo(audio, factor, duration): + """ + Resample the audio to a target sample rate. + + :param np.ndarray audio: Audio data points. + :param int factor: Echo factor. + :param int duration: Echo duration. If duration == 0, then no echo. + """ + if duration == 0: return audio + result = np.zeros(len(audio)+duration) + ratio = 1 + for i in range(duration+1): + result[i:len(audio)+i] = ratio*audio + ratio*=factor + return _normalize(result) + +def impulse_echo(audio, impulse_func): + """ + Resample the audio to a target sample rate. + + :param np.ndarray audio: Audio data points. + :param np.ndarray impulse_func: Echo factors array. + """ + result = np.zeros(len(audio)+len(impulse_func)) + result[:len(audio)] = audio + for i in range(duration): + result[i+1:len(audio)+i+1] = impulse_func[i]*audio + return _normalize(result) + +def _normalize(dat): + return 0.99 * dat / np.max(np.abs(dat)) + +def _zeroPad(dat, l): + """ + Pad by zeros. + + :param list dat: The data array. + :param int l: Objective length. + :raises ValueError: If l is shorter than the dat length. + """ + if lx_obj: + continue + + id = file_path.split('/')[-1][:-4] + + start = time.time() + if augmentation: + bg_lib = self.bg_libs[np.random.randint(len(self.bg_libs))] + if self.server: + ns = bg_lib[np.random.randint(len(bg_lib))] + else: + _, ns = read_wav(bg_lib[np.random.randint(len(bg_lib))]) + ns = mergeChannels(ns) + data = randomAugment(data, rate, 1, obj_length = x_obj, noiseSource = ns, bgMaximum = bgMaximum)[0] + else: + data = zero_padding_1d(data, x_obj) + time1 = time.time() + a_seg = AudioSegment(data, rate) + xs.append(self.af.featurize(a_seg)) + time2 = time.time() + aug_total += time1-start + mfb_total += time2-time1 + + if returnUnicode: + ys.append(self.unicodes[id]) + else: + ys.append(np.array(self.labels[id]).astype(int)) + + if verbose: + # How the fuck to use print? See this: + # print('a={first:4.2f}, b={second:03d}'.format(first=f(x,n),second=g(x,n))) + # print("a=%d,b=%d" % (f(x,n),g(x,n))) + print("Augmentation time = %f sec; Featurization time = %f sec" % (aug_total, mfb_total)) + xs = np.array(xs) + xs = np.transpose(xs, [0,2,1]) + if returnUnicode: + return xs, ys + else: + if isCTC: + ys = sparse_tuple_from(ys) + return xs, ys + else: + ys_lengths = [len(y)+1 for y in ys] + max_length = max(ys_lengths) + temp = [] + + # The first three tokens should be reserved for padding, start, and end tokens. + for y in ys: + if len(y)<(max_length-1): + # Add the end token. (Actually 2, but will be 2 after 3 is added.) + y = np.concatenate([y, [-1]]) + temp.append(np.concatenate([y+3, np.zeros(max_length-len(y))])) + else: + y = np.concatenate([y, [-1]]) + temp.append(y+3) + ys = np.array(temp) + return xs, ys, ys_lengths diff --git a/lib/tools_math.py b/lib/tools_math.py new file mode 100644 index 0000000..819cd20 --- /dev/null +++ b/lib/tools_math.py @@ -0,0 +1,51 @@ +import numpy as np + +def sigmoid(x): + return 1.0/(1.0+np.exp(-x)) + +def normalize(dat): + return 0.99 * dat / np.max(np.abs(dat)) + +def get_topk_args(arr, k): + return arr.argsort()[::-1][:k] + +def get_distance(v1, v2): + if len(v2.shape) != 1: + raise ValueError("arg2 should be an 1d array.") + if len(v1.shape) == 1: + return np.sqrt(np.sum(np.square(v1-v2))) + elif len(v1.shape) == 2: + return np.sqrt(np.sum(np.square(v1-v2), axis = 1)) + else: + raise ValueError("arg1 should be rather 1d or 2d array.") + +def get_cos_sim(v1, v2): + if len(v2.shape) != 1: + raise ValueError("arg2 should be an 1d array.") + if len(v1.shape) == 1: + inner = np.sum((v1*v2)) + normv1 = np.sqrt(np.sum(np.square(v1))) + normv2 = np.sqrt(np.sum(np.square(v2))) + return inner/(normv1*normv2) + elif len(v1.shape) == 2: + inner = np.sum((v1*v2), axis = 1) + normv1 = np.sqrt(np.sum(np.square(v1), axis = 1)) + normv2 = np.sqrt(np.sum(np.square(v2))) + return inner/(normv1*normv2) + +def index2onehot(indices, label_range): + result = np.zeros([len(indices), label_range]) + result[np.arange(len(indices)), indices] = 1.0 + return result + +def randomExcept(n, end, start = 0): + r = range(start, n) + range(n+1, end) + return np.random.choice(r) + +def zero_padding_1d(vec, obj_length): + result = np.concatenate([vec, np.zeros(obj_length-len(vec))]) + return result + +def neg_padding_1d(vec, obj_length): + result = np.concatenate([vec, -np.ones(obj_length-len(vec))]) + return result diff --git a/lib/tools_pinyin.py b/lib/tools_pinyin.py new file mode 100644 index 0000000..ce7d3ce --- /dev/null +++ b/lib/tools_pinyin.py @@ -0,0 +1,62 @@ +import numpy as np +import pickle +from pypinyin import lazy_pinyin + +class pinyinParser: + + def __init__(self, path): + with open(path, 'rb') as handle: + self.pinyinDict = pickle.load(handle) + invPath = path[:-7]+"Inv.pickle" + with open(invPath, 'rb') as handle: + self.pinyinDict_inv = pickle.load(handle) + + def getDictSize(self): + return len(self.pinyinDict) + + def getPinYin(self, unicodeContent): + return " ".join([x for x in lazy_pinyin(unicodeContent)]) + + def _index2OneHot(self, index): + result = np.zeros(len(self.pinyinDict)) + result[index] = 1.0 + return result + + def _indices2OneHot(self, indices): + result = np.zeros([len(indices), len(self.pinyinDict)]) + result[np.arange(len(indices)), indices] = 1.0 + return result + + def getPinYinIndices(self, pinyin): + pinyinList = pinyin.strip().split() + indices = [] + for pinyin in pinyinList: + if pinyin in self.pinyinDict: + indices.append(self.pinyinDict[pinyin]) + else: + raise ValueError("Could not find "+pinyin+" in the dictionary.") + if len(indices)==0: + raise ValueError("Invalid input.") + return indices + + def getPinYinOneHot(self, pinyin): + pinyinList = pinyin.strip().split() + indices = [] + for pinyin in pinyinList: + if pinyin in self.pinyinDict: + indices.append(self.pinyinDict[pinyin]) + else: + raise ValueError("Could not find "+pinyin+" in the dictionary.") + if len(indices)==0: + raise ValueError("Invalid input.") + return self._indices2OneHot(indices) + + def decodeIndices(self, vec, useUnderline = True): + result = [] + for num in vec: + if num in self.pinyinDict_inv: + result.append(self.pinyinDict_inv[num]) + if useUnderline: + return '_'.join(result) + else: + return ''.join(result) diff --git a/lib/tools_player.py b/lib/tools_player.py new file mode 100644 index 0000000..46d5e15 --- /dev/null +++ b/lib/tools_player.py @@ -0,0 +1,23 @@ +import numpy as np +import sounddevice as sd +import matplotlib.pyplot as plt + +from lib.tools_audio import * + +def play(vec, Fs): + sd.play(vec, Fs, blocking=True) + +def normalize(dat): + return 0.99 * dat / np.max(np.abs(dat)) + +def load_data(file_path): + try: + _, data_temp = read_mp3(file_path) + except: + _, data_temp = read_wav(file_path) + return data_temp + +def plotSound(vec): + plt.plot(vec) + plt.ylabel('Amplitude') + plt.show() diff --git a/lib/tools_sparse.py b/lib/tools_sparse.py new file mode 100644 index 0000000..0c04b3f --- /dev/null +++ b/lib/tools_sparse.py @@ -0,0 +1,55 @@ +import numpy as np +from lib.tools_pinyin import * + +def get_maxLengthListinList(ls): + length = 0 + for l in ls: + if len(l)>length: length = len(l) + return length + +def sparse_tuple_from(sequences, dtype=np.int32): + """ + Create a sparse representention of x. + Args: + sequences: a list of lists of type dtype where each element is a sequence + Returns: + A tuple with (indices, values, shape) + """ + indices = [] + values = [] + + for n, seq in enumerate(sequences): + indices.extend(zip([n] * len(seq), range(len(seq)))) + values.extend(seq) + + indices = np.asarray(indices, dtype=np.int64) + values = np.asarray(values, dtype=dtype) + shape = np.asarray([len(sequences), np.asarray(indices).max(0)[1] + 1], dtype=np.int64) + + return indices, values, shape + +def sparseTuples2dense(sparseTensor): + pred_dense = -np.ones(sparseTensor[2]) + for i in range(len(sparseTensor[0])): + pred_dense[sparseTensor[0][i][0],sparseTensor[0][i][1]] = sparseTensor[1][i] + return pred_dense + +def report_accuracy(decoded_list, test_targets, pyParser): + original_list = sparseTuples2dense(test_targets) + detected_list = sparseTuples2dense(decoded_list) + print("-------------------") + for i in range(len(original_list)): + original_line = [] + detected_line = [] + for stuff in original_list[i]: + if stuff!=-1: + original_line.append(stuff) + for stuff in detected_list[i]: + if stuff!=-1: + detected_line.append(stuff) + print(i) + print(original_line) + print(detected_line) + print(pyParser.decodeIndices(original_line, useUnderline = True)) + print(pyParser.decodeIndices(detected_line, useUnderline = True)) + print("-------------------") diff --git a/logs/readme.md b/logs/readme.md new file mode 100644 index 0000000..39cdd0d --- /dev/null +++ b/logs/readme.md @@ -0,0 +1 @@ +- diff --git a/models/readme.md b/models/readme.md new file mode 100644 index 0000000..39cdd0d --- /dev/null +++ b/models/readme.md @@ -0,0 +1 @@ +-