diff --git a/lib/contrib/audio.py b/lib/contrib/audio.py
new file mode 100644
index 0000000..97c9268
--- /dev/null
+++ b/lib/contrib/audio.py
@@ -0,0 +1,678 @@
+"""Contains the audio segment class."""
+import numpy as np
+import io
+import struct
+import re
+import soundfile
+import resampy
+from scipy import signal
+import random
+import copy
+
+
+class AudioSegment(object):
+    """Monaural audio segment abstraction.
+
+    :param samples: Audio samples [num_samples x num_channels].
+    :type samples: ndarray.float32
+    :param sample_rate: Audio sample rate.
+    :type sample_rate: int
+    :raises TypeError: If the sample data type is not float or int.
+    """
+
+    def __init__(self, samples, sample_rate):
+        """Create audio segment from samples.
+
+        Samples are convert float32 internally, with int scaled to [-1, 1].
+        """
+        self._samples = self._convert_samples_to_float32(samples)
+        self._sample_rate = sample_rate
+        if self._samples.ndim >= 2:
+            self._samples = np.mean(self._samples, 1)
+
+    def __eq__(self, other):
+        """Return whether two objects are equal."""
+        if type(other) is not type(self):
+            return False
+        if self._sample_rate != other._sample_rate:
+            return False
+        if self._samples.shape != other._samples.shape:
+            return False
+        if np.any(self.samples != other._samples):
+            return False
+        return True
+
+    def __ne__(self, other):
+        """Return whether two objects are unequal."""
+        return not self.__eq__(other)
+
+    def __str__(self):
+        """Return human-readable representation of segment."""
+        return ("%s: num_samples=%d, sample_rate=%d, duration=%.2fsec, "
+                "rms=%.2fdB" % (type(self), self.num_samples, self.sample_rate,
+                                self.duration, self.rms_db))
+
+    @classmethod
+    def from_file(cls, file):
+        """Create audio segment from audio file.
+
+        :param filepath: Filepath or file object to audio file.
+        :type filepath: basestring|file
+        :return: Audio segment instance.
+        :rtype: AudioSegment
+        """
+        samples, sample_rate = soundfile.read(file, dtype='float32')
+        return cls(samples, sample_rate)
+
+    @classmethod
+    def slice_from_file(cls, file, start=None, end=None):
+        """Loads a small section of an audio without having to load
+        the entire file into the memory which can be incredibly wasteful.
+
+        :param file: Input audio filepath or file object.
+        :type file: basestring|file
+        :param start: Start time in seconds. If start is negative, it wraps
+                      around from the end. If not provided, this function
+                      reads from the very beginning.
+        :type start: float
+        :param end: End time in seconds. If end is negative, it wraps around
+                    from the end. If not provided, the default behvaior is
+                    to read to the end of the file.
+        :type end: float
+        :return: AudioSegment instance of the specified slice of the input
+                 audio file.
+        :rtype: AudioSegment
+        :raise ValueError: If start or end is incorrectly set, e.g. out of
+                           bounds in time.
+        """
+        sndfile = soundfile.SoundFile(file)
+        sample_rate = sndfile.samplerate
+        duration = float(len(sndfile)) / sample_rate
+        start = 0. if start is None else start
+        end = 0. if end is None else end
+        if start < 0.0:
+            start += duration
+        if end < 0.0:
+            end += duration
+        if start < 0.0:
+            raise ValueError("The slice start position (%f s) is out of "
+                             "bounds." % start)
+        if end < 0.0:
+            raise ValueError("The slice end position (%f s) is out of bounds." %
+                             end)
+        if start > end:
+            raise ValueError("The slice start position (%f s) is later than "
+                             "the slice end position (%f s)." % (start, end))
+        if end > duration:
+            raise ValueError("The slice end position (%f s) is out of bounds "
+                             "(> %f s)" % (end, duration))
+        start_frame = int(start * sample_rate)
+        end_frame = int(end * sample_rate)
+        sndfile.seek(start_frame)
+        data = sndfile.read(frames=end_frame - start_frame, dtype='float32')
+        return cls(data, sample_rate)
+
+    @classmethod
+    def from_sequence_file(cls, filepath):
+        """Create audio segment from sequence file. Sequence file is a binary
+        file containing a collection of multiple audio files, with several
+        header bytes in the head indicating the offsets of each audio byte data
+        chunk.
+
+        The format is:
+
+            4 bytes (int, version),
+            4 bytes (int, num of utterance),
+            4 bytes (int, bytes per header),
+            [bytes_per_header*(num_utterance+1)] bytes (offsets for each audio),
+            audio_bytes_data_of_1st_utterance,
+            audio_bytes_data_of_2nd_utterance,
+            ......
+
+        Sequence file name must end with ".seqbin". And the filename of the 5th
+        utterance's audio file in sequence file "xxx.seqbin" must be
+        "xxx.seqbin_5", with "5" indicating the utterance index within this
+        sequence file (starting from 1).
+
+        :param filepath: Filepath of sequence file.
+        :type filepath: basestring
+        :return: Audio segment instance.
+        :rtype: AudioSegment
+        """
+        # parse filepath
+        matches = re.match(r"(.+\.seqbin)_(\d+)", filepath)
+        if matches is None:
+            raise IOError("File type of %s is not supported" % filepath)
+        filename = matches.group(1)
+        fileno = int(matches.group(2))
+
+        # read headers
+        f = open(filename, 'rb')
+        version = f.read(4)
+        num_utterances = struct.unpack("i", f.read(4))[0]
+        bytes_per_header = struct.unpack("i", f.read(4))[0]
+        header_bytes = f.read(bytes_per_header * (num_utterances + 1))
+        header = [
+            struct.unpack("i", header_bytes[bytes_per_header * i:
+                                            bytes_per_header * (i + 1)])[0]
+            for i in range(num_utterances + 1)
+        ]
+
+        # read audio bytes
+        f.seek(header[fileno - 1])
+        audio_bytes = f.read(header[fileno] - header[fileno - 1])
+        f.close()
+
+        # create audio segment
+        try:
+            return cls.from_bytes(audio_bytes)
+        except Exception as e:
+            samples = np.frombuffer(audio_bytes, dtype='int16')
+            return cls(samples=samples, sample_rate=8000)
+
+    @classmethod
+    def from_bytes(cls, bytes):
+        """Create audio segment from a byte string containing audio samples.
+
+        :param bytes: Byte string containing audio samples.
+        :type bytes: str
+        :return: Audio segment instance.
+        :rtype: AudioSegment
+        """
+        samples, sample_rate = soundfile.read(
+            io.BytesIO(bytes), dtype='float32')
+        return cls(samples, sample_rate)
+
+    @classmethod
+    def concatenate(cls, *segments):
+        """Concatenate an arbitrary number of audio segments together.
+
+        :param *segments: Input audio segments to be concatenated.
+        :type *segments: tuple of AudioSegment
+        :return: Audio segment instance as concatenating results.
+        :rtype: AudioSegment
+        :raises ValueError: If the number of segments is zero, or if the
+                            sample_rate of any segments does not match.
+        :raises TypeError: If any segment is not AudioSegment instance.
+        """
+        # Perform basic sanity-checks.
+        if len(segments) == 0:
+            raise ValueError("No audio segments are given to concatenate.")
+        sample_rate = segments[0]._sample_rate
+        for seg in segments:
+            if sample_rate != seg._sample_rate:
+                raise ValueError("Can't concatenate segments with "
+                                 "different sample rates")
+            if type(seg) is not cls:
+                raise TypeError("Only audio segments of the same type "
+                                "can be concatenated.")
+        samples = np.concatenate([seg.samples for seg in segments])
+        return cls(samples, sample_rate)
+
+    @classmethod
+    def make_silence(cls, duration, sample_rate):
+        """Creates a silent audio segment of the given duration and sample rate.
+
+        :param duration: Length of silence in seconds.
+        :type duration: float
+        :param sample_rate: Sample rate.
+        :type sample_rate: float
+        :return: Silent AudioSegment instance of the given duration.
+        :rtype: AudioSegment
+        """
+        samples = np.zeros(int(duration * sample_rate))
+        return cls(samples, sample_rate)
+
+    def to_wav_file(self, filepath, dtype='float32'):
+        """Save audio segment to disk as wav file.
+
+        :param filepath: WAV filepath or file object to save the
+                         audio segment.
+        :type filepath: basestring|file
+        :param dtype: Subtype for audio file. Options: 'int16', 'int32',
+                      'float32', 'float64'. Default is 'float32'.
+        :type dtype: str
+        :raises TypeError: If dtype is not supported.
+        """
+        samples = self._convert_samples_from_float32(self._samples, dtype)
+        subtype_map = {
+            'int16': 'PCM_16',
+            'int32': 'PCM_32',
+            'float32': 'FLOAT',
+            'float64': 'DOUBLE'
+        }
+        soundfile.write(
+            filepath,
+            samples,
+            self._sample_rate,
+            format='WAV',
+            subtype=subtype_map[dtype])
+
+    def superimpose(self, other):
+        """Add samples from another segment to those of this segment
+        (sample-wise addition, not segment concatenation).
+
+        Note that this is an in-place transformation.
+
+        :param other: Segment containing samples to be added in.
+        :type other: AudioSegments
+        :raise TypeError: If type of two segments don't match.
+        :raise ValueError: If the sample rates of the two segments are not
+                           equal, or if the lengths of segments don't match.
+        """
+        if isinstance(other, type(self)):
+            raise TypeError("Cannot add segments of different types: %s "
+                            "and %s." % (type(self), type(other)))
+        if self._sample_rate != other._sample_rate:
+            raise ValueError("Sample rates must match to add segments.")
+        if len(self._samples) != len(other._samples):
+            raise ValueError("Segment lengths must match to add segments.")
+        self._samples += other._samples
+
+    def to_bytes(self, dtype='float32'):
+        """Create a byte string containing the audio content.
+
+        :param dtype: Data type for export samples. Options: 'int16', 'int32',
+                      'float32', 'float64'. Default is 'float32'.
+        :type dtype: str
+        :return: Byte string containing audio content.
+        :rtype: str
+        """
+        samples = self._convert_samples_from_float32(self._samples, dtype)
+        return samples.tostring()
+
+    def gain_db(self, gain):
+        """Apply gain in decibels to samples.
+
+        Note that this is an in-place transformation.
+
+        :param gain: Gain in decibels to apply to samples.
+        :type gain: float|1darray
+        """
+        self._samples *= 10.**(gain / 20.)
+
+    def change_speed(self, speed_rate):
+        """Change the audio speed by linear interpolation.
+
+        Note that this is an in-place transformation.
+
+        :param speed_rate: Rate of speed change:
+                           speed_rate > 1.0, speed up the audio;
+                           speed_rate = 1.0, unchanged;
+                           speed_rate < 1.0, slow down the audio;
+                           speed_rate <= 0.0, not allowed, raise ValueError.
+        :type speed_rate: float
+        :raises ValueError: If speed_rate <= 0.0.
+        """
+        if speed_rate <= 0:
+            raise ValueError("speed_rate should be greater than zero.")
+        old_length = self._samples.shape[0]
+        new_length = int(old_length / speed_rate)
+        old_indices = np.arange(old_length)
+        new_indices = np.linspace(start=0, stop=old_length, num=new_length)
+        self._samples = np.interp(new_indices, old_indices, self._samples)
+
+    def normalize(self, target_db=-20, max_gain_db=300.0):
+        """Normalize audio to be of the desired RMS value in decibels.
+
+        Note that this is an in-place transformation.
+
+        :param target_db: Target RMS value in decibels. This value should be
+                          less than 0.0 as 0.0 is full-scale audio.
+        :type target_db: float
+        :param max_gain_db: Max amount of gain in dB that can be applied for
+                            normalization. This is to prevent nans when
+                            attempting to normalize a signal consisting of
+                            all zeros.
+        :type max_gain_db: float
+        :raises ValueError: If the required gain to normalize the segment to
+                            the target_db value exceeds max_gain_db.
+        """
+        gain = target_db - self.rms_db
+        if gain > max_gain_db:
+            raise ValueError(
+                "Unable to normalize segment to %f dB because the "
+                "the probable gain have exceeds max_gain_db (%f dB)" %
+                (target_db, max_gain_db))
+        self.gain_db(min(max_gain_db, target_db - self.rms_db))
+
+    def normalize_online_bayesian(self,
+                                  target_db,
+                                  prior_db,
+                                  prior_samples,
+                                  startup_delay=0.0):
+        """Normalize audio using a production-compatible online/causal
+        algorithm. This uses an exponential likelihood and gamma prior to
+        make online estimates of the RMS even when there are very few samples.
+
+        Note that this is an in-place transformation.
+
+        :param target_db: Target RMS value in decibels.
+        :type target_bd: float
+        :param prior_db: Prior RMS estimate in decibels.
+        :type prior_db: float
+        :param prior_samples: Prior strength in number of samples.
+        :type prior_samples: float
+        :param startup_delay: Default 0.0s. If provided, this function will
+                              accrue statistics for the first startup_delay
+                              seconds before applying online normalization.
+        :type startup_delay: float
+        """
+        # Estimate total RMS online.
+        startup_sample_idx = min(self.num_samples - 1,
+                                 int(self.sample_rate * startup_delay))
+        prior_mean_squared = 10.**(prior_db / 10.)
+        prior_sum_of_squares = prior_mean_squared * prior_samples
+        cumsum_of_squares = np.cumsum(self.samples**2)
+        sample_count = np.arange(self.num_samples) + 1
+        if startup_sample_idx > 0:
+            cumsum_of_squares[:startup_sample_idx] = \
+                cumsum_of_squares[startup_sample_idx]
+            sample_count[:startup_sample_idx] = \
+                sample_count[startup_sample_idx]
+        mean_squared_estimate = ((cumsum_of_squares + prior_sum_of_squares) /
+                                 (sample_count + prior_samples))
+        rms_estimate_db = 10 * np.log10(mean_squared_estimate)
+        # Compute required time-varying gain.
+        gain_db = target_db - rms_estimate_db
+        self.gain_db(gain_db)
+
+    def resample(self, target_sample_rate, filter='kaiser_best'):
+        """Resample the audio to a target sample rate.
+
+        Note that this is an in-place transformation.
+
+        :param target_sample_rate: Target sample rate.
+        :type target_sample_rate: int
+        :param filter: The resampling filter to use one of {'kaiser_best',
+                       'kaiser_fast'}.
+        :type filter: str
+        """
+        self._samples = resampy.resample(
+            self.samples, self.sample_rate, target_sample_rate, filter=filter)
+        self._sample_rate = target_sample_rate
+
+    def pad_silence(self, duration, sides='both'):
+        """Pad this audio sample with a period of silence.
+
+        Note that this is an in-place transformation.
+
+        :param duration: Length of silence in seconds to pad.
+        :type duration: float
+        :param sides: Position for padding:
+                     'beginning' - adds silence in the beginning;
+                     'end' - adds silence in the end;
+                     'both' - adds silence in both the beginning and the end.
+        :type sides: str
+        :raises ValueError: If sides is not supported.
+        """
+        if duration == 0.0:
+            return self
+        cls = type(self)
+        silence = self.make_silence(duration, self._sample_rate)
+        if sides == "beginning":
+            padded = cls.concatenate(silence, self)
+        elif sides == "end":
+            padded = cls.concatenate(self, silence)
+        elif sides == "both":
+            padded = cls.concatenate(silence, self, silence)
+        else:
+            raise ValueError("Unknown value for the sides %s" % sides)
+        self._samples = padded._samples
+
+    def shift(self, shift_ms):
+        """Shift the audio in time. If `shift_ms` is positive, shift with time
+        advance; if negative, shift with time delay. Silence are padded to
+        keep the duration unchanged.
+
+        Note that this is an in-place transformation.
+
+        :param shift_ms: Shift time in millseconds. If positive, shift with
+                         time advance; if negative; shift with time delay.
+        :type shift_ms: float
+        :raises ValueError: If shift_ms is longer than audio duration.
+        """
+        if abs(shift_ms) / 1000.0 > self.duration:
+            raise ValueError("Absolute value of shift_ms should be smaller "
+                             "than audio duration.")
+        shift_samples = int(shift_ms * self._sample_rate / 1000)
+        if shift_samples > 0:
+            # time advance
+            self._samples[:-shift_samples] = self._samples[shift_samples:]
+            self._samples[-shift_samples:] = 0
+        elif shift_samples < 0:
+            # time delay
+            self._samples[-shift_samples:] = self._samples[:shift_samples]
+            self._samples[:-shift_samples] = 0
+
+    def subsegment(self, start_sec=None, end_sec=None):
+        """Cut the AudioSegment between given boundaries.
+
+        Note that this is an in-place transformation.
+
+        :param start_sec: Beginning of subsegment in seconds.
+        :type start_sec: float
+        :param end_sec: End of subsegment in seconds.
+        :type end_sec: float
+        :raise ValueError: If start_sec or end_sec is incorrectly set, e.g. out
+                           of bounds in time.
+        """
+        start_sec = 0.0 if start_sec is None else start_sec
+        end_sec = self.duration if end_sec is None else end_sec
+        if start_sec < 0.0:
+            start_sec = self.duration + start_sec
+        if end_sec < 0.0:
+            end_sec = self.duration + end_sec
+        if start_sec < 0.0:
+            raise ValueError("The slice start position (%f s) is out of "
+                             "bounds." % start_sec)
+        if end_sec < 0.0:
+            raise ValueError("The slice end position (%f s) is out of bounds." %
+                             end_sec)
+        if start_sec > end_sec:
+            raise ValueError("The slice start position (%f s) is later than "
+                             "the end position (%f s)." % (start_sec, end_sec))
+        if end_sec > self.duration:
+            raise ValueError("The slice end position (%f s) is out of bounds "
+                             "(> %f s)" % (end_sec, self.duration))
+        start_sample = int(round(start_sec * self._sample_rate))
+        end_sample = int(round(end_sec * self._sample_rate))
+        self._samples = self._samples[start_sample:end_sample]
+
+    def random_subsegment(self, subsegment_length, rng=None):
+        """Cut the specified length of the audiosegment randomly.
+
+        Note that this is an in-place transformation.
+
+        :param subsegment_length: Subsegment length in seconds.
+        :type subsegment_length: float
+        :param rng: Random number generator state.
+        :type rng: random.Random
+        :raises ValueError: If the length of subsegment is greater than
+                            the origineal segemnt.
+        """
+        rng = random.Random() if rng is None else rng
+        if subsegment_length > self.duration:
+            raise ValueError("Length of subsegment must not be greater "
+                             "than original segment.")
+        start_time = rng.uniform(0.0, self.duration - subsegment_length)
+        self.subsegment(start_time, start_time + subsegment_length)
+
+    def convolve(self, impulse_segment, allow_resample=False):
+        """Convolve this audio segment with the given impulse segment.
+
+        Note that this is an in-place transformation.
+
+        :param impulse_segment: Impulse response segments.
+        :type impulse_segment: AudioSegment
+        :param allow_resample: Indicates whether resampling is allowed when
+                               the impulse_segment has a different sample
+                               rate from this signal.
+        :type allow_resample: bool
+        :raises ValueError: If the sample rate is not match between two
+                            audio segments when resample is not allowed.
+        """
+        if allow_resample and self.sample_rate != impulse_segment.sample_rate:
+            impulse_segment.resample(self.sample_rate)
+        if self.sample_rate != impulse_segment.sample_rate:
+            raise ValueError("Impulse segment's sample rate (%d Hz) is not "
+                             "equal to base signal sample rate (%d Hz)." %
+                             (impulse_segment.sample_rate, self.sample_rate))
+        samples = signal.fftconvolve(self.samples, impulse_segment.samples,
+                                     "full")
+        self._samples = samples
+
+    def convolve_and_normalize(self, impulse_segment, allow_resample=False):
+        """Convolve and normalize the resulting audio segment so that it
+        has the same average power as the input signal.
+
+        Note that this is an in-place transformation.
+
+        :param impulse_segment: Impulse response segments.
+        :type impulse_segment: AudioSegment
+        :param allow_resample: Indicates whether resampling is allowed when
+                               the impulse_segment has a different sample
+                               rate from this signal.
+        :type allow_resample: bool
+        """
+        target_db = self.rms_db
+        self.convolve(impulse_segment, allow_resample=allow_resample)
+        self.normalize(target_db)
+
+    def add_noise(self,
+                  noise,
+                  snr_dB,
+                  allow_downsampling=False,
+                  max_gain_db=300.0,
+                  rng=None):
+        """Add the given noise segment at a specific signal-to-noise ratio.
+        If the noise segment is longer than this segment, a random subsegment
+        of matching length is sampled from it and used instead.
+
+        Note that this is an in-place transformation.
+
+        :param noise: Noise signal to add.
+        :type noise: AudioSegment
+        :param snr_dB: Signal-to-Noise Ratio, in decibels.
+        :type snr_dB: float
+        :param allow_downsampling: Whether to allow the noise signal to be
+                                   downsampled to match the base signal sample
+                                   rate.
+        :type allow_downsampling: bool
+        :param max_gain_db: Maximum amount of gain to apply to noise signal
+                            before adding it in. This is to prevent attempting
+                            to apply infinite gain to a zero signal.
+        :type max_gain_db: float
+        :param rng: Random number generator state.
+        :type rng: None|random.Random
+        :raises ValueError: If the sample rate does not match between the two
+                            audio segments when downsampling is not allowed, or
+                            if the duration of noise segments is shorter than
+                            original audio segments.
+        """
+        rng = random.Random() if rng is None else rng
+        if allow_downsampling and noise.sample_rate > self.sample_rate:
+            noise = noise.resample(self.sample_rate)
+        if noise.sample_rate != self.sample_rate:
+            raise ValueError("Noise sample rate (%d Hz) is not equal to base "
+                             "signal sample rate (%d Hz)." % (noise.sample_rate,
+                                                              self.sample_rate))
+        if noise.duration < self.duration:
+            raise ValueError("Noise signal (%f sec) must be at least as long as"
+                             " base signal (%f sec)." %
+                             (noise.duration, self.duration))
+        noise_gain_db = min(self.rms_db - noise.rms_db - snr_dB, max_gain_db)
+        noise_new = copy.deepcopy(noise)
+        noise_new.random_subsegment(self.duration, rng=rng)
+        noise_new.gain_db(noise_gain_db)
+        self.superimpose(noise_new)
+
+    @property
+    def samples(self):
+        """Return audio samples.
+
+        :return: Audio samples.
+        :rtype: ndarray
+        """
+        return self._samples.copy()
+
+    @property
+    def sample_rate(self):
+        """Return audio sample rate.
+
+        :return: Audio sample rate.
+        :rtype: int
+        """
+        return self._sample_rate
+
+    @property
+    def num_samples(self):
+        """Return number of samples.
+
+        :return: Number of samples.
+        :rtype: int
+        """
+        return self._samples.shape[0]
+
+    @property
+    def duration(self):
+        """Return audio duration.
+
+        :return: Audio duration in seconds.
+        :rtype: float
+        """
+        return self._samples.shape[0] / float(self._sample_rate)
+
+    @property
+    def rms_db(self):
+        """Return root mean square energy of the audio in decibels.
+
+        :return: Root mean square energy in decibels.
+        :rtype: float
+        """
+        # square root => multiply by 10 instead of 20 for dBs
+        mean_square = np.mean(self._samples**2)
+        return 10 * np.log10(mean_square)
+
+    def _convert_samples_to_float32(self, samples):
+        """Convert sample type to float32.
+
+        Audio sample type is usually integer or float-point.
+        Integers will be scaled to [-1, 1] in float32.
+        """
+        float32_samples = samples.astype('float32')
+        if samples.dtype in np.sctypes['int']:
+            bits = np.iinfo(samples.dtype).bits
+            float32_samples *= (1. / 2**(bits - 1))
+        elif samples.dtype in np.sctypes['float']:
+            pass
+        else:
+            raise TypeError("Unsupported sample type: %s." % samples.dtype)
+        return float32_samples
+
+    def _convert_samples_from_float32(self, samples, dtype):
+        """Convert sample type from float32 to dtype.
+
+        Audio sample type is usually integer or float-point. For integer
+        type, float32 will be rescaled from [-1, 1] to the maximum range
+        supported by the integer type.
+
+        This is for writing a audio file.
+        """
+        dtype = np.dtype(dtype)
+        output_samples = samples.copy()
+        if dtype in np.sctypes['int']:
+            bits = np.iinfo(dtype).bits
+            output_samples *= (2**(bits - 1) / 1.)
+            min_val = np.iinfo(dtype).min
+            max_val = np.iinfo(dtype).max
+            output_samples[output_samples > max_val] = max_val
+            output_samples[output_samples < min_val] = min_val
+        elif samples.dtype in np.sctypes['float']:
+            min_val = np.finfo(dtype).min
+            max_val = np.finfo(dtype).max
+            output_samples[output_samples > max_val] = max_val
+            output_samples[output_samples < min_val] = min_val
+        else:
+            raise TypeError("Unsupported sample type: %s." % samples.dtype)
+        return output_samples.astype(dtype)
diff --git a/lib/contrib/audio_featurizer.py b/lib/contrib/audio_featurizer.py
new file mode 100644
index 0000000..0069b2e
--- /dev/null
+++ b/lib/contrib/audio_featurizer.py
@@ -0,0 +1,182 @@
+"""Contains the audio featurizer class."""
+import numpy as np
+# https://github.com/jameslyons/python_speech_features
+from python_speech_features import mfcc
+from python_speech_features import delta
+
+
+class AudioFeaturizer(object):
+    """Audio featurizer, for extracting features from audio contents of
+    AudioSegment or SpeechSegment.
+
+    Currently, it supports feature types of linear spectrogram and mfcc.
+
+    :param specgram_type: Specgram feature type. Options: 'linear'.
+    :type specgram_type: str
+    :param stride_ms: Striding size (in milliseconds) for generating frames.
+    :type stride_ms: float
+    :param window_ms: Window size (in milliseconds) for generating frames.
+    :type window_ms: float
+    :param max_freq: When specgram_type is 'linear', only FFT bins
+                     corresponding to frequencies between [0, max_freq] are
+                     returned; when specgram_type is 'mfcc', max_feq is the
+                     highest band edge of mel filters.
+    :types max_freq: None|float
+    :param target_sample_rate: Audio are resampled (if upsampling or
+                               downsampling is allowed) to this before
+                               extracting spectrogram features.
+    :type target_sample_rate: float
+    :param use_dB_normalization: Whether to normalize the audio to a certain
+                                 decibels before extracting the features.
+    :type use_dB_normalization: bool
+    :param target_dB: Target audio decibels for normalization.
+    :type target_dB: float
+    """
+
+    def __init__(self,
+                 specgram_type='linear',
+                 stride_ms=10.0,
+                 window_ms=20.0,
+                 max_freq=None,
+                 target_sample_rate=16000,
+                 use_dB_normalization=True,
+                 target_dB=-20):
+        self._specgram_type = specgram_type
+        self._stride_ms = stride_ms
+        self._window_ms = window_ms
+        self._max_freq = max_freq
+        self._target_sample_rate = target_sample_rate
+        self._use_dB_normalization = use_dB_normalization
+        self._target_dB = target_dB
+
+    def featurize(self,
+                  audio_segment,
+                  allow_downsampling=True,
+                  allow_upsampling=True):
+        """Extract audio features from AudioSegment or SpeechSegment.
+
+        :param audio_segment: Audio/speech segment to extract features from.
+        :type audio_segment: AudioSegment|SpeechSegment
+        :param allow_downsampling: Whether to allow audio downsampling before
+                                   featurizing.
+        :type allow_downsampling: bool
+        :param allow_upsampling: Whether to allow audio upsampling before
+                                 featurizing.
+        :type allow_upsampling: bool
+        :return: Spectrogram audio feature in 2darray.
+        :rtype: ndarray
+        :raises ValueError: If audio sample rate is not supported.
+        """
+        # upsampling or downsampling
+        if ((audio_segment.sample_rate > self._target_sample_rate and
+             allow_downsampling) or
+            (audio_segment.sample_rate < self._target_sample_rate and
+             allow_upsampling)):
+            audio_segment.resample(self._target_sample_rate)
+        if audio_segment.sample_rate != self._target_sample_rate:
+            raise ValueError("Audio sample rate is not supported. "
+                             "Turn allow_downsampling or allow up_sampling on.")
+        # decibel normalization
+        if self._use_dB_normalization:
+            audio_segment.normalize(target_db=self._target_dB)
+        # extract spectrogram
+        return self._compute_specgram(audio_segment.samples,
+                                      audio_segment.sample_rate)
+
+    def _compute_specgram(self, samples, sample_rate):
+        """Extract various audio features."""
+        if self._specgram_type == 'linear':
+            return self._compute_linear_specgram(
+                samples, sample_rate, self._stride_ms, self._window_ms,
+                self._max_freq)
+        elif self._specgram_type == 'mfcc':
+            return self._compute_mfcc(samples, sample_rate, self._stride_ms,
+                                      self._window_ms, self._max_freq)
+        else:
+            raise ValueError("Unknown specgram_type %s. "
+                             "Supported values: linear." % self._specgram_type)
+
+    def _compute_linear_specgram(self,
+                                 samples,
+                                 sample_rate,
+                                 stride_ms=10.0,
+                                 window_ms=20.0,
+                                 max_freq=None,
+                                 eps=1e-14):
+        """Compute the linear spectrogram from FFT energy."""
+        if max_freq is None:
+            max_freq = sample_rate / 2
+        if max_freq > sample_rate / 2:
+            raise ValueError("max_freq must not be greater than half of "
+                             "sample rate.")
+        if stride_ms > window_ms:
+            raise ValueError("Stride size must not be greater than "
+                             "window size.")
+        stride_size = int(0.001 * sample_rate * stride_ms)
+        window_size = int(0.001 * sample_rate * window_ms)
+        specgram, freqs = self._specgram_real(
+            samples,
+            window_size=window_size,
+            stride_size=stride_size,
+            sample_rate=sample_rate)
+        ind = np.where(freqs <= max_freq)[0][-1] + 1
+        return np.log(specgram[:ind, :] + eps)
+
+    def _specgram_real(self, samples, window_size, stride_size, sample_rate):
+        """Compute the spectrogram for samples from a real signal."""
+        # extract strided windows
+        truncate_size = (len(samples) - window_size) % stride_size
+        samples = samples[:len(samples) - truncate_size]
+        nshape = (window_size, (len(samples) - window_size) // stride_size + 1)
+        nstrides = (samples.strides[0], samples.strides[0] * stride_size)
+        windows = np.lib.stride_tricks.as_strided(
+            samples, shape=nshape, strides=nstrides)
+        assert np.all(
+            windows[:, 1] == samples[stride_size:(stride_size + window_size)])
+        # window weighting, squared Fast Fourier Transform (fft), scaling
+        weighting = np.hanning(window_size)[:, None]
+        fft = np.fft.rfft(windows * weighting, axis=0)
+        fft = np.absolute(fft)
+        fft = fft**2
+        scale = np.sum(weighting**2) * sample_rate
+        fft[1:-1, :] *= (2.0 / scale)
+        fft[(0, -1), :] /= scale
+        # prepare fft frequency list
+        freqs = float(sample_rate) / window_size * np.arange(fft.shape[0])
+        return fft, freqs
+
+    def _compute_mfcc(self,
+                      samples,
+                      sample_rate,
+                      stride_ms=10.0,
+                      window_ms=20.0,
+                      max_freq=None):
+        """Compute mfcc from samples."""
+        if max_freq is None:
+            max_freq = sample_rate / 2
+        if max_freq > sample_rate / 2:
+            raise ValueError("max_freq must not be greater than half of "
+                             "sample rate.")
+        if stride_ms > window_ms:
+            raise ValueError("Stride size must not be greater than "
+                             "window size.")
+        # compute the 13 cepstral coefficients, and the first one is replaced
+        # by log(frame energy)
+        mfcc_feat = mfcc(
+            signal=samples,
+            samplerate=sample_rate,
+            winlen=0.001 * window_ms,
+            winstep=0.001 * stride_ms,
+            highfreq=max_freq)
+        # Deltas
+        d_mfcc_feat = delta(mfcc_feat, 2)
+        # Deltas-Deltas
+        dd_mfcc_feat = delta(d_mfcc_feat, 2)
+        # transpose
+        mfcc_feat = np.transpose(mfcc_feat)
+        d_mfcc_feat = np.transpose(d_mfcc_feat)
+        dd_mfcc_feat = np.transpose(dd_mfcc_feat)
+        # concat above three features
+        concat_mfcc_feat = np.concatenate(
+            (mfcc_feat, d_mfcc_feat, dd_mfcc_feat))
+        return concat_mfcc_feat
diff --git a/lib/pinyinDictNoTone.pickle b/lib/pinyinDictNoTone.pickle
new file mode 100644
index 0000000..1b2fe00
Binary files /dev/null and b/lib/pinyinDictNoTone.pickle differ
diff --git a/lib/pinyinDictNoToneInv.pickle b/lib/pinyinDictNoToneInv.pickle
new file mode 100644
index 0000000..bd03a7a
Binary files /dev/null and b/lib/pinyinDictNoToneInv.pickle differ
diff --git a/lib/recorder.py b/lib/recorder.py
new file mode 100644
index 0000000..856f0a5
--- /dev/null
+++ b/lib/recorder.py
@@ -0,0 +1,80 @@
+import pyaudio
+import wave
+
+class Recorder(object):
+    '''A recorder class for recording audio to a WAV file.
+    Records in mono by default.
+    '''
+
+    def __init__(self, channels=1, rate=44100, frames_per_buffer=1024):
+        self.channels = channels
+        self.rate = rate
+        self.frames_per_buffer = frames_per_buffer
+
+    def open(self, fname, mode='wb'):
+        return RecordingFile(fname, mode, self.channels, self.rate,
+                            self.frames_per_buffer)
+
+class RecordingFile(object):
+    def __init__(self, fname, mode, channels,
+                rate, frames_per_buffer):
+        self.fname = fname
+        self.mode = mode
+        self.channels = channels
+        self.rate = rate
+        self.frames_per_buffer = frames_per_buffer
+        self._pa = pyaudio.PyAudio()
+        self.wavefile = self._prepare_file(self.fname, self.mode)
+        self._stream = None
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exception, value, traceback):
+        self.close()
+
+    def record(self, duration):
+        # Use a stream with no callback function in blocking mode
+        self._stream = self._pa.open(format=pyaudio.paInt16,
+                                        channels=self.channels,
+                                        rate=self.rate,
+                                        input=True,
+                                        frames_per_buffer=self.frames_per_buffer)
+        for _ in range(int(self.rate / self.frames_per_buffer * duration)):
+            audio = self._stream.read(self.frames_per_buffer)
+            self.wavefile.writeframes(audio)
+        return None
+
+    def start_recording(self):
+        # Use a stream with a callback in non-blocking mode
+        self._stream = self._pa.open(format=pyaudio.paInt16,
+                                        channels=self.channels,
+                                        rate=self.rate,
+                                        input=True,
+                                        frames_per_buffer=self.frames_per_buffer,
+                                        stream_callback=self.get_callback())
+        self._stream.start_stream()
+        return self
+
+    def stop_recording(self):
+        self._stream.stop_stream()
+        return self
+
+    def get_callback(self):
+        def callback(in_data, frame_count, time_info, status):
+            self.wavefile.writeframes(in_data)
+            return in_data, pyaudio.paContinue
+        return callback
+
+
+    def close(self):
+        self._stream.close()
+        self._pa.terminate()
+        self.wavefile.close()
+
+    def _prepare_file(self, fname, mode='wb'):
+        wavefile = wave.open(fname, mode)
+        wavefile.setnchannels(self.channels)
+        wavefile.setsampwidth(self._pa.get_sample_size(pyaudio.paInt16))
+        wavefile.setframerate(self.rate)
+        return wavefile
diff --git a/lib/tools_audio.py b/lib/tools_audio.py
new file mode 100644
index 0000000..c25bc32
--- /dev/null
+++ b/lib/tools_audio.py
@@ -0,0 +1,92 @@
+import scipy.io.wavfile as wav
+import numpy as np
+import os
+import pydub
+import tempfile
+import scipy
+import random
+from python_speech_features import logfbank
+
+from lib.tools_math import *
+
+def changeRateTo16000(filepath):
+    if filepath[-4:].lower() =='.wav':
+        sound = pydub.AudioSegment.from_wav(filepath)
+        sound = sound.set_frame_rate(16000)
+        sound.export(filepath, format="wav")
+    elif filepath[-4:].lower() =='.m4a':
+        sound = pydub.AudioSegment.from_file(filepath, "m4a")
+        sound = sound.set_frame_rate(16000)
+        sound.export(filepath[:-3]+"wav", format="wav")
+    elif filepath[-4:].lower() =='.mp3':
+        sound = pydub.AudioSegment.from_mp3(filepath)
+        sound = sound.set_frame_rate(16000)
+        sound.export(filepath[:-3]+"wav", format="wav")
+    else:
+        print("Unsupported Format.")
+
+def read_wav(file_path):
+    assert file_path[-4:]=='.wav'
+    rate, data = wav.read(file_path)
+    return rate, data
+
+def read_m4a(file_path):
+    path, ext = os.path.splitext(file_path)
+    assert ext=='.m4a'
+    aac_version = pydub.AudioSegment.from_file(file_path, "m4a")
+    _, path = tempfile.mkstemp()
+    aac_version.export(path, format="wav")
+    rate, data = scipy.io.wavfile.read(path)
+    os.remove(path)
+    return rate, data
+
+def read_mp3(file_path):
+    path, ext = os.path.splitext(file_path)
+    assert ext=='.mp3'
+    mp3 = pydub.AudioSegment.from_mp3(file_path)
+    _, path = tempfile.mkstemp()
+    mp3.export(path, format="wav")
+    rate, data = scipy.io.wavfile.read(path)
+    os.remove(path)
+    return rate, data
+
+def mp3_to_wav(file_path, obj_path):
+    path, ext = os.path.splitext(file_path)
+    assert ext=='.mp3'
+    mp3 = pydub.AudioSegment.from_mp3(file_path)
+    mp3.export(obj_path, format="wav")
+
+def mergeChannels(data):
+    data = normalize(data)
+    if len(data.shape)==1:
+        return data
+    if len(data.shape)==2:
+        return np.mean(data, axis = 1)
+    raise ValueError("This is not what an audio file ought to be!")
+
+def getDefaultSpectrogram(rate, data):
+    f, t, Sxx = signal.spectrogram(data, fs=rate, window='hamming', nperseg=400, noverlap=240, nfft=1024, scaling='spectrum', return_onesided=True)
+    return Sxx
+
+def frame_split(data, frame_width, frame_step):
+    if len(data)<frame_width:
+        raise ValueError("The length of data is shorter than the frame width.")
+    frame_max = int(np.floor((len(data)-frame_width+frame_step)/float(frame_step)))
+    result = []
+    for i in range(frame_max):
+        start = i*frame_step
+        end = start+frame_width
+        result.append(data[start:end])
+    return np.array(result)
+
+def get_mel_db(wave_data, sr, winlen = 0.025, winstep = 0.01, nfft = 512, num_mel = 40, wav_process=False):
+    # Input 10.015 sec, output, (1000, 40)
+    # nfft >= num data points in one window(frame)
+    if wav_process == True:
+        frame_shift = int(sr * winstep)
+        frame_size = int(sr * winlen)
+        wave_data, index = librosa.effects.trim(wave_data, frame_length=frame_size, hop_length=frame_shift)
+    mel_db = logfbank(wave_data, samplerate=sr, winlen=winlen, winstep=winstep,
+                      nfilt=num_mel, nfft=nfft, lowfreq=0, highfreq=None, preemph=0.97)
+    mel_db -= (np.mean(mel_db,axis=1).reshape(-1,1)+1e-8)
+    return mel_db
diff --git a/lib/tools_augmentation.py b/lib/tools_augmentation.py
new file mode 100644
index 0000000..23ce1db
--- /dev/null
+++ b/lib/tools_augmentation.py
@@ -0,0 +1,368 @@
+import numpy as np
+import pyfftw
+import resampy
+from scipy.ndimage import zoom
+
+def randomAugment(data, rate, num, obj_length = None, noiseSource = None, bgMaximum = 0.08, verbose = False):
+    """
+    Perform random augmentations. Recommended bgMaximum: random noise:0.07,
+    office - 0.1, youtube human: 0.07, youtube backgrounds: 0.15.
+
+    :param np.ndarray data: The audio's data point. One channel, which means the
+        length of the shape should be one.
+    :param int rate: The sampling rate of the audio.
+    :param int num: Number of augmentation.
+    :param int obj_length: Output audio lengths. Will not be padded if leave it
+        none
+    :param np.ndarray noiseSource: The source of noise. Will add white noise if
+        leave it none.
+    :param float bgMaximum: The maximum background sound.
+    :param boolean verbose: If true, print out the adjustment made during the
+        process.
+
+    :return: A list of audio data points.
+    :raises ValueError: If num < 0 or obj_length <= 0 or 0.75*len(data).
+    """
+    if obj_length is not None:
+        if (obj_length <= 0):
+            raise ValueError('Objective length must be above than 0.')
+        if (obj_length <= 0.75*len(data)):
+            raise ValueError('Objective length too short.')
+    if num < 0:
+        raise ValueError('Number of augmentation must be above than or equal to 0.')
+    if num == 0:
+        return []
+
+    result = []
+    data = _normalize(data)
+    for _ in range(num):
+        # 1. shift the data a little bit.
+        if len(data)>16000:
+            shifty = min(int(len(data)/10), np.random.randint(4000))
+            if np.random.random()>0.5: shifty *= -1
+            transformed = shift(data, shifty)
+        else:
+            transformed = data
+        # 2. Adjust the speed.
+        ub = 1.25
+        lb = 0.8
+        if obj_length is not None:
+            zoomUpperBound = min(ub, obj_length/float(len(transformed)))
+            if zoomUpperBound<lb:
+                zoomFactor = zoomUpperBound
+            else:
+                zoomFactor = np.random.random()*(zoomUpperBound-lb)+lb
+        else:
+            zoomFactor = np.random.random()*(ub-lb)+lb
+        transformed = audioResize(transformed, zoomFactor)
+        if verbose:
+            print("Data zoom factor = "+str(zoomFactor))
+        # 3. Add noise.
+        if bgMaximum>0:
+            noiseFactor = np.random.random()*bgMaximum
+            if noiseSource is not None:
+                transformed = addNoiseFrom(transformed, noiseSource, noiseFactor)
+                if verbose:
+                    print("Noise added from source.")
+                    print("Noise factor = "+str(noiseFactor))
+            else:
+                transformed = addNoise(transformed, noiseFactor)
+                if verbose:
+                    print("Noise generated randomly.")
+                    print("Noise factor = "+str(noiseFactor))
+        # 4. Adjust the volume
+        maximum = np.random.random()*0.6+0.4
+        transformed = audioVolume(transformed, maximum)
+        if verbose: print("Volume adjusted. Max = "+str(maximum)+".")
+        # 5. Zero padding
+        if obj_length is not None:
+            if verbose: print("Shape before zoom is "+str(transformed.shape))
+            # First, make sure the length is smaller than or equal to the
+            # obj_length. Then pad to obj_length.
+            transformed = _zeroPad(transformed[:obj_length], obj_length)
+        if verbose:
+            print("Shape after zoom is "+str(transformed.shape))
+        result.append(transformed)
+        if verbose: print("--------------")
+    return result
+
+def freqChange(data, rate, freq_range = None, bias = None):
+    """
+    lower, upper = freq_range
+    Trim and shift the frequency. Remove the frequency higher than 'upper' and
+    lower than 'lower', then shift the frequency by 'bias'. Probably would have
+    error raised when len(data)<rate. If dealing with speech data, the tightest
+    bound recommended is (400,2500), and the recommended range of bias is (-25,
+    25).
+
+    :param np.ndarray data: The audio's data point. One channel, which means the
+        length of the shape should be one.
+    :param int rate: The sampling rate of the audio.
+    :param int lower: The lower bound of the frequency. Must smaller than the
+        upper bound.
+    :param int upper: The upper bound of the frequency. Must higher than the
+        lower bound, and smaller than rate//2.
+    :param int bias: The frequency bias.
+
+    :return: Transformed audio data points. Should be the same size as the
+        'data' (Not Tested).
+    :raises AssertionError: If lower >= upper.
+    """
+    yf = pyfftw.interfaces.numpy_fft.fft(data)
+    trans = np.copy(yf)
+    trans *= 0
+
+    # Clip the frequency.
+    if freq_range is not None:
+        # Determine the maximum and the minimum frequency.
+        minF, maxF = freq_range
+        assert maxF>minF
+        fBound = int(rate/2)
+        minF = max(0, minF)
+        maxF = min(fBound, maxF)
+        # Determine the maximum and the minimum point.
+        minP = int(len(yf)*minF/(2*fBound))
+        maxP = int(len(yf)*maxF/(2*fBound))
+        # Trim the fourier form.
+        trans[minP:maxP] = yf[minP:maxP]
+        trans[-maxP:-minP] = yf[-maxP:-minP]
+        yf = trans
+        trans = np.copy(yf)
+        trans *= 0
+
+    # Shift by the bias.
+    for i in range(int(len(yf)/2)):
+        obj_index = int(i-bias*len(yf)/rate)
+        if (obj_index<=(len(yf)/2)) and (obj_index>=0):
+            trans[i] = yf[obj_index]
+            trans[-i] = yf[-obj_index]
+
+    s = _normalize(pyfftw.interfaces.numpy_fft.ifft(trans).real)
+    return s
+
+def dataTrim(data, trim_lower, trim_upper):
+    """
+    Trim the audio, which means the data points before trim_lower or after
+    trim_upper will be removed. trim_upper can be negative.
+
+    :param np.ndarray data: The audio's data point. One channel, which means the
+        length of the shape should be one.
+    :param int trim_lower: The lower bound to trim.
+    :param int trim_upper: The upper bound to trim.
+
+    :return: Transformed audio data points.
+    :raises ValueError: If trim_lower >= trim_upper.
+    """
+    if (trim_lower < 0):
+        raise ValueError('Lower bound must be above than or equal to zero.')
+    if (trim_upper == 0):
+        raise ValueError('Upper bound cannot be zero.')
+    if (trim_upper > 0) and (trim_lower >= trim_upper):
+        raise ValueError('Lower bound is larger than or equal to the upper bound.')
+    if (trim_upper < 0) and ((trim_lower-trim_upper)>=len(data)):
+        raise ValueError('Lower bound is larger than or equal to the upper bound.')
+
+    return data[trim_lower:trim_upper]
+
+def dataPadding(data, padding_lower, padding_upper):
+    """
+    Add zeros to the beginning or the end of the audio.
+
+    :param np.ndarray data: The audio's data point. One channel, which means the
+        length of the shape should be one.
+    :param int padding_lower: Number of zeros to be added to the beginning.
+    :param int padding_upper: Number of zeros to be added to the end.
+
+    :return: Transformed audio data points.
+    :raises ValueError: If padding_lower or padding_upper < 0.
+    """
+    if (padding_lower <= 0) or (padding_upper <= 0):
+        raise ValueError('Number of padding must be above than zero.')
+    result = np.zeros(len(data)+padding_lower+padding_upper)
+    try:
+        result[padding_lower:-padding_upper] = data
+    except ValueError:
+        print(padding_lower)
+        print(padding_upper)
+        print(len(data))
+        raise ValueError("FUCK YOU!!!!!!!!!")
+    return result
+
+def audioResize(data, zoomFactor):
+    """
+    Resize the audio. Not only the length, but the frequency will also be
+    changed. If dealing with speech data, the zoomFactor bound recommended is
+    [0.75,1.35].
+
+    :param np.ndarray data: The audio's data point. One channel, which means the
+        length of the shape should be one.
+    :param int zoomFactor: The objective zoomFactor.
+
+    :return: Transformed audio data points.
+    :raises ValueError: If zoomFactor <= 0.
+    """
+    if (zoomFactor <= 0):
+        raise ValueError('The zoomFactor should be larger than zero.')
+    return zoom(data, zoomFactor)
+
+def audioVolume(data, maximum):
+    """
+    Set the maximum volume of the audio. I personally do not recommend setting
+        the maximum above than 0.99
+
+    :param np.ndarray data: The audio's data point. One channel, which means the
+        length of the shape should be one.
+    :param float maximum: The maximum volume.
+
+    :return: Transformed audio data points. Should be the same size as the
+        'data'.
+    :raises ValueError: If maximum < 0 or >1.
+    """
+    if (maximum < 0) or (maximum > 1):
+        raise ValueError('The maximum should be between 0 and 1.')
+    return maximum * data / np.max(np.abs(data))
+
+def audioVolumeLinear(data, maximum_start, maximum_end):
+    """
+    Set the maximum volume of the audio linearly regarding the time.
+
+    :param np.ndarray data: The audio's data point. One channel, which means the
+        length of the shape should be one.
+    :param float maximum_start: The maximum volume at the beginning.
+    :param float maximum_end: The maximum volume in the end.
+
+    :return: Transformed audio data points. Should be the same size as the
+        'data'.
+    :raises ValueError: If (maximum_start < 0 or >1) or (maximum_end < 0 or >1).
+    """
+    if (maximum_start < 0) or (maximum_start > 1) or (maximum_end < 0) or (maximum_end > 1):
+        raise ValueError('The maximum should be between 0 and 1.')
+    maximum = np.linspace(maximum_start, maximum_end, num=len(data))
+    return maximum * data / np.max(np.abs(data))
+
+def addNoise(data, noise_factor):
+    """
+    Add random noise to the audio.
+
+    :param np.ndarray data: The audio's data point. One channel, which means the
+        length of the shape should be one.
+    :param float noise_factor: The ratio of noise in volume.
+
+    :return: Transformed audio data points. Should be the same size as the
+        'data'.
+    :raises ValueError: If noise_factor < 0 or >1.
+    """
+    if (noise_factor < 0) or (noise_factor > 1):
+        raise ValueError('The noise factor should be between 0 and 1.')
+    noise = np.random.random(size=len(data))*2-1
+    noise /= (np.max(np.abs(noise))/noise_factor)
+    audio = (1.0-noise_factor) * data / np.max(np.abs(data))
+    return noise+audio
+
+def addNoiseFrom(data, noiseSource, noise_factor):
+    """
+    Add random noise from source. If noise from people speaking, recommend below
+    0.05, else, recommend below 0.3
+
+    :param np.ndarray data: The audio's data point. One channel, which means the
+        length of the shape should be one.
+    :param np.ndarray noiseSource: The noise data. Must be longer or equal to
+        the length of 'data'.
+    :param float noise_factor: The ratio of noise in volume.
+
+    :return: Transformed audio data points. Should be the same size as the
+        'data'.
+    :raises ValueError: If len(noiseSource)<len(data).
+    :raises ValueError: If noise_factor < 0 or >1.
+    """
+    if (noise_factor < 0) or (noise_factor > 1):
+        raise ValueError('The noise factor should be between 0 and 1.')
+    if (len(noiseSource)<len(data)):
+        raise ValueError('The length of the noise source should be longer than the audio data.')
+    start = np.random.randint(len(noiseSource)-len(data))
+    noise = noiseSource[start:start+len(data)]
+    noise = noise_factor * noise / (np.max(np.abs(noise))+1e-6)
+    audio = (1.0-noise_factor) * data / np.max(np.abs(data))
+    return noise+audio
+
+def shift(audio, shifty):
+    """
+    Shift the audio in time. If `shifty` is positive, shift with time
+    advance; if negative, shift with time delay. Silence are padded to
+    keep the duration unchanged.
+
+    :param np.ndarray audio: Audio data points.
+    :param float shifty: Shift time in millseconds. If positive, shift with
+        time advance; if negative; shift with time delay.
+    :raises ValueError: If shifty is longer than audio duration.
+    """
+    if abs(shifty) > len(audio):
+        raise ValueError("Absolute value of shift_ms should be smaller "
+                         "than audio duration.")
+    if shifty > 0:
+        # time advance
+        audio[:-shifty] = audio[shifty:]
+        audio[-shifty:] = 0
+    elif shifty < 0:
+        # time delay
+        audio[-shifty:] = audio[:shifty]
+        audio[:-shifty] = 0
+    return audio
+
+def resample(audio, rate, target_sample_rate, filter='kaiser_best'):
+    """
+    Resample the audio to a target sample rate.
+
+    :param np.ndarray audio: Audio data points.
+    :param int target_sample_rate: Target sample rate.
+    :param str filter: The resampling filter to use one of {'kaiser_best',
+                   'kaiser_fast'}.
+    """
+    audio = resampy.resample(audio, rate, target_sample_rate, filter=filter)
+    return audio
+
+def simple_echo(audio, factor, duration):
+    """
+    Resample the audio to a target sample rate.
+
+    :param np.ndarray audio: Audio data points.
+    :param int factor: Echo factor.
+    :param int duration: Echo duration. If duration == 0, then no echo.
+    """
+    if duration == 0: return audio
+    result = np.zeros(len(audio)+duration)
+    ratio = 1
+    for i in range(duration+1):
+        result[i:len(audio)+i] = ratio*audio
+        ratio*=factor
+    return _normalize(result)
+
+def impulse_echo(audio, impulse_func):
+    """
+    Resample the audio to a target sample rate.
+
+    :param np.ndarray audio: Audio data points.
+    :param np.ndarray impulse_func: Echo factors array.
+    """
+    result = np.zeros(len(audio)+len(impulse_func))
+    result[:len(audio)] = audio
+    for i in range(duration):
+        result[i+1:len(audio)+i+1] = impulse_func[i]*audio
+    return _normalize(result)
+
+def _normalize(dat):
+    return 0.99 * dat / np.max(np.abs(dat))
+
+def _zeroPad(dat, l):
+    """
+    Pad by zeros.
+
+    :param list dat: The data array.
+    :param int l: Objective length.
+    :raises ValueError: If l is shorter than the dat length.
+    """
+    if l<len(dat):
+        raise ValueError("Obj length smaller than data length, cannot do the zero padding.")
+    result = np.zeros(l)
+    result[:len(dat)] = dat
+    return result
diff --git a/lib/tools_batch.py b/lib/tools_batch.py
new file mode 100644
index 0000000..81a2a62
--- /dev/null
+++ b/lib/tools_batch.py
@@ -0,0 +1,159 @@
+import numpy as np
+import time
+import os
+# import logging
+from lib.tools_pinyin import *
+from lib.tools_audio import *
+from lib.tools_math import *
+from lib.tools_augmentation import *
+from lib.tools_sparse import *
+from lib.contrib.audio_featurizer import AudioFeaturizer
+from lib.contrib.audio import AudioSegment
+
+# logger = logging.getLogger(__file__)
+
+class BatchGetter:
+
+    # If turn on the server option, save all the noise waves to RAM, else, only
+    # the paths will be saved to RAM.
+    def __init__(self, ids_path, transcript_path, pinyin_path, background_root, server = False):
+        self.server = server
+        self.af = AudioFeaturizer()
+        self.pyParser = pinyinParser(pinyin_path)
+        temp = self._get_addressList(ids_path)
+        self.addressList = []
+        self.labels = self._get_labelsDict(transcript_path)
+        self.unicodes = self._get_unicodes(transcript_path)
+        for address in temp:
+            id = address.split('/')[-1][:-4]
+            if id in self.labels:
+                self.addressList.append(address)
+
+        if background_root[-1] != '/': background_root = background_root+'/'
+        root_office = background_root+"office_backgrounds/"
+        root_youtube = background_root+"youtube_backgrounds/"
+        root_talking = background_root+"youtube_talking/"
+        bg_office = [root_office+x for x in os.listdir(root_office) if x[0]!='.']
+        bg_youtube = [root_youtube+x for x in os.listdir(root_youtube) if x[0]!='.']
+        bg_talking = [root_talking+x for x in os.listdir(root_talking) if x[0]!='.']
+        if self.server:
+            bg_office_w = [mergeChannels(read_wav(w)[1]) for w in bg_office]
+            bg_youtube_w = [mergeChannels(read_wav(w)[1]) for w in bg_youtube]
+            bg_talking_w = [mergeChannels(read_wav(w)[1]) for w in bg_talking]
+            self.bg_libs = [bg_office_w, bg_youtube_w, bg_talking_w]
+        else:
+            self.bg_libs = [bg_office, bg_youtube, bg_talking]
+
+    def _get_addressList(self, root):
+        if root[-1] == '/': root=root[:-1]
+        result = []
+
+        ids = [f for f in os.listdir(root) if f[0]!='.']
+        for id in ids:
+            files = [root+'/'+id+'/'+f for f in os.listdir(root+'/'+id) if f[0]!='.']
+            result.extend(files)
+        return result
+
+    def _get_labelsDict(self, transcript_path):
+        result = {}
+        with open(transcript_path, 'r') as f:
+            for line in f:
+                line = line.strip().split()
+                content = ("".join(line[1:]))
+                result[line[0]] = self.pyParser.getPinYinIndices(self.pyParser.getPinYin(content))
+        return result
+
+    def _get_unicodes(self, transcript_path):
+        result = {}
+        with open(transcript_path, 'r') as f:
+            for line in f:
+                line = line.strip().split()
+                content = ("".join(line[1:]))
+                result[line[0]] = content
+        return result
+
+    # If using the fbank, remember to adjust the x_obj. What is more, the
+    # frame_width and frame stride will be disabled, while winlen, winstep, nfft,
+    # and num_mel will be enabled.
+    # If raw frames: x_obj(163000) = your_obj(160000)+frame_width(4000)-frame_stride(1000)
+    # If filterbank: x_obj(160240) = your_obj(160000)+winlen(16000*0.025=400)-winstep(16000*0.01=160)
+    def get_batch(self, num, x_obj_min = 16000, x_obj = 160160, batch_type = 'train',
+                  augmentation = True, returnUnicode = False, bgMaximum = 0.05,
+                  isCTC = True, verbose = False):
+        if batch_type=='train':
+            range_min = 0
+            range_max = int(len(self.addressList))*0.95
+        elif batch_type=='test':
+            range_min = int(len(self.addressList))*0.95
+            range_max = len(self.addressList)
+        elif batch_type=='all':
+            range_min = 0
+            range_max = int(len(self.addressList))
+        else:
+            return
+
+        xs = []
+        ys = []
+        aug_total = 0
+        mfb_total = 0
+        while len(xs)<num:
+            index = np.random.randint(range_min, range_max)
+            file_path = self.addressList[index]
+            rate, data = read_wav(file_path)
+            if len(data)<x_obj_min or len(data)>x_obj:
+                continue
+
+            id = file_path.split('/')[-1][:-4]
+
+            start = time.time()
+            if augmentation:
+                bg_lib = self.bg_libs[np.random.randint(len(self.bg_libs))]
+                if self.server:
+                    ns = bg_lib[np.random.randint(len(bg_lib))]
+                else:
+                    _, ns = read_wav(bg_lib[np.random.randint(len(bg_lib))])
+                    ns = mergeChannels(ns)
+                data = randomAugment(data, rate, 1, obj_length = x_obj, noiseSource = ns, bgMaximum = bgMaximum)[0]
+            else:
+                data = zero_padding_1d(data, x_obj)
+            time1 = time.time()
+            a_seg = AudioSegment(data, rate)
+            xs.append(self.af.featurize(a_seg))
+            time2 = time.time()
+            aug_total += time1-start
+            mfb_total += time2-time1
+
+            if returnUnicode:
+                ys.append(self.unicodes[id])
+            else:
+                ys.append(np.array(self.labels[id]).astype(int))
+
+        if verbose:
+            # How the fuck to use print? See this:
+            # print('a={first:4.2f}, b={second:03d}'.format(first=f(x,n),second=g(x,n)))
+            # print("a=%d,b=%d" % (f(x,n),g(x,n)))
+            print("Augmentation time = %f sec; Featurization time = %f sec" % (aug_total, mfb_total))
+        xs = np.array(xs)
+        xs = np.transpose(xs, [0,2,1])
+        if returnUnicode:
+            return xs, ys
+        else:
+            if isCTC:
+                ys = sparse_tuple_from(ys)
+                return xs, ys
+            else:
+                ys_lengths = [len(y)+1 for y in ys]
+                max_length = max(ys_lengths)
+                temp = []
+
+                # The first three tokens should be reserved for padding, start, and end tokens.
+                for y in ys:
+                    if len(y)<(max_length-1):
+                        # Add the end token. (Actually 2, but will be 2 after 3 is added.)
+                        y = np.concatenate([y, [-1]])
+                        temp.append(np.concatenate([y+3, np.zeros(max_length-len(y))]))
+                    else:
+                        y = np.concatenate([y, [-1]])
+                        temp.append(y+3)
+                ys = np.array(temp)
+                return xs, ys, ys_lengths
diff --git a/lib/tools_math.py b/lib/tools_math.py
new file mode 100644
index 0000000..819cd20
--- /dev/null
+++ b/lib/tools_math.py
@@ -0,0 +1,51 @@
+import numpy as np
+
+def sigmoid(x):
+    return 1.0/(1.0+np.exp(-x))
+
+def normalize(dat):
+    return 0.99 * dat / np.max(np.abs(dat))
+
+def get_topk_args(arr, k):
+    return arr.argsort()[::-1][:k]
+
+def get_distance(v1, v2):
+    if len(v2.shape) != 1:
+        raise ValueError("arg2 should be an 1d array.")
+    if len(v1.shape) == 1:
+        return np.sqrt(np.sum(np.square(v1-v2)))
+    elif len(v1.shape) == 2:
+        return np.sqrt(np.sum(np.square(v1-v2), axis = 1))
+    else:
+        raise ValueError("arg1 should be rather 1d or 2d array.")
+
+def get_cos_sim(v1, v2):
+    if len(v2.shape) != 1:
+        raise ValueError("arg2 should be an 1d array.")
+    if len(v1.shape) == 1:
+        inner = np.sum((v1*v2))
+        normv1 = np.sqrt(np.sum(np.square(v1)))
+        normv2 = np.sqrt(np.sum(np.square(v2)))
+        return inner/(normv1*normv2)
+    elif len(v1.shape) == 2:
+        inner = np.sum((v1*v2), axis = 1)
+        normv1 = np.sqrt(np.sum(np.square(v1), axis = 1))
+        normv2 = np.sqrt(np.sum(np.square(v2)))
+        return inner/(normv1*normv2)
+
+def index2onehot(indices, label_range):
+    result = np.zeros([len(indices), label_range])
+    result[np.arange(len(indices)), indices] = 1.0
+    return result
+
+def randomExcept(n, end, start = 0):
+    r = range(start, n) + range(n+1, end)
+    return np.random.choice(r)
+
+def zero_padding_1d(vec, obj_length):
+    result = np.concatenate([vec, np.zeros(obj_length-len(vec))])
+    return result
+
+def neg_padding_1d(vec, obj_length):
+    result = np.concatenate([vec, -np.ones(obj_length-len(vec))])
+    return result
diff --git a/lib/tools_pinyin.py b/lib/tools_pinyin.py
new file mode 100644
index 0000000..ce7d3ce
--- /dev/null
+++ b/lib/tools_pinyin.py
@@ -0,0 +1,62 @@
+import numpy as np
+import pickle
+from pypinyin import lazy_pinyin
+
+class pinyinParser:
+
+    def __init__(self, path):
+        with open(path, 'rb') as handle:
+            self.pinyinDict = pickle.load(handle)
+        invPath = path[:-7]+"Inv.pickle"
+        with open(invPath, 'rb') as handle:
+            self.pinyinDict_inv = pickle.load(handle)
+
+    def getDictSize(self):
+        return len(self.pinyinDict)
+
+    def getPinYin(self, unicodeContent):
+        return " ".join([x for x in lazy_pinyin(unicodeContent)])
+
+    def _index2OneHot(self, index):
+        result = np.zeros(len(self.pinyinDict))
+        result[index] = 1.0
+        return result
+
+    def _indices2OneHot(self, indices):
+        result = np.zeros([len(indices), len(self.pinyinDict)])
+        result[np.arange(len(indices)), indices] = 1.0
+        return result
+
+    def getPinYinIndices(self, pinyin):
+        pinyinList = pinyin.strip().split()
+        indices = []
+        for pinyin in pinyinList:
+            if pinyin in self.pinyinDict:
+                indices.append(self.pinyinDict[pinyin])
+            else:
+                raise ValueError("Could not find "+pinyin+" in the dictionary.")
+        if len(indices)==0:
+            raise ValueError("Invalid input.")
+        return indices
+
+    def getPinYinOneHot(self, pinyin):
+        pinyinList = pinyin.strip().split()
+        indices = []
+        for pinyin in pinyinList:
+            if pinyin in self.pinyinDict:
+                indices.append(self.pinyinDict[pinyin])
+            else:
+                raise ValueError("Could not find "+pinyin+" in the dictionary.")
+        if len(indices)==0:
+            raise ValueError("Invalid input.")
+        return self._indices2OneHot(indices)
+
+    def decodeIndices(self, vec, useUnderline = True):
+        result = []
+        for num in vec:
+            if num in self.pinyinDict_inv:
+                result.append(self.pinyinDict_inv[num])
+        if useUnderline:
+            return '_'.join(result)
+        else:
+            return ''.join(result)
diff --git a/lib/tools_player.py b/lib/tools_player.py
new file mode 100644
index 0000000..46d5e15
--- /dev/null
+++ b/lib/tools_player.py
@@ -0,0 +1,23 @@
+import numpy as np
+import sounddevice as sd
+import matplotlib.pyplot as plt
+
+from lib.tools_audio import *
+
+def play(vec, Fs):
+    sd.play(vec, Fs, blocking=True)
+
+def normalize(dat):
+    return 0.99 * dat / np.max(np.abs(dat))
+
+def load_data(file_path):
+    try:
+        _, data_temp = read_mp3(file_path)
+    except:
+        _, data_temp = read_wav(file_path)
+    return data_temp
+
+def plotSound(vec):
+    plt.plot(vec)
+    plt.ylabel('Amplitude')
+    plt.show()
diff --git a/lib/tools_sparse.py b/lib/tools_sparse.py
new file mode 100644
index 0000000..0c04b3f
--- /dev/null
+++ b/lib/tools_sparse.py
@@ -0,0 +1,55 @@
+import numpy as np
+from lib.tools_pinyin import *
+
+def get_maxLengthListinList(ls):
+    length = 0
+    for l in ls:
+        if len(l)>length: length = len(l)
+    return length
+
+def sparse_tuple_from(sequences, dtype=np.int32):
+    """
+    Create a sparse representention of x.
+    Args:
+        sequences: a list of lists of type dtype where each element is a sequence
+    Returns:
+        A tuple with (indices, values, shape)
+    """
+    indices = []
+    values = []
+
+    for n, seq in enumerate(sequences):
+        indices.extend(zip([n] * len(seq), range(len(seq))))
+        values.extend(seq)
+
+    indices = np.asarray(indices, dtype=np.int64)
+    values = np.asarray(values, dtype=dtype)
+    shape = np.asarray([len(sequences), np.asarray(indices).max(0)[1] + 1], dtype=np.int64)
+
+    return indices, values, shape
+
+def sparseTuples2dense(sparseTensor):
+    pred_dense = -np.ones(sparseTensor[2])
+    for i in range(len(sparseTensor[0])):
+        pred_dense[sparseTensor[0][i][0],sparseTensor[0][i][1]] = sparseTensor[1][i]
+    return pred_dense
+
+def report_accuracy(decoded_list, test_targets, pyParser):
+    original_list = sparseTuples2dense(test_targets)
+    detected_list = sparseTuples2dense(decoded_list)
+    print("-------------------")
+    for i in range(len(original_list)):
+        original_line = []
+        detected_line = []
+        for stuff in original_list[i]:
+            if stuff!=-1:
+                original_line.append(stuff)
+        for stuff in detected_list[i]:
+            if stuff!=-1:
+                detected_line.append(stuff)
+        print(i)
+        print(original_line)
+        print(detected_line)
+        print(pyParser.decodeIndices(original_line, useUnderline = True))
+        print(pyParser.decodeIndices(detected_line, useUnderline = True))
+        print("-------------------")
diff --git a/logs/readme.md b/logs/readme.md
new file mode 100644
index 0000000..39cdd0d
--- /dev/null
+++ b/logs/readme.md
@@ -0,0 +1 @@
+-
diff --git a/models/readme.md b/models/readme.md
new file mode 100644
index 0000000..39cdd0d
--- /dev/null
+++ b/models/readme.md
@@ -0,0 +1 @@
+-