codeprentice-org · phananh1096 · Dec 5, 2020
diff --git a/.ipynb_checkpoints/ASR Model-checkpoint.ipynb b/.ipynb_checkpoints/ASR Model-checkpoint.ipynb
@@ -0,0 +1,378 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Deepspeech Implementation\n",
+    "\n",
+    "*Implementation referenced from arXiv:1412.5567. All credits belong to original authors*"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "First we import the required libraries. For this implementation, we will mainly use functions from the Keras library, with the exception of a few functions in the CTC_loss method, where we will need to rely on the Tensorflow library."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import keras\n",
+    "import sklearn\n",
+    "import tensorflow as tf\n",
+    "from keras import Sequential, Model\n",
+    "from keras import optimizers\n",
+    "from keras import layers"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Model Architecture\n",
+    "\n",
+    "We show two implementations of the model architecture. One using the Keras.Model() method and one using Keras.Sequential() method\n",
+    "\n",
+    "*TODO: ADD DESCRIPTIONS*"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Keras.Model Architecture**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Default params for DeepSpeech model\n",
+    "def buildModel_test(input_dim, output_dim, context = 5, units = 1024, dropouts = (0.1,0.1,0)):    \n",
+    "    \n",
+    "    # Create Input Layer and preprocessing for first FC layer\n",
+    "    _input = layers.Input([None, input_dim])\n",
+    "    \n",
+    "    # Call Keras expand_dims to add extra channel dimension (axis = -1) to input required by convolution 2D layer\n",
+    "    x = layers.Lambda(keras.backend.expand_dims, arguments = dict(axis=-1))(_input)\n",
+    "    \n",
+    "    # ** Layer 1 **\n",
+    "    # A zero-padded convolutional layer applied on time dimension only.\n",
+    "    # Thus, we will need to pad time dimension and specify kernel size for time dimension based on specified context.\n",
+    "    x = layers.ZeroPadding2D(padding=(context,0))(x)\n",
+    "    x = layers.Conv2D(filters = units, kernel_size=(context*2+1, input_dim))(x)\n",
+    "    \n",
+    "    # Reshaping after convolution\n",
+    "    x = layers.Lambda(keras.backend.squeeze, arguments=dict(axis=2))(x)\n",
+    "    \n",
+    "    # Clipped Relu (max=20) and Dropout are then applied to convolutional output:\n",
+    "    x = layers.ReLU(max_value=20)(x)\n",
+    "    x = layers.Dropout(rate=dropouts[0])(x)\n",
+    "    \n",
+    "    # ** Layer 2 **\n",
+    "    # Dense Layer, followed by clipped Relu and Dropout operating on \n",
+    "    # independent data for each time-step via TimeDistributed Layer\n",
+    "    x = layers.TimeDistributed(layers.Dense(units))(x)\n",
+    "    x = layers.ReLU(max_value=20)(x)\n",
+    "    x = layers.Dropout(rate=dropouts[1])(x)\n",
+    "    \n",
+    "    # ** Layer 3 **\n",
+    "    # Similar to Layer 2\n",
+    "    x = layers.TimeDistributed(layers.Dense(units))(x)\n",
+    "    x = layers.ReLU(max_value=20)(x)\n",
+    "    x = layers.Dropout(rate=dropouts[2])(x)\n",
+    "    \n",
+    "    # ** Layer 4 **\n",
+    "    # Bidirectional RNN, with output being sum of both forward and backward units\n",
+    "    x = layers.Bidirectional(layers.SimpleRNN(units, return_sequences=True), merge_mode='sum')(x)\n",
+    "    \n",
+    "    # ** Layer 5 **\n",
+    "    # Final Dense Layer followed by Softmax to get predictions along characters for each timestep\n",
+    "    x = layers.TimeDistributed(layers.Dense(output_dim))(x)\n",
+    "    _output = layers.Softmax()(x)\n",
+    "    \n",
+    "    # Create model\n",
+    "    model = keras.Model(_input, _output)\n",
+    "                               \n",
+    "   # Print summary\n",
+    "    model.summary()\n",
+    "    \n",
+    "    return model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Keras.Sequential Architecture**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def buildModel(input_dim, output_dim, context = 5, units = 1024, dropouts = (0.1,0.1,0)):    \n",
+    "    model2 = keras.Sequential()\n",
+    "    # Create Input Layer and preprocessing for first FC layer\n",
+    "    model2.add(layers.Input([None, input_dim]))\n",
+    "    \n",
+    "    # Call Keras expand_dims to add extra channel dimension (axis = -1) to input required by convolution 2D layer\n",
+    "    model2.add(layers.Lambda(keras.backend.expand_dims, arguments = dict(axis=-1)))\n",
+    "    \n",
+    "    # ** Layer 1 **\n",
+    "    # A zero-padded convolutional layer applied on time dimension only.\n",
+    "    # Thus, we will need to pad time dimension and specify kernel size for time dimension based on specified context.\n",
+    "    model2.add(layers.ZeroPadding2D(padding=(context,0)))\n",
+    "    model2.add(layers.Conv2D(filters = units, kernel_size=(context*2+1, input_dim)))\n",
+    "    \n",
+    "    # Reshaping after convolution\n",
+    "    model2.add(layers.Lambda(keras.backend.squeeze, arguments=dict(axis=2)))\n",
+    "    \n",
+    "    # Clipped Relu (max=20) and Dropout are then applied to convolutional output:\n",
+    "    model2.add(layers.ReLU(max_value=20))\n",
+    "    model2.add(layers.Dropout(rate=dropouts[0]))\n",
+    "    \n",
+    "    # ** Layer 2 **\n",
+    "    # Dense Layer, followed by clipped Relu and Dropout operating on \n",
+    "    # independent data for each time-step via TimeDistributed Layer\n",
+    "    model2.add(layers.TimeDistributed(layers.Dense(units)))\n",
+    "    model2.add(layers.ReLU(max_value=20))\n",
+    "    model2.add(layers.Dropout(rate=dropouts[1]))\n",
+    "    \n",
+    "    # ** Layer 3 **\n",
+    "    # Similar to Layer 2\n",
+    "    model2.add(layers.TimeDistributed(layers.Dense(units)))\n",
+    "    model2.add(layers.ReLU(max_value=20))\n",
+    "    model2.add(layers.Dropout(rate=dropouts[2]))\n",
+    "    \n",
+    "    # ** Layer 4 **\n",
+    "    # Bidirectional RNN, with output being sum of both forward and backward units\n",
+    "    model2.add(layers.Bidirectional(layers.SimpleRNN(units, return_sequences=True), merge_mode='sum'))\n",
+    "    model2.add(layers.TimeDistributed(layers.Dense(output_dim, activation='softmax')))\n",
+    "    \n",
+    "    model2.summary()\n",
+    "    \n",
+    "    return model2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Model: \"sequential_9\"\n",
+      "_________________________________________________________________\n",
+      "Layer (type)                 Output Shape              Param #   \n",
+      "=================================================================\n",
+      "lambda_26 (Lambda)           (None, None, 1000, 1)     0         \n",
+      "_________________________________________________________________\n",
+      "zero_padding2d_13 (ZeroPaddi (None, None, 1000, 1)     0         \n",
+      "_________________________________________________________________\n",
+      "conv2d_13 (Conv2D)           (None, None, 1, 1024)     11265024  \n",
+      "_________________________________________________________________\n",
+      "lambda_27 (Lambda)           (None, None, 1024)        0         \n",
+      "_________________________________________________________________\n",
+      "re_lu_39 (ReLU)              (None, None, 1024)        0         \n",
+      "_________________________________________________________________\n",
+      "dropout_39 (Dropout)         (None, None, 1024)        0         \n",
+      "_________________________________________________________________\n",
+      "time_distributed_39 (TimeDis (None, None, 1024)        1049600   \n",
+      "_________________________________________________________________\n",
+      "re_lu_40 (ReLU)              (None, None, 1024)        0         \n",
+      "_________________________________________________________________\n",
+      "dropout_40 (Dropout)         (None, None, 1024)        0         \n",
+      "_________________________________________________________________\n",
+      "time_distributed_40 (TimeDis (None, None, 1024)        1049600   \n",
+      "_________________________________________________________________\n",
+      "re_lu_41 (ReLU)              (None, None, 1024)        0         \n",
+      "_________________________________________________________________\n",
+      "dropout_41 (Dropout)         (None, None, 1024)        0         \n",
+      "_________________________________________________________________\n",
+      "bidirectional_13 (Bidirectio (None, None, 1024)        4196352   \n",
+      "_________________________________________________________________\n",
+      "time_distributed_41 (TimeDis (None, None, 25)          25625     \n",
+      "=================================================================\n",
+      "Total params: 17,586,201\n",
+      "Trainable params: 17,586,201\n",
+      "Non-trainable params: 0\n",
+      "_________________________________________________________________\n",
+      "Model: \"model_6\"\n",
+      "_________________________________________________________________\n",
+      "Layer (type)                 Output Shape              Param #   \n",
+      "=================================================================\n",
+      "input_18 (InputLayer)        [(None, None, 1000)]      0         \n",
+      "_________________________________________________________________\n",
+      "lambda_28 (Lambda)           (None, None, 1000, 1)     0         \n",
+      "_________________________________________________________________\n",
+      "zero_padding2d_14 (ZeroPaddi (None, None, 1000, 1)     0         \n",
+      "_________________________________________________________________\n",
+      "conv2d_14 (Conv2D)           (None, None, 1, 1024)     11265024  \n",
+      "_________________________________________________________________\n",
+      "lambda_29 (Lambda)           (None, None, 1024)        0         \n",
+      "_________________________________________________________________\n",
+      "re_lu_42 (ReLU)              (None, None, 1024)        0         \n",
+      "_________________________________________________________________\n",
+      "dropout_42 (Dropout)         (None, None, 1024)        0         \n",
+      "_________________________________________________________________\n",
+      "time_distributed_42 (TimeDis (None, None, 1024)        1049600   \n",
+      "_________________________________________________________________\n",
+      "re_lu_43 (ReLU)              (None, None, 1024)        0         \n",
+      "_________________________________________________________________\n",
+      "dropout_43 (Dropout)         (None, None, 1024)        0         \n",
+      "_________________________________________________________________\n",
+      "time_distributed_43 (TimeDis (None, None, 1024)        1049600   \n",
+      "_________________________________________________________________\n",
+      "re_lu_44 (ReLU)              (None, None, 1024)        0         \n",
+      "_________________________________________________________________\n",
+      "dropout_44 (Dropout)         (None, None, 1024)        0         \n",
+      "_________________________________________________________________\n",
+      "bidirectional_14 (Bidirectio (None, None, 1024)        4196352   \n",
+      "_________________________________________________________________\n",
+      "time_distributed_44 (TimeDis (None, None, 25)          25625     \n",
+      "_________________________________________________________________\n",
+      "softmax_6 (Softmax)          (None, None, 25)          0         \n",
+      "=================================================================\n",
+      "Total params: 17,586,201\n",
+      "Trainable params: 17,586,201\n",
+      "Non-trainable params: 0\n",
+      "_________________________________________________________________\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Test build and summary on random input/output size to make sure equivalent\n",
+    "model = buildModel(1000, 25)\n",
+    "model = buildModel_test(1000, 25)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Implementation of CTC Loss\n",
+    "\n",
+    "The original paper uses the tensorflow backend function *\"tf.nn.ctc_loss\"* to define CTC_loss. In our implementation, we attempt to use the *\"keras.backend.ctc_batch_cost\"* function. The Keras function is more streamlined in that we only need to provide 4 arguments: Y_true (Ground truth labels), Y_pred (softmax output from our model), pred_length (sequence length of each batch item in Y_pred), and true_length (sequence length of each batch item in Y_true).\n",
+    "\n",
+    "In order to calculate pred_length and true_length, we reference several tensorflow functions used in the orginal implementation:\n",
+    "\n",
+    "> *tf.ones_like()* and *tf.math.reduce_sum()*\n",
+    "\n",
+    "The first function creates a copy of any input tensor, where all values replaced with 1's. The second function allows us to perform a summation of values along a specified axis. If we were to apply this function on a tensor of shape (batch, sequence_length) where all values are 1, the result is an output vector of shape (batch, 1) that tells us the sequence length for each batch item. This allows us to obtain the true_length array from the Y_true tensor.\n",
+    "\n",
+    "> *tf.reduce_max()* \n",
+    "\n",
+    "However, to obtain the pred_length array from Y_pred, we need to perform one extra step. Our Y_pred has dimensions (batch, sequence_length, num_char_classes). Our softmax gives us a one-hot encoding of all possible classes. In order obtain the right dimensions to apply *tf.ones_like()* and *tf.math.reduce_sum()*, we need to choose a prediction class. We can do this by calling *tf.reduce_max()* on our output, which returns the index of the maximum value along a specific axis, thus effectively removing the chosen dimension. Here, we call reduce_max on axis 2, which represents the one-hot encodings of the different characters, to obtain a tensor of shape (batch, sequence_length) that we can then operate on similar to Y_true.\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def ctc_loss(y_true, y_pred): \n",
+    "    # Get length array of y_true\n",
+    "    true_length = tf.math.reduce_sum(tf.ones_like(y_true), 1)\n",
+    "    # Get length array of y_pred:\n",
+    "    pred_length = tf.math.reduce_sum(tf.ones_like(tf.math.reduce_max(y_pred, 2)), 1)\n",
+    "    return tf.keras.backend.ctc_batch_cost(y_true, y_pred, pred_length, true_length)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "** Optimizer **\n",
+    "\n",
+    "We use the SGD optimizer with Nesterove Accelerated Gradient as per the paper. Momentum is set to 0.99"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "optimizer = optimizers.SGD(learning_rate=0.01, momentum=0.99, nesterov=True, name=\"SGD\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now that we have defined our CTC_loss and optimizer, as well as specified the model architecture, we will need to compile the model as shown below"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.compile(loss=ctc_loss, optimizer=optimizer, metrics=['accuracy'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 52,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# We can now fit the model:\n",
+    "# TODO: Make sure fit can run once processed input data and truth labels available\n",
+    "# model.fit()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Resources:\n",
+    "\n",
+    "+ DeepSpeech paper: https://arxiv.org/pdf/1412.5567.pdf\n",
+    "+ The original implementation of DeepSpeech: https://github.com/rolczynski/Automatic-Speech-Recognition\n",
+    "+ Keras Library: https://keras.io/api/\n",
+    "+ Tensorflow Library: https://www.tensorflow.org/api_docs/python/\n",
+    "+ https://www.tensorflow.org/api_docs/python/tf/nn/ctc_loss\n",
+    "+ https://stackoverflow.com/questions/57292896/understanding-ctc-loss-for-speech-recognition-in-keras\n",
+    "+ https://chadrick-kwag.net/tf-keras-rnn-ctc-example/"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}