-
Notifications
You must be signed in to change notification settings - Fork 23
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
097d054
commit cf7ded4
Showing
11 changed files
with
4,985 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
import numpy as np | ||
import tensorflow as tf | ||
|
||
|
||
class layerNormedGRU(tf.contrib.rnn.RNNCell): | ||
|
||
def __init__( | ||
self, size, activation=tf.tanh, reuse=None, | ||
normalizer=tf.contrib.layers.layer_norm, | ||
initializer=tf.contrib.layers.xavier_initializer()): | ||
super(layerNormedGRU, self).__init__(_reuse=reuse) | ||
self._size = size | ||
self._activation = activation | ||
self._normalizer = normalizer | ||
self._initializer = initializer | ||
|
||
@property | ||
def state_size(self): | ||
return self._size | ||
|
||
@property | ||
def output_size(self): | ||
return self._size | ||
|
||
def call(self, input_, state): | ||
update, reset = tf.split(self._forward( | ||
'update_reset', [state, input_], 2 * self._size, tf.nn.sigmoid, | ||
bias_initializer=tf.constant_initializer(-1.)), 2, 1) | ||
candidate = self._forward( | ||
'candidate', [reset * state, input_], self._size, self._activation) | ||
state = (1 - update) * state + update * candidate | ||
return state, state | ||
|
||
def _forward(self, name, inputs, size, activation, **kwargs): | ||
with tf.variable_scope(name): | ||
return _forward( | ||
inputs, size, activation, normalizer=self._normalizer, | ||
weight_initializer=self._initializer, **kwargs) | ||
|
||
|
||
def _forward( | ||
inputs, size, activation, normalizer=tf.contrib.layers.layer_norm, | ||
weight_initializer=tf.contrib.layers.xavier_initializer(), | ||
bias_initializer=tf.zeros_initializer()): | ||
if not isinstance(inputs, (tuple, list)): | ||
inputs = (inputs,) | ||
shapes = [] | ||
outputs = [] | ||
# Map each input to individually normalize their outputs. | ||
for index, input_ in enumerate(inputs): | ||
shapes.append(input_.shape[1: -1].as_list()) | ||
input_ = tf.contrib.layers.flatten(input_) | ||
weight = tf.get_variable( | ||
'weight_{}'.format(index + 1), (int(input_.shape[1]), size), | ||
tf.float32, weight_initializer) | ||
output = tf.matmul(input_, weight) | ||
if normalizer: | ||
output = normalizer(output) | ||
outputs.append(output) | ||
output = tf.reduce_mean(outputs, 0) | ||
# Add bias after normalization. | ||
bias = tf.get_variable( | ||
'weight', (size,), tf.float32, bias_initializer) | ||
output += bias | ||
# Activation function. | ||
if activation: | ||
output = activation(output) | ||
# Restore shape dimensions that are consistent among inputs. | ||
min_dim = min(len(shape[1:]) for shape in shapes) | ||
dim_shapes = [[shape[dim] for shape in shapes] for dim in range(min_dim)] | ||
matching_dims = ''.join('NY'[len(set(x)) == 1] for x in dim_shapes) + 'N' | ||
agreement = matching_dims.index('N') | ||
remaining = sum(np.prod(shape[agreement:]) for shape in shapes) | ||
if agreement: | ||
batch_size = output.shape[0].value or -1 | ||
shape = [batch_size] + shapes[:agreement] + [remaining] | ||
output = tf.reshape(output, shape) | ||
return output |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
import tensorflow as tf | ||
import numpy as np | ||
from layerNormedGRU import layerNormedGRU | ||
|
||
class model: | ||
|
||
def __init__(self, num_class, topk_paths = 10): | ||
self.xs = tf.placeholder(tf.float32, [None, 1000, 161]) | ||
self.ys = tf.sparse_placeholder(tf.int32) | ||
self.learning_rate = tf.placeholder(tf.float32) | ||
self.seq_len = tf.placeholder(tf.int32, [None]) | ||
self.isTrain = tf.placeholder(tf.bool, name='phase') | ||
|
||
xs_input = tf.expand_dims(self.xs, 3) | ||
|
||
conv1 = self._nn_conv_bn_layer(xs_input, 'conv_1', [11, 41, 1, 32], [3, 2]) | ||
conv2 = self._nn_conv_bn_layer(conv1, 'conv_2', [11, 21, 32, 32], [1, 2]) | ||
conv_out = tf.reshape(conv2, [-1, 334, 41*32]) | ||
biRNN1 = self._biRNN_bn_layer(conv_out, 'biRNN_1', 256) | ||
biRNN2 = self._biRNN_bn_layer(biRNN1, 'biRNN_2', 256) | ||
biRNN3 = self._biRNN_bn_layer(biRNN2, 'biRNN_3', 256) | ||
|
||
self.phonemes = tf.layers.dense(biRNN3, num_class) | ||
|
||
# Notes: tf.nn.ctc_loss performs the softmax operation for you, so | ||
# inputs should be e.g. linear projections of outputs by an LSTM. | ||
self.loss = tf.reduce_mean(tf.nn.ctc_loss(labels=self.ys, inputs=self.phonemes, sequence_length=self.seq_len, | ||
ignore_longer_outputs_than_inputs=True, time_major=False)) | ||
|
||
optimizer = tf.train.AdamOptimizer(self.learning_rate, beta1 = 0.6, beta2 = 0.8) | ||
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) | ||
with tf.control_dependencies(update_ops): | ||
gvs = optimizer.compute_gradients(self.loss) | ||
capped_gvs = [(tf.clip_by_value(grad, -400., 400.), var) for grad, var in gvs if grad is not None] | ||
self.train_op = optimizer.apply_gradients(capped_gvs) | ||
|
||
self.prediction, log_prob = tf.nn.ctc_beam_search_decoder(tf.transpose(self.phonemes,[1,0,2]), self.seq_len, top_paths=topk_paths, merge_repeated=False) | ||
|
||
self.loss_summary = tf.summary.scalar("loss", self.loss) | ||
self.merged = tf.summary.merge_all() | ||
|
||
def _nn_conv_bn_layer(self, inputs, scope, shape, strides): | ||
with tf.variable_scope(scope): | ||
W_conv = tf.get_variable("W", shape=shape, initializer=tf.contrib.layers.xavier_initializer()) | ||
h_conv = tf.nn.conv2d(inputs, W_conv, strides=[1, strides[0], strides[1], 1], padding='SAME', name="conv2d") | ||
b = tf.get_variable("bias" , shape=[shape[3]], initializer=tf.contrib.layers.xavier_initializer()) | ||
h_bn = tf.layers.batch_normalization(h_conv+b, training = self.isTrain) | ||
h_relu = tf.nn.relu6(h_bn, name="relu6") | ||
return h_relu | ||
|
||
def _biRNN_bn_layer(self, input, scope, hidden_units, cell = "LayerNormedGRU"): | ||
with tf.variable_scope(scope): | ||
if cell == 'GRU': | ||
fw_cell = tf.nn.rnn_cell.GRUCell(hidden_units, activation=tf.nn.relu, name = 'fw_cell') | ||
bw_cell = tf.nn.rnn_cell.GRUCell(hidden_units, activation=tf.nn.relu, name = 'bw_cell') | ||
elif cell == 'LSTM': | ||
fw_cell = tf.nn.rnn_cell.BasicLSTMCell(hidden_units, activation=tf.nn.relu, name = 'fw_cell') | ||
bw_cell = tf.nn.rnn_cell.BasicLSTMCell(hidden_units, activation=tf.nn.relu, name = 'bw_cell') | ||
elif cell == 'vanila': | ||
fw_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_units, activation=tf.nn.relu, name = 'fw_cell') | ||
bw_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_units, activation=tf.nn.relu, name = 'bw_cell') | ||
elif cell == 'LayerNormedGRU': | ||
with tf.variable_scope('fw_cell'): | ||
fw_cell = layerNormedGRU(hidden_units, activation=tf.nn.relu) | ||
with tf.variable_scope('bw_cell'): | ||
bw_cell = layerNormedGRU(hidden_units, activation=tf.nn.relu) | ||
else: | ||
raise ValueError("Invalid cell type: "+str(cell)) | ||
|
||
(output_fw, output_bw), _ = tf.nn.bidirectional_dynamic_rnn(fw_cell, bw_cell, input, dtype=tf.float32, scope="bi_dynamic_rnn") | ||
# output_fw_bn = tf.layers.batch_normalization(output_fw, training = self.isTrain, name = 'output_fw_bn') | ||
# output_bw_bn = tf.layers.batch_normalization(output_bw, training = self.isTrain, name = 'output_bw_bn') | ||
# bilstm_outputs_concat_1 = tf.concat([output_fw_bn, output_bw_bn], 2) | ||
bilstm_outputs_concat_1 = tf.concat([output_fw, output_bw], 2) | ||
return bilstm_outputs_concat_1 | ||
|
||
def train(self, sess, learning_rate, xs, ys): | ||
_, loss, summary = sess.run([self.train_op, self.loss, self.merged], feed_dict = {self.isTrain: True, self.learning_rate: learning_rate, self.seq_len: np.ones(xs.shape[0])*334, self.xs: xs, self.ys: ys}) | ||
return loss, summary | ||
|
||
def get_loss(self, sess, xs, ys): | ||
loss = sess.run(self.loss, feed_dict = {self.isTrain: False, self.seq_len: np.ones(xs.shape[0])*334, self.xs: xs, self.ys: ys}) | ||
return loss | ||
|
||
def predict(self, sess, xs): | ||
prediction = sess.run(self.prediction, feed_dict = {self.isTrain: False, self.seq_len: np.ones(xs.shape[0])*334, self.xs: xs}) | ||
return prediction |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
import tensorflow as tf | ||
import numpy as np | ||
from layerNormedGRU import layerNormedGRU | ||
|
||
class model: | ||
|
||
def __init__(self, num_class, topk_paths = 10): | ||
self.xs = tf.placeholder(tf.float32, [None, 1000, 161]) | ||
self.ys = tf.sparse_placeholder(tf.int32) | ||
self.learning_rate = tf.placeholder(tf.float32) | ||
self.seq_len = tf.placeholder(tf.int32, [None]) | ||
self.isTrain = tf.placeholder(tf.bool, name='phase') | ||
|
||
xs_input = tf.expand_dims(self.xs, 3) | ||
|
||
conv1 = self._nn_conv_bn_layer(xs_input, 'conv_1', [11, 41, 1, 32], [3, 2]) | ||
conv2 = self._nn_conv_bn_layer(conv1, 'conv_2', [11, 21, 32, 64], [1, 2]) | ||
conv_out = tf.reshape(conv2, [-1, 334, 41*64]) | ||
biRNN1 = self._biRNN_bn_layer(conv_out, 'biRNN_1', 256) | ||
biRNN2 = self._biRNN_bn_layer(biRNN1, 'biRNN_2', 256) | ||
biRNN3 = self._biRNN_bn_layer(biRNN2, 'biRNN_3', 256) | ||
biRNN4 = self._biRNN_bn_layer(biRNN3, 'biRNN_4', 256) | ||
biRNN5 = self._biRNN_bn_layer(biRNN4, 'biRNN_5', 256) | ||
|
||
self.phonemes = tf.layers.dense(biRNN5, num_class) | ||
|
||
# Notes: tf.nn.ctc_loss performs the softmax operation for you, so | ||
# inputs should be e.g. linear projections of outputs by an LSTM. | ||
self.loss = tf.reduce_mean(tf.nn.ctc_loss(labels=self.ys, inputs=self.phonemes, sequence_length=self.seq_len, | ||
ignore_longer_outputs_than_inputs=True, time_major=False)) | ||
|
||
optimizer = tf.train.AdamOptimizer(self.learning_rate, beta1=0.7, beta2=0.9) | ||
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) | ||
with tf.control_dependencies(update_ops): | ||
gvs = optimizer.compute_gradients(self.loss) | ||
capped_gvs = [(tf.clip_by_value(grad, -400., 400.), var) for grad, var in gvs if grad is not None] | ||
self.train_op = optimizer.apply_gradients(capped_gvs) | ||
|
||
self.prediction, log_prob = tf.nn.ctc_beam_search_decoder(tf.transpose(self.phonemes,[1,0,2]), self.seq_len, top_paths=topk_paths, merge_repeated=False) | ||
|
||
self.loss_summary = tf.summary.scalar("loss", self.loss) | ||
self.merged = tf.summary.merge_all() | ||
|
||
def _nn_conv_bn_layer(self, inputs, scope, shape, strides): | ||
with tf.variable_scope(scope): | ||
W_conv = tf.get_variable("W", shape=shape, initializer=tf.contrib.layers.xavier_initializer()) | ||
h_conv = tf.nn.conv2d(inputs, W_conv, strides=[1, strides[0], strides[1], 1], padding='SAME', name="conv2d") | ||
b = tf.get_variable("bias" , shape=[shape[3]], initializer=tf.contrib.layers.xavier_initializer()) | ||
h_bn = tf.layers.batch_normalization(h_conv+b, training = self.isTrain) | ||
h_relu = tf.nn.relu6(h_bn, name="relu6") | ||
return h_relu | ||
|
||
def _biRNN_bn_layer(self, input, scope, hidden_units, cell = "LayerNormedGRU"): | ||
with tf.variable_scope(scope): | ||
if cell == 'GRU': | ||
fw_cell = tf.nn.rnn_cell.GRUCell(hidden_units, activation=tf.nn.relu, name = 'fw_cell') | ||
bw_cell = tf.nn.rnn_cell.GRUCell(hidden_units, activation=tf.nn.relu, name = 'bw_cell') | ||
elif cell == 'LSTM': | ||
fw_cell = tf.nn.rnn_cell.BasicLSTMCell(hidden_units, activation=tf.nn.relu, name = 'fw_cell') | ||
bw_cell = tf.nn.rnn_cell.BasicLSTMCell(hidden_units, activation=tf.nn.relu, name = 'bw_cell') | ||
elif cell == 'vanila': | ||
fw_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_units, activation=tf.nn.relu, name = 'fw_cell') | ||
bw_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_units, activation=tf.nn.relu, name = 'bw_cell') | ||
elif cell == 'LayerNormedGRU': | ||
with tf.variable_scope('fw_cell'): | ||
fw_cell = layerNormedGRU(hidden_units, activation=tf.nn.relu) | ||
with tf.variable_scope('bw_cell'): | ||
bw_cell = layerNormedGRU(hidden_units, activation=tf.nn.relu) | ||
else: | ||
raise ValueError("Invalid cell type: "+str(cell)) | ||
|
||
(output_fw, output_bw), _ = tf.nn.bidirectional_dynamic_rnn(fw_cell, bw_cell, input, dtype=tf.float32, scope="bi_dynamic_rnn") | ||
# output_fw_bn = tf.layers.batch_normalization(output_fw, training = self.isTrain, name = 'output_fw_bn') | ||
# output_bw_bn = tf.layers.batch_normalization(output_bw, training = self.isTrain, name = 'output_bw_bn') | ||
# bilstm_outputs_concat_1 = tf.concat([output_fw_bn, output_bw_bn], 2) | ||
bilstm_outputs_concat_1 = tf.concat([output_fw, output_bw], 2) | ||
return bilstm_outputs_concat_1 | ||
|
||
def train(self, sess, learning_rate, xs, ys): | ||
_, loss, summary = sess.run([self.train_op, self.loss, self.merged], feed_dict = {self.isTrain: True, self.learning_rate: learning_rate, self.seq_len: np.ones(xs.shape[0])*334, self.xs: xs, self.ys: ys}) | ||
return loss, summary | ||
|
||
def get_loss(self, sess, xs, ys): | ||
loss = sess.run(self.loss, feed_dict = {self.isTrain: False, self.seq_len: np.ones(xs.shape[0])*334, self.xs: xs, self.ys: ys}) | ||
return loss | ||
|
||
def predict(self, sess, xs): | ||
prediction = sess.run(self.prediction, feed_dict = {self.isTrain: False, self.seq_len: np.ones(xs.shape[0])*334, self.xs: xs}) | ||
return prediction |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
import tensorflow as tf | ||
import numpy as np | ||
from layerNormedGRU import layerNormedGRU | ||
|
||
class model: | ||
|
||
def __init__(self, num_class, topk_paths = 10): | ||
self.xs = tf.placeholder(tf.float32, [None, 1000, 161]) | ||
self.ys = tf.sparse_placeholder(tf.int32) | ||
self.learning_rate = tf.placeholder(tf.float32) | ||
self.seq_len = tf.placeholder(tf.int32, [None]) | ||
self.isTrain = tf.placeholder(tf.bool, name='phase') | ||
|
||
xs_input = tf.expand_dims(self.xs, 3) | ||
|
||
conv1 = self._nn_conv_bn_layer(xs_input, 'conv_1', [11, 41, 1, 32], [3, 2]) | ||
conv2 = self._nn_conv_bn_layer(conv1, 'conv_2', [11, 21, 32, 64], [1, 2]) | ||
conv_out = tf.reshape(conv2, [-1, 334, 41*64]) | ||
biRNN1 = self._biRNN_bn_layer(conv_out, 'biRNN_1', 1024) | ||
biRNN2 = self._biRNN_bn_layer(biRNN1, 'biRNN_2', 1024) | ||
biRNN3 = self._biRNN_bn_layer(biRNN2, 'biRNN_3', 1024) | ||
biRNN4 = self._biRNN_bn_layer(biRNN3, 'biRNN_4', 1024) | ||
biRNN5 = self._biRNN_bn_layer(biRNN4, 'biRNN_5', 1024) | ||
|
||
self.phonemes = tf.layers.dense(biRNN5, num_class) | ||
|
||
# Notes: tf.nn.ctc_loss performs the softmax operation for you, so | ||
# inputs should be e.g. linear projections of outputs by an LSTM. | ||
self.loss = tf.reduce_mean(tf.nn.ctc_loss(labels=self.ys, inputs=self.phonemes, sequence_length=self.seq_len, | ||
ignore_longer_outputs_than_inputs=True, time_major=False)) | ||
|
||
optimizer = tf.train.AdamOptimizer(self.learning_rate, beta1=0.7, beta2=0.9) | ||
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) | ||
with tf.control_dependencies(update_ops): | ||
gvs = optimizer.compute_gradients(self.loss) | ||
capped_gvs = [(tf.clip_by_value(grad, -400., 400.), var) for grad, var in gvs if grad is not None] | ||
self.train_op = optimizer.apply_gradients(capped_gvs) | ||
|
||
self.prediction, log_prob = tf.nn.ctc_beam_search_decoder(tf.transpose(self.phonemes,[1,0,2]), self.seq_len, top_paths=topk_paths, merge_repeated=False) | ||
|
||
self.loss_summary = tf.summary.scalar("loss", self.loss) | ||
self.merged = tf.summary.merge_all() | ||
|
||
def _nn_conv_bn_layer(self, inputs, scope, shape, strides): | ||
with tf.variable_scope(scope): | ||
W_conv = tf.get_variable("W", shape=shape, initializer=tf.contrib.layers.xavier_initializer()) | ||
h_conv = tf.nn.conv2d(inputs, W_conv, strides=[1, strides[0], strides[1], 1], padding='SAME', name="conv2d") | ||
b = tf.get_variable("bias" , shape=[shape[3]], initializer=tf.contrib.layers.xavier_initializer()) | ||
h_bn = tf.layers.batch_normalization(h_conv+b, training = self.isTrain) | ||
h_relu = tf.nn.relu6(h_bn, name="relu6") | ||
return h_relu | ||
|
||
def _biRNN_bn_layer(self, input, scope, hidden_units, cell = "LayerNormedGRU"): | ||
with tf.variable_scope(scope): | ||
if cell == 'GRU': | ||
fw_cell = tf.nn.rnn_cell.GRUCell(hidden_units, activation=tf.nn.relu, name = 'fw_cell') | ||
bw_cell = tf.nn.rnn_cell.GRUCell(hidden_units, activation=tf.nn.relu, name = 'bw_cell') | ||
elif cell == 'LSTM': | ||
fw_cell = tf.nn.rnn_cell.BasicLSTMCell(hidden_units, activation=tf.nn.relu, name = 'fw_cell') | ||
bw_cell = tf.nn.rnn_cell.BasicLSTMCell(hidden_units, activation=tf.nn.relu, name = 'bw_cell') | ||
elif cell == 'vanila': | ||
fw_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_units, activation=tf.nn.relu, name = 'fw_cell') | ||
bw_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_units, activation=tf.nn.relu, name = 'bw_cell') | ||
elif cell == 'LayerNormedGRU': | ||
with tf.variable_scope('fw_cell'): | ||
fw_cell = layerNormedGRU(hidden_units, activation=tf.nn.relu) | ||
with tf.variable_scope('bw_cell'): | ||
bw_cell = layerNormedGRU(hidden_units, activation=tf.nn.relu) | ||
else: | ||
raise ValueError("Invalid cell type: "+str(cell)) | ||
|
||
(output_fw, output_bw), _ = tf.nn.bidirectional_dynamic_rnn(fw_cell, bw_cell, input, dtype=tf.float32, scope="bi_dynamic_rnn") | ||
# output_fw_bn = tf.layers.batch_normalization(output_fw, training = self.isTrain, name = 'output_fw_bn') | ||
# output_bw_bn = tf.layers.batch_normalization(output_bw, training = self.isTrain, name = 'output_bw_bn') | ||
# bilstm_outputs_concat_1 = tf.concat([output_fw_bn, output_bw_bn], 2) | ||
bilstm_outputs_concat_1 = tf.concat([output_fw, output_bw], 2) | ||
return bilstm_outputs_concat_1 | ||
|
||
def train(self, sess, learning_rate, xs, ys): | ||
_, loss, summary = sess.run([self.train_op, self.loss, self.merged], feed_dict = {self.isTrain: True, self.learning_rate: learning_rate, self.seq_len: np.ones(xs.shape[0])*334, self.xs: xs, self.ys: ys}) | ||
return loss, summary | ||
|
||
def get_loss(self, sess, xs, ys): | ||
loss = sess.run(self.loss, feed_dict = {self.isTrain: False, self.seq_len: np.ones(xs.shape[0])*334, self.xs: xs, self.ys: ys}) | ||
return loss | ||
|
||
def predict(self, sess, xs): | ||
prediction = sess.run(self.prediction, feed_dict = {self.isTrain: False, self.seq_len: np.ones(xs.shape[0])*334, self.xs: xs}) | ||
return prediction |
Oops, something went wrong.