Added scripts for prior computation and model conversion to Kaldi's nnet3 format with a limited functionality.

dspavankumar · dspavankumar · commit 9a51fe37dfe8 · 2017-08-23T17:38:18.000+02:00
diff --git a/LICENSE b/LICENSE
diff --git a/README.md b/README.md
@@ -30,7 +30,7 @@ Learning models in Keras.
 
 1. Python 3.4+
 
-2. Keras with Theano/Tensorflow backend
+2. Keras with Tensorflow/Theano backend
 
 3. Kaldi
 
@@ -63,6 +63,12 @@ run run_kt_LSTM.sh.
 
 6. align.sh is the alignment script.
 
+7. compute_priors.py computes priors.
+
+8. saveModelNnet3.sh and saveModelNnet3Raw.py convert the trained
+  feedforward DNNs into Kaldi's nnet3 format. They currently have
+  limited functionality.
+
 ## Training Schedule
 
 The script uses stochastic gradient descent with 0.5 momentum. It 
diff --git a/run_kt.sh b/run_kt.sh
@@ -44,17 +44,10 @@ done
 ## Uncomment to train a Maxout network
 #[ -f $exp/dnn.nnet.h5 ] || python3 steps_kt/train_maxout.py ${train}_cv05 ${gmm}_ali_cv05 ${train}_tr95 ${gmm}_ali_tr95 $gmm $exp
 
-## Get priors: Make a Python script to do this.
-ali-to-pdf $gmm/final.mdl ark:"gunzip -c ${gmm}_ali_???5/ali.*.gz |" ark,t:- | \
-    cut -d" " -f2- | tr ' ' '\n' | sed -r '/^\s*$/d' | sort | uniq -c | sort -n -k2 | \
-    awk '{a[$2]=$1; c+=$1; LI=$2} END{for(i=0;i<LI;i++) printf "%e,",a[i]/c; printf "%e",a[LI]/c}' \
-    > $exp/dnn.priors.csv
-
 ## Make graph
 [ -f $gmm/graph/HCLG.fst ] || utils/mkgraph.sh ${lang}_test_bg $gmm $gmm/graph
 
 ## Decode
-cp $gmm/final.mdl $gmm/tree $exp/
 [ -f $exp/decode/wer_11 ] || bash steps_kt/decode.sh --nj $nj \
     --add-deltas "true" --norm-vars "true" --splice-opts "--left-context=5 --right-context=5" \
     $test $gmm/graph $exp $exp/decode
diff --git a/run_kt_LSTM.sh b/run_kt_LSTM.sh
@@ -42,17 +42,10 @@ done
 ## Train
 [ -f $exp/dnn.nnet.h5 ] || python3 steps_kt/train_LSTM.py ${train}_cv05 ${gmm}_ali_cv05 ${train}_tr95 ${gmm}_ali_tr95 $gmm $exp
 
-## Get priors: Make a Python script to do this.
-ali-to-pdf $gmm/final.mdl ark:"gunzip -c ${gmm}_ali_???5/ali.*.gz |" ark,t:- | \
-    cut -d" " -f2- | tr ' ' '\n' | sed -r '/^\s*$/d' | sort | uniq -c | sort -n -k2 | \
-    awk '{a[$2]=$1; c+=$1; LI=$2} END{for(i=0;i<LI;i++) printf "%e,",a[i]/c; printf "%e",a[LI]/c}' \
-    > $exp/dnn.priors.csv
-
 ## Make graph
 [ -f $gmm/graph/HCLG.fst ] || utils/mkgraph.sh ${lang}_test_bg $gmm $gmm/graph
 
 ## Decode
-cp $gmm/final.mdl $gmm/tree $exp/
 [ -f $exp/decode/wer_11 ] || bash steps_kt/decode_seq.sh --nj $nj \
     --add-deltas "true" --norm-vars "true" --splice-size "11" \
     $test $gmm/graph $exp $exp/decode
diff --git a/steps_kt/compute_priors.py b/steps_kt/compute_priors.py
@@ -0,0 +1,64 @@
+#!/usr/bin/python3
+
+##  Copyright (C) 2016 D S Pavan Kumar
+##  dspavankumar [at] gmail [dot] com
+##
+##  This program is free software: you can redistribute it and/or modify
+##  it under the terms of the GNU General Public License as published by
+##  the Free Software Foundation, either version 3 of the License, or
+##  (at your option) any later version.
+##
+##  This program is distributed in the hope that it will be useful,
+##  but WITHOUT ANY WARRANTY; without even the implied warranty of
+##  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+##  GNU General Public License for more details.
+##
+##  You should have received a copy of the GNU General Public License
+##  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+import sys
+import numpy
+from subprocess import Popen, PIPE
+
+## Read output feature dimension
+def read_output_feat_dim (exp):
+    p = Popen (['am-info', exp+'/final.mdl'], stdout=PIPE)
+    for line in p.stdout:
+        if b'number of pdfs' in line:
+            return int(line.split()[-1])
+
+## Compute priors
+def compute_priors (exp, ali_tr, ali_cv=None):
+    dim = read_output_feat_dim (exp)    
+    counts = numpy.zeros(dim)
+
+    ## Prepare string
+    ali_str = 'ark:gunzip -c ' + ali_tr+'/ali.*.gz '
+    if ali_cv:
+        ali_str += ali_cv+'/ali.*.gz '
+    ali_str += '|'
+
+    p = Popen(['ali-to-pdf', exp+'/final.mdl', ali_str, 'ark,t:-'], stdout=PIPE)
+
+    ## Compute counts
+    for line in p.stdout:
+        line = line.split()
+        for index in line[1:]:
+            counts[int(index)] += 1
+
+    ## Compute priors
+    priors = counts / numpy.sum(counts)
+
+    ## Floor zero values
+    priors[priors==0] = 1e-5
+
+    ## Write to file
+    priors.tofile (exp+'/dnn.priors.csv', sep=',', format='%e')
+
+if __name__ == '__main__':
+    exp     = sys.argv[1]
+    ali_tr  = sys.argv[2]
+    ali_cv  = sys.argv[3] if len(sys.argv)==4 else None
+
+    compute_priors (exp, ali_tr, ali_cv)
diff --git a/steps_kt/saveModelNnet3.sh b/steps_kt/saveModelNnet3.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+
+##  Copyright (C) 2016 D S Pavan Kumar
+##  dspavankumar [at] gmail [dot] com
+##
+##  This program is free software: you can redistribute it and/or modify
+##  it under the terms of the GNU General Public License as published by
+##  the Free Software Foundation, either version 3 of the License, or
+##  (at your option) any later version.
+##
+##  This program is distributed in the hope that it will be useful,
+##  but WITHOUT ANY WARRANTY; without even the implied warranty of
+##  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+##  GNU General Public License for more details.
+##
+##  You should have received a copy of the GNU General Public License
+##  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+## NOTE: This script converts feedforward DNNs in HDF5 format to the
+##       standard Kaldi's nnet3 format. It has limited functionality.
+##       It uses steps_kt/saveModelNnet3Raw.py, which is also limited
+##       in functionality. The scripts do the job, but can be better.
+
+exp=$1
+
+. cmd.sh
+. path.sh
+
+## Check if argument exists
+[ -z $exp ] && echo "Provide DNN directory as an argument" && exit 1
+
+## Check if files required exist in the exp directory
+for f in $exp/final.mdl $exp/dnn.nnet.h5 $exp/dnn.priors.csv ; do
+    [ ! -f $f ] && echo "Expected $f to exist" && exit 1
+done
+
+## Copy the raw Nnet3
+python3 scripts_kt/saveModelNnet3Raw.py $exp/dnn.nnet.h5 $exp/dnn.nnet3.raw
+
+## Append context and priors to the raw Nnet3
+printf "<LeftContext> 0 <RightContext> 0 <Priors>  [ " >> $exp/dnn.nnet3.raw
+awk '{gsub(","," ",$0); print $0 " ]"}' $exp/dnn.priors.csv >> $exp/dnn.nnet3.raw
+
+## Copy the transition matrix
+copy-transition-model --binary=false $exp/final.mdl $exp/dnn.nnet3.trans
+
+mv $exp/final.mdl $exp/final.mdl.bak
+
+## Prepare the final model
+cat $exp/dnn.nnet3.trans $exp/dnn.nnet3.raw > $exp/final.mdl.txt
+
+## Convert to binary format
+nnet3-am-copy $exp/final.mdl.txt $exp/final.mdl
+
+## Clean up
+rm -f $exp/dnn.nnet3.raw $exp/dnn.nnet3.trans $exp/final.mdl.txt
+
+echo "Older final model backed up as: $exp/final.mdl.bak"
+echo "Nnet3 model successfully stored as: $exp/final.mdl"
diff --git a/steps_kt/saveModelNnet3Raw.py b/steps_kt/saveModelNnet3Raw.py
@@ -0,0 +1,79 @@
+#!/usr/bin/python3
+
+##  Copyright (C) 2016 D S Pavan Kumar
+##  dspavankumar [at] gmail [dot] com
+##
+##  This program is free software: you can redistribute it and/or modify
+##  it under the terms of the GNU General Public License as published by
+##  the Free Software Foundation, either version 3 of the License, or
+##  (at your option) any later version.
+##
+##  This program is distributed in the hope that it will be useful,
+##  but WITHOUT ANY WARRANTY; without even the implied warranty of
+##  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+##  GNU General Public License for more details.
+##
+##  You should have received a copy of the GNU General Public License
+##  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+## NOTE: This script has limited functionality. It currently converts
+##       feedforward networks with relu and softmax layers in HDF5 format
+##       to the standard Kaldi's nnet3 "raw" format. Call this script from
+##       steps_kt/saveModelNnet3.sh to get a complete model.
+
+import keras
+import numpy
+import sys
+
+def saveModel (model, fileName):
+    with open (fileName, 'w') as f:
+        f.write ('<Nnet3> \n')
+
+        ## Write the component descriptions
+        f.write ('input-node name=input dim=%d\n' % m.input_shape[-1])
+        prevLayerName = 'input'
+        num_components = 0
+        for layer in model.layers:
+            if layer.name.startswith ('dense'):
+                f.write ('component-node name=%s.affine component=%s.affine input=%s\n' % (layer.name, layer.name, prevLayerName))
+                num_components += 1
+                activation_text = layer.get_config()['activation']
+                if activation_text != 'linear':
+                    f.write ('component-node name=%s.%s component=%s.%s input=%s.affine\n' % (layer.name, activation_text, layer.name, activation_text, layer.name))
+                    num_components += 1
+                prevLayerName = layer.name + '.' + activation_text
+        f.write('output-node name=output input=%s objective=linear\n' % prevLayerName)
+
+        f.write('\n<NumComponents> %d\n' % num_components)
+        
+        ## Write the layer values
+        for layer in model.layers:
+            if not layer.name.startswith ('dense'):
+                raise TypeError ('Unknown layer type: ' + layer.name)
+            
+            f.write ('<ComponentName> %s.affine <NaturalGradientAffineComponent> <MaxChange> 2.0 <LearningRate> 0.001 <LinearParams>  [ \n ' % (layer.name))
+            for row in layer.get_weights()[0].T:
+                row.tofile (f, format="%e", sep=' ')
+                f.write (' \n ')
+            f.write ('] \n <BiasParams> [ ')
+            layer.get_weights()[1].tofile (f, format="%e", sep=' ')
+            f.write (' ] \n')
+            f.write ('<RankIn> 20 <RankOut> 80 <UpdatePeriod> 4 <NumSamplesHistory> 2000 <Alpha> 4 <IsGradient> F </NaturalGradientAffineComponent>\n')
+
+            ## Deal with the activation
+            activation_text = layer.get_config()['activation']
+            if activation_text == 'relu':
+                f.write ('<ComponentName> %s.relu <RectifiedLinearComponent> <Dim> %d <ValueAvg> [ ] <DerivAvg> [ ] <Count> 0 <NumDimsSelfRepaired> 0 <NumDimsProcessed> 0 </RectifiedLinearComponent>\n' % (layer.name, layer.output_shape[-1]))
+            elif activation_text == 'softmax':
+                f.write ('<ComponentName> %s.softmax <LogSoftmaxComponent> <Dim> %d <ValueAvg> [ ] <DerivAvg> [ ] <Count> 0 <NumDimsSelfRepaired> 0 <NumDimsProcessed> 0 </LogSoftmaxComponent>\n' % (layer.name, layer.output_shape[-1]))
+            else:
+                raise TypeError ('Unknown/unhandled activation: ' + activation_text)
+        f.write ('</Nnet3> \n')
+
+## Save h5 model in nnet3 format
+if __name__ == '__main__':
+    h5model = sys.argv[1]
+    nnet3 = sys.argv[2]
+    m = keras.models.load_model (h5model)
+    saveModel(m, nnet3)
diff --git a/steps_kt/train.py b/steps_kt/train.py
@@ -21,6 +21,8 @@
 import keras.backend as K
 from keras.optimizers import SGD
 from dataGenerator import dataGenerator
+from compute_priors import compute_priors
+from shutil import copy
 import numpy
 import sys
 import os
@@ -46,7 +48,13 @@
             'lrScaleCount' : 18,
             'minValError' : 0.002}
 
+## Copy final model and tree from GMM directory
 os.makedirs (exp, exist_ok=True)
+copy (gmm + '/final.mdl', exp)
+copy (gmm + '/tree', exp)
+
+## Compute priors
+compute_priors (exp, ali_tr, ali_cv)
 
 trGen = dataGenerator (data_tr, ali_tr, gmm, learning['batchSize'])
 cvGen = dataGenerator (data_cv, ali_cv, gmm, learning['batchSize'])
diff --git a/steps_kt/train_LSTM.py b/steps_kt/train_LSTM.py
@@ -21,6 +21,8 @@
 import keras.backend as K
 from keras.optimizers import SGD
 from dataGenSequences import dataGenSequences
+from compute_priors import compute_priors
+from shutil import copy
 import numpy
 import sys
 import os
@@ -47,7 +49,13 @@
             'lrScaleCount' : 18,
             'minValError' : 0.002}
 
+## Copy final model and tree from GMM directory
 os.makedirs (exp, exist_ok=True)
+copy (gmm + '/final.mdl', exp)
+copy (gmm + '/tree', exp)
+
+## Compute priors
+compute_priors (exp, ali_tr, ali_cv)
 
 trGen = dataGenSequences (data_tr, ali_tr, gmm, learning['batchSize'], learning['spliceSize'])
 cvGen = dataGenSequences (data_cv, ali_cv, gmm, learning['batchSize'], learning['spliceSize'])
diff --git a/steps_kt/train_maxout.py b/steps_kt/train_maxout.py
@@ -21,6 +21,8 @@
 import keras.backend as K
 from keras.optimizers import SGD
 from dataGenerator import dataGenerator
+from compute_priors import compute_priors
+from shutil import copy
 import numpy
 import sys
 import os
@@ -46,7 +48,13 @@
             'lrScaleCount' : 18,
             'minValError' : 0.002}
 
+## Copy final model and tree from GMM directory
 os.makedirs (exp, exist_ok=True)
+copy (gmm + '/final.mdl', exp)
+copy (gmm + '/tree', exp)
+
+## Compute priors
+compute_priors (exp, ali_tr, ali_cv)
 
 trGen = dataGenerator (data_tr, ali_tr, gmm, learning['batchSize'])
 cvGen = dataGenerator (data_cv, ali_cv, gmm, learning['batchSize'])