-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun-exp.py
210 lines (174 loc) · 7.9 KB
/
run-exp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
import librosa
import os
import sys
import argparse
import numpy as np
import logging as log
import tensorflow as tf
from tensorflow import keras
import soundfile as sf
from tensorflow.keras import models, layers
# Removing all handlers from logging.root
for handler in log.root.handlers[:]:
log.root.removeHandler(handler)
log.basicConfig(level=log.INFO, format='%(asctime)s %(levelname)-8s %(message)s', datefmt='%a, %d %b %Y %H:%M:%S')
# FEAT_LEN = 431 # for 10 seconds of audio, we get 431 MFCC vectors
FEAT_DIM = 40 # each MFCC vector is of 40 dimensions
def load_data_from_file(filepath, timesteps, channel_last=False):
print("Loading {0} ... ".format(filepath))
data, samplerate = sf.read(filepath)
feat = librosa.feature.mfcc(data, sr=samplerate, n_mfcc=FEAT_DIM)
X = np.zeros([1, timesteps, FEAT_DIM])
# Limiting both X and feat to have timesteps in length
feat = feat.T
feat_timesteps = feat.shape[0]
if feat_timesteps >= timesteps:
X[0, :timesteps, :] = feat[:timesteps, :]
else:
# feat_timesteps < timesteps
remainder = timesteps
ridx = 0
while feat_timesteps <= remainder:
X[0, ridx * feat_timesteps:(ridx + 1) * feat_timesteps, :] = feat
ridx += 1
remainder -= feat_timesteps
X[0, ridx * feat_timesteps:, :] = feat[:remainder, :]
if channel_last:
# Expand one dimension in the last axis
X = np.expand_dims(X, axis=3)
return X
def load_data_from_dir(dir, locales, timesteps, limit_per_locale, channel_last=False):
'''
Load the training data from a directory (FLAC files)
'''
num_audio = min(len(os.listdir(dir)), len(locales) * limit_per_locale)
if channel_last:
X = np.zeros([num_audio, timesteps, FEAT_DIM, 1])
else:
X = np.zeros([num_audio, timesteps, FEAT_DIM])
Y_onehot = np.zeros([num_audio, len(locales)])
Y = np.zeros(num_audio)
locales_count = {x: 0 for x in locales}
locales_to_idx = dict()
for i, locale in enumerate(locales):
locales_to_idx[locale] = i
idx = 0
for f in os.listdir(dir):
f_locale = f[:2]
if locales_count[f_locale] >= limit_per_locale:
continue
fpath = os.path.join(dir, f)
file_feats = load_data_from_file(fpath, timesteps, channel_last)
X[idx] = file_feats
Y_onehot[idx, locales_to_idx[f_locale]] = 1
Y[idx] = locales_to_idx[f_locale]
idx += 1
locales_count[f_locale] += 1
return X, Y, Y_onehot, locales_to_idx
def create_rnn_model(input_layer_dim, hidden_layer_dims, output_layer_dim):
model = models.Sequential()
model.add(layers.LSTM(input_layer_dim, return_sequences=True, unroll=False))
for idx, layer_dim in enumerate(hidden_layer_dims):
if idx == len(hidden_layer_dims) - 1:
return_seq = False
else:
return_seq = True
model.add(layers.LSTM(layer_dim, return_sequences=return_seq))
model.add(layers.Dropout(0.2))
model.add(layers.Dense(output_layer_dim, activation='softmax'))
return model
def create_cnn_model(output_layer_dim):
model = models.Sequential()
model.add(layers.Conv2D(16, kernel_size=3))
model.add(layers.MaxPooling2D())
model.add(layers.Conv2D(32, kernel_size=3))
model.add(layers.MaxPooling2D())
model.add(layers.Conv2D(64, kernel_size=3))
model.add(layers.MaxPooling2D())
model.add(layers.Flatten())
model.add(layers.Dense(100, activation='relu'))
model.add(layers.Dense(output_layer_dim, activation='softmax'))
return model
def eval(predictions, truth):
return predictions[predictions == truth].shape[0] / truth.shape[0]
def get_model_name(model_type, input_layer_dim, hidden_layer_dims, output_layer_dim, train_data, timesteps, epochs):
return "i{0}_h{1}_o{2}_{3}_samples_{4}_epochs_{5}_timesteps_{6}.pb".format(input_layer_dim, '_'.join(
[str(x) for x in hidden_layer_dims]), output_layer_dim, train_data, epochs, timesteps, model_type)
def main(args):
# Training data: 73,080 in total, 24360 per language
# Testing data: 540 in total, 180 per language
train_dir = args.train_dir
test_dir = args.test_dir
save_model_dir = args.save_model_dir
load_model_file = args.load_model
test_audio = args.test_audio
batch_size = 32
epochs = 5
timesteps = 512
input_layer_dim = 100
hidden_layer_dims = [100]
output_layer_dim = 3
num_samples = 24000
model_type = 'cnn'
print(keras.__version__)
print("Train dir: {0}".format(train_dir))
print("Test dir: {0}".format(test_dir))
locales = ['en', 'de', 'es']
if not train_dir and not test_dir and not load_model_file and not test_audio:
log.error("Must specify --train-dir or --test-dir or --load-model")
return 1
# train_X shape: [batch, timesteps, feature_dim]
if model_type == 'cnn':
channel_last = True
else:
channel_last = False
if test_audio:
if not load_model_file:
log.error("Must specify --load-model if --test-audio is specified")
return 1
model = keras.models.load_model(load_model_file)
file_feats = load_data_from_file(test_audio, timesteps, channel_last)
predictions = model.predict_classes(file_feats)
print("Predicted language: {0}".format(locales[predictions[0]]))
elif load_model_file:
if not test_dir:
log.error("Must specify --test-dir if --load-model is specified")
return 1
model = keras.models.load_model(load_model_file)
test_X, test_Y, test_Y_onehot, _ = load_data_from_dir(test_dir, locales, timesteps, 180,
channel_last=channel_last)
predictions = model.predict_classes(test_X)
print(predictions)
print(test_Y)
accuracy = eval(predictions, test_Y)
print("Accuracy: {0:.2f}%".format(100 * accuracy))
else:
train_X, train_Y, train_Y_onehot, locales_to_idx = load_data_from_dir(train_dir, locales, timesteps,
num_samples,
channel_last=channel_last)
print(train_X.shape)
print(train_Y_onehot.shape)
print(train_Y.shape)
# model = create_rnn_model(input_layer_dim, hidden_layer_dims, output_layer_dim)
model = create_cnn_model(output_layer_dim)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
history = model.fit(train_X, train_Y_onehot, batch_size=batch_size, epochs=epochs)
print(history)
if save_model_dir:
model_file = os.path.join(save_model_dir,
get_model_name(model_type, input_layer_dim, hidden_layer_dims, output_layer_dim,
num_samples, timesteps, epochs))
print("Saving model to: {0}".format(model_file))
model.save(model_file)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Train and test the acoustic language-id algorithms.')
parser.add_argument('--train-dir', dest='train_dir', action='store', required=False, help='Training directory')
parser.add_argument('--test-dir', dest='test_dir', action='store', required=False, help='Testing directory')
parser.add_argument('--save-model-dir', dest='save_model_dir', action='store', required=False,
help='Directory to save model')
parser.add_argument('--load-model', dest='load_model', action='store', required=False,
help='Load the model instead of training it')
parser.add_argument('--test-audio', dest='test_audio', action='store', required=False,
help='Test one audio file with the model')
args = parser.parse_args()
sys.exit(main(args))