-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain_old.py
285 lines (246 loc) · 12.7 KB
/
train_old.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
# -*- coding: utf-8 -*-
from __future__ import print_function
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
import random
random.seed(42) # For reproducibility
import tensorflow as tf
import os
from tensorflow.python import debug as tf_debug
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '0'
import numpy as np
import logging
import time
import getopt
from utils import convert_int_sequence_to_text_sequence, configure_logger
from data import DataGenerator
from utils import get_language_chars, check_language_code
"""
train module
End to end multi language speech recognition using CTC Cost model
Copyright (c) 2019 Imdat Solak
Copyright 2015-2016, Baidu USA LLC.
This model is based on the paper of https://arxiv.org/pdf/1512.02595.pdf
(Deep Speech 2: End-to-End Speech Recognition in English and Mandarin) and its implementation on Github.
https://github.com/baidu-research/ba-dls-deepspeech
All rights reserved.
Implemented with Tensorflow
"""
"""
Module train
This modul trains the model with the selected language audio files in mini batches.
Notes:
File formats must be in raw format (.wav file). Audio must be sampled 16000 sample and just 1 channel data is accepted.
"""
logger = logging.getLogger(__name__)
def usage():
print('Missing or Wrong Parameters.')
print('Usage: train.py ...')
print('Options:')
print(' -e --epoch <num-epochs> Number of training epochs(int: 100)')
print(' -b --batch_size <batch-size> Mini batch size for each iteration of the train (int; 32)')
print(' -i --ilog <ilog_num> Log decoded speech-to-text sample every this many steps (int; 100)')
print(' -r --restore <checkpoint> Checkpoint to restore (string; none)')
print(' -d --data_dir <train-data-dir> Location of training data (string; ./data)')
print(' -g <gru_layer> How many gru layers (int; 3)')
print(' -m <model_type> Model Type (int; 1)')
print(' -l <language> Language (str, 1 of [de_DE, en_US]; en_US)')
print(' -c <checkpoint-freq> Save every this checkpoints (int; 1000)')
print(' -G Multi-GPU-Training TRUE')
print(' -M <model-dir> Model Root dir to save models to... (str; ./trained-models/)')
print()
sys.exit(1)
try:
options, arguments = getopt.getopt(sys.argv[1:], 'he:b:i:r:d:g:m:c:l:M:G', ['help', 'epoch', 'batch_size', 'ilog', 'restore', 'data_dir', 'gru_layer', 'model_type', 'ckpt_freq', 'language', 'model_root'])
except getopt.GetoptError:
usage()
num_epochs = 100
mini_batch_size = 32
iterlog = 100
restore_ckpt = None
data_dir = 'data'
checkpoint_freq = 1000
model_type = 1
gru_layer = 3
language = None
model_root = './trained-models/'
multi_gpu = False
for opt, arg in options:
if opt in ('-e', '--epoch'):
num_epochs = int(arg)
elif opt in ('-b', '--batch_size'):
mini_batch_size = int(arg)
elif opt in ('-i','--ilog'):
iterlog = int(arg)
elif opt in ('-r','--restore'):
restore_ckpt = arg
elif opt in ('-d', '--data_dir'):
data_dir = arg
elif opt in ('-g', '--gru_layer'):
gru_layer = int(arg)
elif opt in ('-m', '--model_type'):
model_type = int(arg)
elif opt in ('-l', '--language'):
language = arg
elif opt in ('-c', '--ckpt_freq'):
checkpoint_freq = int(arg)
elif opt in ('-h', '--help'):
usage()
elif opt in ('-M', '--model_root'):
model_root = arg
elif opt == '-G':
multi_gpu = True
import horovod.tensorflow as hvd
hvd.init()
else:
assert False, "unhandled option"
def train_model(language, data_dir, model_type=1, gru=3, num_epochs=100, mini_batch_size=32, iterlog=20, cp_freq=1000, restore_path=None, model_root = './trained-models/', multi_gpu=False):
# Check language is supported
if not check_language_code(language):
raise ValueError("Invalid or not supported language code!")
# Check description file exists
if not os.path.exists(data_dir):
raise ValueError("Description file does not exist!'")
# Check valid model is selected
if model_type == 1:
from models import model_conv1_gru as model
elif model_type == 2:
from models import model_conv2_gru as model
else:
raise ValueError("No valid model selected!")
# Create model directories
model_name = model.__name__ + str(gru)
model_dir = os.path.join(model_root, model_name)
if multi_gpu:
my_gpu_rank = hvd.local_rank()
else:
my_gpu_rank = 0
if not multi_gpu or my_gpu_rank == 0:
if not os.path.exists(model_root):
os.makedirs(model_root)
if not os.path.exists(model_dir):
os.makedirs(model_dir)
# Configure logging
configure_logger(logFileName=os.path.join(model_root, 'training.log'))
print('Loading data...')
# Load char_map, index_map and gets number of classes
char_map, index_map, nb_classes = get_language_chars(language)
# Prepare the data generator. Load the JSON file that contains the dataset
datagen = DataGenerator(char_map=char_map, multi_gpu=multi_gpu)
# Loads data limited with max duration. returns number of iterations.
no_iter = datagen.load_data(data_dir, minibatch_size=mini_batch_size, max_duration=20.0)
print('Building Model...')
global_step = tf.Variable(0, name='global_step', trainable=False)
# Create input placeholders for CTC Cost feeding and decoding feeding
with tf.name_scope('inputs'):
# Audio inputs have size of [batch_size, max_stepsize, num_features]. But the batch_size and max_stepsize can vary along each step
# inputs = tf.placeholder(tf.float32, [None, None, 161], name='inputs') # spectrogram version
inputs = tf.placeholder(tf.float32, [None, None, 40], name='inputs') # filterbank version. 40 shows number of filters.s
# inputs = tf.placeholder(tf.float32, [None, None, 12], name='inputs') # mfcc version. 12 shows number of ceps.
# 1d array of size [batch_size]
seq_len = tf.placeholder(tf.int32, [None], name='seq_len')
# We define the placeholder for the labels. Here we use sparse_placeholder that will generate a SparseTensor required by ctc_loss op.
targets = tf.sparse_placeholder(tf.int32, name='targets')
# Create model layers
logits = model(inputs, nb_classes, gru)
logits = tf.transpose(logits, perm=[1, 0, 2])
# Compute the CTC loss using either TensorFlow's "ctc_loss". Then calculate the average loss across the batch
with tf.name_scope('loss'):
total_loss = tf.nn.ctc_loss(inputs=logits, labels=targets, sequence_length=seq_len, ignore_longer_outputs_than_inputs=True)
avg_loss = tf.reduce_mean(total_loss, name="Mean")
# Adam Optimizer has preferred for the performance reasons to optimize the weights
with tf.name_scope('train'):
optimizer = tf.train.AdamOptimizer(learning_rate=0.0001, beta1=0.9, beta2=0.999, epsilon=1e-8)
if multi_gpu:
optimizer = hvd.DistributedOptimizer(optimizer)
train_op = optimizer.minimize(avg_loss, global_step=global_step)
# optimizer = tf.train.MomentumOptimizer(learning_rate= 2e-4, momentum=0.99, use_nesterov=True).minimize(avg_loss)
# Beam search decodes the mini-batch
with tf.name_scope('decoder'):
decoded, log_prob = tf.nn.ctc_beam_search_decoder(logits, seq_len, beam_width=100, top_paths=1, merge_repeated=False)
# Option 2: tf.nn.ctc_greedy_decoder (it's faster but give worse results)
dense_decoded = tf.sparse_tensor_to_dense(decoded[0], name="SparseToDense", default_value=-1)
# The Levenshtein (edit) distances between the decodings and their transcriptions "distance"
with tf.name_scope('distance'):
distance = tf.edit_distance(tf.cast(decoded[0], tf.int32), targets, name="edit_distance")
# The accuracy of the outcome averaged over the whole batch ``accuracy`
ler = tf.reduce_mean(distance, name="Mean")
config = tf.ConfigProto(allow_soft_placement=False, log_device_placement=False)
config.gpu_options.allow_growth = False
hooks = []
if multi_gpu:
config.gpu_options.visible_device_list = str(my_gpu_rank)
hooks = [
# Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states
# from rank 0 to all other processes. This is necessary to ensure consistent
# initialization of all workers when training is started with random weights
# or restored from a checkpoint.
hvd.BroadcastGlobalVariablesHook(0),
]
# Horovod: adjust number of steps based on number of GPUs.
# NOTE: our "no_iter" is already adjusted to the data-size for this instance in MultiGPU-Training
# The data SIZE is automatically split in DataGenerator in num_gpu chunks
# hooks.append(tf.train.StopAtStepHook(last_step=no_iter * num_epochs))
hooks.append(tf.train.LoggingTensorHook(tensors={'step': global_step, 'loss': avg_loss}, every_n_iter=cp_freq))
if multi_gpu:
checkpoint_dir = model_dir if my_gpu_rank == 0 else None
else:
checkpoint_dir = model_dir
checkpoint_load_dir = model_dir
print('Training...')
iterator = None
total_iter = no_iter
remaining_iter = total_iter
current_step = 1
best_cost = 1e10
with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir, hooks=hooks, config=config) as mon_sess:
ckpt = tf.train.get_checkpoint_state(checkpoint_load_dir)
if ckpt is not None and ckpt.model_checkpoint_path is not None:
current_step = int(ckpt.model_checkpoint_path.split('/')[-1].split('.')[1].split('-')[-1])
remaining_iter = total_iter - (current_step % total_iter)
tf.get_default_graph().finalize()
while not mon_sess.should_stop():
# For each epoch get new iterator and start new threads. But just for the first epoch audio files are sorted according to the durations
if current_step % total_iter == 0 or iterator is None:
if current_step // total_iter == 0:
iterator = datagen.iterate_train(mini_batch_size=mini_batch_size, sort_by_duration=True, shuffle=False, max_iters=remaining_iter)
else:
iterator = datagen.iterate_train(mini_batch_size=mini_batch_size, sort_by_duration=False, shuffle=True, max_iters=remaining_iter)
batch_no = (current_step % total_iter) + 1
epoch_no = (current_step // total_iter) + 1
b_perc = int(float(batch_no) / float(total_iter) * 100.0)
print('Epoch: {}/{}, Batch: {:-6d}/{} {:d}%'.format(epoch_no, num_epochs, batch_no, total_iter, b_perc), end='\r')
sys.stdout.flush()
iter_start = time.time()
inputs, out_len, indices, values, shape, labels = next(iterator)
feed = {"inputs/inputs:0": inputs, "inputs/targets/shape:0": shape, "inputs/targets/indices:0": indices, "inputs/targets/values:0": values, "inputs/seq_len:0": out_len}
if current_step % iterlog == 0:
_, ctc_cost, cError, cDecoded = mon_sess.run([train_op, avg_loss, ler, dense_decoded], feed_dict=feed)
batch_error = cError * mini_batch_size
for i, seq in enumerate(cDecoded):
seq = [s for s in seq if s != -1]
if not multi_gpu or my_gpu_rank == 0:
logger.info("Iteration :{}-{}".format(current_step % total_iter, str(i + 1)))
sequence = convert_int_sequence_to_text_sequence(seq, index_map)
if not multi_gpu or my_gpu_rank == 0:
logger.info("Original Utterance ({0}): {1}".format(len(labels[i]), labels[i]))
logger.info("Decoded Text ({0}):\t{1}\n".format(len(sequence), sequence))
else:
ctc_cost, _ = mon_sess.run([avg_loss, train_op], feed_dict=feed)
if current_step // total_iter > 0:
best_loss_str = '{:.5f}'.format(best_cost)
else:
best_loss_str = 'N/A'
if not multi_gpu or my_gpu_rank == 0:
logger.info("Epoch: {:-4d}, Iter:{:-6d}, Loss:{:.5f}, BestLoss: {}, Time:{:.3f}".format((current_step // total_iter)+1, (current_step % total_iter)+1, ctc_cost, best_loss_str, time.time() - iter_start))
current_step += 1
# Ignore best_cost during Epoch 1 run...
if current_step // total_iter > 0 and ctc_cost < best_cost:
best_cost = ctc_cost
print('\n')
return True
if language is not None:
_ = train_model(language, data_dir, model_type, gru_layer, num_epochs, mini_batch_size, iterlog, checkpoint_freq, restore_ckpt, model_root, multi_gpu)
else:
usage()