-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathtacotron.py
494 lines (404 loc) · 21.8 KB
/
tacotron.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
import tensorflow as tf
from tacotron.utils.symbols import symbols
from infolog import log
from tacotron.models.helpers import TacoTrainingHelper, TacoTestHelper
from tacotron.models.modules import *
from tensorflow.contrib.seq2seq import dynamic_decode
from tacotron.models.Architecture_wrappers import TacotronEncoderCell, TacotronDecoderCell
from tacotron.models.custom_decoder import CustomDecoder
from tacotron.models.attention import LocationSensitiveAttention
import numpy as np
# set False when freeze model
tf_pyfunc = True # True False
def split_func(x, split_pos):
rst = []
start = 0
# x will be a numpy array with the contents of the placeholder below
for i in range(split_pos.shape[0]):
rst.append(x[:,start:start+split_pos[i]])
start += split_pos[i]
return rst
def split_func_tf_api(x_tensor, split_pos_tensor):
rst = []
start = 0
split_pos_shape = tf.shape(split_pos_tensor)[0]
# set split_pos_shape = hparams.tacotron_num_gpus = 1
for i in range(1):
tmp = x_tensor[:, start:start + split_pos_tensor[i]]
rst.append(tmp)
start += split_pos_tensor[i]
return rst
class Tacotron():
"""Tacotron-2 Feature prediction Model.
"""
def __init__(self, hparams):
self._hparams = hparams
def initialize(self, inputs, input_lengths, mel_targets=None, stop_token_targets=None, linear_targets=None, targets_lengths=None, gta=False,
global_step=None, is_training=False, is_evaluating=False, split_infos=None):
"""
Initializes the model for inference
sets "mel_outputs" and "alignments" fields.
Args:
- inputs: int32 Tensor with shape [N, T_in] where N is batch size, T_in is number of
steps in the input time series, and values are character IDs
- input_lengths: int32 Tensor with shape [N] where N is batch size and values are the lengths
of each sequence in inputs.
- mel_targets: float32 Tensor with shape [N, T_out, M] where N is batch size, T_out is number
of steps in the output time series, M is num_mels, and values are entries in the mel
spectrogram. Only needed for training.
"""
if mel_targets is None and stop_token_targets is not None:
raise ValueError('no multi targets were provided but token_targets were given')
if mel_targets is not None and stop_token_targets is None and not gta:
raise ValueError('Mel targets are provided without corresponding token_targets')
if not gta and self._hparams.predict_linear==True and linear_targets is None and is_training:
raise ValueError('Model is set to use post processing to predict linear spectrograms in training but no linear targets given!')
if gta and linear_targets is not None:
raise ValueError('Linear spectrogram prediction is not supported in GTA mode!')
if is_training and self._hparams.mask_decoder and targets_lengths is None:
raise RuntimeError('Model set to mask paddings but no targets lengths provided for the mask!')
if is_training and is_evaluating:
raise RuntimeError('Model can not be in training and evaluation modes at the same time!')
split_device = '/cpu:0' if self._hparams.tacotron_num_gpus > 1 or self._hparams.split_on_cpu else '/gpu:0'
with tf.device(split_device):
hp = self._hparams
lout_int = [tf.int32]*hp.tacotron_num_gpus
lout_float = [tf.float32]*hp.tacotron_num_gpus
tower_input_lengths = tf.split(input_lengths, num_or_size_splits=hp.tacotron_num_gpus, axis=0)
tower_targets_lengths = tf.split(targets_lengths, num_or_size_splits=hp.tacotron_num_gpus, axis=0) if targets_lengths is not None else targets_lengths
if (tf_pyfunc):
p_inputs = tf.py_func(split_func, [inputs, split_infos[:, 0]], lout_int)
p_mel_targets = tf.py_func(split_func, [mel_targets, split_infos[:, 1]],
lout_float) if mel_targets is not None else mel_targets
p_stop_token_targets = tf.py_func(split_func, [stop_token_targets, split_infos[:, 2]],
lout_float) if stop_token_targets is not None else stop_token_targets
p_linear_targets = tf.py_func(split_func, [linear_targets, split_infos[:, 3]],
lout_float) if linear_targets is not None else linear_targets
else:
p_inputs = split_func_tf_api(tf.cast(inputs, tf.int32), tf.cast(split_infos[:, 0], tf.int32))
p_mel_targets = split_func_tf_api(tf.cast(mel_targets, tf.float32), tf.cast(split_infos[:, 1],
tf.float32)) if mel_targets is not None else mel_targets
p_stop_token_targets = split_func_tf_api(tf.cast(stop_token_targets, tf.float32),
tf.cast(split_infos[:, 2],tf.float32)) if stop_token_targets is not None else stop_token_targets
p_linear_targets = split_func_tf_api(tf.cast(linear_targets, tf.float32), tf.cast(split_infos[:, 3],
tf.float32), ) if linear_targets is not None else linear_targets
tower_inputs = []
tower_mel_targets = []
tower_stop_token_targets = []
tower_linear_targets = []
batch_size = tf.shape(inputs)[0]
mel_channels = hp.num_mels
linear_channels = hp.num_freq
for i in range (0,hp.tacotron_num_gpus):
tower_inputs.append(tf.reshape(p_inputs[i], [batch_size, -1]))
if p_mel_targets is not None:
tower_mel_targets.append(tf.reshape(p_mel_targets[i], [batch_size, -1, mel_channels]))
if p_stop_token_targets is not None:
tower_stop_token_targets.append(tf.reshape(p_stop_token_targets[i], [batch_size, -1]))
if p_linear_targets is not None:
tower_linear_targets.append(tf.reshape(p_linear_targets[i], [batch_size, -1, linear_channels]))
T2_output_range = (-hp.max_abs_value, hp.max_abs_value) if hp.symmetric_mels else (0, hp.max_abs_value)
self.tower_decoder_output = []
self.tower_alignments = []
self.tower_stop_token_prediction = []
self.tower_mel_outputs = []
self.tower_linear_outputs = []
tower_embedded_inputs = []
tower_enc_conv_output_shape = []
tower_encoder_outputs = []
tower_residual = []
tower_projected_residual = []
# 1. Declare GPU Devices
gpus = ["/gpu:{}".format(i) for i in range(0,hp.tacotron_num_gpus)]
for i in range(0,hp.tacotron_num_gpus):
with tf.device(tf.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0", worker_device=gpus[i])):
with tf.variable_scope('inference') as scope:
assert hp.tacotron_teacher_forcing_mode in ('constant', 'scheduled')
if hp.tacotron_teacher_forcing_mode == 'scheduled' and is_training:
assert global_step is not None
#GTA is only used for predicting mels to train Wavenet vocoder, so we ommit post processing when doing GTA synthesis
post_condition = hp.predict_linear and not gta
# Embeddings ==> [batch_size, sequence_length, embedding_dim]
self.embedding_table = tf.get_variable(
'inputs_embedding', [len(symbols), hp.embedding_dim], dtype=tf.float32)
embedded_inputs = tf.nn.embedding_lookup(self.embedding_table, tower_inputs[i])
#Encoder Cell ==> [batch_size, encoder_steps, encoder_lstm_units]
encoder_cell = TacotronEncoderCell(
EncoderConvolutions(is_training, hparams=hp, scope='encoder_convolutions'),
EncoderRNN(is_training, size=hp.encoder_lstm_units,
zoneout=hp.tacotron_zoneout_rate, scope='encoder_LSTM'))
encoder_outputs = encoder_cell(embedded_inputs, tower_input_lengths[i])
#For shape visualization purpose
enc_conv_output_shape = encoder_cell.conv_output_shape
#Decoder Parts
#Attention Decoder Prenet
prenet = Prenet(is_training, layers_sizes=hp.prenet_layers, drop_rate=hp.tacotron_dropout_rate, scope='decoder_prenet')
#Attention Mechanism
attention_mechanism = LocationSensitiveAttention(hp.attention_dim, encoder_outputs, hparams=hp, is_training=is_training,
mask_encoder=hp.mask_encoder, memory_sequence_length=tf.reshape(tower_input_lengths[i], [-1]), smoothing=hp.smoothing,
cumulate_weights=hp.cumulative_weights)
#Decoder LSTM Cells
decoder_lstm = DecoderRNN(is_training, layers=hp.decoder_layers,
size=hp.decoder_lstm_units, zoneout=hp.tacotron_zoneout_rate, scope='decoder_LSTM')
#Frames Projection layer
frame_projection = FrameProjection(hp.num_mels * hp.outputs_per_step, scope='linear_transform_projection')
#<stop_token> projection layer
stop_projection = StopProjection(is_training or is_evaluating, shape=hp.outputs_per_step, scope='stop_token_projection')
#Decoder Cell ==> [batch_size, decoder_steps, num_mels * r] (after decoding)
decoder_cell = TacotronDecoderCell(
prenet,
attention_mechanism,
decoder_lstm,
frame_projection,
stop_projection)
#Define the helper for our decoder
if is_training or is_evaluating or gta:
self.helper = TacoTrainingHelper(batch_size, tower_mel_targets[i], hp, gta, is_evaluating, global_step)
else:
self.helper = TacoTestHelper(batch_size, hp)
#initial decoder state
decoder_init_state = decoder_cell.zero_state(batch_size=batch_size, dtype=tf.float32)
#Only use max iterations at synthesis time
max_iters = hp.max_iters if not (is_training or is_evaluating) else None
#Decode
(frames_prediction, stop_token_prediction, _), final_decoder_state, _ = dynamic_decode(
CustomDecoder(decoder_cell, self.helper, decoder_init_state),
impute_finished=False,
maximum_iterations=max_iters,
swap_memory=hp.tacotron_swap_with_cpu)
# Reshape outputs to be one output per entry
#==> [batch_size, non_reduced_decoder_steps (decoder_steps * r), num_mels]
decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels])
stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1])
if hp.clip_outputs:
decoder_output = tf.minimum(tf.maximum(decoder_output, T2_output_range[0] - hp.lower_bound_decay), T2_output_range[1])
#Postnet
postnet = Postnet(is_training, hparams=hp, scope='postnet_convolutions')
#Compute residual using post-net ==> [batch_size, decoder_steps * r, postnet_channels]
residual = postnet(decoder_output)
#Project residual to same dimension as mel spectrogram
#==> [batch_size, decoder_steps * r, num_mels]
residual_projection = FrameProjection(hp.num_mels, scope='postnet_projection')
projected_residual = residual_projection(residual)
#Compute the mel spectrogram
mel_outputs = decoder_output + projected_residual
if hp.clip_outputs:
mel_outputs = tf.minimum(tf.maximum(mel_outputs, T2_output_range[0] - hp.lower_bound_decay), T2_output_range[1])
if post_condition:
# Add post-processing CBHG. This does a great job at extracting features from mels before projection to Linear specs.
post_cbhg = CBHG(hp.cbhg_kernels, hp.cbhg_conv_channels, hp.cbhg_pool_size, [hp.cbhg_projection, hp.num_mels],
hp.cbhg_projection_kernel_size, hp.cbhg_highwaynet_layers,
hp.cbhg_highway_units, hp.cbhg_rnn_units, hp.batch_norm_position, is_training, name='CBHG_postnet')
#[batch_size, decoder_steps(mel_frames), cbhg_channels]
post_outputs = post_cbhg(mel_outputs, None)
#Linear projection of extracted features to make linear spectrogram
linear_specs_projection = FrameProjection(hp.num_freq, scope='cbhg_linear_specs_projection')
#[batch_size, decoder_steps(linear_frames), num_freq]
linear_outputs = linear_specs_projection(post_outputs)
if hp.clip_outputs:
linear_outputs = tf.minimum(tf.maximum(linear_outputs, T2_output_range[0] - hp.lower_bound_decay), T2_output_range[1])
#Grab alignments from the final decoder state
alignments = tf.transpose(final_decoder_state.alignment_history.stack(), [1, 2, 0])
self.tower_decoder_output.append(decoder_output)
self.tower_alignments.append(alignments)
self.tower_stop_token_prediction.append(stop_token_prediction)
self.tower_mel_outputs.append(mel_outputs)
tower_embedded_inputs.append(embedded_inputs)
tower_enc_conv_output_shape.append(enc_conv_output_shape)
tower_encoder_outputs.append(encoder_outputs)
tower_residual.append(residual)
tower_projected_residual.append(projected_residual)
if post_condition:
self.tower_linear_outputs.append(linear_outputs)
log('initialisation done {}'.format(gpus[i]))
if is_training:
self.ratio = self.helper._ratio
self.tower_inputs = tower_inputs
self.tower_input_lengths = tower_input_lengths
self.tower_mel_targets = tower_mel_targets
self.tower_linear_targets = tower_linear_targets
self.tower_targets_lengths = tower_targets_lengths
self.tower_stop_token_targets = tower_stop_token_targets
self.all_vars = tf.trainable_variables()
log('Initialized Tacotron model. Dimensions (? = dynamic shape): ')
log(' Train mode: {}'.format(is_training))
log(' Eval mode: {}'.format(is_evaluating))
log(' GTA mode: {}'.format(gta))
log(' Synthesis mode: {}'.format(not (is_training or is_evaluating)))
log(' Input: {}'.format(inputs.shape))
for i in range(0,hp.tacotron_num_gpus):
log(' device: {}'.format(i))
log(' embedding: {}'.format(tower_embedded_inputs[i].shape))
log(' enc conv out: {}'.format(tower_enc_conv_output_shape[i]))
log(' encoder out: {}'.format(tower_encoder_outputs[i].shape))
log(' decoder out: {}'.format(self.tower_decoder_output[i].shape))
log(' residual out: {}'.format(tower_residual[i].shape))
log(' projected residual out: {}'.format(tower_projected_residual[i].shape))
log(' mel out: {}'.format(self.tower_mel_outputs[i].shape))
if post_condition:
log(' linear out: {}'.format(self.tower_linear_outputs[i].shape))
log(' <stop_token> out: {}'.format(self.tower_stop_token_prediction[i].shape))
#1_000_000 is causing syntax problems for some people?! Python please :)
log(' Tacotron Parameters {:.3f} Million.'.format(np.sum([np.prod(v.get_shape().as_list()) for v in self.all_vars]) / 1000000))
def add_loss(self):
'''Adds loss to the model. Sets "loss" field. initialize must have been called.'''
hp = self._hparams
self.tower_before_loss = []
self.tower_after_loss= []
self.tower_stop_token_loss = []
self.tower_regularization_loss = []
self.tower_linear_loss = []
self.tower_loss = []
total_before_loss = 0
total_after_loss= 0
total_stop_token_loss = 0
total_regularization_loss = 0
total_linear_loss = 0
total_loss = 0
gpus = ["/gpu:{}".format(i) for i in range(0,hp.tacotron_num_gpus)]
for i in range(0,hp.tacotron_num_gpus):
with tf.device(tf.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0", worker_device=gpus[i])):
with tf.variable_scope('loss') as scope:
if hp.mask_decoder:
# Compute loss of predictions before postnet
before = MaskedMSE(self.tower_mel_targets[i], self.tower_decoder_output[i], self.tower_targets_lengths[i],
hparams=self._hparams)
# Compute loss after postnet
after = MaskedMSE(self.tower_mel_targets[i], self.tower_mel_outputs[i], self.tower_targets_lengths[i],
hparams=self._hparams)
#Compute <stop_token> loss (for learning dynamic generation stop)
stop_token_loss = MaskedSigmoidCrossEntropy(self.tower_stop_token_targets[i],
self.tower_stop_token_prediction[i], self.tower_targets_lengths[i], hparams=self._hparams)
#Compute masked linear loss
if hp.predict_linear:
#Compute Linear L1 mask loss (priority to low frequencies)
linear_loss = MaskedLinearLoss(self.tower_linear_targets[i], self.tower_linear_outputs[i],
self.targets_lengths, hparams=self._hparams)
else:
linear_loss=0.
else:
# Compute loss of predictions before postnet
before = tf.losses.mean_squared_error(self.tower_mel_targets[i], self.tower_decoder_output[i])
# Compute loss after postnet
after = tf.losses.mean_squared_error(self.tower_mel_targets[i], self.tower_mel_outputs[i])
#Compute <stop_token> loss (for learning dynamic generation stop)
stop_token_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(
labels=self.tower_stop_token_targets[i],
logits=self.tower_stop_token_prediction[i]))
if hp.predict_linear:
#Compute linear loss
#From https://github.com/keithito/tacotron/blob/tacotron2-work-in-progress/models/tacotron.py
#Prioritize loss for frequencies under 2000 Hz.
l1 = tf.abs(self.tower_linear_targets[i] - self.tower_linear_outputs[i])
n_priority_freq = int(2000 / (hp.sample_rate * 0.5) * hp.num_freq)
linear_loss = 0.5 * tf.reduce_mean(l1) + 0.5 * tf.reduce_mean(l1[:,:,0:n_priority_freq])
else:
linear_loss = 0.
# Compute the regularization weight
if hp.tacotron_scale_regularization:
reg_weight_scaler = 1. / (2 * hp.max_abs_value) if hp.symmetric_mels else 1. / (hp.max_abs_value)
reg_weight = hp.tacotron_reg_weight * reg_weight_scaler
else:
reg_weight = hp.tacotron_reg_weight
# Regularize variables
# Exclude all types of bias, RNN (Bengio et al. On the difficulty of training recurrent neural networks), embeddings and prediction projection layers.
# Note that we consider attention mechanism v_a weights as a prediction projection layer and we don't regularize it. (This gave better stability)
regularization = tf.add_n([tf.nn.l2_loss(v) for v in self.all_vars
if not('bias' in v.name or 'Bias' in v.name or '_projection' in v.name or 'inputs_embedding' in v.name
or 'RNN' in v.name or 'LSTM' in v.name)]) * reg_weight
# Compute final loss term
self.tower_before_loss.append(before)
self.tower_after_loss.append(after)
self.tower_stop_token_loss.append(stop_token_loss)
self.tower_regularization_loss.append(regularization)
self.tower_linear_loss.append(linear_loss)
tower_loss = before + after + stop_token_loss + regularization + linear_loss
self.tower_loss.append(tower_loss)
total_before_loss += before
total_after_loss += after
total_stop_token_loss += stop_token_loss
total_regularization_loss += regularization
total_linear_loss += linear_loss
total_loss += tower_loss
self.before_loss = total_before_loss / hp.tacotron_num_gpus
self.after_loss = total_after_loss / hp.tacotron_num_gpus
self.stop_token_loss = total_stop_token_loss / hp.tacotron_num_gpus
self.regularization_loss = total_regularization_loss / hp.tacotron_num_gpus
self.linear_loss = total_linear_loss / hp.tacotron_num_gpus
self.loss = total_loss / hp.tacotron_num_gpus
def add_optimizer(self, global_step):
'''Adds optimizer. Sets "gradients" and "optimize" fields. add_loss must have been called.
Args:
global_step: int32 scalar Tensor representing current global step in training
'''
hp = self._hparams
tower_gradients = []
# 1. Declare GPU Devices
gpus = ["/gpu:{}".format(i) for i in range(0,hp.tacotron_num_gpus)]
grad_device = '/cpu:0' if hp.tacotron_num_gpus > 1 else gpus[0]
with tf.device(grad_device):
with tf.variable_scope('optimizer') as scope:
if hp.tacotron_decay_learning_rate:
self.decay_steps = hp.tacotron_decay_steps
self.decay_rate = hp.tacotron_decay_rate
self.learning_rate = self._learning_rate_decay(hp.tacotron_initial_learning_rate, global_step)
else:
self.learning_rate = tf.convert_to_tensor(hp.tacotron_initial_learning_rate)
optimizer = tf.train.AdamOptimizer(self.learning_rate, hp.tacotron_adam_beta1,
hp.tacotron_adam_beta2, hp.tacotron_adam_epsilon)
# 2. Compute Gradient
for i in range(0,hp.tacotron_num_gpus):
# Device placement
with tf.device(tf.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0", worker_device=gpus[i])):
with tf.variable_scope('optimizer') as scope:
update_vars = [v for v in self.all_vars if not ('inputs_embedding' in v.name or 'encoder_' in v.name)] if hp.tacotron_fine_tuning else None
gradients = optimizer.compute_gradients(self.tower_loss[i], var_list=update_vars)
tower_gradients.append(gradients)
# 3. Average Gradient
with tf.device(grad_device):
avg_grads = []
variables = []
for grad_and_vars in zip(*tower_gradients):
# each_grads_vars = ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
grads = []
for g,_ in grad_and_vars:
expanded_g = tf.expand_dims(g, 0)
# Append on a 'tower' dimension which we will average over below.
grads.append(expanded_g)
# Average over the 'tower' dimension.
grad = tf.concat(axis=0, values=grads)
grad = tf.reduce_mean(grad, 0)
v = grad_and_vars[0][1]
avg_grads.append(grad)
variables.append(v)
self.gradients = avg_grads
#Just for caution
#https://github.com/Rayhane-mamah/Tacotron-2/issues/11
if hp.tacotron_clip_gradients:
clipped_gradients, _ = tf.clip_by_global_norm(avg_grads, 1.) # __mark 0.5 refer
else:
clipped_gradients = avg_grads
# Add dependency on UPDATE_OPS; otherwise batchnorm won't work correctly. See:
# https://github.com/tensorflow/tensorflow/issues/1122
with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
self.optimize = optimizer.apply_gradients(zip(clipped_gradients, variables),
global_step=global_step)
def _learning_rate_decay(self, init_lr, global_step):
#################################################################
# Narrow Exponential Decay:
# Phase 1: lr = 1e-3
# We only start learning rate decay after 50k steps
# Phase 2: lr in ]1e-5, 1e-3[
# decay reach minimal value at step 310k
# Phase 3: lr = 1e-5
# clip by minimal learning rate value (step > 310k)
#################################################################
hp = self._hparams
#Compute natural exponential decay
lr = tf.train.exponential_decay(init_lr,
global_step - hp.tacotron_start_decay, #lr = 1e-3 at step 50k
self.decay_steps,
self.decay_rate, #lr = 1e-5 around step 310k
name='lr_exponential_decay')
#clip learning rate by max and min values (initial and final values)
return tf.minimum(tf.maximum(lr, hp.tacotron_final_learning_rate), init_lr)