forked from Snowdar/asv-subtools
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrunPhoneticXvector.sh
executable file
·275 lines (229 loc) · 10.4 KB
/
runPhoneticXvector.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
#!/bin/bash
# Copyright Tsinghua (Author:YiLiu earlier than 2018-10-01)
# xmuspeech (Author:Snowdar 2018-10-01)
set -e
stage=0
endstage=8
phonetic_train_stage=-10
xv_train_stage=0 # should be always >=0 in this case
use_gpu=true
clean=true
remove_egs=true
cmn=true # do sliding cmn when getting egs
sleep_time=3
model_limit=8
phonetic_min_len=20
phonetic_lr_factor=0.2 # if phonetic_lr_factor=0 ,the params of layers coming from phonetic raw are not changed when iterating.
xv_min_chunk=60
xv_max_chunk=80 # equal to xv_min_len:utt-length should be always >= xv_max_chunk
num_archives=150
xvTrainData=data/plp_20_5.0/baseTrain_concat_volume_sp
phoneticTrainData=data/plp_20_5.0/thchs30_train # the feat-type and dim of two traindatas should be consistent
phoneticAliDir=exp/thchs30_train_dnn_ali # get ali from a am model by yourself
outputname=base_phonetic_xv_plp_20_5.0_cmn # just a output name and the real output-path is exp/$outputname
. subtools/path.sh
. subtools/kaldi/utils/parse_options.sh
########## auto variables ################
phonetic_nnet_dir=exp/$outputname/phonetic
phonetic_egs_dir=exp/$outputname/phonetic/egs
xv_nnet_dir=exp/$outputname/xvector
xv_egs_dir=exp/$outputname/xvector/egs
mkdir -p $phonetic_nnet_dir
mkdir -p $xv_nnet_dir
echo -e "SleepTime=$sleep_time\nLimit=$model_limit" > $xv_nnet_dir/control.conf
xv_feat_dim=$(feat-to-dim scp:$xvTrainData/feats.scp -) || exit 1
phonetic_feat_dim=$(feat-to-dim scp:$phoneticTrainData/feats.scp -) || exit 1
[ $xv_feat_dim != $phonetic_feat_dim ] && echo "[exit] Dim of $xvTrainData is not equal to $phoneticTrainData" && exit 1
feat_dim=$xv_feat_dim
mkdir -p $phonetic_nnet_dir
mkdir -p $xv_nnet_dir
#### stage --> go #####
if [[ $stage -le 0 && 0 -le $endstage ]];then
echo "[stage 0] Prepare xvTrainData dir with no nonspeech frames"
rm -rf ${xvTrainData}_nosil
rm -rf exp/features/${xvTrainData}_nosil
subtools/kaldi/sid/nnet3/xvector/prepare_feats_for_egs.sh --nj 20 --cmd "run.pl" \
$xvTrainData ${xvTrainData}_nosil exp/features/${xvTrainData}_nosil
fi
if [[ $stage -le 1 && 1 -le $endstage ]];then
echo "[stage 1] Remove utts whose length is less than the lower limit value"
subtools/removeUtt.sh ${phoneticTrainData} $phonetic_min_len
subtools/removeUtt.sh ${xvTrainData}_nosil $xv_max_chunk
fi
## phonetic ##
##############################################
phonetic_end_node=phonetic_tdnn5
if [[ $stage -le 2 && 2 -le $endstage ]];then
echo "[stage 2] Prepare phonetic network config"
phonetic_num_targets=$(tree-info $phoneticAliDir/tree | grep num-pdfs | awk '{print $2}') || exit 1
mkdir -p $phonetic_nnet_dir/configs
cat <<EOF > $phonetic_nnet_dir/configs/network.xconfig
input dim=$feat_dim name=input
relu-batchnorm-layer name=phonetic_tdnn1 dim=650 input=Append(-2,-1,0,1,2)
relu-batchnorm-layer name=phonetic_tdnn2 dim=650 input=Append(-1,0,1)
relu-batchnorm-layer name=phonetic_tdnn3 dim=650 input=Append(-1,0,1)
relu-batchnorm-layer name=phonetic_tdnn4 dim=650 input=Append(-3,0,3)
relu-batchnorm-layer name=$phonetic_end_node dim=128 input=Append(-6,-3,0)
output-layer name=output dim=$phonetic_num_targets max-change=1.5
EOF
subtools/kaldi/steps/nnet3/xconfig_to_configs.py \
--xconfig-file $phonetic_nnet_dir/configs/network.xconfig \
--config-dir $phonetic_nnet_dir/configs
cp $phonetic_nnet_dir/configs/final.config $phonetic_nnet_dir/nnet.config
fi
if [[ $stage -le 3 && 3 -le $endstage ]];then
echo "[stage 3] get egs for training phonetic nnet3 model"
left_context=$(grep 'model_left_context' $phonetic_nnet_dir/configs/vars | cut -d '=' -f 2)
right_context=$(grep 'model_right_context' $phonetic_nnet_dir/configs/vars | cut -d '=' -f 2)
frame_subsampling_factor=1
[ -f $phoneticAliDir/frame_subsampling_factor ] && frame_subsampling_factor=$(awk '{print $1}' $phoneticAliDir/frame_subsampling_factor)
subtools/kaldi/sid/nnet3/get_egs.sh --cmd "run.pl" \
--nj 10 \
--stage 0 \
--cmn $cmn \
--frame-subsampling-factor $frame_subsampling_factor \
--vad true \
--frames-per-eg 1 \
--left-context $left_context \
--right-context $right_context \
${phoneticTrainData} $phoneticAliDir $phonetic_egs_dir
# [To indicate training without multitask, delete valid_diagnostic.scp .] why?
rm -f $phonetic_egs_dir/valid_diagnostic.scp
fi
if [[ $stage -le 4 && 4 -le $endstage ]];then
echo "[stage 4] train phonetic nnet3 raw model"
subtools/kaldi/steps/nnet3/train_raw_dnn.py --stage=$phonetic_train_stage \
--cmd="run.pl" \
--trainer.optimization.num-jobs-initial=2 \
--trainer.optimization.num-jobs-final=8 \
--trainer.optimization.initial-effective-lrate=0.0015 \
--trainer.optimization.final-effective-lrate=0.00015 \
--trainer.optimization.minibatch-size=256,128 \
--trainer.srand=123 \
--trainer.max-param-change=2 \
--trainer.num-epochs=3 \
--egs.frames-per-eg=1 \
--egs.dir="$phonetic_egs_dir" \
--cleanup=true \
--cleanup.remove-egs=$remove_egs \
--cleanup.preserve-model-interval=10 \
--use-gpu=$use_gpu \
--dir=$phonetic_nnet_dir || exit 1;
fi
## xvector ##
##############################################
if [[ $stage -le 5 && 5 -le $endstage ]];then
echo "[stage 5] get egs for training xvector nnet3 model"
subtools/kaldi/sid/nnet3/xvector/get_egs.sh --cmd "run.pl" \
--nj 20 \
--stage 0 \
--num-train-archives $num_archives \
--frames-per-iter-diagnostic 100000 \
--min-frames-per-chunk $xv_min_chunk \
--max-frames-per-chunk $xv_max_chunk \
--num-diagnostic-archives 3 \
--num-repeats 6000 \
"${xvTrainData}_nosil" $xv_egs_dir
fi
if [[ $stage -le 6 && 6 -le $endstage ]];then
echo "[stage 6] prepare xvector network config based phonetic network config and init the joining nnet3 raw model"
xv_num_targets=$(wc -w $xv_egs_dir/pdf2num | awk '{print $1}')
max_chunk_size=10000
min_chunk_size=25
mkdir -p $xv_nnet_dir/configs
cat <<EOF > $xv_nnet_dir/configs/network.xconfig
# please note that it is important to have input layer with the name=input
# The frame-level layers
input dim=${feat_dim} name=input
relu-batchnorm-layer name=tdnn1 input=Append(-2,-1,0,1,2) dim=512
relu-batchnorm-layer name=tdnn2 input=Append(-2,0,2) dim=512
relu-batchnorm-layer name=tdnn3 input=Append(-3,0,3) dim=512
relu-batchnorm-layer name=tdnn4 dim=512
relu-batchnorm-layer name=tdnn5 dim=1500 input=Append(tdnn4,${phonetic_end_node}.batchnorm)
# The stats pooling layer. Layers after this are segment-level.
# In the config below, the first and last argument (0, and ${max_chunk_size})
# means that we pool over an input segment starting at frame 0
# and ending at frame ${max_chunk_size} or earlier. The other arguments (1:1)
# mean that no subsampling is performed.
stats-layer name=stats config=mean+stddev(0:1:1:${max_chunk_size})
# This is where we usually extract the embedding (aka xvector) from.
relu-batchnorm-layer name=tdnn6 dim=512 input=stats
# This is where another layer the embedding could be extracted
# from, but usually the previous one works better.
relu-batchnorm-layer name=tdnn7 dim=512
output-layer name=output include-log-softmax=true dim=${xv_num_targets}
EOF
# --existing-model option could make sure that the config can be parsed successfully because
# ${phonetic_end_node}.batchnorm is not exist in this nnet.config after all.
subtools/kaldi/steps/nnet3/xconfig_to_configs.py \
--existing-model $phonetic_nnet_dir/final.raw \
--xconfig-file $xv_nnet_dir/configs/network.xconfig \
--config-dir $xv_nnet_dir/configs
# if phonetic_lr_factor=0 ,the params of layers coming from phonetic raw are not changed when iterating.
# And by nnet3-init(nnet3-copy can also complete it ),we can get a new nnet3 network which joins both
# phonetic and xvector network.
run.pl $xv_nnet_dir/log/generate_input_mdl.log \
nnet3-copy --edits="set-learning-rate-factor name=* learning-rate-factor=$phonetic_lr_factor" \
$phonetic_nnet_dir/final.raw - \| \
nnet3-init --srand=1 - $xv_nnet_dir/configs/final.config $xv_nnet_dir/input.raw || exit 1;
# some configs for extracting xvector
echo "output-node name=output input=tdnn6.affine" > $xv_nnet_dir/extract_tdnn6.config
cp -f $xv_nnet_dir/extract_tdnn6.config $xv_nnet_dir/extract.config
echo "output-node name=output input=tdnn7.affine" > $xv_nnet_dir/extract_tdnn7.config
echo "$max_chunk_size" > $xv_nnet_dir/max_chunk_size
echo "$min_chunk_size" > $xv_nnet_dir/min_chunk_size
fi
if [[ $stage -le 7 && 7 -le $endstage ]]; then
echo "[stage 7] train xvector nnet3 model with some phonetic hidden layers"
dropout_schedule='0,[email protected],[email protected],0'
srand=123
# iteration starts from 0.raw which is a copy file of input.raw and make sure xv_train_stage from 0 rather than -10
cp -f $xv_nnet_dir/input.raw $xv_nnet_dir/0.raw
subtools/kaldi/steps/nnet3/train_raw_dnn.py --stage=$xv_train_stage \
--cmd="run.pl" \
--trainer.optimization.proportional-shrink 10 \
--trainer.optimization.momentum=0.5 \
--trainer.optimization.num-jobs-initial=2 \
--trainer.optimization.num-jobs-final=8 \
--trainer.optimization.initial-effective-lrate=0.001 \
--trainer.optimization.final-effective-lrate=0.0001 \
--trainer.optimization.minibatch-size=128 \
--trainer.srand=$srand \
--trainer.max-param-change=2 \
--trainer.num-epochs=3 \
--trainer.dropout-schedule="$dropout_schedule" \
--trainer.shuffle-buffer-size=1000 \
--egs.frames-per-eg=1 \
--egs.dir=$xv_egs_dir \
--cleanup=true \
--cleanup.remove-egs=$remove_egs \
--cleanup.preserve-model-interval=500 \
--use-gpu=$use_gpu \
--dir=$xv_nnet_dir || exit 1;
fi
if [[ -f $xv_nnet_dir/final.raw && "$clean" == "true" ]];then
rm -f $xv_egs_dir/egs*
rm -f $phonetic_egs_dir/egs*
rm -rf ${xvTrainData}_nosil
rm -rf exp/features/${xvTrainData}_nosil
fi
if [[ $stage -le 8 && 8 -le $endstage ]]; then
echo "[stage 8] extract phonetic-xvectors of several datasets"
prefix=plp_20_5.0
toEXdata="baseTrain_volume_sp test_1s_concat_sp"
layer="tdnn6"
nj=20
gpu=false
cache=1000
for x in $toEXdata ;do
for y in $layer ;do
num=0
[ -f $xv_nnet_dir/$y/$x/xvector.scp ] && num=$(grep ERROR $xv_nnet_dir/$y/$x/log/extract.*.log | wc -l)
[[ "$force" == "true" || ! -f $xv_nnet_dir/$y/$x/xvector.scp || $num -gt 0 ]] && \
subtools/kaldi/sid/nnet3/xvector/extract_xvectors.sh --cache-capacity $cache --extract-config extract_${y}.config \
--use-gpu $gpu --nj $nj $xv_nnet_dir data/${prefix}/$x $xv_nnet_dir/$y/$x
> $xv_nnet_dir/$y/$x/$prefix
echo "$y layer embeddings of data/$prefix/$x extracted done."
done
done
fi