From 6b0938a61d597ca248f5e7eab97995cfbbe20d85 Mon Sep 17 00:00:00 2001 From: Meixu Song Date: Fri, 19 Jun 2020 23:30:02 +0800 Subject: [PATCH] [scripts,egs] modify chain2 script (add ivector, diagnosis log, compute_wer.sh), and add wer results for mini_librispeech and wsj (#4001) --- .gitpod.Dockerfile | 22 + .gitpod.yml | 7 + README.md | 1 + egs/mini_librispeech/s5/cmd.sh | 1 + .../s5/local/chain2/compare_wer.sh | 137 ++++++ .../s5/local/chain2/data_prep_common.sh | 78 --- .../s5/local/chain2/tuning/run_tdnn_1a.sh | 101 ++-- egs/wsj/s5/local/chain2/compare_wer.sh | 143 ++++++ egs/wsj/s5/local/chain2/run_tdnn.sh | 1 + egs/wsj/s5/local/chain2/tuning/run_tdnn_1i.sh | 463 ++++++++++++++++++ egs/wsj/s5/run.sh | 2 +- egs/wsj/s5/steps/nnet3/chain2/train.sh | 47 +- 12 files changed, 892 insertions(+), 111 deletions(-) create mode 100644 .gitpod.Dockerfile create mode 100644 .gitpod.yml create mode 100755 egs/mini_librispeech/s5/local/chain2/compare_wer.sh delete mode 100755 egs/mini_librispeech/s5/local/chain2/data_prep_common.sh create mode 100755 egs/wsj/s5/local/chain2/compare_wer.sh create mode 120000 egs/wsj/s5/local/chain2/run_tdnn.sh create mode 100755 egs/wsj/s5/local/chain2/tuning/run_tdnn_1i.sh diff --git a/.gitpod.Dockerfile b/.gitpod.Dockerfile new file mode 100644 index 00000000000..53c8237d4e8 --- /dev/null +++ b/.gitpod.Dockerfile @@ -0,0 +1,22 @@ +FROM gitpod/workspace-full + +USER songmeixu + +# Install custom tools, runtime, etc. using apt-get +# For example, the command below would install "bastet" - a command line tetris clone: +# +# RUN sudo apt-get -q update && # sudo apt-get install -yq bastet && # sudo rm -rf /var/lib/apt/lists/* +# +# More information: https://www.gitpod.io/docs/config-docker/ + +FROM gitpod/workspace-full + +RUN sudo apt-get update \ + && sudo apt-get install -y \ + sox gfortran \ + && sudo rm -rf /var/lib/apt/lists/* + +RUN cd ~/GitHub/r-with-intel-mkl/ \ + && wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB | sudo apt-key add - \ + && sudo sh -c 'echo deb https://apt.repos.intel.com/mkl all main > /etc/apt/sources.list.d/intel-mkl.list' \ + && sudo apt-get update && sudo apt-get install intel-mkl-64bit diff --git a/.gitpod.yml b/.gitpod.yml new file mode 100644 index 00000000000..9e8cb2b79f2 --- /dev/null +++ b/.gitpod.yml @@ -0,0 +1,7 @@ +image: + file: .gitpod.dockerfile + +tasks: + - init: echo "Replace me with a build script for the project." + command: echo "Replace me with something that should run on every start, or just + remove me entirely." diff --git a/README.md b/README.md index 963b82ed427..ae0ceeeebae 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,5 @@ [![Build Status](https://travis-ci.com/kaldi-asr/kaldi.svg?branch=master)](https://travis-ci.com/kaldi-asr/kaldi) +[![Gitpod Ready-to-Code](https://img.shields.io/badge/Gitpod-Ready--to--Code-blue?logo=gitpod)](https://gitpod.io/#https://github.com/kaldi-asr/kaldi) Kaldi Speech Recognition Toolkit ================================ diff --git a/egs/mini_librispeech/s5/cmd.sh b/egs/mini_librispeech/s5/cmd.sh index 71dd849a93b..ed453ff8416 100644 --- a/egs/mini_librispeech/s5/cmd.sh +++ b/egs/mini_librispeech/s5/cmd.sh @@ -13,3 +13,4 @@ export train_cmd="queue.pl --mem 2G" export decode_cmd="queue.pl --mem 4G" export mkgraph_cmd="queue.pl --mem 8G" +export cuda_cmd="queue.pl --gpu 1" diff --git a/egs/mini_librispeech/s5/local/chain2/compare_wer.sh b/egs/mini_librispeech/s5/local/chain2/compare_wer.sh new file mode 100755 index 00000000000..c14d2031f8f --- /dev/null +++ b/egs/mini_librispeech/s5/local/chain2/compare_wer.sh @@ -0,0 +1,137 @@ +#!/usr/bin/env bash + +# this script is used for comparing decoding results between systems. +# e.g. local/chain/compare_wer.sh exp/chain/tdnn_{c,d}_sp +# For use with discriminatively trained systems you specify the epochs after a colon: +# for instance, +# local/chain/compare_wer.sh exp/chain/tdnn_c_sp exp/chain/tdnn_c_sp_smbr:{1,2,3} + + +if [ $# == 0 ]; then + echo "Usage: $0: [--looped] [--online] [ ... ]" + echo "e.g.: $0 exp/chain/tdnn_{b,c}_sp" + echo "or (with epoch numbers for discriminative training):" + echo "$0 exp/chain/tdnn_b_sp_disc:{1,2,3}" + exit 1 +fi + +echo "# $0 $*" + +include_looped=false +if [ "$1" == "--looped" ]; then + include_looped=true + shift +fi +include_online=false +if [ "$1" == "--online" ]; then + include_online=true + shift +fi + + +used_epochs=false + +# this function set_names is used to separate the epoch-related parts of the name +# [for discriminative training] and the regular parts of the name. +# If called with a colon-free directory name, like: +# set_names exp/chain/tdnn_lstm1e_sp_bi_smbr +# it will set dir=exp/chain/tdnn_lstm1e_sp_bi_smbr and epoch_infix="" +# If called with something like: +# set_names exp/chain/tdnn_d_sp_smbr:3 +# it will set dir=exp/chain/tdnn_d_sp_smbr and epoch_infix="_epoch3" + + +set_names() { + if [ $# != 1 ]; then + echo "compare_wer_general.sh: internal error" + exit 1 # exit the program + fi + dirname=$(echo $1 | cut -d: -f1) + epoch=$(echo $1 | cut -s -d: -f2) + if [ -z $epoch ]; then + epoch_infix="" + else + used_epochs=true + epoch_infix=_epoch${epoch} + fi +} + + + +echo -n "# System " +for x in $*; do printf "% 10s" " $(basename $x)"; done +echo + +strings=( + "#WER dev_clean_2 (tgsmall) " + "#WER dev_clean_2 (tglarge) ") + +for n in 0 1; do + echo -n "${strings[$n]}" + for x in $*; do + set_names $x # sets $dirname and $epoch_infix + decode_names=(tgsmall_dev_clean_2 tglarge_dev_clean_2) + + wer=$(cat $dirname/decode_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + if $include_looped; then + echo -n "# [looped:] " + for x in $*; do + set_names $x # sets $dirname and $epoch_infix + wer=$(cat $dirname/decode_looped_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + fi + if $include_online; then + echo -n "# [online:] " + for x in $*; do + set_names $x # sets $dirname and $epoch_infix + wer=$(cat ${dirname}_online/decode_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + fi +done + + +if $used_epochs; then + exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems. +fi + + +echo -n "# Final train prob " +for x in $*; do + prob=$(grep Overall $x/log/diagnostic_train.final.log | grep -v xent | awk '{printf("%.4f", $10)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob " +for x in $*; do + prob=$(grep Overall $x/log/diagnostic_heldout.final.log | grep -v xent | awk '{printf("%.4f", $10)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final train prob (xent)" +for x in $*; do + prob=$(grep Overall $x/log/diagnostic_train.final.log | grep -w xent | awk '{printf("%.4f", $10)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob (xent)" +for x in $*; do + prob=$(grep Overall $x/log/diagnostic_heldout.final.log | grep -w xent | awk '{printf("%.4f", $10)}') + printf "% 10s" $prob +done +echo + +echo -n "# Num-params " +for x in $*; do + printf "% 10s" $(grep num-parameters $x/log/progress.1.log | awk '{print $2}') +done +echo \ No newline at end of file diff --git a/egs/mini_librispeech/s5/local/chain2/data_prep_common.sh b/egs/mini_librispeech/s5/local/chain2/data_prep_common.sh deleted file mode 100755 index 21b36cce421..00000000000 --- a/egs/mini_librispeech/s5/local/chain2/data_prep_common.sh +++ /dev/null @@ -1,78 +0,0 @@ -#!/bin/bash -# Copyright 2019 Daniel Povey -# 2019 Srikanth Madikeri (Idiap Research Institute) - -set -euo pipefail - -# This script is called from local/chain/tuning/run_tdnn_2a.sh and -# similar scripts. It contains the common feature preparation and -# lattice-alignment preparation parts of the chaina training. -# See those scripts for examples of usage. - -stage=0 -train_set=train_clean_5 -test_sets="dev_clean_2" -gmm=tri3b - -. ./cmd.sh -. ./path.sh -. utils/parse_options.sh - -gmm_dir=exp/${gmm} -ali_dir=exp/${gmm}_ali_${train_set}_sp - -for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do - if [ ! -f $f ]; then - echo "$0: expected file $f to exist" - exit 1 - fi -done - -# Our default data augmentation method is 3-way speed augmentation followed by -# volume perturbation. We are looking into better ways of doing this, -# e.g. involving noise and reverberation. - -if [ $stage -le 1 ]; then - # Although the nnet will be trained by high resolution data, we still have to - # perturb the normal data to get the alignment. _sp stands for speed-perturbed - echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)" - utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp - echo "$0: making MFCC features for low-resolution speed-perturbed data" - steps/make_mfcc.sh --cmd "$train_cmd" --nj 10 data/${train_set}_sp || exit 1; - steps/compute_cmvn_stats.sh data/${train_set}_sp || exit 1; - utils/fix_data_dir.sh data/${train_set}_sp -fi - -if [ $stage -le 2 ]; then - echo "$0: aligning with the perturbed low-resolution data" - steps/align_fmllr.sh --nj 20 --cmd "$train_cmd" \ - data/${train_set}_sp data/lang $gmm_dir $ali_dir || exit 1 -fi - -if [ $stage -le 3 ]; then - # Create high-resolution MFCC features (with 40 cepstra instead of 13). - # this shows how you can split across multiple file-systems. - echo "$0: creating high-resolution MFCC features" - mfccdir=data/${train_set}_sp_hires/data - if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then - utils/create_split_dir.pl /export/fs0{1,2}/$USER/kaldi-data/mfcc/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage - fi - - for datadir in ${train_set}_sp ${test_sets}; do - utils/copy_data_dir.sh data/$datadir data/${datadir}_hires - done - - # do volume-perturbation on the training data prior to extracting hires - # features; this helps make trained nnets more invariant to test data volume. - utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires || exit 1; - - for datadir in ${train_set}_sp ${test_sets}; do - steps/make_mfcc.sh --nj 10 --mfcc-config conf/mfcc_hires.conf \ - --cmd "$train_cmd" data/${datadir}_hires || exit 1; - steps/compute_cmvn_stats.sh data/${datadir}_hires || exit 1; - utils/fix_data_dir.sh data/${datadir}_hires || exit 1; - done -fi - - -exit 0 diff --git a/egs/mini_librispeech/s5/local/chain2/tuning/run_tdnn_1a.sh b/egs/mini_librispeech/s5/local/chain2/tuning/run_tdnn_1a.sh index 2311fc0699e..ee97f2d93f4 100755 --- a/egs/mini_librispeech/s5/local/chain2/tuning/run_tdnn_1a.sh +++ b/egs/mini_librispeech/s5/local/chain2/tuning/run_tdnn_1a.sh @@ -1,9 +1,25 @@ -#!/bin/bash +#!/usr/bin/env bash # Copyright 2019 Srikanth Madikeri (Idiap Research Institute) -# +# # This script is a modification of local/chain/run_tdnn.sh adapted to the chain2 recipes. +# This is a basic TDNN experiment. +# run_tdnn_1a.sh in local/chain2 but uses new kaldi recipe. + +# steps/info/chain_dir_info.pl exp/chain2/tdnn1a_sp +# exp/chain2/tdnn1a_sp: num-iters=6 nj=2..5 combine=-0.038->-0.033 (over 3) + +# local/chain2/compare_wer.sh exp/chain2/tdnn1a_sp +# System tdnn1a_sp +#WER dev_clean_2 (tgsmall) 17.50 +#WER dev_clean_2 (tglarge) 12.67 +# Final train prob -0.0626 +# Final valid prob -0.0539 +# Final train prob (xent) -1.5220 +# Final valid prob (xent) -1.3991 +# Num-params 10005600 + # Set -e here so that we catch if any executable fails immediately set -euo pipefail @@ -14,20 +30,19 @@ decode_nj=10 train_set=train_clean_5 test_sets=dev_clean_2 gmm=tri3b -srand=0 nnet3_affix= # The rest are configs specific to this script. Most of the parameters # are just hardcoded at this level, in the commands below. -affix=2c # affix for the TDNN directory name +affix=1a # affix for the TDNN directory name tree_affix= train_stage=-10 get_egs_stage=-10 +decode_iter= - +# training options # training chunk-options -chunk_width=140 -dropout_schedule='0,0@0.20,0.3@0.50,0' +chunk_width=140,100,160 xent_regularize=0.1 bottom_subsampling_factor=1 # I'll set this to 3 later, 1 is for compatibility with a broken ru. frame_subsampling_factor=3 @@ -45,6 +60,14 @@ egs_extra_right_context=5 # to group multiple speaker together in some cases). chunks_per_group=4 +# training options +srand=0 +remove_egs=true +reporting_email= + +#decode options +test_online_decoding=true # if true, it will run the last decoding stage. + # End configuration section. echo "$0 $@" # Print the command line for logging @@ -53,32 +76,35 @@ echo "$0 $@" # Print the command line for logging . ./path.sh . ./utils/parse_options.sh -# if ! cuda-compiled; then -# cat < $dir/configs/default.xconfig - input name=input dim=40 + input dim=100 name=ivector + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat + # the first splicing is moved before the lda layer, so no splicing here - fixed-affine-layer name=lda input=Append(-2,-1,0,1,2) affine-transform-file=$dir/configs/lda.mat - relu-renorm-layer name=tdnn1 dim=512 input=Append(-2,-1,0,1,2) + relu-renorm-layer name=tdnn1 dim=512 relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1) relu-renorm-layer name=tdnn3 dim=512 input=Append(-1,0,1) relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3) @@ -245,11 +277,13 @@ if [ $stage -le 17 ]; then # Dump raw egs. steps/chain2/get_raw_egs.sh --cmd "$train_cmd" \ --lang "default" \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ --left-context $egs_left_context \ --right-context $egs_right_context \ --frame-subsampling-factor $frame_subsampling_factor \ --alignment-subsampling-factor $frame_subsampling_factor \ - --frames-per-chunk 140,100,160 \ + --frames-per-chunk ${chunk_width} \ + --online-ivector-dir ${train_ivector_dir} \ ${train_data_dir} ${dir} ${lat_dir} ${dir}/raw_egs fi @@ -298,7 +332,8 @@ if [ $stage -le 22 ]; then --xent-regularize $xent_regularize --leaky-hmm-coefficient 0.1 \ --max-param-change 2.0 \ --num-jobs-initial 2 --num-jobs-final 5 \ - $dir/egs $dir + --groups-per-minibatch 256,128,64 \ + $dir/egs $dir || exit 1; fi if [ $stage -le 23 ]; then @@ -310,23 +345,29 @@ if [ $stage -le 23 ]; then fi if [ $stage -le 24 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) # Do the speaker-dependent decoding pass test_sets=dev_clean_2 for data in $test_sets; do + ( nspk=$(wc -l [ ... ]" + echo "e.g.: $0 exp/chain/tdnn_{b,c}_sp" + echo "or (with epoch numbers for discriminative training):" + echo "$0 exp/chain/tdnn_b_sp_disc:{1,2,3}" + exit 1 +fi + +echo "# $0 $*" + +include_looped=false +if [ "$1" == "--looped" ]; then + include_looped=true + shift +fi +include_online=false +if [ "$1" == "--online" ]; then + include_online=true + shift +fi + + +used_epochs=false + +# this function set_names is used to separate the epoch-related parts of the name +# [for discriminative training] and the regular parts of the name. +# If called with a colon-free directory name, like: +# set_names exp/chain/tdnn_lstm1e_sp_bi_smbr +# it will set dir=exp/chain/tdnn_lstm1e_sp_bi_smbr and epoch_infix="" +# If called with something like: +# set_names exp/chain/tdnn_d_sp_smbr:3 +# it will set dir=exp/chain/tdnn_d_sp_smbr and epoch_infix="_epoch3" + + +set_names() { + if [ $# != 1 ]; then + echo "compare_wer_general.sh: internal error" + exit 1 # exit the program + fi + dirname=$(echo $1 | cut -d: -f1) + epoch=$(echo $1 | cut -s -d: -f2) + if [ -z $epoch ]; then + epoch_infix="" + else + used_epochs=true + epoch_infix=_epoch${epoch} + fi +} + + + +echo -n "# System " +for x in $*; do printf "% 10s" " $(basename $x)"; done +echo + +strings=( + "#WER dev93 (tgpr) " + "#WER dev93 (tg) " + "#WER dev93 (big-dict,tgpr) " + "#WER dev93 (big-dict,fg) " + "#WER eval92 (tgpr) " + "#WER eval92 (tg) " + "#WER eval92 (big-dict,tgpr)" + "#WER eval92 (big-dict,fg) ") + +for n in 0 1 2 3 4 5 6 7; do + echo -n "${strings[$n]}" + for x in $*; do + set_names $x # sets $dirname and $epoch_infix + decode_names=(tgpr_dev93 tg_dev93 bd_tgpr_dev93 bd_tgpr_dev93_fg tgpr_eval92 tg_eval92 bd_tgpr_eval92 bd_tgpr_eval92_fg) + + wer=$(< $dirname/decode_${decode_names[$n]}/scoring_kaldi/best_wer utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + if $include_looped; then + echo -n "# [looped:] " + for x in $*; do + set_names $x # sets $dirname and $epoch_infix + wer=$(< $dirname/decode_looped_${decode_names[$n]}/scoring_kaldi/best_wer utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + fi + if $include_online; then + echo -n "# [online:] " + for x in $*; do + set_names $x # sets $dirname and $epoch_infix + wer=$(< ${dirname}_online/decode_${decode_names[$n]}/scoring_kaldi/best_wer utils/best_wer.sh | awk '{print $2}') + printf "% 10s" $wer + done + echo + fi +done + + +if $used_epochs; then + exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems. +fi + + +echo -n "# Final train prob " +for x in $*; do + prob=$(grep Overall $x/log/diagnostic_train.final.log | grep -v xent | awk '{printf("%.4f", $10)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob " +for x in $*; do + prob=$(grep Overall $x/log/diagnostic_heldout.final.log | grep -v xent | awk '{printf("%.4f", $10)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final train prob (xent)" +for x in $*; do + prob=$(grep Overall $x/log/diagnostic_train.final.log | grep -w xent | awk '{printf("%.4f", $10)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob (xent)" +for x in $*; do + prob=$(grep Overall $x/log/diagnostic_heldout.final.log | grep -w xent | awk '{printf("%.4f", $10)}') + printf "% 10s" $prob +done +echo + +echo -n "# Num-params " +for x in $*; do + printf "% 10s" $(grep num-parameters $x/log/progress.1.log | awk '{print $2}') +done +echo diff --git a/egs/wsj/s5/local/chain2/run_tdnn.sh b/egs/wsj/s5/local/chain2/run_tdnn.sh new file mode 120000 index 00000000000..deb68d515d2 --- /dev/null +++ b/egs/wsj/s5/local/chain2/run_tdnn.sh @@ -0,0 +1 @@ +tuning/run_tdnn_1i.sh \ No newline at end of file diff --git a/egs/wsj/s5/local/chain2/tuning/run_tdnn_1i.sh b/egs/wsj/s5/local/chain2/tuning/run_tdnn_1i.sh new file mode 100755 index 00000000000..4a997a137de --- /dev/null +++ b/egs/wsj/s5/local/chain2/tuning/run_tdnn_1i.sh @@ -0,0 +1,463 @@ +#!/usr/bin/env bash + +# 1i is like 1h, while it introduces 'apply-cmvn-online' that does +# cmn normalization both for i-extractor and TDNN input. +# run_tdnn_1i.sh in local/chain2 uses new kaldi recipe. + +# local/chain2/compare_wer.sh exp/chain2_online_cmn/tdnn1i_sp +# System tdnn1i_sp +#WER dev93 (tgpr) 6.83 +#WER dev93 (tg) 6.53 +#WER dev93 (big-dict,tgpr) 4.71 +#WER dev93 (big-dict,fg) 4.31 +#WER eval92 (tgpr) 4.86 +#WER eval92 (tg) 4.43 +#WER eval92 (big-dict,tgpr) 2.71 +#WER eval92 (big-dict,fg) 2.27 +# Final train prob -0.0397 +# Final valid prob -0.0346 +# Final train prob (xent) -0.7091 +# Final valid prob (xent) -0.6436 +# Num-params 9476352 + +# steps/info/chain_dir_info.pl exp/chain_online_cmn/tdnn1i_sp +# exp/chain_online_cmn/tdnn1i_sp: num-iters=108 nj=2..8 num-params=8.4M dim=40+100->2880 combine=-0.044->-0.044 (over 1) xent:train/valid[71,107,final]=(-0.873,-0.660,-0.672/-0.906,-0.714,-0.734) logprob:train/valid[71,107,final]=(-0.067,-0.044,-0.044/-0.068,-0.054,-0.055) + +# Set -e here so that we catch if any executable fails immediately +set -euo pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +nj=30 +train_set=train_si284 +test_sets="test_dev93 test_eval92" +gmm=tri4b # this is the source gmm-dir that we'll use for alignments; it + # should have alignments for the specified training data. + +num_threads_ubm=8 + +nj_extractor=10 +# It runs a JOB with '-pe smp N', where N=$[threads*processes] +num_threads_extractor=4 +num_processes_extractor=2 + +nnet3_affix=_online_cmn # affix for exp dirs, e.g. it was _cleaned in tedlium. + +# Options which are not passed through to run_ivector_common.sh +affix=1i #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +reporting_email= + +# Setting 'online_cmvn' to true replaces 'apply-cmvn' by +# 'apply-cmvn-online' both for i-vector extraction and TDNN input. +# The i-vector extractor uses the config 'conf/online_cmvn.conf' for +# both the UBM and the i-extractor. The TDNN input is configured via +# '--feat.cmvn-opts' that is set to the same config, so we use the +# same cmvn for i-extractor and the TDNN input. +online_cmvn=true + +# LSTM/chain options +train_stage=-10 +xent_regularize=0.1 +dropout_schedule='0,0@0.20,0.5@0.50,0' + +# training chunk-options +chunk_width=140,100,160 +bottom_subsampling_factor=1 # I'll set this to 3 later, 1 is for compatibility with a broken ru. +frame_subsampling_factor=3 +langs="default" # list of language names + +# The amount of extra left/right context we put in the egs. Note: this could +# easily be zero, since we're not using a recurrent topology, but we put in a +# little extra context so that we have more room to play with the configuration +# without re-dumping egs. +egs_extra_left_context=5 +egs_extra_right_context=5 + +# The number of chunks (of length: see $chunk_width above) that we group +# together for each "speaker" (actually: pseudo-speaker, since we may have +# to group multiple speaker together in some cases). +chunks_per_group=4 + +# training options +srand=0 +remove_egs=true + +#decode options +test_online_decoding=true # if true, it will run the last decoding stage. + +# End configuration section. +echo "$0 $*" # Print the command line for logging + + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 13 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 14 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor ${frame_subsampling_factor} \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 15 ]; then + # $dir/configs will contain xconfig and config files for the initial + # models. It's a scratch space used by this script but not by + # scripts called from here. + mkdir -p $dir/configs/ + # $dir/init will contain the initial models + mkdir -p $dir/init/ + + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print(0.5/$xent_regularize)" | python) + tdnn_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim-continuous=true" + tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66" + linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0" + prefinal_opts="l2-regularize=0.01" + output_opts="l2-regularize=0.005" + + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + idct-layer name=idct input=input dim=40 cepstral-lifter=22 affine-transform-file=$dir/configs/idct.mat + delta-layer name=delta input=idct + no-op-component name=input2 input=Append(delta, Scale(1.0, ReplaceIndex(ivector, t, 0))) + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 $tdnn_opts dim=1024 input=input2 + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=0 + tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3 + linear-component name=prefinal-l dim=192 $linear_opts + + + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1024 small-dim=192 + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + output-layer name=output-default input=prefinal-chain include-log-softmax=false dim=$num_targets $output_opts + + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1024 small-dim=192 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts + output-layer name=output-default-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ + if [ -f $dir/init/default_trans.mdl ]; then # checking this because it may have been copied in a previous run of the same script + copy-transition-model $tree_dir/final.mdl $dir/init/default_trans.mdl || exit 1 & + else + echo "Keeping the old $dir/init/default_trans.mdl as it already exists." + fi +fi + +init_info=$dir/init/info.txt +if [ $stage -le 16 ]; then + + if [ ! -f $dir/configs/ref.raw ]; then + echo "Expected $dir/configs/ref.raw to exist" + exit + fi + + nnet3-info $dir/configs/ref.raw > $dir/configs/temp.info + model_left_context=$(grep -F 'left-context' $dir/configs/temp.info | awk '{print $2}') + model_right_context=$(grep -F 'right-context' $dir/configs/temp.info | awk '{print $2}') + cat >$init_info </dev/null || true + + for data in $test_sets; do + ( + data_affix=$(echo $data | sed s/test_//) + nspk=$(wc -l /dev/null || true + + for data in $test_sets; do + ( + data_affix=$(echo $data | sed s/test_//) + nspk=$(wc -l