From 6b0938a61d597ca248f5e7eab97995cfbbe20d85 Mon Sep 17 00:00:00 2001
From: Meixu Song <songmeixu@outlook.com>
Date: Fri, 19 Jun 2020 23:30:02 +0800
Subject: [PATCH] [scripts,egs] modify chain2 script (add ivector, diagnosis
 log, compute_wer.sh), and add wer results for mini_librispeech and wsj
 (#4001)

---
 .gitpod.Dockerfile                            |  22 +
 .gitpod.yml                                   |   7 +
 README.md                                     |   1 +
 egs/mini_librispeech/s5/cmd.sh                |   1 +
 .../s5/local/chain2/compare_wer.sh            | 137 ++++++
 .../s5/local/chain2/data_prep_common.sh       |  78 ---
 .../s5/local/chain2/tuning/run_tdnn_1a.sh     | 101 ++--
 egs/wsj/s5/local/chain2/compare_wer.sh        | 143 ++++++
 egs/wsj/s5/local/chain2/run_tdnn.sh           |   1 +
 egs/wsj/s5/local/chain2/tuning/run_tdnn_1i.sh | 463 ++++++++++++++++++
 egs/wsj/s5/run.sh                             |   2 +-
 egs/wsj/s5/steps/nnet3/chain2/train.sh        |  47 +-
 12 files changed, 892 insertions(+), 111 deletions(-)
 create mode 100644 .gitpod.Dockerfile
 create mode 100644 .gitpod.yml
 create mode 100755 egs/mini_librispeech/s5/local/chain2/compare_wer.sh
 delete mode 100755 egs/mini_librispeech/s5/local/chain2/data_prep_common.sh
 create mode 100755 egs/wsj/s5/local/chain2/compare_wer.sh
 create mode 120000 egs/wsj/s5/local/chain2/run_tdnn.sh
 create mode 100755 egs/wsj/s5/local/chain2/tuning/run_tdnn_1i.sh

diff --git a/.gitpod.Dockerfile b/.gitpod.Dockerfile
new file mode 100644
index 00000000000..53c8237d4e8
--- /dev/null
+++ b/.gitpod.Dockerfile
@@ -0,0 +1,22 @@
+FROM gitpod/workspace-full
+                    
+USER songmeixu
+
+# Install custom tools, runtime, etc. using apt-get
+# For example, the command below would install "bastet" - a command line tetris clone:
+#
+# RUN sudo apt-get -q update && #     sudo apt-get install -yq bastet && #     sudo rm -rf /var/lib/apt/lists/*
+#
+# More information: https://www.gitpod.io/docs/config-docker/
+
+FROM gitpod/workspace-full
+
+RUN sudo apt-get update \
+ && sudo apt-get install -y \
+    sox gfortran \
+ && sudo rm -rf /var/lib/apt/lists/*
+
+RUN cd ~/GitHub/r-with-intel-mkl/ \
+ && wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB | sudo apt-key add - \
+ && sudo sh -c 'echo deb https://apt.repos.intel.com/mkl all main > /etc/apt/sources.list.d/intel-mkl.list' \
+ && sudo apt-get update && sudo apt-get install intel-mkl-64bit
diff --git a/.gitpod.yml b/.gitpod.yml
new file mode 100644
index 00000000000..9e8cb2b79f2
--- /dev/null
+++ b/.gitpod.yml
@@ -0,0 +1,7 @@
+image:
+  file: .gitpod.dockerfile
+
+tasks:
+  - init: echo "Replace me with a build script for the project."
+    command: echo "Replace me with something that should run on every start, or just
+      remove me entirely."
diff --git a/README.md b/README.md
index 963b82ed427..ae0ceeeebae 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,5 @@
 [![Build Status](https://travis-ci.com/kaldi-asr/kaldi.svg?branch=master)](https://travis-ci.com/kaldi-asr/kaldi)
+[![Gitpod Ready-to-Code](https://img.shields.io/badge/Gitpod-Ready--to--Code-blue?logo=gitpod)](https://gitpod.io/#https://github.com/kaldi-asr/kaldi) 
 Kaldi Speech Recognition Toolkit
 ================================
 
diff --git a/egs/mini_librispeech/s5/cmd.sh b/egs/mini_librispeech/s5/cmd.sh
index 71dd849a93b..ed453ff8416 100644
--- a/egs/mini_librispeech/s5/cmd.sh
+++ b/egs/mini_librispeech/s5/cmd.sh
@@ -13,3 +13,4 @@
 export train_cmd="queue.pl --mem 2G"
 export decode_cmd="queue.pl --mem 4G"
 export mkgraph_cmd="queue.pl --mem 8G"
+export cuda_cmd="queue.pl --gpu 1"
diff --git a/egs/mini_librispeech/s5/local/chain2/compare_wer.sh b/egs/mini_librispeech/s5/local/chain2/compare_wer.sh
new file mode 100755
index 00000000000..c14d2031f8f
--- /dev/null
+++ b/egs/mini_librispeech/s5/local/chain2/compare_wer.sh
@@ -0,0 +1,137 @@
+#!/usr/bin/env bash
+
+# this script is used for comparing decoding results between systems.
+# e.g. local/chain/compare_wer.sh exp/chain/tdnn_{c,d}_sp
+# For use with discriminatively trained systems you specify the epochs after a colon:
+# for instance,
+# local/chain/compare_wer.sh exp/chain/tdnn_c_sp exp/chain/tdnn_c_sp_smbr:{1,2,3}
+
+
+if [ $# == 0 ]; then
+  echo "Usage: $0: [--looped] [--online] <dir1> [<dir2> ... ]"
+  echo "e.g.: $0 exp/chain/tdnn_{b,c}_sp"
+  echo "or (with epoch numbers for discriminative training):"
+  echo "$0 exp/chain/tdnn_b_sp_disc:{1,2,3}"
+  exit 1
+fi
+
+echo "# $0 $*"
+
+include_looped=false
+if [ "$1" == "--looped" ]; then
+  include_looped=true
+  shift
+fi
+include_online=false
+if [ "$1" == "--online" ]; then
+  include_online=true
+  shift
+fi
+
+
+used_epochs=false
+
+# this function set_names is used to separate the epoch-related parts of the name
+# [for discriminative training] and the regular parts of the name.
+# If called with a colon-free directory name, like:
+#  set_names exp/chain/tdnn_lstm1e_sp_bi_smbr
+# it will set dir=exp/chain/tdnn_lstm1e_sp_bi_smbr and epoch_infix=""
+# If called with something like:
+#  set_names exp/chain/tdnn_d_sp_smbr:3
+# it will set dir=exp/chain/tdnn_d_sp_smbr and epoch_infix="_epoch3"
+
+
+set_names() {
+  if [ $# != 1 ]; then
+    echo "compare_wer_general.sh: internal error"
+    exit 1  # exit the program
+  fi
+  dirname=$(echo $1 | cut -d: -f1)
+  epoch=$(echo $1 | cut -s -d: -f2)
+  if [ -z $epoch ]; then
+    epoch_infix=""
+  else
+    used_epochs=true
+    epoch_infix=_epoch${epoch}
+  fi
+}
+
+
+
+echo -n "# System               "
+for x in $*; do   printf "% 10s" " $(basename $x)";   done
+echo
+
+strings=(
+  "#WER dev_clean_2 (tgsmall) "
+  "#WER dev_clean_2 (tglarge) ")
+
+for n in 0 1; do
+   echo -n "${strings[$n]}"
+   for x in $*; do
+     set_names $x  # sets $dirname and $epoch_infix
+    decode_names=(tgsmall_dev_clean_2 tglarge_dev_clean_2)
+
+     wer=$(cat $dirname/decode_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}')
+     printf "% 10s" $wer
+   done
+   echo
+   if $include_looped; then
+     echo -n "#             [looped:]    "
+     for x in $*; do
+       set_names $x  # sets $dirname and $epoch_infix
+       wer=$(cat $dirname/decode_looped_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}')
+       printf "% 10s" $wer
+     done
+     echo
+   fi
+   if $include_online; then
+     echo -n "#             [online:]    "
+     for x in $*; do
+       set_names $x  # sets $dirname and $epoch_infix
+       wer=$(cat ${dirname}_online/decode_${decode_names[$n]}/wer_* | utils/best_wer.sh | awk '{print $2}')
+       printf "% 10s" $wer
+     done
+     echo
+   fi
+done
+
+
+if $used_epochs; then
+  exit 0;  # the diagnostics aren't comparable between regular and discriminatively trained systems.
+fi
+
+
+echo -n "# Final train prob     "
+for x in $*; do
+  prob=$(grep Overall $x/log/diagnostic_train.final.log | grep -v xent | awk '{printf("%.4f", $10)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob     "
+for x in $*; do
+  prob=$(grep Overall $x/log/diagnostic_heldout.final.log | grep -v xent | awk '{printf("%.4f", $10)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final train prob (xent)"
+for x in $*; do
+  prob=$(grep Overall $x/log/diagnostic_train.final.log | grep -w xent | awk '{printf("%.4f", $10)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob (xent)"
+for x in $*; do
+  prob=$(grep Overall $x/log/diagnostic_heldout.final.log | grep -w xent | awk '{printf("%.4f", $10)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Num-params              "
+for x in $*; do
+  printf "% 10s" $(grep num-parameters $x/log/progress.1.log | awk '{print $2}')
+done
+echo
\ No newline at end of file
diff --git a/egs/mini_librispeech/s5/local/chain2/data_prep_common.sh b/egs/mini_librispeech/s5/local/chain2/data_prep_common.sh
deleted file mode 100755
index 21b36cce421..00000000000
--- a/egs/mini_librispeech/s5/local/chain2/data_prep_common.sh
+++ /dev/null
@@ -1,78 +0,0 @@
-#!/bin/bash
-# Copyright 2019 Daniel Povey
-#           2019 Srikanth Madikeri (Idiap Research Institute)
-
-set -euo pipefail
-
-# This script is called from local/chain/tuning/run_tdnn_2a.sh and
-# similar scripts.   It contains the common feature preparation and
-# lattice-alignment preparation parts of the chaina training.
-# See those scripts for examples of usage.
-
-stage=0
-train_set=train_clean_5
-test_sets="dev_clean_2"
-gmm=tri3b
-
-. ./cmd.sh
-. ./path.sh
-. utils/parse_options.sh
-
-gmm_dir=exp/${gmm}
-ali_dir=exp/${gmm}_ali_${train_set}_sp
-
-for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do
-  if [ ! -f $f ]; then
-    echo "$0: expected file $f to exist"
-    exit 1
-  fi
-done
-
-# Our default data augmentation method is 3-way speed augmentation followed by
-# volume perturbation.  We are looking into better ways of doing this,
-# e.g. involving noise and reverberation.
-
-if [ $stage -le 1 ]; then
-  # Although the nnet will be trained by high resolution data, we still have to
-  # perturb the normal data to get the alignment.  _sp stands for speed-perturbed
-  echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)"
-  utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp
-  echo "$0: making MFCC features for low-resolution speed-perturbed data"
-  steps/make_mfcc.sh --cmd "$train_cmd" --nj 10 data/${train_set}_sp || exit 1;
-  steps/compute_cmvn_stats.sh data/${train_set}_sp || exit 1;
-  utils/fix_data_dir.sh data/${train_set}_sp
-fi
-
-if [ $stage -le 2 ]; then
-  echo "$0: aligning with the perturbed low-resolution data"
-  steps/align_fmllr.sh --nj 20 --cmd "$train_cmd" \
-    data/${train_set}_sp data/lang $gmm_dir $ali_dir || exit 1
-fi
-
-if [ $stage -le 3 ]; then
-  # Create high-resolution MFCC features (with 40 cepstra instead of 13).
-  # this shows how you can split across multiple file-systems.
-  echo "$0: creating high-resolution MFCC features"
-  mfccdir=data/${train_set}_sp_hires/data
-  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
-    utils/create_split_dir.pl /export/fs0{1,2}/$USER/kaldi-data/mfcc/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage
-  fi
-
-  for datadir in ${train_set}_sp ${test_sets}; do
-    utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
-  done
-
-  # do volume-perturbation on the training data prior to extracting hires
-  # features; this helps make trained nnets more invariant to test data volume.
-  utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires || exit 1;
-
-  for datadir in ${train_set}_sp ${test_sets}; do
-    steps/make_mfcc.sh --nj 10 --mfcc-config conf/mfcc_hires.conf \
-      --cmd "$train_cmd" data/${datadir}_hires || exit 1;
-    steps/compute_cmvn_stats.sh data/${datadir}_hires || exit 1;
-    utils/fix_data_dir.sh data/${datadir}_hires || exit 1;
-  done
-fi
-
-
-exit 0
diff --git a/egs/mini_librispeech/s5/local/chain2/tuning/run_tdnn_1a.sh b/egs/mini_librispeech/s5/local/chain2/tuning/run_tdnn_1a.sh
index 2311fc0699e..ee97f2d93f4 100755
--- a/egs/mini_librispeech/s5/local/chain2/tuning/run_tdnn_1a.sh
+++ b/egs/mini_librispeech/s5/local/chain2/tuning/run_tdnn_1a.sh
@@ -1,9 +1,25 @@
-#!/bin/bash
+#!/usr/bin/env bash
 
 # Copyright 2019 Srikanth Madikeri (Idiap Research Institute)
-# 
+#
 # This script is a modification of local/chain/run_tdnn.sh adapted to the chain2 recipes.
 
+# This is a basic TDNN experiment.
+# run_tdnn_1a.sh in local/chain2 but uses new kaldi recipe.
+
+# steps/info/chain_dir_info.pl exp/chain2/tdnn1a_sp
+# exp/chain2/tdnn1a_sp: num-iters=6 nj=2..5 combine=-0.038->-0.033 (over 3)
+
+# local/chain2/compare_wer.sh exp/chain2/tdnn1a_sp
+# System                tdnn1a_sp
+#WER dev_clean_2 (tgsmall)      17.50
+#WER dev_clean_2 (tglarge)      12.67
+# Final train prob        -0.0626
+# Final valid prob        -0.0539
+# Final train prob (xent)   -1.5220
+# Final valid prob (xent)   -1.3991
+# Num-params                10005600
+
 # Set -e here so that we catch if any executable fails immediately
 set -euo pipefail
 
@@ -14,20 +30,19 @@ decode_nj=10
 train_set=train_clean_5
 test_sets=dev_clean_2
 gmm=tri3b
-srand=0
 nnet3_affix=
 
 # The rest are configs specific to this script.  Most of the parameters
 # are just hardcoded at this level, in the commands below.
-affix=2c   # affix for the TDNN directory name
+affix=1a   # affix for the TDNN directory name
 tree_affix=
 train_stage=-10
 get_egs_stage=-10
+decode_iter=
 
-
+# training options
 # training chunk-options
-chunk_width=140
-dropout_schedule='0,0@0.20,0.3@0.50,0'
+chunk_width=140,100,160
 xent_regularize=0.1
 bottom_subsampling_factor=1  # I'll set this to 3 later, 1 is for compatibility with a broken ru.
 frame_subsampling_factor=3
@@ -45,6 +60,14 @@ egs_extra_right_context=5
 # to group multiple speaker together in some cases).
 chunks_per_group=4
 
+# training options
+srand=0
+remove_egs=true
+reporting_email=
+
+#decode options
+test_online_decoding=true  # if true, it will run the last decoding stage.
+
 
 # End configuration section.
 echo "$0 $@"  # Print the command line for logging
@@ -53,32 +76,35 @@ echo "$0 $@"  # Print the command line for logging
 . ./path.sh
 . ./utils/parse_options.sh
 
-# if ! cuda-compiled; then
-#   cat <<EOF && exit 1
-# This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
-# If you want to use GPUs (and have them), go to src/, and configure and make on a machine
-# where "nvcc" is installed.
-# EOF
-# fi
-
-if [ $stage -le 9 ]; then
-    local/chain2/data_prep_common.sh  \
-             --train-set $train_set \
-             --gmm $gmm  || exit 1;
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
 fi
 
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 11" if you have already
+# run those things.
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --nnet3-affix "$nnet3_affix" || exit 1;
+
 # Problem: We have removed the "train_" prefix of our training set in
 # the alignment directory names! Bad!
 gmm_dir=exp/$gmm
 ali_dir=exp/${gmm}_ali_${train_set}_sp
-tree_dir=exp/chaina/tree_sp${tree_affix:+_$tree_affix}
+tree_dir=exp/chain2${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
 lang=data/lang_chain
-lat_dir=exp/chaina/${gmm}_${train_set}_sp_lats
-dir=exp/chaina/tdnn${affix}_sp
+lat_dir=exp/chain2${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain2${nnet3_affix}/tdnn${affix}_sp
 train_data_dir=data/${train_set}_sp_hires
 lores_train_data_dir=data/${train_set}_sp
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
 
-for f in $gmm_dir/final.mdl $train_data_dir/feats.scp \
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
     $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz; do
   [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
 done
@@ -151,10 +177,16 @@ if [ $stage -le 14 ]; then
 
   echo "$0: creating top model"
   cat <<EOF > $dir/configs/default.xconfig
-  input name=input dim=40
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
   # the first splicing is moved before the lda layer, so no splicing here
-  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2) affine-transform-file=$dir/configs/lda.mat
-  relu-renorm-layer name=tdnn1 dim=512 input=Append(-2,-1,0,1,2)
+  relu-renorm-layer name=tdnn1 dim=512
   relu-renorm-layer name=tdnn2 dim=512 input=Append(-1,0,1)
   relu-renorm-layer name=tdnn3 dim=512 input=Append(-1,0,1)
   relu-renorm-layer name=tdnn4 dim=512 input=Append(-3,0,3)
@@ -245,11 +277,13 @@ if [ $stage -le 17 ]; then
   # Dump raw egs.
   steps/chain2/get_raw_egs.sh --cmd "$train_cmd" \
     --lang "default" \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
     --left-context $egs_left_context \
     --right-context $egs_right_context \
     --frame-subsampling-factor $frame_subsampling_factor \
     --alignment-subsampling-factor $frame_subsampling_factor \
-    --frames-per-chunk 140,100,160 \
+    --frames-per-chunk ${chunk_width} \
+    --online-ivector-dir ${train_ivector_dir} \
     ${train_data_dir} ${dir} ${lat_dir} ${dir}/raw_egs
 fi
 
@@ -298,7 +332,8 @@ if [ $stage -le 22 ]; then
     --xent-regularize $xent_regularize --leaky-hmm-coefficient 0.1 \
     --max-param-change 2.0 \
     --num-jobs-initial 2 --num-jobs-final 5 \
-     $dir/egs $dir
+    --groups-per-minibatch 256,128,64 \
+     $dir/egs $dir || exit 1;
 fi
 
 if [ $stage -le 23 ]; then
@@ -310,23 +345,29 @@ if [ $stage -le 23 ]; then
 fi
 
 if [ $stage -le 24 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
   # Do the speaker-dependent decoding pass
   test_sets=dev_clean_2
   for data in $test_sets; do
+    (
       nspk=$(wc -l <data/${data}_hires/spk2utt)
       steps/nnet3/decode.sh \
           --acwt 1.0 --post-decode-acwt 10.0 \
           --extra-left-context $egs_left_context \
           --extra-right-context $egs_right_context \
-          --frames-per-chunk 150 \
           --extra-left-context-initial 0 \
           --extra-right-context-final 0 \
-          --nj $nspk --cmd "$decode_cmd"   \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
           $tree_dir/graph_tgsmall data/${data}_hires ${dir}/decode_tgsmall_${data} || exit 1
       steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
         data/lang_test_{tgsmall,tglarge} \
        data/${data}_hires ${dir}/decode_{tgsmall,tglarge}_${data} || exit 1
+    ) || touch $dir/.error &
   done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
 fi
 
 exit 0;
diff --git a/egs/wsj/s5/local/chain2/compare_wer.sh b/egs/wsj/s5/local/chain2/compare_wer.sh
new file mode 100755
index 00000000000..e335d92c975
--- /dev/null
+++ b/egs/wsj/s5/local/chain2/compare_wer.sh
@@ -0,0 +1,143 @@
+#!/usr/bin/env bash
+
+# this script is used for comparing decoding results between systems.
+# e.g. local/chain/compare_wer.sh exp/chain/tdnn_{c,d}_sp
+# For use with discriminatively trained systems you specify the epochs after a colon:
+# for instance,
+# local/chain/compare_wer.sh exp/chain/tdnn_c_sp exp/chain/tdnn_c_sp_smbr:{1,2,3}
+
+
+if [ $# == 0 ]; then
+  echo "Usage: $0: [--looped] [--online] <dir1> [<dir2> ... ]"
+  echo "e.g.: $0 exp/chain/tdnn_{b,c}_sp"
+  echo "or (with epoch numbers for discriminative training):"
+  echo "$0 exp/chain/tdnn_b_sp_disc:{1,2,3}"
+  exit 1
+fi
+
+echo "# $0 $*"
+
+include_looped=false
+if [ "$1" == "--looped" ]; then
+  include_looped=true
+  shift
+fi
+include_online=false
+if [ "$1" == "--online" ]; then
+  include_online=true
+  shift
+fi
+
+
+used_epochs=false
+
+# this function set_names is used to separate the epoch-related parts of the name
+# [for discriminative training] and the regular parts of the name.
+# If called with a colon-free directory name, like:
+#  set_names exp/chain/tdnn_lstm1e_sp_bi_smbr
+# it will set dir=exp/chain/tdnn_lstm1e_sp_bi_smbr and epoch_infix=""
+# If called with something like:
+#  set_names exp/chain/tdnn_d_sp_smbr:3
+# it will set dir=exp/chain/tdnn_d_sp_smbr and epoch_infix="_epoch3"
+
+
+set_names() {
+  if [ $# != 1 ]; then
+    echo "compare_wer_general.sh: internal error"
+    exit 1  # exit the program
+  fi
+  dirname=$(echo $1 | cut -d: -f1)
+  epoch=$(echo $1 | cut -s -d: -f2)
+  if [ -z $epoch ]; then
+    epoch_infix=""
+  else
+    used_epochs=true
+    epoch_infix=_epoch${epoch}
+  fi
+}
+
+
+
+echo -n "# System               "
+for x in $*; do   printf "% 10s" " $(basename $x)";   done
+echo
+
+strings=(
+  "#WER dev93 (tgpr)          "
+  "#WER dev93 (tg)            "
+  "#WER dev93 (big-dict,tgpr) "
+  "#WER dev93 (big-dict,fg)   "
+  "#WER eval92 (tgpr)         "
+  "#WER eval92 (tg)           "
+  "#WER eval92 (big-dict,tgpr)"
+  "#WER eval92 (big-dict,fg)  ")
+
+for n in 0 1 2 3 4 5 6 7; do
+   echo -n "${strings[$n]}"
+   for x in $*; do
+     set_names $x  # sets $dirname and $epoch_infix
+     decode_names=(tgpr_dev93 tg_dev93 bd_tgpr_dev93 bd_tgpr_dev93_fg tgpr_eval92 tg_eval92 bd_tgpr_eval92 bd_tgpr_eval92_fg)
+
+     wer=$(< $dirname/decode_${decode_names[$n]}/scoring_kaldi/best_wer utils/best_wer.sh | awk '{print $2}')
+     printf "% 10s" $wer
+   done
+   echo
+   if $include_looped; then
+     echo -n "#             [looped:]    "
+     for x in $*; do
+       set_names $x  # sets $dirname and $epoch_infix
+       wer=$(< $dirname/decode_looped_${decode_names[$n]}/scoring_kaldi/best_wer utils/best_wer.sh | awk '{print $2}')
+       printf "% 10s" $wer
+     done
+     echo
+   fi
+   if $include_online; then
+     echo -n "#             [online:]    "
+     for x in $*; do
+       set_names $x  # sets $dirname and $epoch_infix
+       wer=$(< ${dirname}_online/decode_${decode_names[$n]}/scoring_kaldi/best_wer utils/best_wer.sh | awk '{print $2}')
+       printf "% 10s" $wer
+     done
+     echo
+   fi
+done
+
+
+if $used_epochs; then
+  exit 0;  # the diagnostics aren't comparable between regular and discriminatively trained systems.
+fi
+
+
+echo -n "# Final train prob     "
+for x in $*; do
+  prob=$(grep Overall $x/log/diagnostic_train.final.log | grep -v xent | awk '{printf("%.4f", $10)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob     "
+for x in $*; do
+  prob=$(grep Overall $x/log/diagnostic_heldout.final.log | grep -v xent | awk '{printf("%.4f", $10)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final train prob (xent)"
+for x in $*; do
+  prob=$(grep Overall $x/log/diagnostic_train.final.log | grep -w xent | awk '{printf("%.4f", $10)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob (xent)"
+for x in $*; do
+  prob=$(grep Overall $x/log/diagnostic_heldout.final.log | grep -w xent | awk '{printf("%.4f", $10)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Num-params              "
+for x in $*; do
+  printf "% 10s" $(grep num-parameters $x/log/progress.1.log | awk '{print $2}')
+done
+echo
diff --git a/egs/wsj/s5/local/chain2/run_tdnn.sh b/egs/wsj/s5/local/chain2/run_tdnn.sh
new file mode 120000
index 00000000000..deb68d515d2
--- /dev/null
+++ b/egs/wsj/s5/local/chain2/run_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_1i.sh
\ No newline at end of file
diff --git a/egs/wsj/s5/local/chain2/tuning/run_tdnn_1i.sh b/egs/wsj/s5/local/chain2/tuning/run_tdnn_1i.sh
new file mode 100755
index 00000000000..4a997a137de
--- /dev/null
+++ b/egs/wsj/s5/local/chain2/tuning/run_tdnn_1i.sh
@@ -0,0 +1,463 @@
+#!/usr/bin/env bash
+
+# 1i is like 1h, while it introduces 'apply-cmvn-online' that does
+# cmn normalization both for i-extractor and TDNN input.
+# run_tdnn_1i.sh in local/chain2 uses new kaldi recipe.
+
+# local/chain2/compare_wer.sh exp/chain2_online_cmn/tdnn1i_sp
+# System                tdnn1i_sp
+#WER dev93 (tgpr)                6.83
+#WER dev93 (tg)                  6.53
+#WER dev93 (big-dict,tgpr)       4.71
+#WER dev93 (big-dict,fg)         4.31
+#WER eval92 (tgpr)               4.86
+#WER eval92 (tg)                 4.43
+#WER eval92 (big-dict,tgpr)      2.71
+#WER eval92 (big-dict,fg)        2.27
+# Final train prob        -0.0397
+# Final valid prob        -0.0346
+# Final train prob (xent)   -0.7091
+# Final valid prob (xent)   -0.6436
+# Num-params                 9476352
+
+# steps/info/chain_dir_info.pl exp/chain_online_cmn/tdnn1i_sp
+# exp/chain_online_cmn/tdnn1i_sp: num-iters=108 nj=2..8 num-params=8.4M dim=40+100->2880 combine=-0.044->-0.044 (over 1) xent:train/valid[71,107,final]=(-0.873,-0.660,-0.672/-0.906,-0.714,-0.734) logprob:train/valid[71,107,final]=(-0.067,-0.044,-0.044/-0.068,-0.054,-0.055)
+
+# Set -e here so that we catch if any executable fails immediately
+set -euo pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+train_set=train_si284
+test_sets="test_dev93 test_eval92"
+gmm=tri4b        # this is the source gmm-dir that we'll use for alignments; it
+                 # should have alignments for the specified training data.
+
+num_threads_ubm=8
+
+nj_extractor=10
+# It runs a JOB with '-pe smp N', where N=$[threads*processes]
+num_threads_extractor=4
+num_processes_extractor=2
+
+nnet3_affix=_online_cmn   # affix for exp dirs, e.g. it was _cleaned in tedlium.
+
+# Options which are not passed through to run_ivector_common.sh
+affix=1i   #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+reporting_email=
+
+# Setting 'online_cmvn' to true replaces 'apply-cmvn' by
+# 'apply-cmvn-online' both for i-vector extraction and TDNN input.
+# The i-vector extractor uses the config 'conf/online_cmvn.conf' for
+# both the UBM and the i-extractor. The TDNN input is configured via
+# '--feat.cmvn-opts' that is set to the same config, so we use the
+# same cmvn for i-extractor and the TDNN input.
+online_cmvn=true
+
+# LSTM/chain options
+train_stage=-10
+xent_regularize=0.1
+dropout_schedule='0,0@0.20,0.5@0.50,0'
+
+# training chunk-options
+chunk_width=140,100,160
+bottom_subsampling_factor=1  # I'll set this to 3 later, 1 is for compatibility with a broken ru.
+frame_subsampling_factor=3
+langs="default"  # list of language names
+
+# The amount of extra left/right context we put in the egs.  Note: this could
+# easily be zero, since we're not using a recurrent topology, but we put in a
+# little extra context so that we have more room to play with the configuration
+# without re-dumping egs.
+egs_extra_left_context=5
+egs_extra_right_context=5
+
+# The number of chunks (of length: see $chunk_width above) that we group
+# together for each "speaker" (actually: pseudo-speaker, since we may have
+# to group multiple speaker together in some cases).
+chunks_per_group=4
+
+# training options
+srand=0
+remove_egs=true
+
+#decode options
+test_online_decoding=true  # if true, it will run the last decoding stage.
+
+# End configuration section.
+echo "$0 $*"  # Print the command line for logging
+
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh \
+  --stage $stage --nj $nj \
+  --train-set $train_set --gmm $gmm \
+  --online-cmvn-iextractor $online_cmvn \
+  --num-threads-ubm $num_threads_ubm \
+  --nj-extractor $nj_extractor \
+  --num-processes-extractor $num_processes_extractor \
+  --num-threads-extractor $num_threads_extractor \
+  --nnet3-affix "$nnet3_affix"
+
+
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+lat_dir=exp/chain2${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain2${nnet3_affix}/tdnn${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+
+# note: you don't necessarily have to change the treedir name
+# each time you do a new experiment-- only if you change the
+# configuration in a way that affects the tree.
+tree_dir=exp/chain2${nnet3_affix}/tree_a_sp
+# the 'lang' directory is created by this script.
+# If you create such a directory with a non-standard topology
+# you should probably name it differently.
+lang=data/lang_chain
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $gmm_dir/final.mdl \
+    $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 13 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 14 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor ${frame_subsampling_factor} \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$train_cmd" 3500 ${lores_train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 15 ]; then
+  # $dir/configs will contain xconfig and config files for the initial
+  # models.  It's a scratch space used by this script but not by
+  # scripts called from here.
+  mkdir -p $dir/configs/
+  # $dir/init will contain the initial models
+  mkdir -p $dir/init/
+  
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print(0.5/$xent_regularize)" | python)
+  tdnn_opts="l2-regularize=0.01 dropout-proportion=0.0 dropout-per-dim-continuous=true"
+  tdnnf_opts="l2-regularize=0.01 dropout-proportion=0.0 bypass-scale=0.66"
+  linear_opts="l2-regularize=0.01 orthonormal-constraint=-1.0"
+  prefinal_opts="l2-regularize=0.01"
+  output_opts="l2-regularize=0.005"
+
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  idct-layer name=idct input=input dim=40 cepstral-lifter=22 affine-transform-file=$dir/configs/idct.mat
+  delta-layer name=delta input=idct
+  no-op-component name=input2 input=Append(delta, Scale(1.0, ReplaceIndex(ivector, t, 0)))
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 $tdnn_opts dim=1024 input=input2
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=0
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf9 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf10 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf11 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf12 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  tdnnf-layer name=tdnnf13 $tdnnf_opts dim=1024 bottleneck-dim=128 time-stride=3
+  linear-component name=prefinal-l dim=192 $linear_opts
+
+
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts big-dim=1024 small-dim=192
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+  output-layer name=output-default input=prefinal-chain include-log-softmax=false dim=$num_targets $output_opts
+
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts big-dim=1024 small-dim=192
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+  output-layer name=output-default-xent input=prefinal-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+  if [ -f $dir/init/default_trans.mdl ]; then # checking this because it may have been copied in a previous run of the same script
+      copy-transition-model $tree_dir/final.mdl $dir/init/default_trans.mdl  || exit 1 &
+  else
+      echo "Keeping the old $dir/init/default_trans.mdl as it already exists."
+  fi
+fi
+
+init_info=$dir/init/info.txt
+if [ $stage -le 16 ]; then
+
+  if [ ! -f $dir/configs/ref.raw ]; then
+      echo "Expected $dir/configs/ref.raw to exist"
+      exit
+  fi
+
+  nnet3-info $dir/configs/ref.raw  > $dir/configs/temp.info 
+  model_left_context=$(grep -F 'left-context' $dir/configs/temp.info | awk '{print $2}')
+  model_right_context=$(grep -F 'right-context' $dir/configs/temp.info | awk '{print $2}')
+  cat >$init_info <<EOF
+frame_subsampling_factor $frame_subsampling_factor
+langs $langs
+model_left_context $model_left_context
+model_right_context $model_right_context
+EOF
+  rm $dir/configs/temp.info
+fi
+
+# Make phone LM and denominator and normalization FST
+if [ $stage -le 17 ]; then
+  echo "$0: Making Phone LM and denominator and normalization FST"
+  mkdir -p $dir/den_fsts/log
+
+  # We may later reorganize this.
+  cp $tree_dir/tree $dir/default.tree
+
+  echo "$0: creating phone language-model"
+  $train_cmd $dir/den_fsts/log/make_phone_lm_default.log \
+    chain-est-phone-lm --num-extra-lm-states=2000 \
+       "ark:gunzip -c $gmm_dir/ali.*.gz | ali-to-phones $gmm_dir/final.mdl ark:- ark:- |" \
+       $dir/den_fsts/default.phone_lm.fst
+
+  echo "$0: creating denominator FST"
+  $train_cmd $dir/den_fsts/log/make_den_fst.log \
+     chain-make-den-fst $dir/default.tree $dir/init/default_trans.mdl $dir/den_fsts/default.phone_lm.fst \
+     $dir/den_fsts/default.den.fst $dir/den_fsts/default.normalization.fst || exit 1;
+fi
+
+model_left_context=$(awk '/^model_left_context/ {print $2;}' $dir/init/info.txt)
+model_right_context=$(awk '/^model_right_context/ {print $2;}' $dir/init/info.txt)
+if [ -z $model_left_context ]; then
+    echo "ERROR: Cannot find entry for model_left_context in $dir/init/info.txt"
+fi
+if [ -z $model_right_context ]; then
+    echo "ERROR: Cannot find entry for model_right_context in $dir/init/info.txt"
+fi
+# Note: we add frame_subsampling_factor/2 so that we can support the frame
+# shifting that's done during training, so if frame-subsampling-factor=3, we
+# train on the same egs with the input shifted by -1,0,1 frames.  This is done
+# via the --frame-shift option to nnet3-chain-copy-egs in the script.
+egs_left_context=$((model_left_context+(frame_subsampling_factor/2)+egs_extra_left_context))
+egs_right_context=$((model_right_context+(frame_subsampling_factor/2)+egs_extra_right_context))
+
+for d in $dir/raw_egs $dir/processed_egs; do
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $d/storage ] ; then
+    mkdir -p $d
+    utils/create_split_dir.pl \
+      /export/b0{3,4,5,6}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$d/storage $d/storage
+  fi
+done
+
+if [ $stage -le 18 ]; then
+  echo "$0: about to dump raw egs."
+  # Dump raw egs.
+  steps/chain2/get_raw_egs.sh --cmd "$train_cmd" \
+    --lang "default" \
+    --cmvn-opts "--config=conf/online_cmvn.conf" \
+    --left-context $egs_left_context \
+    --right-context $egs_right_context \
+    --frame-subsampling-factor $frame_subsampling_factor \
+    --alignment-subsampling-factor $frame_subsampling_factor \
+    --frames-per-chunk ${chunk_width} \
+    --online-ivector-dir ${train_ivector_dir} \
+    ${train_data_dir} ${dir} ${lat_dir} ${dir}/raw_egs
+fi
+
+if [ $stage -le 19 ]; then
+  echo "$0: about to process egs"
+  steps/chain2/process_egs.sh  --cmd "$train_cmd" \
+      --num-repeats 1 \
+    ${dir}/raw_egs ${dir}/processed_egs
+fi
+
+if [ $stage -le 20 ]; then
+  echo "$0: about to randomize egs"
+  steps/chain2/randomize_egs.sh --frames-per-job 5000000 \
+    ${dir}/processed_egs ${dir}/egs
+fi
+
+if [ $stage -le 21 ]; then
+    echo "$0: Preparing initial acoustic model"
+    if [ -f $dir/configs/init.config ]; then
+            $train_cmd ${dir}/log/add_first_layer.log \
+                    nnet3-init --srand=${srand} ${dir}/configs/init.raw \
+                    ${dir}/configs/final.config ${dir}/init/default.raw || exit 1
+    else
+            $train_cmd ${dir}/log/init_model.log \
+               nnet3-init --srand=${srand} ${dir}/configs/final.config ${dir}/init/default.raw || exit 1
+    fi
+
+    $train_cmd $dir/log/init_mdl.log \
+        nnet3-am-init ${dir}/init/default_trans.mdl $dir/init/default.raw $dir/init/default.mdl || exit 1
+fi
+
+if [ $stage -le 22 ]; then
+  echo "$0: about to train model"
+  steps/chain2/train.sh \
+    --stage $train_stage --cmd "$decode_cmd" \
+    --xent-regularize $xent_regularize --leaky-hmm-coefficient 0.1 \
+    --max-param-change 2.0 \
+    --dropout-schedule ${dropout_schedule} \
+    --num-jobs-initial 2 --num-jobs-final 8 \
+    --initial-effective-lrate 0.0005 \
+    --final-effective-lrate 0.00005 \
+    --num-epochs 10 \
+    --groups-per-minibatch 128,64 \
+     $dir/egs $dir || exit 1;
+fi
+
+if [ $stage -le 23 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/lang/check_phones_compatible.sh \
+    data/lang_test_tgpr/phones.txt $lang/phones.txt
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test_tgpr \
+    $tree_dir $tree_dir/graph_tgpr || exit 1;
+
+  utils/lang/check_phones_compatible.sh \
+    data/lang_test_bd_tgpr/phones.txt $lang/phones.txt
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test_bd_tgpr \
+    $tree_dir $tree_dir/graph_bd_tgpr || exit 1;
+fi
+
+if [ $stage -le 24 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      data_affix=$(echo $data | sed s/test_//)
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      for lmtype in tgpr bd_tgpr; do
+        steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $egs_left_context --extra-right-context $egs_right_context \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
+      done
+      steps/lmrescore.sh \
+        --self-loop-scale 1.0 \
+        --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
+        data/${data}_hires ${dir}/decode_{tgpr,tg}_${data_affix} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_bd_{tgpr,fgconst} \
+       data/${data}_hires ${dir}/decode_${lmtype}_${data_affix}{,_fg} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+# Not testing the 'looped' decoding separately, because for
+# TDNN systems it would give exactly the same results as the
+# normal decoding.
+
+if $test_online_decoding && [ $stage -le 25 ]; then
+  cp $dir/default.tree $dir/tree
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      data_affix=$(echo $data | sed s/test_//)
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      # note: we just give it "data/${data}" as it only uses the wav.scp, the
+      # feature type does not matter.
+      for lmtype in tgpr bd_tgpr; do
+        steps/online/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $nspk --cmd "$decode_cmd" \
+          $tree_dir/graph_${lmtype} data/${data} ${dir}_online/decode_${lmtype}_${data_affix} || exit 1
+      done
+      steps/lmrescore.sh \
+        --self-loop-scale 1.0 \
+        --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
+        data/${data}_hires ${dir}_online/decode_{tgpr,tg}_${data_affix} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_bd_{tgpr,fgconst} \
+       data/${data}_hires ${dir}_online/decode_${lmtype}_${data_affix}{,_fg} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/wsj/s5/run.sh b/egs/wsj/s5/run.sh
index c4e9326a374..400130c914f 100755
--- a/egs/wsj/s5/run.sh
+++ b/egs/wsj/s5/run.sh
@@ -322,7 +322,7 @@ fi
 
 if [ $stage -le 7 ]; then
   # Caution: this part needs a GPU.
-  local/chain/run_tdnn.sh
+  local/chain2/run_tdnn.sh
 fi
 
 exit 0;
diff --git a/egs/wsj/s5/steps/nnet3/chain2/train.sh b/egs/wsj/s5/steps/nnet3/chain2/train.sh
index 3acd6962a9e..e4159b5560e 100755
--- a/egs/wsj/s5/steps/nnet3/chain2/train.sh
+++ b/egs/wsj/s5/steps/nnet3/chain2/train.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 
 # Copyright   2019  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
 # Copyright   2019  Idiap Research Institute (Author: Srikanth Madikeri).  Apache 2.0.
@@ -194,6 +194,13 @@ while [ $x -lt $num_iters ]; do
     done
   fi
 
+  if [ $x -gt 0 ]; then
+    # This doesn't use the egs, it only shows the relative change in model parameters.
+    $cmd $dir/log/progress.$x.log \
+      nnet3-show-progress --use-gpu=no $dir/$(($x-1)).raw $dir/${x}.raw '&&' \
+        nnet3-info $dir/${x}.raw &
+  fi
+
   cache_io_opt="--write-cache=$dir/cache.$next_x"
   if [ $x -gt 0 -a -f $dir/cache.$x ]; then
       cache_io_opt="$cache_io_opt --read-cache=$dir/cache.$x"
@@ -210,7 +217,7 @@ while [ $x -lt $num_iters ]; do
     $cmd $gpu_cmd_opt $dir/log/train.$x.$j.log \
          nnet3-chain-train2  \
              $parallel_train_opts $verbose_opt \
-            --out-of-range-regularize=$out_of_range_regularize \
+             --out-of-range-regularize=$out_of_range_regularize \
              $cache_io_opt \
              --use-gpu=$use_gpu --apply-deriv-weights=$apply_deriv_weights \
              --leaky-hmm-coefficient=$leaky_hmm_coefficient --xent-regularize=$xent_regularize \
@@ -278,7 +285,43 @@ if [ $stage -le $num_iters ]; then
           nnet3-am-init $dir/0_trans.mdl - $dir/final.mdl
    fi
 
+   # Compute the probability of the final, combined model with
+   # the same subset we used for the previous diagnostic processes, as the
+   # different subsets will lead to different probs.
+   [ -f $dir/.error_diagnostic ] && rm $dir/.error_diagnostic
+   for name in train heldout; do
+     egs_opts=
+     if $multilingual_eg; then
+       weight_rspecifier=$egs_dir/diagnostic_${name}.weight.ark
+       [[ -f $weight_rspecifier ]] && egs_opts="--weights=ark:$weight_rspecifier"
+     fi
+     $cmd $gpu_cmd_opt $dir/log/diagnostic_${name}.final.log \
+       nnet3-chain-train2 --use-gpu=$use_gpu \
+         --leaky-hmm-coefficient=$leaky_hmm_coefficient \
+         --xent-regularize=$xent_regularize \
+         --out-of-range-regularize=$out_of_range_regularize \
+         $l2_regularize_opt \
+         --print-interval=10  \
+         $dir/final.raw  $den_fst_dir \
+         "ark:nnet3-chain-copy-egs $egs_opts scp:$egs_dir/${name}_subset.scp ark:- | nnet3-chain-merge-egs $multilingual_eg_opts --minibatch-size=1:64 ark:- ark:-|" \
+         $dir/final_${name}.mdl || touch $dir/.error_diagnostic &
+   done
+
+   if [ -f $dir/final_train.mdl ]; then
+     rm $dir/final_{train,heldout}.mdl
+   fi
 fi
 
+if [ ! -f $dir/final.mdl ]; then
+  echo "$0: $dir/final.mdl does not exist."
+  # we don't want to clean up if the training didn't succeed.
+  exit 1;
+fi
+
+sleep 2
+
 echo "$0: done"
+
+steps/info/chain_dir_info.pl $dir
+
 exit 0