From aeb8986e358ad224d17bfc8671d825a18b0c7a4d Mon Sep 17 00:00:00 2001
From: Fangjun Kuang <csukuangfj@gmail.com>
Date: Fri, 13 May 2022 07:39:14 +0800
Subject: [PATCH] Ignore padding frames during RNN-T decoding. (#358)

* Ignore padding frames during RNN-T decoding.

* Fix outdated decoding code.

* Minor fixes.
---
 ...-pruned-transducer-stateless-2022-03-12.sh |   9 +-
 ...pruned-transducer-stateless2-2022-04-29.sh |   7 +-
 ...pruned-transducer-stateless3-2022-04-29.sh |   7 +-
 ...speech-transducer-stateless2-2022-04-19.sh |   9 +-
 ...d-transducer-stateless-librispeech-100h.sh |  31 +-
 ...d-transducer-stateless-librispeech-960h.sh |  31 +-
 .../run-pre-trained-transducer-stateless.sh   |  40 +-
 .../workflows/run-librispeech-2022-03-12.yml  |  14 +-
 .../workflows/run-librispeech-2022-04-29.yml  |  47 ++-
 ...peech-transducer-stateless2-2022-04-19.yml |  11 +-
 ...-transducer-stateless-librispeech-100h.yml |  80 +++-
 ...r-stateless-librispeech-multi-datasets.yml |  80 +++-
 .../run-pretrained-transducer-stateless.yml   |  82 ++++-
 .../ASR/transducer_stateless/conformer.py     |   4 +-
 .../transducer_stateless_modified-2/decode.py | 305 +++++++++-------
 .../pretrained.py                             |  69 +++-
 .../transducer_stateless_modified/decode.py   | 305 +++++++++-------
 .../pretrained.py                             |  69 +++-
 .../beam_search.py                            | 341 +++++++++++++-----
 .../ASR/pruned_transducer_stateless/decode.py |  83 ++---
 .../pruned_transducer_stateless/pretrained.py |  73 +++-
 .../beam_search.py                            |  98 ++++-
 .../pruned_transducer_stateless2/decode.py    |  16 +-
 .../decode-giga.py                            |  18 +-
 .../pruned_transducer_stateless3/decode.py    |  16 +-
 .../pruned_transducer_stateless4/decode.py    |  28 +-
 .../ASR/transducer_stateless/beam_search.py   | 337 +++++++++++++++--
 .../ASR/transducer_stateless/decode.py        | 154 ++++++--
 .../ASR/transducer_stateless/decoder.py       |   1 +
 .../ASR/transducer_stateless/pretrained.py    |  95 +++--
 .../ASR/transducer_stateless2/decode.py       | 126 ++++++-
 .../ASR/transducer_stateless2/pretrained.py   |  95 +++--
 .../decode.py                                 | 135 ++++++-
 .../pretrained.py                             |  60 ++-
 .../ASR/pruned_transducer_stateless/decode.py |   6 +-
 .../pruned_transducer_stateless/pretrained.py |  87 +----
 36 files changed, 2205 insertions(+), 764 deletions(-)

diff --git a/.github/scripts/run-librispeech-pruned-transducer-stateless-2022-03-12.sh b/.github/scripts/run-librispeech-pruned-transducer-stateless-2022-03-12.sh
index 59e9edf41e..bd816c2d62 100755
--- a/.github/scripts/run-librispeech-pruned-transducer-stateless-2022-03-12.sh
+++ b/.github/scripts/run-librispeech-pruned-transducer-stateless-2022-03-12.sh
@@ -33,7 +33,7 @@ for sym in 1 2 3; do
     $repo/test_wavs/1221-135766-0002.wav
 done
 
-for method in modified_beam_search beam_search; do
+for method in fast_beam_search modified_beam_search beam_search; do
   log "$method"
 
   ./pruned_transducer_stateless/pretrained.py \
@@ -47,7 +47,8 @@ for method in modified_beam_search beam_search; do
 done
 
 echo "GITHUB_EVENT_NAME: ${GITHUB_EVENT_NAME}"
-if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" ]]; then
+echo "GITHUB_EVENT_LABEL_NAME: ${GITHUB_EVENT_LABEL_NAME}"
+if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" == x"run-decode"  ]]; then
   mkdir -p pruned_transducer_stateless/exp
   ln -s $PWD/$repo/exp/pretrained.pt pruned_transducer_stateless/exp/epoch-999.pt
   ln -s $PWD/$repo/data/lang_bpe_500 data/
@@ -58,9 +59,9 @@ if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" ]]; then
   log "Decoding test-clean and test-other"
 
   # use a small value for decoding with CPU
-  max_duration=50
+  max_duration=100
 
-  for method in greedy_search fast_beam_search; do
+  for method in greedy_search fast_beam_search modified_beam_search; do
     log "Decoding with $method"
 
     ./pruned_transducer_stateless/decode.py \
diff --git a/.github/scripts/run-librispeech-pruned-transducer-stateless2-2022-04-29.sh b/.github/scripts/run-librispeech-pruned-transducer-stateless2-2022-04-29.sh
index 1b62caab8c..6b5b51bd71 100755
--- a/.github/scripts/run-librispeech-pruned-transducer-stateless2-2022-04-29.sh
+++ b/.github/scripts/run-librispeech-pruned-transducer-stateless2-2022-04-29.sh
@@ -51,7 +51,8 @@ for method in modified_beam_search beam_search fast_beam_search; do
 done
 
 echo "GITHUB_EVENT_NAME: ${GITHUB_EVENT_NAME}"
-if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" ]]; then
+echo "GITHUB_EVENT_LABEL_NAME: ${GITHUB_EVENT_LABEL_NAME}"
+if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" == x"run-decode"  ]]; then
   mkdir -p pruned_transducer_stateless2/exp
   ln -s $PWD/$repo/exp/pretrained.pt pruned_transducer_stateless2/exp/epoch-999.pt
   ln -s $PWD/$repo/data/lang_bpe_500 data/
@@ -62,9 +63,9 @@ if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" ]]; then
   log "Decoding test-clean and test-other"
 
   # use a small value for decoding with CPU
-  max_duration=50
+  max_duration=100
 
-  for method in greedy_search fast_beam_search; do
+  for method in greedy_search fast_beam_search modified_beam_search; do
     log "Decoding with $method"
 
     ./pruned_transducer_stateless2/decode.py \
diff --git a/.github/scripts/run-librispeech-pruned-transducer-stateless3-2022-04-29.sh b/.github/scripts/run-librispeech-pruned-transducer-stateless3-2022-04-29.sh
index 1177e5a86e..62ea02c47e 100755
--- a/.github/scripts/run-librispeech-pruned-transducer-stateless3-2022-04-29.sh
+++ b/.github/scripts/run-librispeech-pruned-transducer-stateless3-2022-04-29.sh
@@ -51,7 +51,8 @@ for method in modified_beam_search beam_search fast_beam_search; do
 done
 
 echo "GITHUB_EVENT_NAME: ${GITHUB_EVENT_NAME}"
-if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" ]]; then
+echo "GITHUB_EVENT_LABEL_NAME: ${GITHUB_EVENT_LABEL_NAME}"
+if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" == x"run-decode"  ]]; then
   mkdir -p pruned_transducer_stateless3/exp
   ln -s $PWD/$repo/exp/pretrained.pt pruned_transducer_stateless3/exp/epoch-999.pt
   ln -s $PWD/$repo/data/lang_bpe_500 data/
@@ -62,9 +63,9 @@ if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" ]]; then
   log "Decoding test-clean and test-other"
 
   # use a small value for decoding with CPU
-  max_duration=50
+  max_duration=100
 
-  for method in greedy_search fast_beam_search; do
+  for method in greedy_search fast_beam_search modified_beam_search; do
     log "Decoding with $method"
 
     ./pruned_transducer_stateless3/decode.py \
diff --git a/.github/scripts/run-librispeech-transducer-stateless2-2022-04-19.sh b/.github/scripts/run-librispeech-transducer-stateless2-2022-04-19.sh
index d2a2d3c029..c22660d0a6 100755
--- a/.github/scripts/run-librispeech-transducer-stateless2-2022-04-19.sh
+++ b/.github/scripts/run-librispeech-transducer-stateless2-2022-04-19.sh
@@ -33,7 +33,7 @@ for sym in 1 2 3; do
     $repo/test_wavs/1221-135766-0002.wav
 done
 
-for method in modified_beam_search beam_search; do
+for method in fast_beam_search modified_beam_search beam_search; do
   log "$method"
 
   ./transducer_stateless2/pretrained.py \
@@ -47,7 +47,8 @@ for method in modified_beam_search beam_search; do
 done
 
 echo "GITHUB_EVENT_NAME: ${GITHUB_EVENT_NAME}"
-if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" ]]; then
+echo "GITHUB_EVENT_LABEL_NAME: ${GITHUB_EVENT_LABEL_NAME}"
+if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" == x"run-decode"  ]]; then
   mkdir -p transducer_stateless2/exp
   ln -s $PWD/$repo/exp/pretrained.pt transducer_stateless2/exp/epoch-999.pt
   ln -s $PWD/$repo/data/lang_bpe_500 data/
@@ -58,9 +59,9 @@ if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" ]]; then
   log "Decoding test-clean and test-other"
 
   # use a small value for decoding with CPU
-  max_duration=50
+  max_duration=100
 
-  for method in greedy_search modified_beam_search; do
+  for method in greedy_search fast_beam_search modified_beam_search; do
     log "Decoding with $method"
 
     ./transducer_stateless2/decode.py \
diff --git a/.github/scripts/run-pre-trained-transducer-stateless-librispeech-100h.sh b/.github/scripts/run-pre-trained-transducer-stateless-librispeech-100h.sh
index f484bd49aa..dcc99d62e4 100755
--- a/.github/scripts/run-pre-trained-transducer-stateless-librispeech-100h.sh
+++ b/.github/scripts/run-pre-trained-transducer-stateless-librispeech-100h.sh
@@ -33,7 +33,7 @@ for sym in 1 2 3; do
     $repo/test_wavs/1221-135766-0002.wav
 done
 
-for method in modified_beam_search beam_search; do
+for method in modified_beam_search beam_search fast_beam_search; do
   log "$method"
 
   ./transducer_stateless_multi_datasets/pretrained.py \
@@ -45,3 +45,32 @@ for method in modified_beam_search beam_search; do
     $repo/test_wavs/1221-135766-0001.wav \
     $repo/test_wavs/1221-135766-0002.wav
 done
+
+echo "GITHUB_EVENT_NAME: ${GITHUB_EVENT_NAME}"
+echo "GITHUB_EVENT_LABEL_NAME: ${GITHUB_EVENT_LABEL_NAME}"
+if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" == x"run-decode"  ]]; then
+  mkdir -p transducer_stateless_multi_datasets/exp
+  ln -s $PWD/$repo/exp/pretrained.pt transducer_stateless_multi_datasets/exp/epoch-999.pt
+  ln -s $PWD/$repo/data/lang_bpe_500 data/
+
+  ls -lh data
+  ls -lh transducer_stateless_multi_datasets/exp
+
+  log "Decoding test-clean and test-other"
+
+  # use a small value for decoding with CPU
+  max_duration=100
+
+  for method in greedy_search fast_beam_search modified_beam_search; do
+    log "Decoding with $method"
+
+    ./transducer_stateless_multi_datasets/decode.py \
+      --decoding-method $method \
+      --epoch 999 \
+      --avg 1 \
+      --max-duration $max_duration \
+      --exp-dir transducer_stateless_multi_datasets/exp
+  done
+
+  rm transducer_stateless_multi_datasets/exp/*.pt
+fi
diff --git a/.github/scripts/run-pre-trained-transducer-stateless-librispeech-960h.sh b/.github/scripts/run-pre-trained-transducer-stateless-librispeech-960h.sh
index 5501dcecdd..9622224c91 100755
--- a/.github/scripts/run-pre-trained-transducer-stateless-librispeech-960h.sh
+++ b/.github/scripts/run-pre-trained-transducer-stateless-librispeech-960h.sh
@@ -33,7 +33,7 @@ for sym in 1 2 3; do
     $repo/test_wavs/1221-135766-0002.wav
 done
 
-for method in modified_beam_search beam_search; do
+for method in modified_beam_search beam_search fast_beam_search; do
   log "$method"
 
   ./transducer_stateless_multi_datasets/pretrained.py \
@@ -45,3 +45,32 @@ for method in modified_beam_search beam_search; do
     $repo/test_wavs/1221-135766-0001.wav \
     $repo/test_wavs/1221-135766-0002.wav
 done
+
+echo "GITHUB_EVENT_NAME: ${GITHUB_EVENT_NAME}"
+echo "GITHUB_EVENT_LABEL_NAME: ${GITHUB_EVENT_LABEL_NAME}"
+if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" == x"run-decode"  ]]; then
+  mkdir -p transducer_stateless_multi_datasets/exp
+  ln -s $PWD/$repo/exp/pretrained.pt transducer_stateless_multi_datasets/exp/epoch-999.pt
+  ln -s $PWD/$repo/data/lang_bpe_500 data/
+
+  ls -lh data
+  ls -lh transducer_stateless_multi_datasets/exp
+
+  log "Decoding test-clean and test-other"
+
+  # use a small value for decoding with CPU
+  max_duration=100
+
+  for method in greedy_search fast_beam_search modified_beam_search; do
+    log "Decoding with $method"
+
+    ./transducer_stateless_multi_datasets/decode.py \
+      --decoding-method $method \
+      --epoch 999 \
+      --avg 1 \
+      --max-duration $max_duration \
+      --exp-dir transducer_stateless_multi_datasets/exp
+  done
+
+  rm transducer_stateless_multi_datasets/exp/*.pt
+fi
diff --git a/.github/scripts/run-pre-trained-transducer-stateless.sh b/.github/scripts/run-pre-trained-transducer-stateless.sh
index cb57602e31..4a1dc1a7e2 100755
--- a/.github/scripts/run-pre-trained-transducer-stateless.sh
+++ b/.github/scripts/run-pre-trained-transducer-stateless.sh
@@ -33,7 +33,7 @@ for sym in 1 2 3; do
     $repo/test_wavs/1221-135766-0002.wav
 done
 
-for method in modified_beam_search beam_search; do
+for method in fast_beam_search modified_beam_search beam_search; do
   log "$method"
 
   ./transducer_stateless/pretrained.py \
@@ -46,15 +46,31 @@ for method in modified_beam_search beam_search; do
     $repo/test_wavs/1221-135766-0002.wav
 done
 
-for method in modified_beam_search beam_search; do
-  log "$method"
+echo "GITHUB_EVENT_NAME: ${GITHUB_EVENT_NAME}"
+echo "GITHUB_EVENT_LABEL_NAME: ${GITHUB_EVENT_LABEL_NAME}"
+if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" == x"run-decode"  ]]; then
+  mkdir -p transducer_stateless/exp
+  ln -s $PWD/$repo/exp/pretrained.pt transducer_stateless/exp/epoch-999.pt
+  ln -s $PWD/$repo/data/lang_bpe_500 data/
 
-  ./transducer_stateless_multi_datasets/pretrained.py \
-    --method $method \
-    --beam-size 4 \
-    --checkpoint $repo/exp/pretrained.pt \
-    --bpe-model $repo/data/lang_bpe_500/bpe.model \
-    $repo/test_wavs/1089-134686-0001.wav \
-    $repo/test_wavs/1221-135766-0001.wav \
-    $repo/test_wavs/1221-135766-0002.wav
-done
+  ls -lh data
+  ls -lh transducer_stateless/exp
+
+  log "Decoding test-clean and test-other"
+
+  # use a small value for decoding with CPU
+  max_duration=100
+
+  for method in greedy_search fast_beam_search modified_beam_search; do
+    log "Decoding with $method"
+
+    ./transducer_stateless/decode.py \
+      --decoding-method $method \
+      --epoch 999 \
+      --avg 1 \
+      --max-duration $max_duration \
+      --exp-dir transducer_stateless/exp
+  done
+
+  rm transducer_stateless/exp/*.pt
+fi
diff --git a/.github/workflows/run-librispeech-2022-03-12.yml b/.github/workflows/run-librispeech-2022-03-12.yml
index 39c6fd24f7..b18b84378d 100644
--- a/.github/workflows/run-librispeech-2022-03-12.yml
+++ b/.github/workflows/run-librispeech-2022-03-12.yml
@@ -35,7 +35,7 @@ on:
 
 jobs:
   run_librispeech_2022_03_12:
-    if: github.event.label.name == 'ready' || github.event_name == 'push' || github.event_name == 'schedule'
+    if: github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
@@ -107,11 +107,11 @@ jobs:
         run: |
           .github/scripts/compute-fbank-librispeech-test-clean-and-test-other.sh
 
-
       - name: Inference with pre-trained model
         shell: bash
         env:
           GITHUB_EVENT_NAME: ${{ github.event_name }}
+          GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
         run: |
           mkdir -p egs/librispeech/ASR/data
           ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
@@ -124,8 +124,8 @@ jobs:
 
           .github/scripts/run-librispeech-pruned-transducer-stateless-2022-03-12.sh
 
-      - name: Display decoding results
-        if: github.event_name == 'schedule'
+      - name: Display decoding results for pruned_transducer_stateless
+        if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
         shell: bash
         run: |
           cd egs/librispeech/ASR/
@@ -141,9 +141,13 @@ jobs:
           find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
           find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
 
+          echo "===modified beam search==="
+          find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+          find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
       - name: Upload decoding results for pruned_transducer_stateless
         uses: actions/upload-artifact@v2
-        if: github.event_name == 'schedule'
+        if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
         with:
           name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-pruned_transducer_stateless-2022-03-12
           path: egs/librispeech/ASR/pruned_transducer_stateless/exp/
diff --git a/.github/workflows/run-librispeech-2022-04-29.yml b/.github/workflows/run-librispeech-2022-04-29.yml
index ffaee25f18..e3fe3b904f 100644
--- a/.github/workflows/run-librispeech-2022-04-29.yml
+++ b/.github/workflows/run-librispeech-2022-04-29.yml
@@ -35,7 +35,7 @@ on:
 
 jobs:
   run_librispeech_2022_04_29:
-    if: github.event.label.name == 'ready' || github.event_name == 'push' || github.event_name == 'schedule'
+    if: github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
@@ -111,6 +111,7 @@ jobs:
         shell: bash
         env:
           GITHUB_EVENT_NAME: ${{ github.event_name }}
+          GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
         run: |
           mkdir -p egs/librispeech/ASR/data
           ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
@@ -125,44 +126,54 @@ jobs:
 
           .github/scripts/run-librispeech-pruned-transducer-stateless3-2022-04-29.sh
 
-      - name: Display decoding results
-        if: github.event_name == 'schedule'
+      - name: Display decoding results for pruned_transducer_stateless2
+        if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
         shell: bash
         run: |
           cd egs/librispeech/ASR
           tree pruned_transducer_stateless2/exp
-          cd pruned_transducer_stateless2
-          echo "results for pruned_transducer_stateless2"
+          cd pruned_transducer_stateless2/exp
           echo "===greedy search==="
-          find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
-          find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+          find greedy_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+          find greedy_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
 
           echo "===fast_beam_search==="
-          find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
-          find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+          find fast_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+          find fast_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
 
-          cd ../
+          echo "===modified beam search==="
+          find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+          find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+      - name: Display decoding results for pruned_transducer_stateless3
+        if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
+        shell: bash
+        run: |
+          cd egs/librispeech/ASR
           tree pruned_transducer_stateless3/exp
-          cd pruned_transducer_stateless3
-          echo "results for pruned_transducer_stateless3"
+          cd pruned_transducer_stateless3/exp
           echo "===greedy search==="
-          find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
-          find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+          find greedy_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+          find greedy_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
 
           echo "===fast_beam_search==="
-          find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
-          find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+          find fast_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+          find fast_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+          echo "===modified beam search==="
+          find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+          find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
 
       - name: Upload decoding results for pruned_transducer_stateless2
         uses: actions/upload-artifact@v2
-        if: github.event_name == 'schedule'
+        if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
         with:
           name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-pruned_transducer_stateless2-2022-04-29
           path: egs/librispeech/ASR/pruned_transducer_stateless2/exp/
 
       - name: Upload decoding results for pruned_transducer_stateless3
         uses: actions/upload-artifact@v2
-        if: github.event_name == 'schedule'
+        if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
         with:
           name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-pruned_transducer_stateless3-2022-04-29
           path: egs/librispeech/ASR/pruned_transducer_stateless3/exp/
diff --git a/.github/workflows/run-librispeech-transducer-stateless2-2022-04-19.yml b/.github/workflows/run-librispeech-transducer-stateless2-2022-04-19.yml
index c52b543d8b..3864f4aa36 100644
--- a/.github/workflows/run-librispeech-transducer-stateless2-2022-04-19.yml
+++ b/.github/workflows/run-librispeech-transducer-stateless2-2022-04-19.yml
@@ -35,7 +35,7 @@ on:
 
 jobs:
   run_librispeech_2022_04_19:
-    if: github.event.label.name == 'ready' || github.event_name == 'push' || github.event_name == 'schedule'
+    if: github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
@@ -111,6 +111,7 @@ jobs:
         shell: bash
         env:
           GITHUB_EVENT_NAME: ${{ github.event_name }}
+          GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
         run: |
           mkdir -p egs/librispeech/ASR/data
           ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
@@ -124,7 +125,7 @@ jobs:
           .github/scripts/run-librispeech-transducer-stateless2-2022-04-19.sh
 
       - name: Display decoding results
-        if: github.event_name == 'schedule'
+        if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
         shell: bash
         run: |
           cd egs/librispeech/ASR/
@@ -136,13 +137,17 @@ jobs:
           find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
           find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
 
+          echo "===fast_beam_search==="
+          find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+          find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
           echo "===modified_beam_search==="
           find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
           find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
 
       - name: Upload decoding results for transducer_stateless2
         uses: actions/upload-artifact@v2
-        if: github.event_name == 'schedule'
+        if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
         with:
           name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-transducer_stateless2-2022-04-19
           path: egs/librispeech/ASR/transducer_stateless2/exp/
diff --git a/.github/workflows/run-pretrained-transducer-stateless-librispeech-100h.yml b/.github/workflows/run-pretrained-transducer-stateless-librispeech-100h.yml
index 438f6e8827..f77d9e6584 100644
--- a/.github/workflows/run-pretrained-transducer-stateless-librispeech-100h.yml
+++ b/.github/workflows/run-pretrained-transducer-stateless-librispeech-100h.yml
@@ -23,9 +23,18 @@ on:
   pull_request:
     types: [labeled]
 
+  schedule:
+    # minute (0-59)
+    # hour (0-23)
+    # day of the month (1-31)
+    # month (1-12)
+    # day of the week (0-6)
+    # nightly build at 15:50 UTC time every day
+    - cron: "50 15 * * *"
+
 jobs:
   run_pre_trained_transducer_stateless_multi_datasets_librispeech_100h:
-    if: github.event.label.name == 'ready' || github.event_name == 'push'
+    if: github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
@@ -64,11 +73,80 @@ jobs:
         run: |
           .github/scripts/install-kaldifeat.sh
 
+      - name: Cache LibriSpeech test-clean and test-other datasets
+        id: libri-test-clean-and-test-other-data
+        uses: actions/cache@v2
+        with:
+          path: |
+            ~/tmp/download
+          key: cache-libri-test-clean-and-test-other
+
+      - name: Download LibriSpeech test-clean and test-other
+        if: steps.libri-test-clean-and-test-other-data.outputs.cache-hit != 'true'
+        shell: bash
+        run: |
+          .github/scripts/download-librispeech-test-clean-and-test-other-dataset.sh
+
+      - name: Prepare manifests for LibriSpeech test-clean and test-other
+        shell: bash
+        run: |
+          .github/scripts/prepare-librispeech-test-clean-and-test-other-manifests.sh
+
+      - name: Cache LibriSpeech test-clean and test-other fbank features
+        id: libri-test-clean-and-test-other-fbank
+        uses: actions/cache@v2
+        with:
+          path: |
+            ~/tmp/fbank-libri
+          key: cache-libri-fbank-test-clean-and-test-other
+
+      - name: Compute fbank for LibriSpeech test-clean and test-other
+        if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
+        shell: bash
+        run: |
+          .github/scripts/compute-fbank-librispeech-test-clean-and-test-other.sh
+
       - name: Inference with pre-trained model
         shell: bash
+        env:
+          GITHUB_EVENT_NAME: ${{ github.event_name }}
+          GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
         run: |
+          mkdir -p egs/librispeech/ASR/data
+          ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
+          ls -lh egs/librispeech/ASR/data/*
+
           sudo apt-get -qq install git-lfs tree sox
           export PYTHONPATH=$PWD:$PYTHONPATH
           export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
           export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
+
           .github/scripts/run-pre-trained-transducer-stateless-librispeech-100h.sh
+
+      - name: Display decoding results for transducer_stateless_multi_datasets
+        if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
+        shell: bash
+        run: |
+          cd egs/librispeech/ASR/
+          tree ./transducer_stateless_multi_datasets/exp
+
+          cd transducer_stateless_multi_datasets
+          echo "results for transducer_stateless_multi_datasets"
+          echo "===greedy search==="
+          find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+          find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+          echo "===fast_beam_search==="
+          find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+          find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+          echo "===modified beam search==="
+          find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+          find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+      - name: Upload decoding results for transducer_stateless_multi_datasets
+        uses: actions/upload-artifact@v2
+        if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
+        with:
+          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-transducer_stateless_multi_datasets-100h-2022-02-21
+          path: egs/librispeech/ASR/transducer_stateless_multi_datasets/exp/
diff --git a/.github/workflows/run-pretrained-transducer-stateless-librispeech-multi-datasets.yml b/.github/workflows/run-pretrained-transducer-stateless-librispeech-multi-datasets.yml
index f50ac2af78..ddfa620735 100644
--- a/.github/workflows/run-pretrained-transducer-stateless-librispeech-multi-datasets.yml
+++ b/.github/workflows/run-pretrained-transducer-stateless-librispeech-multi-datasets.yml
@@ -23,9 +23,18 @@ on:
   pull_request:
     types: [labeled]
 
+  schedule:
+    # minute (0-59)
+    # hour (0-23)
+    # day of the month (1-31)
+    # month (1-12)
+    # day of the week (0-6)
+    # nightly build at 15:50 UTC time every day
+    - cron: "50 15 * * *"
+
 jobs:
   run_pre_trained_transducer_stateless_multi_datasets_librispeech_960h:
-    if: github.event.label.name == 'ready' || github.event_name == 'push'
+    if: github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
@@ -64,11 +73,80 @@ jobs:
         run: |
           .github/scripts/install-kaldifeat.sh
 
+      - name: Cache LibriSpeech test-clean and test-other datasets
+        id: libri-test-clean-and-test-other-data
+        uses: actions/cache@v2
+        with:
+          path: |
+            ~/tmp/download
+          key: cache-libri-test-clean-and-test-other
+
+      - name: Download LibriSpeech test-clean and test-other
+        if: steps.libri-test-clean-and-test-other-data.outputs.cache-hit != 'true'
+        shell: bash
+        run: |
+          .github/scripts/download-librispeech-test-clean-and-test-other-dataset.sh
+
+      - name: Prepare manifests for LibriSpeech test-clean and test-other
+        shell: bash
+        run: |
+          .github/scripts/prepare-librispeech-test-clean-and-test-other-manifests.sh
+
+      - name: Cache LibriSpeech test-clean and test-other fbank features
+        id: libri-test-clean-and-test-other-fbank
+        uses: actions/cache@v2
+        with:
+          path: |
+            ~/tmp/fbank-libri
+          key: cache-libri-fbank-test-clean-and-test-other
+
+      - name: Compute fbank for LibriSpeech test-clean and test-other
+        if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
+        shell: bash
+        run: |
+          .github/scripts/compute-fbank-librispeech-test-clean-and-test-other.sh
+
       - name: Inference with pre-trained model
         shell: bash
+        env:
+          GITHUB_EVENT_NAME: ${{ github.event_name }}
+          GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
         run: |
+          mkdir -p egs/librispeech/ASR/data
+          ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
+          ls -lh egs/librispeech/ASR/data/*
+
           sudo apt-get -qq install git-lfs tree sox
           export PYTHONPATH=$PWD:$PYTHONPATH
           export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
           export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
+
           .github/scripts/run-pre-trained-transducer-stateless-librispeech-960h.sh
+
+      - name: Display decoding results for transducer_stateless_multi_datasets
+        if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
+        shell: bash
+        run: |
+          cd egs/librispeech/ASR/
+          tree ./transducer_stateless_multi_datasets/exp
+
+          cd transducer_stateless_multi_datasets
+          echo "results for transducer_stateless_multi_datasets"
+          echo "===greedy search==="
+          find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+          find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+          echo "===fast_beam_search==="
+          find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+          find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+          echo "===modified beam search==="
+          find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+          find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+      - name: Upload decoding results for transducer_stateless_multi_datasets
+        uses: actions/upload-artifact@v2
+        if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
+        with:
+          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-transducer_stateless_multi_datasets-100h-2022-03-01
+          path: egs/librispeech/ASR/transducer_stateless_multi_datasets/exp/
diff --git a/.github/workflows/run-pretrained-transducer-stateless.yml b/.github/workflows/run-pretrained-transducer-stateless.yml
index ca355e7783..cdea78a88f 100644
--- a/.github/workflows/run-pretrained-transducer-stateless.yml
+++ b/.github/workflows/run-pretrained-transducer-stateless.yml
@@ -14,7 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-name: run-pre-trained-trandsucer-stateless
+name: run-pre-trained-transducer-stateless
 
 on:
   push:
@@ -23,9 +23,18 @@ on:
   pull_request:
     types: [labeled]
 
+  schedule:
+    # minute (0-59)
+    # hour (0-23)
+    # day of the month (1-31)
+    # month (1-12)
+    # day of the week (0-6)
+    # nightly build at 15:50 UTC time every day
+    - cron: "50 15 * * *"
+
 jobs:
   run_pre_trained_transducer_stateless:
-    if: github.event.label.name == 'ready' || github.event_name == 'push'
+    if: github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
@@ -64,11 +73,80 @@ jobs:
         run: |
           .github/scripts/install-kaldifeat.sh
 
+      - name: Cache LibriSpeech test-clean and test-other datasets
+        id: libri-test-clean-and-test-other-data
+        uses: actions/cache@v2
+        with:
+          path: |
+            ~/tmp/download
+          key: cache-libri-test-clean-and-test-other
+
+      - name: Download LibriSpeech test-clean and test-other
+        if: steps.libri-test-clean-and-test-other-data.outputs.cache-hit != 'true'
+        shell: bash
+        run: |
+          .github/scripts/download-librispeech-test-clean-and-test-other-dataset.sh
+
+      - name: Prepare manifests for LibriSpeech test-clean and test-other
+        shell: bash
+        run: |
+          .github/scripts/prepare-librispeech-test-clean-and-test-other-manifests.sh
+
+      - name: Cache LibriSpeech test-clean and test-other fbank features
+        id: libri-test-clean-and-test-other-fbank
+        uses: actions/cache@v2
+        with:
+          path: |
+            ~/tmp/fbank-libri
+          key: cache-libri-fbank-test-clean-and-test-other
+
+      - name: Compute fbank for LibriSpeech test-clean and test-other
+        if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
+        shell: bash
+        run: |
+          .github/scripts/compute-fbank-librispeech-test-clean-and-test-other.sh
+
       - name: Inference with pre-trained model
         shell: bash
+        env:
+          GITHUB_EVENT_NAME: ${{ github.event_name }}
+          GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
         run: |
+          mkdir -p egs/librispeech/ASR/data
+          ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
+          ls -lh egs/librispeech/ASR/data/*
+
           sudo apt-get -qq install git-lfs tree sox
           export PYTHONPATH=$PWD:$PYTHONPATH
           export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
           export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
+
           .github/scripts/run-pre-trained-transducer-stateless.sh
+
+      - name: Display decoding results for transducer_stateless
+        if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
+        shell: bash
+        run: |
+          cd egs/librispeech/ASR/
+          tree ./transducer_stateless/exp
+
+          cd transducer_stateless
+          echo "results for transducer_stateless"
+          echo "===greedy search==="
+          find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+          find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+          echo "===fast_beam_search==="
+          find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+          find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+          echo "===modified beam search==="
+          find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+          find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+      - name: Upload decoding results for transducer_stateless
+        uses: actions/upload-artifact@v2
+        if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
+        with:
+          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-transducer_stateless-2022-02-07
+          path: egs/librispeech/ASR/transducer_stateless/exp/
diff --git a/egs/aishell/ASR/transducer_stateless/conformer.py b/egs/aishell/ASR/transducer_stateless/conformer.py
index 81d7708f9f..149df92ab9 100644
--- a/egs/aishell/ASR/transducer_stateless/conformer.py
+++ b/egs/aishell/ASR/transducer_stateless/conformer.py
@@ -110,7 +110,9 @@ def forward(
         x = x.permute(1, 0, 2)  # (N, T, C) -> (T, N, C)
 
         # Caution: We assume the subsampling factor is 4!
-        lengths = ((x_lens - 1) // 2 - 1) // 2
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            lengths = ((x_lens - 1) // 2 - 1) // 2
         assert x.size(0) == lengths.max().item()
         mask = make_pad_mask(lengths)
 
diff --git a/egs/aishell/ASR/transducer_stateless_modified-2/decode.py b/egs/aishell/ASR/transducer_stateless_modified-2/decode.py
index 8b851bd17d..47265f846d 100755
--- a/egs/aishell/ASR/transducer_stateless_modified-2/decode.py
+++ b/egs/aishell/ASR/transducer_stateless_modified-2/decode.py
@@ -19,49 +19,62 @@
 Usage:
 (1) greedy search
 ./transducer_stateless_modified-2/decode.py \
-        --epoch 89 \
-        --avg 38 \
-        --exp-dir ./transducer_stateless_modified-2/exp \
-        --max-duration 100 \
-        --decoding-method greedy_search
-
-(2) beam search
-./transducer_stateless_modified/decode.py \
-        --epoch 89 \
-        --avg 38 \
-        --exp-dir ./transducer_stateless_modified-2/exp \
-        --max-duration 100 \
-        --decoding-method beam_search \
-        --beam-size 4
+    --epoch 89 \
+    --avg 38 \
+    --exp-dir ./transducer_stateless_modified-2/exp \
+    --max-duration 100 \
+    --decoding-method greedy_search
+
+(2) beam search (not recommended)
+./transducer_stateless_modified-2/decode.py \
+    --epoch 89 \
+    --avg 38 \
+    --exp-dir ./transducer_stateless_modified-2/exp \
+    --max-duration 100 \
+    --decoding-method beam_search \
+    --beam-size 4
 
 (3) modified beam search
 ./transducer_stateless_modified-2/decode.py \
-        --epoch 89 \
-        --avg 38 \
-        --exp-dir ./transducer_stateless_modified/exp \
-        --max-duration 100 \
-        --decoding-method modified_beam_search \
-        --beam-size 4
+    --epoch 89 \
+    --avg 38 \
+    --exp-dir ./transducer_stateless_modified-2/exp \
+    --max-duration 100 \
+    --decoding-method modified_beam_search \
+    --beam-size 4
+(4) fast beam search
+./transducer_stateless_modified-2/decode.py \
+    --epoch 89 \
+    --avg 38 \
+    --exp-dir ./transducer_stateless_modified-2/exp \
+    --max-duration 100 \
+    --decoding-method fast_beam_search \
+    --beam-size 4 \
+    --max-contexts 4 \
+    --max-states 8
 """
 
 import argparse
 import logging
 from collections import defaultdict
 from pathlib import Path
-from typing import Dict, List, Tuple
+from typing import Dict, List, Optional, Tuple
 
+import k2
 import torch
 import torch.nn as nn
 from aishell import AIShell
 from asr_datamodule import AsrDataModule
-from beam_search import beam_search, greedy_search, modified_beam_search
-from conformer import Conformer
-from decoder import Decoder
-from joiner import Joiner
-from model import Transducer
+from beam_search import (
+    beam_search,
+    fast_beam_search_one_best,
+    greedy_search,
+    greedy_search_batch,
+    modified_beam_search,
+)
+from train import get_params, get_transducer_model
 
 from icefall.checkpoint import average_checkpoints, load_checkpoint
-from icefall.env import get_env_info
 from icefall.lexicon import Lexicon
 from icefall.utils import (
     AttributeDict,
@@ -114,6 +127,7 @@ def get_parser():
           - greedy_search
           - beam_search
           - modified_beam_search
+          - fast_beam_search
         """,
     )
 
@@ -121,95 +135,62 @@ def get_parser():
         "--beam-size",
         type=int,
         default=4,
-        help="Used only when --decoding-method is beam_search "
-        "and modified_beam_search",
+        help="""An integer indicating how many candidates we will keep for each
+        frame. Used only when --decoding-method is beam_search or
+        modified_beam_search.""",
     )
 
     parser.add_argument(
-        "--context-size",
-        type=int,
-        default=2,
-        help="The context size in the decoder. 1 means bigram; "
-        "2 means tri-gram",
+        "--beam",
+        type=float,
+        default=4,
+        help="""A floating point value to calculate the cutoff score during beam
+        search (i.e., `cutoff = max-score - beam`), which is the same as the
+        `beam` in Kaldi.
+        Used only when --decoding-method is fast_beam_search""",
     )
+
     parser.add_argument(
-        "--max-sym-per-frame",
+        "--max-contexts",
         type=int,
-        default=3,
-        help="Maximum number of symbols per frame",
+        default=4,
+        help="""Used only when --decoding-method is
+        fast_beam_search""",
     )
-    return parser
-
 
-def get_params() -> AttributeDict:
-    params = AttributeDict(
-        {
-            # parameters for conformer
-            "feature_dim": 80,
-            "encoder_out_dim": 512,
-            "subsampling_factor": 4,
-            "attention_dim": 512,
-            "nhead": 8,
-            "dim_feedforward": 2048,
-            "num_encoder_layers": 12,
-            "vgg_frontend": False,
-            "env_info": get_env_info(),
-        }
-    )
-    return params
-
-
-def get_encoder_model(params: AttributeDict):
-    # TODO: We can add an option to switch between Conformer and Transformer
-    encoder = Conformer(
-        num_features=params.feature_dim,
-        output_dim=params.encoder_out_dim,
-        subsampling_factor=params.subsampling_factor,
-        d_model=params.attention_dim,
-        nhead=params.nhead,
-        dim_feedforward=params.dim_feedforward,
-        num_encoder_layers=params.num_encoder_layers,
-        vgg_frontend=params.vgg_frontend,
+    parser.add_argument(
+        "--max-states",
+        type=int,
+        default=8,
+        help="""Used only when --decoding-method is
+        fast_beam_search""",
     )
-    return encoder
-
 
-def get_decoder_model(params: AttributeDict):
-    decoder = Decoder(
-        vocab_size=params.vocab_size,
-        embedding_dim=params.encoder_out_dim,
-        blank_id=params.blank_id,
-        context_size=params.context_size,
+    parser.add_argument(
+        "--context-size",
+        type=int,
+        default=2,
+        help="The context size in the decoder. 1 means bigram; "
+        "2 means tri-gram",
     )
-    return decoder
 
-
-def get_joiner_model(params: AttributeDict):
-    joiner = Joiner(
-        input_dim=params.encoder_out_dim,
-        output_dim=params.vocab_size,
+    parser.add_argument(
+        "--max-sym-per-frame",
+        type=int,
+        default=1,
+        help="""Maximum number of symbols per frame.
+        Used only when --decoding_method is greedy_search""",
     )
-    return joiner
 
-
-def get_transducer_model(params: AttributeDict):
-    encoder = get_encoder_model(params)
-    decoder = get_decoder_model(params)
-    joiner = get_joiner_model(params)
-
-    model = Transducer(
-        encoder=encoder,
-        decoder=decoder,
-        joiner=joiner,
-    )
-    return model
+    return parser
 
 
 def decode_one_batch(
     params: AttributeDict,
     model: nn.Module,
-    lexicon: Lexicon,
+    token_table: k2.SymbolTable,
     batch: dict,
+    decoding_graph: Optional[k2.Fsa] = None,
 ) -> Dict[str, List[List[str]]]:
     """Decode one batch and return the result in a dict. The dict has the
     following format:
@@ -230,8 +211,8 @@ def decode_one_batch(
         It is the return value from iterating
         `lhotse.dataset.K2SpeechRecognitionDataset`. See its documentation
         for the format of the `batch`.
-      lexicon:
-        It contains the token symbol table and the word symbol table.
+      token_table:
+        It maps token ID to a string.
     Returns:
       Return the decoding result. See above description for the format of
       the returned dict.
@@ -249,44 +230,80 @@ def decode_one_batch(
     encoder_out, encoder_out_lens = model.encoder(
         x=feature, x_lens=feature_lens
     )
-    hyps = []
-    batch_size = encoder_out.size(0)
-
-    for i in range(batch_size):
-        # fmt: off
-        encoder_out_i = encoder_out[i:i+1, :encoder_out_lens[i]]
-        # fmt: on
-        if params.decoding_method == "greedy_search":
-            hyp = greedy_search(
-                model=model,
-                encoder_out=encoder_out_i,
-                max_sym_per_frame=params.max_sym_per_frame,
-            )
-        elif params.decoding_method == "beam_search":
-            hyp = beam_search(
-                model=model, encoder_out=encoder_out_i, beam=params.beam_size
-            )
-        elif params.decoding_method == "modified_beam_search":
-            hyp = modified_beam_search(
-                model=model, encoder_out=encoder_out_i, beam=params.beam_size
-            )
-        else:
-            raise ValueError(
-                f"Unsupported decoding method: {params.decoding_method}"
-            )
-        hyps.append([lexicon.token_table[i] for i in hyp])
+
+    if params.decoding_method == "fast_beam_search":
+        hyp_tokens = fast_beam_search_one_best(
+            model=model,
+            decoding_graph=decoding_graph,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+            beam=params.beam,
+            max_contexts=params.max_contexts,
+            max_states=params.max_states,
+        )
+    elif (
+        params.decoding_method == "greedy_search"
+        and params.max_sym_per_frame == 1
+    ):
+        hyp_tokens = greedy_search_batch(
+            model=model,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+        )
+    elif params.decoding_method == "modified_beam_search":
+        hyp_tokens = modified_beam_search(
+            model=model,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+            beam=params.beam_size,
+        )
+    else:
+        hyp_tokens = []
+        batch_size = encoder_out.size(0)
+        for i in range(batch_size):
+            # fmt: off
+            encoder_out_i = encoder_out[i:i+1, :encoder_out_lens[i]]
+            # fmt: on
+            if params.decoding_method == "greedy_search":
+                hyp = greedy_search(
+                    model=model,
+                    encoder_out=encoder_out_i,
+                    max_sym_per_frame=params.max_sym_per_frame,
+                )
+            elif params.decoding_method == "beam_search":
+                hyp = beam_search(
+                    model=model,
+                    encoder_out=encoder_out_i,
+                    beam=params.beam_size,
+                )
+            else:
+                raise ValueError(
+                    f"Unsupported decoding method: {params.decoding_method}"
+                )
+            hyp_tokens.append(hyp)
+
+    hyps = [[token_table[t] for t in tokens] for tokens in hyp_tokens]
 
     if params.decoding_method == "greedy_search":
         return {"greedy_search": hyps}
+    elif params.decoding_method == "fast_beam_search":
+        return {
+            (
+                f"beam_{params.beam}_"
+                f"max_contexts_{params.max_contexts}_"
+                f"max_states_{params.max_states}"
+            ): hyps
+        }
     else:
-        return {f"beam_{params.beam_size}": hyps}
+        return {f"beam_size_{params.beam_size}": hyps}
 
 
 def decode_dataset(
     dl: torch.utils.data.DataLoader,
     params: AttributeDict,
     model: nn.Module,
-    lexicon: Lexicon,
+    token_table: k2.SymbolTable,
+    decoding_graph: Optional[k2.Fsa] = None,
 ) -> Dict[str, List[Tuple[List[str], List[str]]]]:
     """Decode dataset.
 
@@ -297,6 +314,11 @@ def decode_dataset(
         It is returned by :func:`get_params`.
       model:
         The neural model.
+      token_table:
+        It maps a token ID to a string.
+      decoding_graph:
+        The decoding graph. Can be either a `k2.trivial_graph` or HLG, Used
+        only when --decoding_method is fast_beam_search.
     Returns:
       Return a dict, whose key may be "greedy_search" if greedy search
       is used, or it may be "beam_7" if beam size of 7 is used.
@@ -312,9 +334,9 @@ def decode_dataset(
         num_batches = "?"
 
     if params.decoding_method == "greedy_search":
-        log_interval = 100
+        log_interval = 50
     else:
-        log_interval = 2
+        log_interval = 10
 
     results = defaultdict(list)
     for batch_idx, batch in enumerate(dl):
@@ -323,7 +345,8 @@ def decode_dataset(
         hyps_dict = decode_one_batch(
             params=params,
             model=model,
-            lexicon=lexicon,
+            token_table=token_table,
+            decoding_graph=decoding_graph,
             batch=batch,
         )
 
@@ -358,6 +381,7 @@ def save_results(
             params.res_dir / f"recogs-{test_set_name}-{key}-{params.suffix}.txt"
         )
         store_transcripts(filename=recog_path, texts=results)
+        logging.info(f"The transcripts are stored in {recog_path}")
 
         # The following prints out WERs, per-word error statistics and aligned
         # ref/hyp pairs.
@@ -408,13 +432,21 @@ def main():
     assert params.decoding_method in (
         "greedy_search",
         "beam_search",
+        "fast_beam_search",
         "modified_beam_search",
     )
     params.res_dir = params.exp_dir / params.decoding_method
 
     params.suffix = f"epoch-{params.epoch}-avg-{params.avg}"
-    if "beam_search" in params.decoding_method:
-        params.suffix += f"-beam-{params.beam_size}"
+
+    if "fast_beam_search" in params.decoding_method:
+        params.suffix += f"-beam-{params.beam}"
+        params.suffix += f"-max-contexts-{params.max_contexts}"
+        params.suffix += f"-max-states-{params.max_states}"
+    elif "beam_search" in params.decoding_method:
+        params.suffix += (
+            f"-{params.decoding_method}-beam-size-{params.beam_size}"
+        )
     else:
         params.suffix += f"-context-{params.context_size}"
         params.suffix += f"-max-sym-per-frame-{params.max_sym_per_frame}"
@@ -456,6 +488,11 @@ def main():
     model.eval()
     model.device = device
 
+    if params.decoding_method == "fast_beam_search":
+        decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
+    else:
+        decoding_graph = None
+
     num_param = sum([p.numel() for p in model.parameters()])
     logging.info(f"Number of model parameters: {num_param}")
 
@@ -472,7 +509,8 @@ def main():
             dl=test_dl,
             params=params,
             model=model,
-            lexicon=lexicon,
+            token_table=lexicon.token_table,
+            decoding_graph=decoding_graph,
         )
 
         save_results(
@@ -484,8 +522,5 @@ def main():
     logging.info("Done!")
 
 
-torch.set_num_threads(1)
-torch.set_num_interop_threads(1)
-
 if __name__ == "__main__":
     main()
diff --git a/egs/aishell/ASR/transducer_stateless_modified-2/pretrained.py b/egs/aishell/ASR/transducer_stateless_modified-2/pretrained.py
index 9e6ed96b1f..a95a4bc526 100755
--- a/egs/aishell/ASR/transducer_stateless_modified-2/pretrained.py
+++ b/egs/aishell/ASR/transducer_stateless_modified-2/pretrained.py
@@ -19,7 +19,7 @@
 """
 Usage:
 
-# greedy search
+(1) greedy search
 ./transducer_stateless_modified-2/pretrained.py \
   --checkpoint /path/to/pretrained.pt \
   --lang-dir /path/to/lang_char \
@@ -27,7 +27,7 @@
   /path/to/foo.wav \
   /path/to/bar.wav
 
-# beam search
+(2) beam search
 ./transducer_stateless_modified-2/pretrained.py \
   --checkpoint /path/to/pretrained.pt \
   --lang-dir /path/to/lang_char \
@@ -36,7 +36,7 @@
   /path/to/foo.wav \
   /path/to/bar.wav
 
-# modified beam search
+(3) modified beam search
 ./transducer_stateless_modified-2/pretrained.py \
   --checkpoint /path/to/pretrained.pt \
   --lang-dir /path/to/lang_char \
@@ -45,6 +45,14 @@
   /path/to/foo.wav \
   /path/to/bar.wav
 
+(4) fast beam search
+./transducer_stateless_modified-2/pretrained.py \
+  --checkpoint /path/to/pretrained.pt \
+  --lang-dir /path/to/lang_char \
+  --method fast_beam_search \
+  --beam-size 4 \
+  /path/to/foo.wav \
+  /path/to/bar.wav
 """
 
 import argparse
@@ -53,11 +61,13 @@
 from pathlib import Path
 from typing import List
 
+import k2
 import kaldifeat
 import torch
 import torchaudio
 from beam_search import (
     beam_search,
+    fast_beam_search_one_best,
     greedy_search,
     greedy_search_batch,
     modified_beam_search,
@@ -97,6 +107,7 @@ def get_parser():
           - greedy_search
           - beam_search
           - modified_beam_search
+          - fast_beam_search
         """,
     )
 
@@ -121,7 +132,33 @@ def get_parser():
         "--beam-size",
         type=int,
         default=4,
-        help="Used only when --method is beam_search and modified_beam_search",
+        help="""An integer indicating how many candidates we will keep for each
+        frame. Used only when --method is beam_search or
+        modified_beam_search.""",
+    )
+
+    parser.add_argument(
+        "--beam",
+        type=float,
+        default=4,
+        help="""A floating point value to calculate the cutoff score during beam
+        search (i.e., `cutoff = max-score - beam`), which is the same as the
+        `beam` in Kaldi.
+        Used only when --method is fast_beam_search""",
+    )
+
+    parser.add_argument(
+        "--max-contexts",
+        type=int,
+        default=4,
+        help="""Used only when --method is fast_beam_search""",
+    )
+
+    parser.add_argument(
+        "--max-states",
+        type=int,
+        default=8,
+        help="""Used only when --method is fast_beam_search""",
     )
 
     parser.add_argument(
@@ -134,11 +171,10 @@ def get_parser():
     parser.add_argument(
         "--max-sym-per-frame",
         type=int,
-        default=3,
+        default=1,
         help="Maximum number of symbols per frame. "
         "Use only when --method is greedy_search",
     )
-    return parser
 
     return parser
 
@@ -225,20 +261,37 @@ def main():
     encoder_out, encoder_out_lens = model.encoder(
         x=features, x_lens=feature_lens
     )
+
+    num_waves = encoder_out.size(0)
     hyp_list = []
-    if params.method == "greedy_search" and params.max_sym_per_frame == 1:
+    logging.info(f"Using {params.method}")
+
+    if params.method == "fast_beam_search":
+        decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
+        hyp_list = fast_beam_search_one_best(
+            model=model,
+            decoding_graph=decoding_graph,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+            beam=params.beam,
+            max_contexts=params.max_contexts,
+            max_states=params.max_states,
+        )
+    elif params.method == "greedy_search" and params.max_sym_per_frame == 1:
         hyp_list = greedy_search_batch(
             model=model,
             encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
         )
     elif params.method == "modified_beam_search":
         hyp_list = modified_beam_search(
             model=model,
             encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
             beam=params.beam_size,
         )
     else:
-        for i in range(encoder_out.size(0)):
+        for i in range(num_waves):
             # fmt: off
             encoder_out_i = encoder_out[i:i+1, :encoder_out_lens[i]]
             # fmt: on
diff --git a/egs/aishell/ASR/transducer_stateless_modified/decode.py b/egs/aishell/ASR/transducer_stateless_modified/decode.py
index 5b5fe6ffa8..4773ebc7d5 100755
--- a/egs/aishell/ASR/transducer_stateless_modified/decode.py
+++ b/egs/aishell/ASR/transducer_stateless_modified/decode.py
@@ -19,48 +19,63 @@
 Usage:
 (1) greedy search
 ./transducer_stateless_modified/decode.py \
-        --epoch 64 \
-        --avg 33 \
-        --exp-dir ./transducer_stateless_modified/exp \
-        --max-duration 100 \
-        --decoding-method greedy_search
+    --epoch 14 \
+    --avg 7 \
+    --exp-dir ./transducer_stateless_modified/exp \
+    --max-duration 600 \
+    --decoding-method greedy_search
 
-(2) beam search
+(2) beam search (not recommended)
 ./transducer_stateless_modified/decode.py \
-        --epoch 14 \
-        --avg 7 \
-        --exp-dir ./transducer_stateless_modified/exp \
-        --max-duration 100 \
-        --decoding-method beam_search \
-        --beam-size 4
+    --epoch 14 \
+    --avg 7 \
+    --exp-dir ./transducer_stateless_modified/exp \
+    --max-duration 600 \
+    --decoding-method beam_search \
+    --beam-size 4
 
 (3) modified beam search
 ./transducer_stateless_modified/decode.py \
-        --epoch 14 \
-        --avg 7 \
-        --exp-dir ./transducer_stateless_modified/exp \
-        --max-duration 100 \
-        --decoding-method modified_beam_search \
-        --beam-size 4
+    --epoch 14 \
+    --avg 7 \
+    --exp-dir ./transducer_stateless_modified/exp \
+    --max-duration 600 \
+    --decoding-method modified_beam_search \
+    --beam-size 4
+
+(4) fast beam search
+./transducer_stateless_modified/decode.py \
+    --epoch 14 \
+    --avg 7 \
+    --exp-dir ./transducer_stateless_modified/exp \
+    --max-duration 600 \
+    --decoding-method fast_beam_search \
+    --beam 4 \
+    --max-contexts 4 \
+    --max-states 8
 """
 
+
 import argparse
 import logging
 from collections import defaultdict
 from pathlib import Path
-from typing import Dict, List, Tuple
+from typing import Dict, List, Optional, Tuple
 
+import k2
 import torch
 import torch.nn as nn
 from asr_datamodule import AishellAsrDataModule
-from beam_search import beam_search, greedy_search, modified_beam_search
-from conformer import Conformer
-from decoder import Decoder
-from joiner import Joiner
-from model import Transducer
+from beam_search import (
+    beam_search,
+    fast_beam_search_one_best,
+    greedy_search,
+    greedy_search_batch,
+    modified_beam_search,
+)
+from train import get_params, get_transducer_model
 
 from icefall.checkpoint import average_checkpoints, load_checkpoint
-from icefall.env import get_env_info
 from icefall.lexicon import Lexicon
 from icefall.utils import (
     AttributeDict,
@@ -113,6 +128,7 @@ def get_parser():
           - greedy_search
           - beam_search
           - modified_beam_search
+          - fast_beam_search
         """,
     )
 
@@ -120,94 +136,62 @@ def get_parser():
         "--beam-size",
         type=int,
         default=4,
-        help="Used only when --decoding-method is beam_search",
+        help="""An integer indicating how many candidates we will keep for each
+        frame. Used only when --decoding-method is beam_search or
+        modified_beam_search.""",
     )
 
     parser.add_argument(
-        "--context-size",
-        type=int,
-        default=2,
-        help="The context size in the decoder. 1 means bigram; "
-        "2 means tri-gram",
+        "--beam",
+        type=float,
+        default=4,
+        help="""A floating point value to calculate the cutoff score during beam
+        search (i.e., `cutoff = max-score - beam`), which is the same as the
+        `beam` in Kaldi.
+        Used only when --decoding-method is fast_beam_search""",
     )
+
     parser.add_argument(
-        "--max-sym-per-frame",
+        "--max-contexts",
         type=int,
-        default=3,
-        help="Maximum number of symbols per frame",
+        default=4,
+        help="""Used only when --decoding-method is
+        fast_beam_search""",
     )
-    return parser
 
-
-def get_params() -> AttributeDict:
-    params = AttributeDict(
-        {
-            # parameters for conformer
-            "feature_dim": 80,
-            "encoder_out_dim": 512,
-            "subsampling_factor": 4,
-            "attention_dim": 512,
-            "nhead": 8,
-            "dim_feedforward": 2048,
-            "num_encoder_layers": 12,
-            "vgg_frontend": False,
-            "env_info": get_env_info(),
-        }
-    )
-    return params
-
-
-def get_encoder_model(params: AttributeDict):
-    # TODO: We can add an option to switch between Conformer and Transformer
-    encoder = Conformer(
-        num_features=params.feature_dim,
-        output_dim=params.encoder_out_dim,
-        subsampling_factor=params.subsampling_factor,
-        d_model=params.attention_dim,
-        nhead=params.nhead,
-        dim_feedforward=params.dim_feedforward,
-        num_encoder_layers=params.num_encoder_layers,
-        vgg_frontend=params.vgg_frontend,
+    parser.add_argument(
+        "--max-states",
+        type=int,
+        default=8,
+        help="""Used only when --decoding-method is
+        fast_beam_search""",
     )
-    return encoder
-
 
-def get_decoder_model(params: AttributeDict):
-    decoder = Decoder(
-        vocab_size=params.vocab_size,
-        embedding_dim=params.encoder_out_dim,
-        blank_id=params.blank_id,
-        context_size=params.context_size,
+    parser.add_argument(
+        "--context-size",
+        type=int,
+        default=2,
+        help="The context size in the decoder. 1 means bigram; "
+        "2 means tri-gram",
     )
-    return decoder
-
 
-def get_joiner_model(params: AttributeDict):
-    joiner = Joiner(
-        input_dim=params.encoder_out_dim,
-        output_dim=params.vocab_size,
+    parser.add_argument(
+        "--max-sym-per-frame",
+        type=int,
+        default=1,
+        help="""Maximum number of symbols per frame.
+        Used only when --decoding_method is greedy_search""",
     )
-    return joiner
-
 
-def get_transducer_model(params: AttributeDict):
-    encoder = get_encoder_model(params)
-    decoder = get_decoder_model(params)
-    joiner = get_joiner_model(params)
-
-    model = Transducer(
-        encoder=encoder,
-        decoder=decoder,
-        joiner=joiner,
-    )
-    return model
+    return parser
 
 
 def decode_one_batch(
     params: AttributeDict,
     model: nn.Module,
-    lexicon: Lexicon,
+    token_table: k2.SymbolTable,
     batch: dict,
+    decoding_graph: Optional[k2.Fsa] = None,
 ) -> Dict[str, List[List[str]]]:
     """Decode one batch and return the result in a dict. The dict has the
     following format:
@@ -228,8 +212,11 @@ def decode_one_batch(
         It is the return value from iterating
         `lhotse.dataset.K2SpeechRecognitionDataset`. See its documentation
         for the format of the `batch`.
-      lexicon:
-        It contains the token symbol table and the word symbol table.
+      token_table:
+        It maps token ID to a string.
+      decoding_graph:
+        The decoding graph. Can be either a `k2.trivial_graph` or HLG, Used
+        only when --decoding_method is fast_beam_search.
     Returns:
       Return the decoding result. See above description for the format of
       the returned dict.
@@ -247,44 +234,80 @@ def decode_one_batch(
     encoder_out, encoder_out_lens = model.encoder(
         x=feature, x_lens=feature_lens
     )
-    hyps = []
-    batch_size = encoder_out.size(0)
-
-    for i in range(batch_size):
-        # fmt: off
-        encoder_out_i = encoder_out[i:i+1, :encoder_out_lens[i]]
-        # fmt: on
-        if params.decoding_method == "greedy_search":
-            hyp = greedy_search(
-                model=model,
-                encoder_out=encoder_out_i,
-                max_sym_per_frame=params.max_sym_per_frame,
-            )
-        elif params.decoding_method == "beam_search":
-            hyp = beam_search(
-                model=model, encoder_out=encoder_out_i, beam=params.beam_size
-            )
-        elif params.decoding_method == "modified_beam_search":
-            hyp = modified_beam_search(
-                model=model, encoder_out=encoder_out_i, beam=params.beam_size
-            )
-        else:
-            raise ValueError(
-                f"Unsupported decoding method: {params.decoding_method}"
-            )
-        hyps.append([lexicon.token_table[i] for i in hyp])
+
+    if params.decoding_method == "fast_beam_search":
+        hyp_tokens = fast_beam_search_one_best(
+            model=model,
+            decoding_graph=decoding_graph,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+            beam=params.beam,
+            max_contexts=params.max_contexts,
+            max_states=params.max_states,
+        )
+    elif (
+        params.decoding_method == "greedy_search"
+        and params.max_sym_per_frame == 1
+    ):
+        hyp_tokens = greedy_search_batch(
+            model=model,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+        )
+    elif params.decoding_method == "modified_beam_search":
+        hyp_tokens = modified_beam_search(
+            model=model,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+            beam=params.beam_size,
+        )
+    else:
+        hyp_tokens = []
+        batch_size = encoder_out.size(0)
+        for i in range(batch_size):
+            # fmt: off
+            encoder_out_i = encoder_out[i:i+1, :encoder_out_lens[i]]
+            # fmt: on
+            if params.decoding_method == "greedy_search":
+                hyp = greedy_search(
+                    model=model,
+                    encoder_out=encoder_out_i,
+                    max_sym_per_frame=params.max_sym_per_frame,
+                )
+            elif params.decoding_method == "beam_search":
+                hyp = beam_search(
+                    model=model,
+                    encoder_out=encoder_out_i,
+                    beam=params.beam_size,
+                )
+            else:
+                raise ValueError(
+                    f"Unsupported decoding method: {params.decoding_method}"
+                )
+            hyp_tokens.append(hyp)
+
+    hyps = [[token_table[t] for t in tokens] for tokens in hyp_tokens]
 
     if params.decoding_method == "greedy_search":
         return {"greedy_search": hyps}
+    elif params.decoding_method == "fast_beam_search":
+        return {
+            (
+                f"beam_{params.beam}_"
+                f"max_contexts_{params.max_contexts}_"
+                f"max_states_{params.max_states}"
+            ): hyps
+        }
     else:
-        return {f"beam_{params.beam_size}": hyps}
+        return {f"beam_size_{params.beam_size}": hyps}
 
 
 def decode_dataset(
     dl: torch.utils.data.DataLoader,
     params: AttributeDict,
     model: nn.Module,
-    lexicon: Lexicon,
+    token_table: k2.SymbolTable,
+    decoding_graph: Optional[k2.Fsa] = None,
 ) -> Dict[str, List[Tuple[List[str], List[str]]]]:
     """Decode dataset.
 
@@ -295,6 +318,11 @@ def decode_dataset(
         It is returned by :func:`get_params`.
       model:
         The neural model.
+      token_table:
+        It maps a token ID to a string.
+      decoding_graph:
+        The decoding graph. Can be either a `k2.trivial_graph` or HLG, Used
+        only when --decoding_method is fast_beam_search.
     Returns:
       Return a dict, whose key may be "greedy_search" if greedy search
       is used, or it may be "beam_7" if beam size of 7 is used.
@@ -310,9 +338,9 @@ def decode_dataset(
         num_batches = "?"
 
     if params.decoding_method == "greedy_search":
-        log_interval = 100
+        log_interval = 50
     else:
-        log_interval = 2
+        log_interval = 10
 
     results = defaultdict(list)
     for batch_idx, batch in enumerate(dl):
@@ -321,7 +349,8 @@ def decode_dataset(
         hyps_dict = decode_one_batch(
             params=params,
             model=model,
-            lexicon=lexicon,
+            token_table=token_table,
+            decoding_graph=decoding_graph,
             batch=batch,
         )
 
@@ -356,6 +385,7 @@ def save_results(
             params.res_dir / f"recogs-{test_set_name}-{key}-{params.suffix}.txt"
         )
         store_transcripts(filename=recog_path, texts=results)
+        logging.info(f"The transcripts are stored in {recog_path}")
 
         # The following prints out WERs, per-word error statistics and aligned
         # ref/hyp pairs.
@@ -406,13 +436,21 @@ def main():
     assert params.decoding_method in (
         "greedy_search",
         "beam_search",
+        "fast_beam_search",
         "modified_beam_search",
     )
     params.res_dir = params.exp_dir / params.decoding_method
 
     params.suffix = f"epoch-{params.epoch}-avg-{params.avg}"
-    if "beam_search" in params.decoding_method:
-        params.suffix += f"-beam-{params.beam_size}"
+
+    if "fast_beam_search" in params.decoding_method:
+        params.suffix += f"-beam-{params.beam}"
+        params.suffix += f"-max-contexts-{params.max_contexts}"
+        params.suffix += f"-max-states-{params.max_states}"
+    elif "beam_search" in params.decoding_method:
+        params.suffix += (
+            f"-{params.decoding_method}-beam-size-{params.beam_size}"
+        )
     else:
         params.suffix += f"-context-{params.context_size}"
         params.suffix += f"-max-sym-per-frame-{params.max_sym_per_frame}"
@@ -452,6 +490,11 @@ def main():
     model.eval()
     model.device = device
 
+    if params.decoding_method == "fast_beam_search":
+        decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
+    else:
+        decoding_graph = None
+
     num_param = sum([p.numel() for p in model.parameters()])
     logging.info(f"Number of model parameters: {num_param}")
 
@@ -467,7 +510,8 @@ def main():
             dl=test_dl,
             params=params,
             model=model,
-            lexicon=lexicon,
+            token_table=lexicon.token_table,
+            decoding_graph=decoding_graph,
         )
 
         save_results(
@@ -479,8 +523,5 @@ def main():
     logging.info("Done!")
 
 
-torch.set_num_threads(1)
-torch.set_num_interop_threads(1)
-
 if __name__ == "__main__":
     main()
diff --git a/egs/aishell/ASR/transducer_stateless_modified/pretrained.py b/egs/aishell/ASR/transducer_stateless_modified/pretrained.py
index f7c5b24ba9..262e822c20 100755
--- a/egs/aishell/ASR/transducer_stateless_modified/pretrained.py
+++ b/egs/aishell/ASR/transducer_stateless_modified/pretrained.py
@@ -19,7 +19,7 @@
 """
 Usage:
 
-# greedy search
+(1) greedy search
 ./transducer_stateless_modified/pretrained.py \
   --checkpoint /path/to/pretrained.pt \
   --lang-dir /path/to/lang_char \
@@ -27,7 +27,7 @@
   /path/to/foo.wav \
   /path/to/bar.wav
 
-# beam search
+(2) beam search
 ./transducer_stateless_modified/pretrained.py \
   --checkpoint /path/to/pretrained.pt \
   --lang-dir /path/to/lang_char \
@@ -36,7 +36,7 @@
   /path/to/foo.wav \
   /path/to/bar.wav
 
-# modified beam search
+(3) modified beam search
 ./transducer_stateless_modified/pretrained.py \
   --checkpoint /path/to/pretrained.pt \
   --lang-dir /path/to/lang_char \
@@ -45,6 +45,14 @@
   /path/to/foo.wav \
   /path/to/bar.wav
 
+(4) fast beam search
+./transducer_stateless_modified/pretrained.py \
+  --checkpoint /path/to/pretrained.pt \
+  --lang-dir /path/to/lang_char \
+  --method fast_beam_search \
+  --beam-size 4 \
+  /path/to/foo.wav \
+  /path/to/bar.wav
 """
 
 import argparse
@@ -53,11 +61,13 @@
 from pathlib import Path
 from typing import List
 
+import k2
 import kaldifeat
 import torch
 import torchaudio
 from beam_search import (
     beam_search,
+    fast_beam_search_one_best,
     greedy_search,
     greedy_search_batch,
     modified_beam_search,
@@ -97,6 +107,7 @@ def get_parser():
           - greedy_search
           - beam_search
           - modified_beam_search
+          - fast_beam_search
         """,
     )
 
@@ -121,7 +132,33 @@ def get_parser():
         "--beam-size",
         type=int,
         default=4,
-        help="Used only when --method is beam_search and modified_beam_search",
+        help="""An integer indicating how many candidates we will keep for each
+        frame. Used only when --method is beam_search or
+        modified_beam_search.""",
+    )
+
+    parser.add_argument(
+        "--beam",
+        type=float,
+        default=4,
+        help="""A floating point value to calculate the cutoff score during beam
+        search (i.e., `cutoff = max-score - beam`), which is the same as the
+        `beam` in Kaldi.
+        Used only when --method is fast_beam_search""",
+    )
+
+    parser.add_argument(
+        "--max-contexts",
+        type=int,
+        default=4,
+        help="""Used only when --method is fast_beam_search""",
+    )
+
+    parser.add_argument(
+        "--max-states",
+        type=int,
+        default=8,
+        help="""Used only when --method is fast_beam_search""",
     )
 
     parser.add_argument(
@@ -134,11 +171,10 @@ def get_parser():
     parser.add_argument(
         "--max-sym-per-frame",
         type=int,
-        default=3,
+        default=1,
         help="Maximum number of symbols per frame. "
         "Use only when --method is greedy_search",
     )
-    return parser
 
     return parser
 
@@ -225,20 +261,37 @@ def main():
     encoder_out, encoder_out_lens = model.encoder(
         x=features, x_lens=feature_lens
     )
+
+    num_waves = encoder_out.size(0)
     hyp_list = []
-    if params.method == "greedy_search" and params.max_sym_per_frame == 1:
+    logging.info(f"Using {params.method}")
+
+    if params.method == "fast_beam_search":
+        decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
+        hyp_list = fast_beam_search_one_best(
+            model=model,
+            decoding_graph=decoding_graph,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+            beam=params.beam,
+            max_contexts=params.max_contexts,
+            max_states=params.max_states,
+        )
+    elif params.method == "greedy_search" and params.max_sym_per_frame == 1:
         hyp_list = greedy_search_batch(
             model=model,
             encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
         )
     elif params.method == "modified_beam_search":
         hyp_list = modified_beam_search(
             model=model,
             encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
             beam=params.beam_size,
         )
     else:
-        for i in range(encoder_out.size(0)):
+        for i in range(num_waves):
             # fmt: off
             encoder_out_i = encoder_out[i:i+1, :encoder_out_lens[i]]
             # fmt: on
diff --git a/egs/librispeech/ASR/pruned_transducer_stateless/beam_search.py b/egs/librispeech/ASR/pruned_transducer_stateless/beam_search.py
index 5d1e9b4716..db23fd993f 100644
--- a/egs/librispeech/ASR/pruned_transducer_stateless/beam_search.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless/beam_search.py
@@ -27,7 +27,7 @@
 from icefall.utils import get_texts
 
 
-def fast_beam_search(
+def fast_beam_search_one_best(
     model: Transducer,
     decoding_graph: k2.Fsa,
     encoder_out: torch.Tensor,
@@ -35,10 +35,12 @@ def fast_beam_search(
     beam: float,
     max_states: int,
     max_contexts: int,
-    use_max: bool = False,
 ) -> List[List[int]]:
     """It limits the maximum number of symbols per frame to 1.
 
+    A lattice is first obtained using modified beam search, and then
+    the shortest path within the lattice is used as the final output.
+
     Args:
       model:
         An instance of `Transducer`.
@@ -55,12 +57,151 @@ def fast_beam_search(
         Max states per stream per frame.
       max_contexts:
         Max contexts pre stream per frame.
-      use_max:
-        True to use max operation to select the hypothesis with the largest
-        log_prob when there are duplicate hypotheses; False to use log-add.
     Returns:
       Return the decoded result.
     """
+    lattice = fast_beam_search(
+        model=model,
+        decoding_graph=decoding_graph,
+        encoder_out=encoder_out,
+        encoder_out_lens=encoder_out_lens,
+        beam=beam,
+        max_states=max_states,
+        max_contexts=max_contexts,
+    )
+
+    best_path = one_best_decoding(lattice)
+    hyps = get_texts(best_path)
+    return hyps
+
+
+def fast_beam_search_nbest_oracle(
+    model: Transducer,
+    decoding_graph: k2.Fsa,
+    encoder_out: torch.Tensor,
+    encoder_out_lens: torch.Tensor,
+    beam: float,
+    max_states: int,
+    max_contexts: int,
+    num_paths: int,
+    ref_texts: List[List[int]],
+    use_double_scores: bool = True,
+    nbest_scale: float = 0.5,
+) -> List[List[int]]:
+    """It limits the maximum number of symbols per frame to 1.
+
+    A lattice is first obtained using modified beam search, and then
+    we select `num_paths` linear paths from the lattice. The path
+    that has the minimum edit distance with the given reference transcript
+    is used as the output.
+
+    This is the best result we can achieve for any nbest based rescoring
+    methods.
+
+    Args:
+      model:
+        An instance of `Transducer`.
+      decoding_graph:
+        Decoding graph used for decoding, may be a TrivialGraph or a HLG.
+      encoder_out:
+        A tensor of shape (N, T, C) from the encoder.
+      encoder_out_lens:
+        A tensor of shape (N,) containing the number of frames in `encoder_out`
+        before padding.
+      beam:
+        Beam value, similar to the beam used in Kaldi..
+      max_states:
+        Max states per stream per frame.
+      max_contexts:
+        Max contexts pre stream per frame.
+      num_paths:
+        Number of paths to extract from the decoded lattice.
+      ref_texts:
+        A list-of-list of integers containing the reference transcripts.
+        If the decoding_graph is a trivial_graph, the integer ID is the
+        BPE token ID.
+      use_double_scores:
+        True to use double precision for computation. False to use
+        single precision.
+      nbest_scale:
+        It's the scale applied to the lattice.scores. A smaller value
+        yields more unique paths.
+
+    Returns:
+      Return the decoded result.
+    """
+    lattice = fast_beam_search(
+        model=model,
+        decoding_graph=decoding_graph,
+        encoder_out=encoder_out,
+        encoder_out_lens=encoder_out_lens,
+        beam=beam,
+        max_states=max_states,
+        max_contexts=max_contexts,
+    )
+
+    nbest = Nbest.from_lattice(
+        lattice=lattice,
+        num_paths=num_paths,
+        use_double_scores=use_double_scores,
+        nbest_scale=nbest_scale,
+    )
+
+    hyps = nbest.build_levenshtein_graphs()
+    refs = k2.levenshtein_graph(ref_texts, device=hyps.device)
+
+    levenshtein_alignment = k2.levenshtein_alignment(
+        refs=refs,
+        hyps=hyps,
+        hyp_to_ref_map=nbest.shape.row_ids(1),
+        sorted_match_ref=True,
+    )
+
+    tot_scores = levenshtein_alignment.get_tot_scores(
+        use_double_scores=False, log_semiring=False
+    )
+    ragged_tot_scores = k2.RaggedTensor(nbest.shape, tot_scores)
+
+    max_indexes = ragged_tot_scores.argmax()
+
+    best_path = k2.index_fsa(nbest.fsa, max_indexes)
+
+    hyps = get_texts(best_path)
+    return hyps
+
+
+def fast_beam_search(
+    model: Transducer,
+    decoding_graph: k2.Fsa,
+    encoder_out: torch.Tensor,
+    encoder_out_lens: torch.Tensor,
+    beam: float,
+    max_states: int,
+    max_contexts: int,
+) -> k2.Fsa:
+    """It limits the maximum number of symbols per frame to 1.
+
+    Args:
+      model:
+        An instance of `Transducer`.
+      decoding_graph:
+        Decoding graph used for decoding, may be a TrivialGraph or a HLG.
+      encoder_out:
+        A tensor of shape (N, T, C) from the encoder.
+      encoder_out_lens:
+        A tensor of shape (N,) containing the number of frames in `encoder_out`
+        before padding.
+      beam:
+        Beam value, similar to the beam used in Kaldi..
+      max_states:
+        Max states per stream per frame.
+      max_contexts:
+        Max contexts pre stream per frame.
+    Returns:
+      Return an FsaVec with axes [utt][state][arc] containing the decoded
+      lattice. Note: When the input graph is a TrivialGraph, the returned
+      lattice is actually an acceptor.
+    """
     assert encoder_out.ndim == 3
 
     context_size = model.decoder.context_size
@@ -92,7 +233,7 @@ def fast_beam_search(
         # (shape.NumElements(), 1, encoder_out_dim)
         # fmt: off
         current_encoder_out = torch.index_select(
-            encoder_out[:, t:t + 1, :], 0, shape.row_ids(1).long()
+            encoder_out[:, t:t + 1, :], 0, shape.row_ids(1).to(torch.int64)
             # in some old versions of pytorch, the type of index requires
             # to be LongTensor. In the newest version of pytorch, the type
             # of index can be IntTensor or LongTensor. For supporting the
@@ -109,67 +250,7 @@ def fast_beam_search(
     decoding_streams.terminate_and_flush_to_streams()
     lattice = decoding_streams.format_output(encoder_out_lens.tolist())
 
-    if use_max:
-        best_path = one_best_decoding(lattice)
-        hyps = get_texts(best_path)
-        return hyps
-    else:
-        num_paths = 200
-        use_double_scores = True
-        nbest_scale = 0.8
-
-        nbest = Nbest.from_lattice(
-            lattice=lattice,
-            num_paths=num_paths,
-            use_double_scores=use_double_scores,
-            nbest_scale=nbest_scale,
-        )
-        # The following code is modified from nbest.intersect()
-        word_fsa = k2.invert(nbest.fsa)
-        if hasattr(lattice, "aux_labels"):
-            # delete token IDs as it is not needed
-            del word_fsa.aux_labels
-        word_fsa.scores.zero_()
-
-        word_fsa_with_epsilon_loops = k2.linear_fsa_with_self_loops(word_fsa)
-        path_to_utt_map = nbest.shape.row_ids(1)
-
-        if hasattr(lattice, "aux_labels"):
-            # lattice has token IDs as labels and word IDs as aux_labels.
-            # inv_lattice has word IDs as labels and token IDs as aux_labels
-            inv_lattice = k2.invert(lattice)
-            inv_lattice = k2.arc_sort(inv_lattice)
-        else:
-            inv_lattice = k2.arc_sort(lattice)
-
-        if inv_lattice.shape[0] == 1:
-            path_lattice = k2.intersect_device(
-                inv_lattice,
-                word_fsa_with_epsilon_loops,
-                b_to_a_map=torch.zeros_like(path_to_utt_map),
-                sorted_match_a=True,
-            )
-        else:
-            path_lattice = k2.intersect_device(
-                inv_lattice,
-                word_fsa_with_epsilon_loops,
-                b_to_a_map=path_to_utt_map,
-                sorted_match_a=True,
-            )
-
-        # path_lattice has word IDs as labels and token IDs as aux_labels
-        path_lattice = k2.top_sort(k2.connect(path_lattice))
-
-        tot_scores = path_lattice.get_tot_scores(
-            use_double_scores=use_double_scores, log_semiring=True
-        )
-
-        ragged_tot_scores = k2.RaggedTensor(nbest.shape, tot_scores)
-        best_hyp_indexes = ragged_tot_scores.argmax()
-
-        best_path = k2.index_fsa(nbest.fsa, best_hyp_indexes)
-        hyps = get_texts(best_path)
-        return hyps
+    return lattice
 
 
 def greedy_search(
@@ -193,10 +274,10 @@ def greedy_search(
     assert encoder_out.size(0) == 1, encoder_out.size(0)
 
     blank_id = model.decoder.blank_id
-    unk_id = model.decoder.unk_id
     context_size = model.decoder.context_size
+    unk_id = getattr(model, "unk_id", blank_id)
 
-    device = model.device
+    device = next(model.parameters()).device
 
     decoder_input = torch.tensor(
         [blank_id] * context_size, device=device, dtype=torch.int64
@@ -230,7 +311,7 @@ def greedy_search(
         # logits is (1, 1, 1, vocab_size)
 
         y = logits.argmax().item()
-        if y != blank_id and y != unk_id:
+        if y not in (blank_id, unk_id):
             hyp.append(y)
             decoder_input = torch.tensor(
                 [hyp[-context_size:]], device=device
@@ -249,7 +330,9 @@ def greedy_search(
 
 
 def greedy_search_batch(
-    model: Transducer, encoder_out: torch.Tensor
+    model: Transducer,
+    encoder_out: torch.Tensor,
+    encoder_out_lens: torch.Tensor,
 ) -> List[List[int]]:
     """Greedy search in batch mode. It hardcodes --max-sym-per-frame=1.
     Args:
@@ -257,6 +340,9 @@ def greedy_search_batch(
         The transducer model.
       encoder_out:
         Output from the encoder. Its shape is (N, T, C), where N >= 1.
+      encoder_out_lens:
+        A 1-D tensor of shape (N,), containing number of valid frames in
+        encoder_out before padding.
     Returns:
       Return a list-of-list of token IDs containing the decoded results.
       len(ans) equals to encoder_out.size(0).
@@ -264,28 +350,48 @@ def greedy_search_batch(
     assert encoder_out.ndim == 3
     assert encoder_out.size(0) >= 1, encoder_out.size(0)
 
-    device = model.device
+    packed_encoder_out = torch.nn.utils.rnn.pack_padded_sequence(
+        input=encoder_out,
+        lengths=encoder_out_lens.cpu(),
+        batch_first=True,
+        enforce_sorted=False,
+    )
 
-    batch_size = encoder_out.size(0)
-    T = encoder_out.size(1)
+    device = next(model.parameters()).device
 
     blank_id = model.decoder.blank_id
-    unk_id = model.decoder.unk_id
+    unk_id = getattr(model, "unk_id", blank_id)
     context_size = model.decoder.context_size
 
-    hyps = [[blank_id] * context_size for _ in range(batch_size)]
+    batch_size_list = packed_encoder_out.batch_sizes.tolist()
+    N = encoder_out.size(0)
+    assert torch.all(encoder_out_lens > 0), encoder_out_lens
+    assert N == batch_size_list[0], (N, batch_size_list)
+
+    hyps = [[blank_id] * context_size for _ in range(N)]
 
     decoder_input = torch.tensor(
         hyps,
         device=device,
         dtype=torch.int64,
-    )  # (batch_size, context_size)
+    )  # (N, context_size)
 
     decoder_out = model.decoder(decoder_input, need_pad=False)
-    # decoder_out: (batch_size, 1, decoder_out_dim)
-    for t in range(T):
-        current_encoder_out = encoder_out[:, t : t + 1, :].unsqueeze(2)  # noqa
+    # decoder_out: (N, 1, decoder_out_dim)
+
+    encoder_out = packed_encoder_out.data
+
+    offset = 0
+    for batch_size in batch_size_list:
+        start = offset
+        end = offset + batch_size
+        current_encoder_out = encoder_out.data[start:end]
+        current_encoder_out = current_encoder_out.unsqueeze(1).unsqueeze(1)
         # current_encoder_out's shape: (batch_size, 1, 1, encoder_out_dim)
+        offset = end
+
+        decoder_out = decoder_out[:batch_size]
+
         logits = model.joiner(current_encoder_out, decoder_out.unsqueeze(1))
         # logits'shape (batch_size, 1, 1, vocab_size)
 
@@ -294,12 +400,12 @@ def greedy_search_batch(
         y = logits.argmax(dim=1).tolist()
         emitted = False
         for i, v in enumerate(y):
-            if v != blank_id and v != unk_id:
+            if v not in (blank_id, unk_id):
                 hyps[i].append(v)
                 emitted = True
         if emitted:
             # update decoder output
-            decoder_input = [h[-context_size:] for h in hyps]
+            decoder_input = [h[-context_size:] for h in hyps[:batch_size]]
             decoder_input = torch.tensor(
                 decoder_input,
                 device=device,
@@ -307,7 +413,12 @@ def greedy_search_batch(
             )
             decoder_out = model.decoder(decoder_input, need_pad=False)
 
-    ans = [h[context_size:] for h in hyps]
+    sorted_ans = [h[context_size:] for h in hyps]
+    ans = []
+    unsorted_indices = packed_encoder_out.unsorted_indices.tolist()
+    for i in range(N):
+        ans.append(sorted_ans[unsorted_indices[i]])
+
     return ans
 
 
@@ -472,6 +583,7 @@ def _get_hyps_shape(hyps: List[HypothesisList]) -> k2.RaggedShape:
 def modified_beam_search(
     model: Transducer,
     encoder_out: torch.Tensor,
+    encoder_out_lens: torch.Tensor,
     beam: int = 4,
     use_max: bool = False,
 ) -> List[List[int]]:
@@ -482,6 +594,9 @@ def modified_beam_search(
         The transducer model.
       encoder_out:
         Output from the encoder. Its shape is (N, T, C).
+      encoder_out_lens:
+        A 1-D tensor of shape (N,), containing number of valid frames in
+        encoder_out before padding.
       beam:
         Number of active paths during the beam search.
       use_max:
@@ -492,16 +607,27 @@ def modified_beam_search(
       for the i-th utterance.
     """
     assert encoder_out.ndim == 3, encoder_out.shape
+    assert encoder_out.size(0) >= 1, encoder_out.size(0)
 
-    batch_size = encoder_out.size(0)
-    T = encoder_out.size(1)
+    packed_encoder_out = torch.nn.utils.rnn.pack_padded_sequence(
+        input=encoder_out,
+        lengths=encoder_out_lens.cpu(),
+        batch_first=True,
+        enforce_sorted=False,
+    )
 
     blank_id = model.decoder.blank_id
-    unk_id = model.decoder.unk_id
+    unk_id = getattr(model, "unk_id", blank_id)
     context_size = model.decoder.context_size
-    device = model.device
-    B = [HypothesisList() for _ in range(batch_size)]
-    for i in range(batch_size):
+    device = next(model.parameters()).device
+
+    batch_size_list = packed_encoder_out.batch_sizes.tolist()
+    N = encoder_out.size(0)
+    assert torch.all(encoder_out_lens > 0), encoder_out_lens
+    assert N == batch_size_list[0], (N, batch_size_list)
+
+    B = [HypothesisList() for _ in range(N)]
+    for i in range(N):
         B[i].add(
             Hypothesis(
                 ys=[blank_id] * context_size,
@@ -510,9 +636,20 @@ def modified_beam_search(
             use_max=use_max,
         )
 
-    for t in range(T):
-        current_encoder_out = encoder_out[:, t : t + 1, :].unsqueeze(2)  # noqa
+    encoder_out = packed_encoder_out.data
+
+    offset = 0
+    finalized_B = []
+    for batch_size in batch_size_list:
+        start = offset
+        end = offset + batch_size
+        current_encoder_out = encoder_out.data[start:end]
+        current_encoder_out = current_encoder_out.unsqueeze(1).unsqueeze(1)
         # current_encoder_out's shape is (batch_size, 1, 1, encoder_out_dim)
+        offset = end
+
+        finalized_B = B[batch_size:] + finalized_B
+        B = B[:batch_size]
 
         hyps_shape = _get_hyps_shape(B).to(device)
 
@@ -577,15 +714,21 @@ def modified_beam_search(
 
                 new_ys = hyp.ys[:]
                 new_token = topk_token_indexes[k]
-                if new_token != blank_id and new_token != unk_id:
+                if new_token not in (blank_id, unk_id):
                     new_ys.append(new_token)
 
                 new_log_prob = topk_log_probs[k]
                 new_hyp = Hypothesis(ys=new_ys, log_prob=new_log_prob)
                 B[i].add(new_hyp)
 
+    B = B + finalized_B
     best_hyps = [b.get_most_probable(length_norm=True) for b in B]
-    ans = [h.ys[context_size:] for h in best_hyps]
+
+    sorted_ans = [h.ys[context_size:] for h in best_hyps]
+    ans = []
+    unsorted_indices = packed_encoder_out.unsorted_indices.tolist()
+    for i in range(N):
+        ans.append(sorted_ans[unsorted_indices[i]])
 
     return ans
 
@@ -622,10 +765,10 @@ def _deprecated_modified_beam_search(
     # support only batch_size == 1 for now
     assert encoder_out.size(0) == 1, encoder_out.size(0)
     blank_id = model.decoder.blank_id
-    unk_id = model.decoder.unk_id
+    unk_id = getattr(model, "unk_id", blank_id)
     context_size = model.decoder.context_size
 
-    device = model.device
+    device = next(model.parameters()).device
 
     T = encoder_out.size(1)
 
@@ -691,7 +834,7 @@ def _deprecated_modified_beam_search(
             hyp = A[topk_hyp_indexes[i]]
             new_ys = hyp.ys[:]
             new_token = topk_token_indexes[i]
-            if new_token != blank_id and new_token != unk_id:
+            if new_token not in (blank_id, unk_id):
                 new_ys.append(new_token)
             new_log_prob = topk_log_probs[i]
             new_hyp = Hypothesis(ys=new_ys, log_prob=new_log_prob)
@@ -732,10 +875,10 @@ def beam_search(
     # support only batch_size == 1 for now
     assert encoder_out.size(0) == 1, encoder_out.size(0)
     blank_id = model.decoder.blank_id
-    unk_id = model.decoder.unk_id
+    unk_id = getattr(model, "unk_id", blank_id)
     context_size = model.decoder.context_size
 
-    device = model.device
+    device = next(model.parameters()).device
 
     decoder_input = torch.tensor(
         [blank_id] * context_size,
@@ -818,7 +961,7 @@ def beam_search(
             # Second, process other non-blank labels
             values, indices = log_prob.topk(beam + 1)
             for i, v in zip(indices.tolist(), values.tolist()):
-                if i == blank_id or i == unk_id:
+                if i in (blank_id, unk_id):
                     continue
                 new_ys = y_star.ys + [i]
                 new_log_prob = y_star.log_prob + v
diff --git a/egs/librispeech/ASR/pruned_transducer_stateless/decode.py b/egs/librispeech/ASR/pruned_transducer_stateless/decode.py
index 349e4c281a..ea43836bd9 100755
--- a/egs/librispeech/ASR/pruned_transducer_stateless/decode.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless/decode.py
@@ -19,53 +19,53 @@
 Usage:
 (1) greedy search
 ./pruned_transducer_stateless/decode.py \
-        --epoch 28 \
-        --avg 15 \
-        --exp-dir ./pruned_transducer_stateless/exp \
-        --max-duration 100 \
-        --decoding-method greedy_search
+    --epoch 28 \
+    --avg 15 \
+    --exp-dir ./pruned_transducer_stateless/exp \
+    --max-duration 600 \
+    --decoding-method greedy_search
 
-(2) beam search
+(2) beam search (not recommended)
 ./pruned_transducer_stateless/decode.py \
-        --epoch 28 \
-        --avg 15 \
-        --exp-dir ./pruned_transducer_stateless/exp \
-        --max-duration 100 \
-        --decoding-method beam_search \
-        --beam-size 4
+    --epoch 28 \
+    --avg 15 \
+    --exp-dir ./pruned_transducer_stateless/exp \
+    --max-duration 600 \
+    --decoding-method beam_search \
+    --beam-size 4
 
 (3) modified beam search
 ./pruned_transducer_stateless/decode.py \
-        --epoch 28 \
-        --avg 15 \
-        --exp-dir ./pruned_transducer_stateless/exp \
-        --max-duration 100 \
-        --decoding-method modified_beam_search \
-        --beam-size 4
+    --epoch 28 \
+    --avg 15 \
+    --exp-dir ./pruned_transducer_stateless/exp \
+    --max-duration 600 \
+    --decoding-method modified_beam_search \
+    --beam-size 4
 
 (4) fast beam search
 ./pruned_transducer_stateless/decode.py \
-        --epoch 28 \
-        --avg 15 \
-        --exp-dir ./pruned_transducer_stateless/exp \
-        --max-duration 1500 \
-        --decoding-method fast_beam_search \
-        --beam 4 \
-        --max-contexts 4 \
-        --max-states 8
+    --epoch 28 \
+    --avg 15 \
+    --exp-dir ./pruned_transducer_stateless/exp \
+    --max-duration 600 \
+    --decoding-method fast_beam_search \
+    --beam 4 \
+    --max-contexts 4 \
+    --max-states 8
 
 (5) fast beam search using LG
 ./pruned_transducer_stateless/decode.py \
-        --epoch 28 \
-        --avg 15 \
-        --exp-dir ./pruned_transducer_stateless/exp \
-        --use-LG True \
-        --use-max False \
-        --max-duration 1500 \
-        --decoding-method fast_beam_search \
-        --beam 8 \
-        --max-contexts 8 \
-        --max-states 64
+    --epoch 28 \
+    --avg 15 \
+    --exp-dir ./pruned_transducer_stateless/exp \
+    --use-LG True \
+    --use-max False \
+    --max-duration 600 \
+    --decoding-method fast_beam_search \
+    --beam 8 \
+    --max-contexts 8 \
+    --max-states 64
 """
 
 
@@ -82,7 +82,7 @@
 from asr_datamodule import LibriSpeechAsrDataModule
 from beam_search import (
     beam_search,
-    fast_beam_search,
+    fast_beam_search_one_best,
     greedy_search,
     greedy_search_batch,
     modified_beam_search,
@@ -307,7 +307,7 @@ def decode_one_batch(
     hyps = []
 
     if params.decoding_method == "fast_beam_search":
-        hyp_tokens = fast_beam_search(
+        hyp_tokens = fast_beam_search_one_best(
             model=model,
             decoding_graph=decoding_graph,
             encoder_out=encoder_out,
@@ -315,7 +315,6 @@ def decode_one_batch(
             beam=params.beam,
             max_contexts=params.max_contexts,
             max_states=params.max_states,
-            use_max=params.use_max,
         )
         if params.use_LG:
             for hyp in hyp_tokens:
@@ -330,6 +329,7 @@ def decode_one_batch(
         hyp_tokens = greedy_search_batch(
             model=model,
             encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
         )
         for hyp in sp.decode(hyp_tokens):
             hyps.append(hyp.split())
@@ -337,6 +337,7 @@ def decode_one_batch(
         hyp_tokens = modified_beam_search(
             model=model,
             encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
             beam=params.beam_size,
             use_max=params.use_max,
         )
@@ -421,9 +422,9 @@ def decode_dataset(
         num_batches = "?"
 
     if params.decoding_method == "greedy_search":
-        log_interval = 100
+        log_interval = 50
     else:
-        log_interval = 2
+        log_interval = 10
 
     results = defaultdict(list)
     for batch_idx, batch in enumerate(dl):
diff --git a/egs/librispeech/ASR/pruned_transducer_stateless/pretrained.py b/egs/librispeech/ASR/pruned_transducer_stateless/pretrained.py
index 3cc4729747..148bf7b028 100755
--- a/egs/librispeech/ASR/pruned_transducer_stateless/pretrained.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless/pretrained.py
@@ -25,7 +25,7 @@
         /path/to/foo.wav \
         /path/to/bar.wav \
 
-(1) beam search
+(2) beam search
 ./pruned_transducer_stateless/pretrained.py \
         --checkpoint ./pruned_transducer_stateless/exp/pretrained.pt \
         --bpe-model ./data/lang_bpe_500/bpe.model \
@@ -34,6 +34,24 @@
         /path/to/foo.wav \
         /path/to/bar.wav \
 
+(3) modified beam search
+./pruned_transducer_stateless/pretrained.py \
+        --checkpoint ./pruned_transducer_stateless/exp/pretrained.pt \
+        --bpe-model ./data/lang_bpe_500/bpe.model \
+        --method modified_beam_search \
+        --beam-size 4 \
+        /path/to/foo.wav \
+        /path/to/bar.wav \
+
+(4) fast beam search
+./pruned_transducer_stateless/pretrained.py \
+        --checkpoint ./pruned_transducer_stateless/exp/pretrained.pt \
+        --bpe-model ./data/lang_bpe_500/bpe.model \
+        --method fast_beam_search \
+        --beam-size 4 \
+        /path/to/foo.wav \
+        /path/to/bar.wav \
+
 You can also use `./pruned_transducer_stateless/exp/epoch-xx.pt`.
 
 Note: ./pruned_transducer_stateless/exp/pretrained.pt is generated by
@@ -46,12 +64,14 @@
 import math
 from typing import List
 
+import k2
 import kaldifeat
 import sentencepiece as spm
 import torch
 import torchaudio
 from beam_search import (
     beam_search,
+    fast_beam_search_one_best,
     greedy_search,
     greedy_search_batch,
     modified_beam_search,
@@ -77,9 +97,7 @@ def get_parser():
     parser.add_argument(
         "--bpe-model",
         type=str,
-        help="""Path to bpe.model.
-        Used only when method is ctc-decoding.
-        """,
+        help="""Path to bpe.model.""",
     )
 
     parser.add_argument(
@@ -90,6 +108,7 @@ def get_parser():
           - greedy_search
           - beam_search
           - modified_beam_search
+          - fast_beam_search
         """,
     )
 
@@ -114,7 +133,33 @@ def get_parser():
         "--beam-size",
         type=int,
         default=4,
-        help="Used only when --method is beam_search and modified_beam_search",
+        help="""An integer indicating how many candidates we will keep for each
+        frame. Used only when --method is beam_search or
+        modified_beam_search.""",
+    )
+
+    parser.add_argument(
+        "--beam",
+        type=float,
+        default=4,
+        help="""A floating point value to calculate the cutoff score during beam
+        search (i.e., `cutoff = max-score - beam`), which is the same as the
+        `beam` in Kaldi.
+        Used only when --method is fast_beam_search""",
+    )
+
+    parser.add_argument(
+        "--max-contexts",
+        type=int,
+        default=4,
+        help="""Used only when --method is fast_beam_search""",
+    )
+
+    parser.add_argument(
+        "--max-states",
+        type=int,
+        default=8,
+        help="""Used only when --method is fast_beam_search""",
     )
 
     parser.add_argument(
@@ -230,10 +275,25 @@ def main():
     if params.method == "beam_search":
         msg += f" with beam size {params.beam_size}"
     logging.info(msg)
-    if params.method == "modified_beam_search":
+
+    if params.method == "fast_beam_search":
+        decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
+        hyp_tokens = fast_beam_search_one_best(
+            model=model,
+            decoding_graph=decoding_graph,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+            beam=params.beam,
+            max_contexts=params.max_contexts,
+            max_states=params.max_states,
+        )
+        for hyp in sp.decode(hyp_tokens):
+            hyps.append(hyp.split())
+    elif params.method == "modified_beam_search":
         hyp_tokens = modified_beam_search(
             model=model,
             encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
             beam=params.beam_size,
         )
 
@@ -243,6 +303,7 @@ def main():
         hyp_tokens = greedy_search_batch(
             model=model,
             encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
         )
         for hyp in sp.decode(hyp_tokens):
             hyps.append(hyp.split())
diff --git a/egs/librispeech/ASR/pruned_transducer_stateless2/beam_search.py b/egs/librispeech/ASR/pruned_transducer_stateless2/beam_search.py
index fc1285dc72..ce8b04afdb 100644
--- a/egs/librispeech/ASR/pruned_transducer_stateless2/beam_search.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless2/beam_search.py
@@ -335,7 +335,9 @@ def greedy_search(
 
 
 def greedy_search_batch(
-    model: Transducer, encoder_out: torch.Tensor
+    model: Transducer,
+    encoder_out: torch.Tensor,
+    encoder_out_lens: torch.Tensor,
 ) -> List[List[int]]:
     """Greedy search in batch mode. It hardcodes --max-sym-per-frame=1.
     Args:
@@ -343,6 +345,9 @@ def greedy_search_batch(
         The transducer model.
       encoder_out:
         Output from the encoder. Its shape is (N, T, C), where N >= 1.
+      encoder_out_lens:
+        A 1-D tensor of shape (N,), containing number of valid frames in
+        encoder_out before padding.
     Returns:
       Return a list-of-list of token IDs containing the decoded results.
       len(ans) equals to encoder_out.size(0).
@@ -350,31 +355,49 @@ def greedy_search_batch(
     assert encoder_out.ndim == 3
     assert encoder_out.size(0) >= 1, encoder_out.size(0)
 
-    device = next(model.parameters()).device
+    packed_encoder_out = torch.nn.utils.rnn.pack_padded_sequence(
+        input=encoder_out,
+        lengths=encoder_out_lens.cpu(),
+        batch_first=True,
+        enforce_sorted=False,
+    )
 
-    batch_size = encoder_out.size(0)
-    T = encoder_out.size(1)
+    device = next(model.parameters()).device
 
     blank_id = model.decoder.blank_id
     unk_id = getattr(model, "unk_id", blank_id)
     context_size = model.decoder.context_size
 
-    hyps = [[blank_id] * context_size for _ in range(batch_size)]
+    batch_size_list = packed_encoder_out.batch_sizes.tolist()
+    N = encoder_out.size(0)
+    assert torch.all(encoder_out_lens > 0), encoder_out_lens
+    assert N == batch_size_list[0], (N, batch_size_list)
+
+    hyps = [[blank_id] * context_size for _ in range(N)]
 
     decoder_input = torch.tensor(
         hyps,
         device=device,
         dtype=torch.int64,
-    )  # (batch_size, context_size)
+    )  # (N, context_size)
 
     decoder_out = model.decoder(decoder_input, need_pad=False)
     decoder_out = model.joiner.decoder_proj(decoder_out)
-    encoder_out = model.joiner.encoder_proj(encoder_out)
+    # decoder_out: (N, 1, decoder_out_dim)
 
-    # decoder_out: (batch_size, 1, decoder_out_dim)
-    for t in range(T):
-        current_encoder_out = encoder_out[:, t : t + 1, :].unsqueeze(2)  # noqa
+    encoder_out = model.joiner.encoder_proj(packed_encoder_out.data)
+
+    offset = 0
+    for batch_size in batch_size_list:
+        start = offset
+        end = offset + batch_size
+        current_encoder_out = encoder_out.data[start:end]
+        current_encoder_out = current_encoder_out.unsqueeze(1).unsqueeze(1)
         # current_encoder_out's shape: (batch_size, 1, 1, encoder_out_dim)
+        offset = end
+
+        decoder_out = decoder_out[:batch_size]
+
         logits = model.joiner(
             current_encoder_out, decoder_out.unsqueeze(1), project_input=False
         )
@@ -390,7 +413,7 @@ def greedy_search_batch(
                 emitted = True
         if emitted:
             # update decoder output
-            decoder_input = [h[-context_size:] for h in hyps]
+            decoder_input = [h[-context_size:] for h in hyps[:batch_size]]
             decoder_input = torch.tensor(
                 decoder_input,
                 device=device,
@@ -399,7 +422,12 @@ def greedy_search_batch(
             decoder_out = model.decoder(decoder_input, need_pad=False)
             decoder_out = model.joiner.decoder_proj(decoder_out)
 
-    ans = [h[context_size:] for h in hyps]
+    sorted_ans = [h[context_size:] for h in hyps]
+    ans = []
+    unsorted_indices = packed_encoder_out.unsorted_indices.tolist()
+    for i in range(N):
+        ans.append(sorted_ans[unsorted_indices[i]])
+
     return ans
 
 
@@ -557,6 +585,7 @@ def _get_hyps_shape(hyps: List[HypothesisList]) -> k2.RaggedShape:
 def modified_beam_search(
     model: Transducer,
     encoder_out: torch.Tensor,
+    encoder_out_lens: torch.Tensor,
     beam: int = 4,
 ) -> List[List[int]]:
     """Beam search in batch mode with --max-sym-per-frame=1 being hardcoded.
@@ -566,6 +595,9 @@ def modified_beam_search(
         The transducer model.
       encoder_out:
         Output from the encoder. Its shape is (N, T, C).
+      encoder_out_lens:
+        A 1-D tensor of shape (N,), containing number of valid frames in
+        encoder_out before padding.
       beam:
         Number of active paths during the beam search.
     Returns:
@@ -573,16 +605,27 @@ def modified_beam_search(
       for the i-th utterance.
     """
     assert encoder_out.ndim == 3, encoder_out.shape
+    assert encoder_out.size(0) >= 1, encoder_out.size(0)
 
-    batch_size = encoder_out.size(0)
-    T = encoder_out.size(1)
+    packed_encoder_out = torch.nn.utils.rnn.pack_padded_sequence(
+        input=encoder_out,
+        lengths=encoder_out_lens.cpu(),
+        batch_first=True,
+        enforce_sorted=False,
+    )
 
     blank_id = model.decoder.blank_id
     unk_id = getattr(model, "unk_id", blank_id)
     context_size = model.decoder.context_size
     device = next(model.parameters()).device
-    B = [HypothesisList() for _ in range(batch_size)]
-    for i in range(batch_size):
+
+    batch_size_list = packed_encoder_out.batch_sizes.tolist()
+    N = encoder_out.size(0)
+    assert torch.all(encoder_out_lens > 0), encoder_out_lens
+    assert N == batch_size_list[0], (N, batch_size_list)
+
+    B = [HypothesisList() for _ in range(N)]
+    for i in range(N):
         B[i].add(
             Hypothesis(
                 ys=[blank_id] * context_size,
@@ -590,11 +633,20 @@ def modified_beam_search(
             )
         )
 
-    encoder_out = model.joiner.encoder_proj(encoder_out)
+    encoder_out = model.joiner.encoder_proj(packed_encoder_out.data)
 
-    for t in range(T):
-        current_encoder_out = encoder_out[:, t : t + 1, :].unsqueeze(2)  # noqa
+    offset = 0
+    finalized_B = []
+    for batch_size in batch_size_list:
+        start = offset
+        end = offset + batch_size
+        current_encoder_out = encoder_out.data[start:end]
+        current_encoder_out = current_encoder_out.unsqueeze(1).unsqueeze(1)
         # current_encoder_out's shape is (batch_size, 1, 1, encoder_out_dim)
+        offset = end
+
+        finalized_B = B[batch_size:] + finalized_B
+        B = B[:batch_size]
 
         hyps_shape = _get_hyps_shape(B).to(device)
 
@@ -668,8 +720,14 @@ def modified_beam_search(
                 new_hyp = Hypothesis(ys=new_ys, log_prob=new_log_prob)
                 B[i].add(new_hyp)
 
+    B = B + finalized_B
     best_hyps = [b.get_most_probable(length_norm=True) for b in B]
-    ans = [h.ys[context_size:] for h in best_hyps]
+
+    sorted_ans = [h.ys[context_size:] for h in best_hyps]
+    ans = []
+    unsorted_indices = packed_encoder_out.unsorted_indices.tolist()
+    for i in range(N):
+        ans.append(sorted_ans[unsorted_indices[i]])
 
     return ans
 
diff --git a/egs/librispeech/ASR/pruned_transducer_stateless2/decode.py b/egs/librispeech/ASR/pruned_transducer_stateless2/decode.py
index 5d946003a4..05a4cdca51 100755
--- a/egs/librispeech/ASR/pruned_transducer_stateless2/decode.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless2/decode.py
@@ -22,15 +22,15 @@
         --epoch 28 \
         --avg 15 \
         --exp-dir ./pruned_transducer_stateless2/exp \
-        --max-duration 100 \
+        --max-duration 600 \
         --decoding-method greedy_search
 
-(2) beam search
+(2) beam search (not recommended)
 ./pruned_transducer_stateless2/decode.py \
         --epoch 28 \
         --avg 15 \
         --exp-dir ./pruned_transducer_stateless2/exp \
-        --max-duration 100 \
+        --max-duration 600 \
         --decoding-method beam_search \
         --beam-size 4
 
@@ -39,7 +39,7 @@
         --epoch 28 \
         --avg 15 \
         --exp-dir ./pruned_transducer_stateless2/exp \
-        --max-duration 100 \
+        --max-duration 600 \
         --decoding-method modified_beam_search \
         --beam-size 4
 
@@ -48,7 +48,7 @@
         --epoch 28 \
         --avg 15 \
         --exp-dir ./pruned_transducer_stateless2/exp \
-        --max-duration 1500 \
+        --max-duration 600 \
         --decoding-method fast_beam_search \
         --beam 4 \
         --max-contexts 4 \
@@ -270,6 +270,7 @@ def decode_one_batch(
         hyp_tokens = greedy_search_batch(
             model=model,
             encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
         )
         for hyp in sp.decode(hyp_tokens):
             hyps.append(hyp.split())
@@ -277,6 +278,7 @@ def decode_one_batch(
         hyp_tokens = modified_beam_search(
             model=model,
             encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
             beam=params.beam_size,
         )
         for hyp in sp.decode(hyp_tokens):
@@ -356,9 +358,9 @@ def decode_dataset(
         num_batches = "?"
 
     if params.decoding_method == "greedy_search":
-        log_interval = 100
+        log_interval = 50
     else:
-        log_interval = 2
+        log_interval = 10
 
     results = defaultdict(list)
     for batch_idx, batch in enumerate(dl):
diff --git a/egs/librispeech/ASR/pruned_transducer_stateless3/decode-giga.py b/egs/librispeech/ASR/pruned_transducer_stateless3/decode-giga.py
index a715a2a5ca..8d6e33e9d9 100755
--- a/egs/librispeech/ASR/pruned_transducer_stateless3/decode-giga.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless3/decode-giga.py
@@ -22,15 +22,15 @@
         --epoch 28 \
         --avg 15 \
         --exp-dir ./pruned_transducer_stateless3/exp \
-        --max-duration 100 \
+        --max-duration 600 \
         --decoding-method greedy_search
 
-(2) beam search
+(2) beam search (not recommended)
 ./pruned_transducer_stateless3/decode-giga.py \
         --epoch 28 \
         --avg 15 \
         --exp-dir ./pruned_transducer_stateless3/exp \
-        --max-duration 100 \
+        --max-duration 600 \
         --decoding-method beam_search \
         --beam-size 4
 
@@ -39,7 +39,7 @@
         --epoch 28 \
         --avg 15 \
         --exp-dir ./pruned_transducer_stateless3/exp \
-        --max-duration 100 \
+        --max-duration 600 \
         --decoding-method modified_beam_search \
         --beam-size 4
 
@@ -48,7 +48,7 @@
         --epoch 28 \
         --avg 15 \
         --exp-dir ./pruned_transducer_stateless3/exp \
-        --max-duration 1500 \
+        --max-duration 600 \
         --decoding-method fast_beam_search \
         --beam 4 \
         --max-contexts 4 \
@@ -224,8 +224,8 @@ def get_parser():
 
 
 def post_processing(
-    results: List[Tuple[List[List[str]], List[List[str]]]],
-) -> List[Tuple[List[List[str]], List[List[str]]]]:
+    results: List[Tuple[List[str], List[str]]],
+) -> List[Tuple[List[str], List[str]]]:
     new_results = []
     for ref, hyp in results:
         new_ref = asr_text_post_processing(" ".join(ref)).split()
@@ -415,9 +415,9 @@ def decode_dataset(
         num_batches = "?"
 
     if params.decoding_method == "greedy_search":
-        log_interval = 100
+        log_interval = 50
     else:
-        log_interval = 2
+        log_interval = 10
 
     results = defaultdict(list)
     for batch_idx, batch in enumerate(dl):
diff --git a/egs/librispeech/ASR/pruned_transducer_stateless3/decode.py b/egs/librispeech/ASR/pruned_transducer_stateless3/decode.py
index 9a6b5a117b..5b3dce8535 100755
--- a/egs/librispeech/ASR/pruned_transducer_stateless3/decode.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless3/decode.py
@@ -22,15 +22,15 @@
         --epoch 28 \
         --avg 15 \
         --exp-dir ./pruned_transducer_stateless3/exp \
-        --max-duration 100 \
+        --max-duration 600 \
         --decoding-method greedy_search
 
-(2) beam search
+(2) beam search (not recommended)
 ./pruned_transducer_stateless3/decode.py \
         --epoch 28 \
         --avg 15 \
         --exp-dir ./pruned_transducer_stateless3/exp \
-        --max-duration 100 \
+        --max-duration 600 \
         --decoding-method beam_search \
         --beam-size 4
 
@@ -39,7 +39,7 @@
         --epoch 28 \
         --avg 15 \
         --exp-dir ./pruned_transducer_stateless3/exp \
-        --max-duration 100 \
+        --max-duration 600 \
         --decoding-method modified_beam_search \
         --beam-size 4
 
@@ -48,7 +48,7 @@
         --epoch 28 \
         --avg 15 \
         --exp-dir ./pruned_transducer_stateless3/exp \
-        --max-duration 1500 \
+        --max-duration 600 \
         --decoding-method fast_beam_search \
         --beam 4 \
         --max-contexts 4 \
@@ -307,6 +307,7 @@ def decode_one_batch(
         hyp_tokens = greedy_search_batch(
             model=model,
             encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
         )
         for hyp in sp.decode(hyp_tokens):
             hyps.append(hyp.split())
@@ -314,6 +315,7 @@ def decode_one_batch(
         hyp_tokens = modified_beam_search(
             model=model,
             encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
             beam=params.beam_size,
         )
         for hyp in sp.decode(hyp_tokens):
@@ -403,9 +405,9 @@ def decode_dataset(
         num_batches = "?"
 
     if params.decoding_method == "greedy_search":
-        log_interval = 100
+        log_interval = 50
     else:
-        log_interval = 2
+        log_interval = 10
 
     results = defaultdict(list)
     for batch_idx, batch in enumerate(dl):
diff --git a/egs/librispeech/ASR/pruned_transducer_stateless4/decode.py b/egs/librispeech/ASR/pruned_transducer_stateless4/decode.py
index 1f4a22213d..9982cc5306 100755
--- a/egs/librispeech/ASR/pruned_transducer_stateless4/decode.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless4/decode.py
@@ -22,16 +22,16 @@
 ./pruned_transducer_stateless4/decode.py \
         --epoch 30 \
         --avg 15 \
-        --exp-dir ./pruned_transducer_stateless2/exp \
-        --max-duration 100 \
+        --exp-dir ./pruned_transducer_stateless4/exp \
+        --max-duration 600 \
         --decoding-method greedy_search
 
-(2) beam search
+(2) beam search (not recommended)
 ./pruned_transducer_stateless4/decode.py \
         --epoch 30 \
         --avg 15 \
-        --exp-dir ./pruned_transducer_stateless2/exp \
-        --max-duration 100 \
+        --exp-dir ./pruned_transducer_stateless4/exp \
+        --max-duration 600 \
         --decoding-method beam_search \
         --beam-size 4
 
@@ -39,8 +39,8 @@
 ./pruned_transducer_stateless4/decode.py \
         --epoch 30 \
         --avg 15 \
-        --exp-dir ./pruned_transducer_stateless2/exp \
-        --max-duration 100 \
+        --exp-dir ./pruned_transducer_stateless4/exp \
+        --max-duration 600 \
         --decoding-method modified_beam_search \
         --beam-size 4
 
@@ -48,8 +48,8 @@
 ./pruned_transducer_stateless4/decode.py \
         --epoch 30 \
         --avg 15 \
-        --exp-dir ./pruned_transducer_stateless2/exp \
-        --max-duration 1500 \
+        --exp-dir ./pruned_transducer_stateless4/exp \
+        --max-duration 600 \
         --decoding-method fast_beam_search \
         --beam 4 \
         --max-contexts 4 \
@@ -70,7 +70,7 @@
 from asr_datamodule import LibriSpeechAsrDataModule
 from beam_search import (
     beam_search,
-    fast_beam_search,
+    fast_beam_search_one_best,
     greedy_search,
     greedy_search_batch,
     modified_beam_search,
@@ -266,7 +266,7 @@ def decode_one_batch(
     hyps = []
 
     if params.decoding_method == "fast_beam_search":
-        hyp_tokens = fast_beam_search(
+        hyp_tokens = fast_beam_search_one_best(
             model=model,
             decoding_graph=decoding_graph,
             encoder_out=encoder_out,
@@ -284,6 +284,7 @@ def decode_one_batch(
         hyp_tokens = greedy_search_batch(
             model=model,
             encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
         )
         for hyp in sp.decode(hyp_tokens):
             hyps.append(hyp.split())
@@ -291,6 +292,7 @@ def decode_one_batch(
         hyp_tokens = modified_beam_search(
             model=model,
             encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
             beam=params.beam_size,
         )
         for hyp in sp.decode(hyp_tokens):
@@ -370,9 +372,9 @@ def decode_dataset(
         num_batches = "?"
 
     if params.decoding_method == "greedy_search":
-        log_interval = 100
+        log_interval = 50
     else:
-        log_interval = 2
+        log_interval = 10
 
     results = defaultdict(list)
     for batch_idx, batch in enumerate(dl):
diff --git a/egs/librispeech/ASR/transducer_stateless/beam_search.py b/egs/librispeech/ASR/transducer_stateless/beam_search.py
index 388a8d67a8..ea985f30da 100644
--- a/egs/librispeech/ASR/transducer_stateless/beam_search.py
+++ b/egs/librispeech/ASR/transducer_stateless/beam_search.py
@@ -22,6 +22,235 @@
 import torch
 from model import Transducer
 
+from icefall.decode import Nbest, one_best_decoding
+from icefall.utils import get_texts
+
+
+def fast_beam_search_one_best(
+    model: Transducer,
+    decoding_graph: k2.Fsa,
+    encoder_out: torch.Tensor,
+    encoder_out_lens: torch.Tensor,
+    beam: float,
+    max_states: int,
+    max_contexts: int,
+) -> List[List[int]]:
+    """It limits the maximum number of symbols per frame to 1.
+
+    A lattice is first obtained using modified beam search, and then
+    the shortest path within the lattice is used as the final output.
+
+    Args:
+      model:
+        An instance of `Transducer`.
+      decoding_graph:
+        Decoding graph used for decoding, may be a TrivialGraph or a HLG.
+      encoder_out:
+        A tensor of shape (N, T, C) from the encoder.
+      encoder_out_lens:
+        A tensor of shape (N,) containing the number of frames in `encoder_out`
+        before padding.
+      beam:
+        Beam value, similar to the beam used in Kaldi..
+      max_states:
+        Max states per stream per frame.
+      max_contexts:
+        Max contexts pre stream per frame.
+    Returns:
+      Return the decoded result.
+    """
+    lattice = fast_beam_search(
+        model=model,
+        decoding_graph=decoding_graph,
+        encoder_out=encoder_out,
+        encoder_out_lens=encoder_out_lens,
+        beam=beam,
+        max_states=max_states,
+        max_contexts=max_contexts,
+    )
+
+    best_path = one_best_decoding(lattice)
+    hyps = get_texts(best_path)
+    return hyps
+
+
+def fast_beam_search_nbest_oracle(
+    model: Transducer,
+    decoding_graph: k2.Fsa,
+    encoder_out: torch.Tensor,
+    encoder_out_lens: torch.Tensor,
+    beam: float,
+    max_states: int,
+    max_contexts: int,
+    num_paths: int,
+    ref_texts: List[List[int]],
+    use_double_scores: bool = True,
+    nbest_scale: float = 0.5,
+) -> List[List[int]]:
+    """It limits the maximum number of symbols per frame to 1.
+
+    A lattice is first obtained using modified beam search, and then
+    we select `num_paths` linear paths from the lattice. The path
+    that has the minimum edit distance with the given reference transcript
+    is used as the output.
+
+    This is the best result we can achieve for any nbest based rescoring
+    methods.
+
+    Args:
+      model:
+        An instance of `Transducer`.
+      decoding_graph:
+        Decoding graph used for decoding, may be a TrivialGraph or a HLG.
+      encoder_out:
+        A tensor of shape (N, T, C) from the encoder.
+      encoder_out_lens:
+        A tensor of shape (N,) containing the number of frames in `encoder_out`
+        before padding.
+      beam:
+        Beam value, similar to the beam used in Kaldi..
+      max_states:
+        Max states per stream per frame.
+      max_contexts:
+        Max contexts pre stream per frame.
+      num_paths:
+        Number of paths to extract from the decoded lattice.
+      ref_texts:
+        A list-of-list of integers containing the reference transcripts.
+        If the decoding_graph is a trivial_graph, the integer ID is the
+        BPE token ID.
+      use_double_scores:
+        True to use double precision for computation. False to use
+        single precision.
+      nbest_scale:
+        It's the scale applied to the lattice.scores. A smaller value
+        yields more unique paths.
+
+    Returns:
+      Return the decoded result.
+    """
+    lattice = fast_beam_search(
+        model=model,
+        decoding_graph=decoding_graph,
+        encoder_out=encoder_out,
+        encoder_out_lens=encoder_out_lens,
+        beam=beam,
+        max_states=max_states,
+        max_contexts=max_contexts,
+    )
+
+    nbest = Nbest.from_lattice(
+        lattice=lattice,
+        num_paths=num_paths,
+        use_double_scores=use_double_scores,
+        nbest_scale=nbest_scale,
+    )
+
+    hyps = nbest.build_levenshtein_graphs()
+    refs = k2.levenshtein_graph(ref_texts, device=hyps.device)
+
+    levenshtein_alignment = k2.levenshtein_alignment(
+        refs=refs,
+        hyps=hyps,
+        hyp_to_ref_map=nbest.shape.row_ids(1),
+        sorted_match_ref=True,
+    )
+
+    tot_scores = levenshtein_alignment.get_tot_scores(
+        use_double_scores=False, log_semiring=False
+    )
+    ragged_tot_scores = k2.RaggedTensor(nbest.shape, tot_scores)
+
+    max_indexes = ragged_tot_scores.argmax()
+
+    best_path = k2.index_fsa(nbest.fsa, max_indexes)
+
+    hyps = get_texts(best_path)
+    return hyps
+
+
+def fast_beam_search(
+    model: Transducer,
+    decoding_graph: k2.Fsa,
+    encoder_out: torch.Tensor,
+    encoder_out_lens: torch.Tensor,
+    beam: float,
+    max_states: int,
+    max_contexts: int,
+) -> k2.Fsa:
+    """It limits the maximum number of symbols per frame to 1.
+
+    Args:
+      model:
+        An instance of `Transducer`.
+      decoding_graph:
+        Decoding graph used for decoding, may be a TrivialGraph or a HLG.
+      encoder_out:
+        A tensor of shape (N, T, C) from the encoder.
+      encoder_out_lens:
+        A tensor of shape (N,) containing the number of frames in `encoder_out`
+        before padding.
+      beam:
+        Beam value, similar to the beam used in Kaldi..
+      max_states:
+        Max states per stream per frame.
+      max_contexts:
+        Max contexts pre stream per frame.
+    Returns:
+      Return an FsaVec with axes [utt][state][arc] containing the decoded
+      lattice. Note: When the input graph is a TrivialGraph, the returned
+      lattice is actually an acceptor.
+    """
+    assert encoder_out.ndim == 3
+
+    context_size = model.decoder.context_size
+    vocab_size = model.decoder.vocab_size
+
+    B, T, C = encoder_out.shape
+
+    config = k2.RnntDecodingConfig(
+        vocab_size=vocab_size,
+        decoder_history_len=context_size,
+        beam=beam,
+        max_contexts=max_contexts,
+        max_states=max_states,
+    )
+    individual_streams = []
+    for i in range(B):
+        individual_streams.append(k2.RnntDecodingStream(decoding_graph))
+    decoding_streams = k2.RnntDecodingStreams(individual_streams, config)
+
+    encoder_out_len = torch.ones(1, dtype=torch.int32)
+    decoder_out_len = torch.ones(1, dtype=torch.int32)
+
+    for t in range(T):
+        # shape is a RaggedShape of shape (B, context)
+        # contexts is a Tensor of shape (shape.NumElements(), context_size)
+        shape, contexts = decoding_streams.get_contexts()
+        # `nn.Embedding()` in torch below v1.7.1 supports only torch.int64
+        contexts = contexts.to(torch.int64)
+        # decoder_out is of shape (shape.NumElements(), 1, decoder_out_dim)
+        decoder_out = model.decoder(contexts, need_pad=False)
+        # current_encoder_out is of shape
+        # (shape.NumElements(), 1, joiner_dim)
+        # fmt: off
+        current_encoder_out = torch.index_select(
+            encoder_out[:, t:t + 1, :], 0, shape.row_ids(1).to(torch.int64)
+        )
+        # fmt: on
+        logits = model.joiner(
+            current_encoder_out,
+            decoder_out,
+            encoder_out_len.expand(decoder_out.size(0)),
+            decoder_out_len.expand(decoder_out.size(0)),
+        )  # (N, vocab_size)
+        log_probs = logits.log_softmax(dim=-1)
+        decoding_streams.advance(log_probs)
+    decoding_streams.terminate_and_flush_to_streams()
+    lattice = decoding_streams.format_output(encoder_out_lens.tolist())
+
+    return lattice
+
 
 def greedy_search(
     model: Transducer, encoder_out: torch.Tensor, max_sym_per_frame: int
@@ -104,7 +333,9 @@ def greedy_search(
 
 
 def greedy_search_batch(
-    model: Transducer, encoder_out: torch.Tensor
+    model: Transducer,
+    encoder_out: torch.Tensor,
+    encoder_out_lens: torch.Tensor,
 ) -> List[List[int]]:
     """Greedy search in batch mode. It hardcodes --max-sym-per-frame=1.
     Args:
@@ -112,6 +343,9 @@ def greedy_search_batch(
         The transducer model.
       encoder_out:
         Output from the encoder. Its shape is (N, T, C), where N >= 1.
+      encoder_out_lens:
+        A 1-D tensor of shape (N,), containing number of valid frames in
+        encoder_out before padding.
     Returns:
       Return a list-of-list of token IDs containing the decoded results.
       len(ans) equals to encoder_out.size(0).
@@ -119,32 +353,54 @@ def greedy_search_batch(
     assert encoder_out.ndim == 3
     assert encoder_out.size(0) >= 1, encoder_out.size(0)
 
-    device = model.device
+    packed_encoder_out = torch.nn.utils.rnn.pack_padded_sequence(
+        input=encoder_out,
+        lengths=encoder_out_lens.cpu(),
+        batch_first=True,
+        enforce_sorted=False,
+    )
 
-    batch_size = encoder_out.size(0)
-    T = encoder_out.size(1)
+    device = next(model.parameters()).device
 
     blank_id = model.decoder.blank_id
     context_size = model.decoder.context_size
 
-    hyps = [[blank_id] * context_size for _ in range(batch_size)]
+    batch_size_list = packed_encoder_out.batch_sizes.tolist()
+    N = encoder_out.size(0)
+    assert torch.all(encoder_out_lens > 0), encoder_out_lens
+    assert N == batch_size_list[0], (N, batch_size_list)
+
+    hyps = [[blank_id] * context_size for _ in range(N)]
 
     decoder_input = torch.tensor(
         hyps,
         device=device,
         dtype=torch.int64,
-    )  # (batch_size, context_size)
+    )  # (N, context_size)
     decoder_out = model.decoder(decoder_input, need_pad=False)
-    # decoder_out: (batch_size, 1, decoder_out_dim)
+    # decoder_out: (N, 1, decoder_out_dim)
 
-    encoder_out_len = torch.ones(batch_size, dtype=torch.int32)
-    decoder_out_len = torch.ones(batch_size, dtype=torch.int32)
+    encoder_out_len = torch.ones(1, dtype=torch.int32)
+    decoder_out_len = torch.ones(1, dtype=torch.int32)
 
-    for t in range(T):
-        current_encoder_out = encoder_out[:, t : t + 1, :]  # noqa
+    encoder_out = packed_encoder_out.data
+
+    offset = 0
+    for batch_size in batch_size_list:
+        start = offset
+        end = offset + batch_size
+        current_encoder_out = encoder_out.data[start:end]
+        current_encoder_out = current_encoder_out.unsqueeze(1)
         # current_encoder_out's shape: (batch_size, 1, encoder_out_dim)
+        offset = end
+
+        decoder_out = decoder_out[:batch_size]
+
         logits = model.joiner(
-            current_encoder_out, decoder_out, encoder_out_len, decoder_out_len
+            current_encoder_out,
+            decoder_out,
+            encoder_out_len.expand(batch_size),
+            decoder_out_len.expand(batch_size),
         )  # (batch_size, vocab_size)
 
         assert logits.ndim == 2, logits.shape
@@ -157,7 +413,7 @@ def greedy_search_batch(
 
         if emitted:
             # update decoder output
-            decoder_input = [h[-context_size:] for h in hyps]
+            decoder_input = [h[-context_size:] for h in hyps[:batch_size]]
             decoder_input = torch.tensor(
                 decoder_input,
                 device=device,
@@ -168,7 +424,12 @@ def greedy_search_batch(
                 need_pad=False,
             )  # (batch_size, 1, decoder_out_dim)
 
-    ans = [h[context_size:] for h in hyps]
+    sorted_ans = [h[context_size:] for h in hyps]
+    ans = []
+    unsorted_indices = packed_encoder_out.unsorted_indices.tolist()
+    for i in range(N):
+        ans.append(sorted_ans[unsorted_indices[i]])
+
     return ans
 
 
@@ -415,6 +676,7 @@ def _get_hyps_shape(hyps: List[HypothesisList]) -> k2.RaggedShape:
 def modified_beam_search(
     model: Transducer,
     encoder_out: torch.Tensor,
+    encoder_out_lens: torch.Tensor,
     beam: int = 4,
 ) -> List[List[int]]:
     """Beam search in batch mode with --max-sym-per-frame=1 being hardcodded.
@@ -424,6 +686,9 @@ def modified_beam_search(
         The transducer model.
       encoder_out:
         Output from the encoder. Its shape is (N, T, C).
+      encoder_out_lens:
+        A 1-D tensor of shape (N,), containing number of valid frames in
+        encoder_out before padding.
       beam:
         Number of active paths during the beam search.
     Returns:
@@ -431,15 +696,26 @@ def modified_beam_search(
       for the i-th utterance.
     """
     assert encoder_out.ndim == 3, encoder_out.shape
+    assert encoder_out.size(0) >= 1, encoder_out.size(0)
 
-    batch_size = encoder_out.size(0)
-    T = encoder_out.size(1)
+    packed_encoder_out = torch.nn.utils.rnn.pack_padded_sequence(
+        input=encoder_out,
+        lengths=encoder_out_lens.cpu(),
+        batch_first=True,
+        enforce_sorted=False,
+    )
 
     blank_id = model.decoder.blank_id
     context_size = model.decoder.context_size
-    device = model.device
-    B = [HypothesisList() for _ in range(batch_size)]
-    for i in range(batch_size):
+    device = next(model.parameters()).device
+
+    batch_size_list = packed_encoder_out.batch_sizes.tolist()
+    N = encoder_out.size(0)
+    assert torch.all(encoder_out_lens > 0), encoder_out_lens
+    assert N == batch_size_list[0], (N, batch_size_list)
+
+    B = [HypothesisList() for _ in range(N)]
+    for i in range(N):
         B[i].add(
             Hypothesis(
                 ys=[blank_id] * context_size,
@@ -449,9 +725,20 @@ def modified_beam_search(
 
     encoder_out_len = torch.tensor([1])
     decoder_out_len = torch.tensor([1])
-    for t in range(T):
-        current_encoder_out = encoder_out[:, t : t + 1, :]  # noqa
+
+    encoder_out = packed_encoder_out.data
+    offset = 0
+    finalized_B = []
+    for batch_size in batch_size_list:
+        start = offset
+        end = offset + batch_size
+        current_encoder_out = encoder_out.data[start:end]
+        current_encoder_out = current_encoder_out.unsqueeze(1)
         # current_encoder_out's shape is: (batch_size, 1, encoder_out_dim)
+        offset = end
+
+        finalized_B = B[batch_size:] + finalized_B
+        B = B[:batch_size]
 
         hyps_shape = _get_hyps_shape(B).to(device)
 
@@ -524,8 +811,14 @@ def modified_beam_search(
                 new_hyp = Hypothesis(ys=new_ys, log_prob=new_log_prob)
                 B[i].add(new_hyp)
 
+    B = B + finalized_B
     best_hyps = [b.get_most_probable(length_norm=True) for b in B]
-    ans = [h.ys[context_size:] for h in best_hyps]
+
+    sorted_ans = [h.ys[context_size:] for h in best_hyps]
+    ans = []
+    unsorted_indices = packed_encoder_out.unsorted_indices.tolist()
+    for i in range(N):
+        ans.append(sorted_ans[unsorted_indices[i]])
 
     return ans
 
diff --git a/egs/librispeech/ASR/transducer_stateless/decode.py b/egs/librispeech/ASR/transducer_stateless/decode.py
index ac66c9b493..5ea17b1739 100755
--- a/egs/librispeech/ASR/transducer_stateless/decode.py
+++ b/egs/librispeech/ASR/transducer_stateless/decode.py
@@ -19,29 +19,40 @@
 Usage:
 (1) greedy search
 ./transducer_stateless/decode.py \
-        --epoch 14 \
-        --avg 7 \
-        --exp-dir ./transducer_stateless/exp \
-        --max-duration 100 \
-        --decoding-method greedy_search
+    --epoch 14 \
+    --avg 7 \
+    --exp-dir ./transducer_stateless/exp \
+    --max-duration 600 \
+    --decoding-method greedy_search
 
-(2) beam search
+(2) beam search (not recommended)
 ./transducer_stateless/decode.py \
-        --epoch 14 \
-        --avg 7 \
-        --exp-dir ./transducer_stateless/exp \
-        --max-duration 100 \
-        --decoding-method beam_search \
-        --beam-size 4
+    --epoch 14 \
+    --avg 7 \
+    --exp-dir ./transducer_stateless/exp \
+    --max-duration 600 \
+    --decoding-method beam_search \
+    --beam-size 4
 
 (3) modified beam search
 ./transducer_stateless/decode.py \
-        --epoch 14 \
-        --avg 7 \
-        --exp-dir ./transducer_stateless/exp \
-        --max-duration 100 \
-        --decoding-method modified_beam_search \
-        --beam-size 4
+    --epoch 14 \
+    --avg 7 \
+    --exp-dir ./transducer_stateless/exp \
+    --max-duration 600 \
+    --decoding-method modified_beam_search \
+    --beam-size 4
+
+(4) fast beam search
+./transducer_stateless/decode.py \
+    --epoch 14 \
+    --avg 7 \
+    --exp-dir ./transducer_stateless/exp \
+    --max-duration 600 \
+    --decoding-method fast_beam_search \
+    --beam 4 \
+    --max-contexts 4 \
+    --max-states 8
 """
 
 
@@ -49,14 +60,16 @@
 import logging
 from collections import defaultdict
 from pathlib import Path
-from typing import Dict, List, Tuple
+from typing import Dict, List, Optional, Tuple
 
+import k2
 import sentencepiece as spm
 import torch
 import torch.nn as nn
 from asr_datamodule import LibriSpeechAsrDataModule
 from beam_search import (
     beam_search,
+    fast_beam_search_one_best,
     greedy_search,
     greedy_search_batch,
     modified_beam_search,
@@ -115,6 +128,7 @@ def get_parser():
           - greedy_search
           - beam_search
           - modified_beam_search
+          - fast_beam_search
         """,
     )
 
@@ -122,8 +136,35 @@ def get_parser():
         "--beam-size",
         type=int,
         default=4,
+        help="""An integer indicating how many candidates we will keep for each
+        frame. Used only when --decoding-method is beam_search or
+        modified_beam_search.""",
+    )
+
+    parser.add_argument(
+        "--beam",
+        type=float,
+        default=4,
+        help="""A floating point value to calculate the cutoff score during beam
+        search (i.e., `cutoff = max-score - beam`), which is the same as the
+        `beam` in Kaldi.
+        Used only when --decoding-method is fast_beam_search""",
+    )
+
+    parser.add_argument(
+        "--max-contexts",
+        type=int,
+        default=4,
         help="""Used only when --decoding-method is
-        beam_search or modified_beam_search""",
+        fast_beam_search""",
+    )
+
+    parser.add_argument(
+        "--max-states",
+        type=int,
+        default=8,
+        help="""Used only when --decoding-method is
+        fast_beam_search""",
     )
 
     parser.add_argument(
@@ -149,6 +190,7 @@ def decode_one_batch(
     model: nn.Module,
     sp: spm.SentencePieceProcessor,
     batch: dict,
+    decoding_graph: Optional[k2.Fsa] = None,
 ) -> Dict[str, List[List[str]]]:
     """Decode one batch and return the result in a dict. The dict has the
     following format:
@@ -171,6 +213,9 @@ def decode_one_batch(
         It is the return value from iterating
         `lhotse.dataset.K2SpeechRecognitionDataset`. See its documentation
         for the format of the `batch`.
+      decoding_graph:
+        The decoding graph. Can be either a `k2.trivial_graph` or HLG, Used
+        only when --decoding_method is fast_beam_search.
     Returns:
       Return the decoding result. See above description for the format of
       the returned dict.
@@ -188,24 +233,44 @@ def decode_one_batch(
     encoder_out, encoder_out_lens = model.encoder(
         x=feature, x_lens=feature_lens
     )
-    hyp_list: List[List[int]] = []
 
-    if (
+    hyps = []
+
+    if params.decoding_method == "fast_beam_search":
+        hyp_tokens = fast_beam_search_one_best(
+            model=model,
+            decoding_graph=decoding_graph,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+            beam=params.beam,
+            max_contexts=params.max_contexts,
+            max_states=params.max_states,
+        )
+        for hyp in sp.decode(hyp_tokens):
+            hyps.append(hyp.split())
+    elif (
         params.decoding_method == "greedy_search"
         and params.max_sym_per_frame == 1
     ):
-        hyp_list = greedy_search_batch(
+        hyp_tokens = greedy_search_batch(
             model=model,
             encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
         )
+        for hyp in sp.decode(hyp_tokens):
+            hyps.append(hyp.split())
     elif params.decoding_method == "modified_beam_search":
-        hyp_list = modified_beam_search(
+        hyp_tokens = modified_beam_search(
             model=model,
             encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
             beam=params.beam_size,
         )
+        for hyp in sp.decode(hyp_tokens):
+            hyps.append(hyp.split())
     else:
         batch_size = encoder_out.size(0)
+
         for i in range(batch_size):
             # fmt: off
             encoder_out_i = encoder_out[i:i+1, :encoder_out_lens[i]]
@@ -226,14 +291,20 @@ def decode_one_batch(
                 raise ValueError(
                     f"Unsupported decoding method: {params.decoding_method}"
                 )
-            hyp_list.append(hyp)
-
-    hyps = [sp.decode(hyp).split() for hyp in hyp_list]
+            hyps.append(sp.decode(hyp).split())
 
     if params.decoding_method == "greedy_search":
         return {"greedy_search": hyps}
+    elif params.decoding_method == "fast_beam_search":
+        return {
+            (
+                f"beam_{params.beam}_"
+                f"max_contexts_{params.max_contexts}_"
+                f"max_states_{params.max_states}"
+            ): hyps
+        }
     else:
-        return {f"beam_{params.beam_size}": hyps}
+        return {f"beam_size_{params.beam_size}": hyps}
 
 
 def decode_dataset(
@@ -241,6 +312,7 @@ def decode_dataset(
     params: AttributeDict,
     model: nn.Module,
     sp: spm.SentencePieceProcessor,
+    decoding_graph: Optional[k2.Fsa] = None,
 ) -> Dict[str, List[Tuple[List[str], List[str]]]]:
     """Decode dataset.
 
@@ -253,6 +325,9 @@ def decode_dataset(
         The neural model.
       sp:
         The BPE model.
+      decoding_graph:
+        The decoding graph. Can be either a `k2.trivial_graph` or HLG, Used
+        only when --decoding_method is fast_beam_search.
     Returns:
       Return a dict, whose key may be "greedy_search" if greedy search
       is used, or it may be "beam_7" if beam size of 7 is used.
@@ -268,9 +343,9 @@ def decode_dataset(
         num_batches = "?"
 
     if params.decoding_method == "greedy_search":
-        log_interval = 100
+        log_interval = 50
     else:
-        log_interval = 2
+        log_interval = 10
 
     results = defaultdict(list)
     for batch_idx, batch in enumerate(dl):
@@ -280,6 +355,7 @@ def decode_dataset(
             params=params,
             model=model,
             sp=sp,
+            decoding_graph=decoding_graph,
             batch=batch,
         )
 
@@ -360,13 +436,21 @@ def main():
     assert params.decoding_method in (
         "greedy_search",
         "beam_search",
+        "fast_beam_search",
         "modified_beam_search",
     )
     params.res_dir = params.exp_dir / params.decoding_method
 
     params.suffix = f"epoch-{params.epoch}-avg-{params.avg}"
-    if "beam_search" in params.decoding_method:
-        params.suffix += f"-beam-{params.beam_size}"
+
+    if "fast_beam_search" in params.decoding_method:
+        params.suffix += f"-beam-{params.beam}"
+        params.suffix += f"-max-contexts-{params.max_contexts}"
+        params.suffix += f"-max-states-{params.max_states}"
+    elif "beam_search" in params.decoding_method:
+        params.suffix += (
+            f"-{params.decoding_method}-beam-size-{params.beam_size}"
+        )
     else:
         params.suffix += f"-context-{params.context_size}"
         params.suffix += f"-max-sym-per-frame-{params.max_sym_per_frame}"
@@ -408,6 +492,11 @@ def main():
     model.eval()
     model.device = device
 
+    if params.decoding_method == "fast_beam_search":
+        decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
+    else:
+        decoding_graph = None
+
     num_param = sum([p.numel() for p in model.parameters()])
     logging.info(f"Number of model parameters: {num_param}")
 
@@ -428,6 +517,7 @@ def main():
             params=params,
             model=model,
             sp=sp,
+            decoding_graph=decoding_graph,
         )
 
         save_results(
diff --git a/egs/librispeech/ASR/transducer_stateless/decoder.py b/egs/librispeech/ASR/transducer_stateless/decoder.py
index b82fed37b7..fbc2373a9d 100644
--- a/egs/librispeech/ASR/transducer_stateless/decoder.py
+++ b/egs/librispeech/ASR/transducer_stateless/decoder.py
@@ -58,6 +58,7 @@ def __init__(
             padding_idx=blank_id,
         )
         self.blank_id = blank_id
+        self.vocab_size = vocab_size
 
         assert context_size >= 1, context_size
         self.context_size = context_size
diff --git a/egs/librispeech/ASR/transducer_stateless/pretrained.py b/egs/librispeech/ASR/transducer_stateless/pretrained.py
index 4fb5d92c5b..b645218015 100755
--- a/egs/librispeech/ASR/transducer_stateless/pretrained.py
+++ b/egs/librispeech/ASR/transducer_stateless/pretrained.py
@@ -19,30 +19,39 @@
 
 (1) greedy search
 ./transducer_stateless/pretrained.py \
-        --checkpoint ./transducer_stateless/exp/pretrained.pt \
-        --bpe-model ./data/lang_bpe_500/bpe.model \
-        --method greedy_search \
-        --max-sym-per-frame 1 \
-        /path/to/foo.wav \
-        /path/to/bar.wav \
+    --checkpoint ./transducer_stateless/exp/pretrained.pt \
+    --bpe-model ./data/lang_bpe_500/bpe.model \
+    --method greedy_search \
+    --max-sym-per-frame 1 \
+    /path/to/foo.wav \
+    /path/to/bar.wav
 
 (2) beam search
 ./transducer_stateless/pretrained.py \
-        --checkpoint ./transducer_stateless/exp/pretrained.pt \
-        --bpe-model ./data/lang_bpe_500/bpe.model \
-        --method beam_search \
-        --beam-size 4 \
-        /path/to/foo.wav \
-        /path/to/bar.wav \
+    --checkpoint ./transducer_stateless/exp/pretrained.pt \
+    --bpe-model ./data/lang_bpe_500/bpe.model \
+    --method beam_search \
+    --beam-size 4 \
+    /path/to/foo.wav \
+    /path/to/bar.wav
 
 (3) modified beam search
 ./transducer_stateless/pretrained.py \
-        --checkpoint ./transducer_stateless/exp/pretrained.pt \
-        --bpe-model ./data/lang_bpe_500/bpe.model \
-        --method modified_beam_search \
-        --beam-size 4 \
-        /path/to/foo.wav \
-        /path/to/bar.wav \
+    --checkpoint ./transducer_stateless/exp/pretrained.pt \
+    --bpe-model ./data/lang_bpe_500/bpe.model \
+    --method modified_beam_search \
+    --beam-size 4 \
+    /path/to/foo.wav \
+    /path/to/bar.wav
+
+(4) fast beam search
+./transducer_stateless/pretrained.py \
+    --checkpoint ./transducer_stateless/exp/pretrained.pt \
+    --bpe-model ./data/lang_bpe_500/bpe.model \
+    --method fast_beam_search \
+    --beam-size 4 \
+    /path/to/foo.wav \
+    /path/to/bar.wav
 
 You can also use `./transducer_stateless/exp/epoch-xx.pt`.
 
@@ -56,12 +65,14 @@
 import math
 from typing import List
 
+import k2
 import kaldifeat
 import sentencepiece as spm
 import torch
 import torchaudio
 from beam_search import (
     beam_search,
+    fast_beam_search_one_best,
     greedy_search,
     greedy_search_batch,
     modified_beam_search,
@@ -87,9 +98,7 @@ def get_parser():
     parser.add_argument(
         "--bpe-model",
         type=str,
-        help="""Path to bpe.model.
-        Used only when method is ctc-decoding.
-        """,
+        help="""Path to bpe.model.""",
     )
 
     parser.add_argument(
@@ -100,6 +109,7 @@ def get_parser():
           - greedy_search
           - beam_search
           - modified_beam_search
+          - fast_beam_search
         """,
     )
 
@@ -124,7 +134,33 @@ def get_parser():
         "--beam-size",
         type=int,
         default=4,
-        help="Used only when --method is beam_search and modified_beam_search ",
+        help="""An integer indicating how many candidates we will keep for each
+        frame. Used only when --method is beam_search or
+        modified_beam_search.""",
+    )
+
+    parser.add_argument(
+        "--beam",
+        type=float,
+        default=4,
+        help="""A floating point value to calculate the cutoff score during beam
+        search (i.e., `cutoff = max-score - beam`), which is the same as the
+        `beam` in Kaldi.
+        Used only when --method is fast_beam_search""",
+    )
+
+    parser.add_argument(
+        "--max-contexts",
+        type=int,
+        default=4,
+        help="""Used only when --method is fast_beam_search""",
+    )
+
+    parser.add_argument(
+        "--max-states",
+        type=int,
+        default=8,
+        help="""Used only when --method is fast_beam_search""",
     )
 
     parser.add_argument(
@@ -241,15 +277,28 @@ def main():
         msg += f" with beam size {params.beam_size}"
     logging.info(msg)
 
-    if params.method == "greedy_search" and params.max_sym_per_frame == 1:
+    if params.method == "fast_beam_search":
+        decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
+        hyp_list = fast_beam_search_one_best(
+            model=model,
+            decoding_graph=decoding_graph,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+            beam=params.beam,
+            max_contexts=params.max_contexts,
+            max_states=params.max_states,
+        )
+    elif params.method == "greedy_search" and params.max_sym_per_frame == 1:
         hyp_list = greedy_search_batch(
             model=model,
             encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
         )
     elif params.method == "modified_beam_search":
         hyp_list = modified_beam_search(
             model=model,
             encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
             beam=params.beam_size,
         )
     else:
diff --git a/egs/librispeech/ASR/transducer_stateless2/decode.py b/egs/librispeech/ASR/transducer_stateless2/decode.py
index 08c61c2be3..4cf1e559c1 100755
--- a/egs/librispeech/ASR/transducer_stateless2/decode.py
+++ b/egs/librispeech/ASR/transducer_stateless2/decode.py
@@ -22,15 +22,15 @@
         --epoch 14 \
         --avg 7 \
         --exp-dir ./transducer_stateless2/exp \
-        --max-duration 100 \
+        --max-duration 600 \
         --decoding-method greedy_search
 
-(2) beam search
+(2) beam search (not recommended)
 ./transducer_stateless2/decode.py \
         --epoch 14 \
         --avg 7 \
         --exp-dir ./transducer_stateless2/exp \
-        --max-duration 100 \
+        --max-duration 600 \
         --decoding-method beam_search \
         --beam-size 4
 
@@ -39,9 +39,20 @@
         --epoch 14 \
         --avg 7 \
         --exp-dir ./transducer_stateless2/exp \
-        --max-duration 100 \
+        --max-duration 600 \
         --decoding-method modified_beam_search \
         --beam-size 4
+
+(4) fast beam search
+./transducer_stateless2/decode.py \
+        --epoch 14 \
+        --avg 7 \
+        --exp-dir ./transducer_stateless2/exp \
+        --max-duration 600 \
+        --decoding-method fast_beam_search \
+        --beam 4 \
+        --max-contexts 4 \
+        --max-states 8
 """
 
 
@@ -49,14 +60,16 @@
 import logging
 from collections import defaultdict
 from pathlib import Path
-from typing import Dict, List, Tuple
+from typing import Dict, List, Optional, Tuple
 
+import k2
 import sentencepiece as spm
 import torch
 import torch.nn as nn
 from asr_datamodule import LibriSpeechAsrDataModule
 from beam_search import (
     beam_search,
+    fast_beam_search_one_best,
     greedy_search,
     greedy_search_batch,
     modified_beam_search,
@@ -115,6 +128,7 @@ def get_parser():
           - greedy_search
           - beam_search
           - modified_beam_search
+          - fast_beam_search
         """,
     )
 
@@ -122,8 +136,35 @@ def get_parser():
         "--beam-size",
         type=int,
         default=4,
+        help="""An integer indicating how many candidates we will keep for each
+        frame. Used only when --decoding-method is beam_search or
+        modified_beam_search.""",
+    )
+
+    parser.add_argument(
+        "--beam",
+        type=float,
+        default=4,
+        help="""A floating point value to calculate the cutoff score during beam
+        search (i.e., `cutoff = max-score - beam`), which is the same as the
+        `beam` in Kaldi.
+        Used only when --decoding-method is fast_beam_search""",
+    )
+
+    parser.add_argument(
+        "--max-contexts",
+        type=int,
+        default=4,
         help="""Used only when --decoding-method is
-        beam_search or modified_beam_search""",
+        fast_beam_search""",
+    )
+
+    parser.add_argument(
+        "--max-states",
+        type=int,
+        default=8,
+        help="""Used only when --decoding-method is
+        fast_beam_search""",
     )
 
     parser.add_argument(
@@ -149,6 +190,7 @@ def decode_one_batch(
     model: nn.Module,
     sp: spm.SentencePieceProcessor,
     batch: dict,
+    decoding_graph: Optional[k2.Fsa] = None,
 ) -> Dict[str, List[List[str]]]:
     """Decode one batch and return the result in a dict. The dict has the
     following format:
@@ -171,6 +213,9 @@ def decode_one_batch(
         It is the return value from iterating
         `lhotse.dataset.K2SpeechRecognitionDataset`. See its documentation
         for the format of the `batch`.
+      decoding_graph:
+        The decoding graph. Can be either a `k2.trivial_graph` or HLG, Used
+        only when --decoding_method is fast_beam_search.
     Returns:
       Return the decoding result. See above description for the format of
       the returned dict.
@@ -188,24 +233,44 @@ def decode_one_batch(
     encoder_out, encoder_out_lens = model.encoder(
         x=feature, x_lens=feature_lens
     )
-    hyp_list: List[List[int]] = []
 
-    if (
+    hyps = []
+
+    if params.decoding_method == "fast_beam_search":
+        hyp_tokens = fast_beam_search_one_best(
+            model=model,
+            decoding_graph=decoding_graph,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+            beam=params.beam,
+            max_contexts=params.max_contexts,
+            max_states=params.max_states,
+        )
+        for hyp in sp.decode(hyp_tokens):
+            hyps.append(hyp.split())
+    elif (
         params.decoding_method == "greedy_search"
         and params.max_sym_per_frame == 1
     ):
-        hyp_list = greedy_search_batch(
+        hyp_tokens = greedy_search_batch(
             model=model,
             encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
         )
+        for hyp in sp.decode(hyp_tokens):
+            hyps.append(hyp.split())
     elif params.decoding_method == "modified_beam_search":
-        hyp_list = modified_beam_search(
+        hyp_tokens = modified_beam_search(
             model=model,
             encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
             beam=params.beam_size,
         )
+        for hyp in sp.decode(hyp_tokens):
+            hyps.append(hyp.split())
     else:
         batch_size = encoder_out.size(0)
+
         for i in range(batch_size):
             # fmt: off
             encoder_out_i = encoder_out[i:i+1, :encoder_out_lens[i]]
@@ -226,14 +291,20 @@ def decode_one_batch(
                 raise ValueError(
                     f"Unsupported decoding method: {params.decoding_method}"
                 )
-            hyp_list.append(hyp)
-
-    hyps = [sp.decode(hyp).split() for hyp in hyp_list]
+            hyps.append(sp.decode(hyp).split())
 
     if params.decoding_method == "greedy_search":
         return {"greedy_search": hyps}
+    elif params.decoding_method == "fast_beam_search":
+        return {
+            (
+                f"beam_{params.beam}_"
+                f"max_contexts_{params.max_contexts}_"
+                f"max_states_{params.max_states}"
+            ): hyps
+        }
     else:
-        return {f"beam_{params.beam_size}": hyps}
+        return {f"beam_size_{params.beam_size}": hyps}
 
 
 def decode_dataset(
@@ -241,6 +312,7 @@ def decode_dataset(
     params: AttributeDict,
     model: nn.Module,
     sp: spm.SentencePieceProcessor,
+    decoding_graph: Optional[k2.Fsa] = None,
 ) -> Dict[str, List[Tuple[List[str], List[str]]]]:
     """Decode dataset.
 
@@ -253,6 +325,9 @@ def decode_dataset(
         The neural model.
       sp:
         The BPE model.
+      decoding_graph:
+        The decoding graph. Can be either a `k2.trivial_graph` or HLG, Used
+        only when --decoding_method is fast_beam_search.
     Returns:
       Return a dict, whose key may be "greedy_search" if greedy search
       is used, or it may be "beam_7" if beam size of 7 is used.
@@ -268,9 +343,9 @@ def decode_dataset(
         num_batches = "?"
 
     if params.decoding_method == "greedy_search":
-        log_interval = 100
+        log_interval = 50
     else:
-        log_interval = 2
+        log_interval = 10
 
     results = defaultdict(list)
     for batch_idx, batch in enumerate(dl):
@@ -280,6 +355,7 @@ def decode_dataset(
             params=params,
             model=model,
             sp=sp,
+            decoding_graph=decoding_graph,
             batch=batch,
         )
 
@@ -360,13 +436,21 @@ def main():
     assert params.decoding_method in (
         "greedy_search",
         "beam_search",
+        "fast_beam_search",
         "modified_beam_search",
     )
     params.res_dir = params.exp_dir / params.decoding_method
 
     params.suffix = f"epoch-{params.epoch}-avg-{params.avg}"
-    if "beam_search" in params.decoding_method:
-        params.suffix += f"-beam-{params.beam_size}"
+
+    if "fast_beam_search" in params.decoding_method:
+        params.suffix += f"-beam-{params.beam}"
+        params.suffix += f"-max-contexts-{params.max_contexts}"
+        params.suffix += f"-max-states-{params.max_states}"
+    elif "beam_search" in params.decoding_method:
+        params.suffix += (
+            f"-{params.decoding_method}-beam-size-{params.beam_size}"
+        )
     else:
         params.suffix += f"-context-{params.context_size}"
         params.suffix += f"-max-sym-per-frame-{params.max_sym_per_frame}"
@@ -408,6 +492,11 @@ def main():
     model.eval()
     model.device = device
 
+    if params.decoding_method == "fast_beam_search":
+        decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
+    else:
+        decoding_graph = None
+
     num_param = sum([p.numel() for p in model.parameters()])
     logging.info(f"Number of model parameters: {num_param}")
 
@@ -428,6 +517,7 @@ def main():
             params=params,
             model=model,
             sp=sp,
+            decoding_graph=decoding_graph,
         )
 
         save_results(
diff --git a/egs/librispeech/ASR/transducer_stateless2/pretrained.py b/egs/librispeech/ASR/transducer_stateless2/pretrained.py
index 2f0604893c..292f77f036 100755
--- a/egs/librispeech/ASR/transducer_stateless2/pretrained.py
+++ b/egs/librispeech/ASR/transducer_stateless2/pretrained.py
@@ -19,30 +19,39 @@
 
 (1) greedy search
 ./transducer_stateless2/pretrained.py \
-        --checkpoint ./transducer_stateless2/exp/pretrained.pt \
-        --bpe-model ./data/lang_bpe_500/bpe.model \
-        --method greedy_search \
-        --max-sym-per-frame 1 \
-        /path/to/foo.wav \
-        /path/to/bar.wav \
+    --checkpoint ./transducer_stateless2/exp/pretrained.pt \
+    --bpe-model ./data/lang_bpe_500/bpe.model \
+    --method greedy_search \
+    --max-sym-per-frame 1 \
+    /path/to/foo.wav \
+    /path/to/bar.wav
 
 (2) beam search
 ./transducer_stateless2/pretrained.py \
-        --checkpoint ./transducer_stateless2/exp/pretrained.pt \
-        --bpe-model ./data/lang_bpe_500/bpe.model \
-        --method beam_search \
-        --beam-size 4 \
-        /path/to/foo.wav \
-        /path/to/bar.wav \
+    --checkpoint ./transducer_stateless2/exp/pretrained.pt \
+    --bpe-model ./data/lang_bpe_500/bpe.model \
+    --method beam_search \
+    --beam-size 4 \
+    /path/to/foo.wav \
+    /path/to/bar.wav
 
 (3) modified beam search
 ./transducer_stateless2/pretrained.py \
-        --checkpoint ./transducer_stateless2/exp/pretrained.pt \
-        --bpe-model ./data/lang_bpe_500/bpe.model \
-        --method modified_beam_search \
-        --beam-size 4 \
-        /path/to/foo.wav \
-        /path/to/bar.wav \
+    --checkpoint ./transducer_stateless2/exp/pretrained.pt \
+    --bpe-model ./data/lang_bpe_500/bpe.model \
+    --method modified_beam_search \
+    --beam-size 4 \
+    /path/to/foo.wav \
+    /path/to/bar.wav
+
+(4) fast beam search
+./transducer_stateless2/pretrained.py \
+    --checkpoint ./transducer_stateless2/exp/pretrained.pt \
+    --bpe-model ./data/lang_bpe_500/bpe.model \
+    --method fast_beam_search \
+    --beam-size 4 \
+    /path/to/foo.wav \
+    /path/to/bar.wav
 
 You can also use `./transducer_stateless2/exp/epoch-xx.pt`.
 
@@ -56,12 +65,14 @@
 import math
 from typing import List
 
+import k2
 import kaldifeat
 import sentencepiece as spm
 import torch
 import torchaudio
 from beam_search import (
     beam_search,
+    fast_beam_search_one_best,
     greedy_search,
     greedy_search_batch,
     modified_beam_search,
@@ -87,9 +98,7 @@ def get_parser():
     parser.add_argument(
         "--bpe-model",
         type=str,
-        help="""Path to bpe.model.
-        Used only when method is ctc-decoding.
-        """,
+        help="""Path to bpe.model.""",
     )
 
     parser.add_argument(
@@ -100,6 +109,7 @@ def get_parser():
           - greedy_search
           - beam_search
           - modified_beam_search
+          - fast_beam_search
         """,
     )
 
@@ -124,7 +134,33 @@ def get_parser():
         "--beam-size",
         type=int,
         default=4,
-        help="Used only when --method is beam_search and modified_beam_search ",
+        help="""An integer indicating how many candidates we will keep for each
+        frame. Used only when --method is beam_search or
+        modified_beam_search.""",
+    )
+
+    parser.add_argument(
+        "--beam",
+        type=float,
+        default=4,
+        help="""A floating point value to calculate the cutoff score during beam
+        search (i.e., `cutoff = max-score - beam`), which is the same as the
+        `beam` in Kaldi.
+        Used only when --method is fast_beam_search""",
+    )
+
+    parser.add_argument(
+        "--max-contexts",
+        type=int,
+        default=4,
+        help="""Used only when --method is fast_beam_search""",
+    )
+
+    parser.add_argument(
+        "--max-states",
+        type=int,
+        default=8,
+        help="""Used only when --method is fast_beam_search""",
     )
 
     parser.add_argument(
@@ -241,15 +277,28 @@ def main():
         msg += f" with beam size {params.beam_size}"
     logging.info(msg)
 
-    if params.method == "greedy_search" and params.max_sym_per_frame == 1:
+    if params.method == "fast_beam_search":
+        decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
+        hyp_list = fast_beam_search_one_best(
+            model=model,
+            decoding_graph=decoding_graph,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+            beam=params.beam,
+            max_contexts=params.max_contexts,
+            max_states=params.max_states,
+        )
+    elif params.method == "greedy_search" and params.max_sym_per_frame == 1:
         hyp_list = greedy_search_batch(
             model=model,
             encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
         )
     elif params.method == "modified_beam_search":
         hyp_list = modified_beam_search(
             model=model,
             encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
             beam=params.beam_size,
         )
     else:
diff --git a/egs/librispeech/ASR/transducer_stateless_multi_datasets/decode.py b/egs/librispeech/ASR/transducer_stateless_multi_datasets/decode.py
index 22f137d364..955366970e 100755
--- a/egs/librispeech/ASR/transducer_stateless_multi_datasets/decode.py
+++ b/egs/librispeech/ASR/transducer_stateless_multi_datasets/decode.py
@@ -22,17 +22,37 @@
         --epoch 14 \
         --avg 7 \
         --exp-dir ./transducer_stateless_multi_datasets/exp \
-        --max-duration 100 \
+        --max-duration 600 \
         --decoding-method greedy_search
 
-(2) beam search
+(2) beam search (not recommended)
 ./transducer_stateless_multi_datasets/decode.py \
         --epoch 14 \
         --avg 7 \
         --exp-dir ./transducer_stateless_multi_datasets/exp \
-        --max-duration 100 \
+        --max-duration 600 \
         --decoding-method beam_search \
         --beam-size 4
+
+(3) modified beam search
+./transducer_stateless_multi_datasets/decode.py \
+        --epoch 14 \
+        --avg 7 \
+        --exp-dir ./transducer_stateless_multi_datasets/exp \
+        --max-duration 600 \
+        --decoding-method modified_beam_search \
+        --beam-size 4
+
+(4) fast beam search
+./transducer_stateless_multi_datasets/decode.py \
+        --epoch 14 \
+        --avg 7 \
+        --exp-dir ./transducer_stateless_multi_datasets/exp \
+        --max-duration 600 \
+        --decoding-method fast_beam_search \
+        --beam 4 \
+        --max-contexts 4 \
+        --max-states 8
 """
 
 
@@ -40,14 +60,16 @@
 import logging
 from collections import defaultdict
 from pathlib import Path
-from typing import Dict, List, Tuple
+from typing import Dict, List, Optional, Tuple
 
+import k2
 import sentencepiece as spm
 import torch
 import torch.nn as nn
 from asr_datamodule import AsrDataModule
 from beam_search import (
     beam_search,
+    fast_beam_search_one_best,
     greedy_search,
     greedy_search_batch,
     modified_beam_search,
@@ -107,6 +129,7 @@ def get_parser():
           - greedy_search
           - beam_search
           - modified_beam_search
+          - fast_beam_search
         """,
     )
 
@@ -114,8 +137,35 @@ def get_parser():
         "--beam-size",
         type=int,
         default=4,
+        help="""An integer indicating how many candidates we will keep for each
+        frame. Used only when --decoding-method is beam_search or
+        modified_beam_search.""",
+    )
+
+    parser.add_argument(
+        "--beam",
+        type=float,
+        default=4,
+        help="""A floating point value to calculate the cutoff score during beam
+        search (i.e., `cutoff = max-score - beam`), which is the same as the
+        `beam` in Kaldi.
+        Used only when --decoding-method is fast_beam_search""",
+    )
+
+    parser.add_argument(
+        "--max-contexts",
+        type=int,
+        default=4,
         help="""Used only when --decoding-method is
-        beam_search or modified_beam_search""",
+        fast_beam_search""",
+    )
+
+    parser.add_argument(
+        "--max-states",
+        type=int,
+        default=8,
+        help="""Used only when --decoding-method is
+        fast_beam_search""",
     )
 
     parser.add_argument(
@@ -141,6 +191,7 @@ def decode_one_batch(
     model: nn.Module,
     sp: spm.SentencePieceProcessor,
     batch: dict,
+    decoding_graph: Optional[k2.Fsa] = None,
 ) -> Dict[str, List[List[str]]]:
     """Decode one batch and return the result in a dict. The dict has the
     following format:
@@ -163,6 +214,9 @@ def decode_one_batch(
         It is the return value from iterating
         `lhotse.dataset.K2SpeechRecognitionDataset`. See its documentation
         for the format of the `batch`.
+      decoding_graph:
+        The decoding graph. Can be either a `k2.trivial_graph` or HLG, Used
+        only when --decoding_method is fast_beam_search.
     Returns:
       Return the decoding result. See above description for the format of
       the returned dict.
@@ -180,24 +234,44 @@ def decode_one_batch(
     encoder_out, encoder_out_lens = model.encoder(
         x=feature, x_lens=feature_lens
     )
-    hyp_list = []
-    batch_size = encoder_out.size(0)
 
-    if (
+    hyps = []
+
+    if params.decoding_method == "fast_beam_search":
+        hyp_tokens = fast_beam_search_one_best(
+            model=model,
+            decoding_graph=decoding_graph,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+            beam=params.beam,
+            max_contexts=params.max_contexts,
+            max_states=params.max_states,
+        )
+        for hyp in sp.decode(hyp_tokens):
+            hyps.append(hyp.split())
+    elif (
         params.decoding_method == "greedy_search"
         and params.max_sym_per_frame == 1
     ):
-        hyp_list = greedy_search_batch(
+        hyp_tokens = greedy_search_batch(
             model=model,
             encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
         )
+        for hyp in sp.decode(hyp_tokens):
+            hyps.append(hyp.split())
     elif params.decoding_method == "modified_beam_search":
-        hyp_list = modified_beam_search(
+        hyp_tokens = modified_beam_search(
             model=model,
             encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
             beam=params.beam_size,
         )
+        for hyp in sp.decode(hyp_tokens):
+            hyps.append(hyp.split())
     else:
+        batch_size = encoder_out.size(0)
+
         for i in range(batch_size):
             # fmt: off
             encoder_out_i = encoder_out[i:i+1, :encoder_out_lens[i]]
@@ -218,14 +292,20 @@ def decode_one_batch(
                 raise ValueError(
                     f"Unsupported decoding method: {params.decoding_method}"
                 )
-            hyp_list.append(sp.decode(hyp).split())
-
-    hyps = [sp.decode(hyp).split() for hyp in hyp_list]
+            hyps.append(sp.decode(hyp).split())
 
     if params.decoding_method == "greedy_search":
         return {"greedy_search": hyps}
+    elif params.decoding_method == "fast_beam_search":
+        return {
+            (
+                f"beam_{params.beam}_"
+                f"max_contexts_{params.max_contexts}_"
+                f"max_states_{params.max_states}"
+            ): hyps
+        }
     else:
-        return {f"beam_{params.beam_size}": hyps}
+        return {f"beam_size_{params.beam_size}": hyps}
 
 
 def decode_dataset(
@@ -233,6 +313,7 @@ def decode_dataset(
     params: AttributeDict,
     model: nn.Module,
     sp: spm.SentencePieceProcessor,
+    decoding_graph: Optional[k2.Fsa] = None,
 ) -> Dict[str, List[Tuple[List[str], List[str]]]]:
     """Decode dataset.
 
@@ -245,6 +326,9 @@ def decode_dataset(
         The neural model.
       sp:
         The BPE model.
+      decoding_graph:
+        The decoding graph. Can be either a `k2.trivial_graph` or HLG, Used
+        only when --decoding_method is fast_beam_search.
     Returns:
       Return a dict, whose key may be "greedy_search" if greedy search
       is used, or it may be "beam_7" if beam size of 7 is used.
@@ -260,9 +344,9 @@ def decode_dataset(
         num_batches = "?"
 
     if params.decoding_method == "greedy_search":
-        log_interval = 100
+        log_interval = 50
     else:
-        log_interval = 2
+        log_interval = 10
 
     results = defaultdict(list)
     for batch_idx, batch in enumerate(dl):
@@ -272,6 +356,7 @@ def decode_dataset(
             params=params,
             model=model,
             sp=sp,
+            decoding_graph=decoding_graph,
             batch=batch,
         )
 
@@ -352,13 +437,21 @@ def main():
     assert params.decoding_method in (
         "greedy_search",
         "beam_search",
+        "fast_beam_search",
         "modified_beam_search",
     )
     params.res_dir = params.exp_dir / params.decoding_method
 
     params.suffix = f"epoch-{params.epoch}-avg-{params.avg}"
-    if "beam_search" in params.decoding_method:
-        params.suffix += f"-beam-{params.beam_size}"
+
+    if "fast_beam_search" in params.decoding_method:
+        params.suffix += f"-beam-{params.beam}"
+        params.suffix += f"-max-contexts-{params.max_contexts}"
+        params.suffix += f"-max-states-{params.max_states}"
+    elif "beam_search" in params.decoding_method:
+        params.suffix += (
+            f"-{params.decoding_method}-beam-size-{params.beam_size}"
+        )
     else:
         params.suffix += f"-context-{params.context_size}"
         params.suffix += f"-max-sym-per-frame-{params.max_sym_per_frame}"
@@ -402,6 +495,11 @@ def main():
     model.eval()
     model.device = device
 
+    if params.decoding_method == "fast_beam_search":
+        decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
+    else:
+        decoding_graph = None
+
     num_param = sum([p.numel() for p in model.parameters()])
     logging.info(f"Number of model parameters: {num_param}")
 
@@ -423,6 +521,7 @@ def main():
             params=params,
             model=model,
             sp=sp,
+            decoding_graph=decoding_graph,
         )
 
         save_results(
diff --git a/egs/librispeech/ASR/transducer_stateless_multi_datasets/pretrained.py b/egs/librispeech/ASR/transducer_stateless_multi_datasets/pretrained.py
index df9c3186fb..f297fa2b2f 100755
--- a/egs/librispeech/ASR/transducer_stateless_multi_datasets/pretrained.py
+++ b/egs/librispeech/ASR/transducer_stateless_multi_datasets/pretrained.py
@@ -44,6 +44,15 @@
     /path/to/foo.wav \
     /path/to/bar.wav
 
+(4) fast beam search
+./transducer_stateless_multi_datasets/pretrained.py \
+    --checkpoint ./transducer_stateless_multi_datasets/exp/pretrained.pt \
+    --bpe-model ./data/lang_bpe_500/bpe.model \
+    --method fast_beam_search \
+    --beam-size 4 \
+    /path/to/foo.wav \
+    /path/to/bar.wav
+
 You can also use `./transducer_stateless_multi_datasets/exp/epoch-xx.pt`.
 
 Note: ./transducer_stateless_multi_datasets/exp/pretrained.pt is generated by
@@ -56,12 +65,14 @@
 import math
 from typing import List
 
+import k2
 import kaldifeat
 import sentencepiece as spm
 import torch
 import torchaudio
 from beam_search import (
     beam_search,
+    fast_beam_search_one_best,
     greedy_search,
     greedy_search_batch,
     modified_beam_search,
@@ -87,9 +98,7 @@ def get_parser():
     parser.add_argument(
         "--bpe-model",
         type=str,
-        help="""Path to bpe.model.
-        Used only when method is ctc-decoding.
-        """,
+        help="""Path to bpe.model.""",
     )
 
     parser.add_argument(
@@ -100,6 +109,7 @@ def get_parser():
           - greedy_search
           - beam_search
           - modified_beam_search
+          - fast_beam_search
         """,
     )
 
@@ -124,7 +134,33 @@ def get_parser():
         "--beam-size",
         type=int,
         default=4,
-        help="Used only when --method is beam_search and modified_beam_search ",
+        help="""An integer indicating how many candidates we will keep for each
+        frame. Used only when --method is beam_search or
+        modified_beam_search.""",
+    )
+
+    parser.add_argument(
+        "--beam",
+        type=float,
+        default=4,
+        help="""A floating point value to calculate the cutoff score during beam
+        search (i.e., `cutoff = max-score - beam`), which is the same as the
+        `beam` in Kaldi.
+        Used only when --method is fast_beam_search""",
+    )
+
+    parser.add_argument(
+        "--max-contexts",
+        type=int,
+        default=4,
+        help="""Used only when --method is fast_beam_search""",
+    )
+
+    parser.add_argument(
+        "--max-states",
+        type=int,
+        default=8,
+        help="""Used only when --method is fast_beam_search""",
     )
 
     parser.add_argument(
@@ -241,18 +277,30 @@ def main():
         msg += f" with beam size {params.beam_size}"
     logging.info(msg)
 
-    if params.method == "greedy_search" and params.max_sym_per_frame == 1:
+    if params.method == "fast_beam_search":
+        decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
+        hyp_list = fast_beam_search_one_best(
+            model=model,
+            decoding_graph=decoding_graph,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+            beam=params.beam,
+            max_contexts=params.max_contexts,
+            max_states=params.max_states,
+        )
+    elif params.method == "greedy_search" and params.max_sym_per_frame == 1:
         hyp_list = greedy_search_batch(
             model=model,
             encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
         )
     elif params.method == "modified_beam_search":
         hyp_list = modified_beam_search(
             model=model,
             encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
             beam=params.beam_size,
         )
-
     else:
         for i in range(num_waves):
             # fmt: off
diff --git a/egs/tedlium3/ASR/pruned_transducer_stateless/decode.py b/egs/tedlium3/ASR/pruned_transducer_stateless/decode.py
index fd8d2dd0e1..4d9d3c3cfb 100755
--- a/egs/tedlium3/ASR/pruned_transducer_stateless/decode.py
+++ b/egs/tedlium3/ASR/pruned_transducer_stateless/decode.py
@@ -69,7 +69,7 @@
 from asr_datamodule import TedLiumAsrDataModule
 from beam_search import (
     beam_search,
-    fast_beam_search,
+    fast_beam_search_one_best,
     greedy_search,
     greedy_search_batch,
     modified_beam_search,
@@ -237,7 +237,7 @@ def decode_one_batch(
     hyps = []
 
     if params.decoding_method == "fast_beam_search":
-        hyp_tokens = fast_beam_search(
+        hyp_tokens = fast_beam_search_one_best(
             model=model,
             decoding_graph=decoding_graph,
             encoder_out=encoder_out,
@@ -255,6 +255,7 @@ def decode_one_batch(
         hyp_tokens = greedy_search_batch(
             model=model,
             encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
         )
         for hyp in sp.decode(hyp_tokens):
             hyps.append(hyp.split())
@@ -262,6 +263,7 @@ def decode_one_batch(
         hyp_tokens = modified_beam_search(
             model=model,
             encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
             beam=params.beam_size,
         )
         for hyp in sp.decode(hyp_tokens):
diff --git a/egs/tedlium3/ASR/pruned_transducer_stateless/pretrained.py b/egs/tedlium3/ASR/pruned_transducer_stateless/pretrained.py
index 08e4962e2d..8480ac029c 100644
--- a/egs/tedlium3/ASR/pruned_transducer_stateless/pretrained.py
+++ b/egs/tedlium3/ASR/pruned_transducer_stateless/pretrained.py
@@ -72,23 +72,16 @@
 import kaldifeat
 import sentencepiece as spm
 import torch
-import torch.nn as nn
 import torchaudio
 from beam_search import (
     beam_search,
-    fast_beam_search,
+    fast_beam_search_one_best,
     greedy_search,
     greedy_search_batch,
     modified_beam_search,
 )
-from conformer import Conformer
-from decoder import Decoder
-from joiner import Joiner
-from model import Transducer
 from torch.nn.utils.rnn import pad_sequence
-
-from icefall.env import get_env_info
-from icefall.utils import AttributeDict
+from train import get_params, get_transducer_model
 
 
 def get_parser():
@@ -185,74 +178,14 @@ def get_parser():
         """,
     )
 
-    return parser
-
-
-def get_params() -> AttributeDict:
-    params = AttributeDict(
-        {
-            "sample_rate": 16000,
-            # parameters for conformer
-            "feature_dim": 80,
-            "subsampling_factor": 4,
-            "attention_dim": 512,
-            "nhead": 8,
-            "dim_feedforward": 2048,
-            "num_encoder_layers": 12,
-            "vgg_frontend": False,
-            # parameters for decoder
-            "embedding_dim": 512,
-            "env_info": get_env_info(),
-        }
-    )
-    return params
-
-
-def get_encoder_model(params: AttributeDict) -> nn.Module:
-    encoder = Conformer(
-        num_features=params.feature_dim,
-        output_dim=params.vocab_size,
-        subsampling_factor=params.subsampling_factor,
-        d_model=params.attention_dim,
-        nhead=params.nhead,
-        dim_feedforward=params.dim_feedforward,
-        num_encoder_layers=params.num_encoder_layers,
-        vgg_frontend=params.vgg_frontend,
-    )
-    return encoder
-
-
-def get_decoder_model(params: AttributeDict) -> nn.Module:
-    decoder = Decoder(
-        vocab_size=params.vocab_size,
-        embedding_dim=params.embedding_dim,
-        blank_id=params.blank_id,
-        unk_id=params.unk_id,
-        context_size=params.context_size,
-    )
-    return decoder
-
-
-def get_joiner_model(params: AttributeDict) -> nn.Module:
-    joiner = Joiner(
-        input_dim=params.vocab_size,
-        inner_dim=params.embedding_dim,
-        output_dim=params.vocab_size,
+    parser.add_argument(
+        "--sample-rate",
+        type=int,
+        default=16000,
+        help="The sample rate of the input sound file",
     )
-    return joiner
-
-
-def get_transducer_model(params: AttributeDict) -> nn.Module:
-    encoder = get_encoder_model(params)
-    decoder = get_decoder_model(params)
-    joiner = get_joiner_model(params)
 
-    model = Transducer(
-        encoder=encoder,
-        decoder=decoder,
-        joiner=joiner,
-    )
-    return model
+    return parser
 
 
 def read_sound_files(
@@ -354,7 +287,7 @@ def main():
     logging.info(msg)
 
     if params.decoding_method == "fast_beam_search":
-        hyp_tokens = fast_beam_search(
+        hyp_tokens = fast_beam_search_one_best(
             model=model,
             decoding_graph=decoding_graph,
             encoder_out=encoder_out,
@@ -372,6 +305,7 @@ def main():
         hyp_tokens = greedy_search_batch(
             model=model,
             encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
         )
         for hyp in sp.decode(hyp_tokens):
             hyps.append(hyp.split())
@@ -379,6 +313,7 @@ def main():
         hyp_tokens = modified_beam_search(
             model=model,
             encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
             beam=params.beam_size,
         )
         for hyp in sp.decode(hyp_tokens):