Skip to content
This repository has been archived by the owner on Oct 13, 2022. It is now read-only.

Use a pre-trained bigram P for LF-MMI training #222

Merged
merged 4 commits into from
Jul 2, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions egs/librispeech/asr/simple_v1/.gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
*.txt
data
exp
110 changes: 110 additions & 0 deletions egs/librispeech/asr/simple_v1/local/add_silence_to_transcript.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
#!/usr/bin/env python3

# Copyright 2021 Xiaomi Corporation (Author: Fangjun Kuang)
'''
Add silence with a given probability after each word in the transcript.

If the input transcript contains:

hello world
foo bar koo
zoo

Then the output transcript **may** look like the following:

!SIL hello !SIL world !SIL
foo bar !SIL koo !SIL
!SIL zoo !SIL

(Assume !SIL represents silence.)
'''

from pathlib import Path

import argparse
import random


def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('--transcript',
type=str,
help='The input transcript file.'
'We assume that the transcript file consists of '
'lines. Each line consists of space separated words.')
parser.add_argument('--sil-word',
type=str,
default='!SIL',
help='The word that represents silence.')
parser.add_argument('--sil-prob',
type=float,
default=0.5,
help='The probability for adding a '
'silence after each world.')
parser.add_argument('--seed',
type=int,
default=None,
help='The seed for random number generators.')

return parser.parse_args()


def need_silence(sil_prob: float) -> bool:
'''
Args:
sil_prob:
The probability to add a silence.
Returns:
Return True if a silence is needed.
Return False otherwise.
'''
return random.uniform(0, 1) <= sil_prob


def process_line(line: str, sil_word: str, sil_prob: float) -> None:
'''Process a single line from the transcript.

Args:
line:
A str containing space separated words.
sil_word:
The symbol indicating silence.
sil_prob:
The probability for adding a silence after each word.
Returns:
Return None.
'''
words = line.strip().split()
for i, word in enumerate(words):
if i == 0:
# beginning of the line
if need_silence(sil_prob):
print(sil_word, end=' ')

print(word, end=' ')

if need_silence(sil_prob):
print(sil_word, end=' ')

# end of the line, print a new line
if i == len(words) - 1:
print()


def main():
args = get_args()
random.seed(args.seed)

assert Path(args.transcript).is_file()
assert len(args.sil_word) > 0
assert 0 < args.sil_prob < 1

with open(args.transcript) as f:
for line in f:
process_line(line=line,
sil_word=args.sil_word,
sil_prob=args.sil_prob)


if __name__ == '__main__':
main()
132 changes: 132 additions & 0 deletions egs/librispeech/asr/simple_v1/local/convert_transcript_to_corpus.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
#!/usr/bin/env python3

# Copyright 2021 Xiaomi Corporation (Author: Fangjun Kuang)
'''
Convert a transcript file to a corpus for LM training with
the help of a lexicon. If the lexicon contains phones, the resulting
LM will be a phone LM; If the lexicon contains word pieces,
the resulting LM will be a word piece LM.

If a word has multiple pronunciations, only the first one is used.

If the input transcript is:

hello zoo world hello
world zoo
foo zoo world hellO

and if the lexicon is

<UNK> SPN
hello h e l l o 2
hello h e l l o
world w o r l d
zoo z o o

Then the output is

h e l l o 2 z o o w o r l d h e l l o 2
w o r l d z o o
SPN z o o w o r l d SPN
'''

from pathlib import Path
from typing import Dict

import argparse


def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('--transcript',
type=str,
help='The input transcript file.'
'We assume that the transcript file consists of '
'lines. Each line consists of space separated words.')
parser.add_argument('--lexicon', type=str, help='The input lexicon file.')
parser.add_argument('--oov',
type=str,
default='<UNK>',
help='The OOV word.')

return parser.parse_args()


def read_lexicon(filename: str) -> Dict[str, str]:
'''
Args:
filename:
Filename to the lexicon. Each line in the lexicon
has the following format:

word p1 p2 p3

where the first field is a word and the remaining fields
are the pronunciations of the word. Fields are separated
by spaces.
Returns:
Return a dict whose keys are words and values are the pronunciations.
'''
ans = dict()
with open(filename) as f:
for line in f:
line = line.strip()

if len(line) == 0:
# skip empty lines
continue

fields = line.split()
assert len(fields) >= 2

word = fields[0]
pron = ' '.join(fields[1:])

if word not in ans:
# In case a word has multiple pronunciations,
# we only use the first one
ans[word] = pron
return ans


def process_line(lexicon: Dict[str, str], line: str, oov_pron: str) -> None:
'''
Args:
lexicon:
A dict containing pronunciations. Its keys are words and values
are pronunciations.
line:
A line of transcript consisting of space separated words.
oov_pron:
The pronunciation of the oov word if a word in line is not present
in the lexicon.
Returns:
Return None.
'''
words = line.strip().split()
for i, w in enumerate(words):
pron = lexicon.get(w, oov_pron)
print(pron, end=' ')
if i == len(words) - 1:
# end of the line, prints a new line
print()


def main():
args = get_args()
assert Path(args.lexicon).is_file()
assert Path(args.transcript).is_file()
assert len(args.oov) > 0

lexicon = read_lexicon(args.lexicon)
assert args.oov in lexicon

oov_pron = lexicon[args.oov]

with open(args.transcript) as f:
for line in f:
process_line(lexicon=lexicon, line=line, oov_pron=oov_pron)


if __name__ == '__main__':
main()
Loading