-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTaskfile.yml
172 lines (138 loc) · 6.54 KB
/
Taskfile.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
_build-default:
desc: "Run through entire processing pipeline"
cmds:
#
- git pull
- ^clean-output-folder
- ^tmp-makedir
- ^make-kaldi-subfolders
- task elan-to-json > {{ .OUTPUT_PATH }}/tmp/toy_elan.json
- task json-to-kaldi < {{ .OUTPUT_PATH }}/tmp/toy_elan.json
- cp {{ .OUTPUT_PATH }}/tmp/json_splitted/corpus.txt {{ .OUTPUT_PATH }}/kaldi/data/local/
# Extract unique wordlist for corpus data, then generate pronunciation dictionary
- task make-wordlist < {{ .OUTPUT_PATH }}/tmp/toy_elan.json > {{ .OUTPUT_PATH }}/tmp/corpus_wordlist.txt
- task make-prn-dict < {{ .OUTPUT_PATH }}/tmp/corpus_wordlist.txt > {{ .OUTPUT_PATH }}/kaldi/data/local/dict/lexicon.txt
- task make-nonsil-phones > {{ .OUTPUT_PATH }}/kaldi/data/local/dict/nonsilence_phones.txt
- task copy-silence-phones
- cp {{ .OUTPUT_PATH }}/tmp/json_splitted/segments {{ .OUTPUT_PATH }}/tmp/json_splitted/text {{ .OUTPUT_PATH }}/tmp/json_splitted/utt2spk {{ .OUTPUT_PATH }}/tmp/json_splitted/wav.scp {{ .OUTPUT_PATH }}/kaldi/data/test/
- cp {{ .OUTPUT_PATH }}/tmp/json_splitted/segments {{ .OUTPUT_PATH }}/tmp/json_splitted/text {{ .OUTPUT_PATH }}/tmp/json_splitted/utt2spk {{ .OUTPUT_PATH }}/tmp/json_splitted/wav.scp {{ .OUTPUT_PATH }}/kaldi/data/train/
# - sort {{ .OUTPUT_PATH }}/tmp/json_splitted/segments > {{ .OUTPUT_PATH }}/kaldi/data/train/segments
# - cp {{ .OUTPUT_PATH }}/kaldi/data/train/segments {{ .OUTPUT_PATH }}/kaldi/data/test/
# - sort {{ .OUTPUT_PATH }}/tmp/json_splitted/wav.scp > {{ .OUTPUT_PATH }}/kaldi/data/train/wav.scp
# - cp {{ .OUTPUT_PATH }}/kaldi/data/train/wav.scp {{ .OUTPUT_PATH }}/kaldi/data/test/
# - sort {{ .OUTPUT_PATH }}/tmp/json_splitted/text > {{ .OUTPUT_PATH }}/kaldi/data/train/text
# - cp {{ .OUTPUT_PATH }}/kaldi/data/train/text {{ .OUTPUT_PATH }}/kaldi/data/test/
# - sort {{ .OUTPUT_PATH }}/tmp/json_splitted/utt2spk > {{ .OUTPUT_PATH }}/kaldi/data/train/utt2spk
# - cp {{ .OUTPUT_PATH }}/kaldi/data/train/utt2spk {{ .OUTPUT_PATH }}/kaldi/data/test/
- task gather-wavs extract-wavs
- ^make-conf-files
- ^copy-helper-scripts
- echo "######################## Build task completed without errors"
_run-training:
desc: "Run Kaldi on prepared data"
dir: output/kaldi
cmds:
- ./run.sh
# Stuff inside default /kaldi-helpers/Taskfile.yml
copy-helper-scripts:
desc: "Copy the necessary scripts from Kaldi"
cmds:
- cp /kaldi-helpers/pronunciation/kaldi-demo-prep/digits/cmd.sh {{ .OUTPUT_PATH }}/kaldi/
- cp path.sh {{ .OUTPUT_PATH }}/kaldi/
- cp /kaldi-helpers/pronunciation/kaldi-demo-prep/digits/run.sh {{ .OUTPUT_PATH }}/kaldi/
- cp /kaldi-helpers/pronunciation/kaldi-demo-prep/digits/local/score.sh {{ .OUTPUT_PATH }}/kaldi/local/score.sh
# Copy in steps and utils from another Kaldi project
- cp -R {{ .KALDI_PATH }}/egs/wsj/s5/steps {{ .OUTPUT_PATH }}/kaldi/steps
- cp -R {{ .KALDI_PATH }}/egs/wsj/s5/utils {{ .OUTPUT_PATH }}/kaldi/utils
make-conf-files:
desc: "Make and copy necessary configuration files into output/kaldi/conf"
cmds:
# TODO: inject relevant variables from Taskvars.yml into a Moustache template
# instead of copying from asr-daan/pronunciation/... folder
- cp pronunciation/kaldi-demo-prep/digits/conf/* {{ .OUTPUT_PATH }}/kaldi/conf
##################### Test definitions
copy-silence-phones:
desc: "Copy or make relevant silence/optional silence phones config for Kaldi"
cmds:
# TODO: Let users define this as part of the CONFIG variables inside
# Taskvars.yml
- echo "{{ .SILENCE_PHONES }}" | tr , '\n' > {{ .OUTPUT_PATH }}/kaldi/data/local/dict/silence_phones.txt
- echo "{{ .OPTIONAL_SILENCE_PHONES }}" | tr , '\n' > {{ .OUTPUT_PATH }}/kaldi/data/local/dict/optional_silence.txt
# - cp input/optional_silence.txt {{ .OUTPUT_PATH }}/kaldi/data/local/dict/
# - cp input/silence_phones.txt {{ .OUTPUT_PATH }}/kaldi/data/local/dict/
gather-wavs:
desc: "Gather all wav files inside input/data into output/media.zip"
dir: input/data
cmds:
- tar cf /kaldi-helpers/{{ .OUTPUT_PATH }}/media.tar `find . | grep '\.wav'`
extract-wavs:
desc: "Extract all wav files into kaldi folder"
dir: output
cmds:
- tar xf media.tar -C kaldi
- rm media.tar
clean-output-folder:
desc: "Delete all files and folders inside /kaldi-helpers/output"
cmds:
- rm -rf output/*
tmp-makedir:
desc: "Make the tmp directory, if it does not exist"
cmds:
- if [ ! -d {{ .OUTPUT_PATH }}/tmp ]; then mkdir {{ .OUTPUT_PATH }}/tmp; fi
tmp-delete:
deps: [tmp-makedir]
desc: "Delete all files in tmp directory"
cmds:
# Make directory non-empty, just in case it's just been initialised
- cp /dev/null {{ .OUTPUT_PATH }}/null
- rm -r {{ .OUTPUT_PATH }}/*
make-kaldi-subfolders:
desc: "Makes subfolder structure which Kaldi expects"
cmds:
- mkdir -p {{ .OUTPUT_PATH }}/kaldi/data/local/dict
- mkdir -p {{ .OUTPUT_PATH }}/kaldi/data/test
- mkdir -p {{ .OUTPUT_PATH }}/kaldi/data/train
# - mkdir -p {{ .OUTPUT_PATH }}/kaldi/{{ .KALDI_PROJECT_NAME }}_audio/test
# - mkdir -p {{ .OUTPUT_PATH }}/kaldi/{{ .KALDI_PROJECT_NAME }}_audio/train
# Top level directories
- mkdir -p {{ .OUTPUT_PATH }}/kaldi/conf
- mkdir -p {{ .OUTPUT_PATH }}/kaldi/local
make-nonsil-phones:
desc: "Generate non-silence phones file from LETTER_TO_SOUND file defined in Taskfile.yml"
cmds:
- grep -v '^#' < {{ .SCRIPTS_PATH }}/{{ .LETTER_TO_SOUND }}
| cut -d' ' -f2
| grep -v '^$'
| sort -u
elan-to-json:
desc: "Convert a folder of .eaf files to a single JSON file"
env:
PYTHONIOENCODING: "utf-8"
cmds:
- python3.6 {{ .SCRIPTS_PATH }}/audio_segmenter/elan_to_json.py {{ .CORPUS_PATH }}
cat-all-json:
desc: "Concatenate all .json files into one .json file"
cmds:
- jq -s '. | add'
clean-json:
desc: "Clean corpus of problematic characters before passing data to Kaldi"
env:
PYTHONIOENCODING: "utf-8"
cmds:
- python3 {{ .SCRIPTS_PATH }}/text_processing/clean_text.py
make-wordlist:
desc: "Make a list of unique words that occur in the corpus"
env:
PYTHONIOENCODING: "utf-8"
cmds:
- python3 {{ .SCRIPTS_PATH }}/text_processing/make_wordlist.py
make-prn-dict:
desc: "Make pronunciation dictionary"
env:
PYTHONIOENCODING: "utf-8"
cmds:
- python3 {{ .SCRIPTS_PATH }}/pronunciation/convert.py --config {{ .SCRIPTS_PATH }}/{{ .LETTER_TO_SOUND }}
json-to-kaldi:
desc: "Generate files for the Kaldi format"
cmds:
- python3 {{ .SCRIPTS_PATH }}/audio_segmenter/json_to_kaldi.py --output-folder="{{ .OUTPUT_PATH }}/tmp/json_splitted"