add a data prep example for lhotse

freewym · Nov 6, 2020 · f21dd7c · f21dd7c
1 parent 1b0c26c
commit f21dd7c
Show file tree

Hide file tree

Showing 5 changed files with 216 additions and 0 deletions.
diff --git a/espresso/tools/.gitignore b/espresso/tools/.gitignore
@@ -1,3 +1,4 @@
 kaldi
 openfst*
 pychain
+lhotse
diff --git a/examples/mobvoihotwords/cmd.sh b/examples/mobvoihotwords/cmd.sh
@@ -0,0 +1,20 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+#export train_cmd="run.pl --mem 4G"
+#export cuda_cmd="run.pl --mem 4G --gpu 1"
+#export decode_cmd="run.pl --mem 4G"
+
+# JHU setup (copy queue-freegpu.pl from ESPnet into utils/)
+export train_cmd="queue.pl --mem 4G"
+export cuda_cmd="queue-freegpu.pl --mem 8G --gpu 1 --config conf/gpu.conf"
+export decode_cmd="queue.pl --mem 4G"
diff --git a/examples/mobvoihotwords/conf/gpu.conf b/examples/mobvoihotwords/conf/gpu.conf
@@ -0,0 +1,10 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0
+option gpu=* -l 'hostname=c*,gpu=$0' -q g.q
diff --git a/examples/mobvoihotwords/local/data_prep.py b/examples/mobvoihotwords/local/data_prep.py
@@ -0,0 +1,169 @@
+#!/usr/bin/env python3
+# Copyright (c) Yiming Wang
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import logging
+import os
+import sys
+from concurrent.futures import ProcessPoolExecutor
+from pathlib import Path
+
+
+logging.basicConfig(
+    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    level=os.environ.get("LOGLEVEL", "INFO").upper(),
+    stream=sys.stdout,
+)
+logger = logging.getLogger(__name__)
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        description="data preparation for the MobvoiHotwords corpus"
+    )
+    # fmt: off
+    # fmt: on
+
+    return parser
+
+
+def main(args):
+    try:
+        # TODO use pip install once it's available
+        from espresso.tools.lhotse import CutSet, Mfcc, MfccConfig, LilcomFilesWriter, WavAugmenter
+        from espresso.tools.lhotse.recipes.mobvoihotwords import download_and_untar, prepare_mobvoihotwords
+        from espresso.tools.lhotse.utils import fastcopy
+    except ImportError:
+        raise ImportError("Please install Lhotse by `make lhotse` after entering espresso/tools")
+
+    root_dir = Path("data")
+    corpus_dir = root_dir / "MobvoiHotwords"
+    output_dir = root_dir
+
+    # Download and extract the corpus
+    download_and_untar(root_dir)
+
+    # Prepare manifests
+    mobvoihotwords_manifests = prepare_mobvoihotwords(corpus_dir, output_dir)
+    logger.info(
+        "train/dev/test size: {}/{}/{}".format(
+            len(mobvoihotwords_manifests["train"]["recordings"]),
+            len(mobvoihotwords_manifests["dev"]["recordings"]),
+            len(mobvoihotwords_manifests["test"]["recordings"])
+        )
+    )
+
+    # Data augmentation
+    mfcc_hires_config = fastcopy(
+        MfccConfig(), num_mel_bins=40, num_ceps=40, low_freq=20, high_freq=-400
+    )
+    num_jobs = 1
+    for partition, manifests in mobvoihotwords_manifests.items():
+        cut_set = CutSet.from_manifests(
+            recordings=manifests["recordings"],
+            supervisions=manifests["supervisions"],
+        )
+        with ProcessPoolExecutor(num_jobs) as ex:
+            if "train" in partition:
+                with LilcomFilesWriter(f"{output_dir}/feats_{partition}_orig") as storage:
+                    cut_set_orig = cut_set.compute_and_store_features(
+                        extractor=Mfcc(config=mfcc_hires_config),
+                        storage=storage,
+                        augmenter=None,
+                        executor=ex,
+                    )
+                with  LilcomFilesWriter(f"{output_dir}/feats_{partition}_rev") as storage:
+                    cut_set_rev = cut_set.compute_and_store_features(
+                        extractor=Mfcc(config=mfcc_hires_config),
+                        storage=storage,
+                        augmenter=WavAugmenter(effect_chain=reverb()),
+                        excutor=ex,
+                    )
+                    cut_set_rev = CutSet(
+                        {
+                            ("rev-" + cut.id): cut.with_id("rev-" + cut.id)
+                            for cut in cut_set_rev.cuts
+                        }
+                    )
+                with  LilcomFilesWriter(f"{output_dir}/feats_{partition}_sp1.1") as storage:
+                    cut_set_sp1p1 = cut_set.compute_and_store_features(
+                        extractor=Mfcc(config=mfcc_hires_config),
+                        storage=storage,
+                        augmenter=WavAugmenter(
+                            effect_chain=speed_perturb(times=1.1, sampling_rate=16000)
+                        ),
+                        excutor=ex,
+                    )
+                    cut_set_sp1p1 = CutSet(
+                        {
+                            ("sp1.1-" + cut.id): cut.with_id("sp1.1-" + cut.id)
+                            for cut in cut_set_sp1p1.cuts
+                        }
+                    )
+                with  LilcomFilesWriter(f"{output_dir}/feats_{partition}_sp0.9") as storage:
+                    cut_set_sp0p9 = cut_set.compute_and_store_features(
+                        extractor=Mfcc(config=mfcc_hires_config),
+                        storage=storage,
+                        augmenter=WavAugmenter(
+                            effect_chain=speed_perturb(times=0.9, sampling_rate=16000)
+                        ),
+                        excutor=ex,
+                    )
+                    cut_set_sp0p9 = CutSet(
+                        {
+                            ("sp0.9-" + cut.id): cut.with_id("sp0.9-" + cut.id)
+                            for cut in cut_set_sp0p9.cuts
+                        }
+                    )
+                cut_set = CutSet.from_cuts(
+                    {
+                        **cut_set_orig.cuts, **cut_set_rev.cuts,
+                        **cut_set_sp1p1.cuts, **cut_set_sp0p9.cuts}
+
+                )
+            else:
+                with LilcomFilesWriter(f"{output_dir}/feats_{partition}") as storage:
+                    cut_set = cut_set.compute_and_store_features(
+                        extractor=Mfcc(config=mfcc_hires_config),
+                        storage=storage,
+                        augmenter=None,
+                        executor=ex,
+                    ).pad()
+            mobvoihotwords_manifests[partition]["cuts"] = cut_set
+            cut_set.to_json(output_dir / f"cuts_{partition}.json.gz")
+
+
+def reverb(*args, **kwargs):
+    """
+    Returns a reverb effect for wav augmentation.
+    """
+    import augment
+    effect_chain = augment.EffectChain()
+    # Reverb it makes the signal to have two channels,
+    # which we combine into 1 by running `channels` w/o parameters
+    effect_chain.reverb(50, 50, lambda: np.random.randint(1, 30)).channels()
+    return effect_chain
+
+
+def speed_perturb(times: float, sampling_rate: int):
+    """
+    Returns a <times> speed perturbation effect for wav augmentation.
+    :param times: resulting times of speed relative to the original speed
+    :param sampling_rate: a sampling rate value for which the effect will be created (resampling is needed for pitch).
+    """
+    import augment
+    effect_chain = augment.EffectChain()
+    # The pitch effect changes the sampling ratio; we have to compensate for that.
+    # Here, we specify 'quick' options on both pitch and rate effects, to speed up things
+    effect_chain.pitch("-q", lambda: (times - 1.0) * 100).rate("-q", sampling_rate)
+    return effect_chain
+
+
+if __name__ == "__main__":
+    parser = get_parser()
+    args = parser.parse_args()
+    main(args)
diff --git a/examples/mobvoihotwords/path.sh b/examples/mobvoihotwords/path.sh
@@ -0,0 +1,16 @@
+MAIN_ROOT=$PWD/../..
+export KALDI_ROOT=$MAIN_ROOT/espresso/tools/kaldi
+
+# BEGIN from kaldi path.sh
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sctk/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
+# END
+
+export PATH=~/anaconda3/bin:$PATH
+export PATH=$MAIN_ROOT:$MAIN_ROOT/espresso:$MAIN_ROOT/espresso/tools:$PATH
+export LD_LIBRARY_PATH=$MAIN_ROOT/espresso/tools/openfst/lib:$LD_LIBRARY_PATH
+export PYTHONPATH=$MAIN_ROOT:$MAIN_ROOT/espresso:$MAIN_ROOT/espresso/tools:$MAIN_ROOT/espresso/tools/lhotse:$MAIN_ROOT/espresso/tools/pychain:$PYTHONPATH
+export PYTHONUNBUFFERED=1