2.2.2 (#571)

* Add better functionality for adding pronunciations of OOVs to dictionaries * Fix duration parsing with sox
MontrealCorpusTools · Feb 16, 2023 · c2ef519 · c2ef519
1 parent 912f216
commit c2ef519
Show file tree

Hide file tree

Showing 20 changed files with 496 additions and 72 deletions.
diff --git a/docs/source/changelog/changelog_2.2.rst b/docs/source/changelog/changelog_2.2.rst
@@ -5,6 +5,13 @@
 2.2 Changelog
 *************
 
+2.2.2
+=====
+
+- Fixed a rounding issue in parsing sox output for sound file duration
+- Added ``--dictionary_path`` option to :ref:`g2p_dictionary_generating` to allow for generating pronunciations for just those words that are missing in a dictionary
+- Added ``add_words`` subcommand to :ref:`pretrained_models` to allow for easy adding of words and pronunciations from :ref:`g2p_dictionary_generating` to pronunciation dictionaries
+
 2.2.1
 =====
 

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -86,6 +86,10 @@
         "MFA acoustic models",
         "https://mfa-models.readthedocs.io/en/latest/acoustic/index.html",
     ),
+    "pretrained_tokenizer_models": (
+        "MFA tokenizer models",
+        "https://mfa-models.readthedocs.io/en/latest/tokenizer/index.html",
+    ),
     "pretrained_dictionaries": (
         "MFA dictionaries",
         "https://mfa-models.readthedocs.io/en/latest/dictionary/index.html",
@@ -94,6 +98,10 @@
         "MFA G2P models",
         "https://mfa-models.readthedocs.io/en/latest/g2p/index.html",
     ),
+    "pretrained_ivector_extractor": (
+        "MFA G2P models",
+        "https://mfa-models.readthedocs.io/en/latest/ivector/index.html",
+    ),
     "pretrained_language_models": (
         "MFA language models",
         "https://mfa-models.readthedocs.io/en/latest/language_model/index.html",

diff --git a/docs/source/external_links.py b/docs/source/external_links.py
@@ -33,6 +33,7 @@
     "lm": "language model",
     "dictionary": "dictionary",
     "ivector": "ivector extractor",
+    "tokenizer": "tokenizer model",
 }
 
 

diff --git a/docs/source/first_steps/index.rst b/docs/source/first_steps/index.rst
@@ -15,26 +15,31 @@ Use cases
 
 There are several broad use cases that you might want to use MFA for.  Take a look below and if any are close matches, you should be able to apply the linked instructions to your data.
 
-#. **Use case 1:** You have a :ref:`speech corpus <corpus_structure>`, your language is in the list of :xref:`pretrained_acoustic_models` and the list of :xref:`pretrained_dictionaries`.
+#. **Use case 1:** You have a :ref:`speech corpus <corpus_structure>`, the language has a :xref:`pretrained acoustic model <pretrained_acoustic_models>` and :xref:`pretrained dictionary <pretrained_dictionaries>`.
 
-    #. Follow :ref:`first_steps_align_pretrained` to generate aligned TextGrids
+   #. Follow :ref:`first_steps_align_pretrained` to generate aligned TextGrids
 
-#. **Use case 2:** You have a :ref:`speech corpus <corpus_structure>`, the language involved is in the list of :xref:`pretrained_acoustic_models` and the list of :xref:`pretrained_g2p`, but not on the list of :xref:`pretrained_dictionaries`.
+#. **Use case 2:** You have a :ref:`speech corpus <corpus_structure>`, the language has a :xref:`pretrained acoustic model <pretrained_acoustic_models>` and :xref:`pretrained dictionary <pretrained_dictionaries>`, but the coverage of the dictionary for your corpus is not great, but the language has a :xref:`pretrained G2P model <pretrained_g2p>`.
 
-    #. Follow :ref:`first_steps_g2p_pretrained` to generate a dictionary
-    #. Use the generated dictionary in :ref:`first_steps_align_pretrained` to generate aligned TextGrids
+   #. Follow :ref:`first_steps_g2p_oovs` to generate pronunciations for OOV words in the corpus
+   #. Use the generated dictionary in :ref:`first_steps_align_pretrained` to generate aligned TextGrids
 
-#. **Use case 3:** You have a :ref:`speech corpus <corpus_structure>`, a :ref:`pronunciation dictionary <dictionary_format>`, but there is no :xref:`pretrained_acoustic_models` for the language (or none that have the same phones as the pronunciation dictionary)
+#. **Use case 3:** You have a :ref:`speech corpus <corpus_structure>`, the language has a :xref:`pretrained acoustic model <pretrained_acoustic_models>` and :xref:`pretrained G2P model <pretrained_g2p>`, but it doesn't have a :xref:`pretrained dictionary <pretrained_dictionaries>`.
 
-    #. Follow :ref:`first_steps_align_train_acoustic_model` to generate aligned TextGrids
+   #. Follow :ref:`first_steps_g2p_pretrained` to generate a dictionary
+   #. Use the generated dictionary in :ref:`first_steps_align_pretrained` to generate aligned TextGrids
 
-#. **Use case 4:** You have a :ref:`speech corpus <corpus_structure>`, a :ref:`pronunciation dictionary <dictionary_format>`, but it does not have great coverage of the words in the corpus.
+#. **Use case 4:** You have a :ref:`speech corpus <corpus_structure>` and your own :ref:`pronunciation dictionary <dictionary_format>`, but there is no :xref:`pretrained acoustic model <pretrained_acoustic_models>` for the language (or none that have the same phones as the pronunciation dictionary).
 
-    #. Follow :ref:`first_steps_train_g2p` to train a G2P model
-    #. Use the trained G2P model in :ref:`first_steps_g2p_pretrained` to generate a pronunciation dictionary
-    #. Use the generated pronunciation dictionary in :ref:`first_steps_align_train_acoustic_model` to generate aligned TextGrids
+   #. Follow :ref:`first_steps_align_train_acoustic_model` to generate aligned TextGrids
 
-#. **Use case 5:** You have a :ref:`speech corpus <corpus_structure>` and the language involved is in the list of :xref:`pretrained_acoustic_models`, but the language does not mark word boundaries in its orthography.
+#. **Use case 5:** You have a :ref:`speech corpus <corpus_structure>` and your own :ref:`pronunciation dictionary <dictionary_format>`, but it does not have great coverage of the words in the corpus.
+
+   #. Follow :ref:`first_steps_train_g2p` to train a G2P model
+   #. Use the trained G2P model in :ref:`first_steps_g2p_pretrained` to generate a pronunciation dictionary
+   #. Use the generated pronunciation dictionary in :ref:`first_steps_align_train_acoustic_model` to generate aligned TextGrids
+
+#. **Use case 6:** You have a :ref:`speech corpus <corpus_structure>` and the language has a :xref:`pretrained acoustic model <pretrained_acoustic_models>`, but the language does not mark word boundaries in its orthography (and the language has a :xref:`pretrained tokenizer model <pretrained_tokenizer_models>`).
 
    #. Follow :ref:`first_steps_tokenize` to tokenize the corpus
    #. Use the tokenized transcripts and follow :ref:`first_steps_align_pretrained`
@@ -98,7 +103,7 @@ Depending on your use case, you might have a list of words to run G2P over, or j
    mfa g2p ~/mfa_data/my_corpus english_us_arpa ~/mfa_data/new_dictionary.txt  # If using a corpus
    mfa g2p ~/mfa_data/my_word_list.txt english_us_arpa ~/mfa_data/new_dictionary.txt  # If using a word list
 
-Running one of the above will output a text file pronunciation dictionary in the format that MFA uses (:ref:`dictionary_format`).  I recommend looking over the pronunciations generated and make sure that they look sensible.  For languages where the orthography is not transparent, it may be helpful to include :code:`--num_pronunciations 3` so that more pronunciations are generated than just the most likely one. For more details on running G2P, see :ref:`g2p_dictionary_generating`.
+Running one of the above will output a text file pronunciation dictionary in the :ref:`MFA dictionary format <dictionary_format>`.  I recommend looking over the pronunciations generated and make sure that they look sensible.  For languages where the orthography is not transparent, it may be helpful to include :code:`--num_pronunciations 3` so that more pronunciations are generated than just the most likely one. For more details on running G2P, see :ref:`g2p_dictionary_generating`.
 
 From here you can use this dictionary file as input to any MFA command that uses dictionaries, i.e.
 
@@ -111,6 +116,47 @@ From here you can use this dictionary file as input to any MFA command that uses
 
    Please see :ref:`dict_generating_example` for an example using toy data.
 
+
+.. _first_steps_g2p_oovs:
+
+Generating pronunciations for OOV items in a corpus
+---------------------------------------------------
+
+For the purposes of this example, we'll use the "english_us_arpa" model, but the instructions will be applicable to any pretrained G2P model. We'll also assume that you have done nothing else with MFA other than follow the :ref:`installation` instructions and you have the :code:`mfa` command working.  Finally, we'll assume that your corpus is stored in the folder :code:`~/mfa_data/my_corpus`, so when working with your data, this will be the main thing to update.
+
+First we'll need the pretrained G2P model.  These are installed via the :code:`mfa model download` command:
+
+.. code-block::
+
+   mfa model download g2p english_us_arpa
+
+You should be able to run :code:`mfa model inspect g2p english_us_arpa` and it will output information about the :code:`english_us_arpa` G2P model.
+
+Depending on your use case, you might have a list of words to run G2P over, or just a corpus of sound and transcription files.  The :code:`mfa g2p` command can process either:
+
+.. code-block::
+
+   mfa g2p ~/mfa_data/my_corpus english_us_arpa ~/mfa_data/g2pped_oovs.txt --dictionary_path english_us_arpa
+
+Running the above will output a text file in the format that MFA uses (:ref:`dictionary_format`) with all the OOV words (ignoring bracketed words like :ipa_inline:`<cutoff>`).  I recommend looking over the pronunciations generated and make sure that they look sensible.  For languages where the orthography is not transparent, it may be helpful to include :code:`--num_pronunciations 3` so that more pronunciations are generated than just the most likely one. For more details on running G2P, see :ref:`g2p_dictionary_generating`.
+
+Once you have looked over the dictionary, you can save the new pronunciations via:
+
+.. code-block::
+
+   mfa model add_words english_us_arpa ~/mfa_data/g2pped_oovs.txt
+
+The new pronunciations will be available when you use  :code:`english_us_arpa` as the dictionary path in an MFA command, i.e. the modified command from :ref:`first_steps_align_pretrained`:
+
+.. code-block::
+
+   mfa align ~/mfa_data/my_corpus english_us_arpa english_us_arpa ~/mfa_data/my_corpus_aligned
+
+
+.. warning::
+
+   Please do look over the G2P results before adding them to the dictionary, at the very least to spot check.  Especially for non-transparent orthography systems, words with unseen graphemes, homographs, etc, G2P can generate phonotactically illegal forms, so I do not recommend piping G2P output to alignment without human spot checking.
+
 .. _first_steps_align_train_acoustic_model:
 
 Training a new acoustic model on a corpus

diff --git a/montreal_forced_aligner/command_line/g2p.py b/montreal_forced_aligner/command_line/g2p.py
@@ -10,10 +10,15 @@
     check_databases,
     cleanup_databases,
     common_options,
+    validate_dictionary,
     validate_g2p_model,
 )
 from montreal_forced_aligner.config import GLOBAL_CONFIG, MFA_PROFILE_VARIABLE
-from montreal_forced_aligner.g2p.generator import PyniniCorpusGenerator, PyniniWordListGenerator
+from montreal_forced_aligner.g2p.generator import (
+    PyniniCorpusGenerator,
+    PyniniDictionaryCorpusGenerator,
+    PyniniWordListGenerator,
+)
 
 __all__ = ["g2p_cli"]
 
@@ -38,6 +43,12 @@
     help="Path to config file to use for G2P.",
     type=click.Path(exists=True, file_okay=True, dir_okay=False, path_type=Path),
 )
+@click.option(
+    "--dictionary_path",
+    help="Path to existing pronunciation dictionary to use to find OOVs.",
+    type=click.UNPROCESSED,
+    callback=validate_dictionary,
+)
 @click.option(
     "--include_bracketed",
     is_flag=True,
@@ -61,13 +72,26 @@ def g2p_cli(context, **kwargs) -> None:
     input_path = kwargs["input_path"]
     g2p_model_path = kwargs["g2p_model_path"]
     output_path = kwargs["output_path"]
+    dictionary_path = kwargs.get("dictionary_path", None)
 
     if os.path.isdir(input_path):
-        g2p = PyniniCorpusGenerator(
-            corpus_directory=input_path,
-            g2p_model_path=g2p_model_path,
-            **PyniniCorpusGenerator.parse_parameters(config_path, context.params, context.args),
-        )
+        if dictionary_path is not None:
+            g2p = PyniniDictionaryCorpusGenerator(
+                corpus_directory=input_path,
+                dictionary_path=dictionary_path,
+                g2p_model_path=g2p_model_path,
+                **PyniniDictionaryCorpusGenerator.parse_parameters(
+                    config_path, context.params, context.args
+                ),
+            )
+        else:
+            g2p = PyniniCorpusGenerator(
+                corpus_directory=input_path,
+                g2p_model_path=g2p_model_path,
+                **PyniniCorpusGenerator.parse_parameters(
+                    config_path, context.params, context.args
+                ),
+            )
     else:
         g2p = PyniniWordListGenerator(
             word_list_path=input_path,

diff --git a/montreal_forced_aligner/command_line/model.py b/montreal_forced_aligner/command_line/model.py
@@ -9,13 +9,21 @@
 
 import click
 
-from montreal_forced_aligner.config import GLOBAL_CONFIG
+from montreal_forced_aligner.command_line.utils import (
+    check_databases,
+    cleanup_databases,
+    common_options,
+    validate_dictionary,
+)
+from montreal_forced_aligner.config import GLOBAL_CONFIG, MFA_PROFILE_VARIABLE
 from montreal_forced_aligner.data import PhoneSetType
+from montreal_forced_aligner.dictionary.multispeaker import MultispeakerDictionary
 from montreal_forced_aligner.exceptions import (
     ModelLoadError,
     ModelSaveError,
     ModelTypeNotSupportedError,
     MultipleModelTypesFoundError,
+    PhoneMismatchError,
     PretrainedModelNotFoundError,
 )
 from montreal_forced_aligner.models import MODEL_TYPES, Archive, ModelManager, guess_model_type
@@ -26,6 +34,7 @@
     "download_model_cli",
     "list_model_cli",
     "inspect_model_cli",
+    "add_words_cli",
 ]
 
 
@@ -138,6 +147,50 @@ def inspect_model_cli(model_type: str, model: str) -> None:
     m.pretty_print()
 
 
+@model_cli.command(name="add_words", short_help="Add words to a dictionary")
+@click.argument("dictionary_path", type=click.UNPROCESSED, callback=validate_dictionary)
+@click.argument("new_pronunciations_path", type=click.UNPROCESSED, callback=validate_dictionary)
+@click.help_option("-h", "--help")
+@common_options
+@click.pass_context
+def add_words_cli(context, **kwargs) -> None:
+    """
+    Add words from one pronunciation dictionary to another pronunciation dictionary,
+    so long as the new pronunciations do not contain any new phones
+    """
+    if kwargs.get("profile", None) is not None:
+        os.putenv(MFA_PROFILE_VARIABLE, kwargs["profile"])
+    GLOBAL_CONFIG.current_profile.update(kwargs)
+    GLOBAL_CONFIG.save()
+    check_databases()
+
+    dictionary_path = kwargs.get("dictionary_path", None)
+    new_pronunciations_path = kwargs.get("new_pronunciations_path", None)
+    base_dictionary = MultispeakerDictionary(dictionary_path=dictionary_path)
+    base_dictionary.dictionary_setup()
+    new_pronunciations = MultispeakerDictionary(dictionary_path=new_pronunciations_path)
+    new_pronunciations.dictionary_setup()
+    new_phones = set()
+    for phone in new_pronunciations.non_silence_phones:
+        if phone not in base_dictionary.non_silence_phones:
+            new_phones.add(phone)
+    if new_phones:
+        raise PhoneMismatchError(new_phones)
+
+    try:
+        new_words = new_pronunciations.words_for_export(probability=True)
+        base_dictionary.add_words(new_words)
+        base_dictionary.export_lexicon(
+            base_dictionary._default_dictionary_id,
+            base_dictionary.dictionary_model.path,
+            probability=True,
+        )
+    except Exception:
+        raise
+    finally:
+        cleanup_databases()
+
+
 @model_cli.command(name="save", short_help="Save a model")
 @click.argument("model_type", type=click.Choice(sorted(MODEL_TYPES)))
 @click.argument(

diff --git a/montreal_forced_aligner/command_line/utils.py b/montreal_forced_aligner/command_line/utils.py
@@ -5,7 +5,6 @@
 import os
 import shutil
 import subprocess
-import time
 import typing
 from pathlib import Path
 
@@ -262,7 +261,7 @@ def check_databases(db_name=None) -> None:
                 isolation_level="AUTOCOMMIT",
             ).execution_options(logging_token="check_databases_engine")
             with engine.connect():
-                time.sleep(1)
+                pass
             return
         except sqlalchemy.exc.OperationalError:
             if not os.listdir(db_directory):
@@ -343,7 +342,6 @@ def cleanup_databases() -> None:
 
 def remove_databases() -> None:
     """Remove database"""
-    time.sleep(1)
     GLOBAL_CONFIG.load()
 
     db_directory = os.path.join(

diff --git a/montreal_forced_aligner/corpus/base.py b/montreal_forced_aligner/corpus/base.py
@@ -696,13 +696,17 @@ def normalize_text(self) -> None:
                             continue
                         if (w, dict_id) not in word_insert_mappings:
                             max_mapping_ids[dict_id] += 1
+                            word_type = WordType.oov
+                            if hasattr(self, "brackets"):
+                                if any(w.startswith(b) for b, _ in self.brackets):
+                                    word_type = WordType.bracketed
                             word_insert_mappings[(w, dict_id)] = {
                                 "id": word_key,
                                 "mapping_id": max_mapping_ids[d_id],
                                 "word": w,
                                 "count": 0,
                                 "dictionary_id": dict_id,
-                                "word_type": WordType.oov,
+                                "word_type": word_type,
                             }
                             pronunciation_insert_mappings.append(
                                 {

diff --git a/montreal_forced_aligner/corpus/helper.py b/montreal_forced_aligner/corpus/helper.py
@@ -1,7 +1,7 @@
 """Helper functions for corpus parsing and loading"""
 from __future__ import annotations
 
-import datetime
+import re
 import subprocess
 import typing
 
@@ -127,11 +127,12 @@ def get_wav_info(
                 elif line.startswith("Sample Rate"):
                     sample_rate = int(line.split(":")[-1].strip())
                 elif line.startswith("Duration"):
-                    duration_string = line.split(":", maxsplit=1)[-1].split("=")[0].strip()
-                    duration = (
-                        datetime.datetime.strptime(duration_string, "%H:%M:%S.%f")
-                        - datetime.datetime(1900, 1, 1)
-                    ).total_seconds()
+                    m = re.search(r"= (?P<num_samples>\d+) samples", line)
+                    if m:
+                        num_samples = int(m.group("num_samples"))
+                        duration = round(num_samples / sample_rate, 6)
+                    else:
+                        raise SoundFileError(file_path, "Could not parse number of samples")
                     break
             sample_rate_string = ""
             if enforce_sample_rate is not None:

diff --git a/montreal_forced_aligner/dictionary/mixins.py b/montreal_forced_aligner/dictionary/mixins.py
@@ -11,8 +11,8 @@
 from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple
 
 from montreal_forced_aligner.abc import DatabaseMixin
-from montreal_forced_aligner.data import PhoneSetType, PhoneType
-from montreal_forced_aligner.db import Phone
+from montreal_forced_aligner.data import PhoneSetType, PhoneType, WordType
+from montreal_forced_aligner.db import Phone, Word
 from montreal_forced_aligner.helper import mfa_open
 
 if TYPE_CHECKING:
@@ -838,6 +838,28 @@ def __init__(self, **kwargs):
         self._disambiguation_symbols_int_path = None
         self._phones_dir = None
         self._lexicon_fst_paths = {}
+        self._num_words = None
+        self._num_speech_words = None
+
+    @property
+    def num_words(self) -> int:
+        """Number of words (including OOVs and special symbols) in the dictionary"""
+        if self._num_words is None:
+            with self.session() as session:
+                self._num_words = session.query(Word).count()
+        return self._num_words
+
+    @property
+    def num_speech_words(self) -> int:
+        """Number of speech words in the dictionary"""
+        if self._num_speech_words is None:
+            with self.session() as session:
+                self._num_speech_words = (
+                    session.query(Word)
+                    .filter(Word.word_type.in_([WordType.speech, WordType.clitic]))
+                    .count()
+                )
+        return self._num_speech_words
 
     @property
     def word_boundary_int_path(self) -> Path: