diff --git a/docs/source/changelog/changelog_2.2.rst b/docs/source/changelog/changelog_2.2.rst index 7a9e92f8..c1e9e7b3 100644 --- a/docs/source/changelog/changelog_2.2.rst +++ b/docs/source/changelog/changelog_2.2.rst @@ -5,6 +5,13 @@ 2.2 Changelog ************* +2.2.2 +===== + +- Fixed a rounding issue in parsing sox output for sound file duration +- Added ``--dictionary_path`` option to :ref:`g2p_dictionary_generating` to allow for generating pronunciations for just those words that are missing in a dictionary +- Added ``add_words`` subcommand to :ref:`pretrained_models` to allow for easy adding of words and pronunciations from :ref:`g2p_dictionary_generating` to pronunciation dictionaries + 2.2.1 ===== diff --git a/docs/source/conf.py b/docs/source/conf.py index 7285f2d7..86827296 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -86,6 +86,10 @@ "MFA acoustic models", "https://mfa-models.readthedocs.io/en/latest/acoustic/index.html", ), + "pretrained_tokenizer_models": ( + "MFA tokenizer models", + "https://mfa-models.readthedocs.io/en/latest/tokenizer/index.html", + ), "pretrained_dictionaries": ( "MFA dictionaries", "https://mfa-models.readthedocs.io/en/latest/dictionary/index.html", @@ -94,6 +98,10 @@ "MFA G2P models", "https://mfa-models.readthedocs.io/en/latest/g2p/index.html", ), + "pretrained_ivector_extractor": ( + "MFA G2P models", + "https://mfa-models.readthedocs.io/en/latest/ivector/index.html", + ), "pretrained_language_models": ( "MFA language models", "https://mfa-models.readthedocs.io/en/latest/language_model/index.html", diff --git a/docs/source/external_links.py b/docs/source/external_links.py index ff48f1ee..7235634b 100644 --- a/docs/source/external_links.py +++ b/docs/source/external_links.py @@ -33,6 +33,7 @@ "lm": "language model", "dictionary": "dictionary", "ivector": "ivector extractor", + "tokenizer": "tokenizer model", } diff --git a/docs/source/first_steps/index.rst b/docs/source/first_steps/index.rst index 3bac7a0c..527cfdf5 100644 --- a/docs/source/first_steps/index.rst +++ b/docs/source/first_steps/index.rst @@ -15,26 +15,31 @@ Use cases There are several broad use cases that you might want to use MFA for. Take a look below and if any are close matches, you should be able to apply the linked instructions to your data. -#. **Use case 1:** You have a :ref:`speech corpus `, your language is in the list of :xref:`pretrained_acoustic_models` and the list of :xref:`pretrained_dictionaries`. +#. **Use case 1:** You have a :ref:`speech corpus `, the language has a :xref:`pretrained acoustic model ` and :xref:`pretrained dictionary `. - #. Follow :ref:`first_steps_align_pretrained` to generate aligned TextGrids + #. Follow :ref:`first_steps_align_pretrained` to generate aligned TextGrids -#. **Use case 2:** You have a :ref:`speech corpus `, the language involved is in the list of :xref:`pretrained_acoustic_models` and the list of :xref:`pretrained_g2p`, but not on the list of :xref:`pretrained_dictionaries`. +#. **Use case 2:** You have a :ref:`speech corpus `, the language has a :xref:`pretrained acoustic model ` and :xref:`pretrained dictionary `, but the coverage of the dictionary for your corpus is not great, but the language has a :xref:`pretrained G2P model `. - #. Follow :ref:`first_steps_g2p_pretrained` to generate a dictionary - #. Use the generated dictionary in :ref:`first_steps_align_pretrained` to generate aligned TextGrids + #. Follow :ref:`first_steps_g2p_oovs` to generate pronunciations for OOV words in the corpus + #. Use the generated dictionary in :ref:`first_steps_align_pretrained` to generate aligned TextGrids -#. **Use case 3:** You have a :ref:`speech corpus `, a :ref:`pronunciation dictionary `, but there is no :xref:`pretrained_acoustic_models` for the language (or none that have the same phones as the pronunciation dictionary) +#. **Use case 3:** You have a :ref:`speech corpus `, the language has a :xref:`pretrained acoustic model ` and :xref:`pretrained G2P model `, but it doesn't have a :xref:`pretrained dictionary `. - #. Follow :ref:`first_steps_align_train_acoustic_model` to generate aligned TextGrids + #. Follow :ref:`first_steps_g2p_pretrained` to generate a dictionary + #. Use the generated dictionary in :ref:`first_steps_align_pretrained` to generate aligned TextGrids -#. **Use case 4:** You have a :ref:`speech corpus `, a :ref:`pronunciation dictionary `, but it does not have great coverage of the words in the corpus. +#. **Use case 4:** You have a :ref:`speech corpus ` and your own :ref:`pronunciation dictionary `, but there is no :xref:`pretrained acoustic model ` for the language (or none that have the same phones as the pronunciation dictionary). - #. Follow :ref:`first_steps_train_g2p` to train a G2P model - #. Use the trained G2P model in :ref:`first_steps_g2p_pretrained` to generate a pronunciation dictionary - #. Use the generated pronunciation dictionary in :ref:`first_steps_align_train_acoustic_model` to generate aligned TextGrids + #. Follow :ref:`first_steps_align_train_acoustic_model` to generate aligned TextGrids -#. **Use case 5:** You have a :ref:`speech corpus ` and the language involved is in the list of :xref:`pretrained_acoustic_models`, but the language does not mark word boundaries in its orthography. +#. **Use case 5:** You have a :ref:`speech corpus ` and your own :ref:`pronunciation dictionary `, but it does not have great coverage of the words in the corpus. + + #. Follow :ref:`first_steps_train_g2p` to train a G2P model + #. Use the trained G2P model in :ref:`first_steps_g2p_pretrained` to generate a pronunciation dictionary + #. Use the generated pronunciation dictionary in :ref:`first_steps_align_train_acoustic_model` to generate aligned TextGrids + +#. **Use case 6:** You have a :ref:`speech corpus ` and the language has a :xref:`pretrained acoustic model `, but the language does not mark word boundaries in its orthography (and the language has a :xref:`pretrained tokenizer model `). #. Follow :ref:`first_steps_tokenize` to tokenize the corpus #. Use the tokenized transcripts and follow :ref:`first_steps_align_pretrained` @@ -98,7 +103,7 @@ Depending on your use case, you might have a list of words to run G2P over, or j mfa g2p ~/mfa_data/my_corpus english_us_arpa ~/mfa_data/new_dictionary.txt # If using a corpus mfa g2p ~/mfa_data/my_word_list.txt english_us_arpa ~/mfa_data/new_dictionary.txt # If using a word list -Running one of the above will output a text file pronunciation dictionary in the format that MFA uses (:ref:`dictionary_format`). I recommend looking over the pronunciations generated and make sure that they look sensible. For languages where the orthography is not transparent, it may be helpful to include :code:`--num_pronunciations 3` so that more pronunciations are generated than just the most likely one. For more details on running G2P, see :ref:`g2p_dictionary_generating`. +Running one of the above will output a text file pronunciation dictionary in the :ref:`MFA dictionary format `. I recommend looking over the pronunciations generated and make sure that they look sensible. For languages where the orthography is not transparent, it may be helpful to include :code:`--num_pronunciations 3` so that more pronunciations are generated than just the most likely one. For more details on running G2P, see :ref:`g2p_dictionary_generating`. From here you can use this dictionary file as input to any MFA command that uses dictionaries, i.e. @@ -111,6 +116,47 @@ From here you can use this dictionary file as input to any MFA command that uses Please see :ref:`dict_generating_example` for an example using toy data. + +.. _first_steps_g2p_oovs: + +Generating pronunciations for OOV items in a corpus +--------------------------------------------------- + +For the purposes of this example, we'll use the "english_us_arpa" model, but the instructions will be applicable to any pretrained G2P model. We'll also assume that you have done nothing else with MFA other than follow the :ref:`installation` instructions and you have the :code:`mfa` command working. Finally, we'll assume that your corpus is stored in the folder :code:`~/mfa_data/my_corpus`, so when working with your data, this will be the main thing to update. + +First we'll need the pretrained G2P model. These are installed via the :code:`mfa model download` command: + +.. code-block:: + + mfa model download g2p english_us_arpa + +You should be able to run :code:`mfa model inspect g2p english_us_arpa` and it will output information about the :code:`english_us_arpa` G2P model. + +Depending on your use case, you might have a list of words to run G2P over, or just a corpus of sound and transcription files. The :code:`mfa g2p` command can process either: + +.. code-block:: + + mfa g2p ~/mfa_data/my_corpus english_us_arpa ~/mfa_data/g2pped_oovs.txt --dictionary_path english_us_arpa + +Running the above will output a text file in the format that MFA uses (:ref:`dictionary_format`) with all the OOV words (ignoring bracketed words like :ipa_inline:``). I recommend looking over the pronunciations generated and make sure that they look sensible. For languages where the orthography is not transparent, it may be helpful to include :code:`--num_pronunciations 3` so that more pronunciations are generated than just the most likely one. For more details on running G2P, see :ref:`g2p_dictionary_generating`. + +Once you have looked over the dictionary, you can save the new pronunciations via: + +.. code-block:: + + mfa model add_words english_us_arpa ~/mfa_data/g2pped_oovs.txt + +The new pronunciations will be available when you use :code:`english_us_arpa` as the dictionary path in an MFA command, i.e. the modified command from :ref:`first_steps_align_pretrained`: + +.. code-block:: + + mfa align ~/mfa_data/my_corpus english_us_arpa english_us_arpa ~/mfa_data/my_corpus_aligned + + +.. warning:: + + Please do look over the G2P results before adding them to the dictionary, at the very least to spot check. Especially for non-transparent orthography systems, words with unseen graphemes, homographs, etc, G2P can generate phonotactically illegal forms, so I do not recommend piping G2P output to alignment without human spot checking. + .. _first_steps_align_train_acoustic_model: Training a new acoustic model on a corpus diff --git a/montreal_forced_aligner/command_line/g2p.py b/montreal_forced_aligner/command_line/g2p.py index 19855af2..985385bb 100644 --- a/montreal_forced_aligner/command_line/g2p.py +++ b/montreal_forced_aligner/command_line/g2p.py @@ -10,10 +10,15 @@ check_databases, cleanup_databases, common_options, + validate_dictionary, validate_g2p_model, ) from montreal_forced_aligner.config import GLOBAL_CONFIG, MFA_PROFILE_VARIABLE -from montreal_forced_aligner.g2p.generator import PyniniCorpusGenerator, PyniniWordListGenerator +from montreal_forced_aligner.g2p.generator import ( + PyniniCorpusGenerator, + PyniniDictionaryCorpusGenerator, + PyniniWordListGenerator, +) __all__ = ["g2p_cli"] @@ -38,6 +43,12 @@ help="Path to config file to use for G2P.", type=click.Path(exists=True, file_okay=True, dir_okay=False, path_type=Path), ) +@click.option( + "--dictionary_path", + help="Path to existing pronunciation dictionary to use to find OOVs.", + type=click.UNPROCESSED, + callback=validate_dictionary, +) @click.option( "--include_bracketed", is_flag=True, @@ -61,13 +72,26 @@ def g2p_cli(context, **kwargs) -> None: input_path = kwargs["input_path"] g2p_model_path = kwargs["g2p_model_path"] output_path = kwargs["output_path"] + dictionary_path = kwargs.get("dictionary_path", None) if os.path.isdir(input_path): - g2p = PyniniCorpusGenerator( - corpus_directory=input_path, - g2p_model_path=g2p_model_path, - **PyniniCorpusGenerator.parse_parameters(config_path, context.params, context.args), - ) + if dictionary_path is not None: + g2p = PyniniDictionaryCorpusGenerator( + corpus_directory=input_path, + dictionary_path=dictionary_path, + g2p_model_path=g2p_model_path, + **PyniniDictionaryCorpusGenerator.parse_parameters( + config_path, context.params, context.args + ), + ) + else: + g2p = PyniniCorpusGenerator( + corpus_directory=input_path, + g2p_model_path=g2p_model_path, + **PyniniCorpusGenerator.parse_parameters( + config_path, context.params, context.args + ), + ) else: g2p = PyniniWordListGenerator( word_list_path=input_path, diff --git a/montreal_forced_aligner/command_line/model.py b/montreal_forced_aligner/command_line/model.py index 5d5686cd..df6f5ac6 100644 --- a/montreal_forced_aligner/command_line/model.py +++ b/montreal_forced_aligner/command_line/model.py @@ -9,13 +9,21 @@ import click -from montreal_forced_aligner.config import GLOBAL_CONFIG +from montreal_forced_aligner.command_line.utils import ( + check_databases, + cleanup_databases, + common_options, + validate_dictionary, +) +from montreal_forced_aligner.config import GLOBAL_CONFIG, MFA_PROFILE_VARIABLE from montreal_forced_aligner.data import PhoneSetType +from montreal_forced_aligner.dictionary.multispeaker import MultispeakerDictionary from montreal_forced_aligner.exceptions import ( ModelLoadError, ModelSaveError, ModelTypeNotSupportedError, MultipleModelTypesFoundError, + PhoneMismatchError, PretrainedModelNotFoundError, ) from montreal_forced_aligner.models import MODEL_TYPES, Archive, ModelManager, guess_model_type @@ -26,6 +34,7 @@ "download_model_cli", "list_model_cli", "inspect_model_cli", + "add_words_cli", ] @@ -138,6 +147,50 @@ def inspect_model_cli(model_type: str, model: str) -> None: m.pretty_print() +@model_cli.command(name="add_words", short_help="Add words to a dictionary") +@click.argument("dictionary_path", type=click.UNPROCESSED, callback=validate_dictionary) +@click.argument("new_pronunciations_path", type=click.UNPROCESSED, callback=validate_dictionary) +@click.help_option("-h", "--help") +@common_options +@click.pass_context +def add_words_cli(context, **kwargs) -> None: + """ + Add words from one pronunciation dictionary to another pronunciation dictionary, + so long as the new pronunciations do not contain any new phones + """ + if kwargs.get("profile", None) is not None: + os.putenv(MFA_PROFILE_VARIABLE, kwargs["profile"]) + GLOBAL_CONFIG.current_profile.update(kwargs) + GLOBAL_CONFIG.save() + check_databases() + + dictionary_path = kwargs.get("dictionary_path", None) + new_pronunciations_path = kwargs.get("new_pronunciations_path", None) + base_dictionary = MultispeakerDictionary(dictionary_path=dictionary_path) + base_dictionary.dictionary_setup() + new_pronunciations = MultispeakerDictionary(dictionary_path=new_pronunciations_path) + new_pronunciations.dictionary_setup() + new_phones = set() + for phone in new_pronunciations.non_silence_phones: + if phone not in base_dictionary.non_silence_phones: + new_phones.add(phone) + if new_phones: + raise PhoneMismatchError(new_phones) + + try: + new_words = new_pronunciations.words_for_export(probability=True) + base_dictionary.add_words(new_words) + base_dictionary.export_lexicon( + base_dictionary._default_dictionary_id, + base_dictionary.dictionary_model.path, + probability=True, + ) + except Exception: + raise + finally: + cleanup_databases() + + @model_cli.command(name="save", short_help="Save a model") @click.argument("model_type", type=click.Choice(sorted(MODEL_TYPES))) @click.argument( diff --git a/montreal_forced_aligner/command_line/utils.py b/montreal_forced_aligner/command_line/utils.py index 3571469c..4100a6c2 100644 --- a/montreal_forced_aligner/command_line/utils.py +++ b/montreal_forced_aligner/command_line/utils.py @@ -5,7 +5,6 @@ import os import shutil import subprocess -import time import typing from pathlib import Path @@ -262,7 +261,7 @@ def check_databases(db_name=None) -> None: isolation_level="AUTOCOMMIT", ).execution_options(logging_token="check_databases_engine") with engine.connect(): - time.sleep(1) + pass return except sqlalchemy.exc.OperationalError: if not os.listdir(db_directory): @@ -343,7 +342,6 @@ def cleanup_databases() -> None: def remove_databases() -> None: """Remove database""" - time.sleep(1) GLOBAL_CONFIG.load() db_directory = os.path.join( diff --git a/montreal_forced_aligner/corpus/base.py b/montreal_forced_aligner/corpus/base.py index d32e42c7..e4281d94 100644 --- a/montreal_forced_aligner/corpus/base.py +++ b/montreal_forced_aligner/corpus/base.py @@ -696,13 +696,17 @@ def normalize_text(self) -> None: continue if (w, dict_id) not in word_insert_mappings: max_mapping_ids[dict_id] += 1 + word_type = WordType.oov + if hasattr(self, "brackets"): + if any(w.startswith(b) for b, _ in self.brackets): + word_type = WordType.bracketed word_insert_mappings[(w, dict_id)] = { "id": word_key, "mapping_id": max_mapping_ids[d_id], "word": w, "count": 0, "dictionary_id": dict_id, - "word_type": WordType.oov, + "word_type": word_type, } pronunciation_insert_mappings.append( { diff --git a/montreal_forced_aligner/corpus/helper.py b/montreal_forced_aligner/corpus/helper.py index 35c05cf1..9692c3ed 100644 --- a/montreal_forced_aligner/corpus/helper.py +++ b/montreal_forced_aligner/corpus/helper.py @@ -1,7 +1,7 @@ """Helper functions for corpus parsing and loading""" from __future__ import annotations -import datetime +import re import subprocess import typing @@ -127,11 +127,12 @@ def get_wav_info( elif line.startswith("Sample Rate"): sample_rate = int(line.split(":")[-1].strip()) elif line.startswith("Duration"): - duration_string = line.split(":", maxsplit=1)[-1].split("=")[0].strip() - duration = ( - datetime.datetime.strptime(duration_string, "%H:%M:%S.%f") - - datetime.datetime(1900, 1, 1) - ).total_seconds() + m = re.search(r"= (?P\d+) samples", line) + if m: + num_samples = int(m.group("num_samples")) + duration = round(num_samples / sample_rate, 6) + else: + raise SoundFileError(file_path, "Could not parse number of samples") break sample_rate_string = "" if enforce_sample_rate is not None: diff --git a/montreal_forced_aligner/dictionary/mixins.py b/montreal_forced_aligner/dictionary/mixins.py index a1a1070a..f1f95395 100644 --- a/montreal_forced_aligner/dictionary/mixins.py +++ b/montreal_forced_aligner/dictionary/mixins.py @@ -11,8 +11,8 @@ from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple from montreal_forced_aligner.abc import DatabaseMixin -from montreal_forced_aligner.data import PhoneSetType, PhoneType -from montreal_forced_aligner.db import Phone +from montreal_forced_aligner.data import PhoneSetType, PhoneType, WordType +from montreal_forced_aligner.db import Phone, Word from montreal_forced_aligner.helper import mfa_open if TYPE_CHECKING: @@ -838,6 +838,28 @@ def __init__(self, **kwargs): self._disambiguation_symbols_int_path = None self._phones_dir = None self._lexicon_fst_paths = {} + self._num_words = None + self._num_speech_words = None + + @property + def num_words(self) -> int: + """Number of words (including OOVs and special symbols) in the dictionary""" + if self._num_words is None: + with self.session() as session: + self._num_words = session.query(Word).count() + return self._num_words + + @property + def num_speech_words(self) -> int: + """Number of speech words in the dictionary""" + if self._num_speech_words is None: + with self.session() as session: + self._num_speech_words = ( + session.query(Word) + .filter(Word.word_type.in_([WordType.speech, WordType.clitic])) + .count() + ) + return self._num_speech_words @property def word_boundary_int_path(self) -> Path: diff --git a/montreal_forced_aligner/dictionary/multispeaker.py b/montreal_forced_aligner/dictionary/multispeaker.py index 0ddcd148..aa80ad4c 100644 --- a/montreal_forced_aligner/dictionary/multispeaker.py +++ b/montreal_forced_aligner/dictionary/multispeaker.py @@ -166,7 +166,7 @@ def dictionary_base_names(self) -> Dict[int, str]: self._dictionary_base_names[d_id] = base_name return self._dictionary_base_names - def word_mapping(self, dictionary_id: int = 1) -> Dict[str, int]: + def word_mapping(self, dictionary_id: int = None) -> Dict[str, int]: """ Get the word mapping for a specified dictionary id @@ -180,6 +180,8 @@ def word_mapping(self, dictionary_id: int = 1) -> Dict[str, int]: dict[str, int] Mapping from words to their integer IDs for Kaldi processing """ + if dictionary_id is None: + dictionary_id = self._default_dictionary_id if dictionary_id not in self._words_mappings: self._words_mappings[dictionary_id] = {} with self.session() as session: @@ -1232,29 +1234,95 @@ def export_trained_rules(self, output_directory: str) -> None: with mfa_open(output_rules_path, "w") as f: yaml.dump(dict(dialectal_rules), f, Dumper=yaml.Dumper, allow_unicode=True) - def export_lexicon( + def add_words( + self, new_word_data: typing.List[typing.Dict[str, typing.Any]], dictionary_id: int = None + ) -> None: + """ + Add word data to a dictionary in the form exported from + :meth:`~montreal_forced_aligner.dictionary.multispeaker.MultispeakerDictionaryMixin.words_for_export` + + Parameters + ---------- + new_word_data: list[dict[str,Any]] + Word data to add + dictionary_id: int, optional + Dictionary id to add words, defaults to the default dictionary + """ + if dictionary_id is None: + dictionary_id = self._default_dictionary_id + word_mapping = {} + pronunciation_mapping = [] + word_index = self.get_next_primary_key(Word) + pronunciation_index = self.get_next_primary_key(Word) + with self.session() as session: + word_mapping_index = ( + session.query(sqlalchemy.func.max(Word.mapping_id)) + .filter(Word.dictionary_id == dictionary_id) + .scalar() + + 1 + ) + for data in new_word_data: + word = data["word"] + if word in self.word_mapping(dictionary_id): + continue + if word not in word_mapping: + word_mapping[word] = { + "id": word_index, + "mapping_id": word_mapping_index, + "word": word, + "word_type": WordType.speech, + "count": 0, + "dictionary_id": dictionary_id, + } + word_index += 1 + word_mapping_index += 1 + phones = data["pronunciation"] + d = { + "id": pronunciation_index, + "base_pronunciation_id": pronunciation_index, + "word_id": word_mapping[word]["id"], + "pronunciation": phones, + } + pronunciation_index += 1 + if "probability" in data and data["probability"] is not None: + d["probability"] = data["probability"] + d["silence_after_probability"] = data["silence_after_probability"] + d["silence_before_correction"] = data["silence_before_correction"] + d["non_silence_before_correction"] = data["non_silence_before_correction"] + + pronunciation_mapping.append(d) + self._num_speech_words = None + session.bulk_insert_mappings(Word, list(word_mapping.values())) + session.flush() + session.bulk_insert_mappings(Pronunciation, pronunciation_mapping) + session.commit() + + def words_for_export( self, - dictionary_id: int, - path: Path, + dictionary_id: int = None, write_disambiguation: typing.Optional[bool] = False, probability: typing.Optional[bool] = False, - ) -> None: + ) -> typing.List[typing.Dict[str, typing.Any]]: """ - Export pronunciation dictionary to a text file + Generate exportable pronunciations Parameters ---------- - path: :class:`~pathlib.Path` - Path to save dictionary + dictionary_id: int, optional + Dictionary id to export, defaults to the default dictionary write_disambiguation: bool, optional Flag for whether to include disambiguation information probability: bool, optional Flag for whether to include probabilities - silence_probabilities: bool, optional - Flag for whether to include per pronunciation silence probabilities, only valid - when ``probability`` is set to True + + Returns + ------- + list[dict[str,Any]] + List of pronunciations as dictionaries """ - with mfa_open(path, "w") as f, self.session() as session: + if dictionary_id is None: + dictionary_id = self._default_dictionary_id + with self.session() as session: columns = [Word.word, Pronunciation.pronunciation] if write_disambiguation: columns.append(Pronunciation.disambiguation) @@ -1273,8 +1341,30 @@ def export_lexicon( ) .order_by(Word.word) ) - for row in pronunciations: - data = row.pronunciation_data + data = [row for row, in pronunciations] + return data + + def export_lexicon( + self, + dictionary_id: int, + path: Path, + write_disambiguation: typing.Optional[bool] = False, + probability: typing.Optional[bool] = False, + ) -> None: + """ + Export pronunciation dictionary to a text file + + Parameters + ---------- + path: :class:`~pathlib.Path` + Path to save dictionary + write_disambiguation: bool, optional + Flag for whether to include disambiguation information + probability: bool, optional + Flag for whether to include probabilities + """ + with mfa_open(path, "w") as f: + for data in self.words_for_export(dictionary_id, write_disambiguation, probability): phones = data["pronunciation"] if write_disambiguation and data["disambiguation"] is not None: phones += f" #{data['disambiguation']}" diff --git a/montreal_forced_aligner/exceptions.py b/montreal_forced_aligner/exceptions.py index dce0a5d0..e2affd2b 100644 --- a/montreal_forced_aligner/exceptions.py +++ b/montreal_forced_aligner/exceptions.py @@ -268,6 +268,22 @@ class DictionaryError(MFAError): pass +class PhoneMismatchError(DictionaryError): + """ + Exception class for when a dictionary receives a new phone + + Parameters + ---------- + missing_phones: Collection[str] + Phones that are not in the acoustic model + """ + + def __init__(self, missing_phones: Collection[str]): + super().__init__("There were extra phones that were not in the dictionary: ") + missing_phones = [f"{self.printer.error_text(x)}" for x in sorted(missing_phones)] + self.message_lines.append(comma_join(missing_phones)) + + class NoDefaultSpeakerDictionaryError(DictionaryError): """ Exception class for errors in creating MultispeakerDictionary objects diff --git a/montreal_forced_aligner/g2p/generator.py b/montreal_forced_aligner/g2p/generator.py index 1114eb51..64d1efb9 100644 --- a/montreal_forced_aligner/g2p/generator.py +++ b/montreal_forced_aligner/g2p/generator.py @@ -20,7 +20,9 @@ from montreal_forced_aligner.abc import DatabaseMixin, TopLevelMfaWorker from montreal_forced_aligner.config import GLOBAL_CONFIG -from montreal_forced_aligner.corpus.text_corpus import TextCorpusMixin +from montreal_forced_aligner.corpus.text_corpus import DictionaryTextCorpusMixin, TextCorpusMixin +from montreal_forced_aligner.data import WordType +from montreal_forced_aligner.db import Word from montreal_forced_aligner.exceptions import PyniniGenerationError from montreal_forced_aligner.g2p.mixins import G2PTopLevelMixin from montreal_forced_aligner.helper import comma_join, mfa_open, score_g2p @@ -759,3 +761,46 @@ def words_to_g2p(self) -> List[str]: if not self.include_bracketed: word_list = [x for x in word_list if not self.check_bracketed(x)] return word_list + + +class PyniniDictionaryCorpusGenerator( + PyniniGenerator, DictionaryTextCorpusMixin, TopLevelMfaWorker +): + """ + Top-level worker for generating pronunciations from a corpus and a Pynini G2P model + + See Also + -------- + :class:`~montreal_forced_aligner.g2p.generator.PyniniGenerator` + For Pynini G2P generation parameters + :class:`~montreal_forced_aligner.corpus.text_corpus.TextCorpusMixin` + For corpus parsing parameters + :class:`~montreal_forced_aligner.abc.TopLevelMfaWorker` + For top-level parameters + """ + + def __init__(self, **kwargs): + super().__init__(**kwargs) + self._word_list = None + + def setup(self) -> None: + """Set up the pronunciation generator""" + if self.initialized: + return + self.load_corpus() + super().setup() + self.g2p_model.validate(self.words_to_g2p) + self.initialized = True + + @property + def words_to_g2p(self) -> List[str]: + """Words to produce pronunciations""" + if self._word_list is None: + with self.session() as session: + query = ( + session.query(Word.word) + .filter(Word.word_type == WordType.oov, Word.word != self.oov_word) + .order_by(Word.word) + ) + self._word_list = [x for x, in query] + return self._word_list diff --git a/montreal_forced_aligner/language_modeling/trainer.py b/montreal_forced_aligner/language_modeling/trainer.py index 690b610e..07bed55a 100644 --- a/montreal_forced_aligner/language_modeling/trainer.py +++ b/montreal_forced_aligner/language_modeling/trainer.py @@ -352,9 +352,6 @@ def evaluate(self) -> None: if m: perplexity = float(m.group("perplexity")) self.large_perplexity = perplexity - self.num_sentences = num_sentences - self.num_words = num_words - self.num_oovs = num_oovs logger.info(f"{num_sentences}, {num_words}, {num_oovs}") logger.info(f"Perplexity of large model: {perplexity}") diff --git a/montreal_forced_aligner/tokenization/trainer.py b/montreal_forced_aligner/tokenization/trainer.py index fbc3e116..5aba586e 100644 --- a/montreal_forced_aligner/tokenization/trainer.py +++ b/montreal_forced_aligner/tokenization/trainer.py @@ -308,14 +308,17 @@ def meta(self) -> MetaDict: def train(self) -> None: if os.path.exists(self.fst_path): + self.finalize_training() return super().train() def initialize_training(self) -> None: """Initialize training tokenizer model""" - logger.info("Initializing training...") self.create_new_current_workflow(WorkflowType.tokenizer_training) + if self.fst_path.exists(): + return + logger.info("Initializing training...") with self.session() as session: session.query(M2M2Job).delete() session.query(M2MSymbol).delete() diff --git a/tests/conftest.py b/tests/conftest.py index adbf347a..68d196f4 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -777,15 +777,6 @@ def groups_path(config_directory): return config_directory.joinpath("test_groups.yaml") -@pytest.fixture(scope="session") -def speaker_dictionary_path(basic_dict_path, acoustic_dict_path, generated_dir): - data = {"default": acoustic_dict_path, "sickmichael": basic_dict_path} - speaker_dict_path = generated_dir.joinpath("test_basic_acoustic_dicts.yaml") - with mfa_open(speaker_dict_path, "w") as f: - yaml.dump(data, f, Dumper=yaml.Dumper, allow_unicode=True) - return speaker_dict_path - - @pytest.fixture(scope="session") def mono_output_directory(generated_dir): return generated_dir.joinpath("mono_output") diff --git a/tests/data/dictionaries/test_acoustic.txt b/tests/data/dictionaries/test_acoustic.txt index a687430b..587abe26 100644 --- a/tests/data/dictionaries/test_acoustic.txt +++ b/tests/data/dictionaries/test_acoustic.txt @@ -40,3 +40,6 @@ should sh uh d be b iy all aa l thanks th ae ng k s +just jh ah s t +sound s aw n d +environment eh n v ay r ah n m eh n t diff --git a/tests/test_commandline_g2p.py b/tests/test_commandline_g2p.py index 7e7ea3ff..9fd1deaa 100644 --- a/tests/test_commandline_g2p.py +++ b/tests/test_commandline_g2p.py @@ -39,7 +39,44 @@ def test_generate_pretrained( check_databases() d = MultispeakerDictionary(output_path) d.dictionary_setup() - assert len(d.word_mapping(list(d.dictionary_lookup.values())[0])) > 0 + assert d.num_speech_words > 0 + + +def test_generate_pretrained_dictionary( + english_g2p_model, combined_corpus_dir, english_dictionary, temp_dir, generated_dir, db_setup +): + output_path = generated_dir.joinpath("filtered_g2p_out.txt") + command = [ + "g2p", + combined_corpus_dir, + english_g2p_model, + output_path, + "-t", + os.path.join(temp_dir, "dict_g2p_cli"), + "-q", + "--clean", + "--dictionary_path", + english_dictionary, + "--num_pronunciations", + "1", + "--use_mp", + "False", + ] + command = [str(x) for x in command] + result = click.testing.CliRunner(mix_stderr=False, echo_stdin=True).invoke( + mfa_cli, command, catch_exceptions=True + ) + print(result.stdout) + print(result.stderr) + if result.exception: + print(result.exc_info) + raise result.exception + assert not result.return_value + assert os.path.exists(output_path) + check_databases() + d = MultispeakerDictionary(output_path) + d.dictionary_setup() + assert d.num_speech_words == 2 def test_generate_pretrained_threshold( @@ -73,7 +110,7 @@ def test_generate_pretrained_threshold( d = MultispeakerDictionary(output_path) d.dictionary_setup() - assert len(d.word_mapping(list(d.dictionary_lookup.values())[0])) > 0 + assert d.num_speech_words > 0 def test_train_g2p( @@ -175,7 +212,7 @@ def test_generate_dict( check_databases() d = MultispeakerDictionary(dictionary_path=g2p_basic_output) d.dictionary_setup() - assert len(d.word_mapping(list(d.dictionary_lookup.values())[0])) > 0 + assert d.num_speech_words > 0 def test_generate_dict_phonetisaurus( @@ -213,7 +250,7 @@ def test_generate_dict_phonetisaurus( check_databases() d = MultispeakerDictionary(dictionary_path=g2p_basic_phonetisaurus_output) d.dictionary_setup() - assert len(d.word_mapping(list(d.dictionary_lookup.values())[0])) > 0 + assert d.num_speech_words > 0 def test_generate_dict_text_only( @@ -252,7 +289,7 @@ def test_generate_dict_text_only( check_databases() d = MultispeakerDictionary(dictionary_path=g2p_basic_output) d.dictionary_setup() - assert len(d.word_mapping(list(d.dictionary_lookup.values())[0])) > 0 + assert d.num_speech_words > 0 def test_generate_dict_textgrid( @@ -291,4 +328,4 @@ def test_generate_dict_textgrid( check_databases() d = MultispeakerDictionary(dictionary_path=output_file) d.dictionary_setup() - assert len(d.word_mapping(list(d.dictionary_lookup.values())[0])) > 0 + assert d.num_speech_words > 0 diff --git a/tests/test_commandline_model.py b/tests/test_commandline_model.py index d98e129d..57aa1b2a 100644 --- a/tests/test_commandline_model.py +++ b/tests/test_commandline_model.py @@ -4,7 +4,8 @@ import pytest from montreal_forced_aligner.command_line.mfa import mfa_cli -from montreal_forced_aligner.exceptions import RemoteModelNotFoundError +from montreal_forced_aligner.dictionary import MultispeakerDictionary +from montreal_forced_aligner.exceptions import PhoneMismatchError, RemoteModelNotFoundError from montreal_forced_aligner.models import AcousticModel, DictionaryModel, G2PModel, ModelManager @@ -140,6 +141,88 @@ def test_inspect_model(): assert not result.return_value +def test_add_pronunciations( + hindi_dict_path, japanese_dict_path, basic_dict_path, acoustic_dict_path +): + command = [ + "model", + "save", + "dictionary", + str(hindi_dict_path), + "--name", + "hindi", + "--overwrite", + ] + result = click.testing.CliRunner(mix_stderr=False, echo_stdin=True).invoke( + mfa_cli, command, catch_exceptions=True + ) + print(result.stdout) + print(result.stderr) + if result.exception: + print(result.exc_info) + raise result.exception + assert not result.return_value + assert os.path.exists(DictionaryModel.get_pretrained_path("hindi")) + + with pytest.raises(PhoneMismatchError): + command = [ + "model", + "add_words", + "hindi", + str(japanese_dict_path), + ] + result = click.testing.CliRunner(mix_stderr=False, echo_stdin=True).invoke( + mfa_cli, command, catch_exceptions=True + ) + print(result.stdout) + print(result.stderr) + if result.exception: + print(result.exc_info) + raise result.exception + assert not result.return_value + command = [ + "model", + "save", + "dictionary", + str(acoustic_dict_path), + "--name", + "acoustic", + "--overwrite", + ] + result = click.testing.CliRunner(mix_stderr=False, echo_stdin=True).invoke( + mfa_cli, command, catch_exceptions=True + ) + print(result.stdout) + print(result.stderr) + if result.exception: + print(result.exc_info) + raise result.exception + assert not result.return_value + assert os.path.exists(DictionaryModel.get_pretrained_path("acoustic")) + command = [ + "model", + "add_words", + "acoustic", + str(basic_dict_path), + ] + result = click.testing.CliRunner(mix_stderr=False, echo_stdin=True).invoke( + mfa_cli, command, catch_exceptions=True + ) + print(result.stdout) + print(result.stderr) + if result.exception: + print(result.exc_info) + raise result.exception + assert not result.return_value + + pretrained_acoustic_path = DictionaryModel.get_pretrained_path("acoustic") + assert pretrained_acoustic_path.exists() + d = MultispeakerDictionary(pretrained_acoustic_path) + d.dictionary_setup() + + assert "hopefully" in d.word_mapping() + + def test_list_model(): command = [ "model", diff --git a/tests/test_corpus.py b/tests/test_corpus.py index 8c7e7d74..fb71ab0d 100644 --- a/tests/test_corpus.py +++ b/tests/test_corpus.py @@ -398,11 +398,6 @@ def test_weird_words(weird_words_dir, generated_dir, basic_dict_path, global_con "ajfish", "asds-asda", "sdasd", - "[me_really]", - "[me____really]", - "[me_really]", - "", - "<_s>", } ) assert (