diff --git a/docs/source/changelog/changelog_2.2.rst b/docs/source/changelog/changelog_2.2.rst
index 7a9e92f8..c1e9e7b3 100644
--- a/docs/source/changelog/changelog_2.2.rst
+++ b/docs/source/changelog/changelog_2.2.rst
@@ -5,6 +5,13 @@
 2.2 Changelog
 *************
 
+2.2.2
+=====
+
+- Fixed a rounding issue in parsing sox output for sound file duration
+- Added ``--dictionary_path`` option to :ref:`g2p_dictionary_generating` to allow for generating pronunciations for just those words that are missing in a dictionary
+- Added ``add_words`` subcommand to :ref:`pretrained_models` to allow for easy adding of words and pronunciations from :ref:`g2p_dictionary_generating` to pronunciation dictionaries
+
 2.2.1
 =====
 
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 7285f2d7..86827296 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -86,6 +86,10 @@
         "MFA acoustic models",
         "https://mfa-models.readthedocs.io/en/latest/acoustic/index.html",
     ),
+    "pretrained_tokenizer_models": (
+        "MFA tokenizer models",
+        "https://mfa-models.readthedocs.io/en/latest/tokenizer/index.html",
+    ),
     "pretrained_dictionaries": (
         "MFA dictionaries",
         "https://mfa-models.readthedocs.io/en/latest/dictionary/index.html",
@@ -94,6 +98,10 @@
         "MFA G2P models",
         "https://mfa-models.readthedocs.io/en/latest/g2p/index.html",
     ),
+    "pretrained_ivector_extractor": (
+        "MFA G2P models",
+        "https://mfa-models.readthedocs.io/en/latest/ivector/index.html",
+    ),
     "pretrained_language_models": (
         "MFA language models",
         "https://mfa-models.readthedocs.io/en/latest/language_model/index.html",
diff --git a/docs/source/external_links.py b/docs/source/external_links.py
index ff48f1ee..7235634b 100644
--- a/docs/source/external_links.py
+++ b/docs/source/external_links.py
@@ -33,6 +33,7 @@
     "lm": "language model",
     "dictionary": "dictionary",
     "ivector": "ivector extractor",
+    "tokenizer": "tokenizer model",
 }
 
 
diff --git a/docs/source/first_steps/index.rst b/docs/source/first_steps/index.rst
index 3bac7a0c..527cfdf5 100644
--- a/docs/source/first_steps/index.rst
+++ b/docs/source/first_steps/index.rst
@@ -15,26 +15,31 @@ Use cases
 
 There are several broad use cases that you might want to use MFA for.  Take a look below and if any are close matches, you should be able to apply the linked instructions to your data.
 
-#. **Use case 1:** You have a :ref:`speech corpus <corpus_structure>`, your language is in the list of :xref:`pretrained_acoustic_models` and the list of :xref:`pretrained_dictionaries`.
+#. **Use case 1:** You have a :ref:`speech corpus <corpus_structure>`, the language has a :xref:`pretrained acoustic model <pretrained_acoustic_models>` and :xref:`pretrained dictionary <pretrained_dictionaries>`.
 
-    #. Follow :ref:`first_steps_align_pretrained` to generate aligned TextGrids
+   #. Follow :ref:`first_steps_align_pretrained` to generate aligned TextGrids
 
-#. **Use case 2:** You have a :ref:`speech corpus <corpus_structure>`, the language involved is in the list of :xref:`pretrained_acoustic_models` and the list of :xref:`pretrained_g2p`, but not on the list of :xref:`pretrained_dictionaries`.
+#. **Use case 2:** You have a :ref:`speech corpus <corpus_structure>`, the language has a :xref:`pretrained acoustic model <pretrained_acoustic_models>` and :xref:`pretrained dictionary <pretrained_dictionaries>`, but the coverage of the dictionary for your corpus is not great, but the language has a :xref:`pretrained G2P model <pretrained_g2p>`.
 
-    #. Follow :ref:`first_steps_g2p_pretrained` to generate a dictionary
-    #. Use the generated dictionary in :ref:`first_steps_align_pretrained` to generate aligned TextGrids
+   #. Follow :ref:`first_steps_g2p_oovs` to generate pronunciations for OOV words in the corpus
+   #. Use the generated dictionary in :ref:`first_steps_align_pretrained` to generate aligned TextGrids
 
-#. **Use case 3:** You have a :ref:`speech corpus <corpus_structure>`, a :ref:`pronunciation dictionary <dictionary_format>`, but there is no :xref:`pretrained_acoustic_models` for the language (or none that have the same phones as the pronunciation dictionary)
+#. **Use case 3:** You have a :ref:`speech corpus <corpus_structure>`, the language has a :xref:`pretrained acoustic model <pretrained_acoustic_models>` and :xref:`pretrained G2P model <pretrained_g2p>`, but it doesn't have a :xref:`pretrained dictionary <pretrained_dictionaries>`.
 
-    #. Follow :ref:`first_steps_align_train_acoustic_model` to generate aligned TextGrids
+   #. Follow :ref:`first_steps_g2p_pretrained` to generate a dictionary
+   #. Use the generated dictionary in :ref:`first_steps_align_pretrained` to generate aligned TextGrids
 
-#. **Use case 4:** You have a :ref:`speech corpus <corpus_structure>`, a :ref:`pronunciation dictionary <dictionary_format>`, but it does not have great coverage of the words in the corpus.
+#. **Use case 4:** You have a :ref:`speech corpus <corpus_structure>` and your own :ref:`pronunciation dictionary <dictionary_format>`, but there is no :xref:`pretrained acoustic model <pretrained_acoustic_models>` for the language (or none that have the same phones as the pronunciation dictionary).
 
-    #. Follow :ref:`first_steps_train_g2p` to train a G2P model
-    #. Use the trained G2P model in :ref:`first_steps_g2p_pretrained` to generate a pronunciation dictionary
-    #. Use the generated pronunciation dictionary in :ref:`first_steps_align_train_acoustic_model` to generate aligned TextGrids
+   #. Follow :ref:`first_steps_align_train_acoustic_model` to generate aligned TextGrids
 
-#. **Use case 5:** You have a :ref:`speech corpus <corpus_structure>` and the language involved is in the list of :xref:`pretrained_acoustic_models`, but the language does not mark word boundaries in its orthography.
+#. **Use case 5:** You have a :ref:`speech corpus <corpus_structure>` and your own :ref:`pronunciation dictionary <dictionary_format>`, but it does not have great coverage of the words in the corpus.
+
+   #. Follow :ref:`first_steps_train_g2p` to train a G2P model
+   #. Use the trained G2P model in :ref:`first_steps_g2p_pretrained` to generate a pronunciation dictionary
+   #. Use the generated pronunciation dictionary in :ref:`first_steps_align_train_acoustic_model` to generate aligned TextGrids
+
+#. **Use case 6:** You have a :ref:`speech corpus <corpus_structure>` and the language has a :xref:`pretrained acoustic model <pretrained_acoustic_models>`, but the language does not mark word boundaries in its orthography (and the language has a :xref:`pretrained tokenizer model <pretrained_tokenizer_models>`).
 
    #. Follow :ref:`first_steps_tokenize` to tokenize the corpus
    #. Use the tokenized transcripts and follow :ref:`first_steps_align_pretrained`
@@ -98,7 +103,7 @@ Depending on your use case, you might have a list of words to run G2P over, or j
    mfa g2p ~/mfa_data/my_corpus english_us_arpa ~/mfa_data/new_dictionary.txt  # If using a corpus
    mfa g2p ~/mfa_data/my_word_list.txt english_us_arpa ~/mfa_data/new_dictionary.txt  # If using a word list
 
-Running one of the above will output a text file pronunciation dictionary in the format that MFA uses (:ref:`dictionary_format`).  I recommend looking over the pronunciations generated and make sure that they look sensible.  For languages where the orthography is not transparent, it may be helpful to include :code:`--num_pronunciations 3` so that more pronunciations are generated than just the most likely one. For more details on running G2P, see :ref:`g2p_dictionary_generating`.
+Running one of the above will output a text file pronunciation dictionary in the :ref:`MFA dictionary format <dictionary_format>`.  I recommend looking over the pronunciations generated and make sure that they look sensible.  For languages where the orthography is not transparent, it may be helpful to include :code:`--num_pronunciations 3` so that more pronunciations are generated than just the most likely one. For more details on running G2P, see :ref:`g2p_dictionary_generating`.
 
 From here you can use this dictionary file as input to any MFA command that uses dictionaries, i.e.
 
@@ -111,6 +116,47 @@ From here you can use this dictionary file as input to any MFA command that uses
 
    Please see :ref:`dict_generating_example` for an example using toy data.
 
+
+.. _first_steps_g2p_oovs:
+
+Generating pronunciations for OOV items in a corpus
+---------------------------------------------------
+
+For the purposes of this example, we'll use the "english_us_arpa" model, but the instructions will be applicable to any pretrained G2P model. We'll also assume that you have done nothing else with MFA other than follow the :ref:`installation` instructions and you have the :code:`mfa` command working.  Finally, we'll assume that your corpus is stored in the folder :code:`~/mfa_data/my_corpus`, so when working with your data, this will be the main thing to update.
+
+First we'll need the pretrained G2P model.  These are installed via the :code:`mfa model download` command:
+
+.. code-block::
+
+   mfa model download g2p english_us_arpa
+
+You should be able to run :code:`mfa model inspect g2p english_us_arpa` and it will output information about the :code:`english_us_arpa` G2P model.
+
+Depending on your use case, you might have a list of words to run G2P over, or just a corpus of sound and transcription files.  The :code:`mfa g2p` command can process either:
+
+.. code-block::
+
+   mfa g2p ~/mfa_data/my_corpus english_us_arpa ~/mfa_data/g2pped_oovs.txt --dictionary_path english_us_arpa
+
+Running the above will output a text file in the format that MFA uses (:ref:`dictionary_format`) with all the OOV words (ignoring bracketed words like :ipa_inline:`<cutoff>`).  I recommend looking over the pronunciations generated and make sure that they look sensible.  For languages where the orthography is not transparent, it may be helpful to include :code:`--num_pronunciations 3` so that more pronunciations are generated than just the most likely one. For more details on running G2P, see :ref:`g2p_dictionary_generating`.
+
+Once you have looked over the dictionary, you can save the new pronunciations via:
+
+.. code-block::
+
+   mfa model add_words english_us_arpa ~/mfa_data/g2pped_oovs.txt
+
+The new pronunciations will be available when you use  :code:`english_us_arpa` as the dictionary path in an MFA command, i.e. the modified command from :ref:`first_steps_align_pretrained`:
+
+.. code-block::
+
+   mfa align ~/mfa_data/my_corpus english_us_arpa english_us_arpa ~/mfa_data/my_corpus_aligned
+
+
+.. warning::
+
+   Please do look over the G2P results before adding them to the dictionary, at the very least to spot check.  Especially for non-transparent orthography systems, words with unseen graphemes, homographs, etc, G2P can generate phonotactically illegal forms, so I do not recommend piping G2P output to alignment without human spot checking.
+
 .. _first_steps_align_train_acoustic_model:
 
 Training a new acoustic model on a corpus
diff --git a/montreal_forced_aligner/command_line/g2p.py b/montreal_forced_aligner/command_line/g2p.py
index 19855af2..985385bb 100644
--- a/montreal_forced_aligner/command_line/g2p.py
+++ b/montreal_forced_aligner/command_line/g2p.py
@@ -10,10 +10,15 @@
     check_databases,
     cleanup_databases,
     common_options,
+    validate_dictionary,
     validate_g2p_model,
 )
 from montreal_forced_aligner.config import GLOBAL_CONFIG, MFA_PROFILE_VARIABLE
-from montreal_forced_aligner.g2p.generator import PyniniCorpusGenerator, PyniniWordListGenerator
+from montreal_forced_aligner.g2p.generator import (
+    PyniniCorpusGenerator,
+    PyniniDictionaryCorpusGenerator,
+    PyniniWordListGenerator,
+)
 
 __all__ = ["g2p_cli"]
 
@@ -38,6 +43,12 @@
     help="Path to config file to use for G2P.",
     type=click.Path(exists=True, file_okay=True, dir_okay=False, path_type=Path),
 )
+@click.option(
+    "--dictionary_path",
+    help="Path to existing pronunciation dictionary to use to find OOVs.",
+    type=click.UNPROCESSED,
+    callback=validate_dictionary,
+)
 @click.option(
     "--include_bracketed",
     is_flag=True,
@@ -61,13 +72,26 @@ def g2p_cli(context, **kwargs) -> None:
     input_path = kwargs["input_path"]
     g2p_model_path = kwargs["g2p_model_path"]
     output_path = kwargs["output_path"]
+    dictionary_path = kwargs.get("dictionary_path", None)
 
     if os.path.isdir(input_path):
-        g2p = PyniniCorpusGenerator(
-            corpus_directory=input_path,
-            g2p_model_path=g2p_model_path,
-            **PyniniCorpusGenerator.parse_parameters(config_path, context.params, context.args),
-        )
+        if dictionary_path is not None:
+            g2p = PyniniDictionaryCorpusGenerator(
+                corpus_directory=input_path,
+                dictionary_path=dictionary_path,
+                g2p_model_path=g2p_model_path,
+                **PyniniDictionaryCorpusGenerator.parse_parameters(
+                    config_path, context.params, context.args
+                ),
+            )
+        else:
+            g2p = PyniniCorpusGenerator(
+                corpus_directory=input_path,
+                g2p_model_path=g2p_model_path,
+                **PyniniCorpusGenerator.parse_parameters(
+                    config_path, context.params, context.args
+                ),
+            )
     else:
         g2p = PyniniWordListGenerator(
             word_list_path=input_path,
diff --git a/montreal_forced_aligner/command_line/model.py b/montreal_forced_aligner/command_line/model.py
index 5d5686cd..df6f5ac6 100644
--- a/montreal_forced_aligner/command_line/model.py
+++ b/montreal_forced_aligner/command_line/model.py
@@ -9,13 +9,21 @@
 
 import click
 
-from montreal_forced_aligner.config import GLOBAL_CONFIG
+from montreal_forced_aligner.command_line.utils import (
+    check_databases,
+    cleanup_databases,
+    common_options,
+    validate_dictionary,
+)
+from montreal_forced_aligner.config import GLOBAL_CONFIG, MFA_PROFILE_VARIABLE
 from montreal_forced_aligner.data import PhoneSetType
+from montreal_forced_aligner.dictionary.multispeaker import MultispeakerDictionary
 from montreal_forced_aligner.exceptions import (
     ModelLoadError,
     ModelSaveError,
     ModelTypeNotSupportedError,
     MultipleModelTypesFoundError,
+    PhoneMismatchError,
     PretrainedModelNotFoundError,
 )
 from montreal_forced_aligner.models import MODEL_TYPES, Archive, ModelManager, guess_model_type
@@ -26,6 +34,7 @@
     "download_model_cli",
     "list_model_cli",
     "inspect_model_cli",
+    "add_words_cli",
 ]
 
 
@@ -138,6 +147,50 @@ def inspect_model_cli(model_type: str, model: str) -> None:
     m.pretty_print()
 
 
+@model_cli.command(name="add_words", short_help="Add words to a dictionary")
+@click.argument("dictionary_path", type=click.UNPROCESSED, callback=validate_dictionary)
+@click.argument("new_pronunciations_path", type=click.UNPROCESSED, callback=validate_dictionary)
+@click.help_option("-h", "--help")
+@common_options
+@click.pass_context
+def add_words_cli(context, **kwargs) -> None:
+    """
+    Add words from one pronunciation dictionary to another pronunciation dictionary,
+    so long as the new pronunciations do not contain any new phones
+    """
+    if kwargs.get("profile", None) is not None:
+        os.putenv(MFA_PROFILE_VARIABLE, kwargs["profile"])
+    GLOBAL_CONFIG.current_profile.update(kwargs)
+    GLOBAL_CONFIG.save()
+    check_databases()
+
+    dictionary_path = kwargs.get("dictionary_path", None)
+    new_pronunciations_path = kwargs.get("new_pronunciations_path", None)
+    base_dictionary = MultispeakerDictionary(dictionary_path=dictionary_path)
+    base_dictionary.dictionary_setup()
+    new_pronunciations = MultispeakerDictionary(dictionary_path=new_pronunciations_path)
+    new_pronunciations.dictionary_setup()
+    new_phones = set()
+    for phone in new_pronunciations.non_silence_phones:
+        if phone not in base_dictionary.non_silence_phones:
+            new_phones.add(phone)
+    if new_phones:
+        raise PhoneMismatchError(new_phones)
+
+    try:
+        new_words = new_pronunciations.words_for_export(probability=True)
+        base_dictionary.add_words(new_words)
+        base_dictionary.export_lexicon(
+            base_dictionary._default_dictionary_id,
+            base_dictionary.dictionary_model.path,
+            probability=True,
+        )
+    except Exception:
+        raise
+    finally:
+        cleanup_databases()
+
+
 @model_cli.command(name="save", short_help="Save a model")
 @click.argument("model_type", type=click.Choice(sorted(MODEL_TYPES)))
 @click.argument(
diff --git a/montreal_forced_aligner/command_line/utils.py b/montreal_forced_aligner/command_line/utils.py
index 3571469c..4100a6c2 100644
--- a/montreal_forced_aligner/command_line/utils.py
+++ b/montreal_forced_aligner/command_line/utils.py
@@ -5,7 +5,6 @@
 import os
 import shutil
 import subprocess
-import time
 import typing
 from pathlib import Path
 
@@ -262,7 +261,7 @@ def check_databases(db_name=None) -> None:
                 isolation_level="AUTOCOMMIT",
             ).execution_options(logging_token="check_databases_engine")
             with engine.connect():
-                time.sleep(1)
+                pass
             return
         except sqlalchemy.exc.OperationalError:
             if not os.listdir(db_directory):
@@ -343,7 +342,6 @@ def cleanup_databases() -> None:
 
 def remove_databases() -> None:
     """Remove database"""
-    time.sleep(1)
     GLOBAL_CONFIG.load()
 
     db_directory = os.path.join(
diff --git a/montreal_forced_aligner/corpus/base.py b/montreal_forced_aligner/corpus/base.py
index d32e42c7..e4281d94 100644
--- a/montreal_forced_aligner/corpus/base.py
+++ b/montreal_forced_aligner/corpus/base.py
@@ -696,13 +696,17 @@ def normalize_text(self) -> None:
                             continue
                         if (w, dict_id) not in word_insert_mappings:
                             max_mapping_ids[dict_id] += 1
+                            word_type = WordType.oov
+                            if hasattr(self, "brackets"):
+                                if any(w.startswith(b) for b, _ in self.brackets):
+                                    word_type = WordType.bracketed
                             word_insert_mappings[(w, dict_id)] = {
                                 "id": word_key,
                                 "mapping_id": max_mapping_ids[d_id],
                                 "word": w,
                                 "count": 0,
                                 "dictionary_id": dict_id,
-                                "word_type": WordType.oov,
+                                "word_type": word_type,
                             }
                             pronunciation_insert_mappings.append(
                                 {
diff --git a/montreal_forced_aligner/corpus/helper.py b/montreal_forced_aligner/corpus/helper.py
index 35c05cf1..9692c3ed 100644
--- a/montreal_forced_aligner/corpus/helper.py
+++ b/montreal_forced_aligner/corpus/helper.py
@@ -1,7 +1,7 @@
 """Helper functions for corpus parsing and loading"""
 from __future__ import annotations
 
-import datetime
+import re
 import subprocess
 import typing
 
@@ -127,11 +127,12 @@ def get_wav_info(
                 elif line.startswith("Sample Rate"):
                     sample_rate = int(line.split(":")[-1].strip())
                 elif line.startswith("Duration"):
-                    duration_string = line.split(":", maxsplit=1)[-1].split("=")[0].strip()
-                    duration = (
-                        datetime.datetime.strptime(duration_string, "%H:%M:%S.%f")
-                        - datetime.datetime(1900, 1, 1)
-                    ).total_seconds()
+                    m = re.search(r"= (?P<num_samples>\d+) samples", line)
+                    if m:
+                        num_samples = int(m.group("num_samples"))
+                        duration = round(num_samples / sample_rate, 6)
+                    else:
+                        raise SoundFileError(file_path, "Could not parse number of samples")
                     break
             sample_rate_string = ""
             if enforce_sample_rate is not None:
diff --git a/montreal_forced_aligner/dictionary/mixins.py b/montreal_forced_aligner/dictionary/mixins.py
index a1a1070a..f1f95395 100644
--- a/montreal_forced_aligner/dictionary/mixins.py
+++ b/montreal_forced_aligner/dictionary/mixins.py
@@ -11,8 +11,8 @@
 from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple
 
 from montreal_forced_aligner.abc import DatabaseMixin
-from montreal_forced_aligner.data import PhoneSetType, PhoneType
-from montreal_forced_aligner.db import Phone
+from montreal_forced_aligner.data import PhoneSetType, PhoneType, WordType
+from montreal_forced_aligner.db import Phone, Word
 from montreal_forced_aligner.helper import mfa_open
 
 if TYPE_CHECKING:
@@ -838,6 +838,28 @@ def __init__(self, **kwargs):
         self._disambiguation_symbols_int_path = None
         self._phones_dir = None
         self._lexicon_fst_paths = {}
+        self._num_words = None
+        self._num_speech_words = None
+
+    @property
+    def num_words(self) -> int:
+        """Number of words (including OOVs and special symbols) in the dictionary"""
+        if self._num_words is None:
+            with self.session() as session:
+                self._num_words = session.query(Word).count()
+        return self._num_words
+
+    @property
+    def num_speech_words(self) -> int:
+        """Number of speech words in the dictionary"""
+        if self._num_speech_words is None:
+            with self.session() as session:
+                self._num_speech_words = (
+                    session.query(Word)
+                    .filter(Word.word_type.in_([WordType.speech, WordType.clitic]))
+                    .count()
+                )
+        return self._num_speech_words
 
     @property
     def word_boundary_int_path(self) -> Path:
diff --git a/montreal_forced_aligner/dictionary/multispeaker.py b/montreal_forced_aligner/dictionary/multispeaker.py
index 0ddcd148..aa80ad4c 100644
--- a/montreal_forced_aligner/dictionary/multispeaker.py
+++ b/montreal_forced_aligner/dictionary/multispeaker.py
@@ -166,7 +166,7 @@ def dictionary_base_names(self) -> Dict[int, str]:
                     self._dictionary_base_names[d_id] = base_name
         return self._dictionary_base_names
 
-    def word_mapping(self, dictionary_id: int = 1) -> Dict[str, int]:
+    def word_mapping(self, dictionary_id: int = None) -> Dict[str, int]:
         """
         Get the word mapping for a specified dictionary id
 
@@ -180,6 +180,8 @@ def word_mapping(self, dictionary_id: int = 1) -> Dict[str, int]:
         dict[str, int]
             Mapping from words to their integer IDs for Kaldi processing
         """
+        if dictionary_id is None:
+            dictionary_id = self._default_dictionary_id
         if dictionary_id not in self._words_mappings:
             self._words_mappings[dictionary_id] = {}
             with self.session() as session:
@@ -1232,29 +1234,95 @@ def export_trained_rules(self, output_directory: str) -> None:
                 with mfa_open(output_rules_path, "w") as f:
                     yaml.dump(dict(dialectal_rules), f, Dumper=yaml.Dumper, allow_unicode=True)
 
-    def export_lexicon(
+    def add_words(
+        self, new_word_data: typing.List[typing.Dict[str, typing.Any]], dictionary_id: int = None
+    ) -> None:
+        """
+        Add word data to a dictionary in the form exported from
+        :meth:`~montreal_forced_aligner.dictionary.multispeaker.MultispeakerDictionaryMixin.words_for_export`
+
+        Parameters
+        ----------
+        new_word_data: list[dict[str,Any]]
+            Word data to add
+        dictionary_id: int, optional
+            Dictionary id to add words, defaults to the default dictionary
+        """
+        if dictionary_id is None:
+            dictionary_id = self._default_dictionary_id
+        word_mapping = {}
+        pronunciation_mapping = []
+        word_index = self.get_next_primary_key(Word)
+        pronunciation_index = self.get_next_primary_key(Word)
+        with self.session() as session:
+            word_mapping_index = (
+                session.query(sqlalchemy.func.max(Word.mapping_id))
+                .filter(Word.dictionary_id == dictionary_id)
+                .scalar()
+                + 1
+            )
+            for data in new_word_data:
+                word = data["word"]
+                if word in self.word_mapping(dictionary_id):
+                    continue
+                if word not in word_mapping:
+                    word_mapping[word] = {
+                        "id": word_index,
+                        "mapping_id": word_mapping_index,
+                        "word": word,
+                        "word_type": WordType.speech,
+                        "count": 0,
+                        "dictionary_id": dictionary_id,
+                    }
+                    word_index += 1
+                    word_mapping_index += 1
+                phones = data["pronunciation"]
+                d = {
+                    "id": pronunciation_index,
+                    "base_pronunciation_id": pronunciation_index,
+                    "word_id": word_mapping[word]["id"],
+                    "pronunciation": phones,
+                }
+                pronunciation_index += 1
+                if "probability" in data and data["probability"] is not None:
+                    d["probability"] = data["probability"]
+                    d["silence_after_probability"] = data["silence_after_probability"]
+                    d["silence_before_correction"] = data["silence_before_correction"]
+                    d["non_silence_before_correction"] = data["non_silence_before_correction"]
+
+                pronunciation_mapping.append(d)
+            self._num_speech_words = None
+            session.bulk_insert_mappings(Word, list(word_mapping.values()))
+            session.flush()
+            session.bulk_insert_mappings(Pronunciation, pronunciation_mapping)
+            session.commit()
+
+    def words_for_export(
         self,
-        dictionary_id: int,
-        path: Path,
+        dictionary_id: int = None,
         write_disambiguation: typing.Optional[bool] = False,
         probability: typing.Optional[bool] = False,
-    ) -> None:
+    ) -> typing.List[typing.Dict[str, typing.Any]]:
         """
-        Export pronunciation dictionary to a text file
+        Generate exportable pronunciations
 
         Parameters
         ----------
-        path: :class:`~pathlib.Path`
-            Path to save dictionary
+        dictionary_id: int, optional
+            Dictionary id to export, defaults to the default dictionary
         write_disambiguation: bool, optional
             Flag for whether to include disambiguation information
         probability: bool, optional
             Flag for whether to include probabilities
-        silence_probabilities: bool, optional
-            Flag for whether to include per pronunciation silence probabilities, only valid
-            when ``probability`` is set to True
+
+        Returns
+        -------
+        list[dict[str,Any]]
+            List of pronunciations as dictionaries
         """
-        with mfa_open(path, "w") as f, self.session() as session:
+        if dictionary_id is None:
+            dictionary_id = self._default_dictionary_id
+        with self.session() as session:
             columns = [Word.word, Pronunciation.pronunciation]
             if write_disambiguation:
                 columns.append(Pronunciation.disambiguation)
@@ -1273,8 +1341,30 @@ def export_lexicon(
                 )
                 .order_by(Word.word)
             )
-            for row in pronunciations:
-                data = row.pronunciation_data
+            data = [row for row, in pronunciations]
+        return data
+
+    def export_lexicon(
+        self,
+        dictionary_id: int,
+        path: Path,
+        write_disambiguation: typing.Optional[bool] = False,
+        probability: typing.Optional[bool] = False,
+    ) -> None:
+        """
+        Export pronunciation dictionary to a text file
+
+        Parameters
+        ----------
+        path: :class:`~pathlib.Path`
+            Path to save dictionary
+        write_disambiguation: bool, optional
+            Flag for whether to include disambiguation information
+        probability: bool, optional
+            Flag for whether to include probabilities
+        """
+        with mfa_open(path, "w") as f:
+            for data in self.words_for_export(dictionary_id, write_disambiguation, probability):
                 phones = data["pronunciation"]
                 if write_disambiguation and data["disambiguation"] is not None:
                     phones += f" #{data['disambiguation']}"
diff --git a/montreal_forced_aligner/exceptions.py b/montreal_forced_aligner/exceptions.py
index dce0a5d0..e2affd2b 100644
--- a/montreal_forced_aligner/exceptions.py
+++ b/montreal_forced_aligner/exceptions.py
@@ -268,6 +268,22 @@ class DictionaryError(MFAError):
     pass
 
 
+class PhoneMismatchError(DictionaryError):
+    """
+    Exception class for when a dictionary receives a new phone
+
+    Parameters
+    ----------
+    missing_phones: Collection[str]
+        Phones that are not in the acoustic model
+    """
+
+    def __init__(self, missing_phones: Collection[str]):
+        super().__init__("There were extra phones that were not in the dictionary: ")
+        missing_phones = [f"{self.printer.error_text(x)}" for x in sorted(missing_phones)]
+        self.message_lines.append(comma_join(missing_phones))
+
+
 class NoDefaultSpeakerDictionaryError(DictionaryError):
     """
     Exception class for errors in creating MultispeakerDictionary objects
diff --git a/montreal_forced_aligner/g2p/generator.py b/montreal_forced_aligner/g2p/generator.py
index 1114eb51..64d1efb9 100644
--- a/montreal_forced_aligner/g2p/generator.py
+++ b/montreal_forced_aligner/g2p/generator.py
@@ -20,7 +20,9 @@
 
 from montreal_forced_aligner.abc import DatabaseMixin, TopLevelMfaWorker
 from montreal_forced_aligner.config import GLOBAL_CONFIG
-from montreal_forced_aligner.corpus.text_corpus import TextCorpusMixin
+from montreal_forced_aligner.corpus.text_corpus import DictionaryTextCorpusMixin, TextCorpusMixin
+from montreal_forced_aligner.data import WordType
+from montreal_forced_aligner.db import Word
 from montreal_forced_aligner.exceptions import PyniniGenerationError
 from montreal_forced_aligner.g2p.mixins import G2PTopLevelMixin
 from montreal_forced_aligner.helper import comma_join, mfa_open, score_g2p
@@ -759,3 +761,46 @@ def words_to_g2p(self) -> List[str]:
         if not self.include_bracketed:
             word_list = [x for x in word_list if not self.check_bracketed(x)]
         return word_list
+
+
+class PyniniDictionaryCorpusGenerator(
+    PyniniGenerator, DictionaryTextCorpusMixin, TopLevelMfaWorker
+):
+    """
+    Top-level worker for generating pronunciations from a corpus and a Pynini G2P model
+
+    See Also
+    --------
+    :class:`~montreal_forced_aligner.g2p.generator.PyniniGenerator`
+        For Pynini G2P generation parameters
+    :class:`~montreal_forced_aligner.corpus.text_corpus.TextCorpusMixin`
+        For corpus parsing parameters
+    :class:`~montreal_forced_aligner.abc.TopLevelMfaWorker`
+        For top-level parameters
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self._word_list = None
+
+    def setup(self) -> None:
+        """Set up the pronunciation generator"""
+        if self.initialized:
+            return
+        self.load_corpus()
+        super().setup()
+        self.g2p_model.validate(self.words_to_g2p)
+        self.initialized = True
+
+    @property
+    def words_to_g2p(self) -> List[str]:
+        """Words to produce pronunciations"""
+        if self._word_list is None:
+            with self.session() as session:
+                query = (
+                    session.query(Word.word)
+                    .filter(Word.word_type == WordType.oov, Word.word != self.oov_word)
+                    .order_by(Word.word)
+                )
+                self._word_list = [x for x, in query]
+        return self._word_list
diff --git a/montreal_forced_aligner/language_modeling/trainer.py b/montreal_forced_aligner/language_modeling/trainer.py
index 690b610e..07bed55a 100644
--- a/montreal_forced_aligner/language_modeling/trainer.py
+++ b/montreal_forced_aligner/language_modeling/trainer.py
@@ -352,9 +352,6 @@ def evaluate(self) -> None:
                 if m:
                     perplexity = float(m.group("perplexity"))
             self.large_perplexity = perplexity
-            self.num_sentences = num_sentences
-            self.num_words = num_words
-            self.num_oovs = num_oovs
             logger.info(f"{num_sentences}, {num_words}, {num_oovs}")
             logger.info(f"Perplexity of large model: {perplexity}")
 
diff --git a/montreal_forced_aligner/tokenization/trainer.py b/montreal_forced_aligner/tokenization/trainer.py
index fbc3e116..5aba586e 100644
--- a/montreal_forced_aligner/tokenization/trainer.py
+++ b/montreal_forced_aligner/tokenization/trainer.py
@@ -308,14 +308,17 @@ def meta(self) -> MetaDict:
 
     def train(self) -> None:
         if os.path.exists(self.fst_path):
+            self.finalize_training()
             return
         super().train()
 
     def initialize_training(self) -> None:
         """Initialize training tokenizer model"""
-        logger.info("Initializing training...")
 
         self.create_new_current_workflow(WorkflowType.tokenizer_training)
+        if self.fst_path.exists():
+            return
+        logger.info("Initializing training...")
         with self.session() as session:
             session.query(M2M2Job).delete()
             session.query(M2MSymbol).delete()
diff --git a/tests/conftest.py b/tests/conftest.py
index adbf347a..68d196f4 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -777,15 +777,6 @@ def groups_path(config_directory):
     return config_directory.joinpath("test_groups.yaml")
 
 
-@pytest.fixture(scope="session")
-def speaker_dictionary_path(basic_dict_path, acoustic_dict_path, generated_dir):
-    data = {"default": acoustic_dict_path, "sickmichael": basic_dict_path}
-    speaker_dict_path = generated_dir.joinpath("test_basic_acoustic_dicts.yaml")
-    with mfa_open(speaker_dict_path, "w") as f:
-        yaml.dump(data, f, Dumper=yaml.Dumper, allow_unicode=True)
-    return speaker_dict_path
-
-
 @pytest.fixture(scope="session")
 def mono_output_directory(generated_dir):
     return generated_dir.joinpath("mono_output")
diff --git a/tests/data/dictionaries/test_acoustic.txt b/tests/data/dictionaries/test_acoustic.txt
index a687430b..587abe26 100644
--- a/tests/data/dictionaries/test_acoustic.txt
+++ b/tests/data/dictionaries/test_acoustic.txt
@@ -40,3 +40,6 @@ should	sh uh d
 be	b iy
 all	aa l
 thanks	th ae ng k s
+just	jh ah s t
+sound	s aw n d
+environment	eh n v ay r ah n m eh n t
diff --git a/tests/test_commandline_g2p.py b/tests/test_commandline_g2p.py
index 7e7ea3ff..9fd1deaa 100644
--- a/tests/test_commandline_g2p.py
+++ b/tests/test_commandline_g2p.py
@@ -39,7 +39,44 @@ def test_generate_pretrained(
     check_databases()
     d = MultispeakerDictionary(output_path)
     d.dictionary_setup()
-    assert len(d.word_mapping(list(d.dictionary_lookup.values())[0])) > 0
+    assert d.num_speech_words > 0
+
+
+def test_generate_pretrained_dictionary(
+    english_g2p_model, combined_corpus_dir, english_dictionary, temp_dir, generated_dir, db_setup
+):
+    output_path = generated_dir.joinpath("filtered_g2p_out.txt")
+    command = [
+        "g2p",
+        combined_corpus_dir,
+        english_g2p_model,
+        output_path,
+        "-t",
+        os.path.join(temp_dir, "dict_g2p_cli"),
+        "-q",
+        "--clean",
+        "--dictionary_path",
+        english_dictionary,
+        "--num_pronunciations",
+        "1",
+        "--use_mp",
+        "False",
+    ]
+    command = [str(x) for x in command]
+    result = click.testing.CliRunner(mix_stderr=False, echo_stdin=True).invoke(
+        mfa_cli, command, catch_exceptions=True
+    )
+    print(result.stdout)
+    print(result.stderr)
+    if result.exception:
+        print(result.exc_info)
+        raise result.exception
+    assert not result.return_value
+    assert os.path.exists(output_path)
+    check_databases()
+    d = MultispeakerDictionary(output_path)
+    d.dictionary_setup()
+    assert d.num_speech_words == 2
 
 
 def test_generate_pretrained_threshold(
@@ -73,7 +110,7 @@ def test_generate_pretrained_threshold(
     d = MultispeakerDictionary(output_path)
     d.dictionary_setup()
 
-    assert len(d.word_mapping(list(d.dictionary_lookup.values())[0])) > 0
+    assert d.num_speech_words > 0
 
 
 def test_train_g2p(
@@ -175,7 +212,7 @@ def test_generate_dict(
     check_databases()
     d = MultispeakerDictionary(dictionary_path=g2p_basic_output)
     d.dictionary_setup()
-    assert len(d.word_mapping(list(d.dictionary_lookup.values())[0])) > 0
+    assert d.num_speech_words > 0
 
 
 def test_generate_dict_phonetisaurus(
@@ -213,7 +250,7 @@ def test_generate_dict_phonetisaurus(
     check_databases()
     d = MultispeakerDictionary(dictionary_path=g2p_basic_phonetisaurus_output)
     d.dictionary_setup()
-    assert len(d.word_mapping(list(d.dictionary_lookup.values())[0])) > 0
+    assert d.num_speech_words > 0
 
 
 def test_generate_dict_text_only(
@@ -252,7 +289,7 @@ def test_generate_dict_text_only(
     check_databases()
     d = MultispeakerDictionary(dictionary_path=g2p_basic_output)
     d.dictionary_setup()
-    assert len(d.word_mapping(list(d.dictionary_lookup.values())[0])) > 0
+    assert d.num_speech_words > 0
 
 
 def test_generate_dict_textgrid(
@@ -291,4 +328,4 @@ def test_generate_dict_textgrid(
     check_databases()
     d = MultispeakerDictionary(dictionary_path=output_file)
     d.dictionary_setup()
-    assert len(d.word_mapping(list(d.dictionary_lookup.values())[0])) > 0
+    assert d.num_speech_words > 0
diff --git a/tests/test_commandline_model.py b/tests/test_commandline_model.py
index d98e129d..57aa1b2a 100644
--- a/tests/test_commandline_model.py
+++ b/tests/test_commandline_model.py
@@ -4,7 +4,8 @@
 import pytest
 
 from montreal_forced_aligner.command_line.mfa import mfa_cli
-from montreal_forced_aligner.exceptions import RemoteModelNotFoundError
+from montreal_forced_aligner.dictionary import MultispeakerDictionary
+from montreal_forced_aligner.exceptions import PhoneMismatchError, RemoteModelNotFoundError
 from montreal_forced_aligner.models import AcousticModel, DictionaryModel, G2PModel, ModelManager
 
 
@@ -140,6 +141,88 @@ def test_inspect_model():
     assert not result.return_value
 
 
+def test_add_pronunciations(
+    hindi_dict_path, japanese_dict_path, basic_dict_path, acoustic_dict_path
+):
+    command = [
+        "model",
+        "save",
+        "dictionary",
+        str(hindi_dict_path),
+        "--name",
+        "hindi",
+        "--overwrite",
+    ]
+    result = click.testing.CliRunner(mix_stderr=False, echo_stdin=True).invoke(
+        mfa_cli, command, catch_exceptions=True
+    )
+    print(result.stdout)
+    print(result.stderr)
+    if result.exception:
+        print(result.exc_info)
+        raise result.exception
+    assert not result.return_value
+    assert os.path.exists(DictionaryModel.get_pretrained_path("hindi"))
+
+    with pytest.raises(PhoneMismatchError):
+        command = [
+            "model",
+            "add_words",
+            "hindi",
+            str(japanese_dict_path),
+        ]
+        result = click.testing.CliRunner(mix_stderr=False, echo_stdin=True).invoke(
+            mfa_cli, command, catch_exceptions=True
+        )
+        print(result.stdout)
+        print(result.stderr)
+        if result.exception:
+            print(result.exc_info)
+            raise result.exception
+        assert not result.return_value
+    command = [
+        "model",
+        "save",
+        "dictionary",
+        str(acoustic_dict_path),
+        "--name",
+        "acoustic",
+        "--overwrite",
+    ]
+    result = click.testing.CliRunner(mix_stderr=False, echo_stdin=True).invoke(
+        mfa_cli, command, catch_exceptions=True
+    )
+    print(result.stdout)
+    print(result.stderr)
+    if result.exception:
+        print(result.exc_info)
+        raise result.exception
+    assert not result.return_value
+    assert os.path.exists(DictionaryModel.get_pretrained_path("acoustic"))
+    command = [
+        "model",
+        "add_words",
+        "acoustic",
+        str(basic_dict_path),
+    ]
+    result = click.testing.CliRunner(mix_stderr=False, echo_stdin=True).invoke(
+        mfa_cli, command, catch_exceptions=True
+    )
+    print(result.stdout)
+    print(result.stderr)
+    if result.exception:
+        print(result.exc_info)
+        raise result.exception
+    assert not result.return_value
+
+    pretrained_acoustic_path = DictionaryModel.get_pretrained_path("acoustic")
+    assert pretrained_acoustic_path.exists()
+    d = MultispeakerDictionary(pretrained_acoustic_path)
+    d.dictionary_setup()
+
+    assert "hopefully" in d.word_mapping()
+
+
 def test_list_model():
     command = [
         "model",
diff --git a/tests/test_corpus.py b/tests/test_corpus.py
index 8c7e7d74..fb71ab0d 100644
--- a/tests/test_corpus.py
+++ b/tests/test_corpus.py
@@ -398,11 +398,6 @@ def test_weird_words(weird_words_dir, generated_dir, basic_dict_path, global_con
                 "ajfish",
                 "asds-asda",
                 "sdasd",
-                "[me_really]",
-                "[me____really]",
-                "[me_really]",
-                "<unk>",
-                "<_s>",
             }
         )
     assert (