diff --git a/silnlp/nmt/config.py b/silnlp/nmt/config.py index 910b9345..1d892ef0 100644 --- a/silnlp/nmt/config.py +++ b/silnlp/nmt/config.py @@ -493,7 +493,8 @@ def preprocess(self, stats: bool, force_align: bool = False) -> None: LOGGER.error(f"The source file {str(file)} does not exist.") return - self._build_vocabs(stats) + if self.data["tokenize"]: + self._build_vocabs(stats) tokenizer = self.create_tokenizer() self._build_corpora(tokenizer, stats, force_align) LOGGER.info("Preprocessing completed") @@ -557,7 +558,7 @@ def _build_corpora(self, tokenizer: Tokenizer, stats: bool, force_align: bool) - dict_count = self._write_dictionary(tokenizer, src_terms_files, trg_terms_files) LOGGER.info(f"dictionary size: {dict_count}") - if stats: + if stats and self.data["tokenize"]: self._calculate_tokenization_stats() return train_count diff --git a/silnlp/nmt/hugging_face_config.py b/silnlp/nmt/hugging_face_config.py index 6e53f8bf..1d8b9be0 100644 --- a/silnlp/nmt/hugging_face_config.py +++ b/silnlp/nmt/hugging_face_config.py @@ -540,7 +540,7 @@ def _build_vocabs(self, stats: bool = False) -> None: ["Target", 0], ] - if stats: + if stats and self.data["tokenize"]: stats_columns = pd.MultiIndex.from_tuples( [ (" ", "Translation Side"),