Skip to content

Commit

Permalink
Merge pull request #584 from sillsdev/issue365
Browse files Browse the repository at this point in the history
Solve Issue #365 and #390
  • Loading branch information
AmeWenJ authored Nov 9, 2024
2 parents 31cb9b7 + 6daa28e commit 7ef209b
Show file tree
Hide file tree
Showing 2 changed files with 4 additions and 3 deletions.
5 changes: 3 additions & 2 deletions silnlp/nmt/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -493,7 +493,8 @@ def preprocess(self, stats: bool, force_align: bool = False) -> None:
LOGGER.error(f"The source file {str(file)} does not exist.")
return

self._build_vocabs(stats)
if self.data["tokenize"]:
self._build_vocabs(stats)
tokenizer = self.create_tokenizer()
self._build_corpora(tokenizer, stats, force_align)
LOGGER.info("Preprocessing completed")
Expand Down Expand Up @@ -557,7 +558,7 @@ def _build_corpora(self, tokenizer: Tokenizer, stats: bool, force_align: bool) -
dict_count = self._write_dictionary(tokenizer, src_terms_files, trg_terms_files)
LOGGER.info(f"dictionary size: {dict_count}")

if stats:
if stats and self.data["tokenize"]:
self._calculate_tokenization_stats()

return train_count
Expand Down
2 changes: 1 addition & 1 deletion silnlp/nmt/hugging_face_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -540,7 +540,7 @@ def _build_vocabs(self, stats: bool = False) -> None:
["Target", 0],
]

if stats:
if stats and self.data["tokenize"]:
stats_columns = pd.MultiIndex.from_tuples(
[
(" ", "Translation Side"),
Expand Down

0 comments on commit 7ef209b

Please sign in to comment.