From da17a7712c946ab1954229dcf6f4b7dd29e89952 Mon Sep 17 00:00:00 2001 From: Chris Granger Date: Tue, 15 Oct 2024 10:10:43 -0600 Subject: [PATCH] remove youtokenme --- .../collections/common/tokenizers/__init__.py | 1 - .../tokenizers/youtokentome_tokenizer.py | 77 ------------------- 2 files changed, 78 deletions(-) delete mode 100644 nemo/collections/common/tokenizers/youtokentome_tokenizer.py diff --git a/nemo/collections/common/tokenizers/__init__.py b/nemo/collections/common/tokenizers/__init__.py index 7dc3387c005f..750398670d0c 100644 --- a/nemo/collections/common/tokenizers/__init__.py +++ b/nemo/collections/common/tokenizers/__init__.py @@ -21,4 +21,3 @@ from nemo.collections.common.tokenizers.sentencepiece_tokenizer import SentencePieceTokenizer from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec from nemo.collections.common.tokenizers.word_tokenizer import WordTokenizer -from nemo.collections.common.tokenizers.youtokentome_tokenizer import YouTokenToMeTokenizer diff --git a/nemo/collections/common/tokenizers/youtokentome_tokenizer.py b/nemo/collections/common/tokenizers/youtokentome_tokenizer.py deleted file mode 100644 index 77375d38558a..000000000000 --- a/nemo/collections/common/tokenizers/youtokentome_tokenizer.py +++ /dev/null @@ -1,77 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from pathlib import Path - -import youtokentome as yttm - -from nemo.collections.common.tokenizers import TokenizerSpec - -__all__ = ['YouTokenToMeTokenizer'] - - -class YouTokenToMeTokenizer(TokenizerSpec): - def __init__(self, model_path, bpe_dropout=0.0, legacy=False, r2l=False): - model_path = Path(model_path).expanduser() - self.tokenizer = yttm.BPE(model=str(model_path)) - self.vocab_size = len(self.tokenizer.vocab()) - self.special_tokens = self.tokens_to_ids(["", "", "", ""]) - self.bpe_dropout = bpe_dropout - self.legacy = legacy - self.r2l = r2l - - def text_to_tokens(self, text): - return self.tokenizer.encode( - text, output_type=yttm.OutputType.SUBWORD, dropout_prob=self.bpe_dropout, reverse=self.r2l - ) - - def tokens_to_text(self, tokens): - return self.ids_to_text(self.tokens_to_ids(tokens)) - - def text_to_ids(self, text): - return self.tokenizer.encode( - text, output_type=yttm.OutputType.ID, dropout_prob=self.bpe_dropout, reverse=self.r2l - ) - - def ids_to_text(self, ids): - ids_ = [id_ for id_ in ids if id_ not in self.special_tokens] - if self.r2l: - ids_ = ids_[::-1] - return self.tokenizer.decode([ids_])[0] - - def tokens_to_ids(self, tokens): - return [self.tokenizer.subword_to_id(token) for token in tokens] - - def ids_to_tokens(self, ids): - if self.legacy: - ids_ = [id_ for id_ in ids if id_ not in self.special_tokens] - else: - ids_ = ids - return [self.tokenizer.id_to_subword(id_) for id_ in ids_] - - @property - def pad_id(self): - return self.tokenizer.subword_to_id("") - - @property - def bos_id(self): - return self.tokenizer.subword_to_id("") - - @property - def eos_id(self): - return self.tokenizer.subword_to_id("") - - @property - def unk_id(self): - return self.tokenizer.subword_to_id("")