Add null tokenizer (#11789) (#11802)

* Add null tokenizer * Apply isort and black reformatting * cleanup --------- Signed-off-by: Sangkug Lym <[email protected]> Signed-off-by: erhoo82 <[email protected]> Co-authored-by: Sangkug Lym <[email protected]> Co-authored-by: erhoo82 <[email protected]>
NVIDIA · Jan 10, 2025 · 67f1ab5 · 67f1ab5
1 parent 074b23f
commit 67f1ab5
Show file tree

Hide file tree

Showing 2 changed files with 81 additions and 0 deletions.
diff --git a/nemo/collections/common/tokenizers/null_tokenizer.py b/nemo/collections/common/tokenizers/null_tokenizer.py
@@ -0,0 +1,75 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from megatron.core.datasets.megatron_tokenizer import MegatronTokenizer
+
+
+class NullTokenizer(MegatronTokenizer):
+    """
+    Synthetic tokenizer for performance benchmarking and debugging
+
+    Args:
+        vocab_size: vocabulary size for embedding
+    """
+
+    def __init__(self, vocab_size):
+        super().__init__(None, vocab_size=vocab_size)
+        self._vocab_size_without_eod = int(vocab_size)
+        self._eod_id = self._vocab_size_without_eod
+
+    def tokenize(self, text):
+        return [int(x) for x in text.split(' ')]
+
+    def detokenize(self, ids):
+        text = [str(x) for x in ids]
+        return ' '.join(text)
+
+    def offsets(self, ids: list[int], text: str) -> list[int]:
+        offsets, start_idx = [], 0
+        for id_ in ids:
+            offsets.append(start_idx)
+            start_idx += 1 + len(str(id_))
+        return offsets
+
+    @property
+    def vocab_size(self):
+        return self._vocab_size_without_eod + 1
+
+    @property
+    def vocab(self):
+        raise NotImplementedError
+
+    @property
+    def inv_vocab(self):
+        raise NotImplementedError
+
+    @property
+    def cls(self):
+        return -1
+
+    @property
+    def sep(self):
+        return -1
+
+    @property
+    def mask(self):
+        return -1
+
+    @property
+    def eod(self):
+        return self._eod_id
+
+    @property
+    def additional_special_tokens_ids(self):
+        return None
diff --git a/nemo/collections/nlp/modules/common/tokenizer_utils.py b/nemo/collections/nlp/modules/common/tokenizer_utils.py
@@ -156,6 +156,7 @@ def get_nmt_tokenizer(
     delimiter: Optional[str] = None,
     trust_remote_code: Optional[bool] = False,
     chat_template: Optional[Dict] = None,
+    vocab_size: Optional[int] = None,
 ):
     """
     Args:
@@ -239,6 +240,11 @@ def get_nmt_tokenizer(
         from nemo.collections.common.tokenizers.tiktoken_tokenizer import TiktokenTokenizer
 
         return TiktokenTokenizer(vocab_file=vocab_file)
+    elif library == 'null':
+        assert vocab_size is not None
+        from nemo.collections.common.tokenizers.null_tokenizer import NullTokenizer
+
+        return NullTokenizer(vocab_size)
     else:
         raise NotImplementedError(
             'Currently we only support "huggingface", "sentencepiece", "megatron", and "byte-level" tokenizer'