Merge branch 'icu_build_sources' of https://github.com/mryzhov/openvi…

…no_tokenizers_public into icu_build_sources
openvinotoolkit · Jan 17, 2025 · 788ef1c · 788ef1c
2 parents d971aac + 02fc991
commit 788ef1c
Show file tree

Hide file tree

Showing 77 changed files with 18,301 additions and 17,203 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (C) 2022-2024 Intel Corporation
+# Copyright (C) 2022-2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 #
 

diff --git a/README.md b/README.md
@@ -416,13 +416,14 @@ int main(int argc, char* argv[]) {
 ## Supported Tokenizer Types
 
 | Huggingface <br/>Tokenizer Type | Tokenizer Model Type | Tokenizer | Detokenizer |
-|---------------------------------|----------------------|----------|------------|
-| Fast                            | WordPiece            | ✅        | ❌          |
-|                                 | BPE                  | ✅        | ✅          |
-|                                 | Unigram              | ❌         | ❌          |
-| Legacy                          | SentencePiece .model | ✅        | ✅          |
-| Custom                          | tiktoken             | ✅        | ✅          |
-| RWKV                            | Trie                 | ✅        | ✅          |
+|---------------------------------|----------------------|----------|-----------|
+| Fast                            | WordPiece            | ✅        | ✅          |
+|                                 | BPE                  | ✅        | ✅         |
+|                                 | Unigram              | ❌         | ❌         |
+|                                 | WordLevel*           | ✅         | ✅         |
+| Legacy                          | SentencePiece .model | ✅        | ✅         |
+| Custom                          | tiktoken             | ✅        | ✅         |
+| RWKV                            | Trie                 | ✅        | ✅         |
 
 ## Test Results
 
@@ -461,8 +462,8 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
     </tr>
     <tr>
       <td >WordPiece</td>
-      <td >98.39</td>
-      <td >747</td>
+      <td >99.34</td>
+      <td >1811</td>
     </tr>
   </tbody>
 </table>
@@ -771,43 +772,43 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
       <td >WordPiece</td>
       <td >ProsusAI/finbert</td>
       <td >100.00</td>
-      <td >109</td>
+      <td >261</td>
     </tr>
     <tr>
       <td >WordPiece</td>
       <td >bert-base-multilingual-cased</td>
       <td >100.00</td>
-      <td >109</td>
+      <td >261</td>
     </tr>
     <tr>
       <td >WordPiece</td>
       <td >cointegrated/rubert-tiny2</td>
       <td >100.00</td>
-      <td >109</td>
+      <td >261</td>
     </tr>
     <tr>
       <td >WordPiece</td>
       <td >distilbert-base-uncased-finetuned-sst-2-english</td>
       <td >100.00</td>
-      <td >109</td>
+      <td >261</td>
     </tr>
     <tr>
       <td >WordPiece</td>
       <td >google/mobilebert-uncased</td>
       <td >100.00</td>
-      <td >93</td>
+      <td >245</td>
     </tr>
     <tr>
       <td >WordPiece</td>
       <td >rasa/LaBSE</td>
-      <td >88.99</td>
-      <td >109</td>
+      <td >95.40</td>
+      <td >261</td>
     </tr>
     <tr>
       <td >WordPiece</td>
       <td >sentence-transformers/all-MiniLM-L6-v2</td>
       <td >100.00</td>
-      <td >109</td>
+      <td >261</td>
     </tr>
   </tbody>
 </table>

diff --git a/cmake/platforms.cmake b/cmake/platforms.cmake
@@ -1,5 +1,5 @@
 
-# Copyright (C) 2023-2024 Intel Corporation
+# Copyright (C) 2023-2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 #
 

diff --git a/cmake/templates/__version__.py.in b/cmake/templates/__version__.py.in
@@ -1,5 +1,5 @@
 # -*- coding: utf-8 -*-
-# Copyright (C) 2018-2024 Intel Corporation
+# Copyright (C) 2018-2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
 __version__ = "@OpenVINOTokenizers_FULL_VERSION@"
diff --git a/cmake/version.cmake b/cmake/version.cmake
@@ -1,4 +1,4 @@
-# Copyright (C) 2018-2024 Intel Corporation
+# Copyright (C) 2018-2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 #
 

diff --git a/js/README.md b/js/README.md
@@ -27,4 +27,4 @@ core.addExtension(openvinoTokenizers.path); // Add tokenizers extension
 
 [License](https://github.com/openvinotoolkit/openvino/blob/master/LICENSE)
 
-Copyright © 2018-2024 Intel Corporation
+Copyright © 2018-2025 Intel Corporation
diff --git a/pyproject.toml b/pyproject.toml
@@ -83,7 +83,7 @@ select = ["C", "E", "F", "I", "W"]
 lines-after-imports = 2
 
 [tool.bandit]
-tests = ["B301", "B302", "B303", "B304", "B305", "B306", "B308", "B310", "B311", "B312", "B313", "B314", "B315", "B316", "B317", "B318", "B319", "B320", "B321", "B323", "B324", "B401", "B402", "B403", "B404", "B405", "B406", "B407", "B408", "B409", "B410", "B411", "B412", "B413"]
+tests = ["B301", "B302", "B303", "B304", "B305", "B306", "B308", "B310", "B311", "B312", "B313", "B314", "B315", "B316", "B317", "B318", "B319", "B321", "B323", "B324", "B401", "B402", "B403", "B404", "B405", "B406", "B407", "B408", "B409", "B411", "B412", "B413"]
 skips = ["B101", "B102", "B103", "B104", "B105", "B106", "B107", "B108", "B110", "B112", "B201", "B501", "B502", "B503", "B504", "B505", "B506", "B507", "B601", "B602", "B603", "B604", "B605", "B606", "B607", "B608", "B609", "B610", "B611", "B701", "B702", "B703"]
 no_shell = ["os.execl", "os.execle", "os.execlp", "os.execlpe", "os.execv", "os.execve", "os.execvp", "os.execvpe", "os.spawnl", "os.spawnle", "os.spawnlp", "os.spawnlpe", "os.spawnv", "os.spawnve", "os.spawnvp", "os.spawnvpe", "os.startfile"]
 shell = ["os.system", "os.popen", "os.popen2", "os.popen3", "os.popen4", "popen2.popen2", "popen2.popen3", "popen2.popen4", "popen2.Popen3", "popen2.Popen4", "commands.getoutput", "commands.getstatusoutput"]

diff --git a/python/openvino_tokenizers/__init__.py b/python/openvino_tokenizers/__init__.py
@@ -1,5 +1,5 @@
 # -*- coding: utf-8 -*-
-# Copyright (C) 2018-2024 Intel Corporation
+# Copyright (C) 2018-2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 import functools
 import os

diff --git a/python/openvino_tokenizers/cli.py b/python/openvino_tokenizers/cli.py
@@ -1,5 +1,5 @@
 # -*- coding: utf-8 -*-
-# Copyright (C) 2023-2024 Intel Corporation
+# Copyright (C) 2023-2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
 from argparse import Action, ArgumentError, ArgumentParser

diff --git a/python/openvino_tokenizers/constants.py b/python/openvino_tokenizers/constants.py
@@ -1,5 +1,5 @@
 # -*- coding: utf-8 -*-
-# Copyright (C) 2023-2024 Intel Corporation
+# Copyright (C) 2023-2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 from enum import Enum
 

diff --git a/python/openvino_tokenizers/convert_tokenizer.py b/python/openvino_tokenizers/convert_tokenizer.py
@@ -1,5 +1,5 @@
 # -*- coding: utf-8 -*-
-# Copyright (C) 2018-2024 Intel Corporation
+# Copyright (C) 2018-2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
 import logging

diff --git a/python/openvino_tokenizers/hf_parser.py b/python/openvino_tokenizers/hf_parser.py
@@ -383,50 +383,36 @@ def add_padding(self, use_max_padding: bool = False) -> None:
         str,
         Callable[[Dict[str, Any]], Union[DecodingStep, List[DecodingStep]]],
     ] = {
-        "Replace": lambda decode_dict: RegexDecodingStep.parse_replace_dict(decode_dict),
+        "Replace": RegexDecodingStep.parse_replace_dict,
         "Fuse": lambda decode_dict: FuseStep(),
-        "Strip": lambda decode_dict: RegexDecodingStep.parse_strip_dict(decode_dict),
+        "Strip": RegexDecodingStep.parse_strip_dict,
         "ByteFallback": lambda decode_dict: ByteFallbackStep(),
     }
 
     def decoding(self) -> None:
         skip_tokens = parse_special_tokens(self.original_tokenizer)
+        self.pipeline.add_steps(VocabDecoderStep.from_hf_json(self.tokenizer_json, self.pipeline.vocab, list(skip_tokens), do_skip_tokens=self.skip_special_tokens))
 
-        if self.tokenizer_json["model"]["type"] == "WordLevel":
-            self.pipeline.add_steps(
-                [
-                    VocabDecoderStep(
-                        vocab=[f" {token}" for token in self.pipeline.vocab],
-                        skip_tokens=list(skip_tokens),
-                        do_skip_tokens=self.skip_special_tokens,
-                    ),
-                    FuseStep(),
-                    RegexDecodingStep.strip_forward_space(),
-                ]
-            )
-            if self.clean_up_tokenization_spaces:
-                self.pipeline.add_steps(RegexDecodingStep.clean_up_tokenization_spaces())
-            return
-        elif self.tokenizer_json["decoder"] is None or self.tokenizer_json["model"]["type"] == "WordPiece":
-            return
-
-        self.pipeline.add_steps(
-            VocabDecoderStep(skip_tokens=list(skip_tokens), do_skip_tokens=self.skip_special_tokens)
-        )
-
-        if self.tokenizer_json["decoder"]["type"] == "Sequence":
+        has_decoder = self.tokenizer_json.get("decoder") is not None
+        if has_decoder and self.tokenizer_json["decoder"]["type"] == "Sequence":
             for decoder_dict in self.tokenizer_json["decoder"]["decoders"]:
                 decoder_parser = self.decoding_map.get(decoder_dict.get("type"))
                 if decoder_parser is None:
                     pass
                     # raise ValueError(f"Decoder {decoder_dict} is not supported yet.")
                 else:
                     self.pipeline.add_steps(decoder_parser(decoder_dict))
-        elif self.tokenizer_json["decoder"]["type"] == "ByteLevel":
+        elif has_decoder and self.tokenizer_json["decoder"]["type"] == "ByteLevel":
             self.pipeline.add_steps(CharsToBytesStep())
         else:
             self.pipeline.add_steps(FuseStep())
 
+        # strip forward space because VocabDecoderStep.from_hf_json modifies vocabulary
+        if self.tokenizer_json["model"]["type"] == "WordLevel":
+            self.pipeline.add_steps(RegexDecodingStep.strip_forward_space())
+        elif self.tokenizer_json["model"]["type"] == "WordPiece":
+            self.pipeline.add_steps(RegexDecodingStep.strip_forward_space())
+
         if self.utf8_replace_mode is not None and (self.utf8_replace_mode != UTF8ReplaceMode.DISABLE):
             self.pipeline.add_steps(UTF8ValidateStep(mode=self.utf8_replace_mode))
 

diff --git a/python/openvino_tokenizers/str_pack.py b/python/openvino_tokenizers/str_pack.py
@@ -1,5 +1,5 @@
 # -*- coding: utf-8 -*-
-# Copyright (C) 2018-2024 Intel Corporation
+# Copyright (C) 2018-2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
 from io import BytesIO

diff --git a/python/openvino_tokenizers/tokenizer_pipeline.py b/python/openvino_tokenizers/tokenizer_pipeline.py
@@ -1,5 +1,5 @@
 # -*- coding: utf-8 -*-
-# Copyright (C) 2018-2024 Intel Corporation
+# Copyright (C) 2018-2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
 from __future__ import annotations
@@ -1084,6 +1084,22 @@ def finalize(self) -> None:
         elif self.skip_tokens is None:
             self.skip_tokens = pipeline.skip_tokens or []
 
+    @classmethod
+    def from_hf_json(cls, tokenizer_json: Dict[str, Any], pipeline_vocab: Optional[List[str]], skip_tokens: Optional[List[int]] = None, do_skip_tokens: bool = True) -> "VocabDecoderStep":
+        model_type = tokenizer_json["model"]["type"]
+
+        if pipeline_vocab is not None and model_type == "WordLevel":
+            vocab = [f" {token}" for token in pipeline_vocab]
+        elif pipeline_vocab is not None and model_type == "WordPiece":
+            vocab = [
+                token if token in ".,!?" else token[2:] if token.startswith("##") else f" {token}"
+                for token in pipeline_vocab
+            ]
+        else:  # Use vocab node from pipeline
+            vocab = None
+
+        return cls(vocab, skip_tokens, do_skip_tokens)
+
     def get_vocab_node_outputs(self) -> Optional[List[Output]]:
         return self.get_pipeline().vocab_node_outputs if self.get_pipeline() is not None else None
 

diff --git a/python/openvino_tokenizers/utils.py b/python/openvino_tokenizers/utils.py
@@ -1,5 +1,5 @@
 # -*- coding: utf-8 -*-
-# Copyright (C) 2018-2024 Intel Corporation
+# Copyright (C) 2018-2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
 import logging

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (C) 2018-2024 Intel Corporation
+# Copyright (C) 2018-2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 #
 

diff --git a/src/bpe_tokenizer.cpp b/src/bpe_tokenizer.cpp
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2024 Intel Corporation
+// Copyright (C) 2018-2025 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 

diff --git a/src/bpe_tokenizer.hpp b/src/bpe_tokenizer.hpp
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2024 Intel Corporation
+// Copyright (C) 2018-2025 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 

diff --git a/src/byte_fallback.cpp b/src/byte_fallback.cpp
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2024 Intel Corporation
+// Copyright (C) 2018-2025 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 

diff --git a/src/byte_fallback.hpp b/src/byte_fallback.hpp
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2024 Intel Corporation
+// Copyright (C) 2018-2025 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 

diff --git a/src/bytes_to_chars.cpp b/src/bytes_to_chars.cpp
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2024 Intel Corporation
+// Copyright (C) 2018-2025 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 

diff --git a/src/bytes_to_chars.hpp b/src/bytes_to_chars.hpp
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2024 Intel Corporation
+// Copyright (C) 2018-2025 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 

diff --git a/src/case_fold.cpp b/src/case_fold.cpp
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2024 Intel Corporation
+// Copyright (C) 2018-2025 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 

diff --git a/src/case_fold.hpp b/src/case_fold.hpp
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2024 Intel Corporation
+// Copyright (C) 2018-2025 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 

diff --git a/src/chars_to_bytes.cpp b/src/chars_to_bytes.cpp
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2024 Intel Corporation
+// Copyright (C) 2018-2025 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 

diff --git a/src/chars_to_bytes.hpp b/src/chars_to_bytes.hpp
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2024 Intel Corporation
+// Copyright (C) 2018-2025 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 

diff --git a/src/charsmap_normalization.cpp b/src/charsmap_normalization.cpp
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2024 Intel Corporation
+// Copyright (C) 2018-2025 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 

diff --git a/src/charsmap_normalization.hpp b/src/charsmap_normalization.hpp
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2024 Intel Corporation
+// Copyright (C) 2018-2025 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 

diff --git a/src/combine_segments.cpp b/src/combine_segments.cpp
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2024 Intel Corporation
+// Copyright (C) 2018-2025 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 

diff --git a/src/combine_segments.hpp b/src/combine_segments.hpp
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2024 Intel Corporation
+// Copyright (C) 2018-2025 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 

diff --git a/src/equal_str.cpp b/src/equal_str.cpp
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2024 Intel Corporation
+// Copyright (C) 2018-2025 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 

diff --git a/src/equal_str.hpp b/src/equal_str.hpp
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2024 Intel Corporation
+// Copyright (C) 2018-2025 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 

diff --git a/src/fuze.cpp b/src/fuze.cpp
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2024 Intel Corporation
+// Copyright (C) 2018-2025 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 

diff --git a/src/fuze.hpp b/src/fuze.hpp
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2024 Intel Corporation
+// Copyright (C) 2018-2025 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 

diff --git a/src/normalize_unicode.cpp b/src/normalize_unicode.cpp
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2024 Intel Corporation
+// Copyright (C) 2018-2025 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 

diff --git a/src/normalize_unicode.hpp b/src/normalize_unicode.hpp
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2024 Intel Corporation
+// Copyright (C) 2018-2025 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 

diff --git a/src/ov_extension.cpp b/src/ov_extension.cpp
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2024 Intel Corporation
+// Copyright (C) 2018-2025 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 

diff --git a/src/ragged_tensor_pack.cpp b/src/ragged_tensor_pack.cpp
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2024 Intel Corporation
+// Copyright (C) 2018-2025 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
Original file line number	Diff line number	Diff line change
Expand Up		@@ -27,4 +27,4 @@ core.addExtension(openvinoTokenizers.path); // Add tokenizers extension

		[License](https://github.com/openvinotoolkit/openvino/blob/master/LICENSE)

		Copyright © 2018-2024 Intel Corporation
		Copyright © 2018-2025 Intel Corporation