Skip to content

Commit

Permalink
Revert "Revert "Revert "Use opset15 version of Str Pack/Unpack (#351)" (
Browse files Browse the repository at this point in the history
#374)…" (#383)

This reverts commit 2e59c96.
  • Loading branch information
rkazants authored Jan 20, 2025
1 parent 2e59c96 commit afaa521
Show file tree
Hide file tree
Showing 13 changed files with 8,431 additions and 8,415 deletions.
19 changes: 2 additions & 17 deletions python/openvino_tokenizers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ def new_fe_init(self, *args, **kwargs):


openvino.runtime.Core.__init__ = new_core_init
openvino.runtime.utils.node_factory.NodeFactory.__init__ = new_factory_init
openvino.frontend.frontend.FrontEnd.__init__ = new_fe_init


Expand All @@ -75,22 +76,6 @@ def _get_factory_callable() -> Callable[[], NodeFactory]:
def inner(opset_version: Optional[str] = None) -> NodeFactory:
nonlocal factory
if opset_version not in factory:
openvino.runtime.utils.node_factory.NodeFactory.__init__ = new_factory_init
factory[opset_version] = NodeFactory() if opset_version is None else NodeFactory(opset_version)

return factory[opset_version]

return inner


def _get_opset_factory_callable() -> Callable[[], NodeFactory]:
# factory without extensions
factory = {}

def inner(opset_version: Optional[str] = None) -> NodeFactory:
nonlocal factory
if opset_version not in factory:
openvino.runtime.utils.node_factory.NodeFactory.__init__ = old_factory_init
factory[opset_version] = NodeFactory() if opset_version is None else NodeFactory(opset_version)

return factory[opset_version]
Expand All @@ -99,10 +84,10 @@ def inner(opset_version: Optional[str] = None) -> NodeFactory:


_get_factory = _get_factory_callable()
_get_opset_factory = _get_opset_factory_callable()

# some files uses _get_factory function
from .__version__ import __version__ # noqa
from .build_tokenizer import build_rwkv_tokenizer # noqa
from .convert_tokenizer import convert_tokenizer # noqa
from .str_pack import pack_strings, unpack_strings # noqa
from .utils import add_greedy_decoding, connect_models # noqa
8 changes: 4 additions & 4 deletions python/openvino_tokenizers/build_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,12 @@ def build_rwkv_tokenizer(
tokenizer_output_type: Type = Type.i64,
detokenizer_input_type: Type = Type.i64,
) -> Tuple[Model, Model]:
from openvino_tokenizers import _get_factory, _get_opset_factory
from openvino_tokenizers import _get_factory

input_node = op.Parameter(Type.string, PartialShape(["?"]))
input_node.set_friendly_name("string_input")

output = _get_opset_factory("opset15").create("StringTensorUnpack", input_node.outputs()).outputs()
output = _get_factory().create("StringTensorUnpack", input_node.outputs()).outputs()
trie_node = TrieTokenizerStep.from_rwkv_vocab(rwkv_vocab)
output = trie_node.get_ov_subgraph(TokenizerPipeline.add_ragged_dimension(output))

Expand Down Expand Up @@ -56,7 +56,7 @@ def build_rwkv_tokenizer(
_get_factory()
.create(
"VocabDecoder",
[*detokenizer_input.outputs(), *BasePipelineStep.create_string_constant_node(trie_node.vocab)],
[*detokenizer_input.outputs(), *BasePipelineStep.create_string_constant_node(trie_node.vocab).outputs()],
)
.outputs()
)
Expand All @@ -65,7 +65,7 @@ def build_rwkv_tokenizer(
if clean_up_tokenization_spaces:
RegexDecodingStep.clean_up_tokenization_spaces().get_ov_subgraph(detokenizer_output)

detokenizer_output = _get_opset_factory("opset15").create("StringTensorPack", detokenizer_output).outputs()
detokenizer_output = _get_factory().create("StringTensorPack", detokenizer_output).outputs()
detokenizer_output[0].tensor.add_names({STRING_OUTPUT_NAME})

detokenizer = Model(detokenizer_output, [detokenizer_input], DETOKENIZER_NAME)
Expand Down
6 changes: 3 additions & 3 deletions python/openvino_tokenizers/hf_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from transformers import PreTrainedTokenizerBase, PreTrainedTokenizerFast
from transformers.convert_slow_tokenizer import import_protobuf

from . import _get_factory, _get_opset_factory
from . import _get_factory
from .constants import (
ATTENTION_MASK_INPUT_NAME,
DETOKENIZER_NAME,
Expand Down Expand Up @@ -810,7 +810,7 @@ def convert_sentencepiece_model_tokenizer(
if params.handle_special_tokens_with_re:
tokens, ids = zip(*sorted(((token, id) for id, token in add_tokens.items()), reverse=True))
added_inputs = [
*BasePipelineStep.create_string_constant_node(tokens),
*BasePipelineStep.create_string_constant_node(tokens).outputs(),
make_constant_node(np.array(ids, dtype=np.int32), Type.i32).output(0),
]
else:
Expand Down Expand Up @@ -1013,7 +1013,7 @@ def get_sp_detokenizer(
if params.utf8_replace_mode is not None and params.utf8_replace_mode != UTF8ReplaceMode.DISABLE:
last_sinks = UTF8ValidateStep(params.utf8_replace_mode).get_ov_subgraph(detokenizer)

string_output = _get_opset_factory("opset15").create("StringTensorPack", last_sinks).outputs()
string_output = _get_factory().create("StringTensorPack", last_sinks).outputs()
string_output[0].tensor.add_names({STRING_OUTPUT_NAME})
tokenizer_detokenizer = Model(string_output, [model_input], DETOKENIZER_NAME)
tokenizer_detokenizer.validate_nodes_and_infer_types()
Expand Down
62 changes: 62 additions & 0 deletions python/openvino_tokenizers/str_pack.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# -*- coding: utf-8 -*-
# Copyright (C) 2018-2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

from io import BytesIO
from typing import Iterable, List

import numpy as np
from numpy.typing import NDArray


def to_bytes(number: int) -> bytes:
return number.to_bytes(4, "little")


def pack_string(string: str) -> NDArray:
return np.frombuffer(bytes(string, "utf-8"), dtype=np.uint8)


def pack_strings(strings: Iterable[str]) -> NDArray:
"""
Convert any list of string to U8/1D numpy array compatible with converted OV model input
"""
strings = list(strings)
batch_size = len(strings)
if batch_size == 0:
return np.frombuffer(to_bytes(0), np.uint8)

buffer = BytesIO()
buffer.write(to_bytes(batch_size))
symbols = BytesIO()
offset = 0
buffer.write(to_bytes(offset))
for string in strings:
byte_string = string.encode("utf-8") if isinstance(string, str) else string
offset += len(byte_string)

buffer.write(to_bytes(offset))
symbols.write(byte_string)

buffer.write(symbols.getvalue())
return np.frombuffer(buffer.getvalue(), np.uint8)


# TODO: handle possible sighed values in batch size and offsets
def unpack_strings(u8_tensor: NDArray, decoding_errors: str = "replace") -> List[str]:
"""
Convert an array of uint8 elements to a list of strings; reverse to pack_strings
"""

def from_bytes(offset: int, size: int) -> int:
return int.from_bytes(u8_tensor[offset : offset + size], "little")

batch_size = from_bytes(0, 4)
strings = []
for i in range(batch_size):
begin = from_bytes(4 + i * 4, 4)
end = from_bytes(4 + (i + 1) * 4, 4)
length = end - begin
begin += 4 * (batch_size + 2)
strings.append(bytes(u8_tensor[begin : begin + length]).decode("utf-8", errors=decoding_errors))
return strings
69 changes: 29 additions & 40 deletions python/openvino_tokenizers/tokenizer_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from openvino.runtime.exceptions import OVTypeError, UserInputError
from openvino.runtime.utils.types import as_node, make_constant_node

from . import _get_factory, _get_opset_factory
from . import _get_factory
from .constants import (
ATTENTION_MASK_INPUT_NAME,
DETOKENIZER_NAME,
Expand All @@ -31,13 +31,8 @@
VOCAB_SIZE_CACHE_PROPORTION,
UTF8ReplaceMode,
)
from .utils import (
apply_unicode_to_bytes,
create_unpacked_string,
generate_tokens_with_space_symbols,
has_incompatible_re2_op,
quote_meta,
)
from .str_pack import pack_string, pack_strings
from .utils import apply_unicode_to_bytes, generate_tokens_with_space_symbols, has_incompatible_re2_op, quote_meta


logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -71,15 +66,15 @@ def get_ov_subgraph(self, *input_nodes: List[Output]) -> List[Output]:
raise NotImplementedError

@staticmethod
def create_string_constant_node(value: Union[str, Iterable[str]]) -> List[Output]:
def create_string_constant_node(value: Union[str, Iterable[str]]) -> op.Constant:
if isinstance(value, str):
# string scalar
return op.Constant(np.frombuffer(bytes(value, "utf-8"), dtype=np.uint8)).outputs()
elif isinstance(value, Iterable):
# support only 1D strings for now
return create_unpacked_string(value)
ps = pack_string(value)
return op.Constant(ps)
else:
raise ValueError(f"Unsupported value type {type(value)}")
# support only 1D strings for now
ps = pack_strings(value)
return _get_factory().create("StringTensorUnpack", op.Constant(ps).outputs())

def finalize(self) -> None:
"""Called after the entire pipeline has been built"""
Expand Down Expand Up @@ -149,7 +144,7 @@ def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
return list(input_nodes)

split_pattern = "|".join(token.regex_repr() for token in self.special_tokens)
input_nodes.extend(self.create_string_constant_node(split_pattern))
input_nodes.extend(self.create_string_constant_node(split_pattern).outputs())

return _get_factory().create("SpecialTokensSplit", input_nodes).outputs()

Expand Down Expand Up @@ -238,10 +233,10 @@ def del_control_chars_regex(cls) -> "RegexNormalizationStep":

def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
input_nodes.extend(
[
*self.create_string_constant_node(self.regex_search_pattern),
*self.create_string_constant_node(self.replace_term),
]
(
self.create_string_constant_node(self.regex_search_pattern),
self.create_string_constant_node(self.replace_term),
)
)
return (
_get_factory().create("RegexNormalization", input_nodes, {"global_replace": self.global_replace}).outputs()
Expand Down Expand Up @@ -362,7 +357,7 @@ def punctuation_splitter(cls, behaviour="isolate") -> "RegexSplitStep":
)

def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
input_nodes.extend(self.create_string_constant_node(self.split_pattern))
input_nodes.extend(self.create_string_constant_node(self.split_pattern).outputs())
return (
_get_factory()
.create(
Expand Down Expand Up @@ -428,7 +423,7 @@ def get_vocab_node_outputs(self) -> Optional[List[Output]]:

def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
pipeline = self.get_pipeline()
pipeline.vocab_node_outputs = self.create_string_constant_node(self.vocab)
pipeline.vocab_node_outputs = self.create_string_constant_node(self.vocab).outputs()

ragged_dims, other_dims = [], input_nodes
if len(input_nodes) > 4:
Expand Down Expand Up @@ -480,7 +475,7 @@ def from_rwkv_vocab(cls, vocab_file_strings: Iterable[str]) -> TrieTokenizerStep
def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
input_nodes.extend(
(
*self.create_string_constant_node(self.vocab),
*self.create_string_constant_node(self.vocab).outputs(),
make_constant_node(np.array(self.indices, dtype=np.int32), Type.i32),
)
)
Expand Down Expand Up @@ -516,7 +511,7 @@ def from_hf_json(cls, tokenizer_json: Dict[str, Any]) -> "WordPieceTokenizationS
def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
input_nodes.extend(
(
*self.create_string_constant_node(self.vocab),
*self.create_string_constant_node(self.vocab).outputs(),
*as_node(self.unk_token_id).outputs(),
)
)
Expand Down Expand Up @@ -648,10 +643,10 @@ def merges_are_pairs(self) -> bool:

def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
pipeline = self.get_pipeline()
pipeline.vocab_node_outputs = self.create_string_constant_node(self.vocab)
pipeline.vocab_node_outputs = self.create_string_constant_node(self.vocab).outputs()

if self.added_tokens:
special_tokens_outputs = self.create_string_constant_node(self.added_tokens)
special_tokens_outputs = self.create_string_constant_node(self.added_tokens).outputs()
else:
special_tokens_outputs = []

Expand All @@ -664,12 +659,12 @@ def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
left_merges, right_merges = zip(*self.merges)
input_nodes.extend(
(
*self.create_string_constant_node(left_merges),
*self.create_string_constant_node(right_merges),
*self.create_string_constant_node(left_merges).outputs(),
*self.create_string_constant_node(right_merges).outputs(),
)
)
else:
input_nodes.extend(self.create_string_constant_node(self.merges))
input_nodes.extend(self.create_string_constant_node(self.merges).outputs())

if special_tokens_outputs:
input_nodes.extend(
Expand Down Expand Up @@ -1040,13 +1035,7 @@ def finalize(self) -> None:
self.skip_tokens = pipeline.skip_tokens or []

@classmethod
def from_hf_json(
cls,
tokenizer_json: Dict[str, Any],
pipeline_vocab: Optional[List[str]],
skip_tokens: Optional[List[int]] = None,
do_skip_tokens: bool = True,
) -> "VocabDecoderStep":
def from_hf_json(cls, tokenizer_json: Dict[str, Any], pipeline_vocab: Optional[List[str]], skip_tokens: Optional[List[int]] = None, do_skip_tokens: bool = True) -> "VocabDecoderStep":
model_type = tokenizer_json["model"]["type"]

if pipeline_vocab is not None and model_type == "WordLevel":
Expand All @@ -1068,7 +1057,7 @@ def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
if self.vocab is None:
vocab_outputs = self.get_vocab_node_outputs()
else:
vocab_outputs = self.create_string_constant_node(self.vocab)
vocab_outputs = self.create_string_constant_node(self.vocab).outputs()
input_nodes.extend(vocab_outputs)

# Put constant with skip tokens even if do_skip_tokens=False, so that it can be switched on/off at runtime.
Expand Down Expand Up @@ -1189,8 +1178,8 @@ def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:

input_nodes.extend(
(
*self.create_string_constant_node(self.regex_search_pattern),
*self.create_string_constant_node(self.replace_term),
*self.create_string_constant_node(self.regex_search_pattern).outputs(),
*self.create_string_constant_node(self.replace_term).outputs(),
)
)
return ragged_dims + _get_factory().create("RegexNormalization", input_nodes).outputs()
Expand Down Expand Up @@ -1245,7 +1234,7 @@ def get_tokenizer_ov_subgraph(self) -> Model:

processing_outputs = []
for input_node in string_inputs:
input_node = _get_opset_factory("opset15").create("StringTensorUnpack", input_node.outputs()).outputs()
input_node = _get_factory().create("StringTensorUnpack", input_node.outputs()).outputs()

ragged = []
if isinstance(self.steps[0], SpecialTokensSplit):
Expand Down Expand Up @@ -1318,7 +1307,7 @@ def create_decoding_pipeline(self, input_nodes: List[Output]) -> List[Output]:
pipeline_step = step.get_ov_subgraph(input_nodes)
input_nodes = pipeline_step

return _get_opset_factory("opset15").create("StringTensorPack", input_nodes).outputs()
return _get_factory().create("StringTensorPack", input_nodes).outputs()

def get_detokenizer_ov_subgraph(self) -> Model:
self.finalize()
Expand Down
Loading

0 comments on commit afaa521

Please sign in to comment.