Skip to content

Commit

Permalink
Merge pull request #15 from apaniukov/add-eos-token-data
Browse files Browse the repository at this point in the history
Add EOS token to rt_info
  • Loading branch information
apaniukov authored Feb 9, 2024
2 parents 86c5ba1 + df9fab9 commit 73e3592
Show file tree
Hide file tree
Showing 9 changed files with 198 additions and 120 deletions.
202 changes: 101 additions & 101 deletions README.md

Large diffs are not rendered by default.

10 changes: 7 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ transformers = [
"transformers[sentencepiece] >= 4.36.0",
"tiktoken"
]
# chatglm2 custom tokenizer file imports torch, have to add torch dependency for tests
torch = [
'torch @ https://download.pytorch.org/whl/cpu-cxx11-abi/torch-2.0.1%2Bcpu.cxx11.abi-cp38-cp38-linux_x86_64.whl ; sys_platform=="linux" and python_version == "3.8"',
'torch @ https://download.pytorch.org/whl/cpu-cxx11-abi/torch-2.0.1%2Bcpu.cxx11.abi-cp39-cp39-linux_x86_64.whl ; sys_platform=="linux" and python_version == "3.9"',
Expand All @@ -38,6 +39,7 @@ dev = [
"bandit",
"pytest",
"pytest_harvest",
"pandas",
"openvino_tokenizers[transformers, torch]"
]
fuzzing = [
Expand All @@ -53,15 +55,17 @@ all = [
convert_tokenizer = "openvino_tokenizers.cli:convert_hf_tokenizer"

[tool.ruff]
line-length = 119

[tool.ruff.lint]
ignore = ["C901", "E501", "E741", "W605"]
select = ["C", "E", "F", "I", "W"]
line-length = 119

[tool.ruff.per-file-ignores]
[tool.ruff.lint.per-file-ignores]
"__init__.py" = ["F401"]
"openvino_tokenizers/hf_parser.py" = ["F821"]

[tool.ruff.isort]
[tool.ruff.lint.isort]
lines-after-imports = 2

[tool.bandit]
Expand Down
1 change: 1 addition & 0 deletions python/openvino_tokenizers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from .str_pack import pack_strings, unpack_strings
from .utils import add_greedy_decoding, connect_models


_ext_name = "openvino_tokenizers"
if sys.platform == "win32":
_ext_name = f"{_ext_name}.dll"
Expand Down
2 changes: 2 additions & 0 deletions python/openvino_tokenizers/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
TOKEN_IDS_OUTPUT_NAME = "token_ids"
STRING_OUTPUT_NAME = "string_output"

EOS_TOKEN_ID_NAME = "eos_token_id"

GREEDY_DECODER_NAME = "greedy_decoder"

TOKENIZER_NAME = "tokenizer"
Expand Down
32 changes: 24 additions & 8 deletions python/openvino_tokenizers/hf_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,14 @@
from openvino.runtime import Node, op
from openvino.runtime.exceptions import OVTypeError
from openvino.runtime.utils.types import as_node, make_constant_node
from transformers import PreTrainedTokenizerBase
from transformers.convert_slow_tokenizer import import_protobuf

from . import _get_factory
from .constants import (
ATTENTION_MASK_INPUT_NAME,
DETOKENIZER_NAME,
EOS_TOKEN_ID_NAME,
STRING_OUTPUT_NAME,
TOKEN_IDS_INPUT_NAME,
TOKEN_TYPE_IDS_INPUT_NAME,
Expand Down Expand Up @@ -93,7 +95,7 @@ def parse_split_step(pretokenizer_dict: Dict[str, Any]) -> RegexSplitStep:


def parse_byte_level_pretokenization_step(
pretokenizer_dict: Dict[str, Any]
pretokenizer_dict: Dict[str, Any],
) -> List[Union[NormalizationStep, PreTokenizatinStep]]:
steps = []
if pretokenizer_dict.get("add_prefix_space"):
Expand Down Expand Up @@ -145,6 +147,7 @@ def parse(
),
]:
add_steps()
self.pipeline.eos_token_id = getattr(self.original_tokenizer, "eos_token_id", None)

return self.pipeline

Expand Down Expand Up @@ -298,7 +301,7 @@ def decoding(
return


def parse_special_tokens(hf_tokenizer: "PreTrainedTokenizerBase") -> Dict[int, str]:
def parse_special_tokens(hf_tokenizer: PreTrainedTokenizerBase) -> Dict[int, str]:
# the order matters
if getattr(hf_tokenizer, "added_tokens_decoder", False):
return {
Expand All @@ -315,7 +318,7 @@ def parse_special_tokens(hf_tokenizer: "PreTrainedTokenizerBase") -> Dict[int, s


def convert_fast_tokenizer(
hf_tokenizer: "PreTrainedTokenizerBase",
hf_tokenizer: PreTrainedTokenizerBase,
number_of_inputs: int = 1,
with_detokenizer: bool = False,
skip_special_tokens: bool = False,
Expand Down Expand Up @@ -348,13 +351,16 @@ def convert_fast_tokenizer(
filtered_outputs.append(ov_tokenizer.output(i))

tokenizer_model = Model(filtered_outputs, ov_tokenizer.get_parameters(), TOKENIZER_NAME)
for path, info in ov_tokenizer.get_rt_info().items():
tokenizer_model.set_rt_info(info.value, path)

if with_detokenizer:
return tokenizer_model, pipeline.get_detokenizer_ov_subgraph()

return tokenizer_model


def is_sentencepiece_model(hf_tokenizer: "PreTrainedTokenizerBase") -> bool:
def is_sentencepiece_model(hf_tokenizer: PreTrainedTokenizerBase) -> bool:
return getattr(hf_tokenizer, "vocab_files_names", {}).get("vocab_file", "").endswith(".model")


Expand Down Expand Up @@ -397,7 +403,7 @@ def modify_sentencepiece_model(


def convert_sentencepiece_model_tokenizer(
hf_tokenizer: "PreTrainedTokenizerBase",
hf_tokenizer: PreTrainedTokenizerBase,
add_attention_mask: bool = True,
with_detokenizer: bool = False,
streaming_detokenizer: bool = False,
Expand Down Expand Up @@ -491,18 +497,26 @@ def convert_sentencepiece_model_tokenizer(
tokenizer = Model(outputs, [input_node], TOKENIZER_NAME)
tokenizer.validate_nodes_and_infer_types()

if hf_tokenizer.eos_token_id is not None:
tokenizer.set_rt_info(hf_tokenizer.eos_token_id, EOS_TOKEN_ID_NAME)

if not with_detokenizer:
return tokenizer

if clean_up_tokenization_spaces is None:
clean_up_tokenization_spaces = hf_tokenizer.clean_up_tokenization_spaces

return tokenizer, get_sp_detokenizer(
detokenizer = get_sp_detokenizer(
sp_model_node,
streaming_detokenizer=streaming_detokenizer,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
)

if hf_tokenizer.eos_token_id is not None:
detokenizer.set_rt_info(hf_tokenizer.eos_token_id, EOS_TOKEN_ID_NAME)

return tokenizer, detokenizer


def get_sp_detokenizer(
sp_model_node: Node, streaming_detokenizer: bool = False, clean_up_tokenization_spaces: bool = False
Expand Down Expand Up @@ -531,7 +545,7 @@ def get_sp_detokenizer(
return tokenizer_detokenizer


def is_tiktoken_model(hf_tokenizer: "PreTrainedTokenizerBase") -> bool:
def is_tiktoken_model(hf_tokenizer: PreTrainedTokenizerBase) -> bool:
try:
from tiktoken import Encoding
except ImportError:
Expand All @@ -543,7 +557,7 @@ def is_tiktoken_model(hf_tokenizer: "PreTrainedTokenizerBase") -> bool:


def convert_tiktoken_model_tokenizer(
hf_tokenizer: "PreTrainedTokenizerBase",
hf_tokenizer: PreTrainedTokenizerBase,
with_detokenizer: bool = False,
skip_special_tokens: bool = False,
clean_up_tokenization_spaces: Optional[bool] = None,
Expand Down Expand Up @@ -577,4 +591,6 @@ def convert_tiktoken_model_tokenizer(
if not with_detokenizer:
return pipeline.get_tokenizer_ov_subgraph()

pipeline.eos_token_id = hf_tokenizer.eos_token_id

return pipeline.get_tokenizer_ov_subgraph(), pipeline.get_detokenizer_ov_subgraph()
19 changes: 15 additions & 4 deletions python/openvino_tokenizers/tokenizer_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from .constants import (
ATTENTION_MASK_INPUT_NAME,
DETOKENIZER_NAME,
EOS_TOKEN_ID_NAME,
STRING_OUTPUT_NAME,
TOKEN_IDS_INPUT_NAME,
TOKEN_TYPE_IDS_INPUT_NAME,
Expand All @@ -26,8 +27,9 @@
from .str_pack import pack_string, pack_strings


@dataclass
class BasePipelineStep:
_pipeline = field(default=None, init=False, repr=False)
_pipeline: Optional[weakref.ReferenceType["TokenizerPipeline"]] = field(default=None, init=False, repr=False)

def __str__(self) -> str:
params_string = ", ".join(f"{key}={val!r}" for key, val in self.get_config().items())
Expand All @@ -44,7 +46,7 @@ def get_config(self) -> Dict[str, Any]:
return config

def get_pipeline(self) -> Optional["TokenizerPipeline"]:
return self._pipeline()
return self._pipeline() if self._pipeline is not None else None

def set_pipeline(self, pipeline: "TokenizerPipeline") -> None:
self._pipeline = weakref.ref(pipeline)
Expand Down Expand Up @@ -475,6 +477,9 @@ def set_token_id(self, vocab: Optional[List[str]]) -> None:
if vocab is not None and self.token in vocab:
self._token_id = vocab.index(self.token)

@property
def token_id(self) -> Optional[int]:
return self._token_id

@dataclass
class TokenWithTypeId:
Expand Down Expand Up @@ -658,7 +663,7 @@ def get_ov_subgraph(self, input_nodes: List[Output]) -> List[Output]:
"RaggedToDense",
input_nodes[3 * i : 3 * (i + 1)]
+ max_length.outputs()
+ make_constant_node(0, Type.i32).outputs(),
+ make_constant_node(self.token_id or 0, Type.i32).outputs(),
)
.outputs()
)
Expand Down Expand Up @@ -753,6 +758,7 @@ class TokenizerPipeline:
skip_tokens: Optional[List[int]] = field(default=None, repr=False)
number_of_inputs: int = 1
vocab_node_outputs: Optional[List[Output]] = field(default=None, repr=False)
eos_token_id: Optional[int] = None

def get_config(self) -> Dict[str, Dict[str, Any]]:
return {type(step).__name__: step.get_config() for step in self.steps}
Expand Down Expand Up @@ -793,7 +799,10 @@ def get_tokenizer_ov_subgraph(self) -> Model:
for step in self.post_tokenization_steps:
processing_outputs = step.get_ov_subgraph(processing_outputs)

return Model(processing_outputs, string_inputs, name=TOKENIZER_NAME)
model = Model(processing_outputs, string_inputs, name=TOKENIZER_NAME)
if self.eos_token_id is not None:
model.set_rt_info(self.eos_token_id, EOS_TOKEN_ID_NAME)
return model

@property
def normalization_steps(self) -> List[NormalizationStep]:
Expand Down Expand Up @@ -841,4 +850,6 @@ def get_detokenizer_ov_subgraph(self) -> Model:
outputs = self.create_decoding_pipeline([token_ids])
model = Model(outputs, [input_node], name=DETOKENIZER_NAME)
model.output().tensor.add_names({STRING_OUTPUT_NAME})
if self.eos_token_id is not None:
model.set_rt_info(self.eos_token_id, EOS_TOKEN_ID_NAME)
return model
2 changes: 1 addition & 1 deletion tests/pass_rates.json
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
{
"tokenizers_test.py::test_": 0.9104394066610692
"tokenizers_test.py::test_": 0.9110740586355426
}
2 changes: 1 addition & 1 deletion tests/tokenizer_differential_fuzzing.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
import sys
import unicodedata
from functools import lru_cache

import atheris
import numpy as np
from openvino import compile_model
from transformers import AutoTokenizer
import unicodedata


with atheris.instrument_imports():
Expand Down
48 changes: 46 additions & 2 deletions tests/tokenizers_test.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
# -*- coding: utf-8 -*-
# Copyright (C) 2018-2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
from typing import Optional

import numpy as np
import pytest
from openvino import Core
from openvino import Core, Model
from openvino_tokenizers import convert_tokenizer
from openvino_tokenizers.constants import EOS_TOKEN_ID_NAME
from transformers import AutoTokenizer


Expand Down Expand Up @@ -117,7 +119,7 @@ def unpack_strings(strings):
# "THUDM/chatglm-6b", # hf_tokenizer init error
"THUDM/chatglm2-6b", # detokenizer cannot filter special tokens
"THUDM/chatglm3-6b",
# "t5-base", # crashes tests
# "t5-base", # no <s> token in the vocab, sentencepiece check error
]
tiktiken_models = [
"stabilityai/stablelm-2-1_6b",
Expand Down Expand Up @@ -468,3 +470,45 @@ def test_detokenizer_results_align_with_hf_on_multitoken_symbols_for_streaming()
hf_detokenized_stream += hf_output

assert detokenized_stream == hf_detokenized_stream


def check_eos_id(eos_token_id: Optional[int], *models: Model) -> None:
for model in models:
if eos_token_id is None:
assert not model.has_rt_info(EOS_TOKEN_ID_NAME)
else:
assert model.has_rt_info(EOS_TOKEN_ID_NAME)
assert model.get_rt_info(EOS_TOKEN_ID_NAME).value == eos_token_id


def test_eos_token_id_rt_info_wordpiece(hf_wordpiece_tokenizers):
eos_token_id = hf_wordpiece_tokenizers.eos_token_id
ov_tokenizer = convert_tokenizer(hf_wordpiece_tokenizers)
check_eos_id(eos_token_id, ov_tokenizer)


def test_eos_token_id_rt_info_bpe(hf_bpe_tokenizers):
eos_token_id = hf_bpe_tokenizers.eos_token_id
ov_tokenizer, ov_detokenizer = convert_tokenizer(
hf_bpe_tokenizers,
with_detokenizer=True,
)
check_eos_id(eos_token_id, ov_tokenizer, ov_detokenizer)


def test_eos_token_id_rt_info_tiktoken(hf_tiktoken_tokenizers):
eos_token_id = hf_tiktoken_tokenizers.eos_token_id
ov_tokenizer, ov_detokenizer = convert_tokenizer(
hf_tiktoken_tokenizers,
with_detokenizer=True,
)
check_eos_id(eos_token_id, ov_tokenizer, ov_detokenizer)


def test_eos_token_id_rt_info_sentencepiece(hf_sentencepiece_tokenizers):
eos_token_id = hf_sentencepiece_tokenizers.eos_token_id
ov_tokenizer, ov_detokenizer = convert_tokenizer(
hf_sentencepiece_tokenizers,
with_detokenizer=True,
)
check_eos_id(eos_token_id, ov_tokenizer, ov_detokenizer)

0 comments on commit 73e3592

Please sign in to comment.