diff --git a/README.md b/README.md
index 3f73a4695..96a2e7f99 100644
--- a/README.md
+++ b/README.md
@@ -76,6 +76,7 @@ After that you can add binary extension in the code with:
- `core.add_extension("libopenvino_tokenizers.so")` for Linux
and `read`/`compile` converted (de)tokenizers models.
+If you use version `2023.3.0.0`, the binary extension file is called `(lib)user_ov_extension.(dll/dylib/so)`.
## Usage
@@ -269,8 +270,8 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
SentencePiece |
- 76.07 |
- 2896 |
+ 76.33 |
+ 3620 |
Tiktoken |
@@ -456,7 +457,7 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
SentencePiece |
camembert-base_slow |
- 75.14 |
+ 74.03 |
181 |
@@ -471,6 +472,18 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
100.00 |
181 |
+
+ SentencePiece |
+ facebook/musicgen-small |
+ 80.11 |
+ 181 |
+
+
+ SentencePiece |
+ facebook/musicgen-small_slow |
+ 74.03 |
+ 181 |
+
SentencePiece |
microsoft/deberta-v3-base |
@@ -483,6 +496,18 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
100.00 |
181 |
+
+ SentencePiece |
+ t5-base |
+ 81.22 |
+ 181 |
+
+
+ SentencePiece |
+ t5-base_slow |
+ 75.14 |
+ 181 |
+
SentencePiece |
xlm-roberta-base |
diff --git a/python/openvino_tokenizers/cli.py b/python/openvino_tokenizers/cli.py
index e2092558a..5d6dc9732 100644
--- a/python/openvino_tokenizers/cli.py
+++ b/python/openvino_tokenizers/cli.py
@@ -48,6 +48,16 @@ def get_parser() -> ArgumentParser:
action="store_true",
help="Add a detokenizer model to the output",
)
+ parser.add_argument(
+ "--subfolder",
+ required=False,
+ type=str,
+ default="",
+ help=(
+ "Specify in case the tokenizer files are located inside a subfolder of the model repo on huggingface.co. "
+ "Example: `convert_tokenizer SimianLuo/LCM_Dreamshaper_v7 --subfolder tokenizer`"
+ ),
+ )
parser.add_argument(
"--skip-special-tokens",
"--skip_special_tokens",
@@ -139,7 +149,7 @@ def convert_hf_tokenizer() -> None:
args = get_parser().parse_args()
print("Loading Huggingface Tokenizer...")
- hf_tokenizer = AutoTokenizer.from_pretrained(args.name, trust_remote_code=args.trust_remote_code)
+ hf_tokenizer = AutoTokenizer.from_pretrained(args.name, subfolder=args.subfolder, trust_remote_code=args.trust_remote_code)
print("Converting Huggingface Tokenizer to OpenVINO...")
converted = convert_tokenizer(
diff --git a/python/openvino_tokenizers/hf_parser.py b/python/openvino_tokenizers/hf_parser.py
index 738ff6178..2d0a3b091 100644
--- a/python/openvino_tokenizers/hf_parser.py
+++ b/python/openvino_tokenizers/hf_parser.py
@@ -462,7 +462,10 @@ def convert_sentencepiece_model_tokenizer(
getattr(hf_tokenizer, "truncation_side", "") == "right"
or getattr(hf_tokenizer, "padding_side", "") == "right"
)
- add_bos_token = getattr(hf_tokenizer, "add_bos_token", add_eos_token) or False
+
+ add_bos_token = (
+ getattr(hf_tokenizer, "add_bos_token", add_eos_token) and hf_tokenizer.bos_token_id is not None
+ ) or False
tokenizer_node = _get_factory().create(
"SentencepieceTokenizer",
diff --git a/src/regex_normalization.cpp b/src/regex_normalization.cpp
index dd95e85d4..31a9563f0 100644
--- a/src/regex_normalization.cpp
+++ b/src/regex_normalization.cpp
@@ -10,10 +10,20 @@
using namespace ov;
-RegexNormalization::RegexNormalization(const ov::OutputVector& arguments) :
- ov::op::Op(arguments) {
- constructor_validate_and_infer_types();
- }
+RegexNormalization::RegexNormalization(
+ const ov::OutputVector& arguments,
+ bool global_replace
+) : ov::op::Op(arguments),
+m_global_replace(global_replace) {
+ auto search_pattern_const = as_type_ptr(arguments[3].get_node_shared_ptr());
+ auto replace_pattern_const = as_type_ptr(arguments[4].get_node_shared_ptr());
+ auto search_pattern_buf = static_cast(search_pattern_const->get_data_ptr());
+ auto replace_pattern_buf = static_cast(replace_pattern_const->get_data_ptr());
+ auto search_pattern = absl::string_view((const char*)search_pattern_buf, search_pattern_const->get_byte_size());
+ m_replace_pattern = absl::string_view((const char*)replace_pattern_buf, replace_pattern_const->get_byte_size());
+ m_search_pattern_re = std::make_shared(search_pattern);
+ constructor_validate_and_infer_types();
+}
RegexNormalization::RegexNormalization(
diff --git a/src/regex_normalization.hpp b/src/regex_normalization.hpp
index 1d0b7e993..7562f1ec9 100644
--- a/src/regex_normalization.hpp
+++ b/src/regex_normalization.hpp
@@ -18,7 +18,10 @@ class RegexNormalization : public ov::op::Op {
OPENVINO_OP("RegexNormalization");
RegexNormalization () = default;
- RegexNormalization(const ov::OutputVector& arguments); // not used
+ RegexNormalization(
+ const ov::OutputVector& arguments,
+ bool global_replace = true
+ );
RegexNormalization(
const ov::OutputVector& arguments,
const std::shared_ptr& search_pattern_re,
diff --git a/src/tensorflow_translators.cpp b/src/tensorflow_translators.cpp
index 57781f0d9..7313a38b1 100644
--- a/src/tensorflow_translators.cpp
+++ b/src/tensorflow_translators.cpp
@@ -105,11 +105,15 @@ ov::OutputVector translate_normalize_utf8(const ov::frontend::NodeContext& node)
}
ov::OutputVector translate_static_regex_replace(const ov::frontend::NodeContext& node) {
+ auto node_name = node.get_name();
FRONT_END_GENERAL_CHECK(node.get_input_size() == 1, "StaticRegexReplace expects only 1 input");
+ auto replace_global = node.get_attribute("replace_global", true);
ov::OutputVector inputs = pre_translate_string_tensor_input(node.get_input(0));
inputs.push_back(string_attribute_to_constant(node, "pattern"));
inputs.push_back(string_attribute_to_constant(node, "rewrite"));
- return { post_translate_string_tensor_output(std::make_shared(inputs)->outputs()) };
+ auto string_pack_result = post_translate_string_tensor_output(std::make_shared(inputs, replace_global)->outputs());
+ set_node_name(node_name, string_pack_result.get_node_shared_ptr());
+ return { string_pack_result };
}
ov::OutputVector translate_regex_split_with_offsets(const ov::frontend::NodeContext& node) {
@@ -119,7 +123,7 @@ ov::OutputVector translate_regex_split_with_offsets(const ov::frontend::NodeCont
inputs.push_back(delim_regex_pattern);
// TODO: Use node.get_input(2) with keep_delim_regex_pattern, most likely it should be handled in another RegexSplit with `isolate` behaviour
auto outputs = std::make_shared(inputs)->outputs();
- auto flatten_string_tensor = post_translate_string_tensor_output({outputs[2], outputs[3], outputs[4]});
+ auto flatten_string_tensor = post_translate_string_tensor_output({ outputs[2], outputs[3], outputs[4] });
return { post_translate_ragged_tensor_output({outputs[0], outputs[1], flatten_string_tensor}) };
}
@@ -127,14 +131,14 @@ ov::OutputVector translate_wordpiece_tokenize_with_offsets(const ov::frontend::N
FRONT_END_GENERAL_CHECK(node.get_input_size() == 2, "WordpieceTokenizeWithOffsets expects 2 inputs");
ov::OutputVector inputs = pre_translate_ragged_string_tensor_input(node.get_input(0));
- #if USE_STRING_TENSORS
+#if USE_STRING_TENSORS
// It may seem enough to call pre_translate_string_tensor_input that will override Parameter element
// type in case if string tensors are not used.
// But a Parameter is still required to be overridden even if string tensors are used because in TF model
// it is represented not as a string tensor, but as a resource with hash table for lookup that we cannot interpret
// and have to replace by 1D string tensor.
- override_parameter(node.get_input(1).get_node_shared_ptr(), element::string, PartialShape{Dimension()});
- #endif
+ override_parameter(node.get_input(1).get_node_shared_ptr(), element::string, PartialShape{ Dimension() });
+#endif
auto vocab = pre_translate_string_tensor_input(node.get_input(1));
inputs.insert(inputs.end(), vocab.begin(), vocab.end());
diff --git a/src/utils.cpp b/src/utils.cpp
index f53262f2b..f5b329b98 100644
--- a/src/utils.cpp
+++ b/src/utils.cpp
@@ -125,27 +125,15 @@ void override_parameter (std::shared_ptr node, element::Type type, con
}
}
-// TODO: replace NodeContext and input_index by a single input
-OutputVector pre_translate_string_tensor_input(ov::Output input) {
+OutputVector pre_translate_string_tensor_input(const ov::Output& input) {
auto input_node = input.get_node_shared_ptr();
-#if !USE_STRING_TENSORS
- override_parameter(input_node, element::u8, PartialShape{Dimension()});
-#endif
-
if (auto struct_pack = std::dynamic_pointer_cast(input_node)) {
FRONT_END_GENERAL_CHECK(struct_pack->get_input_size() == 3, "Expected 3 inputs to StringTensorPack which represents a string tensor");
return struct_pack->input_values();
- } else {
- #if USE_STRING_TENSORS || true // always
- return std::make_shared(OutputVector{input}, "begins_ends")->outputs();
- #else
- // Suppose this is u8 packed string tensor with a single batch dimension
- // Unpack this tensor using standard operations
-
- // Cannot do that because there is not ReinterprectCast operation in OV
- // TODO: Find a way to make it without reinterpretation operation or introduce it as an extension (easy)
- #endif
+ }
+ else {
+ return std::make_shared(OutputVector{ input }, "begins_ends")->outputs();
}
}
@@ -221,3 +209,11 @@ std::shared_ptr string_attribute_to_constant (const ov::frontend::NodeCont
return std::make_shared(element::u8, Shape{value.length()}, (const void*)value.data());
#endif
}
+
+void set_node_name(const std::string& node_name, const std::shared_ptr& node) {
+ const auto& outputs = node->outputs();
+ node->set_friendly_name(node_name);
+ for (size_t idx = 0; idx < outputs.size(); ++idx) {
+ outputs[idx].get_tensor().add_names({ node_name + ":" + std::to_string(idx) });
+ }
+}
diff --git a/src/utils.hpp b/src/utils.hpp
index da0634687..7fafb011b 100644
--- a/src/utils.hpp
+++ b/src/utils.hpp
@@ -52,7 +52,7 @@ void unpack_strings_to_tensors(const std::string* strings, const ov::Shape shape
void override_parameter (std::shared_ptr node, ov::element::Type type, const ov::PartialShape& shape);
-ov::OutputVector pre_translate_string_tensor_input(ov::Output input);
+ov::OutputVector pre_translate_string_tensor_input(const ov::Output& input);
ov::OutputVector pre_translate_ragged_tensor_input(ov::Output input);
@@ -68,3 +68,5 @@ bool evaluate_normalization_helper (
std::function normalizer);
std::shared_ptr string_attribute_to_constant (const ov::frontend::NodeContext& node, const std::string& name);
+
+void set_node_name(const std::string& node_name, const std::shared_ptr& node);
diff --git a/tests/pass_rates.json b/tests/pass_rates.json
index 2f5988136..206a424f5 100644
--- a/tests/pass_rates.json
+++ b/tests/pass_rates.json
@@ -1,3 +1,3 @@
{
- "tokenizers_test.py::test_": 0.8798110323746006
+ "tokenizers_test.py::test_": 0.8700921600807978
}
\ No newline at end of file
diff --git a/tests/tokenizers_test.py b/tests/tokenizers_test.py
index 42a106d77..1317f4c87 100644
--- a/tests/tokenizers_test.py
+++ b/tests/tokenizers_test.py
@@ -119,7 +119,8 @@ def unpack_strings(strings):
# "THUDM/chatglm-6b", # hf_tokenizer init error
"THUDM/chatglm2-6b", # detokenizer cannot filter special tokens
"THUDM/chatglm3-6b",
- # "t5-base", # no token in the vocab, sentencepiece check error
+ "t5-base",
+ "facebook/musicgen-small",
]
tiktiken_models = [
"stabilityai/stablelm-2-1_6b",