From 9c200c2d09d139860e974296ddee130d989dae60 Mon Sep 17 00:00:00 2001 From: Wovchena Date: Mon, 30 Oct 2023 15:27:20 +0400 Subject: [PATCH 1/3] Export pack_strings() and unpack_strings() --- .../tokenizer/CMakeLists.txt | 1 + .../user_ie_extensions/tokenizer/utils.cpp | 35 ------------------- .../user_ie_extensions/tokenizer/utils.hpp | 32 ++++++++++++++++- 3 files changed, 32 insertions(+), 36 deletions(-) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/CMakeLists.txt b/modules/custom_operations/user_ie_extensions/tokenizer/CMakeLists.txt index 910b340e8..d2790e155 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/CMakeLists.txt +++ b/modules/custom_operations/user_ie_extensions/tokenizer/CMakeLists.txt @@ -65,6 +65,7 @@ target_include_directories(${TARGET_NAME} PRIVATE "${sentencepiece_BINARY_DIR}" "${FAST_TOKENIZER_INCS}") +target_include_directories(${TARGET_NAME} PUBLIC .) if(CMAKE_CL_64) target_compile_definitions(sentencepiece-static PRIVATE _CRT_SECURE_NO_WARNINGS _SCL_SECURE_NO_WARNINGS) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/utils.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/utils.cpp index 9ee3e15ba..199cd314e 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/utils.cpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/utils.cpp @@ -223,41 +223,6 @@ std::shared_ptr string_attribute_to_constant (const ov::frontend::NodeCont #endif } - -// Pack any container with string to ov::Tensor with element type u8 -// Requirements for BatchOfStrings: .size() with size and .begin(), .end() as iterators, elements with .begin(), .end() and .length() -// so basically any STL container with std::string is compatible -// Tensor destination will be reshaped according the input data -template -void pack_strings (const BatchOfStrings& strings, ov::Tensor& destination) { - auto batch_size = strings.size(); - - // First run over all elements: calculate total memory required to hold all strings - auto symbols_size = std::accumulate( - strings.begin(), strings.end(), size_t(0), - [](size_t accum, typename BatchOfStrings::const_reference s) - { return accum + s.length(); }); - - auto total_size = 4*(1 + 1 + batch_size) + symbols_size; - destination.set_shape({total_size}); - - auto data = destination.data(); - auto pbatch_size = reinterpret_cast(data); - auto pindices = pbatch_size + 1; - auto psymbols = reinterpret_cast(pindices + 1 + batch_size); - size_t current_symbols_pos = 0; - - *pbatch_size = batch_size; - *pindices = 0; - - for(auto s: strings) { - psymbols = std::copy(s.begin(), s.end(), psymbols); - current_symbols_pos += s.length(); - *++pindices = current_symbols_pos; - } -} - - std::vector unpack_strings (const ov::Tensor& source) { auto strings = source.data(); auto length = source.get_byte_size(); diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/utils.hpp b/modules/custom_operations/user_ie_extensions/tokenizer/utils.hpp index a0d72b5fc..32bb34110 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/utils.hpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/utils.hpp @@ -69,7 +69,37 @@ bool evaluate_normalization_helper ( std::shared_ptr string_attribute_to_constant (const ov::frontend::NodeContext& node, const std::string& name); +// Pack any container with string to ov::Tensor with element type u8 +// Requirements for BatchOfStrings: .size() with size and .begin(), .end() as iterators, elements with .begin(), .end() and .length() +// so basically any STL container with std::string is compatible +// Tensor destination will be reshaped according the input data template -void pack_strings (const BatchOfStrings& strings, ov::Tensor& destination); +void pack_strings (const BatchOfStrings& strings, ov::Tensor& destination) { + auto batch_size = strings.size(); + + // First run over all elements: calculate total memory required to hold all strings + auto symbols_size = std::accumulate( + strings.begin(), strings.end(), size_t(0), + [](size_t accum, typename BatchOfStrings::const_reference s) + { return accum + s.length(); }); + + auto total_size = 4*(1 + 1 + batch_size) + symbols_size; + destination.set_shape({total_size}); + + auto data = destination.data(); + auto pbatch_size = reinterpret_cast(data); + auto pindices = pbatch_size + 1; + auto psymbols = reinterpret_cast(pindices + 1 + batch_size); + size_t current_symbols_pos = 0; + + *pbatch_size = batch_size; + *pindices = 0; + + for(auto s: strings) { + psymbols = std::copy(s.begin(), s.end(), psymbols); + current_symbols_pos += s.length(); + *++pindices = current_symbols_pos; + } +} std::vector unpack_strings(const ov::Tensor& source); From debcb5d36990891f9312b78ec3d9c8851e4da785 Mon Sep 17 00:00:00 2001 From: Wovchena Date: Fri, 10 Nov 2023 15:17:39 +0400 Subject: [PATCH 2/3] Move unpack_strings(), create sepparate include dir --- .../tokenizer/CMakeLists.txt | 2 +- .../tokenizer/include/tokenizer/strings.hpp | 60 +++++++++++++++++++ .../user_ie_extensions/tokenizer/utils.cpp | 20 ------- .../user_ie_extensions/tokenizer/utils.hpp | 35 ----------- 4 files changed, 61 insertions(+), 56 deletions(-) create mode 100644 modules/custom_operations/user_ie_extensions/tokenizer/include/tokenizer/strings.hpp diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/CMakeLists.txt b/modules/custom_operations/user_ie_extensions/tokenizer/CMakeLists.txt index d94ce04ec..5b799dd44 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/CMakeLists.txt +++ b/modules/custom_operations/user_ie_extensions/tokenizer/CMakeLists.txt @@ -134,7 +134,7 @@ target_include_directories(${TARGET_NAME} PRIVATE # fast_tokenizer ${FAST_TOKENIZER_INCS}) -target_include_directories(${TARGET_NAME} PUBLIC .) +target_include_directories(${TARGET_NAME} PUBLIC ./include/) if(CMAKE_CL_64) target_compile_definitions(sentencepiece-static PRIVATE _CRT_SECURE_NO_WARNINGS _SCL_SECURE_NO_WARNINGS) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/include/tokenizer/strings.hpp b/modules/custom_operations/user_ie_extensions/tokenizer/include/tokenizer/strings.hpp new file mode 100644 index 000000000..bb1c85a8f --- /dev/null +++ b/modules/custom_operations/user_ie_extensions/tokenizer/include/tokenizer/strings.hpp @@ -0,0 +1,60 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +// Pack any container with string to ov::Tensor with element type u8 +// Requirements for BatchOfStrings: .size() with size and .begin(), .end() as iterators, elements with .begin(), .end() and .length() +// so basically any STL container with std::string is compatible +// Tensor destination will be reshaped according the input data +template +void pack_strings(const BatchOfStrings& strings, ov::Tensor& destination) { + auto batch_size = strings.size(); + + // First run over all elements: calculate total memory required to hold all strings + auto symbols_size = std::accumulate( + strings.begin(), strings.end(), size_t(0), + [](size_t accum, typename BatchOfStrings::const_reference s) + { return accum + s.length(); }); + + auto total_size = 4*(1 + 1 + batch_size) + symbols_size; + destination.set_shape({total_size}); + + auto data = destination.data(); + auto pbatch_size = reinterpret_cast(data); + auto pindices = pbatch_size + 1; + auto psymbols = reinterpret_cast(pindices + 1 + batch_size); + size_t current_symbols_pos = 0; + + *pbatch_size = batch_size; + *pindices = 0; + + for(auto s: strings) { + psymbols = std::copy(s.begin(), s.end(), psymbols); + current_symbols_pos += s.length(); + *++pindices = current_symbols_pos; + } +} + +std::vector unpack_strings(const ov::Tensor& source) { + auto strings = source.data(); + auto length = source.get_byte_size(); + // check the format of the input bitstream representing the string tensor + OPENVINO_ASSERT(length >= 4, "Incorrect packed string tensor format: no batch size in the packed string tensor"); + auto batch_size = *reinterpret_cast(strings + 0); + OPENVINO_ASSERT(length >= 4 + 4 + 4 * batch_size, + "Incorrect packed string tensor format: the packed string tensor must contain first string offset and end indices"); + auto begin_ids = reinterpret_cast(strings + 4); + auto end_ids = begin_ids + 1; + auto symbols = strings + 4 + 4 + 4 * batch_size; + + std::vector result; + result.reserve(batch_size); + for(size_t i = 0; i < batch_size; ++i) { + result.push_back(std::string(symbols + begin_ids[i], symbols + end_ids[i])); + } + return result; +} diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/utils.cpp b/modules/custom_operations/user_ie_extensions/tokenizer/utils.cpp index 199cd314e..3aaf6989e 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/utils.cpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/utils.cpp @@ -222,23 +222,3 @@ std::shared_ptr string_attribute_to_constant (const ov::frontend::NodeCont return std::make_shared(element::u8, Shape{value.length()}, (const void*)value.data()); #endif } - -std::vector unpack_strings (const ov::Tensor& source) { - auto strings = source.data(); - auto length = source.get_byte_size(); - // check the format of the input bitstream representing the string tensor - OPENVINO_ASSERT(length >= 4, "Incorrect packed string tensor format: no batch size in the packed string tensor"); - auto batch_size = *reinterpret_cast(strings + 0); - OPENVINO_ASSERT(length >= 4 + 4 + 4 * batch_size, - "Incorrect packed string tensor format: the packed string tensor must contain first string offset and end indices"); - auto begin_ids = reinterpret_cast(strings + 4); - auto end_ids = begin_ids + 1; - auto symbols = strings + 4 + 4 + 4 * batch_size; - - std::vector result; - result.reserve(batch_size); - for(size_t i = 0; i < batch_size; ++i) { - result.push_back(std::string(symbols + begin_ids[i], symbols + end_ids[i])); - } - return result; -} diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/utils.hpp b/modules/custom_operations/user_ie_extensions/tokenizer/utils.hpp index 32bb34110..8ffbc9e04 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/utils.hpp +++ b/modules/custom_operations/user_ie_extensions/tokenizer/utils.hpp @@ -68,38 +68,3 @@ bool evaluate_normalization_helper ( std::function normalizer); std::shared_ptr string_attribute_to_constant (const ov::frontend::NodeContext& node, const std::string& name); - -// Pack any container with string to ov::Tensor with element type u8 -// Requirements for BatchOfStrings: .size() with size and .begin(), .end() as iterators, elements with .begin(), .end() and .length() -// so basically any STL container with std::string is compatible -// Tensor destination will be reshaped according the input data -template -void pack_strings (const BatchOfStrings& strings, ov::Tensor& destination) { - auto batch_size = strings.size(); - - // First run over all elements: calculate total memory required to hold all strings - auto symbols_size = std::accumulate( - strings.begin(), strings.end(), size_t(0), - [](size_t accum, typename BatchOfStrings::const_reference s) - { return accum + s.length(); }); - - auto total_size = 4*(1 + 1 + batch_size) + symbols_size; - destination.set_shape({total_size}); - - auto data = destination.data(); - auto pbatch_size = reinterpret_cast(data); - auto pindices = pbatch_size + 1; - auto psymbols = reinterpret_cast(pindices + 1 + batch_size); - size_t current_symbols_pos = 0; - - *pbatch_size = batch_size; - *pindices = 0; - - for(auto s: strings) { - psymbols = std::copy(s.begin(), s.end(), psymbols); - current_symbols_pos += s.length(); - *++pindices = current_symbols_pos; - } -} - -std::vector unpack_strings(const ov::Tensor& source); From b739ffd0bf097f6d54e29b10090b9265d072ee30 Mon Sep 17 00:00:00 2001 From: Wovchena Date: Sat, 11 Nov 2023 03:24:20 +0400 Subject: [PATCH 3/3] openvino_extensions --- .../user_ie_extensions/CMakeLists.txt | 1 + .../openvino_extensions}/strings.hpp | 49 ++++++++++--------- .../tokenizer/CMakeLists.txt | 2 - 3 files changed, 26 insertions(+), 26 deletions(-) rename modules/custom_operations/user_ie_extensions/{tokenizer/include/tokenizer => include/openvino_extensions}/strings.hpp (54%) diff --git a/modules/custom_operations/user_ie_extensions/CMakeLists.txt b/modules/custom_operations/user_ie_extensions/CMakeLists.txt index 26f438b07..c830c0a21 100644 --- a/modules/custom_operations/user_ie_extensions/CMakeLists.txt +++ b/modules/custom_operations/user_ie_extensions/CMakeLists.txt @@ -101,3 +101,4 @@ endif() target_link_libraries(${TARGET_NAME} PRIVATE openvino::runtime) target_compile_definitions(${TARGET_NAME} PRIVATE ${CUSTOM_OPERATIONS}) +target_include_directories(${TARGET_NAME} PUBLIC ./include/) diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/include/tokenizer/strings.hpp b/modules/custom_operations/user_ie_extensions/include/openvino_extensions/strings.hpp similarity index 54% rename from modules/custom_operations/user_ie_extensions/tokenizer/include/tokenizer/strings.hpp rename to modules/custom_operations/user_ie_extensions/include/openvino_extensions/strings.hpp index bb1c85a8f..5bfe85e5a 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/include/tokenizer/strings.hpp +++ b/modules/custom_operations/user_ie_extensions/include/openvino_extensions/strings.hpp @@ -6,8 +6,9 @@ #include +namespace openvino_extensions { // Pack any container with string to ov::Tensor with element type u8 -// Requirements for BatchOfStrings: .size() with size and .begin(), .end() as iterators, elements with .begin(), .end() and .length() +// Requirements for BatchOfStrings: .size() with size and .begin(), .end() as iterators, elements with .begin(), .end() and .size() // so basically any STL container with std::string is compatible // Tensor destination will be reshaped according the input data template @@ -15,46 +16,46 @@ void pack_strings(const BatchOfStrings& strings, ov::Tensor& destination) { auto batch_size = strings.size(); // First run over all elements: calculate total memory required to hold all strings - auto symbols_size = std::accumulate( + size_t symbols_size = std::accumulate( strings.begin(), strings.end(), size_t(0), - [](size_t accum, typename BatchOfStrings::const_reference s) - { return accum + s.length(); }); + [](size_t accum, typename BatchOfStrings::const_reference str) + { return accum + str.size(); }); - auto total_size = 4*(1 + 1 + batch_size) + symbols_size; + size_t total_size = 4 * (1 + 1 + batch_size) + symbols_size; destination.set_shape({total_size}); - auto data = destination.data(); - auto pbatch_size = reinterpret_cast(data); - auto pindices = pbatch_size + 1; - auto psymbols = reinterpret_cast(pindices + 1 + batch_size); + int32_t* pindices = reinterpret_cast(destination.data()); + pindices[0] = batch_size; + pindices[1] = 0; + pindices += 2; + char* psymbols = reinterpret_cast(pindices + batch_size); size_t current_symbols_pos = 0; - *pbatch_size = batch_size; - *pindices = 0; - - for(auto s: strings) { - psymbols = std::copy(s.begin(), s.end(), psymbols); - current_symbols_pos += s.length(); - *++pindices = current_symbols_pos; + for (const auto& str: strings) { + psymbols = std::copy(str.begin(), str.end(), psymbols); + current_symbols_pos += str.size(); + *pindices = current_symbols_pos; + ++pindices; } } std::vector unpack_strings(const ov::Tensor& source) { - auto strings = source.data(); - auto length = source.get_byte_size(); + int32_t length = source.get_byte_size(); // check the format of the input bitstream representing the string tensor OPENVINO_ASSERT(length >= 4, "Incorrect packed string tensor format: no batch size in the packed string tensor"); - auto batch_size = *reinterpret_cast(strings + 0); + const int32_t* pindices = reinterpret_cast(source.data()); + int32_t batch_size = pindices[0]; OPENVINO_ASSERT(length >= 4 + 4 + 4 * batch_size, "Incorrect packed string tensor format: the packed string tensor must contain first string offset and end indices"); - auto begin_ids = reinterpret_cast(strings + 4); - auto end_ids = begin_ids + 1; - auto symbols = strings + 4 + 4 + 4 * batch_size; + const int32_t* begin_ids = pindices + 1; + const int32_t* end_ids = pindices + 2; + const char* symbols = reinterpret_cast(pindices + 2 + batch_size); std::vector result; result.reserve(batch_size); - for(size_t i = 0; i < batch_size; ++i) { - result.push_back(std::string(symbols + begin_ids[i], symbols + end_ids[i])); + for (int32_t idx = 0; idx < batch_size; ++idx) { + result.emplace_back(symbols + begin_ids[idx], symbols + end_ids[idx]); } return result; } +} diff --git a/modules/custom_operations/user_ie_extensions/tokenizer/CMakeLists.txt b/modules/custom_operations/user_ie_extensions/tokenizer/CMakeLists.txt index 5b799dd44..fcb5df69b 100644 --- a/modules/custom_operations/user_ie_extensions/tokenizer/CMakeLists.txt +++ b/modules/custom_operations/user_ie_extensions/tokenizer/CMakeLists.txt @@ -134,8 +134,6 @@ target_include_directories(${TARGET_NAME} PRIVATE # fast_tokenizer ${FAST_TOKENIZER_INCS}) -target_include_directories(${TARGET_NAME} PUBLIC ./include/) - if(CMAKE_CL_64) target_compile_definitions(sentencepiece-static PRIVATE _CRT_SECURE_NO_WARNINGS _SCL_SECURE_NO_WARNINGS) endif()