From e244cbdffe585160425173510eaac60a63050529 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Tue, 5 Nov 2024 16:46:52 +0000 Subject: [PATCH 1/4] Add max_length Option to CLI Convert Tool --- python/openvino_tokenizers/cli.py | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/python/openvino_tokenizers/cli.py b/python/openvino_tokenizers/cli.py index a34d5e57..a4f11aa2 100644 --- a/python/openvino_tokenizers/cli.py +++ b/python/openvino_tokenizers/cli.py @@ -2,7 +2,7 @@ # Copyright (C) 2023-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -from argparse import Action, ArgumentParser +from argparse import Action, ArgumentParser, ArgumentError from pathlib import Path from openvino import Type, save_model @@ -21,6 +21,12 @@ def __call__(self, parser, namespace, values, option_string=None) -> None: setattr(namespace, self.dest, self.string_to_type_dict[values]) +def check_positive_int(value: str) -> int: + int_value = int(value) + if int_value <= 0: + raise ArgumentError(f"Value must be positive integer, got: {value}") + return int_value + class TrueOrPositiveIntAction(Action): def __call__(self, parser, namespace, values, option_string=None) -> None: if values.isnumeric(): @@ -104,6 +110,17 @@ def get_parser() -> ArgumentParser: "Not supported for Sentencepiece-based tokenizers." ), ) + parser.add_argument( + "--max_length", + "--max-length", + required=False, + type=check_positive_int, + help=( + "Set max_length to the tokenizer for truncation operation. " + "Tokenizer won't produce output longer than max_length. " + "The value will be replaced by the max_padding option if set." + ), + ) skip_special_group = parser.add_mutually_exclusive_group() skip_special_group.add_argument( "--not-skip-special-tokens", @@ -250,9 +267,13 @@ def convert_hf_tokenizer() -> None: print("Loading Huggingface Tokenizer...") hf_tokenizer = AutoTokenizer.from_pretrained(args.name, **tokenizer_init_kwargs) + if isinstance(args.max_padding, int) and args.max_padding is not True: print(f"Set max_length to: {args.max_padding}") hf_tokenizer.model_max_length = args.max_padding + elif args.max_length: + print(f"Set max_length to: {args.max_length}") + hf_tokenizer.model_max_length = args.max_length print("Converting Huggingface Tokenizer to OpenVINO...") converted = convert_tokenizer( From 5e1d359482b5e0c5b36e2a29346585ce8579f238 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Tue, 5 Nov 2024 16:49:19 +0000 Subject: [PATCH 2/4] Add max_length Option to CLI Convert Tool --- python/openvino_tokenizers/cli.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/openvino_tokenizers/cli.py b/python/openvino_tokenizers/cli.py index a4f11aa2..21750339 100644 --- a/python/openvino_tokenizers/cli.py +++ b/python/openvino_tokenizers/cli.py @@ -21,10 +21,10 @@ def __call__(self, parser, namespace, values, option_string=None) -> None: setattr(namespace, self.dest, self.string_to_type_dict[values]) -def check_positive_int(value: str) -> int: +def check_max_length_positive_int(value: str) -> int: int_value = int(value) if int_value <= 0: - raise ArgumentError(f"Value must be positive integer, got: {value}") + raise ArgumentError(f"Max length must be positive integer, got: {value}") return int_value class TrueOrPositiveIntAction(Action): @@ -114,7 +114,7 @@ def get_parser() -> ArgumentParser: "--max_length", "--max-length", required=False, - type=check_positive_int, + type=check_max_length_positive_int, help=( "Set max_length to the tokenizer for truncation operation. " "Tokenizer won't produce output longer than max_length. " From 8ccc53638e114ba3ed9c68efc26fd6195eb536d2 Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Tue, 5 Nov 2024 16:49:56 +0000 Subject: [PATCH 3/4] Add max_length Option to CLI Convert Tool --- python/openvino_tokenizers/cli.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/openvino_tokenizers/cli.py b/python/openvino_tokenizers/cli.py index 21750339..680cc8b8 100644 --- a/python/openvino_tokenizers/cli.py +++ b/python/openvino_tokenizers/cli.py @@ -2,7 +2,7 @@ # Copyright (C) 2023-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -from argparse import Action, ArgumentParser, ArgumentError +from argparse import Action, ArgumentError, ArgumentParser from pathlib import Path from openvino import Type, save_model @@ -27,6 +27,7 @@ def check_max_length_positive_int(value: str) -> int: raise ArgumentError(f"Max length must be positive integer, got: {value}") return int_value + class TrueOrPositiveIntAction(Action): def __call__(self, parser, namespace, values, option_string=None) -> None: if values.isnumeric(): From 95c1330defcd8d19f8217f371e4729490a814d0c Mon Sep 17 00:00:00 2001 From: Artur Paniukov Date: Tue, 5 Nov 2024 16:54:11 +0000 Subject: [PATCH 4/4] Add max_length Option to CLI Convert Tool --- python/openvino_tokenizers/cli.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/openvino_tokenizers/cli.py b/python/openvino_tokenizers/cli.py index 680cc8b8..ded5de9c 100644 --- a/python/openvino_tokenizers/cli.py +++ b/python/openvino_tokenizers/cli.py @@ -21,10 +21,10 @@ def __call__(self, parser, namespace, values, option_string=None) -> None: setattr(namespace, self.dest, self.string_to_type_dict[values]) -def check_max_length_positive_int(value: str) -> int: +def check_positive_int(value: str) -> int: int_value = int(value) if int_value <= 0: - raise ArgumentError(f"Max length must be positive integer, got: {value}") + raise ArgumentError(f"Value must be positive integer, got: {value}") return int_value @@ -115,7 +115,7 @@ def get_parser() -> ArgumentParser: "--max_length", "--max-length", required=False, - type=check_max_length_positive_int, + type=check_positive_int, help=( "Set max_length to the tokenizer for truncation operation. " "Tokenizer won't produce output longer than max_length. "