diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml index 2e0afaa882..b02aac6023 100644 --- a/.github/workflows/causal_lm_cpp.yml +++ b/.github/workflows/causal_lm_cpp.yml @@ -22,6 +22,18 @@ env: w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.1.0-17911-83c047443de/w_openvino_toolkit_windows_2025.1.0.dev20250116_x86_64.zip jobs: + code-quality-checks: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v4 + with: + python-version: 3.9 + - name: Install pre-commit + run: pip install pre-commit + - name: Run pre-commit (checks for trailing whitespaces, and non-ASCII symbols in filenames and file content) + run: pre-commit run --all-files --show-diff-on-failure + cpp-multinomial-greedy_causal_lm-ubuntu: runs-on: ubuntu-20.04-8-cores defaults: diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000000..7d724ba222 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,20 @@ +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.0.1 + hooks: + - id: trailing-whitespace # checks for files with trailing whitespaces, excluding .md and Git-related hidden files + exclude: '\.md$|.*\.git.*' + - id: check-merge-conflict # checks for files that contain merge conflict strings (such as <<<<<<<, =======, and >>>>>>>) + - id: check-json # Ensures that JSON files are syntactically correct + - id: end-of-file-fixer # ensures that each file ends with one blank line, excluding Git-related hidden files + exclude: '.*\.git.*' + - repo: local + hooks: + - id: forbid-non-ascii-filenames # runs the script that prohibits non-ASCII characters in file names + name: Prohibit non-ASCII characters in file names + entry: ./pre_commit_scripts/check_non_ascii_filenames.sh + language: script + - id: forbid-non-ascii-in-files # checks for non-ASCII characters in files (excluding Markdown and hidden files), with characters ± and ? allowed + name: Check for non-ASCII characters in files (excluding Markdown and hidden files), with characters ± and ? allowed + entry: ./pre_commit_scripts/check_non_ascii_in_files.sh + language: script diff --git a/pre_commit_scripts/check_non_ascii_filenames.sh b/pre_commit_scripts/check_non_ascii_filenames.sh new file mode 100755 index 0000000000..2bd4a5deb7 --- /dev/null +++ b/pre_commit_scripts/check_non_ascii_filenames.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +# Store the command output: +empty_tree=$(git hash-object -t tree /dev/null) + +# Get a list of new files that might have non-ASCII characters: +problem_files=$(git diff --name-only --diff-filter=A -z "$empty_tree" | LC_ALL=C grep -P "[^\x00-\x7F]") + +# Count the number of problematic files: +count=$(echo "$problem_files" | wc -w) + +# Print necessary info based on the result: +if [ "$count" -ne 0 ]; then + echo "Error: Non-ASCII characters found in filenames of new files:" + echo "$problem_files" + exit 1 +else + echo "Success: No non-ASCII filenames found." +fi +exit 0 diff --git a/pre_commit_scripts/check_non_ascii_in_files.sh b/pre_commit_scripts/check_non_ascii_in_files.sh new file mode 100755 index 0000000000..18206cbdb2 --- /dev/null +++ b/pre_commit_scripts/check_non_ascii_in_files.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +# Define the list of files to check, excluding .md, hidden, and a number of specific files: +files_to_check=$(git ls-files | grep -vE "^\." | grep -vE "\.md$" | grep -vE "^(tests/python_tests|tools/who_what_benchmark/(tests|whowhatbench))" | grep -v "tools/llm_bench/llm_bench_utils/ov_model_classes.py") + +# Run git grep to find non-ASCII characters in the selected files and store the results: +results=$(LC_ALL=C git grep -n "[^ -~±�\”\“]" -- $files_to_check) + +# Print the results: +if [ -n "$results" ]; then + echo "Error: Non-ASCII characters found in files:" + echo "$results" + exit 1 +else + echo "Success: No non-ASCII characters found in files." +fi +exit 0 diff --git a/requirements-build.txt b/requirements-build.txt index b961822bb7..dde07fdcde 100644 --- a/requirements-build.txt +++ b/requirements-build.txt @@ -1,3 +1,3 @@ cmake~=3.23.0; platform_system != 'Darwin' or platform_machine == 'x86_64' cmake~=3.24.0; platform_system == 'Darwin' and platform_machine == 'arm64' -pybind11-stubgen==2.5.1 \ No newline at end of file +pybind11-stubgen==2.5.1 diff --git a/samples/cpp/image_generation/CMakeLists.txt b/samples/cpp/image_generation/CMakeLists.txt index 16710d2697..b1274d7551 100644 --- a/samples/cpp/image_generation/CMakeLists.txt +++ b/samples/cpp/image_generation/CMakeLists.txt @@ -107,4 +107,4 @@ set_target_properties(inpainting PROPERTIES install(TARGETS inpainting RUNTIME DESTINATION samples_bin/ COMPONENT samples_bin - EXCLUDE_FROM_ALL) \ No newline at end of file + EXCLUDE_FROM_ALL) diff --git a/samples/cpp/text_generation/CMakeLists.txt b/samples/cpp/text_generation/CMakeLists.txt index efad21a647..31b12da8b5 100644 --- a/samples/cpp/text_generation/CMakeLists.txt +++ b/samples/cpp/text_generation/CMakeLists.txt @@ -58,4 +58,4 @@ set_target_properties(benchmark_genai PROPERTIES install(TARGETS benchmark_genai RUNTIME DESTINATION samples_bin/ COMPONENT samples_bin - EXCLUDE_FROM_ALL) \ No newline at end of file + EXCLUDE_FROM_ALL) diff --git a/samples/cpp/text_generation/beam_search_causal_lm.cpp b/samples/cpp/text_generation/beam_search_causal_lm.cpp index 9e1ee069ad..8850eb2b6c 100644 --- a/samples/cpp/text_generation/beam_search_causal_lm.cpp +++ b/samples/cpp/text_generation/beam_search_causal_lm.cpp @@ -19,7 +19,7 @@ int main(int argc, char* argv[]) try { config.num_beams = 15; config.diversity_penalty = 1.0f; config.num_return_sequences = config.num_beams; - + // Since the streamer is set, the results will // be printed each time a new token is generated. auto beams = pipe.generate(prompts, config); diff --git a/samples/cpp/text_generation/chat_sample.cpp b/samples/cpp/text_generation/chat_sample.cpp index c0d172563c..6f4bc85812 100644 --- a/samples/cpp/text_generation/chat_sample.cpp +++ b/samples/cpp/text_generation/chat_sample.cpp @@ -12,14 +12,14 @@ int main(int argc, char* argv[]) try { std::string device = "CPU"; // GPU, NPU can be used as well ov::genai::LLMPipeline pipe(models_path, device); - + ov::genai::GenerationConfig config; config.max_new_tokens = 100; - std::function streamer = [](std::string word) { + std::function streamer = [](std::string word) { std::cout << word << std::flush; // Return flag corresponds whether generation should be stopped. // false means continue generation. - return false; + return false; }; pipe.start_chat(); diff --git a/samples/cpp/text_generation/encrypted_model_causal_lm.cpp b/samples/cpp/text_generation/encrypted_model_causal_lm.cpp index 7926de5552..735f391875 100644 --- a/samples/cpp/text_generation/encrypted_model_causal_lm.cpp +++ b/samples/cpp/text_generation/encrypted_model_causal_lm.cpp @@ -41,7 +41,7 @@ int main(int argc, char* argv[]) try { auto [model_str, model_weights] = decrypt_model(models_path + "/openvino_model.xml", models_path + "/openvino_model.bin"); ov::genai::Tokenizer tokenizer = decrypt_tokenizer(models_path); - + ov::genai::LLMPipeline pipe(model_str, model_weights, tokenizer, device); std::string result = pipe.generate(prompt, ov::genai::max_new_tokens(100)); diff --git a/samples/cpp/text_generation/prompt_lookup_decoding_lm.cpp b/samples/cpp/text_generation/prompt_lookup_decoding_lm.cpp index bf4d81daa2..d4251b67c3 100644 --- a/samples/cpp/text_generation/prompt_lookup_decoding_lm.cpp +++ b/samples/cpp/text_generation/prompt_lookup_decoding_lm.cpp @@ -19,7 +19,7 @@ int main(int argc, char* argv[]) try { std::string model_path = argv[1]; std::string prompt = argv[2]; - + std::string device = "CPU"; ov::genai::LLMPipeline pipe( diff --git a/samples/cpp/visual_language_chat/CMakeLists.txt b/samples/cpp/visual_language_chat/CMakeLists.txt index 6f209ad0c8..e9d8db56e9 100644 --- a/samples/cpp/visual_language_chat/CMakeLists.txt +++ b/samples/cpp/visual_language_chat/CMakeLists.txt @@ -42,4 +42,4 @@ set_target_properties(benchmark_vlm PROPERTIES install(TARGETS benchmark_vlm RUNTIME DESTINATION samples_bin/ COMPONENT samples_bin - EXCLUDE_FROM_ALL) \ No newline at end of file + EXCLUDE_FROM_ALL) diff --git a/samples/cpp/visual_language_chat/benchmark_vlm.cpp b/samples/cpp/visual_language_chat/benchmark_vlm.cpp index 8467738307..322f806577 100644 --- a/samples/cpp/visual_language_chat/benchmark_vlm.cpp +++ b/samples/cpp/visual_language_chat/benchmark_vlm.cpp @@ -42,15 +42,15 @@ int main(int argc, char* argv[]) try { size_t num_warmup = result["num_warmup"].as(); size_t num_iter = result["num_iter"].as(); ov::Tensor image = utils::load_image(image_path); - + ov::genai::GenerationConfig config; config.max_new_tokens = result["max_new_tokens"].as(); ov::genai::VLMPipeline pipe(models_path, device); - + for (size_t i = 0; i < num_warmup; i++) pipe.generate(prompt, ov::genai::image(image), ov::genai::generation_config(config)); - + auto res = pipe.generate(prompt, ov::genai::image(image), ov::genai::generation_config(config)); auto metrics = res.perf_metrics; for (size_t i = 0; i < num_iter - 1; i++) { diff --git a/samples/export-requirements.txt b/samples/export-requirements.txt index 1eaf151466..b48dbbaa70 100644 --- a/samples/export-requirements.txt +++ b/samples/export-requirements.txt @@ -10,4 +10,4 @@ diffusers==0.32.2 # For image generation pipelines timm==1.0.14 # For exporting InternVL2 torchvision # For visual language models transformers>=4.43 # For Whisper -hf_transfer # for faster models download, should used with env var HF_HUB_ENABLE_HF_TRANSFER=1 \ No newline at end of file +hf_transfer # for faster models download, should used with env var HF_HUB_ENABLE_HF_TRANSFER=1 diff --git a/samples/python/image_generation/text2image.py b/samples/python/image_generation/text2image.py index cba1eefd1d..d7126cbe2e 100644 --- a/samples/python/image_generation/text2image.py +++ b/samples/python/image_generation/text2image.py @@ -29,4 +29,4 @@ def main(): if '__main__' == __name__: - main() \ No newline at end of file + main() diff --git a/samples/python/text_generation/benchmark_genai.py b/samples/python/text_generation/benchmark_genai.py index d279ab95fc..76e1a0cd4d 100755 --- a/samples/python/text_generation/benchmark_genai.py +++ b/samples/python/text_generation/benchmark_genai.py @@ -12,31 +12,31 @@ def main(): parser.add_argument("-n", "--num_iter", type=int, default=2, help="Number of iterations") parser.add_argument("-mt", "--max_new_tokens", type=int, default=20, help="Maximal number of new tokens") parser.add_argument("-d", "--device", type=str, default="CPU", help="Device") - + args = parser.parse_args() - # Perf metrics is stored in DecodedResults. + # Perf metrics is stored in DecodedResults. # In order to get DecodedResults instead of a string input should be a list. prompt = [args.prompt] models_path = args.model device = args.device num_warmup = args.num_warmup num_iter = args.num_iter - + config = ov_genai.GenerationConfig() config.max_new_tokens = args.max_new_tokens pipe = ov_genai.LLMPipeline(models_path, device) - + for _ in range(num_warmup): pipe.generate(prompt, config) - + res = pipe.generate(prompt, config) perf_metrics = res.perf_metrics for _ in range(num_iter - 1): res = pipe.generate(prompt, config) perf_metrics += res.perf_metrics - + print(f"Load time: {perf_metrics.get_load_time():.2f} ms") print(f"Generate time: {perf_metrics.get_generate_duration().mean:.2f} ± {perf_metrics.get_generate_duration().std:.2f} ms") print(f"Tokenization time: {perf_metrics.get_tokenization_duration().mean:.2f} ± {perf_metrics.get_tokenization_duration().std:.2f} ms") diff --git a/samples/python/text_generation/multinomial_causal_lm.py b/samples/python/text_generation/multinomial_causal_lm.py index c915b89a2f..61bb14b58d 100755 --- a/samples/python/text_generation/multinomial_causal_lm.py +++ b/samples/python/text_generation/multinomial_causal_lm.py @@ -11,18 +11,18 @@ class IterableStreamer(openvino_genai.StreamerBase): """ A custom streamer class for handling token streaming and detokenization with buffering. - + Attributes: tokenizer (Tokenizer): The tokenizer used for encoding and decoding tokens. tokens_cache (list): A buffer to accumulate tokens for detokenization. text_queue (Queue): A synchronized queue for storing decoded text chunks. print_len (int): The length of the printed text to manage incremental decoding. """ - + def __init__(self, tokenizer): """ Initializes the IterableStreamer with the given tokenizer. - + Args: tokenizer (Tokenizer): The tokenizer to use for encoding and decoding tokens. """ @@ -38,14 +38,14 @@ def __iter__(self): Returns the iterator object itself. """ return self - + def __next__(self): """ Returns the next value from the text queue. - + Returns: str: The next decoded text chunk. - + Raises: StopIteration: If there are no more elements in the queue. """ @@ -53,20 +53,20 @@ def __next__(self): if value is None: raise StopIteration return value - + def get_stop_flag(self): """ Checks whether the generation process should be stopped. - + Returns: bool: Always returns False in this implementation. """ return False - + def put_word(self, word: str): """ Puts a word into the text queue. - + Args: word (str): The word to put into the queue. """ @@ -75,10 +75,10 @@ def put_word(self, word: str): def put(self, token_id: int) -> bool: """ Processes a token and manages the decoding buffer. Adds decoded text to the queue. - + Args: token_id (int): The token_id to process. - + Returns: bool: True if generation should be stopped, False otherwise. """ @@ -168,7 +168,7 @@ def token_printer(): config.top_p = 0.9 config.top_k = 30 - # Since the streamer is set, the results will be printed + # Since the streamer is set, the results will be printed # every time a new token is generated and put into the streamer queue. pipe.generate(args.prompt, config, text_print_streamer) printer_thread.join() diff --git a/samples/python/text_generation/prompt_lookup_decoding_lm.py b/samples/python/text_generation/prompt_lookup_decoding_lm.py index 726391ba9b..ea06510db1 100755 --- a/samples/python/text_generation/prompt_lookup_decoding_lm.py +++ b/samples/python/text_generation/prompt_lookup_decoding_lm.py @@ -5,10 +5,10 @@ import argparse import openvino_genai -def streamer(subword): - print(subword, end='', flush=True) - # Return flag corresponds whether generation should be stopped. - # False means continue generation. +def streamer(subword): + print(subword, end='', flush=True) + # Return flag corresponds whether generation should be stopped. + # False means continue generation. return False def main(): @@ -20,7 +20,7 @@ def main(): device = 'CPU' pipe = openvino_genai.LLMPipeline(args.model_dir, device, prompt_lookup=True) - + config = openvino_genai.GenerationConfig() config.max_new_tokens = 100 # add parameter to enable prompt lookup decoding to generate `num_assistant_tokens` candidates per iteration @@ -28,7 +28,7 @@ def main(): # Define max_ngram_size config.max_ngram_size = 3 - # Since the streamer is set, the results will be printed + # Since the streamer is set, the results will be printed # every time a new token is generated and put into the streamer queue. pipe.generate(args.prompt, config, streamer) diff --git a/samples/python/text_generation/speculative_decoding_lm.py b/samples/python/text_generation/speculative_decoding_lm.py index 76f925b26c..fcec33e076 100755 --- a/samples/python/text_generation/speculative_decoding_lm.py +++ b/samples/python/text_generation/speculative_decoding_lm.py @@ -8,7 +8,7 @@ def streamer(subword): print(subword, end='', flush=True) - # Return flag corresponds whether generation should be stopped. + # Return flag corresponds whether generation should be stopped. # False means continue generation. return False @@ -27,7 +27,7 @@ def main(): draft_model = openvino_genai.draft_model(args.draft_model_dir, draft_device) pipe = openvino_genai.LLMPipeline(args.model_dir, main_device, draft_model=draft_model) - + config = openvino_genai.GenerationConfig() config.max_new_tokens = 100 # Speculative decoding generation parameters like `num_assistant_tokens` and `assistant_confidence_threshold` are mutually excluded @@ -36,7 +36,7 @@ def main(): # add parameter to enable speculative decoding to generate candidates by draft_model while candidate probability is higher than `assistant_confidence_threshold` # config.assistant_confidence_threshold = 0.4 - # Since the streamer is set, the results will be printed + # Since the streamer is set, the results will be printed # every time a new token is generated and put into the streamer queue. pipe.generate(args.prompt, config, streamer) diff --git a/samples/python/whisper_speech_recognition/recorder.py b/samples/python/whisper_speech_recognition/recorder.py index e79f1f9008..7202c98357 100644 --- a/samples/python/whisper_speech_recognition/recorder.py +++ b/samples/python/whisper_speech_recognition/recorder.py @@ -15,7 +15,7 @@ sample_format = pyaudio.paInt16 # 16 bits per sample channels = 1 fs = 16000 # Record at 16k samples per second -seconds = 5 +seconds = 5 filename = "output.wav" p = pyaudio.PyAudio() # Create an interface to PortAudio @@ -34,7 +34,7 @@ data = stream.read(chunk) frames.append(data) -# Stop and close the stream +# Stop and close the stream stream.stop_stream() stream.close() # Terminate the PortAudio interface diff --git a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp index 6c5552a7b5..f9c8520aeb 100644 --- a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp +++ b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp @@ -95,7 +95,7 @@ class OPENVINO_GENAI_EXPORTS ContinuousBatchingPipeline { /** * @brief Constructs a ContinuousBatchingPipeline from already existing model and tokenizer. - * + * * This constructor allows for the creation of a ContinuousBatchingPipeline using an existing model * represented as a string and a weights tensor, along with a manually initialized tokenizer. * This is useful when the model and tokenizer are already loaded or created in memory and do not diff --git a/src/cpp/include/openvino/genai/generation_handle.hpp b/src/cpp/include/openvino/genai/generation_handle.hpp index 71ec815e61..f81102fd88 100644 --- a/src/cpp/include/openvino/genai/generation_handle.hpp +++ b/src/cpp/include/openvino/genai/generation_handle.hpp @@ -31,7 +31,7 @@ struct EncodedGenerationResult { // Status of generation GenerationStatus m_status = GenerationStatus::RUNNING; - + // PerfMetrics but with empty tokenization/detokenization durations. PerfMetrics perf_metrics; }; @@ -73,7 +73,7 @@ class GenerationStream; class OPENVINO_GENAI_EXPORTS GenerationHandleImpl { std::shared_ptr m_generation_stream; ov::genai::GenerationConfig m_sampling_params; - + public: GenerationHandleImpl(std::shared_ptr generation_stream, const ov::genai::GenerationConfig& sampling_params) : m_generation_stream(std::move(generation_stream)), diff --git a/src/cpp/include/openvino/genai/image_generation/text2image_pipeline.hpp b/src/cpp/include/openvino/genai/image_generation/text2image_pipeline.hpp index 11787869cf..e78be0048d 100644 --- a/src/cpp/include/openvino/genai/image_generation/text2image_pipeline.hpp +++ b/src/cpp/include/openvino/genai/image_generation/text2image_pipeline.hpp @@ -18,7 +18,7 @@ class OPENVINO_GENAI_EXPORTS Text2ImagePipeline { /** * Initializes text to image generation pipeline from a folder with models. * Note, such pipeline is not ready to use as models are not compiled internally. - * + * * Typical scenario is to initialize models using this constructor and then reshape pipeline * with 'reshape()' method and then perform compilation using 'compile()' method. * @param models_path A models path to read models and config files from @@ -30,7 +30,7 @@ class OPENVINO_GENAI_EXPORTS Text2ImagePipeline { * @param models_path A models path to read models and config files from * @param device A single device used for all models * @param properties Properties to pass to 'compile_model' or other pipeline properties like LoRA adapters - * @note If you want to compile each model on a dedicated device or with specific properties, you can create + * @note If you want to compile each model on a dedicated device or with specific properties, you can create * models individually and then combine a final pipeline using static methods like 'latent_consistency_model' or * 'stable_diffusion_3'. See 'samples/cpp/image_generation/heterogeneous_stable_diffusion.cpp' for example */ @@ -172,7 +172,7 @@ class OPENVINO_GENAI_EXPORTS Text2ImagePipeline { * @param guidance_scale A guidance scale. Note, that it's important whether guidance_scale > 1, which affects whether negative prompts * are used or not. For example, all values > 1 are the same for reshape perspective and may vary in subsequent 'generate()' calls. * @note If pipeline has been already compiled, it cannot be reshaped and an exception is thrown. - * + * * Example how to reshape SD3 or Flux models for specific max sequence length: * @code * ov::genai::Text2ImagePipeline pipe("/path"); diff --git a/src/cpp/include/openvino/genai/scheduler_config.hpp b/src/cpp/include/openvino/genai/scheduler_config.hpp index a551b866c6..a1ee6f0609 100644 --- a/src/cpp/include/openvino/genai/scheduler_config.hpp +++ b/src/cpp/include/openvino/genai/scheduler_config.hpp @@ -46,8 +46,8 @@ struct SchedulerConfig { // Enable caching of KV-blocks. // When turned on all previously calculated KV-caches are kept in memory for future usages. // KV-caches can be rewritten if KV-cache limit is reached, but blocks are not released. - // This results in more RAM usage, maximum RAM usage is determined by cache_size or num_kv_blocks parameters. - // When turend off only KV-cache required for batch calculation is kept in memory and + // This results in more RAM usage, maximum RAM usage is determined by cache_size or num_kv_blocks parameters. + // When turend off only KV-cache required for batch calculation is kept in memory and // when a sequence has finished genegartion its cache is released. bool enable_prefix_caching = false; diff --git a/src/cpp/include/openvino/genai/visual_language/perf_metrics.hpp b/src/cpp/include/openvino/genai/visual_language/perf_metrics.hpp index 86750a13f6..d14c42f970 100644 --- a/src/cpp/include/openvino/genai/visual_language/perf_metrics.hpp +++ b/src/cpp/include/openvino/genai/visual_language/perf_metrics.hpp @@ -31,4 +31,4 @@ struct OPENVINO_GENAI_EXPORTS VLMPerfMetrics : public PerfMetrics { VLMRawPerfMetrics vlm_raw_metrics; }; -} \ No newline at end of file +} diff --git a/src/cpp/include/openvino/genai/visual_language/pipeline.hpp b/src/cpp/include/openvino/genai/visual_language/pipeline.hpp index 8c3d380b0f..f52f4cc8a0 100644 --- a/src/cpp/include/openvino/genai/visual_language/pipeline.hpp +++ b/src/cpp/include/openvino/genai/visual_language/pipeline.hpp @@ -19,7 +19,7 @@ class OPENVINO_GENAI_EXPORTS VLMDecodedResults : public DecodedResults{ VLMPerfMetrics perf_metrics; }; -/// @brief A map of models for VLMPipeline constructor. +/// @brief A map of models for VLMPipeline constructor. /// Key is model name (e.g. "vision_embeddings", "text_embeddings", "language", "resampler") /// and value is a pair of model IR as string and weights as tensor. using ModelsMap = std::map>; diff --git a/src/cpp/src/cache_manager.hpp b/src/cpp/src/cache_manager.hpp index 5a0ff9b9f3..3d5ce91ad8 100644 --- a/src/cpp/src/cache_manager.hpp +++ b/src/cpp/src/cache_manager.hpp @@ -13,34 +13,34 @@ #include "openvino/core/shape.hpp" -class TensorMmapAllocator { +class TensorMmapAllocator { size_t m_total_size; void* m_data; - -public: - TensorMmapAllocator(size_t total_size) : - m_total_size(total_size) { } - - void* allocate(size_t bytes, size_t) { - if (m_total_size == bytes) { + +public: + TensorMmapAllocator(size_t total_size) : + m_total_size(total_size) { } + + void* allocate(size_t bytes, size_t) { + if (m_total_size == bytes) { m_data = mmap(NULL, bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); OPENVINO_ASSERT(m_data != MAP_FAILED); - return m_data; - } - throw std::runtime_error{"Unexpected number of bytes was requested to allocate."}; - } - - void deallocate(void*, size_t bytes, size_t) { - if (m_total_size != bytes) { - throw std::runtime_error{"Unexpected number of bytes was requested to deallocate."}; + return m_data; + } + throw std::runtime_error{"Unexpected number of bytes was requested to allocate."}; + } + + void deallocate(void*, size_t bytes, size_t) { + if (m_total_size != bytes) { + throw std::runtime_error{"Unexpected number of bytes was requested to deallocate."}; } munmap(m_data, bytes); - } - - bool is_equal(const TensorMmapAllocator& other) const noexcept { - return this == &other; - } -}; + } + + bool is_equal(const TensorMmapAllocator& other) const noexcept { + return this == &other; + } +}; #endif @@ -115,7 +115,7 @@ class CacheManager { value_roi_size_byte = m_value_cache[decoder_layer_id].get_byte_size(); key_cache_roi_end = static_cast(key_cache.data()) + key_roi_size_byte; value_cache_roi_end = static_cast(value_cache.data()) + value_roi_size_byte; - + // copy current cache data ov::Tensor dst_key_roi(key_cache, start_key, end_key); ov::Tensor dst_value_roi(value_cache, start_value, end_value); @@ -126,8 +126,8 @@ class CacheManager { } #ifdef _WIN32 - // Some optimizations like AVX2, AVX512, AMX require a minimal shape and - // perform multiplying by zero on the excess data. Uninitialized tensor data contain NAN's, + // Some optimizations like AVX2, AVX512, AMX require a minimal shape and + // perform multiplying by zero on the excess data. Uninitialized tensor data contain NAN's, // so NAN * 0 returns non-zero invalid data. // So we need to set zeros to all newly allocated tensors data. std::memset(key_cache_roi_end, 0, key_cache.get_byte_size() - key_roi_size_byte); @@ -154,7 +154,7 @@ class CacheManager { key_cache_shape); ov::Tensor value_cache = remote_context.create_tensor(m_device_config.get_cache_precision(), value_cache_shape); - + if (m_key_cache.size() > decoder_layer_id) { ov::Coordinate end_key = m_key_cache[decoder_layer_id].get_shape(); ov::Coordinate end_value = m_value_cache[decoder_layer_id].get_shape(); @@ -199,7 +199,7 @@ class CacheManager { ov::Coordinate key_src_end_roi = key_shape; ov::Coordinate key_dst_start_roi(key_shape.size(), 0); ov::Coordinate key_dst_end_roi = key_shape; - + ov::Coordinate value_src_start_roi(value_shape.size(), 0); ov::Coordinate value_src_end_roi = value_shape; ov::Coordinate value_dst_start_roi(value_shape.size(), 0); diff --git a/src/cpp/src/continuous_batching_adapter.hpp b/src/cpp/src/continuous_batching_adapter.hpp index 00928b342d..d1e6bdbdec 100644 --- a/src/cpp/src/continuous_batching_adapter.hpp +++ b/src/cpp/src/continuous_batching_adapter.hpp @@ -50,7 +50,7 @@ class ContinuousBatchingAdapter final : public LLMPipelineImplBase { const ov::AnyMap& plugin_config, const ov::genai::GenerationConfig& generation_config ): LLMPipelineImplBase{tokenizer, GenerationConfig()}, m_impl{ - model_str, + model_str, weights_tensor, tokenizer, scheduler_config, @@ -79,7 +79,7 @@ class ContinuousBatchingAdapter final : public LLMPipelineImplBase { ) override { // Get the currrent timestamp in order to evaluate total generate duration. auto start_time = std::chrono::steady_clock::now(); - + std::vector prompts = std::visit(overloaded{ [](const std::string& prompt) { return std::vector{prompt}; @@ -181,11 +181,11 @@ class ContinuousBatchingAdapter final : public LLMPipelineImplBase { const GenerationConfig& config = generation_config.has_value() ? *generation_config : m_generation_config; // -1 == config.eos_token_id and config.validate() are handled in m_impl. - std::vector generated = m_impl.generate(input_ids, - std::vector{input_ids.size(), config}, + std::vector generated = m_impl.generate(input_ids, + std::vector{input_ids.size(), config}, streamer ); - + std::vector> plain_tokens; std::vector plain_scores; for (EncodedGenerationResult& res : generated) { @@ -193,7 +193,7 @@ class ContinuousBatchingAdapter final : public LLMPipelineImplBase { std::move(res.m_generation_ids.begin(), res.m_generation_ids.end(), std::back_inserter(plain_tokens)); std::move(res.m_scores.begin(), res.m_scores.end(), std::back_inserter(plain_scores)); } - + PerfMetrics perf_metrics; // For EncodedGenerationResults, all perf_metrics are the same. if (generated.size() > 0) { diff --git a/src/cpp/src/continuous_batching_impl.cpp b/src/cpp/src/continuous_batching_impl.cpp index 99df043090..1d35a5ebff 100644 --- a/src/cpp/src/continuous_batching_impl.cpp +++ b/src/cpp/src/continuous_batching_impl.cpp @@ -318,7 +318,7 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::generate(const std::vector m_kv_heads_config; size_t m_num_decoder_layers = 0; size_t m_num_kv_blocks = 0, m_cache_size = 0; // KV cache sizes in either blocks or GBs - size_t m_block_size = 0; // block size is per inference device + size_t m_block_size = 0; // block size is per inference device std::string m_device; size_t get_block_size_by_device(const std::string& device) const { diff --git a/src/cpp/src/generation_config.cpp b/src/cpp/src/generation_config.cpp index de23852c9b..cc7a1218b0 100644 --- a/src/cpp/src/generation_config.cpp +++ b/src/cpp/src/generation_config.cpp @@ -213,7 +213,7 @@ void GenerationConfig::validate() const { // Sampling strategies - OPENVINO_ASSERT(num_return_sequences == 1 || (is_multinomial() || is_beam_search()), + OPENVINO_ASSERT(num_return_sequences == 1 || (is_multinomial() || is_beam_search()), "'num_return_sequences' can be more than 1 only in case of beam search or multinomial sampling, but got ", num_return_sequences); // generic penalties, but not supported by beam search currently diff --git a/src/cpp/src/icontinuous_batching.hpp b/src/cpp/src/icontinuous_batching.hpp index 11c9b67e69..e3109af51a 100644 --- a/src/cpp/src/icontinuous_batching.hpp +++ b/src/cpp/src/icontinuous_batching.hpp @@ -65,7 +65,7 @@ class ContinuousBatchingPipeline::IContinuousBatchingPipeline { virtual GenerationHandle add_request(uint64_t request_id, const std::string& prompt, GenerationConfig sampling_params) = 0; - + /** * Checks whether server (pipeline) has non-finished requests and step() should be called within a loop */ @@ -94,7 +94,7 @@ class ContinuousBatchingPipeline::IContinuousBatchingPipeline { /** * Starts chat with a given system prompt - * + * * In chat scenario prompts passed to `generate` method are accumulated inside the pipeline until `finish_chat` is called */ void start_chat(const std::string& system_message); @@ -104,4 +104,4 @@ class ContinuousBatchingPipeline::IContinuousBatchingPipeline { */ void finish_chat(); }; -} \ No newline at end of file +} diff --git a/src/cpp/src/image_generation/flux_pipeline.hpp b/src/cpp/src/image_generation/flux_pipeline.hpp index 83acdeb2fb..9203373d89 100644 --- a/src/cpp/src/image_generation/flux_pipeline.hpp +++ b/src/cpp/src/image_generation/flux_pipeline.hpp @@ -406,7 +406,7 @@ class FluxPipeline : public DiffusionPipeline { } latents = unpack_latents(latents, m_custom_generation_config.height, m_custom_generation_config.width, vae_scale_factor); - + return m_vae->decode(latents); } diff --git a/src/cpp/src/image_generation/inpainting_pipeline.cpp b/src/cpp/src/image_generation/inpainting_pipeline.cpp index 25ac267b69..527417c8da 100644 --- a/src/cpp/src/image_generation/inpainting_pipeline.cpp +++ b/src/cpp/src/image_generation/inpainting_pipeline.cpp @@ -19,7 +19,7 @@ namespace genai { InpaintingPipeline::InpaintingPipeline(const std::filesystem::path& root_dir) { const std::string class_name = get_class_name(root_dir); - if (class_name == "StableDiffusionPipeline" || + if (class_name == "StableDiffusionPipeline" || class_name == "LatentConsistencyModelPipeline" || class_name == "StableDiffusionInpaintPipeline") { m_impl = std::make_shared(PipelineType::INPAINTING, root_dir); diff --git a/src/cpp/src/image_generation/models/unet_inference.hpp b/src/cpp/src/image_generation/models/unet_inference.hpp index 639338901b..e7c413da32 100644 --- a/src/cpp/src/image_generation/models/unet_inference.hpp +++ b/src/cpp/src/image_generation/models/unet_inference.hpp @@ -65,4 +65,4 @@ class UNet2DConditionModel::UNetInference { }; } // namespace genai -} // namespace ov \ No newline at end of file +} // namespace ov diff --git a/src/cpp/src/image_generation/models/unet_inference_dynamic.hpp b/src/cpp/src/image_generation/models/unet_inference_dynamic.hpp index 6bc86a5f06..a3f3dd0e43 100644 --- a/src/cpp/src/image_generation/models/unet_inference_dynamic.hpp +++ b/src/cpp/src/image_generation/models/unet_inference_dynamic.hpp @@ -44,4 +44,4 @@ class UNet2DConditionModel::UNetInferenceDynamic : public UNet2DConditionModel:: }; } // namespace genai -} // namespace ov \ No newline at end of file +} // namespace ov diff --git a/src/cpp/src/image_generation/models/unet_inference_static_bs1.hpp b/src/cpp/src/image_generation/models/unet_inference_static_bs1.hpp index fd5d53e1d1..1fc55caadf 100644 --- a/src/cpp/src/image_generation/models/unet_inference_static_bs1.hpp +++ b/src/cpp/src/image_generation/models/unet_inference_static_bs1.hpp @@ -17,7 +17,7 @@ class UNet2DConditionModel::UNetInferenceStaticBS1 : public UNet2DConditionModel const std::string& device, const ov::AnyMap& properties) override { - // All shapes for input/output tensors should be static. + // All shapes for input/output tensors should be static. // Double check this and throw runtime error if it's not the case. for (auto& input : model->inputs()) { OPENVINO_ASSERT(!input.get_partial_shape().is_dynamic(), @@ -142,4 +142,4 @@ class UNet2DConditionModel::UNetInferenceStaticBS1 : public UNet2DConditionModel }; } // namespace genai -} // namespace ov \ No newline at end of file +} // namespace ov diff --git a/src/cpp/src/image_generation/schedulers/ddim.cpp b/src/cpp/src/image_generation/schedulers/ddim.cpp index 2c0199051f..4f49ffa070 100644 --- a/src/cpp/src/image_generation/schedulers/ddim.cpp +++ b/src/cpp/src/image_generation/schedulers/ddim.cpp @@ -36,7 +36,7 @@ DDIMScheduler::Config::Config(const std::filesystem::path& scheduler_config_path read_json_param(data, "rescale_betas_zero_snr", rescale_betas_zero_snr); } -DDIMScheduler::DDIMScheduler(const std::filesystem::path& scheduler_config_path) +DDIMScheduler::DDIMScheduler(const std::filesystem::path& scheduler_config_path) : DDIMScheduler(Config(scheduler_config_path)) { } diff --git a/src/cpp/src/image_generation/schedulers/euler_ancestral_discrete.cpp b/src/cpp/src/image_generation/schedulers/euler_ancestral_discrete.cpp index 5f711f29ac..5854855745 100644 --- a/src/cpp/src/image_generation/schedulers/euler_ancestral_discrete.cpp +++ b/src/cpp/src/image_generation/schedulers/euler_ancestral_discrete.cpp @@ -18,7 +18,7 @@ EulerAncestralDiscreteScheduler::Config::Config(const std::filesystem::path& sch nlohmann::json data = nlohmann::json::parse(file); using utils::read_json_param; - + read_json_param(data, "num_train_timesteps", num_train_timesteps); read_json_param(data, "beta_start", beta_start); read_json_param(data, "beta_end", beta_end); @@ -30,7 +30,7 @@ EulerAncestralDiscreteScheduler::Config::Config(const std::filesystem::path& sch read_json_param(data, "rescale_betas_zero_snr", rescale_betas_zero_snr); } -EulerAncestralDiscreteScheduler::EulerAncestralDiscreteScheduler(const std::filesystem::path& scheduler_config_path) +EulerAncestralDiscreteScheduler::EulerAncestralDiscreteScheduler(const std::filesystem::path& scheduler_config_path) : EulerAncestralDiscreteScheduler(Config(scheduler_config_path)) { } diff --git a/src/cpp/src/image_generation/schedulers/flow_match_euler_discrete.cpp b/src/cpp/src/image_generation/schedulers/flow_match_euler_discrete.cpp index 17e50ddc04..5ef2e722e8 100644 --- a/src/cpp/src/image_generation/schedulers/flow_match_euler_discrete.cpp +++ b/src/cpp/src/image_generation/schedulers/flow_match_euler_discrete.cpp @@ -162,7 +162,7 @@ size_t FlowMatchEulerDiscreteScheduler::_index_for_timestep(float timestep) { void FlowMatchEulerDiscreteScheduler::scale_noise(ov::Tensor sample, float timestep, ov::Tensor noise) { OPENVINO_ASSERT(timestep == -1, "Timestep is not computed yet"); - + size_t index_for_timestep; if (m_begin_index == -1) { index_for_timestep = _index_for_timestep(timestep); diff --git a/src/cpp/src/image_generation/schedulers/lms_discrete.cpp b/src/cpp/src/image_generation/schedulers/lms_discrete.cpp index be7d7a96df..6ebb47ca84 100644 --- a/src/cpp/src/image_generation/schedulers/lms_discrete.cpp +++ b/src/cpp/src/image_generation/schedulers/lms_discrete.cpp @@ -108,7 +108,7 @@ LMSDiscreteScheduler::Config::Config(const std::filesystem::path& scheduler_conf read_json_param(data, "steps_offset", steps_offset); } -LMSDiscreteScheduler::LMSDiscreteScheduler(const std::filesystem::path& scheduler_config_path) +LMSDiscreteScheduler::LMSDiscreteScheduler(const std::filesystem::path& scheduler_config_path) : LMSDiscreteScheduler(Config(scheduler_config_path)) { } @@ -146,7 +146,7 @@ float LMSDiscreteScheduler::get_init_noise_sigma() const { m_config.timestep_spacing == TimestepSpacing::TRAILING) { return max_sigma; } - + return std::sqrt(max_sigma * max_sigma + 1); } @@ -175,7 +175,7 @@ void LMSDiscreteScheduler::set_timesteps(size_t num_inference_steps, float stren } m_sigmas.push_back(0.f); - + // initialize timesteps for (size_t i = 0; i < num_inference_steps; ++i) { int64_t timestep = _sigma_to_t(m_sigmas[i]); @@ -206,7 +206,7 @@ std::map LMSDiscreteScheduler::step(ov::Tensor noise_pr break; case PredictionType::V_PREDICTION: // pred_original_sample = model_output * (-sigma / (sigma**2 + 1) ** 0.5) + (sample / (sigma**2 + 1)) - pred_latent = noise_pred.data()[j] * (-sigma / std::sqrt(sigma * sigma + 1.0f) + + pred_latent = noise_pred.data()[j] * (-sigma / std::sqrt(sigma * sigma + 1.0f) + latents.data()[j] / (sigma * sigma + 1.0f)); break; default: diff --git a/src/cpp/src/image_generation/schedulers/pndm.cpp b/src/cpp/src/image_generation/schedulers/pndm.cpp index 860b65be6f..b69fc6f816 100644 --- a/src/cpp/src/image_generation/schedulers/pndm.cpp +++ b/src/cpp/src/image_generation/schedulers/pndm.cpp @@ -31,7 +31,7 @@ PNDMScheduler::Config::Config(const std::filesystem::path& scheduler_config_path read_json_param(data, "timestep_spacing", timestep_spacing); } -PNDMScheduler::PNDMScheduler(const std::filesystem::path& scheduler_config_path) +PNDMScheduler::PNDMScheduler(const std::filesystem::path& scheduler_config_path) : PNDMScheduler(Config(scheduler_config_path)) { } diff --git a/src/cpp/src/image_generation/stable_diffusion_3_pipeline.hpp b/src/cpp/src/image_generation/stable_diffusion_3_pipeline.hpp index 017a52a2ff..23277acb9b 100644 --- a/src/cpp/src/image_generation/stable_diffusion_3_pipeline.hpp +++ b/src/cpp/src/image_generation/stable_diffusion_3_pipeline.hpp @@ -355,7 +355,7 @@ class StableDiffusion3Pipeline : public DiffusionPipeline { ov::Tensor negative_pooled_prompt_2_embed_out = get_tensor_batch(text_encoder_2_output, 0); ov::Tensor negative_prompt_2_embed_out = get_tensor_batch(text_encoder_2_hidden_state, 0); ov::Tensor negative_t5_prompt_embed_out = get_tensor_batch(text_encoder_3_output, 0); - + ov::Tensor negative_pooled_prompt_embed, negative_prompt_embed, negative_pooled_prompt_2_embed, negative_prompt_2_embed, negative_t5_prompt_embed; if (generation_config.num_images_per_prompt == 1) { diff --git a/src/cpp/src/image_generation/stable_diffusion_pipeline.hpp b/src/cpp/src/image_generation/stable_diffusion_pipeline.hpp index 2c05bdb585..72e69edea7 100644 --- a/src/cpp/src/image_generation/stable_diffusion_pipeline.hpp +++ b/src/cpp/src/image_generation/stable_diffusion_pipeline.hpp @@ -371,7 +371,7 @@ class StableDiffusionPipeline : public DiffusionPipeline { ov::Shape noise_pred_shape = noise_pred_tensor.get_shape(); noise_pred_shape[0] /= batch_size_multiplier; - + if (batch_size_multiplier > 1) { noisy_residual_tensor.set_shape(noise_pred_shape); diff --git a/src/cpp/src/image_generation/text2image_pipeline.cpp b/src/cpp/src/image_generation/text2image_pipeline.cpp index d4a5b0a77b..c504eeee55 100644 --- a/src/cpp/src/image_generation/text2image_pipeline.cpp +++ b/src/cpp/src/image_generation/text2image_pipeline.cpp @@ -20,7 +20,7 @@ namespace genai { Text2ImagePipeline::Text2ImagePipeline(const std::filesystem::path& root_dir) { const std::string class_name = get_class_name(root_dir); - if (class_name == "StableDiffusionPipeline" || + if (class_name == "StableDiffusionPipeline" || class_name == "LatentConsistencyModelPipeline") { m_impl = std::make_shared(PipelineType::TEXT_2_IMAGE, root_dir); } else if (class_name == "StableDiffusionXLPipeline") { diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index 6ebef7bfba..85f6d1ba8f 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -18,7 +18,7 @@ namespace genai { namespace { -/* +/* * NPU reads some properties from the config file, but when LLMPipeline is initialized * from the model_str and weights_tensor, there are no files. * In the later case ModelDesc is stored in properties. @@ -37,7 +37,7 @@ std::pair split_model_descr( pop_property(main_properties, "name_or_path", model_descr.name_or_path); pop_property(main_properties, "type", model_descr.type); pop_property(main_properties, "num_key_value_heads", model_descr.num_key_value_heads); - + return {main_properties, model_descr}; } @@ -62,7 +62,7 @@ std::pair draft_model( const std::string& device, const ov::AnyMap& properties) { auto [plugin_config, scheduler_config] = utils::split_scheduler_config(properties); - + std::filesystem::path openvino_model_name = "openvino_model.xml"; auto model = utils::singleton_core().read_model(models_path / openvino_model_name, {}, plugin_config); auto generation_config = utils::from_config_json_if_exists(models_path); @@ -99,8 +99,8 @@ ov::genai::LLMPipeline::LLMPipeline( const std::string& device, const ov::AnyMap& properties) { auto start_time = std::chrono::steady_clock::now(); - if (properties.find(ov::genai::scheduler_config.name()) != properties.end() || - properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() || + if (properties.find(ov::genai::scheduler_config.name()) != properties.end() || + properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() || properties.find(ov::genai::prompt_lookup.name()) != properties.end()) { auto [plugin_config, scheduler_config] = utils::split_scheduler_config(properties); m_pimpl = std::make_unique(models_path, tokenizer, scheduler_config, device, plugin_config); @@ -118,8 +118,8 @@ ov::genai::LLMPipeline::LLMPipeline( const ov::AnyMap& properties) { auto start_time = std::chrono::steady_clock::now(); - if (properties.find(ov::genai::scheduler_config.name()) != properties.end() || - properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() || + if (properties.find(ov::genai::scheduler_config.name()) != properties.end() || + properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() || properties.find(ov::genai::prompt_lookup.name()) != properties.end()) { auto [device_properties, scheduler_config] = utils::split_scheduler_config(properties); m_pimpl = std::make_unique(models_path, scheduler_config, device, device_properties); @@ -141,8 +141,8 @@ ov::genai::LLMPipeline::LLMPipeline( const ov::genai::GenerationConfig& generation_config) { auto start_time = std::chrono::steady_clock::now(); - if (properties.find(ov::genai::scheduler_config.name()) != properties.end() || - properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() || + if (properties.find(ov::genai::scheduler_config.name()) != properties.end() || + properties.find(utils::DRAFT_MODEL_ARG_NAME) != properties.end() || properties.find(ov::genai::prompt_lookup.name()) != properties.end()){ auto [device_properties, scheduler_config] = utils::split_scheduler_config(properties); @@ -150,20 +150,20 @@ ov::genai::LLMPipeline::LLMPipeline( tokenizer, scheduler_config, device, device_properties, generation_config); } else if (device == "NPU") { // TODO: CVS-158771 Currently, it's a workaround. Probably there is a better solution. - // NPU reads some properties from the config file, but when LLMPipeline is initialized - // from the model_str and weights_tensor, there is no files. + // NPU reads some properties from the config file, but when LLMPipeline is initialized + // from the model_str and weights_tensor, there is no files. // Therefore, we need to pass these properties manually. // This is necessary only for NPU, for other plugins can be ommited. // Example of usage: - // ov::AnyMap model_descr_properties = {{"name_or_path", "meta-llama/Llama-2-7b-chat-hf"}, - // {"type", "llama"}, + // ov::AnyMap model_descr_properties = {{"name_or_path", "meta-llama/Llama-2-7b-chat-hf"}, + // {"type", "llama"}, // {"num_key_value_heads", 32}}; // ov::genai::LLMPipeline pipe(model_str,..., model_descr_properties); // This will convert from AnyMap to ModelDesc. auto [filtered_properties, model_descr] = split_model_descr(properties); m_pimpl = static_llm::LLMPipelineFactory::create( - utils::singleton_core().read_model(model_str, weights_tensor), + utils::singleton_core().read_model(model_str, weights_tensor), model_descr, tokenizer, device, diff --git a/src/cpp/src/llm_pipeline_stateful.hpp b/src/cpp/src/llm_pipeline_stateful.hpp index 968c550a86..986d5d5b11 100644 --- a/src/cpp/src/llm_pipeline_stateful.hpp +++ b/src/cpp/src/llm_pipeline_stateful.hpp @@ -22,7 +22,7 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { // Tail of previous output in chat mode is missing in KV cache, let's keep it std::optional m_last_disappeared_token = std::nullopt; // If sequence contains some symbols, which could be ambiguously encoded by tokenizer, we need to trim kv cache - // If we use beam search sampling with chat mode we need to remove last answer of the model from kv cache and add best answer to history + // If we use beam search sampling with chat mode we need to remove last answer of the model from kv cache and add best answer to history // so, let's keep info about amount of tokens to trim from kv cache and amount of tokens to keep in history ov::genai::utils::HistoryRemoveManager m_kv_history_manager = {0, 0}; size_t m_kv_cache_seq_length_axis = 2; diff --git a/src/cpp/src/logit_processor.hpp b/src/cpp/src/logit_processor.hpp index a9446591cf..fc8ae41872 100644 --- a/src/cpp/src/logit_processor.hpp +++ b/src/cpp/src/logit_processor.hpp @@ -29,7 +29,7 @@ struct Logits { OPENVINO_ASSERT(m_vector.size() == 0, "Logits vector already initialized"); m_vector.reserve(m_size); for (size_t i = 0; i < m_size; i++) - m_vector.emplace_back(m_data[i], i); + m_vector.emplace_back(m_data[i], i); } bool is_vector_initialized() const { @@ -59,8 +59,8 @@ class TopPFilter : public ILogitTransformer { TopPFilter(double top_p) : m_top_p(top_p) {} bool partial_sort_and_resize(Logits& logits) { - // Since most of the time huge part of logits vector contains minimal values - // expensive sorting of entire vector might be unnecessary, especially for low values of top_p. + // Since most of the time huge part of logits vector contains minimal values + // expensive sorting of entire vector might be unnecessary, especially for low values of top_p. // This method partially sorts vector finding M top elements and stops when top_p condition is met. // It iterates a few times starting with M = 16 and multiplying it by 2 each iteration until M = 1024. // If top_p is found in considered scope it resizes logits vector and returns true. Otherwise it returns false. @@ -111,9 +111,9 @@ class TopKFilter : public ILogitTransformer { // If this transform is used along with top_p, it should be applied after it since top_p sorts entire vector and top_k does it only partially void apply(Logits& logits) override { - if (m_top_k >= logits.m_size) + if (m_top_k >= logits.m_size) return; - + // If top_p is also used vector is already initialized and sorted if (!logits.is_vector_initialized()) { // Initialize and partially sort vector @@ -234,7 +234,7 @@ class RepetitionPenaltyTransform : public IPenaltyTransformer { class EOSPenaltyTransform : public ILogitTransformer { public: - EOSPenaltyTransform(const std::set& stop_token_ids, size_t min_generated_tokens) : + EOSPenaltyTransform(const std::set& stop_token_ids, size_t min_generated_tokens) : m_stop_token_ids(stop_token_ids), m_applicable_tensor_len(min_generated_tokens) {} void apply(Logits& logits) override { @@ -243,7 +243,7 @@ class EOSPenaltyTransform : public ILogitTransformer { for (auto stop_token_id: m_stop_token_ids) logits.m_data[stop_token_id] = 0.f; } - + bool is_applicable(size_t generated_tokens_cnt = 0) override { return generated_tokens_cnt < m_applicable_tensor_len; @@ -310,7 +310,7 @@ class PresencePenaltyTransform : public IPenaltyTransformer { class LogitProcessor { protected: std::vector> m_logit_transformers; - + std::shared_ptr> m_unique_generated_token_ids = std::shared_ptr>(new std::map); std::shared_ptr> m_unique_prompt_token_ids = std::shared_ptr>(new std::set); size_t m_generated_tokens = 0; @@ -334,21 +334,21 @@ class LogitProcessor { if (sampling_params.is_multinomial() || sampling_params.is_greedy_decoding()) { if (sampling_params.repetition_penalty != 1.0f) { - std::shared_ptr transformer = + std::shared_ptr transformer = std::shared_ptr(new LogitTransformers::RepetitionPenaltyTransform(sampling_params.repetition_penalty)); transformer->set_unique_prompt_token_ids(m_unique_prompt_token_ids); transformer->set_unique_generated_token_ids(m_unique_generated_token_ids); m_logit_transformers.push_back(transformer); } if (sampling_params.presence_penalty != 0.0f) { - std::shared_ptr transformer = - std::shared_ptr(new LogitTransformers::PresencePenaltyTransform(sampling_params.presence_penalty)); + std::shared_ptr transformer = + std::shared_ptr(new LogitTransformers::PresencePenaltyTransform(sampling_params.presence_penalty)); transformer->set_unique_generated_token_ids(m_unique_generated_token_ids); m_logit_transformers.push_back(transformer); - + } if (sampling_params.frequency_penalty != 0.0f) { - std::shared_ptr transformer = + std::shared_ptr transformer = std::shared_ptr(new LogitTransformers::FrequencyPenaltyTransform(sampling_params.frequency_penalty)); transformer->set_unique_generated_token_ids(m_unique_generated_token_ids); m_logit_transformers.push_back(transformer); diff --git a/src/cpp/src/lora_helper.cpp b/src/cpp/src/lora_helper.cpp index 5d836832dd..1a8c49a281 100644 --- a/src/cpp/src/lora_helper.cpp +++ b/src/cpp/src/lora_helper.cpp @@ -28,4 +28,4 @@ bool update_adapters_from_properties (const AnyMap& properties, std::optional extract_adapters_from_properties (const AnyMap& properties bool update_adapters_from_properties (const AnyMap& properties, std::optional& adapter_config); } -} \ No newline at end of file +} diff --git a/src/cpp/src/lora_names_mapping.cpp b/src/cpp/src/lora_names_mapping.cpp index 03bf2ed93c..9ea26929e0 100644 --- a/src/cpp/src/lora_names_mapping.cpp +++ b/src/cpp/src/lora_names_mapping.cpp @@ -336,4 +336,4 @@ NameMap maybe_map_non_diffusers_lora_to_diffusers(const std::set& k } -} \ No newline at end of file +} diff --git a/src/cpp/src/make_tokenizer_stateful.cpp b/src/cpp/src/make_tokenizer_stateful.cpp index 547ecdac92..e18977d2e0 100644 --- a/src/cpp/src/make_tokenizer_stateful.cpp +++ b/src/cpp/src/make_tokenizer_stateful.cpp @@ -24,12 +24,12 @@ bool ov::genai::MakeCombineSegmentsSatateful::run_on_model(const std::shared_ptr if (!combine_seg_node || combine_seg_node->input_value(1).get_element_type() != ov::element::i32) { return false; } - + std::shared_ptr input_1_const = std::dynamic_pointer_cast(combine_seg_node->get_input_node_shared_ptr(1)); if (!input_1_const) { return false; } - + op::util::VariableInfo var_info{ov::Shape{}, ov::element::boolean, ADD_SPECIAL_TOKENS_VAR_ID}; auto variable = std::make_shared(var_info); @@ -41,7 +41,7 @@ bool ov::genai::MakeCombineSegmentsSatateful::run_on_model(const std::shared_ptr combine_seg_node->input(1).replace_source_output(select_node->output(0)); auto assign = std::make_shared(read_value, variable); - + model->add_sinks({assign}); model->add_variables({variable}); return true; @@ -58,7 +58,7 @@ bool ov::genai::MakeVocabDecoderSatateful::run_on_model(const std::shared_ptrinput_value(4).get_element_type().is_integral_number()) return false; - + std::shared_ptr skip_tokens_const = std::dynamic_pointer_cast(vocab_decoder_node->get_input_node_shared_ptr(4)); std::shared_ptr skip_tokens_slice = std::dynamic_pointer_cast(vocab_decoder_node->get_input_node_shared_ptr(4)); if (!skip_tokens_const && !skip_tokens_slice) @@ -67,7 +67,7 @@ bool ov::genai::MakeVocabDecoderSatateful::run_on_model(const std::shared_ptr(ov::element::i32, ov::Shape{1}, std::vector{0}); auto int_max_const = std::make_shared(ov::element::i32, ov::Shape{1}, std::vector{std::numeric_limits::max()}); auto one_const = std::make_shared(ov::element::i32, ov::Shape{1}, std::vector{1}); - + // By default, INT_MAX will multiply with 1 and all skip_tokens will be selected. op::util::VariableInfo var_info{ov::Shape{1}, ov::element::i32, SKIP_SPECIAL_TOKENS_VAR_ID}; auto variable = std::make_shared(var_info); @@ -82,7 +82,7 @@ bool ov::genai::MakeVocabDecoderSatateful::run_on_model(const std::shared_ptr slice_node = std::make_shared(skip_tokens_const, start_const, stop, one_const); vocab_decoder_node->input(4).replace_source_output(slice_node->output(0)); } - + auto assign = std::make_shared(read_value, variable); model->add_sinks({assign}); model->add_variables({variable}); diff --git a/src/cpp/src/make_tokenizer_stateful.hpp b/src/cpp/src/make_tokenizer_stateful.hpp index 9ad06ae07a..7e9eeb271c 100644 --- a/src/cpp/src/make_tokenizer_stateful.hpp +++ b/src/cpp/src/make_tokenizer_stateful.hpp @@ -7,10 +7,10 @@ namespace ov { namespace genai { -/** +/** * @brief This pass modifies tokenizer ov::Model so that special tokens adding will be * enabled or disabled depending on stateful value. - * + * * +--------------+ * | DefaultMode | * +--------------+ @@ -38,10 +38,10 @@ class MakeCombineSegmentsSatateful : public ov::pass::ModelPass { bool run_on_model(const std::shared_ptr& model) override; }; -/** +/** * @brief This pass modifies tokenizer ov::Model so that special tokens adding will be * enabled or disabled depending on stateful value. - * + * * +--------------+ * | DefaultMode | * +--------------+ diff --git a/src/cpp/src/model_runner.hpp b/src/cpp/src/model_runner.hpp index 1a161b03c8..8d3d8a7f72 100644 --- a/src/cpp/src/model_runner.hpp +++ b/src/cpp/src/model_runner.hpp @@ -136,7 +136,7 @@ class ModelRunner { int64_t * input_ids_data = input_ids.data(), * position_ids_data = position_ids.data(); - int32_t + int32_t * past_lens_data = past_lens.data(), * subsequence_begins_data = subsequence_begins.data(), * block_indices_begins_data = block_indices_begins.data(); diff --git a/src/cpp/src/perf_metrics.cpp b/src/cpp/src/perf_metrics.cpp index a84b83dd2f..f7276b171d 100644 --- a/src/cpp/src/perf_metrics.cpp +++ b/src/cpp/src/perf_metrics.cpp @@ -15,12 +15,12 @@ ov::genai::MeanStdPair calc_mean_and_std(const std::vector float { return acc + duration.count() / 1000.0f; }); mean /= durations.size(); - + float sum_square_durations = std::accumulate(durations.begin(), durations.end(), 0.0f, [](const float& acc, const ov::genai::MicroSeconds& duration) -> float { auto d = duration.count() / 1000.0f; @@ -105,8 +105,8 @@ void PerfMetrics::evaluate_statistics(std::optional start_time) { num_generated_tokens = batch_sizes[0]; // The very first infer request (prefill stage) is slower than subsequent ones since we process a sequence of tokens. - // To have a clearer TPOT number, the time taken to generate the very first token at the prefill stage - // must not be included in the TPOT calculation. The first duration used for TPOT is from the first token + // To have a clearer TPOT number, the time taken to generate the very first token at the prefill stage + // must not be included in the TPOT calculation. The first duration used for TPOT is from the first token // to the second token, not from the start time to the first token. for (size_t i = 1; i < tok_times.size(); ++i) { // If in 10 ms a batch of 5 new tokens is generated then TPOT is 10 / 5 = 2 tok/ms. @@ -132,7 +132,7 @@ void PerfMetrics::evaluate_statistics(std::optional start_time) { PerfMetrics PerfMetrics::operator+(const PerfMetrics& right) const { OPENVINO_ASSERT(right.load_time == load_time, "generation metrics can be accumulated only for the same pipeline"); - + // Copy left value to res. PerfMetrics res = *this; @@ -143,7 +143,7 @@ PerfMetrics PerfMetrics::operator+(const PerfMetrics& right) const { auto& right_durations = right.raw_metrics.m_durations; auto& right_batch_sizes = right.raw_metrics.m_batch_sizes; auto& right_times_to_first_token = right.raw_metrics.m_times_to_first_token; - + new_durations.insert(new_durations.end(), right_durations.begin(), right_durations.end()); new_times_to_first_token.insert(new_times_to_first_token.end(), right_times_to_first_token.begin(), right_times_to_first_token.end()); new_batch_sizes.insert(new_batch_sizes.end(), right_batch_sizes.begin(), right_batch_sizes.end()); @@ -155,7 +155,7 @@ PerfMetrics PerfMetrics::operator+(const PerfMetrics& right) const { auto& right_tok_durations = right.raw_metrics.tokenization_durations; auto& right_detok_durations = right.raw_metrics.detokenization_durations; auto& right_gen_durations = right.raw_metrics.generate_durations; - + new_tok_durations.insert(new_tok_durations.end(), right_tok_durations.begin(), right_tok_durations.end()); new_detok_durations.insert(new_detok_durations.end(), right_detok_durations.begin(), right_detok_durations.end()); new_gen_durations.insert(new_gen_durations.end(), right_gen_durations.begin(), right_gen_durations.end()); diff --git a/src/cpp/src/prompt_lookup/continuous_batching_for_prompt_lookup.cpp b/src/cpp/src/prompt_lookup/continuous_batching_for_prompt_lookup.cpp index aa4ea8a53a..54751ec90d 100644 --- a/src/cpp/src/prompt_lookup/continuous_batching_for_prompt_lookup.cpp +++ b/src/cpp/src/prompt_lookup/continuous_batching_for_prompt_lookup.cpp @@ -95,4 +95,4 @@ size_t ContinuousBatchingPipeline::ContinuousBatchingForPromptLookupImpl::get_pr return m_batch_size; } -} \ No newline at end of file +} diff --git a/src/cpp/src/prompt_lookup/continuous_batching_for_prompt_lookup.hpp b/src/cpp/src/prompt_lookup/continuous_batching_for_prompt_lookup.hpp index 98b2d71586..200fb5302f 100644 --- a/src/cpp/src/prompt_lookup/continuous_batching_for_prompt_lookup.hpp +++ b/src/cpp/src/prompt_lookup/continuous_batching_for_prompt_lookup.hpp @@ -27,7 +27,7 @@ class ContinuousBatchingPipeline::ContinuousBatchingForPromptLookupImpl : public properties, generation_config, true } {}; - + void generate_candidates(); // { generated_len, validation_len } @@ -43,4 +43,4 @@ class ContinuousBatchingPipeline::ContinuousBatchingForPromptLookupImpl : public protected: TokenIds generate_candidates(const TokenIds& input_ids, size_t num_pred_tokens, size_t max_ngram_size); }; -} \ No newline at end of file +} diff --git a/src/cpp/src/prompt_lookup/prompt_lookup_impl.cpp b/src/cpp/src/prompt_lookup/prompt_lookup_impl.cpp index 539680c819..9325efc2ff 100644 --- a/src/cpp/src/prompt_lookup/prompt_lookup_impl.cpp +++ b/src/cpp/src/prompt_lookup/prompt_lookup_impl.cpp @@ -65,16 +65,16 @@ void ContinuousBatchingPipeline::PromptLookupImpl::step() { num_matches = (present_req_len - prev_full_req_len - 1); acceptance_rate = static_cast(num_matches) / static_cast(prev_validation_len); - } + } m_sd_metrics.update_acceptance_rate(request_id, acceptance_rate * 100); m_sd_metrics.update_draft_accepted_tokens(request_id, num_matches); } // update perf metrics const auto num_generated_tokens = m_pipeline->get_processed_tokens_per_iteration(); - if (num_generated_tokens > 0) { + if (num_generated_tokens > 0) { raw_perf_counters.m_batch_sizes.emplace_back(num_generated_tokens); - + auto infer_duration = step_timer.get_duration_microsec(); raw_perf_counters.m_token_infer_durations.emplace_back(infer_duration); @@ -125,8 +125,8 @@ ContinuousBatchingPipeline::PromptLookupImpl::generate(const std::vector generations; for (size_t request_id = 0; request_id < input_ids.size(); ++request_id) { - OPENVINO_ASSERT(1 == input_ids[request_id].get_shape().at(0), "Use multiple tensors to pass a batch."); - OPENVINO_ASSERT(sampling_params[request_id].is_prompt_lookup(), "`max_ngram_size` && `num_assistant_tokens` should be specified for `prompt lookup decoding`"); + OPENVINO_ASSERT(1 == input_ids[request_id].get_shape().at(0), "Use multiple tensors to pass a batch."); + OPENVINO_ASSERT(sampling_params[request_id].is_prompt_lookup(), "`max_ngram_size` && `num_assistant_tokens` should be specified for `prompt lookup decoding`"); generations.push_back(m_pipeline->add_request(request_id, input_ids[request_id], sampling_params[request_id])); } auto all_requests = m_pipeline->get_awaiting_requests(); diff --git a/src/cpp/src/prompt_lookup/prompt_lookup_impl.hpp b/src/cpp/src/prompt_lookup/prompt_lookup_impl.hpp index 0535931d81..1393cecc13 100644 --- a/src/cpp/src/prompt_lookup/prompt_lookup_impl.hpp +++ b/src/cpp/src/prompt_lookup/prompt_lookup_impl.hpp @@ -49,4 +49,4 @@ class ContinuousBatchingPipeline::PromptLookupImpl : public ContinuousBatchingPi SpeculativeDecodingMetrics get_metrics(); }; -} \ No newline at end of file +} diff --git a/src/cpp/src/safetensors.c b/src/cpp/src/safetensors.c index 61559882c6..d128eb1bee 100644 --- a/src/cpp/src/safetensors.c +++ b/src/cpp/src/safetensors.c @@ -1,2 +1,2 @@ #define SAFETENSORS_IMPLEMENTATION -#include "safetensors.h" \ No newline at end of file +#include "safetensors.h" diff --git a/src/cpp/src/sampler.cpp b/src/cpp/src/sampler.cpp index 7a1e079746..642d2a8a80 100644 --- a/src/cpp/src/sampler.cpp +++ b/src/cpp/src/sampler.cpp @@ -4,7 +4,7 @@ #include "sampler.hpp" namespace ov::genai { -// Modified Knuth–Morris–Pratt algorithm which returns tokens following after every needle occurrence in haystack +// Modified Knuth-Morris-Pratt algorithm which returns tokens following after every needle occurrence in haystack std::vector kmp_search(const std::vector& haystack, const std::vector& needle) { if (needle.empty()) { // no_repeat_ngram_size == 1, ban every token return {haystack.begin(), haystack.end()}; @@ -159,7 +159,7 @@ int match_stop_string2(Tokenizer & tokenizer, const TokenIds & generated_tokens, std::vector last_generated_tokens(generated_tokens.end()-num_tokens, generated_tokens.end()); if (stop_tokens == last_generated_tokens) return num_tokens; - + // Continue checking chunks of 4 tokens num_tokens += 4; while (num_tokens <= generated_tokens.size()) { @@ -188,7 +188,7 @@ void Sampler::GroupBeamSearcher::finalize(SamplerOutput& sampler_output) { // mark current sequence as finished beam.m_sequence->set_status(SequenceStatus::FINISHED); - // Setting length since this function is used when sequence generated tokens number reaches max_new_tokens + // Setting length since this function is used when sequence generated tokens number reaches max_new_tokens beam.m_sequence->set_finish_reason(GenerationFinishReason::LENGTH); // we also need to drop add ongoing / forked sequences from scheduler sampler_output.m_dropped_sequences.push_back(sequence_id); @@ -549,7 +549,7 @@ std::vector Sampler::_try_finish_generation(SequenceGroup::Ptr & sequen std::vector dropped_seq_ids; for (auto& running_sequence : sequence_group->get_running_sequences()) { const auto generated_len = running_sequence->get_generated_len(); - if (sequence_group->get_max_new_tokens() <= generated_len || + if (sequence_group->get_max_new_tokens() <= generated_len || is_stop_token_id_hit(running_sequence->get_generated_ids().back(), sampling_params.stop_token_ids) && !sampling_params.ignore_eos) { // stop sequence by max_new_tokens or stop token (eos included) running_sequence->set_status(SequenceStatus::FINISHED); @@ -680,7 +680,7 @@ bool Sampler::validate_candidate( float p_i = std::exp(*it_log_prob), q_i = std::exp(sampled_token.m_log_prob), probability_ratio = p_i / q_i; - + auto dist = std::uniform_int_distribution<>(0, 100); // equivalent to multinomial with number of trials == 1 float r_i = dist(rng_engine); r_i /= 100; @@ -723,7 +723,7 @@ float get_p_prime(Sequence::Ptr& running_sequence, if (cumulative_prob == 0.f) { return 1.f; } - + float p_n = std::exp(sampled_token.m_log_prob), q_n = std::exp(*it_log_prob), p_prime = std::max(0.f, (p_n - q_n)) / std::log(cumulative_prob); @@ -806,7 +806,7 @@ SamplerOutput Sampler::sample(const std::vector & sequence_g stop_sample_tokens(running_sequence, token_offset, max_num_sampled_token, max_removed_tokens_per_request); break; } - + // do sampling only for token validation/generation. // continue in case of extending draft model sequences by main model generated tokens which // should be taken to KV cache without validation @@ -892,7 +892,7 @@ SamplerOutput Sampler::sample(const std::vector & sequence_g m_beam_search_info.at(request_id).finalize(sampler_output); } } - // Notify handle after sampling is done. + // Notify handle after sampling is done. // For non-streaming this is effective only when the generation is finished. OPENVINO_ASSERT(num_tokens_to_process >= max_removed_tokens_per_request); sequence_group->notify_handle(); @@ -931,7 +931,7 @@ void Sampler::create_logit_processor(uint64_t request_id, const GenerationConfig m_logit_processors.insert({request_id, LogitProcessor(sampling_params, prompt)}); } -void Sampler::clear_request_info(uint64_t request_id) { +void Sampler::clear_request_info(uint64_t request_id) { m_beam_search_info.erase(request_id); m_logit_processors.erase(request_id); m_stop_strings.erase(request_id); diff --git a/src/cpp/src/sequence_group.cpp b/src/cpp/src/sequence_group.cpp index 7b9265db1a..8ca00e23dc 100644 --- a/src/cpp/src/sequence_group.cpp +++ b/src/cpp/src/sequence_group.cpp @@ -20,7 +20,7 @@ size_t Sequence::_make_hash(size_t content_length) { // hash of current block depends on prefix hashes std::vector content; size_t prefix_hashes_needed_count = block_start_idx / block_size; - OPENVINO_ASSERT(prefix_hashes_needed_count <= m_prefix_hashes.size()); + OPENVINO_ASSERT(prefix_hashes_needed_count <= m_prefix_hashes.size()); content.insert(content.end(), m_prefix_hashes.begin(), m_prefix_hashes.begin() + prefix_hashes_needed_count); // get tokens corresponding to current block @@ -38,7 +38,7 @@ size_t Sequence::_make_hash(size_t content_length) { return std::hash{}(std::string_view(data, size)); } -// Each KV block can be uniquely identified by +// Each KV block can be uniquely identified by // the tokens within the block and the tokens in the prefix before the block. // hash(prefix tokens + block tokens) <--> KV Block size_t Sequence::get_hash(size_t content_length) { @@ -56,8 +56,8 @@ size_t Sequence::get_hash(size_t content_length) { if (content_len % block_size == 0) { return m_prefix_hashes[content_len / block_size - 1]; } - + return _make_hash(content_len); } } // namespace genai -} // namespace ov \ No newline at end of file +} // namespace ov diff --git a/src/cpp/src/sequence_group.hpp b/src/cpp/src/sequence_group.hpp index 19d29c92ac..c117879412 100644 --- a/src/cpp/src/sequence_group.hpp +++ b/src/cpp/src/sequence_group.hpp @@ -210,7 +210,7 @@ class SequenceGroup : public std::enable_shared_from_this { bool m_has_echoed = false; uint64_t m_next_sequence_id = 0; - + // amount of processed tokens, e.g. prompt can be processed using multiple consequence inferences // so, we need to track which part of the prompt we have already processed size_t m_num_processed_tokens = 0; @@ -485,7 +485,7 @@ class SequenceGroup : public std::enable_shared_from_this { size_t get_num_tokens_to_validate() { return m_num_validation_tokens; } - + void set_stream_window_size(size_t k) { m_stream_window_size = k; } @@ -666,7 +666,7 @@ class SequenceGroup : public std::enable_shared_from_this { } } - + // Special notification path for max_new_tokens == 0 where we don't expect to return any new tokens, but only process prompt void notify_handle_echo_only() { // This method is called after scheduling and before sampling, diff --git a/src/cpp/src/speculative_decoding/continuous_batching_for_speculative_decoding_impl.cpp b/src/cpp/src/speculative_decoding/continuous_batching_for_speculative_decoding_impl.cpp index bec2b75e0d..c0040009aa 100644 --- a/src/cpp/src/speculative_decoding/continuous_batching_for_speculative_decoding_impl.cpp +++ b/src/cpp/src/speculative_decoding/continuous_batching_for_speculative_decoding_impl.cpp @@ -86,7 +86,7 @@ get_prefix_len( const size_t candidate_sequence_gen_len = candidate_token_ids.size(), running_sequence_gen_len = running_sequence->get_generated_len(); - + // to find the len of prefix size_t sequence_prefix_len = std::min(candidate_sequence_gen_len, running_sequence_gen_len); for (size_t i = 0; i < sequence_prefix_len; ++i) { @@ -106,7 +106,7 @@ size_t remove_tokens_from_sequence(Sequence::Ptr& sequence, size_t min_generated_tokens, LogitProcessor& logit_proccessor) { - const auto generated_token_ids = sequence->get_generated_ids(); + const auto generated_token_ids = sequence->get_generated_ids(); const auto sequence_generated_len = generated_token_ids.size(); OPENVINO_ASSERT(sequence_generated_len >= min_generated_tokens); @@ -192,7 +192,7 @@ init_request( return min_candidate_len; } -UpdateRequestResult +UpdateRequestResult ContinuousBatchingPipeline::ContinuousBatchingForSpeculativeDecodingImpl::init_request_by_candidate( uint64_t request_id, const GeneratedSequences& candidates) { @@ -200,7 +200,7 @@ ContinuousBatchingPipeline::ContinuousBatchingForSpeculativeDecodingImpl::init_r if (request->get_request_id() != request_id) { continue; } - + UpdateRequestResult result; m_sampler->create_logit_processor(request_id, request->get_sampling_parameters(), request->get_prompt_ids()); auto& logit_processor = m_sampler->get_logit_processor(request_id); @@ -339,4 +339,4 @@ void ContinuousBatchingPipeline::ContinuousBatchingForSpeculativeDecodingImpl::m } } } -} \ No newline at end of file +} diff --git a/src/cpp/src/speculative_decoding/continuous_batching_for_speculative_decoding_impl.hpp b/src/cpp/src/speculative_decoding/continuous_batching_for_speculative_decoding_impl.hpp index e4e4be63d8..226d1d9600 100644 --- a/src/cpp/src/speculative_decoding/continuous_batching_for_speculative_decoding_impl.hpp +++ b/src/cpp/src/speculative_decoding/continuous_batching_for_speculative_decoding_impl.hpp @@ -40,4 +40,4 @@ class ContinuousBatchingPipeline::ContinuousBatchingForSpeculativeDecodingImpl : void finish_request(SequenceGroup::Ptr request); void _pull_awaiting_requests() override {}; }; -} \ No newline at end of file +} diff --git a/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp b/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp index ddb3d0ae10..2cdfe2c772 100644 --- a/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp +++ b/src/cpp/src/speculative_decoding/speculative_decoding_impl.cpp @@ -17,7 +17,7 @@ bool are_tokenizers_equal(Tokenizer& lhs, Tokenizer& rhs) { std::string test_string = "Could you please tell me something about OpenVINO.GenAI?"; ov::Tensor encoded_string_lhs = lhs.encode(test_string).input_ids, encoded_string_rhs = rhs.encode(test_string).input_ids; - + ov::Shape shape_lhs = encoded_string_lhs.get_shape(), shape_rhs = encoded_string_rhs.get_shape(); @@ -25,7 +25,7 @@ bool are_tokenizers_equal(Tokenizer& lhs, Tokenizer& rhs) { lhs.get_bos_token_id() == rhs.get_bos_token_id() && lhs.get_pad_token_id() == rhs.get_pad_token_id(); } -ContinuousBatchingPipeline::SpeculativeDecodingImpl::SpeculativeDecodingImpl(const ov::genai::ModelDesc& main_model_desc, +ContinuousBatchingPipeline::SpeculativeDecodingImpl::SpeculativeDecodingImpl(const ov::genai::ModelDesc& main_model_desc, const ov::genai::ModelDesc& draft_model_desc) { auto main_model = main_model_desc.model; auto draft_model = draft_model_desc.model; @@ -77,7 +77,7 @@ ContinuousBatchingPipeline::SpeculativeDecodingImpl::SpeculativeDecodingImpl(con // todo: remove this condition after support of CVS-154103 OPENVINO_ASSERT(are_tokenizers_equal(main_model_tokenizer, draft_model_tokenizer), "Tokenizers for draft and main models are different!"); - + m_tokenizer = main_model_tokenizer; // to create `main_pipeline` with enabled validation_mode and `draft_pipeline` with disabled validation mode @@ -194,7 +194,7 @@ void ContinuousBatchingPipeline::SpeculativeDecodingImpl::step() { const auto num_generated_tokens = m_main_pipeline->get_processed_tokens_per_iteration(); if (num_generated_tokens > 0) { auto infer_duration = step_timer.get_duration_microsec(); - + raw_perf_counters.m_token_infer_durations.emplace_back(infer_duration); raw_perf_counters.m_inference_durations[0] += MicroSeconds(infer_duration); raw_perf_counters.m_new_token_times.emplace_back(main_timer.get_end_time()); @@ -286,7 +286,7 @@ ContinuousBatchingPipeline::SpeculativeDecodingImpl::generate(const std::vector< } } }; - + streamer_ptr->end(); }; diff --git a/src/cpp/src/speculative_decoding/speculative_decoding_impl.hpp b/src/cpp/src/speculative_decoding/speculative_decoding_impl.hpp index 4023519287..74d66fba5e 100644 --- a/src/cpp/src/speculative_decoding/speculative_decoding_impl.hpp +++ b/src/cpp/src/speculative_decoding/speculative_decoding_impl.hpp @@ -30,7 +30,7 @@ struct ModelDesc { properties(properties), scheduler_config(scheduler_config), generation_config(generation_config) {} - + ModelDesc() = default; }; @@ -48,7 +48,7 @@ class ContinuousBatchingPipeline::SpeculativeDecodingImpl : public ContinuousBat void drop_requests(); bool is_requests_empty(); std::vector get_awaiting_requests(); - + public: SpeculativeDecodingImpl(const ov::genai::ModelDesc& main_model_desc, const ov::genai::ModelDesc& draft_model_desc); @@ -71,4 +71,4 @@ class ContinuousBatchingPipeline::SpeculativeDecodingImpl : public ContinuousBat SpeculativeDecodingMetrics get_speculative_decoding_metrics(); }; -} \ No newline at end of file +} diff --git a/src/cpp/src/speculative_decoding/speculative_decoding_metrics.cpp b/src/cpp/src/speculative_decoding/speculative_decoding_metrics.cpp index 2357b99fd7..6191b39f13 100644 --- a/src/cpp/src/speculative_decoding/speculative_decoding_metrics.cpp +++ b/src/cpp/src/speculative_decoding/speculative_decoding_metrics.cpp @@ -154,4 +154,4 @@ void SpeculativeDecodingMetrics::clean_up() { total_duration = 0; } -} \ No newline at end of file +} diff --git a/src/cpp/src/speculative_decoding/speculative_decoding_metrics.hpp b/src/cpp/src/speculative_decoding/speculative_decoding_metrics.hpp index d4a14a1692..a1ef0857b2 100644 --- a/src/cpp/src/speculative_decoding/speculative_decoding_metrics.hpp +++ b/src/cpp/src/speculative_decoding/speculative_decoding_metrics.hpp @@ -43,4 +43,4 @@ class SpeculativeDecodingMetrics { void clean_up(); }; -} \ No newline at end of file +} diff --git a/src/cpp/src/text_callback_streamer.cpp b/src/cpp/src/text_callback_streamer.cpp index aee909dfb8..2c29c20109 100644 --- a/src/cpp/src/text_callback_streamer.cpp +++ b/src/cpp/src/text_callback_streamer.cpp @@ -16,7 +16,7 @@ bool TextCallbackStreamer::put(int64_t token) { m_tokens_cache.push_back(token); std::string text = m_tokenizer.decode(m_tokens_cache); m_decoded_lengths.push_back(text.length()); - + if (!text.empty() && '\n' == text.back() && text.size() > m_printed_len) { // Flush the cache after the new line symbol res << std::string_view{text.data() + m_printed_len, text.size() - m_printed_len}; @@ -27,7 +27,7 @@ bool TextCallbackStreamer::put(int64_t token) { } constexpr size_t delay_n_tokens = 3; - // In some cases adding the next token can shorten the text, + // In some cases adding the next token can shorten the text, // e.g. when apostrophe removing regex had worked after adding new tokens. // Printing several last tokens is delayed. if (m_decoded_lengths.size() < delay_n_tokens) { @@ -65,4 +65,4 @@ void TextCallbackStreamer::end() { ov::genai::StreamerBase::~StreamerBase() = default; } // namespace genai -} // namespace ov \ No newline at end of file +} // namespace ov diff --git a/src/cpp/src/text_callback_streamer.hpp b/src/cpp/src/text_callback_streamer.hpp index 2c5fab5700..b5cb9caebd 100644 --- a/src/cpp/src/text_callback_streamer.hpp +++ b/src/cpp/src/text_callback_streamer.hpp @@ -26,4 +26,4 @@ class TextCallbackStreamer: public StreamerBase { }; } // namespace genai -} // namespace ov \ No newline at end of file +} // namespace ov diff --git a/src/cpp/src/utils.cpp b/src/cpp/src/utils.cpp index a8cf844cb7..5d2732723c 100644 --- a/src/cpp/src/utils.cpp +++ b/src/cpp/src/utils.cpp @@ -170,7 +170,7 @@ ProcessorConfig from_any_map( } /** - * scheduler_config is a separate config for continuous batching pipeline. + * scheduler_config is a separate config for continuous batching pipeline. * This routine splits scheduler_config from plugin_config. */ std::pair split_scheduler_config(const ov::AnyMap& properties) { diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp index 8c56c39a8c..5819e5d45c 100644 --- a/src/cpp/src/utils.hpp +++ b/src/cpp/src/utils.hpp @@ -16,7 +16,7 @@ namespace utils { // Variable template that checks if a type has begin() and end() member functions template constexpr bool is_container = false; - + template constexpr bool is_container().begin()), diff --git a/src/cpp/src/visual_language/clip.cpp b/src/cpp/src/visual_language/clip.cpp index 9347f63074..40a0f50d93 100644 --- a/src/cpp/src/visual_language/clip.cpp +++ b/src/cpp/src/visual_language/clip.cpp @@ -258,7 +258,7 @@ clip_image_f32 clip_image_preprocess(clip_ctx& ctx, const clip_image_u8& img) { } std::vector get_image_patches( - const clip_image_u8& image, + const clip_image_u8& image, const std::vector>& image_grid_pinpoints, const std::pair& size, int patch_size @@ -274,7 +274,7 @@ std::vector get_image_patches( int base_patch_height = size.second; clip_image_u8 base_patch; bicubic_resize(image, base_patch, base_patch_width, base_patch_height); - + patches.push_back(base_patch); // Select best resolution for patching diff --git a/src/cpp/src/visual_language/clip.hpp b/src/cpp/src/visual_language/clip.hpp index e00ac2fc40..c0b08e4e60 100644 --- a/src/cpp/src/visual_language/clip.hpp +++ b/src/cpp/src/visual_language/clip.hpp @@ -37,7 +37,7 @@ void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int target_wi clip_image_f32 clip_image_preprocess(struct clip_ctx& ctx, const clip_image_u8& img); std::vector get_image_patches( - const clip_image_u8& image, + const clip_image_u8& image, const std::vector>& image_grid_pinpoints, const std::pair& size, int patch_size diff --git a/src/cpp/src/visual_language/embedding_model.cpp b/src/cpp/src/visual_language/embedding_model.cpp index ebc2f80bbd..d49a7439ad 100644 --- a/src/cpp/src/visual_language/embedding_model.cpp +++ b/src/cpp/src/visual_language/embedding_model.cpp @@ -64,4 +64,4 @@ void EmbeddingsModel::merge_postprocess(std::shared_ptr model, float } } // namespace genai -} // namespace ov \ No newline at end of file +} // namespace ov diff --git a/src/cpp/src/visual_language/embedding_model.hpp b/src/cpp/src/visual_language/embedding_model.hpp index 5e85e03026..5d675405b0 100644 --- a/src/cpp/src/visual_language/embedding_model.hpp +++ b/src/cpp/src/visual_language/embedding_model.hpp @@ -46,4 +46,4 @@ class EmbeddingsModel { }; } // namespace genai -} // namespace ov \ No newline at end of file +} // namespace ov diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp index 66b17e5804..8bfd06da20 100644 --- a/src/cpp/src/visual_language/inputs_embedder.cpp +++ b/src/cpp/src/visual_language/inputs_embedder.cpp @@ -40,7 +40,7 @@ class InputsEmbedder::IInputsEmbedder { // Tail of previous output for LM in chat mode is missing in KV cache. std::optional m_last_disappeared_token = std::nullopt; // If sequence contains some symbols, which could be ambiguous encoded by tokenizer, we need to trim kv cache - // If we use beam search sampling with chat mode we need to remove last answer of the model from kv cache and add best answer to history + // If we use beam search sampling with chat mode we need to remove last answer of the model from kv cache and add best answer to history // so, let's keep info about amount of tokens to trim from kv cache and amount of tokens to keep in history ov::genai::utils::HistoryRemoveManager m_kv_history_manager = {0, 0}; @@ -78,7 +78,7 @@ class InputsEmbedder::IInputsEmbedder { } m_last_disappeared_token = last_disappeared_token; - + std::copy(encoded_result.begin(), encoded_result.end(), std::back_inserter(m_tokenized_history)); } @@ -124,7 +124,7 @@ class InputsEmbedder::IInputsEmbedder { m_vision_encoder(model_dir, m_vlm_config.model_type, device, device_config), m_embedding(model_dir, m_vlm_config.scale_emb, device, device_config), m_tokenizer{model_dir, device_config} { } - + IInputsEmbedder( const VLMConfig& vlm_config, const ModelsMap& models_map, @@ -630,7 +630,7 @@ class InputsEmbedderLLaVA : public InputsEmbedder::IInputsEmbedder { std::string image_token = m_vlm_config.im_start; // Adapted from llava-1.5-7b-hf chat_template.json std::string chat_template_fallback = "{% for message in messages %}{% if message['role'] == 'user' %}{{ 'USER: ' + message['content'] + ' ' }}{% else %}{{ 'ASSISTANT: ' + message['content'] + ' ' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}"; - + std::vector single_images = to_single_image_tensors(images); std::string formatted_prompt; @@ -750,7 +750,7 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA { std::string formatted_prompt; std::vector image_embeds; image_embeds.reserve(single_images.size()); - + ov::Tensor image_newline; for (const auto& image : single_images) { @@ -1071,20 +1071,20 @@ class InputsEmbedderInternVLChat : public InputsEmbedder::IInputsEmbedder { std::string image_start_token = m_vlm_config.image_start_token; std::string image_context_token = m_vlm_config.image_context_token; std::string image_end_token = m_vlm_config.image_end_token; - + std::vector single_images = to_single_image_tensors(images); std::string formatted_prompt; std::vector image_embeds; image_embeds.reserve(single_images.size()); - + for (const auto& image : single_images) { EncodedImage encoded_image = m_vision_encoder.encode(image); ov::Tensor single_image_embeds = encoded_image.resized_source; const size_t num_patches = single_image_embeds.get_shape().at(0); const size_t num_image_tokens = single_image_embeds.get_shape().at(1); - + formatted_prompt += image_start_token; for (int i = 0; i < num_patches * num_image_tokens; ++i) { formatted_prompt += image_context_token; @@ -1155,7 +1155,7 @@ class InputsEmbedderInternVLChat : public InputsEmbedder::IInputsEmbedder { std::copy_n(image_embeds_data + image_context_token_idx * embed_dim, embed_dim, merged_embeds_data + offset); - + ++image_context_token_idx; if (image_context_token_idx == num_all_image_tokens) { @@ -1593,7 +1593,7 @@ class InputsEmbedderQwen2VL : public InputsEmbedder::IInputsEmbedder { InputsEmbedderQwen2VL( const VLMConfig& vlm_config, const ModelsMap& models_map, - const Tokenizer& tokenizer, + const Tokenizer& tokenizer, const std::filesystem::path& config_dir_path, const std::string& device, const ov::AnyMap device_config) : @@ -1605,7 +1605,7 @@ class InputsEmbedderQwen2VL : public InputsEmbedder::IInputsEmbedder { device_config ).create_infer_request(); } - + virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics) override { std::string formatted_prompt; @@ -1614,7 +1614,7 @@ class InputsEmbedderQwen2VL : public InputsEmbedder::IInputsEmbedder { std::vector> images_grid_thw; image_embeds.reserve(single_images.size()); images_grid_thw.reserve(single_images.size()); - + for (const auto& image : single_images) { EncodedImage encoded_image = m_vision_encoder.encode(image); ov::Tensor single_image_embeds = encoded_image.resized_source; @@ -1644,7 +1644,7 @@ class InputsEmbedderQwen2VL : public InputsEmbedder::IInputsEmbedder { if (images.empty()) { return text_embeds; } - + auto start_tokenizer_time = std::chrono::steady_clock::now(); ov::Tensor encoded_vision_start_token = m_tokenizer.encode(m_vlm_config.vision_start_token, ov::genai::add_special_tokens(false)).input_ids; ov::Tensor encoded_image_pad_token = m_tokenizer.encode(m_vlm_config.image_pad_token, ov::genai::add_special_tokens(false)).input_ids; @@ -1722,7 +1722,7 @@ class InputsEmbedderQwen2VL : public InputsEmbedder::IInputsEmbedder { } } - // Concatenate image embeddings + // Concatenate image embeddings ov::Tensor concatenated_images; if (image_embeds.size() == 1) { concatenated_images = image_embeds.at(0); @@ -1732,10 +1732,10 @@ class InputsEmbedderQwen2VL : public InputsEmbedder::IInputsEmbedder { total_length += embed.get_shape().at(0); } size_t hidden_dim = image_embeds.at(0).get_shape().at(1); - + concatenated_images = ov::Tensor(image_embeds.at(0).get_element_type(), {total_length, hidden_dim}); float* concat_data = concatenated_images.data(); - + size_t offset = 0; for (const auto& embed : image_embeds) { size_t embed_size = embed.get_shape().at(0) * embed.get_shape().at(1); @@ -1763,7 +1763,7 @@ class InputsEmbedderQwen2VL : public InputsEmbedder::IInputsEmbedder { const int64_t* input_ids_data = input_ids.data(); float* merged_embeds_data = merged_embeds.data(); const float* vision_embeds_data = processed_vision_embeds.data(); - + size_t vision_embed_idx = 0; for (size_t batch_idx = 0; batch_idx < batch_size; ++batch_idx) { for (size_t seq_idx = 0; seq_idx < seq_length; ++seq_idx) { @@ -1781,7 +1781,7 @@ class InputsEmbedderQwen2VL : public InputsEmbedder::IInputsEmbedder { return merged_embeds; } - ov::Tensor get_rotary_pos_emb(const std::vector>& grids_thw) { + ov::Tensor get_rotary_pos_emb(const std::vector>& grids_thw) { const size_t spatial_merge_size = m_vision_encoder.m_processor_config.merge_size; std::vector> all_pos_ids; @@ -1795,7 +1795,7 @@ class InputsEmbedderQwen2VL : public InputsEmbedder::IInputsEmbedder { total_positions += t * h * w; max_grid_size = std::max({max_grid_size, h, w}); - + // Create height position IDs std::vector hpos_ids(h * w); for (size_t hi = 0; hi < h; ++hi) { @@ -1855,7 +1855,7 @@ class InputsEmbedderQwen2VL : public InputsEmbedder::IInputsEmbedder { // Calculate rotary embeddings for max_grid_size const size_t dim = 1280 / 16 / 2; // config.vision_config.embed_dim / self.config.vision_config.num_heads / 2 const float theta = 10000.0f; - + std::vector inv_freq(dim / 2); for (size_t i = 0; i < dim / 2; ++i) { inv_freq[i] = 1.0f / std::pow(theta, static_cast(i) / static_cast(dim / 2)); @@ -1889,7 +1889,7 @@ class InputsEmbedderQwen2VL : public InputsEmbedder::IInputsEmbedder { const int64_t vision_start_token_id ) { const size_t spatial_merge_size = m_vision_encoder.m_processor_config.merge_size; - + const int64_t* input_ids = input_ids_tensor.data(); size_t batch_size = input_ids_tensor.get_shape().at(0); size_t seq_len = input_ids_tensor.get_shape().at(1); @@ -1903,7 +1903,7 @@ class InputsEmbedderQwen2VL : public InputsEmbedder::IInputsEmbedder { ov::Tensor position_ids{ov::element::i64, {3, batch_size, seq_len}}; int64_t* pos_data = position_ids.data(); - + size_t st = 0; int64_t next_pos = 0; size_t grid_idx = 0; diff --git a/src/cpp/src/visual_language/inputs_embedder.hpp b/src/cpp/src/visual_language/inputs_embedder.hpp index 4462c58185..86971fa78d 100644 --- a/src/cpp/src/visual_language/inputs_embedder.hpp +++ b/src/cpp/src/visual_language/inputs_embedder.hpp @@ -58,7 +58,7 @@ class InputsEmbedder { // adds currently generated text to chat history void update_chat_history(const std::string& decoded_results); - // finishes chat and clears a chat history + // finishes chat and clears a chat history void finish_chat(); private: class IInputsEmbedder; diff --git a/src/cpp/src/visual_language/perf_metrics.cpp b/src/cpp/src/visual_language/perf_metrics.cpp index a5894078bf..dc281e70d2 100644 --- a/src/cpp/src/visual_language/perf_metrics.cpp +++ b/src/cpp/src/visual_language/perf_metrics.cpp @@ -33,4 +33,4 @@ VLMPerfMetrics VLMPerfMetrics::operator+(const VLMPerfMetrics& right) const { right_prepare_embeddings_durations.end()); return result; } -} \ No newline at end of file +} diff --git a/src/cpp/src/visual_language/pipeline.cpp b/src/cpp/src/visual_language/pipeline.cpp index 95e3064548..632fffdd79 100644 --- a/src/cpp/src/visual_language/pipeline.cpp +++ b/src/cpp/src/visual_language/pipeline.cpp @@ -21,7 +21,7 @@ using namespace ov::genai; namespace { - + template struct overloaded : Ts... {using Ts::operator()...;}; template overloaded(Ts...) -> overloaded; @@ -127,7 +127,7 @@ class ov::genai::VLMPipeline::VLMPipelineImpl { }, m_generation_config{generation_config}, m_is_chat_conversation{false} { - + m_inputs_embedder = std::make_shared( m_vlm_config, models_map, tokenizer, config_dir_path, device, properties); @@ -250,7 +250,7 @@ class ov::genai::VLMPipeline::VLMPipelineImpl { res_raw_counters.generate_durations.emplace_back(PerfMetrics::get_microsec(generate_end_time - generate_start_time)); res_raw_counters.detokenization_durations.emplace_back(PerfMetrics::get_microsec(decode_end_time - decode_start_time)); res_raw_counters.tokenization_durations.insert(res_raw_counters.tokenization_durations.end(), raw_counters.tokenization_durations.begin(), raw_counters.tokenization_durations.end()); - + // VLM specific perf metrics decoded.perf_metrics.vlm_raw_metrics.prepare_embeddings_durations.emplace_back(PerfMetrics::get_microsec(end_get_inputs_embeds - start_get_inputs_embeds)); diff --git a/src/cpp/src/visual_language/processor_config.cpp b/src/cpp/src/visual_language/processor_config.cpp index 527557061e..487002db10 100644 --- a/src/cpp/src/visual_language/processor_config.cpp +++ b/src/cpp/src/visual_language/processor_config.cpp @@ -20,7 +20,7 @@ ov::genai::ProcessorConfig::ProcessorConfig(const std::filesystem::path& json_pa if (parsed.contains("norm_std")) { norm_std = parsed.at("norm_std").get>(); } - + // Setting llava config params if (parsed.contains("image_mean")) { image_mean = parsed.at("image_mean").get>(); diff --git a/src/cpp/src/visual_language/vision_encoder.cpp b/src/cpp/src/visual_language/vision_encoder.cpp index 04ddd63145..4b55f9705c 100644 --- a/src/cpp/src/visual_language/vision_encoder.cpp +++ b/src/cpp/src/visual_language/vision_encoder.cpp @@ -331,7 +331,7 @@ EncodedImage llava_image_embed_make_with_bytes_slice(clip_ctx& ctx_clip, const o ov::Tensor pixel_values{ov::element::f32, {n_images, channels, patch_size, max_size / patch_size}}; size_t d3_all_pixel = pixel_values.get_shape().at(3); float* pixel_value_data = pixel_values.data(); - + //image chw to 1*c*kernel*hw/kernel and padding zero clip_image_f32& resized_preprocessed = preprocessed.at(0).at(0); size_t img_h = resized_preprocessed.ny; @@ -346,7 +346,7 @@ EncodedImage llava_image_embed_make_with_bytes_slice(clip_ctx& ctx_clip, const o for (size_t k_idx = 0; k_idx < patch_size; k_idx++) { std::copy(clip_value_data, clip_value_data + d3_clip_pixel, pixel_value_data); clip_value_data += d3_clip_pixel; - pixel_value_data += d3_all_pixel; + pixel_value_data += d3_all_pixel; } } @@ -359,7 +359,7 @@ EncodedImage llava_image_embed_make_with_bytes_slice(clip_ctx& ctx_clip, const o img_w = elem.nx; ov::Tensor clip_img{ov::element::f32, {1, channels, img_h, img_w}, elem.buf.data()}; ov::Tensor clip_pixel_values = preprocess_for_encoder(clip_img, patch_size); - + d3_clip_pixel = clip_pixel_values.get_shape().at(3); clip_value_data = clip_pixel_values.data(); pixel_value_data = pixel_values.data() + batch_pixel * channels * patch_size * d3_all_pixel; @@ -473,7 +473,7 @@ clip_image_f32 preprocess_clip_image_llava(const clip_image_u8& image, const Pro for (int y = 0; y < crop_height; ++y) { for (int x = 0; x < crop_width; ++x) { for (int c = 0; c < 3; ++c) { - cropped_image.buf[(y * crop_width + x) * 3 + c] = + cropped_image.buf[(y * crop_width + x) * 3 + c] = resized_image.buf[((start_y + y) * resized_image.nx + (start_x + x)) * 3 + c]; } } @@ -850,7 +850,7 @@ ImageSize smart_resize_qwen2vl(size_t height, size_t width, size_t factor, size_ } size_t h_bar = std::round(static_cast(height) / factor) * factor; - size_t w_bar = std::round(static_cast(width) / factor) * factor; + size_t w_bar = std::round(static_cast(width) / factor) * factor; if (h_bar * w_bar > max_pixels) { double beta = std::sqrt((height * width) / static_cast(max_pixels)); @@ -861,7 +861,7 @@ ImageSize smart_resize_qwen2vl(size_t height, size_t width, size_t factor, size_ h_bar = std::ceil(height * beta / factor) * factor; w_bar = std::ceil(width * beta / factor) * factor; } - + return ImageSize{h_bar, w_bar}; } @@ -876,24 +876,24 @@ ov::Tensor reshape_image_patches_qwen2vl( const size_t spatial_merge_size ) { ov::Shape output_shape{ - grid_t, - temporal_patch_size, - channel, - grid_h / spatial_merge_size, - spatial_merge_size, - patch_size, - grid_w / spatial_merge_size, - spatial_merge_size, - patch_size + grid_t, + temporal_patch_size, + channel, + grid_h / spatial_merge_size, + spatial_merge_size, + patch_size, + grid_w / spatial_merge_size, + spatial_merge_size, + patch_size }; - + ov::Tensor reshaped_patches(patches.get_element_type(), output_shape); const float* input_data = patches.data(); float* output_data = reshaped_patches.data(); size_t input_idx = 0; - + for (size_t gt = 0; gt < output_shape.at(0); ++gt) { for (size_t tp = 0; tp < output_shape.at(1); ++tp) { for (size_t c = 0; c < output_shape.at(2); ++c) { @@ -932,7 +932,7 @@ ov::Tensor transpose_image_patches_qwen2vl(const ov::Tensor& reshaped_patches) { // Input dimensions order: [0,1,2,3,4,5,6,7,8] // Output dimensions order: [0,3,6,4,7,2,1,5,8] auto input_shape = reshaped_patches.get_shape(); - + ov::Shape output_shape = { input_shape.at(0), // grid_t input_shape.at(3), // grid_h / spatial_merge_size @@ -946,14 +946,14 @@ ov::Tensor transpose_image_patches_qwen2vl(const ov::Tensor& reshaped_patches) { }; ov::Tensor transposed_patches(reshaped_patches.get_element_type(), output_shape); - + const float* src = reshaped_patches.data(); float* dst = transposed_patches.data(); - + size_t shape_size = input_shape.size(); std::vector input_strides(shape_size); std::vector output_strides(shape_size); - + input_strides[shape_size - 1] = 1; output_strides[shape_size - 1] = 1; for(int i = 7; i >= 0; i--) { @@ -969,7 +969,7 @@ ov::Tensor transpose_image_patches_qwen2vl(const ov::Tensor& reshaped_patches) { input_indices[i] = remaining / input_strides[i]; remaining %= input_strides[i]; } - + std::vector output_indices = { input_indices.at(0), input_indices.at(3), @@ -981,15 +981,15 @@ ov::Tensor transpose_image_patches_qwen2vl(const ov::Tensor& reshaped_patches) { input_indices.at(5), input_indices.at(8) }; - + size_t dst_idx = 0; for(int i = 0; i < shape_size; i++) { dst_idx += output_indices[i] * output_strides[i]; } - + dst[dst_idx] = src[idx]; } - + return transposed_patches; } } @@ -1119,8 +1119,8 @@ EncodedImage VisionEncoder::encode_qwen2vl(const ov::Tensor& image, const Proces auto original_width = image_shape.at(2); ImageSize target_image_size = smart_resize_qwen2vl( - original_height, - original_width, + original_height, + original_width, config.patch_size * config.merge_size, config.min_pixels, config.max_pixels @@ -1142,7 +1142,7 @@ EncodedImage VisionEncoder::encode_qwen2vl(const ov::Tensor& image, const Proces auto orig_shape = patches.get_shape(); ov::Tensor tiled_patches(patches.get_element_type(), {config.temporal_patch_size, orig_shape.at(1), orig_shape.at(2), orig_shape.at(3)}); - + for (size_t i = 0; i < config.temporal_patch_size; i++) { std::memcpy( tiled_patches.data() + i * patches.get_byte_size() / sizeof(float), @@ -1155,7 +1155,7 @@ EncodedImage VisionEncoder::encode_qwen2vl(const ov::Tensor& image, const Proces auto patches_shape = patches.get_shape(); size_t channel = patches_shape.at(1); - + size_t grid_t = patches_shape.at(0) / config.temporal_patch_size; size_t grid_h = target_image_size.height / config.patch_size; size_t grid_w = target_image_size.width / config.patch_size; diff --git a/src/cpp/src/visual_language/vlm_config.hpp b/src/cpp/src/visual_language/vlm_config.hpp index 7a052b8537..6f6cd09a5a 100644 --- a/src/cpp/src/visual_language/vlm_config.hpp +++ b/src/cpp/src/visual_language/vlm_config.hpp @@ -57,14 +57,14 @@ class VLMConfig { /// @brief phi3_v new line token embedding to separate images. std::vector sub_GN = std::vector(4096, 0.0f); std::vector glb_GN = std::vector(4096, 0.0f); - + /// @brief A string token denoting start of vision embeddings for Qwen2VL model. std::string vision_start_token = "<|vision_start|>"; /// @brief A placeholder for image embeddings in text for Qwen2VL model. std::string image_pad_token = "<|image_pad|>"; /// @brief A string token denoting end of vision embeddings for Qwen2VL model. std::string vision_end_token = "<|vision_end|>"; - + /// @brief Default constructor. VLMConfig() = default; /// @brief Construct VLMConfig from values in json_path. diff --git a/src/cpp/src/visual_language/vlm_model_type.hpp b/src/cpp/src/visual_language/vlm_model_type.hpp index 93387cacbc..258ce452c0 100644 --- a/src/cpp/src/visual_language/vlm_model_type.hpp +++ b/src/cpp/src/visual_language/vlm_model_type.hpp @@ -36,4 +36,4 @@ inline VLMModelType to_vlm_model_type(const std::string& value) { } OPENVINO_THROW("Unsupported '", value, "' VLM model type"); } -} \ No newline at end of file +} diff --git a/src/cpp/src/whisper_pipeline_static.cpp b/src/cpp/src/whisper_pipeline_static.cpp index e49a25e2d2..9357ed9c28 100644 --- a/src/cpp/src/whisper_pipeline_static.cpp +++ b/src/cpp/src/whisper_pipeline_static.cpp @@ -199,7 +199,7 @@ void zero_past_key_values(ov::InferRequest& request) { void prepare_decoder_with_past(ov::InferRequest& decoder_with_past, ov::InferRequest& decoder, const size_t init_ids_size) { // NB: Prepare attetion mask to be in a format [0, 0, 0, 1, 1, 1, 1, ..., 0, 1] - // Mask should be inverted for decoder_with_past + // Mask should be inverted for decoder_with_past auto attention_mask = decoder_with_past.get_tensor("attention_mask"); auto* attention_mask_ptr = attention_mask.data(); std::fill(attention_mask_ptr, attention_mask_ptr + init_ids_size, 0); @@ -548,7 +548,7 @@ WhisperPipeline::StaticWhisperPipeline::StaticWhisperPipeline(const std::filesys ov::genai::utils::print_compiled_model_properties(compiled_model, "Static Whisper encoder model"); m_models.encoder = compiled_model.create_infer_request(); - // Will compile decoder model when it's needed + // Will compile decoder model when it's needed m_decoder_cache = DecoderCache(decoder_model, properties); compiled_model = core.compile_model(decoder_with_past_model, "NPU", properties); diff --git a/src/docs/DEBUG_LOG.md b/src/docs/DEBUG_LOG.md index 5ed3f35d17..146072a6c4 100644 --- a/src/docs/DEBUG_LOG.md +++ b/src/docs/DEBUG_LOG.md @@ -40,4 +40,4 @@ the properties of the compiled model are printed as follows: AFFINITY: CORE EXECUTION_DEVICES: CPU: Intel(R) Xeon(R) Platinum 8468 -``` \ No newline at end of file +``` diff --git a/src/python/openvino_genai/__init__.py b/src/python/openvino_genai/__init__.py index 0ad7ba3f12..ff14ecc331 100644 --- a/src/python/openvino_genai/__init__.py +++ b/src/python/openvino_genai/__init__.py @@ -28,7 +28,7 @@ # LLM pipeline from .py_openvino_genai import ( - LLMPipeline, + LLMPipeline, draft_model, ) diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi index f1898d1232..da09b2a36b 100644 --- a/src/python/openvino_genai/py_openvino_genai.pyi +++ b/src/python/openvino_genai/py_openvino_genai.pyi @@ -28,15 +28,15 @@ class AdapterConfig: class Mode: """ Members: - + MODE_AUTO - + MODE_DYNAMIC - + MODE_STATIC_RANK - + MODE_STATIC - + MODE_FUSE """ MODE_AUTO: typing.ClassVar[AdapterConfig.Mode] # value = @@ -107,11 +107,11 @@ class AggregationMode: Represents the mode of per-token score aggregation when determining least important tokens for eviction from cache :param AggregationMode.SUM: In this mode the importance scores of each token will be summed after each step of generation :param AggregationMode.NORM_SUM: Same as SUM, but the importance scores are additionally divided by the lifetime (in tokens generated) of a given token in cache - + Members: - + SUM - + NORM_SUM """ NORM_SUM: typing.ClassVar[AggregationMode] # value = @@ -312,20 +312,20 @@ class CLIPTextModelWithProjection: ... class CacheEvictionConfig: """ - + Configuration struct for the cache eviction algorithm. :param start_size: Number of tokens in the *beginning* of KV cache that should be retained in the KV cache for this sequence during generation. Must be non-zero and a multiple of the KV cache block size for this pipeline. :type start_size: int - + :param recent_size: Number of tokens in the *end* of KV cache that should be retained in the KV cache for this sequence during generation. Must be non-zero and a multiple of the KV cache block size for this pipeline. :type recent_size: int - + :param max_cache_size: Maximum number of tokens that should be kept in the KV cache. The evictable block area will be located between the "start" and "recent" blocks and its size will be calculated as (`max_cache_size` - `start_size` - `recent_size`). Must be non-zero, larger than (`start_size` + `recent_size`), and a multiple of the KV cache block size for this pipeline. Note that since only the completely filled blocks are evicted, the actual maximum per-sequence KV cache size in tokens may be up to (`max_cache_size` + `SchedulerConfig.block_size - 1`). :type max_cache_size: int - + :param aggregation_mode: The mode used to compute the importance of tokens for eviction :type aggregation_mode: openvino_genai.AggregationMode - + :param apply_rotation: Whether to apply cache rotation (RoPE-based) after each eviction. Set this to false if your model has different RoPE scheme from the one used in the original llama model and you experience accuracy issues with cache eviction enabled. @@ -345,7 +345,7 @@ class CacheEvictionConfig: ... class ChunkStreamerBase: """ - + Base class for chunk streamers. In order to use inherit from from this class. """ def __init__(self) -> None: @@ -408,11 +408,11 @@ class CppStdGenerator(Generator): ... class DecodedResults: """ - + Structure to store resulting batched text outputs and scores for each batch. The first num_return_sequences elements correspond to the first batch element. - - Parameters: + + Parameters: texts: vector of resulting sequences. scores: scores for each sequence. metrics: performance metrics with tpot, ttft, etc. of type ov::genai::PerfMetrics. @@ -432,10 +432,10 @@ class DecodedResults: ... class EncodedGenerationResult: """ - + GenerationResult stores resulting batched tokens and scores. - - Parameters: + + Parameters: request_id: obsolete when handle API is approved as handle will connect results with prompts. generation_ids: in a generic case we have multiple generation results per initial prompt depending on sampling parameters (e.g. beam search or parallel sampling). @@ -448,7 +448,7 @@ class EncodedGenerationResult: DROPPED_BY_HANDLE = 4 - Status set when generation handle is dropped. perf_metrics: Performance metrics for each generation result. - + """ m_generation_ids: list[list[int]] m_scores: list[float] @@ -462,14 +462,14 @@ class EncodedGenerationResult: ... class EncodedResults: """ - + Structure to store resulting batched tokens and scores for each batch sequence. The first num_return_sequences elements correspond to the first batch element. In the case if results decoded with beam search and random sampling scores contain sum of logarithmic probabilities for each token in the sequence. In the case of greedy decoding scores are filled with zeros. - - Parameters: + + Parameters: tokens: sequence of resulting tokens. scores: sum of logarithmic probabilities of all tokens in the sequence. metrics: performance metrics with tpot, ttft, etc. of type ov::genai::PerfMetrics. @@ -532,11 +532,11 @@ class FluxTransformer2DModel: ... class GenerationConfig: """ - + Structure to keep generation config parameters. For a selected method of decoding, only parameters from that group and generic parameters are used. For example, if do_sample is set to true, then only generic parameters and random sampling parameters will be used while greedy and beam search parameters will not affect decoding at all. - + Parameters: max_length: the maximum length the generated tokens can have. Corresponds to the length of the input prompt + max_new_tokens. Its effect is overridden by `max_new_tokens`, if also set. @@ -550,11 +550,11 @@ class GenerationConfig: echo: if set to true, the model will echo the prompt in the output. logprobs: number of top logprobs computed for each position, if set to 0, logprobs are not computed and value 0.0 is returned. Currently only single top logprob can be returned, so any logprobs > 1 is treated as logprobs == 1. (default: 0). - + repetition_penalty: the parameter for repetition penalty. 1.0 means no penalty. presence_penalty: reduces absolute log prob if the token was generated at least once. frequency_penalty: reduces absolute log prob as many times as the token was generated. - + Beam search specific parameters: num_beams: number of beams for beam search. 1 disables beam search. num_beam_groups: number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams. @@ -569,7 +569,7 @@ class GenerationConfig: "openvino_genai.StopCriteria.EARLY", where the generation stops as soon as there are `num_beams` complete candidates; "openvino_genai.StopCriteria.HEURISTIC" is applied and the generation stops when is it very unlikely to find better candidates; "openvino_genai.StopCriteria.NEVER", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm). - + Random sampling parameters: temperature: the value used to modulate token probabilities for random sampling. top_p: if set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation. @@ -633,11 +633,11 @@ class GenerationConfig: class GenerationFinishReason: """ Members: - + NONE - + STOP - + LENGTH """ LENGTH: typing.ClassVar[GenerationFinishReason] # value = @@ -690,10 +690,10 @@ class GenerationOutput: score: float class GenerationResult: """ - + GenerationResult stores resulting batched tokens and scores. - - Parameters: + + Parameters: request_id: obsolete when handle API is approved as handle will connect results with prompts. generation_ids: in a generic case we have multiple generation results per initial prompt depending on sampling parameters (e.g. beam search or parallel sampling). @@ -706,7 +706,7 @@ class GenerationResult: DROPPED_BY_HANDLE = 4 - Status set when generation handle is dropped. perf_metrics: Performance metrics for each generation result. - + """ m_generation_ids: list[str] m_scores: list[float] @@ -726,15 +726,15 @@ class GenerationResult: class GenerationStatus: """ Members: - + RUNNING - + FINISHED - + IGNORED - + DROPPED_BY_PIPELINE - + DROPPED_BY_HANDLE """ DROPPED_BY_HANDLE: typing.ClassVar[GenerationStatus] # value = @@ -819,12 +819,12 @@ class Image2ImagePipeline: def generate(self, prompt: str, image: openvino._pyopenvino.Tensor, **kwargs) -> openvino._pyopenvino.Tensor: """ Generates images for text-to-image models. - + :param prompt: input prompt :type prompt: str - + :param kwargs: arbitrary keyword arguments with keys corresponding to generate params. - + Expected parameters list: prompt_2: str - second prompt, prompt_3: str - third prompt, @@ -842,7 +842,7 @@ class Image2ImagePipeline: adapters: LoRA adapters, strength: strength for image to image generation. 1.0f means initial image is fully noised, max_sequence_length: int - length of t5_encoder_model input - + :return: ov.Tensor with resulting images :rtype: ov.Tensor """ @@ -920,12 +920,12 @@ class InpaintingPipeline: def generate(self, prompt: str, image: openvino._pyopenvino.Tensor, mask_image: openvino._pyopenvino.Tensor, **kwargs) -> openvino._pyopenvino.Tensor: """ Generates images for text-to-image models. - + :param prompt: input prompt :type prompt: str - + :param kwargs: arbitrary keyword arguments with keys corresponding to generate params. - + Expected parameters list: prompt_2: str - second prompt, prompt_3: str - third prompt, @@ -943,7 +943,7 @@ class InpaintingPipeline: adapters: LoRA adapters, strength: strength for image to image generation. 1.0f means initial image is fully noised, max_sequence_length: int - length of t5_encoder_model input - + :return: ov.Tensor with resulting images :rtype: ov.Tensor """ @@ -962,27 +962,27 @@ class LLMPipeline: def __call__(self, inputs: openvino._pyopenvino.Tensor | TokenizedInputs | str | list[str], generation_config: GenerationConfig | None = None, streamer: typing.Callable[[str], bool] | StreamerBase | None = None, **kwargs) -> EncodedResults | DecodedResults: """ Generates sequences or tokens for LLMs. If input is a string or list of strings then resulting sequences will be already detokenized. - + :param inputs: inputs in the form of string, list of strings or tokenized input_ids :type inputs: str, List[str], ov.genai.TokenizedInputs, or ov.Tensor - + :param generation_config: generation_config :type generation_config: GenerationConfig or a Dict - + :param streamer: streamer either as a lambda with a boolean returning flag whether generation should be stopped :type : Callable[[str], bool], ov.genai.StreamerBase - + :param kwargs: arbitrary keyword arguments with keys corresponding to GenerationConfig fields. :type : Dict - + :return: return results in encoded, or decoded form depending on inputs type :rtype: DecodedResults, EncodedResults, str - - + + Structure to keep generation config parameters. For a selected method of decoding, only parameters from that group and generic parameters are used. For example, if do_sample is set to true, then only generic parameters and random sampling parameters will be used while greedy and beam search parameters will not affect decoding at all. - + Parameters: max_length: the maximum length the generated tokens can have. Corresponds to the length of the input prompt + max_new_tokens. Its effect is overridden by `max_new_tokens`, if also set. @@ -996,11 +996,11 @@ class LLMPipeline: echo: if set to true, the model will echo the prompt in the output. logprobs: number of top logprobs computed for each position, if set to 0, logprobs are not computed and value 0.0 is returned. Currently only single top logprob can be returned, so any logprobs > 1 is treated as logprobs == 1. (default: 0). - + repetition_penalty: the parameter for repetition penalty. 1.0 means no penalty. presence_penalty: reduces absolute log prob if the token was generated at least once. frequency_penalty: reduces absolute log prob as many times as the token was generated. - + Beam search specific parameters: num_beams: number of beams for beam search. 1 disables beam search. num_beam_groups: number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams. @@ -1015,7 +1015,7 @@ class LLMPipeline: "openvino_genai.StopCriteria.EARLY", where the generation stops as soon as there are `num_beams` complete candidates; "openvino_genai.StopCriteria.HEURISTIC" is applied and the generation stops when is it very unlikely to find better candidates; "openvino_genai.StopCriteria.NEVER", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm). - + Random sampling parameters: temperature: the value used to modulate token probabilities for random sampling. top_p: if set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation. @@ -1047,27 +1047,27 @@ class LLMPipeline: def generate(self, inputs: openvino._pyopenvino.Tensor | TokenizedInputs | str | list[str], generation_config: GenerationConfig | None = None, streamer: typing.Callable[[str], bool] | StreamerBase | None = None, **kwargs) -> EncodedResults | DecodedResults: """ Generates sequences or tokens for LLMs. If input is a string or list of strings then resulting sequences will be already detokenized. - + :param inputs: inputs in the form of string, list of strings or tokenized input_ids :type inputs: str, List[str], ov.genai.TokenizedInputs, or ov.Tensor - + :param generation_config: generation_config :type generation_config: GenerationConfig or a Dict - + :param streamer: streamer either as a lambda with a boolean returning flag whether generation should be stopped :type : Callable[[str], bool], ov.genai.StreamerBase - + :param kwargs: arbitrary keyword arguments with keys corresponding to GenerationConfig fields. :type : Dict - + :return: return results in encoded, or decoded form depending on inputs type :rtype: DecodedResults, EncodedResults, str - - + + Structure to keep generation config parameters. For a selected method of decoding, only parameters from that group and generic parameters are used. For example, if do_sample is set to true, then only generic parameters and random sampling parameters will be used while greedy and beam search parameters will not affect decoding at all. - + Parameters: max_length: the maximum length the generated tokens can have. Corresponds to the length of the input prompt + max_new_tokens. Its effect is overridden by `max_new_tokens`, if also set. @@ -1081,11 +1081,11 @@ class LLMPipeline: echo: if set to true, the model will echo the prompt in the output. logprobs: number of top logprobs computed for each position, if set to 0, logprobs are not computed and value 0.0 is returned. Currently only single top logprob can be returned, so any logprobs > 1 is treated as logprobs == 1. (default: 0). - + repetition_penalty: the parameter for repetition penalty. 1.0 means no penalty. presence_penalty: reduces absolute log prob if the token was generated at least once. frequency_penalty: reduces absolute log prob as many times as the token was generated. - + Beam search specific parameters: num_beams: number of beams for beam search. 1 disables beam search. num_beam_groups: number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams. @@ -1100,7 +1100,7 @@ class LLMPipeline: "openvino_genai.StopCriteria.EARLY", where the generation stops as soon as there are `num_beams` complete candidates; "openvino_genai.StopCriteria.HEURISTIC" is applied and the generation stops when is it very unlikely to find better candidates; "openvino_genai.StopCriteria.NEVER", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm). - + Random sampling parameters: temperature: the value used to modulate token probabilities for random sampling. top_p: if set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation. @@ -1129,9 +1129,9 @@ class MeanStdPair: ... class PerfMetrics: """ - + Holds performance metrics for each generate call. - + PerfMetrics holds fields with mean and standard deviations for the following metrics: - Time To the First Token (TTFT), ms - Time per Output Token (TPOT), ms/token @@ -1139,42 +1139,42 @@ class PerfMetrics: - Tokenization duration, ms - Detokenization duration, ms - Throughput, tokens/s - + Additional fields include: - Load time, ms - Number of generated tokens - Number of tokens in the input prompt - + Preferable way to access values is via get functions. Getters calculate mean and std values from raw_metrics and return pairs. If mean and std were already calculated, getters return cached values. - + :param get_load_time: Returns the load time in milliseconds. :type get_load_time: float - + :param get_num_generated_tokens: Returns the number of generated tokens. :type get_num_generated_tokens: int - + :param get_num_input_tokens: Returns the number of tokens in the input prompt. :type get_num_input_tokens: int - + :param get_ttft: Returns the mean and standard deviation of TTFT in milliseconds. :type get_ttft: MeanStdPair - + :param get_tpot: Returns the mean and standard deviation of TPOT in milliseconds. :type get_tpot: MeanStdPair - + :param get_throughput: Returns the mean and standard deviation of throughput in tokens per second. :type get_throughput: MeanStdPair - + :param get_generate_duration: Returns the mean and standard deviation of generate durations in milliseconds. :type get_generate_duration: MeanStdPair - + :param get_tokenization_duration: Returns the mean and standard deviation of tokenization durations in milliseconds. :type get_tokenization_duration: MeanStdPair - + :param get_detokenization_duration: Returns the mean and standard deviation of detokenization durations in milliseconds. :type get_detokenization_duration: MeanStdPair - + :param raw_metrics: A structure of RawPerfMetrics type that holds raw metrics. :type raw_metrics: RawPerfMetrics """ @@ -1211,23 +1211,23 @@ class PerfMetrics: ... class PipelineMetrics: """ - + Contains general pipeline metrics, either aggregated throughout the lifetime of the generation pipeline or measured at the previous generation step. - + :param requests: Number of requests to be processed by the pipeline. :type requests: int - + :param scheduled_requests: Number of requests that were scheduled for processing at the previous step of the pipeline. :type scheduled_requests: int - + :param cache_usage: Percentage of KV cache usage in the last generation step. :type cache_usage: float - + :param max_cache_usage: Max KV cache usage during the lifetime of the pipeline in % :type max_cache_usage: float - - + + :param avg_cache_usage: Running average of the KV cache usage (in %) during the lifetime of the pipeline, with max window size of 1000 steps :type avg_cache_usage: float """ @@ -1250,33 +1250,33 @@ class PipelineMetrics: ... class RawPerfMetrics: """ - + Structure with raw performance metrics for each generation before any statistics are calculated. - + :param generate_durations: Durations for each generate call in milliseconds. :type generate_durations: List[float] - + :param tokenization_durations: Durations for the tokenization process in milliseconds. :type tokenization_durations: List[float] - + :param detokenization_durations: Durations for the detokenization process in milliseconds. :type detokenization_durations: List[float] - + :param m_times_to_first_token: Times to the first token for each call in milliseconds. :type m_times_to_first_token: List[float] - + :param m_new_token_times: Timestamps of generation every token or batch of tokens in milliseconds. :type m_new_token_times: List[double] - + :param token_infer_durations : Inference time for each token in milliseconds. :type batch_sizes: List[float] - + :param m_batch_sizes: Batch sizes for each generate call. :type m_batch_sizes: List[int] - + :param m_durations: Total durations for each generate call in milliseconds. :type m_durations: List[float] - + :param inference_durations : Total inference duration for each generate call in milliseconds. :type batch_sizes: List[float] """ @@ -1365,21 +1365,21 @@ class Scheduler: class Type: """ Members: - + AUTO - + LCM - + LMS_DISCRETE - + DDIM - + EULER_DISCRETE - + FLOW_MATCH_EULER_DISCRETE - + PNDM - + EULER_ANCESTRAL_DISCRETE """ AUTO: typing.ClassVar[Scheduler.Type] # value = @@ -1422,17 +1422,17 @@ class Scheduler: ... class SchedulerConfig: """ - + SchedulerConfig to construct ContinuousBatchingPipeline - - Parameters: + + Parameters: max_num_batched_tokens: a maximum number of tokens to batch (in contrast to max_batch_size which combines independent sequences, we consider total amount of tokens in a batch). num_kv_blocks: total number of KV blocks available to scheduler logic. cache_size: total size of KV cache in GB. block_size: block size for KV cache. dynamic_split_fuse: whether to split prompt / generate to different scheduling phases. - + vLLM-like settings: max_num_seqs: max number of scheduled sequences (you can think of it as "max batch size"). enable_prefix_caching: Enable caching of KV-blocks. @@ -1454,21 +1454,21 @@ class SchedulerConfig: ... class StopCriteria: """ - + StopCriteria controls the stopping condition for grouped beam search. - + The following values are possible: "openvino_genai.StopCriteria.EARLY" stops as soon as there are `num_beams` complete candidates. "openvino_genai.StopCriteria.HEURISTIC" stops when is it unlikely to find better candidates. "openvino_genai.StopCriteria.NEVER" stops when there cannot be better candidates. - - + + Members: - + EARLY - + HEURISTIC - + NEVER """ EARLY: typing.ClassVar[StopCriteria] # value = @@ -1503,7 +1503,7 @@ class StopCriteria: ... class StreamerBase: """ - + Base class for streamers. In order to use inherit from from this class and implement put, and methods. """ def __init__(self) -> None: @@ -1608,12 +1608,12 @@ class Text2ImagePipeline: def generate(self, prompt: str, **kwargs) -> openvino._pyopenvino.Tensor: """ Generates images for text-to-image models. - + :param prompt: input prompt :type prompt: str - + :param kwargs: arbitrary keyword arguments with keys corresponding to generate params. - + Expected parameters list: prompt_2: str - second prompt, prompt_3: str - third prompt, @@ -1631,7 +1631,7 @@ class Text2ImagePipeline: adapters: LoRA adapters, strength: strength for image to image generation. 1.0f means initial image is fully noised, max_sequence_length: int - length of t5_encoder_model input - + :return: ov.Tensor with resulting images :rtype: ov.Tensor """ @@ -1766,10 +1766,10 @@ class UNet2DConditionModel: ... class VLMDecodedResults(DecodedResults): """ - + Structure to store resulting batched text outputs and scores for each batch. The first num_return_sequences elements correspond to the first batch element. - + Parameters: texts: vector of resulting sequences. scores: scores for each sequence. @@ -1790,12 +1790,12 @@ class VLMDecodedResults(DecodedResults): ... class VLMPerfMetrics(PerfMetrics): """ - + Structure with raw performance metrics for each generation before any statistics are calculated. - + :param get_prepare_embeddings_duration: Returns mean and standard deviation of embeddings preparation duration in milliseconds :type get_prepare_embeddings_duration: MeanStdPair - + :param vlm_raw_metrics: VLM specific raw metrics :type VLMRawPerfMetrics: """ @@ -1824,22 +1824,22 @@ class VLMPipeline: def generate(self, prompt: str, images: list[openvino._pyopenvino.Tensor], generation_config: GenerationConfig, streamer: typing.Callable[[str], bool] | StreamerBase | None = None, **kwargs) -> VLMDecodedResults: """ Generates sequences for VLMs. - + :param prompt: input prompt :type prompt: str - + :param images: image or list of images :type images: List[ov.Tensor] or ov.Tensor - + :param generation_config: generation_config :type generation_config: GenerationConfig or a Dict - + :param streamer: streamer either as a lambda with a boolean returning flag whether generation should be stopped :type : Callable[[str], bool], ov.genai.StreamerBase - + :param kwargs: arbitrary keyword arguments with keys corresponding to GenerationConfig fields. :type : Dict - + :return: return results in decoded form :rtype: VLMDecodedResults """ @@ -1847,22 +1847,22 @@ class VLMPipeline: def generate(self, prompt: str, images: openvino._pyopenvino.Tensor, generation_config: GenerationConfig, streamer: typing.Callable[[str], bool] | StreamerBase | None = None, **kwargs) -> VLMDecodedResults: """ Generates sequences for VLMs. - + :param prompt: input prompt :type prompt: str - + :param images: image or list of images :type images: List[ov.Tensor] or ov.Tensor - + :param generation_config: generation_config :type generation_config: GenerationConfig or a Dict - + :param streamer: streamer either as a lambda with a boolean returning flag whether generation should be stopped :type : Callable[[str], bool], ov.genai.StreamerBase - + :param kwargs: arbitrary keyword arguments with keys corresponding to GenerationConfig fields. :type : Dict - + :return: return results in decoded form :rtype: VLMDecodedResults """ @@ -1870,18 +1870,18 @@ class VLMPipeline: def generate(self, prompt: str, **kwargs) -> VLMDecodedResults: """ Generates sequences for VLMs. - + :param prompt: input prompt :type prompt: str - + :param kwargs: arbitrary keyword arguments with keys corresponding to generate params. - + Expected parameters list: image: ov.Tensor - input image, images: List[ov.Tensor] - input images, generation_config: GenerationConfig, streamer: Callable[[str], bool], ov.genai.StreamerBase - streamer either as a lambda with a boolean returning flag whether generation should be stopped - + :return: return results in decoded form :rtype: VLMDecodedResults """ @@ -1897,9 +1897,9 @@ class VLMPipeline: ... class VLMRawPerfMetrics: """ - + Structure with VLM specific raw performance metrics for each generation before any statistics are calculated. - + :param prepare_embeddings_durations: Durations of embeddings preparation. :type prepare_embeddings_durations: List[MicroSeconds] """ @@ -1910,9 +1910,9 @@ class VLMRawPerfMetrics: ... class WhisperDecodedResultChunk: """ - + Structure to store decoded text with corresponding timestamps - + :param start_ts chunk start time in seconds :param end_ts chunk end time in seconds :param text chunk text @@ -1930,9 +1930,9 @@ class WhisperDecodedResultChunk: ... class WhisperDecodedResults: """ - + Structure to store resulting text outputs and scores. - + Parameters: texts: vector of resulting sequences. scores: scores for each sequence. @@ -1955,47 +1955,47 @@ class WhisperDecodedResults: ... class WhisperGenerationConfig(GenerationConfig): """ - + WhisperGenerationConfig - + Whisper specific parameters: :param decoder_start_token_id: Corresponds to the ”<|startoftranscript|>” token. :type decoder_start_token_id: int - + :param pad_token_id: Padding token id. :type pad_token_id: int - + :param translate_token_id: Translate token id. :type translate_token_id: int - + :param transcribe_token_id: Transcribe token id. :type transcribe_token_id: int - + :param no_timestamps_token_id: No timestamps token id. :type no_timestamps_token_id: int - + :param prev_sot_token_id: Corresponds to the ”<|startofprev|>” token. :type prev_sot_token_id: int - + :param is_multilingual: :type is_multilingual: bool - + :param begin_suppress_tokens: A list containing tokens that will be suppressed at the beginning of the sampling process. :type begin_suppress_tokens: list[int] - + :param suppress_tokens: A list containing the non-speech tokens that will be suppressed during generation. :type suppress_tokens: list[int] - + :param language: Language token to use for generation in the form of <|en|>. You can find all the possible language tokens in the generation_config.json lang_to_id dictionary. :type language: Optional[str] - + :param lang_to_id: Language token to token_id map. Initialized from the generation_config.json lang_to_id dictionary. :type lang_to_id: Dict[str, int] - + :param task: Task to use for generation, either “translate” or “transcribe” :type task: int - + :param return_timestamps: If `true` the pipeline will return timestamps along the text for *segments* of words in the text. For instance, if you get WhisperDecodedResultChunk @@ -2005,29 +2005,29 @@ class WhisperGenerationConfig(GenerationConfig): then it means the model predicts that the segment "Hi there!" was spoken after `0.5` and before `1.5` seconds. Note that a segment of text refers to a sequence of one or more words, rather than individual words. :type return_timestamps: bool - + :param initial_prompt: Initial prompt tokens passed as a previous transcription (after `<|startofprev|>` token) to the first processing window. Can be used to steer the model to use particular spellings or styles. - + Example: auto result = pipeline.generate(raw_speech); // He has gone and gone for good answered Paul Icrom who... - + auto result = pipeline.generate(raw_speech, ov::genai::initial_prompt("Polychrome")); // He has gone and gone for good answered Polychrome who... :type initial_prompt: Optional[str] - + :param hotwords: Hotwords tokens passed as a previous transcription (after `<|startofprev|>` token) to the all processing windows. Can be used to steer the model to use particular spellings or styles. - + Example: auto result = pipeline.generate(raw_speech); // He has gone and gone for good answered Paul Icrom who... - + auto result = pipeline.generate(raw_speech, ov::genai::hotwords("Polychrome")); // He has gone and gone for good answered Polychrome who... :type hotwords: Optional[str] - + Generic parameters: max_length: the maximum length the generated tokens can have. Corresponds to the length of the input prompt + max_new_tokens. Its effect is overridden by `max_new_tokens`, if also set. @@ -2041,11 +2041,11 @@ class WhisperGenerationConfig(GenerationConfig): echo: if set to true, the model will echo the prompt in the output. logprobs: number of top logprobs computed for each position, if set to 0, logprobs are not computed and value 0.0 is returned. Currently only single top logprob can be returned, so any logprobs > 1 is treated as logprobs == 1. (default: 0). - + repetition_penalty: the parameter for repetition penalty. 1.0 means no penalty. presence_penalty: reduces absolute log prob if the token was generated at least once. frequency_penalty: reduces absolute log prob as many times as the token was generated. - + Beam search specific parameters: num_beams: number of beams for beam search. 1 disables beam search. num_beam_groups: number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams. @@ -2060,7 +2060,7 @@ class WhisperGenerationConfig(GenerationConfig): "openvino_genai.StopCriteria.EARLY", where the generation stops as soon as there are `num_beams` complete candidates; "openvino_genai.StopCriteria.HEURISTIC" is applied and the generation stops when is it very unlikely to find better candidates; "openvino_genai.StopCriteria.NEVER", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm). - + Random sampling parameters: temperature: the value used to modulate token probabilities for random sampling. top_p: if set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation. @@ -2096,12 +2096,12 @@ class WhisperGenerationConfig(GenerationConfig): ... class WhisperPerfMetrics(PerfMetrics): """ - + Structure with raw performance metrics for each generation before any statistics are calculated. - + :param get_features_extraction_duration: Returns mean and standard deviation of features extraction duration in milliseconds :type get_features_extraction_duration: MeanStdPair - + :param whisper_raw_metrics: Whisper specific raw metrics :type WhisperRawPerfMetrics: """ @@ -2125,64 +2125,64 @@ class WhisperPipeline: def generate(self, raw_speech_input: list[float], generation_config: WhisperGenerationConfig | None = None, streamer: typing.Callable[[str], bool] | ChunkStreamerBase | None = None, **kwargs) -> WhisperDecodedResults: """ High level generate that receives raw speech as a vector of floats and returns decoded output. - + :param raw_speech_input: inputs in the form of list of floats. Required to be normalized to near [-1, 1] range and have 16k Hz sampling rate. :type raw_speech_input: List[float] - + :param generation_config: generation_config :type generation_config: WhisperGenerationConfig or a Dict - + :param streamer: streamer either as a lambda with a boolean returning flag whether generation should be stopped. Streamer supported for short-form audio (< 30 seconds) with `return_timestamps=False` only :type : Callable[[str], bool], ov.genai.StreamerBase - + :param kwargs: arbitrary keyword arguments with keys corresponding to WhisperGenerationConfig fields. :type : Dict - + :return: return results in decoded form :rtype: WhisperDecodedResults - - + + WhisperGenerationConfig - + Whisper specific parameters: :param decoder_start_token_id: Corresponds to the ”<|startoftranscript|>” token. :type decoder_start_token_id: int - + :param pad_token_id: Padding token id. :type pad_token_id: int - + :param translate_token_id: Translate token id. :type translate_token_id: int - + :param transcribe_token_id: Transcribe token id. :type transcribe_token_id: int - + :param no_timestamps_token_id: No timestamps token id. :type no_timestamps_token_id: int - + :param prev_sot_token_id: Corresponds to the ”<|startofprev|>” token. :type prev_sot_token_id: int - + :param is_multilingual: :type is_multilingual: bool - + :param begin_suppress_tokens: A list containing tokens that will be suppressed at the beginning of the sampling process. :type begin_suppress_tokens: list[int] - + :param suppress_tokens: A list containing the non-speech tokens that will be suppressed during generation. :type suppress_tokens: list[int] - + :param language: Language token to use for generation in the form of <|en|>. You can find all the possible language tokens in the generation_config.json lang_to_id dictionary. :type language: Optional[str] - + :param lang_to_id: Language token to token_id map. Initialized from the generation_config.json lang_to_id dictionary. :type lang_to_id: Dict[str, int] - + :param task: Task to use for generation, either “translate” or “transcribe” :type task: int - + :param return_timestamps: If `true` the pipeline will return timestamps along the text for *segments* of words in the text. For instance, if you get WhisperDecodedResultChunk @@ -2192,29 +2192,29 @@ class WhisperPipeline: then it means the model predicts that the segment "Hi there!" was spoken after `0.5` and before `1.5` seconds. Note that a segment of text refers to a sequence of one or more words, rather than individual words. :type return_timestamps: bool - + :param initial_prompt: Initial prompt tokens passed as a previous transcription (after `<|startofprev|>` token) to the first processing window. Can be used to steer the model to use particular spellings or styles. - + Example: auto result = pipeline.generate(raw_speech); // He has gone and gone for good answered Paul Icrom who... - + auto result = pipeline.generate(raw_speech, ov::genai::initial_prompt("Polychrome")); // He has gone and gone for good answered Polychrome who... :type initial_prompt: Optional[str] - + :param hotwords: Hotwords tokens passed as a previous transcription (after `<|startofprev|>` token) to the all processing windows. Can be used to steer the model to use particular spellings or styles. - + Example: auto result = pipeline.generate(raw_speech); // He has gone and gone for good answered Paul Icrom who... - + auto result = pipeline.generate(raw_speech, ov::genai::hotwords("Polychrome")); // He has gone and gone for good answered Polychrome who... :type hotwords: Optional[str] - + Generic parameters: max_length: the maximum length the generated tokens can have. Corresponds to the length of the input prompt + max_new_tokens. Its effect is overridden by `max_new_tokens`, if also set. @@ -2228,11 +2228,11 @@ class WhisperPipeline: echo: if set to true, the model will echo the prompt in the output. logprobs: number of top logprobs computed for each position, if set to 0, logprobs are not computed and value 0.0 is returned. Currently only single top logprob can be returned, so any logprobs > 1 is treated as logprobs == 1. (default: 0). - + repetition_penalty: the parameter for repetition penalty. 1.0 means no penalty. presence_penalty: reduces absolute log prob if the token was generated at least once. frequency_penalty: reduces absolute log prob as many times as the token was generated. - + Beam search specific parameters: num_beams: number of beams for beam search. 1 disables beam search. num_beam_groups: number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams. @@ -2247,7 +2247,7 @@ class WhisperPipeline: "openvino_genai.StopCriteria.EARLY", where the generation stops as soon as there are `num_beams` complete candidates; "openvino_genai.StopCriteria.HEURISTIC" is applied and the generation stops when is it very unlikely to find better candidates; "openvino_genai.StopCriteria.NEVER", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm). - + Random sampling parameters: temperature: the value used to modulate token probabilities for random sampling. top_p: if set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation. @@ -2263,9 +2263,9 @@ class WhisperPipeline: ... class WhisperRawPerfMetrics: """ - + Structure with whisper specific raw performance metrics for each generation before any statistics are calculated. - + :param features_extraction_durations: Duration for each features extraction call. :type features_extraction_durations: List[MicroSeconds] """ diff --git a/src/python/py_continuous_batching_pipeline.cpp b/src/python/py_continuous_batching_pipeline.cpp index aad40a2204..03ffa959be 100644 --- a/src/python/py_continuous_batching_pipeline.cpp +++ b/src/python/py_continuous_batching_pipeline.cpp @@ -53,7 +53,7 @@ auto cache_eviction_config_docstring = R"( auto scheduler_config_docstring = R"( SchedulerConfig to construct ContinuousBatchingPipeline - Parameters: + Parameters: max_num_batched_tokens: a maximum number of tokens to batch (in contrast to max_batch_size which combines independent sequences, we consider total amount of tokens in a batch). num_kv_blocks: total number of KV blocks available to scheduler logic. @@ -74,7 +74,7 @@ auto scheduler_config_docstring = R"( auto generation_result_docstring = R"( GenerationResult stores resulting batched tokens and scores. - Parameters: + Parameters: request_id: obsolete when handle API is approved as handle will connect results with prompts. generation_ids: in a generic case we have multiple generation results per initial prompt depending on sampling parameters (e.g. beam search or parallel sampling). @@ -136,7 +136,7 @@ py::object __call_cb_generate(ContinuousBatchingPipeline& pipe, { py::gil_scoped_release rel; encoded_results = pipe.generate(input_ids, sampling_params, streamer); - } + } results = py::cast(encoded_results); }, [&](std::vector prompts) { @@ -144,7 +144,7 @@ py::object __call_cb_generate(ContinuousBatchingPipeline& pipe, { py::gil_scoped_release rel; generated_results = pipe.generate(prompts, sampling_params, streamer); - } + } results = py::cast(generated_results); }}, inputs); diff --git a/src/python/py_llm_pipeline.cpp b/src/python/py_llm_pipeline.cpp index 89dd29004f..e5e20fd4e6 100644 --- a/src/python/py_llm_pipeline.cpp +++ b/src/python/py_llm_pipeline.cpp @@ -109,7 +109,7 @@ extern char generation_config_docstring[]; void init_llm_pipeline(py::module_& m) { py::class_(m, "LLMPipeline", "This class is used for generation with LLMs") - // init(model_path, tokenizer, device, config, kwargs) should be defined before init(model_path, device, config, kwargs) + // init(model_path, tokenizer, device, config, kwargs) should be defined before init(model_path, device, config, kwargs) // to prevent tokenizer treated as kwargs argument .def(py::init([]( const std::filesystem::path& models_path, @@ -121,8 +121,8 @@ void init_llm_pipeline(py::module_& m) { ScopedVar env_manager(pyutils::ov_tokenizers_module_path()); ov::AnyMap properties = pyutils::kwargs_to_any_map(kwargs); if (config.size()) { - PyErr_WarnEx(PyExc_DeprecationWarning, - "'config' parameters is deprecated, please use kwargs to pass config properties instead.", + PyErr_WarnEx(PyExc_DeprecationWarning, + "'config' parameters is deprecated, please use kwargs to pass config properties instead.", 1); auto config_properties = pyutils::properties_to_any_map(config); properties.insert(config_properties.begin(), config_properties.end()); @@ -151,8 +151,8 @@ void init_llm_pipeline(py::module_& m) { ScopedVar env_manager(pyutils::ov_tokenizers_module_path()); ov::AnyMap properties = pyutils::kwargs_to_any_map(kwargs); if (config.size()) { - PyErr_WarnEx(PyExc_DeprecationWarning, - "'config' parameters is deprecated, please use kwargs to pass config properties instead.", + PyErr_WarnEx(PyExc_DeprecationWarning, + "'config' parameters is deprecated, please use kwargs to pass config properties instead.", 1); auto config_properties = pyutils::properties_to_any_map(config); properties.insert(config_properties.begin(), config_properties.end()); diff --git a/src/python/py_lora_adapter.cpp b/src/python/py_lora_adapter.cpp index ddaf6e2081..289fb84733 100644 --- a/src/python/py_lora_adapter.cpp +++ b/src/python/py_lora_adapter.cpp @@ -26,7 +26,7 @@ void init_lora_adapter(py::module_& m) { path (os.PathLike): Path to adapter file in safetensors format. )") .def( - "__bool__", + "__bool__", [](ov::genai::Adapter& self ) { return bool(self); @@ -47,7 +47,7 @@ void init_lora_adapter(py::module_& m) { py::arg_v("mode", ov::genai::AdapterConfig::Mode::MODE_AUTO, "AdapterConfig.Mode.MODE_AUTO")); adapter_config.def(py::init([]( - const ov::genai::Adapter& adapter, + const ov::genai::Adapter& adapter, float alpha, ov::genai::AdapterConfig::Mode mode) { return std::make_unique(adapter, alpha, mode); @@ -55,17 +55,17 @@ void init_lora_adapter(py::module_& m) { py::arg("adapter"), py::arg("alpha"), py::arg_v("mode", ov::genai::AdapterConfig::Mode::MODE_AUTO, "AdapterConfig.Mode.MODE_AUTO")); - + adapter_config.def(py::init([]( - const ov::genai::Adapter& adapter, + const ov::genai::Adapter& adapter, ov::genai::AdapterConfig::Mode mode) { return std::make_unique(adapter, mode); }), py::arg("adapter"), py::arg_v("mode", ov::genai::AdapterConfig::Mode::MODE_AUTO, "AdapterConfig.Mode.MODE_AUTO")); - + adapter_config.def(py::init([]( - const std::vector& adapters, + const std::vector& adapters, ov::genai::AdapterConfig::Mode mode) { return std::make_unique(adapters, mode); }), @@ -73,14 +73,14 @@ void init_lora_adapter(py::module_& m) { py::arg_v("mode", ov::genai::AdapterConfig::Mode::MODE_AUTO, "AdapterConfig.Mode.MODE_AUTO")); adapter_config.def(py::init([]( - const std::vector>& adapters, + const std::vector>& adapters, ov::genai::AdapterConfig::Mode mode) { return std::make_unique(adapters, mode); }), py::arg("adapters"), py::arg_v("mode", ov::genai::AdapterConfig::Mode::MODE_AUTO, "AdapterConfig.Mode.MODE_AUTO")); adapter_config.def( - "__bool__", + "__bool__", [](ov::genai::AdapterConfig& self ) { return bool(self); diff --git a/src/python/py_openvino_genai.cpp b/src/python/py_openvino_genai.cpp index 8b8bd831b0..4cd966ea1b 100644 --- a/src/python/py_openvino_genai.cpp +++ b/src/python/py_openvino_genai.cpp @@ -41,7 +41,7 @@ auto decoded_results_docstring = R"( Structure to store resulting batched text outputs and scores for each batch. The first num_return_sequences elements correspond to the first batch element. - Parameters: + Parameters: texts: vector of resulting sequences. scores: scores for each sequence. metrics: performance metrics with tpot, ttft, etc. of type ov::genai::PerfMetrics. @@ -54,7 +54,7 @@ auto encoded_results_docstring = R"( sum of logarithmic probabilities for each token in the sequence. In the case of greedy decoding scores are filled with zeros. - Parameters: + Parameters: tokens: sequence of resulting tokens. scores: sum of logarithmic probabilities of all tokens in the sequence. metrics: performance metrics with tpot, ttft, etc. of type ov::genai::PerfMetrics. @@ -100,7 +100,7 @@ PYBIND11_MODULE(py_openvino_genai, m) { py::str res; if (valid_utf8_strings.size() == 1) return valid_utf8_strings[0]; - + for (size_t i = 0; i < valid_utf8_strings.size() - 1; i++) { res += py::str(std::to_string(dr.scores[i])) + py::str(": ") + valid_utf8_strings[i] + py::str("\n"); } diff --git a/src/python/py_perf_metrics.cpp b/src/python/py_perf_metrics.cpp index 17e71150ac..76b0f8335e 100644 --- a/src/python/py_perf_metrics.cpp +++ b/src/python/py_perf_metrics.cpp @@ -111,8 +111,8 @@ std::vector timestamp_to_ms(const T& instance, U T::*member) { const auto& timestamps = instance.*member; res.reserve(timestamps.size()); std::transform(timestamps.begin(), timestamps.end(), std::back_inserter(res), - [](const auto& timestamp) { - return std::chrono::duration(timestamp.time_since_epoch()).count(); + [](const auto& timestamp) { + return std::chrono::duration(timestamp.time_since_epoch()).count(); }); return res; } @@ -125,11 +125,11 @@ void init_perf_metrics(py::module_& m) { .def_property_readonly("generate_durations", [](const RawPerfMetrics &rw) { return pyutils::get_ms(rw, &RawPerfMetrics::generate_durations); }) - .def_property_readonly("tokenization_durations", [](const RawPerfMetrics &rw) { + .def_property_readonly("tokenization_durations", [](const RawPerfMetrics &rw) { return pyutils::get_ms(rw, &RawPerfMetrics::tokenization_durations); }) - .def_property_readonly("detokenization_durations", [](const RawPerfMetrics &rw) { - return pyutils::get_ms(rw, &RawPerfMetrics::detokenization_durations); + .def_property_readonly("detokenization_durations", [](const RawPerfMetrics &rw) { + return pyutils::get_ms(rw, &RawPerfMetrics::detokenization_durations); }) .def_property_readonly("m_times_to_first_token", [](const RawPerfMetrics &rw) { return pyutils::get_ms(rw, &RawPerfMetrics::m_times_to_first_token); diff --git a/src/python/py_tokenizer.cpp b/src/python/py_tokenizer.cpp index 0dd9f3d715..45c5a1c97c 100644 --- a/src/python/py_tokenizer.cpp +++ b/src/python/py_tokenizer.cpp @@ -34,8 +34,8 @@ void init_tokenizer(py::module_& m) { ScopedVar env_manager(pyutils::ov_tokenizers_module_path()); auto kwargs_properties = pyutils::kwargs_to_any_map(kwargs); if (properties.size()) { - PyErr_WarnEx(PyExc_DeprecationWarning, - "'properties' parameters is deprecated, please use kwargs to pass config properties instead.", + PyErr_WarnEx(PyExc_DeprecationWarning, + "'properties' parameters is deprecated, please use kwargs to pass config properties instead.", 1); auto map_properties = pyutils::properties_to_any_map(properties); kwargs_properties.insert(map_properties.begin(), map_properties.end()); diff --git a/src/python/py_utils.cpp b/src/python/py_utils.cpp index 91c3ce8b97..727184137e 100644 --- a/src/python/py_utils.cpp +++ b/src/python/py_utils.cpp @@ -68,7 +68,7 @@ ov::AnyMap py_object_to_any_map(const py::object& py_obj) { } ov::Any py_object_to_any(const py::object& py_obj, std::string property_name) { - // These properties should be casted to ov::AnyMap, instead of std::map. + // These properties should be casted to ov::AnyMap, instead of std::map. std::set any_map_properties = { "GENERATE_CONFIG", "PREFILL_CONFIG", diff --git a/src/python/py_whisper_pipeline.cpp b/src/python/py_whisper_pipeline.cpp index aac14c258a..b3bb35072d 100644 --- a/src/python/py_whisper_pipeline.cpp +++ b/src/python/py_whisper_pipeline.cpp @@ -77,7 +77,7 @@ auto whisper_decoded_result_chunk = R"( auto whisper_generation_config_docstring = R"( WhisperGenerationConfig - + Whisper specific parameters: :param decoder_start_token_id: Corresponds to the ”<|startoftranscript|>” token. :type decoder_start_token_id: int diff --git a/tests/cpp/block_manager.cpp b/tests/cpp/block_manager.cpp index 9501eaefb5..41e313c414 100644 --- a/tests/cpp/block_manager.cpp +++ b/tests/cpp/block_manager.cpp @@ -103,4 +103,4 @@ TEST(TestBlockManager, CanFreeBlocksFromSequence) { size_t seq_id = sequence_group->get_sequences()[0]->get_id(); bm.free_blocks_from_sequence(seq_id, { {0}, {1}, {2} }); EXPECT_EQ(bm.num_free_blocks(), 6); -} \ No newline at end of file +} diff --git a/tests/cpp/cache_manager.cpp b/tests/cpp/cache_manager.cpp index 0c483f0ec1..a354063b64 100644 --- a/tests/cpp/cache_manager.cpp +++ b/tests/cpp/cache_manager.cpp @@ -64,7 +64,7 @@ TEST(TestCacheManager, test_cache_size_param) { auto cache_manager = std::make_shared(device_config, request, core); auto block_manager = BlockManager(device_config.get_num_kv_blocks(), false, device_config.get_block_size(), device_config.get_num_layers()); cache_manager->allocate_cache_if_needed(block_manager.get_total_number_of_kv_blocks()); - + ASSERT_EQ(get_total_allocated_bytes(cache_manager, num_decoder_layers), 2146959360); } @@ -134,4 +134,4 @@ TEST(TestCacheManager, test_dynamic_cache_increase) { // check that cache does not increase if new blocks were not allocated cache_manager->allocate_cache_if_needed(block_manager.get_total_number_of_kv_blocks()); OPENVINO_ASSERT(get_total_allocated_bytes(cache_manager, num_decoder_layers), 200 * block_size_in_bytes); -} \ No newline at end of file +} diff --git a/tests/cpp/data/cache_rotation_poc_ref_coefficients_per_block_0.txt b/tests/cpp/data/cache_rotation_poc_ref_coefficients_per_block_0.txt index ac9a793240..7188962b0a 100644 --- a/tests/cpp/data/cache_rotation_poc_ref_coefficients_per_block_0.txt +++ b/tests/cpp/data/cache_rotation_poc_ref_coefficients_per_block_0.txt @@ -2,29 +2,29 @@ 64 53 27 -18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 +18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 8 18 -0.9990234375 0.36181640625 0.8564453125 0.039520263671875 0.99853515625 -0.9423828125 -0.98876953125 -0.8720703125 0.705078125 -0.830078125 0.74462890625 0.5908203125 -0.888671875 0.73095703125 -0.9853515625 0.4990234375 0.923828125 0.09765625 -0.6533203125 -0.96875 -0.97900390625 -0.8544921875 -0.69482421875 -0.544921875 -0.418701171875 -0.318115234375 -0.240478515625 -0.1812744140625 -0.13623046875 -0.102294921875 -0.07672119140625 -0.05755615234375 -0.0311431884765625 -0.93212890625 -0.515625 0.9990234375 -0.048828125 -0.335205078125 0.1468505859375 0.4892578125 0.708984375 0.5576171875 0.66748046875 0.806640625 0.45849609375 -0.6826171875 0.1710205078125 0.86669921875 -0.382568359375 -0.99560546875 -0.7568359375 -0.2484130859375 0.2030029296875 0.51953125 0.71923828125 0.83837890625 0.908203125 0.9482421875 0.97021484375 0.9833984375 0.99072265625 0.99462890625 0.99658203125 0.99853515625 +0.9990234375 0.36181640625 0.8564453125 0.039520263671875 0.99853515625 -0.9423828125 -0.98876953125 -0.8720703125 0.705078125 -0.830078125 0.74462890625 0.5908203125 -0.888671875 0.73095703125 -0.9853515625 0.4990234375 0.923828125 0.09765625 -0.6533203125 -0.96875 -0.97900390625 -0.8544921875 -0.69482421875 -0.544921875 -0.418701171875 -0.318115234375 -0.240478515625 -0.1812744140625 -0.13623046875 -0.102294921875 -0.07672119140625 -0.05755615234375 +0.0311431884765625 -0.93212890625 -0.515625 0.9990234375 -0.048828125 -0.335205078125 0.1468505859375 0.4892578125 0.708984375 0.5576171875 0.66748046875 0.806640625 0.45849609375 -0.6826171875 0.1710205078125 0.86669921875 -0.382568359375 -0.99560546875 -0.7568359375 -0.2484130859375 0.2030029296875 0.51953125 0.71923828125 0.83837890625 0.908203125 0.9482421875 0.97021484375 0.9833984375 0.99072265625 0.99462890625 0.99658203125 0.99853515625 19 -1.0 0.362060546875 0.85693359375 0.03955078125 0.99853515625 -0.9423828125 -0.9892578125 -0.87255859375 0.705078125 -0.83056640625 0.74462890625 0.59130859375 -0.88916015625 0.73095703125 -0.98486328125 0.4990234375 0.923828125 0.09796142578125 -0.6533203125 -0.96875 -0.97900390625 -0.8544921875 -0.69482421875 -0.544921875 -0.418701171875 -0.318359375 -0.240478515625 -0.18115234375 -0.1361083984375 -0.10235595703125 -0.07672119140625 -0.057525634765625 -0.0311431884765625 -0.93212890625 -0.515625 0.9990234375 -0.0489501953125 -0.3349609375 0.1468505859375 0.48974609375 0.70947265625 0.55712890625 0.66748046875 0.80712890625 0.45849609375 -0.6826171875 0.1707763671875 0.86669921875 -0.38232421875 -0.99560546875 -0.7568359375 -0.2484130859375 0.203125 0.51953125 0.71923828125 0.83837890625 0.908203125 0.9482421875 0.970703125 0.9833984375 0.9912109375 0.9951171875 0.99658203125 0.998046875 +1.0 0.362060546875 0.85693359375 0.03955078125 0.99853515625 -0.9423828125 -0.9892578125 -0.87255859375 0.705078125 -0.83056640625 0.74462890625 0.59130859375 -0.88916015625 0.73095703125 -0.98486328125 0.4990234375 0.923828125 0.09796142578125 -0.6533203125 -0.96875 -0.97900390625 -0.8544921875 -0.69482421875 -0.544921875 -0.418701171875 -0.318359375 -0.240478515625 -0.18115234375 -0.1361083984375 -0.10235595703125 -0.07672119140625 -0.057525634765625 +0.0311431884765625 -0.93212890625 -0.515625 0.9990234375 -0.0489501953125 -0.3349609375 0.1468505859375 0.48974609375 0.70947265625 0.55712890625 0.66748046875 0.80712890625 0.45849609375 -0.6826171875 0.1707763671875 0.86669921875 -0.38232421875 -0.99560546875 -0.7568359375 -0.2484130859375 0.203125 0.51953125 0.71923828125 0.83837890625 0.908203125 0.9482421875 0.970703125 0.9833984375 0.9912109375 0.9951171875 0.99658203125 0.998046875 20 -0.99951171875 0.36181640625 0.8564453125 0.039337158203125 0.99853515625 -0.9423828125 -0.98876953125 -0.8720703125 0.70458984375 -0.83056640625 0.744140625 0.5908203125 -0.888671875 0.73046875 -0.9853515625 0.49853515625 0.92431640625 0.09765625 -0.6533203125 -0.96875 -0.97900390625 -0.8544921875 -0.6943359375 -0.54443359375 -0.418701171875 -0.318359375 -0.2406005859375 -0.1810302734375 -0.1363525390625 -0.10223388671875 -0.07672119140625 -0.05755615234375 -0.031036376953125 -0.9326171875 -0.515625 0.9990234375 -0.0491943359375 -0.335205078125 0.1468505859375 0.489501953125 0.708984375 0.5576171875 0.66748046875 0.806640625 0.458251953125 -0.6826171875 0.1712646484375 0.86669921875 -0.382568359375 -0.99560546875 -0.7568359375 -0.248291015625 0.203125 0.51953125 0.71923828125 0.83837890625 0.908203125 0.94775390625 0.970703125 0.9833984375 0.99072265625 0.9951171875 0.99755859375 0.99853515625 +0.99951171875 0.36181640625 0.8564453125 0.039337158203125 0.99853515625 -0.9423828125 -0.98876953125 -0.8720703125 0.70458984375 -0.83056640625 0.744140625 0.5908203125 -0.888671875 0.73046875 -0.9853515625 0.49853515625 0.92431640625 0.09765625 -0.6533203125 -0.96875 -0.97900390625 -0.8544921875 -0.6943359375 -0.54443359375 -0.418701171875 -0.318359375 -0.2406005859375 -0.1810302734375 -0.1363525390625 -0.10223388671875 -0.07672119140625 -0.05755615234375 +0.031036376953125 -0.9326171875 -0.515625 0.9990234375 -0.0491943359375 -0.335205078125 0.1468505859375 0.489501953125 0.708984375 0.5576171875 0.66748046875 0.806640625 0.458251953125 -0.6826171875 0.1712646484375 0.86669921875 -0.382568359375 -0.99560546875 -0.7568359375 -0.248291015625 0.203125 0.51953125 0.71923828125 0.83837890625 0.908203125 0.94775390625 0.970703125 0.9833984375 0.99072265625 0.9951171875 0.99755859375 0.99853515625 21 -0.99951171875 0.361572265625 0.85693359375 0.03936767578125 0.99853515625 -0.9423828125 -0.98876953125 -0.8720703125 0.705078125 -0.83056640625 0.744140625 0.5908203125 -0.888671875 0.73095703125 -0.98583984375 0.498779296875 0.923828125 0.097412109375 -0.6533203125 -0.96875 -0.97900390625 -0.8544921875 -0.69482421875 -0.544921875 -0.41845703125 -0.318115234375 -0.240478515625 -0.180908203125 -0.1361083984375 -0.10235595703125 -0.07666015625 -0.057586669921875 -0.031005859375 -0.93212890625 -0.51611328125 0.9990234375 -0.048614501953125 -0.335205078125 0.1468505859375 0.489501953125 0.70947265625 0.5576171875 0.66796875 0.806640625 0.45849609375 -0.6826171875 0.1710205078125 0.86669921875 -0.38232421875 -0.99560546875 -0.7568359375 -0.248291015625 0.2034912109375 0.51953125 0.71923828125 0.8388671875 0.908203125 0.94775390625 0.970703125 0.9833984375 0.99072265625 0.9951171875 0.9970703125 0.998046875 +0.99951171875 0.361572265625 0.85693359375 0.03936767578125 0.99853515625 -0.9423828125 -0.98876953125 -0.8720703125 0.705078125 -0.83056640625 0.744140625 0.5908203125 -0.888671875 0.73095703125 -0.98583984375 0.498779296875 0.923828125 0.097412109375 -0.6533203125 -0.96875 -0.97900390625 -0.8544921875 -0.69482421875 -0.544921875 -0.41845703125 -0.318115234375 -0.240478515625 -0.180908203125 -0.1361083984375 -0.10235595703125 -0.07666015625 -0.057586669921875 +0.031005859375 -0.93212890625 -0.51611328125 0.9990234375 -0.048614501953125 -0.335205078125 0.1468505859375 0.489501953125 0.70947265625 0.5576171875 0.66796875 0.806640625 0.45849609375 -0.6826171875 0.1710205078125 0.86669921875 -0.38232421875 -0.99560546875 -0.7568359375 -0.248291015625 0.2034912109375 0.51953125 0.71923828125 0.8388671875 0.908203125 0.94775390625 0.970703125 0.9833984375 0.99072265625 0.9951171875 0.9970703125 0.998046875 22 -0.99951171875 0.36181640625 0.8564453125 0.03936767578125 0.99853515625 -0.9423828125 -0.9892578125 -0.8720703125 0.705078125 -0.830078125 0.74462890625 0.5908203125 -0.888671875 0.73095703125 -0.98486328125 0.498779296875 0.923828125 0.097900390625 -0.65380859375 -0.96826171875 -0.97900390625 -0.8544921875 -0.69482421875 -0.54443359375 -0.41845703125 -0.318359375 -0.2406005859375 -0.18115234375 -0.1361083984375 -0.10223388671875 -0.07672119140625 -0.057586669921875 -0.03094482421875 -0.93212890625 -0.515625 0.9990234375 -0.04888916015625 -0.335205078125 0.1468505859375 0.4892578125 0.70947265625 0.55712890625 0.66796875 0.806640625 0.458251953125 -0.6826171875 0.1708984375 0.86669921875 -0.38232421875 -0.9951171875 -0.75634765625 -0.248291015625 0.2030029296875 0.51953125 0.71923828125 0.83837890625 0.90771484375 0.94775390625 0.970703125 0.9833984375 0.990234375 0.9951171875 0.9970703125 0.99853515625 +0.99951171875 0.36181640625 0.8564453125 0.03936767578125 0.99853515625 -0.9423828125 -0.9892578125 -0.8720703125 0.705078125 -0.830078125 0.74462890625 0.5908203125 -0.888671875 0.73095703125 -0.98486328125 0.498779296875 0.923828125 0.097900390625 -0.65380859375 -0.96826171875 -0.97900390625 -0.8544921875 -0.69482421875 -0.54443359375 -0.41845703125 -0.318359375 -0.2406005859375 -0.18115234375 -0.1361083984375 -0.10223388671875 -0.07672119140625 -0.057586669921875 +0.03094482421875 -0.93212890625 -0.515625 0.9990234375 -0.04888916015625 -0.335205078125 0.1468505859375 0.4892578125 0.70947265625 0.55712890625 0.66796875 0.806640625 0.458251953125 -0.6826171875 0.1708984375 0.86669921875 -0.38232421875 -0.9951171875 -0.75634765625 -0.248291015625 0.2030029296875 0.51953125 0.71923828125 0.83837890625 0.90771484375 0.94775390625 0.970703125 0.9833984375 0.990234375 0.9951171875 0.9970703125 0.99853515625 23 -0.99951171875 0.36181640625 0.85693359375 0.03936767578125 0.9990234375 -0.94189453125 -0.98876953125 -0.8720703125 0.70458984375 -0.830078125 0.74462890625 0.5908203125 -0.888671875 0.73046875 -0.9853515625 0.498779296875 0.923828125 0.09783935546875 -0.6533203125 -0.96875 -0.9794921875 -0.8544921875 -0.6953125 -0.54443359375 -0.41845703125 -0.318359375 -0.240478515625 -0.18115234375 -0.1361083984375 -0.102294921875 -0.07672119140625 -0.057647705078125 -0.031005859375 -0.93212890625 -0.515625 0.9990234375 -0.048797607421875 -0.334716796875 0.14697265625 0.4892578125 0.708984375 0.5576171875 0.66796875 0.80615234375 0.45849609375 -0.6826171875 0.1708984375 0.86669921875 -0.38232421875 -0.9951171875 -0.7568359375 -0.248291015625 0.203125 0.51953125 0.71923828125 0.83837890625 0.908203125 0.94775390625 0.97021484375 0.9833984375 0.99072265625 0.99462890625 0.9970703125 0.99853515625 +0.99951171875 0.36181640625 0.85693359375 0.03936767578125 0.9990234375 -0.94189453125 -0.98876953125 -0.8720703125 0.70458984375 -0.830078125 0.74462890625 0.5908203125 -0.888671875 0.73046875 -0.9853515625 0.498779296875 0.923828125 0.09783935546875 -0.6533203125 -0.96875 -0.9794921875 -0.8544921875 -0.6953125 -0.54443359375 -0.41845703125 -0.318359375 -0.240478515625 -0.18115234375 -0.1361083984375 -0.102294921875 -0.07672119140625 -0.057647705078125 +0.031005859375 -0.93212890625 -0.515625 0.9990234375 -0.048797607421875 -0.334716796875 0.14697265625 0.4892578125 0.708984375 0.5576171875 0.66796875 0.80615234375 0.45849609375 -0.6826171875 0.1708984375 0.86669921875 -0.38232421875 -0.9951171875 -0.7568359375 -0.248291015625 0.203125 0.51953125 0.71923828125 0.83837890625 0.908203125 0.94775390625 0.97021484375 0.9833984375 0.99072265625 0.99462890625 0.9970703125 0.99853515625 24 -0.99951171875 0.3623046875 0.8564453125 0.03955078125 0.99853515625 -0.9423828125 -0.9892578125 -0.8720703125 0.705078125 -0.83056640625 0.74462890625 0.5908203125 -0.888671875 0.73095703125 -0.9853515625 0.498779296875 0.923828125 0.097900390625 -0.6533203125 -0.96875 -0.97900390625 -0.8544921875 -0.69482421875 -0.54443359375 -0.4189453125 -0.318359375 -0.240478515625 -0.18115234375 -0.1361083984375 -0.10223388671875 -0.0767822265625 -0.05755615234375 -0.031097412109375 -0.93212890625 -0.51611328125 0.99951171875 -0.04888916015625 -0.3349609375 0.1466064453125 0.4892578125 0.708984375 0.5576171875 0.66796875 0.80712890625 0.4580078125 -0.6826171875 0.1707763671875 0.8671875 -0.382568359375 -0.99560546875 -0.75634765625 -0.248291015625 0.203369140625 0.52001953125 0.71875 0.83837890625 0.908203125 0.94775390625 0.97021484375 0.9833984375 0.99072265625 0.9951171875 0.99658203125 0.998046875 +0.99951171875 0.3623046875 0.8564453125 0.03955078125 0.99853515625 -0.9423828125 -0.9892578125 -0.8720703125 0.705078125 -0.83056640625 0.74462890625 0.5908203125 -0.888671875 0.73095703125 -0.9853515625 0.498779296875 0.923828125 0.097900390625 -0.6533203125 -0.96875 -0.97900390625 -0.8544921875 -0.69482421875 -0.54443359375 -0.4189453125 -0.318359375 -0.240478515625 -0.18115234375 -0.1361083984375 -0.10223388671875 -0.0767822265625 -0.05755615234375 +0.031097412109375 -0.93212890625 -0.51611328125 0.99951171875 -0.04888916015625 -0.3349609375 0.1466064453125 0.4892578125 0.708984375 0.5576171875 0.66796875 0.80712890625 0.4580078125 -0.6826171875 0.1707763671875 0.8671875 -0.382568359375 -0.99560546875 -0.75634765625 -0.248291015625 0.203369140625 0.52001953125 0.71875 0.83837890625 0.908203125 0.94775390625 0.97021484375 0.9833984375 0.99072265625 0.9951171875 0.99658203125 0.998046875 25 -0.99951171875 0.36181640625 0.8564453125 0.039398193359375 0.9990234375 -0.94189453125 -0.9892578125 -0.87255859375 0.705078125 -0.830078125 0.74462890625 0.5908203125 -0.88916015625 0.73095703125 -0.9853515625 0.4990234375 0.92431640625 0.0977783203125 -0.6533203125 -0.96875 -0.97900390625 -0.8544921875 -0.6943359375 -0.54443359375 -0.418701171875 -0.318359375 -0.2403564453125 -0.1812744140625 -0.13623046875 -0.102294921875 -0.07684326171875 -0.057586669921875 -0.030853271484375 -0.93212890625 -0.515625 0.9990234375 -0.04901123046875 -0.3349609375 0.146728515625 0.489501953125 0.708984375 0.5576171875 0.66796875 0.806640625 0.458251953125 -0.68212890625 0.1707763671875 0.86669921875 -0.382568359375 -0.9951171875 -0.7568359375 -0.2481689453125 0.2034912109375 0.51953125 0.71875 0.83837890625 0.90771484375 0.94775390625 0.970703125 0.9833984375 0.9912109375 0.99462890625 0.99755859375 0.998046875 +0.99951171875 0.36181640625 0.8564453125 0.039398193359375 0.9990234375 -0.94189453125 -0.9892578125 -0.87255859375 0.705078125 -0.830078125 0.74462890625 0.5908203125 -0.88916015625 0.73095703125 -0.9853515625 0.4990234375 0.92431640625 0.0977783203125 -0.6533203125 -0.96875 -0.97900390625 -0.8544921875 -0.6943359375 -0.54443359375 -0.418701171875 -0.318359375 -0.2403564453125 -0.1812744140625 -0.13623046875 -0.102294921875 -0.07684326171875 -0.057586669921875 +0.030853271484375 -0.93212890625 -0.515625 0.9990234375 -0.04901123046875 -0.3349609375 0.146728515625 0.489501953125 0.708984375 0.5576171875 0.66796875 0.806640625 0.458251953125 -0.68212890625 0.1707763671875 0.86669921875 -0.382568359375 -0.9951171875 -0.7568359375 -0.2481689453125 0.2034912109375 0.51953125 0.71875 0.83837890625 0.90771484375 0.94775390625 0.970703125 0.9833984375 0.9912109375 0.99462890625 0.99755859375 0.998046875 diff --git a/tests/cpp/logit_filtering.cpp b/tests/cpp/logit_filtering.cpp index dcb98113f3..06cc3e7b5b 100644 --- a/tests/cpp/logit_filtering.cpp +++ b/tests/cpp/logit_filtering.cpp @@ -342,4 +342,3 @@ const std::vector EOS_PENALTY_TRANSFORM_TEST_CASE INSTANTIATE_TEST_SUITE_P(VariousInputs, EOSPenaltyTransformTest, testing::ValuesIn(EOS_PENALTY_TRANSFORM_TEST_CASES)); - diff --git a/tests/cpp/sampler.cpp b/tests/cpp/sampler.cpp index 3741880827..19201f4fd1 100644 --- a/tests/cpp/sampler.cpp +++ b/tests/cpp/sampler.cpp @@ -42,7 +42,7 @@ TEST(SamplerValidationMode, gen_phase_to_cut_whole_seq) { }; // to emulate processed prompt and add next token [ 0 ] - sequence_groups.front()->get_sequences().front()->append_token(0, 1.f); + sequence_groups.front()->get_sequences().front()->append_token(0, 1.f); sequence_groups.front()->update_processed_tokens_num(5); // append candidates [ 2, 3, 4 ] @@ -86,7 +86,7 @@ TEST(SamplerValidationMode, gen_phase_to_cut_part_seq) { }; // to emulate processed prompt and add next token [ 0 ] - sequence_groups.front()->get_sequences().front()->append_token(0, 1.f); + sequence_groups.front()->get_sequences().front()->append_token(0, 1.f); sequence_groups.front()->update_processed_tokens_num(5); // append candidates [ 1, 2, 2 ] @@ -131,7 +131,7 @@ TEST(SamplerValidationMode, gen_phase) { }; // to emulate processed prompt and add next token [ 0 ] - sequence_groups.front()->get_sequences().front()->append_token(0, 1.f); + sequence_groups.front()->get_sequences().front()->append_token(0, 1.f); sequence_groups.front()->update_processed_tokens_num(5); // append candidates [ 1, 2, 3 ] diff --git a/tests/cpp/speculative_decoding.cpp b/tests/cpp/speculative_decoding.cpp index 1cf8db0fab..16cd75563f 100644 --- a/tests/cpp/speculative_decoding.cpp +++ b/tests/cpp/speculative_decoding.cpp @@ -19,7 +19,7 @@ class CBForSDTest : public testing::Test, public ov::genai::ContinuousBatchingPi sampling_params.num_assistant_tokens = 1; ov::genai::SequenceGroup::Ptr sequence_group = std::make_shared(request_id, input_ids, - sampling_params, + sampling_params, 32); { @@ -43,7 +43,7 @@ TEST_F(CBForSDTest, init_sequence_by_not_empty__one_sequence) { std::vector tokens = { 0, 1, 2 }; std::vector log_probs = { 0.1f, 0.2f, 0.3f }; ov::genai::GeneratedSequences candidate{{ 0, ov::genai::GeneratedSequence(tokens, log_probs) }}; - + auto before = m_pipeline.get_generated_requests(); auto update_result = m_pipeline.update_request(0, candidate, true); ASSERT_EQ(update_result.removed_tokens_cnt, 0); @@ -64,7 +64,7 @@ TEST_F(CBForSDTest, init_sequence_by_empty__one_sequence) { std::vector tokens = {}; std::vector log_probs = {}; ov::genai::GeneratedSequences candidate{{ 0, ov::genai::GeneratedSequence(tokens, log_probs) }}; - + auto before = m_pipeline.get_generated_requests(); auto update_result = m_pipeline.update_request(0, candidate, true); ASSERT_EQ(update_result.removed_tokens_cnt, 0); @@ -85,7 +85,7 @@ TEST_F(CBForSDTest, no_updated_tokens__one_sequence) { std::vector tokens = { 0, 1, 2 }; std::vector log_probs = { 0.1f, 0.2f, 0.3f }; ov::genai::GeneratedSequences candidate{{ 0, ov::genai::GeneratedSequence(tokens, log_probs) }}; - + auto update_result = m_pipeline.update_request(0, candidate, true); ASSERT_EQ(update_result.removed_tokens_cnt, 0); ASSERT_EQ(update_result.inserted_tokens_cnt, 3); @@ -112,7 +112,7 @@ TEST_F(CBForSDTest, remove_tokens__one_sequence) { std::vector tokens = { 0, 1, 2 }; std::vector log_probs = { 0.1f, 0.2f, 0.3f }; ov::genai::GeneratedSequences candidate{{ 0, ov::genai::GeneratedSequence(tokens, log_probs) }}; - + auto update_result = m_pipeline.update_request(0, candidate, true); ASSERT_EQ(update_result.removed_tokens_cnt, 0); ASSERT_EQ(update_result.inserted_tokens_cnt, 3); @@ -141,7 +141,7 @@ TEST_F(CBForSDTest, remove_and_replace_tokens__one_sequence) { std::vector tokens = { 0, 1, 2 }; std::vector log_probs = { 0.1f, 0.2f, 0.3f }; ov::genai::GeneratedSequences candidate{{ 0, ov::genai::GeneratedSequence(tokens, log_probs) }}; - + auto update_result = m_pipeline.update_request(0, candidate, true); ASSERT_EQ(update_result.removed_tokens_cnt, 0); ASSERT_EQ(update_result.inserted_tokens_cnt, 3); @@ -170,7 +170,7 @@ TEST_F(CBForSDTest, add_tokens__one_sequence) { std::vector tokens = { 0, 1, 2 }; std::vector log_probs = { 0.1f, 0.2f, 0.3f }; ov::genai::GeneratedSequences candidate{{ 0, ov::genai::GeneratedSequence(tokens, log_probs) }}; - + auto update_result = m_pipeline.update_request(0, candidate, true); ASSERT_EQ(update_result.removed_tokens_cnt, 0); ASSERT_EQ(update_result.inserted_tokens_cnt, 3); @@ -204,7 +204,7 @@ TEST_F(CBForSDTest, update_empty_sequence_by_not_empty__two_sequence) { { 0, ov::genai::GeneratedSequence(tokens_0, log_probs_0) }, { 1, ov::genai::GeneratedSequence(tokens_1, log_probs_1) } }; - + auto before = m_pipeline.get_generated_requests(); auto update_result = m_pipeline.update_request(0, candidate, true); ASSERT_EQ(update_result.removed_tokens_cnt, 0); @@ -233,7 +233,7 @@ TEST_F(CBForSDTest, init_sequence_by_not_empty__two_sequence) { { 0, ov::genai::GeneratedSequence(tokens_0, log_probs_0) }, { 1, ov::genai::GeneratedSequence(tokens_1, log_probs_1) } }; - + auto before = m_pipeline.get_generated_requests(); auto update_result = m_pipeline.init_request_by_candidate(0, candidate); ASSERT_EQ(update_result.removed_tokens_cnt, 0); @@ -260,7 +260,7 @@ TEST_F(CBForSDTest, init_sequence_by_empty__two_sequence) { { 0, ov::genai::GeneratedSequence(tokens, log_probs) }, { 1, ov::genai::GeneratedSequence(tokens, log_probs) }, }; - + auto before = m_pipeline.get_generated_requests(); auto update_result = m_pipeline.init_request_by_candidate(0, candidate); ASSERT_EQ(update_result.removed_tokens_cnt, 0); @@ -286,7 +286,7 @@ TEST_F(CBForSDTest, no_updated_tokens__two_sequence) { { 0, ov::genai::GeneratedSequence(tokens_0, log_probs_0) }, { 1, ov::genai::GeneratedSequence(tokens_1, log_probs_1) }, }; - + auto update_result = m_pipeline.init_request_by_candidate(0, candidate); ASSERT_EQ(update_result.removed_tokens_cnt, 0); ASSERT_EQ(update_result.inserted_tokens_cnt, 2); @@ -319,7 +319,7 @@ TEST_F(CBForSDTest, remove_tokens__two_sequence) { { 0, ov::genai::GeneratedSequence(tokens, log_probs) }, { 1, ov::genai::GeneratedSequence(tokens, log_probs) }, }; - + auto update_result = m_pipeline.init_request_by_candidate(0, candidate); ASSERT_EQ(update_result.removed_tokens_cnt, 0); ASSERT_EQ(update_result.inserted_tokens_cnt, 3); @@ -358,7 +358,7 @@ TEST_F(CBForSDTest, remove_and_replace_tokens__two_sequence) { { 0, ov::genai::GeneratedSequence(tokens, log_probs) }, { 1, ov::genai::GeneratedSequence(tokens, log_probs) }, }; - + auto update_result = m_pipeline.init_request_by_candidate(0, candidate); ASSERT_EQ(update_result.removed_tokens_cnt, 0); ASSERT_EQ(update_result.inserted_tokens_cnt, 3); @@ -397,7 +397,7 @@ TEST_F(CBForSDTest, add_tokens__two_sequence) { { 0, ov::genai::GeneratedSequence(tokens, log_probs) }, { 1, ov::genai::GeneratedSequence(tokens, log_probs) }, }; - + auto update_result = m_pipeline.init_request_by_candidate(0, candidate); ASSERT_EQ(update_result.removed_tokens_cnt, 0); ASSERT_EQ(update_result.inserted_tokens_cnt, 3); @@ -426,4 +426,3 @@ TEST_F(CBForSDTest, add_tokens__two_sequence) { ASSERT_EQ(after.at(0).at(1).token_ids, tokens); ASSERT_EQ(after.at(0).at(1).log_probs, log_probs); } - diff --git a/tests/cpp/utils.cpp b/tests/cpp/utils.cpp index 14b110b993..cad71c006f 100644 --- a/tests/cpp/utils.cpp +++ b/tests/cpp/utils.cpp @@ -18,4 +18,4 @@ TEST(TestIsContainer, test_is_container) { EXPECT_EQ(is_container>, true); EXPECT_EQ(is_container, true); EXPECT_EQ(is_container>, true); -} \ No newline at end of file +} diff --git a/tests/python_tests/common.py b/tests/python_tests/common.py index b0b6a70e93..255212c90c 100644 --- a/tests/python_tests/common.py +++ b/tests/python_tests/common.py @@ -302,11 +302,11 @@ def run_continuous_batching( models_path : Path, scheduler_config : SchedulerConfig, prompts: List[str], - generation_configs : List[GenerationConfig] | GenerationConfig + generation_configs : List[GenerationConfig] | GenerationConfig ) -> List[GenerationResult]: if type(generation_configs) is not list: generation_configs = [generation_configs] * len(prompts) - + cb_pipe = ContinuousBatchingPipeline(models_path, scheduler_config=scheduler_config, device='CPU', tokenizer_properties={}, properties=get_default_properties()) output = cb_pipe.generate(prompts, generation_configs) @@ -346,12 +346,12 @@ def __init__(self): def accumulate(self, subword) -> bool: self.results.append(subword) return False - + def get_results(self) -> List[GenerationResult]: streaming_result = GenerationResult() streaming_result.m_generation_ids = [''.join(self.results)] return [streaming_result] - + def reset(self): self.results = [] @@ -368,7 +368,7 @@ def run_llm_pipeline( if use_cb: properties['scheduler_config'] = SchedulerConfig() ov_pipe = LLMPipeline(models_path, device='CPU', **properties) - + if streamer is None and not (generation_config.is_beam_search() or generation_config.num_return_sequences > 1) and len(prompts) == 1: # We can use streamer only if we have a single prompt and not beam search. streamer = StreamerWithResults() @@ -377,8 +377,8 @@ def run_llm_pipeline( streamer.reset() generate_outputs : DecodedResults = ov_pipe.generate( - inputs=prompts, - generation_config=generation_config, + inputs=prompts, + generation_config=generation_config, streamer=streamer.accumulate if isinstance(streamer, StreamerWithResults) else streamer ) @@ -398,7 +398,7 @@ def run_llm_pipeline( del ov_pipe shutil.rmtree(models_path) - + if isinstance(streamer, StreamerWithResults): compare_generation_results(prompts, generation_results, streamer.get_results(), generation_config) @@ -456,12 +456,12 @@ def convert_models(opt_model : OVModelForCausalLM, hf_tokenizer : AutoTokenizer, tokenizer, detokenizer = convert_tokenizer(hf_tokenizer, with_detokenizer=True) serialize(tokenizer, models_path / "openvino_tokenizer.xml") serialize(detokenizer, models_path / "openvino_detokenizer.xml") - -def run_llm_pipeline_with_ref(model_id: str, - prompts: List[str], - generation_config: GenerationConfig | dict, - tmp_path: Path, + +def run_llm_pipeline_with_ref(model_id: str, + prompts: List[str], + generation_config: GenerationConfig | dict, + tmp_path: Path, use_cb : bool = False, streamer: StreamerWithResults | Callable | StreamerBase = None): models_path : Path = tmp_path / model_id diff --git a/tests/python_tests/conftest.py b/tests/python_tests/conftest.py index e159045601..bd623f1f5a 100644 --- a/tests/python_tests/conftest.py +++ b/tests/python_tests/conftest.py @@ -22,4 +22,3 @@ def pytest_configure(config: pytest.Config): marker = 'precommit' if config.getoption('-m') == 'precommit' else 'nightly' pytest.run_marker = marker pytest.selected_model_ids = config.getoption('--model_ids', default=None) - diff --git a/tests/python_tests/data/long_prompts.txt b/tests/python_tests/data/long_prompts.txt index 793036541f..d149792eb3 100644 --- a/tests/python_tests/data/long_prompts.txt +++ b/tests/python_tests/data/long_prompts.txt @@ -7,7 +7,7 @@ Mira would then make her way to her favorite corner, a snug little nook bathed i In conclusion, quantum entanglement is a captivating area of study that has reshaped our understanding of the physical universe. It is a testament to the oddities of quantum mechanics, where particles can be deeply connected regardless of the distances that separate them. With ongoing research and technological advancements, the concept of entanglement continues to inspire new theories and applications, offering a glimpse into a future where quantum systems may revolutionize how we process information and interact with the world around us. As we delve deeper into the quantum realm, we uncover not just the intricacies of particles and forces but also fundamental truths about the nature of reality itself. In quantum mechanics, particles such as electrons or photons exist in a state of superposition. This means they do not have definite properties until measured. For example, an electron can simultaneously have a spin of "up" and "down" until an observation is made. When two particles are entangled, their superposed states are linked. If one particle is measured and found to have a specific property, the other particle’s state is determined instantaneously—the spin of the second particle will be opposite that of the first, regardless of the distance between them. Moreover, quantum entanglement is a critical component of quantum communication. It enables secure transmission of information through techniques like quantum key distribution (QKD). In QKD, two parties can share a secret key using entangled particles. Any attempt by an eavesdropper to intercept or measure the particles will disturb their states, revealing the presence of an unauthorized observer. This technology promises a significant advancement in data security, offering virtually unbreakable encryption. -Once home, Mira spread out her findings across her bedroom floor . The map was rudimentary, marked with simple symbols: a sun, a tree, and an ominous 'X' at the end . It felt like a treasure map, and Mira's imagination began to race . After her parents went to bed, she gathered supplies: a flashlight, a notebook, and a snack for the journey . With her heart racing at the thought of adventure, she headed out into the cool night . The moon illuminated her path as Mira made her way up the hillside, following the map's directions . The night was quiet, with only the sound of rustling leaves and the distant hoot of an owl . As she climbed higher, she felt a growing sense of purpose . “The heart that beats beneath the stones,” she muttered, trying to decipher what the words could mean . After some time, she arrived at a clearing where the ground was carpeted with moss and dotted with smooth stones . The map indicated that she needed to look closely . Mira knelt down to inspect the area and, just as she was about to give up, she heard a soft thump, like the beat of a drum . Surprised, she looked around and found a particularly large stone slightly displaced from the others . The crystal became her talisman, reminding her of her promise and the magic of storytelling—a bridge between the ordinary and the extraordinary, where dreams take flight and every book waited to be opened . +Once home, Mira spread out her findings across her bedroom floor . The map was rudimentary, marked with simple symbols: a sun, a tree, and an ominous 'X' at the end . It felt like a treasure map, and Mira's imagination began to race . After her parents went to bed, she gathered supplies: a flashlight, a notebook, and a snack for the journey . With her heart racing at the thought of adventure, she headed out into the cool night . The moon illuminated her path as Mira made her way up the hillside, following the map's directions . The night was quiet, with only the sound of rustling leaves and the distant hoot of an owl . As she climbed higher, she felt a growing sense of purpose . “The heart that beats beneath the stones,” she muttered, trying to decipher what the words could mean . After some time, she arrived at a clearing where the ground was carpeted with moss and dotted with smooth stones . The map indicated that she needed to look closely . Mira knelt down to inspect the area and, just as she was about to give up, she heard a soft thump, like the beat of a drum . Surprised, she looked around and found a particularly large stone slightly displaced from the others . The crystal became her talisman, reminding her of her promise and the magic of storytelling—a bridge between the ordinary and the extraordinary, where dreams take flight and every book waited to be opened . Quantum entanglement is one of the most intriguing phenomena in the realm of quantum mechanics, a branch of physics that describes the behavior of matter and energy at the smallest scales. Developed in the early 20th century, quantum mechanics fundamentally altered our perception of the universe. Unlike classical physics, which dictates that particles have defined positions and velocities, quantum mechanics introduces a level of uncertainty and non-locality. One of the cornerstones of this theory is the concept of entanglement, which Albert Einstein famously referred to as "spooky action at a distance." ### The Fascinating World of Bioluminescence #### Introduction Bioluminescence is a natural phenomenon that occurs in various organisms, characterized by the ability to emit light . This incredible adaptation can be found in a range of living beings, including certain species of fungi, bacteria, and marine animals . The light produced can serve various purposes such as predation, communication, and camouflage . This article explores the mechanisms, examples, and ecological significance of bioluminescence, shedding light on its role in the natural world . The process of bioluminescence involves a biochemical reaction between a light-emitting molecule known as luciferin and an enzyme called luciferase . This reaction occurs within specialized cells or organelles and typically requires oxygen . The specific structure of luciferin varies among different organisms, leading to a wide range of colors emitted, from blue and green to red and yellow . The basic biochemical reaction can be summarized as follows: 1 . **Formation of Luciferin-Oxygen Complex**: When luciferin reacts with oxygen in the presence of luciferase, it forms an unstable complex . 2 . diff --git a/tests/python_tests/models/nightly b/tests/python_tests/models/nightly index 72b707bd63..13749a1991 100644 --- a/tests/python_tests/models/nightly +++ b/tests/python_tests/models/nightly @@ -48,4 +48,4 @@ hf-internal-testing/tiny-random-StableLmForCausalLM hf-internal-testing/tiny-random-PhiForCausalLM hf-internal-testing/tiny-random-CodeGenForCausalLM hf-internal-testing/tiny-random-Starcoder2ForCausalLM -hf-internal-testing/tiny-random-OPTForCausalLM \ No newline at end of file +hf-internal-testing/tiny-random-OPTForCausalLM diff --git a/tests/python_tests/models/precommit b/tests/python_tests/models/precommit index 0b913d3b01..8adac460e0 100644 --- a/tests/python_tests/models/precommit +++ b/tests/python_tests/models/precommit @@ -1,3 +1,3 @@ hf-tiny-model-private/tiny-random-CodeGenForCausalLM hf-tiny-model-private/tiny-random-GPT2LMHeadModel -hf-tiny-model-private/tiny-random-OPTForCausalLM \ No newline at end of file +hf-tiny-model-private/tiny-random-OPTForCausalLM diff --git a/tests/python_tests/models/real_models b/tests/python_tests/models/real_models index 5fd8fe0500..a15878f63e 100644 --- a/tests/python_tests/models/real_models +++ b/tests/python_tests/models/real_models @@ -127,4 +127,4 @@ tiiuae/falcon-rw-7b togethercomputer/RedPajama-INCITE-Chat-3B-v1 # xverse/XVERSE-7B-Chat: Transformers - Exception: data did not match any variant of untagged enum PyPreTokenizerTypeWrapper at line 78 column 3 # xverse/XVERSE-MoE-A4.2B: Transformers - Exception: data did not match any variant of untagged enum PyPreTokenizerTypeWrapper at line 78 column 3 -Deci/DeciLM-7B \ No newline at end of file +Deci/DeciLM-7B diff --git a/tests/python_tests/test_continuous_batching.py b/tests/python_tests/test_continuous_batching.py index 7cf5bc5355..ce5fd9f232 100644 --- a/tests/python_tests/test_continuous_batching.py +++ b/tests/python_tests/test_continuous_batching.py @@ -342,7 +342,7 @@ def test_preemption_with_multinomial_n_seq(tmp_path, dynamic_split_fuse): def get_data_by_pipeline_type(model_path: Path, pipeline_type: str): device = "CPU" - prompt = "Prompt example is" + prompt = "Prompt example is" generation_config = GenerationConfig() generation_config.max_new_tokens = 10 generation_config.do_sample = True diff --git a/tests/python_tests/test_generation_config.py b/tests/python_tests/test_generation_config.py index 72da672713..16bc064724 100644 --- a/tests/python_tests/test_generation_config.py +++ b/tests/python_tests/test_generation_config.py @@ -77,7 +77,7 @@ def test_valid_configs(generation_config_kwargs): # stop conditions dict(), # no stop conditions at all dict(ignore_eos=True), # no 'max_new_tokens', no 'max_length' with 'ignore_eos' - dict(stop_token_ids={-1}), # value in 'stop_token_ids' must be non-negative + dict(stop_token_ids={-1}), # value in 'stop_token_ids' must be non-negative dict(max_new_tokens=0), # max new tokens cannot be empty (only when 'echo' is True) dict(max_new_tokens=10, min_new_tokens=20), # 'max_new_tokens' must be >= 'min_new_tokens' # penalties diff --git a/tests/python_tests/test_kv_cache_eviction.py b/tests/python_tests/test_kv_cache_eviction.py index 84baa79e02..216a58e1e0 100644 --- a/tests/python_tests/test_kv_cache_eviction.py +++ b/tests/python_tests/test_kv_cache_eviction.py @@ -189,4 +189,3 @@ def get_beam_search_seq_len_300() -> GenerationConfig: @pytest.mark.precommit def test_dynamic_memory_allocation(tmp_path, params): run_cb_pipeline_with_ref(tmp_path, "facebook/opt-125m", scheduler_params=params[0], generation_config=params[1]) - diff --git a/tests/python_tests/test_llm_pipeline.py b/tests/python_tests/test_llm_pipeline.py index 8968f2a083..64fbc65bcc 100644 --- a/tests/python_tests/test_llm_pipeline.py +++ b/tests/python_tests/test_llm_pipeline.py @@ -483,5 +483,5 @@ def test_left_pad(): } models[2].pad_token = models[2].eos_token - + run_llm_pipeline_with_ref(model_id=models[0], prompts=prompts, generation_config=generation_config_dict, tmp_path=models[1]) diff --git a/tests/python_tests/test_sampling.py b/tests/python_tests/test_sampling.py index 7a3aced29a..62015d4418 100644 --- a/tests/python_tests/test_sampling.py +++ b/tests/python_tests/test_sampling.py @@ -63,8 +63,8 @@ def test_stop_strings(tmp_path, generation_config): ids=["basic", "repetition_penalty", "long_max_new_tokens"]) @pytest.mark.parametrize("prompt", [ 'What is OpenVINO?', - 'table is made of', - 'The Sun is yellow because', + 'table is made of', + 'The Sun is yellow because', '你好! 你好嗎?', 'I have an interview about product speccing with the company Weekend Health. Give me an example of a question they might ask with regards about a new feature' ]) @@ -74,9 +74,9 @@ def test_greedy(tmp_path, generation_config, prompt, use_cb): if sys.platform.startswith('win') and prompt.startswith('你'): pytest.skip("For unknown reason this prompt fails on Win") - run_llm_pipeline_with_ref(model_id=model_id, - prompts=[prompt], - generation_config=generation_config, + run_llm_pipeline_with_ref(model_id=model_id, + prompts=[prompt], + generation_config=generation_config, tmp_path=tmp_path, use_cb=use_cb) diff --git a/tests/python_tests/test_tokenizer.py b/tests/python_tests/test_tokenizer.py index d71534c2f1..677e2e6ed8 100644 --- a/tests/python_tests/test_tokenizer.py +++ b/tests/python_tests/test_tokenizer.py @@ -37,14 +37,14 @@ def load_genai_tokenizer_with_configs(configs: List[Tuple], temp_path): def get_chat_templates(): - # Returns chat templates saved in tokenizer_configs.py, + # Returns chat templates saved in tokenizer_configs.py, # but skips some models that currently are not processed correctly. skipped_models = { # TODO: openchat/openchat_3.5 and berkeley-nest/Starling-LM-7B-alpha have the same template. # Need to enable and unskip, since it's preset in continuous batching and has >100 000 downloads. "openchat/openchat-3.5-0106", - + # These models fail even on HF so no need to check if applying chat matches. "vibhorag101/llama-2-13b-chat-hf-phr_mental_therapy", "codellama/CodeLlama-34b-Instruct-hf", @@ -83,7 +83,7 @@ def get_chat_templates(): "shenzhi-wang/Llama3-8B-Chinese-Chat", # AssertionError "nlpai-lab/KULLM3", "HuggingFaceH4/zephyr-7b-gemma-sft-v0.1", - "MediaTek-Research/Breeze-7B-Instruct-v0_1", + "MediaTek-Research/Breeze-7B-Instruct-v0_1", "shanchen/llama3-8B-slerp-biomed-chat-chinese", # AssertionError "MLP-KTLim/llama-3-Korean-Bllossom-8B", "aloobun/CosmicBun-8B", # Chat template is not supported by Jinja2Cpp @@ -240,7 +240,7 @@ def test_encode_decode_with_special_tokens_option(prompt): hf_res_no_spec = hf_tokenizer(prompt, return_tensors="np", add_special_tokens=False)["input_ids"] assert np.all(ov_res_add_spec == hf_res_add_spec) assert np.all(ov_res_no_spec == hf_res_no_spec) - + # Check that add_special_tokens flag indeed made any difference assert ov_res_add_spec.size != ov_res_no_spec.size assert hf_res_add_spec.size != hf_res_no_spec.size @@ -383,4 +383,3 @@ def test_load_special_tokens_from_special_tokens_map_json_with_string_repr(model assert tok.get_bos_token_id() == token_str_int_map['bos_token'] if 'eos_token' in token_str_int_map: assert tok.get_eos_token_id() == token_str_int_map['eos_token'] - diff --git a/tests/python_tests/test_whisper_pipeline_static.py b/tests/python_tests/test_whisper_pipeline_static.py index ec6b3d5cfa..90b3384754 100644 --- a/tests/python_tests/test_whisper_pipeline_static.py +++ b/tests/python_tests/test_whisper_pipeline_static.py @@ -10,7 +10,7 @@ import pytest import pathlib -# This test suite is designed specifically to validate the functionality +# This test suite is designed specifically to validate the functionality # and robustness of the WhisperStaticPipeline on NPUW:CPU. config = {"NPU_USE_NPUW" : "YES", "NPUW_DEVICES" : "CPU", @@ -51,7 +51,7 @@ def load_and_save_whisper_model(params, stateful=False, **tokenizer_kwargs): opt_model.config.save_pretrained(path) opt_model.save_pretrained(path) processor.save_pretrained(path) - + return model_id, path def get_results_cpu_npu(model_path, audio_sample, **config_kwargs): diff --git a/tools/cacheviz/__init__.py b/tools/cacheviz/__init__.py index b755d885d3..32e4c6b5ce 100644 --- a/tools/cacheviz/__init__.py +++ b/tools/cacheviz/__init__.py @@ -1,3 +1,2 @@ # Copyright (C) 2023-2025 Intel Corporation # SPDX-License-Identifier: Apache-2.0 - diff --git a/tools/cacheviz/cacheviz.py b/tools/cacheviz/cacheviz.py index 841a5eeb65..da83ddc4ce 100644 --- a/tools/cacheviz/cacheviz.py +++ b/tools/cacheviz/cacheviz.py @@ -316,6 +316,3 @@ def on_press(event): if __name__ == "__main__": main() - - - diff --git a/tools/cacheviz/requirements.txt b/tools/cacheviz/requirements.txt index 9af70e35fa..e272c6488f 100644 --- a/tools/cacheviz/requirements.txt +++ b/tools/cacheviz/requirements.txt @@ -1,2 +1,2 @@ argparse -matplotlib \ No newline at end of file +matplotlib diff --git a/tools/continuous_batching/accuracy/continuous_batching_accuracy.cpp b/tools/continuous_batching/accuracy/continuous_batching_accuracy.cpp index d644ba9418..1d05b35e5a 100644 --- a/tools/continuous_batching/accuracy/continuous_batching_accuracy.cpp +++ b/tools/continuous_batching/accuracy/continuous_batching_accuracy.cpp @@ -120,7 +120,7 @@ int main(int argc, char* argv[]) try { std::cout << "Partial result:" << std::endl; print_generation_result(generation_result); } - break; + break; default: break; } diff --git a/tools/continuous_batching/accuracy/continuous_batching_speculative_decoding.cpp b/tools/continuous_batching/accuracy/continuous_batching_speculative_decoding.cpp index eeb3c0f070..850cddfb47 100644 --- a/tools/continuous_batching/accuracy/continuous_batching_speculative_decoding.cpp +++ b/tools/continuous_batching/accuracy/continuous_batching_speculative_decoding.cpp @@ -13,7 +13,7 @@ void print_cb_generation_result(const ov::genai::GenerationResult& generation_re } std::vector get_spec_decoding_generation_config_examples() { - + // sampling param for speulative decoding ov::genai::GenerationConfig generation_config_greedy_constant = ov::genai::greedy(); { @@ -105,7 +105,7 @@ int main(int argc, char* argv[]) try { scheduler_config.dynamic_split_fuse = dynamic_split_fuse; // vLLM specific params scheduler_config.max_num_seqs = 2; - + ov::genai::ContinuousBatchingPipeline pipe(models_path, scheduler_config, device, {ov::genai::draft_model(draft_models_path, device)}); std::vector generation_results = pipe.generate(prompts, generation_config); @@ -130,7 +130,7 @@ int main(int argc, char* argv[]) try { std::cout << "Partial result:" << std::endl; print_cb_generation_result(generation_result); } - break; + break; default: break; } diff --git a/tools/continuous_batching/benchmark/continuous_batching_benchmark.cpp b/tools/continuous_batching/benchmark/continuous_batching_benchmark.cpp index d7cad80fd0..ac40a49fca 100644 --- a/tools/continuous_batching/benchmark/continuous_batching_benchmark.cpp +++ b/tools/continuous_batching/benchmark/continuous_batching_benchmark.cpp @@ -146,7 +146,7 @@ class GenerationInfo { std::chrono::milliseconds cumulated_tpot; std::chrono::milliseconds mean_tpot; size_t num_output_tokens; - + std::chrono::steady_clock::time_point start_time; std::chrono::steady_clock::time_point last_read_time; @@ -269,7 +269,7 @@ class GenerationInfoCollector { for (GenerationInfo& generation_info : generations_info) { if (!generation_info.is_active()) continue; - + if (generation_info.is_finished()) { num_finished++; generation_info.set_inactive(); @@ -287,8 +287,8 @@ class GenerationInfoCollector { std::chrono::milliseconds mean_tpot = std::chrono::milliseconds::zero(); size_t total_input_len = 0; size_t total_output_len = 0; - - + + for (GenerationInfo& generation_info : generations_info){ auto generation_metrics = generation_info.get_metrics(); mean_ttft += generation_metrics.mean_ttft; @@ -304,7 +304,7 @@ class GenerationInfoCollector { std::cout << "Input throughput: " << total_input_len / total_duration.count() << " tokens / s" << std::endl; std::cout << "Output throughput: " << total_output_len / total_duration.count() << " tokens / s" << std::endl; std::cout << "Mean TTFT: " << mean_ttft.count() << " ms" << std::endl; - std::cout << "Mean TPOT: " << mean_tpot.count() << " ms" << std::endl; + std::cout << "Mean TPOT: " << mean_tpot.count() << " ms" << std::endl; } }; @@ -512,7 +512,7 @@ int main(int argc, char* argv[]) try { std::cout << "ERROR: Wrong json parameter in device_config." << std::endl; return EXIT_FAILURE; } - + // Benchmarking std::cout << "Loading models, creating pipelines, preparing environment..." << std::endl; ov::genai::ContinuousBatchingPipeline pipe(models_path, scheduler_config, device, device_config_map); @@ -526,7 +526,7 @@ int main(int argc, char* argv[]) try { std::thread trafficSimulatorThread(trafficSimulator, &pipe, &dataset, request_rate, &generation_info_collector, is_speculative_decoding_enabled); trafficSimulatorThread.join(); } - + std::thread lmmEngineThread(llmEngineLoop, &pipe, &dataset, &finishGenerationThread); std::thread statisticsReporterThread(statisticsReporter, &generation_info_collector, num_prompts); if (request_rate != "inf") { diff --git a/tools/llm_bench/doc/NOTES.md b/tools/llm_bench/doc/NOTES.md index 8d84b4e8c8..96f936cd76 100644 --- a/tools/llm_bench/doc/NOTES.md +++ b/tools/llm_bench/doc/NOTES.md @@ -71,4 +71,4 @@ ConnectionError: Couldn't reach 'wikitext' on the Hub (SSLError) ``` root cause: The wikitext data set was not downloaded correctly, or the Hugging Face Hub network could not be connected normally.
Solution:
-Refer to https://huggingface.co/docs/datasets/loading#arrow , copy wikitext data set to ~/.cache/huggingface/datasets/ folder, set the environment variable HF_DATASETS_OFFLINE to 1. \ No newline at end of file +Refer to https://huggingface.co/docs/datasets/loading#arrow , copy wikitext data set to ~/.cache/huggingface/datasets/ folder, set the environment variable HF_DATASETS_OFFLINE to 1. diff --git a/tools/llm_bench/doc/PROMPT.md b/tools/llm_bench/doc/PROMPT.md index 5418bf0bb5..0613dd0073 100644 --- a/tools/llm_bench/doc/PROMPT.md +++ b/tools/llm_bench/doc/PROMPT.md @@ -41,4 +41,4 @@ Prompt file example: ## 5. Visual Language Models Supported parameters that can be set are: * `media` - imge file path -* `prompt`- input text prompt \ No newline at end of file +* `prompt`- input text prompt diff --git a/tools/llm_bench/llm_bench_utils/hook_beam_search.py b/tools/llm_bench/llm_bench_utils/hook_beam_search.py index 49f9db8236..5252122eed 100644 --- a/tools/llm_bench/llm_bench_utils/hook_beam_search.py +++ b/tools/llm_bench/llm_bench_utils/hook_beam_search.py @@ -509,4 +509,4 @@ def new_forward(self, model): def new_get_multimodal_embeddings(self, model): model._orig_get_multimodal_embeddings = model.get_multimodal_embeddings - model.get_multimodal_embeddings = types.MethodType(new_get_multimodal_embeddings, model) \ No newline at end of file + model.get_multimodal_embeddings = types.MethodType(new_get_multimodal_embeddings, model) diff --git a/tools/llm_bench/llm_bench_utils/hook_common.py b/tools/llm_bench/llm_bench_utils/hook_common.py index 3b0d623737..f0b363c8e0 100644 --- a/tools/llm_bench/llm_bench_utils/hook_common.py +++ b/tools/llm_bench/llm_bench_utils/hook_common.py @@ -26,4 +26,4 @@ def get_bench_hook(num_beams, ov_model): else: log.warning(f'The minimum version of transformers to get 1st and 2nd tokens latency of {search_type} is: {min_version}') bench_hook = None - return bench_hook \ No newline at end of file + return bench_hook diff --git a/tools/llm_bench/llm_bench_utils/hook_greedy_search.py b/tools/llm_bench/llm_bench_utils/hook_greedy_search.py index 86a0eec4ad..85302f18da 100644 --- a/tools/llm_bench/llm_bench_utils/hook_greedy_search.py +++ b/tools/llm_bench/llm_bench_utils/hook_greedy_search.py @@ -385,8 +385,8 @@ def new_forward(self, model): if trans_version >= version.parse('4.45.0'): model._sample = hook_sample_v45.new_sample.__get__(model, model.__class__) elif trans_version >= version.parse('4.43.0'): - model._sample = hook_sample_v43.new_sample.__get__(model, model.__class__) - + model._sample = hook_sample_v43.new_sample.__get__(model, model.__class__) + def new_get_multimodal_embeddings(self, model): model._orig_get_multimodal_embeddings = model.get_multimodal_embeddings - model.get_multimodal_embeddings = types.MethodType(new_get_multimodal_embeddings, model) \ No newline at end of file + model.get_multimodal_embeddings = types.MethodType(new_get_multimodal_embeddings, model) diff --git a/tools/llm_bench/llm_bench_utils/hook_sample.py b/tools/llm_bench/llm_bench_utils/hook_sample.py index a1e970bf4a..4dc10025a7 100644 --- a/tools/llm_bench/llm_bench_utils/hook_sample.py +++ b/tools/llm_bench/llm_bench_utils/hook_sample.py @@ -226,4 +226,4 @@ def new_sample( past_key_values=model_kwargs.get("past_key_values"), ) else: - return input_ids \ No newline at end of file + return input_ids diff --git a/tools/llm_bench/llm_bench_utils/hook_sample_v43.py b/tools/llm_bench/llm_bench_utils/hook_sample_v43.py index 94d0aa8370..f3b1b51469 100644 --- a/tools/llm_bench/llm_bench_utils/hook_sample_v43.py +++ b/tools/llm_bench/llm_bench_utils/hook_sample_v43.py @@ -230,4 +230,4 @@ def new_sample( past_key_values=model_kwargs.get("past_key_values"), ) else: - return input_ids \ No newline at end of file + return input_ids diff --git a/tools/llm_bench/llm_bench_utils/hook_sample_v45.py b/tools/llm_bench/llm_bench_utils/hook_sample_v45.py index 2a43717e9b..9ed7d80e88 100644 --- a/tools/llm_bench/llm_bench_utils/hook_sample_v45.py +++ b/tools/llm_bench/llm_bench_utils/hook_sample_v45.py @@ -222,4 +222,4 @@ def new_sample( past_key_values=model_kwargs.get("past_key_values"), ) else: - return input_ids \ No newline at end of file + return input_ids diff --git a/tools/llm_bench/llm_bench_utils/ov_model_classes.py b/tools/llm_bench/llm_bench_utils/ov_model_classes.py index ccb4e9af1f..e6b7707ac2 100644 --- a/tools/llm_bench/llm_bench_utils/ov_model_classes.py +++ b/tools/llm_bench/llm_bench_utils/ov_model_classes.py @@ -78,7 +78,7 @@ def forward( past_key_values = tuple( past_key_value for pkv_per_layer in past_key_values for past_key_value in pkv_per_layer ) - + # Add the past_key_values to the decoder inputs inputs = dict(zip(self.key_value_input_names, past_key_values)) @@ -440,7 +440,7 @@ def forward( past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, **kwargs, ) -> CausalLMOutputWithPast: - + if not self.is_v1: return super().forward(input_ids=input_ids, attention_mask=attention_mask, past_key_values=past_key_values, **kwargs) self.compile() diff --git a/tools/llm_bench/prompts/llama-2-7b-chat_l.jsonl b/tools/llm_bench/prompts/llama-2-7b-chat_l.jsonl index 4bf82b10b6..3a08db69dd 100644 --- a/tools/llm_bench/prompts/llama-2-7b-chat_l.jsonl +++ b/tools/llm_bench/prompts/llama-2-7b-chat_l.jsonl @@ -1 +1 @@ -{"prompt": "[INST] <> A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. <> You will act as a Christian, and fully summarize following text:\nSometimes it's nice to take a minute in the pew by yourself beforehand. You have this beautiful church probably almost all to yourself. Can you feel its energy resonating through you? Can you feel the majesty of the Lord's kingdom and how you're a part of it? Take a moment to kneel and pray with your head down and hands clasped together. Reflect on your faith and how you feel currently. Think about how you've been responding to God's call and how you've been living in the light of his love. When the priest is ready for you, of course. You'll probably see him there by his lonesome or someone else walk out just before you. Sit down either across from him or behind the screen -- it's totally up to you whether or not you prefer to remain anonymous. He won't treat you any differently either way. Make the sign of the cross upon his prompt, saying, \"Bless me, Father, for I have sinned. It has been 10 years since my last confession.\" This is your standard, traditional phrasing. However, if you just sit down and say hello, that's fine, too. The priest knows what he's doing. The Byzantine Rite is a bit different. The priest may sit to your side and put his epitrachelion on your head. He may then also do the Prayer of Absolution. But the idea remains the exact same -- just go wherever he takes you. Once you sit down and you've made the sign of the cross, just sit back and follow the priest's lead. He'll ask you how long it's been since your last confession (if you don't voluntarily offer that information), how you are feeling, maybe how your faith is going, and then ask you what sins you would like to talk about with him and God. It's just a casual conversation! Do not fret. There is absolutely zero pressure on your part. Again, as long as you come there with the intention of leaving with a clean heart, you're more than welcome in the church. There is no wrong way to go about confession! This part is intimidating, but think about it this way: the priest you're talking to has probably heard just about everything before. Whatever you have to say will not blow his mind. So when he asks, start rattling them off, from the most serious to the least. If he asks any questions, answer them, but do not feel the need to go into detail. A simple, \"I did so and so,\" will suffice. Your priest is going to be very understanding. If you don't remember the exact timeframe, that's fine. If you don't remember your motivation, that's fine. All your priest cares about is that you're being as honest as possible and that your heart is in the right place. He'll talk you through everything, possibly asking about your intentions, but mainly just letting you know that God loves you, sin and all. If he has any ideas to bring you closer to God, he may suggest them at this juncture. He's there to help, after all. He will then ask you to make an Act of Contrition. That goes like this: My God, I am sorry for my sins with all my heart.In choosing to do wrong and failing to do good,I have sinned against You whom I should loveabove all things. I firmly intend, with your help,to do penance, to sin no more, andto avoid whatever leads me to sin.Our Savior Jesus Christ suffered and died for us.In his name, my God, have mercy.If you are a Roman Catholic, your act of contrition will go like this: Oh my God, I am very sorry for having offended thee. But most of all, because they offend you, my God, who is all good and deserving of all my love. I firmly resolve with the help of thy grace, to sin no more, and to avoid the near occasion of sin. Amen. Don't worry! It won't be anything huge. Take the absolution to heart -- you now have a brand new, clean slate to work with. \"Penance\" is your expression of regret and repentance, showing God that you're truly sorry and that you wish for nothing more than to be forgiven. Thanks. [/INST]"} \ No newline at end of file +{"prompt": "[INST] <> A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. <> You will act as a Christian, and fully summarize following text:\nSometimes it's nice to take a minute in the pew by yourself beforehand. You have this beautiful church probably almost all to yourself. Can you feel its energy resonating through you? Can you feel the majesty of the Lord's kingdom and how you're a part of it? Take a moment to kneel and pray with your head down and hands clasped together. Reflect on your faith and how you feel currently. Think about how you've been responding to God's call and how you've been living in the light of his love. When the priest is ready for you, of course. You'll probably see him there by his lonesome or someone else walk out just before you. Sit down either across from him or behind the screen -- it's totally up to you whether or not you prefer to remain anonymous. He won't treat you any differently either way. Make the sign of the cross upon his prompt, saying, \"Bless me, Father, for I have sinned. It has been 10 years since my last confession.\" This is your standard, traditional phrasing. However, if you just sit down and say hello, that's fine, too. The priest knows what he's doing. The Byzantine Rite is a bit different. The priest may sit to your side and put his epitrachelion on your head. He may then also do the Prayer of Absolution. But the idea remains the exact same -- just go wherever he takes you. Once you sit down and you've made the sign of the cross, just sit back and follow the priest's lead. He'll ask you how long it's been since your last confession (if you don't voluntarily offer that information), how you are feeling, maybe how your faith is going, and then ask you what sins you would like to talk about with him and God. It's just a casual conversation! Do not fret. There is absolutely zero pressure on your part. Again, as long as you come there with the intention of leaving with a clean heart, you're more than welcome in the church. There is no wrong way to go about confession! This part is intimidating, but think about it this way: the priest you're talking to has probably heard just about everything before. Whatever you have to say will not blow his mind. So when he asks, start rattling them off, from the most serious to the least. If he asks any questions, answer them, but do not feel the need to go into detail. A simple, \"I did so and so,\" will suffice. Your priest is going to be very understanding. If you don't remember the exact timeframe, that's fine. If you don't remember your motivation, that's fine. All your priest cares about is that you're being as honest as possible and that your heart is in the right place. He'll talk you through everything, possibly asking about your intentions, but mainly just letting you know that God loves you, sin and all. If he has any ideas to bring you closer to God, he may suggest them at this juncture. He's there to help, after all. He will then ask you to make an Act of Contrition. That goes like this: My God, I am sorry for my sins with all my heart.In choosing to do wrong and failing to do good,I have sinned against You whom I should loveabove all things. I firmly intend, with your help,to do penance, to sin no more, andto avoid whatever leads me to sin.Our Savior Jesus Christ suffered and died for us.In his name, my God, have mercy.If you are a Roman Catholic, your act of contrition will go like this: Oh my God, I am very sorry for having offended thee. But most of all, because they offend you, my God, who is all good and deserving of all my love. I firmly resolve with the help of thy grace, to sin no more, and to avoid the near occasion of sin. Amen. Don't worry! It won't be anything huge. Take the absolution to heart -- you now have a brand new, clean slate to work with. \"Penance\" is your expression of regret and repentance, showing God that you're truly sorry and that you wish for nothing more than to be forgiven. Thanks. [/INST]"} diff --git a/tools/llm_bench/requirements.txt b/tools/llm_bench/requirements.txt index 6bf8d8cddf..48e64804dc 100644 --- a/tools/llm_bench/requirements.txt +++ b/tools/llm_bench/requirements.txt @@ -10,7 +10,7 @@ pillow torch transformers>=4.40.0 diffusers>=0.22.0 -#optimum is in dependency list of optimum-intel +#optimum is in dependency list of optimum-intel git+https://github.com/huggingface/optimum-intel.git@main#egg=optimum-intel git+https://github.com/openvinotoolkit/nncf.git@develop#egg=nncf packaging