From 68d3e487ea3dc6547fb6686678b68a7b3d8fcbee Mon Sep 17 00:00:00 2001 From: Alexander Suvorov Date: Fri, 3 Jan 2025 15:23:50 +0100 Subject: [PATCH 01/27] use decoder interface --- src/cpp/src/whisper/models/decoder.cpp | 26 +++ src/cpp/src/whisper/models/decoder.hpp | 35 ++++ .../src/whisper/models/statefull_decoder.cpp | 73 ++++++++ .../src/whisper/models/statefull_decoder.hpp | 29 +++ .../src/whisper/models/with_past_decoder.cpp | 102 +++++++++++ .../src/whisper/models/with_past_decoder.hpp | 32 ++++ src/cpp/src/whisper/whisper.cpp | 170 +++++------------- src/cpp/src/whisper/whisper.hpp | 8 +- src/cpp/src/whisper/whisper_models.hpp | 2 +- src/cpp/src/whisper_pipeline.cpp | 18 +- 10 files changed, 356 insertions(+), 139 deletions(-) create mode 100644 src/cpp/src/whisper/models/decoder.cpp create mode 100644 src/cpp/src/whisper/models/decoder.hpp create mode 100644 src/cpp/src/whisper/models/statefull_decoder.cpp create mode 100644 src/cpp/src/whisper/models/statefull_decoder.hpp create mode 100644 src/cpp/src/whisper/models/with_past_decoder.cpp create mode 100644 src/cpp/src/whisper/models/with_past_decoder.hpp diff --git a/src/cpp/src/whisper/models/decoder.cpp b/src/cpp/src/whisper/models/decoder.cpp new file mode 100644 index 0000000000..32a8f2eff6 --- /dev/null +++ b/src/cpp/src/whisper/models/decoder.cpp @@ -0,0 +1,26 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "decoder.hpp" + +#include + +#include "statefull_decoder.hpp" +#include "utils.hpp" +#include "with_past_decoder.hpp" + +namespace ov::genai { +std::shared_ptr WhisperDecoder::from_path(const std::filesystem::path& models_path, + const std::string& device, + const ov::AnyMap& properties) { + bool has_decoder_with_past = std::filesystem::exists(models_path / "openvino_decoder_with_past_model.xml"); + + if (has_decoder_with_past) { + return std::make_shared(models_path, device, properties); + } + + return std::make_shared(models_path, device, properties); +} + +WhisperDecoder::~WhisperDecoder() = default; +} // namespace ov::genai diff --git a/src/cpp/src/whisper/models/decoder.hpp b/src/cpp/src/whisper/models/decoder.hpp new file mode 100644 index 0000000000..d82ce5047a --- /dev/null +++ b/src/cpp/src/whisper/models/decoder.hpp @@ -0,0 +1,35 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include + +#include "openvino/genai/whisper_generation_config.hpp" +#include "openvino/runtime/core.hpp" + +namespace ov::genai { +class WhisperDecoder { +public: + static std::shared_ptr from_path(const std::filesystem::path& models_path, + const std::string& device, + const ov::AnyMap& properties); + + virtual std::pair detect_language(const ov::Tensor& encoder_hidden_state, + const int64_t decoder_start_token_id) { + OPENVINO_THROW("detect_language method not implemented"); + }; + + virtual std::pair decode(const ov::Tensor& encoder_hidden_state, + const std::vector& input_ids, + const size_t cache_position) { + OPENVINO_THROW("decode method not implemented"); + }; + + virtual void reset_state() { + OPENVINO_THROW("reset_state method not implemented"); + } + + virtual ~WhisperDecoder(); +}; +} // namespace ov::genai diff --git a/src/cpp/src/whisper/models/statefull_decoder.cpp b/src/cpp/src/whisper/models/statefull_decoder.cpp new file mode 100644 index 0000000000..2f235178a4 --- /dev/null +++ b/src/cpp/src/whisper/models/statefull_decoder.cpp @@ -0,0 +1,73 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "statefull_decoder.hpp" + +#include "utils.hpp" + +namespace ov::genai { +WhisperStatefullDecoder::WhisperStatefullDecoder(const std::filesystem::path& models_path, + const std::string& device, + const ov::AnyMap& properties) { + ov::Core core = utils::singleton_core(); + + auto model = core.read_model((models_path / "openvino_decoder_model.xml").string()); + + // todo: remove once stateful model has dynamic input_ids seq_len + std::map name_to_shape; + for (const ov::Output& input : model->inputs()) { + ov::PartialShape shape = input.get_partial_shape(); + if (input.get_any_name().find("input_ids") != std::string::npos) { + shape[1] = -1; + name_to_shape[input.get_any_name()] = shape; + } + } + model->reshape(name_to_shape); + + auto compiled_model = core.compile_model(model, device, properties); + + utils::print_compiled_model_properties(compiled_model, "whisper decoder model"); + m_request = compiled_model.create_infer_request(); +} + +std::pair WhisperStatefullDecoder::detect_language(const ov::Tensor& encoder_hidden_state, + const int64_t decoder_start_token_id) { + auto [output_tensor, infer_ms] = decode(encoder_hidden_state, {decoder_start_token_id}, 0); + + int64_t output_token = ov::genai::utils::argmax(output_tensor, 0); + + reset_state(); + + return {output_token, infer_ms}; +} + +std::pair WhisperStatefullDecoder::decode(const ov::Tensor& encoder_hidden_state, + const std::vector& input_ids, + const size_t cache_position) { + m_request.set_tensor("encoder_hidden_states", encoder_hidden_state); + + ov::Tensor input_ids_tensor(ov::element::i64, {1, input_ids.size()}, (void*)input_ids.data()); + m_request.set_tensor("input_ids", input_ids_tensor); + + ov::Tensor cache_position_tensor = m_request.get_tensor("cache_position"); + cache_position_tensor.set_shape({input_ids.size()}); + + auto cache_data = cache_position_tensor.data(); + std::iota(cache_data, cache_data + cache_position_tensor.get_size(), cache_position); + + m_request.get_tensor("beam_idx").set_shape({1}); + m_request.get_tensor("beam_idx").data()[0] = 0; + + const auto infer_start = std::chrono::steady_clock::now(); + m_request.infer(); + const auto infer_ms = ov::genai::PerfMetrics::get_microsec(std::chrono::steady_clock::now() - infer_start); + + auto output_tensor = m_request.get_tensor("logits"); + + return {output_tensor, infer_ms}; +}; + +void WhisperStatefullDecoder::reset_state() { + m_request.reset_state(); +} +} // namespace ov::genai diff --git a/src/cpp/src/whisper/models/statefull_decoder.hpp b/src/cpp/src/whisper/models/statefull_decoder.hpp new file mode 100644 index 0000000000..569031b9fa --- /dev/null +++ b/src/cpp/src/whisper/models/statefull_decoder.hpp @@ -0,0 +1,29 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "decoder.hpp" +#include "openvino/runtime/core.hpp" + +namespace ov::genai { + +class WhisperStatefullDecoder : public WhisperDecoder { +public: + explicit WhisperStatefullDecoder(const std::filesystem::path& models_path, + const std::string& device, + const ov::AnyMap& properties); + + std::pair detect_language(const ov::Tensor& encoder_hidden_state, + const int64_t decoder_start_token_id) override; + + std::pair decode(const ov::Tensor& encoder_hidden_state, + const std::vector& input_ids, + const size_t cache_position) override; + + void reset_state() override; + +private: + ov::InferRequest m_request; +}; +} // namespace ov::genai diff --git a/src/cpp/src/whisper/models/with_past_decoder.cpp b/src/cpp/src/whisper/models/with_past_decoder.cpp new file mode 100644 index 0000000000..a32c9a45be --- /dev/null +++ b/src/cpp/src/whisper/models/with_past_decoder.cpp @@ -0,0 +1,102 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "with_past_decoder.hpp" + +#include + +#include "utils.hpp" + +namespace { +void set_past_key_value(ov::InferRequest& source, ov::InferRequest& dest) { + // source outputs: + // present.0.decoder.key + // present.0.decoder.value + // present.0.encoder.key + // present.0.encoder.value + + // dest inputs: + // past_key_values.0.decoder.key + // past_key_values.0.decoder.value + // past_key_values.0.encoder.key + // past_key_values.0.encoder.value + + for (auto& source_output : source.get_compiled_model().outputs()) { + std::string source_output_name = source_output.get_any_name(); + if (source_output_name.find("logits") != std::string::npos) { + continue; + } + + std::string with_past_input_name = + std::regex_replace(source_output_name, std::regex("present"), "past_key_values"); + + auto kv_tensor = source.get_tensor(source_output_name); + dest.set_tensor(with_past_input_name, ov::Tensor{kv_tensor}); + } +} +} // namespace + +namespace ov::genai { +WhisperWithPastDecoder::WhisperWithPastDecoder(const std::filesystem::path& models_path, + const std::string& device, + const ov::AnyMap& properties) { + ov::Core core = utils::singleton_core(); + + auto compiled_model = core.compile_model(models_path / "openvino_decoder_model.xml", device, properties); + utils::print_compiled_model_properties(compiled_model, "whisper decoder model"); + m_request_decoder = compiled_model.create_infer_request(); + + compiled_model = core.compile_model(models_path / "openvino_decoder_with_past_model.xml", device, properties); + utils::print_compiled_model_properties(compiled_model, "whisper decoder with past model"); + m_request_decoder_with_past = compiled_model.create_infer_request(); +} + +std::pair WhisperWithPastDecoder::detect_language(const ov::Tensor& encoder_hidden_state, + const int64_t decoder_start_token_id) { + auto [output_tensor, infer_ms] = decode(encoder_hidden_state, {decoder_start_token_id}, 0); + + int64_t output_token = ov::genai::utils::argmax(output_tensor, 0); + + reset_state(); + + return {output_token, infer_ms}; +} + +std::pair WhisperWithPastDecoder::decode(const ov::Tensor& encoder_hidden_state, + const std::vector& input_ids, + const size_t cache_position) { + const bool initial_step = cache_position == 0; + ov::InferRequest& request = initial_step ? m_request_decoder : m_request_decoder_with_past; + + request.set_tensor("encoder_hidden_states", encoder_hidden_state); + + const ov::Tensor input_ids_tensor(ov::element::i64, {1, input_ids.size()}, (void*)input_ids.data()); + request.set_tensor("input_ids", input_ids_tensor); + + if (!initial_step) { + ov::Tensor cache_position_tensor = request.get_tensor("cache_position"); + cache_position_tensor.set_shape({1}); + cache_position_tensor.data()[0] = cache_position; + } + + const auto infer_start = std::chrono::steady_clock::now(); + request.infer(); + const auto infer_ms = ov::genai::PerfMetrics::get_microsec(std::chrono::steady_clock::now() - infer_start); + + auto output_tensor = request.get_tensor("logits"); + + if (initial_step) { + set_past_key_value(m_request_decoder, m_request_decoder_with_past); + } else if (!m_decoder_with_past_kv_value_set) { + set_past_key_value(m_request_decoder_with_past, m_request_decoder_with_past); + m_decoder_with_past_kv_value_set = true; + } + + return {output_tensor, infer_ms}; +} + +void WhisperWithPastDecoder::reset_state() { + m_request_decoder_with_past.reset_state(); + m_decoder_with_past_kv_value_set = false; +} +} // namespace ov::genai diff --git a/src/cpp/src/whisper/models/with_past_decoder.hpp b/src/cpp/src/whisper/models/with_past_decoder.hpp new file mode 100644 index 0000000000..b6f6924af6 --- /dev/null +++ b/src/cpp/src/whisper/models/with_past_decoder.hpp @@ -0,0 +1,32 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "decoder.hpp" +#include "openvino/runtime/core.hpp" + +namespace ov::genai { + +class WhisperWithPastDecoder : public WhisperDecoder { +public: + explicit WhisperWithPastDecoder(const std::filesystem::path& models_path, + const std::string& device, + const ov::AnyMap& properties); + + std::pair detect_language(const ov::Tensor& encoder_hidden_state, + const int64_t decoder_start_token_id) override; + + std::pair decode(const ov::Tensor& encoder_hidden_state, + const std::vector& input_ids, + const size_t cache_position) override; + + void reset_state() override; + +private: + ov::InferRequest m_request_decoder; + ov::InferRequest m_request_decoder_with_past; + bool m_decoder_with_past_kv_value_set = false; +}; + +} // namespace ov::genai diff --git a/src/cpp/src/whisper/whisper.cpp b/src/cpp/src/whisper/whisper.cpp index 04993f288c..9dffe01bd2 100644 --- a/src/cpp/src/whisper/whisper.cpp +++ b/src/cpp/src/whisper/whisper.cpp @@ -10,6 +10,7 @@ #include "context_tokens.hpp" #include "logit_processor.hpp" +#include "models/decoder.hpp" #include "openvino/genai/perf_metrics.hpp" #include "openvino/genai/whisper_generation_config.hpp" #include "openvino/genai/whisper_pipeline.hpp" @@ -53,89 +54,34 @@ ov::Tensor encode(ov::InferRequest& request, return request.get_tensor("last_hidden_state"); } -void set_past_key_value(ov::InferRequest& source, ov::InferRequest& dest) { - // source outputs: - // present.0.decoder.key - // present.0.decoder.value - // present.0.encoder.key - // present.0.encoder.value - - // dest inputs: - // past_key_values.0.decoder.key - // past_key_values.0.decoder.value - // past_key_values.0.encoder.key - // past_key_values.0.encoder.value - - for (auto& source_output : source.get_compiled_model().outputs()) { - std::string source_output_name = source_output.get_any_name(); - if (source_output_name.find("logits") != std::string::npos) { - continue; - } - - std::string with_past_input_name = - std::regex_replace(source_output_name, std::regex("present"), "past_key_values"); - - auto kv_tensor = source.get_tensor(source_output_name); - dest.set_tensor(with_past_input_name, ov::Tensor{kv_tensor}); - } -} - int64_t decode(ov::Tensor& encoder_hidden_state, - ov::InferRequest& decoder, - std::vector& input_ids, + std::shared_ptr decoder, + const std::vector& input_ids, + const size_t cache_position, const ov::genai::WhisperGenerationConfig& config, ov::genai::RawPerfMetrics& raw_metrics, - const bool apply_logit_processors = true, - const bool return_timestamps = false) { - decoder.set_tensor("encoder_hidden_states", ov::Tensor{encoder_hidden_state}); - - ov::Tensor input_ids_tensor(ov::element::i64, {1, input_ids.size()}, input_ids.data()); - decoder.set_tensor("input_ids", input_ids_tensor); - - ov::genai::utils::infer_with_perf_metrics(decoder, raw_metrics); - - auto output_tensor = decoder.get_tensor("logits"); + const bool return_timestamps, + const bool initial_step, + const std::vector& generated_tokens) { + auto [output_tensor, infer_ms] = decoder->decode(encoder_hidden_state, input_ids, cache_position); + const auto infer_end = std::chrono::steady_clock::now(); + raw_metrics.m_inference_durations[0] += MicroSeconds(infer_ms); + raw_metrics.m_token_infer_durations.emplace_back(infer_ms); + raw_metrics.m_new_token_times.emplace_back(infer_end); + raw_metrics.m_batch_sizes.emplace_back(1); - if (apply_logit_processors) { + if (initial_step) { ov::genai::do_suppress_tokens(output_tensor, 0, config.begin_suppress_tokens); - ov::genai::do_suppress_tokens(output_tensor, 0, config.suppress_tokens); - - if (return_timestamps) { - ov::genai::process_whisper_timestamp_logits(output_tensor, 0, config, {}, true); - } } - int64_t output_token = ov::genai::utils::argmax(output_tensor, 0); - - return output_token; -} - -int64_t decode_with_past(ov::Tensor& encoder_hidden_state, - ov::InferRequest& decoder_with_past, - int64_t input_id, - const size_t cache_position, - const ov::genai::WhisperGenerationConfig& config, - ov::genai::RawPerfMetrics& raw_metrics, - const bool return_timestamps, - const std::vector& generated_tokens) { - decoder_with_past.set_tensor("encoder_hidden_states", ov::Tensor{encoder_hidden_state}); - - std::vector input_ids = {input_id}; - ov::Tensor input_ids_tensor(ov::element::i64, {1, 1}, input_ids.data()); - decoder_with_past.set_tensor("input_ids", input_ids_tensor); - - ov::Tensor cache_position_tensor = decoder_with_past.get_tensor("cache_position"); - cache_position_tensor.set_shape({1}); - cache_position_tensor.data()[0] = cache_position; - - ov::genai::utils::infer_with_perf_metrics(decoder_with_past, raw_metrics); - - auto output_tensor = decoder_with_past.get_tensor("logits"); - ov::genai::do_suppress_tokens(output_tensor, 0, config.suppress_tokens); if (return_timestamps) { - ov::genai::process_whisper_timestamp_logits(output_tensor, 0, config, generated_tokens); + if (initial_step) { + ov::genai::process_whisper_timestamp_logits(output_tensor, 0, config, {}, true); + } else { + ov::genai::process_whisper_timestamp_logits(output_tensor, 0, config, generated_tokens); + } } int64_t output_token = ov::genai::utils::argmax(output_tensor, 0); @@ -143,34 +89,11 @@ int64_t decode_with_past(ov::Tensor& encoder_hidden_state, return output_token; } -int64_t detect_language(ov::Tensor& encoder_hidden_state, - ov::InferRequest& decoder, - const ov::genai::WhisperGenerationConfig& config, - ov::genai::RawPerfMetrics& raw_metrics) { - std::vector input_ids{config.decoder_start_token_id}; - - decoder.set_tensor("encoder_hidden_states", ov::Tensor{encoder_hidden_state}); - - ov::Tensor input_ids_tensor(ov::element::i64, {1, input_ids.size()}, input_ids.data()); - decoder.set_tensor("input_ids", input_ids_tensor); - - const auto infer_start = std::chrono::steady_clock::now(); - decoder.infer(); - const auto infer_ms = ov::genai::PerfMetrics::get_microsec(std::chrono::steady_clock::now() - infer_start); - raw_metrics.m_inference_durations[0] += MicroSeconds(infer_ms); - - auto output_tensor = decoder.get_tensor("logits"); - - int64_t output_token = ov::genai::utils::argmax(output_tensor, 0); - - return output_token; -} - -std::vector prepare_init_tokens(ov::Tensor& encoder_hidden_state, - ov::InferRequest decoder, - const ov::genai::WhisperGenerationConfig& config, - const bool return_timestamps, - ov::genai::RawPerfMetrics& raw_metrics) { +std::vector prepare_init_ids(ov::Tensor& encoder_hidden_state, + std::shared_ptr decoder, + const ov::genai::WhisperGenerationConfig& config, + const bool return_timestamps, + ov::genai::RawPerfMetrics& raw_metrics) { if (!config.is_multilingual) { if (return_timestamps) { return std::vector{config.decoder_start_token_id}; @@ -186,7 +109,9 @@ std::vector prepare_init_tokens(ov::Tensor& encoder_hidden_state, language_token_id = config.lang_to_id.at(language); } } else { - language_token_id = detect_language(encoder_hidden_state, decoder, config, raw_metrics); + auto [language, infer_ms] = decoder->detect_language(encoder_hidden_state, config.decoder_start_token_id); + language_token_id = language; + raw_metrics.m_inference_durations[0] += MicroSeconds(infer_ms); } int64_t task_token_id = config.transcribe_token_id; @@ -206,14 +131,14 @@ std::vector prepare_init_tokens(ov::Tensor& encoder_hidden_state, std::pair> full_decode(ov::Tensor& encoder_hidden_state, const ov::genai::WhisperGenerationConfig& config, - ov::genai::WhisperInitializedModels& models, + std::shared_ptr decoder, std::vector init_ids, const size_t max_new_tokens, const bool return_timestamps, ov::genai::RawPerfMetrics& raw_metrics, const std::shared_ptr streamer) { int64_t output_token = - decode(encoder_hidden_state, models.decoder, init_ids, config, raw_metrics, true, return_timestamps); + decode(encoder_hidden_state, decoder, init_ids, 0, config, raw_metrics, return_timestamps, true, {}); std::vector output_tokens{output_token}; @@ -225,21 +150,16 @@ std::pair> full_decode(ov::Tensor& encoder_hidden_sta return {false, output_tokens}; } - set_past_key_value(models.decoder, models.decoder_with_past); - for (size_t i = 0; i < max_new_tokens - 1; i++) { - auto output_token = decode_with_past(encoder_hidden_state, - models.decoder_with_past, - output_tokens.back(), - init_ids.size() + i, - config, - raw_metrics, - return_timestamps, - output_tokens); - - if (i == 0) { - set_past_key_value(models.decoder_with_past, models.decoder_with_past); - } + auto output_token = decode(encoder_hidden_state, + decoder, + {output_tokens.back()}, + init_ids.size() + i, + config, + raw_metrics, + return_timestamps, + false, + output_tokens); if (output_token == config.eos_token_id) { break; @@ -264,7 +184,8 @@ WhisperGenerateResult whisper_generate(const ov::genai::WhisperGenerationConfig& const ov::genai::WhisperConfig& model_config, const WhisperContextTokens& context_tokens, const RawSpeechInput& raw_speech, - ov::genai::WhisperInitializedModels& models, + ov::InferRequest& encoder, + std::shared_ptr decoder, WhisperFeatureExtractor& feature_extractor, const std::shared_ptr streamer) { size_t max_new_tokens = config.get_max_new_tokens(); @@ -301,7 +222,7 @@ WhisperGenerateResult whisper_generate(const ov::genai::WhisperGenerationConfig& auto input_features_chunk = input_features.get_data_with_offset(chunk_offset, feature_extractor.nb_max_frames); - ov::Tensor hidden_state_tensor = encode(models.encoder, + ov::Tensor hidden_state_tensor = encode(encoder, input_features_chunk, feature_extractor.feature_size, feature_extractor.nb_max_frames, @@ -309,8 +230,7 @@ WhisperGenerateResult whisper_generate(const ov::genai::WhisperGenerationConfig& // prepare init_ids just once for whole input if (init_tokens.empty()) { - init_tokens = - prepare_init_tokens(hidden_state_tensor, models.decoder, config, return_timestamps, raw_metrics); + init_tokens = prepare_init_ids(hidden_state_tensor, decoder, config, return_timestamps, raw_metrics); } std::vector chunk_init_tokens = ov::genai::get_prompt_tokens(context_tokens, config, chunk_offset); @@ -318,14 +238,14 @@ WhisperGenerateResult whisper_generate(const ov::genai::WhisperGenerationConfig& auto [cancelled, chunk_output_tokens] = full_decode(hidden_state_tensor, config, - models, + decoder, chunk_init_tokens, max_new_tokens - output_tokens.size(), return_timestamps, raw_metrics, streamer); - models.decoder_with_past.reset_state(); + decoder->reset_state(); if (return_timestamps) { auto extracted_segments = ov::genai::extract_segments(chunk_output_tokens, @@ -333,7 +253,7 @@ WhisperGenerateResult whisper_generate(const ov::genai::WhisperGenerationConfig& feature_extractor.nb_max_frames, time_precision); - ov::genai::utils::filter_non_segment_metrics(raw_metrics, output_tokens.size(), extracted_segments.segment_ranges); + utils::filter_non_segment_metrics(raw_metrics, output_tokens.size(), extracted_segments.segment_ranges); segments.insert(segments.end(), extracted_segments.segments.begin(), extracted_segments.segments.end()); diff --git a/src/cpp/src/whisper/whisper.hpp b/src/cpp/src/whisper/whisper.hpp index 81f559db9f..fbdf56d171 100644 --- a/src/cpp/src/whisper/whisper.hpp +++ b/src/cpp/src/whisper/whisper.hpp @@ -6,6 +6,7 @@ #include #include "context_tokens.hpp" +#include "models/decoder.hpp" #include "openvino/genai/whisper_generation_config.hpp" #include "openvino/genai/whisper_pipeline.hpp" #include "whisper_config.hpp" @@ -30,9 +31,10 @@ struct WhisperGenerateResult { WhisperGenerateResult whisper_generate(const ov::genai::WhisperGenerationConfig& config, const ov::genai::WhisperConfig& model_config, const WhisperContextTokens& context_tokens, - const ov::genai::RawSpeechInput& raw_speech, - ov::genai::WhisperInitializedModels& models, - ov::genai::WhisperFeatureExtractor& feature_extractor, + const RawSpeechInput& raw_speech, + ov::InferRequest& encoder, + std::shared_ptr decoder, + WhisperFeatureExtractor& feature_extractor, const std::shared_ptr streamer); } // namespace genai diff --git a/src/cpp/src/whisper/whisper_models.hpp b/src/cpp/src/whisper/whisper_models.hpp index 576bdb9dc7..9a915e92f4 100644 --- a/src/cpp/src/whisper/whisper_models.hpp +++ b/src/cpp/src/whisper/whisper_models.hpp @@ -3,7 +3,7 @@ #pragma once -#include +#include namespace ov { namespace genai { diff --git a/src/cpp/src/whisper_pipeline.cpp b/src/cpp/src/whisper_pipeline.cpp index f0fb34cdf6..de9ca0e4e0 100644 --- a/src/cpp/src/whisper_pipeline.cpp +++ b/src/cpp/src/whisper_pipeline.cpp @@ -10,6 +10,7 @@ #include "utils.hpp" #include "whisper/context_tokens.hpp" +#include "whisper/models/decoder.hpp" #include "whisper/streamer.hpp" #include "whisper/whisper.hpp" #include "whisper/whisper_config.hpp" @@ -47,7 +48,8 @@ namespace genai { class WhisperPipeline::WhisperPipelineStatefulImpl : public WhisperPipeline::WhisperPipelineImplBase { public: - ov::genai::WhisperInitializedModels m_models; + ov::InferRequest m_encoder; + std::shared_ptr m_decoder; WhisperPipelineStatefulImpl(const std::filesystem::path& models_path, const std::string& device, @@ -61,14 +63,9 @@ class WhisperPipeline::WhisperPipelineStatefulImpl : public WhisperPipeline::Whi compiled_model = core.compile_model((models_path / "openvino_encoder_model.xml").string(), device, compile_properties); ov::genai::utils::print_compiled_model_properties(compiled_model, "whisper encoder model"); - m_models.encoder = compiled_model.create_infer_request(); - compiled_model = - core.compile_model((models_path / "openvino_decoder_model.xml").string(), device, compile_properties); - ov::genai::utils::print_compiled_model_properties(compiled_model, "whisper decoder model"); - m_models.decoder = compiled_model.create_infer_request(); - compiled_model = core.compile_model(models_path / "openvino_decoder_with_past_model.xml", device, compile_properties); - m_models.decoder_with_past = compiled_model.create_infer_request(); - ov::genai::utils::print_compiled_model_properties(compiled_model, "whisper decoder with past model"); + m_encoder = compiled_model.create_infer_request(); + + m_decoder = WhisperDecoder::from_path(models_path, device, compile_properties); // If eos_token_id was not provided, take value if (m_generation_config.eos_token_id == -1) { @@ -98,7 +95,8 @@ class WhisperPipeline::WhisperPipelineStatefulImpl : public WhisperPipeline::Whi m_model_config, context_tokens, raw_speech_input, - m_models, + m_encoder, + m_decoder, m_feature_extractor, streamer_ptr); auto decode_start_time = std::chrono::steady_clock::now(); From b2df4a654733d54dbebde70f888f1b2f44f59d87 Mon Sep 17 00:00:00 2001 From: Alexander Suvorov Date: Fri, 3 Jan 2025 15:39:40 +0100 Subject: [PATCH 02/27] remove reshape --- src/cpp/src/whisper/models/statefull_decoder.cpp | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/src/cpp/src/whisper/models/statefull_decoder.cpp b/src/cpp/src/whisper/models/statefull_decoder.cpp index 2f235178a4..bc2c91c91f 100644 --- a/src/cpp/src/whisper/models/statefull_decoder.cpp +++ b/src/cpp/src/whisper/models/statefull_decoder.cpp @@ -11,20 +11,7 @@ WhisperStatefullDecoder::WhisperStatefullDecoder(const std::filesystem::path& mo const ov::AnyMap& properties) { ov::Core core = utils::singleton_core(); - auto model = core.read_model((models_path / "openvino_decoder_model.xml").string()); - - // todo: remove once stateful model has dynamic input_ids seq_len - std::map name_to_shape; - for (const ov::Output& input : model->inputs()) { - ov::PartialShape shape = input.get_partial_shape(); - if (input.get_any_name().find("input_ids") != std::string::npos) { - shape[1] = -1; - name_to_shape[input.get_any_name()] = shape; - } - } - model->reshape(name_to_shape); - - auto compiled_model = core.compile_model(model, device, properties); + auto compiled_model = core.compile_model(models_path / "openvino_decoder_model.xml", device, properties); utils::print_compiled_model_properties(compiled_model, "whisper decoder model"); m_request = compiled_model.create_infer_request(); From 17e3ea750886c8aa6bdb9e79bc990862cd215dcc Mon Sep 17 00:00:00 2001 From: Alexander Suvorov Date: Fri, 3 Jan 2025 17:00:37 +0100 Subject: [PATCH 03/27] use stateful seq2seq barnch --- samples/export-requirements.txt | 2 +- src/cpp/src/whisper/models/decoder.cpp | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/samples/export-requirements.txt b/samples/export-requirements.txt index af38558656..2639f86890 100644 --- a/samples/export-requirements.txt +++ b/samples/export-requirements.txt @@ -2,7 +2,7 @@ --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly openvino-tokenizers~=2025.0.0.0.dev -optimum-intel @ git+https://github.com/huggingface/optimum-intel.git +optimum-intel @ git+https://github.com/eaidova/optimum-intel@ea/stateful_seq2seq numpy<2.0.0; sys_platform == 'darwin' einops==0.8.0 # For Qwen transformers_stream_generator==0.0.5 # For Qwen diff --git a/src/cpp/src/whisper/models/decoder.cpp b/src/cpp/src/whisper/models/decoder.cpp index 32a8f2eff6..9cc61d80f9 100644 --- a/src/cpp/src/whisper/models/decoder.cpp +++ b/src/cpp/src/whisper/models/decoder.cpp @@ -16,6 +16,7 @@ std::shared_ptr WhisperDecoder::from_path(const std::filesystem: bool has_decoder_with_past = std::filesystem::exists(models_path / "openvino_decoder_with_past_model.xml"); if (has_decoder_with_past) { + // todo: add deprecation notice return std::make_shared(models_path, device, properties); } From 806b01a97807ddc828feeab134e5c23678596b16 Mon Sep 17 00:00:00 2001 From: Alexander Suvorov Date: Fri, 3 Jan 2025 17:07:53 +0100 Subject: [PATCH 04/27] Address review comments --- src/cpp/src/whisper/models/decoder.hpp | 12 +++--------- src/cpp/src/whisper/models/statefull_decoder.hpp | 6 +++--- src/cpp/src/whisper/models/with_past_decoder.hpp | 6 +++--- src/cpp/src/whisper_pipeline.cpp | 7 ++++--- 4 files changed, 13 insertions(+), 18 deletions(-) diff --git a/src/cpp/src/whisper/models/decoder.hpp b/src/cpp/src/whisper/models/decoder.hpp index d82ce5047a..cd58e54729 100644 --- a/src/cpp/src/whisper/models/decoder.hpp +++ b/src/cpp/src/whisper/models/decoder.hpp @@ -16,19 +16,13 @@ class WhisperDecoder { const ov::AnyMap& properties); virtual std::pair detect_language(const ov::Tensor& encoder_hidden_state, - const int64_t decoder_start_token_id) { - OPENVINO_THROW("detect_language method not implemented"); - }; + const int64_t decoder_start_token_id) = 0; virtual std::pair decode(const ov::Tensor& encoder_hidden_state, const std::vector& input_ids, - const size_t cache_position) { - OPENVINO_THROW("decode method not implemented"); - }; + const size_t cache_position) = 0; - virtual void reset_state() { - OPENVINO_THROW("reset_state method not implemented"); - } + virtual void reset_state() = 0; virtual ~WhisperDecoder(); }; diff --git a/src/cpp/src/whisper/models/statefull_decoder.hpp b/src/cpp/src/whisper/models/statefull_decoder.hpp index 569031b9fa..6f1c9eb002 100644 --- a/src/cpp/src/whisper/models/statefull_decoder.hpp +++ b/src/cpp/src/whisper/models/statefull_decoder.hpp @@ -10,9 +10,9 @@ namespace ov::genai { class WhisperStatefullDecoder : public WhisperDecoder { public: - explicit WhisperStatefullDecoder(const std::filesystem::path& models_path, - const std::string& device, - const ov::AnyMap& properties); + WhisperStatefullDecoder(const std::filesystem::path& models_path, + const std::string& device, + const ov::AnyMap& properties); std::pair detect_language(const ov::Tensor& encoder_hidden_state, const int64_t decoder_start_token_id) override; diff --git a/src/cpp/src/whisper/models/with_past_decoder.hpp b/src/cpp/src/whisper/models/with_past_decoder.hpp index b6f6924af6..c7af1cdaa2 100644 --- a/src/cpp/src/whisper/models/with_past_decoder.hpp +++ b/src/cpp/src/whisper/models/with_past_decoder.hpp @@ -10,9 +10,9 @@ namespace ov::genai { class WhisperWithPastDecoder : public WhisperDecoder { public: - explicit WhisperWithPastDecoder(const std::filesystem::path& models_path, - const std::string& device, - const ov::AnyMap& properties); + WhisperWithPastDecoder(const std::filesystem::path& models_path, + const std::string& device, + const ov::AnyMap& properties); std::pair detect_language(const ov::Tensor& encoder_hidden_state, const int64_t decoder_start_token_id) override; diff --git a/src/cpp/src/whisper_pipeline.cpp b/src/cpp/src/whisper_pipeline.cpp index b7c474258e..ffd792c889 100644 --- a/src/cpp/src/whisper_pipeline.cpp +++ b/src/cpp/src/whisper_pipeline.cpp @@ -48,9 +48,6 @@ namespace genai { class WhisperPipeline::WhisperPipelineStatefulImpl : public WhisperPipeline::WhisperPipelineImplBase { public: - ov::InferRequest m_encoder; - std::shared_ptr m_decoder; - WhisperPipelineStatefulImpl(const std::filesystem::path& models_path, const std::string& device, const ov::AnyMap& properties) @@ -134,6 +131,10 @@ class WhisperPipeline::WhisperPipelineStatefulImpl : public WhisperPipeline::Whi return result; } + +private: + ov::InferRequest m_encoder; + std::shared_ptr m_decoder; }; std::pair streamer(ChunkStreamerVariant func) { From e041a33d1fb5d724efad3803409df98e598f07dd Mon Sep 17 00:00:00 2001 From: Alexander Suvorov Date: Fri, 3 Jan 2025 17:37:33 +0100 Subject: [PATCH 05/27] Rename --- src/cpp/src/whisper/whisper.cpp | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/cpp/src/whisper/whisper.cpp b/src/cpp/src/whisper/whisper.cpp index 9dffe01bd2..3ab873609d 100644 --- a/src/cpp/src/whisper/whisper.cpp +++ b/src/cpp/src/whisper/whisper.cpp @@ -89,11 +89,11 @@ int64_t decode(ov::Tensor& encoder_hidden_state, return output_token; } -std::vector prepare_init_ids(ov::Tensor& encoder_hidden_state, - std::shared_ptr decoder, - const ov::genai::WhisperGenerationConfig& config, - const bool return_timestamps, - ov::genai::RawPerfMetrics& raw_metrics) { +std::vector prepare_init_tokens(ov::Tensor& encoder_hidden_state, + std::shared_ptr decoder, + const ov::genai::WhisperGenerationConfig& config, + const bool return_timestamps, + ov::genai::RawPerfMetrics& raw_metrics) { if (!config.is_multilingual) { if (return_timestamps) { return std::vector{config.decoder_start_token_id}; @@ -109,8 +109,8 @@ std::vector prepare_init_ids(ov::Tensor& encoder_hidden_state, language_token_id = config.lang_to_id.at(language); } } else { - auto [language, infer_ms] = decoder->detect_language(encoder_hidden_state, config.decoder_start_token_id); - language_token_id = language; + auto [language_token, infer_ms] = decoder->detect_language(encoder_hidden_state, config.decoder_start_token_id); + language_token_id = language_token; raw_metrics.m_inference_durations[0] += MicroSeconds(infer_ms); } @@ -132,13 +132,13 @@ std::vector prepare_init_ids(ov::Tensor& encoder_hidden_state, std::pair> full_decode(ov::Tensor& encoder_hidden_state, const ov::genai::WhisperGenerationConfig& config, std::shared_ptr decoder, - std::vector init_ids, + const std::vector& init_tokens, const size_t max_new_tokens, const bool return_timestamps, ov::genai::RawPerfMetrics& raw_metrics, const std::shared_ptr streamer) { int64_t output_token = - decode(encoder_hidden_state, decoder, init_ids, 0, config, raw_metrics, return_timestamps, true, {}); + decode(encoder_hidden_state, decoder, init_tokens, 0, config, raw_metrics, return_timestamps, true, {}); std::vector output_tokens{output_token}; @@ -154,7 +154,7 @@ std::pair> full_decode(ov::Tensor& encoder_hidden_sta auto output_token = decode(encoder_hidden_state, decoder, {output_tokens.back()}, - init_ids.size() + i, + init_tokens.size() + i, config, raw_metrics, return_timestamps, @@ -228,9 +228,9 @@ WhisperGenerateResult whisper_generate(const ov::genai::WhisperGenerationConfig& feature_extractor.nb_max_frames, raw_metrics); - // prepare init_ids just once for whole input + // prepare init_tokens just once for whole input if (init_tokens.empty()) { - init_tokens = prepare_init_ids(hidden_state_tensor, decoder, config, return_timestamps, raw_metrics); + init_tokens = prepare_init_tokens(hidden_state_tensor, decoder, config, return_timestamps, raw_metrics); } std::vector chunk_init_tokens = ov::genai::get_prompt_tokens(context_tokens, config, chunk_offset); From 9502d9b2b4420190d5c7f6a2fcc5271994f7424f Mon Sep 17 00:00:00 2001 From: Alexander Suvorov Date: Fri, 3 Jan 2025 17:43:41 +0100 Subject: [PATCH 06/27] Use commit --- samples/export-requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/export-requirements.txt b/samples/export-requirements.txt index 2639f86890..aa57675218 100644 --- a/samples/export-requirements.txt +++ b/samples/export-requirements.txt @@ -2,7 +2,7 @@ --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly openvino-tokenizers~=2025.0.0.0.dev -optimum-intel @ git+https://github.com/eaidova/optimum-intel@ea/stateful_seq2seq +optimum-intel @ git+https://github.com/eaidova/optimum-intel@770001994b08893c6611de7e569bfea2a7bac4f9 numpy<2.0.0; sys_platform == 'darwin' einops==0.8.0 # For Qwen transformers_stream_generator==0.0.5 # For Qwen From 6c30fa46398fe193d904e153a0d623c761c35294 Mon Sep 17 00:00:00 2001 From: Alexander Suvorov Date: Fri, 3 Jan 2025 18:19:58 +0100 Subject: [PATCH 07/27] Set tests reqs --- samples/export-requirements.txt | 2 +- tests/python_tests/requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/samples/export-requirements.txt b/samples/export-requirements.txt index aa57675218..af38558656 100644 --- a/samples/export-requirements.txt +++ b/samples/export-requirements.txt @@ -2,7 +2,7 @@ --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly openvino-tokenizers~=2025.0.0.0.dev -optimum-intel @ git+https://github.com/eaidova/optimum-intel@770001994b08893c6611de7e569bfea2a7bac4f9 +optimum-intel @ git+https://github.com/huggingface/optimum-intel.git numpy<2.0.0; sys_platform == 'darwin' einops==0.8.0 # For Qwen transformers_stream_generator==0.0.5 # For Qwen diff --git a/tests/python_tests/requirements.txt b/tests/python_tests/requirements.txt index c851c71ee5..78cacd61ae 100644 --- a/tests/python_tests/requirements.txt +++ b/tests/python_tests/requirements.txt @@ -1,6 +1,6 @@ --extra-index-url https://download.pytorch.org/whl/cpu diffusers==0.32.1 -optimum-intel @ git+https://github.com/huggingface/optimum-intel.git +optimum-intel @ git+https://github.com/eaidova/optimum-intel@ea/stateful_seq2seq numpy<2.0.0; platform_system == "Darwin" and platform_machine == "x86_64" onnx==1.17.0 pytest From e38cf5c37ee944dbc87f749e9696375a6beb6b2f Mon Sep 17 00:00:00 2001 From: Alexander Suvorov Date: Thu, 9 Jan 2025 15:52:37 +0100 Subject: [PATCH 08/27] Add with_past model tests --- tests/python_tests/test_whisper_pipeline.py | 185 ++++++++++++-------- 1 file changed, 114 insertions(+), 71 deletions(-) diff --git a/tests/python_tests/test_whisper_pipeline.py b/tests/python_tests/test_whisper_pipeline.py index c046d1ae2c..06d5e56b3c 100644 --- a/tests/python_tests/test_whisper_pipeline.py +++ b/tests/python_tests/test_whisper_pipeline.py @@ -52,50 +52,25 @@ def get_whisper_models_list(tiny_only=False): # used whisper models are relatively small # cache them in memory to speedup tests @functools.lru_cache() -def read_whisper_model(params, **tokenizer_kwargs): +def read_whisper_model(params, stateful=True): model_id, path = params + if not stateful: + path = pathlib.Path(f"{path}_with_past") - processor = WhisperProcessor.from_pretrained(model_id, trust_remote_code=True) - - if (path / "openvino_encoder_model.xml").exists(): - opt_model = OVModelForSpeechSeq2Seq.from_pretrained( - path, - trust_remote_code=True, - compile=False, - device="CPU", - load_in_8bit=False, - ) - else: - - tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) - ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer( - tokenizer, - with_detokenizer=True, - clean_up_tokenization_spaces=False, - **tokenizer_kwargs, - ) + if not (path / "openvino_encoder_model.xml").exists(): + save_model(model_id=model_id, tmp_path=path, stateful=stateful) - openvino.save_model(ov_tokenizer, path / "openvino_tokenizer.xml") - openvino.save_model(ov_detokenizer, path / "openvino_detokenizer.xml") - - # to store tokenizer config jsons with special tokens - tokenizer.save_pretrained(path) + opt_model = OVModelForSpeechSeq2Seq.from_pretrained( + path, + trust_remote_code=True, + compile=False, + device="CPU", + load_in_8bit=False, + ) - opt_model = OVModelForSpeechSeq2Seq.from_pretrained( - model_id, - export=True, - trust_remote_code=True, - stateful=False, - compile=False, - device="CPU", - load_in_8bit=False, - ) - opt_model.generation_config.save_pretrained(path) - opt_model.config.save_pretrained(path) - opt_model.save_pretrained(path) - processor.save_pretrained(path) + processor = WhisperProcessor.from_pretrained(model_id, trust_remote_code=True) - opt_pipe = pipeline( + hf_pipe = pipeline( "automatic-speech-recognition", model=opt_model, tokenizer=processor.tokenizer, @@ -105,11 +80,42 @@ def read_whisper_model(params, **tokenizer_kwargs): return ( model_id, path, - opt_pipe, + hf_pipe, ov_genai.WhisperPipeline(path, "CPU", **{"ENABLE_MMAP": False}), ) +def save_model(model_id: str, tmp_path: pathlib.Path, stateful=True): + tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) + ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer( + tokenizer, + with_detokenizer=True, + clean_up_tokenization_spaces=False, + ) + + openvino.save_model(ov_tokenizer, tmp_path / "openvino_tokenizer.xml") + openvino.save_model(ov_detokenizer, tmp_path / "openvino_detokenizer.xml") + + # to store tokenizer config jsons with special tokens + tokenizer.save_pretrained(tmp_path) + + opt_model = OVModelForSpeechSeq2Seq.from_pretrained( + model_id, + export=True, + trust_remote_code=True, + stateful=stateful, + compile=False, + device="CPU", + load_in_8bit=False, + ) + opt_model.generation_config.save_pretrained(tmp_path) + opt_model.config.save_pretrained(tmp_path) + opt_model.save_pretrained(tmp_path) + + processor = WhisperProcessor.from_pretrained(model_id, trust_remote_code=True) + processor.save_pretrained(tmp_path) + + def run_huggingface( pipeline, sample, @@ -179,6 +185,9 @@ def run_pipeline_with_ref( streamer: typing.Callable[[str], bool] | None = None, ): _, _, hf_pipe, genai_pipe = read_whisper_model((model_id, tmp_path)) + _, _, hf_with_past_pipe, genai_with_past_pipe = read_whisper_model( + (model_id, tmp_path), stateful=False + ) if type(sample) is np.ndarray and len(sample.shape) == 1: sample = np.expand_dims(sample, 0) @@ -189,6 +198,12 @@ def run_pipeline_with_ref( compare_results(hf_result, genai_result) + genai_with_past_result = run_genai( + genai_with_past_pipe, _sample, generation_config, streamer + ) + + compare_results(hf_result, genai_with_past_result) + def compare_results(hf_result, genai_result): assert genai_result.texts[0] == hf_result["text"] @@ -274,9 +289,9 @@ def test_whisper_config_constructor(model_descr): @pytest.mark.parametrize("test_sample", get_samples_from_dataset(length=1)) @pytest.mark.precommit def test_whisper_constructors(model_descr, test_sample): - model_id, path, opt_pipe, pipe = read_whisper_model(model_descr) + model_id, path, hf_pipe, genai_pipe = read_whisper_model(model_descr) - expected = opt_pipe(test_sample)["text"] + expected = hf_pipe(test_sample)["text"] genai_result = ov_genai.WhisperPipeline( models_path=path, device="CPU", **{"ENABLE_MMAP": False} @@ -294,17 +309,17 @@ def test_whisper_constructors(model_descr, test_sample): @pytest.mark.parametrize("test_sample", get_samples_from_dataset(length=1)) @pytest.mark.precommit def test_max_new_tokens(model_descr, test_sample): - model_id, path, opt_pipe, pipe = read_whisper_model(model_descr) + model_id, path, hf_pipe, genai_pipe = read_whisper_model(model_descr) - expected = opt_pipe(test_sample, max_new_tokens=10) + expected = hf_pipe(test_sample, max_new_tokens=10) - genai_result = pipe.generate(test_sample, max_new_tokens=10) + genai_result = genai_pipe.generate(test_sample, max_new_tokens=10) compare_results(expected, genai_result) - config = pipe.get_generation_config() + config = genai_pipe.get_generation_config() config.max_new_tokens = 10 - genai_result = pipe.generate(test_sample, config) + genai_result = genai_pipe.generate(test_sample, config) compare_results(expected, genai_result) @@ -318,23 +333,23 @@ def test_max_new_tokens(model_descr, test_sample): ) @pytest.mark.precommit def test_language_mode(model_descr, test_samples): - model_id, path, opt_pipe, pipe = read_whisper_model(model_descr) + model_id, path, hf_pipe, genai_pipe = read_whisper_model(model_descr) samples, language = test_samples - expected = opt_pipe( + expected = hf_pipe( samples[0], max_new_tokens=30, generate_kwargs={"language": language} ) - genai_result = pipe.generate( + genai_result = genai_pipe.generate( samples[0], max_new_tokens=30, language=f"<|{language}|>" ) compare_results(expected, genai_result) - config = pipe.get_generation_config() + config = genai_pipe.get_generation_config() config.max_new_tokens = 30 config.language = f"<|{language}|>" - genai_result = pipe.generate(samples[0], config) + genai_result = genai_pipe.generate(samples[0], config) compare_results(expected, genai_result) @@ -345,46 +360,46 @@ def test_language_mode(model_descr, test_samples): ) @pytest.mark.precommit def test_task_mode(model_descr, test_sample): - model_id, path, opt_pipe, pipe = read_whisper_model(model_descr) + model_id, path, hf_pipe, genai_pipe = read_whisper_model(model_descr) - expected = opt_pipe( + expected = hf_pipe( test_sample, max_new_tokens=30, generate_kwargs={"language": "fr", "task": "translate"}, ) - genai_result = pipe.generate( + genai_result = genai_pipe.generate( test_sample, max_new_tokens=30, language="<|fr|>", task="translate" ) compare_results(expected, genai_result) - config = pipe.get_generation_config() + config = genai_pipe.get_generation_config() config.max_new_tokens = 30 config.language = "<|fr|>" config.task = "translate" - genai_result = pipe.generate(test_sample, config) + genai_result = genai_pipe.generate(test_sample, config) compare_results(expected, genai_result) # seems to be equivalent to translate task - expected = opt_pipe( + expected = hf_pipe( test_sample, max_new_tokens=30, generate_kwargs={"language": "en", "task": "transcribe"}, ) - genai_result = pipe.generate( + genai_result = genai_pipe.generate( test_sample, max_new_tokens=30, language="<|en|>", task="transcribe" ) compare_results(expected, genai_result) - config = pipe.get_generation_config() + config = genai_pipe.get_generation_config() config.max_new_tokens = 30 config.language = "<|en|>" config.task = "transcribe" - genai_result = pipe.generate(test_sample, config) + genai_result = genai_pipe.generate(test_sample, config) compare_results(expected, genai_result) @@ -400,12 +415,12 @@ def test_task_mode(model_descr, test_sample): ) @pytest.mark.precommit def test_language_autodetect(model_descr, test_sample): - model_id, path, opt_pipe, pipe = read_whisper_model(model_descr) + model_id, path, hf_pipe, genai_pipe = read_whisper_model(model_descr) - input_features = opt_pipe.feature_extractor(test_sample) - language_id = opt_pipe.model.detect_language(input_features["input_features"])[0] + input_features = hf_pipe.feature_extractor(test_sample) + language_id = hf_pipe.model.detect_language(input_features["input_features"])[0] # ensure detected language us not english - assert language_id != pipe.get_generation_config().lang_to_id["<|en|>"] + assert language_id != genai_pipe.get_generation_config().lang_to_id["<|en|>"] run_pipeline_with_ref( model_id=model_descr[0], @@ -469,6 +484,34 @@ def test_longform_audio(model_descr, test_sample): assert "".join(streamer_result) == hf_result["text"] +@pytest.mark.parametrize("model_descr", get_whisper_models_list()) +@pytest.mark.parametrize( + "test_sample", get_samples_from_dataset(length=10, long_form=True) +) +@pytest.mark.precommit +def test_longform_audio_with_past(model_descr, test_sample): + _, _, hf_pipe, genai_pipe = read_whisper_model(model_descr, stateful=True) + + streamer_result = [] + + genai_result = run_genai( + genai_pipe, + test_sample, + config=ov_genai.WhisperGenerationConfig(return_timestamps=True), + streamer=lambda x: streamer_result.append(x), + ) + + hf_result = run_huggingface( + hf_pipe, + test_sample, + config=ov_genai.WhisperGenerationConfig(return_timestamps=True), + ) + + compare_results(hf_result, genai_result) + + assert "".join(streamer_result) == hf_result["text"] + + @pytest.mark.parametrize("model_descr", get_whisper_models_list()) @pytest.mark.precommit def test_shortform(model_descr): @@ -494,19 +537,19 @@ def test_shortform(model_descr): ) @pytest.mark.precommit def test_initial_prompt_hotwords(model_descr, test_sample): - model_id, path, opt_pipe, pipe = read_whisper_model(model_descr) + model_id, path, hf_pipe, genai_pipe = read_whisper_model(model_descr) - result = pipe.generate(test_sample) + result = genai_pipe.generate(test_sample) assert "Joel Keaton" in result.texts[0] assert "Joel Kyton" not in result.texts[0] - result = pipe.generate(test_sample, initial_prompt="Joel Kyton") + result = genai_pipe.generate(test_sample, initial_prompt="Joel Kyton") assert "Joel Keaton" not in result.texts[0] assert "Joel Kyton" in result.texts[0] - result = pipe.generate(test_sample, hotwords="Joel Kyton") + result = genai_pipe.generate(test_sample, hotwords="Joel Kyton") assert "Joel Keaton" not in result.texts[0] assert "Joel Kyton" in result.texts[0] @@ -521,9 +564,9 @@ def test_initial_prompt_hotwords(model_descr, test_sample): ) @pytest.mark.precommit def test_perf_metrics(model_descr, test_sample): - model_id, path, opt_pipe, pipe = read_whisper_model(model_descr) + model_id, path, hf_pipe, genai_pipe = read_whisper_model(model_descr) - result = pipe.generate(test_sample) + result = genai_pipe.generate(test_sample) perf_metrics = result.perf_metrics From acc656ff4dfcdba87332d8d340fb5b1a100d1fcc Mon Sep 17 00:00:00 2001 From: Alexander Suvorov Date: Thu, 9 Jan 2025 16:00:17 +0100 Subject: [PATCH 09/27] remove comment --- src/cpp/src/whisper/models/decoder.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/cpp/src/whisper/models/decoder.cpp b/src/cpp/src/whisper/models/decoder.cpp index 9cc61d80f9..32a8f2eff6 100644 --- a/src/cpp/src/whisper/models/decoder.cpp +++ b/src/cpp/src/whisper/models/decoder.cpp @@ -16,7 +16,6 @@ std::shared_ptr WhisperDecoder::from_path(const std::filesystem: bool has_decoder_with_past = std::filesystem::exists(models_path / "openvino_decoder_with_past_model.xml"); if (has_decoder_with_past) { - // todo: add deprecation notice return std::make_shared(models_path, device, properties); } From 3728884db607d27458d2e50e2529a26f1a58fece Mon Sep 17 00:00:00 2001 From: Alexander Suvorov Date: Thu, 9 Jan 2025 16:06:37 +0100 Subject: [PATCH 10/27] bump tokenizers --- thirdparty/openvino_tokenizers | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/openvino_tokenizers b/thirdparty/openvino_tokenizers index bcfd3eda25..d5f0abf827 160000 --- a/thirdparty/openvino_tokenizers +++ b/thirdparty/openvino_tokenizers @@ -1 +1 @@ -Subproject commit bcfd3eda25ae3ec423502a4074e35c774506c732 +Subproject commit d5f0abf8271f3cd8fc98d747b3e569fbeacca532 From 445ce5a507c24df5e701f194e9d2f883db2d1e7a Mon Sep 17 00:00:00 2001 From: Alexander Suvorov Date: Thu, 9 Jan 2025 16:25:50 +0100 Subject: [PATCH 11/27] Fix typo --- src/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/README.md b/src/README.md index 6466b431d0..ffacd420e4 100644 --- a/src/README.md +++ b/src/README.md @@ -179,7 +179,7 @@ int main(int argc, char* argv[]) { Streaming with a custom class: -C++ template for a stremer. +C++ template for a streamer. ```cpp #include "openvino/genai/streamer_base.hpp" #include "openvino/genai/llm_pipeline.hpp" From 5bdd69561c9a79240e995a958cf5d63eb8578641 Mon Sep 17 00:00:00 2001 From: Alexander Suvorov Date: Fri, 10 Jan 2025 09:37:34 +0100 Subject: [PATCH 12/27] Add deprecation message --- .github/workflows/windows.yml | 3 +-- src/cpp/src/logger.hpp | 17 +++++++++++++++++ .../src/whisper/models/with_past_decoder.cpp | 5 +++++ 3 files changed, 23 insertions(+), 2 deletions(-) create mode 100644 src/cpp/src/logger.hpp diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index e65972110b..835fd924ca 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -311,10 +311,9 @@ jobs: python -m pip install . --verbose --find-links ${env:OV_INSTALL_DIR}/wheels python -m pip install ./tools/who_what_benchmark --find-links ${env:OV_INSTALL_DIR}/wheels - # will install transformers 4.46.3 version # transformers 4.46.3 will enable return_timestamps tests # this check enabled for windows only. Ticket: 160205. - python -m pip install git+https://github.com/huggingface/optimum-intel.git@753f84db6e0966580eb9eaa74a808213be730631 + python -m pip install transformers==4.46.3 python -m pytest -v ./tests/python_tests/test_whisper_pipeline.py -k "not test_smoke" diff --git a/src/cpp/src/logger.hpp b/src/cpp/src/logger.hpp new file mode 100644 index 0000000000..503a419e5e --- /dev/null +++ b/src/cpp/src/logger.hpp @@ -0,0 +1,17 @@ +// Copyright (C) 2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once +#include +#include + +namespace ov::genai { + +class Logger { +public: + static void warn(std::string message) { + std::cout << "[WARN] " << message << '\n'; + }; +}; + +} // namespace ov::genai diff --git a/src/cpp/src/whisper/models/with_past_decoder.cpp b/src/cpp/src/whisper/models/with_past_decoder.cpp index a32c9a45be..7f62ea5657 100644 --- a/src/cpp/src/whisper/models/with_past_decoder.cpp +++ b/src/cpp/src/whisper/models/with_past_decoder.cpp @@ -5,6 +5,7 @@ #include +#include "logger.hpp" #include "utils.hpp" namespace { @@ -40,6 +41,10 @@ namespace ov::genai { WhisperWithPastDecoder::WhisperWithPastDecoder(const std::filesystem::path& models_path, const std::string& device, const ov::AnyMap& properties) { + Logger::warn("Whisper decoder models with past is deprecated. Support will be removed in 2026.0.0 release.\n" + "To obtain stateful decoder model use latest `optimum-intel` package:\n" + "pip install optimum-intel@git+https://github.com/huggingface/optimum-intel.git\n" + "optimum-cli export openvino --trust-remote-code --model openai/whisper-tiny whisper-tiny"); ov::Core core = utils::singleton_core(); auto compiled_model = core.compile_model(models_path / "openvino_decoder_model.xml", device, properties); From 7f2a1532b9ea120fbcc842a47c30f0f3c5e63655 Mon Sep 17 00:00:00 2001 From: Alexander Suvorov Date: Tue, 21 Jan 2025 14:34:21 +0100 Subject: [PATCH 13/27] Use sampler for whisper pipeline --- .../genai/whisper_generation_config.hpp | 23 +- src/cpp/src/debug_utils.hpp | 3 +- src/cpp/src/lm_encoding.cpp | 12 +- src/cpp/src/logger.hpp | 2 +- src/cpp/src/sampler.cpp | 16 +- src/cpp/src/sampler.hpp | 2 +- src/cpp/src/utils.cpp | 17 -- src/cpp/src/utils.hpp | 2 - src/cpp/src/whisper/logit_processor.cpp | 1 - src/cpp/src/whisper/models/decoder.hpp | 8 +- .../src/whisper/models/statefull_decoder.cpp | 99 +++++-- .../src/whisper/models/statefull_decoder.hpp | 13 +- .../src/whisper/models/with_past_decoder.cpp | 107 ------- .../src/whisper/models/with_past_decoder.hpp | 15 +- src/cpp/src/whisper/whisper.cpp | 260 +++++++++++------- src/cpp/src/whisper/whisper.hpp | 4 +- src/cpp/src/whisper_generation_config.cpp | 50 +--- src/cpp/src/whisper_pipeline.cpp | 9 +- 18 files changed, 317 insertions(+), 326 deletions(-) delete mode 100644 src/cpp/src/whisper/models/with_past_decoder.cpp diff --git a/src/cpp/include/openvino/genai/whisper_generation_config.hpp b/src/cpp/include/openvino/genai/whisper_generation_config.hpp index 44d611923d..18f4cfb45d 100644 --- a/src/cpp/include/openvino/genai/whisper_generation_config.hpp +++ b/src/cpp/include/openvino/genai/whisper_generation_config.hpp @@ -6,6 +6,7 @@ #include #include +#include "generation_config.hpp" #include "openvino/genai/tokenizer.hpp" #include "openvino/runtime/compiled_model.hpp" @@ -15,28 +16,14 @@ namespace genai { /** * @brief Structure to keep whisper generation config parameters. */ -class OPENVINO_GENAI_EXPORTS WhisperGenerationConfig { +class OPENVINO_GENAI_EXPORTS WhisperGenerationConfig : public GenerationConfig { public: WhisperGenerationConfig() = default; explicit WhisperGenerationConfig(const std::filesystem::path& json_path); - // Generic - - // the maximum length the generated tokens can have. Corresponds to the length of the input prompt + - // `max_new_tokens`. Its effect is overridden by `max_new_tokens`, if also set. - size_t max_new_tokens = SIZE_MAX; - // the maximum numbers of tokens to generate, excluding the number of tokens in the prompt. - // max_new_tokens has priority over max_length. - size_t max_length = SIZE_MAX; - - // Whisper specific - // Corresponds to the ”<|startoftranscript|>” token. int64_t decoder_start_token_id = 50258; - // End of stream token id. - int64_t eos_token_id = 50257; - // Padding token id. int64_t pad_token_id = 50257; @@ -110,12 +97,6 @@ class OPENVINO_GENAI_EXPORTS WhisperGenerationConfig { // A list containing the non-speech tokens that will be suppressed during generation. std::vector suppress_tokens; - /** @brief sets eos_token_id to tokenizer_eos_token_id if eos_token_id is less than 0. - * Otherwise verifies eos_token_id == tokenizer_eos_token_id. - */ - void set_eos_token_id(int64_t tokenizer_eos_token_id); - size_t get_max_new_tokens(size_t prompt_length = 0) const; - void update_generation_config(const ov::AnyMap& config_map = {}); template diff --git a/src/cpp/src/debug_utils.hpp b/src/cpp/src/debug_utils.hpp index 415f8c0480..1b76a818b3 100644 --- a/src/cpp/src/debug_utils.hpp +++ b/src/cpp/src/debug_utils.hpp @@ -12,7 +12,7 @@ template void print_array(T * array, size_t size) { std::cout << " => [ "; - for (size_t i = 0; i < size; ++i) { + for (size_t i = 0; i < std::min(size, size_t(10)); ++i) { std::cout << array[i] << " "; } std::cout << " ] " << std::endl; @@ -20,6 +20,7 @@ void print_array(T * array, size_t size) { inline void print_tensor(std::string name, ov::Tensor tensor) { std::cout << name; + std::cout << " " << tensor.get_shape().to_string(); if (tensor.get_element_type() == ov::element::i32) { print_array(tensor.data(), tensor.get_size()); } else if (tensor.get_element_type() == ov::element::i64) { diff --git a/src/cpp/src/lm_encoding.cpp b/src/cpp/src/lm_encoding.cpp index 9ef876d8aa..a326f8748c 100644 --- a/src/cpp/src/lm_encoding.cpp +++ b/src/cpp/src/lm_encoding.cpp @@ -14,10 +14,13 @@ #include "lm_encoding.hpp" #include "openvino/genai/perf_metrics.hpp" +namespace { -namespace ov { -namespace genai { - +/** + * Set position ids tensor data for next token inference based on provided attention mask + * Supports multi batch + * Supports sparse attention_mask + */ void update_position_ids(ov::Tensor&& position_ids, const ov::Tensor&& attention_mask) { const size_t batch_size = attention_mask.get_shape().at(0); const size_t sequence_length = attention_mask.get_shape().at(1); @@ -48,7 +51,10 @@ void update_attention_mask_with_beams(ov::Tensor&& attention_mask, std::vector()[result_prompt_offset + new_shape.at(1) - 1] = 1; } } +} +namespace ov { +namespace genai { std::pair> get_lm_encoded_results( ov::InferRequest& m_llm, diff --git a/src/cpp/src/logger.hpp b/src/cpp/src/logger.hpp index 503a419e5e..fbf0657a87 100644 --- a/src/cpp/src/logger.hpp +++ b/src/cpp/src/logger.hpp @@ -9,7 +9,7 @@ namespace ov::genai { class Logger { public: - static void warn(std::string message) { + static void warn(const std::string& message) { std::cout << "[WARN] " << message << '\n'; }; }; diff --git a/src/cpp/src/sampler.cpp b/src/cpp/src/sampler.cpp index 54850f657b..1c0dd504fe 100644 --- a/src/cpp/src/sampler.cpp +++ b/src/cpp/src/sampler.cpp @@ -407,7 +407,7 @@ void Sampler::GroupBeamSearcher::select_next_tokens(const ov::Tensor& logits, } // check whether group has finished - group.is_done(m_parameters); + group.is_done(m_parameters, m_sequence_group->get_prompt_len()); // group cannot continue if there are no valid child beams if (child_beams_per_group[group_id].size() == 0) { @@ -548,14 +548,14 @@ std::vector Sampler::_try_finish_generation(SequenceGroup::Ptr & sequen std::vector dropped_seq_ids; for (auto& running_sequence : sequence_group->get_running_sequences()) { const auto generated_len = running_sequence->get_generated_len(); - if (sampling_params.max_new_tokens <= generated_len || + if (sampling_params.get_max_new_tokens(sequence_group->get_prompt_len()) <= generated_len || is_stop_token_id_hit(running_sequence->get_generated_ids().back(), sampling_params.stop_token_ids) && !sampling_params.ignore_eos) { // stop sequence by max_new_tokens or stop token (eos included) running_sequence->set_status(SequenceStatus::FINISHED); if (is_stop_token_id_hit(running_sequence->get_generated_ids().back(), sampling_params.stop_token_ids) && !sampling_params.ignore_eos) { running_sequence->set_finish_reason(GenerationFinishReason::STOP); - } else if (sampling_params.max_new_tokens == generated_len) { + } else if (sampling_params.get_max_new_tokens(sequence_group->get_prompt_len()) == generated_len) { running_sequence->set_finish_reason(GenerationFinishReason::LENGTH); } @@ -798,8 +798,8 @@ SamplerOutput Sampler::sample(const std::vector & sequence_g // max counter of needed to be sampled tokens OPENVINO_ASSERT(running_sequence->get_generated_len() >= token_offset); size_t generated_and_verified_len = running_sequence->get_generated_len() - token_offset; - OPENVINO_ASSERT(sampling_params.max_new_tokens >= generated_and_verified_len); - size_t max_num_sampled_token = sampling_params.max_new_tokens - generated_and_verified_len; + OPENVINO_ASSERT(sampling_params.get_max_new_tokens(sequence_group->get_prompt_len()) >= generated_and_verified_len); + size_t max_num_sampled_token = sampling_params.get_max_new_tokens(sequence_group->get_prompt_len()) - generated_and_verified_len; if (max_num_sampled_token == 0) { stop_sample_tokens(running_sequence, token_offset, max_num_sampled_token, max_removed_tokens_per_request); break; @@ -885,7 +885,7 @@ SamplerOutput Sampler::sample(const std::vector & sequence_g // check max length stop criteria std::vector running_sequences = sequence_group->get_running_sequences(); if (!sequence_group->has_finished() && - running_sequences[0]->get_generated_len() == sampling_params.max_new_tokens) { + running_sequences[0]->get_generated_len() == sampling_params.get_max_new_tokens(sequence_group->get_prompt_len())) { // stop sequence by max_new_tokens m_beam_search_info.at(request_id).finalize(sampler_output); } @@ -954,7 +954,7 @@ int64_t Sampler::GroupBeamSearcher::Group::finish(Beam beam, const ov::genai::Ge return preeempted_sequence_id; } -void Sampler::GroupBeamSearcher::Group::is_done(const ov::genai::GenerationConfig& sampling_params) { +void Sampler::GroupBeamSearcher::Group::is_done(const ov::genai::GenerationConfig& sampling_params, size_t prompt_length) { assert(sampling_params.num_beams % sampling_params.num_beam_groups == 0 && "number of beams should be divisible by number of groups"); size_t group_size = sampling_params.num_beams / sampling_params.num_beam_groups; @@ -975,7 +975,7 @@ void Sampler::GroupBeamSearcher::Group::is_done(const ov::genai::GenerationConfi return; } case ov::genai::StopCriteria::NEVER: { - size_t length = sampling_params.length_penalty > 0.0 ? sampling_params.max_new_tokens : cur_len; + size_t length = sampling_params.length_penalty > 0.0 ? sampling_params.get_max_new_tokens(prompt_length) : cur_len; float highest_attainable_score = best_sum_logprobs / std::pow(float(length), sampling_params.length_penalty); done = worst_score >= highest_attainable_score; return; diff --git a/src/cpp/src/sampler.hpp b/src/cpp/src/sampler.hpp index 7796f93d1e..1872b6c1e4 100644 --- a/src/cpp/src/sampler.hpp +++ b/src/cpp/src/sampler.hpp @@ -112,7 +112,7 @@ class Sampler::GroupBeamSearcher { bool done = false; int64_t finish(Beam beam, const ov::genai::GenerationConfig& sampling_params); - void is_done(const ov::genai::GenerationConfig& sampling_params); + void is_done(const ov::genai::GenerationConfig& sampling_params, size_t prompt_length); }; SequenceGroup::Ptr m_sequence_group; diff --git a/src/cpp/src/utils.cpp b/src/cpp/src/utils.cpp index 9261aa7a4a..c73e47f153 100644 --- a/src/cpp/src/utils.cpp +++ b/src/cpp/src/utils.cpp @@ -128,23 +128,6 @@ void set_attention_mask(ov::Tensor&& attention_mask, std::vector next_b } } -/** - * Set position ids tensor data for next token inference based on provided attention mask - * Supports multi batch - * Supports sparse attention_mask - */ -void update_position_ids(ov::Tensor&& position_ids, const ov::Tensor&& attention_mask) { - const size_t batch_size = attention_mask.get_shape().at(0); - const size_t atten_length = attention_mask.get_shape().at(1); - position_ids.set_shape({batch_size, 1}); - - for (size_t batch = 0; batch < batch_size; batch++) { - int64_t* start = attention_mask.data() + batch * atten_length; - // todo: be careful with start + atten_length, probably need to replace with start + atten_length -1 - position_ids.data()[batch] = std::accumulate(start, start + atten_length, 0); - } -} - /** * Get attention mask tensor for next token inference * Supports multi batch diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp index ad0e1a05d4..235768ef18 100644 --- a/src/cpp/src/utils.hpp +++ b/src/cpp/src/utils.hpp @@ -53,8 +53,6 @@ void initialize_position_ids(ov::Tensor& position_ids, const ov::Tensor& attenti ov::Tensor extend_attention(ov::Tensor attention_mask); -void update_position_ids(ov::Tensor&& position_ids, const ov::Tensor&& attention_mask); - template struct OmitOptional { using value = T; }; template struct OmitOptional> { using value = T; }; diff --git a/src/cpp/src/whisper/logit_processor.cpp b/src/cpp/src/whisper/logit_processor.cpp index d3d9552f57..38bc66b1cf 100644 --- a/src/cpp/src/whisper/logit_processor.cpp +++ b/src/cpp/src/whisper/logit_processor.cpp @@ -28,7 +28,6 @@ void process_whisper_timestamp_logits(ov::Tensor& logits, const std::vector& generated_tokens, bool initial_step = false) { const size_t batch_size = logits.get_shape().at(0); - OPENVINO_ASSERT(batch_size == 1, "Batch != 1 is not supported"); size_t vocab_size = logits.get_shape().back(); size_t batch_offset = batch_idx * logits.get_shape()[1] * vocab_size; diff --git a/src/cpp/src/whisper/models/decoder.hpp b/src/cpp/src/whisper/models/decoder.hpp index cd58e54729..acb10d92b5 100644 --- a/src/cpp/src/whisper/models/decoder.hpp +++ b/src/cpp/src/whisper/models/decoder.hpp @@ -15,12 +15,12 @@ class WhisperDecoder { const std::string& device, const ov::AnyMap& properties); - virtual std::pair detect_language(const ov::Tensor& encoder_hidden_state, + virtual std::pair detect_language(const Tensor& encoder_hidden_state, const int64_t decoder_start_token_id) = 0; - virtual std::pair decode(const ov::Tensor& encoder_hidden_state, - const std::vector& input_ids, - const size_t cache_position) = 0; + virtual std::pair decode(const Tensor& encoder_hidden_state, + const Tensor& input_ids, + const Tensor& beam_idx) = 0; virtual void reset_state() = 0; diff --git a/src/cpp/src/whisper/models/statefull_decoder.cpp b/src/cpp/src/whisper/models/statefull_decoder.cpp index bc2c91c91f..ce029d3057 100644 --- a/src/cpp/src/whisper/models/statefull_decoder.cpp +++ b/src/cpp/src/whisper/models/statefull_decoder.cpp @@ -3,6 +3,7 @@ #include "statefull_decoder.hpp" +#include "debug_utils.hpp" #include "utils.hpp" namespace ov::genai { @@ -19,7 +20,13 @@ WhisperStatefullDecoder::WhisperStatefullDecoder(const std::filesystem::path& mo std::pair WhisperStatefullDecoder::detect_language(const ov::Tensor& encoder_hidden_state, const int64_t decoder_start_token_id) { - auto [output_tensor, infer_ms] = decode(encoder_hidden_state, {decoder_start_token_id}, 0); + Tensor input_ids_tensor{ov::element::i64, {1, 1}}; + input_ids_tensor.data()[0] = decoder_start_token_id; + + Tensor beam_idx_tensor{ov::element::i32, {1}}; + beam_idx_tensor.data()[0] = 0; + + auto [output_tensor, infer_ms] = decode(encoder_hidden_state, input_ids_tensor, beam_idx_tensor); int64_t output_token = ov::genai::utils::argmax(output_tensor, 0); @@ -28,22 +35,16 @@ std::pair WhisperStatefullDecoder::detect_language(const ov::Ten return {output_token, infer_ms}; } -std::pair WhisperStatefullDecoder::decode(const ov::Tensor& encoder_hidden_state, - const std::vector& input_ids, - const size_t cache_position) { - m_request.set_tensor("encoder_hidden_states", encoder_hidden_state); - - ov::Tensor input_ids_tensor(ov::element::i64, {1, input_ids.size()}, (void*)input_ids.data()); - m_request.set_tensor("input_ids", input_ids_tensor); - - ov::Tensor cache_position_tensor = m_request.get_tensor("cache_position"); - cache_position_tensor.set_shape({input_ids.size()}); - - auto cache_data = cache_position_tensor.data(); - std::iota(cache_data, cache_data + cache_position_tensor.get_size(), cache_position); +std::pair WhisperStatefullDecoder::decode(const Tensor& encoder_hidden_state, + const Tensor& input_ids, + const Tensor& beam_idx) { + const size_t batch_size = input_ids.get_shape().at(0); + const size_t seq_len = input_ids.get_shape().at(1); - m_request.get_tensor("beam_idx").set_shape({1}); - m_request.get_tensor("beam_idx").data()[0] = 0; + _set_encoder_hidden_states_tensor(encoder_hidden_state, batch_size); + _set_cache_position_tensor(seq_len); + m_request.set_tensor("input_ids", input_ids); + m_request.set_tensor("beam_idx", beam_idx); const auto infer_start = std::chrono::steady_clock::now(); m_request.infer(); @@ -54,7 +55,71 @@ std::pair WhisperStatefullDecoder::decode(const ov::Tensor& e return {output_tensor, infer_ms}; }; +/** + * Encoder hidden states expected to be with batch 1 + * Copy encoder hidden state tensor from batch 1 to requested batch_size. + * Set new encoder hidden states tensor to infer request. + */ +void WhisperStatefullDecoder::_set_encoder_hidden_states_tensor(const Tensor& encoder_hidden_state, + const size_t batch_size) { + _reset_encoder_past_key_values_states(encoder_hidden_state, batch_size); + + OPENVINO_ASSERT(encoder_hidden_state.get_shape().at(0) == 1); + Shape shape{encoder_hidden_state.get_shape()}; + shape[0] = batch_size; + + Tensor new_encoder_hidden_states{ov::element::f32, shape}; + + auto new_encoder_hidden_states_data = new_encoder_hidden_states.data(); + auto encoder_hidden_state_data = encoder_hidden_state.data(); + + for (size_t batch = 0; batch < batch_size; batch++) { + const size_t batch_offset = batch * encoder_hidden_state.get_size(); + std::memcpy(new_encoder_hidden_states_data + batch_offset, + encoder_hidden_state_data, + encoder_hidden_state.get_byte_size()); + } + + m_request.set_tensor("encoder_hidden_states", new_encoder_hidden_states); +} + +// Ensure encoder past_key values states are reset if batch size changed. This is workaround for Ticket: +void WhisperStatefullDecoder::_reset_encoder_past_key_values_states(const Tensor& encoder_hidden_state, + const size_t batch_size) { + const size_t current_batch_size = m_request.get_tensor("encoder_hidden_states").get_shape().at(0); + // batch hasn't changed, skip + if (current_batch_size == 0 || current_batch_size == batch_size) { + return; + } + + const size_t encoder_state_length_dim = encoder_hidden_state.get_shape().at(1); + for (auto& state : m_request.query_state()) { + // find encoder states by dimension + const Shape& state_shape = state.get_state().get_shape(); + if (state_shape.at(2) == encoder_state_length_dim) { + state.reset(); + } + } +} + +void WhisperStatefullDecoder::_set_cache_position_tensor(const size_t seq_len) { + ov::Tensor cache_position_tensor = m_request.get_tensor("cache_position"); + + int64_t start_cache_position = 0; + + if (cache_position_tensor.get_size() != 0) { + start_cache_position = cache_position_tensor.data()[cache_position_tensor.get_size() - 1] + 1; + } + + cache_position_tensor.set_shape({seq_len}); + + auto cache_data = cache_position_tensor.data(); + std::iota(cache_data, cache_data + seq_len, start_cache_position); +}; + void WhisperStatefullDecoder::reset_state() { m_request.reset_state(); -} + m_request.set_tensor("cache_position", ov::Tensor{ov::element::i64, {0}}); +}; + } // namespace ov::genai diff --git a/src/cpp/src/whisper/models/statefull_decoder.hpp b/src/cpp/src/whisper/models/statefull_decoder.hpp index 6f1c9eb002..4d4572c33d 100644 --- a/src/cpp/src/whisper/models/statefull_decoder.hpp +++ b/src/cpp/src/whisper/models/statefull_decoder.hpp @@ -14,15 +14,20 @@ class WhisperStatefullDecoder : public WhisperDecoder { const std::string& device, const ov::AnyMap& properties); - std::pair detect_language(const ov::Tensor& encoder_hidden_state, + std::pair detect_language(const Tensor& encoder_hidden_state, const int64_t decoder_start_token_id) override; - std::pair decode(const ov::Tensor& encoder_hidden_state, - const std::vector& input_ids, - const size_t cache_position) override; + std::pair decode(const Tensor& encoder_hidden_state, + const Tensor& input_ids, + const Tensor& beam_idx) override; void reset_state() override; +private: + void _set_encoder_hidden_states_tensor(const Tensor& encoder_hidden_state, const size_t batch_size); + void _reset_encoder_past_key_values_states(const Tensor& encoder_hidden_state, const size_t batch_size); + void _set_cache_position_tensor(const size_t seq_len); + private: ov::InferRequest m_request; }; diff --git a/src/cpp/src/whisper/models/with_past_decoder.cpp b/src/cpp/src/whisper/models/with_past_decoder.cpp deleted file mode 100644 index 7f62ea5657..0000000000 --- a/src/cpp/src/whisper/models/with_past_decoder.cpp +++ /dev/null @@ -1,107 +0,0 @@ -// Copyright (C) 2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 - -#include "with_past_decoder.hpp" - -#include - -#include "logger.hpp" -#include "utils.hpp" - -namespace { -void set_past_key_value(ov::InferRequest& source, ov::InferRequest& dest) { - // source outputs: - // present.0.decoder.key - // present.0.decoder.value - // present.0.encoder.key - // present.0.encoder.value - - // dest inputs: - // past_key_values.0.decoder.key - // past_key_values.0.decoder.value - // past_key_values.0.encoder.key - // past_key_values.0.encoder.value - - for (auto& source_output : source.get_compiled_model().outputs()) { - std::string source_output_name = source_output.get_any_name(); - if (source_output_name.find("logits") != std::string::npos) { - continue; - } - - std::string with_past_input_name = - std::regex_replace(source_output_name, std::regex("present"), "past_key_values"); - - auto kv_tensor = source.get_tensor(source_output_name); - dest.set_tensor(with_past_input_name, ov::Tensor{kv_tensor}); - } -} -} // namespace - -namespace ov::genai { -WhisperWithPastDecoder::WhisperWithPastDecoder(const std::filesystem::path& models_path, - const std::string& device, - const ov::AnyMap& properties) { - Logger::warn("Whisper decoder models with past is deprecated. Support will be removed in 2026.0.0 release.\n" - "To obtain stateful decoder model use latest `optimum-intel` package:\n" - "pip install optimum-intel@git+https://github.com/huggingface/optimum-intel.git\n" - "optimum-cli export openvino --trust-remote-code --model openai/whisper-tiny whisper-tiny"); - ov::Core core = utils::singleton_core(); - - auto compiled_model = core.compile_model(models_path / "openvino_decoder_model.xml", device, properties); - utils::print_compiled_model_properties(compiled_model, "whisper decoder model"); - m_request_decoder = compiled_model.create_infer_request(); - - compiled_model = core.compile_model(models_path / "openvino_decoder_with_past_model.xml", device, properties); - utils::print_compiled_model_properties(compiled_model, "whisper decoder with past model"); - m_request_decoder_with_past = compiled_model.create_infer_request(); -} - -std::pair WhisperWithPastDecoder::detect_language(const ov::Tensor& encoder_hidden_state, - const int64_t decoder_start_token_id) { - auto [output_tensor, infer_ms] = decode(encoder_hidden_state, {decoder_start_token_id}, 0); - - int64_t output_token = ov::genai::utils::argmax(output_tensor, 0); - - reset_state(); - - return {output_token, infer_ms}; -} - -std::pair WhisperWithPastDecoder::decode(const ov::Tensor& encoder_hidden_state, - const std::vector& input_ids, - const size_t cache_position) { - const bool initial_step = cache_position == 0; - ov::InferRequest& request = initial_step ? m_request_decoder : m_request_decoder_with_past; - - request.set_tensor("encoder_hidden_states", encoder_hidden_state); - - const ov::Tensor input_ids_tensor(ov::element::i64, {1, input_ids.size()}, (void*)input_ids.data()); - request.set_tensor("input_ids", input_ids_tensor); - - if (!initial_step) { - ov::Tensor cache_position_tensor = request.get_tensor("cache_position"); - cache_position_tensor.set_shape({1}); - cache_position_tensor.data()[0] = cache_position; - } - - const auto infer_start = std::chrono::steady_clock::now(); - request.infer(); - const auto infer_ms = ov::genai::PerfMetrics::get_microsec(std::chrono::steady_clock::now() - infer_start); - - auto output_tensor = request.get_tensor("logits"); - - if (initial_step) { - set_past_key_value(m_request_decoder, m_request_decoder_with_past); - } else if (!m_decoder_with_past_kv_value_set) { - set_past_key_value(m_request_decoder_with_past, m_request_decoder_with_past); - m_decoder_with_past_kv_value_set = true; - } - - return {output_tensor, infer_ms}; -} - -void WhisperWithPastDecoder::reset_state() { - m_request_decoder_with_past.reset_state(); - m_decoder_with_past_kv_value_set = false; -} -} // namespace ov::genai diff --git a/src/cpp/src/whisper/models/with_past_decoder.hpp b/src/cpp/src/whisper/models/with_past_decoder.hpp index c7af1cdaa2..8bd47bb981 100644 --- a/src/cpp/src/whisper/models/with_past_decoder.hpp +++ b/src/cpp/src/whisper/models/with_past_decoder.hpp @@ -14,19 +14,26 @@ class WhisperWithPastDecoder : public WhisperDecoder { const std::string& device, const ov::AnyMap& properties); - std::pair detect_language(const ov::Tensor& encoder_hidden_state, + std::pair detect_language(const Tensor& encoder_hidden_state, const int64_t decoder_start_token_id) override; - std::pair decode(const ov::Tensor& encoder_hidden_state, - const std::vector& input_ids, - const size_t cache_position) override; + std::pair decode(const Tensor& encoder_hidden_state, + const Tensor& input_ids, + const Tensor& beam_idx) override; void reset_state() override; private: ov::InferRequest m_request_decoder; ov::InferRequest m_request_decoder_with_past; + bool m_initial_step = true; bool m_decoder_with_past_kv_value_set = false; + size_t m_cache_position = 0; + + void _set_encoder_hidden_states_tensor(const Tensor& encoder_hidden_state, + const size_t batch_size, + InferRequest& request); + void _set_cache_position_tensor(const size_t seq_len); }; } // namespace ov::genai diff --git a/src/cpp/src/whisper/whisper.cpp b/src/cpp/src/whisper/whisper.cpp index 3ab873609d..96e464d115 100644 --- a/src/cpp/src/whisper/whisper.cpp +++ b/src/cpp/src/whisper/whisper.cpp @@ -14,6 +14,7 @@ #include "openvino/genai/perf_metrics.hpp" #include "openvino/genai/whisper_generation_config.hpp" #include "openvino/genai/whisper_pipeline.hpp" +#include "sampler.hpp" #include "timestamps.hpp" #include "utils.hpp" #include "whisper_config.hpp" @@ -25,6 +26,156 @@ using ov::genai::MicroSeconds; namespace { +void process_whisper_logits(ov::Tensor logits, + const ov::genai::WhisperGenerationConfig& config, + const bool return_timestamps, + const std::map>& batch_to_generated_ids) { + const bool initial_step = batch_to_generated_ids.empty(); + const size_t batch_size = logits.get_shape().at(0); + + for (size_t batch = 0; batch < batch_size; batch++) { + if (initial_step) { + ov::genai::do_suppress_tokens(logits, batch, config.begin_suppress_tokens); + } + + ov::genai::do_suppress_tokens(logits, batch, config.suppress_tokens); + + if (return_timestamps) { + const auto& generated_ids = initial_step ? std::vector{} : batch_to_generated_ids.at(batch); + ov::genai::process_whisper_timestamp_logits(logits, batch, config, generated_ids, initial_step); + } + } +} + +std::pair decode(std::shared_ptr decoder, + const std::vector& input_ids, + const ov::Tensor& encoder_hidden_state, + const std::shared_ptr streamer_ptr, + ov::genai::Sampler& sampler, + ov::genai::SequenceGroup::Ptr sequence_group, + const bool return_timestamps, + const ov::genai::WhisperGenerationConfig& config, + ov::genai::RawPerfMetrics& raw_metrics) { + const auto handle = std::make_shared(sequence_group->get_generation_stream(), + sequence_group->get_sampling_parameters()); + + auto stream_generated_tokens = [&streamer_ptr, &handle, &return_timestamps]() { + if (return_timestamps || !streamer_ptr || !handle->can_read()) { + return; + } + + std::unordered_map token = handle->back(); + for (const auto& gen_token : token.begin()->second.generated_ids) { + if (streamer_ptr->put(gen_token)) { + handle->drop(); + break; + } + } + }; + + const size_t batch_size = 1; + + ov::Tensor beam_idx = ov::Tensor(ov::element::i32, {batch_size}); + std::fill_n(beam_idx.data(), batch_size, 0); + + const ov::Tensor input_ids_tensor{ov::element::i64, {1, input_ids.size()}, (void*)input_ids.data()}; + + auto [logits, infer_ms] = decoder->decode(encoder_hidden_state, input_ids_tensor, beam_idx); + + const auto infer_end = std::chrono::steady_clock::now(); + raw_metrics.m_inference_durations[0] += MicroSeconds(infer_ms); + raw_metrics.m_token_infer_durations.emplace_back(infer_ms); + raw_metrics.m_new_token_times.emplace_back(infer_end); + raw_metrics.m_batch_sizes.emplace_back(batch_size); + + process_whisper_logits(logits, config, return_timestamps, {}); + + // since we have applied `Slice` operation to last MatMul, model output sequence length is 1 + // so, we need to update sequence groups to think that they already have processed all prompt tokens except last + // ones and schedule only `output_sequence_len` ones + int64_t output_sequence_len = logits.get_shape().at(1); + sequence_group->update_processed_tokens_num(sequence_group->get_prompt_len() - output_sequence_len); + sequence_group->schedule_tokens(output_sequence_len); + + sampler.sample({sequence_group}, logits); + stream_generated_tokens(); + + // "Generation" phase + while (!sequence_group->has_finished()) { + std::map> batch_to_generated_ids{}; + + sequence_group->schedule_tokens(1); + // compute aggregated values + size_t num_sequences = sequence_group->num_running_seqs(); + size_t total_num_tokens = sequence_group->get_num_scheduled_tokens() * num_sequences; + + ov::Tensor new_input_ids(ov::element::i64, {total_num_tokens, 1}); + int64_t* input_ids_data = new_input_ids.data(); + + std::vector next_beams; + + std::vector running_sequences = sequence_group->get_running_sequences(); + size_t num_scheduled_tokens = sequence_group->get_num_scheduled_tokens(); + size_t num_processed_tokens = sequence_group->get_num_processed_tokens(); + + std::map beam_idxs = sampler.get_beam_idxs(sequence_group); + + for (auto sequence : running_sequences) { + for (size_t batch = 0, position_id = num_processed_tokens; batch < num_scheduled_tokens; + ++batch, ++position_id) { + // compute token for current sequence + if (position_id < sequence_group->get_prompt_len()) { + input_ids_data[batch] = sequence_group->get_prompt_ids()[position_id]; + } else { + input_ids_data[batch] = + sequence->get_generated_ids()[position_id - sequence_group->get_prompt_len()]; + } + } + + // apply strides to shift to a next sequence + input_ids_data += num_scheduled_tokens; + + auto beam_idx = beam_idxs[sequence->get_id()]; + next_beams.push_back(beam_idx); + batch_to_generated_ids[next_beams.size() - 1] = sequence->get_generated_ids(); + } + + auto [logits, infer_ms] = decoder->decode(encoder_hidden_state, + new_input_ids, + ov::Tensor{ov::element::i32, {total_num_tokens}, next_beams.data()}); + + const auto infer_end = std::chrono::steady_clock::now(); + raw_metrics.m_inference_durations[0] += MicroSeconds(infer_ms); + raw_metrics.m_token_infer_durations.emplace_back(infer_ms); + raw_metrics.m_new_token_times.emplace_back(infer_end); + raw_metrics.m_batch_sizes.emplace_back(batch_size); + + process_whisper_logits(logits, config, return_timestamps, batch_to_generated_ids); + + sampler.sample({sequence_group}, logits); + stream_generated_tokens(); + } + + ov::genai::EncodedResults results; + + const auto sampling_params = sequence_group->get_sampling_parameters(); + + // there is also check in generation config validate function + OPENVINO_ASSERT(config.num_return_sequences == 1); + const auto& sequences = sequence_group->get_finished_sequences(); + const auto& sequence = sequences[0]; + + const float score = sampling_params.is_beam_search() ? sequence->get_beam_search_score(sampling_params) + : sequence->get_cumulative_log_prob(); + + results.tokens.push_back(sequence->get_generated_ids()); + results.scores.push_back(score); + + sampler.clear_request_info(sequence_group->get_request_id()); + + return {results, sequence_group->handle_dropped()}; +} + ov::Tensor encode(ov::InferRequest& request, std::vector& mel_data, const size_t feature_size, @@ -54,41 +205,6 @@ ov::Tensor encode(ov::InferRequest& request, return request.get_tensor("last_hidden_state"); } -int64_t decode(ov::Tensor& encoder_hidden_state, - std::shared_ptr decoder, - const std::vector& input_ids, - const size_t cache_position, - const ov::genai::WhisperGenerationConfig& config, - ov::genai::RawPerfMetrics& raw_metrics, - const bool return_timestamps, - const bool initial_step, - const std::vector& generated_tokens) { - auto [output_tensor, infer_ms] = decoder->decode(encoder_hidden_state, input_ids, cache_position); - const auto infer_end = std::chrono::steady_clock::now(); - raw_metrics.m_inference_durations[0] += MicroSeconds(infer_ms); - raw_metrics.m_token_infer_durations.emplace_back(infer_ms); - raw_metrics.m_new_token_times.emplace_back(infer_end); - raw_metrics.m_batch_sizes.emplace_back(1); - - if (initial_step) { - ov::genai::do_suppress_tokens(output_tensor, 0, config.begin_suppress_tokens); - } - - ov::genai::do_suppress_tokens(output_tensor, 0, config.suppress_tokens); - - if (return_timestamps) { - if (initial_step) { - ov::genai::process_whisper_timestamp_logits(output_tensor, 0, config, {}, true); - } else { - ov::genai::process_whisper_timestamp_logits(output_tensor, 0, config, generated_tokens); - } - } - - int64_t output_token = ov::genai::utils::argmax(output_tensor, 0); - - return output_token; -} - std::vector prepare_init_tokens(ov::Tensor& encoder_hidden_state, std::shared_ptr decoder, const ov::genai::WhisperGenerationConfig& config, @@ -129,52 +245,6 @@ std::vector prepare_init_tokens(ov::Tensor& encoder_hidden_state, config.no_timestamps_token_id}; } -std::pair> full_decode(ov::Tensor& encoder_hidden_state, - const ov::genai::WhisperGenerationConfig& config, - std::shared_ptr decoder, - const std::vector& init_tokens, - const size_t max_new_tokens, - const bool return_timestamps, - ov::genai::RawPerfMetrics& raw_metrics, - const std::shared_ptr streamer) { - int64_t output_token = - decode(encoder_hidden_state, decoder, init_tokens, 0, config, raw_metrics, return_timestamps, true, {}); - - std::vector output_tokens{output_token}; - - if (!return_timestamps && streamer && streamer->put(output_token)) { - return {true, output_tokens}; - } - - if (max_new_tokens == 1) { - return {false, output_tokens}; - } - - for (size_t i = 0; i < max_new_tokens - 1; i++) { - auto output_token = decode(encoder_hidden_state, - decoder, - {output_tokens.back()}, - init_tokens.size() + i, - config, - raw_metrics, - return_timestamps, - false, - output_tokens); - - if (output_token == config.eos_token_id) { - break; - } - - output_tokens.push_back(output_token); - - if (!return_timestamps && streamer && streamer->put(output_token)) { - return {true, output_tokens}; - } - } - - return {false, output_tokens}; -} - } // namespace namespace ov { @@ -187,7 +257,8 @@ WhisperGenerateResult whisper_generate(const ov::genai::WhisperGenerationConfig& ov::InferRequest& encoder, std::shared_ptr decoder, WhisperFeatureExtractor& feature_extractor, - const std::shared_ptr streamer) { + const std::shared_ptr streamer, + Sampler& sampler) { size_t max_new_tokens = config.get_max_new_tokens(); WhisperGenerateResult result; @@ -216,10 +287,6 @@ WhisperGenerateResult whisper_generate(const ov::genai::WhisperGenerationConfig& size_t segment_offset = 0; for (size_t chunk_offset = 0; chunk_offset < input_features.n_frames; chunk_offset += segment_offset) { - if (output_tokens.size() >= max_new_tokens) { - break; - } - auto input_features_chunk = input_features.get_data_with_offset(chunk_offset, feature_extractor.nb_max_frames); ov::Tensor hidden_state_tensor = encode(encoder, @@ -236,16 +303,19 @@ WhisperGenerateResult whisper_generate(const ov::genai::WhisperGenerationConfig& std::vector chunk_init_tokens = ov::genai::get_prompt_tokens(context_tokens, config, chunk_offset); chunk_init_tokens.insert(chunk_init_tokens.end(), init_tokens.begin(), init_tokens.end()); - auto [cancelled, chunk_output_tokens] = full_decode(hidden_state_tensor, - config, - decoder, - chunk_init_tokens, - max_new_tokens - output_tokens.size(), - return_timestamps, - raw_metrics, - streamer); - + SequenceGroup::Ptr sequence_group = std::make_shared(0, chunk_init_tokens, config, 1); + + auto [result, cancelled] = decode(decoder, + chunk_init_tokens, + hidden_state_tensor, + streamer, + sampler, + sequence_group, + return_timestamps, + config, + raw_metrics); decoder->reset_state(); + std::vector chunk_output_tokens = result.tokens[0]; if (return_timestamps) { auto extracted_segments = ov::genai::extract_segments(chunk_output_tokens, diff --git a/src/cpp/src/whisper/whisper.hpp b/src/cpp/src/whisper/whisper.hpp index fbdf56d171..96c17a4216 100644 --- a/src/cpp/src/whisper/whisper.hpp +++ b/src/cpp/src/whisper/whisper.hpp @@ -9,6 +9,7 @@ #include "models/decoder.hpp" #include "openvino/genai/whisper_generation_config.hpp" #include "openvino/genai/whisper_pipeline.hpp" +#include "sampler.hpp" #include "whisper_config.hpp" #include "whisper_feature_extractor.hpp" #include "whisper_models.hpp" @@ -35,7 +36,8 @@ WhisperGenerateResult whisper_generate(const ov::genai::WhisperGenerationConfig& ov::InferRequest& encoder, std::shared_ptr decoder, WhisperFeatureExtractor& feature_extractor, - const std::shared_ptr streamer); + const std::shared_ptr streamer, + Sampler& sampler); } // namespace genai } // namespace ov diff --git a/src/cpp/src/whisper_generation_config.cpp b/src/cpp/src/whisper_generation_config.cpp index beb663caaf..733ce74028 100644 --- a/src/cpp/src/whisper_generation_config.cpp +++ b/src/cpp/src/whisper_generation_config.cpp @@ -14,7 +14,8 @@ namespace ov { namespace genai { -WhisperGenerationConfig::WhisperGenerationConfig(const std::filesystem::path& json_path) { +WhisperGenerationConfig::WhisperGenerationConfig(const std::filesystem::path& json_path) + : GenerationConfig::GenerationConfig(json_path) { using ov::genai::utils::read_json_param; std::ifstream f(json_path); @@ -22,12 +23,9 @@ WhisperGenerationConfig::WhisperGenerationConfig(const std::filesystem::path& js nlohmann::json data = nlohmann::json::parse(f); - read_json_param(data, "max_new_tokens", max_new_tokens); - read_json_param(data, "max_length", max_length); read_json_param(data, "begin_suppress_tokens", begin_suppress_tokens); read_json_param(data, "suppress_tokens", suppress_tokens); read_json_param(data, "decoder_start_token_id", decoder_start_token_id); - read_json_param(data, "eos_token_id", eos_token_id); read_json_param(data, "pad_token_id", pad_token_id); read_json_param(data, "no_timestamps_token_id", no_timestamps_token_id); read_json_param(data, "max_initial_timestamp_index", max_initial_timestamp_index); @@ -42,28 +40,12 @@ WhisperGenerationConfig::WhisperGenerationConfig(const std::filesystem::path& js read_json_param(data, "lang_to_id", lang_to_id); } -void WhisperGenerationConfig::set_eos_token_id(int64_t tokenizer_eos_token_id) { - if (eos_token_id < 0) { - eos_token_id = tokenizer_eos_token_id; - } else { - OPENVINO_ASSERT(eos_token_id == tokenizer_eos_token_id, - "EOS token ID is different in generation config (", - eos_token_id, - ") and tokenizer (", - tokenizer_eos_token_id, - ")"); - } -} - void WhisperGenerationConfig::update_generation_config(const ov::AnyMap& config_map) { using ov::genai::utils::read_anymap_param; - read_anymap_param(config_map, "max_new_tokens", max_new_tokens); - read_anymap_param(config_map, "max_length", max_length); read_anymap_param(config_map, "begin_suppress_tokens", begin_suppress_tokens); read_anymap_param(config_map, "suppress_tokens", suppress_tokens); read_anymap_param(config_map, "decoder_start_token_id", decoder_start_token_id); - read_anymap_param(config_map, "eos_token_id", eos_token_id); read_anymap_param(config_map, "pad_token_id", pad_token_id); read_anymap_param(config_map, "transcribe_token_id", transcribe_token_id); read_anymap_param(config_map, "translate_token_id", translate_token_id); @@ -76,27 +58,12 @@ void WhisperGenerationConfig::update_generation_config(const ov::AnyMap& config_ read_anymap_param(config_map, "return_timestamps", return_timestamps); read_anymap_param(config_map, "initial_prompt", initial_prompt); read_anymap_param(config_map, "hotwords", hotwords); -} -size_t WhisperGenerationConfig::get_max_new_tokens(size_t prompt_length) const { - // max_new_tokens has priority over max_length, only if max_new_tokens was not specified use max_length - if (max_new_tokens != SIZE_MAX) { - return max_new_tokens; - } else { - return max_length - prompt_length; - } + GenerationConfig::update_generation_config(config_map); } void WhisperGenerationConfig::validate() const { - OPENVINO_ASSERT(max_new_tokens > 0, "'max_new_tokens' must be greater than 0"); - - // max_new_tokens has priority over max_length - // if max_new_tokens is defined no need to check max_length - OPENVINO_ASSERT(max_new_tokens != SIZE_MAX || max_length > 0, - "'max_length' must be greater than 0 or 'max_new_tokens' should be defined"); - - OPENVINO_ASSERT(eos_token_id != -1 || max_new_tokens != SIZE_MAX || max_length != SIZE_MAX, - "Either 'eos_token_id', or 'max_new_tokens', or 'max_length' should be defined."); + GenerationConfig::validate(); if (is_multilingual && language.has_value()) { OPENVINO_ASSERT(lang_to_id.count(*language), @@ -114,6 +81,15 @@ void WhisperGenerationConfig::validate() const { OPENVINO_ASSERT(!language.has_value(), "Cannot specify 'language' for not multilingual model."); OPENVINO_ASSERT(!task.has_value(), "Cannot specify 'task' for not multilingual model."); } + + if (is_beam_search()) { + OPENVINO_ASSERT(num_return_sequences == 1, + "'num_return_sequences' must be 1. Provided: ", + num_return_sequences, + "."); + } + + OPENVINO_ASSERT(!is_assisting_generation(), "Assisted generation is not supported."); } } // namespace genai } // namespace ov diff --git a/src/cpp/src/whisper_pipeline.cpp b/src/cpp/src/whisper_pipeline.cpp index ffd792c889..1e0d5a9e9a 100644 --- a/src/cpp/src/whisper_pipeline.cpp +++ b/src/cpp/src/whisper_pipeline.cpp @@ -51,7 +51,8 @@ class WhisperPipeline::WhisperPipelineStatefulImpl : public WhisperPipeline::Whi WhisperPipelineStatefulImpl(const std::filesystem::path& models_path, const std::string& device, const ov::AnyMap& properties) - : WhisperPipelineImplBase{models_path} { + : WhisperPipelineImplBase{models_path}, + m_sampler(m_tokenizer) { ov::Core core = utils::singleton_core(); ov::CompiledModel compiled_model = @@ -65,6 +66,8 @@ class WhisperPipeline::WhisperPipelineStatefulImpl : public WhisperPipeline::Whi if (m_generation_config.eos_token_id == -1) { m_generation_config.set_eos_token_id(m_tokenizer.get_eos_token_id()); } + + m_sampler.set_seed(m_generation_config.rng_seed); } WhisperDecodedResults generate(const RawSpeechInput& raw_speech_input, @@ -96,7 +99,8 @@ class WhisperPipeline::WhisperPipelineStatefulImpl : public WhisperPipeline::Whi m_encoder, m_decoder, m_feature_extractor, - streamer_ptr); + streamer_ptr, + m_sampler); auto decode_start_time = std::chrono::steady_clock::now(); WhisperDecodedResults result{std::vector{m_tokenizer.decode(generate_result.output_tokens)}, std::vector{1.f}}; generate_result.perf_metrics.raw_metrics.detokenization_durations.emplace_back( @@ -135,6 +139,7 @@ class WhisperPipeline::WhisperPipelineStatefulImpl : public WhisperPipeline::Whi private: ov::InferRequest m_encoder; std::shared_ptr m_decoder; + Sampler m_sampler; }; std::pair streamer(ChunkStreamerVariant func) { From c36840101015d5bb279e4b22b71f7ba0c4eabb96 Mon Sep 17 00:00:00 2001 From: Alexander Suvorov Date: Tue, 21 Jan 2025 16:13:03 +0100 Subject: [PATCH 14/27] Add with past decoder --- .../src/whisper/models/with_past_decoder.cpp | 202 ++++++++++++++++++ .../src/whisper/models/with_past_decoder.hpp | 1 - tests/python_tests/requirements.txt | 2 +- 3 files changed, 203 insertions(+), 2 deletions(-) create mode 100644 src/cpp/src/whisper/models/with_past_decoder.cpp diff --git a/src/cpp/src/whisper/models/with_past_decoder.cpp b/src/cpp/src/whisper/models/with_past_decoder.cpp new file mode 100644 index 0000000000..00ac7bc9d7 --- /dev/null +++ b/src/cpp/src/whisper/models/with_past_decoder.cpp @@ -0,0 +1,202 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "with_past_decoder.hpp" + +#include + +#include "logger.hpp" +#include "utils.hpp" + +namespace { + +bool are_past_key_values_empty(ov::InferRequest& request) { + for (const auto& input : request.get_compiled_model().inputs()) { + const std::string name = input.get_any_name(); + if (name.find("past_key_values") == std::string::npos) { + continue; + } + + ov::Tensor tensor = request.get_tensor(name); + + return tensor.get_size() == 0; + } + + OPENVINO_THROW("Past key value tensor not found"); +} + +void reset_past_key_values(ov::InferRequest& request) { + for (const auto& input : request.get_compiled_model().inputs()) { + const std::string name = input.get_any_name(); + if (name.find("past_key_values") == std::string::npos) { + continue; + } + + ov::Shape shape{request.get_tensor(name).get_shape()}; + shape[0] = 0; + + request.set_tensor(name, ov::Tensor{ov::element::f32, shape}); + } +} + +void copy_with_beam_gather(const ov::Tensor& source, ov::Tensor& dest, const ov::Tensor& beam_idx) { + const size_t dest_batch_size = beam_idx.get_shape().at(0); + + ov::Shape dest_shape{source.get_shape()}; + dest_shape[0] = dest_batch_size; + dest.set_shape(dest_shape); + + OPENVINO_ASSERT(dest_shape.size() == 4); + + const size_t batch_dim_size = dest_shape[1] * dest_shape[2] * dest_shape[3]; + + const auto beam_idx_data = beam_idx.data(); + const auto source_data = source.data(); + auto dest_data = dest.data(); + + for (size_t dest_batch = 0; dest_batch < dest_batch_size; dest_batch++) { + const size_t source_batch = beam_idx_data[dest_batch]; + + const auto source_start = source_data + (source_batch * batch_dim_size); + const auto dest_start = dest_data + (dest_batch * batch_dim_size); + std::memcpy(dest_start, source_start, sizeof(float) * batch_dim_size); + } +} + +void set_past_key_value(ov::InferRequest& source, ov::InferRequest& dest, const ov::Tensor& beam_idx) { + // source outputs: + // present.0.decoder.key + // present.0.decoder.value + // present.0.encoder.key + // present.0.encoder.value + + // dest inputs: + // past_key_values.0.decoder.key + // past_key_values.0.decoder.value + // past_key_values.0.encoder.key + // past_key_values.0.encoder.value + + for (auto& source_output : source.get_compiled_model().outputs()) { + std::string source_output_name = source_output.get_any_name(); + if (source_output_name.find("present") == std::string::npos) { + continue; + } + + std::string dest_input_name = std::regex_replace(source_output_name, std::regex("present"), "past_key_values"); + + auto source_tensor = source.get_tensor(source_output_name); + auto dest_tensor = dest.get_tensor(dest_input_name); + + copy_with_beam_gather(source_tensor, dest_tensor, beam_idx); + } +} +} // namespace + +namespace ov::genai { +WhisperWithPastDecoder::WhisperWithPastDecoder(const std::filesystem::path& models_path, + const std::string& device, + const ov::AnyMap& properties) { + Logger::warn("Whisper decoder models with past is deprecated. Support will be removed in 2026.0.0 release.\n" + "To obtain stateful decoder model use latest `optimum-intel` package:\n" + "pip install optimum-intel@git+https://github.com/huggingface/optimum-intel.git\n" + "optimum-cli export openvino --trust-remote-code --model openai/whisper-tiny whisper-tiny"); + ov::Core core = utils::singleton_core(); + + auto compiled_model = core.compile_model(models_path / "openvino_decoder_model.xml", device, properties); + utils::print_compiled_model_properties(compiled_model, "whisper decoder model"); + m_request_decoder = compiled_model.create_infer_request(); + + compiled_model = core.compile_model(models_path / "openvino_decoder_with_past_model.xml", device, properties); + utils::print_compiled_model_properties(compiled_model, "whisper decoder with past model"); + m_request_decoder_with_past = compiled_model.create_infer_request(); +} + +std::pair WhisperWithPastDecoder::detect_language(const ov::Tensor& encoder_hidden_state, + const int64_t decoder_start_token_id) { + Tensor input_ids_tensor{ov::element::i64, {1, 1}}; + input_ids_tensor.data()[0] = decoder_start_token_id; + + Tensor beam_idx_tensor{ov::element::i32, {1}}; + beam_idx_tensor.data()[0] = 0; + + auto [output_tensor, infer_ms] = decode(encoder_hidden_state, input_ids_tensor, beam_idx_tensor); + + int64_t output_token = ov::genai::utils::argmax(output_tensor, 0); + + reset_state(); + + return {output_token, infer_ms}; +} + +std::pair WhisperWithPastDecoder::decode(const Tensor& encoder_hidden_state, + const Tensor& input_ids, + const Tensor& beam_idx) { + ov::InferRequest& request = m_initial_step ? m_request_decoder : m_request_decoder_with_past; + + const size_t batch_size = input_ids.get_shape().at(0); + const size_t seq_length = input_ids.get_shape().at(1); + + _set_encoder_hidden_states_tensor(encoder_hidden_state, batch_size, request); + request.set_tensor("input_ids", input_ids); + + if (!m_initial_step) { + ov::Tensor cache_position_tensor = request.get_tensor("cache_position"); + cache_position_tensor.set_shape({1}); + cache_position_tensor.data()[0] = m_cache_position; + } + + if (!m_initial_step) { + if (are_past_key_values_empty(m_request_decoder_with_past)) { + set_past_key_value(m_request_decoder, m_request_decoder_with_past, beam_idx); + } else { + set_past_key_value(m_request_decoder_with_past, m_request_decoder_with_past, beam_idx); + } + } + + const auto infer_start = std::chrono::steady_clock::now(); + request.infer(); + const auto infer_ms = ov::genai::PerfMetrics::get_microsec(std::chrono::steady_clock::now() - infer_start); + + auto output_tensor = request.get_tensor("logits"); + + m_initial_step = false; + m_cache_position += seq_length; + + return {output_tensor, infer_ms}; +} + +/** + * Encoder hidden states expected to be with batch 1 + * Copy encoder hidden state tensor from batch 1 to requested batch_size. + * Set new encoder hidden states tensor to infer request. + */ +void WhisperWithPastDecoder::_set_encoder_hidden_states_tensor(const Tensor& encoder_hidden_state, + const size_t batch_size, + InferRequest& request) { + OPENVINO_ASSERT(encoder_hidden_state.get_shape().at(0) == 1); + Shape shape{encoder_hidden_state.get_shape()}; + shape[0] = batch_size; + + Tensor new_encoder_hidden_states{ov::element::f32, shape}; + + auto new_encoder_hidden_states_data = new_encoder_hidden_states.data(); + auto encoder_hidden_state_data = encoder_hidden_state.data(); + + for (size_t batch = 0; batch < batch_size; batch++) { + const size_t batch_offset = batch * encoder_hidden_state.get_size(); + std::memcpy(new_encoder_hidden_states_data + batch_offset, + encoder_hidden_state_data, + encoder_hidden_state.get_byte_size()); + } + + request.set_tensor("encoder_hidden_states", new_encoder_hidden_states); +} + +void WhisperWithPastDecoder::reset_state() { + reset_past_key_values(m_request_decoder_with_past); + m_request_decoder_with_past.reset_state(); + m_decoder_with_past_kv_value_set = false; + m_initial_step = true; + m_cache_position = 0; +} +} // namespace ov::genai \ No newline at end of file diff --git a/src/cpp/src/whisper/models/with_past_decoder.hpp b/src/cpp/src/whisper/models/with_past_decoder.hpp index 8bd47bb981..e94f74a6c7 100644 --- a/src/cpp/src/whisper/models/with_past_decoder.hpp +++ b/src/cpp/src/whisper/models/with_past_decoder.hpp @@ -33,7 +33,6 @@ class WhisperWithPastDecoder : public WhisperDecoder { void _set_encoder_hidden_states_tensor(const Tensor& encoder_hidden_state, const size_t batch_size, InferRequest& request); - void _set_cache_position_tensor(const size_t seq_len); }; } // namespace ov::genai diff --git a/tests/python_tests/requirements.txt b/tests/python_tests/requirements.txt index 78cacd61ae..c851c71ee5 100644 --- a/tests/python_tests/requirements.txt +++ b/tests/python_tests/requirements.txt @@ -1,6 +1,6 @@ --extra-index-url https://download.pytorch.org/whl/cpu diffusers==0.32.1 -optimum-intel @ git+https://github.com/eaidova/optimum-intel@ea/stateful_seq2seq +optimum-intel @ git+https://github.com/huggingface/optimum-intel.git numpy<2.0.0; platform_system == "Darwin" and platform_machine == "x86_64" onnx==1.17.0 pytest From 2e061aa8c472326105998d367f55c1863f8bb34c Mon Sep 17 00:00:00 2001 From: Alexander Suvorov Date: Wed, 22 Jan 2025 09:43:17 +0100 Subject: [PATCH 15/27] Refactor with past decoder --- .../src/whisper/models/statefull_decoder.cpp | 2 + .../src/whisper/models/with_past_decoder.cpp | 99 +++++++++++-------- .../src/whisper/models/with_past_decoder.hpp | 5 +- 3 files changed, 61 insertions(+), 45 deletions(-) diff --git a/src/cpp/src/whisper/models/statefull_decoder.cpp b/src/cpp/src/whisper/models/statefull_decoder.cpp index ce029d3057..7d6837d323 100644 --- a/src/cpp/src/whisper/models/statefull_decoder.cpp +++ b/src/cpp/src/whisper/models/statefull_decoder.cpp @@ -41,7 +41,9 @@ std::pair WhisperStatefullDecoder::decode(const Tensor& encod const size_t batch_size = input_ids.get_shape().at(0); const size_t seq_len = input_ids.get_shape().at(1); + // todo: skip copy if already set and batch didn't changed _set_encoder_hidden_states_tensor(encoder_hidden_state, batch_size); + _set_cache_position_tensor(seq_len); m_request.set_tensor("input_ids", input_ids); m_request.set_tensor("beam_idx", beam_idx); diff --git a/src/cpp/src/whisper/models/with_past_decoder.cpp b/src/cpp/src/whisper/models/with_past_decoder.cpp index 00ac7bc9d7..4541f2e194 100644 --- a/src/cpp/src/whisper/models/with_past_decoder.cpp +++ b/src/cpp/src/whisper/models/with_past_decoder.cpp @@ -10,35 +10,6 @@ namespace { -bool are_past_key_values_empty(ov::InferRequest& request) { - for (const auto& input : request.get_compiled_model().inputs()) { - const std::string name = input.get_any_name(); - if (name.find("past_key_values") == std::string::npos) { - continue; - } - - ov::Tensor tensor = request.get_tensor(name); - - return tensor.get_size() == 0; - } - - OPENVINO_THROW("Past key value tensor not found"); -} - -void reset_past_key_values(ov::InferRequest& request) { - for (const auto& input : request.get_compiled_model().inputs()) { - const std::string name = input.get_any_name(); - if (name.find("past_key_values") == std::string::npos) { - continue; - } - - ov::Shape shape{request.get_tensor(name).get_shape()}; - shape[0] = 0; - - request.set_tensor(name, ov::Tensor{ov::element::f32, shape}); - } -} - void copy_with_beam_gather(const ov::Tensor& source, ov::Tensor& dest, const ov::Tensor& beam_idx) { const size_t dest_batch_size = beam_idx.get_shape().at(0); @@ -63,7 +34,7 @@ void copy_with_beam_gather(const ov::Tensor& source, ov::Tensor& dest, const ov: } } -void set_past_key_value(ov::InferRequest& source, ov::InferRequest& dest, const ov::Tensor& beam_idx) { +void copy_past_key_value(ov::InferRequest& source, ov::InferRequest& dest, const ov::Tensor& beam_idx) { // source outputs: // present.0.decoder.key // present.0.decoder.value @@ -90,6 +61,21 @@ void set_past_key_value(ov::InferRequest& source, ov::InferRequest& dest, const copy_with_beam_gather(source_tensor, dest_tensor, beam_idx); } } + +void link_past_key_value(ov::InferRequest& source, ov::InferRequest& dest) { + for (auto& source_output : source.get_compiled_model().outputs()) { + std::string source_output_name = source_output.get_any_name(); + if (source_output_name.find("present") == std::string::npos) { + continue; + } + + std::string dest_input_name = std::regex_replace(source_output_name, std::regex("present"), "past_key_values"); + auto source_tensor = source.get_tensor(source_output_name); + + dest.set_tensor(dest_input_name, source_tensor); + } +} + } // namespace namespace ov::genai { @@ -131,27 +117,23 @@ std::pair WhisperWithPastDecoder::detect_language(const ov::Tens std::pair WhisperWithPastDecoder::decode(const Tensor& encoder_hidden_state, const Tensor& input_ids, const Tensor& beam_idx) { - ov::InferRequest& request = m_initial_step ? m_request_decoder : m_request_decoder_with_past; + const bool is_initial_step = m_cache_position == 0; + ov::InferRequest& request = is_initial_step ? m_request_decoder : m_request_decoder_with_past; const size_t batch_size = input_ids.get_shape().at(0); const size_t seq_length = input_ids.get_shape().at(1); + // todo: skip copy if already set and batch didn't changed _set_encoder_hidden_states_tensor(encoder_hidden_state, batch_size, request); request.set_tensor("input_ids", input_ids); - if (!m_initial_step) { + if (!is_initial_step) { ov::Tensor cache_position_tensor = request.get_tensor("cache_position"); cache_position_tensor.set_shape({1}); cache_position_tensor.data()[0] = m_cache_position; } - if (!m_initial_step) { - if (are_past_key_values_empty(m_request_decoder_with_past)) { - set_past_key_value(m_request_decoder, m_request_decoder_with_past, beam_idx); - } else { - set_past_key_value(m_request_decoder_with_past, m_request_decoder_with_past, beam_idx); - } - } + _set_past_key_value(beam_idx); const auto infer_start = std::chrono::steady_clock::now(); request.infer(); @@ -159,7 +141,6 @@ std::pair WhisperWithPastDecoder::decode(const Tensor& encoder_hi auto output_tensor = request.get_tensor("logits"); - m_initial_step = false; m_cache_position += seq_length; return {output_tensor, infer_ms}; @@ -192,11 +173,43 @@ void WhisperWithPastDecoder::_set_encoder_hidden_states_tensor(const Tensor& enc request.set_tensor("encoder_hidden_states", new_encoder_hidden_states); } +void WhisperWithPastDecoder::_set_past_key_value(const Tensor& beam_idx) { + const bool is_initial_step = m_cache_position == 0; + if (is_initial_step) { + return; + } + + const size_t batch_size = beam_idx.get_shape().at(0); + // no copy needed, just 'link' output tensor with input tensor + const bool can_link_past_key_value = batch_size == 1 && beam_idx.data()[0] == 0; + + if (!m_initial_past_key_value_set) { + if (can_link_past_key_value) { + link_past_key_value(m_request_decoder, m_request_decoder_with_past); + } else { + copy_past_key_value(m_request_decoder, m_request_decoder_with_past, beam_idx); + } + + m_initial_past_key_value_set = true; + return; + } + + if (m_past_key_value_linked) { + return; + } + + if (can_link_past_key_value) { + link_past_key_value(m_request_decoder_with_past, m_request_decoder_with_past); + m_past_key_value_linked = true; + } else { + copy_past_key_value(m_request_decoder_with_past, m_request_decoder_with_past, beam_idx); + } +}; + void WhisperWithPastDecoder::reset_state() { - reset_past_key_values(m_request_decoder_with_past); m_request_decoder_with_past.reset_state(); - m_decoder_with_past_kv_value_set = false; - m_initial_step = true; m_cache_position = 0; + m_initial_past_key_value_set = false; + m_past_key_value_linked = false; } } // namespace ov::genai \ No newline at end of file diff --git a/src/cpp/src/whisper/models/with_past_decoder.hpp b/src/cpp/src/whisper/models/with_past_decoder.hpp index e94f74a6c7..7eb3990cca 100644 --- a/src/cpp/src/whisper/models/with_past_decoder.hpp +++ b/src/cpp/src/whisper/models/with_past_decoder.hpp @@ -26,13 +26,14 @@ class WhisperWithPastDecoder : public WhisperDecoder { private: ov::InferRequest m_request_decoder; ov::InferRequest m_request_decoder_with_past; - bool m_initial_step = true; - bool m_decoder_with_past_kv_value_set = false; size_t m_cache_position = 0; + bool m_initial_past_key_value_set = false; + bool m_past_key_value_linked = false; void _set_encoder_hidden_states_tensor(const Tensor& encoder_hidden_state, const size_t batch_size, InferRequest& request); + void _set_past_key_value(const Tensor& beam_idx); }; } // namespace ov::genai From 4eaa9a715f7ad0ab8c3d1a3556a9c13117816afa Mon Sep 17 00:00:00 2001 From: Alexander Suvorov Date: Wed, 22 Jan 2025 10:06:29 +0100 Subject: [PATCH 16/27] Do not copy encoder_hidden_states if not needed --- .../src/whisper/models/statefull_decoder.cpp | 24 ++++++++++++------- .../src/whisper/models/with_past_decoder.cpp | 12 ++++++++++ 2 files changed, 27 insertions(+), 9 deletions(-) diff --git a/src/cpp/src/whisper/models/statefull_decoder.cpp b/src/cpp/src/whisper/models/statefull_decoder.cpp index 7d6837d323..55b111286b 100644 --- a/src/cpp/src/whisper/models/statefull_decoder.cpp +++ b/src/cpp/src/whisper/models/statefull_decoder.cpp @@ -41,7 +41,6 @@ std::pair WhisperStatefullDecoder::decode(const Tensor& encod const size_t batch_size = input_ids.get_shape().at(0); const size_t seq_len = input_ids.get_shape().at(1); - // todo: skip copy if already set and batch didn't changed _set_encoder_hidden_states_tensor(encoder_hidden_state, batch_size); _set_cache_position_tensor(seq_len); @@ -64,7 +63,15 @@ std::pair WhisperStatefullDecoder::decode(const Tensor& encod */ void WhisperStatefullDecoder::_set_encoder_hidden_states_tensor(const Tensor& encoder_hidden_state, const size_t batch_size) { - _reset_encoder_past_key_values_states(encoder_hidden_state, batch_size); + const size_t current_batch_size = m_request.get_tensor("encoder_hidden_states").get_shape().at(0); + // batch hasn't changed, skip + if (current_batch_size == batch_size) { + return; + } + + if (current_batch_size != 0) { + _reset_encoder_past_key_values_states(encoder_hidden_state, batch_size); + } OPENVINO_ASSERT(encoder_hidden_state.get_shape().at(0) == 1); Shape shape{encoder_hidden_state.get_shape()}; @@ -85,15 +92,10 @@ void WhisperStatefullDecoder::_set_encoder_hidden_states_tensor(const Tensor& en m_request.set_tensor("encoder_hidden_states", new_encoder_hidden_states); } -// Ensure encoder past_key values states are reset if batch size changed. This is workaround for Ticket: +// Past_key value states are not shring/grow when batch is changed. Reset past_key values states as a workaround. +// Ticket: void WhisperStatefullDecoder::_reset_encoder_past_key_values_states(const Tensor& encoder_hidden_state, const size_t batch_size) { - const size_t current_batch_size = m_request.get_tensor("encoder_hidden_states").get_shape().at(0); - // batch hasn't changed, skip - if (current_batch_size == 0 || current_batch_size == batch_size) { - return; - } - const size_t encoder_state_length_dim = encoder_hidden_state.get_shape().at(1); for (auto& state : m_request.query_state()) { // find encoder states by dimension @@ -122,6 +124,10 @@ void WhisperStatefullDecoder::_set_cache_position_tensor(const size_t seq_len) { void WhisperStatefullDecoder::reset_state() { m_request.reset_state(); m_request.set_tensor("cache_position", ov::Tensor{ov::element::i64, {0}}); + + Shape encoder_hidden_states_shape{m_request.get_tensor("encoder_hidden_states").get_shape()}; + encoder_hidden_states_shape[0] = 0; + m_request.set_tensor("encoder_hidden_states", ov::Tensor{ov::element::f32, encoder_hidden_states_shape}); }; } // namespace ov::genai diff --git a/src/cpp/src/whisper/models/with_past_decoder.cpp b/src/cpp/src/whisper/models/with_past_decoder.cpp index 4541f2e194..11f1ef7713 100644 --- a/src/cpp/src/whisper/models/with_past_decoder.cpp +++ b/src/cpp/src/whisper/models/with_past_decoder.cpp @@ -154,6 +154,12 @@ std::pair WhisperWithPastDecoder::decode(const Tensor& encoder_hi void WhisperWithPastDecoder::_set_encoder_hidden_states_tensor(const Tensor& encoder_hidden_state, const size_t batch_size, InferRequest& request) { + const size_t current_batch_size = request.get_tensor("encoder_hidden_states").get_shape().at(0); + // batch hasn't changed, skip + if (current_batch_size == batch_size) { + return; + } + OPENVINO_ASSERT(encoder_hidden_state.get_shape().at(0) == 1); Shape shape{encoder_hidden_state.get_shape()}; shape[0] = batch_size; @@ -211,5 +217,11 @@ void WhisperWithPastDecoder::reset_state() { m_cache_position = 0; m_initial_past_key_value_set = false; m_past_key_value_linked = false; + + Shape encoder_hidden_states_shape{m_request_decoder_with_past.get_tensor("encoder_hidden_states").get_shape()}; + encoder_hidden_states_shape[0] = 0; + m_request_decoder.set_tensor("encoder_hidden_states", ov::Tensor{ov::element::f32, encoder_hidden_states_shape}); + m_request_decoder_with_past.set_tensor("encoder_hidden_states", + ov::Tensor{ov::element::f32, encoder_hidden_states_shape}); } } // namespace ov::genai \ No newline at end of file From 50fb8298508780eeb2048bd129be9078a5f3f60e Mon Sep 17 00:00:00 2001 From: Alexander Suvorov Date: Wed, 22 Jan 2025 10:55:10 +0100 Subject: [PATCH 17/27] Add stubs --- .../openvino/genai/generation_config.hpp | 4 +- .../genai/whisper_generation_config.hpp | 4 +- .../openvino_genai/py_openvino_genai.pyi | 115 +++++++++++++----- src/python/py_whisper_pipeline.cpp | 67 +++++++--- 4 files changed, 136 insertions(+), 54 deletions(-) diff --git a/src/cpp/include/openvino/genai/generation_config.hpp b/src/cpp/include/openvino/genai/generation_config.hpp index 3a75fc02ea..5fe2d73259 100644 --- a/src/cpp/include/openvino/genai/generation_config.hpp +++ b/src/cpp/include/openvino/genai/generation_config.hpp @@ -143,7 +143,7 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig { OPENVINO_DEPRECATED("Please, use `is_assisting_generation()` instead of `is_speculative_decoding()`. This method will be removed in 2026.0.0 release") bool is_speculative_decoding() const; - void update_generation_config(const ov::AnyMap& properties); + virtual void update_generation_config(const ov::AnyMap& properties); template util::EnableIfAllStringAny update_generation_config(Properties&&... properties) { @@ -152,7 +152,7 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig { /// @brief checks that are no conflicting parameters, e.g. do_sample=true and num_beams > 1. /// @throws Exception if config is invalid. - void validate() const; + virtual void validate() const; }; /* diff --git a/src/cpp/include/openvino/genai/whisper_generation_config.hpp b/src/cpp/include/openvino/genai/whisper_generation_config.hpp index 18b4202609..4443fd01db 100644 --- a/src/cpp/include/openvino/genai/whisper_generation_config.hpp +++ b/src/cpp/include/openvino/genai/whisper_generation_config.hpp @@ -97,7 +97,7 @@ class OPENVINO_GENAI_EXPORTS WhisperGenerationConfig : public GenerationConfig { // A list containing the non-speech tokens that will be suppressed during generation. std::vector suppress_tokens; - void update_generation_config(const ov::AnyMap& config_map = {}); + void update_generation_config(const ov::AnyMap& config_map = {}) override; template util::EnableIfAllStringAny update_generation_config(Properties&&... properties) { @@ -106,7 +106,7 @@ class OPENVINO_GENAI_EXPORTS WhisperGenerationConfig : public GenerationConfig { /// @brief checks that are no conflicting parameters. /// @throws Exception if config is invalid. - void validate() const; + void validate() const override; }; /* diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi index bba366401e..bbe581e184 100644 --- a/src/python/openvino_genai/py_openvino_genai.pyi +++ b/src/python/openvino_genai/py_openvino_genai.pyi @@ -348,11 +348,11 @@ class ChunkStreamerBase: """ End is called at the end of generation. It can be used to flush cache if your own streamer has one """ - def put(self, arg0: int) -> bool: + def put(self, token: int) -> bool: """ Put is called every time new token is generated. Returns a bool flag to indicate whether generation should be stopped, if return true generation stops """ - def put_chunk(self, arg0: list[int]) -> bool: + def put_chunk(self, tokens: list[int]) -> bool: """ Put is called every time new token chunk is generated. Returns a bool flag to indicate whether generation should be stopped, if return true generation stops """ @@ -1944,22 +1944,12 @@ class WhisperDecodedResults: @property def texts(self) -> list[str]: ... -class WhisperGenerationConfig: +class WhisperGenerationConfig(GenerationConfig): """ WhisperGenerationConfig - :param max_length: the maximum length the generated tokens can have. Corresponds to the length of the input prompt + - `max_new_tokens`. Its effect is overridden by `max_new_tokens`, if also set. - :type max_length: int - - :param max_new_tokens: the maximum numbers of tokens to generate, excluding the number of tokens in the prompt. max_new_tokens has priority over max_length. - :type max_new_tokens: int - - :param eos_token_id: End of stream token id. - :type eos_token_id: int - + Whisper specific parameters: - :param decoder_start_token_id: Corresponds to the ”<|startoftranscript|>” token. :type decoder_start_token_id: int @@ -2028,18 +2018,55 @@ class WhisperGenerationConfig: auto result = pipeline.generate(raw_speech, ov::genai::hotwords("Polychrome")); // He has gone and gone for good answered Polychrome who... :type hotwords: Optional[str] + + Generic parameters: + max_length: the maximum length the generated tokens can have. Corresponds to the length of the input prompt + + max_new_tokens. Its effect is overridden by `max_new_tokens`, if also set. + max_new_tokens: the maximum numbers of tokens to generate, excluding the number of tokens in the prompt. max_new_tokens has priority over max_length. + min_new_tokens: set 0 probability for eos_token_id for the first eos_token_id generated tokens. + ignore_eos: if set to true, then generation will not stop even if token is met. + eos_token_id: token_id of (end of sentence) + stop_strings: a set of strings that will cause pipeline to stop generating further tokens. + include_stop_str_in_output: if set to true stop string that matched generation will be included in generation output (default: false) + stop_token_ids: a set of tokens that will cause pipeline to stop generating further tokens. + echo: if set to true, the model will echo the prompt in the output. + logprobs: number of top logprobs computed for each position, if set to 0, logprobs are not computed and value 0.0 is returned. + Currently only single top logprob can be returned, so any logprobs > 1 is treated as logprobs == 1. (default: 0). + + repetition_penalty: the parameter for repetition penalty. 1.0 means no penalty. + presence_penalty: reduces absolute log prob if the token was generated at least once. + frequency_penalty: reduces absolute log prob as many times as the token was generated. + + Beam search specific parameters: + num_beams: number of beams for beam search. 1 disables beam search. + num_beam_groups: number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams. + diversity_penalty: value is subtracted from a beam's score if it generates the same token as any beam from other group at a particular time. + length_penalty: exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to + the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log + likelihood of the sequence (i.e. negative), length_penalty > 0.0 promotes longer sequences, while + length_penalty < 0.0 encourages shorter sequences. + num_return_sequences: the number of sequences to return for grouped beam search decoding. + no_repeat_ngram_size: if set to int > 0, all ngrams of that size can only occur once. + stop_criteria: controls the stopping condition for grouped beam search. It accepts the following values: + "openvino_genai.StopCriteria.EARLY", where the generation stops as soon as there are `num_beams` complete candidates; + "openvino_genai.StopCriteria.HEURISTIC" is applied and the generation stops when is it very unlikely to find better candidates; + "openvino_genai.StopCriteria.NEVER", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm). + + Random sampling parameters: + temperature: the value used to modulate token probabilities for random sampling. + top_p: if set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation. + top_k: the number of highest probability vocabulary tokens to keep for top-k-filtering. + do_sample: whether or not to use multinomial random sampling that add up to `top_p` or higher are kept. + num_return_sequences: the number of sequences to generate from a single prompt. """ begin_suppress_tokens: list[int] decoder_start_token_id: int - eos_token_id: int hotwords: str | None initial_prompt: str | None is_multilingual: bool lang_to_id: dict[str, int] language: str | None max_initial_timestamp_index: int - max_length: int - max_new_tokens: int no_timestamps_token_id: int pad_token_id: int prev_sot_token_id: int @@ -2056,8 +2083,6 @@ class WhisperGenerationConfig: @typing.overload def __init__(self, **kwargs) -> None: ... - def set_eos_token_id(self, tokenizer_eos_token_id: int) -> None: - ... def update_generation_config(self, **kwargs) -> None: ... class WhisperPerfMetrics(PerfMetrics): @@ -2110,18 +2135,8 @@ class WhisperPipeline: WhisperGenerationConfig - :param max_length: the maximum length the generated tokens can have. Corresponds to the length of the input prompt + - `max_new_tokens`. Its effect is overridden by `max_new_tokens`, if also set. - :type max_length: int - - :param max_new_tokens: the maximum numbers of tokens to generate, excluding the number of tokens in the prompt. max_new_tokens has priority over max_length. - :type max_new_tokens: int - - :param eos_token_id: End of stream token id. - :type eos_token_id: int - + Whisper specific parameters: - :param decoder_start_token_id: Corresponds to the ”<|startoftranscript|>” token. :type decoder_start_token_id: int @@ -2190,6 +2205,46 @@ class WhisperPipeline: auto result = pipeline.generate(raw_speech, ov::genai::hotwords("Polychrome")); // He has gone and gone for good answered Polychrome who... :type hotwords: Optional[str] + + Generic parameters: + max_length: the maximum length the generated tokens can have. Corresponds to the length of the input prompt + + max_new_tokens. Its effect is overridden by `max_new_tokens`, if also set. + max_new_tokens: the maximum numbers of tokens to generate, excluding the number of tokens in the prompt. max_new_tokens has priority over max_length. + min_new_tokens: set 0 probability for eos_token_id for the first eos_token_id generated tokens. + ignore_eos: if set to true, then generation will not stop even if token is met. + eos_token_id: token_id of (end of sentence) + stop_strings: a set of strings that will cause pipeline to stop generating further tokens. + include_stop_str_in_output: if set to true stop string that matched generation will be included in generation output (default: false) + stop_token_ids: a set of tokens that will cause pipeline to stop generating further tokens. + echo: if set to true, the model will echo the prompt in the output. + logprobs: number of top logprobs computed for each position, if set to 0, logprobs are not computed and value 0.0 is returned. + Currently only single top logprob can be returned, so any logprobs > 1 is treated as logprobs == 1. (default: 0). + + repetition_penalty: the parameter for repetition penalty. 1.0 means no penalty. + presence_penalty: reduces absolute log prob if the token was generated at least once. + frequency_penalty: reduces absolute log prob as many times as the token was generated. + + Beam search specific parameters: + num_beams: number of beams for beam search. 1 disables beam search. + num_beam_groups: number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams. + diversity_penalty: value is subtracted from a beam's score if it generates the same token as any beam from other group at a particular time. + length_penalty: exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to + the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log + likelihood of the sequence (i.e. negative), length_penalty > 0.0 promotes longer sequences, while + length_penalty < 0.0 encourages shorter sequences. + num_return_sequences: the number of sequences to return for grouped beam search decoding. + no_repeat_ngram_size: if set to int > 0, all ngrams of that size can only occur once. + stop_criteria: controls the stopping condition for grouped beam search. It accepts the following values: + "openvino_genai.StopCriteria.EARLY", where the generation stops as soon as there are `num_beams` complete candidates; + "openvino_genai.StopCriteria.HEURISTIC" is applied and the generation stops when is it very unlikely to find better candidates; + "openvino_genai.StopCriteria.NEVER", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm). + + Random sampling parameters: + temperature: the value used to modulate token probabilities for random sampling. + top_p: if set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation. + top_k: the number of highest probability vocabulary tokens to keep for top-k-filtering. + do_sample: whether or not to use multinomial random sampling that add up to `top_p` or higher are kept. + num_return_sequences: the number of sequences to generate from a single prompt. """ def get_generation_config(self) -> WhisperGenerationConfig: ... diff --git a/src/python/py_whisper_pipeline.cpp b/src/python/py_whisper_pipeline.cpp index 55728409e8..d6ddaedbcf 100644 --- a/src/python/py_whisper_pipeline.cpp +++ b/src/python/py_whisper_pipeline.cpp @@ -17,6 +17,7 @@ namespace py = pybind11; using ov::genai::ChunkStreamerBase; using ov::genai::ChunkStreamerVariant; using ov::genai::DecodedResults; +using ov::genai::GenerationConfig; using ov::genai::OptionalWhisperGenerationConfig; using ov::genai::PerfMetrics; using ov::genai::RawSpeechInput; @@ -76,18 +77,8 @@ auto whisper_decoded_result_chunk = R"( auto whisper_generation_config_docstring = R"( WhisperGenerationConfig - :param max_length: the maximum length the generated tokens can have. Corresponds to the length of the input prompt + - `max_new_tokens`. Its effect is overridden by `max_new_tokens`, if also set. - :type max_length: int - - :param max_new_tokens: the maximum numbers of tokens to generate, excluding the number of tokens in the prompt. max_new_tokens has priority over max_length. - :type max_new_tokens: int - - :param eos_token_id: End of stream token id. - :type eos_token_id: int - + Whisper specific parameters: - :param decoder_start_token_id: Corresponds to the ”<|startoftranscript|>” token. :type decoder_start_token_id: int @@ -156,6 +147,46 @@ auto whisper_generation_config_docstring = R"( auto result = pipeline.generate(raw_speech, ov::genai::hotwords("Polychrome")); // He has gone and gone for good answered Polychrome who... :type hotwords: Optional[str] + + Generic parameters: + max_length: the maximum length the generated tokens can have. Corresponds to the length of the input prompt + + max_new_tokens. Its effect is overridden by `max_new_tokens`, if also set. + max_new_tokens: the maximum numbers of tokens to generate, excluding the number of tokens in the prompt. max_new_tokens has priority over max_length. + min_new_tokens: set 0 probability for eos_token_id for the first eos_token_id generated tokens. + ignore_eos: if set to true, then generation will not stop even if token is met. + eos_token_id: token_id of (end of sentence) + stop_strings: a set of strings that will cause pipeline to stop generating further tokens. + include_stop_str_in_output: if set to true stop string that matched generation will be included in generation output (default: false) + stop_token_ids: a set of tokens that will cause pipeline to stop generating further tokens. + echo: if set to true, the model will echo the prompt in the output. + logprobs: number of top logprobs computed for each position, if set to 0, logprobs are not computed and value 0.0 is returned. + Currently only single top logprob can be returned, so any logprobs > 1 is treated as logprobs == 1. (default: 0). + + repetition_penalty: the parameter for repetition penalty. 1.0 means no penalty. + presence_penalty: reduces absolute log prob if the token was generated at least once. + frequency_penalty: reduces absolute log prob as many times as the token was generated. + + Beam search specific parameters: + num_beams: number of beams for beam search. 1 disables beam search. + num_beam_groups: number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams. + diversity_penalty: value is subtracted from a beam's score if it generates the same token as any beam from other group at a particular time. + length_penalty: exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to + the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log + likelihood of the sequence (i.e. negative), length_penalty > 0.0 promotes longer sequences, while + length_penalty < 0.0 encourages shorter sequences. + num_return_sequences: the number of sequences to return for grouped beam search decoding. + no_repeat_ngram_size: if set to int > 0, all ngrams of that size can only occur once. + stop_criteria: controls the stopping condition for grouped beam search. It accepts the following values: + "openvino_genai.StopCriteria.EARLY", where the generation stops as soon as there are `num_beams` complete candidates; + "openvino_genai.StopCriteria.HEURISTIC" is applied and the generation stops when is it very unlikely to find better candidates; + "openvino_genai.StopCriteria.NEVER", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm). + + Random sampling parameters: + temperature: the value used to modulate token probabilities for random sampling. + top_p: if set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation. + top_k: the number of highest probability vocabulary tokens to keep for top-k-filtering. + do_sample: whether or not to use multinomial random sampling that add up to `top_p` or higher are kept. + num_return_sequences: the number of sequences to generate from a single prompt. )"; auto streamer_base_docstring = R"( @@ -274,17 +305,16 @@ void init_whisper_pipeline(py::module_& m) { "End is called at the end of generation. It can be used to flush cache if your own streamer has one"); // Binding for WhisperGenerationConfig - py::class_(m, "WhisperGenerationConfig", whisper_generation_config_docstring) + py::class_(m, + "WhisperGenerationConfig", + whisper_generation_config_docstring) .def(py::init(), py::arg("json_path"), "path where generation_config.json is stored") .def(py::init([](const py::kwargs& kwargs) { return *update_whisper_config_from_kwargs(WhisperGenerationConfig(), kwargs); })) - .def_readwrite("max_new_tokens", &WhisperGenerationConfig::max_new_tokens) - .def_readwrite("max_length", &WhisperGenerationConfig::max_length) .def_readwrite("begin_suppress_tokens", &WhisperGenerationConfig::begin_suppress_tokens) .def_readwrite("suppress_tokens", &WhisperGenerationConfig::suppress_tokens) .def_readwrite("decoder_start_token_id", &WhisperGenerationConfig::decoder_start_token_id) - .def_readwrite("eos_token_id", &WhisperGenerationConfig::eos_token_id) .def_readwrite("pad_token_id", &WhisperGenerationConfig::pad_token_id) .def_readwrite("translate_token_id", &WhisperGenerationConfig::translate_token_id) .def_readwrite("transcribe_token_id", &WhisperGenerationConfig::transcribe_token_id) @@ -298,12 +328,9 @@ void init_whisper_pipeline(py::module_& m) { .def_readwrite("return_timestamps", &WhisperGenerationConfig::return_timestamps) .def_readwrite("initial_prompt", &WhisperGenerationConfig::initial_prompt) .def_readwrite("hotwords", &WhisperGenerationConfig::hotwords) - .def("set_eos_token_id", &WhisperGenerationConfig::set_eos_token_id, py::arg("tokenizer_eos_token_id")) - .def("update_generation_config", []( - ov::genai::WhisperGenerationConfig& config, - const py::kwargs& kwargs) { + .def("update_generation_config", [](ov::genai::WhisperGenerationConfig& config, const py::kwargs& kwargs) { config.update_generation_config(pyutils::kwargs_to_any_map(kwargs)); - });; + }); py::class_(m, "WhisperRawPerfMetrics", raw_perf_metrics_docstring) .def(py::init<>()) From 502174285d349a325a99e7b763126245bd31ad9e Mon Sep 17 00:00:00 2001 From: Alexander Suvorov Date: Wed, 22 Jan 2025 10:59:06 +0100 Subject: [PATCH 18/27] Remove comment --- src/cpp/src/whisper/models/with_past_decoder.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/cpp/src/whisper/models/with_past_decoder.cpp b/src/cpp/src/whisper/models/with_past_decoder.cpp index a6c68bffcb..60e1adcd3c 100644 --- a/src/cpp/src/whisper/models/with_past_decoder.cpp +++ b/src/cpp/src/whisper/models/with_past_decoder.cpp @@ -123,7 +123,6 @@ std::pair WhisperWithPastDecoder::decode(const Tensor& encoder_hi const size_t batch_size = input_ids.get_shape().at(0); const size_t seq_length = input_ids.get_shape().at(1); - // todo: skip copy if already set and batch didn't changed _set_encoder_hidden_states_tensor(encoder_hidden_state, batch_size, request); request.set_tensor("input_ids", input_ids); From 04318d10a6ded8290bf01ae90ed7d94532a2eeab Mon Sep 17 00:00:00 2001 From: Alexander Suvorov Date: Wed, 22 Jan 2025 11:24:38 +0100 Subject: [PATCH 19/27] Add args name --- src/python/py_whisper_pipeline.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/python/py_whisper_pipeline.cpp b/src/python/py_whisper_pipeline.cpp index d6ddaedbcf..aac14c258a 100644 --- a/src/python/py_whisper_pipeline.cpp +++ b/src/python/py_whisper_pipeline.cpp @@ -295,11 +295,13 @@ void init_whisper_pipeline(py::module_& m) { .def("put", &ChunkStreamerBase::put, "Put is called every time new token is generated. Returns a bool flag to indicate whether generation " - "should be stopped, if return true generation stops") + "should be stopped, if return true generation stops", + py::arg("token")) .def("put_chunk", &ChunkStreamerBase::put_chunk, "Put is called every time new token chunk is generated. Returns a bool flag to indicate whether " - "generation should be stopped, if return true generation stops") + "generation should be stopped, if return true generation stops", + py::arg("tokens")) .def("end", &ChunkStreamerBase::end, "End is called at the end of generation. It can be used to flush cache if your own streamer has one"); From 9e08d180fdd48685fd5a755eb6e7cdc1a2637648 Mon Sep 17 00:00:00 2001 From: Alexander Suvorov Date: Wed, 22 Jan 2025 14:28:27 +0100 Subject: [PATCH 20/27] add tests --- src/cpp/src/whisper/whisper.cpp | 2 +- src/cpp/src/whisper_generation_config.cpp | 10 ++--- tests/python_tests/test_whisper_pipeline.py | 50 ++++++++++++++++++++- 3 files changed, 53 insertions(+), 9 deletions(-) diff --git a/src/cpp/src/whisper/whisper.cpp b/src/cpp/src/whisper/whisper.cpp index 26fb735903..cbc26e490e 100644 --- a/src/cpp/src/whisper/whisper.cpp +++ b/src/cpp/src/whisper/whisper.cpp @@ -101,7 +101,7 @@ std::pair decode(std::shared_ptrhas_finished()) { + while (!sequence_group->has_finished() && !sequence_group->handle_dropped()) { std::map> batch_to_generated_ids{}; sequence_group->schedule_tokens(1); diff --git a/src/cpp/src/whisper_generation_config.cpp b/src/cpp/src/whisper_generation_config.cpp index 85b3635ee6..ec12170cf9 100644 --- a/src/cpp/src/whisper_generation_config.cpp +++ b/src/cpp/src/whisper_generation_config.cpp @@ -82,12 +82,10 @@ void WhisperGenerationConfig::validate() const { OPENVINO_ASSERT(!task.has_value(), "Cannot specify 'task' for not multilingual model."); } - if (is_beam_search()) { - OPENVINO_ASSERT(num_return_sequences == 1, - "'num_return_sequences' must be 1. Provided: ", - num_return_sequences, - "."); - } + OPENVINO_ASSERT(num_return_sequences == 1, + "'num_return_sequences' must be 1. Provided: ", + num_return_sequences, + "."); OPENVINO_ASSERT(!is_assisting_generation(), "Assisted generation is not supported."); } diff --git a/tests/python_tests/test_whisper_pipeline.py b/tests/python_tests/test_whisper_pipeline.py index 4fe239b358..3893becd7e 100644 --- a/tests/python_tests/test_whisper_pipeline.py +++ b/tests/python_tests/test_whisper_pipeline.py @@ -126,9 +126,14 @@ def run_huggingface( return pipeline( sample, - max_new_tokens=min(config.max_new_tokens, 444), return_timestamps=config.return_timestamps, - generate_kwargs={"language": config.language, "task": config.task}, + generate_kwargs={ + "language": config.language, + "task": config.task, + "max_new_tokens": min(config.max_new_tokens, 444), + "top_p": config.top_p, + "do_sample": config.do_sample, + }, ) @@ -147,6 +152,8 @@ def run_genai( genai_config.return_timestamps = config.return_timestamps genai_config.task = config.task genai_config.language = f"<|{config.language}|>" if config.language else None + genai_config.do_sample = config.do_sample + genai_config.top_p = config.top_p return pipeline.generate(sample, genai_config, streamer=streamer) @@ -555,6 +562,45 @@ def test_initial_prompt_hotwords(model_descr, test_sample): assert "Joel Kyton" in result.texts[0] +@pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True)) +@pytest.mark.parametrize("test_sample", get_samples_from_dataset(length=1)) +@pytest.mark.precommit +def test_random_sampling(model_descr, test_sample): + _, _, hf_pipe, genai_pipe = read_whisper_model(model_descr) + + config = ov_genai.WhisperGenerationConfig(do_sample=True, top_p=0.01) + + genai_result = run_genai( + genai_pipe, + test_sample, + config=config, + ) + + hf_result = run_huggingface( + hf_pipe, + test_sample, + config=config, + ) + + compare_results(hf_result, genai_result) + + config.top_p = 0.6 + + genai_result = run_genai( + genai_pipe, + test_sample, + config=config, + ) + + hf_result = run_huggingface( + hf_pipe, + test_sample, + config=config, + ) + + assert genai_result.texts[0] != hf_result["text"] + + @pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True)) @pytest.mark.parametrize( "test_sample", From 56bf11c1c07558e764049a541788d54697307f40 Mon Sep 17 00:00:00 2001 From: Alexander Suvorov Date: Wed, 22 Jan 2025 17:35:00 +0100 Subject: [PATCH 21/27] Apply review comments --- src/cpp/include/openvino/genai/generation_config.hpp | 4 ++-- src/cpp/src/whisper/whisper.cpp | 8 +++----- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/src/cpp/include/openvino/genai/generation_config.hpp b/src/cpp/include/openvino/genai/generation_config.hpp index 5fe2d73259..3a75fc02ea 100644 --- a/src/cpp/include/openvino/genai/generation_config.hpp +++ b/src/cpp/include/openvino/genai/generation_config.hpp @@ -143,7 +143,7 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig { OPENVINO_DEPRECATED("Please, use `is_assisting_generation()` instead of `is_speculative_decoding()`. This method will be removed in 2026.0.0 release") bool is_speculative_decoding() const; - virtual void update_generation_config(const ov::AnyMap& properties); + void update_generation_config(const ov::AnyMap& properties); template util::EnableIfAllStringAny update_generation_config(Properties&&... properties) { @@ -152,7 +152,7 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig { /// @brief checks that are no conflicting parameters, e.g. do_sample=true and num_beams > 1. /// @throws Exception if config is invalid. - virtual void validate() const; + void validate() const; }; /* diff --git a/src/cpp/src/whisper/whisper.cpp b/src/cpp/src/whisper/whisper.cpp index cbc26e490e..4031149163 100644 --- a/src/cpp/src/whisper/whisper.cpp +++ b/src/cpp/src/whisper/whisper.cpp @@ -90,12 +90,10 @@ std::pair decode(std::shared_ptrupdate_processed_tokens_num(sequence_group->get_prompt_len() - output_sequence_len); - sequence_group->schedule_tokens(output_sequence_len); + sequence_group->schedule_tokens(sequence_group->get_prompt_len()); + sequence_group->set_output_seq_len(output_sequence_len); sampler.sample({sequence_group}, logits); stream_generated_tokens(); From 50eb509cc9a61f2a71291a38391307d13d942a2b Mon Sep 17 00:00:00 2001 From: Alexander Suvorov Date: Thu, 23 Jan 2025 10:51:28 +0100 Subject: [PATCH 22/27] move set_encoder_states to base class --- .../genai/whisper_generation_config.hpp | 4 +- src/cpp/src/whisper/models/decoder.cpp | 33 ++++++++++++ src/cpp/src/whisper/models/decoder.hpp | 5 ++ .../src/whisper/models/statefull_decoder.cpp | 52 +------------------ .../src/whisper/models/statefull_decoder.hpp | 2 - .../src/whisper/models/with_past_decoder.cpp | 33 ------------ .../src/whisper/models/with_past_decoder.hpp | 3 -- 7 files changed, 41 insertions(+), 91 deletions(-) diff --git a/src/cpp/include/openvino/genai/whisper_generation_config.hpp b/src/cpp/include/openvino/genai/whisper_generation_config.hpp index 4443fd01db..18b4202609 100644 --- a/src/cpp/include/openvino/genai/whisper_generation_config.hpp +++ b/src/cpp/include/openvino/genai/whisper_generation_config.hpp @@ -97,7 +97,7 @@ class OPENVINO_GENAI_EXPORTS WhisperGenerationConfig : public GenerationConfig { // A list containing the non-speech tokens that will be suppressed during generation. std::vector suppress_tokens; - void update_generation_config(const ov::AnyMap& config_map = {}) override; + void update_generation_config(const ov::AnyMap& config_map = {}); template util::EnableIfAllStringAny update_generation_config(Properties&&... properties) { @@ -106,7 +106,7 @@ class OPENVINO_GENAI_EXPORTS WhisperGenerationConfig : public GenerationConfig { /// @brief checks that are no conflicting parameters. /// @throws Exception if config is invalid. - void validate() const override; + void validate() const; }; /* diff --git a/src/cpp/src/whisper/models/decoder.cpp b/src/cpp/src/whisper/models/decoder.cpp index 32a8f2eff6..0b2a083908 100644 --- a/src/cpp/src/whisper/models/decoder.cpp +++ b/src/cpp/src/whisper/models/decoder.cpp @@ -22,5 +22,38 @@ std::shared_ptr WhisperDecoder::from_path(const std::filesystem: return std::make_shared(models_path, device, properties); } +/** + * Encoder hidden states expected to be with batch 1 + * Copy encoder hidden state tensor from batch 1 to requested batch_size. + * Set new encoder hidden states tensor to infer request. + */ +void WhisperDecoder::_set_encoder_hidden_states_tensor(const Tensor& encoder_hidden_state, + const size_t batch_size, + InferRequest& request) { + const size_t current_batch_size = request.get_tensor("encoder_hidden_states").get_shape().at(0); + // batch hasn't changed, skip + if (current_batch_size == batch_size) { + return; + } + + OPENVINO_ASSERT(encoder_hidden_state.get_shape().at(0) == 1); + Shape shape{encoder_hidden_state.get_shape()}; + shape[0] = batch_size; + + Tensor new_encoder_hidden_states{ov::element::f32, shape}; + + auto new_encoder_hidden_states_data = new_encoder_hidden_states.data(); + auto encoder_hidden_state_data = encoder_hidden_state.data(); + + for (size_t batch = 0; batch < batch_size; batch++) { + const size_t batch_offset = batch * encoder_hidden_state.get_size(); + std::memcpy(new_encoder_hidden_states_data + batch_offset, + encoder_hidden_state_data, + encoder_hidden_state.get_byte_size()); + } + + request.set_tensor("encoder_hidden_states", new_encoder_hidden_states); +} + WhisperDecoder::~WhisperDecoder() = default; } // namespace ov::genai diff --git a/src/cpp/src/whisper/models/decoder.hpp b/src/cpp/src/whisper/models/decoder.hpp index acb10d92b5..66e86a0733 100644 --- a/src/cpp/src/whisper/models/decoder.hpp +++ b/src/cpp/src/whisper/models/decoder.hpp @@ -25,5 +25,10 @@ class WhisperDecoder { virtual void reset_state() = 0; virtual ~WhisperDecoder(); + +protected: + void _set_encoder_hidden_states_tensor(const Tensor& encoder_hidden_state, + const size_t batch_size, + InferRequest& request); }; } // namespace ov::genai diff --git a/src/cpp/src/whisper/models/statefull_decoder.cpp b/src/cpp/src/whisper/models/statefull_decoder.cpp index 55b111286b..9c0c4a0b3f 100644 --- a/src/cpp/src/whisper/models/statefull_decoder.cpp +++ b/src/cpp/src/whisper/models/statefull_decoder.cpp @@ -41,7 +41,7 @@ std::pair WhisperStatefullDecoder::decode(const Tensor& encod const size_t batch_size = input_ids.get_shape().at(0); const size_t seq_len = input_ids.get_shape().at(1); - _set_encoder_hidden_states_tensor(encoder_hidden_state, batch_size); + _set_encoder_hidden_states_tensor(encoder_hidden_state, batch_size, m_request); _set_cache_position_tensor(seq_len); m_request.set_tensor("input_ids", input_ids); @@ -56,56 +56,6 @@ std::pair WhisperStatefullDecoder::decode(const Tensor& encod return {output_tensor, infer_ms}; }; -/** - * Encoder hidden states expected to be with batch 1 - * Copy encoder hidden state tensor from batch 1 to requested batch_size. - * Set new encoder hidden states tensor to infer request. - */ -void WhisperStatefullDecoder::_set_encoder_hidden_states_tensor(const Tensor& encoder_hidden_state, - const size_t batch_size) { - const size_t current_batch_size = m_request.get_tensor("encoder_hidden_states").get_shape().at(0); - // batch hasn't changed, skip - if (current_batch_size == batch_size) { - return; - } - - if (current_batch_size != 0) { - _reset_encoder_past_key_values_states(encoder_hidden_state, batch_size); - } - - OPENVINO_ASSERT(encoder_hidden_state.get_shape().at(0) == 1); - Shape shape{encoder_hidden_state.get_shape()}; - shape[0] = batch_size; - - Tensor new_encoder_hidden_states{ov::element::f32, shape}; - - auto new_encoder_hidden_states_data = new_encoder_hidden_states.data(); - auto encoder_hidden_state_data = encoder_hidden_state.data(); - - for (size_t batch = 0; batch < batch_size; batch++) { - const size_t batch_offset = batch * encoder_hidden_state.get_size(); - std::memcpy(new_encoder_hidden_states_data + batch_offset, - encoder_hidden_state_data, - encoder_hidden_state.get_byte_size()); - } - - m_request.set_tensor("encoder_hidden_states", new_encoder_hidden_states); -} - -// Past_key value states are not shring/grow when batch is changed. Reset past_key values states as a workaround. -// Ticket: -void WhisperStatefullDecoder::_reset_encoder_past_key_values_states(const Tensor& encoder_hidden_state, - const size_t batch_size) { - const size_t encoder_state_length_dim = encoder_hidden_state.get_shape().at(1); - for (auto& state : m_request.query_state()) { - // find encoder states by dimension - const Shape& state_shape = state.get_state().get_shape(); - if (state_shape.at(2) == encoder_state_length_dim) { - state.reset(); - } - } -} - void WhisperStatefullDecoder::_set_cache_position_tensor(const size_t seq_len) { ov::Tensor cache_position_tensor = m_request.get_tensor("cache_position"); diff --git a/src/cpp/src/whisper/models/statefull_decoder.hpp b/src/cpp/src/whisper/models/statefull_decoder.hpp index 4d4572c33d..44156fc6aa 100644 --- a/src/cpp/src/whisper/models/statefull_decoder.hpp +++ b/src/cpp/src/whisper/models/statefull_decoder.hpp @@ -24,8 +24,6 @@ class WhisperStatefullDecoder : public WhisperDecoder { void reset_state() override; private: - void _set_encoder_hidden_states_tensor(const Tensor& encoder_hidden_state, const size_t batch_size); - void _reset_encoder_past_key_values_states(const Tensor& encoder_hidden_state, const size_t batch_size); void _set_cache_position_tensor(const size_t seq_len); private: diff --git a/src/cpp/src/whisper/models/with_past_decoder.cpp b/src/cpp/src/whisper/models/with_past_decoder.cpp index 60e1adcd3c..2ab07112fa 100644 --- a/src/cpp/src/whisper/models/with_past_decoder.cpp +++ b/src/cpp/src/whisper/models/with_past_decoder.cpp @@ -145,39 +145,6 @@ std::pair WhisperWithPastDecoder::decode(const Tensor& encoder_hi return {output_tensor, infer_ms}; } -/** - * Encoder hidden states expected to be with batch 1 - * Copy encoder hidden state tensor from batch 1 to requested batch_size. - * Set new encoder hidden states tensor to infer request. - */ -void WhisperWithPastDecoder::_set_encoder_hidden_states_tensor(const Tensor& encoder_hidden_state, - const size_t batch_size, - InferRequest& request) { - const size_t current_batch_size = request.get_tensor("encoder_hidden_states").get_shape().at(0); - // batch hasn't changed, skip - if (current_batch_size == batch_size) { - return; - } - - OPENVINO_ASSERT(encoder_hidden_state.get_shape().at(0) == 1); - Shape shape{encoder_hidden_state.get_shape()}; - shape[0] = batch_size; - - Tensor new_encoder_hidden_states{ov::element::f32, shape}; - - auto new_encoder_hidden_states_data = new_encoder_hidden_states.data(); - auto encoder_hidden_state_data = encoder_hidden_state.data(); - - for (size_t batch = 0; batch < batch_size; batch++) { - const size_t batch_offset = batch * encoder_hidden_state.get_size(); - std::memcpy(new_encoder_hidden_states_data + batch_offset, - encoder_hidden_state_data, - encoder_hidden_state.get_byte_size()); - } - - request.set_tensor("encoder_hidden_states", new_encoder_hidden_states); -} - void WhisperWithPastDecoder::_set_past_key_value(const Tensor& beam_idx) { const bool is_initial_step = m_cache_position == 0; if (is_initial_step) { diff --git a/src/cpp/src/whisper/models/with_past_decoder.hpp b/src/cpp/src/whisper/models/with_past_decoder.hpp index 7eb3990cca..3cf4404092 100644 --- a/src/cpp/src/whisper/models/with_past_decoder.hpp +++ b/src/cpp/src/whisper/models/with_past_decoder.hpp @@ -30,9 +30,6 @@ class WhisperWithPastDecoder : public WhisperDecoder { bool m_initial_past_key_value_set = false; bool m_past_key_value_linked = false; - void _set_encoder_hidden_states_tensor(const Tensor& encoder_hidden_state, - const size_t batch_size, - InferRequest& request); void _set_past_key_value(const Tensor& beam_idx); }; From abde309857f2209e092d9920e0f8fe2672a3c013 Mon Sep 17 00:00:00 2001 From: Alexander Suvorov Date: Thu, 23 Jan 2025 11:05:26 +0100 Subject: [PATCH 23/27] Move detect_language to base decoder --- .../whisper_speech_recognition.cpp | 2 +- src/cpp/src/whisper/models/decoder.cpp | 17 +++++++++++++++++ src/cpp/src/whisper/models/decoder.hpp | 3 +-- .../src/whisper/models/statefull_decoder.cpp | 17 ----------------- .../src/whisper/models/statefull_decoder.hpp | 3 --- .../src/whisper/models/with_past_decoder.cpp | 17 ----------------- .../src/whisper/models/with_past_decoder.hpp | 3 --- 7 files changed, 19 insertions(+), 43 deletions(-) diff --git a/samples/cpp/whisper_speech_recognition/whisper_speech_recognition.cpp b/samples/cpp/whisper_speech_recognition/whisper_speech_recognition.cpp index 3b2b4ff466..cbb932a74d 100644 --- a/samples/cpp/whisper_speech_recognition/whisper_speech_recognition.cpp +++ b/samples/cpp/whisper_speech_recognition/whisper_speech_recognition.cpp @@ -18,7 +18,7 @@ int main(int argc, char* argv[]) try { ov::genai::WhisperGenerationConfig config = pipeline.get_generation_config(); config.max_new_tokens = 100; // increase this based on your speech length // 'task' and 'language' parameters are supported for multilingual models only - config.language = "<|en|>"; // can switch to <|zh|> for Chinese language + // config.language = "<|en|>"; // can switch to <|zh|> for Chinese language config.task = "transcribe"; config.return_timestamps = true; diff --git a/src/cpp/src/whisper/models/decoder.cpp b/src/cpp/src/whisper/models/decoder.cpp index 0b2a083908..1c8df0edd9 100644 --- a/src/cpp/src/whisper/models/decoder.cpp +++ b/src/cpp/src/whisper/models/decoder.cpp @@ -22,6 +22,23 @@ std::shared_ptr WhisperDecoder::from_path(const std::filesystem: return std::make_shared(models_path, device, properties); } +std::pair WhisperDecoder::detect_language(const ov::Tensor& encoder_hidden_state, + const int64_t decoder_start_token_id) { + Tensor input_ids_tensor{ov::element::i64, {1, 1}}; + input_ids_tensor.data()[0] = decoder_start_token_id; + + Tensor beam_idx_tensor{ov::element::i32, {1}}; + beam_idx_tensor.data()[0] = 0; + + auto [output_tensor, infer_ms] = decode(encoder_hidden_state, input_ids_tensor, beam_idx_tensor); + + int64_t output_token = ov::genai::utils::argmax(output_tensor, 0); + + reset_state(); + + return {output_token, infer_ms}; +} + /** * Encoder hidden states expected to be with batch 1 * Copy encoder hidden state tensor from batch 1 to requested batch_size. diff --git a/src/cpp/src/whisper/models/decoder.hpp b/src/cpp/src/whisper/models/decoder.hpp index 66e86a0733..6eeba2b387 100644 --- a/src/cpp/src/whisper/models/decoder.hpp +++ b/src/cpp/src/whisper/models/decoder.hpp @@ -15,8 +15,7 @@ class WhisperDecoder { const std::string& device, const ov::AnyMap& properties); - virtual std::pair detect_language(const Tensor& encoder_hidden_state, - const int64_t decoder_start_token_id) = 0; + std::pair detect_language(const Tensor& encoder_hidden_state, const int64_t decoder_start_token_id); virtual std::pair decode(const Tensor& encoder_hidden_state, const Tensor& input_ids, diff --git a/src/cpp/src/whisper/models/statefull_decoder.cpp b/src/cpp/src/whisper/models/statefull_decoder.cpp index 9c0c4a0b3f..5208f496fb 100644 --- a/src/cpp/src/whisper/models/statefull_decoder.cpp +++ b/src/cpp/src/whisper/models/statefull_decoder.cpp @@ -18,23 +18,6 @@ WhisperStatefullDecoder::WhisperStatefullDecoder(const std::filesystem::path& mo m_request = compiled_model.create_infer_request(); } -std::pair WhisperStatefullDecoder::detect_language(const ov::Tensor& encoder_hidden_state, - const int64_t decoder_start_token_id) { - Tensor input_ids_tensor{ov::element::i64, {1, 1}}; - input_ids_tensor.data()[0] = decoder_start_token_id; - - Tensor beam_idx_tensor{ov::element::i32, {1}}; - beam_idx_tensor.data()[0] = 0; - - auto [output_tensor, infer_ms] = decode(encoder_hidden_state, input_ids_tensor, beam_idx_tensor); - - int64_t output_token = ov::genai::utils::argmax(output_tensor, 0); - - reset_state(); - - return {output_token, infer_ms}; -} - std::pair WhisperStatefullDecoder::decode(const Tensor& encoder_hidden_state, const Tensor& input_ids, const Tensor& beam_idx) { diff --git a/src/cpp/src/whisper/models/statefull_decoder.hpp b/src/cpp/src/whisper/models/statefull_decoder.hpp index 44156fc6aa..c8c733e943 100644 --- a/src/cpp/src/whisper/models/statefull_decoder.hpp +++ b/src/cpp/src/whisper/models/statefull_decoder.hpp @@ -14,9 +14,6 @@ class WhisperStatefullDecoder : public WhisperDecoder { const std::string& device, const ov::AnyMap& properties); - std::pair detect_language(const Tensor& encoder_hidden_state, - const int64_t decoder_start_token_id) override; - std::pair decode(const Tensor& encoder_hidden_state, const Tensor& input_ids, const Tensor& beam_idx) override; diff --git a/src/cpp/src/whisper/models/with_past_decoder.cpp b/src/cpp/src/whisper/models/with_past_decoder.cpp index 2ab07112fa..1ade0dea6b 100644 --- a/src/cpp/src/whisper/models/with_past_decoder.cpp +++ b/src/cpp/src/whisper/models/with_past_decoder.cpp @@ -97,23 +97,6 @@ WhisperWithPastDecoder::WhisperWithPastDecoder(const std::filesystem::path& mode m_request_decoder_with_past = compiled_model.create_infer_request(); } -std::pair WhisperWithPastDecoder::detect_language(const ov::Tensor& encoder_hidden_state, - const int64_t decoder_start_token_id) { - Tensor input_ids_tensor{ov::element::i64, {1, 1}}; - input_ids_tensor.data()[0] = decoder_start_token_id; - - Tensor beam_idx_tensor{ov::element::i32, {1}}; - beam_idx_tensor.data()[0] = 0; - - auto [output_tensor, infer_ms] = decode(encoder_hidden_state, input_ids_tensor, beam_idx_tensor); - - int64_t output_token = ov::genai::utils::argmax(output_tensor, 0); - - reset_state(); - - return {output_token, infer_ms}; -} - std::pair WhisperWithPastDecoder::decode(const Tensor& encoder_hidden_state, const Tensor& input_ids, const Tensor& beam_idx) { diff --git a/src/cpp/src/whisper/models/with_past_decoder.hpp b/src/cpp/src/whisper/models/with_past_decoder.hpp index 3cf4404092..1610c60d4e 100644 --- a/src/cpp/src/whisper/models/with_past_decoder.hpp +++ b/src/cpp/src/whisper/models/with_past_decoder.hpp @@ -14,9 +14,6 @@ class WhisperWithPastDecoder : public WhisperDecoder { const std::string& device, const ov::AnyMap& properties); - std::pair detect_language(const Tensor& encoder_hidden_state, - const int64_t decoder_start_token_id) override; - std::pair decode(const Tensor& encoder_hidden_state, const Tensor& input_ids, const Tensor& beam_idx) override; From 3348ad5f5cc91ec9a616b7b8bbb28003bce1616a Mon Sep 17 00:00:00 2001 From: Alexander Suvorov Date: Thu, 23 Jan 2025 11:05:58 +0100 Subject: [PATCH 24/27] revert sample --- .../whisper_speech_recognition/whisper_speech_recognition.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/cpp/whisper_speech_recognition/whisper_speech_recognition.cpp b/samples/cpp/whisper_speech_recognition/whisper_speech_recognition.cpp index cbb932a74d..3b2b4ff466 100644 --- a/samples/cpp/whisper_speech_recognition/whisper_speech_recognition.cpp +++ b/samples/cpp/whisper_speech_recognition/whisper_speech_recognition.cpp @@ -18,7 +18,7 @@ int main(int argc, char* argv[]) try { ov::genai::WhisperGenerationConfig config = pipeline.get_generation_config(); config.max_new_tokens = 100; // increase this based on your speech length // 'task' and 'language' parameters are supported for multilingual models only - // config.language = "<|en|>"; // can switch to <|zh|> for Chinese language + config.language = "<|en|>"; // can switch to <|zh|> for Chinese language config.task = "transcribe"; config.return_timestamps = true; From d63e70079939696f6fa51fa516b872ee77f75be4 Mon Sep 17 00:00:00 2001 From: Alexander Suvorov Date: Thu, 23 Jan 2025 14:20:09 +0100 Subject: [PATCH 25/27] Move whisper utils --- src/cpp/src/utils.cpp | 16 ---------------- src/cpp/src/utils.hpp | 2 -- src/cpp/src/whisper/models/decoder.cpp | 2 +- src/cpp/src/whisper/whisper_utils.cpp | 16 ++++++++++++++++ src/cpp/src/whisper/whisper_utils.hpp | 2 ++ 5 files changed, 19 insertions(+), 19 deletions(-) diff --git a/src/cpp/src/utils.cpp b/src/cpp/src/utils.cpp index dd3051b8b0..a8cf844cb7 100644 --- a/src/cpp/src/utils.cpp +++ b/src/cpp/src/utils.cpp @@ -46,22 +46,6 @@ void print_tensor(const ov::Tensor& tensor) { std::cout << "]" << std::endl; } -int64_t argmax(const ov::Tensor& logits, const size_t batch_idx) { - if (logits.get_shape()[0] <= batch_idx) { - OPENVINO_THROW("logits batch size doesn't match the number of beams"); - } - - size_t vocab_size = logits.get_shape().back(); - size_t batch_offset = batch_idx * logits.get_shape()[1] * vocab_size; - size_t sequence_offset = (logits.get_shape()[1] - 1) * vocab_size; - const float* logits_data = logits.data() + batch_offset + sequence_offset; - - int64_t out_token = std::max_element(logits_data, logits_data + vocab_size) - logits_data; - float max_logit = logits_data[out_token]; - - return out_token; -} - /** * Initializes position ids based on attention mask and starting position */ diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp index c25b2c3913..8c56c39a8c 100644 --- a/src/cpp/src/utils.hpp +++ b/src/cpp/src/utils.hpp @@ -47,8 +47,6 @@ Tensor init_attention_mask(const Tensor& position_ids); void print_tensor(const ov::Tensor& tensor); -int64_t argmax(const ov::Tensor& logits, const size_t batch_idx); - void initialize_position_ids(ov::Tensor& position_ids, const ov::Tensor& attention_mask, int64_t start_pos = 0); ov::Tensor extend_attention(ov::Tensor attention_mask); diff --git a/src/cpp/src/whisper/models/decoder.cpp b/src/cpp/src/whisper/models/decoder.cpp index 1c8df0edd9..c09a84ccdd 100644 --- a/src/cpp/src/whisper/models/decoder.cpp +++ b/src/cpp/src/whisper/models/decoder.cpp @@ -6,7 +6,7 @@ #include #include "statefull_decoder.hpp" -#include "utils.hpp" +#include "whisper/whisper_utils.hpp" #include "with_past_decoder.hpp" namespace ov::genai { diff --git a/src/cpp/src/whisper/whisper_utils.cpp b/src/cpp/src/whisper/whisper_utils.cpp index 6e56a1439d..f41d3d11d8 100644 --- a/src/cpp/src/whisper/whisper_utils.cpp +++ b/src/cpp/src/whisper/whisper_utils.cpp @@ -41,6 +41,22 @@ void filter_non_segment_metrics(ov::genai::RawPerfMetrics& raw_metrics, filter_by_ranges(raw_metrics.m_batch_sizes, offset, ranges); } +int64_t argmax(const ov::Tensor& logits, const size_t batch_idx) { + if (logits.get_shape()[0] <= batch_idx) { + OPENVINO_THROW("logits batch size doesn't match the number of beams"); + } + + size_t vocab_size = logits.get_shape().back(); + size_t batch_offset = batch_idx * logits.get_shape()[1] * vocab_size; + size_t sequence_offset = (logits.get_shape()[1] - 1) * vocab_size; + const float* logits_data = logits.data() + batch_offset + sequence_offset; + + int64_t out_token = std::max_element(logits_data, logits_data + vocab_size) - logits_data; + float max_logit = logits_data[out_token]; + + return out_token; +} + } // namespace utils } // namespace genai } // namespace ov diff --git a/src/cpp/src/whisper/whisper_utils.hpp b/src/cpp/src/whisper/whisper_utils.hpp index 234feed6a8..8fd0a080c6 100644 --- a/src/cpp/src/whisper/whisper_utils.hpp +++ b/src/cpp/src/whisper/whisper_utils.hpp @@ -17,6 +17,8 @@ void filter_non_segment_metrics(ov::genai::RawPerfMetrics& raw_metrics, size_t offset, std::vector>& ranges); +int64_t argmax(const ov::Tensor& logits, const size_t batch_idx); + } // namespace utils } // namespace genai } // namespace ov From 6411b173abd2141f147e0471fe73c2e9c71ca0a2 Mon Sep 17 00:00:00 2001 From: Alexander Suvorov Date: Thu, 23 Jan 2025 14:35:27 +0100 Subject: [PATCH 26/27] Add get_max_new_tokens for sequence group --- src/cpp/src/sampler.cpp | 19 +++++++++++-------- src/cpp/src/sampler.hpp | 2 +- src/cpp/src/sequence_group.hpp | 6 +++++- 3 files changed, 17 insertions(+), 10 deletions(-) diff --git a/src/cpp/src/sampler.cpp b/src/cpp/src/sampler.cpp index a1957b0630..7a1e079746 100644 --- a/src/cpp/src/sampler.cpp +++ b/src/cpp/src/sampler.cpp @@ -408,7 +408,7 @@ void Sampler::GroupBeamSearcher::select_next_tokens(const ov::Tensor& logits, } // check whether group has finished - group.is_done(m_parameters, m_sequence_group->get_prompt_len()); + group.is_done(); // group cannot continue if there are no valid child beams if (child_beams_per_group[group_id].size() == 0) { @@ -549,14 +549,14 @@ std::vector Sampler::_try_finish_generation(SequenceGroup::Ptr & sequen std::vector dropped_seq_ids; for (auto& running_sequence : sequence_group->get_running_sequences()) { const auto generated_len = running_sequence->get_generated_len(); - if (sampling_params.get_max_new_tokens(sequence_group->get_prompt_len()) <= generated_len || + if (sequence_group->get_max_new_tokens() <= generated_len || is_stop_token_id_hit(running_sequence->get_generated_ids().back(), sampling_params.stop_token_ids) && !sampling_params.ignore_eos) { // stop sequence by max_new_tokens or stop token (eos included) running_sequence->set_status(SequenceStatus::FINISHED); if (is_stop_token_id_hit(running_sequence->get_generated_ids().back(), sampling_params.stop_token_ids) && !sampling_params.ignore_eos) { running_sequence->set_finish_reason(GenerationFinishReason::STOP); - } else if (sampling_params.get_max_new_tokens(sequence_group->get_prompt_len()) == generated_len) { + } else if (sequence_group->get_max_new_tokens() == generated_len) { running_sequence->set_finish_reason(GenerationFinishReason::LENGTH); } @@ -800,8 +800,8 @@ SamplerOutput Sampler::sample(const std::vector & sequence_g // max counter of needed to be sampled tokens OPENVINO_ASSERT(running_sequence->get_generated_len() >= token_offset); size_t generated_and_verified_len = running_sequence->get_generated_len() - token_offset; - OPENVINO_ASSERT(sampling_params.get_max_new_tokens(sequence_group->get_prompt_len()) >= generated_and_verified_len); - size_t max_num_sampled_token = sampling_params.get_max_new_tokens(sequence_group->get_prompt_len()) - generated_and_verified_len; + OPENVINO_ASSERT(sequence_group->get_max_new_tokens() >= generated_and_verified_len); + size_t max_num_sampled_token = sequence_group->get_max_new_tokens() - generated_and_verified_len; if (max_num_sampled_token == 0) { stop_sample_tokens(running_sequence, token_offset, max_num_sampled_token, max_removed_tokens_per_request); break; @@ -887,7 +887,7 @@ SamplerOutput Sampler::sample(const std::vector & sequence_g // check max length stop criteria std::vector running_sequences = sequence_group->get_running_sequences(); if (!sequence_group->has_finished() && - running_sequences[0]->get_generated_len() == sampling_params.get_max_new_tokens(sequence_group->get_prompt_len())) { + running_sequences[0]->get_generated_len() == sequence_group->get_max_new_tokens()) { // stop sequence by max_new_tokens m_beam_search_info.at(request_id).finalize(sampler_output); } @@ -956,7 +956,10 @@ int64_t Sampler::GroupBeamSearcher::Group::finish(Beam beam, const ov::genai::Ge return preeempted_sequence_id; } -void Sampler::GroupBeamSearcher::Group::is_done(const ov::genai::GenerationConfig& sampling_params, size_t prompt_length) { +void Sampler::GroupBeamSearcher::Group::is_done() { + const auto sequence_group = ongoing.front().m_sequence->get_sequence_group_ptr(); + const ov::genai::GenerationConfig& sampling_params = sequence_group->get_sampling_parameters(); + assert(sampling_params.num_beams % sampling_params.num_beam_groups == 0 && "number of beams should be divisible by number of groups"); size_t group_size = sampling_params.num_beams / sampling_params.num_beam_groups; @@ -977,7 +980,7 @@ void Sampler::GroupBeamSearcher::Group::is_done(const ov::genai::GenerationConfi return; } case ov::genai::StopCriteria::NEVER: { - size_t length = sampling_params.length_penalty > 0.0 ? sampling_params.get_max_new_tokens(prompt_length) : cur_len; + size_t length = sampling_params.length_penalty > 0.0 ? sequence_group->get_max_new_tokens() : cur_len; float highest_attainable_score = best_sum_logprobs / std::pow(float(length), sampling_params.length_penalty); done = worst_score >= highest_attainable_score; return; diff --git a/src/cpp/src/sampler.hpp b/src/cpp/src/sampler.hpp index 3b7d98a7d8..9768e0a7af 100644 --- a/src/cpp/src/sampler.hpp +++ b/src/cpp/src/sampler.hpp @@ -114,7 +114,7 @@ class Sampler::GroupBeamSearcher { bool done = false; int64_t finish(Beam beam, const ov::genai::GenerationConfig& sampling_params); - void is_done(const ov::genai::GenerationConfig& sampling_params, size_t prompt_length); + void is_done(); }; SequenceGroup::Ptr m_sequence_group; diff --git a/src/cpp/src/sequence_group.hpp b/src/cpp/src/sequence_group.hpp index fef9757b43..19d29c92ac 100644 --- a/src/cpp/src/sequence_group.hpp +++ b/src/cpp/src/sequence_group.hpp @@ -689,7 +689,11 @@ class SequenceGroup : public std::enable_shared_from_this { GenerationOutputs outputs; outputs.emplace(0, output); m_generation_stream->push(std::move(outputs)); - } + } + + size_t get_max_new_tokens() { + return m_sampling_params.get_max_new_tokens(get_prompt_len()); + } }; inline std::shared_ptr Sequence::get_sequence_group_ptr() const { From f9cb4613b87d0dc4ad1e4d2882007b99102bab04 Mon Sep 17 00:00:00 2001 From: Alexander Suvorov Date: Thu, 23 Jan 2025 14:40:33 +0100 Subject: [PATCH 27/27] Use sg get_max_new_tokens --- src/cpp/src/scheduler.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cpp/src/scheduler.hpp b/src/cpp/src/scheduler.hpp index 86f705f759..eabbee935f 100644 --- a/src/cpp/src/scheduler.hpp +++ b/src/cpp/src/scheduler.hpp @@ -493,7 +493,7 @@ class Scheduler { for (auto idx = 0; idx < sequence_groups.size(); idx++) { auto seq_length = sequence_groups[idx]->get_prompt_len() * m_kv_blocks_initial_multiplier; auto gen_config = sequence_groups[idx]->get_sampling_parameters(); - seq_length = std::min(seq_length, sequence_groups[idx]->get_prompt_len() + gen_config.get_max_new_tokens(sequence_groups[idx]->get_prompt_len())); + seq_length = std::min(seq_length, sequence_groups[idx]->get_prompt_len() + sequence_groups[idx]->get_max_new_tokens()); size_t blocks_num = std::ceil((float)seq_length / m_block_manager->get_block_size()); if (gen_config.is_beam_search()) { blocks_num *= gen_config.num_beams;