From 68d3e487ea3dc6547fb6686678b68a7b3d8fcbee Mon Sep 17 00:00:00 2001
From: Alexander Suvorov <alexander.suvorov@intel.com>
Date: Fri, 3 Jan 2025 15:23:50 +0100
Subject: [PATCH 01/27] use decoder interface

---
 src/cpp/src/whisper/models/decoder.cpp        |  26 +++
 src/cpp/src/whisper/models/decoder.hpp        |  35 ++++
 .../src/whisper/models/statefull_decoder.cpp  |  73 ++++++++
 .../src/whisper/models/statefull_decoder.hpp  |  29 +++
 .../src/whisper/models/with_past_decoder.cpp  | 102 +++++++++++
 .../src/whisper/models/with_past_decoder.hpp  |  32 ++++
 src/cpp/src/whisper/whisper.cpp               | 170 +++++-------------
 src/cpp/src/whisper/whisper.hpp               |   8 +-
 src/cpp/src/whisper/whisper_models.hpp        |   2 +-
 src/cpp/src/whisper_pipeline.cpp              |  18 +-
 10 files changed, 356 insertions(+), 139 deletions(-)
 create mode 100644 src/cpp/src/whisper/models/decoder.cpp
 create mode 100644 src/cpp/src/whisper/models/decoder.hpp
 create mode 100644 src/cpp/src/whisper/models/statefull_decoder.cpp
 create mode 100644 src/cpp/src/whisper/models/statefull_decoder.hpp
 create mode 100644 src/cpp/src/whisper/models/with_past_decoder.cpp
 create mode 100644 src/cpp/src/whisper/models/with_past_decoder.hpp
diff --git a/src/cpp/src/whisper/models/decoder.cpp b/src/cpp/src/whisper/models/decoder.cpp
new file mode 100644
index 0000000000..32a8f2eff6
--- /dev/null
+++ b/src/cpp/src/whisper/models/decoder.cpp
@@ -0,0 +1,26 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "decoder.hpp"
+
+#include <filesystem>
+
+#include "statefull_decoder.hpp"
+#include "utils.hpp"
+#include "with_past_decoder.hpp"
+
+namespace ov::genai {
+std::shared_ptr<WhisperDecoder> WhisperDecoder::from_path(const std::filesystem::path& models_path,
+                                                          const std::string& device,
+                                                          const ov::AnyMap& properties) {
+    bool has_decoder_with_past = std::filesystem::exists(models_path / "openvino_decoder_with_past_model.xml");
+
+    if (has_decoder_with_past) {
+        return std::make_shared<WhisperWithPastDecoder>(models_path, device, properties);
+    }
+
+    return std::make_shared<WhisperStatefullDecoder>(models_path, device, properties);
+}
+
+WhisperDecoder::~WhisperDecoder() = default;
+}  // namespace ov::genai
diff --git a/src/cpp/src/whisper/models/decoder.hpp b/src/cpp/src/whisper/models/decoder.hpp
new file mode 100644
index 0000000000..d82ce5047a
--- /dev/null
+++ b/src/cpp/src/whisper/models/decoder.hpp
@@ -0,0 +1,35 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <filesystem>
+
+#include "openvino/genai/whisper_generation_config.hpp"
+#include "openvino/runtime/core.hpp"
+
+namespace ov::genai {
+class WhisperDecoder {
+public:
+    static std::shared_ptr<WhisperDecoder> from_path(const std::filesystem::path& models_path,
+                                                     const std::string& device,
+                                                     const ov::AnyMap& properties);
+
+    virtual std::pair<int64_t, float> detect_language(const ov::Tensor& encoder_hidden_state,
+                                                      const int64_t decoder_start_token_id) {
+        OPENVINO_THROW("detect_language method not implemented");
+    };
+
+    virtual std::pair<ov::Tensor, float> decode(const ov::Tensor& encoder_hidden_state,
+                                                const std::vector<int64_t>& input_ids,
+                                                const size_t cache_position) {
+        OPENVINO_THROW("decode method not implemented");
+    };
+
+    virtual void reset_state() {
+        OPENVINO_THROW("reset_state method not implemented");
+    }
+
+    virtual ~WhisperDecoder();
+};
+}  // namespace ov::genai
diff --git a/src/cpp/src/whisper/models/statefull_decoder.cpp b/src/cpp/src/whisper/models/statefull_decoder.cpp
new file mode 100644
index 0000000000..2f235178a4
--- /dev/null
+++ b/src/cpp/src/whisper/models/statefull_decoder.cpp
@@ -0,0 +1,73 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "statefull_decoder.hpp"
+
+#include "utils.hpp"
+
+namespace ov::genai {
+WhisperStatefullDecoder::WhisperStatefullDecoder(const std::filesystem::path& models_path,
+                                                 const std::string& device,
+                                                 const ov::AnyMap& properties) {
+    ov::Core core = utils::singleton_core();
+
+    auto model = core.read_model((models_path / "openvino_decoder_model.xml").string());
+
+    // todo: remove once stateful model has dynamic input_ids seq_len
+    std::map<std::string, ov::PartialShape> name_to_shape;
+    for (const ov::Output<ov::Node>& input : model->inputs()) {
+        ov::PartialShape shape = input.get_partial_shape();
+        if (input.get_any_name().find("input_ids") != std::string::npos) {
+            shape[1] = -1;
+            name_to_shape[input.get_any_name()] = shape;
+        }
+    }
+    model->reshape(name_to_shape);
+
+    auto compiled_model = core.compile_model(model, device, properties);
+
+    utils::print_compiled_model_properties(compiled_model, "whisper decoder model");
+    m_request = compiled_model.create_infer_request();
+}
+
+std::pair<int64_t, float> WhisperStatefullDecoder::detect_language(const ov::Tensor& encoder_hidden_state,
+                                                                   const int64_t decoder_start_token_id) {
+    auto [output_tensor, infer_ms] = decode(encoder_hidden_state, {decoder_start_token_id}, 0);
+
+    int64_t output_token = ov::genai::utils::argmax(output_tensor, 0);
+
+    reset_state();
+
+    return {output_token, infer_ms};
+}
+
+std::pair<ov::Tensor, float> WhisperStatefullDecoder::decode(const ov::Tensor& encoder_hidden_state,
+                                                             const std::vector<int64_t>& input_ids,
+                                                             const size_t cache_position) {
+    m_request.set_tensor("encoder_hidden_states", encoder_hidden_state);
+
+    ov::Tensor input_ids_tensor(ov::element::i64, {1, input_ids.size()}, (void*)input_ids.data());
+    m_request.set_tensor("input_ids", input_ids_tensor);
+
+    ov::Tensor cache_position_tensor = m_request.get_tensor("cache_position");
+    cache_position_tensor.set_shape({input_ids.size()});
+
+    auto cache_data = cache_position_tensor.data<int64_t>();
+    std::iota(cache_data, cache_data + cache_position_tensor.get_size(), cache_position);
+
+    m_request.get_tensor("beam_idx").set_shape({1});
+    m_request.get_tensor("beam_idx").data<int32_t>()[0] = 0;
+
+    const auto infer_start = std::chrono::steady_clock::now();
+    m_request.infer();
+    const auto infer_ms = ov::genai::PerfMetrics::get_microsec(std::chrono::steady_clock::now() - infer_start);
+
+    auto output_tensor = m_request.get_tensor("logits");
+
+    return {output_tensor, infer_ms};
+};
+
+void WhisperStatefullDecoder::reset_state() {
+    m_request.reset_state();
+}
+}  // namespace ov::genai
diff --git a/src/cpp/src/whisper/models/statefull_decoder.hpp b/src/cpp/src/whisper/models/statefull_decoder.hpp
new file mode 100644
index 0000000000..569031b9fa
--- /dev/null
+++ b/src/cpp/src/whisper/models/statefull_decoder.hpp
@@ -0,0 +1,29 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "decoder.hpp"
+#include "openvino/runtime/core.hpp"
+
+namespace ov::genai {
+
+class WhisperStatefullDecoder : public WhisperDecoder {
+public:
+    explicit WhisperStatefullDecoder(const std::filesystem::path& models_path,
+                                     const std::string& device,
+                                     const ov::AnyMap& properties);
+
+    std::pair<int64_t, float> detect_language(const ov::Tensor& encoder_hidden_state,
+                                              const int64_t decoder_start_token_id) override;
+
+    std::pair<ov::Tensor, float> decode(const ov::Tensor& encoder_hidden_state,
+                                        const std::vector<int64_t>& input_ids,
+                                        const size_t cache_position) override;
+
+    void reset_state() override;
+
+private:
+    ov::InferRequest m_request;
+};
+}  // namespace ov::genai
diff --git a/src/cpp/src/whisper/models/with_past_decoder.cpp b/src/cpp/src/whisper/models/with_past_decoder.cpp
new file mode 100644
index 0000000000..a32c9a45be
--- /dev/null
+++ b/src/cpp/src/whisper/models/with_past_decoder.cpp
@@ -0,0 +1,102 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "with_past_decoder.hpp"
+
+#include <regex>
+
+#include "utils.hpp"
+
+namespace {
+void set_past_key_value(ov::InferRequest& source, ov::InferRequest& dest) {
+    // source outputs:
+    // present.0.decoder.key
+    // present.0.decoder.value
+    // present.0.encoder.key
+    // present.0.encoder.value
+
+    // dest inputs:
+    // past_key_values.0.decoder.key
+    // past_key_values.0.decoder.value
+    // past_key_values.0.encoder.key
+    // past_key_values.0.encoder.value
+
+    for (auto& source_output : source.get_compiled_model().outputs()) {
+        std::string source_output_name = source_output.get_any_name();
+        if (source_output_name.find("logits") != std::string::npos) {
+            continue;
+        }
+
+        std::string with_past_input_name =
+            std::regex_replace(source_output_name, std::regex("present"), "past_key_values");
+
+        auto kv_tensor = source.get_tensor(source_output_name);
+        dest.set_tensor(with_past_input_name, ov::Tensor{kv_tensor});
+    }
+}
+}  // namespace
+
+namespace ov::genai {
+WhisperWithPastDecoder::WhisperWithPastDecoder(const std::filesystem::path& models_path,
+                                               const std::string& device,
+                                               const ov::AnyMap& properties) {
+    ov::Core core = utils::singleton_core();
+
+    auto compiled_model = core.compile_model(models_path / "openvino_decoder_model.xml", device, properties);
+    utils::print_compiled_model_properties(compiled_model, "whisper decoder model");
+    m_request_decoder = compiled_model.create_infer_request();
+
+    compiled_model = core.compile_model(models_path / "openvino_decoder_with_past_model.xml", device, properties);
+    utils::print_compiled_model_properties(compiled_model, "whisper decoder with past model");
+    m_request_decoder_with_past = compiled_model.create_infer_request();
+}
+
+std::pair<int64_t, float> WhisperWithPastDecoder::detect_language(const ov::Tensor& encoder_hidden_state,
+                                                                  const int64_t decoder_start_token_id) {
+    auto [output_tensor, infer_ms] = decode(encoder_hidden_state, {decoder_start_token_id}, 0);
+
+    int64_t output_token = ov::genai::utils::argmax(output_tensor, 0);
+
+    reset_state();
+
+    return {output_token, infer_ms};
+}
+
+std::pair<ov::Tensor, float> WhisperWithPastDecoder::decode(const ov::Tensor& encoder_hidden_state,
+                                                            const std::vector<int64_t>& input_ids,
+                                                            const size_t cache_position) {
+    const bool initial_step = cache_position == 0;
+    ov::InferRequest& request = initial_step ? m_request_decoder : m_request_decoder_with_past;
+
+    request.set_tensor("encoder_hidden_states", encoder_hidden_state);
+
+    const ov::Tensor input_ids_tensor(ov::element::i64, {1, input_ids.size()}, (void*)input_ids.data());
+    request.set_tensor("input_ids", input_ids_tensor);
+
+    if (!initial_step) {
+        ov::Tensor cache_position_tensor = request.get_tensor("cache_position");
+        cache_position_tensor.set_shape({1});
+        cache_position_tensor.data<int64_t>()[0] = cache_position;
+    }
+
+    const auto infer_start = std::chrono::steady_clock::now();
+    request.infer();
+    const auto infer_ms = ov::genai::PerfMetrics::get_microsec(std::chrono::steady_clock::now() - infer_start);
+
+    auto output_tensor = request.get_tensor("logits");
+
+    if (initial_step) {
+        set_past_key_value(m_request_decoder, m_request_decoder_with_past);
+    } else if (!m_decoder_with_past_kv_value_set) {
+        set_past_key_value(m_request_decoder_with_past, m_request_decoder_with_past);
+        m_decoder_with_past_kv_value_set = true;
+    }
+
+    return {output_tensor, infer_ms};
+}
+
+void WhisperWithPastDecoder::reset_state() {
+    m_request_decoder_with_past.reset_state();
+    m_decoder_with_past_kv_value_set = false;
+}
+}  // namespace ov::genai
diff --git a/src/cpp/src/whisper/models/with_past_decoder.hpp b/src/cpp/src/whisper/models/with_past_decoder.hpp
new file mode 100644
index 0000000000..b6f6924af6
--- /dev/null
+++ b/src/cpp/src/whisper/models/with_past_decoder.hpp
@@ -0,0 +1,32 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "decoder.hpp"
+#include "openvino/runtime/core.hpp"
+
+namespace ov::genai {
+
+class WhisperWithPastDecoder : public WhisperDecoder {
+public:
+    explicit WhisperWithPastDecoder(const std::filesystem::path& models_path,
+                                    const std::string& device,
+                                    const ov::AnyMap& properties);
+
+    std::pair<int64_t, float> detect_language(const ov::Tensor& encoder_hidden_state,
+                                              const int64_t decoder_start_token_id) override;
+
+    std::pair<ov::Tensor, float> decode(const ov::Tensor& encoder_hidden_state,
+                                        const std::vector<int64_t>& input_ids,
+                                        const size_t cache_position) override;
+
+    void reset_state() override;
+
+private:
+    ov::InferRequest m_request_decoder;
+    ov::InferRequest m_request_decoder_with_past;
+    bool m_decoder_with_past_kv_value_set = false;
+};
+
+}  // namespace ov::genai
diff --git a/src/cpp/src/whisper/whisper.cpp b/src/cpp/src/whisper/whisper.cpp
index 04993f288c..9dffe01bd2 100644
--- a/src/cpp/src/whisper/whisper.cpp
+++ b/src/cpp/src/whisper/whisper.cpp
@@ -10,6 +10,7 @@
 
 #include "context_tokens.hpp"
 #include "logit_processor.hpp"
+#include "models/decoder.hpp"
 #include "openvino/genai/perf_metrics.hpp"
 #include "openvino/genai/whisper_generation_config.hpp"
 #include "openvino/genai/whisper_pipeline.hpp"
@@ -53,89 +54,34 @@ ov::Tensor encode(ov::InferRequest& request,
     return request.get_tensor("last_hidden_state");
 }
 
-void set_past_key_value(ov::InferRequest& source, ov::InferRequest& dest) {
-    // source outputs:
-    // present.0.decoder.key
-    // present.0.decoder.value
-    // present.0.encoder.key
-    // present.0.encoder.value
-
-    // dest inputs:
-    // past_key_values.0.decoder.key
-    // past_key_values.0.decoder.value
-    // past_key_values.0.encoder.key
-    // past_key_values.0.encoder.value
-
-    for (auto& source_output : source.get_compiled_model().outputs()) {
-        std::string source_output_name = source_output.get_any_name();
-        if (source_output_name.find("logits") != std::string::npos) {
-            continue;
-        }
-
-        std::string with_past_input_name =
-            std::regex_replace(source_output_name, std::regex("present"), "past_key_values");
-
-        auto kv_tensor = source.get_tensor(source_output_name);
-        dest.set_tensor(with_past_input_name, ov::Tensor{kv_tensor});
-    }
-}
-
 int64_t decode(ov::Tensor& encoder_hidden_state,
-               ov::InferRequest& decoder,
-               std::vector<int64_t>& input_ids,
+               std::shared_ptr<ov::genai::WhisperDecoder> decoder,
+               const std::vector<int64_t>& input_ids,
+               const size_t cache_position,
                const ov::genai::WhisperGenerationConfig& config,
                ov::genai::RawPerfMetrics& raw_metrics,
-               const bool apply_logit_processors = true,
-               const bool return_timestamps = false) {
-    decoder.set_tensor("encoder_hidden_states", ov::Tensor{encoder_hidden_state});
-
-    ov::Tensor input_ids_tensor(ov::element::i64, {1, input_ids.size()}, input_ids.data());
-    decoder.set_tensor("input_ids", input_ids_tensor);
-
-    ov::genai::utils::infer_with_perf_metrics(decoder, raw_metrics);
-
-    auto output_tensor = decoder.get_tensor("logits");
+               const bool return_timestamps,
+               const bool initial_step,
+               const std::vector<int64_t>& generated_tokens) {
+    auto [output_tensor, infer_ms] = decoder->decode(encoder_hidden_state, input_ids, cache_position);
+    const auto infer_end = std::chrono::steady_clock::now();
+    raw_metrics.m_inference_durations[0] += MicroSeconds(infer_ms);
+    raw_metrics.m_token_infer_durations.emplace_back(infer_ms);
+    raw_metrics.m_new_token_times.emplace_back(infer_end);
+    raw_metrics.m_batch_sizes.emplace_back(1);
 
-    if (apply_logit_processors) {
+    if (initial_step) {
         ov::genai::do_suppress_tokens(output_tensor, 0, config.begin_suppress_tokens);
-        ov::genai::do_suppress_tokens(output_tensor, 0, config.suppress_tokens);
-
-        if (return_timestamps) {
-            ov::genai::process_whisper_timestamp_logits(output_tensor, 0, config, {}, true);
-        }
     }
 
-    int64_t output_token = ov::genai::utils::argmax(output_tensor, 0);
-
-    return output_token;
-}
-
-int64_t decode_with_past(ov::Tensor& encoder_hidden_state,
-                         ov::InferRequest& decoder_with_past,
-                         int64_t input_id,
-                         const size_t cache_position,
-                         const ov::genai::WhisperGenerationConfig& config,
-                         ov::genai::RawPerfMetrics& raw_metrics,
-                         const bool return_timestamps,
-                         const std::vector<int64_t>& generated_tokens) {
-    decoder_with_past.set_tensor("encoder_hidden_states", ov::Tensor{encoder_hidden_state});
-
-    std::vector<int64_t> input_ids = {input_id};
-    ov::Tensor input_ids_tensor(ov::element::i64, {1, 1}, input_ids.data());
-    decoder_with_past.set_tensor("input_ids", input_ids_tensor);
-
-    ov::Tensor cache_position_tensor = decoder_with_past.get_tensor("cache_position");
-    cache_position_tensor.set_shape({1});
-    cache_position_tensor.data<int64_t>()[0] = cache_position;
-
-    ov::genai::utils::infer_with_perf_metrics(decoder_with_past, raw_metrics);
-
-    auto output_tensor = decoder_with_past.get_tensor("logits");
-
     ov::genai::do_suppress_tokens(output_tensor, 0, config.suppress_tokens);
 
     if (return_timestamps) {
-        ov::genai::process_whisper_timestamp_logits(output_tensor, 0, config, generated_tokens);
+        if (initial_step) {
+            ov::genai::process_whisper_timestamp_logits(output_tensor, 0, config, {}, true);
+        } else {
+            ov::genai::process_whisper_timestamp_logits(output_tensor, 0, config, generated_tokens);
+        }
     }
 
     int64_t output_token = ov::genai::utils::argmax(output_tensor, 0);
@@ -143,34 +89,11 @@ int64_t decode_with_past(ov::Tensor& encoder_hidden_state,
     return output_token;
 }
 
-int64_t detect_language(ov::Tensor& encoder_hidden_state,
-                        ov::InferRequest& decoder,
-                        const ov::genai::WhisperGenerationConfig& config,
-                        ov::genai::RawPerfMetrics& raw_metrics) {
-    std::vector<int64_t> input_ids{config.decoder_start_token_id};
-
-    decoder.set_tensor("encoder_hidden_states", ov::Tensor{encoder_hidden_state});
-
-    ov::Tensor input_ids_tensor(ov::element::i64, {1, input_ids.size()}, input_ids.data());
-    decoder.set_tensor("input_ids", input_ids_tensor);
-
-    const auto infer_start = std::chrono::steady_clock::now();
-    decoder.infer();
-    const auto infer_ms = ov::genai::PerfMetrics::get_microsec(std::chrono::steady_clock::now() - infer_start);
-    raw_metrics.m_inference_durations[0] += MicroSeconds(infer_ms);
-
-    auto output_tensor = decoder.get_tensor("logits");
-
-    int64_t output_token = ov::genai::utils::argmax(output_tensor, 0);
-
-    return output_token;
-}
-
-std::vector<int64_t> prepare_init_tokens(ov::Tensor& encoder_hidden_state,
-                                         ov::InferRequest decoder,
-                                         const ov::genai::WhisperGenerationConfig& config,
-                                         const bool return_timestamps,
-                                         ov::genai::RawPerfMetrics& raw_metrics) {
+std::vector<int64_t> prepare_init_ids(ov::Tensor& encoder_hidden_state,
+                                      std::shared_ptr<ov::genai::WhisperDecoder> decoder,
+                                      const ov::genai::WhisperGenerationConfig& config,
+                                      const bool return_timestamps,
+                                      ov::genai::RawPerfMetrics& raw_metrics) {
     if (!config.is_multilingual) {
         if (return_timestamps) {
             return std::vector<int64_t>{config.decoder_start_token_id};
@@ -186,7 +109,9 @@ std::vector<int64_t> prepare_init_tokens(ov::Tensor& encoder_hidden_state,
             language_token_id = config.lang_to_id.at(language);
         }
     } else {
-        language_token_id = detect_language(encoder_hidden_state, decoder, config, raw_metrics);
+        auto [language, infer_ms] = decoder->detect_language(encoder_hidden_state, config.decoder_start_token_id);
+        language_token_id = language;
+        raw_metrics.m_inference_durations[0] += MicroSeconds(infer_ms);
     }
 
     int64_t task_token_id = config.transcribe_token_id;
@@ -206,14 +131,14 @@ std::vector<int64_t> prepare_init_tokens(ov::Tensor& encoder_hidden_state,
 
 std::pair<bool, std::vector<int64_t>> full_decode(ov::Tensor& encoder_hidden_state,
                                                   const ov::genai::WhisperGenerationConfig& config,
-                                                  ov::genai::WhisperInitializedModels& models,
+                                                  std::shared_ptr<ov::genai::WhisperDecoder> decoder,
                                                   std::vector<int64_t> init_ids,
                                                   const size_t max_new_tokens,
                                                   const bool return_timestamps,
                                                   ov::genai::RawPerfMetrics& raw_metrics,
                                                   const std::shared_ptr<ov::genai::StreamerBase> streamer) {
     int64_t output_token =
-        decode(encoder_hidden_state, models.decoder, init_ids, config, raw_metrics, true, return_timestamps);
+        decode(encoder_hidden_state, decoder, init_ids, 0, config, raw_metrics, return_timestamps, true, {});
 
     std::vector<int64_t> output_tokens{output_token};
 
@@ -225,21 +150,16 @@ std::pair<bool, std::vector<int64_t>> full_decode(ov::Tensor& encoder_hidden_sta
         return {false, output_tokens};
     }
 
-    set_past_key_value(models.decoder, models.decoder_with_past);
-
     for (size_t i = 0; i < max_new_tokens - 1; i++) {
-        auto output_token = decode_with_past(encoder_hidden_state,
-                                             models.decoder_with_past,
-                                             output_tokens.back(),
-                                             init_ids.size() + i,
-                                             config,
-                                             raw_metrics,
-                                             return_timestamps,
-                                             output_tokens);
-
-        if (i == 0) {
-            set_past_key_value(models.decoder_with_past, models.decoder_with_past);
-        }
+        auto output_token = decode(encoder_hidden_state,
+                                   decoder,
+                                   {output_tokens.back()},
+                                   init_ids.size() + i,
+                                   config,
+                                   raw_metrics,
+                                   return_timestamps,
+                                   false,
+                                   output_tokens);
 
         if (output_token == config.eos_token_id) {
             break;
@@ -264,7 +184,8 @@ WhisperGenerateResult whisper_generate(const ov::genai::WhisperGenerationConfig&
                                        const ov::genai::WhisperConfig& model_config,
                                        const WhisperContextTokens& context_tokens,
                                        const RawSpeechInput& raw_speech,
-                                       ov::genai::WhisperInitializedModels& models,
+                                       ov::InferRequest& encoder,
+                                       std::shared_ptr<WhisperDecoder> decoder,
                                        WhisperFeatureExtractor& feature_extractor,
                                        const std::shared_ptr<ChunkStreamerBase> streamer) {
     size_t max_new_tokens = config.get_max_new_tokens();
@@ -301,7 +222,7 @@ WhisperGenerateResult whisper_generate(const ov::genai::WhisperGenerationConfig&
 
         auto input_features_chunk = input_features.get_data_with_offset(chunk_offset, feature_extractor.nb_max_frames);
 
-        ov::Tensor hidden_state_tensor = encode(models.encoder,
+        ov::Tensor hidden_state_tensor = encode(encoder,
                                                 input_features_chunk,
                                                 feature_extractor.feature_size,
                                                 feature_extractor.nb_max_frames,
@@ -309,8 +230,7 @@ WhisperGenerateResult whisper_generate(const ov::genai::WhisperGenerationConfig&
 
         // prepare init_ids just once for whole input
         if (init_tokens.empty()) {
-            init_tokens =
-                prepare_init_tokens(hidden_state_tensor, models.decoder, config, return_timestamps, raw_metrics);
+            init_tokens = prepare_init_ids(hidden_state_tensor, decoder, config, return_timestamps, raw_metrics);
         }
 
         std::vector<int64_t> chunk_init_tokens = ov::genai::get_prompt_tokens(context_tokens, config, chunk_offset);
@@ -318,14 +238,14 @@ WhisperGenerateResult whisper_generate(const ov::genai::WhisperGenerationConfig&
 
         auto [cancelled, chunk_output_tokens] = full_decode(hidden_state_tensor,
                                                             config,
-                                                            models,
+                                                            decoder,
                                                             chunk_init_tokens,
                                                             max_new_tokens - output_tokens.size(),
                                                             return_timestamps,
                                                             raw_metrics,
                                                             streamer);
 
-        models.decoder_with_past.reset_state();
+        decoder->reset_state();
 
         if (return_timestamps) {
             auto extracted_segments = ov::genai::extract_segments(chunk_output_tokens,
@@ -333,7 +253,7 @@ WhisperGenerateResult whisper_generate(const ov::genai::WhisperGenerationConfig&
                                                                   feature_extractor.nb_max_frames,
                                                                   time_precision);
 
-            ov::genai::utils::filter_non_segment_metrics(raw_metrics, output_tokens.size(), extracted_segments.segment_ranges);
+            utils::filter_non_segment_metrics(raw_metrics, output_tokens.size(), extracted_segments.segment_ranges);
 
             segments.insert(segments.end(), extracted_segments.segments.begin(), extracted_segments.segments.end());
 
diff --git a/src/cpp/src/whisper/whisper.hpp b/src/cpp/src/whisper/whisper.hpp
index 81f559db9f..fbdf56d171 100644
--- a/src/cpp/src/whisper/whisper.hpp
+++ b/src/cpp/src/whisper/whisper.hpp
@@ -6,6 +6,7 @@
 #include <openvino/openvino.hpp>
 
 #include "context_tokens.hpp"
+#include "models/decoder.hpp"
 #include "openvino/genai/whisper_generation_config.hpp"
 #include "openvino/genai/whisper_pipeline.hpp"
 #include "whisper_config.hpp"
@@ -30,9 +31,10 @@ struct WhisperGenerateResult {
 WhisperGenerateResult whisper_generate(const ov::genai::WhisperGenerationConfig& config,
                                        const ov::genai::WhisperConfig& model_config,
                                        const WhisperContextTokens& context_tokens,
-                                       const ov::genai::RawSpeechInput& raw_speech,
-                                       ov::genai::WhisperInitializedModels& models,
-                                       ov::genai::WhisperFeatureExtractor& feature_extractor,
+                                       const RawSpeechInput& raw_speech,
+                                       ov::InferRequest& encoder,
+                                       std::shared_ptr<WhisperDecoder> decoder,
+                                       WhisperFeatureExtractor& feature_extractor,
                                        const std::shared_ptr<ChunkStreamerBase> streamer);
 
 }  // namespace genai
diff --git a/src/cpp/src/whisper/whisper_models.hpp b/src/cpp/src/whisper/whisper_models.hpp
index 576bdb9dc7..9a915e92f4 100644
--- a/src/cpp/src/whisper/whisper_models.hpp
+++ b/src/cpp/src/whisper/whisper_models.hpp
@@ -3,7 +3,7 @@
 
 #pragma once
 
-#include <openvino/openvino.hpp>
+#include <openvino/runtime/core.hpp>
 
 namespace ov {
 namespace genai {
diff --git a/src/cpp/src/whisper_pipeline.cpp b/src/cpp/src/whisper_pipeline.cpp
index f0fb34cdf6..de9ca0e4e0 100644
--- a/src/cpp/src/whisper_pipeline.cpp
+++ b/src/cpp/src/whisper_pipeline.cpp
@@ -10,6 +10,7 @@
 
 #include "utils.hpp"
 #include "whisper/context_tokens.hpp"
+#include "whisper/models/decoder.hpp"
 #include "whisper/streamer.hpp"
 #include "whisper/whisper.hpp"
 #include "whisper/whisper_config.hpp"
@@ -47,7 +48,8 @@ namespace genai {
 
 class WhisperPipeline::WhisperPipelineStatefulImpl : public WhisperPipeline::WhisperPipelineImplBase {
 public:
-    ov::genai::WhisperInitializedModels m_models;
+    ov::InferRequest m_encoder;
+    std::shared_ptr<ov::genai::WhisperDecoder> m_decoder;
 
     WhisperPipelineStatefulImpl(const std::filesystem::path& models_path,
                                 const std::string& device,
@@ -61,14 +63,9 @@ class WhisperPipeline::WhisperPipelineStatefulImpl : public WhisperPipeline::Whi
         compiled_model =
             core.compile_model((models_path / "openvino_encoder_model.xml").string(), device, compile_properties);
         ov::genai::utils::print_compiled_model_properties(compiled_model, "whisper encoder model");
-        m_models.encoder = compiled_model.create_infer_request();
-        compiled_model =
-            core.compile_model((models_path / "openvino_decoder_model.xml").string(), device, compile_properties);
-        ov::genai::utils::print_compiled_model_properties(compiled_model, "whisper decoder model");
-        m_models.decoder = compiled_model.create_infer_request();
-        compiled_model = core.compile_model(models_path / "openvino_decoder_with_past_model.xml", device, compile_properties);
-        m_models.decoder_with_past = compiled_model.create_infer_request();
-        ov::genai::utils::print_compiled_model_properties(compiled_model, "whisper decoder with past model");
+        m_encoder = compiled_model.create_infer_request();
+
+        m_decoder = WhisperDecoder::from_path(models_path, device, compile_properties);
 
         // If eos_token_id was not provided, take value
         if (m_generation_config.eos_token_id == -1) {
@@ -98,7 +95,8 @@ class WhisperPipeline::WhisperPipelineStatefulImpl : public WhisperPipeline::Whi
                                                            m_model_config,
                                                            context_tokens,
                                                            raw_speech_input,
-                                                           m_models,
+                                                           m_encoder,
+                                                           m_decoder,
                                                            m_feature_extractor,
                                                            streamer_ptr);
         auto decode_start_time = std::chrono::steady_clock::now();

From b2df4a654733d54dbebde70f888f1b2f44f59d87 Mon Sep 17 00:00:00 2001
From: Alexander Suvorov <alexander.suvorov@intel.com>
Date: Fri, 3 Jan 2025 15:39:40 +0100
Subject: [PATCH 02/27] remove reshape

---
 src/cpp/src/whisper/models/statefull_decoder.cpp | 15 +--------------
 1 file changed, 1 insertion(+), 14 deletions(-)

diff --git a/src/cpp/src/whisper/models/statefull_decoder.cpp b/src/cpp/src/whisper/models/statefull_decoder.cpp
index 2f235178a4..bc2c91c91f 100644
--- a/src/cpp/src/whisper/models/statefull_decoder.cpp
+++ b/src/cpp/src/whisper/models/statefull_decoder.cpp
@@ -11,20 +11,7 @@ WhisperStatefullDecoder::WhisperStatefullDecoder(const std::filesystem::path& mo
                                                  const ov::AnyMap& properties) {
     ov::Core core = utils::singleton_core();
 
-    auto model = core.read_model((models_path / "openvino_decoder_model.xml").string());
-
-    // todo: remove once stateful model has dynamic input_ids seq_len
-    std::map<std::string, ov::PartialShape> name_to_shape;
-    for (const ov::Output<ov::Node>& input : model->inputs()) {
-        ov::PartialShape shape = input.get_partial_shape();
-        if (input.get_any_name().find("input_ids") != std::string::npos) {
-            shape[1] = -1;
-            name_to_shape[input.get_any_name()] = shape;
-        }
-    }
-    model->reshape(name_to_shape);
-
-    auto compiled_model = core.compile_model(model, device, properties);
+    auto compiled_model = core.compile_model(models_path / "openvino_decoder_model.xml", device, properties);
 
     utils::print_compiled_model_properties(compiled_model, "whisper decoder model");
     m_request = compiled_model.create_infer_request();

From 17e3ea750886c8aa6bdb9e79bc990862cd215dcc Mon Sep 17 00:00:00 2001
From: Alexander Suvorov <alexander.suvorov@intel.com>
Date: Fri, 3 Jan 2025 17:00:37 +0100
Subject: [PATCH 03/27] use stateful seq2seq barnch

---
 samples/export-requirements.txt        | 2 +-
 src/cpp/src/whisper/models/decoder.cpp | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/samples/export-requirements.txt b/samples/export-requirements.txt
index af38558656..2639f86890 100644
--- a/samples/export-requirements.txt
+++ b/samples/export-requirements.txt
@@ -2,7 +2,7 @@
 --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
 --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
 openvino-tokenizers~=2025.0.0.0.dev
-optimum-intel @ git+https://github.com/huggingface/optimum-intel.git
+optimum-intel @ git+https://github.com/eaidova/optimum-intel@ea/stateful_seq2seq
 numpy<2.0.0; sys_platform == 'darwin'
 einops==0.8.0  # For Qwen
 transformers_stream_generator==0.0.5  # For Qwen
diff --git a/src/cpp/src/whisper/models/decoder.cpp b/src/cpp/src/whisper/models/decoder.cpp
index 32a8f2eff6..9cc61d80f9 100644
--- a/src/cpp/src/whisper/models/decoder.cpp
+++ b/src/cpp/src/whisper/models/decoder.cpp
@@ -16,6 +16,7 @@ std::shared_ptr<WhisperDecoder> WhisperDecoder::from_path(const std::filesystem:
     bool has_decoder_with_past = std::filesystem::exists(models_path / "openvino_decoder_with_past_model.xml");
 
     if (has_decoder_with_past) {
+        // todo: add deprecation notice
         return std::make_shared<WhisperWithPastDecoder>(models_path, device, properties);
     }
 

From 806b01a97807ddc828feeab134e5c23678596b16 Mon Sep 17 00:00:00 2001
From: Alexander Suvorov <alexander.suvorov@intel.com>
Date: Fri, 3 Jan 2025 17:07:53 +0100
Subject: [PATCH 04/27] Address review comments

---
 src/cpp/src/whisper/models/decoder.hpp           | 12 +++---------
 src/cpp/src/whisper/models/statefull_decoder.hpp |  6 +++---
 src/cpp/src/whisper/models/with_past_decoder.hpp |  6 +++---
 src/cpp/src/whisper_pipeline.cpp                 |  7 ++++---
 4 files changed, 13 insertions(+), 18 deletions(-)

diff --git a/src/cpp/src/whisper/models/decoder.hpp b/src/cpp/src/whisper/models/decoder.hpp
index d82ce5047a..cd58e54729 100644
--- a/src/cpp/src/whisper/models/decoder.hpp
+++ b/src/cpp/src/whisper/models/decoder.hpp
@@ -16,19 +16,13 @@ class WhisperDecoder {
                                                      const ov::AnyMap& properties);
 
     virtual std::pair<int64_t, float> detect_language(const ov::Tensor& encoder_hidden_state,
-                                                      const int64_t decoder_start_token_id) {
-        OPENVINO_THROW("detect_language method not implemented");
-    };
+                                                      const int64_t decoder_start_token_id) = 0;
 
     virtual std::pair<ov::Tensor, float> decode(const ov::Tensor& encoder_hidden_state,
                                                 const std::vector<int64_t>& input_ids,
-                                                const size_t cache_position) {
-        OPENVINO_THROW("decode method not implemented");
-    };
+                                                const size_t cache_position) = 0;
 
-    virtual void reset_state() {
-        OPENVINO_THROW("reset_state method not implemented");
-    }
+    virtual void reset_state() = 0;
 
     virtual ~WhisperDecoder();
 };
diff --git a/src/cpp/src/whisper/models/statefull_decoder.hpp b/src/cpp/src/whisper/models/statefull_decoder.hpp
index 569031b9fa..6f1c9eb002 100644
--- a/src/cpp/src/whisper/models/statefull_decoder.hpp
+++ b/src/cpp/src/whisper/models/statefull_decoder.hpp
@@ -10,9 +10,9 @@ namespace ov::genai {
 
 class WhisperStatefullDecoder : public WhisperDecoder {
 public:
-    explicit WhisperStatefullDecoder(const std::filesystem::path& models_path,
-                                     const std::string& device,
-                                     const ov::AnyMap& properties);
+    WhisperStatefullDecoder(const std::filesystem::path& models_path,
+                            const std::string& device,
+                            const ov::AnyMap& properties);
 
     std::pair<int64_t, float> detect_language(const ov::Tensor& encoder_hidden_state,
                                               const int64_t decoder_start_token_id) override;
diff --git a/src/cpp/src/whisper/models/with_past_decoder.hpp b/src/cpp/src/whisper/models/with_past_decoder.hpp
index b6f6924af6..c7af1cdaa2 100644
--- a/src/cpp/src/whisper/models/with_past_decoder.hpp
+++ b/src/cpp/src/whisper/models/with_past_decoder.hpp
@@ -10,9 +10,9 @@ namespace ov::genai {
 
 class WhisperWithPastDecoder : public WhisperDecoder {
 public:
-    explicit WhisperWithPastDecoder(const std::filesystem::path& models_path,
-                                    const std::string& device,
-                                    const ov::AnyMap& properties);
+    WhisperWithPastDecoder(const std::filesystem::path& models_path,
+                           const std::string& device,
+                           const ov::AnyMap& properties);
 
     std::pair<int64_t, float> detect_language(const ov::Tensor& encoder_hidden_state,
                                               const int64_t decoder_start_token_id) override;
diff --git a/src/cpp/src/whisper_pipeline.cpp b/src/cpp/src/whisper_pipeline.cpp
index b7c474258e..ffd792c889 100644
--- a/src/cpp/src/whisper_pipeline.cpp
+++ b/src/cpp/src/whisper_pipeline.cpp
@@ -48,9 +48,6 @@ namespace genai {
 
 class WhisperPipeline::WhisperPipelineStatefulImpl : public WhisperPipeline::WhisperPipelineImplBase {
 public:
-    ov::InferRequest m_encoder;
-    std::shared_ptr<ov::genai::WhisperDecoder> m_decoder;
-
     WhisperPipelineStatefulImpl(const std::filesystem::path& models_path,
                                 const std::string& device,
                                 const ov::AnyMap& properties)
@@ -134,6 +131,10 @@ class WhisperPipeline::WhisperPipelineStatefulImpl : public WhisperPipeline::Whi
 
         return result;
     }
+
+private:
+    ov::InferRequest m_encoder;
+    std::shared_ptr<ov::genai::WhisperDecoder> m_decoder;
 };
 
 std::pair<std::string, Any> streamer(ChunkStreamerVariant func) {

From e041a33d1fb5d724efad3803409df98e598f07dd Mon Sep 17 00:00:00 2001
From: Alexander Suvorov <alexander.suvorov@intel.com>
Date: Fri, 3 Jan 2025 17:37:33 +0100
Subject: [PATCH 05/27] Rename

---
 src/cpp/src/whisper/whisper.cpp | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/cpp/src/whisper/whisper.cpp b/src/cpp/src/whisper/whisper.cpp
index 9dffe01bd2..3ab873609d 100644
--- a/src/cpp/src/whisper/whisper.cpp
+++ b/src/cpp/src/whisper/whisper.cpp
@@ -89,11 +89,11 @@ int64_t decode(ov::Tensor& encoder_hidden_state,
     return output_token;
 }
 
-std::vector<int64_t> prepare_init_ids(ov::Tensor& encoder_hidden_state,
-                                      std::shared_ptr<ov::genai::WhisperDecoder> decoder,
-                                      const ov::genai::WhisperGenerationConfig& config,
-                                      const bool return_timestamps,
-                                      ov::genai::RawPerfMetrics& raw_metrics) {
+std::vector<int64_t> prepare_init_tokens(ov::Tensor& encoder_hidden_state,
+                                         std::shared_ptr<ov::genai::WhisperDecoder> decoder,
+                                         const ov::genai::WhisperGenerationConfig& config,
+                                         const bool return_timestamps,
+                                         ov::genai::RawPerfMetrics& raw_metrics) {
     if (!config.is_multilingual) {
         if (return_timestamps) {
             return std::vector<int64_t>{config.decoder_start_token_id};
@@ -109,8 +109,8 @@ std::vector<int64_t> prepare_init_ids(ov::Tensor& encoder_hidden_state,
             language_token_id = config.lang_to_id.at(language);
         }
     } else {
-        auto [language, infer_ms] = decoder->detect_language(encoder_hidden_state, config.decoder_start_token_id);
-        language_token_id = language;
+        auto [language_token, infer_ms] = decoder->detect_language(encoder_hidden_state, config.decoder_start_token_id);
+        language_token_id = language_token;
         raw_metrics.m_inference_durations[0] += MicroSeconds(infer_ms);
     }
 
@@ -132,13 +132,13 @@ std::vector<int64_t> prepare_init_ids(ov::Tensor& encoder_hidden_state,
 std::pair<bool, std::vector<int64_t>> full_decode(ov::Tensor& encoder_hidden_state,
                                                   const ov::genai::WhisperGenerationConfig& config,
                                                   std::shared_ptr<ov::genai::WhisperDecoder> decoder,
-                                                  std::vector<int64_t> init_ids,
+                                                  const std::vector<int64_t>& init_tokens,
                                                   const size_t max_new_tokens,
                                                   const bool return_timestamps,
                                                   ov::genai::RawPerfMetrics& raw_metrics,
                                                   const std::shared_ptr<ov::genai::StreamerBase> streamer) {
     int64_t output_token =
-        decode(encoder_hidden_state, decoder, init_ids, 0, config, raw_metrics, return_timestamps, true, {});
+        decode(encoder_hidden_state, decoder, init_tokens, 0, config, raw_metrics, return_timestamps, true, {});
 
     std::vector<int64_t> output_tokens{output_token};
 
@@ -154,7 +154,7 @@ std::pair<bool, std::vector<int64_t>> full_decode(ov::Tensor& encoder_hidden_sta
         auto output_token = decode(encoder_hidden_state,
                                    decoder,
                                    {output_tokens.back()},
-                                   init_ids.size() + i,
+                                   init_tokens.size() + i,
                                    config,
                                    raw_metrics,
                                    return_timestamps,
@@ -228,9 +228,9 @@ WhisperGenerateResult whisper_generate(const ov::genai::WhisperGenerationConfig&
                                                 feature_extractor.nb_max_frames,
                                                 raw_metrics);
 
-        // prepare init_ids just once for whole input
+        // prepare init_tokens just once for whole input
         if (init_tokens.empty()) {
-            init_tokens = prepare_init_ids(hidden_state_tensor, decoder, config, return_timestamps, raw_metrics);
+            init_tokens = prepare_init_tokens(hidden_state_tensor, decoder, config, return_timestamps, raw_metrics);
         }
 
         std::vector<int64_t> chunk_init_tokens = ov::genai::get_prompt_tokens(context_tokens, config, chunk_offset);

From 9502d9b2b4420190d5c7f6a2fcc5271994f7424f Mon Sep 17 00:00:00 2001
From: Alexander Suvorov <alexander.suvorov@intel.com>
Date: Fri, 3 Jan 2025 17:43:41 +0100
Subject: [PATCH 06/27] Use commit

---
 samples/export-requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/samples/export-requirements.txt b/samples/export-requirements.txt
index 2639f86890..aa57675218 100644
--- a/samples/export-requirements.txt
+++ b/samples/export-requirements.txt
@@ -2,7 +2,7 @@
 --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
 --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
 openvino-tokenizers~=2025.0.0.0.dev
-optimum-intel @ git+https://github.com/eaidova/optimum-intel@ea/stateful_seq2seq
+optimum-intel @ git+https://github.com/eaidova/optimum-intel@770001994b08893c6611de7e569bfea2a7bac4f9
 numpy<2.0.0; sys_platform == 'darwin'
 einops==0.8.0  # For Qwen
 transformers_stream_generator==0.0.5  # For Qwen

From 6c30fa46398fe193d904e153a0d623c761c35294 Mon Sep 17 00:00:00 2001
From: Alexander Suvorov <alexander.suvorov@intel.com>
Date: Fri, 3 Jan 2025 18:19:58 +0100
Subject: [PATCH 07/27] Set tests reqs

---
 samples/export-requirements.txt     | 2 +-
 tests/python_tests/requirements.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/samples/export-requirements.txt b/samples/export-requirements.txt
index aa57675218..af38558656 100644
--- a/samples/export-requirements.txt
+++ b/samples/export-requirements.txt
@@ -2,7 +2,7 @@
 --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/pre-release
 --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
 openvino-tokenizers~=2025.0.0.0.dev
-optimum-intel @ git+https://github.com/eaidova/optimum-intel@770001994b08893c6611de7e569bfea2a7bac4f9
+optimum-intel @ git+https://github.com/huggingface/optimum-intel.git
 numpy<2.0.0; sys_platform == 'darwin'
 einops==0.8.0  # For Qwen
 transformers_stream_generator==0.0.5  # For Qwen
diff --git a/tests/python_tests/requirements.txt b/tests/python_tests/requirements.txt
index c851c71ee5..78cacd61ae 100644
--- a/tests/python_tests/requirements.txt
+++ b/tests/python_tests/requirements.txt
@@ -1,6 +1,6 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
 diffusers==0.32.1
-optimum-intel @ git+https://github.com/huggingface/optimum-intel.git
+optimum-intel @ git+https://github.com/eaidova/optimum-intel@ea/stateful_seq2seq
 numpy<2.0.0; platform_system == "Darwin" and platform_machine == "x86_64"
 onnx==1.17.0
 pytest

From e38cf5c37ee944dbc87f749e9696375a6beb6b2f Mon Sep 17 00:00:00 2001
From: Alexander Suvorov <alexander.suvorov@intel.com>
Date: Thu, 9 Jan 2025 15:52:37 +0100
Subject: [PATCH 08/27] Add with_past model tests

---
 tests/python_tests/test_whisper_pipeline.py | 185 ++++++++++++--------
 1 file changed, 114 insertions(+), 71 deletions(-)

diff --git a/tests/python_tests/test_whisper_pipeline.py b/tests/python_tests/test_whisper_pipeline.py
index c046d1ae2c..06d5e56b3c 100644
--- a/tests/python_tests/test_whisper_pipeline.py
+++ b/tests/python_tests/test_whisper_pipeline.py
@@ -52,50 +52,25 @@ def get_whisper_models_list(tiny_only=False):
 # used whisper models are relatively small
 # cache them in memory to speedup tests
 @functools.lru_cache()
-def read_whisper_model(params, **tokenizer_kwargs):
+def read_whisper_model(params, stateful=True):
     model_id, path = params
+    if not stateful:
+        path = pathlib.Path(f"{path}_with_past")
 
-    processor = WhisperProcessor.from_pretrained(model_id, trust_remote_code=True)
-
-    if (path / "openvino_encoder_model.xml").exists():
-        opt_model = OVModelForSpeechSeq2Seq.from_pretrained(
-            path,
-            trust_remote_code=True,
-            compile=False,
-            device="CPU",
-            load_in_8bit=False,
-        )
-    else:
-
-        tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
-        ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer(
-            tokenizer,
-            with_detokenizer=True,
-            clean_up_tokenization_spaces=False,
-            **tokenizer_kwargs,
-        )
+    if not (path / "openvino_encoder_model.xml").exists():
+        save_model(model_id=model_id, tmp_path=path, stateful=stateful)
 
-        openvino.save_model(ov_tokenizer, path / "openvino_tokenizer.xml")
-        openvino.save_model(ov_detokenizer, path / "openvino_detokenizer.xml")
-
-        # to store tokenizer config jsons with special tokens
-        tokenizer.save_pretrained(path)
+    opt_model = OVModelForSpeechSeq2Seq.from_pretrained(
+        path,
+        trust_remote_code=True,
+        compile=False,
+        device="CPU",
+        load_in_8bit=False,
+    )
 
-        opt_model = OVModelForSpeechSeq2Seq.from_pretrained(
-            model_id,
-            export=True,
-            trust_remote_code=True,
-            stateful=False,
-            compile=False,
-            device="CPU",
-            load_in_8bit=False,
-        )
-        opt_model.generation_config.save_pretrained(path)
-        opt_model.config.save_pretrained(path)
-        opt_model.save_pretrained(path)
-        processor.save_pretrained(path)
+    processor = WhisperProcessor.from_pretrained(model_id, trust_remote_code=True)
 
-    opt_pipe = pipeline(
+    hf_pipe = pipeline(
         "automatic-speech-recognition",
         model=opt_model,
         tokenizer=processor.tokenizer,
@@ -105,11 +80,42 @@ def read_whisper_model(params, **tokenizer_kwargs):
     return (
         model_id,
         path,
-        opt_pipe,
+        hf_pipe,
         ov_genai.WhisperPipeline(path, "CPU", **{"ENABLE_MMAP": False}),
     )
 
 
+def save_model(model_id: str, tmp_path: pathlib.Path, stateful=True):
+    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+    ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer(
+        tokenizer,
+        with_detokenizer=True,
+        clean_up_tokenization_spaces=False,
+    )
+
+    openvino.save_model(ov_tokenizer, tmp_path / "openvino_tokenizer.xml")
+    openvino.save_model(ov_detokenizer, tmp_path / "openvino_detokenizer.xml")
+
+    # to store tokenizer config jsons with special tokens
+    tokenizer.save_pretrained(tmp_path)
+
+    opt_model = OVModelForSpeechSeq2Seq.from_pretrained(
+        model_id,
+        export=True,
+        trust_remote_code=True,
+        stateful=stateful,
+        compile=False,
+        device="CPU",
+        load_in_8bit=False,
+    )
+    opt_model.generation_config.save_pretrained(tmp_path)
+    opt_model.config.save_pretrained(tmp_path)
+    opt_model.save_pretrained(tmp_path)
+
+    processor = WhisperProcessor.from_pretrained(model_id, trust_remote_code=True)
+    processor.save_pretrained(tmp_path)
+
+
 def run_huggingface(
     pipeline,
     sample,
@@ -179,6 +185,9 @@ def run_pipeline_with_ref(
     streamer: typing.Callable[[str], bool] | None = None,
 ):
     _, _, hf_pipe, genai_pipe = read_whisper_model((model_id, tmp_path))
+    _, _, hf_with_past_pipe, genai_with_past_pipe = read_whisper_model(
+        (model_id, tmp_path), stateful=False
+    )
 
     if type(sample) is np.ndarray and len(sample.shape) == 1:
         sample = np.expand_dims(sample, 0)
@@ -189,6 +198,12 @@ def run_pipeline_with_ref(
 
         compare_results(hf_result, genai_result)
 
+        genai_with_past_result = run_genai(
+            genai_with_past_pipe, _sample, generation_config, streamer
+        )
+
+        compare_results(hf_result, genai_with_past_result)
+
 
 def compare_results(hf_result, genai_result):
     assert genai_result.texts[0] == hf_result["text"]
@@ -274,9 +289,9 @@ def test_whisper_config_constructor(model_descr):
 @pytest.mark.parametrize("test_sample", get_samples_from_dataset(length=1))
 @pytest.mark.precommit
 def test_whisper_constructors(model_descr, test_sample):
-    model_id, path, opt_pipe, pipe = read_whisper_model(model_descr)
+    model_id, path, hf_pipe, genai_pipe = read_whisper_model(model_descr)
 
-    expected = opt_pipe(test_sample)["text"]
+    expected = hf_pipe(test_sample)["text"]
 
     genai_result = ov_genai.WhisperPipeline(
         models_path=path, device="CPU", **{"ENABLE_MMAP": False}
@@ -294,17 +309,17 @@ def test_whisper_constructors(model_descr, test_sample):
 @pytest.mark.parametrize("test_sample", get_samples_from_dataset(length=1))
 @pytest.mark.precommit
 def test_max_new_tokens(model_descr, test_sample):
-    model_id, path, opt_pipe, pipe = read_whisper_model(model_descr)
+    model_id, path, hf_pipe, genai_pipe = read_whisper_model(model_descr)
 
-    expected = opt_pipe(test_sample, max_new_tokens=10)
+    expected = hf_pipe(test_sample, max_new_tokens=10)
 
-    genai_result = pipe.generate(test_sample, max_new_tokens=10)
+    genai_result = genai_pipe.generate(test_sample, max_new_tokens=10)
 
     compare_results(expected, genai_result)
 
-    config = pipe.get_generation_config()
+    config = genai_pipe.get_generation_config()
     config.max_new_tokens = 10
-    genai_result = pipe.generate(test_sample, config)
+    genai_result = genai_pipe.generate(test_sample, config)
     compare_results(expected, genai_result)
 
 
@@ -318,23 +333,23 @@ def test_max_new_tokens(model_descr, test_sample):
 )
 @pytest.mark.precommit
 def test_language_mode(model_descr, test_samples):
-    model_id, path, opt_pipe, pipe = read_whisper_model(model_descr)
+    model_id, path, hf_pipe, genai_pipe = read_whisper_model(model_descr)
     samples, language = test_samples
 
-    expected = opt_pipe(
+    expected = hf_pipe(
         samples[0], max_new_tokens=30, generate_kwargs={"language": language}
     )
 
-    genai_result = pipe.generate(
+    genai_result = genai_pipe.generate(
         samples[0], max_new_tokens=30, language=f"<|{language}|>"
     )
 
     compare_results(expected, genai_result)
 
-    config = pipe.get_generation_config()
+    config = genai_pipe.get_generation_config()
     config.max_new_tokens = 30
     config.language = f"<|{language}|>"
-    genai_result = pipe.generate(samples[0], config)
+    genai_result = genai_pipe.generate(samples[0], config)
 
     compare_results(expected, genai_result)
 
@@ -345,46 +360,46 @@ def test_language_mode(model_descr, test_samples):
 )
 @pytest.mark.precommit
 def test_task_mode(model_descr, test_sample):
-    model_id, path, opt_pipe, pipe = read_whisper_model(model_descr)
+    model_id, path, hf_pipe, genai_pipe = read_whisper_model(model_descr)
 
-    expected = opt_pipe(
+    expected = hf_pipe(
         test_sample,
         max_new_tokens=30,
         generate_kwargs={"language": "fr", "task": "translate"},
     )
 
-    genai_result = pipe.generate(
+    genai_result = genai_pipe.generate(
         test_sample, max_new_tokens=30, language="<|fr|>", task="translate"
     )
 
     compare_results(expected, genai_result)
 
-    config = pipe.get_generation_config()
+    config = genai_pipe.get_generation_config()
     config.max_new_tokens = 30
     config.language = "<|fr|>"
     config.task = "translate"
-    genai_result = pipe.generate(test_sample, config)
+    genai_result = genai_pipe.generate(test_sample, config)
 
     compare_results(expected, genai_result)
 
     # seems to be equivalent to translate task
-    expected = opt_pipe(
+    expected = hf_pipe(
         test_sample,
         max_new_tokens=30,
         generate_kwargs={"language": "en", "task": "transcribe"},
     )
 
-    genai_result = pipe.generate(
+    genai_result = genai_pipe.generate(
         test_sample, max_new_tokens=30, language="<|en|>", task="transcribe"
     )
 
     compare_results(expected, genai_result)
 
-    config = pipe.get_generation_config()
+    config = genai_pipe.get_generation_config()
     config.max_new_tokens = 30
     config.language = "<|en|>"
     config.task = "transcribe"
-    genai_result = pipe.generate(test_sample, config)
+    genai_result = genai_pipe.generate(test_sample, config)
 
     compare_results(expected, genai_result)
 
@@ -400,12 +415,12 @@ def test_task_mode(model_descr, test_sample):
 )
 @pytest.mark.precommit
 def test_language_autodetect(model_descr, test_sample):
-    model_id, path, opt_pipe, pipe = read_whisper_model(model_descr)
+    model_id, path, hf_pipe, genai_pipe = read_whisper_model(model_descr)
 
-    input_features = opt_pipe.feature_extractor(test_sample)
-    language_id = opt_pipe.model.detect_language(input_features["input_features"])[0]
+    input_features = hf_pipe.feature_extractor(test_sample)
+    language_id = hf_pipe.model.detect_language(input_features["input_features"])[0]
     # ensure detected language us not english
-    assert language_id != pipe.get_generation_config().lang_to_id["<|en|>"]
+    assert language_id != genai_pipe.get_generation_config().lang_to_id["<|en|>"]
 
     run_pipeline_with_ref(
         model_id=model_descr[0],
@@ -469,6 +484,34 @@ def test_longform_audio(model_descr, test_sample):
     assert "".join(streamer_result) == hf_result["text"]
 
 
+@pytest.mark.parametrize("model_descr", get_whisper_models_list())
+@pytest.mark.parametrize(
+    "test_sample", get_samples_from_dataset(length=10, long_form=True)
+)
+@pytest.mark.precommit
+def test_longform_audio_with_past(model_descr, test_sample):
+    _, _, hf_pipe, genai_pipe = read_whisper_model(model_descr, stateful=True)
+
+    streamer_result = []
+
+    genai_result = run_genai(
+        genai_pipe,
+        test_sample,
+        config=ov_genai.WhisperGenerationConfig(return_timestamps=True),
+        streamer=lambda x: streamer_result.append(x),
+    )
+
+    hf_result = run_huggingface(
+        hf_pipe,
+        test_sample,
+        config=ov_genai.WhisperGenerationConfig(return_timestamps=True),
+    )
+
+    compare_results(hf_result, genai_result)
+
+    assert "".join(streamer_result) == hf_result["text"]
+
+
 @pytest.mark.parametrize("model_descr", get_whisper_models_list())
 @pytest.mark.precommit
 def test_shortform(model_descr):
@@ -494,19 +537,19 @@ def test_shortform(model_descr):
 )
 @pytest.mark.precommit
 def test_initial_prompt_hotwords(model_descr, test_sample):
-    model_id, path, opt_pipe, pipe = read_whisper_model(model_descr)
+    model_id, path, hf_pipe, genai_pipe = read_whisper_model(model_descr)
 
-    result = pipe.generate(test_sample)
+    result = genai_pipe.generate(test_sample)
 
     assert "Joel Keaton" in result.texts[0]
     assert "Joel Kyton" not in result.texts[0]
 
-    result = pipe.generate(test_sample, initial_prompt="Joel Kyton")
+    result = genai_pipe.generate(test_sample, initial_prompt="Joel Kyton")
 
     assert "Joel Keaton" not in result.texts[0]
     assert "Joel Kyton" in result.texts[0]
 
-    result = pipe.generate(test_sample, hotwords="Joel Kyton")
+    result = genai_pipe.generate(test_sample, hotwords="Joel Kyton")
 
     assert "Joel Keaton" not in result.texts[0]
     assert "Joel Kyton" in result.texts[0]
@@ -521,9 +564,9 @@ def test_initial_prompt_hotwords(model_descr, test_sample):
 )
 @pytest.mark.precommit
 def test_perf_metrics(model_descr, test_sample):
-    model_id, path, opt_pipe, pipe = read_whisper_model(model_descr)
+    model_id, path, hf_pipe, genai_pipe = read_whisper_model(model_descr)
 
-    result = pipe.generate(test_sample)
+    result = genai_pipe.generate(test_sample)
 
     perf_metrics = result.perf_metrics
 

From acc656ff4dfcdba87332d8d340fb5b1a100d1fcc Mon Sep 17 00:00:00 2001
From: Alexander Suvorov <alexander.suvorov@intel.com>
Date: Thu, 9 Jan 2025 16:00:17 +0100
Subject: [PATCH 09/27] remove comment

---
 src/cpp/src/whisper/models/decoder.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/cpp/src/whisper/models/decoder.cpp b/src/cpp/src/whisper/models/decoder.cpp
index 9cc61d80f9..32a8f2eff6 100644
--- a/src/cpp/src/whisper/models/decoder.cpp
+++ b/src/cpp/src/whisper/models/decoder.cpp
@@ -16,7 +16,6 @@ std::shared_ptr<WhisperDecoder> WhisperDecoder::from_path(const std::filesystem:
     bool has_decoder_with_past = std::filesystem::exists(models_path / "openvino_decoder_with_past_model.xml");
 
     if (has_decoder_with_past) {
-        // todo: add deprecation notice
         return std::make_shared<WhisperWithPastDecoder>(models_path, device, properties);
     }
 

From 3728884db607d27458d2e50e2529a26f1a58fece Mon Sep 17 00:00:00 2001
From: Alexander Suvorov <alexander.suvorov@intel.com>
Date: Thu, 9 Jan 2025 16:06:37 +0100
Subject: [PATCH 10/27] bump tokenizers

---
 thirdparty/openvino_tokenizers | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/openvino_tokenizers b/thirdparty/openvino_tokenizers
index bcfd3eda25..d5f0abf827 160000
--- a/thirdparty/openvino_tokenizers
+++ b/thirdparty/openvino_tokenizers
@@ -1 +1 @@
-Subproject commit bcfd3eda25ae3ec423502a4074e35c774506c732
+Subproject commit d5f0abf8271f3cd8fc98d747b3e569fbeacca532

From 445ce5a507c24df5e701f194e9d2f883db2d1e7a Mon Sep 17 00:00:00 2001
From: Alexander Suvorov <alexander.suvorov@intel.com>
Date: Thu, 9 Jan 2025 16:25:50 +0100
Subject: [PATCH 11/27] Fix typo

---
 src/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/README.md b/src/README.md
index 6466b431d0..ffacd420e4 100644
--- a/src/README.md
+++ b/src/README.md
@@ -179,7 +179,7 @@ int main(int argc, char* argv[]) {
 
 Streaming with a custom class:
 
-C++ template for a stremer.
+C++ template for a streamer.
 ```cpp
 #include "openvino/genai/streamer_base.hpp"
 #include "openvino/genai/llm_pipeline.hpp"

From 5bdd69561c9a79240e995a958cf5d63eb8578641 Mon Sep 17 00:00:00 2001
From: Alexander Suvorov <alexander.suvorov@intel.com>
Date: Fri, 10 Jan 2025 09:37:34 +0100
Subject: [PATCH 12/27] Add deprecation message

---
 .github/workflows/windows.yml                   |  3 +--
 src/cpp/src/logger.hpp                          | 17 +++++++++++++++++
 .../src/whisper/models/with_past_decoder.cpp    |  5 +++++
 3 files changed, 23 insertions(+), 2 deletions(-)
 create mode 100644 src/cpp/src/logger.hpp

diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
index e65972110b..835fd924ca 100644
--- a/.github/workflows/windows.yml
+++ b/.github/workflows/windows.yml
@@ -311,10 +311,9 @@ jobs:
           python -m pip install . --verbose --find-links ${env:OV_INSTALL_DIR}/wheels
           python -m pip install ./tools/who_what_benchmark --find-links ${env:OV_INSTALL_DIR}/wheels
           
-          # will install transformers 4.46.3 version
           # transformers 4.46.3 will enable return_timestamps tests
           # this check enabled for windows only. Ticket: 160205.
-          python -m pip install git+https://github.com/huggingface/optimum-intel.git@753f84db6e0966580eb9eaa74a808213be730631
+          python -m pip install transformers==4.46.3
           
           python -m pytest -v ./tests/python_tests/test_whisper_pipeline.py -k "not test_smoke"
 
diff --git a/src/cpp/src/logger.hpp b/src/cpp/src/logger.hpp
new file mode 100644
index 0000000000..503a419e5e
--- /dev/null
+++ b/src/cpp/src/logger.hpp
@@ -0,0 +1,17 @@
+// Copyright (C) 2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+#include <iostream>
+#include <string>
+
+namespace ov::genai {
+
+class Logger {
+public:
+    static void warn(std::string message) {
+        std::cout << "[WARN] " << message << '\n';
+    };
+};
+
+}  // namespace ov::genai
diff --git a/src/cpp/src/whisper/models/with_past_decoder.cpp b/src/cpp/src/whisper/models/with_past_decoder.cpp
index a32c9a45be..7f62ea5657 100644
--- a/src/cpp/src/whisper/models/with_past_decoder.cpp
+++ b/src/cpp/src/whisper/models/with_past_decoder.cpp
@@ -5,6 +5,7 @@
 
 #include <regex>
 
+#include "logger.hpp"
 #include "utils.hpp"
 
 namespace {
@@ -40,6 +41,10 @@ namespace ov::genai {
 WhisperWithPastDecoder::WhisperWithPastDecoder(const std::filesystem::path& models_path,
                                                const std::string& device,
                                                const ov::AnyMap& properties) {
+    Logger::warn("Whisper decoder models with past is deprecated. Support will be removed in 2026.0.0 release.\n"
+                 "To obtain stateful decoder model use latest `optimum-intel` package:\n"
+                 "pip install optimum-intel@git+https://github.com/huggingface/optimum-intel.git\n"
+                 "optimum-cli export openvino --trust-remote-code --model openai/whisper-tiny whisper-tiny");
     ov::Core core = utils::singleton_core();
 
     auto compiled_model = core.compile_model(models_path / "openvino_decoder_model.xml", device, properties);

From 7f2a1532b9ea120fbcc842a47c30f0f3c5e63655 Mon Sep 17 00:00:00 2001
From: Alexander Suvorov <alexander.suvorov@intel.com>
Date: Tue, 21 Jan 2025 14:34:21 +0100
Subject: [PATCH 13/27] Use sampler for whisper pipeline

---
 .../genai/whisper_generation_config.hpp       |  23 +-
 src/cpp/src/debug_utils.hpp                   |   3 +-
 src/cpp/src/lm_encoding.cpp                   |  12 +-
 src/cpp/src/logger.hpp                        |   2 +-
 src/cpp/src/sampler.cpp                       |  16 +-
 src/cpp/src/sampler.hpp                       |   2 +-
 src/cpp/src/utils.cpp                         |  17 --
 src/cpp/src/utils.hpp                         |   2 -
 src/cpp/src/whisper/logit_processor.cpp       |   1 -
 src/cpp/src/whisper/models/decoder.hpp        |   8 +-
 .../src/whisper/models/statefull_decoder.cpp  |  99 +++++--
 .../src/whisper/models/statefull_decoder.hpp  |  13 +-
 .../src/whisper/models/with_past_decoder.cpp  | 107 -------
 .../src/whisper/models/with_past_decoder.hpp  |  15 +-
 src/cpp/src/whisper/whisper.cpp               | 260 +++++++++++-------
 src/cpp/src/whisper/whisper.hpp               |   4 +-
 src/cpp/src/whisper_generation_config.cpp     |  50 +---
 src/cpp/src/whisper_pipeline.cpp              |   9 +-
 18 files changed, 317 insertions(+), 326 deletions(-)
 delete mode 100644 src/cpp/src/whisper/models/with_past_decoder.cpp

diff --git a/src/cpp/include/openvino/genai/whisper_generation_config.hpp b/src/cpp/include/openvino/genai/whisper_generation_config.hpp
index 44d611923d..18f4cfb45d 100644
--- a/src/cpp/include/openvino/genai/whisper_generation_config.hpp
+++ b/src/cpp/include/openvino/genai/whisper_generation_config.hpp
@@ -6,6 +6,7 @@
 #include <filesystem>
 #include <optional>
 
+#include "generation_config.hpp"
 #include "openvino/genai/tokenizer.hpp"
 #include "openvino/runtime/compiled_model.hpp"
 
@@ -15,28 +16,14 @@ namespace genai {
 /**
  * @brief Structure to keep whisper generation config parameters.
  */
-class OPENVINO_GENAI_EXPORTS WhisperGenerationConfig {
+class OPENVINO_GENAI_EXPORTS WhisperGenerationConfig : public GenerationConfig {
 public:
     WhisperGenerationConfig() = default;
     explicit WhisperGenerationConfig(const std::filesystem::path& json_path);
 
-    // Generic
-
-    // the maximum length the generated tokens can have. Corresponds to the length of the input prompt +
-    // `max_new_tokens`. Its effect is overridden by `max_new_tokens`, if also set.
-    size_t max_new_tokens = SIZE_MAX;
-    // the maximum numbers of tokens to generate, excluding the number of tokens in the prompt.
-    // max_new_tokens has priority over max_length.
-    size_t max_length = SIZE_MAX;
-
-    // Whisper specific
-
     // Corresponds to the ”<|startoftranscript|>” token.
     int64_t decoder_start_token_id = 50258;
 
-    // End of stream token id.
-    int64_t eos_token_id = 50257;
-
     // Padding token id.
     int64_t pad_token_id = 50257;
 
@@ -110,12 +97,6 @@ class OPENVINO_GENAI_EXPORTS WhisperGenerationConfig {
     // A list containing the non-speech tokens that will be suppressed during generation.
     std::vector<int64_t> suppress_tokens;
 
-    /** @brief sets eos_token_id to tokenizer_eos_token_id if eos_token_id is less than 0.
-     * Otherwise verifies eos_token_id == tokenizer_eos_token_id.
-     */
-    void set_eos_token_id(int64_t tokenizer_eos_token_id);
-    size_t get_max_new_tokens(size_t prompt_length = 0) const;
-
     void update_generation_config(const ov::AnyMap& config_map = {});
 
     template <typename... Properties>
diff --git a/src/cpp/src/debug_utils.hpp b/src/cpp/src/debug_utils.hpp
index 415f8c0480..1b76a818b3 100644
--- a/src/cpp/src/debug_utils.hpp
+++ b/src/cpp/src/debug_utils.hpp
@@ -12,7 +12,7 @@
 template <typename T>
 void print_array(T * array, size_t size) {
     std::cout << " => [ ";
-    for (size_t i = 0; i < size; ++i) {
+    for (size_t i = 0; i < std::min(size, size_t(10)); ++i) {
         std::cout << array[i] << " ";
     }
     std::cout << " ] " << std::endl;
@@ -20,6 +20,7 @@ void print_array(T * array, size_t size) {
 
 inline void print_tensor(std::string name, ov::Tensor tensor) {
     std::cout << name;
+    std::cout << " " << tensor.get_shape().to_string();
     if (tensor.get_element_type() == ov::element::i32) {
         print_array(tensor.data<int>(), tensor.get_size());
     } else if (tensor.get_element_type() == ov::element::i64) {
diff --git a/src/cpp/src/lm_encoding.cpp b/src/cpp/src/lm_encoding.cpp
index 9ef876d8aa..a326f8748c 100644
--- a/src/cpp/src/lm_encoding.cpp
+++ b/src/cpp/src/lm_encoding.cpp
@@ -14,10 +14,13 @@
 #include "lm_encoding.hpp"
 #include "openvino/genai/perf_metrics.hpp"
 
+namespace {
 
-namespace ov {
-namespace genai {
-
+/**
+ * Set position ids tensor data for next token inference based on provided attention mask
+ * Supports multi batch
+ * Supports sparse attention_mask
+ */
 void update_position_ids(ov::Tensor&& position_ids, const ov::Tensor&& attention_mask) {
     const size_t batch_size = attention_mask.get_shape().at(0);
     const size_t sequence_length = attention_mask.get_shape().at(1);
@@ -48,7 +51,10 @@ void update_attention_mask_with_beams(ov::Tensor&& attention_mask, std::vector<i
         attention_mask.data<int64_t>()[result_prompt_offset + new_shape.at(1) - 1] = 1;
     }
 }
+}
 
+namespace ov {
+namespace genai {
 
 std::pair<EncodedResults, std::optional<int64_t>> get_lm_encoded_results(
     ov::InferRequest& m_llm,
diff --git a/src/cpp/src/logger.hpp b/src/cpp/src/logger.hpp
index 503a419e5e..fbf0657a87 100644
--- a/src/cpp/src/logger.hpp
+++ b/src/cpp/src/logger.hpp
@@ -9,7 +9,7 @@ namespace ov::genai {
 
 class Logger {
 public:
-    static void warn(std::string message) {
+    static void warn(const std::string& message) {
         std::cout << "[WARN] " << message << '\n';
     };
 };
diff --git a/src/cpp/src/sampler.cpp b/src/cpp/src/sampler.cpp
index 54850f657b..1c0dd504fe 100644
--- a/src/cpp/src/sampler.cpp
+++ b/src/cpp/src/sampler.cpp
@@ -407,7 +407,7 @@ void Sampler::GroupBeamSearcher::select_next_tokens(const ov::Tensor& logits,
         }
 
         // check whether group has finished
-        group.is_done(m_parameters);
+        group.is_done(m_parameters, m_sequence_group->get_prompt_len());
 
         // group cannot continue if there are no valid child beams
         if (child_beams_per_group[group_id].size() == 0) {
@@ -548,14 +548,14 @@ std::vector<int64_t> Sampler::_try_finish_generation(SequenceGroup::Ptr & sequen
     std::vector<int64_t> dropped_seq_ids;
     for (auto& running_sequence : sequence_group->get_running_sequences()) {
         const auto generated_len = running_sequence->get_generated_len();
-        if (sampling_params.max_new_tokens <= generated_len || 
+        if (sampling_params.get_max_new_tokens(sequence_group->get_prompt_len()) <= generated_len || 
             is_stop_token_id_hit(running_sequence->get_generated_ids().back(), sampling_params.stop_token_ids) && !sampling_params.ignore_eos) {
             // stop sequence by max_new_tokens or stop token (eos included)
             running_sequence->set_status(SequenceStatus::FINISHED);
 
             if (is_stop_token_id_hit(running_sequence->get_generated_ids().back(), sampling_params.stop_token_ids) && !sampling_params.ignore_eos) {
                 running_sequence->set_finish_reason(GenerationFinishReason::STOP);
-            } else if (sampling_params.max_new_tokens == generated_len) {
+            } else if (sampling_params.get_max_new_tokens(sequence_group->get_prompt_len()) == generated_len) {
                 running_sequence->set_finish_reason(GenerationFinishReason::LENGTH);
             }
 
@@ -798,8 +798,8 @@ SamplerOutput Sampler::sample(const std::vector<SequenceGroup::Ptr> & sequence_g
                         // max counter of needed to be sampled tokens
                         OPENVINO_ASSERT(running_sequence->get_generated_len() >= token_offset);
                         size_t generated_and_verified_len = running_sequence->get_generated_len() - token_offset;
-                        OPENVINO_ASSERT(sampling_params.max_new_tokens >= generated_and_verified_len);
-                        size_t max_num_sampled_token = sampling_params.max_new_tokens - generated_and_verified_len;
+                        OPENVINO_ASSERT(sampling_params.get_max_new_tokens(sequence_group->get_prompt_len()) >= generated_and_verified_len);
+                        size_t max_num_sampled_token = sampling_params.get_max_new_tokens(sequence_group->get_prompt_len()) - generated_and_verified_len;
                         if (max_num_sampled_token == 0) {
                             stop_sample_tokens(running_sequence, token_offset, max_num_sampled_token, max_removed_tokens_per_request);
                             break;
@@ -885,7 +885,7 @@ SamplerOutput Sampler::sample(const std::vector<SequenceGroup::Ptr> & sequence_g
                 // check max length stop criteria
                 std::vector<Sequence::Ptr> running_sequences = sequence_group->get_running_sequences();
                 if (!sequence_group->has_finished() &&
-                    running_sequences[0]->get_generated_len() == sampling_params.max_new_tokens) {
+                    running_sequences[0]->get_generated_len() == sampling_params.get_max_new_tokens(sequence_group->get_prompt_len())) {
                     // stop sequence by max_new_tokens
                     m_beam_search_info.at(request_id).finalize(sampler_output);
                 }
@@ -954,7 +954,7 @@ int64_t Sampler::GroupBeamSearcher::Group::finish(Beam beam, const ov::genai::Ge
     return preeempted_sequence_id;
 }
 
-void Sampler::GroupBeamSearcher::Group::is_done(const ov::genai::GenerationConfig& sampling_params) {
+void Sampler::GroupBeamSearcher::Group::is_done(const ov::genai::GenerationConfig& sampling_params, size_t prompt_length) {
     assert(sampling_params.num_beams % sampling_params.num_beam_groups == 0 &&
         "number of beams should be divisible by number of groups");
     size_t group_size = sampling_params.num_beams / sampling_params.num_beam_groups;
@@ -975,7 +975,7 @@ void Sampler::GroupBeamSearcher::Group::is_done(const ov::genai::GenerationConfi
         return;
     }
     case ov::genai::StopCriteria::NEVER: {
-        size_t length = sampling_params.length_penalty > 0.0 ? sampling_params.max_new_tokens : cur_len;
+        size_t length = sampling_params.length_penalty > 0.0 ? sampling_params.get_max_new_tokens(prompt_length) : cur_len;
         float highest_attainable_score = best_sum_logprobs / std::pow(float(length), sampling_params.length_penalty);
         done = worst_score >= highest_attainable_score;
         return;
diff --git a/src/cpp/src/sampler.hpp b/src/cpp/src/sampler.hpp
index 7796f93d1e..1872b6c1e4 100644
--- a/src/cpp/src/sampler.hpp
+++ b/src/cpp/src/sampler.hpp
@@ -112,7 +112,7 @@ class Sampler::GroupBeamSearcher {
         bool done = false;
 
         int64_t finish(Beam beam, const ov::genai::GenerationConfig& sampling_params);
-        void is_done(const ov::genai::GenerationConfig& sampling_params);
+        void is_done(const ov::genai::GenerationConfig& sampling_params, size_t prompt_length);
     };
 
     SequenceGroup::Ptr m_sequence_group;
diff --git a/src/cpp/src/utils.cpp b/src/cpp/src/utils.cpp
index 9261aa7a4a..c73e47f153 100644
--- a/src/cpp/src/utils.cpp
+++ b/src/cpp/src/utils.cpp
@@ -128,23 +128,6 @@ void set_attention_mask(ov::Tensor&& attention_mask, std::vector<int32_t> next_b
     }
 }
 
-/**
- * Set position ids tensor data for next token inference based on provided attention mask
- * Supports multi batch
- * Supports sparse attention_mask
- */
-void update_position_ids(ov::Tensor&& position_ids, const ov::Tensor&& attention_mask) {
-    const size_t batch_size = attention_mask.get_shape().at(0);
-    const size_t atten_length = attention_mask.get_shape().at(1);
-    position_ids.set_shape({batch_size, 1});
-
-    for (size_t batch = 0; batch < batch_size; batch++) {
-        int64_t* start = attention_mask.data<int64_t>() + batch * atten_length;
-        // todo: be careful with start + atten_length, probably need to replace with start + atten_length -1
-        position_ids.data<int64_t>()[batch] = std::accumulate(start, start + atten_length, 0);
-    }
-}
-
 /**
  * Get attention mask tensor for next token inference
  * Supports multi batch
diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp
index ad0e1a05d4..235768ef18 100644
--- a/src/cpp/src/utils.hpp
+++ b/src/cpp/src/utils.hpp
@@ -53,8 +53,6 @@ void initialize_position_ids(ov::Tensor& position_ids, const ov::Tensor& attenti
 
 ov::Tensor extend_attention(ov::Tensor attention_mask);
 
-void update_position_ids(ov::Tensor&& position_ids, const ov::Tensor&& attention_mask);
-
 template <typename T> struct OmitOptional { using value = T; };
 template <typename T> struct OmitOptional<std::optional<T>> { using value = T; };
 
diff --git a/src/cpp/src/whisper/logit_processor.cpp b/src/cpp/src/whisper/logit_processor.cpp
index d3d9552f57..38bc66b1cf 100644
--- a/src/cpp/src/whisper/logit_processor.cpp
+++ b/src/cpp/src/whisper/logit_processor.cpp
@@ -28,7 +28,6 @@ void process_whisper_timestamp_logits(ov::Tensor& logits,
                                       const std::vector<int64_t>& generated_tokens,
                                       bool initial_step = false) {
     const size_t batch_size = logits.get_shape().at(0);
-    OPENVINO_ASSERT(batch_size == 1, "Batch != 1 is not supported");
 
     size_t vocab_size = logits.get_shape().back();
     size_t batch_offset = batch_idx * logits.get_shape()[1] * vocab_size;
diff --git a/src/cpp/src/whisper/models/decoder.hpp b/src/cpp/src/whisper/models/decoder.hpp
index cd58e54729..acb10d92b5 100644
--- a/src/cpp/src/whisper/models/decoder.hpp
+++ b/src/cpp/src/whisper/models/decoder.hpp
@@ -15,12 +15,12 @@ class WhisperDecoder {
                                                      const std::string& device,
                                                      const ov::AnyMap& properties);
 
-    virtual std::pair<int64_t, float> detect_language(const ov::Tensor& encoder_hidden_state,
+    virtual std::pair<int64_t, float> detect_language(const Tensor& encoder_hidden_state,
                                                       const int64_t decoder_start_token_id) = 0;
 
-    virtual std::pair<ov::Tensor, float> decode(const ov::Tensor& encoder_hidden_state,
-                                                const std::vector<int64_t>& input_ids,
-                                                const size_t cache_position) = 0;
+    virtual std::pair<Tensor, float> decode(const Tensor& encoder_hidden_state,
+                                            const Tensor& input_ids,
+                                            const Tensor& beam_idx) = 0;
 
     virtual void reset_state() = 0;
 
diff --git a/src/cpp/src/whisper/models/statefull_decoder.cpp b/src/cpp/src/whisper/models/statefull_decoder.cpp
index bc2c91c91f..ce029d3057 100644
--- a/src/cpp/src/whisper/models/statefull_decoder.cpp
+++ b/src/cpp/src/whisper/models/statefull_decoder.cpp
@@ -3,6 +3,7 @@
 
 #include "statefull_decoder.hpp"
 
+#include "debug_utils.hpp"
 #include "utils.hpp"
 
 namespace ov::genai {
@@ -19,7 +20,13 @@ WhisperStatefullDecoder::WhisperStatefullDecoder(const std::filesystem::path& mo
 
 std::pair<int64_t, float> WhisperStatefullDecoder::detect_language(const ov::Tensor& encoder_hidden_state,
                                                                    const int64_t decoder_start_token_id) {
-    auto [output_tensor, infer_ms] = decode(encoder_hidden_state, {decoder_start_token_id}, 0);
+    Tensor input_ids_tensor{ov::element::i64, {1, 1}};
+    input_ids_tensor.data<int64_t>()[0] = decoder_start_token_id;
+
+    Tensor beam_idx_tensor{ov::element::i32, {1}};
+    beam_idx_tensor.data<int32_t>()[0] = 0;
+
+    auto [output_tensor, infer_ms] = decode(encoder_hidden_state, input_ids_tensor, beam_idx_tensor);
 
     int64_t output_token = ov::genai::utils::argmax(output_tensor, 0);
 
@@ -28,22 +35,16 @@ std::pair<int64_t, float> WhisperStatefullDecoder::detect_language(const ov::Ten
     return {output_token, infer_ms};
 }
 
-std::pair<ov::Tensor, float> WhisperStatefullDecoder::decode(const ov::Tensor& encoder_hidden_state,
-                                                             const std::vector<int64_t>& input_ids,
-                                                             const size_t cache_position) {
-    m_request.set_tensor("encoder_hidden_states", encoder_hidden_state);
-
-    ov::Tensor input_ids_tensor(ov::element::i64, {1, input_ids.size()}, (void*)input_ids.data());
-    m_request.set_tensor("input_ids", input_ids_tensor);
-
-    ov::Tensor cache_position_tensor = m_request.get_tensor("cache_position");
-    cache_position_tensor.set_shape({input_ids.size()});
-
-    auto cache_data = cache_position_tensor.data<int64_t>();
-    std::iota(cache_data, cache_data + cache_position_tensor.get_size(), cache_position);
+std::pair<ov::Tensor, float> WhisperStatefullDecoder::decode(const Tensor& encoder_hidden_state,
+                                                             const Tensor& input_ids,
+                                                             const Tensor& beam_idx) {
+    const size_t batch_size = input_ids.get_shape().at(0);
+    const size_t seq_len = input_ids.get_shape().at(1);
 
-    m_request.get_tensor("beam_idx").set_shape({1});
-    m_request.get_tensor("beam_idx").data<int32_t>()[0] = 0;
+    _set_encoder_hidden_states_tensor(encoder_hidden_state, batch_size);
+    _set_cache_position_tensor(seq_len);
+    m_request.set_tensor("input_ids", input_ids);
+    m_request.set_tensor("beam_idx", beam_idx);
 
     const auto infer_start = std::chrono::steady_clock::now();
     m_request.infer();
@@ -54,7 +55,71 @@ std::pair<ov::Tensor, float> WhisperStatefullDecoder::decode(const ov::Tensor& e
     return {output_tensor, infer_ms};
 };
 
+/**
+ * Encoder hidden states expected to be with batch 1
+ * Copy encoder hidden state tensor from batch 1 to requested batch_size.
+ * Set new encoder hidden states tensor to infer request.
+ */
+void WhisperStatefullDecoder::_set_encoder_hidden_states_tensor(const Tensor& encoder_hidden_state,
+                                                                const size_t batch_size) {
+    _reset_encoder_past_key_values_states(encoder_hidden_state, batch_size);
+
+    OPENVINO_ASSERT(encoder_hidden_state.get_shape().at(0) == 1);
+    Shape shape{encoder_hidden_state.get_shape()};
+    shape[0] = batch_size;
+
+    Tensor new_encoder_hidden_states{ov::element::f32, shape};
+
+    auto new_encoder_hidden_states_data = new_encoder_hidden_states.data<float>();
+    auto encoder_hidden_state_data = encoder_hidden_state.data<float>();
+
+    for (size_t batch = 0; batch < batch_size; batch++) {
+        const size_t batch_offset = batch * encoder_hidden_state.get_size();
+        std::memcpy(new_encoder_hidden_states_data + batch_offset,
+                    encoder_hidden_state_data,
+                    encoder_hidden_state.get_byte_size());
+    }
+
+    m_request.set_tensor("encoder_hidden_states", new_encoder_hidden_states);
+}
+
+// Ensure encoder past_key values states are reset if batch size changed. This is workaround for Ticket:
+void WhisperStatefullDecoder::_reset_encoder_past_key_values_states(const Tensor& encoder_hidden_state,
+                                                                    const size_t batch_size) {
+    const size_t current_batch_size = m_request.get_tensor("encoder_hidden_states").get_shape().at(0);
+    // batch hasn't changed, skip
+    if (current_batch_size == 0 || current_batch_size == batch_size) {
+        return;
+    }
+
+    const size_t encoder_state_length_dim = encoder_hidden_state.get_shape().at(1);
+    for (auto& state : m_request.query_state()) {
+        // find encoder states by dimension
+        const Shape& state_shape = state.get_state().get_shape();
+        if (state_shape.at(2) == encoder_state_length_dim) {
+            state.reset();
+        }
+    }
+}
+
+void WhisperStatefullDecoder::_set_cache_position_tensor(const size_t seq_len) {
+    ov::Tensor cache_position_tensor = m_request.get_tensor("cache_position");
+
+    int64_t start_cache_position = 0;
+
+    if (cache_position_tensor.get_size() != 0) {
+        start_cache_position = cache_position_tensor.data<int64_t>()[cache_position_tensor.get_size() - 1] + 1;
+    }
+
+    cache_position_tensor.set_shape({seq_len});
+
+    auto cache_data = cache_position_tensor.data<int64_t>();
+    std::iota(cache_data, cache_data + seq_len, start_cache_position);
+};
+
 void WhisperStatefullDecoder::reset_state() {
     m_request.reset_state();
-}
+    m_request.set_tensor("cache_position", ov::Tensor{ov::element::i64, {0}});
+};
+
 }  // namespace ov::genai
diff --git a/src/cpp/src/whisper/models/statefull_decoder.hpp b/src/cpp/src/whisper/models/statefull_decoder.hpp
index 6f1c9eb002..4d4572c33d 100644
--- a/src/cpp/src/whisper/models/statefull_decoder.hpp
+++ b/src/cpp/src/whisper/models/statefull_decoder.hpp
@@ -14,15 +14,20 @@ class WhisperStatefullDecoder : public WhisperDecoder {
                             const std::string& device,
                             const ov::AnyMap& properties);
 
-    std::pair<int64_t, float> detect_language(const ov::Tensor& encoder_hidden_state,
+    std::pair<int64_t, float> detect_language(const Tensor& encoder_hidden_state,
                                               const int64_t decoder_start_token_id) override;
 
-    std::pair<ov::Tensor, float> decode(const ov::Tensor& encoder_hidden_state,
-                                        const std::vector<int64_t>& input_ids,
-                                        const size_t cache_position) override;
+    std::pair<Tensor, float> decode(const Tensor& encoder_hidden_state,
+                                    const Tensor& input_ids,
+                                    const Tensor& beam_idx) override;
 
     void reset_state() override;
 
+private:
+    void _set_encoder_hidden_states_tensor(const Tensor& encoder_hidden_state, const size_t batch_size);
+    void _reset_encoder_past_key_values_states(const Tensor& encoder_hidden_state, const size_t batch_size);
+    void _set_cache_position_tensor(const size_t seq_len);
+
 private:
     ov::InferRequest m_request;
 };
diff --git a/src/cpp/src/whisper/models/with_past_decoder.cpp b/src/cpp/src/whisper/models/with_past_decoder.cpp
deleted file mode 100644
index 7f62ea5657..0000000000
--- a/src/cpp/src/whisper/models/with_past_decoder.cpp
+++ /dev/null
@@ -1,107 +0,0 @@
-// Copyright (C) 2024 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-
-#include "with_past_decoder.hpp"
-
-#include <regex>
-
-#include "logger.hpp"
-#include "utils.hpp"
-
-namespace {
-void set_past_key_value(ov::InferRequest& source, ov::InferRequest& dest) {
-    // source outputs:
-    // present.0.decoder.key
-    // present.0.decoder.value
-    // present.0.encoder.key
-    // present.0.encoder.value
-
-    // dest inputs:
-    // past_key_values.0.decoder.key
-    // past_key_values.0.decoder.value
-    // past_key_values.0.encoder.key
-    // past_key_values.0.encoder.value
-
-    for (auto& source_output : source.get_compiled_model().outputs()) {
-        std::string source_output_name = source_output.get_any_name();
-        if (source_output_name.find("logits") != std::string::npos) {
-            continue;
-        }
-
-        std::string with_past_input_name =
-            std::regex_replace(source_output_name, std::regex("present"), "past_key_values");
-
-        auto kv_tensor = source.get_tensor(source_output_name);
-        dest.set_tensor(with_past_input_name, ov::Tensor{kv_tensor});
-    }
-}
-}  // namespace
-
-namespace ov::genai {
-WhisperWithPastDecoder::WhisperWithPastDecoder(const std::filesystem::path& models_path,
-                                               const std::string& device,
-                                               const ov::AnyMap& properties) {
-    Logger::warn("Whisper decoder models with past is deprecated. Support will be removed in 2026.0.0 release.\n"
-                 "To obtain stateful decoder model use latest `optimum-intel` package:\n"
-                 "pip install optimum-intel@git+https://github.com/huggingface/optimum-intel.git\n"
-                 "optimum-cli export openvino --trust-remote-code --model openai/whisper-tiny whisper-tiny");
-    ov::Core core = utils::singleton_core();
-
-    auto compiled_model = core.compile_model(models_path / "openvino_decoder_model.xml", device, properties);
-    utils::print_compiled_model_properties(compiled_model, "whisper decoder model");
-    m_request_decoder = compiled_model.create_infer_request();
-
-    compiled_model = core.compile_model(models_path / "openvino_decoder_with_past_model.xml", device, properties);
-    utils::print_compiled_model_properties(compiled_model, "whisper decoder with past model");
-    m_request_decoder_with_past = compiled_model.create_infer_request();
-}
-
-std::pair<int64_t, float> WhisperWithPastDecoder::detect_language(const ov::Tensor& encoder_hidden_state,
-                                                                  const int64_t decoder_start_token_id) {
-    auto [output_tensor, infer_ms] = decode(encoder_hidden_state, {decoder_start_token_id}, 0);
-
-    int64_t output_token = ov::genai::utils::argmax(output_tensor, 0);
-
-    reset_state();
-
-    return {output_token, infer_ms};
-}
-
-std::pair<ov::Tensor, float> WhisperWithPastDecoder::decode(const ov::Tensor& encoder_hidden_state,
-                                                            const std::vector<int64_t>& input_ids,
-                                                            const size_t cache_position) {
-    const bool initial_step = cache_position == 0;
-    ov::InferRequest& request = initial_step ? m_request_decoder : m_request_decoder_with_past;
-
-    request.set_tensor("encoder_hidden_states", encoder_hidden_state);
-
-    const ov::Tensor input_ids_tensor(ov::element::i64, {1, input_ids.size()}, (void*)input_ids.data());
-    request.set_tensor("input_ids", input_ids_tensor);
-
-    if (!initial_step) {
-        ov::Tensor cache_position_tensor = request.get_tensor("cache_position");
-        cache_position_tensor.set_shape({1});
-        cache_position_tensor.data<int64_t>()[0] = cache_position;
-    }
-
-    const auto infer_start = std::chrono::steady_clock::now();
-    request.infer();
-    const auto infer_ms = ov::genai::PerfMetrics::get_microsec(std::chrono::steady_clock::now() - infer_start);
-
-    auto output_tensor = request.get_tensor("logits");
-
-    if (initial_step) {
-        set_past_key_value(m_request_decoder, m_request_decoder_with_past);
-    } else if (!m_decoder_with_past_kv_value_set) {
-        set_past_key_value(m_request_decoder_with_past, m_request_decoder_with_past);
-        m_decoder_with_past_kv_value_set = true;
-    }
-
-    return {output_tensor, infer_ms};
-}
-
-void WhisperWithPastDecoder::reset_state() {
-    m_request_decoder_with_past.reset_state();
-    m_decoder_with_past_kv_value_set = false;
-}
-}  // namespace ov::genai
diff --git a/src/cpp/src/whisper/models/with_past_decoder.hpp b/src/cpp/src/whisper/models/with_past_decoder.hpp
index c7af1cdaa2..8bd47bb981 100644
--- a/src/cpp/src/whisper/models/with_past_decoder.hpp
+++ b/src/cpp/src/whisper/models/with_past_decoder.hpp
@@ -14,19 +14,26 @@ class WhisperWithPastDecoder : public WhisperDecoder {
                            const std::string& device,
                            const ov::AnyMap& properties);
 
-    std::pair<int64_t, float> detect_language(const ov::Tensor& encoder_hidden_state,
+    std::pair<int64_t, float> detect_language(const Tensor& encoder_hidden_state,
                                               const int64_t decoder_start_token_id) override;
 
-    std::pair<ov::Tensor, float> decode(const ov::Tensor& encoder_hidden_state,
-                                        const std::vector<int64_t>& input_ids,
-                                        const size_t cache_position) override;
+    std::pair<Tensor, float> decode(const Tensor& encoder_hidden_state,
+                                    const Tensor& input_ids,
+                                    const Tensor& beam_idx) override;
 
     void reset_state() override;
 
 private:
     ov::InferRequest m_request_decoder;
     ov::InferRequest m_request_decoder_with_past;
+    bool m_initial_step = true;
     bool m_decoder_with_past_kv_value_set = false;
+    size_t m_cache_position = 0;
+
+    void _set_encoder_hidden_states_tensor(const Tensor& encoder_hidden_state,
+                                           const size_t batch_size,
+                                           InferRequest& request);
+    void _set_cache_position_tensor(const size_t seq_len);
 };
 
 }  // namespace ov::genai
diff --git a/src/cpp/src/whisper/whisper.cpp b/src/cpp/src/whisper/whisper.cpp
index 3ab873609d..96e464d115 100644
--- a/src/cpp/src/whisper/whisper.cpp
+++ b/src/cpp/src/whisper/whisper.cpp
@@ -14,6 +14,7 @@
 #include "openvino/genai/perf_metrics.hpp"
 #include "openvino/genai/whisper_generation_config.hpp"
 #include "openvino/genai/whisper_pipeline.hpp"
+#include "sampler.hpp"
 #include "timestamps.hpp"
 #include "utils.hpp"
 #include "whisper_config.hpp"
@@ -25,6 +26,156 @@ using ov::genai::MicroSeconds;
 
 namespace {
 
+void process_whisper_logits(ov::Tensor logits,
+                            const ov::genai::WhisperGenerationConfig& config,
+                            const bool return_timestamps,
+                            const std::map<size_t, std::vector<int64_t>>& batch_to_generated_ids) {
+    const bool initial_step = batch_to_generated_ids.empty();
+    const size_t batch_size = logits.get_shape().at(0);
+
+    for (size_t batch = 0; batch < batch_size; batch++) {
+        if (initial_step) {
+            ov::genai::do_suppress_tokens(logits, batch, config.begin_suppress_tokens);
+        }
+
+        ov::genai::do_suppress_tokens(logits, batch, config.suppress_tokens);
+
+        if (return_timestamps) {
+            const auto& generated_ids = initial_step ? std::vector<int64_t>{} : batch_to_generated_ids.at(batch);
+            ov::genai::process_whisper_timestamp_logits(logits, batch, config, generated_ids, initial_step);
+        }
+    }
+}
+
+std::pair<ov::genai::EncodedResults, bool> decode(std::shared_ptr<ov::genai::WhisperDecoder> decoder,
+                                                  const std::vector<int64_t>& input_ids,
+                                                  const ov::Tensor& encoder_hidden_state,
+                                                  const std::shared_ptr<ov::genai::StreamerBase> streamer_ptr,
+                                                  ov::genai::Sampler& sampler,
+                                                  ov::genai::SequenceGroup::Ptr sequence_group,
+                                                  const bool return_timestamps,
+                                                  const ov::genai::WhisperGenerationConfig& config,
+                                                  ov::genai::RawPerfMetrics& raw_metrics) {
+    const auto handle = std::make_shared<ov::genai::GenerationHandleImpl>(sequence_group->get_generation_stream(),
+                                                                          sequence_group->get_sampling_parameters());
+
+    auto stream_generated_tokens = [&streamer_ptr, &handle, &return_timestamps]() {
+        if (return_timestamps || !streamer_ptr || !handle->can_read()) {
+            return;
+        }
+
+        std::unordered_map<uint64_t, ov::genai::GenerationOutput> token = handle->back();
+        for (const auto& gen_token : token.begin()->second.generated_ids) {
+            if (streamer_ptr->put(gen_token)) {
+                handle->drop();
+                break;
+            }
+        }
+    };
+
+    const size_t batch_size = 1;
+
+    ov::Tensor beam_idx = ov::Tensor(ov::element::i32, {batch_size});
+    std::fill_n(beam_idx.data<int32_t>(), batch_size, 0);
+
+    const ov::Tensor input_ids_tensor{ov::element::i64, {1, input_ids.size()}, (void*)input_ids.data()};
+
+    auto [logits, infer_ms] = decoder->decode(encoder_hidden_state, input_ids_tensor, beam_idx);
+
+    const auto infer_end = std::chrono::steady_clock::now();
+    raw_metrics.m_inference_durations[0] += MicroSeconds(infer_ms);
+    raw_metrics.m_token_infer_durations.emplace_back(infer_ms);
+    raw_metrics.m_new_token_times.emplace_back(infer_end);
+    raw_metrics.m_batch_sizes.emplace_back(batch_size);
+
+    process_whisper_logits(logits, config, return_timestamps, {});
+
+    // since we have applied `Slice` operation to last MatMul, model output sequence length is 1
+    // so, we need to update sequence groups to think that they already have processed all prompt tokens except last
+    // ones and schedule only `output_sequence_len` ones
+    int64_t output_sequence_len = logits.get_shape().at(1);
+    sequence_group->update_processed_tokens_num(sequence_group->get_prompt_len() - output_sequence_len);
+    sequence_group->schedule_tokens(output_sequence_len);
+
+    sampler.sample({sequence_group}, logits);
+    stream_generated_tokens();
+
+    // "Generation" phase
+    while (!sequence_group->has_finished()) {
+        std::map<size_t, std::vector<int64_t>> batch_to_generated_ids{};
+
+        sequence_group->schedule_tokens(1);
+        // compute aggregated values
+        size_t num_sequences = sequence_group->num_running_seqs();
+        size_t total_num_tokens = sequence_group->get_num_scheduled_tokens() * num_sequences;
+
+        ov::Tensor new_input_ids(ov::element::i64, {total_num_tokens, 1});
+        int64_t* input_ids_data = new_input_ids.data<int64_t>();
+
+        std::vector<int32_t> next_beams;
+
+        std::vector<ov::genai::Sequence::Ptr> running_sequences = sequence_group->get_running_sequences();
+        size_t num_scheduled_tokens = sequence_group->get_num_scheduled_tokens();
+        size_t num_processed_tokens = sequence_group->get_num_processed_tokens();
+
+        std::map<size_t, int32_t> beam_idxs = sampler.get_beam_idxs(sequence_group);
+
+        for (auto sequence : running_sequences) {
+            for (size_t batch = 0, position_id = num_processed_tokens; batch < num_scheduled_tokens;
+                 ++batch, ++position_id) {
+                // compute token for current sequence
+                if (position_id < sequence_group->get_prompt_len()) {
+                    input_ids_data[batch] = sequence_group->get_prompt_ids()[position_id];
+                } else {
+                    input_ids_data[batch] =
+                        sequence->get_generated_ids()[position_id - sequence_group->get_prompt_len()];
+                }
+            }
+
+            // apply strides to shift to a next sequence
+            input_ids_data += num_scheduled_tokens;
+
+            auto beam_idx = beam_idxs[sequence->get_id()];
+            next_beams.push_back(beam_idx);
+            batch_to_generated_ids[next_beams.size() - 1] = sequence->get_generated_ids();
+        }
+
+        auto [logits, infer_ms] = decoder->decode(encoder_hidden_state,
+                                                  new_input_ids,
+                                                  ov::Tensor{ov::element::i32, {total_num_tokens}, next_beams.data()});
+
+        const auto infer_end = std::chrono::steady_clock::now();
+        raw_metrics.m_inference_durations[0] += MicroSeconds(infer_ms);
+        raw_metrics.m_token_infer_durations.emplace_back(infer_ms);
+        raw_metrics.m_new_token_times.emplace_back(infer_end);
+        raw_metrics.m_batch_sizes.emplace_back(batch_size);
+
+        process_whisper_logits(logits, config, return_timestamps, batch_to_generated_ids);
+
+        sampler.sample({sequence_group}, logits);
+        stream_generated_tokens();
+    }
+
+    ov::genai::EncodedResults results;
+
+    const auto sampling_params = sequence_group->get_sampling_parameters();
+
+    // there is also check in generation config validate function
+    OPENVINO_ASSERT(config.num_return_sequences == 1);
+    const auto& sequences = sequence_group->get_finished_sequences();
+    const auto& sequence = sequences[0];
+
+    const float score = sampling_params.is_beam_search() ? sequence->get_beam_search_score(sampling_params)
+                                                         : sequence->get_cumulative_log_prob();
+
+    results.tokens.push_back(sequence->get_generated_ids());
+    results.scores.push_back(score);
+
+    sampler.clear_request_info(sequence_group->get_request_id());
+
+    return {results, sequence_group->handle_dropped()};
+}
+
 ov::Tensor encode(ov::InferRequest& request,
                   std::vector<float>& mel_data,
                   const size_t feature_size,
@@ -54,41 +205,6 @@ ov::Tensor encode(ov::InferRequest& request,
     return request.get_tensor("last_hidden_state");
 }
 
-int64_t decode(ov::Tensor& encoder_hidden_state,
-               std::shared_ptr<ov::genai::WhisperDecoder> decoder,
-               const std::vector<int64_t>& input_ids,
-               const size_t cache_position,
-               const ov::genai::WhisperGenerationConfig& config,
-               ov::genai::RawPerfMetrics& raw_metrics,
-               const bool return_timestamps,
-               const bool initial_step,
-               const std::vector<int64_t>& generated_tokens) {
-    auto [output_tensor, infer_ms] = decoder->decode(encoder_hidden_state, input_ids, cache_position);
-    const auto infer_end = std::chrono::steady_clock::now();
-    raw_metrics.m_inference_durations[0] += MicroSeconds(infer_ms);
-    raw_metrics.m_token_infer_durations.emplace_back(infer_ms);
-    raw_metrics.m_new_token_times.emplace_back(infer_end);
-    raw_metrics.m_batch_sizes.emplace_back(1);
-
-    if (initial_step) {
-        ov::genai::do_suppress_tokens(output_tensor, 0, config.begin_suppress_tokens);
-    }
-
-    ov::genai::do_suppress_tokens(output_tensor, 0, config.suppress_tokens);
-
-    if (return_timestamps) {
-        if (initial_step) {
-            ov::genai::process_whisper_timestamp_logits(output_tensor, 0, config, {}, true);
-        } else {
-            ov::genai::process_whisper_timestamp_logits(output_tensor, 0, config, generated_tokens);
-        }
-    }
-
-    int64_t output_token = ov::genai::utils::argmax(output_tensor, 0);
-
-    return output_token;
-}
-
 std::vector<int64_t> prepare_init_tokens(ov::Tensor& encoder_hidden_state,
                                          std::shared_ptr<ov::genai::WhisperDecoder> decoder,
                                          const ov::genai::WhisperGenerationConfig& config,
@@ -129,52 +245,6 @@ std::vector<int64_t> prepare_init_tokens(ov::Tensor& encoder_hidden_state,
                                 config.no_timestamps_token_id};
 }
 
-std::pair<bool, std::vector<int64_t>> full_decode(ov::Tensor& encoder_hidden_state,
-                                                  const ov::genai::WhisperGenerationConfig& config,
-                                                  std::shared_ptr<ov::genai::WhisperDecoder> decoder,
-                                                  const std::vector<int64_t>& init_tokens,
-                                                  const size_t max_new_tokens,
-                                                  const bool return_timestamps,
-                                                  ov::genai::RawPerfMetrics& raw_metrics,
-                                                  const std::shared_ptr<ov::genai::StreamerBase> streamer) {
-    int64_t output_token =
-        decode(encoder_hidden_state, decoder, init_tokens, 0, config, raw_metrics, return_timestamps, true, {});
-
-    std::vector<int64_t> output_tokens{output_token};
-
-    if (!return_timestamps && streamer && streamer->put(output_token)) {
-        return {true, output_tokens};
-    }
-
-    if (max_new_tokens == 1) {
-        return {false, output_tokens};
-    }
-
-    for (size_t i = 0; i < max_new_tokens - 1; i++) {
-        auto output_token = decode(encoder_hidden_state,
-                                   decoder,
-                                   {output_tokens.back()},
-                                   init_tokens.size() + i,
-                                   config,
-                                   raw_metrics,
-                                   return_timestamps,
-                                   false,
-                                   output_tokens);
-
-        if (output_token == config.eos_token_id) {
-            break;
-        }
-
-        output_tokens.push_back(output_token);
-
-        if (!return_timestamps && streamer && streamer->put(output_token)) {
-            return {true, output_tokens};
-        }
-    }
-
-    return {false, output_tokens};
-}
-
 }  // namespace
 
 namespace ov {
@@ -187,7 +257,8 @@ WhisperGenerateResult whisper_generate(const ov::genai::WhisperGenerationConfig&
                                        ov::InferRequest& encoder,
                                        std::shared_ptr<WhisperDecoder> decoder,
                                        WhisperFeatureExtractor& feature_extractor,
-                                       const std::shared_ptr<ChunkStreamerBase> streamer) {
+                                       const std::shared_ptr<ChunkStreamerBase> streamer,
+                                       Sampler& sampler) {
     size_t max_new_tokens = config.get_max_new_tokens();
 
     WhisperGenerateResult result;
@@ -216,10 +287,6 @@ WhisperGenerateResult whisper_generate(const ov::genai::WhisperGenerationConfig&
     size_t segment_offset = 0;
 
     for (size_t chunk_offset = 0; chunk_offset < input_features.n_frames; chunk_offset += segment_offset) {
-        if (output_tokens.size() >= max_new_tokens) {
-            break;
-        }
-
         auto input_features_chunk = input_features.get_data_with_offset(chunk_offset, feature_extractor.nb_max_frames);
 
         ov::Tensor hidden_state_tensor = encode(encoder,
@@ -236,16 +303,19 @@ WhisperGenerateResult whisper_generate(const ov::genai::WhisperGenerationConfig&
         std::vector<int64_t> chunk_init_tokens = ov::genai::get_prompt_tokens(context_tokens, config, chunk_offset);
         chunk_init_tokens.insert(chunk_init_tokens.end(), init_tokens.begin(), init_tokens.end());
 
-        auto [cancelled, chunk_output_tokens] = full_decode(hidden_state_tensor,
-                                                            config,
-                                                            decoder,
-                                                            chunk_init_tokens,
-                                                            max_new_tokens - output_tokens.size(),
-                                                            return_timestamps,
-                                                            raw_metrics,
-                                                            streamer);
-
+        SequenceGroup::Ptr sequence_group = std::make_shared<SequenceGroup>(0, chunk_init_tokens, config, 1);
+
+        auto [result, cancelled] = decode(decoder,
+                                          chunk_init_tokens,
+                                          hidden_state_tensor,
+                                          streamer,
+                                          sampler,
+                                          sequence_group,
+                                          return_timestamps,
+                                          config,
+                                          raw_metrics);
         decoder->reset_state();
+        std::vector<int64_t> chunk_output_tokens = result.tokens[0];
 
         if (return_timestamps) {
             auto extracted_segments = ov::genai::extract_segments(chunk_output_tokens,
diff --git a/src/cpp/src/whisper/whisper.hpp b/src/cpp/src/whisper/whisper.hpp
index fbdf56d171..96c17a4216 100644
--- a/src/cpp/src/whisper/whisper.hpp
+++ b/src/cpp/src/whisper/whisper.hpp
@@ -9,6 +9,7 @@
 #include "models/decoder.hpp"
 #include "openvino/genai/whisper_generation_config.hpp"
 #include "openvino/genai/whisper_pipeline.hpp"
+#include "sampler.hpp"
 #include "whisper_config.hpp"
 #include "whisper_feature_extractor.hpp"
 #include "whisper_models.hpp"
@@ -35,7 +36,8 @@ WhisperGenerateResult whisper_generate(const ov::genai::WhisperGenerationConfig&
                                        ov::InferRequest& encoder,
                                        std::shared_ptr<WhisperDecoder> decoder,
                                        WhisperFeatureExtractor& feature_extractor,
-                                       const std::shared_ptr<ChunkStreamerBase> streamer);
+                                       const std::shared_ptr<ChunkStreamerBase> streamer,
+                                       Sampler& sampler);
 
 }  // namespace genai
 }  // namespace ov
diff --git a/src/cpp/src/whisper_generation_config.cpp b/src/cpp/src/whisper_generation_config.cpp
index beb663caaf..733ce74028 100644
--- a/src/cpp/src/whisper_generation_config.cpp
+++ b/src/cpp/src/whisper_generation_config.cpp
@@ -14,7 +14,8 @@
 namespace ov {
 namespace genai {
 
-WhisperGenerationConfig::WhisperGenerationConfig(const std::filesystem::path& json_path) {
+WhisperGenerationConfig::WhisperGenerationConfig(const std::filesystem::path& json_path)
+    : GenerationConfig::GenerationConfig(json_path) {
     using ov::genai::utils::read_json_param;
 
     std::ifstream f(json_path);
@@ -22,12 +23,9 @@ WhisperGenerationConfig::WhisperGenerationConfig(const std::filesystem::path& js
 
     nlohmann::json data = nlohmann::json::parse(f);
 
-    read_json_param(data, "max_new_tokens", max_new_tokens);
-    read_json_param(data, "max_length", max_length);
     read_json_param(data, "begin_suppress_tokens", begin_suppress_tokens);
     read_json_param(data, "suppress_tokens", suppress_tokens);
     read_json_param(data, "decoder_start_token_id", decoder_start_token_id);
-    read_json_param(data, "eos_token_id", eos_token_id);
     read_json_param(data, "pad_token_id", pad_token_id);
     read_json_param(data, "no_timestamps_token_id", no_timestamps_token_id);
     read_json_param(data, "max_initial_timestamp_index", max_initial_timestamp_index);
@@ -42,28 +40,12 @@ WhisperGenerationConfig::WhisperGenerationConfig(const std::filesystem::path& js
     read_json_param(data, "lang_to_id", lang_to_id);
 }
 
-void WhisperGenerationConfig::set_eos_token_id(int64_t tokenizer_eos_token_id) {
-    if (eos_token_id < 0) {
-        eos_token_id = tokenizer_eos_token_id;
-    } else {
-        OPENVINO_ASSERT(eos_token_id == tokenizer_eos_token_id,
-                        "EOS token ID is different in generation config (",
-                        eos_token_id,
-                        ") and tokenizer (",
-                        tokenizer_eos_token_id,
-                        ")");
-    }
-}
-
 void WhisperGenerationConfig::update_generation_config(const ov::AnyMap& config_map) {
     using ov::genai::utils::read_anymap_param;
 
-    read_anymap_param(config_map, "max_new_tokens", max_new_tokens);
-    read_anymap_param(config_map, "max_length", max_length);
     read_anymap_param(config_map, "begin_suppress_tokens", begin_suppress_tokens);
     read_anymap_param(config_map, "suppress_tokens", suppress_tokens);
     read_anymap_param(config_map, "decoder_start_token_id", decoder_start_token_id);
-    read_anymap_param(config_map, "eos_token_id", eos_token_id);
     read_anymap_param(config_map, "pad_token_id", pad_token_id);
     read_anymap_param(config_map, "transcribe_token_id", transcribe_token_id);
     read_anymap_param(config_map, "translate_token_id", translate_token_id);
@@ -76,27 +58,12 @@ void WhisperGenerationConfig::update_generation_config(const ov::AnyMap& config_
     read_anymap_param(config_map, "return_timestamps", return_timestamps);
     read_anymap_param(config_map, "initial_prompt", initial_prompt);
     read_anymap_param(config_map, "hotwords", hotwords);
-}
 
-size_t WhisperGenerationConfig::get_max_new_tokens(size_t prompt_length) const {
-    // max_new_tokens has priority over max_length, only if max_new_tokens was not specified use max_length
-    if (max_new_tokens != SIZE_MAX) {
-        return max_new_tokens;
-    } else {
-        return max_length - prompt_length;
-    }
+    GenerationConfig::update_generation_config(config_map);
 }
 
 void WhisperGenerationConfig::validate() const {
-    OPENVINO_ASSERT(max_new_tokens > 0, "'max_new_tokens' must be greater than 0");
-
-    // max_new_tokens has priority over max_length
-    // if max_new_tokens is defined no need to check max_length
-    OPENVINO_ASSERT(max_new_tokens != SIZE_MAX || max_length > 0,
-                    "'max_length' must be greater than 0 or 'max_new_tokens' should be defined");
-
-    OPENVINO_ASSERT(eos_token_id != -1 || max_new_tokens != SIZE_MAX || max_length != SIZE_MAX,
-                    "Either 'eos_token_id', or 'max_new_tokens', or 'max_length' should be defined.");
+    GenerationConfig::validate();
 
     if (is_multilingual && language.has_value()) {
         OPENVINO_ASSERT(lang_to_id.count(*language),
@@ -114,6 +81,15 @@ void WhisperGenerationConfig::validate() const {
         OPENVINO_ASSERT(!language.has_value(), "Cannot specify 'language' for not multilingual model.");
         OPENVINO_ASSERT(!task.has_value(), "Cannot specify 'task' for not multilingual model.");
     }
+
+    if (is_beam_search()) {
+        OPENVINO_ASSERT(num_return_sequences == 1,
+                        "'num_return_sequences' must be 1. Provided: ",
+                        num_return_sequences,
+                        ".");
+    }
+
+    OPENVINO_ASSERT(!is_assisting_generation(), "Assisted generation is not supported.");
 }
 }  // namespace genai
 }  // namespace ov
diff --git a/src/cpp/src/whisper_pipeline.cpp b/src/cpp/src/whisper_pipeline.cpp
index ffd792c889..1e0d5a9e9a 100644
--- a/src/cpp/src/whisper_pipeline.cpp
+++ b/src/cpp/src/whisper_pipeline.cpp
@@ -51,7 +51,8 @@ class WhisperPipeline::WhisperPipelineStatefulImpl : public WhisperPipeline::Whi
     WhisperPipelineStatefulImpl(const std::filesystem::path& models_path,
                                 const std::string& device,
                                 const ov::AnyMap& properties)
-        : WhisperPipelineImplBase{models_path} {
+        : WhisperPipelineImplBase{models_path},
+          m_sampler(m_tokenizer) {
         ov::Core core = utils::singleton_core();
 
         ov::CompiledModel compiled_model =
@@ -65,6 +66,8 @@ class WhisperPipeline::WhisperPipelineStatefulImpl : public WhisperPipeline::Whi
         if (m_generation_config.eos_token_id == -1) {
             m_generation_config.set_eos_token_id(m_tokenizer.get_eos_token_id());
         }
+
+        m_sampler.set_seed(m_generation_config.rng_seed);
     }
 
     WhisperDecodedResults generate(const RawSpeechInput& raw_speech_input,
@@ -96,7 +99,8 @@ class WhisperPipeline::WhisperPipelineStatefulImpl : public WhisperPipeline::Whi
                                                            m_encoder,
                                                            m_decoder,
                                                            m_feature_extractor,
-                                                           streamer_ptr);
+                                                           streamer_ptr,
+                                                           m_sampler);
         auto decode_start_time = std::chrono::steady_clock::now();
         WhisperDecodedResults result{std::vector{m_tokenizer.decode(generate_result.output_tokens)}, std::vector{1.f}};
         generate_result.perf_metrics.raw_metrics.detokenization_durations.emplace_back(
@@ -135,6 +139,7 @@ class WhisperPipeline::WhisperPipelineStatefulImpl : public WhisperPipeline::Whi
 private:
     ov::InferRequest m_encoder;
     std::shared_ptr<ov::genai::WhisperDecoder> m_decoder;
+    Sampler m_sampler;
 };
 
 std::pair<std::string, Any> streamer(ChunkStreamerVariant func) {

From c36840101015d5bb279e4b22b71f7ba0c4eabb96 Mon Sep 17 00:00:00 2001
From: Alexander Suvorov <alexander.suvorov@intel.com>
Date: Tue, 21 Jan 2025 16:13:03 +0100
Subject: [PATCH 14/27] Add with past decoder

---
 .../src/whisper/models/with_past_decoder.cpp  | 202 ++++++++++++++++++
 .../src/whisper/models/with_past_decoder.hpp  |   1 -
 tests/python_tests/requirements.txt           |   2 +-
 3 files changed, 203 insertions(+), 2 deletions(-)
 create mode 100644 src/cpp/src/whisper/models/with_past_decoder.cpp

diff --git a/src/cpp/src/whisper/models/with_past_decoder.cpp b/src/cpp/src/whisper/models/with_past_decoder.cpp
new file mode 100644
index 0000000000..00ac7bc9d7
--- /dev/null
+++ b/src/cpp/src/whisper/models/with_past_decoder.cpp
@@ -0,0 +1,202 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "with_past_decoder.hpp"
+
+#include <regex>
+
+#include "logger.hpp"
+#include "utils.hpp"
+
+namespace {
+
+bool are_past_key_values_empty(ov::InferRequest& request) {
+    for (const auto& input : request.get_compiled_model().inputs()) {
+        const std::string name = input.get_any_name();
+        if (name.find("past_key_values") == std::string::npos) {
+            continue;
+        }
+
+        ov::Tensor tensor = request.get_tensor(name);
+
+        return tensor.get_size() == 0;
+    }
+
+    OPENVINO_THROW("Past key value tensor not found");
+}
+
+void reset_past_key_values(ov::InferRequest& request) {
+    for (const auto& input : request.get_compiled_model().inputs()) {
+        const std::string name = input.get_any_name();
+        if (name.find("past_key_values") == std::string::npos) {
+            continue;
+        }
+
+        ov::Shape shape{request.get_tensor(name).get_shape()};
+        shape[0] = 0;
+
+        request.set_tensor(name, ov::Tensor{ov::element::f32, shape});
+    }
+}
+
+void copy_with_beam_gather(const ov::Tensor& source, ov::Tensor& dest, const ov::Tensor& beam_idx) {
+    const size_t dest_batch_size = beam_idx.get_shape().at(0);
+
+    ov::Shape dest_shape{source.get_shape()};
+    dest_shape[0] = dest_batch_size;
+    dest.set_shape(dest_shape);
+
+    OPENVINO_ASSERT(dest_shape.size() == 4);
+
+    const size_t batch_dim_size = dest_shape[1] * dest_shape[2] * dest_shape[3];
+
+    const auto beam_idx_data = beam_idx.data<int32_t>();
+    const auto source_data = source.data<float>();
+    auto dest_data = dest.data<float>();
+
+    for (size_t dest_batch = 0; dest_batch < dest_batch_size; dest_batch++) {
+        const size_t source_batch = beam_idx_data[dest_batch];
+
+        const auto source_start = source_data + (source_batch * batch_dim_size);
+        const auto dest_start = dest_data + (dest_batch * batch_dim_size);
+        std::memcpy(dest_start, source_start, sizeof(float) * batch_dim_size);
+    }
+}
+
+void set_past_key_value(ov::InferRequest& source, ov::InferRequest& dest, const ov::Tensor& beam_idx) {
+    // source outputs:
+    // present.0.decoder.key
+    // present.0.decoder.value
+    // present.0.encoder.key
+    // present.0.encoder.value
+
+    // dest inputs:
+    // past_key_values.0.decoder.key
+    // past_key_values.0.decoder.value
+    // past_key_values.0.encoder.key
+    // past_key_values.0.encoder.value
+
+    for (auto& source_output : source.get_compiled_model().outputs()) {
+        std::string source_output_name = source_output.get_any_name();
+        if (source_output_name.find("present") == std::string::npos) {
+            continue;
+        }
+
+        std::string dest_input_name = std::regex_replace(source_output_name, std::regex("present"), "past_key_values");
+
+        auto source_tensor = source.get_tensor(source_output_name);
+        auto dest_tensor = dest.get_tensor(dest_input_name);
+
+        copy_with_beam_gather(source_tensor, dest_tensor, beam_idx);
+    }
+}
+}  // namespace
+
+namespace ov::genai {
+WhisperWithPastDecoder::WhisperWithPastDecoder(const std::filesystem::path& models_path,
+                                               const std::string& device,
+                                               const ov::AnyMap& properties) {
+    Logger::warn("Whisper decoder models with past is deprecated. Support will be removed in 2026.0.0 release.\n"
+                 "To obtain stateful decoder model use latest `optimum-intel` package:\n"
+                 "pip install optimum-intel@git+https://github.com/huggingface/optimum-intel.git\n"
+                 "optimum-cli export openvino --trust-remote-code --model openai/whisper-tiny whisper-tiny");
+    ov::Core core = utils::singleton_core();
+
+    auto compiled_model = core.compile_model(models_path / "openvino_decoder_model.xml", device, properties);
+    utils::print_compiled_model_properties(compiled_model, "whisper decoder model");
+    m_request_decoder = compiled_model.create_infer_request();
+
+    compiled_model = core.compile_model(models_path / "openvino_decoder_with_past_model.xml", device, properties);
+    utils::print_compiled_model_properties(compiled_model, "whisper decoder with past model");
+    m_request_decoder_with_past = compiled_model.create_infer_request();
+}
+
+std::pair<int64_t, float> WhisperWithPastDecoder::detect_language(const ov::Tensor& encoder_hidden_state,
+                                                                  const int64_t decoder_start_token_id) {
+    Tensor input_ids_tensor{ov::element::i64, {1, 1}};
+    input_ids_tensor.data<int64_t>()[0] = decoder_start_token_id;
+
+    Tensor beam_idx_tensor{ov::element::i32, {1}};
+    beam_idx_tensor.data<int32_t>()[0] = 0;
+
+    auto [output_tensor, infer_ms] = decode(encoder_hidden_state, input_ids_tensor, beam_idx_tensor);
+
+    int64_t output_token = ov::genai::utils::argmax(output_tensor, 0);
+
+    reset_state();
+
+    return {output_token, infer_ms};
+}
+
+std::pair<Tensor, float> WhisperWithPastDecoder::decode(const Tensor& encoder_hidden_state,
+                                                        const Tensor& input_ids,
+                                                        const Tensor& beam_idx) {
+    ov::InferRequest& request = m_initial_step ? m_request_decoder : m_request_decoder_with_past;
+
+    const size_t batch_size = input_ids.get_shape().at(0);
+    const size_t seq_length = input_ids.get_shape().at(1);
+
+    _set_encoder_hidden_states_tensor(encoder_hidden_state, batch_size, request);
+    request.set_tensor("input_ids", input_ids);
+
+    if (!m_initial_step) {
+        ov::Tensor cache_position_tensor = request.get_tensor("cache_position");
+        cache_position_tensor.set_shape({1});
+        cache_position_tensor.data<int64_t>()[0] = m_cache_position;
+    }
+
+    if (!m_initial_step) {
+        if (are_past_key_values_empty(m_request_decoder_with_past)) {
+            set_past_key_value(m_request_decoder, m_request_decoder_with_past, beam_idx);
+        } else {
+            set_past_key_value(m_request_decoder_with_past, m_request_decoder_with_past, beam_idx);
+        }
+    }
+
+    const auto infer_start = std::chrono::steady_clock::now();
+    request.infer();
+    const auto infer_ms = ov::genai::PerfMetrics::get_microsec(std::chrono::steady_clock::now() - infer_start);
+
+    auto output_tensor = request.get_tensor("logits");
+
+    m_initial_step = false;
+    m_cache_position += seq_length;
+
+    return {output_tensor, infer_ms};
+}
+
+/**
+ * Encoder hidden states expected to be with batch 1
+ * Copy encoder hidden state tensor from batch 1 to requested batch_size.
+ * Set new encoder hidden states tensor to infer request.
+ */
+void WhisperWithPastDecoder::_set_encoder_hidden_states_tensor(const Tensor& encoder_hidden_state,
+                                                               const size_t batch_size,
+                                                               InferRequest& request) {
+    OPENVINO_ASSERT(encoder_hidden_state.get_shape().at(0) == 1);
+    Shape shape{encoder_hidden_state.get_shape()};
+    shape[0] = batch_size;
+
+    Tensor new_encoder_hidden_states{ov::element::f32, shape};
+
+    auto new_encoder_hidden_states_data = new_encoder_hidden_states.data<float>();
+    auto encoder_hidden_state_data = encoder_hidden_state.data<float>();
+
+    for (size_t batch = 0; batch < batch_size; batch++) {
+        const size_t batch_offset = batch * encoder_hidden_state.get_size();
+        std::memcpy(new_encoder_hidden_states_data + batch_offset,
+                    encoder_hidden_state_data,
+                    encoder_hidden_state.get_byte_size());
+    }
+
+    request.set_tensor("encoder_hidden_states", new_encoder_hidden_states);
+}
+
+void WhisperWithPastDecoder::reset_state() {
+    reset_past_key_values(m_request_decoder_with_past);
+    m_request_decoder_with_past.reset_state();
+    m_decoder_with_past_kv_value_set = false;
+    m_initial_step = true;
+    m_cache_position = 0;
+}
+}  // namespace ov::genai
\ No newline at end of file
diff --git a/src/cpp/src/whisper/models/with_past_decoder.hpp b/src/cpp/src/whisper/models/with_past_decoder.hpp
index 8bd47bb981..e94f74a6c7 100644
--- a/src/cpp/src/whisper/models/with_past_decoder.hpp
+++ b/src/cpp/src/whisper/models/with_past_decoder.hpp
@@ -33,7 +33,6 @@ class WhisperWithPastDecoder : public WhisperDecoder {
     void _set_encoder_hidden_states_tensor(const Tensor& encoder_hidden_state,
                                            const size_t batch_size,
                                            InferRequest& request);
-    void _set_cache_position_tensor(const size_t seq_len);
 };
 
 }  // namespace ov::genai
diff --git a/tests/python_tests/requirements.txt b/tests/python_tests/requirements.txt
index 78cacd61ae..c851c71ee5 100644
--- a/tests/python_tests/requirements.txt
+++ b/tests/python_tests/requirements.txt
@@ -1,6 +1,6 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
 diffusers==0.32.1
-optimum-intel @ git+https://github.com/eaidova/optimum-intel@ea/stateful_seq2seq
+optimum-intel @ git+https://github.com/huggingface/optimum-intel.git
 numpy<2.0.0; platform_system == "Darwin" and platform_machine == "x86_64"
 onnx==1.17.0
 pytest

From 2e061aa8c472326105998d367f55c1863f8bb34c Mon Sep 17 00:00:00 2001
From: Alexander Suvorov <alexander.suvorov@intel.com>
Date: Wed, 22 Jan 2025 09:43:17 +0100
Subject: [PATCH 15/27] Refactor with past decoder

---
 .../src/whisper/models/statefull_decoder.cpp  |  2 +
 .../src/whisper/models/with_past_decoder.cpp  | 99 +++++++++++--------
 .../src/whisper/models/with_past_decoder.hpp  |  5 +-
 3 files changed, 61 insertions(+), 45 deletions(-)

diff --git a/src/cpp/src/whisper/models/statefull_decoder.cpp b/src/cpp/src/whisper/models/statefull_decoder.cpp
index ce029d3057..7d6837d323 100644
--- a/src/cpp/src/whisper/models/statefull_decoder.cpp
+++ b/src/cpp/src/whisper/models/statefull_decoder.cpp
@@ -41,7 +41,9 @@ std::pair<ov::Tensor, float> WhisperStatefullDecoder::decode(const Tensor& encod
     const size_t batch_size = input_ids.get_shape().at(0);
     const size_t seq_len = input_ids.get_shape().at(1);
 
+    // todo: skip copy if already set and batch didn't changed
     _set_encoder_hidden_states_tensor(encoder_hidden_state, batch_size);
+
     _set_cache_position_tensor(seq_len);
     m_request.set_tensor("input_ids", input_ids);
     m_request.set_tensor("beam_idx", beam_idx);
diff --git a/src/cpp/src/whisper/models/with_past_decoder.cpp b/src/cpp/src/whisper/models/with_past_decoder.cpp
index 00ac7bc9d7..4541f2e194 100644
--- a/src/cpp/src/whisper/models/with_past_decoder.cpp
+++ b/src/cpp/src/whisper/models/with_past_decoder.cpp
@@ -10,35 +10,6 @@
 
 namespace {
 
-bool are_past_key_values_empty(ov::InferRequest& request) {
-    for (const auto& input : request.get_compiled_model().inputs()) {
-        const std::string name = input.get_any_name();
-        if (name.find("past_key_values") == std::string::npos) {
-            continue;
-        }
-
-        ov::Tensor tensor = request.get_tensor(name);
-
-        return tensor.get_size() == 0;
-    }
-
-    OPENVINO_THROW("Past key value tensor not found");
-}
-
-void reset_past_key_values(ov::InferRequest& request) {
-    for (const auto& input : request.get_compiled_model().inputs()) {
-        const std::string name = input.get_any_name();
-        if (name.find("past_key_values") == std::string::npos) {
-            continue;
-        }
-
-        ov::Shape shape{request.get_tensor(name).get_shape()};
-        shape[0] = 0;
-
-        request.set_tensor(name, ov::Tensor{ov::element::f32, shape});
-    }
-}
-
 void copy_with_beam_gather(const ov::Tensor& source, ov::Tensor& dest, const ov::Tensor& beam_idx) {
     const size_t dest_batch_size = beam_idx.get_shape().at(0);
 
@@ -63,7 +34,7 @@ void copy_with_beam_gather(const ov::Tensor& source, ov::Tensor& dest, const ov:
     }
 }
 
-void set_past_key_value(ov::InferRequest& source, ov::InferRequest& dest, const ov::Tensor& beam_idx) {
+void copy_past_key_value(ov::InferRequest& source, ov::InferRequest& dest, const ov::Tensor& beam_idx) {
     // source outputs:
     // present.0.decoder.key
     // present.0.decoder.value
@@ -90,6 +61,21 @@ void set_past_key_value(ov::InferRequest& source, ov::InferRequest& dest, const
         copy_with_beam_gather(source_tensor, dest_tensor, beam_idx);
     }
 }
+
+void link_past_key_value(ov::InferRequest& source, ov::InferRequest& dest) {
+    for (auto& source_output : source.get_compiled_model().outputs()) {
+        std::string source_output_name = source_output.get_any_name();
+        if (source_output_name.find("present") == std::string::npos) {
+            continue;
+        }
+
+        std::string dest_input_name = std::regex_replace(source_output_name, std::regex("present"), "past_key_values");
+        auto source_tensor = source.get_tensor(source_output_name);
+
+        dest.set_tensor(dest_input_name, source_tensor);
+    }
+}
+
 }  // namespace
 
 namespace ov::genai {
@@ -131,27 +117,23 @@ std::pair<int64_t, float> WhisperWithPastDecoder::detect_language(const ov::Tens
 std::pair<Tensor, float> WhisperWithPastDecoder::decode(const Tensor& encoder_hidden_state,
                                                         const Tensor& input_ids,
                                                         const Tensor& beam_idx) {
-    ov::InferRequest& request = m_initial_step ? m_request_decoder : m_request_decoder_with_past;
+    const bool is_initial_step = m_cache_position == 0;
+    ov::InferRequest& request = is_initial_step ? m_request_decoder : m_request_decoder_with_past;
 
     const size_t batch_size = input_ids.get_shape().at(0);
     const size_t seq_length = input_ids.get_shape().at(1);
 
+    // todo: skip copy if already set and batch didn't changed
     _set_encoder_hidden_states_tensor(encoder_hidden_state, batch_size, request);
     request.set_tensor("input_ids", input_ids);
 
-    if (!m_initial_step) {
+    if (!is_initial_step) {
         ov::Tensor cache_position_tensor = request.get_tensor("cache_position");
         cache_position_tensor.set_shape({1});
         cache_position_tensor.data<int64_t>()[0] = m_cache_position;
     }
 
-    if (!m_initial_step) {
-        if (are_past_key_values_empty(m_request_decoder_with_past)) {
-            set_past_key_value(m_request_decoder, m_request_decoder_with_past, beam_idx);
-        } else {
-            set_past_key_value(m_request_decoder_with_past, m_request_decoder_with_past, beam_idx);
-        }
-    }
+    _set_past_key_value(beam_idx);
 
     const auto infer_start = std::chrono::steady_clock::now();
     request.infer();
@@ -159,7 +141,6 @@ std::pair<Tensor, float> WhisperWithPastDecoder::decode(const Tensor& encoder_hi
 
     auto output_tensor = request.get_tensor("logits");
 
-    m_initial_step = false;
     m_cache_position += seq_length;
 
     return {output_tensor, infer_ms};
@@ -192,11 +173,43 @@ void WhisperWithPastDecoder::_set_encoder_hidden_states_tensor(const Tensor& enc
     request.set_tensor("encoder_hidden_states", new_encoder_hidden_states);
 }
 
+void WhisperWithPastDecoder::_set_past_key_value(const Tensor& beam_idx) {
+    const bool is_initial_step = m_cache_position == 0;
+    if (is_initial_step) {
+        return;
+    }
+
+    const size_t batch_size = beam_idx.get_shape().at(0);
+    // no copy needed, just 'link' output tensor with input tensor
+    const bool can_link_past_key_value = batch_size == 1 && beam_idx.data<int32_t>()[0] == 0;
+
+    if (!m_initial_past_key_value_set) {
+        if (can_link_past_key_value) {
+            link_past_key_value(m_request_decoder, m_request_decoder_with_past);
+        } else {
+            copy_past_key_value(m_request_decoder, m_request_decoder_with_past, beam_idx);
+        }
+
+        m_initial_past_key_value_set = true;
+        return;
+    }
+
+    if (m_past_key_value_linked) {
+        return;
+    }
+
+    if (can_link_past_key_value) {
+        link_past_key_value(m_request_decoder_with_past, m_request_decoder_with_past);
+        m_past_key_value_linked = true;
+    } else {
+        copy_past_key_value(m_request_decoder_with_past, m_request_decoder_with_past, beam_idx);
+    }
+};
+
 void WhisperWithPastDecoder::reset_state() {
-    reset_past_key_values(m_request_decoder_with_past);
     m_request_decoder_with_past.reset_state();
-    m_decoder_with_past_kv_value_set = false;
-    m_initial_step = true;
     m_cache_position = 0;
+    m_initial_past_key_value_set = false;
+    m_past_key_value_linked = false;
 }
 }  // namespace ov::genai
\ No newline at end of file
diff --git a/src/cpp/src/whisper/models/with_past_decoder.hpp b/src/cpp/src/whisper/models/with_past_decoder.hpp
index e94f74a6c7..7eb3990cca 100644
--- a/src/cpp/src/whisper/models/with_past_decoder.hpp
+++ b/src/cpp/src/whisper/models/with_past_decoder.hpp
@@ -26,13 +26,14 @@ class WhisperWithPastDecoder : public WhisperDecoder {
 private:
     ov::InferRequest m_request_decoder;
     ov::InferRequest m_request_decoder_with_past;
-    bool m_initial_step = true;
-    bool m_decoder_with_past_kv_value_set = false;
     size_t m_cache_position = 0;
+    bool m_initial_past_key_value_set = false;
+    bool m_past_key_value_linked = false;
 
     void _set_encoder_hidden_states_tensor(const Tensor& encoder_hidden_state,
                                            const size_t batch_size,
                                            InferRequest& request);
+    void _set_past_key_value(const Tensor& beam_idx);
 };
 
 }  // namespace ov::genai

From 4eaa9a715f7ad0ab8c3d1a3556a9c13117816afa Mon Sep 17 00:00:00 2001
From: Alexander Suvorov <alexander.suvorov@intel.com>
Date: Wed, 22 Jan 2025 10:06:29 +0100
Subject: [PATCH 16/27] Do not copy encoder_hidden_states if not  needed

---
 .../src/whisper/models/statefull_decoder.cpp  | 24 ++++++++++++-------
 .../src/whisper/models/with_past_decoder.cpp  | 12 ++++++++++
 2 files changed, 27 insertions(+), 9 deletions(-)

diff --git a/src/cpp/src/whisper/models/statefull_decoder.cpp b/src/cpp/src/whisper/models/statefull_decoder.cpp
index 7d6837d323..55b111286b 100644
--- a/src/cpp/src/whisper/models/statefull_decoder.cpp
+++ b/src/cpp/src/whisper/models/statefull_decoder.cpp
@@ -41,7 +41,6 @@ std::pair<ov::Tensor, float> WhisperStatefullDecoder::decode(const Tensor& encod
     const size_t batch_size = input_ids.get_shape().at(0);
     const size_t seq_len = input_ids.get_shape().at(1);
 
-    // todo: skip copy if already set and batch didn't changed
     _set_encoder_hidden_states_tensor(encoder_hidden_state, batch_size);
 
     _set_cache_position_tensor(seq_len);
@@ -64,7 +63,15 @@ std::pair<ov::Tensor, float> WhisperStatefullDecoder::decode(const Tensor& encod
  */
 void WhisperStatefullDecoder::_set_encoder_hidden_states_tensor(const Tensor& encoder_hidden_state,
                                                                 const size_t batch_size) {
-    _reset_encoder_past_key_values_states(encoder_hidden_state, batch_size);
+    const size_t current_batch_size = m_request.get_tensor("encoder_hidden_states").get_shape().at(0);
+    // batch hasn't changed, skip
+    if (current_batch_size == batch_size) {
+        return;
+    }
+
+    if (current_batch_size != 0) {
+        _reset_encoder_past_key_values_states(encoder_hidden_state, batch_size);
+    }
 
     OPENVINO_ASSERT(encoder_hidden_state.get_shape().at(0) == 1);
     Shape shape{encoder_hidden_state.get_shape()};
@@ -85,15 +92,10 @@ void WhisperStatefullDecoder::_set_encoder_hidden_states_tensor(const Tensor& en
     m_request.set_tensor("encoder_hidden_states", new_encoder_hidden_states);
 }
 
-// Ensure encoder past_key values states are reset if batch size changed. This is workaround for Ticket:
+// Past_key value states are not shring/grow when batch is changed. Reset past_key values states as a workaround.
+// Ticket:
 void WhisperStatefullDecoder::_reset_encoder_past_key_values_states(const Tensor& encoder_hidden_state,
                                                                     const size_t batch_size) {
-    const size_t current_batch_size = m_request.get_tensor("encoder_hidden_states").get_shape().at(0);
-    // batch hasn't changed, skip
-    if (current_batch_size == 0 || current_batch_size == batch_size) {
-        return;
-    }
-
     const size_t encoder_state_length_dim = encoder_hidden_state.get_shape().at(1);
     for (auto& state : m_request.query_state()) {
         // find encoder states by dimension
@@ -122,6 +124,10 @@ void WhisperStatefullDecoder::_set_cache_position_tensor(const size_t seq_len) {
 void WhisperStatefullDecoder::reset_state() {
     m_request.reset_state();
     m_request.set_tensor("cache_position", ov::Tensor{ov::element::i64, {0}});
+
+    Shape encoder_hidden_states_shape{m_request.get_tensor("encoder_hidden_states").get_shape()};
+    encoder_hidden_states_shape[0] = 0;
+    m_request.set_tensor("encoder_hidden_states", ov::Tensor{ov::element::f32, encoder_hidden_states_shape});
 };
 
 }  // namespace ov::genai
diff --git a/src/cpp/src/whisper/models/with_past_decoder.cpp b/src/cpp/src/whisper/models/with_past_decoder.cpp
index 4541f2e194..11f1ef7713 100644
--- a/src/cpp/src/whisper/models/with_past_decoder.cpp
+++ b/src/cpp/src/whisper/models/with_past_decoder.cpp
@@ -154,6 +154,12 @@ std::pair<Tensor, float> WhisperWithPastDecoder::decode(const Tensor& encoder_hi
 void WhisperWithPastDecoder::_set_encoder_hidden_states_tensor(const Tensor& encoder_hidden_state,
                                                                const size_t batch_size,
                                                                InferRequest& request) {
+    const size_t current_batch_size = request.get_tensor("encoder_hidden_states").get_shape().at(0);
+    // batch hasn't changed, skip
+    if (current_batch_size == batch_size) {
+        return;
+    }
+
     OPENVINO_ASSERT(encoder_hidden_state.get_shape().at(0) == 1);
     Shape shape{encoder_hidden_state.get_shape()};
     shape[0] = batch_size;
@@ -211,5 +217,11 @@ void WhisperWithPastDecoder::reset_state() {
     m_cache_position = 0;
     m_initial_past_key_value_set = false;
     m_past_key_value_linked = false;
+
+    Shape encoder_hidden_states_shape{m_request_decoder_with_past.get_tensor("encoder_hidden_states").get_shape()};
+    encoder_hidden_states_shape[0] = 0;
+    m_request_decoder.set_tensor("encoder_hidden_states", ov::Tensor{ov::element::f32, encoder_hidden_states_shape});
+    m_request_decoder_with_past.set_tensor("encoder_hidden_states",
+                                           ov::Tensor{ov::element::f32, encoder_hidden_states_shape});
 }
 }  // namespace ov::genai
\ No newline at end of file

From 50fb8298508780eeb2048bd129be9078a5f3f60e Mon Sep 17 00:00:00 2001
From: Alexander Suvorov <alexander.suvorov@intel.com>
Date: Wed, 22 Jan 2025 10:55:10 +0100
Subject: [PATCH 17/27] Add stubs

---
 .../openvino/genai/generation_config.hpp      |   4 +-
 .../genai/whisper_generation_config.hpp       |   4 +-
 .../openvino_genai/py_openvino_genai.pyi      | 115 +++++++++++++-----
 src/python/py_whisper_pipeline.cpp            |  67 +++++++---
 4 files changed, 136 insertions(+), 54 deletions(-)

diff --git a/src/cpp/include/openvino/genai/generation_config.hpp b/src/cpp/include/openvino/genai/generation_config.hpp
index 3a75fc02ea..5fe2d73259 100644
--- a/src/cpp/include/openvino/genai/generation_config.hpp
+++ b/src/cpp/include/openvino/genai/generation_config.hpp
@@ -143,7 +143,7 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig {
     OPENVINO_DEPRECATED("Please, use `is_assisting_generation()` instead of `is_speculative_decoding()`. This method will be removed in 2026.0.0 release")
     bool is_speculative_decoding() const;
 
-    void update_generation_config(const ov::AnyMap& properties);
+    virtual void update_generation_config(const ov::AnyMap& properties);
 
     template <typename... Properties>
     util::EnableIfAllStringAny<void, Properties...> update_generation_config(Properties&&... properties) {
@@ -152,7 +152,7 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig {
 
     /// @brief checks that are no conflicting parameters, e.g. do_sample=true and num_beams > 1.
     /// @throws Exception if config is invalid.
-    void validate() const;
+    virtual void validate() const;
 };
 
 /*
diff --git a/src/cpp/include/openvino/genai/whisper_generation_config.hpp b/src/cpp/include/openvino/genai/whisper_generation_config.hpp
index 18b4202609..4443fd01db 100644
--- a/src/cpp/include/openvino/genai/whisper_generation_config.hpp
+++ b/src/cpp/include/openvino/genai/whisper_generation_config.hpp
@@ -97,7 +97,7 @@ class OPENVINO_GENAI_EXPORTS WhisperGenerationConfig : public GenerationConfig {
     // A list containing the non-speech tokens that will be suppressed during generation.
     std::vector<int64_t> suppress_tokens;
 
-    void update_generation_config(const ov::AnyMap& config_map = {});
+    void update_generation_config(const ov::AnyMap& config_map = {}) override;
 
     template <typename... Properties>
     util::EnableIfAllStringAny<void, Properties...> update_generation_config(Properties&&... properties) {
@@ -106,7 +106,7 @@ class OPENVINO_GENAI_EXPORTS WhisperGenerationConfig : public GenerationConfig {
 
     /// @brief checks that are no conflicting parameters.
     /// @throws Exception if config is invalid.
-    void validate() const;
+    void validate() const override;
 };
 
 /*
diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi
index bba366401e..bbe581e184 100644
--- a/src/python/openvino_genai/py_openvino_genai.pyi
+++ b/src/python/openvino_genai/py_openvino_genai.pyi
@@ -348,11 +348,11 @@ class ChunkStreamerBase:
         """
         End is called at the end of generation. It can be used to flush cache if your own streamer has one
         """
-    def put(self, arg0: int) -> bool:
+    def put(self, token: int) -> bool:
         """
         Put is called every time new token is generated. Returns a bool flag to indicate whether generation should be stopped, if return true generation stops
         """
-    def put_chunk(self, arg0: list[int]) -> bool:
+    def put_chunk(self, tokens: list[int]) -> bool:
         """
         Put is called every time new token chunk is generated. Returns a bool flag to indicate whether generation should be stopped, if return true generation stops
         """
@@ -1944,22 +1944,12 @@ class WhisperDecodedResults:
     @property
     def texts(self) -> list[str]:
         ...
-class WhisperGenerationConfig:
+class WhisperGenerationConfig(GenerationConfig):
     """
     
         WhisperGenerationConfig
-        :param max_length: the maximum length the generated tokens can have. Corresponds to the length of the input prompt +
-                           `max_new_tokens`. Its effect is overridden by `max_new_tokens`, if also set.
-        :type max_length: int
-    
-        :param max_new_tokens: the maximum numbers of tokens to generate, excluding the number of tokens in the prompt. max_new_tokens has priority over max_length.
-        :type max_new_tokens: int
-    
-        :param eos_token_id: End of stream token id.
-        :type eos_token_id: int
-    
+        
         Whisper specific parameters:
-    
         :param decoder_start_token_id: Corresponds to the ”<|startoftranscript|>” token.
         :type decoder_start_token_id: int
     
@@ -2028,18 +2018,55 @@ class WhisperGenerationConfig:
           auto result = pipeline.generate(raw_speech, ov::genai::hotwords("Polychrome"));
           //  He has gone and gone for good answered Polychrome who...
         :type hotwords: Optional[str]
+    
+        Generic parameters:
+        max_length:    the maximum length the generated tokens can have. Corresponds to the length of the input prompt +
+                       max_new_tokens. Its effect is overridden by `max_new_tokens`, if also set.
+        max_new_tokens: the maximum numbers of tokens to generate, excluding the number of tokens in the prompt. max_new_tokens has priority over max_length.
+        min_new_tokens: set 0 probability for eos_token_id for the first eos_token_id generated tokens.
+        ignore_eos:    if set to true, then generation will not stop even if <eos> token is met.
+        eos_token_id:  token_id of <eos> (end of sentence)
+        stop_strings: a set of strings that will cause pipeline to stop generating further tokens.
+        include_stop_str_in_output: if set to true stop string that matched generation will be included in generation output (default: false)
+        stop_token_ids: a set of tokens that will cause pipeline to stop generating further tokens.
+        echo:           if set to true, the model will echo the prompt in the output.
+        logprobs:       number of top logprobs computed for each position, if set to 0, logprobs are not computed and value 0.0 is returned.
+                        Currently only single top logprob can be returned, so any logprobs > 1 is treated as logprobs == 1. (default: 0).
+    
+        repetition_penalty: the parameter for repetition penalty. 1.0 means no penalty.
+        presence_penalty: reduces absolute log prob if the token was generated at least once.
+        frequency_penalty: reduces absolute log prob as many times as the token was generated.
+    
+        Beam search specific parameters:
+        num_beams:         number of beams for beam search. 1 disables beam search.
+        num_beam_groups:   number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams.
+        diversity_penalty: value is subtracted from a beam's score if it generates the same token as any beam from other group at a particular time.
+        length_penalty:    exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to
+            the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log
+            likelihood of the sequence (i.e. negative), length_penalty > 0.0 promotes longer sequences, while
+            length_penalty < 0.0 encourages shorter sequences.
+        num_return_sequences: the number of sequences to return for grouped beam search decoding.
+        no_repeat_ngram_size: if set to int > 0, all ngrams of that size can only occur once.
+        stop_criteria:        controls the stopping condition for grouped beam search. It accepts the following values:
+            "openvino_genai.StopCriteria.EARLY", where the generation stops as soon as there are `num_beams` complete candidates;
+            "openvino_genai.StopCriteria.HEURISTIC" is applied and the generation stops when is it very unlikely to find better candidates;
+            "openvino_genai.StopCriteria.NEVER", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm).
+    
+        Random sampling parameters:
+        temperature:        the value used to modulate token probabilities for random sampling.
+        top_p:              if set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.
+        top_k:              the number of highest probability vocabulary tokens to keep for top-k-filtering.
+        do_sample:          whether or not to use multinomial random sampling that add up to `top_p` or higher are kept.
+        num_return_sequences: the number of sequences to generate from a single prompt.
     """
     begin_suppress_tokens: list[int]
     decoder_start_token_id: int
-    eos_token_id: int
     hotwords: str | None
     initial_prompt: str | None
     is_multilingual: bool
     lang_to_id: dict[str, int]
     language: str | None
     max_initial_timestamp_index: int
-    max_length: int
-    max_new_tokens: int
     no_timestamps_token_id: int
     pad_token_id: int
     prev_sot_token_id: int
@@ -2056,8 +2083,6 @@ class WhisperGenerationConfig:
     @typing.overload
     def __init__(self, **kwargs) -> None:
         ...
-    def set_eos_token_id(self, tokenizer_eos_token_id: int) -> None:
-        ...
     def update_generation_config(self, **kwargs) -> None:
         ...
 class WhisperPerfMetrics(PerfMetrics):
@@ -2110,18 +2135,8 @@ class WhisperPipeline:
          
          
             WhisperGenerationConfig
-            :param max_length: the maximum length the generated tokens can have. Corresponds to the length of the input prompt +
-                               `max_new_tokens`. Its effect is overridden by `max_new_tokens`, if also set.
-            :type max_length: int
-        
-            :param max_new_tokens: the maximum numbers of tokens to generate, excluding the number of tokens in the prompt. max_new_tokens has priority over max_length.
-            :type max_new_tokens: int
-        
-            :param eos_token_id: End of stream token id.
-            :type eos_token_id: int
-        
+            
             Whisper specific parameters:
-        
             :param decoder_start_token_id: Corresponds to the ”<|startoftranscript|>” token.
             :type decoder_start_token_id: int
         
@@ -2190,6 +2205,46 @@ class WhisperPipeline:
               auto result = pipeline.generate(raw_speech, ov::genai::hotwords("Polychrome"));
               //  He has gone and gone for good answered Polychrome who...
             :type hotwords: Optional[str]
+        
+            Generic parameters:
+            max_length:    the maximum length the generated tokens can have. Corresponds to the length of the input prompt +
+                           max_new_tokens. Its effect is overridden by `max_new_tokens`, if also set.
+            max_new_tokens: the maximum numbers of tokens to generate, excluding the number of tokens in the prompt. max_new_tokens has priority over max_length.
+            min_new_tokens: set 0 probability for eos_token_id for the first eos_token_id generated tokens.
+            ignore_eos:    if set to true, then generation will not stop even if <eos> token is met.
+            eos_token_id:  token_id of <eos> (end of sentence)
+            stop_strings: a set of strings that will cause pipeline to stop generating further tokens.
+            include_stop_str_in_output: if set to true stop string that matched generation will be included in generation output (default: false)
+            stop_token_ids: a set of tokens that will cause pipeline to stop generating further tokens.
+            echo:           if set to true, the model will echo the prompt in the output.
+            logprobs:       number of top logprobs computed for each position, if set to 0, logprobs are not computed and value 0.0 is returned.
+                            Currently only single top logprob can be returned, so any logprobs > 1 is treated as logprobs == 1. (default: 0).
+        
+            repetition_penalty: the parameter for repetition penalty. 1.0 means no penalty.
+            presence_penalty: reduces absolute log prob if the token was generated at least once.
+            frequency_penalty: reduces absolute log prob as many times as the token was generated.
+        
+            Beam search specific parameters:
+            num_beams:         number of beams for beam search. 1 disables beam search.
+            num_beam_groups:   number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams.
+            diversity_penalty: value is subtracted from a beam's score if it generates the same token as any beam from other group at a particular time.
+            length_penalty:    exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to
+                the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log
+                likelihood of the sequence (i.e. negative), length_penalty > 0.0 promotes longer sequences, while
+                length_penalty < 0.0 encourages shorter sequences.
+            num_return_sequences: the number of sequences to return for grouped beam search decoding.
+            no_repeat_ngram_size: if set to int > 0, all ngrams of that size can only occur once.
+            stop_criteria:        controls the stopping condition for grouped beam search. It accepts the following values:
+                "openvino_genai.StopCriteria.EARLY", where the generation stops as soon as there are `num_beams` complete candidates;
+                "openvino_genai.StopCriteria.HEURISTIC" is applied and the generation stops when is it very unlikely to find better candidates;
+                "openvino_genai.StopCriteria.NEVER", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm).
+        
+            Random sampling parameters:
+            temperature:        the value used to modulate token probabilities for random sampling.
+            top_p:              if set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.
+            top_k:              the number of highest probability vocabulary tokens to keep for top-k-filtering.
+            do_sample:          whether or not to use multinomial random sampling that add up to `top_p` or higher are kept.
+            num_return_sequences: the number of sequences to generate from a single prompt.
         """
     def get_generation_config(self) -> WhisperGenerationConfig:
         ...
diff --git a/src/python/py_whisper_pipeline.cpp b/src/python/py_whisper_pipeline.cpp
index 55728409e8..d6ddaedbcf 100644
--- a/src/python/py_whisper_pipeline.cpp
+++ b/src/python/py_whisper_pipeline.cpp
@@ -17,6 +17,7 @@ namespace py = pybind11;
 using ov::genai::ChunkStreamerBase;
 using ov::genai::ChunkStreamerVariant;
 using ov::genai::DecodedResults;
+using ov::genai::GenerationConfig;
 using ov::genai::OptionalWhisperGenerationConfig;
 using ov::genai::PerfMetrics;
 using ov::genai::RawSpeechInput;
@@ -76,18 +77,8 @@ auto whisper_decoded_result_chunk = R"(
 
 auto whisper_generation_config_docstring = R"(
     WhisperGenerationConfig
-    :param max_length: the maximum length the generated tokens can have. Corresponds to the length of the input prompt +
-                       `max_new_tokens`. Its effect is overridden by `max_new_tokens`, if also set.
-    :type max_length: int
-
-    :param max_new_tokens: the maximum numbers of tokens to generate, excluding the number of tokens in the prompt. max_new_tokens has priority over max_length.
-    :type max_new_tokens: int
-
-    :param eos_token_id: End of stream token id.
-    :type eos_token_id: int
-
+    
     Whisper specific parameters:
-
     :param decoder_start_token_id: Corresponds to the ”<|startoftranscript|>” token.
     :type decoder_start_token_id: int
 
@@ -156,6 +147,46 @@ auto whisper_generation_config_docstring = R"(
       auto result = pipeline.generate(raw_speech, ov::genai::hotwords("Polychrome"));
       //  He has gone and gone for good answered Polychrome who...
     :type hotwords: Optional[str]
+
+    Generic parameters:
+    max_length:    the maximum length the generated tokens can have. Corresponds to the length of the input prompt +
+                   max_new_tokens. Its effect is overridden by `max_new_tokens`, if also set.
+    max_new_tokens: the maximum numbers of tokens to generate, excluding the number of tokens in the prompt. max_new_tokens has priority over max_length.
+    min_new_tokens: set 0 probability for eos_token_id for the first eos_token_id generated tokens.
+    ignore_eos:    if set to true, then generation will not stop even if <eos> token is met.
+    eos_token_id:  token_id of <eos> (end of sentence)
+    stop_strings: a set of strings that will cause pipeline to stop generating further tokens.
+    include_stop_str_in_output: if set to true stop string that matched generation will be included in generation output (default: false)
+    stop_token_ids: a set of tokens that will cause pipeline to stop generating further tokens.
+    echo:           if set to true, the model will echo the prompt in the output.
+    logprobs:       number of top logprobs computed for each position, if set to 0, logprobs are not computed and value 0.0 is returned.
+                    Currently only single top logprob can be returned, so any logprobs > 1 is treated as logprobs == 1. (default: 0).
+
+    repetition_penalty: the parameter for repetition penalty. 1.0 means no penalty.
+    presence_penalty: reduces absolute log prob if the token was generated at least once.
+    frequency_penalty: reduces absolute log prob as many times as the token was generated.
+
+    Beam search specific parameters:
+    num_beams:         number of beams for beam search. 1 disables beam search.
+    num_beam_groups:   number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams.
+    diversity_penalty: value is subtracted from a beam's score if it generates the same token as any beam from other group at a particular time.
+    length_penalty:    exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to
+        the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log
+        likelihood of the sequence (i.e. negative), length_penalty > 0.0 promotes longer sequences, while
+        length_penalty < 0.0 encourages shorter sequences.
+    num_return_sequences: the number of sequences to return for grouped beam search decoding.
+    no_repeat_ngram_size: if set to int > 0, all ngrams of that size can only occur once.
+    stop_criteria:        controls the stopping condition for grouped beam search. It accepts the following values:
+        "openvino_genai.StopCriteria.EARLY", where the generation stops as soon as there are `num_beams` complete candidates;
+        "openvino_genai.StopCriteria.HEURISTIC" is applied and the generation stops when is it very unlikely to find better candidates;
+        "openvino_genai.StopCriteria.NEVER", where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm).
+
+    Random sampling parameters:
+    temperature:        the value used to modulate token probabilities for random sampling.
+    top_p:              if set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.
+    top_k:              the number of highest probability vocabulary tokens to keep for top-k-filtering.
+    do_sample:          whether or not to use multinomial random sampling that add up to `top_p` or higher are kept.
+    num_return_sequences: the number of sequences to generate from a single prompt.
 )";
 
 auto streamer_base_docstring = R"(
@@ -274,17 +305,16 @@ void init_whisper_pipeline(py::module_& m) {
              "End is called at the end of generation. It can be used to flush cache if your own streamer has one");
 
     // Binding for WhisperGenerationConfig
-    py::class_<WhisperGenerationConfig>(m, "WhisperGenerationConfig", whisper_generation_config_docstring)
+    py::class_<WhisperGenerationConfig, GenerationConfig>(m,
+                                                          "WhisperGenerationConfig",
+                                                          whisper_generation_config_docstring)
         .def(py::init<std::filesystem::path>(), py::arg("json_path"), "path where generation_config.json is stored")
         .def(py::init([](const py::kwargs& kwargs) {
             return *update_whisper_config_from_kwargs(WhisperGenerationConfig(), kwargs);
         }))
-        .def_readwrite("max_new_tokens", &WhisperGenerationConfig::max_new_tokens)
-        .def_readwrite("max_length", &WhisperGenerationConfig::max_length)
         .def_readwrite("begin_suppress_tokens", &WhisperGenerationConfig::begin_suppress_tokens)
         .def_readwrite("suppress_tokens", &WhisperGenerationConfig::suppress_tokens)
         .def_readwrite("decoder_start_token_id", &WhisperGenerationConfig::decoder_start_token_id)
-        .def_readwrite("eos_token_id", &WhisperGenerationConfig::eos_token_id)
         .def_readwrite("pad_token_id", &WhisperGenerationConfig::pad_token_id)
         .def_readwrite("translate_token_id", &WhisperGenerationConfig::translate_token_id)
         .def_readwrite("transcribe_token_id", &WhisperGenerationConfig::transcribe_token_id)
@@ -298,12 +328,9 @@ void init_whisper_pipeline(py::module_& m) {
         .def_readwrite("return_timestamps", &WhisperGenerationConfig::return_timestamps)
         .def_readwrite("initial_prompt", &WhisperGenerationConfig::initial_prompt)
         .def_readwrite("hotwords", &WhisperGenerationConfig::hotwords)
-        .def("set_eos_token_id", &WhisperGenerationConfig::set_eos_token_id, py::arg("tokenizer_eos_token_id"))
-        .def("update_generation_config", [](
-            ov::genai::WhisperGenerationConfig& config,
-            const py::kwargs& kwargs) {
+        .def("update_generation_config", [](ov::genai::WhisperGenerationConfig& config, const py::kwargs& kwargs) {
             config.update_generation_config(pyutils::kwargs_to_any_map(kwargs));
-        });;
+        });
 
     py::class_<WhisperRawPerfMetrics>(m, "WhisperRawPerfMetrics", raw_perf_metrics_docstring)
         .def(py::init<>())

From 502174285d349a325a99e7b763126245bd31ad9e Mon Sep 17 00:00:00 2001
From: Alexander Suvorov <alexander.suvorov@intel.com>
Date: Wed, 22 Jan 2025 10:59:06 +0100
Subject: [PATCH 18/27] Remove comment

---
 src/cpp/src/whisper/models/with_past_decoder.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/cpp/src/whisper/models/with_past_decoder.cpp b/src/cpp/src/whisper/models/with_past_decoder.cpp
index a6c68bffcb..60e1adcd3c 100644
--- a/src/cpp/src/whisper/models/with_past_decoder.cpp
+++ b/src/cpp/src/whisper/models/with_past_decoder.cpp
@@ -123,7 +123,6 @@ std::pair<Tensor, float> WhisperWithPastDecoder::decode(const Tensor& encoder_hi
     const size_t batch_size = input_ids.get_shape().at(0);
     const size_t seq_length = input_ids.get_shape().at(1);
 
-    // todo: skip copy if already set and batch didn't changed
     _set_encoder_hidden_states_tensor(encoder_hidden_state, batch_size, request);
     request.set_tensor("input_ids", input_ids);
 

From 04318d10a6ded8290bf01ae90ed7d94532a2eeab Mon Sep 17 00:00:00 2001
From: Alexander Suvorov <alexander.suvorov@intel.com>
Date: Wed, 22 Jan 2025 11:24:38 +0100
Subject: [PATCH 19/27] Add args name

---
 src/python/py_whisper_pipeline.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/python/py_whisper_pipeline.cpp b/src/python/py_whisper_pipeline.cpp
index d6ddaedbcf..aac14c258a 100644
--- a/src/python/py_whisper_pipeline.cpp
+++ b/src/python/py_whisper_pipeline.cpp
@@ -295,11 +295,13 @@ void init_whisper_pipeline(py::module_& m) {
         .def("put",
              &ChunkStreamerBase::put,
              "Put is called every time new token is generated. Returns a bool flag to indicate whether generation "
-             "should be stopped, if return true generation stops")
+             "should be stopped, if return true generation stops",
+             py::arg("token"))
         .def("put_chunk",
              &ChunkStreamerBase::put_chunk,
              "Put is called every time new token chunk is generated. Returns a bool flag to indicate whether "
-             "generation should be stopped, if return true generation stops")
+             "generation should be stopped, if return true generation stops",
+             py::arg("tokens"))
         .def("end",
              &ChunkStreamerBase::end,
              "End is called at the end of generation. It can be used to flush cache if your own streamer has one");

From 9e08d180fdd48685fd5a755eb6e7cdc1a2637648 Mon Sep 17 00:00:00 2001
From: Alexander Suvorov <alexander.suvorov@intel.com>
Date: Wed, 22 Jan 2025 14:28:27 +0100
Subject: [PATCH 20/27] add tests

---
 src/cpp/src/whisper/whisper.cpp             |  2 +-
 src/cpp/src/whisper_generation_config.cpp   | 10 ++---
 tests/python_tests/test_whisper_pipeline.py | 50 ++++++++++++++++++++-
 3 files changed, 53 insertions(+), 9 deletions(-)

diff --git a/src/cpp/src/whisper/whisper.cpp b/src/cpp/src/whisper/whisper.cpp
index 26fb735903..cbc26e490e 100644
--- a/src/cpp/src/whisper/whisper.cpp
+++ b/src/cpp/src/whisper/whisper.cpp
@@ -101,7 +101,7 @@ std::pair<ov::genai::EncodedResults, bool> decode(std::shared_ptr<ov::genai::Whi
     stream_generated_tokens();
 
     // "Generation" phase
-    while (!sequence_group->has_finished()) {
+    while (!sequence_group->has_finished() && !sequence_group->handle_dropped()) {
         std::map<size_t, std::vector<int64_t>> batch_to_generated_ids{};
 
         sequence_group->schedule_tokens(1);
diff --git a/src/cpp/src/whisper_generation_config.cpp b/src/cpp/src/whisper_generation_config.cpp
index 85b3635ee6..ec12170cf9 100644
--- a/src/cpp/src/whisper_generation_config.cpp
+++ b/src/cpp/src/whisper_generation_config.cpp
@@ -82,12 +82,10 @@ void WhisperGenerationConfig::validate() const {
         OPENVINO_ASSERT(!task.has_value(), "Cannot specify 'task' for not multilingual model.");
     }
 
-    if (is_beam_search()) {
-        OPENVINO_ASSERT(num_return_sequences == 1,
-                        "'num_return_sequences' must be 1. Provided: ",
-                        num_return_sequences,
-                        ".");
-    }
+    OPENVINO_ASSERT(num_return_sequences == 1,
+                    "'num_return_sequences' must be 1. Provided: ",
+                    num_return_sequences,
+                    ".");
 
     OPENVINO_ASSERT(!is_assisting_generation(), "Assisted generation is not supported.");
 }
diff --git a/tests/python_tests/test_whisper_pipeline.py b/tests/python_tests/test_whisper_pipeline.py
index 4fe239b358..3893becd7e 100644
--- a/tests/python_tests/test_whisper_pipeline.py
+++ b/tests/python_tests/test_whisper_pipeline.py
@@ -126,9 +126,14 @@ def run_huggingface(
 
     return pipeline(
         sample,
-        max_new_tokens=min(config.max_new_tokens, 444),
         return_timestamps=config.return_timestamps,
-        generate_kwargs={"language": config.language, "task": config.task},
+        generate_kwargs={
+            "language": config.language,
+            "task": config.task,
+            "max_new_tokens": min(config.max_new_tokens, 444),
+            "top_p": config.top_p,
+            "do_sample": config.do_sample,
+        },
     )
 
 
@@ -147,6 +152,8 @@ def run_genai(
     genai_config.return_timestamps = config.return_timestamps
     genai_config.task = config.task
     genai_config.language = f"<|{config.language}|>" if config.language else None
+    genai_config.do_sample = config.do_sample
+    genai_config.top_p = config.top_p
 
     return pipeline.generate(sample, genai_config, streamer=streamer)
 
@@ -555,6 +562,45 @@ def test_initial_prompt_hotwords(model_descr, test_sample):
     assert "Joel Kyton" in result.texts[0]
 
 
+@pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True))
+@pytest.mark.parametrize("test_sample", get_samples_from_dataset(length=1))
+@pytest.mark.precommit
+def test_random_sampling(model_descr, test_sample):
+    _, _, hf_pipe, genai_pipe = read_whisper_model(model_descr)
+
+    config = ov_genai.WhisperGenerationConfig(do_sample=True, top_p=0.01)
+
+    genai_result = run_genai(
+        genai_pipe,
+        test_sample,
+        config=config,
+    )
+
+    hf_result = run_huggingface(
+        hf_pipe,
+        test_sample,
+        config=config,
+    )
+
+    compare_results(hf_result, genai_result)
+
+    config.top_p = 0.6
+
+    genai_result = run_genai(
+        genai_pipe,
+        test_sample,
+        config=config,
+    )
+
+    hf_result = run_huggingface(
+        hf_pipe,
+        test_sample,
+        config=config,
+    )
+
+    assert genai_result.texts[0] != hf_result["text"]
+
+
 @pytest.mark.parametrize("model_descr", get_whisper_models_list(tiny_only=True))
 @pytest.mark.parametrize(
     "test_sample",

From 56bf11c1c07558e764049a541788d54697307f40 Mon Sep 17 00:00:00 2001
From: Alexander Suvorov <alexander.suvorov@intel.com>
Date: Wed, 22 Jan 2025 17:35:00 +0100
Subject: [PATCH 21/27] Apply review comments

---
 src/cpp/include/openvino/genai/generation_config.hpp | 4 ++--
 src/cpp/src/whisper/whisper.cpp                      | 8 +++-----
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/src/cpp/include/openvino/genai/generation_config.hpp b/src/cpp/include/openvino/genai/generation_config.hpp
index 5fe2d73259..3a75fc02ea 100644
--- a/src/cpp/include/openvino/genai/generation_config.hpp
+++ b/src/cpp/include/openvino/genai/generation_config.hpp
@@ -143,7 +143,7 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig {
     OPENVINO_DEPRECATED("Please, use `is_assisting_generation()` instead of `is_speculative_decoding()`. This method will be removed in 2026.0.0 release")
     bool is_speculative_decoding() const;
 
-    virtual void update_generation_config(const ov::AnyMap& properties);
+    void update_generation_config(const ov::AnyMap& properties);
 
     template <typename... Properties>
     util::EnableIfAllStringAny<void, Properties...> update_generation_config(Properties&&... properties) {
@@ -152,7 +152,7 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig {
 
     /// @brief checks that are no conflicting parameters, e.g. do_sample=true and num_beams > 1.
     /// @throws Exception if config is invalid.
-    virtual void validate() const;
+    void validate() const;
 };
 
 /*
diff --git a/src/cpp/src/whisper/whisper.cpp b/src/cpp/src/whisper/whisper.cpp
index cbc26e490e..4031149163 100644
--- a/src/cpp/src/whisper/whisper.cpp
+++ b/src/cpp/src/whisper/whisper.cpp
@@ -90,12 +90,10 @@ std::pair<ov::genai::EncodedResults, bool> decode(std::shared_ptr<ov::genai::Whi
 
     process_whisper_logits(logits, config, return_timestamps, {});
 
-    // since we have applied `Slice` operation to last MatMul, model output sequence length is 1
-    // so, we need to update sequence groups to think that they already have processed all prompt tokens except last
-    // ones and schedule only `output_sequence_len` ones
+    // sample last token only
     int64_t output_sequence_len = logits.get_shape().at(1);
-    sequence_group->update_processed_tokens_num(sequence_group->get_prompt_len() - output_sequence_len);
-    sequence_group->schedule_tokens(output_sequence_len);
+    sequence_group->schedule_tokens(sequence_group->get_prompt_len());
+    sequence_group->set_output_seq_len(output_sequence_len);
 
     sampler.sample({sequence_group}, logits);
     stream_generated_tokens();

From 50eb509cc9a61f2a71291a38391307d13d942a2b Mon Sep 17 00:00:00 2001
From: Alexander Suvorov <alexander.suvorov@intel.com>
Date: Thu, 23 Jan 2025 10:51:28 +0100
Subject: [PATCH 22/27] move set_encoder_states to base class

---
 .../genai/whisper_generation_config.hpp       |  4 +-
 src/cpp/src/whisper/models/decoder.cpp        | 33 ++++++++++++
 src/cpp/src/whisper/models/decoder.hpp        |  5 ++
 .../src/whisper/models/statefull_decoder.cpp  | 52 +------------------
 .../src/whisper/models/statefull_decoder.hpp  |  2 -
 .../src/whisper/models/with_past_decoder.cpp  | 33 ------------
 .../src/whisper/models/with_past_decoder.hpp  |  3 --
 7 files changed, 41 insertions(+), 91 deletions(-)

diff --git a/src/cpp/include/openvino/genai/whisper_generation_config.hpp b/src/cpp/include/openvino/genai/whisper_generation_config.hpp
index 4443fd01db..18b4202609 100644
--- a/src/cpp/include/openvino/genai/whisper_generation_config.hpp
+++ b/src/cpp/include/openvino/genai/whisper_generation_config.hpp
@@ -97,7 +97,7 @@ class OPENVINO_GENAI_EXPORTS WhisperGenerationConfig : public GenerationConfig {
     // A list containing the non-speech tokens that will be suppressed during generation.
     std::vector<int64_t> suppress_tokens;
 
-    void update_generation_config(const ov::AnyMap& config_map = {}) override;
+    void update_generation_config(const ov::AnyMap& config_map = {});
 
     template <typename... Properties>
     util::EnableIfAllStringAny<void, Properties...> update_generation_config(Properties&&... properties) {
@@ -106,7 +106,7 @@ class OPENVINO_GENAI_EXPORTS WhisperGenerationConfig : public GenerationConfig {
 
     /// @brief checks that are no conflicting parameters.
     /// @throws Exception if config is invalid.
-    void validate() const override;
+    void validate() const;
 };
 
 /*
diff --git a/src/cpp/src/whisper/models/decoder.cpp b/src/cpp/src/whisper/models/decoder.cpp
index 32a8f2eff6..0b2a083908 100644
--- a/src/cpp/src/whisper/models/decoder.cpp
+++ b/src/cpp/src/whisper/models/decoder.cpp
@@ -22,5 +22,38 @@ std::shared_ptr<WhisperDecoder> WhisperDecoder::from_path(const std::filesystem:
     return std::make_shared<WhisperStatefullDecoder>(models_path, device, properties);
 }
 
+/**
+ * Encoder hidden states expected to be with batch 1
+ * Copy encoder hidden state tensor from batch 1 to requested batch_size.
+ * Set new encoder hidden states tensor to infer request.
+ */
+void WhisperDecoder::_set_encoder_hidden_states_tensor(const Tensor& encoder_hidden_state,
+                                                       const size_t batch_size,
+                                                       InferRequest& request) {
+    const size_t current_batch_size = request.get_tensor("encoder_hidden_states").get_shape().at(0);
+    // batch hasn't changed, skip
+    if (current_batch_size == batch_size) {
+        return;
+    }
+
+    OPENVINO_ASSERT(encoder_hidden_state.get_shape().at(0) == 1);
+    Shape shape{encoder_hidden_state.get_shape()};
+    shape[0] = batch_size;
+
+    Tensor new_encoder_hidden_states{ov::element::f32, shape};
+
+    auto new_encoder_hidden_states_data = new_encoder_hidden_states.data<float>();
+    auto encoder_hidden_state_data = encoder_hidden_state.data<float>();
+
+    for (size_t batch = 0; batch < batch_size; batch++) {
+        const size_t batch_offset = batch * encoder_hidden_state.get_size();
+        std::memcpy(new_encoder_hidden_states_data + batch_offset,
+                    encoder_hidden_state_data,
+                    encoder_hidden_state.get_byte_size());
+    }
+
+    request.set_tensor("encoder_hidden_states", new_encoder_hidden_states);
+}
+
 WhisperDecoder::~WhisperDecoder() = default;
 }  // namespace ov::genai
diff --git a/src/cpp/src/whisper/models/decoder.hpp b/src/cpp/src/whisper/models/decoder.hpp
index acb10d92b5..66e86a0733 100644
--- a/src/cpp/src/whisper/models/decoder.hpp
+++ b/src/cpp/src/whisper/models/decoder.hpp
@@ -25,5 +25,10 @@ class WhisperDecoder {
     virtual void reset_state() = 0;
 
     virtual ~WhisperDecoder();
+
+protected:
+    void _set_encoder_hidden_states_tensor(const Tensor& encoder_hidden_state,
+                                           const size_t batch_size,
+                                           InferRequest& request);
 };
 }  // namespace ov::genai
diff --git a/src/cpp/src/whisper/models/statefull_decoder.cpp b/src/cpp/src/whisper/models/statefull_decoder.cpp
index 55b111286b..9c0c4a0b3f 100644
--- a/src/cpp/src/whisper/models/statefull_decoder.cpp
+++ b/src/cpp/src/whisper/models/statefull_decoder.cpp
@@ -41,7 +41,7 @@ std::pair<ov::Tensor, float> WhisperStatefullDecoder::decode(const Tensor& encod
     const size_t batch_size = input_ids.get_shape().at(0);
     const size_t seq_len = input_ids.get_shape().at(1);
 
-    _set_encoder_hidden_states_tensor(encoder_hidden_state, batch_size);
+    _set_encoder_hidden_states_tensor(encoder_hidden_state, batch_size, m_request);
 
     _set_cache_position_tensor(seq_len);
     m_request.set_tensor("input_ids", input_ids);
@@ -56,56 +56,6 @@ std::pair<ov::Tensor, float> WhisperStatefullDecoder::decode(const Tensor& encod
     return {output_tensor, infer_ms};
 };
 
-/**
- * Encoder hidden states expected to be with batch 1
- * Copy encoder hidden state tensor from batch 1 to requested batch_size.
- * Set new encoder hidden states tensor to infer request.
- */
-void WhisperStatefullDecoder::_set_encoder_hidden_states_tensor(const Tensor& encoder_hidden_state,
-                                                                const size_t batch_size) {
-    const size_t current_batch_size = m_request.get_tensor("encoder_hidden_states").get_shape().at(0);
-    // batch hasn't changed, skip
-    if (current_batch_size == batch_size) {
-        return;
-    }
-
-    if (current_batch_size != 0) {
-        _reset_encoder_past_key_values_states(encoder_hidden_state, batch_size);
-    }
-
-    OPENVINO_ASSERT(encoder_hidden_state.get_shape().at(0) == 1);
-    Shape shape{encoder_hidden_state.get_shape()};
-    shape[0] = batch_size;
-
-    Tensor new_encoder_hidden_states{ov::element::f32, shape};
-
-    auto new_encoder_hidden_states_data = new_encoder_hidden_states.data<float>();
-    auto encoder_hidden_state_data = encoder_hidden_state.data<float>();
-
-    for (size_t batch = 0; batch < batch_size; batch++) {
-        const size_t batch_offset = batch * encoder_hidden_state.get_size();
-        std::memcpy(new_encoder_hidden_states_data + batch_offset,
-                    encoder_hidden_state_data,
-                    encoder_hidden_state.get_byte_size());
-    }
-
-    m_request.set_tensor("encoder_hidden_states", new_encoder_hidden_states);
-}
-
-// Past_key value states are not shring/grow when batch is changed. Reset past_key values states as a workaround.
-// Ticket:
-void WhisperStatefullDecoder::_reset_encoder_past_key_values_states(const Tensor& encoder_hidden_state,
-                                                                    const size_t batch_size) {
-    const size_t encoder_state_length_dim = encoder_hidden_state.get_shape().at(1);
-    for (auto& state : m_request.query_state()) {
-        // find encoder states by dimension
-        const Shape& state_shape = state.get_state().get_shape();
-        if (state_shape.at(2) == encoder_state_length_dim) {
-            state.reset();
-        }
-    }
-}
-
 void WhisperStatefullDecoder::_set_cache_position_tensor(const size_t seq_len) {
     ov::Tensor cache_position_tensor = m_request.get_tensor("cache_position");
 
diff --git a/src/cpp/src/whisper/models/statefull_decoder.hpp b/src/cpp/src/whisper/models/statefull_decoder.hpp
index 4d4572c33d..44156fc6aa 100644
--- a/src/cpp/src/whisper/models/statefull_decoder.hpp
+++ b/src/cpp/src/whisper/models/statefull_decoder.hpp
@@ -24,8 +24,6 @@ class WhisperStatefullDecoder : public WhisperDecoder {
     void reset_state() override;
 
 private:
-    void _set_encoder_hidden_states_tensor(const Tensor& encoder_hidden_state, const size_t batch_size);
-    void _reset_encoder_past_key_values_states(const Tensor& encoder_hidden_state, const size_t batch_size);
     void _set_cache_position_tensor(const size_t seq_len);
 
 private:
diff --git a/src/cpp/src/whisper/models/with_past_decoder.cpp b/src/cpp/src/whisper/models/with_past_decoder.cpp
index 60e1adcd3c..2ab07112fa 100644
--- a/src/cpp/src/whisper/models/with_past_decoder.cpp
+++ b/src/cpp/src/whisper/models/with_past_decoder.cpp
@@ -145,39 +145,6 @@ std::pair<Tensor, float> WhisperWithPastDecoder::decode(const Tensor& encoder_hi
     return {output_tensor, infer_ms};
 }
 
-/**
- * Encoder hidden states expected to be with batch 1
- * Copy encoder hidden state tensor from batch 1 to requested batch_size.
- * Set new encoder hidden states tensor to infer request.
- */
-void WhisperWithPastDecoder::_set_encoder_hidden_states_tensor(const Tensor& encoder_hidden_state,
-                                                               const size_t batch_size,
-                                                               InferRequest& request) {
-    const size_t current_batch_size = request.get_tensor("encoder_hidden_states").get_shape().at(0);
-    // batch hasn't changed, skip
-    if (current_batch_size == batch_size) {
-        return;
-    }
-
-    OPENVINO_ASSERT(encoder_hidden_state.get_shape().at(0) == 1);
-    Shape shape{encoder_hidden_state.get_shape()};
-    shape[0] = batch_size;
-
-    Tensor new_encoder_hidden_states{ov::element::f32, shape};
-
-    auto new_encoder_hidden_states_data = new_encoder_hidden_states.data<float>();
-    auto encoder_hidden_state_data = encoder_hidden_state.data<float>();
-
-    for (size_t batch = 0; batch < batch_size; batch++) {
-        const size_t batch_offset = batch * encoder_hidden_state.get_size();
-        std::memcpy(new_encoder_hidden_states_data + batch_offset,
-                    encoder_hidden_state_data,
-                    encoder_hidden_state.get_byte_size());
-    }
-
-    request.set_tensor("encoder_hidden_states", new_encoder_hidden_states);
-}
-
 void WhisperWithPastDecoder::_set_past_key_value(const Tensor& beam_idx) {
     const bool is_initial_step = m_cache_position == 0;
     if (is_initial_step) {
diff --git a/src/cpp/src/whisper/models/with_past_decoder.hpp b/src/cpp/src/whisper/models/with_past_decoder.hpp
index 7eb3990cca..3cf4404092 100644
--- a/src/cpp/src/whisper/models/with_past_decoder.hpp
+++ b/src/cpp/src/whisper/models/with_past_decoder.hpp
@@ -30,9 +30,6 @@ class WhisperWithPastDecoder : public WhisperDecoder {
     bool m_initial_past_key_value_set = false;
     bool m_past_key_value_linked = false;
 
-    void _set_encoder_hidden_states_tensor(const Tensor& encoder_hidden_state,
-                                           const size_t batch_size,
-                                           InferRequest& request);
     void _set_past_key_value(const Tensor& beam_idx);
 };
 

From abde309857f2209e092d9920e0f8fe2672a3c013 Mon Sep 17 00:00:00 2001
From: Alexander Suvorov <alexander.suvorov@intel.com>
Date: Thu, 23 Jan 2025 11:05:26 +0100
Subject: [PATCH 23/27] Move detect_language to base decoder

---
 .../whisper_speech_recognition.cpp              |  2 +-
 src/cpp/src/whisper/models/decoder.cpp          | 17 +++++++++++++++++
 src/cpp/src/whisper/models/decoder.hpp          |  3 +--
 .../src/whisper/models/statefull_decoder.cpp    | 17 -----------------
 .../src/whisper/models/statefull_decoder.hpp    |  3 ---
 .../src/whisper/models/with_past_decoder.cpp    | 17 -----------------
 .../src/whisper/models/with_past_decoder.hpp    |  3 ---
 7 files changed, 19 insertions(+), 43 deletions(-)

diff --git a/samples/cpp/whisper_speech_recognition/whisper_speech_recognition.cpp b/samples/cpp/whisper_speech_recognition/whisper_speech_recognition.cpp
index 3b2b4ff466..cbb932a74d 100644
--- a/samples/cpp/whisper_speech_recognition/whisper_speech_recognition.cpp
+++ b/samples/cpp/whisper_speech_recognition/whisper_speech_recognition.cpp
@@ -18,7 +18,7 @@ int main(int argc, char* argv[]) try {
     ov::genai::WhisperGenerationConfig config = pipeline.get_generation_config();
     config.max_new_tokens = 100;  // increase this based on your speech length
     // 'task' and 'language' parameters are supported for multilingual models only
-    config.language = "<|en|>";  // can switch to <|zh|> for Chinese language
+    // config.language = "<|en|>";  // can switch to <|zh|> for Chinese language
     config.task = "transcribe";
     config.return_timestamps = true;
 
diff --git a/src/cpp/src/whisper/models/decoder.cpp b/src/cpp/src/whisper/models/decoder.cpp
index 0b2a083908..1c8df0edd9 100644
--- a/src/cpp/src/whisper/models/decoder.cpp
+++ b/src/cpp/src/whisper/models/decoder.cpp
@@ -22,6 +22,23 @@ std::shared_ptr<WhisperDecoder> WhisperDecoder::from_path(const std::filesystem:
     return std::make_shared<WhisperStatefullDecoder>(models_path, device, properties);
 }
 
+std::pair<int64_t, float> WhisperDecoder::detect_language(const ov::Tensor& encoder_hidden_state,
+                                                          const int64_t decoder_start_token_id) {
+    Tensor input_ids_tensor{ov::element::i64, {1, 1}};
+    input_ids_tensor.data<int64_t>()[0] = decoder_start_token_id;
+
+    Tensor beam_idx_tensor{ov::element::i32, {1}};
+    beam_idx_tensor.data<int32_t>()[0] = 0;
+
+    auto [output_tensor, infer_ms] = decode(encoder_hidden_state, input_ids_tensor, beam_idx_tensor);
+
+    int64_t output_token = ov::genai::utils::argmax(output_tensor, 0);
+
+    reset_state();
+
+    return {output_token, infer_ms};
+}
+
 /**
  * Encoder hidden states expected to be with batch 1
  * Copy encoder hidden state tensor from batch 1 to requested batch_size.
diff --git a/src/cpp/src/whisper/models/decoder.hpp b/src/cpp/src/whisper/models/decoder.hpp
index 66e86a0733..6eeba2b387 100644
--- a/src/cpp/src/whisper/models/decoder.hpp
+++ b/src/cpp/src/whisper/models/decoder.hpp
@@ -15,8 +15,7 @@ class WhisperDecoder {
                                                      const std::string& device,
                                                      const ov::AnyMap& properties);
 
-    virtual std::pair<int64_t, float> detect_language(const Tensor& encoder_hidden_state,
-                                                      const int64_t decoder_start_token_id) = 0;
+    std::pair<int64_t, float> detect_language(const Tensor& encoder_hidden_state, const int64_t decoder_start_token_id);
 
     virtual std::pair<Tensor, float> decode(const Tensor& encoder_hidden_state,
                                             const Tensor& input_ids,
diff --git a/src/cpp/src/whisper/models/statefull_decoder.cpp b/src/cpp/src/whisper/models/statefull_decoder.cpp
index 9c0c4a0b3f..5208f496fb 100644
--- a/src/cpp/src/whisper/models/statefull_decoder.cpp
+++ b/src/cpp/src/whisper/models/statefull_decoder.cpp
@@ -18,23 +18,6 @@ WhisperStatefullDecoder::WhisperStatefullDecoder(const std::filesystem::path& mo
     m_request = compiled_model.create_infer_request();
 }
 
-std::pair<int64_t, float> WhisperStatefullDecoder::detect_language(const ov::Tensor& encoder_hidden_state,
-                                                                   const int64_t decoder_start_token_id) {
-    Tensor input_ids_tensor{ov::element::i64, {1, 1}};
-    input_ids_tensor.data<int64_t>()[0] = decoder_start_token_id;
-
-    Tensor beam_idx_tensor{ov::element::i32, {1}};
-    beam_idx_tensor.data<int32_t>()[0] = 0;
-
-    auto [output_tensor, infer_ms] = decode(encoder_hidden_state, input_ids_tensor, beam_idx_tensor);
-
-    int64_t output_token = ov::genai::utils::argmax(output_tensor, 0);
-
-    reset_state();
-
-    return {output_token, infer_ms};
-}
-
 std::pair<ov::Tensor, float> WhisperStatefullDecoder::decode(const Tensor& encoder_hidden_state,
                                                              const Tensor& input_ids,
                                                              const Tensor& beam_idx) {
diff --git a/src/cpp/src/whisper/models/statefull_decoder.hpp b/src/cpp/src/whisper/models/statefull_decoder.hpp
index 44156fc6aa..c8c733e943 100644
--- a/src/cpp/src/whisper/models/statefull_decoder.hpp
+++ b/src/cpp/src/whisper/models/statefull_decoder.hpp
@@ -14,9 +14,6 @@ class WhisperStatefullDecoder : public WhisperDecoder {
                             const std::string& device,
                             const ov::AnyMap& properties);
 
-    std::pair<int64_t, float> detect_language(const Tensor& encoder_hidden_state,
-                                              const int64_t decoder_start_token_id) override;
-
     std::pair<Tensor, float> decode(const Tensor& encoder_hidden_state,
                                     const Tensor& input_ids,
                                     const Tensor& beam_idx) override;
diff --git a/src/cpp/src/whisper/models/with_past_decoder.cpp b/src/cpp/src/whisper/models/with_past_decoder.cpp
index 2ab07112fa..1ade0dea6b 100644
--- a/src/cpp/src/whisper/models/with_past_decoder.cpp
+++ b/src/cpp/src/whisper/models/with_past_decoder.cpp
@@ -97,23 +97,6 @@ WhisperWithPastDecoder::WhisperWithPastDecoder(const std::filesystem::path& mode
     m_request_decoder_with_past = compiled_model.create_infer_request();
 }
 
-std::pair<int64_t, float> WhisperWithPastDecoder::detect_language(const ov::Tensor& encoder_hidden_state,
-                                                                  const int64_t decoder_start_token_id) {
-    Tensor input_ids_tensor{ov::element::i64, {1, 1}};
-    input_ids_tensor.data<int64_t>()[0] = decoder_start_token_id;
-
-    Tensor beam_idx_tensor{ov::element::i32, {1}};
-    beam_idx_tensor.data<int32_t>()[0] = 0;
-
-    auto [output_tensor, infer_ms] = decode(encoder_hidden_state, input_ids_tensor, beam_idx_tensor);
-
-    int64_t output_token = ov::genai::utils::argmax(output_tensor, 0);
-
-    reset_state();
-
-    return {output_token, infer_ms};
-}
-
 std::pair<Tensor, float> WhisperWithPastDecoder::decode(const Tensor& encoder_hidden_state,
                                                         const Tensor& input_ids,
                                                         const Tensor& beam_idx) {
diff --git a/src/cpp/src/whisper/models/with_past_decoder.hpp b/src/cpp/src/whisper/models/with_past_decoder.hpp
index 3cf4404092..1610c60d4e 100644
--- a/src/cpp/src/whisper/models/with_past_decoder.hpp
+++ b/src/cpp/src/whisper/models/with_past_decoder.hpp
@@ -14,9 +14,6 @@ class WhisperWithPastDecoder : public WhisperDecoder {
                            const std::string& device,
                            const ov::AnyMap& properties);
 
-    std::pair<int64_t, float> detect_language(const Tensor& encoder_hidden_state,
-                                              const int64_t decoder_start_token_id) override;
-
     std::pair<Tensor, float> decode(const Tensor& encoder_hidden_state,
                                     const Tensor& input_ids,
                                     const Tensor& beam_idx) override;

From 3348ad5f5cc91ec9a616b7b8bbb28003bce1616a Mon Sep 17 00:00:00 2001
From: Alexander Suvorov <alexander.suvorov@intel.com>
Date: Thu, 23 Jan 2025 11:05:58 +0100
Subject: [PATCH 24/27] revert sample

---
 .../whisper_speech_recognition/whisper_speech_recognition.cpp   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/samples/cpp/whisper_speech_recognition/whisper_speech_recognition.cpp b/samples/cpp/whisper_speech_recognition/whisper_speech_recognition.cpp
index cbb932a74d..3b2b4ff466 100644
--- a/samples/cpp/whisper_speech_recognition/whisper_speech_recognition.cpp
+++ b/samples/cpp/whisper_speech_recognition/whisper_speech_recognition.cpp
@@ -18,7 +18,7 @@ int main(int argc, char* argv[]) try {
     ov::genai::WhisperGenerationConfig config = pipeline.get_generation_config();
     config.max_new_tokens = 100;  // increase this based on your speech length
     // 'task' and 'language' parameters are supported for multilingual models only
-    // config.language = "<|en|>";  // can switch to <|zh|> for Chinese language
+    config.language = "<|en|>";  // can switch to <|zh|> for Chinese language
     config.task = "transcribe";
     config.return_timestamps = true;
 

From d63e70079939696f6fa51fa516b872ee77f75be4 Mon Sep 17 00:00:00 2001
From: Alexander Suvorov <alexander.suvorov@intel.com>
Date: Thu, 23 Jan 2025 14:20:09 +0100
Subject: [PATCH 25/27] Move whisper utils

---
 src/cpp/src/utils.cpp                  | 16 ----------------
 src/cpp/src/utils.hpp                  |  2 --
 src/cpp/src/whisper/models/decoder.cpp |  2 +-
 src/cpp/src/whisper/whisper_utils.cpp  | 16 ++++++++++++++++
 src/cpp/src/whisper/whisper_utils.hpp  |  2 ++
 5 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/src/cpp/src/utils.cpp b/src/cpp/src/utils.cpp
index dd3051b8b0..a8cf844cb7 100644
--- a/src/cpp/src/utils.cpp
+++ b/src/cpp/src/utils.cpp
@@ -46,22 +46,6 @@ void print_tensor(const ov::Tensor& tensor) {
     std::cout << "]" << std::endl;
 }
 
-int64_t argmax(const ov::Tensor& logits, const size_t batch_idx) {
-    if (logits.get_shape()[0] <= batch_idx) {
-        OPENVINO_THROW("logits batch size doesn't match the number of beams");
-    }
-
-    size_t vocab_size = logits.get_shape().back();
-    size_t batch_offset = batch_idx * logits.get_shape()[1] * vocab_size;
-    size_t sequence_offset = (logits.get_shape()[1] - 1) * vocab_size;
-    const float* logits_data = logits.data<const float>() + batch_offset + sequence_offset;
-
-    int64_t out_token = std::max_element(logits_data, logits_data + vocab_size) - logits_data;
-    float max_logit = logits_data[out_token];
-
-    return out_token;
-}
-
 /**
  * Initializes position ids based on attention mask and starting position
  */
diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp
index c25b2c3913..8c56c39a8c 100644
--- a/src/cpp/src/utils.hpp
+++ b/src/cpp/src/utils.hpp
@@ -47,8 +47,6 @@ Tensor init_attention_mask(const Tensor& position_ids);
 
 void print_tensor(const ov::Tensor& tensor);
 
-int64_t argmax(const ov::Tensor& logits, const size_t batch_idx);
-
 void initialize_position_ids(ov::Tensor& position_ids, const ov::Tensor& attention_mask, int64_t start_pos = 0);
 
 ov::Tensor extend_attention(ov::Tensor attention_mask);
diff --git a/src/cpp/src/whisper/models/decoder.cpp b/src/cpp/src/whisper/models/decoder.cpp
index 1c8df0edd9..c09a84ccdd 100644
--- a/src/cpp/src/whisper/models/decoder.cpp
+++ b/src/cpp/src/whisper/models/decoder.cpp
@@ -6,7 +6,7 @@
 #include <filesystem>
 
 #include "statefull_decoder.hpp"
-#include "utils.hpp"
+#include "whisper/whisper_utils.hpp"
 #include "with_past_decoder.hpp"
 
 namespace ov::genai {
diff --git a/src/cpp/src/whisper/whisper_utils.cpp b/src/cpp/src/whisper/whisper_utils.cpp
index 6e56a1439d..f41d3d11d8 100644
--- a/src/cpp/src/whisper/whisper_utils.cpp
+++ b/src/cpp/src/whisper/whisper_utils.cpp
@@ -41,6 +41,22 @@ void filter_non_segment_metrics(ov::genai::RawPerfMetrics& raw_metrics,
     filter_by_ranges(raw_metrics.m_batch_sizes, offset, ranges);
 }
 
+int64_t argmax(const ov::Tensor& logits, const size_t batch_idx) {
+    if (logits.get_shape()[0] <= batch_idx) {
+        OPENVINO_THROW("logits batch size doesn't match the number of beams");
+    }
+
+    size_t vocab_size = logits.get_shape().back();
+    size_t batch_offset = batch_idx * logits.get_shape()[1] * vocab_size;
+    size_t sequence_offset = (logits.get_shape()[1] - 1) * vocab_size;
+    const float* logits_data = logits.data<const float>() + batch_offset + sequence_offset;
+
+    int64_t out_token = std::max_element(logits_data, logits_data + vocab_size) - logits_data;
+    float max_logit = logits_data[out_token];
+
+    return out_token;
+}
+
 }  // namespace utils
 }  // namespace genai
 }  // namespace ov
diff --git a/src/cpp/src/whisper/whisper_utils.hpp b/src/cpp/src/whisper/whisper_utils.hpp
index 234feed6a8..8fd0a080c6 100644
--- a/src/cpp/src/whisper/whisper_utils.hpp
+++ b/src/cpp/src/whisper/whisper_utils.hpp
@@ -17,6 +17,8 @@ void filter_non_segment_metrics(ov::genai::RawPerfMetrics& raw_metrics,
                                 size_t offset,
                                 std::vector<std::pair<size_t, size_t>>& ranges);
 
+int64_t argmax(const ov::Tensor& logits, const size_t batch_idx);
+
 }  // namespace utils
 }  // namespace genai
 }  // namespace ov

From 6411b173abd2141f147e0471fe73c2e9c71ca0a2 Mon Sep 17 00:00:00 2001
From: Alexander Suvorov <alexander.suvorov@intel.com>
Date: Thu, 23 Jan 2025 14:35:27 +0100
Subject: [PATCH 26/27] Add get_max_new_tokens for sequence group

---
 src/cpp/src/sampler.cpp        | 19 +++++++++++--------
 src/cpp/src/sampler.hpp        |  2 +-
 src/cpp/src/sequence_group.hpp |  6 +++++-
 3 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/src/cpp/src/sampler.cpp b/src/cpp/src/sampler.cpp
index a1957b0630..7a1e079746 100644
--- a/src/cpp/src/sampler.cpp
+++ b/src/cpp/src/sampler.cpp
@@ -408,7 +408,7 @@ void Sampler::GroupBeamSearcher::select_next_tokens(const ov::Tensor& logits,
         }
 
         // check whether group has finished
-        group.is_done(m_parameters, m_sequence_group->get_prompt_len());
+        group.is_done();
 
         // group cannot continue if there are no valid child beams
         if (child_beams_per_group[group_id].size() == 0) {
@@ -549,14 +549,14 @@ std::vector<int64_t> Sampler::_try_finish_generation(SequenceGroup::Ptr & sequen
     std::vector<int64_t> dropped_seq_ids;
     for (auto& running_sequence : sequence_group->get_running_sequences()) {
         const auto generated_len = running_sequence->get_generated_len();
-        if (sampling_params.get_max_new_tokens(sequence_group->get_prompt_len()) <= generated_len || 
+        if (sequence_group->get_max_new_tokens() <= generated_len || 
             is_stop_token_id_hit(running_sequence->get_generated_ids().back(), sampling_params.stop_token_ids) && !sampling_params.ignore_eos) {
             // stop sequence by max_new_tokens or stop token (eos included)
             running_sequence->set_status(SequenceStatus::FINISHED);
 
             if (is_stop_token_id_hit(running_sequence->get_generated_ids().back(), sampling_params.stop_token_ids) && !sampling_params.ignore_eos) {
                 running_sequence->set_finish_reason(GenerationFinishReason::STOP);
-            } else if (sampling_params.get_max_new_tokens(sequence_group->get_prompt_len()) == generated_len) {
+            } else if (sequence_group->get_max_new_tokens() == generated_len) {
                 running_sequence->set_finish_reason(GenerationFinishReason::LENGTH);
             }
 
@@ -800,8 +800,8 @@ SamplerOutput Sampler::sample(const std::vector<SequenceGroup::Ptr> & sequence_g
                         // max counter of needed to be sampled tokens
                         OPENVINO_ASSERT(running_sequence->get_generated_len() >= token_offset);
                         size_t generated_and_verified_len = running_sequence->get_generated_len() - token_offset;
-                        OPENVINO_ASSERT(sampling_params.get_max_new_tokens(sequence_group->get_prompt_len()) >= generated_and_verified_len);
-                        size_t max_num_sampled_token = sampling_params.get_max_new_tokens(sequence_group->get_prompt_len()) - generated_and_verified_len;
+                        OPENVINO_ASSERT(sequence_group->get_max_new_tokens() >= generated_and_verified_len);
+                        size_t max_num_sampled_token = sequence_group->get_max_new_tokens() - generated_and_verified_len;
                         if (max_num_sampled_token == 0) {
                             stop_sample_tokens(running_sequence, token_offset, max_num_sampled_token, max_removed_tokens_per_request);
                             break;
@@ -887,7 +887,7 @@ SamplerOutput Sampler::sample(const std::vector<SequenceGroup::Ptr> & sequence_g
                 // check max length stop criteria
                 std::vector<Sequence::Ptr> running_sequences = sequence_group->get_running_sequences();
                 if (!sequence_group->has_finished() &&
-                    running_sequences[0]->get_generated_len() == sampling_params.get_max_new_tokens(sequence_group->get_prompt_len())) {
+                    running_sequences[0]->get_generated_len() == sequence_group->get_max_new_tokens()) {
                     // stop sequence by max_new_tokens
                     m_beam_search_info.at(request_id).finalize(sampler_output);
                 }
@@ -956,7 +956,10 @@ int64_t Sampler::GroupBeamSearcher::Group::finish(Beam beam, const ov::genai::Ge
     return preeempted_sequence_id;
 }
 
-void Sampler::GroupBeamSearcher::Group::is_done(const ov::genai::GenerationConfig& sampling_params, size_t prompt_length) {
+void Sampler::GroupBeamSearcher::Group::is_done() {
+    const auto sequence_group = ongoing.front().m_sequence->get_sequence_group_ptr();
+    const ov::genai::GenerationConfig& sampling_params = sequence_group->get_sampling_parameters();
+
     assert(sampling_params.num_beams % sampling_params.num_beam_groups == 0 &&
         "number of beams should be divisible by number of groups");
     size_t group_size = sampling_params.num_beams / sampling_params.num_beam_groups;
@@ -977,7 +980,7 @@ void Sampler::GroupBeamSearcher::Group::is_done(const ov::genai::GenerationConfi
         return;
     }
     case ov::genai::StopCriteria::NEVER: {
-        size_t length = sampling_params.length_penalty > 0.0 ? sampling_params.get_max_new_tokens(prompt_length) : cur_len;
+        size_t length = sampling_params.length_penalty > 0.0 ? sequence_group->get_max_new_tokens() : cur_len;
         float highest_attainable_score = best_sum_logprobs / std::pow(float(length), sampling_params.length_penalty);
         done = worst_score >= highest_attainable_score;
         return;
diff --git a/src/cpp/src/sampler.hpp b/src/cpp/src/sampler.hpp
index 3b7d98a7d8..9768e0a7af 100644
--- a/src/cpp/src/sampler.hpp
+++ b/src/cpp/src/sampler.hpp
@@ -114,7 +114,7 @@ class Sampler::GroupBeamSearcher {
         bool done = false;
 
         int64_t finish(Beam beam, const ov::genai::GenerationConfig& sampling_params);
-        void is_done(const ov::genai::GenerationConfig& sampling_params, size_t prompt_length);
+        void is_done();
     };
 
     SequenceGroup::Ptr m_sequence_group;
diff --git a/src/cpp/src/sequence_group.hpp b/src/cpp/src/sequence_group.hpp
index fef9757b43..19d29c92ac 100644
--- a/src/cpp/src/sequence_group.hpp
+++ b/src/cpp/src/sequence_group.hpp
@@ -689,7 +689,11 @@ class SequenceGroup  : public std::enable_shared_from_this<SequenceGroup> {
         GenerationOutputs outputs;
         outputs.emplace(0, output);
         m_generation_stream->push(std::move(outputs));
-    } 
+    }
+
+    size_t get_max_new_tokens() {
+        return m_sampling_params.get_max_new_tokens(get_prompt_len());
+    }
 };
 
 inline std::shared_ptr<SequenceGroup> Sequence::get_sequence_group_ptr() const {

From f9cb4613b87d0dc4ad1e4d2882007b99102bab04 Mon Sep 17 00:00:00 2001
From: Alexander Suvorov <alexander.suvorov@intel.com>
Date: Thu, 23 Jan 2025 14:40:33 +0100
Subject: [PATCH 27/27] Use sg get_max_new_tokens

---
 src/cpp/src/scheduler.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cpp/src/scheduler.hpp b/src/cpp/src/scheduler.hpp
index 86f705f759..eabbee935f 100644
--- a/src/cpp/src/scheduler.hpp
+++ b/src/cpp/src/scheduler.hpp
@@ -493,7 +493,7 @@ class Scheduler {
         for (auto idx = 0; idx < sequence_groups.size(); idx++) {
             auto seq_length = sequence_groups[idx]->get_prompt_len() * m_kv_blocks_initial_multiplier;
             auto gen_config = sequence_groups[idx]->get_sampling_parameters();
-            seq_length = std::min(seq_length, sequence_groups[idx]->get_prompt_len() + gen_config.get_max_new_tokens(sequence_groups[idx]->get_prompt_len()));
+            seq_length = std::min(seq_length, sequence_groups[idx]->get_prompt_len() + sequence_groups[idx]->get_max_new_tokens());
             size_t blocks_num = std::ceil((float)seq_length / m_block_manager->get_block_size());
             if (gen_config.is_beam_search()) {
                 blocks_num *= gen_config.num_beams;