From 88d42d8fc30fd2ea50f12c9f345b1e06fe248cfa Mon Sep 17 00:00:00 2001
From: Vladimir Zlobin <vladimir.zlobin@intel.com>
Date: Fri, 1 Nov 2024 17:50:56 +0400
Subject: [PATCH 01/28] Phi-3.5-vision-instruc

---
 src/cpp/src/visual_language/clip.cpp          |   2 +-
 src/cpp/src/visual_language/clip.hpp          |   1 +
 .../src/visual_language/inputs_embedder.cpp   | 121 ++++++++++
 .../src/visual_language/inputs_embedder.hpp   |   1 +
 .../src/visual_language/processor_config.cpp  |   4 +
 .../src/visual_language/processor_config.hpp  |   8 +-
 .../src/visual_language/vision_encoder.cpp    | 211 ++++++++++++++++++
 .../src/visual_language/vision_encoder.hpp    |   4 +
 .../src/visual_language/vlm_model_type.hpp    |   4 +-
 src/docs/SUPPORTED_MODELS.md                  |  10 +
 10 files changed, 363 insertions(+), 3 deletions(-)
diff --git a/src/cpp/src/visual_language/clip.cpp b/src/cpp/src/visual_language/clip.cpp
index d7b3c6fb05..c02201ab80 100644
--- a/src/cpp/src/visual_language/clip.cpp
+++ b/src/cpp/src/visual_language/clip.cpp
@@ -28,7 +28,7 @@ inline float clip_lerp(float s, float e, float t) {
     return s + (e - s) * t;
 }
 // Bilinear resize function
-static void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int target_width, int target_height) {
+void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int target_width, int target_height) {
     dst.nx = target_width;
     dst.ny = target_height;
     dst.buf.resize(3 * target_width * target_height);
diff --git a/src/cpp/src/visual_language/clip.hpp b/src/cpp/src/visual_language/clip.hpp
index 9494a48fd9..6c7acc24ab 100644
--- a/src/cpp/src/visual_language/clip.hpp
+++ b/src/cpp/src/visual_language/clip.hpp
@@ -33,6 +33,7 @@ struct clip_image_f32 {
 bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
 
 bool bicubic_resize(const clip_image_u8& img, clip_image_u8& dst, int target_width, int target_height);
+void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int target_width, int target_height);
 
 /** preprocess img and store the result in res_imgs, pad_to_square may be overridden to false depending on model configuration */
 clip_image_f32 clip_image_preprocess(struct clip_ctx& ctx, const clip_image_u8& img);
diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp
index b01f45917b..50fa57aa7b 100644
--- a/src/cpp/src/visual_language/inputs_embedder.cpp
+++ b/src/cpp/src/visual_language/inputs_embedder.cpp
@@ -1007,6 +1007,125 @@ class InputsEmbedderInternVLChat : public InputsEmbedder::IInputsEmbedder {
     }
 };
 
+class InputsEmbedderPhi3V : public InputsEmbedder::IInputsEmbedder {
+public:
+    InputsEmbedderPhi3V(
+        const VLMConfig& vlm_config,
+        const std::filesystem::path& model_dir,
+        const std::string& device,
+        const ov::AnyMap device_config
+    ) : IInputsEmbedder(vlm_config, model_dir, device, device_config), m_image_id{0} {}
+
+    virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images) override {
+        std::string images_prompt;
+        std::vector<EncodedImage> embeds;
+        for (const ov::Tensor& image : to_single_image_tensors(images)) {
+            EncodedImage encoded_image = m_vision_encoder.encode(image);
+        }
+        ov::Tensor inputs_embeds;
+        //     if (m_vlm_config.use_image_id) {
+        //         images_prompt += m_vlm_config.im_id_start + std::to_string(m_image_id) + m_vlm_config.im_id_end;
+        //         ++m_image_id;
+        //     }
+        //     std::string unk64;
+        //     for (size_t idx = 0; idx < m_vlm_config.query_num; ++idx) {
+        //         unk64 += m_vlm_config.unk;
+        //     }
+        //     images_prompt += m_vlm_config.im_start + unk64 + m_vlm_config.im_end;
+        //     if (encoded_image.slices) {
+        //         ov::Shape slices_shape = encoded_image.slices.get_shape();
+        //         for (size_t row_idx = 0; row_idx < slices_shape.at(0); ++row_idx) {
+        //             for (size_t col_idx = 0; col_idx < slices_shape.at(1); ++col_idx) {
+        //                 images_prompt += m_vlm_config.slice_start + unk64 + m_vlm_config.slice_end;
+        //             }
+        //             images_prompt += '\n';
+        //         }
+        //     }
+        //     if ('\n' != *(images_prompt.end() - 1)) {
+        //         // Image wasn't sliced, add \n to the end of image anyway.
+        //         // Strangely, \n isn't placed between </image><slice>.
+        //         images_prompt += '\n';
+        //     }
+        //     embeds.push_back(std::move(encoded_image));
+        // }
+        // images_prompt += prompt;
+
+        // ov::Tensor encoded_input = get_encoded_input_ids(images_prompt);
+
+        // ov::Tensor inputs_embeds = m_embedding.infer(encoded_input);
+        // OPENVINO_ASSERT(
+        //     m_vlm_config.hidden_size == inputs_embeds.get_shape().at(2),
+        //     "Unexpected embedding size"
+        // );
+        // ov::Tensor special_tokens = m_tokenizer.encode(
+        //     m_vlm_config.im_start
+        //     + m_vlm_config.im_end
+        //     + m_vlm_config.slice_start
+        //     + m_vlm_config.slice_end
+        // ).input_ids;
+        // OPENVINO_ASSERT(
+        //     4 == special_tokens.get_shape().at(1),
+        //     "Every special token must be represented with a single int."
+        // );
+        // int64_t im_start_id = special_tokens.data<int64_t>()[0];
+        // int64_t im_end_id = special_tokens.data<int64_t>()[1];
+        // int64_t slice_start_id = special_tokens.data<int64_t>()[2];
+        // int64_t slice_end_id = special_tokens.data<int64_t>()[3];
+        // int64_t im_start_pos = 0, slice_start_pos = 0;
+        // int64_t* begin = encoded_input.data<int64_t>();
+        // int64_t* ids = begin;
+        // size_t encoded_input_size = encoded_input.get_size();
+        // int64_t* end = ids + encoded_input_size;
+        // float* inputs_embeds_data = inputs_embeds.data<float>();
+        // for (const EncodedImage& encoded_image : embeds) {
+        //     const ov::Tensor& resampled_source = resample(encoded_image.resized_source, {encoded_image.resized_source_size});
+        //     float* emb = resampled_source.data<float>();
+        //     ids = std::find(ids, end, im_start_id);
+        //     OPENVINO_ASSERT(end != ids);
+        //     ++ids;
+        //     std::copy_n(emb, resampled_source.get_size(), inputs_embeds_data + std::distance(begin, ids) * m_vlm_config.hidden_size);
+        //     ids += m_vlm_config.query_num;
+        //     if (encoded_image.slices) {
+        //         size_t token_idx = 0;
+        //         const ov::Shape& slices_shape = encoded_image.slices.get_shape();
+        //         for (size_t i = 0; i < slices_shape.at(0); ++i) {
+        //             for (size_t ja = 0; ja < slices_shape.at(1); ++ja) {
+        //                 size_t d2 = slices_shape.at(2);
+        //                 size_t d3 = slices_shape.at(3);
+        //                 ov::Tensor encoded_view{ov::element::f32, {1, d2, d3}, encoded_image.slices.data<float>() + (i * slices_shape.at(1) + ja) * d2 * d3};
+        //                 const ov::Tensor& vision_embed_tensor_i_j = resample(encoded_view, {encoded_image.slices_size});
+        //                 ids = std::find(ids, end, slice_start_id);
+        //                 OPENVINO_ASSERT(end != ids);
+        //                 ++ids;
+        //                 std::copy_n(vision_embed_tensor_i_j.data<float>(), vision_embed_tensor_i_j.get_size(), inputs_embeds_data + std::distance(begin, ids) * m_vlm_config.hidden_size);
+        //                 ids += m_vlm_config.query_num;
+        //             }
+        //         }
+        //     }
+        // }
+
+        if (!m_is_chat_conversation) {
+            m_image_id = 0;
+        }
+
+        return inputs_embeds;
+    }
+
+    virtual void start_chat(const std::string& system_message) override {
+        IInputsEmbedder::start_chat(system_message);
+        m_image_id = 0;
+    }
+
+    virtual void finish_chat() override {
+        IInputsEmbedder::finish_chat();
+        m_image_id = 0;
+    }
+
+private:
+    // Used to insert <|image_i|>\n per image (not a slice).
+    size_t m_image_id;
+};
+
 InputsEmbedder::InputsEmbedder(const VLMConfig& vlm_config,
                                const std::filesystem::path& model_dir,
                                const std::string& device,
@@ -1019,6 +1138,8 @@ InputsEmbedder::InputsEmbedder(const VLMConfig& vlm_config,
         m_impl = std::make_shared<InputsEmbedderLLaVANext>(vlm_config, model_dir, device, device_config);
     } else if (vlm_config.model_type == VLMModelType::INTERNVL_CHAT) {
         m_impl = std::make_shared<InputsEmbedderInternVLChat>(vlm_config, model_dir, device, device_config);
+    } else if (vlm_config.model_type == VLMModelType::PHI3_V) {
+        m_impl = std::make_shared<InputsEmbedderPhi3V>(vlm_config, model_dir, device, device_config);
     } else {
         OPENVINO_THROW("Unsupported model type in VLM InputsEmbedder class. Please, create feature request on new model support");
     }
diff --git a/src/cpp/src/visual_language/inputs_embedder.hpp b/src/cpp/src/visual_language/inputs_embedder.hpp
index 15df273ee8..8e4442c407 100644
--- a/src/cpp/src/visual_language/inputs_embedder.hpp
+++ b/src/cpp/src/visual_language/inputs_embedder.hpp
@@ -45,6 +45,7 @@ class InputsEmbedder {
     friend class InputsEmbedderLLaVA;
     friend class InputsEmbedderLLaVANext;
     friend class InputsEmbedderInternVLChat;
+    friend class InputsEmbedderPhi3V;
 };
 
 } // namespace ov::genai
diff --git a/src/cpp/src/visual_language/processor_config.cpp b/src/cpp/src/visual_language/processor_config.cpp
index 7b953e5bed..e0d29a02c4 100644
--- a/src/cpp/src/visual_language/processor_config.cpp
+++ b/src/cpp/src/visual_language/processor_config.cpp
@@ -41,4 +41,8 @@ ov::genai::ProcessorConfig::ProcessorConfig(const std::filesystem::path& json_pa
     if (parsed.contains("image_grid_pinpoints")) {
         image_grid_pinpoints = parsed.at("image_grid_pinpoints").get<std::vector<std::pair<int, int>>>();
     }
+    read_json_param(parsed, "num_crops", phi3_v.num_crops);
+    if (parsed.contains("img_processor")) {
+        phi3_v.num_img_tokens = parsed.at("img_processor").at("num_img_tokens");
+    }
 }
diff --git a/src/cpp/src/visual_language/processor_config.hpp b/src/cpp/src/visual_language/processor_config.hpp
index 83cf9870a3..c7eac68204 100644
--- a/src/cpp/src/visual_language/processor_config.hpp
+++ b/src/cpp/src/visual_language/processor_config.hpp
@@ -35,9 +35,10 @@ class ProcessorConfig {
     /// llava calls it image_std.
     std::array<float, 3> norm_std{1.0f, 1.0f, 1.0f};
 
-    // llava specific config params
+    // A renamed version of norm_mean.
     std::array<float, 3> image_mean{0.0f, 0.0f, 0.0f};
     std::array<float, 3> image_std{1.0f, 1.0f, 1.0f};
+    // llava specific config params
     size_t crop_size_height = 336;
     size_t crop_size_width = 336;
     size_t size_shortest_edge = 336;
@@ -45,6 +46,11 @@ class ProcessorConfig {
     // llava-next specific config params
     std::vector<std::pair<int, int>> image_grid_pinpoints{{336, 672}, {672, 336}, {672, 672}, {1008, 336}, {336, 1008}};
 
+    struct {
+        size_t num_crops = 4;
+        size_t num_img_tokens = 144;
+    } phi3_v;
+
     /// @brief Default constructor
     ProcessorConfig() = default;
     /// @brief Construct ProcessorConfig from values in json_path.
diff --git a/src/cpp/src/visual_language/vision_encoder.cpp b/src/cpp/src/visual_language/vision_encoder.cpp
index 0b6b169f18..98705f63d0 100644
--- a/src/cpp/src/visual_language/vision_encoder.cpp
+++ b/src/cpp/src/visual_language/vision_encoder.cpp
@@ -613,8 +613,209 @@ ov::Tensor get_pixel_values_internvl(const ov::Tensor& image, const ProcessorCon
     }
     return output_tensor;
 }
+
+namespace phi3_v {
+constexpr size_t INPUT_IMAGE_SIZE = 336;
+
+ov::Tensor padding_336(const ov::Tensor& unpadded) {
+    ov::Shape _1ss3 = unpadded.get_shape();
+    size_t s1 = _1ss3.at(1), s2 = _1ss3.at(2);
+    // TODO: test horizontal and vertical images
+    if (s1 < s2) {
+        size_t tar = size_t(std::ceil(float(s1) / INPUT_IMAGE_SIZE) * INPUT_IMAGE_SIZE);
+        size_t top_padding = (tar - s1) / 2;
+        ov::Tensor padded{ov::element::u8, {1, tar, s2, 3}};
+        uint8_t* padded_data = padded.data<uint8_t>();
+        std::fill_n(padded_data, padded.get_size(), 255);
+        std::copy_n(unpadded.data<uint8_t>(), unpadded.get_size(), padded_data + top_padding * s2 * 3);
+        return padded;
+    }
+    size_t tar = size_t(std::ceil(float(s2) / INPUT_IMAGE_SIZE) * INPUT_IMAGE_SIZE);
+    size_t left_padding = (tar - s2) / 2;
+    ov::Tensor padded{ov::element::u8, {1, s1, tar, 3}};
+    uint8_t* padded_data = padded.data<uint8_t>();
+    std::fill_n(padded_data, padded.get_size(), 255);
+    uint8_t* unpadded_data = unpadded.data<uint8_t>();
+    for (size_t row = 0; row < s1; ++row) {
+        std::copy_n(unpadded_data + row * s2 * 3, s2 * 3, padded_data + row * tar * 3 + left_padding * 3);
+    }
+    return padded;
+}
+
+ov::Tensor HD_transform(const ov::Tensor& uint8, size_t num_crops) {
+    ov::Shape _1hwc = uint8.get_shape();
+    size_t height = _1hwc.at(1), width = _1hwc.at(2);
+    bool trans = false;
+    if (width < height) {
+        std::swap(height, width);
+        trans = true;
+    }
+    float ratio = float(width) / height;
+    unsigned scale = 1;
+    while (scale * std::ceil(scale / ratio) <= num_crops) {
+        ++scale;
+    }
+    --scale;
+    size_t new_w = scale * INPUT_IMAGE_SIZE;
+    size_t new_h = new_w / ratio;
+    clip_image_u8 src{}, dst{};
+    uint8_t* uint8_data = uint8.data<uint8_t>();
+    if (trans) {
+        src = clip_image_u8{height, width, {uint8_data, uint8_data + uint8.get_size()}};
+        bilinear_resize(src, dst, new_h, new_w);
+        // std::cout << new_h << ' ' << new_w << '\n';
+        return padding_336(ov::Tensor{ov::element::u8, {1, new_w, new_h, 3}, dst.buf.data()});
+    }
+    src = clip_image_u8{width, height, {uint8_data, uint8_data + uint8.get_size()}};
+    bilinear_resize(src, dst, new_w, new_h);
+    // std::cout << new_w << ' ' << new_h << '\n';
+    // 672, 448
+    return padding_336(ov::Tensor{ov::element::u8, {1, new_h, new_w, 3}, dst.buf.data()});
+}
+
+ov::Tensor mean_scale(const ov::Tensor& uint8, const ProcessorConfig& config) {
+    uint8_t* uint_8_data = uint8.data<uint8_t>();
+    ov::Tensor float_normalized{ov::element::f32, uint8.get_shape()};
+    float* float_data = float_normalized.data<float>();
+    OPENVINO_ASSERT(0 == uint8.get_size() % 3, "RGB");
+    for (size_t idx = 0; idx < uint8.get_size(); idx += 3) {
+        float_data[idx] = (float(uint_8_data[idx]) / 255.0f - config.norm_mean[0]) / config.norm_std[0];
+        float_data[idx + 1] = (float(uint_8_data[idx + 1]) / 255.0f - config.norm_mean[1]) / config.norm_std[1];
+        float_data[idx + 2] = (float(uint_8_data[idx + 2]) / 255.0f - config.norm_mean[2]) / config.norm_std[2];
+    }
+    return float_normalized;
 }
 
+ov::Tensor channels_first(const ov::Tensor& _1hw3) {
+    ov::Shape shape = _1hw3.get_shape();
+    ov::Tensor _13hw = ov::Tensor{ov::element::f32, {1, 3, shape.at(1), shape.at(2)}};
+    float* _1hw3_data = _1hw3.data<float>();
+    float* _13hw_data = _13hw.data<float>();
+    for (size_t plane = 0; plane < 3; ++plane) {
+        for (size_t row = 0; row < shape.at(1); ++row) {
+            for (size_t col = 0; col < shape.at(2); ++col) {
+                _13hw_data[plane * shape.at(1) * shape.at(2) + row * shape.at(2) + col] = _1hw3_data[row * shape.at(2) * 3 + col * 3 + plane];
+            }
+        }
+    }
+    return _13hw;
+}
+
+// Reimplementation of Python im.reshape(1, 3, h//336, 336, w//336, 336).permute(0,2,4,1,3,5).reshape(-1, 3, 336, 336)
+ov::Tensor slice_image(const ov::Tensor& image) {
+    ov::Shape shape = image.get_shape();
+    size_t N = shape[0];
+    size_t C = shape[1];
+    size_t H = shape[2];
+    size_t W = shape[3];
+
+    size_t num_h_slices = H / INPUT_IMAGE_SIZE;
+    size_t num_w_slices = W / INPUT_IMAGE_SIZE;
+
+    // Step 1: Define and populate the reshaped tensor in the correct shape order
+    ov::Tensor reshaped{ov::element::f32, {N, num_h_slices, num_w_slices, C, INPUT_IMAGE_SIZE, INPUT_IMAGE_SIZE}};
+    float* reshaped_data = reshaped.data<float>();
+    float* image_data = image.data<float>();
+
+    // Populate the reshaped tensor
+    for (size_t n = 0; n < N; ++n) {
+        for (size_t h = 0; h < num_h_slices; ++h) {
+            for (size_t w = 0; w < num_w_slices; ++w) {
+                for (size_t c = 0; c < C; ++c) {
+                    for (size_t i = 0; i < INPUT_IMAGE_SIZE; ++i) {
+                        for (size_t j = 0; j < INPUT_IMAGE_SIZE; ++j) {
+                            size_t src_idx = n * C * H * W + c * H * W + (h * INPUT_IMAGE_SIZE + i) * W + (w * INPUT_IMAGE_SIZE + j);
+                            size_t dst_idx = n * num_h_slices * num_w_slices * C * INPUT_IMAGE_SIZE * INPUT_IMAGE_SIZE +
+                                             h * num_w_slices * C * INPUT_IMAGE_SIZE * INPUT_IMAGE_SIZE +
+                                             w * C * INPUT_IMAGE_SIZE * INPUT_IMAGE_SIZE +
+                                             c * INPUT_IMAGE_SIZE * INPUT_IMAGE_SIZE +
+                                             i * INPUT_IMAGE_SIZE + j;
+                            reshaped_data[dst_idx] = image_data[src_idx];
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    // Step 2: Define the permuted tensor in the final shape
+    ov::Tensor permuted{ov::element::f32, {N * num_h_slices * num_w_slices, C, INPUT_IMAGE_SIZE, INPUT_IMAGE_SIZE}};
+    float* permuted_data = permuted.data<float>();
+
+    // Perform permutation by flattening N, num_h_slices, and num_w_slices
+    for (size_t n = 0; n < N; ++n) {
+        for (size_t h = 0; h < num_h_slices; ++h) {
+            for (size_t w = 0; w < num_w_slices; ++w) {
+                for (size_t c = 0; c < C; ++c) {
+                    for (size_t i = 0; i < INPUT_IMAGE_SIZE; ++i) {
+                        for (size_t j = 0; j < INPUT_IMAGE_SIZE; ++j) {
+                            size_t src_idx = n * num_h_slices * num_w_slices * C * INPUT_IMAGE_SIZE * INPUT_IMAGE_SIZE +
+                                             h * num_w_slices * C * INPUT_IMAGE_SIZE * INPUT_IMAGE_SIZE +
+                                             w * C * INPUT_IMAGE_SIZE * INPUT_IMAGE_SIZE +
+                                             c * INPUT_IMAGE_SIZE * INPUT_IMAGE_SIZE +
+                                             i * INPUT_IMAGE_SIZE + j;
+                            size_t dst_idx = (n * num_h_slices * num_w_slices + h * num_w_slices + w) * C * INPUT_IMAGE_SIZE * INPUT_IMAGE_SIZE +
+                                             c * INPUT_IMAGE_SIZE * INPUT_IMAGE_SIZE +
+                                             i * INPUT_IMAGE_SIZE + j;
+                            permuted_data[dst_idx] = reshaped_data[src_idx];
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    return permuted;
+}
+
+ov::Tensor concatenate_batch(const ov::Tensor& float_first, const ov::Tensor& float_second) {
+    ov::Shape shape_first = float_first.get_shape();
+    ov::Shape shape_second = float_second.get_shape();
+    OPENVINO_ASSERT(shape_first.at(1) == shape_second.at(1), "Channels must be the same");
+    OPENVINO_ASSERT(shape_first.at(2) == shape_second.at(2), "Height must be the same");
+    OPENVINO_ASSERT(shape_first.at(3) == shape_second.at(3), "Width must be the same");
+    ov::Tensor concatenated{ov::element::f32, {shape_first.at(0) + shape_second.at(0), shape_first.at(1), shape_first.at(2), shape_first.at(3)}};
+    float* concatenated_data = concatenated.data<float>();
+    float* first_data = float_first.data<float>();
+    float* second_data = float_second.data<float>();
+    std::copy(first_data, first_data + float_first.get_size(), concatenated_data);
+    std::copy(second_data, second_data + float_second.get_size(), concatenated_data + float_first.get_size());
+    return concatenated;
+}
+
+ov::Tensor pad_to_max_num_crops_tensor(const ov::Tensor& nchw, size_t max_crops) {
+    ov::Shape shape = nchw.get_shape();
+    size_t num_crops = shape[0];
+    if (num_crops >= max_crops) {
+        return nchw;
+    }
+    ov::Tensor padded{ov::element::f32, {max_crops, shape[1], shape[2], shape[3]}};
+    float* padded_data = padded.data<float>();
+    float* nchw_data = nchw.data<float>();
+    std::copy_n(nchw_data, nchw.get_size(), padded_data);
+    return padded;
+}
+
+std::tuple<ov::Tensor, ImageSize, size_t> get_pixel_values_phi3_v(const ov::Tensor& image, const ProcessorConfig& config) {
+    ov::Tensor hd_image = HD_transform(image, config.phi3_v.num_crops);  // TODO: this is just resize_and_pad_image() from clip.hpp.
+    ImageSize image_size{hd_image.get_shape().at(2), hd_image.get_shape().at(1)};
+    clip_image_u8 img{hd_image.get_shape().at(2), hd_image.get_shape().at(1), {hd_image.data<uint8_t>(), hd_image.data<uint8_t>() + hd_image.get_size()}};
+    clip_image_u8 dst;
+    bicubic_resize(img, dst, INPUT_IMAGE_SIZE, INPUT_IMAGE_SIZE);
+    ov::Tensor global_image{ov::element::u8, {1, INPUT_IMAGE_SIZE, INPUT_IMAGE_SIZE, 3}, dst.buf.data()};
+    global_image = mean_scale(global_image, config);
+    hd_image = mean_scale(hd_image, config);
+    global_image = channels_first(global_image);
+    hd_image = channels_first(hd_image);
+    ov::Tensor slices = slice_image(hd_image);
+    ov::Tensor concatenated = concatenate_batch(global_image, slices);
+    ov::Tensor pixel_values = pad_to_max_num_crops_tensor(concatenated, config.phi3_v.num_crops);
+    size_t num_img_tokens = (image_size.height / INPUT_IMAGE_SIZE) * (image_size.width / INPUT_IMAGE_SIZE) * config.phi3_v.num_img_tokens + 1 + (image_size.height / INPUT_IMAGE_SIZE + 1) * size_t(std::sqrt(config.phi3_v.num_img_tokens));
+    return {std::move(pixel_values), image_size, num_img_tokens};
+}
+}  // namespace phi3_v
+}  // anonymous namespace
+
 VisionEncoder::VisionEncoder(const std::filesystem::path& model_dir, const VLMModelType model_type, const std::string& device, const ov::AnyMap device_config, ov::Core core) :
     model_type(model_type) {
         m_vision_encoder = core.compile_model(model_dir / "openvino_vision_embeddings_model.xml", device, device_config).create_infer_request();
@@ -632,6 +833,8 @@ EncodedImage VisionEncoder::encode(const ov::Tensor& image, const ProcessorConfi
         return encode_llava_next(image, config);
     }  else if (model_type == VLMModelType::INTERNVL_CHAT) {
         return encode_internvl(image, config);
+    }  else if (model_type == VLMModelType::PHI3_V) {
+        return encode_phi3_v(image, config);
     } else {
         OPENVINO_THROW("Unsupported type of VisionEncoder");
     }
@@ -705,3 +908,11 @@ EncodedImage VisionEncoder::encode_internvl(const ov::Tensor& image, const Proce
 
     return {std::move(image_features), resized_source_size};
 }
+
+EncodedImage VisionEncoder::encode_phi3_v(const ov::Tensor& image, const ProcessorConfig& config) {
+    auto [pixel_values, image_size, num_img_tokens] = phi3_v::get_pixel_values_phi3_v(image, config);
+    // m_vision_encoder.set_input_tensor();
+    m_vision_encoder.infer();
+
+    return {};
+}
diff --git a/src/cpp/src/visual_language/vision_encoder.hpp b/src/cpp/src/visual_language/vision_encoder.hpp
index a95abb838c..ffb5e82d44 100644
--- a/src/cpp/src/visual_language/vision_encoder.hpp
+++ b/src/cpp/src/visual_language/vision_encoder.hpp
@@ -142,5 +142,9 @@ class VisionEncoder {
     EncodedImage encode_internvl(
         const ov::Tensor& image, const ProcessorConfig& config
     );
+
+    EncodedImage encode_phi3_v(
+        const ov::Tensor& image, const ProcessorConfig& config
+    );
 };
 }
diff --git a/src/cpp/src/visual_language/vlm_model_type.hpp b/src/cpp/src/visual_language/vlm_model_type.hpp
index e4b5e823b6..86b23f50f8 100644
--- a/src/cpp/src/visual_language/vlm_model_type.hpp
+++ b/src/cpp/src/visual_language/vlm_model_type.hpp
@@ -16,6 +16,7 @@ enum class VLMModelType {
     LLAVA,
     LLAVA_NEXT,
     INTERNVL_CHAT,
+    PHI3_V,
 };
 
 inline VLMModelType to_vlm_model_type(const std::string& value) {
@@ -23,7 +24,8 @@ inline VLMModelType to_vlm_model_type(const std::string& value) {
         {"minicpmv", VLMModelType::MINICPM},
         {"llava", VLMModelType::LLAVA},
         {"llava_next", VLMModelType::LLAVA_NEXT},
-        {"internvl_chat", VLMModelType::INTERNVL_CHAT}
+        {"internvl_chat", VLMModelType::INTERNVL_CHAT},
+        {"phi3_v", VLMModelType::PHI3_V}
     };
 
     auto it = model_types_map.find(value);
diff --git a/src/docs/SUPPORTED_MODELS.md b/src/docs/SUPPORTED_MODELS.md
index af3f8c064a..e36f15324d 100644
--- a/src/docs/SUPPORTED_MODELS.md
+++ b/src/docs/SUPPORTED_MODELS.md
@@ -257,6 +257,16 @@ The pipeline can work with other similar topologies produced by `optimum-intel`
         </ul>
       </td>
     </tr>
+    <tr>
+      <td><code>Phi3VForCausalLM</code></td>
+      <td>phi3_v</td>
+      <td>
+        <ul>
+          <li><a href="https://huggingface.co/microsoft/Phi-3-vision-128k-instruct"><code>microsoft/Phi-3-vision-128k-instruct</code></a></li>
+          <li><a href="https://huggingface.co/microsoft/Phi-3.5-vision-instruct"><code>microsoft/Phi-3.5-vision-instruct</code></a></li>
+        </ul>
+      </td>
+    </tr>
   </tbody>
 </table>
 

From 9d7c7a0ad523ca1026e09cca8ad76462ed93886e Mon Sep 17 00:00:00 2001
From: Vladimir Zlobin <vladimir.zlobin@intel.com>
Date: Thu, 7 Nov 2024 16:05:17 +0400
Subject: [PATCH 02/28] encode

---
 .../visual_language_chat.cpp                  | 22 +++++++++----------
 .../src/visual_language/vision_encoder.cpp    |  8 +++----
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/samples/cpp/visual_language_chat/visual_language_chat.cpp b/samples/cpp/visual_language_chat/visual_language_chat.cpp
index 3a655374e9..7e334f7502 100644
--- a/samples/cpp/visual_language_chat/visual_language_chat.cpp
+++ b/samples/cpp/visual_language_chat/visual_language_chat.cpp
@@ -9,7 +9,7 @@ bool print_subword(std::string&& subword) {
     return !(std::cout << subword << std::flush);
 }
 
-int main(int argc, char* argv[]) try {
+int main(int argc, char* argv[]) {
     if (3 != argc) {
         throw std::runtime_error(std::string{"Usage "} + argv[0] + " <MODEL_DIR> <IMAGE_FILE OR DIR_WITH_IMAGES>");
     }
@@ -48,14 +48,14 @@ int main(int argc, char* argv[]) try {
             "question:\n";
     }
     pipe.finish_chat();
-} catch (const std::exception& error) {
-    try {
-        std::cerr << error.what() << '\n';
-    } catch (const std::ios_base::failure&) {}
-    return EXIT_FAILURE;
-} catch (...) {
-    try {
-        std::cerr << "Non-exception object thrown\n";
-    } catch (const std::ios_base::failure&) {}
-    return EXIT_FAILURE;
+// } catch (const std::exception& error) {
+//     try {
+//         std::cerr << error.what() << '\n';
+//     } catch (const std::ios_base::failure&) {}
+//     return EXIT_FAILURE;
+// } catch (...) {
+//     try {
+//         std::cerr << "Non-exception object thrown\n";
+//     } catch (const std::ios_base::failure&) {}
+//     return EXIT_FAILURE;
 }
diff --git a/src/cpp/src/visual_language/vision_encoder.cpp b/src/cpp/src/visual_language/vision_encoder.cpp
index 610d9dee67..90d16f743e 100644
--- a/src/cpp/src/visual_language/vision_encoder.cpp
+++ b/src/cpp/src/visual_language/vision_encoder.cpp
@@ -905,9 +905,9 @@ EncodedImage VisionEncoder::encode_internvl(const ov::Tensor& image, const Proce
 }
 
 EncodedImage VisionEncoder::encode_phi3_v(const ov::Tensor& image, const ProcessorConfig& config) {
-    auto [pixel_values, image_size, num_img_tokens] = phi3_v::get_pixel_values_phi3_v(image, config);
-    // m_vision_encoder.set_input_tensor();
+    // TODO: drop num_img_tokens
+    const auto& [pixel_values, image_size, num_img_tokens] = phi3_v::get_pixel_values_phi3_v(image, config);
+    m_vision_encoder.set_input_tensor(pixel_values);
     m_vision_encoder.infer();
-
-    return {};
+    return {m_vision_encoder.get_output_tensor(), image_size};
 }

From 21dc4984ff08db835bead81c7519025d15d100d9 Mon Sep 17 00:00:00 2001
From: Vladimir Zlobin <vladimir.zlobin@intel.com>
Date: Wed, 27 Nov 2024 13:34:02 +0400
Subject: [PATCH 03/28] Add hd_feature_transformer

---
 .../src/visual_language/inputs_embedder.cpp   | 196 +++++++++++++++++-
 .../src/visual_language/vision_encoder.cpp    |   2 +
 2 files changed, 194 insertions(+), 4 deletions(-)

diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp
index c6c9f68ed7..e6fefe2d44 100644
--- a/src/cpp/src/visual_language/inputs_embedder.cpp
+++ b/src/cpp/src/visual_language/inputs_embedder.cpp
@@ -6,6 +6,7 @@
 #include "visual_language/clip.hpp"
 #include "visual_language/vision_encoder.hpp"
 #include "visual_language/embedding_model.hpp"
+#include "openvino/opsets/opset13.hpp"
 
 #include "utils.hpp"
 
@@ -1006,20 +1007,207 @@ class InputsEmbedderInternVLChat : public InputsEmbedder::IInputsEmbedder {
     }
 };
 
+namespace {
+namespace phi3_v {
+ov::InferRequest create_hd_feature_transformer() {
+    using namespace ov;
+    using namespace element;
+    using namespace opset13;
+    using namespace std;
+    auto t0 = make_shared<Parameter>(f32, PartialShape{-1, 576, 1024});
+    auto t1 = make_shared<Parameter>(i32, PartialShape{});
+    auto t2 = make_shared<Parameter>(i32, PartialShape{});
+    auto t3 = make_shared<ShapeOf>(t0);
+    auto t4 = make_shared<Constant>(i64, Shape{}, vector<int64_t>{0});
+    auto t5 = make_shared<Constant>(i64, Shape{}, vector<int64_t>{0});
+    auto t6 = make_shared<Gather>(t3, t4, t5);
+    auto t7 = make_shared<Constant>(i64, Shape{1}, vector<int64_t>{1});
+    auto t8 = make_shared<Reshape>(t6, t7, false);
+    auto t9 = make_shared<Constant>(i64, Shape{}, vector<int64_t>{1});
+    auto t10 = make_shared<Constant>(i64, Shape{}, vector<int64_t>{0});
+    auto t11 = make_shared<Gather>(t3, t9, t10);
+    auto t12 = make_shared<Convert>(t11, element::f32);
+    auto t13 = make_shared<Constant>(f32, Shape{}, vector<float>{0.5});
+    auto t14 = make_shared<Power>(t12, t13, "numpy");
+    auto t15 = make_shared<Convert>(t14, element::i32);
+    auto t16 = make_shared<Convert>(t15, element::i64);
+    auto t17 = make_shared<Constant>(i32, Shape{}, vector<int32_t>{0});
+    auto t18 = make_shared<Unsqueeze>(t16, t17);
+    auto t19 = make_shared<Constant>(i64, Shape{1}, vector<int64_t>{2});
+    auto t20 = make_shared<Constant>(i64, Shape{}, vector<int64_t>{0});
+    auto t21 = make_shared<Gather>(t3, t19, t20);
+    auto t22 = make_shared<Concat>(NodeVector{t8, t18, t18, t21}, 0);
+    auto t23 = make_shared<Reshape>(t0, t22, false);
+    auto t24 = make_shared<Constant>(i64, Shape{}, vector<int64_t>{2});
+    auto t25 = make_shared<Divide>(t16, t24, "numpy");
+    auto t26 = make_shared<Floor>(t25);
+    auto t27 = make_shared<Constant>(i32, Shape{}, vector<int32_t>{0});
+    auto t28 = make_shared<Unsqueeze>(t26, t27);
+    auto t29 = make_shared<Constant>(i64, Shape{1}, vector<int64_t>{2});
+    auto t30 = make_shared<Constant>(i64, Shape{1}, vector<int64_t>{2});
+    auto t31 = make_shared<Concat>(NodeVector{t8, t28, t29, t28, t30, t21}, 0);
+    auto t32 = make_shared<Reshape>(t23, t31, false);
+    auto t33 = make_shared<Constant>(i64, Shape{6}, vector<int64_t>{0, 1, 3, 2, 4, 5});
+    auto t34 = make_shared<Transpose>(t32, t33);
+    auto t35 = make_shared<Constant>(i64, Shape{1}, vector<int64_t>{-1});
+    auto t36 = make_shared<Constant>(i64, Shape{1}, vector<int64_t>{4});
+    auto t37 = make_shared<Multiply>(t21, t36, "numpy");
+    auto t38 = make_shared<Concat>(NodeVector{t8, t35, t37}, 0);
+    auto t39 = make_shared<Reshape>(t34, t38, false);
+    auto t40 = make_shared<Multiply>(t1, t2, "numpy");
+    auto t41 = make_shared<Convert>(t40, element::i64);
+    auto t42 = make_shared<Divide>(t6, t41, "numpy");
+    auto t43 = make_shared<Floor>(t42);
+    auto t44 = make_shared<Constant>(i64, Shape{}, vector<int64_t>{0});
+    auto t45 = make_shared<Unsqueeze>(t43, t44);
+    auto t46 = make_shared<Convert>(t1, element::i64);
+    auto t47 = make_shared<Unsqueeze>(t46, t44);
+    auto t48 = make_shared<Convert>(t2, element::i64);
+    auto t49 = make_shared<Unsqueeze>(t48, t44);
+    auto t50 = make_shared<Constant>(i64, Shape{1}, vector<int64_t>{-1});
+    auto t51 = make_shared<Concat>(NodeVector{t45, t47, t49, t28, t28, t50}, 0);
+    auto t52 = make_shared<Reshape>(t39, t51, false);
+    auto t53 = make_shared<Constant>(i64, Shape{6}, vector<int64_t>{0, 1, 3, 2, 4, 5});
+    auto t54 = make_shared<Transpose>(t52, t53);
+    auto t55 = make_shared<Multiply>(t1, t15, "numpy");
+    auto t56 = make_shared<Convert>(t55, element::i64);
+    auto t57 = make_shared<Constant>(i64, Shape{}, vector<int64_t>{2});
+    auto t58 = make_shared<Divide>(t56, t57, "numpy");
+    auto t59 = make_shared<Floor>(t58);
+    auto t60 = make_shared<Constant>(i32, Shape{}, vector<int32_t>{0});
+    auto t61 = make_shared<Unsqueeze>(t59, t60);
+    auto t62 = make_shared<Multiply>(t2, t15, "numpy");
+    auto t63 = make_shared<Convert>(t62, element::i64);
+    auto t64 = make_shared<Constant>(i64, Shape{}, vector<int64_t>{2});
+    auto t65 = make_shared<Divide>(t63, t64, "numpy");
+    auto t66 = make_shared<Floor>(t65);
+    auto t67 = make_shared<Unsqueeze>(t66, t60);
+    auto t68 = make_shared<Concat>(NodeVector{t45, t61, t67, t37}, 0);
+    auto t69 = make_shared<Reshape>(t54, t68, false);
+
+    // t0 = opset.Parameter({'shape': [-1, 576, 1024], 'element_type': 'f32'},  #  -> f32[?,576,1024]
+    // t1 = opset.Parameter({'shape': [], 'element_type': 'i32'},  #  -> i32[]
+    // t2 = opset.Parameter({'shape': [], 'element_type': 'i32'},  #  -> i32[]
+    // t3 = opset.ShapeOf([t0], {'output_type': 'i64'},  # f32[?,576,1024] -> i64[3]
+    // t4 = opset.Constant(model, 4,    #  -> i64[](0)
+    // t5 = opset.Constant(model, 5,    #  -> i64[](0)
+    // t6 = opset.Gather([t3, t4, t5], {'batch_dims': 0},  # i64[3], i64[], i64[] -> i64[]
+    // t7 = opset.Constant(model, 7,    #  -> i64[1]([1])
+    // t8 = opset.Reshape([t6, t7], {'special_zero': False},  # i64[], i64[1] -> i64[1]
+    // t9 = opset.Constant(model, 9,    #  -> i64[](1)
+    // t10 = opset.Constant(model, 10,  #  -> i64[](0)
+    // t11 = opset.Gather([t3, t9, t10], {'batch_dims': 0},  # i64[3], i64[], i64[] -> i64[]
+    // t12 = opset.Convert([t11], {'destination_type': 'f32'},  # i64[] -> f32[]
+    // t13 = opset.Constant(model, 13,  #  -> f32[](0.5)
+    // t14 = opset.Power([t12, t13], {'auto_broadcast': 'numpy'},  # f32[], f32[] -> f32[]
+    // t15 = opset.Convert([t14], {'destination_type': 'i32'},  # f32[] -> i32[]
+    // t16 = opset.Convert([t15], {'destination_type': 'i64'},  # i32[] -> i64[]
+    // t17 = opset.Constant(model, 17,    #  -> i32[](0)
+    // t18 = opset.Unsqueeze([t16, t17], {},  # i64[], i32[] -> i64[1]
+    // t19 = opset.Constant(model, 19,  #  -> i64[1]([2])
+    // t20 = opset.Constant(model, 20,  #  -> i64[](0)
+    // t21 = opset.Gather([t3, t19, t20], {'batch_dims': 0},  # i64[3], i64[1], i64[] -> i64[1]
+    // t22 = opset.Concat([t8, t18, t18, t21], {'axis': 0},  # i64[1], i64[1], i64[1], i64[1] -> i64[4]
+    // t23 = opset.Reshape([t0, t22], {'special_zero': False},  # f32[?,576,1024], i64[4] -> f32[?,24,24,1024]
+    // t24 = opset.Constant(model, 24,  #  -> i64[](2)
+    // t25 = opset.Divide([t16, t24], {'auto_broadcast': 'numpy', 'm_pythondiv': True},  # i64[], i64[] -> i64[]
+    // t26 = opset.Floor([t25], {},  # i64[] -> i64[]
+    // t27 = opset.Constant(model, 27,   #  -> i32[](0)
+    // t28 = opset.Unsqueeze([t26, t27], {},  # i64[], i32[] -> i64[1]
+    // t29 = opset.Constant(model, 29,   #  -> i64[1]([2])
+    // t30 = opset.Constant(model, 30,   #  -> i64[1]([2])
+    // t31 = opset.Concat([t8, t28, t29, t28, t30, t21], {'axis': 0},  # i64[1], i64[1], i64[1], i64[1], i64[1], i64[1] -> i64[6]
+    // t32 = opset.Reshape([t23, t31], {'special_zero': False},  # f32[?,24,24,1024], i64[6] -> f32[?,12,2,12,2,1024]
+    // t33 = opset.Constant(model, 33,
+    // t34 = opset.Transpose([t32, t33], {},  # f32[?,12,2,12,2,1024], i64[6] -> f32[?,12,12,2,2,1024]
+    // t35 = opset.Constant(model, 35,   #  -> i64[1]([-1])
+    // t36 = opset.Constant(model, 36,  #  -> i64[1]([4])
+    // t37 = opset.Multiply([t21, t36], {'auto_broadcast': 'numpy'},  # i64[1], i64[1] -> i64[1]
+    // t38 = opset.Concat([t8, t35, t37], {'axis': 0},  # i64[1], i64[1], i64[1] -> i64[3]
+    // t39 = opset.Reshape([t34, t38], {'special_zero': False},  # f32[?,12,12,2,2,1024], i64[3] -> f32[?,?,4096]
+    // t40 = opset.Multiply([t1, t2], {'auto_broadcast': 'numpy'},  # i32[], i32[] -> i32[]
+    // t41 = opset.Convert([t40], {'destination_type': 'i64'},  # i32[] -> i64[]
+    // t42 = opset.Divide([t6, t41], {'auto_broadcast': 'numpy', 'm_pythondiv': True},  # i64[], i64[] -> i64[]
+    // t43 = opset.Floor([t42], {},  # i64[] -> i64[]
+    // t44 = opset.Constant(model, 44,   #  -> i32[](0)
+    // t45 = opset.Unsqueeze([t43, t44], {},  # i64[], i32[] -> i64[1]
+    // t46 = opset.Convert([t1], {'destination_type': 'i64'},  # i32[] -> i64[]
+    // t47 = opset.Unsqueeze([t46, t44], {},  # i64[], i32[] -> i64[1]
+    // t48 = opset.Convert([t2], {'destination_type': 'i64'},  # i32[] -> i64[]
+    // t49 = opset.Unsqueeze([t48, t44], {},  # i64[], i32[] -> i64[1]
+    // t50 = opset.Constant(model, 50,  #  -> i64[1]([-1])
+    // t51 = opset.Concat([t45, t47, t49, t28, t28, t50], {'axis': 0},  # i64[1], i64[1], i64[1], i64[1], i64[1], i64[1] -> i64[6]
+    // t52 = opset.Reshape([t39, t51], {'special_zero': False},  # f32[?,?,4096], i64[6] -> f32[?,?,?,?,?,?]
+    // t53 = opset.Constant(model, 53,
+    // t54 = opset.Transpose([t52, t53], {},  # f32[?,?,?,?,?,?], i64[6] -> f32[?,?,?,?,?,?]
+    // t55 = opset.Multiply([t1, t15], {'auto_broadcast': 'numpy'},  # i32[], i32[] -> i32[]
+    // t56 = opset.Convert([t55], {'destination_type': 'i64'},  # i32[] -> i64[]
+    // t57 = opset.Constant(model, 57,  #  -> i64[](2)
+    // t58 = opset.Divide([t56, t57], {'auto_broadcast': 'numpy', 'm_pythondiv': True},  # i64[], i64[] -> i64[]
+    // t59 = opset.Floor([t58], {},  # i64[] -> i64[]
+    // t60 = opset.Constant(model, 60,  #  -> i32[](0)
+    // t61 = opset.Unsqueeze([t59, t60], {},  # i64[], i32[] -> i64[1]
+    // t62 = opset.Multiply([t2, t15], {'auto_broadcast': 'numpy'},  # i32[], i32[] -> i32[]
+    // t63 = opset.Convert([t62], {'destination_type': 'i64'},  # i32[] -> i64[]
+    // t64 = opset.Constant(model, 64,  #  -> i64[](2)
+    // t65 = opset.Divide([t63, t64], {'auto_broadcast': 'numpy', 'm_pythondiv': True},  # i64[], i64[] -> i64[]
+    // t66 = opset.Floor([t65], {},  # i64[] -> i64[]
+    // t67 = opset.Unsqueeze([t66, t60], {},  # i64[], i32[] -> i64[1]
+    // t68 = opset.Concat([t45, t61, t67, t37], {'axis': 0},  # i64[1], i64[1], i64[1], i64[1] -> i64[4]
+    // t69 = opset.Reshape([t54, t68], {'special_zero': False},  # f32[?,?,?,?,?,?], i64[4] -> f32[?,?,?,?]
+    shared_ptr<Model> model = make_shared<Model>(make_shared<Result>(t69), ParameterVector{t0, t1, t2});
+    ov::InferRequest hd_feature_transformer = utils::singleton_core().compile_model(
+        model, "CPU"
+    ).create_infer_request();
+    // hd_feature_transformer.set_input_tensor(0, ov::Tensor{f32, {4, 576, 1024}});
+    // ov::Tensor h_crop = ov::Tensor{i32, {}};
+    // h_crop.data<int32_t>()[0] = 2;
+    // hd_feature_transformer.set_input_tensor(1, h_crop);
+    // ov::Tensor w_crop = ov::Tensor{i32, {}};
+    // w_crop.data<int32_t>()[0] = 2;
+    // hd_feature_transformer.set_input_tensor(2, w_crop);
+    // hd_feature_transformer.infer();
+    return hd_feature_transformer;
+}
+
+ov::Tensor reshape_hd_patches_2x2merge(const ov::Tensor& image_features, size_t h_crop, size_t w_crop) {
+    ov::Shape shape = image_features.get_shape();
+    OPENVINO_ASSERT(3 == shape.size());
+    OPENVINO_ASSERT(1 == shape.at(0));
+    OPENVINO_ASSERT(24 * 24 == shape.at(1));
+    OPENVINO_ASSERT(1024 == shape.at(2));
+    return {};
+}
+
+// image_features.resized_source: (num_crops+1, 24*24, 1024)
+ov::Tensor hd_feature_transform(const EncodedImage& image_features) {
+    ov::Tensor global_image_features{ov::element::f32, {1, 24*24, 1024}, image_features.resized_source.data<float>()};
+    // global feature can be viewed as a special HD case with num_crops 1x1
+    ov::Tensor global_image_features_hd = reshape_hd_patches_2x2merge(global_image_features, 1, 1);
+    return {};
+}
+}
+}
+
 class InputsEmbedderPhi3V : public InputsEmbedder::IInputsEmbedder {
 public:
+    ov::InferRequest m_hd_feature_transformer;
+
     InputsEmbedderPhi3V(
         const VLMConfig& vlm_config,
         const std::filesystem::path& model_dir,
         const std::string& device,
         const ov::AnyMap device_config
-    ) : IInputsEmbedder(vlm_config, model_dir, device, device_config), m_image_id{0} {}
+    ):
+        IInputsEmbedder(vlm_config, model_dir, device, device_config), m_image_id{0},
+        m_hd_feature_transformer{phi3_v::create_hd_feature_transformer()} {}
 
     virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images) override {
         std::string images_prompt;
         std::vector<EncodedImage> embeds;
         for (const ov::Tensor& image : to_single_image_tensors(images)) {
             EncodedImage encoded_image = m_vision_encoder.encode(image);
+            ov::Tensor image_features_proj = phi3_v::hd_feature_transform(encoded_image);
         }
         ov::Tensor inputs_embeds;
         //     if (m_vlm_config.use_image_id) {
@@ -1055,17 +1243,17 @@ class InputsEmbedderPhi3V : public InputsEmbedder::IInputsEmbedder {
         // OPENVINO_ASSERT(
         //     m_vlm_config.hidden_size == inputs_embeds.get_shape().at(2),
         //     "Unexpected embedding size"
-        // );
+        //;
         // ov::Tensor special_tokens = m_tokenizer.encode(
         //     m_vlm_config.im_start
         //     + m_vlm_config.im_end
         //     + m_vlm_config.slice_start
         //     + m_vlm_config.slice_end
-        // ).input_ids;
+        //.input_ids;
         // OPENVINO_ASSERT(
         //     4 == special_tokens.get_shape().at(1),
         //     "Every special token must be represented with a single int."
-        // );
+        //;
         // int64_t im_start_id = special_tokens.data<int64_t>()[0];
         // int64_t im_end_id = special_tokens.data<int64_t>()[1];
         // int64_t slice_start_id = special_tokens.data<int64_t>()[2];
diff --git a/src/cpp/src/visual_language/vision_encoder.cpp b/src/cpp/src/visual_language/vision_encoder.cpp
index 07f4935f0c..6601ad3763 100644
--- a/src/cpp/src/visual_language/vision_encoder.cpp
+++ b/src/cpp/src/visual_language/vision_encoder.cpp
@@ -909,5 +909,7 @@ EncodedImage VisionEncoder::encode_phi3_v(const ov::Tensor& image, const Process
     const auto& [pixel_values, image_size, num_img_tokens] = phi3_v::get_pixel_values_phi3_v(image, config);
     m_vision_encoder.set_input_tensor(pixel_values);
     m_vision_encoder.infer();
+    // 2, 5, 3, 336, 336 2, 5, 576, 1024
+    std::cout << pixel_values.get_shape() << ' ' << m_vision_encoder.get_output_tensor().get_shape() << '\n';
     return {m_vision_encoder.get_output_tensor(), image_size};
 }

From b34b14ef80b04196ab8b8bf44d36725e21f8d2b3 Mon Sep 17 00:00:00 2001
From: Vladimir Zlobin <vladimir.zlobin@intel.com>
Date: Fri, 29 Nov 2024 14:12:33 +0400
Subject: [PATCH 04/28] actual data infer

---
 .../src/visual_language/inputs_embedder.cpp   | 38 +++++++++++++++----
 1 file changed, 30 insertions(+), 8 deletions(-)

diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp
index e6fefe2d44..302b3980c8 100644
--- a/src/cpp/src/visual_language/inputs_embedder.cpp
+++ b/src/cpp/src/visual_language/inputs_embedder.cpp
@@ -1009,6 +1009,21 @@ class InputsEmbedderInternVLChat : public InputsEmbedder::IInputsEmbedder {
 
 namespace {
 namespace phi3_v {
+// Reimplementation of python
+// N, L, C = image_features.shape
+// assert L == 24 * 24 and C == 1024 and N % (h_crop * w_crop) == 0
+// num_images = N // (h_crop * w_crop)
+// H = int(L**0.5)
+// print(L, H)
+// image_features_hd = (
+//     image_features.reshape(N, H, H, C)  # N, 24, 24, 1024
+//     .reshape(N, H // 2, 2, H // 2, 2, C)  # N, 12, 2, 12, 2, 1024
+//     .permute(0, 1, 3, 2, 4, 5)  # N, 12, 12, 2, 2, 1024
+//     .reshape(N, -1, 4 * C)  # N, 144, 4096
+//     .reshape(num_images, h_crop, w_crop, H // 2, H // 2, -1)  # n_img, h_crop, w_crop, 12, 12, 4096
+//     .permute(0, 1, 3, 2, 4, 5)  # n_img, h_crop, 12, w_crop, 12, 4096
+//     .reshape(num_images, h_crop * H // 2, w_crop * H // 2, 4 * C)  # n_img, h_crop*12, w_crop*12, 4096
+// )
 ov::InferRequest create_hd_feature_transformer() {
     using namespace ov;
     using namespace element;
@@ -1159,31 +1174,38 @@ ov::InferRequest create_hd_feature_transformer() {
     ov::InferRequest hd_feature_transformer = utils::singleton_core().compile_model(
         model, "CPU"
     ).create_infer_request();
-    // hd_feature_transformer.set_input_tensor(0, ov::Tensor{f32, {4, 576, 1024}});
+    // hd_feature_transformer.set_input_tensor(0, ov::Tensor{f32, {1, 576, 1024}});
     // ov::Tensor h_crop = ov::Tensor{i32, {}};
-    // h_crop.data<int32_t>()[0] = 2;
+    // h_crop.data<int32_t>()[0] = 1;
     // hd_feature_transformer.set_input_tensor(1, h_crop);
     // ov::Tensor w_crop = ov::Tensor{i32, {}};
-    // w_crop.data<int32_t>()[0] = 2;
+    // w_crop.data<int32_t>()[0] = 1;
     // hd_feature_transformer.set_input_tensor(2, w_crop);
     // hd_feature_transformer.infer();
+    // std::cout << hd_feature_transformer.get_output_tensor().get_shape() << '\n';  // [1,24,24,4096]
     return hd_feature_transformer;
 }
 
-ov::Tensor reshape_hd_patches_2x2merge(const ov::Tensor& image_features, size_t h_crop, size_t w_crop) {
+ov::Tensor reshape_hd_patches_2x2merge(const ov::Tensor& image_features, size_t h_crop, size_t w_crop, InferRequest& hd_feature_transformer) {
     ov::Shape shape = image_features.get_shape();
     OPENVINO_ASSERT(3 == shape.size());
     OPENVINO_ASSERT(1 == shape.at(0));
     OPENVINO_ASSERT(24 * 24 == shape.at(1));
     OPENVINO_ASSERT(1024 == shape.at(2));
-    return {};
+    hd_feature_transformer.set_input_tensor(0, image_features);
+    ov::Tensor height{ov::element::i32, {}, &h_crop};
+    hd_feature_transformer.set_input_tensor(1, height);
+    ov::Tensor width{ov::element::i32, {}, &w_crop};
+    hd_feature_transformer.set_input_tensor(2, width);
+    hd_feature_transformer.infer();
+    return hd_feature_transformer.get_output_tensor();
 }
 
 // image_features.resized_source: (num_crops+1, 24*24, 1024)
-ov::Tensor hd_feature_transform(const EncodedImage& image_features) {
+ov::Tensor hd_feature_transform(const EncodedImage& image_features, InferRequest& hd_feature_transformer) {
     ov::Tensor global_image_features{ov::element::f32, {1, 24*24, 1024}, image_features.resized_source.data<float>()};
     // global feature can be viewed as a special HD case with num_crops 1x1
-    ov::Tensor global_image_features_hd = reshape_hd_patches_2x2merge(global_image_features, 1, 1);
+    ov::Tensor global_image_features_hd = reshape_hd_patches_2x2merge(global_image_features, 1, 1, hd_feature_transformer);
     return {};
 }
 }
@@ -1207,7 +1229,7 @@ class InputsEmbedderPhi3V : public InputsEmbedder::IInputsEmbedder {
         std::vector<EncodedImage> embeds;
         for (const ov::Tensor& image : to_single_image_tensors(images)) {
             EncodedImage encoded_image = m_vision_encoder.encode(image);
-            ov::Tensor image_features_proj = phi3_v::hd_feature_transform(encoded_image);
+            ov::Tensor image_features_proj = phi3_v::hd_feature_transform(encoded_image, m_hd_feature_transformer);
         }
         ov::Tensor inputs_embeds;
         //     if (m_vlm_config.use_image_id) {

From 2da865838efb0a54bbdf93f386b935147a2278d4 Mon Sep 17 00:00:00 2001
From: Vladimir Zlobin <vladimir.zlobin@intel.com>
Date: Tue, 10 Dec 2024 14:48:47 +0400
Subject: [PATCH 05/28] align tokenizers

---
 thirdparty/openvino_tokenizers | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thirdparty/openvino_tokenizers b/thirdparty/openvino_tokenizers
index 306dcd8dae..904046825b 160000
--- a/thirdparty/openvino_tokenizers
+++ b/thirdparty/openvino_tokenizers
@@ -1 +1 @@
-Subproject commit 306dcd8daec36bbc680c50c68de1e954f42b0ab8
+Subproject commit 904046825b6378bae74f16f302b40599aa88d5b3

From 27d913dbc7ff34746c6287083d8c089181cb7c5c Mon Sep 17 00:00:00 2001
From: Vladimir Zlobin <vladimir.zlobin@intel.com>
Date: Wed, 11 Dec 2024 15:55:28 +0400
Subject: [PATCH 06/28] skip resize

---
 .../src/visual_language/inputs_embedder.cpp   | 66 ++++++++++++++++++-
 .../src/visual_language/vision_encoder.cpp    | 42 +++++++++---
 src/cpp/src/visual_language/vlm_config.cpp    |  4 ++
 src/cpp/src/visual_language/vlm_config.hpp    |  2 +
 4 files changed, 102 insertions(+), 12 deletions(-)

diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp
index 305233aeb4..d6272f4185 100644
--- a/src/cpp/src/visual_language/inputs_embedder.cpp
+++ b/src/cpp/src/visual_language/inputs_embedder.cpp
@@ -1289,11 +1289,71 @@ ov::Tensor reshape_hd_patches_2x2merge(const ov::Tensor& image_features, size_t
     return hd_feature_transformer.get_output_tensor();
 }
 
+// image_features_hd: (num_images, h_crop*12, w_crop*12, 4096)
+// output: (num_images, (h_crop*12) * (w_crop*12+1), 4096)
+ov::Tensor add_image_newline(const ov::Tensor& image_features_hd, const std::vector<float>& sub_GN) {
+    const ov::Shape& nhwc = image_features_hd.get_shape();  // [N, 12*h_crop, 12*w_crop, 4096]
+    const float* in = image_features_hd.data<float>();
+    ov::Tensor image_features_hd_new_line{ov::element::f32, {nhwc.at(0), nhwc.at(1) * (nhwc.at(2) + 1), nhwc.at(3)}};
+    float* out = image_features_hd_new_line.data<float>();
+    for (size_t batch_id = 0; batch_id < nhwc.at(0); ++batch_id) {
+        for (size_t row_id = 0; row_id < nhwc.at(1); ++row_id) {
+            for (size_t col_id = 0; col_id < nhwc.at(2); ++col_id) {
+                std::copy_n(
+                    in + batch_id * nhwc.at(1) * nhwc.at(2) * nhwc.at(3) + row_id * nhwc.at(2) * nhwc.at(3) + col_id * nhwc.at(3),
+                    nhwc.at(3),
+                    out + batch_id * nhwc.at(1) * (nhwc.at(2) + 1) * nhwc.at(3) + row_id * (nhwc.at(2) + 1) * nhwc.at(3) + col_id * nhwc.at(3)
+                );
+            }
+            std::copy(
+                sub_GN.begin(),
+                sub_GN.end(),
+                out + batch_id * nhwc.at(1) * (nhwc.at(2) + 1) * nhwc.at(3) + row_id * (nhwc.at(2) + 1) * nhwc.at(3) + nhwc.at(2) * nhwc.at(3)
+            );
+        }
+    }
+    // std::cout << "AAAAAAAAAAAAAAAAAAAAAa\n";
+    // std::cout << out[12*4096-1]<<'\n';
+    // std::cout << out[12*4096+1]<<'\n';
+    // std::cout << out[12*4096+4095]<<'\n';
+    // std::cout << out[12*4096+4096]<<'\n';
+    // std::cout << out[13*2*4096]<<'\n';
+    // std::cout << out[(13*2+12)*4096]<<'\n';
+    // std::cout << "BBBBBBBBBBBBBBBBB\n";
+    return image_features_hd_new_line;
+}
+
 // image_features.resized_source: (num_crops+1, 24*24, 1024)
-ov::Tensor hd_feature_transform(const EncodedImage& image_features, InferRequest& hd_feature_transformer) {
-    ov::Tensor global_image_features{ov::element::f32, {1, 24*24, 1024}, image_features.resized_source.data<float>()};
+ov::Tensor hd_feature_transform(const EncodedImage& image_features, InferRequest& hd_feature_transformer, const std::vector<float>& sub_GN) {
+    // std::cout << image_features.resized_source.data<float>()[576*1024 + 0] << '\n';
+    // std::cout << image_features.resized_source.data<float>()[576*1024 + 1] << '\n';
+    // std::cout << image_features.resized_source.data<float>()[576*1024 + 1025] << '\n';
+    // std::cout << image_features.resized_source.data<float>()[576*1024 + 4090] << '\n';
+    // std::cout << image_features.resized_source.data<float>()[576*1024 + 80000] << '\n';
+// [5,3,336,336] [5,576,1024]
+// 0.134461
+// -0.867309
+// -0.274503
+// 1.73786
+// 0.13117
+// [5,3,336,336] [5,576,1024]
+// -1.01567
+// -0.291421
+// -0.260488
+// 0.743025
+// 1.4099
+    const ov::Shape& image_features_shape = image_features.resized_source.get_shape();
+    ov::Tensor global_image_features{ov::element::f32, {1, image_features_shape.at(1), image_features_shape.at(2)}, image_features.resized_source.data<float>()};
     // global feature can be viewed as a special HD case with num_crops 1x1
     ov::Tensor global_image_features_hd = reshape_hd_patches_2x2merge(global_image_features, 1, 1, hd_feature_transformer);
+    ov::Tensor global_image_features_hd_newline = add_image_newline(global_image_features_hd, sub_GN);
+    constexpr size_t INPUT_IMAGE_SIZE = 336;
+    size_t h_crop = image_features.resized_source_size.height / INPUT_IMAGE_SIZE;
+    size_t w_crop = image_features.resized_source_size.width / INPUT_IMAGE_SIZE;
+    size_t num_crops = h_crop * w_crop;
+
+    // NOTE: real num_crops is padded
+    // (num_crops, 24*24, 1024)
     return {};
 }
 }
@@ -1317,7 +1377,7 @@ class InputsEmbedderPhi3V : public InputsEmbedder::IInputsEmbedder {
         std::vector<EncodedImage> embeds;
         for (const ov::Tensor& image : to_single_image_tensors(images)) {
             EncodedImage encoded_image = m_vision_encoder.encode(image);
-            ov::Tensor image_features_proj = phi3_v::hd_feature_transform(encoded_image, m_hd_feature_transformer);
+            ov::Tensor image_features_proj = phi3_v::hd_feature_transform(encoded_image, m_hd_feature_transformer, m_vlm_config.sub_GN);
         }
         ov::Tensor inputs_embeds;
         //     if (m_vlm_config.use_image_id) {
diff --git a/src/cpp/src/visual_language/vision_encoder.cpp b/src/cpp/src/visual_language/vision_encoder.cpp
index 492b4eca95..584490f632 100644
--- a/src/cpp/src/visual_language/vision_encoder.cpp
+++ b/src/cpp/src/visual_language/vision_encoder.cpp
@@ -658,13 +658,11 @@ ov::Tensor HD_transform(const ov::Tensor& uint8, size_t num_crops) {
     if (trans) {
         src = clip_image_u8{height, width, {uint8_data, uint8_data + uint8.get_size()}};
         bilinear_resize(src, dst, new_h, new_w);
-        // std::cout << new_h << ' ' << new_w << '\n';
         return padding_336(ov::Tensor{ov::element::u8, {1, new_w, new_h, 3}, dst.buf.data()});
     }
     src = clip_image_u8{width, height, {uint8_data, uint8_data + uint8.get_size()}};
-    bilinear_resize(src, dst, new_w, new_h);
-    // std::cout << new_w << ' ' << new_h << '\n';
-    // 672, 448
+    // bilinear_resize(src, dst, new_w, new_h);
+    dst = src; // TODO: put resize back
     return padding_336(ov::Tensor{ov::element::u8, {1, new_h, new_w, 3}, dst.buf.data()});
 }
 
@@ -674,9 +672,9 @@ ov::Tensor mean_scale(const ov::Tensor& uint8, const ProcessorConfig& config) {
     float* float_data = float_normalized.data<float>();
     OPENVINO_ASSERT(0 == uint8.get_size() % 3, "RGB");
     for (size_t idx = 0; idx < uint8.get_size(); idx += 3) {
-        float_data[idx] = (float(uint_8_data[idx]) / 255.0f - config.norm_mean[0]) / config.norm_std[0];
-        float_data[idx + 1] = (float(uint_8_data[idx + 1]) / 255.0f - config.norm_mean[1]) / config.norm_std[1];
-        float_data[idx + 2] = (float(uint_8_data[idx + 2]) / 255.0f - config.norm_mean[2]) / config.norm_std[2];
+        float_data[idx] = (float(uint_8_data[idx]) / 255.0f - config.image_mean[0]) / config.image_std[0];
+        float_data[idx + 1] = (float(uint_8_data[idx + 1]) / 255.0f - config.image_mean[1]) / config.image_std[1];
+        float_data[idx + 2] = (float(uint_8_data[idx + 2]) / 255.0f - config.image_mean[2]) / config.image_std[2];
     }
     return float_normalized;
 }
@@ -922,9 +920,35 @@ EncodedImage VisionEncoder::encode_internvl(const ov::Tensor& image, const Proce
 EncodedImage VisionEncoder::encode_phi3_v(const ov::Tensor& image, const ProcessorConfig& config) {
     // TODO: drop num_img_tokens
     const auto& [pixel_values, image_size, num_img_tokens] = phi3_v::get_pixel_values_phi3_v(image, config);
+    // std::cout << pixel_values.data<float>()[3*336*336+0] << '\n';
+    // std::cout << pixel_values.data<float>()[3*336*336+1] << '\n';
+    // std::cout << pixel_values.data<float>()[3*336*336+100] << '\n';
+// -1.79226
+// -1.74847
+// -1.14993
+// 0.645675
+// 0.660273
+// 1.09823
     m_vision_encoder.set_input_tensor(pixel_values);
     m_vision_encoder.infer();
-    // 2, 5, 3, 336, 336 2, 5, 576, 1024
-    std::cout << pixel_values.get_shape() << ' ' << m_vision_encoder.get_output_tensor().get_shape() << '\n';
+    // std::cout << pixel_values.get_shape() << ' ' << m_vision_encoder.get_output_tensor().get_shape() << '\n';
+    // ov::Tensor out = m_vision_encoder.get_output_tensor();
+    // std::cout << out.data<float>()[576*1024 + 0] << '\n';
+    // std::cout << out.data<float>()[576*1024 + 1] << '\n';
+    // std::cout << out.data<float>()[576*1024 + 1025] << '\n';
+    // std::cout << out.data<float>()[576*1024 + 4090] << '\n';
+    // std::cout << out.data<float>()[576*1024 + 80000] << '\n';
+// [5,3,336,336] [5,576,1024]
+// 0.134461
+// -0.867309
+// -0.274503
+// 1.73786
+// 0.13117
+// [5,3,336,336] [5,576,1024]
+// -1.01567
+// -0.291421
+// -0.260488
+// 0.743025
+// 1.4099
     return {m_vision_encoder.get_output_tensor(), image_size};
 }
diff --git a/src/cpp/src/visual_language/vlm_config.cpp b/src/cpp/src/visual_language/vlm_config.cpp
index c4022ab80e..da825b6fce 100644
--- a/src/cpp/src/visual_language/vlm_config.cpp
+++ b/src/cpp/src/visual_language/vlm_config.cpp
@@ -19,4 +19,8 @@ ov::genai::VLMConfig::VLMConfig(const std::filesystem::path& json_path) {
 
     // Setting llava_next specific config params
     read_json_param(parsed, "image_newline", image_newline);
+    // phi3_v
+    if (parsed.contains("sub_GN")) {
+        sub_GN = parsed.at("sub_GN").get<std::vector<std::vector<std::vector<std::vector<float>>>>>().at(0).at(0).at(0);
+    }
 }
diff --git a/src/cpp/src/visual_language/vlm_config.hpp b/src/cpp/src/visual_language/vlm_config.hpp
index c126d5495e..5e59f3605f 100644
--- a/src/cpp/src/visual_language/vlm_config.hpp
+++ b/src/cpp/src/visual_language/vlm_config.hpp
@@ -54,6 +54,8 @@ class VLMConfig {
     std::string image_context_token = "<IMG_CONTEXT>";
     /// @brief A string token denoting end of image embeddings for InternVL2 model.
     std::string image_end_token = "</img>";
+    /// @brief phi3_v new line token embedding to separate images.
+    std::vector<float> sub_GN = std::vector(4096, 0.0f);
     
     /// @brief Default constructor.
     VLMConfig() = default;

From 66f75d55a4861811aede8233b805f90203e6b920 Mon Sep 17 00:00:00 2001
From: Vladimir Zlobin <vladimir.zlobin@intel.com>
Date: Thu, 12 Dec 2024 14:52:11 +0400
Subject: [PATCH 07/28] vision

---
 .../src/visual_language/inputs_embedder.cpp   | 84 +++++++++++++++++--
 src/cpp/src/visual_language/vlm_config.cpp    |  5 ++
 src/cpp/src/visual_language/vlm_config.hpp    |  1 +
 3 files changed, 84 insertions(+), 6 deletions(-)

diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp
index d6272f4185..ed6ca87dc1 100644
--- a/src/cpp/src/visual_language/inputs_embedder.cpp
+++ b/src/cpp/src/visual_language/inputs_embedder.cpp
@@ -1277,7 +1277,6 @@ ov::InferRequest create_hd_feature_transformer() {
 ov::Tensor reshape_hd_patches_2x2merge(const ov::Tensor& image_features, size_t h_crop, size_t w_crop, InferRequest& hd_feature_transformer) {
     ov::Shape shape = image_features.get_shape();
     OPENVINO_ASSERT(3 == shape.size());
-    OPENVINO_ASSERT(1 == shape.at(0));
     OPENVINO_ASSERT(24 * 24 == shape.at(1));
     OPENVINO_ASSERT(1024 == shape.at(2));
     hd_feature_transformer.set_input_tensor(0, image_features);
@@ -1323,8 +1322,22 @@ ov::Tensor add_image_newline(const ov::Tensor& image_features_hd, const std::vec
     return image_features_hd_new_line;
 }
 
+ov::Tensor concatenate_2d(const ov::Tensor& first_1lf, const std::vector<float>& second_f, const ov::Tensor& third_1lf) {
+    size_t first_l = first_1lf.get_shape().at(1);
+    constexpr size_t second_l = 1;
+    size_t third_l = third_1lf.get_shape().at(1);
+    size_t features = first_1lf.get_shape().at(2);
+    OPENVINO_ASSERT(second_f.size() == features);
+    ov::Tensor out_1lf{ov::element::f32, {1, first_l + second_l + third_l, features}};
+    float* out = out_1lf.data<float>();
+    std::copy_n(first_1lf.data<float>(), first_l * features, out);
+    std::copy(second_f.begin(), second_f.end(), out + first_l * features);
+    std::copy_n(third_1lf.data<float>(), third_l * features, out + (first_l + second_l) * features);
+    return out_1lf;
+}
+
 // image_features.resized_source: (num_crops+1, 24*24, 1024)
-ov::Tensor hd_feature_transform(const EncodedImage& image_features, InferRequest& hd_feature_transformer, const std::vector<float>& sub_GN) {
+ov::Tensor hd_feature_transform(const EncodedImage& image_features, InferRequest& hd_feature_transformer, const std::vector<float>& sub_GN, const std::vector<float>& glb_GN, ov::InferRequest& vision_projection) {
     // std::cout << image_features.resized_source.data<float>()[576*1024 + 0] << '\n';
     // std::cout << image_features.resized_source.data<float>()[576*1024 + 1] << '\n';
     // std::cout << image_features.resized_source.data<float>()[576*1024 + 1025] << '\n';
@@ -1346,7 +1359,7 @@ ov::Tensor hd_feature_transform(const EncodedImage& image_features, InferRequest
     ov::Tensor global_image_features{ov::element::f32, {1, image_features_shape.at(1), image_features_shape.at(2)}, image_features.resized_source.data<float>()};
     // global feature can be viewed as a special HD case with num_crops 1x1
     ov::Tensor global_image_features_hd = reshape_hd_patches_2x2merge(global_image_features, 1, 1, hd_feature_transformer);
-    ov::Tensor global_image_features_hd_newline = add_image_newline(global_image_features_hd, sub_GN);
+    ov::Tensor global_image_features_hd_newline = add_image_newline(global_image_features_hd, sub_GN);  // [1,12*(12+1),4096]
     constexpr size_t INPUT_IMAGE_SIZE = 336;
     size_t h_crop = image_features.resized_source_size.height / INPUT_IMAGE_SIZE;
     size_t w_crop = image_features.resized_source_size.width / INPUT_IMAGE_SIZE;
@@ -1354,7 +1367,46 @@ ov::Tensor hd_feature_transform(const EncodedImage& image_features, InferRequest
 
     // NOTE: real num_crops is padded
     // (num_crops, 24*24, 1024)
-    return {};
+    ov::Tensor sub_image_features{ov::element::f32, {
+        num_crops,
+        image_features_shape.at(1),
+        image_features_shape.at(2)
+    }, image_features.resized_source.data<float>() + image_features_shape.at(1) * image_features_shape.at(2)};
+    ov::Tensor sub_image_features_hd = reshape_hd_patches_2x2merge(sub_image_features, h_crop, w_crop, hd_feature_transformer);  // [1, 24, 24, 4096]
+    // std::cout<<sub_image_features_hd.data<float>()[0]<<'\n';
+    // std::cout<<sub_image_features_hd.data<float>()[1]<<'\n';
+    // std::cout<<sub_image_features_hd.data<float>()[4096]<<'\n';
+    // std::cout<<sub_image_features_hd.data<float>()[12*13*4096]<<'\n';
+    // std::cout<<sub_image_features_hd.data<float>()[12*13*4096+1]<<'\n';
+//     0.134461
+// -0.867309
+// 0.342726
+// -0.0916849
+// -2.65548
+// -1.01567
+// -0.291421
+// -0.993172
+// -1.0575
+// -0.299
+    ov::Tensor sub_image_features_hd_newline = add_image_newline(sub_image_features_hd, sub_GN);  // [1,h_crop*12*(w_crop*12+1), 4096]
+    // std::cout << sub_image_features_hd_newline.get_shape()<<'\n';
+    // std::cout<<global_image_features_hd_newline.get_shape()<<'\n';
+//     std::cout<<sub_image_features_hd_newline.data<float>()[0]<<'\n';
+//     std::cout<<sub_image_features_hd_newline.data<float>()[1]<<'\n';
+//     std::cout<<sub_image_features_hd_newline.data<float>()[4096]<<'\n';
+//     std::cout<<sub_image_features_hd_newline.data<float>()[12*13*4096]<<'\n';
+//     std::cout<<sub_image_features_hd_newline.data<float>()[12*13*4096+1]<<'\n';
+//     0.134461
+// -0.867309
+// 0.342726
+// 0.0147288
+// -1.87735
+// -1.01567
+// -0.291421
+// -0.993172
+// -1.03232
+// -0.183072
+    return concatenate_2d(sub_image_features_hd_newline, glb_GN, global_image_features_hd_newline);  // [1,l,4096]
 }
 }
 }
@@ -1362,6 +1414,7 @@ ov::Tensor hd_feature_transform(const EncodedImage& image_features, InferRequest
 class InputsEmbedderPhi3V : public InputsEmbedder::IInputsEmbedder {
 public:
     ov::InferRequest m_hd_feature_transformer;
+    ov::InferRequest m_vision_projection;
 
     InputsEmbedderPhi3V(
         const VLMConfig& vlm_config,
@@ -1370,14 +1423,33 @@ class InputsEmbedderPhi3V : public InputsEmbedder::IInputsEmbedder {
         const ov::AnyMap device_config
     ):
         IInputsEmbedder(vlm_config, model_dir, device, device_config), m_image_id{0},
-        m_hd_feature_transformer{phi3_v::create_hd_feature_transformer()} {}
+        m_hd_feature_transformer{phi3_v::create_hd_feature_transformer()},
+        m_vision_projection{utils::singleton_core().compile_model(model_dir / "openvino_vision_projection_model.xml", device).create_infer_request()} {}
 
     ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics) override {
         std::string images_prompt;
         std::vector<EncodedImage> embeds;
         for (const ov::Tensor& image : to_single_image_tensors(images)) {
             EncodedImage encoded_image = m_vision_encoder.encode(image);
-            ov::Tensor image_features_proj = phi3_v::hd_feature_transform(encoded_image, m_hd_feature_transformer, m_vlm_config.sub_GN);
+            ov::Tensor image_features_proj = phi3_v::hd_feature_transform(encoded_image, m_hd_feature_transformer, m_vlm_config.sub_GN, m_vlm_config.glb_GN, m_vision_projection);
+            std::cout << image_features_proj.data<float>()[0]<<'\n';
+            std::cout << image_features_proj.data<float>()[4096]<<'\n';
+            std::cout << image_features_proj.data<float>()[4097]<<'\n';
+            std::cout << image_features_proj.data<float>()[700*4096]<<'\n';
+            std::cout << image_features_proj.data<float>()[700*4097]<<'\n';
+            std::cout << image_features_proj.data<float>()[757*4096-1]<<'\n';
+            // 0.134461
+// 0.342726
+// 0.0631084
+// 0.434334
+// 0.650556
+// 0
+// -1.01567
+// -0.993172
+// -0.226981
+// -1.89643
+// -0.907323
+// 0
         }
         ov::Tensor inputs_embeds;
         //     if (m_vlm_config.use_image_id) {
diff --git a/src/cpp/src/visual_language/vlm_config.cpp b/src/cpp/src/visual_language/vlm_config.cpp
index da825b6fce..5af1b0d9b6 100644
--- a/src/cpp/src/visual_language/vlm_config.cpp
+++ b/src/cpp/src/visual_language/vlm_config.cpp
@@ -23,4 +23,9 @@ ov::genai::VLMConfig::VLMConfig(const std::filesystem::path& json_path) {
     if (parsed.contains("sub_GN")) {
         sub_GN = parsed.at("sub_GN").get<std::vector<std::vector<std::vector<std::vector<float>>>>>().at(0).at(0).at(0);
     }
+    OPENVINO_ASSERT(sub_GN.size() == 4096);
+    if (parsed.contains("glb_GN")) {
+        glb_GN = parsed.at("glb_GN").get<std::vector<std::vector<std::vector<float>>>>().at(0).at(0);
+    }
+    OPENVINO_ASSERT(glb_GN.size() == 4096);
 }
diff --git a/src/cpp/src/visual_language/vlm_config.hpp b/src/cpp/src/visual_language/vlm_config.hpp
index 5e59f3605f..08de40321e 100644
--- a/src/cpp/src/visual_language/vlm_config.hpp
+++ b/src/cpp/src/visual_language/vlm_config.hpp
@@ -56,6 +56,7 @@ class VLMConfig {
     std::string image_end_token = "</img>";
     /// @brief phi3_v new line token embedding to separate images.
     std::vector<float> sub_GN = std::vector(4096, 0.0f);
+    std::vector<float> glb_GN = std::vector(4096, 0.0f);
     
     /// @brief Default constructor.
     VLMConfig() = default;

From c7fc21c6e5566ae4064cddd703891ae595481642 Mon Sep 17 00:00:00 2001
From: Vladimir Zlobin <vladimir.zlobin@intel.com>
Date: Thu, 12 Dec 2024 18:57:25 +0400
Subject: [PATCH 08/28] regex

---
 .../src/visual_language/inputs_embedder.cpp   | 167 +++++++++---------
 1 file changed, 79 insertions(+), 88 deletions(-)

diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp
index ed6ca87dc1..c2591757ab 100644
--- a/src/cpp/src/visual_language/inputs_embedder.cpp
+++ b/src/cpp/src/visual_language/inputs_embedder.cpp
@@ -10,12 +10,7 @@
 #include "openvino/opsets/opset13.hpp"
 
 #include "utils.hpp"
-
-namespace {
-
-constexpr size_t BATCH_SIZE = 1;
-
-} // namespace
+#include <regex>
 
 namespace ov::genai {
 
@@ -618,6 +613,7 @@ class InputsEmbedderLLaVA : public InputsEmbedder::IInputsEmbedder {
         }
         size_t merged_seq_length = text_embeds_seq_length + total_image_seq_length - num_image_tokens;
 
+    constexpr size_t BATCH_SIZE = 1;
         ov::Tensor merged_embeds(text_embeds.get_element_type(), {BATCH_SIZE, merged_seq_length, hidden_size});
         float* merged_data = merged_embeds.data<float>();
 
@@ -1311,14 +1307,6 @@ ov::Tensor add_image_newline(const ov::Tensor& image_features_hd, const std::vec
             );
         }
     }
-    // std::cout << "AAAAAAAAAAAAAAAAAAAAAa\n";
-    // std::cout << out[12*4096-1]<<'\n';
-    // std::cout << out[12*4096+1]<<'\n';
-    // std::cout << out[12*4096+4095]<<'\n';
-    // std::cout << out[12*4096+4096]<<'\n';
-    // std::cout << out[13*2*4096]<<'\n';
-    // std::cout << out[(13*2+12)*4096]<<'\n';
-    // std::cout << "BBBBBBBBBBBBBBBBB\n";
     return image_features_hd_new_line;
 }
 
@@ -1338,23 +1326,6 @@ ov::Tensor concatenate_2d(const ov::Tensor& first_1lf, const std::vector<float>&
 
 // image_features.resized_source: (num_crops+1, 24*24, 1024)
 ov::Tensor hd_feature_transform(const EncodedImage& image_features, InferRequest& hd_feature_transformer, const std::vector<float>& sub_GN, const std::vector<float>& glb_GN, ov::InferRequest& vision_projection) {
-    // std::cout << image_features.resized_source.data<float>()[576*1024 + 0] << '\n';
-    // std::cout << image_features.resized_source.data<float>()[576*1024 + 1] << '\n';
-    // std::cout << image_features.resized_source.data<float>()[576*1024 + 1025] << '\n';
-    // std::cout << image_features.resized_source.data<float>()[576*1024 + 4090] << '\n';
-    // std::cout << image_features.resized_source.data<float>()[576*1024 + 80000] << '\n';
-// [5,3,336,336] [5,576,1024]
-// 0.134461
-// -0.867309
-// -0.274503
-// 1.73786
-// 0.13117
-// [5,3,336,336] [5,576,1024]
-// -1.01567
-// -0.291421
-// -0.260488
-// 0.743025
-// 1.4099
     const ov::Shape& image_features_shape = image_features.resized_source.get_shape();
     ov::Tensor global_image_features{ov::element::f32, {1, image_features_shape.at(1), image_features_shape.at(2)}, image_features.resized_source.data<float>()};
     // global feature can be viewed as a special HD case with num_crops 1x1
@@ -1373,41 +1344,74 @@ ov::Tensor hd_feature_transform(const EncodedImage& image_features, InferRequest
         image_features_shape.at(2)
     }, image_features.resized_source.data<float>() + image_features_shape.at(1) * image_features_shape.at(2)};
     ov::Tensor sub_image_features_hd = reshape_hd_patches_2x2merge(sub_image_features, h_crop, w_crop, hd_feature_transformer);  // [1, 24, 24, 4096]
-    // std::cout<<sub_image_features_hd.data<float>()[0]<<'\n';
-    // std::cout<<sub_image_features_hd.data<float>()[1]<<'\n';
-    // std::cout<<sub_image_features_hd.data<float>()[4096]<<'\n';
-    // std::cout<<sub_image_features_hd.data<float>()[12*13*4096]<<'\n';
-    // std::cout<<sub_image_features_hd.data<float>()[12*13*4096+1]<<'\n';
-//     0.134461
-// -0.867309
-// 0.342726
-// -0.0916849
-// -2.65548
-// -1.01567
-// -0.291421
-// -0.993172
-// -1.0575
-// -0.299
     ov::Tensor sub_image_features_hd_newline = add_image_newline(sub_image_features_hd, sub_GN);  // [1,h_crop*12*(w_crop*12+1), 4096]
-    // std::cout << sub_image_features_hd_newline.get_shape()<<'\n';
-    // std::cout<<global_image_features_hd_newline.get_shape()<<'\n';
-//     std::cout<<sub_image_features_hd_newline.data<float>()[0]<<'\n';
-//     std::cout<<sub_image_features_hd_newline.data<float>()[1]<<'\n';
-//     std::cout<<sub_image_features_hd_newline.data<float>()[4096]<<'\n';
-//     std::cout<<sub_image_features_hd_newline.data<float>()[12*13*4096]<<'\n';
-//     std::cout<<sub_image_features_hd_newline.data<float>()[12*13*4096+1]<<'\n';
-//     0.134461
-// -0.867309
-// 0.342726
-// 0.0147288
-// -1.87735
-// -1.01567
-// -0.291421
-// -0.993172
-// -1.03232
-// -0.183072
     return concatenate_2d(sub_image_features_hd_newline, glb_GN, global_image_features_hd_newline);  // [1,l,4096]
 }
+
+std::vector<ov::Tensor> split_tokenize(const std::string& text, ov::genai::Tokenizer& tokenizer) {
+    constexpr int make_suffix_iterator = -1;
+    std::regex rgx{R"(<\|image_\d+\|>)"};
+    std::sregex_token_iterator iter{
+        text.begin(),
+        text.end(),
+        rgx,
+        make_suffix_iterator
+    };
+    std::vector<ov::Tensor> tokenized;
+    for ( ; iter != std::sregex_token_iterator{}; ++iter) {
+        if (iter->str().empty()) {
+            continue;
+        }
+        tokenized.push_back(tokenizer.encode(*iter).input_ids);
+    }
+    return tokenized;
+}
+
+// ov::Tensor apply_template_and_tokenize(bool is_chat_conversation, const std::string& prompt, ov::genai::VLMPerfMetrics& metrics, const std::string& chat_template_fallback = "") {
+//     ov::Tensor encoded_input_ids;
+//     if (is_chat_conversation) {
+//         // KV cache in model already contains prompts and answers from previous iterations.
+//         // So only new prompt wrapped into chat template to be sent into model. Tokenizer always returns
+//         // token_ids = {<bos token>, ...<valuable tokens>}. So if tokenizer applies only to the new prompt,
+//         // <bos token> will be inserted on every iteration.
+//         // So actual pipeline calculates input_ids for whole chat history + for whole chat history without the new prompt
+//         // and takes only the difference between them.
+//         // The chat history cannot be saved as already encoded tokens because generate call doesn't return <eos> token, but
+//         // KV cache contains it. So we have to add it manually or get it by tokenization all chat history.
+//         m_history.push_back({{"role", "user"}, {"content", prompt}});
+//         constexpr bool add_generation_prompt = true;
+//         std::string new_templated_chat_history;
+//         try {
+//             new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt);
+//         } catch (const std::exception& error) {
+//             // Use fallback chat template if it was not found in tokenizer_config.json
+//             new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt, chat_template_fallback);
+//         }
+//         auto start_tokenizer_time = std::chrono::steady_clock::now();
+//         ov::Tensor new_chat_tokens = m_tokenizer.encode(new_templated_chat_history).input_ids;
+//         if (m_is_cache_empty) {
+//             encoded_input_ids = new_chat_tokens;
+//             // after first `get_inputs_embeds` is called, we supposed LLM is inferred and cache is not empty
+//             m_is_cache_empty = false;
+//         } else {
+//             TokenizedInputs prev_chat_tokens = m_tokenizer.encode(
+//                 m_templated_chat_history
+//             );
+//             encoded_input_ids = utils::subtract_chat_tokenized_inputs(
+//                 {new_chat_tokens}, prev_chat_tokens
+//             ).input_ids;
+//         }
+//         auto end_tokenizer_time = std::chrono::steady_clock::now();
+//         metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time));
+//         m_templated_chat_history = std::move(new_templated_chat_history);
+//     } else {
+//         auto start_tokenizer_time = std::chrono::steady_clock::now();
+//         encoded_input_ids = m_tokenizer.encode(prompt).input_ids;
+//         auto end_tokenizer_time = std::chrono::steady_clock::now();
+//         metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time));
+//     }
+//     return encoded_input_ids;
+// }
 }
 }
 
@@ -1415,6 +1419,8 @@ class InputsEmbedderPhi3V : public InputsEmbedder::IInputsEmbedder {
 public:
     ov::InferRequest m_hd_feature_transformer;
     ov::InferRequest m_vision_projection;
+    // Used to insert <|image_i|>\n per image (not a slice).
+    size_t m_image_id = 1;
 
     InputsEmbedderPhi3V(
         const VLMConfig& vlm_config,
@@ -1427,30 +1433,19 @@ class InputsEmbedderPhi3V : public InputsEmbedder::IInputsEmbedder {
         m_vision_projection{utils::singleton_core().compile_model(model_dir / "openvino_vision_projection_model.xml", device).create_infer_request()} {}
 
     ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics) override {
-        std::string images_prompt;
-        std::vector<EncodedImage> embeds;
+        // TODO: perfmetrics
+        std::cout << prompt<<'\n';
+        std::stringstream images_prompt;
+        std::vector<ov::Tensor> images_features_proj;
         for (const ov::Tensor& image : to_single_image_tensors(images)) {
             EncodedImage encoded_image = m_vision_encoder.encode(image);
-            ov::Tensor image_features_proj = phi3_v::hd_feature_transform(encoded_image, m_hd_feature_transformer, m_vlm_config.sub_GN, m_vlm_config.glb_GN, m_vision_projection);
-            std::cout << image_features_proj.data<float>()[0]<<'\n';
-            std::cout << image_features_proj.data<float>()[4096]<<'\n';
-            std::cout << image_features_proj.data<float>()[4097]<<'\n';
-            std::cout << image_features_proj.data<float>()[700*4096]<<'\n';
-            std::cout << image_features_proj.data<float>()[700*4097]<<'\n';
-            std::cout << image_features_proj.data<float>()[757*4096-1]<<'\n';
-            // 0.134461
-// 0.342726
-// 0.0631084
-// 0.434334
-// 0.650556
-// 0
-// -1.01567
-// -0.993172
-// -0.226981
-// -1.89643
-// -0.907323
-// 0
+            images_features_proj.push_back(phi3_v::hd_feature_transform(encoded_image, m_hd_feature_transformer, m_vlm_config.sub_GN, m_vlm_config.glb_GN, m_vision_projection));
+            images_prompt << "<|image_" << m_image_id << "|>\n";
+            ++m_image_id;
         }
+        images_prompt << prompt;
+        phi3_v::split_tokenize(images_prompt.str(), m_tokenizer);
+
         ov::Tensor inputs_embeds;
         //     if (m_vlm_config.use_image_id) {
         //         images_prompt += m_vlm_config.im_id_start + std::to_string(m_image_id) + m_vlm_config.im_id_end;
@@ -1549,10 +1544,6 @@ class InputsEmbedderPhi3V : public InputsEmbedder::IInputsEmbedder {
         IInputsEmbedder::finish_chat();
         m_image_id = 0;
     }
-
-private:
-    // Used to insert <|image_i|>\n per image (not a slice).
-    size_t m_image_id;
 };
 
 InputsEmbedder::InputsEmbedder(const VLMConfig& vlm_config,

From 83834a24a027f4243de3a670bdf8fd79c165fa08 Mon Sep 17 00:00:00 2001
From: Vladimir Zlobin <vladimir.zlobin@intel.com>
Date: Wed, 8 Jan 2025 14:29:50 +0400
Subject: [PATCH 09/28] code style

---
 .../src/visual_language/inputs_embedder.cpp   | 931 +++++++++---------
 1 file changed, 491 insertions(+), 440 deletions(-)

diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp
index b1027c533b..4404ddfe27 100644
--- a/src/cpp/src/visual_language/inputs_embedder.cpp
+++ b/src/cpp/src/visual_language/inputs_embedder.cpp
@@ -1,16 +1,16 @@
 // Copyright (C) 2023-2024 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
-#include "openvino/genai/visual_language/perf_metrics.hpp"
 #include "visual_language/inputs_embedder.hpp"
 
-#include "visual_language/clip.hpp"
-#include "visual_language/vision_encoder.hpp"
-#include "visual_language/embedding_model.hpp"
-#include "openvino/opsets/opset13.hpp"
+#include <regex>
 
+#include "openvino/genai/visual_language/perf_metrics.hpp"
+#include "openvino/opsets/opset13.hpp"
 #include "utils.hpp"
-#include <regex>
+#include "visual_language/clip.hpp"
+#include "visual_language/embedding_model.hpp"
+#include "visual_language/vision_encoder.hpp"
 
 namespace ov::genai {
 
@@ -40,12 +40,15 @@ class InputsEmbedder::IInputsEmbedder {
     // Tail of previous output for LM in chat mode is missing in KV cache.
     std::optional<int64_t> m_last_disappeared_token = std::nullopt;
     // If sequence contains some symbols, which could be ambiguous encoded by tokenizer, we need to trim kv cache
-    // If we use beam search sampling with chat mode we need to remove last answer of the model from kv cache and add best answer to history 
-    // so, let's keep info about amount of tokens to trim from kv cache and amount of tokens to keep in history
+    // If we use beam search sampling with chat mode we need to remove last answer of the model from kv cache and add
+    // best answer to history so, let's keep info about amount of tokens to trim from kv cache and amount of tokens to
+    // keep in history
     ov::genai::utils::HistoryRemoveManager m_kv_history_manager = {0, 0};
 
 public:
-    virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics) = 0;
+    virtual ov::Tensor get_inputs_embeds(const std::string& prompt,
+                                         const std::vector<ov::Tensor>& images,
+                                         ov::genai::VLMPerfMetrics& metrics) = 0;
 
     EmbeddingsModel get_embedding_model() const {
         return m_embedding;
@@ -63,7 +66,10 @@ class InputsEmbedder::IInputsEmbedder {
         return m_kv_history_manager.num_tokens_to_remove_from_kv_cache;
     }
 
-    void update_tokenized_history(const std::vector<int64_t>& encoded_result, std::optional<int64_t> last_disappeared_token, bool is_beam_search, size_t last_answer_len) {
+    void update_tokenized_history(const std::vector<int64_t>& encoded_result,
+                                  std::optional<int64_t> last_disappeared_token,
+                                  bool is_beam_search,
+                                  size_t last_answer_len) {
         if (is_beam_search) {
             m_kv_history_manager.trusted_history_length = m_tokenized_history.size();
             m_kv_history_manager.num_tokens_to_remove_from_kv_cache = last_answer_len;
@@ -72,7 +78,7 @@ class InputsEmbedder::IInputsEmbedder {
         }
 
         m_last_disappeared_token = last_disappeared_token;
-  
+
         std::copy(encoded_result.begin(), encoded_result.end(), std::back_inserter(m_tokenized_history));
     }
 
@@ -109,52 +115,48 @@ class InputsEmbedder::IInputsEmbedder {
     }
 
 protected:
-    IInputsEmbedder(
-        const VLMConfig& vlm_config,
-        const std::filesystem::path& model_dir,
-        const std::string& device,
-        const ov::AnyMap device_config) :
-        m_vlm_config{vlm_config},
-        m_vision_encoder(model_dir, m_vlm_config.model_type, device, device_config),
-        m_embedding(model_dir, m_vlm_config.scale_emb, device, device_config),
-        m_tokenizer{model_dir, device_config} { }
-    
-    IInputsEmbedder(
-        const VLMConfig& vlm_config,
-        const ModelsMap& models_map,
-        const Tokenizer& tokenizer,
-        const std::filesystem::path& config_dir_path,
-        const std::string& device,
-        const ov::AnyMap device_config) :
-        m_vlm_config{vlm_config},
-        m_vision_encoder(
-            get_model_weights_pair(models_map, "vision_embeddings").first,
-            get_model_weights_pair(models_map, "vision_embeddings").second,
-            config_dir_path,
-            m_vlm_config.model_type,
-            device,
-            device_config
-        ),
-        m_embedding(
-            get_model_weights_pair(models_map, "text_embeddings").first,
-            get_model_weights_pair(models_map, "text_embeddings").second,
-            m_vlm_config.scale_emb,
-            device,
-            device_config
-        ),
-        m_tokenizer(tokenizer) { }
-
-    ov::Tensor get_encoded_input_ids(const std::string& prompt, ov::genai::VLMPerfMetrics& metrics, const std::string& chat_template_fallback = "") {
+    IInputsEmbedder(const VLMConfig& vlm_config,
+                    const std::filesystem::path& model_dir,
+                    const std::string& device,
+                    const ov::AnyMap device_config)
+        : m_vlm_config{vlm_config},
+          m_vision_encoder(model_dir, m_vlm_config.model_type, device, device_config),
+          m_embedding(model_dir, m_vlm_config.scale_emb, device, device_config),
+          m_tokenizer{model_dir, device_config} {}
+
+    IInputsEmbedder(const VLMConfig& vlm_config,
+                    const ModelsMap& models_map,
+                    const Tokenizer& tokenizer,
+                    const std::filesystem::path& config_dir_path,
+                    const std::string& device,
+                    const ov::AnyMap device_config)
+        : m_vlm_config{vlm_config},
+          m_vision_encoder(get_model_weights_pair(models_map, "vision_embeddings").first,
+                           get_model_weights_pair(models_map, "vision_embeddings").second,
+                           config_dir_path,
+                           m_vlm_config.model_type,
+                           device,
+                           device_config),
+          m_embedding(get_model_weights_pair(models_map, "text_embeddings").first,
+                      get_model_weights_pair(models_map, "text_embeddings").second,
+                      m_vlm_config.scale_emb,
+                      device,
+                      device_config),
+          m_tokenizer(tokenizer) {}
+
+    ov::Tensor get_encoded_input_ids(const std::string& prompt,
+                                     ov::genai::VLMPerfMetrics& metrics,
+                                     const std::string& chat_template_fallback = "") {
         ov::Tensor encoded_input_ids;
         if (m_is_chat_conversation) {
             // KV cache in model already contains prompts and answers from previous iterations.
             // So only new prompt wrapped into chat template to be sent into model. Tokenizer always returns
             // token_ids = {<bos token>, ...<valuable tokens>}. So if tokenizer applies only to the new prompt,
             // <bos token> will be inserted on every iteration.
-            // So actual pipeline calculates input_ids for whole chat history + for whole chat history without the new prompt
-            // and takes only the difference between them.
-            // The chat history cannot be saved as already encoded tokens because generate call doesn't return <eos> token, but
-            // KV cache contains it. So we have to add it manually or get it by tokenization all chat history.
+            // So actual pipeline calculates input_ids for whole chat history + for whole chat history without the new
+            // prompt and takes only the difference between them. The chat history cannot be saved as already encoded
+            // tokens because generate call doesn't return <eos> token, but KV cache contains it. So we have to add it
+            // manually or get it by tokenization all chat history.
             m_history.push_back({{"role", "user"}, {"content", prompt}});
             constexpr bool add_generation_prompt = true;
             std::string new_templated_chat_history;
@@ -162,19 +164,24 @@ class InputsEmbedder::IInputsEmbedder {
                 new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt);
             } catch (const std::exception& error) {
                 // Use fallback chat template if it was not found in tokenizer_config.json
-                new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt, chat_template_fallback);
+                new_templated_chat_history =
+                    m_tokenizer.apply_chat_template(m_history, add_generation_prompt, chat_template_fallback);
             }
             auto start_tokenizer_time = std::chrono::steady_clock::now();
-            ov::Tensor new_chat_tokens = m_tokenizer.encode(new_templated_chat_history, ov::genai::add_special_tokens(false)).input_ids;
-            TokenizedInputs prev_chat_tokens = m_tokenizer.encode(m_templated_chat_history, ov::genai::add_special_tokens(false));
+            ov::Tensor new_chat_tokens =
+                m_tokenizer.encode(new_templated_chat_history, ov::genai::add_special_tokens(false)).input_ids;
+            TokenizedInputs prev_chat_tokens =
+                m_tokenizer.encode(m_templated_chat_history, ov::genai::add_special_tokens(false));
 
             // some symbols combinations can be encoded by the tokenizer in different ways
-            // if we met sequence with such combination of symbols, we cannot correctly subtract the new history from the old history
-            // so let's check it out, find the trusted part and use it in on the next step
+            // if we met sequence with such combination of symbols, we cannot correctly subtract the new history from
+            // the old history so let's check it out, find the trusted part and use it in on the next step
             size_t trusted_history_length = 0;
             if (!m_tokenized_history.empty()) {
                 std::set<int64_t> stop_tokens = {m_tokenizer.get_eos_token_id()};
-                trusted_history_length = ov::genai::utils::get_first_history_difference(prev_chat_tokens.input_ids, m_tokenized_history, stop_tokens);
+                trusted_history_length = ov::genai::utils::get_first_history_difference(prev_chat_tokens.input_ids,
+                                                                                        m_tokenized_history,
+                                                                                        stop_tokens);
             }
 
             if (m_tokenized_history.empty()) {
@@ -182,81 +189,94 @@ class InputsEmbedder::IInputsEmbedder {
 
             } else if (trusted_history_length != SIZE_MAX || m_kv_history_manager.does_kv_cache_need_to_update()) {
                 // does_kv_cache_need_to_update will be true here if beam search is activated
-                // in beam search mode we want to remove all history about last model answer from kv cache and add the best answer directly
-                // if we have difference in model answer and decoded answer it anyway will be less then entire history, so let's use data from m_kv_history_manager
+                // in beam search mode we want to remove all history about last model answer from kv cache and add the
+                // best answer directly if we have difference in model answer and decoded answer it anyway will be less
+                // then entire history, so let's use data from m_kv_history_manager
                 if (m_kv_history_manager.does_kv_cache_need_to_update()) {
                     trusted_history_length = m_kv_history_manager.trusted_history_length;
                 } else {
-                    m_kv_history_manager.num_tokens_to_remove_from_kv_cache = m_tokenized_history.size() - trusted_history_length;
-                    // if prev generation was finished because of max len was reached, kv cache is missed one last token, let's keep it
-                    m_kv_history_manager.num_tokens_to_remove_from_kv_cache -= m_last_disappeared_token.has_value() ? 1 : 0;
+                    m_kv_history_manager.num_tokens_to_remove_from_kv_cache =
+                        m_tokenized_history.size() - trusted_history_length;
+                    // if prev generation was finished because of max len was reached, kv cache is missed one last
+                    // token, let's keep it
+                    m_kv_history_manager.num_tokens_to_remove_from_kv_cache -=
+                        m_last_disappeared_token.has_value() ? 1 : 0;
                 }
 
                 ov::Tensor new_tensor = ov::Tensor(new_chat_tokens.get_element_type(),
                                                    {1, new_chat_tokens.get_shape().at(1) - trusted_history_length},
                                                    new_chat_tokens.data<int64_t>() + trusted_history_length);
                 encoded_input_ids = ov::Tensor(new_chat_tokens.get_element_type(),
-                                                    {1, new_chat_tokens.get_shape().at(1) - trusted_history_length});
+                                               {1, new_chat_tokens.get_shape().at(1) - trusted_history_length});
                 new_tensor.copy_to(encoded_input_ids);
             } else {
-                encoded_input_ids = utils::subtract_chat_tokenized_inputs(
-                    {new_chat_tokens}, prev_chat_tokens
-                ).input_ids;
+                encoded_input_ids =
+                    utils::subtract_chat_tokenized_inputs({new_chat_tokens}, prev_chat_tokens).input_ids;
 
                 if (m_last_disappeared_token.has_value())
-                    encoded_input_ids = ov::genai::utils::push_front_inputs(encoded_input_ids, *m_last_disappeared_token);
+                    encoded_input_ids =
+                        ov::genai::utils::push_front_inputs(encoded_input_ids, *m_last_disappeared_token);
             }
             auto end_tokenizer_time = std::chrono::steady_clock::now();
-            metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time));
+            metrics.raw_metrics.tokenization_durations.emplace_back(
+                PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time));
             m_templated_chat_history = std::move(new_templated_chat_history);
             m_tokenized_history.clear();
-            std::copy_n(new_chat_tokens.data<int64_t>(), new_chat_tokens.get_size(), std::back_inserter(m_tokenized_history));
+            std::copy_n(new_chat_tokens.data<int64_t>(),
+                        new_chat_tokens.get_size(),
+                        std::back_inserter(m_tokenized_history));
         } else {
             auto start_tokenizer_time = std::chrono::steady_clock::now();
             encoded_input_ids = m_tokenizer.encode(prompt).input_ids;
             auto end_tokenizer_time = std::chrono::steady_clock::now();
-            metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time));
+            metrics.raw_metrics.tokenization_durations.emplace_back(
+                PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time));
             m_tokenized_history.clear();
-            std::copy_n(encoded_input_ids.data<int64_t>(), encoded_input_ids.get_size(), std::back_inserter(m_tokenized_history));
+            std::copy_n(encoded_input_ids.data<int64_t>(),
+                        encoded_input_ids.get_size(),
+                        std::back_inserter(m_tokenized_history));
         }
 
         return encoded_input_ids;
     }
 
     /**
-    * @brief Unpads an image tensor of a padded and resized image.
-    * Used for packing image features of llava_next models.
-    *
-    * @param tensor An image tensor with a shape (embed_dim, height, width)
-    * @param original_size A size of original image
-    * @return An unpadded image tensor with a shape (embed_dim, new_height, new_width)
-    */
+     * @brief Unpads an image tensor of a padded and resized image.
+     * Used for packing image features of llava_next models.
+     *
+     * @param tensor An image tensor with a shape (embed_dim, height, width)
+     * @param original_size A size of original image
+     * @return An unpadded image tensor with a shape (embed_dim, new_height, new_width)
+     */
 
     /**
-    * @brief Converts a vector of batched images ([NHWC]) into a vector of individual image tensors ([1HWC]).
-    *
-    * @param images A vector of tensors representing the images. Each tensor can have a shape of either [NHWC] or [HWC].
-    * @return A vector of tensors where each tensor represents a single image with a shape of [1, H, W, C].
-    */
+     * @brief Converts a vector of batched images ([NHWC]) into a vector of individual image tensors ([1HWC]).
+     *
+     * @param images A vector of tensors representing the images. Each tensor can have a shape of either [NHWC] or
+     * [HWC].
+     * @return A vector of tensors where each tensor represents a single image with a shape of [1, H, W, C].
+     */
     std::vector<ov::Tensor> to_single_image_tensors(const std::vector<ov::Tensor>& images) {
         std::vector<ov::Tensor> single_image_tensors;
         for (const auto& image : images) {
             ov::Tensor reshaped_image = image;
             ov::Shape image_shape = image.get_shape();
             switch (image_shape.size()) {
-                case 3:
-                    reshaped_image.set_shape({1, image_shape.at(0), image_shape.at(1), image_shape.at(2)});
-                    break;
-                case 4: break;
-                default: OPENVINO_THROW("Input image must have [NHWC] or [HWC] layout");
+            case 3:
+                reshaped_image.set_shape({1, image_shape.at(0), image_shape.at(1), image_shape.at(2)});
+                break;
+            case 4:
+                break;
+            default:
+                OPENVINO_THROW("Input image must have [NHWC] or [HWC] layout");
             }
             ov::Shape reshaped_image_shape = reshaped_image.get_shape();
             for (size_t batch_idx = 0; batch_idx < reshaped_image_shape.at(0); ++batch_idx) {
                 ov::Tensor single_image{
                     reshaped_image.get_element_type(),
                     {1, reshaped_image_shape.at(1), reshaped_image_shape.at(2), reshaped_image_shape.at(3)},
-                    reshaped_image.data<uint8_t>() + batch_idx * reshaped_image_shape.at(1) * reshaped_image_shape.at(2) * reshaped_image_shape.at(3)
-                };
+                    reshaped_image.data<uint8_t>() + batch_idx * reshaped_image_shape.at(1) *
+                                                         reshaped_image_shape.at(2) * reshaped_image_shape.at(3)};
                 single_image_tensors.push_back(std::move(single_image));
             }
         }
@@ -277,12 +297,11 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder {
     size_t m_image_id = 0;
 
 public:
-    InputsEmbedderMiniCPM(
-        const VLMConfig& vlm_config,
-        const std::filesystem::path& model_dir,
-        const std::string& device,
-        const ov::AnyMap device_config) :
-        IInputsEmbedder(vlm_config, model_dir, device, device_config) {
+    InputsEmbedderMiniCPM(const VLMConfig& vlm_config,
+                          const std::filesystem::path& model_dir,
+                          const std::string& device,
+                          const ov::AnyMap device_config)
+        : IInputsEmbedder(vlm_config, model_dir, device, device_config) {
         auto compiled_model =
             utils::singleton_core().compile_model(model_dir / "openvino_resampler_model.xml", device, device_config);
         ov::genai::utils::print_compiled_model_properties(compiled_model, "VLM resampler model");
@@ -291,25 +310,26 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder {
         m_pos_embed_cache = get_2d_sincos_pos_embed(m_vlm_config.hidden_size, {70, 70});
     }
 
-    InputsEmbedderMiniCPM(
-        const VLMConfig& vlm_config,
-        const ModelsMap& models_map,
-        const Tokenizer& tokenizer,
-        const std::filesystem::path& config_dir_path,
-        const std::string& device,
-        const ov::AnyMap device_config) :
-        IInputsEmbedder(vlm_config, models_map, tokenizer, config_dir_path, device, device_config) {
-            m_resampler = utils::singleton_core().compile_model(
-                get_model_weights_pair(models_map, "resampler").first,
-                get_model_weights_pair(models_map, "resampler").second,
-                device,
-                device_config
-            ).create_infer_request();
-
-            m_pos_embed_cache = get_2d_sincos_pos_embed(m_vlm_config.hidden_size, {70, 70});
-        }
+    InputsEmbedderMiniCPM(const VLMConfig& vlm_config,
+                          const ModelsMap& models_map,
+                          const Tokenizer& tokenizer,
+                          const std::filesystem::path& config_dir_path,
+                          const std::string& device,
+                          const ov::AnyMap device_config)
+        : IInputsEmbedder(vlm_config, models_map, tokenizer, config_dir_path, device, device_config) {
+        m_resampler = utils::singleton_core()
+                          .compile_model(get_model_weights_pair(models_map, "resampler").first,
+                                         get_model_weights_pair(models_map, "resampler").second,
+                                         device,
+                                         device_config)
+                          .create_infer_request();
 
-    virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics) override {
+        m_pos_embed_cache = get_2d_sincos_pos_embed(m_vlm_config.hidden_size, {70, 70});
+    }
+
+    virtual ov::Tensor get_inputs_embeds(const std::string& prompt,
+                                         const std::vector<ov::Tensor>& images,
+                                         ov::genai::VLMPerfMetrics& metrics) override {
         std::string images_prompt;
         std::vector<EncodedImage> embeds;
 
@@ -347,24 +367,18 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder {
         ov::Tensor encoded_input = get_encoded_input_ids(images_prompt, metrics);
 
         ov::Tensor inputs_embeds = m_embedding.infer(encoded_input);
-        OPENVINO_ASSERT(
-            m_vlm_config.hidden_size == inputs_embeds.get_shape().at(2),
-            "Unexpected embedding size"
-        );
+        OPENVINO_ASSERT(m_vlm_config.hidden_size == inputs_embeds.get_shape().at(2), "Unexpected embedding size");
         auto start_tokenizer_time = std::chrono::steady_clock::now();
-        ov::Tensor special_tokens = m_tokenizer.encode(
-            m_vlm_config.im_start
-            + m_vlm_config.im_end
-            + m_vlm_config.slice_start
-            + m_vlm_config.slice_end
-        ).input_ids;
+        ov::Tensor special_tokens =
+            m_tokenizer
+                .encode(m_vlm_config.im_start + m_vlm_config.im_end + m_vlm_config.slice_start + m_vlm_config.slice_end)
+                .input_ids;
         auto end_tokenizer_time = std::chrono::steady_clock::now();
         OPENVINO_ASSERT(metrics.raw_metrics.tokenization_durations.size() > 0);
-        metrics.raw_metrics.tokenization_durations[metrics.raw_metrics.tokenization_durations.size() - 1] += ov::genai::MicroSeconds(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time));
-        OPENVINO_ASSERT(
-            4 == special_tokens.get_shape().at(1),
-            "Every special token must be represented with a single int."
-        );
+        metrics.raw_metrics.tokenization_durations[metrics.raw_metrics.tokenization_durations.size() - 1] +=
+            ov::genai::MicroSeconds(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time));
+        OPENVINO_ASSERT(4 == special_tokens.get_shape().at(1),
+                        "Every special token must be represented with a single int.");
         int64_t im_start_id = special_tokens.data<int64_t>()[0];
         int64_t im_end_id = special_tokens.data<int64_t>()[1];
         int64_t slice_start_id = special_tokens.data<int64_t>()[2];
@@ -376,12 +390,15 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder {
         int64_t* end = ids + encoded_input_size;
         float* inputs_embeds_data = inputs_embeds.data<float>();
         for (const EncodedImage& encoded_image : embeds) {
-            const ov::Tensor& resampled_source = resample(encoded_image.resized_source, {encoded_image.resized_source_size});
+            const ov::Tensor& resampled_source =
+                resample(encoded_image.resized_source, {encoded_image.resized_source_size});
             float* emb = resampled_source.data<float>();
             ids = std::find(ids, end, im_start_id);
             OPENVINO_ASSERT(end != ids);
             ++ids;
-            std::copy_n(emb, resampled_source.get_size(), inputs_embeds_data + std::distance(begin, ids) * m_vlm_config.hidden_size);
+            std::copy_n(emb,
+                        resampled_source.get_size(),
+                        inputs_embeds_data + std::distance(begin, ids) * m_vlm_config.hidden_size);
             ids += m_vlm_config.query_num;
             if (encoded_image.slices) {
                 size_t token_idx = 0;
@@ -390,12 +407,17 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder {
                     for (size_t ja = 0; ja < slices_shape.at(1); ++ja) {
                         size_t d2 = slices_shape.at(2);
                         size_t d3 = slices_shape.at(3);
-                        ov::Tensor encoded_view{ov::element::f32, {1, d2, d3}, encoded_image.slices.data<float>() + (i * slices_shape.at(1) + ja) * d2 * d3};
+                        ov::Tensor encoded_view{
+                            ov::element::f32,
+                            {1, d2, d3},
+                            encoded_image.slices.data<float>() + (i * slices_shape.at(1) + ja) * d2 * d3};
                         const ov::Tensor& vision_embed_tensor_i_j = resample(encoded_view, {encoded_image.slices_size});
                         ids = std::find(ids, end, slice_start_id);
                         OPENVINO_ASSERT(end != ids);
                         ++ids;
-                        std::copy_n(vision_embed_tensor_i_j.data<float>(), vision_embed_tensor_i_j.get_size(), inputs_embeds_data + std::distance(begin, ids) * m_vlm_config.hidden_size);
+                        std::copy_n(vision_embed_tensor_i_j.data<float>(),
+                                    vision_embed_tensor_i_j.get_size(),
+                                    inputs_embeds_data + std::distance(begin, ids) * m_vlm_config.hidden_size);
                         ids += m_vlm_config.query_num;
                     }
                 }
@@ -425,11 +447,7 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder {
         std::transform(target_sizes.begin(), target_sizes.end(), patch_len.begin(), [](const ImageSize& height_width) {
             return height_width.height * height_width.width;
         });
-        adjust_pos_cache(
-            target_sizes,
-            m_vlm_config.hidden_size,
-            m_pos_embed_cache
-        );
+        adjust_pos_cache(target_sizes, m_vlm_config.hidden_size, m_pos_embed_cache);
         size_t max_patch_len = *std::max_element(patch_len.begin(), patch_len.end());
         ov::Tensor key_padding_mask(ov::element::f32, {bs, max_patch_len});
         float* mask_data = key_padding_mask.data<float>();
@@ -444,11 +462,9 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder {
             size_t target_w = target_sizes.at(i).width;
             for (size_t h_idx = 0; h_idx < target_h; ++h_idx) {
                 for (size_t w_idx = 0; w_idx < target_w; ++w_idx) {
-                    std::copy_n(
-                        cache_data + (h_idx * _d1 + w_idx) * embed_len,
-                        embed_len,
-                        pos_embed_data + (h_idx * target_w + w_idx) * bs * embed_len + i * embed_len
-                    );
+                    std::copy_n(cache_data + (h_idx * _d1 + w_idx) * embed_len,
+                                embed_len,
+                                pos_embed_data + (h_idx * target_w + w_idx) * bs * embed_len + i * embed_len);
                 }
             }
             for (size_t flat = target_h * target_w; flat < max_patch_len; ++flat) {
@@ -457,8 +473,8 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder {
             std::fill_n(mask_data + i * max_patch_len, patch_len[i], 0.0f);
             std::fill_n(mask_data + i * max_patch_len + patch_len[i], max_patch_len - patch_len[i], 1.0f);
         }
-        m_resampler.set_tensor("image_feature", encoded_image);  // [N, H*W, old_hidden_size]
-        m_resampler.set_tensor("pos_embed", pos_embed);  // [H*W, N, new_hidden_size]
+        m_resampler.set_tensor("image_feature", encoded_image);        // [N, H*W, old_hidden_size]
+        m_resampler.set_tensor("pos_embed", pos_embed);                // [H*W, N, new_hidden_size]
         m_resampler.set_tensor("key_padding_mask", key_padding_mask);  // [N, H*W]
         m_resampler.infer();
         return m_resampler.get_output_tensor();  // [N, query_num, new_hidden_size]
@@ -478,12 +494,12 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder {
             for (size_t j = 0; j < res_d_1; ++j) {
                 size_t k = 0;
                 for (; k < first.get_shape().at(2); ++k) {
-                    res_data[i * res_d_1 * res_d_2 + j * res_d_2 + k]
-                        = first_data[i * res_d_1 * first.get_shape().at(2) + j * first.get_shape().at(2) + k];
+                    res_data[i * res_d_1 * res_d_2 + j * res_d_2 + k] =
+                        first_data[i * res_d_1 * first.get_shape().at(2) + j * first.get_shape().at(2) + k];
                 }
                 for (size_t l = 0; l < second.get_shape().at(2); ++l, ++k) {
-                    res_data[i * res_d_1 * res_d_2 + j * res_d_2 + k]
-                        = second_data[i * res_d_1 * second.get_shape().at(2) + j * second.get_shape().at(2) + l];
+                    res_data[i * res_d_1 * res_d_2 + j * res_d_2 + k] =
+                        second_data[i * res_d_1 * second.get_shape().at(2) + j * second.get_shape().at(2) + l];
                 }
             }
         }
@@ -529,16 +545,14 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder {
         ov::Shape grid_shape = grid.get_shape();
         float* grid_data = grid.data<float>();
         ov::Shape plane_shape{grid_shape.at(1), grid_shape.at(2)};
-        ov::Tensor emb_h = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, ov::Tensor{
-            ov::element::f32,
-            plane_shape,
-            grid_data
-        });  // (H, W, D/2)
-        ov::Tensor emb_w = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, ov::Tensor{
-            ov::element::f32,
-            plane_shape,
-            grid_data + plane_shape.at(0) * plane_shape.at(1)
-        });  // (H, W, D/2)
+        ov::Tensor emb_h =
+            get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2,
+                                                  ov::Tensor{ov::element::f32, plane_shape, grid_data});  // (H, W, D/2)
+        ov::Tensor emb_w = get_1d_sincos_pos_embed_from_grid_new(
+            embed_dim / 2,
+            ov::Tensor{ov::element::f32,
+                       plane_shape,
+                       grid_data + plane_shape.at(0) * plane_shape.at(1)});  // (H, W, D/2)
         return concatenate_last_dim(emb_h, emb_w);
     }
 
@@ -560,17 +574,19 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder {
         return get_2d_sincos_pos_embed_from_grid(embed_dim, grid);
     }
 
-    void adjust_pos_cache(
-        const std::vector<ImageSize>& target_sizes,
-        size_t hidden_size,
-        ov::Tensor& pos_embed_cache
-    ) {
-        size_t max_h = std::max_element(target_sizes.begin(), target_sizes.end(), [](const ImageSize& left, const ImageSize& right) {
-            return left.height < right.height;
-        })->height;
-        size_t max_w = std::max_element(target_sizes.begin(), target_sizes.end(), [](const ImageSize& left, const ImageSize& right) {
-            return left.width < right.width;
-        })->width;
+    void adjust_pos_cache(const std::vector<ImageSize>& target_sizes, size_t hidden_size, ov::Tensor& pos_embed_cache) {
+        size_t max_h = std::max_element(target_sizes.begin(),
+                                        target_sizes.end(),
+                                        [](const ImageSize& left, const ImageSize& right) {
+                                            return left.height < right.height;
+                                        })
+                           ->height;
+        size_t max_w = std::max_element(target_sizes.begin(),
+                                        target_sizes.end(),
+                                        [](const ImageSize& left, const ImageSize& right) {
+                                            return left.width < right.width;
+                                        })
+                           ->width;
         size_t allocated_height, allocated_width;
         if (pos_embed_cache) {
             const ov::Shape& allocated_shape = pos_embed_cache.get_shape();
@@ -582,36 +598,37 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder {
         if (max_h > allocated_height || max_w > allocated_width) {
             allocated_height = std::max(max_h, allocated_height);
             allocated_width = std::max(max_w, allocated_width);
-            pos_embed_cache = get_2d_sincos_pos_embed(
-                hidden_size, {allocated_height, allocated_width}
-            );
+            pos_embed_cache = get_2d_sincos_pos_embed(hidden_size, {allocated_height, allocated_width});
         }
     }
 };
 
 class InputsEmbedderLLaVA : public InputsEmbedder::IInputsEmbedder {
 public:
-    InputsEmbedderLLaVA(
-        const VLMConfig& vlm_config,
-        const std::filesystem::path& model_dir,
-        const std::string& device,
-        const ov::AnyMap device_config) :
-        IInputsEmbedder(vlm_config, model_dir, device, device_config) { }
-
-    InputsEmbedderLLaVA(
-        const VLMConfig& vlm_config,
-        const ModelsMap& models_map,
-        const Tokenizer& tokenizer,
-        const std::filesystem::path& config_dir_path,
-        const std::string& device,
-        const ov::AnyMap device_config) :
-        IInputsEmbedder(vlm_config, models_map, tokenizer, config_dir_path, device, device_config) { }
-
-    virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics) override {
+    InputsEmbedderLLaVA(const VLMConfig& vlm_config,
+                        const std::filesystem::path& model_dir,
+                        const std::string& device,
+                        const ov::AnyMap device_config)
+        : IInputsEmbedder(vlm_config, model_dir, device, device_config) {}
+
+    InputsEmbedderLLaVA(const VLMConfig& vlm_config,
+                        const ModelsMap& models_map,
+                        const Tokenizer& tokenizer,
+                        const std::filesystem::path& config_dir_path,
+                        const std::string& device,
+                        const ov::AnyMap device_config)
+        : IInputsEmbedder(vlm_config, models_map, tokenizer, config_dir_path, device, device_config) {}
+
+    virtual ov::Tensor get_inputs_embeds(const std::string& prompt,
+                                         const std::vector<ov::Tensor>& images,
+                                         ov::genai::VLMPerfMetrics& metrics) override {
         std::string image_token = m_vlm_config.im_start;
         // Adapted from llava-1.5-7b-hf chat_template.json
-        std::string chat_template_fallback = "{% for message in messages %}{% if message['role'] == 'user' %}{{ 'USER: ' + message['content'] + ' ' }}{% else %}{{ 'ASSISTANT: ' + message['content'] + ' ' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}";
-        
+        std::string chat_template_fallback =
+            "{% for message in messages %}{% if message['role'] == 'user' %}{{ 'USER: ' + message['content'] + ' ' "
+            "}}{% else %}{{ 'ASSISTANT: ' + message['content'] + ' ' }}{% endif %}{% endfor %}{% if "
+            "add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}";
+
         std::vector<ov::Tensor> single_images = to_single_image_tensors(images);
 
         std::string formatted_prompt;
@@ -632,21 +649,21 @@ class InputsEmbedderLLaVA : public InputsEmbedder::IInputsEmbedder {
             return text_embeds;
         }
         auto start_tokenizer_time = std::chrono::steady_clock::now();
-        ov::Tensor encoded_image_token = m_tokenizer.encode(m_vlm_config.im_start, ov::genai::add_special_tokens(false)).input_ids;
+        ov::Tensor encoded_image_token =
+            m_tokenizer.encode(m_vlm_config.im_start, ov::genai::add_special_tokens(false)).input_ids;
         auto end_tokenizer_time = std::chrono::steady_clock::now();
         OPENVINO_ASSERT(metrics.raw_metrics.tokenization_durations.size() > 0);
-        metrics.raw_metrics.tokenization_durations[metrics.raw_metrics.tokenization_durations.size() - 1] += ov::genai::MicroSeconds(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time));
+        metrics.raw_metrics.tokenization_durations[metrics.raw_metrics.tokenization_durations.size() - 1] +=
+            ov::genai::MicroSeconds(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time));
         int64_t image_token_id = encoded_image_token.data<int64_t>()[encoded_image_token.get_size() - 1];
         return merge_text_and_image_embeddings_llava(input_ids, text_embeds, image_embeds, image_token_id);
     }
 
 protected:
-    ov::Tensor merge_text_and_image_embeddings_llava(
-        const ov::Tensor& input_ids,
-        const ov::Tensor& text_embeds,
-        const std::vector<ov::Tensor>& image_embeds,
-        int64_t image_token_id
-    ) {
+    ov::Tensor merge_text_and_image_embeddings_llava(const ov::Tensor& input_ids,
+                                                     const ov::Tensor& text_embeds,
+                                                     const std::vector<ov::Tensor>& image_embeds,
+                                                     int64_t image_token_id) {
         auto text_embeds_shape = text_embeds.get_shape();
         size_t text_embeds_seq_length = text_embeds_shape[1];
         size_t hidden_size = text_embeds_shape[2];
@@ -661,22 +678,18 @@ class InputsEmbedderLLaVA : public InputsEmbedder::IInputsEmbedder {
             }
         }
         auto num_images = image_embeds.size();
-        OPENVINO_ASSERT(
-            num_image_tokens == num_images,
-            "Number of image tokens in input_ids different from num_images."
-        );
+        OPENVINO_ASSERT(num_image_tokens == num_images,
+                        "Number of image tokens in input_ids different from num_images.");
 
         size_t total_image_seq_length = 0;
         for (const auto& single_image_embeds : image_embeds) {
-            OPENVINO_ASSERT(
-                text_embeds_shape[2] == single_image_embeds.get_shape().at(2),
-                "Incompatible shapes between text_embeds and image_embeds"
-            );
+            OPENVINO_ASSERT(text_embeds_shape[2] == single_image_embeds.get_shape().at(2),
+                            "Incompatible shapes between text_embeds and image_embeds");
             total_image_seq_length += single_image_embeds.get_shape().at(1);
         }
         size_t merged_seq_length = text_embeds_seq_length + total_image_seq_length - num_image_tokens;
 
-    constexpr size_t BATCH_SIZE = 1;
+        constexpr size_t BATCH_SIZE = 1;
         ov::Tensor merged_embeds(text_embeds.get_element_type(), {BATCH_SIZE, merged_seq_length, hidden_size});
         float* merged_data = merged_embeds.data<float>();
 
@@ -687,15 +700,11 @@ class InputsEmbedderLLaVA : public InputsEmbedder::IInputsEmbedder {
                 const float* image_embeds_data = image_embeds[image_idx].data<const float>();
                 size_t image_seq_length = image_embeds[image_idx].get_shape()[1];
 
-                std::copy_n(image_embeds_data,
-                            image_seq_length * hidden_size,
-                            merged_data + merged_idx * hidden_size);
+                std::copy_n(image_embeds_data, image_seq_length * hidden_size, merged_data + merged_idx * hidden_size);
                 merged_idx += image_seq_length;
                 image_idx++;
             } else {
-                std::copy_n(text_embeds_data + s * hidden_size,
-                            hidden_size,
-                            merged_data + merged_idx * hidden_size);
+                std::copy_n(text_embeds_data + s * hidden_size, hidden_size, merged_data + merged_idx * hidden_size);
                 merged_idx++;
             }
         }
@@ -705,33 +714,36 @@ class InputsEmbedderLLaVA : public InputsEmbedder::IInputsEmbedder {
 
 class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA {
 public:
-    InputsEmbedderLLaVANext(
-        const VLMConfig& vlm_config,
-        const std::filesystem::path& model_dir,
-        const std::string& device,
-        const ov::AnyMap device_config) :
-        InputsEmbedderLLaVA(vlm_config, model_dir, device, device_config) { }
-
-    InputsEmbedderLLaVANext(
-        const VLMConfig& vlm_config,
-        const ModelsMap& models_map,
-        const Tokenizer& tokenizer,
-        const std::filesystem::path& config_dir_path,
-        const std::string& device,
-        const ov::AnyMap device_config) :
-        InputsEmbedderLLaVA(vlm_config, models_map, tokenizer, config_dir_path, device, device_config) { }
-
-    virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics) override {
+    InputsEmbedderLLaVANext(const VLMConfig& vlm_config,
+                            const std::filesystem::path& model_dir,
+                            const std::string& device,
+                            const ov::AnyMap device_config)
+        : InputsEmbedderLLaVA(vlm_config, model_dir, device, device_config) {}
+
+    InputsEmbedderLLaVANext(const VLMConfig& vlm_config,
+                            const ModelsMap& models_map,
+                            const Tokenizer& tokenizer,
+                            const std::filesystem::path& config_dir_path,
+                            const std::string& device,
+                            const ov::AnyMap device_config)
+        : InputsEmbedderLLaVA(vlm_config, models_map, tokenizer, config_dir_path, device, device_config) {}
+
+    virtual ov::Tensor get_inputs_embeds(const std::string& prompt,
+                                         const std::vector<ov::Tensor>& images,
+                                         ov::genai::VLMPerfMetrics& metrics) override {
         std::string image_token = m_vlm_config.im_start;
         // Adapted from llava-1.5-7b-hf chat_template.json
-        std::string chat_template_fallback = "{% for message in messages %}{% if message['role'] == 'user' %}{{ 'USER: ' + message['content'] + ' ' }}{% else %}{{ 'ASSISTANT: ' + message['content'] + ' ' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}";
+        std::string chat_template_fallback =
+            "{% for message in messages %}{% if message['role'] == 'user' %}{{ 'USER: ' + message['content'] + ' ' "
+            "}}{% else %}{{ 'ASSISTANT: ' + message['content'] + ' ' }}{% endif %}{% endfor %}{% if "
+            "add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}";
 
         std::vector<ov::Tensor> single_images = to_single_image_tensors(images);
 
         std::string formatted_prompt;
         std::vector<ov::Tensor> image_embeds;
         image_embeds.reserve(single_images.size());
-        
+
         ov::Tensor image_newline;
 
         for (const auto& image : single_images) {
@@ -744,9 +756,10 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA {
                 std::copy(m_vlm_config.image_newline.begin(), m_vlm_config.image_newline.end(), image_newline_data);
             }
 
-            ImageSize original_image_size{image.get_shape().at(1), image.get_shape().at(2)}; // [height, width]
+            ImageSize original_image_size{image.get_shape().at(1), image.get_shape().at(2)};  // [height, width]
 
-            ov::Tensor packed_features = pack_image_features_llava_next(encoded_image, original_image_size, image_newline);
+            ov::Tensor packed_features =
+                pack_image_features_llava_next(encoded_image, original_image_size, image_newline);
 
             image_embeds.push_back(std::move(packed_features));
             formatted_prompt += image_token + "\n";
@@ -760,29 +773,29 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA {
             return text_embeds;
         }
         auto start_tokenizer_time = std::chrono::steady_clock::now();
-        ov::Tensor encoded_image_token = m_tokenizer.encode(m_vlm_config.im_start, ov::genai::add_special_tokens(false)).input_ids;
+        ov::Tensor encoded_image_token =
+            m_tokenizer.encode(m_vlm_config.im_start, ov::genai::add_special_tokens(false)).input_ids;
         auto end_tokenizer_time = std::chrono::steady_clock::now();
         OPENVINO_ASSERT(metrics.raw_metrics.tokenization_durations.size() > 0);
-        metrics.raw_metrics.tokenization_durations[metrics.raw_metrics.tokenization_durations.size() - 1] += ov::genai::MicroSeconds(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time));
+        metrics.raw_metrics.tokenization_durations[metrics.raw_metrics.tokenization_durations.size() - 1] +=
+            ov::genai::MicroSeconds(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time));
         int64_t image_token_id = encoded_image_token.data<int64_t>()[encoded_image_token.get_size() - 1];
         return merge_text_and_image_embeddings_llava(input_ids, text_embeds, image_embeds, image_token_id);
     }
 
 private:
     /**
-    * @brief Processes base and patches image features extracted from encoded image.
-    * Used in getting inputs embeds for llava_next models.
-    *
-    * @param encoded_image An encoded image retrieved from vision encoder
-    * @param original_image_size A size of the original image
-    * @param image_newline An image newline tensor with a shape (embed_dim)
-    * @return A tensor with a shape (1, new_seq_len, embed_dim)
-    */
-    ov::Tensor pack_image_features_llava_next(
-        const EncodedImage& encoded_image,
-        const ImageSize& original_image_size,
-        const ov::Tensor& image_newline
-    ) {
+     * @brief Processes base and patches image features extracted from encoded image.
+     * Used in getting inputs embeds for llava_next models.
+     *
+     * @param encoded_image An encoded image retrieved from vision encoder
+     * @param original_image_size A size of the original image
+     * @param image_newline An image newline tensor with a shape (embed_dim)
+     * @return A tensor with a shape (1, new_seq_len, embed_dim)
+     */
+    ov::Tensor pack_image_features_llava_next(const EncodedImage& encoded_image,
+                                              const ImageSize& original_image_size,
+                                              const ov::Tensor& image_newline) {
         auto image_feature = encoded_image.resized_source;
         auto image_feature_shape = image_feature.get_shape();
         size_t num_patches = image_feature_shape[0];
@@ -800,11 +813,12 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA {
             std::copy(src_data, src_data + patch_seq_len * embed_dim, dst_data);
 
             // Extract other grid patches
-            ov::Tensor patches_image_feature(image_feature.get_element_type(), {num_patches - 1, patch_seq_len, embed_dim});
+            ov::Tensor patches_image_feature(image_feature.get_element_type(),
+                                             {num_patches - 1, patch_seq_len, embed_dim});
             dst_data = patches_image_feature.data<float>();
             std::copy(src_data + patch_seq_len * embed_dim,
-                    src_data + num_patches * patch_seq_len * embed_dim,
-                    dst_data);
+                      src_data + num_patches * patch_seq_len * embed_dim,
+                      dst_data);
 
             // Process grid patches image feature
             size_t height = encoded_image.resized_source_size.height;
@@ -812,7 +826,11 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA {
             size_t num_patch_height = encoded_image.patches_grid.first;
             size_t num_patch_width = encoded_image.patches_grid.second;
 
-            ov::Tensor reshaped_image_feature = reshape_and_rearrange_image_feature(patches_image_feature, num_patch_height, num_patch_width, height, width);
+            ov::Tensor reshaped_image_feature = reshape_and_rearrange_image_feature(patches_image_feature,
+                                                                                    num_patch_height,
+                                                                                    num_patch_width,
+                                                                                    height,
+                                                                                    width);
 
             ov::Tensor unpadded_image_feature = unpad_image(reshaped_image_feature, original_image_size);
 
@@ -820,7 +838,8 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA {
 
             ov::Tensor processed_image_feature = flatten_and_transpose(image_feature_with_newline);
 
-            // Concatenate base image feature ([1, seq_len_1, emded_dim]) and patches image feature ([seq_len_2, embed_dim])
+            // Concatenate base image feature ([1, seq_len_1, emded_dim]) and patches image feature ([seq_len_2,
+            // embed_dim])
             auto base_shape = base_image_feature.get_shape();
             auto processed_shape = processed_image_feature.get_shape();
 
@@ -832,32 +851,30 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA {
             std::copy(base_data, base_data + base_shape[1] * embed_dim, result.data<float>());
             // Copy processed image feature data
             std::copy(processed_data,
-                    processed_data + processed_shape[0] * embed_dim,
-                    result.data<float>() + base_shape[1] * embed_dim);
+                      processed_data + processed_shape[0] * embed_dim,
+                      result.data<float>() + base_shape[1] * embed_dim);
             return result;
         } else {
             // If there is only one patch, return the original (base) image feature concatenated with image_newline
             ov::Tensor result(image_feature.get_element_type(), {1, patch_seq_len + 1, embed_dim});
             // Copy base image feature data
             std::copy(image_feature_data + embed_dim,
-                    image_feature_data + patch_seq_len * embed_dim,
-                    result.data<float>());
+                      image_feature_data + patch_seq_len * embed_dim,
+                      result.data<float>());
             // Append image_newline data
-            std::copy(newline_data,
-                    newline_data + embed_dim,
-                    result.data<float>() + patch_seq_len * embed_dim);
+            std::copy(newline_data, newline_data + embed_dim, result.data<float>() + patch_seq_len * embed_dim);
             return result;
         }
     }
 
     /**
-    * @brief Adds image newline tensor to patches image feature tensor.
-    * Used for packing image features of llava_next models.
-    *
-    * @param image_feature A tensor with a shape (embed_dim, height, width)
-    * @param image_newline A tensor with a shape (embed_dim)
-    * @return A tensor with a shape (embed_dim, height, width + 1)
-    */
+     * @brief Adds image newline tensor to patches image feature tensor.
+     * Used for packing image features of llava_next models.
+     *
+     * @param image_feature A tensor with a shape (embed_dim, height, width)
+     * @param image_newline A tensor with a shape (embed_dim)
+     * @return A tensor with a shape (embed_dim, height, width + 1)
+     */
     ov::Tensor add_image_newline(const ov::Tensor& image_feature, const ov::Tensor& image_newline) {
         auto shape = image_feature.get_shape();
 
@@ -867,7 +884,8 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA {
         size_t height = shape[1];
         size_t width = shape[2];
 
-        OPENVINO_ASSERT(image_newline.get_shape()[0] == embed_dim, "image_newline dimension must match embed_dim of image_feature");
+        OPENVINO_ASSERT(image_newline.get_shape()[0] == embed_dim,
+                        "image_newline dimension must match embed_dim of image_feature");
 
         const float* image_feature_data = image_feature.data<float>();
         const float* newline_data = image_newline.data<float>();
@@ -878,11 +896,9 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA {
         for (size_t e = 0; e < embed_dim; ++e) {
             for (size_t h = 0; h < height; ++h) {
                 // Copy original image feature data
-                std::copy(
-                    image_feature_data + (e * height * width + h * width),
-                    image_feature_data + (e * height * width + (h + 1) * width),
-                    feature_with_newline_data + (e * height * (width + 1) + h * (width + 1))
-                );
+                std::copy(image_feature_data + (e * height * width + h * width),
+                          image_feature_data + (e * height * width + (h + 1) * width),
+                          feature_with_newline_data + (e * height * (width + 1) + h * (width + 1)));
                 // Add image newline
                 feature_with_newline_data[e * height * (width + 1) + h * (width + 1) + width] = newline_data[e];
             }
@@ -892,12 +908,12 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA {
     }
 
     /**
-    * @brief Flattens and transposes tensor.
-    * Used for packing image features of llava_next models.
-    *
-    * @param tensor A tensor with a shape (embed_dim, height, width)
-    * @return A tensor with a shape (height * width, embed_dim)
-    */
+     * @brief Flattens and transposes tensor.
+     * Used for packing image features of llava_next models.
+     *
+     * @param tensor A tensor with a shape (embed_dim, height, width)
+     * @return A tensor with a shape (height * width, embed_dim)
+     */
     ov::Tensor flatten_and_transpose(const ov::Tensor& tensor) {
         auto shape = tensor.get_shape();
         OPENVINO_ASSERT(shape.size() == 3, "Flattening tensor must have 3 dimensions");
@@ -921,7 +937,6 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA {
         return flatten_feature;
     }
 
-
     ov::Tensor reshape_and_rearrange_image_feature(const ov::Tensor& image_feature,
                                                    int num_patch_height,
                                                    int num_patch_width,
@@ -934,15 +949,11 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA {
         size_t patch_seq_len = shape[1];
         size_t embed_dim = shape[2];
 
-        OPENVINO_ASSERT(
-            num_patches == num_patch_height * num_patch_width,
-            "Number of patches does not match the specified grid size"
-        );
+        OPENVINO_ASSERT(num_patches == num_patch_height * num_patch_width,
+                        "Number of patches does not match the specified grid size");
 
-        OPENVINO_ASSERT(
-            patch_seq_len == height * width,
-            "Patch sequence length does not match the specified height and width"
-        );
+        OPENVINO_ASSERT(patch_seq_len == height * width,
+                        "Patch sequence length does not match the specified height and width");
 
         // Reshape tensor data and permute dimensions
         // [num_patches, patch_seq_len, embed_dim] -> [embed_dim, num_patch_height, height, num_patch_width, width]
@@ -965,20 +976,19 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA {
         ov::Tensor result(image_feature.get_element_type(),
                           {static_cast<size_t>(embed_dim),
                            static_cast<size_t>(num_patch_height * height),
-                           static_cast<size_t>(num_patch_width * width)}
-        );
+                           static_cast<size_t>(num_patch_width * width)});
         std::copy(reshaped_data.begin(), reshaped_data.end(), result.data<float>());
         return result;
     }
 
     /**
-    * @brief Unpads an image tensor of a padded and resized image.
-    * Used for packing image features of llava_next models.
-    *
-    * @param tensor An image tensor with a shape (embed_dim, height, width)
-    * @param original_size A size of original image
-    * @return An unpadded image tensor with a shape (embed_dim, new_height, new_width)
-    */
+     * @brief Unpads an image tensor of a padded and resized image.
+     * Used for packing image features of llava_next models.
+     *
+     * @param tensor An image tensor with a shape (embed_dim, height, width)
+     * @param original_size A size of original image
+     * @return An unpadded image tensor with a shape (embed_dim, new_height, new_width)
+     */
     ov::Tensor unpad_image(const ov::Tensor& tensor, const ImageSize& original_size) {
         size_t original_height = original_size.height;
         size_t original_width = original_size.width;
@@ -1003,9 +1013,9 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA {
                 for (int h = 0; h < unpadded_height_dim; ++h) {
                     std::copy(
                         tensor.data<float>() + (e * current_height * current_width + (padding + h) * current_width),
-                        tensor.data<float>() + (e * current_height * current_width + (padding + h) * current_width + current_width),
-                        unpadded_tensor.data<float>() + (e * unpadded_height_dim * current_width + h * current_width)
-                    );
+                        tensor.data<float>() +
+                            (e * current_height * current_width + (padding + h) * current_width + current_width),
+                        unpadded_tensor.data<float>() + (e * unpadded_height_dim * current_width + h * current_width));
                 }
             }
         } else {
@@ -1017,11 +1027,11 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA {
 
             for (size_t e = 0; e < embed_dim; ++e) {
                 for (int h = 0; h < current_height; ++h) {
-                    std::copy(
-                        tensor.data<float>() + (e * current_height * current_width + h * current_width + padding),
-                        tensor.data<float>() + (e * current_height * current_width + h * current_width + padding + unpadded_width_dim),
-                        unpadded_tensor.data<float>() + (e * current_height * unpadded_width_dim + h * unpadded_width_dim)
-                    );
+                    std::copy(tensor.data<float>() + (e * current_height * current_width + h * current_width + padding),
+                              tensor.data<float>() + (e * current_height * current_width + h * current_width + padding +
+                                                      unpadded_width_dim),
+                              unpadded_tensor.data<float>() +
+                                  (e * current_height * unpadded_width_dim + h * unpadded_width_dim));
                 }
             }
         }
@@ -1032,40 +1042,40 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA {
 
 class InputsEmbedderInternVLChat : public InputsEmbedder::IInputsEmbedder {
 public:
-    InputsEmbedderInternVLChat(
-        const VLMConfig& vlm_config,
-        const std::filesystem::path& model_dir,
-        const std::string& device,
-        const ov::AnyMap device_config) :
-        IInputsEmbedder(vlm_config, model_dir, device, device_config) { }
-
-    InputsEmbedderInternVLChat(
-        const VLMConfig& vlm_config,
-        const ModelsMap& models_map,
-        const Tokenizer& tokenizer,
-        const std::filesystem::path& config_dir_path,
-        const std::string& device,
-        const ov::AnyMap device_config) :
-        IInputsEmbedder(vlm_config, models_map, tokenizer, config_dir_path, device, device_config) { }
-
-    virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics) override {
+    InputsEmbedderInternVLChat(const VLMConfig& vlm_config,
+                               const std::filesystem::path& model_dir,
+                               const std::string& device,
+                               const ov::AnyMap device_config)
+        : IInputsEmbedder(vlm_config, model_dir, device, device_config) {}
+
+    InputsEmbedderInternVLChat(const VLMConfig& vlm_config,
+                               const ModelsMap& models_map,
+                               const Tokenizer& tokenizer,
+                               const std::filesystem::path& config_dir_path,
+                               const std::string& device,
+                               const ov::AnyMap device_config)
+        : IInputsEmbedder(vlm_config, models_map, tokenizer, config_dir_path, device, device_config) {}
+
+    virtual ov::Tensor get_inputs_embeds(const std::string& prompt,
+                                         const std::vector<ov::Tensor>& images,
+                                         ov::genai::VLMPerfMetrics& metrics) override {
         std::string image_start_token = m_vlm_config.image_start_token;
         std::string image_context_token = m_vlm_config.image_context_token;
         std::string image_end_token = m_vlm_config.image_end_token;
-        
+
         std::vector<ov::Tensor> single_images = to_single_image_tensors(images);
 
         std::string formatted_prompt;
         std::vector<ov::Tensor> image_embeds;
         image_embeds.reserve(single_images.size());
-        
+
         for (const auto& image : single_images) {
             EncodedImage encoded_image = m_vision_encoder.encode(image);
             ov::Tensor single_image_embeds = encoded_image.resized_source;
 
             const size_t num_patches = single_image_embeds.get_shape().at(0);
             const size_t num_image_tokens = single_image_embeds.get_shape().at(1);
-            
+
             formatted_prompt += image_start_token;
             for (int i = 0; i < num_patches * num_image_tokens; ++i) {
                 formatted_prompt += image_context_token;
@@ -1083,21 +1093,22 @@ class InputsEmbedderInternVLChat : public InputsEmbedder::IInputsEmbedder {
             return text_embeds;
         }
         auto start_tokenizer_time = std::chrono::steady_clock::now();
-        ov::Tensor encoded_image_context_token = m_tokenizer.encode(image_context_token, ov::genai::add_special_tokens(false)).input_ids;
+        ov::Tensor encoded_image_context_token =
+            m_tokenizer.encode(image_context_token, ov::genai::add_special_tokens(false)).input_ids;
         auto end_tokenizer_time = std::chrono::steady_clock::now();
         OPENVINO_ASSERT(metrics.raw_metrics.tokenization_durations.size() > 0);
-        metrics.raw_metrics.tokenization_durations[metrics.raw_metrics.tokenization_durations.size() - 1] += ov::genai::MicroSeconds(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time));
-        int64_t image_context_token_id = encoded_image_context_token.data<int64_t>()[encoded_image_context_token.get_size() - 1];
+        metrics.raw_metrics.tokenization_durations[metrics.raw_metrics.tokenization_durations.size() - 1] +=
+            ov::genai::MicroSeconds(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time));
+        int64_t image_context_token_id =
+            encoded_image_context_token.data<int64_t>()[encoded_image_context_token.get_size() - 1];
         return merge_text_and_image_embeddings_internvl(input_ids, text_embeds, image_embeds, image_context_token_id);
     }
 
 protected:
-    ov::Tensor merge_text_and_image_embeddings_internvl(
-        const ov::Tensor& input_ids,
-        const ov::Tensor& text_embeds,
-        const std::vector<ov::Tensor>& image_embeds,
-        int64_t image_context_token_id
-    ) {
+    ov::Tensor merge_text_and_image_embeddings_internvl(const ov::Tensor& input_ids,
+                                                        const ov::Tensor& text_embeds,
+                                                        const std::vector<ov::Tensor>& image_embeds,
+                                                        int64_t image_context_token_id) {
         auto text_embeds_shape = text_embeds.get_shape();
         size_t batch_size = text_embeds_shape.at(0);
         size_t seq_len = text_embeds_shape.at(1);
@@ -1131,12 +1142,14 @@ class InputsEmbedderInternVLChat : public InputsEmbedder::IInputsEmbedder {
 
                 if (image_context_tokens_mask[flat_idx]) {
                     const ov::Tensor& single_image_embeds = image_embeds[image_idx];
-                    const size_t num_all_image_tokens = single_image_embeds.get_shape().at(0) * single_image_embeds.get_shape().at(1); // num_patches * num_image_tokens
+                    const size_t num_all_image_tokens =
+                        single_image_embeds.get_shape().at(0) *
+                        single_image_embeds.get_shape().at(1);  // num_patches * num_image_tokens
                     const float* image_embeds_data = single_image_embeds.data<float>();
                     std::copy_n(image_embeds_data + image_context_token_idx * embed_dim,
                                 embed_dim,
                                 merged_embeds_data + offset);
-                    
+
                     ++image_context_token_idx;
 
                     if (image_context_token_idx == num_all_image_tokens) {
@@ -1277,30 +1290,23 @@ ov::InferRequest create_hd_feature_transformer() {
     // t28 = opset.Unsqueeze([t26, t27], {},  # i64[], i32[] -> i64[1]
     // t29 = opset.Constant(model, 29,   #  -> i64[1]([2])
     // t30 = opset.Constant(model, 30,   #  -> i64[1]([2])
-    // t31 = opset.Concat([t8, t28, t29, t28, t30, t21], {'axis': 0},  # i64[1], i64[1], i64[1], i64[1], i64[1], i64[1] -> i64[6]
-    // t32 = opset.Reshape([t23, t31], {'special_zero': False},  # f32[?,24,24,1024], i64[6] -> f32[?,12,2,12,2,1024]
-    // t33 = opset.Constant(model, 33,
-    // t34 = opset.Transpose([t32, t33], {},  # f32[?,12,2,12,2,1024], i64[6] -> f32[?,12,12,2,2,1024]
-    // t35 = opset.Constant(model, 35,   #  -> i64[1]([-1])
-    // t36 = opset.Constant(model, 36,  #  -> i64[1]([4])
-    // t37 = opset.Multiply([t21, t36], {'auto_broadcast': 'numpy'},  # i64[1], i64[1] -> i64[1]
-    // t38 = opset.Concat([t8, t35, t37], {'axis': 0},  # i64[1], i64[1], i64[1] -> i64[3]
-    // t39 = opset.Reshape([t34, t38], {'special_zero': False},  # f32[?,12,12,2,2,1024], i64[3] -> f32[?,?,4096]
-    // t40 = opset.Multiply([t1, t2], {'auto_broadcast': 'numpy'},  # i32[], i32[] -> i32[]
-    // t41 = opset.Convert([t40], {'destination_type': 'i64'},  # i32[] -> i64[]
-    // t42 = opset.Divide([t6, t41], {'auto_broadcast': 'numpy', 'm_pythondiv': True},  # i64[], i64[] -> i64[]
-    // t43 = opset.Floor([t42], {},  # i64[] -> i64[]
-    // t44 = opset.Constant(model, 44,   #  -> i32[](0)
-    // t45 = opset.Unsqueeze([t43, t44], {},  # i64[], i32[] -> i64[1]
-    // t46 = opset.Convert([t1], {'destination_type': 'i64'},  # i32[] -> i64[]
-    // t47 = opset.Unsqueeze([t46, t44], {},  # i64[], i32[] -> i64[1]
-    // t48 = opset.Convert([t2], {'destination_type': 'i64'},  # i32[] -> i64[]
-    // t49 = opset.Unsqueeze([t48, t44], {},  # i64[], i32[] -> i64[1]
-    // t50 = opset.Constant(model, 50,  #  -> i64[1]([-1])
-    // t51 = opset.Concat([t45, t47, t49, t28, t28, t50], {'axis': 0},  # i64[1], i64[1], i64[1], i64[1], i64[1], i64[1] -> i64[6]
-    // t52 = opset.Reshape([t39, t51], {'special_zero': False},  # f32[?,?,4096], i64[6] -> f32[?,?,?,?,?,?]
-    // t53 = opset.Constant(model, 53,
-    // t54 = opset.Transpose([t52, t53], {},  # f32[?,?,?,?,?,?], i64[6] -> f32[?,?,?,?,?,?]
+    // t31 = opset.Concat([t8, t28, t29, t28, t30, t21], {'axis': 0},  # i64[1], i64[1], i64[1], i64[1], i64[1], i64[1]
+    // -> i64[6] t32 = opset.Reshape([t23, t31], {'special_zero': False},  # f32[?,24,24,1024], i64[6] ->
+    // f32[?,12,2,12,2,1024] t33 = opset.Constant(model, 33, t34 = opset.Transpose([t32, t33], {},  #
+    // f32[?,12,2,12,2,1024], i64[6] -> f32[?,12,12,2,2,1024] t35 = opset.Constant(model, 35,   #  -> i64[1]([-1]) t36 =
+    // opset.Constant(model, 36,  #  -> i64[1]([4]) t37 = opset.Multiply([t21, t36], {'auto_broadcast': 'numpy'},  #
+    // i64[1], i64[1] -> i64[1] t38 = opset.Concat([t8, t35, t37], {'axis': 0},  # i64[1], i64[1], i64[1] -> i64[3] t39
+    // = opset.Reshape([t34, t38], {'special_zero': False},  # f32[?,12,12,2,2,1024], i64[3] -> f32[?,?,4096] t40 =
+    // opset.Multiply([t1, t2], {'auto_broadcast': 'numpy'},  # i32[], i32[] -> i32[] t41 = opset.Convert([t40],
+    // {'destination_type': 'i64'},  # i32[] -> i64[] t42 = opset.Divide([t6, t41], {'auto_broadcast': 'numpy',
+    // 'm_pythondiv': True},  # i64[], i64[] -> i64[] t43 = opset.Floor([t42], {},  # i64[] -> i64[] t44 =
+    // opset.Constant(model, 44,   #  -> i32[](0) t45 = opset.Unsqueeze([t43, t44], {},  # i64[], i32[] -> i64[1] t46 =
+    // opset.Convert([t1], {'destination_type': 'i64'},  # i32[] -> i64[] t47 = opset.Unsqueeze([t46, t44], {},  #
+    // i64[], i32[] -> i64[1] t48 = opset.Convert([t2], {'destination_type': 'i64'},  # i32[] -> i64[] t49 =
+    // opset.Unsqueeze([t48, t44], {},  # i64[], i32[] -> i64[1] t50 = opset.Constant(model, 50,  #  -> i64[1]([-1]) t51
+    // = opset.Concat([t45, t47, t49, t28, t28, t50], {'axis': 0},  # i64[1], i64[1], i64[1], i64[1], i64[1], i64[1] ->
+    // i64[6] t52 = opset.Reshape([t39, t51], {'special_zero': False},  # f32[?,?,4096], i64[6] -> f32[?,?,?,?,?,?] t53
+    // = opset.Constant(model, 53, t54 = opset.Transpose([t52, t53], {},  # f32[?,?,?,?,?,?], i64[6] -> f32[?,?,?,?,?,?]
     // t55 = opset.Multiply([t1, t15], {'auto_broadcast': 'numpy'},  # i32[], i32[] -> i32[]
     // t56 = opset.Convert([t55], {'destination_type': 'i64'},  # i32[] -> i64[]
     // t57 = opset.Constant(model, 57,  #  -> i64[](2)
@@ -1317,9 +1323,8 @@ ov::InferRequest create_hd_feature_transformer() {
     // t68 = opset.Concat([t45, t61, t67, t37], {'axis': 0},  # i64[1], i64[1], i64[1], i64[1] -> i64[4]
     // t69 = opset.Reshape([t54, t68], {'special_zero': False},  # f32[?,?,?,?,?,?], i64[4] -> f32[?,?,?,?]
     shared_ptr<Model> model = make_shared<Model>(make_shared<Result>(t69), ParameterVector{t0, t1, t2});
-    ov::InferRequest hd_feature_transformer = utils::singleton_core().compile_model(
-        model, "CPU"
-    ).create_infer_request();
+    ov::InferRequest hd_feature_transformer =
+        utils::singleton_core().compile_model(model, "CPU").create_infer_request();
     // hd_feature_transformer.set_input_tensor(0, ov::Tensor{f32, {1, 576, 1024}});
     // ov::Tensor h_crop = ov::Tensor{i32, {}};
     // h_crop.data<int32_t>()[0] = 1;
@@ -1332,7 +1337,10 @@ ov::InferRequest create_hd_feature_transformer() {
     return hd_feature_transformer;
 }
 
-ov::Tensor reshape_hd_patches_2x2merge(const ov::Tensor& image_features, size_t h_crop, size_t w_crop, InferRequest& hd_feature_transformer) {
+ov::Tensor reshape_hd_patches_2x2merge(const ov::Tensor& image_features,
+                                       size_t h_crop,
+                                       size_t w_crop,
+                                       InferRequest& hd_feature_transformer) {
     ov::Shape shape = image_features.get_shape();
     OPENVINO_ASSERT(3 == shape.size());
     OPENVINO_ASSERT(24 * 24 == shape.at(1));
@@ -1356,23 +1364,24 @@ ov::Tensor add_image_newline(const ov::Tensor& image_features_hd, const std::vec
     for (size_t batch_id = 0; batch_id < nhwc.at(0); ++batch_id) {
         for (size_t row_id = 0; row_id < nhwc.at(1); ++row_id) {
             for (size_t col_id = 0; col_id < nhwc.at(2); ++col_id) {
-                std::copy_n(
-                    in + batch_id * nhwc.at(1) * nhwc.at(2) * nhwc.at(3) + row_id * nhwc.at(2) * nhwc.at(3) + col_id * nhwc.at(3),
-                    nhwc.at(3),
-                    out + batch_id * nhwc.at(1) * (nhwc.at(2) + 1) * nhwc.at(3) + row_id * (nhwc.at(2) + 1) * nhwc.at(3) + col_id * nhwc.at(3)
-                );
+                std::copy_n(in + batch_id * nhwc.at(1) * nhwc.at(2) * nhwc.at(3) + row_id * nhwc.at(2) * nhwc.at(3) +
+                                col_id * nhwc.at(3),
+                            nhwc.at(3),
+                            out + batch_id * nhwc.at(1) * (nhwc.at(2) + 1) * nhwc.at(3) +
+                                row_id * (nhwc.at(2) + 1) * nhwc.at(3) + col_id * nhwc.at(3));
             }
-            std::copy(
-                sub_GN.begin(),
-                sub_GN.end(),
-                out + batch_id * nhwc.at(1) * (nhwc.at(2) + 1) * nhwc.at(3) + row_id * (nhwc.at(2) + 1) * nhwc.at(3) + nhwc.at(2) * nhwc.at(3)
-            );
+            std::copy(sub_GN.begin(),
+                      sub_GN.end(),
+                      out + batch_id * nhwc.at(1) * (nhwc.at(2) + 1) * nhwc.at(3) +
+                          row_id * (nhwc.at(2) + 1) * nhwc.at(3) + nhwc.at(2) * nhwc.at(3));
         }
     }
     return image_features_hd_new_line;
 }
 
-ov::Tensor concatenate_2d(const ov::Tensor& first_1lf, const std::vector<float>& second_f, const ov::Tensor& third_1lf) {
+ov::Tensor concatenate_2d(const ov::Tensor& first_1lf,
+                          const std::vector<float>& second_f,
+                          const ov::Tensor& third_1lf) {
     size_t first_l = first_1lf.get_shape().at(1);
     constexpr size_t second_l = 1;
     size_t third_l = third_1lf.get_shape().at(1);
@@ -1387,12 +1396,20 @@ ov::Tensor concatenate_2d(const ov::Tensor& first_1lf, const std::vector<float>&
 }
 
 // image_features.resized_source: (num_crops+1, 24*24, 1024)
-ov::Tensor hd_feature_transform(const EncodedImage& image_features, InferRequest& hd_feature_transformer, const std::vector<float>& sub_GN, const std::vector<float>& glb_GN, ov::InferRequest& vision_projection) {
+ov::Tensor hd_feature_transform(const EncodedImage& image_features,
+                                InferRequest& hd_feature_transformer,
+                                const std::vector<float>& sub_GN,
+                                const std::vector<float>& glb_GN,
+                                ov::InferRequest& vision_projection) {
     const ov::Shape& image_features_shape = image_features.resized_source.get_shape();
-    ov::Tensor global_image_features{ov::element::f32, {1, image_features_shape.at(1), image_features_shape.at(2)}, image_features.resized_source.data<float>()};
+    ov::Tensor global_image_features{ov::element::f32,
+                                     {1, image_features_shape.at(1), image_features_shape.at(2)},
+                                     image_features.resized_source.data<float>()};
     // global feature can be viewed as a special HD case with num_crops 1x1
-    ov::Tensor global_image_features_hd = reshape_hd_patches_2x2merge(global_image_features, 1, 1, hd_feature_transformer);
-    ov::Tensor global_image_features_hd_newline = add_image_newline(global_image_features_hd, sub_GN);  // [1,12*(12+1),4096]
+    ov::Tensor global_image_features_hd =
+        reshape_hd_patches_2x2merge(global_image_features, 1, 1, hd_feature_transformer);
+    ov::Tensor global_image_features_hd_newline =
+        add_image_newline(global_image_features_hd, sub_GN);  // [1,12*(12+1),4096]
     constexpr size_t INPUT_IMAGE_SIZE = 336;
     size_t h_crop = image_features.resized_source_size.height / INPUT_IMAGE_SIZE;
     size_t w_crop = image_features.resized_source_size.width / INPUT_IMAGE_SIZE;
@@ -1400,27 +1417,23 @@ ov::Tensor hd_feature_transform(const EncodedImage& image_features, InferRequest
 
     // NOTE: real num_crops is padded
     // (num_crops, 24*24, 1024)
-    ov::Tensor sub_image_features{ov::element::f32, {
-        num_crops,
-        image_features_shape.at(1),
-        image_features_shape.at(2)
-    }, image_features.resized_source.data<float>() + image_features_shape.at(1) * image_features_shape.at(2)};
-    ov::Tensor sub_image_features_hd = reshape_hd_patches_2x2merge(sub_image_features, h_crop, w_crop, hd_feature_transformer);  // [1, 24, 24, 4096]
-    ov::Tensor sub_image_features_hd_newline = add_image_newline(sub_image_features_hd, sub_GN);  // [1,h_crop*12*(w_crop*12+1), 4096]
+    ov::Tensor sub_image_features{
+        ov::element::f32,
+        {num_crops, image_features_shape.at(1), image_features_shape.at(2)},
+        image_features.resized_source.data<float>() + image_features_shape.at(1) * image_features_shape.at(2)};
+    ov::Tensor sub_image_features_hd =
+        reshape_hd_patches_2x2merge(sub_image_features, h_crop, w_crop, hd_feature_transformer);  // [1, 24, 24, 4096]
+    ov::Tensor sub_image_features_hd_newline =
+        add_image_newline(sub_image_features_hd, sub_GN);  // [1,h_crop*12*(w_crop*12+1), 4096]
     return concatenate_2d(sub_image_features_hd_newline, glb_GN, global_image_features_hd_newline);  // [1,l,4096]
 }
 
 std::vector<ov::Tensor> split_tokenize(const std::string& text, ov::genai::Tokenizer& tokenizer) {
     constexpr int make_suffix_iterator = -1;
     std::regex rgx{R"(<\|image_\d+\|>)"};
-    std::sregex_token_iterator iter{
-        text.begin(),
-        text.end(),
-        rgx,
-        make_suffix_iterator
-    };
+    std::sregex_token_iterator iter{text.begin(), text.end(), rgx, make_suffix_iterator};
     std::vector<ov::Tensor> tokenized;
-    for ( ; iter != std::sregex_token_iterator{}; ++iter) {
+    for (; iter != std::sregex_token_iterator{}; ++iter) {
         if (iter->str().empty()) {
             continue;
         }
@@ -1429,16 +1442,19 @@ std::vector<ov::Tensor> split_tokenize(const std::string& text, ov::genai::Token
     return tokenized;
 }
 
-// ov::Tensor apply_template_and_tokenize(bool is_chat_conversation, const std::string& prompt, ov::genai::VLMPerfMetrics& metrics, const std::string& chat_template_fallback = "") {
+// ov::Tensor apply_template_and_tokenize(bool is_chat_conversation, const std::string& prompt,
+// ov::genai::VLMPerfMetrics& metrics, const std::string& chat_template_fallback = "") {
 //     ov::Tensor encoded_input_ids;
 //     if (is_chat_conversation) {
 //         // KV cache in model already contains prompts and answers from previous iterations.
 //         // So only new prompt wrapped into chat template to be sent into model. Tokenizer always returns
 //         // token_ids = {<bos token>, ...<valuable tokens>}. So if tokenizer applies only to the new prompt,
 //         // <bos token> will be inserted on every iteration.
-//         // So actual pipeline calculates input_ids for whole chat history + for whole chat history without the new prompt
+//         // So actual pipeline calculates input_ids for whole chat history + for whole chat history without the new
+//         prompt
 //         // and takes only the difference between them.
-//         // The chat history cannot be saved as already encoded tokens because generate call doesn't return <eos> token, but
+//         // The chat history cannot be saved as already encoded tokens because generate call doesn't return <eos>
+//         token, but
 //         // KV cache contains it. So we have to add it manually or get it by tokenization all chat history.
 //         m_history.push_back({{"role", "user"}, {"content", prompt}});
 //         constexpr bool add_generation_prompt = true;
@@ -1447,7 +1463,8 @@ std::vector<ov::Tensor> split_tokenize(const std::string& text, ov::genai::Token
 //             new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt);
 //         } catch (const std::exception& error) {
 //             // Use fallback chat template if it was not found in tokenizer_config.json
-//             new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt, chat_template_fallback);
+//             new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt,
+//             chat_template_fallback);
 //         }
 //         auto start_tokenizer_time = std::chrono::steady_clock::now();
 //         ov::Tensor new_chat_tokens = m_tokenizer.encode(new_templated_chat_history).input_ids;
@@ -1464,18 +1481,19 @@ std::vector<ov::Tensor> split_tokenize(const std::string& text, ov::genai::Token
 //             ).input_ids;
 //         }
 //         auto end_tokenizer_time = std::chrono::steady_clock::now();
-//         metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time));
-//         m_templated_chat_history = std::move(new_templated_chat_history);
+//         metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time -
+//         start_tokenizer_time)); m_templated_chat_history = std::move(new_templated_chat_history);
 //     } else {
 //         auto start_tokenizer_time = std::chrono::steady_clock::now();
 //         encoded_input_ids = m_tokenizer.encode(prompt).input_ids;
 //         auto end_tokenizer_time = std::chrono::steady_clock::now();
-//         metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time));
+//         metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time -
+//         start_tokenizer_time));
 //     }
 //     return encoded_input_ids;
 // }
-}
-}
+}  // namespace phi3_v
+}  // namespace
 
 class InputsEmbedderPhi3V : public InputsEmbedder::IInputsEmbedder {
 public:
@@ -1484,24 +1502,31 @@ class InputsEmbedderPhi3V : public InputsEmbedder::IInputsEmbedder {
     // Used to insert <|image_i|>\n per image (not a slice).
     size_t m_image_id = 1;
 
-    InputsEmbedderPhi3V(
-        const VLMConfig& vlm_config,
-        const std::filesystem::path& model_dir,
-        const std::string& device,
-        const ov::AnyMap device_config
-    ):
-        IInputsEmbedder(vlm_config, model_dir, device, device_config), m_image_id{0},
-        m_hd_feature_transformer{phi3_v::create_hd_feature_transformer()},
-        m_vision_projection{utils::singleton_core().compile_model(model_dir / "openvino_vision_projection_model.xml", device).create_infer_request()} {}
-
-    ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics) override {
+    InputsEmbedderPhi3V(const VLMConfig& vlm_config,
+                        const std::filesystem::path& model_dir,
+                        const std::string& device,
+                        const ov::AnyMap device_config)
+        : IInputsEmbedder(vlm_config, model_dir, device, device_config),
+          m_image_id{0},
+          m_hd_feature_transformer{phi3_v::create_hd_feature_transformer()},
+          m_vision_projection{utils::singleton_core()
+                                  .compile_model(model_dir / "openvino_vision_projection_model.xml", device)
+                                  .create_infer_request()} {}
+
+    ov::Tensor get_inputs_embeds(const std::string& prompt,
+                                 const std::vector<ov::Tensor>& images,
+                                 ov::genai::VLMPerfMetrics& metrics) override {
         // TODO: perfmetrics
-        std::cout << prompt<<'\n';
+        OPENVINO_ASSERT(!std::regex_search(prompt, std::regex{R"(<\|image_\d+\|>)"}), "<|image_i|> can't be used in the prompt because it's reserved for images");
         std::stringstream images_prompt;
         std::vector<ov::Tensor> images_features_proj;
         for (const ov::Tensor& image : to_single_image_tensors(images)) {
             EncodedImage encoded_image = m_vision_encoder.encode(image);
-            images_features_proj.push_back(phi3_v::hd_feature_transform(encoded_image, m_hd_feature_transformer, m_vlm_config.sub_GN, m_vlm_config.glb_GN, m_vision_projection));
+            images_features_proj.push_back(phi3_v::hd_feature_transform(encoded_image,
+                                                                        m_hd_feature_transformer,
+                                                                        m_vlm_config.sub_GN,
+                                                                        m_vlm_config.glb_GN,
+                                                                        m_vision_projection));
             images_prompt << "<|image_" << m_image_id << "|>\n";
             ++m_image_id;
         }
@@ -1564,27 +1589,26 @@ class InputsEmbedderPhi3V : public InputsEmbedder::IInputsEmbedder {
         // int64_t* end = ids + encoded_input_size;
         // float* inputs_embeds_data = inputs_embeds.data<float>();
         // for (const EncodedImage& encoded_image : embeds) {
-        //     const ov::Tensor& resampled_source = resample(encoded_image.resized_source, {encoded_image.resized_source_size});
-        //     float* emb = resampled_source.data<float>();
-        //     ids = std::find(ids, end, im_start_id);
-        //     OPENVINO_ASSERT(end != ids);
+        //     const ov::Tensor& resampled_source = resample(encoded_image.resized_source,
+        //     {encoded_image.resized_source_size}); float* emb = resampled_source.data<float>(); ids = std::find(ids,
+        //     end, im_start_id); OPENVINO_ASSERT(end != ids);
         //     ++ids;
-        //     std::copy_n(emb, resampled_source.get_size(), inputs_embeds_data + std::distance(begin, ids) * m_vlm_config.hidden_size);
-        //     ids += m_vlm_config.query_num;
-        //     if (encoded_image.slices) {
+        //     std::copy_n(emb, resampled_source.get_size(), inputs_embeds_data + std::distance(begin, ids) *
+        //     m_vlm_config.hidden_size); ids += m_vlm_config.query_num; if (encoded_image.slices) {
         //         size_t token_idx = 0;
         //         const ov::Shape& slices_shape = encoded_image.slices.get_shape();
         //         for (size_t i = 0; i < slices_shape.at(0); ++i) {
         //             for (size_t ja = 0; ja < slices_shape.at(1); ++ja) {
         //                 size_t d2 = slices_shape.at(2);
         //                 size_t d3 = slices_shape.at(3);
-        //                 ov::Tensor encoded_view{ov::element::f32, {1, d2, d3}, encoded_image.slices.data<float>() + (i * slices_shape.at(1) + ja) * d2 * d3};
-        //                 const ov::Tensor& vision_embed_tensor_i_j = resample(encoded_view, {encoded_image.slices_size});
-        //                 ids = std::find(ids, end, slice_start_id);
-        //                 OPENVINO_ASSERT(end != ids);
+        //                 ov::Tensor encoded_view{ov::element::f32, {1, d2, d3}, encoded_image.slices.data<float>() +
+        //                 (i * slices_shape.at(1) + ja) * d2 * d3}; const ov::Tensor& vision_embed_tensor_i_j =
+        //                 resample(encoded_view, {encoded_image.slices_size}); ids = std::find(ids, end,
+        //                 slice_start_id); OPENVINO_ASSERT(end != ids);
         //                 ++ids;
-        //                 std::copy_n(vision_embed_tensor_i_j.data<float>(), vision_embed_tensor_i_j.get_size(), inputs_embeds_data + std::distance(begin, ids) * m_vlm_config.hidden_size);
-        //                 ids += m_vlm_config.query_num;
+        //                 std::copy_n(vision_embed_tensor_i_j.data<float>(), vision_embed_tensor_i_j.get_size(),
+        //                 inputs_embeds_data + std::distance(begin, ids) * m_vlm_config.hidden_size); ids +=
+        //                 m_vlm_config.query_num;
         //             }
         //         }
         //     }
@@ -1623,7 +1647,8 @@ InputsEmbedder::InputsEmbedder(const VLMConfig& vlm_config,
     } else if (vlm_config.model_type == VLMModelType::PHI3_V) {
         m_impl = std::make_shared<InputsEmbedderPhi3V>(vlm_config, model_dir, device, device_config);
     } else {
-        OPENVINO_THROW("Unsupported model type in VLM InputsEmbedder class. Please, create feature request on new model support");
+        OPENVINO_THROW(
+            "Unsupported model type in VLM InputsEmbedder class. Please, create feature request on new model support");
     }
 }
 
@@ -1634,19 +1659,42 @@ InputsEmbedder::InputsEmbedder(const VLMConfig& vlm_config,
                                const std::string& device,
                                const ov::AnyMap device_config) {
     if (vlm_config.model_type == VLMModelType::MINICPM) {
-        m_impl = std::make_shared<InputsEmbedderMiniCPM>(vlm_config, models_map, tokenizer, config_dir_path, device, device_config);
+        m_impl = std::make_shared<InputsEmbedderMiniCPM>(vlm_config,
+                                                         models_map,
+                                                         tokenizer,
+                                                         config_dir_path,
+                                                         device,
+                                                         device_config);
     } else if (vlm_config.model_type == VLMModelType::LLAVA) {
-        m_impl = std::make_shared<InputsEmbedderLLaVA>(vlm_config, models_map, tokenizer, config_dir_path, device, device_config);
+        m_impl = std::make_shared<InputsEmbedderLLaVA>(vlm_config,
+                                                       models_map,
+                                                       tokenizer,
+                                                       config_dir_path,
+                                                       device,
+                                                       device_config);
     } else if (vlm_config.model_type == VLMModelType::LLAVA_NEXT) {
-        m_impl = std::make_shared<InputsEmbedderLLaVANext>(vlm_config, models_map, tokenizer, config_dir_path, device, device_config);
+        m_impl = std::make_shared<InputsEmbedderLLaVANext>(vlm_config,
+                                                           models_map,
+                                                           tokenizer,
+                                                           config_dir_path,
+                                                           device,
+                                                           device_config);
     } else if (vlm_config.model_type == VLMModelType::INTERNVL_CHAT) {
-        m_impl = std::make_shared<InputsEmbedderInternVLChat>(vlm_config, models_map, tokenizer, config_dir_path, device, device_config);
+        m_impl = std::make_shared<InputsEmbedderInternVLChat>(vlm_config,
+                                                              models_map,
+                                                              tokenizer,
+                                                              config_dir_path,
+                                                              device,
+                                                              device_config);
     } else {
-        OPENVINO_THROW("Unsupported model type in VLM InputsEmbedder class. Please, create feature request on new model support");
+        OPENVINO_THROW(
+            "Unsupported model type in VLM InputsEmbedder class. Please, create feature request on new model support");
     }
 }
 
-ov::Tensor InputsEmbedder::get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics) {
+ov::Tensor InputsEmbedder::get_inputs_embeds(const std::string& prompt,
+                                             const std::vector<ov::Tensor>& images,
+                                             ov::genai::VLMPerfMetrics& metrics) {
     return m_impl->get_inputs_embeds(prompt, images, metrics);
 }
 
@@ -1658,7 +1706,10 @@ std::vector<int64_t> InputsEmbedder::get_tokenized_history() const {
     return m_impl->get_tokenized_history();
 }
 
-void InputsEmbedder::update_tokenized_history(const std::vector<int64_t>& encoded_result, std::optional<int64_t> last_disappeared_token, bool is_beam_search, size_t last_answer_len) {
+void InputsEmbedder::update_tokenized_history(const std::vector<int64_t>& encoded_result,
+                                              std::optional<int64_t> last_disappeared_token,
+                                              bool is_beam_search,
+                                              size_t last_answer_len) {
     return m_impl->update_tokenized_history(encoded_result, last_disappeared_token, is_beam_search, last_answer_len);
 }
 
@@ -1682,4 +1733,4 @@ void InputsEmbedder::finish_chat() {
     return m_impl->finish_chat();
 }
 
-} // namespace ov::genai
+}  // namespace ov::genai

From 524982f4715f99f4a882761945a047937726d26f Mon Sep 17 00:00:00 2001
From: Vladimir Zlobin <vladimir.zlobin@intel.com>
Date: Wed, 8 Jan 2025 14:56:30 +0400
Subject: [PATCH 10/28] Revert "code style"

This reverts commit 83834a24a027f4243de3a670bdf8fd79c165fa08.
---
 .../src/visual_language/inputs_embedder.cpp   | 931 +++++++++---------
 1 file changed, 440 insertions(+), 491 deletions(-)

diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp
index 4404ddfe27..b1027c533b 100644
--- a/src/cpp/src/visual_language/inputs_embedder.cpp
+++ b/src/cpp/src/visual_language/inputs_embedder.cpp
@@ -1,16 +1,16 @@
 // Copyright (C) 2023-2024 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
+#include "openvino/genai/visual_language/perf_metrics.hpp"
 #include "visual_language/inputs_embedder.hpp"
 
-#include <regex>
-
-#include "openvino/genai/visual_language/perf_metrics.hpp"
-#include "openvino/opsets/opset13.hpp"
-#include "utils.hpp"
 #include "visual_language/clip.hpp"
-#include "visual_language/embedding_model.hpp"
 #include "visual_language/vision_encoder.hpp"
+#include "visual_language/embedding_model.hpp"
+#include "openvino/opsets/opset13.hpp"
+
+#include "utils.hpp"
+#include <regex>
 
 namespace ov::genai {
 
@@ -40,15 +40,12 @@ class InputsEmbedder::IInputsEmbedder {
     // Tail of previous output for LM in chat mode is missing in KV cache.
     std::optional<int64_t> m_last_disappeared_token = std::nullopt;
     // If sequence contains some symbols, which could be ambiguous encoded by tokenizer, we need to trim kv cache
-    // If we use beam search sampling with chat mode we need to remove last answer of the model from kv cache and add
-    // best answer to history so, let's keep info about amount of tokens to trim from kv cache and amount of tokens to
-    // keep in history
+    // If we use beam search sampling with chat mode we need to remove last answer of the model from kv cache and add best answer to history 
+    // so, let's keep info about amount of tokens to trim from kv cache and amount of tokens to keep in history
     ov::genai::utils::HistoryRemoveManager m_kv_history_manager = {0, 0};
 
 public:
-    virtual ov::Tensor get_inputs_embeds(const std::string& prompt,
-                                         const std::vector<ov::Tensor>& images,
-                                         ov::genai::VLMPerfMetrics& metrics) = 0;
+    virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics) = 0;
 
     EmbeddingsModel get_embedding_model() const {
         return m_embedding;
@@ -66,10 +63,7 @@ class InputsEmbedder::IInputsEmbedder {
         return m_kv_history_manager.num_tokens_to_remove_from_kv_cache;
     }
 
-    void update_tokenized_history(const std::vector<int64_t>& encoded_result,
-                                  std::optional<int64_t> last_disappeared_token,
-                                  bool is_beam_search,
-                                  size_t last_answer_len) {
+    void update_tokenized_history(const std::vector<int64_t>& encoded_result, std::optional<int64_t> last_disappeared_token, bool is_beam_search, size_t last_answer_len) {
         if (is_beam_search) {
             m_kv_history_manager.trusted_history_length = m_tokenized_history.size();
             m_kv_history_manager.num_tokens_to_remove_from_kv_cache = last_answer_len;
@@ -78,7 +72,7 @@ class InputsEmbedder::IInputsEmbedder {
         }
 
         m_last_disappeared_token = last_disappeared_token;
-
+  
         std::copy(encoded_result.begin(), encoded_result.end(), std::back_inserter(m_tokenized_history));
     }
 
@@ -115,48 +109,52 @@ class InputsEmbedder::IInputsEmbedder {
     }
 
 protected:
-    IInputsEmbedder(const VLMConfig& vlm_config,
-                    const std::filesystem::path& model_dir,
-                    const std::string& device,
-                    const ov::AnyMap device_config)
-        : m_vlm_config{vlm_config},
-          m_vision_encoder(model_dir, m_vlm_config.model_type, device, device_config),
-          m_embedding(model_dir, m_vlm_config.scale_emb, device, device_config),
-          m_tokenizer{model_dir, device_config} {}
-
-    IInputsEmbedder(const VLMConfig& vlm_config,
-                    const ModelsMap& models_map,
-                    const Tokenizer& tokenizer,
-                    const std::filesystem::path& config_dir_path,
-                    const std::string& device,
-                    const ov::AnyMap device_config)
-        : m_vlm_config{vlm_config},
-          m_vision_encoder(get_model_weights_pair(models_map, "vision_embeddings").first,
-                           get_model_weights_pair(models_map, "vision_embeddings").second,
-                           config_dir_path,
-                           m_vlm_config.model_type,
-                           device,
-                           device_config),
-          m_embedding(get_model_weights_pair(models_map, "text_embeddings").first,
-                      get_model_weights_pair(models_map, "text_embeddings").second,
-                      m_vlm_config.scale_emb,
-                      device,
-                      device_config),
-          m_tokenizer(tokenizer) {}
-
-    ov::Tensor get_encoded_input_ids(const std::string& prompt,
-                                     ov::genai::VLMPerfMetrics& metrics,
-                                     const std::string& chat_template_fallback = "") {
+    IInputsEmbedder(
+        const VLMConfig& vlm_config,
+        const std::filesystem::path& model_dir,
+        const std::string& device,
+        const ov::AnyMap device_config) :
+        m_vlm_config{vlm_config},
+        m_vision_encoder(model_dir, m_vlm_config.model_type, device, device_config),
+        m_embedding(model_dir, m_vlm_config.scale_emb, device, device_config),
+        m_tokenizer{model_dir, device_config} { }
+    
+    IInputsEmbedder(
+        const VLMConfig& vlm_config,
+        const ModelsMap& models_map,
+        const Tokenizer& tokenizer,
+        const std::filesystem::path& config_dir_path,
+        const std::string& device,
+        const ov::AnyMap device_config) :
+        m_vlm_config{vlm_config},
+        m_vision_encoder(
+            get_model_weights_pair(models_map, "vision_embeddings").first,
+            get_model_weights_pair(models_map, "vision_embeddings").second,
+            config_dir_path,
+            m_vlm_config.model_type,
+            device,
+            device_config
+        ),
+        m_embedding(
+            get_model_weights_pair(models_map, "text_embeddings").first,
+            get_model_weights_pair(models_map, "text_embeddings").second,
+            m_vlm_config.scale_emb,
+            device,
+            device_config
+        ),
+        m_tokenizer(tokenizer) { }
+
+    ov::Tensor get_encoded_input_ids(const std::string& prompt, ov::genai::VLMPerfMetrics& metrics, const std::string& chat_template_fallback = "") {
         ov::Tensor encoded_input_ids;
         if (m_is_chat_conversation) {
             // KV cache in model already contains prompts and answers from previous iterations.
             // So only new prompt wrapped into chat template to be sent into model. Tokenizer always returns
             // token_ids = {<bos token>, ...<valuable tokens>}. So if tokenizer applies only to the new prompt,
             // <bos token> will be inserted on every iteration.
-            // So actual pipeline calculates input_ids for whole chat history + for whole chat history without the new
-            // prompt and takes only the difference between them. The chat history cannot be saved as already encoded
-            // tokens because generate call doesn't return <eos> token, but KV cache contains it. So we have to add it
-            // manually or get it by tokenization all chat history.
+            // So actual pipeline calculates input_ids for whole chat history + for whole chat history without the new prompt
+            // and takes only the difference between them.
+            // The chat history cannot be saved as already encoded tokens because generate call doesn't return <eos> token, but
+            // KV cache contains it. So we have to add it manually or get it by tokenization all chat history.
             m_history.push_back({{"role", "user"}, {"content", prompt}});
             constexpr bool add_generation_prompt = true;
             std::string new_templated_chat_history;
@@ -164,24 +162,19 @@ class InputsEmbedder::IInputsEmbedder {
                 new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt);
             } catch (const std::exception& error) {
                 // Use fallback chat template if it was not found in tokenizer_config.json
-                new_templated_chat_history =
-                    m_tokenizer.apply_chat_template(m_history, add_generation_prompt, chat_template_fallback);
+                new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt, chat_template_fallback);
             }
             auto start_tokenizer_time = std::chrono::steady_clock::now();
-            ov::Tensor new_chat_tokens =
-                m_tokenizer.encode(new_templated_chat_history, ov::genai::add_special_tokens(false)).input_ids;
-            TokenizedInputs prev_chat_tokens =
-                m_tokenizer.encode(m_templated_chat_history, ov::genai::add_special_tokens(false));
+            ov::Tensor new_chat_tokens = m_tokenizer.encode(new_templated_chat_history, ov::genai::add_special_tokens(false)).input_ids;
+            TokenizedInputs prev_chat_tokens = m_tokenizer.encode(m_templated_chat_history, ov::genai::add_special_tokens(false));
 
             // some symbols combinations can be encoded by the tokenizer in different ways
-            // if we met sequence with such combination of symbols, we cannot correctly subtract the new history from
-            // the old history so let's check it out, find the trusted part and use it in on the next step
+            // if we met sequence with such combination of symbols, we cannot correctly subtract the new history from the old history
+            // so let's check it out, find the trusted part and use it in on the next step
             size_t trusted_history_length = 0;
             if (!m_tokenized_history.empty()) {
                 std::set<int64_t> stop_tokens = {m_tokenizer.get_eos_token_id()};
-                trusted_history_length = ov::genai::utils::get_first_history_difference(prev_chat_tokens.input_ids,
-                                                                                        m_tokenized_history,
-                                                                                        stop_tokens);
+                trusted_history_length = ov::genai::utils::get_first_history_difference(prev_chat_tokens.input_ids, m_tokenized_history, stop_tokens);
             }
 
             if (m_tokenized_history.empty()) {
@@ -189,94 +182,81 @@ class InputsEmbedder::IInputsEmbedder {
 
             } else if (trusted_history_length != SIZE_MAX || m_kv_history_manager.does_kv_cache_need_to_update()) {
                 // does_kv_cache_need_to_update will be true here if beam search is activated
-                // in beam search mode we want to remove all history about last model answer from kv cache and add the
-                // best answer directly if we have difference in model answer and decoded answer it anyway will be less
-                // then entire history, so let's use data from m_kv_history_manager
+                // in beam search mode we want to remove all history about last model answer from kv cache and add the best answer directly
+                // if we have difference in model answer and decoded answer it anyway will be less then entire history, so let's use data from m_kv_history_manager
                 if (m_kv_history_manager.does_kv_cache_need_to_update()) {
                     trusted_history_length = m_kv_history_manager.trusted_history_length;
                 } else {
-                    m_kv_history_manager.num_tokens_to_remove_from_kv_cache =
-                        m_tokenized_history.size() - trusted_history_length;
-                    // if prev generation was finished because of max len was reached, kv cache is missed one last
-                    // token, let's keep it
-                    m_kv_history_manager.num_tokens_to_remove_from_kv_cache -=
-                        m_last_disappeared_token.has_value() ? 1 : 0;
+                    m_kv_history_manager.num_tokens_to_remove_from_kv_cache = m_tokenized_history.size() - trusted_history_length;
+                    // if prev generation was finished because of max len was reached, kv cache is missed one last token, let's keep it
+                    m_kv_history_manager.num_tokens_to_remove_from_kv_cache -= m_last_disappeared_token.has_value() ? 1 : 0;
                 }
 
                 ov::Tensor new_tensor = ov::Tensor(new_chat_tokens.get_element_type(),
                                                    {1, new_chat_tokens.get_shape().at(1) - trusted_history_length},
                                                    new_chat_tokens.data<int64_t>() + trusted_history_length);
                 encoded_input_ids = ov::Tensor(new_chat_tokens.get_element_type(),
-                                               {1, new_chat_tokens.get_shape().at(1) - trusted_history_length});
+                                                    {1, new_chat_tokens.get_shape().at(1) - trusted_history_length});
                 new_tensor.copy_to(encoded_input_ids);
             } else {
-                encoded_input_ids =
-                    utils::subtract_chat_tokenized_inputs({new_chat_tokens}, prev_chat_tokens).input_ids;
+                encoded_input_ids = utils::subtract_chat_tokenized_inputs(
+                    {new_chat_tokens}, prev_chat_tokens
+                ).input_ids;
 
                 if (m_last_disappeared_token.has_value())
-                    encoded_input_ids =
-                        ov::genai::utils::push_front_inputs(encoded_input_ids, *m_last_disappeared_token);
+                    encoded_input_ids = ov::genai::utils::push_front_inputs(encoded_input_ids, *m_last_disappeared_token);
             }
             auto end_tokenizer_time = std::chrono::steady_clock::now();
-            metrics.raw_metrics.tokenization_durations.emplace_back(
-                PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time));
+            metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time));
             m_templated_chat_history = std::move(new_templated_chat_history);
             m_tokenized_history.clear();
-            std::copy_n(new_chat_tokens.data<int64_t>(),
-                        new_chat_tokens.get_size(),
-                        std::back_inserter(m_tokenized_history));
+            std::copy_n(new_chat_tokens.data<int64_t>(), new_chat_tokens.get_size(), std::back_inserter(m_tokenized_history));
         } else {
             auto start_tokenizer_time = std::chrono::steady_clock::now();
             encoded_input_ids = m_tokenizer.encode(prompt).input_ids;
             auto end_tokenizer_time = std::chrono::steady_clock::now();
-            metrics.raw_metrics.tokenization_durations.emplace_back(
-                PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time));
+            metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time));
             m_tokenized_history.clear();
-            std::copy_n(encoded_input_ids.data<int64_t>(),
-                        encoded_input_ids.get_size(),
-                        std::back_inserter(m_tokenized_history));
+            std::copy_n(encoded_input_ids.data<int64_t>(), encoded_input_ids.get_size(), std::back_inserter(m_tokenized_history));
         }
 
         return encoded_input_ids;
     }
 
     /**
-     * @brief Unpads an image tensor of a padded and resized image.
-     * Used for packing image features of llava_next models.
-     *
-     * @param tensor An image tensor with a shape (embed_dim, height, width)
-     * @param original_size A size of original image
-     * @return An unpadded image tensor with a shape (embed_dim, new_height, new_width)
-     */
+    * @brief Unpads an image tensor of a padded and resized image.
+    * Used for packing image features of llava_next models.
+    *
+    * @param tensor An image tensor with a shape (embed_dim, height, width)
+    * @param original_size A size of original image
+    * @return An unpadded image tensor with a shape (embed_dim, new_height, new_width)
+    */
 
     /**
-     * @brief Converts a vector of batched images ([NHWC]) into a vector of individual image tensors ([1HWC]).
-     *
-     * @param images A vector of tensors representing the images. Each tensor can have a shape of either [NHWC] or
-     * [HWC].
-     * @return A vector of tensors where each tensor represents a single image with a shape of [1, H, W, C].
-     */
+    * @brief Converts a vector of batched images ([NHWC]) into a vector of individual image tensors ([1HWC]).
+    *
+    * @param images A vector of tensors representing the images. Each tensor can have a shape of either [NHWC] or [HWC].
+    * @return A vector of tensors where each tensor represents a single image with a shape of [1, H, W, C].
+    */
     std::vector<ov::Tensor> to_single_image_tensors(const std::vector<ov::Tensor>& images) {
         std::vector<ov::Tensor> single_image_tensors;
         for (const auto& image : images) {
             ov::Tensor reshaped_image = image;
             ov::Shape image_shape = image.get_shape();
             switch (image_shape.size()) {
-            case 3:
-                reshaped_image.set_shape({1, image_shape.at(0), image_shape.at(1), image_shape.at(2)});
-                break;
-            case 4:
-                break;
-            default:
-                OPENVINO_THROW("Input image must have [NHWC] or [HWC] layout");
+                case 3:
+                    reshaped_image.set_shape({1, image_shape.at(0), image_shape.at(1), image_shape.at(2)});
+                    break;
+                case 4: break;
+                default: OPENVINO_THROW("Input image must have [NHWC] or [HWC] layout");
             }
             ov::Shape reshaped_image_shape = reshaped_image.get_shape();
             for (size_t batch_idx = 0; batch_idx < reshaped_image_shape.at(0); ++batch_idx) {
                 ov::Tensor single_image{
                     reshaped_image.get_element_type(),
                     {1, reshaped_image_shape.at(1), reshaped_image_shape.at(2), reshaped_image_shape.at(3)},
-                    reshaped_image.data<uint8_t>() + batch_idx * reshaped_image_shape.at(1) *
-                                                         reshaped_image_shape.at(2) * reshaped_image_shape.at(3)};
+                    reshaped_image.data<uint8_t>() + batch_idx * reshaped_image_shape.at(1) * reshaped_image_shape.at(2) * reshaped_image_shape.at(3)
+                };
                 single_image_tensors.push_back(std::move(single_image));
             }
         }
@@ -297,11 +277,12 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder {
     size_t m_image_id = 0;
 
 public:
-    InputsEmbedderMiniCPM(const VLMConfig& vlm_config,
-                          const std::filesystem::path& model_dir,
-                          const std::string& device,
-                          const ov::AnyMap device_config)
-        : IInputsEmbedder(vlm_config, model_dir, device, device_config) {
+    InputsEmbedderMiniCPM(
+        const VLMConfig& vlm_config,
+        const std::filesystem::path& model_dir,
+        const std::string& device,
+        const ov::AnyMap device_config) :
+        IInputsEmbedder(vlm_config, model_dir, device, device_config) {
         auto compiled_model =
             utils::singleton_core().compile_model(model_dir / "openvino_resampler_model.xml", device, device_config);
         ov::genai::utils::print_compiled_model_properties(compiled_model, "VLM resampler model");
@@ -310,26 +291,25 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder {
         m_pos_embed_cache = get_2d_sincos_pos_embed(m_vlm_config.hidden_size, {70, 70});
     }
 
-    InputsEmbedderMiniCPM(const VLMConfig& vlm_config,
-                          const ModelsMap& models_map,
-                          const Tokenizer& tokenizer,
-                          const std::filesystem::path& config_dir_path,
-                          const std::string& device,
-                          const ov::AnyMap device_config)
-        : IInputsEmbedder(vlm_config, models_map, tokenizer, config_dir_path, device, device_config) {
-        m_resampler = utils::singleton_core()
-                          .compile_model(get_model_weights_pair(models_map, "resampler").first,
-                                         get_model_weights_pair(models_map, "resampler").second,
-                                         device,
-                                         device_config)
-                          .create_infer_request();
-
-        m_pos_embed_cache = get_2d_sincos_pos_embed(m_vlm_config.hidden_size, {70, 70});
-    }
+    InputsEmbedderMiniCPM(
+        const VLMConfig& vlm_config,
+        const ModelsMap& models_map,
+        const Tokenizer& tokenizer,
+        const std::filesystem::path& config_dir_path,
+        const std::string& device,
+        const ov::AnyMap device_config) :
+        IInputsEmbedder(vlm_config, models_map, tokenizer, config_dir_path, device, device_config) {
+            m_resampler = utils::singleton_core().compile_model(
+                get_model_weights_pair(models_map, "resampler").first,
+                get_model_weights_pair(models_map, "resampler").second,
+                device,
+                device_config
+            ).create_infer_request();
+
+            m_pos_embed_cache = get_2d_sincos_pos_embed(m_vlm_config.hidden_size, {70, 70});
+        }
 
-    virtual ov::Tensor get_inputs_embeds(const std::string& prompt,
-                                         const std::vector<ov::Tensor>& images,
-                                         ov::genai::VLMPerfMetrics& metrics) override {
+    virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics) override {
         std::string images_prompt;
         std::vector<EncodedImage> embeds;
 
@@ -367,18 +347,24 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder {
         ov::Tensor encoded_input = get_encoded_input_ids(images_prompt, metrics);
 
         ov::Tensor inputs_embeds = m_embedding.infer(encoded_input);
-        OPENVINO_ASSERT(m_vlm_config.hidden_size == inputs_embeds.get_shape().at(2), "Unexpected embedding size");
+        OPENVINO_ASSERT(
+            m_vlm_config.hidden_size == inputs_embeds.get_shape().at(2),
+            "Unexpected embedding size"
+        );
         auto start_tokenizer_time = std::chrono::steady_clock::now();
-        ov::Tensor special_tokens =
-            m_tokenizer
-                .encode(m_vlm_config.im_start + m_vlm_config.im_end + m_vlm_config.slice_start + m_vlm_config.slice_end)
-                .input_ids;
+        ov::Tensor special_tokens = m_tokenizer.encode(
+            m_vlm_config.im_start
+            + m_vlm_config.im_end
+            + m_vlm_config.slice_start
+            + m_vlm_config.slice_end
+        ).input_ids;
         auto end_tokenizer_time = std::chrono::steady_clock::now();
         OPENVINO_ASSERT(metrics.raw_metrics.tokenization_durations.size() > 0);
-        metrics.raw_metrics.tokenization_durations[metrics.raw_metrics.tokenization_durations.size() - 1] +=
-            ov::genai::MicroSeconds(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time));
-        OPENVINO_ASSERT(4 == special_tokens.get_shape().at(1),
-                        "Every special token must be represented with a single int.");
+        metrics.raw_metrics.tokenization_durations[metrics.raw_metrics.tokenization_durations.size() - 1] += ov::genai::MicroSeconds(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time));
+        OPENVINO_ASSERT(
+            4 == special_tokens.get_shape().at(1),
+            "Every special token must be represented with a single int."
+        );
         int64_t im_start_id = special_tokens.data<int64_t>()[0];
         int64_t im_end_id = special_tokens.data<int64_t>()[1];
         int64_t slice_start_id = special_tokens.data<int64_t>()[2];
@@ -390,15 +376,12 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder {
         int64_t* end = ids + encoded_input_size;
         float* inputs_embeds_data = inputs_embeds.data<float>();
         for (const EncodedImage& encoded_image : embeds) {
-            const ov::Tensor& resampled_source =
-                resample(encoded_image.resized_source, {encoded_image.resized_source_size});
+            const ov::Tensor& resampled_source = resample(encoded_image.resized_source, {encoded_image.resized_source_size});
             float* emb = resampled_source.data<float>();
             ids = std::find(ids, end, im_start_id);
             OPENVINO_ASSERT(end != ids);
             ++ids;
-            std::copy_n(emb,
-                        resampled_source.get_size(),
-                        inputs_embeds_data + std::distance(begin, ids) * m_vlm_config.hidden_size);
+            std::copy_n(emb, resampled_source.get_size(), inputs_embeds_data + std::distance(begin, ids) * m_vlm_config.hidden_size);
             ids += m_vlm_config.query_num;
             if (encoded_image.slices) {
                 size_t token_idx = 0;
@@ -407,17 +390,12 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder {
                     for (size_t ja = 0; ja < slices_shape.at(1); ++ja) {
                         size_t d2 = slices_shape.at(2);
                         size_t d3 = slices_shape.at(3);
-                        ov::Tensor encoded_view{
-                            ov::element::f32,
-                            {1, d2, d3},
-                            encoded_image.slices.data<float>() + (i * slices_shape.at(1) + ja) * d2 * d3};
+                        ov::Tensor encoded_view{ov::element::f32, {1, d2, d3}, encoded_image.slices.data<float>() + (i * slices_shape.at(1) + ja) * d2 * d3};
                         const ov::Tensor& vision_embed_tensor_i_j = resample(encoded_view, {encoded_image.slices_size});
                         ids = std::find(ids, end, slice_start_id);
                         OPENVINO_ASSERT(end != ids);
                         ++ids;
-                        std::copy_n(vision_embed_tensor_i_j.data<float>(),
-                                    vision_embed_tensor_i_j.get_size(),
-                                    inputs_embeds_data + std::distance(begin, ids) * m_vlm_config.hidden_size);
+                        std::copy_n(vision_embed_tensor_i_j.data<float>(), vision_embed_tensor_i_j.get_size(), inputs_embeds_data + std::distance(begin, ids) * m_vlm_config.hidden_size);
                         ids += m_vlm_config.query_num;
                     }
                 }
@@ -447,7 +425,11 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder {
         std::transform(target_sizes.begin(), target_sizes.end(), patch_len.begin(), [](const ImageSize& height_width) {
             return height_width.height * height_width.width;
         });
-        adjust_pos_cache(target_sizes, m_vlm_config.hidden_size, m_pos_embed_cache);
+        adjust_pos_cache(
+            target_sizes,
+            m_vlm_config.hidden_size,
+            m_pos_embed_cache
+        );
         size_t max_patch_len = *std::max_element(patch_len.begin(), patch_len.end());
         ov::Tensor key_padding_mask(ov::element::f32, {bs, max_patch_len});
         float* mask_data = key_padding_mask.data<float>();
@@ -462,9 +444,11 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder {
             size_t target_w = target_sizes.at(i).width;
             for (size_t h_idx = 0; h_idx < target_h; ++h_idx) {
                 for (size_t w_idx = 0; w_idx < target_w; ++w_idx) {
-                    std::copy_n(cache_data + (h_idx * _d1 + w_idx) * embed_len,
-                                embed_len,
-                                pos_embed_data + (h_idx * target_w + w_idx) * bs * embed_len + i * embed_len);
+                    std::copy_n(
+                        cache_data + (h_idx * _d1 + w_idx) * embed_len,
+                        embed_len,
+                        pos_embed_data + (h_idx * target_w + w_idx) * bs * embed_len + i * embed_len
+                    );
                 }
             }
             for (size_t flat = target_h * target_w; flat < max_patch_len; ++flat) {
@@ -473,8 +457,8 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder {
             std::fill_n(mask_data + i * max_patch_len, patch_len[i], 0.0f);
             std::fill_n(mask_data + i * max_patch_len + patch_len[i], max_patch_len - patch_len[i], 1.0f);
         }
-        m_resampler.set_tensor("image_feature", encoded_image);        // [N, H*W, old_hidden_size]
-        m_resampler.set_tensor("pos_embed", pos_embed);                // [H*W, N, new_hidden_size]
+        m_resampler.set_tensor("image_feature", encoded_image);  // [N, H*W, old_hidden_size]
+        m_resampler.set_tensor("pos_embed", pos_embed);  // [H*W, N, new_hidden_size]
         m_resampler.set_tensor("key_padding_mask", key_padding_mask);  // [N, H*W]
         m_resampler.infer();
         return m_resampler.get_output_tensor();  // [N, query_num, new_hidden_size]
@@ -494,12 +478,12 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder {
             for (size_t j = 0; j < res_d_1; ++j) {
                 size_t k = 0;
                 for (; k < first.get_shape().at(2); ++k) {
-                    res_data[i * res_d_1 * res_d_2 + j * res_d_2 + k] =
-                        first_data[i * res_d_1 * first.get_shape().at(2) + j * first.get_shape().at(2) + k];
+                    res_data[i * res_d_1 * res_d_2 + j * res_d_2 + k]
+                        = first_data[i * res_d_1 * first.get_shape().at(2) + j * first.get_shape().at(2) + k];
                 }
                 for (size_t l = 0; l < second.get_shape().at(2); ++l, ++k) {
-                    res_data[i * res_d_1 * res_d_2 + j * res_d_2 + k] =
-                        second_data[i * res_d_1 * second.get_shape().at(2) + j * second.get_shape().at(2) + l];
+                    res_data[i * res_d_1 * res_d_2 + j * res_d_2 + k]
+                        = second_data[i * res_d_1 * second.get_shape().at(2) + j * second.get_shape().at(2) + l];
                 }
             }
         }
@@ -545,14 +529,16 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder {
         ov::Shape grid_shape = grid.get_shape();
         float* grid_data = grid.data<float>();
         ov::Shape plane_shape{grid_shape.at(1), grid_shape.at(2)};
-        ov::Tensor emb_h =
-            get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2,
-                                                  ov::Tensor{ov::element::f32, plane_shape, grid_data});  // (H, W, D/2)
-        ov::Tensor emb_w = get_1d_sincos_pos_embed_from_grid_new(
-            embed_dim / 2,
-            ov::Tensor{ov::element::f32,
-                       plane_shape,
-                       grid_data + plane_shape.at(0) * plane_shape.at(1)});  // (H, W, D/2)
+        ov::Tensor emb_h = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, ov::Tensor{
+            ov::element::f32,
+            plane_shape,
+            grid_data
+        });  // (H, W, D/2)
+        ov::Tensor emb_w = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, ov::Tensor{
+            ov::element::f32,
+            plane_shape,
+            grid_data + plane_shape.at(0) * plane_shape.at(1)
+        });  // (H, W, D/2)
         return concatenate_last_dim(emb_h, emb_w);
     }
 
@@ -574,19 +560,17 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder {
         return get_2d_sincos_pos_embed_from_grid(embed_dim, grid);
     }
 
-    void adjust_pos_cache(const std::vector<ImageSize>& target_sizes, size_t hidden_size, ov::Tensor& pos_embed_cache) {
-        size_t max_h = std::max_element(target_sizes.begin(),
-                                        target_sizes.end(),
-                                        [](const ImageSize& left, const ImageSize& right) {
-                                            return left.height < right.height;
-                                        })
-                           ->height;
-        size_t max_w = std::max_element(target_sizes.begin(),
-                                        target_sizes.end(),
-                                        [](const ImageSize& left, const ImageSize& right) {
-                                            return left.width < right.width;
-                                        })
-                           ->width;
+    void adjust_pos_cache(
+        const std::vector<ImageSize>& target_sizes,
+        size_t hidden_size,
+        ov::Tensor& pos_embed_cache
+    ) {
+        size_t max_h = std::max_element(target_sizes.begin(), target_sizes.end(), [](const ImageSize& left, const ImageSize& right) {
+            return left.height < right.height;
+        })->height;
+        size_t max_w = std::max_element(target_sizes.begin(), target_sizes.end(), [](const ImageSize& left, const ImageSize& right) {
+            return left.width < right.width;
+        })->width;
         size_t allocated_height, allocated_width;
         if (pos_embed_cache) {
             const ov::Shape& allocated_shape = pos_embed_cache.get_shape();
@@ -598,37 +582,36 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder {
         if (max_h > allocated_height || max_w > allocated_width) {
             allocated_height = std::max(max_h, allocated_height);
             allocated_width = std::max(max_w, allocated_width);
-            pos_embed_cache = get_2d_sincos_pos_embed(hidden_size, {allocated_height, allocated_width});
+            pos_embed_cache = get_2d_sincos_pos_embed(
+                hidden_size, {allocated_height, allocated_width}
+            );
         }
     }
 };
 
 class InputsEmbedderLLaVA : public InputsEmbedder::IInputsEmbedder {
 public:
-    InputsEmbedderLLaVA(const VLMConfig& vlm_config,
-                        const std::filesystem::path& model_dir,
-                        const std::string& device,
-                        const ov::AnyMap device_config)
-        : IInputsEmbedder(vlm_config, model_dir, device, device_config) {}
-
-    InputsEmbedderLLaVA(const VLMConfig& vlm_config,
-                        const ModelsMap& models_map,
-                        const Tokenizer& tokenizer,
-                        const std::filesystem::path& config_dir_path,
-                        const std::string& device,
-                        const ov::AnyMap device_config)
-        : IInputsEmbedder(vlm_config, models_map, tokenizer, config_dir_path, device, device_config) {}
-
-    virtual ov::Tensor get_inputs_embeds(const std::string& prompt,
-                                         const std::vector<ov::Tensor>& images,
-                                         ov::genai::VLMPerfMetrics& metrics) override {
+    InputsEmbedderLLaVA(
+        const VLMConfig& vlm_config,
+        const std::filesystem::path& model_dir,
+        const std::string& device,
+        const ov::AnyMap device_config) :
+        IInputsEmbedder(vlm_config, model_dir, device, device_config) { }
+
+    InputsEmbedderLLaVA(
+        const VLMConfig& vlm_config,
+        const ModelsMap& models_map,
+        const Tokenizer& tokenizer,
+        const std::filesystem::path& config_dir_path,
+        const std::string& device,
+        const ov::AnyMap device_config) :
+        IInputsEmbedder(vlm_config, models_map, tokenizer, config_dir_path, device, device_config) { }
+
+    virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics) override {
         std::string image_token = m_vlm_config.im_start;
         // Adapted from llava-1.5-7b-hf chat_template.json
-        std::string chat_template_fallback =
-            "{% for message in messages %}{% if message['role'] == 'user' %}{{ 'USER: ' + message['content'] + ' ' "
-            "}}{% else %}{{ 'ASSISTANT: ' + message['content'] + ' ' }}{% endif %}{% endfor %}{% if "
-            "add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}";
-
+        std::string chat_template_fallback = "{% for message in messages %}{% if message['role'] == 'user' %}{{ 'USER: ' + message['content'] + ' ' }}{% else %}{{ 'ASSISTANT: ' + message['content'] + ' ' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}";
+        
         std::vector<ov::Tensor> single_images = to_single_image_tensors(images);
 
         std::string formatted_prompt;
@@ -649,21 +632,21 @@ class InputsEmbedderLLaVA : public InputsEmbedder::IInputsEmbedder {
             return text_embeds;
         }
         auto start_tokenizer_time = std::chrono::steady_clock::now();
-        ov::Tensor encoded_image_token =
-            m_tokenizer.encode(m_vlm_config.im_start, ov::genai::add_special_tokens(false)).input_ids;
+        ov::Tensor encoded_image_token = m_tokenizer.encode(m_vlm_config.im_start, ov::genai::add_special_tokens(false)).input_ids;
         auto end_tokenizer_time = std::chrono::steady_clock::now();
         OPENVINO_ASSERT(metrics.raw_metrics.tokenization_durations.size() > 0);
-        metrics.raw_metrics.tokenization_durations[metrics.raw_metrics.tokenization_durations.size() - 1] +=
-            ov::genai::MicroSeconds(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time));
+        metrics.raw_metrics.tokenization_durations[metrics.raw_metrics.tokenization_durations.size() - 1] += ov::genai::MicroSeconds(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time));
         int64_t image_token_id = encoded_image_token.data<int64_t>()[encoded_image_token.get_size() - 1];
         return merge_text_and_image_embeddings_llava(input_ids, text_embeds, image_embeds, image_token_id);
     }
 
 protected:
-    ov::Tensor merge_text_and_image_embeddings_llava(const ov::Tensor& input_ids,
-                                                     const ov::Tensor& text_embeds,
-                                                     const std::vector<ov::Tensor>& image_embeds,
-                                                     int64_t image_token_id) {
+    ov::Tensor merge_text_and_image_embeddings_llava(
+        const ov::Tensor& input_ids,
+        const ov::Tensor& text_embeds,
+        const std::vector<ov::Tensor>& image_embeds,
+        int64_t image_token_id
+    ) {
         auto text_embeds_shape = text_embeds.get_shape();
         size_t text_embeds_seq_length = text_embeds_shape[1];
         size_t hidden_size = text_embeds_shape[2];
@@ -678,18 +661,22 @@ class InputsEmbedderLLaVA : public InputsEmbedder::IInputsEmbedder {
             }
         }
         auto num_images = image_embeds.size();
-        OPENVINO_ASSERT(num_image_tokens == num_images,
-                        "Number of image tokens in input_ids different from num_images.");
+        OPENVINO_ASSERT(
+            num_image_tokens == num_images,
+            "Number of image tokens in input_ids different from num_images."
+        );
 
         size_t total_image_seq_length = 0;
         for (const auto& single_image_embeds : image_embeds) {
-            OPENVINO_ASSERT(text_embeds_shape[2] == single_image_embeds.get_shape().at(2),
-                            "Incompatible shapes between text_embeds and image_embeds");
+            OPENVINO_ASSERT(
+                text_embeds_shape[2] == single_image_embeds.get_shape().at(2),
+                "Incompatible shapes between text_embeds and image_embeds"
+            );
             total_image_seq_length += single_image_embeds.get_shape().at(1);
         }
         size_t merged_seq_length = text_embeds_seq_length + total_image_seq_length - num_image_tokens;
 
-        constexpr size_t BATCH_SIZE = 1;
+    constexpr size_t BATCH_SIZE = 1;
         ov::Tensor merged_embeds(text_embeds.get_element_type(), {BATCH_SIZE, merged_seq_length, hidden_size});
         float* merged_data = merged_embeds.data<float>();
 
@@ -700,11 +687,15 @@ class InputsEmbedderLLaVA : public InputsEmbedder::IInputsEmbedder {
                 const float* image_embeds_data = image_embeds[image_idx].data<const float>();
                 size_t image_seq_length = image_embeds[image_idx].get_shape()[1];
 
-                std::copy_n(image_embeds_data, image_seq_length * hidden_size, merged_data + merged_idx * hidden_size);
+                std::copy_n(image_embeds_data,
+                            image_seq_length * hidden_size,
+                            merged_data + merged_idx * hidden_size);
                 merged_idx += image_seq_length;
                 image_idx++;
             } else {
-                std::copy_n(text_embeds_data + s * hidden_size, hidden_size, merged_data + merged_idx * hidden_size);
+                std::copy_n(text_embeds_data + s * hidden_size,
+                            hidden_size,
+                            merged_data + merged_idx * hidden_size);
                 merged_idx++;
             }
         }
@@ -714,36 +705,33 @@ class InputsEmbedderLLaVA : public InputsEmbedder::IInputsEmbedder {
 
 class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA {
 public:
-    InputsEmbedderLLaVANext(const VLMConfig& vlm_config,
-                            const std::filesystem::path& model_dir,
-                            const std::string& device,
-                            const ov::AnyMap device_config)
-        : InputsEmbedderLLaVA(vlm_config, model_dir, device, device_config) {}
-
-    InputsEmbedderLLaVANext(const VLMConfig& vlm_config,
-                            const ModelsMap& models_map,
-                            const Tokenizer& tokenizer,
-                            const std::filesystem::path& config_dir_path,
-                            const std::string& device,
-                            const ov::AnyMap device_config)
-        : InputsEmbedderLLaVA(vlm_config, models_map, tokenizer, config_dir_path, device, device_config) {}
-
-    virtual ov::Tensor get_inputs_embeds(const std::string& prompt,
-                                         const std::vector<ov::Tensor>& images,
-                                         ov::genai::VLMPerfMetrics& metrics) override {
+    InputsEmbedderLLaVANext(
+        const VLMConfig& vlm_config,
+        const std::filesystem::path& model_dir,
+        const std::string& device,
+        const ov::AnyMap device_config) :
+        InputsEmbedderLLaVA(vlm_config, model_dir, device, device_config) { }
+
+    InputsEmbedderLLaVANext(
+        const VLMConfig& vlm_config,
+        const ModelsMap& models_map,
+        const Tokenizer& tokenizer,
+        const std::filesystem::path& config_dir_path,
+        const std::string& device,
+        const ov::AnyMap device_config) :
+        InputsEmbedderLLaVA(vlm_config, models_map, tokenizer, config_dir_path, device, device_config) { }
+
+    virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics) override {
         std::string image_token = m_vlm_config.im_start;
         // Adapted from llava-1.5-7b-hf chat_template.json
-        std::string chat_template_fallback =
-            "{% for message in messages %}{% if message['role'] == 'user' %}{{ 'USER: ' + message['content'] + ' ' "
-            "}}{% else %}{{ 'ASSISTANT: ' + message['content'] + ' ' }}{% endif %}{% endfor %}{% if "
-            "add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}";
+        std::string chat_template_fallback = "{% for message in messages %}{% if message['role'] == 'user' %}{{ 'USER: ' + message['content'] + ' ' }}{% else %}{{ 'ASSISTANT: ' + message['content'] + ' ' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}";
 
         std::vector<ov::Tensor> single_images = to_single_image_tensors(images);
 
         std::string formatted_prompt;
         std::vector<ov::Tensor> image_embeds;
         image_embeds.reserve(single_images.size());
-
+        
         ov::Tensor image_newline;
 
         for (const auto& image : single_images) {
@@ -756,10 +744,9 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA {
                 std::copy(m_vlm_config.image_newline.begin(), m_vlm_config.image_newline.end(), image_newline_data);
             }
 
-            ImageSize original_image_size{image.get_shape().at(1), image.get_shape().at(2)};  // [height, width]
+            ImageSize original_image_size{image.get_shape().at(1), image.get_shape().at(2)}; // [height, width]
 
-            ov::Tensor packed_features =
-                pack_image_features_llava_next(encoded_image, original_image_size, image_newline);
+            ov::Tensor packed_features = pack_image_features_llava_next(encoded_image, original_image_size, image_newline);
 
             image_embeds.push_back(std::move(packed_features));
             formatted_prompt += image_token + "\n";
@@ -773,29 +760,29 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA {
             return text_embeds;
         }
         auto start_tokenizer_time = std::chrono::steady_clock::now();
-        ov::Tensor encoded_image_token =
-            m_tokenizer.encode(m_vlm_config.im_start, ov::genai::add_special_tokens(false)).input_ids;
+        ov::Tensor encoded_image_token = m_tokenizer.encode(m_vlm_config.im_start, ov::genai::add_special_tokens(false)).input_ids;
         auto end_tokenizer_time = std::chrono::steady_clock::now();
         OPENVINO_ASSERT(metrics.raw_metrics.tokenization_durations.size() > 0);
-        metrics.raw_metrics.tokenization_durations[metrics.raw_metrics.tokenization_durations.size() - 1] +=
-            ov::genai::MicroSeconds(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time));
+        metrics.raw_metrics.tokenization_durations[metrics.raw_metrics.tokenization_durations.size() - 1] += ov::genai::MicroSeconds(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time));
         int64_t image_token_id = encoded_image_token.data<int64_t>()[encoded_image_token.get_size() - 1];
         return merge_text_and_image_embeddings_llava(input_ids, text_embeds, image_embeds, image_token_id);
     }
 
 private:
     /**
-     * @brief Processes base and patches image features extracted from encoded image.
-     * Used in getting inputs embeds for llava_next models.
-     *
-     * @param encoded_image An encoded image retrieved from vision encoder
-     * @param original_image_size A size of the original image
-     * @param image_newline An image newline tensor with a shape (embed_dim)
-     * @return A tensor with a shape (1, new_seq_len, embed_dim)
-     */
-    ov::Tensor pack_image_features_llava_next(const EncodedImage& encoded_image,
-                                              const ImageSize& original_image_size,
-                                              const ov::Tensor& image_newline) {
+    * @brief Processes base and patches image features extracted from encoded image.
+    * Used in getting inputs embeds for llava_next models.
+    *
+    * @param encoded_image An encoded image retrieved from vision encoder
+    * @param original_image_size A size of the original image
+    * @param image_newline An image newline tensor with a shape (embed_dim)
+    * @return A tensor with a shape (1, new_seq_len, embed_dim)
+    */
+    ov::Tensor pack_image_features_llava_next(
+        const EncodedImage& encoded_image,
+        const ImageSize& original_image_size,
+        const ov::Tensor& image_newline
+    ) {
         auto image_feature = encoded_image.resized_source;
         auto image_feature_shape = image_feature.get_shape();
         size_t num_patches = image_feature_shape[0];
@@ -813,12 +800,11 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA {
             std::copy(src_data, src_data + patch_seq_len * embed_dim, dst_data);
 
             // Extract other grid patches
-            ov::Tensor patches_image_feature(image_feature.get_element_type(),
-                                             {num_patches - 1, patch_seq_len, embed_dim});
+            ov::Tensor patches_image_feature(image_feature.get_element_type(), {num_patches - 1, patch_seq_len, embed_dim});
             dst_data = patches_image_feature.data<float>();
             std::copy(src_data + patch_seq_len * embed_dim,
-                      src_data + num_patches * patch_seq_len * embed_dim,
-                      dst_data);
+                    src_data + num_patches * patch_seq_len * embed_dim,
+                    dst_data);
 
             // Process grid patches image feature
             size_t height = encoded_image.resized_source_size.height;
@@ -826,11 +812,7 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA {
             size_t num_patch_height = encoded_image.patches_grid.first;
             size_t num_patch_width = encoded_image.patches_grid.second;
 
-            ov::Tensor reshaped_image_feature = reshape_and_rearrange_image_feature(patches_image_feature,
-                                                                                    num_patch_height,
-                                                                                    num_patch_width,
-                                                                                    height,
-                                                                                    width);
+            ov::Tensor reshaped_image_feature = reshape_and_rearrange_image_feature(patches_image_feature, num_patch_height, num_patch_width, height, width);
 
             ov::Tensor unpadded_image_feature = unpad_image(reshaped_image_feature, original_image_size);
 
@@ -838,8 +820,7 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA {
 
             ov::Tensor processed_image_feature = flatten_and_transpose(image_feature_with_newline);
 
-            // Concatenate base image feature ([1, seq_len_1, emded_dim]) and patches image feature ([seq_len_2,
-            // embed_dim])
+            // Concatenate base image feature ([1, seq_len_1, emded_dim]) and patches image feature ([seq_len_2, embed_dim])
             auto base_shape = base_image_feature.get_shape();
             auto processed_shape = processed_image_feature.get_shape();
 
@@ -851,30 +832,32 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA {
             std::copy(base_data, base_data + base_shape[1] * embed_dim, result.data<float>());
             // Copy processed image feature data
             std::copy(processed_data,
-                      processed_data + processed_shape[0] * embed_dim,
-                      result.data<float>() + base_shape[1] * embed_dim);
+                    processed_data + processed_shape[0] * embed_dim,
+                    result.data<float>() + base_shape[1] * embed_dim);
             return result;
         } else {
             // If there is only one patch, return the original (base) image feature concatenated with image_newline
             ov::Tensor result(image_feature.get_element_type(), {1, patch_seq_len + 1, embed_dim});
             // Copy base image feature data
             std::copy(image_feature_data + embed_dim,
-                      image_feature_data + patch_seq_len * embed_dim,
-                      result.data<float>());
+                    image_feature_data + patch_seq_len * embed_dim,
+                    result.data<float>());
             // Append image_newline data
-            std::copy(newline_data, newline_data + embed_dim, result.data<float>() + patch_seq_len * embed_dim);
+            std::copy(newline_data,
+                    newline_data + embed_dim,
+                    result.data<float>() + patch_seq_len * embed_dim);
             return result;
         }
     }
 
     /**
-     * @brief Adds image newline tensor to patches image feature tensor.
-     * Used for packing image features of llava_next models.
-     *
-     * @param image_feature A tensor with a shape (embed_dim, height, width)
-     * @param image_newline A tensor with a shape (embed_dim)
-     * @return A tensor with a shape (embed_dim, height, width + 1)
-     */
+    * @brief Adds image newline tensor to patches image feature tensor.
+    * Used for packing image features of llava_next models.
+    *
+    * @param image_feature A tensor with a shape (embed_dim, height, width)
+    * @param image_newline A tensor with a shape (embed_dim)
+    * @return A tensor with a shape (embed_dim, height, width + 1)
+    */
     ov::Tensor add_image_newline(const ov::Tensor& image_feature, const ov::Tensor& image_newline) {
         auto shape = image_feature.get_shape();
 
@@ -884,8 +867,7 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA {
         size_t height = shape[1];
         size_t width = shape[2];
 
-        OPENVINO_ASSERT(image_newline.get_shape()[0] == embed_dim,
-                        "image_newline dimension must match embed_dim of image_feature");
+        OPENVINO_ASSERT(image_newline.get_shape()[0] == embed_dim, "image_newline dimension must match embed_dim of image_feature");
 
         const float* image_feature_data = image_feature.data<float>();
         const float* newline_data = image_newline.data<float>();
@@ -896,9 +878,11 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA {
         for (size_t e = 0; e < embed_dim; ++e) {
             for (size_t h = 0; h < height; ++h) {
                 // Copy original image feature data
-                std::copy(image_feature_data + (e * height * width + h * width),
-                          image_feature_data + (e * height * width + (h + 1) * width),
-                          feature_with_newline_data + (e * height * (width + 1) + h * (width + 1)));
+                std::copy(
+                    image_feature_data + (e * height * width + h * width),
+                    image_feature_data + (e * height * width + (h + 1) * width),
+                    feature_with_newline_data + (e * height * (width + 1) + h * (width + 1))
+                );
                 // Add image newline
                 feature_with_newline_data[e * height * (width + 1) + h * (width + 1) + width] = newline_data[e];
             }
@@ -908,12 +892,12 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA {
     }
 
     /**
-     * @brief Flattens and transposes tensor.
-     * Used for packing image features of llava_next models.
-     *
-     * @param tensor A tensor with a shape (embed_dim, height, width)
-     * @return A tensor with a shape (height * width, embed_dim)
-     */
+    * @brief Flattens and transposes tensor.
+    * Used for packing image features of llava_next models.
+    *
+    * @param tensor A tensor with a shape (embed_dim, height, width)
+    * @return A tensor with a shape (height * width, embed_dim)
+    */
     ov::Tensor flatten_and_transpose(const ov::Tensor& tensor) {
         auto shape = tensor.get_shape();
         OPENVINO_ASSERT(shape.size() == 3, "Flattening tensor must have 3 dimensions");
@@ -937,6 +921,7 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA {
         return flatten_feature;
     }
 
+
     ov::Tensor reshape_and_rearrange_image_feature(const ov::Tensor& image_feature,
                                                    int num_patch_height,
                                                    int num_patch_width,
@@ -949,11 +934,15 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA {
         size_t patch_seq_len = shape[1];
         size_t embed_dim = shape[2];
 
-        OPENVINO_ASSERT(num_patches == num_patch_height * num_patch_width,
-                        "Number of patches does not match the specified grid size");
+        OPENVINO_ASSERT(
+            num_patches == num_patch_height * num_patch_width,
+            "Number of patches does not match the specified grid size"
+        );
 
-        OPENVINO_ASSERT(patch_seq_len == height * width,
-                        "Patch sequence length does not match the specified height and width");
+        OPENVINO_ASSERT(
+            patch_seq_len == height * width,
+            "Patch sequence length does not match the specified height and width"
+        );
 
         // Reshape tensor data and permute dimensions
         // [num_patches, patch_seq_len, embed_dim] -> [embed_dim, num_patch_height, height, num_patch_width, width]
@@ -976,19 +965,20 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA {
         ov::Tensor result(image_feature.get_element_type(),
                           {static_cast<size_t>(embed_dim),
                            static_cast<size_t>(num_patch_height * height),
-                           static_cast<size_t>(num_patch_width * width)});
+                           static_cast<size_t>(num_patch_width * width)}
+        );
         std::copy(reshaped_data.begin(), reshaped_data.end(), result.data<float>());
         return result;
     }
 
     /**
-     * @brief Unpads an image tensor of a padded and resized image.
-     * Used for packing image features of llava_next models.
-     *
-     * @param tensor An image tensor with a shape (embed_dim, height, width)
-     * @param original_size A size of original image
-     * @return An unpadded image tensor with a shape (embed_dim, new_height, new_width)
-     */
+    * @brief Unpads an image tensor of a padded and resized image.
+    * Used for packing image features of llava_next models.
+    *
+    * @param tensor An image tensor with a shape (embed_dim, height, width)
+    * @param original_size A size of original image
+    * @return An unpadded image tensor with a shape (embed_dim, new_height, new_width)
+    */
     ov::Tensor unpad_image(const ov::Tensor& tensor, const ImageSize& original_size) {
         size_t original_height = original_size.height;
         size_t original_width = original_size.width;
@@ -1013,9 +1003,9 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA {
                 for (int h = 0; h < unpadded_height_dim; ++h) {
                     std::copy(
                         tensor.data<float>() + (e * current_height * current_width + (padding + h) * current_width),
-                        tensor.data<float>() +
-                            (e * current_height * current_width + (padding + h) * current_width + current_width),
-                        unpadded_tensor.data<float>() + (e * unpadded_height_dim * current_width + h * current_width));
+                        tensor.data<float>() + (e * current_height * current_width + (padding + h) * current_width + current_width),
+                        unpadded_tensor.data<float>() + (e * unpadded_height_dim * current_width + h * current_width)
+                    );
                 }
             }
         } else {
@@ -1027,11 +1017,11 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA {
 
             for (size_t e = 0; e < embed_dim; ++e) {
                 for (int h = 0; h < current_height; ++h) {
-                    std::copy(tensor.data<float>() + (e * current_height * current_width + h * current_width + padding),
-                              tensor.data<float>() + (e * current_height * current_width + h * current_width + padding +
-                                                      unpadded_width_dim),
-                              unpadded_tensor.data<float>() +
-                                  (e * current_height * unpadded_width_dim + h * unpadded_width_dim));
+                    std::copy(
+                        tensor.data<float>() + (e * current_height * current_width + h * current_width + padding),
+                        tensor.data<float>() + (e * current_height * current_width + h * current_width + padding + unpadded_width_dim),
+                        unpadded_tensor.data<float>() + (e * current_height * unpadded_width_dim + h * unpadded_width_dim)
+                    );
                 }
             }
         }
@@ -1042,40 +1032,40 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA {
 
 class InputsEmbedderInternVLChat : public InputsEmbedder::IInputsEmbedder {
 public:
-    InputsEmbedderInternVLChat(const VLMConfig& vlm_config,
-                               const std::filesystem::path& model_dir,
-                               const std::string& device,
-                               const ov::AnyMap device_config)
-        : IInputsEmbedder(vlm_config, model_dir, device, device_config) {}
-
-    InputsEmbedderInternVLChat(const VLMConfig& vlm_config,
-                               const ModelsMap& models_map,
-                               const Tokenizer& tokenizer,
-                               const std::filesystem::path& config_dir_path,
-                               const std::string& device,
-                               const ov::AnyMap device_config)
-        : IInputsEmbedder(vlm_config, models_map, tokenizer, config_dir_path, device, device_config) {}
-
-    virtual ov::Tensor get_inputs_embeds(const std::string& prompt,
-                                         const std::vector<ov::Tensor>& images,
-                                         ov::genai::VLMPerfMetrics& metrics) override {
+    InputsEmbedderInternVLChat(
+        const VLMConfig& vlm_config,
+        const std::filesystem::path& model_dir,
+        const std::string& device,
+        const ov::AnyMap device_config) :
+        IInputsEmbedder(vlm_config, model_dir, device, device_config) { }
+
+    InputsEmbedderInternVLChat(
+        const VLMConfig& vlm_config,
+        const ModelsMap& models_map,
+        const Tokenizer& tokenizer,
+        const std::filesystem::path& config_dir_path,
+        const std::string& device,
+        const ov::AnyMap device_config) :
+        IInputsEmbedder(vlm_config, models_map, tokenizer, config_dir_path, device, device_config) { }
+
+    virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics) override {
         std::string image_start_token = m_vlm_config.image_start_token;
         std::string image_context_token = m_vlm_config.image_context_token;
         std::string image_end_token = m_vlm_config.image_end_token;
-
+        
         std::vector<ov::Tensor> single_images = to_single_image_tensors(images);
 
         std::string formatted_prompt;
         std::vector<ov::Tensor> image_embeds;
         image_embeds.reserve(single_images.size());
-
+        
         for (const auto& image : single_images) {
             EncodedImage encoded_image = m_vision_encoder.encode(image);
             ov::Tensor single_image_embeds = encoded_image.resized_source;
 
             const size_t num_patches = single_image_embeds.get_shape().at(0);
             const size_t num_image_tokens = single_image_embeds.get_shape().at(1);
-
+            
             formatted_prompt += image_start_token;
             for (int i = 0; i < num_patches * num_image_tokens; ++i) {
                 formatted_prompt += image_context_token;
@@ -1093,22 +1083,21 @@ class InputsEmbedderInternVLChat : public InputsEmbedder::IInputsEmbedder {
             return text_embeds;
         }
         auto start_tokenizer_time = std::chrono::steady_clock::now();
-        ov::Tensor encoded_image_context_token =
-            m_tokenizer.encode(image_context_token, ov::genai::add_special_tokens(false)).input_ids;
+        ov::Tensor encoded_image_context_token = m_tokenizer.encode(image_context_token, ov::genai::add_special_tokens(false)).input_ids;
         auto end_tokenizer_time = std::chrono::steady_clock::now();
         OPENVINO_ASSERT(metrics.raw_metrics.tokenization_durations.size() > 0);
-        metrics.raw_metrics.tokenization_durations[metrics.raw_metrics.tokenization_durations.size() - 1] +=
-            ov::genai::MicroSeconds(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time));
-        int64_t image_context_token_id =
-            encoded_image_context_token.data<int64_t>()[encoded_image_context_token.get_size() - 1];
+        metrics.raw_metrics.tokenization_durations[metrics.raw_metrics.tokenization_durations.size() - 1] += ov::genai::MicroSeconds(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time));
+        int64_t image_context_token_id = encoded_image_context_token.data<int64_t>()[encoded_image_context_token.get_size() - 1];
         return merge_text_and_image_embeddings_internvl(input_ids, text_embeds, image_embeds, image_context_token_id);
     }
 
 protected:
-    ov::Tensor merge_text_and_image_embeddings_internvl(const ov::Tensor& input_ids,
-                                                        const ov::Tensor& text_embeds,
-                                                        const std::vector<ov::Tensor>& image_embeds,
-                                                        int64_t image_context_token_id) {
+    ov::Tensor merge_text_and_image_embeddings_internvl(
+        const ov::Tensor& input_ids,
+        const ov::Tensor& text_embeds,
+        const std::vector<ov::Tensor>& image_embeds,
+        int64_t image_context_token_id
+    ) {
         auto text_embeds_shape = text_embeds.get_shape();
         size_t batch_size = text_embeds_shape.at(0);
         size_t seq_len = text_embeds_shape.at(1);
@@ -1142,14 +1131,12 @@ class InputsEmbedderInternVLChat : public InputsEmbedder::IInputsEmbedder {
 
                 if (image_context_tokens_mask[flat_idx]) {
                     const ov::Tensor& single_image_embeds = image_embeds[image_idx];
-                    const size_t num_all_image_tokens =
-                        single_image_embeds.get_shape().at(0) *
-                        single_image_embeds.get_shape().at(1);  // num_patches * num_image_tokens
+                    const size_t num_all_image_tokens = single_image_embeds.get_shape().at(0) * single_image_embeds.get_shape().at(1); // num_patches * num_image_tokens
                     const float* image_embeds_data = single_image_embeds.data<float>();
                     std::copy_n(image_embeds_data + image_context_token_idx * embed_dim,
                                 embed_dim,
                                 merged_embeds_data + offset);
-
+                    
                     ++image_context_token_idx;
 
                     if (image_context_token_idx == num_all_image_tokens) {
@@ -1290,23 +1277,30 @@ ov::InferRequest create_hd_feature_transformer() {
     // t28 = opset.Unsqueeze([t26, t27], {},  # i64[], i32[] -> i64[1]
     // t29 = opset.Constant(model, 29,   #  -> i64[1]([2])
     // t30 = opset.Constant(model, 30,   #  -> i64[1]([2])
-    // t31 = opset.Concat([t8, t28, t29, t28, t30, t21], {'axis': 0},  # i64[1], i64[1], i64[1], i64[1], i64[1], i64[1]
-    // -> i64[6] t32 = opset.Reshape([t23, t31], {'special_zero': False},  # f32[?,24,24,1024], i64[6] ->
-    // f32[?,12,2,12,2,1024] t33 = opset.Constant(model, 33, t34 = opset.Transpose([t32, t33], {},  #
-    // f32[?,12,2,12,2,1024], i64[6] -> f32[?,12,12,2,2,1024] t35 = opset.Constant(model, 35,   #  -> i64[1]([-1]) t36 =
-    // opset.Constant(model, 36,  #  -> i64[1]([4]) t37 = opset.Multiply([t21, t36], {'auto_broadcast': 'numpy'},  #
-    // i64[1], i64[1] -> i64[1] t38 = opset.Concat([t8, t35, t37], {'axis': 0},  # i64[1], i64[1], i64[1] -> i64[3] t39
-    // = opset.Reshape([t34, t38], {'special_zero': False},  # f32[?,12,12,2,2,1024], i64[3] -> f32[?,?,4096] t40 =
-    // opset.Multiply([t1, t2], {'auto_broadcast': 'numpy'},  # i32[], i32[] -> i32[] t41 = opset.Convert([t40],
-    // {'destination_type': 'i64'},  # i32[] -> i64[] t42 = opset.Divide([t6, t41], {'auto_broadcast': 'numpy',
-    // 'm_pythondiv': True},  # i64[], i64[] -> i64[] t43 = opset.Floor([t42], {},  # i64[] -> i64[] t44 =
-    // opset.Constant(model, 44,   #  -> i32[](0) t45 = opset.Unsqueeze([t43, t44], {},  # i64[], i32[] -> i64[1] t46 =
-    // opset.Convert([t1], {'destination_type': 'i64'},  # i32[] -> i64[] t47 = opset.Unsqueeze([t46, t44], {},  #
-    // i64[], i32[] -> i64[1] t48 = opset.Convert([t2], {'destination_type': 'i64'},  # i32[] -> i64[] t49 =
-    // opset.Unsqueeze([t48, t44], {},  # i64[], i32[] -> i64[1] t50 = opset.Constant(model, 50,  #  -> i64[1]([-1]) t51
-    // = opset.Concat([t45, t47, t49, t28, t28, t50], {'axis': 0},  # i64[1], i64[1], i64[1], i64[1], i64[1], i64[1] ->
-    // i64[6] t52 = opset.Reshape([t39, t51], {'special_zero': False},  # f32[?,?,4096], i64[6] -> f32[?,?,?,?,?,?] t53
-    // = opset.Constant(model, 53, t54 = opset.Transpose([t52, t53], {},  # f32[?,?,?,?,?,?], i64[6] -> f32[?,?,?,?,?,?]
+    // t31 = opset.Concat([t8, t28, t29, t28, t30, t21], {'axis': 0},  # i64[1], i64[1], i64[1], i64[1], i64[1], i64[1] -> i64[6]
+    // t32 = opset.Reshape([t23, t31], {'special_zero': False},  # f32[?,24,24,1024], i64[6] -> f32[?,12,2,12,2,1024]
+    // t33 = opset.Constant(model, 33,
+    // t34 = opset.Transpose([t32, t33], {},  # f32[?,12,2,12,2,1024], i64[6] -> f32[?,12,12,2,2,1024]
+    // t35 = opset.Constant(model, 35,   #  -> i64[1]([-1])
+    // t36 = opset.Constant(model, 36,  #  -> i64[1]([4])
+    // t37 = opset.Multiply([t21, t36], {'auto_broadcast': 'numpy'},  # i64[1], i64[1] -> i64[1]
+    // t38 = opset.Concat([t8, t35, t37], {'axis': 0},  # i64[1], i64[1], i64[1] -> i64[3]
+    // t39 = opset.Reshape([t34, t38], {'special_zero': False},  # f32[?,12,12,2,2,1024], i64[3] -> f32[?,?,4096]
+    // t40 = opset.Multiply([t1, t2], {'auto_broadcast': 'numpy'},  # i32[], i32[] -> i32[]
+    // t41 = opset.Convert([t40], {'destination_type': 'i64'},  # i32[] -> i64[]
+    // t42 = opset.Divide([t6, t41], {'auto_broadcast': 'numpy', 'm_pythondiv': True},  # i64[], i64[] -> i64[]
+    // t43 = opset.Floor([t42], {},  # i64[] -> i64[]
+    // t44 = opset.Constant(model, 44,   #  -> i32[](0)
+    // t45 = opset.Unsqueeze([t43, t44], {},  # i64[], i32[] -> i64[1]
+    // t46 = opset.Convert([t1], {'destination_type': 'i64'},  # i32[] -> i64[]
+    // t47 = opset.Unsqueeze([t46, t44], {},  # i64[], i32[] -> i64[1]
+    // t48 = opset.Convert([t2], {'destination_type': 'i64'},  # i32[] -> i64[]
+    // t49 = opset.Unsqueeze([t48, t44], {},  # i64[], i32[] -> i64[1]
+    // t50 = opset.Constant(model, 50,  #  -> i64[1]([-1])
+    // t51 = opset.Concat([t45, t47, t49, t28, t28, t50], {'axis': 0},  # i64[1], i64[1], i64[1], i64[1], i64[1], i64[1] -> i64[6]
+    // t52 = opset.Reshape([t39, t51], {'special_zero': False},  # f32[?,?,4096], i64[6] -> f32[?,?,?,?,?,?]
+    // t53 = opset.Constant(model, 53,
+    // t54 = opset.Transpose([t52, t53], {},  # f32[?,?,?,?,?,?], i64[6] -> f32[?,?,?,?,?,?]
     // t55 = opset.Multiply([t1, t15], {'auto_broadcast': 'numpy'},  # i32[], i32[] -> i32[]
     // t56 = opset.Convert([t55], {'destination_type': 'i64'},  # i32[] -> i64[]
     // t57 = opset.Constant(model, 57,  #  -> i64[](2)
@@ -1323,8 +1317,9 @@ ov::InferRequest create_hd_feature_transformer() {
     // t68 = opset.Concat([t45, t61, t67, t37], {'axis': 0},  # i64[1], i64[1], i64[1], i64[1] -> i64[4]
     // t69 = opset.Reshape([t54, t68], {'special_zero': False},  # f32[?,?,?,?,?,?], i64[4] -> f32[?,?,?,?]
     shared_ptr<Model> model = make_shared<Model>(make_shared<Result>(t69), ParameterVector{t0, t1, t2});
-    ov::InferRequest hd_feature_transformer =
-        utils::singleton_core().compile_model(model, "CPU").create_infer_request();
+    ov::InferRequest hd_feature_transformer = utils::singleton_core().compile_model(
+        model, "CPU"
+    ).create_infer_request();
     // hd_feature_transformer.set_input_tensor(0, ov::Tensor{f32, {1, 576, 1024}});
     // ov::Tensor h_crop = ov::Tensor{i32, {}};
     // h_crop.data<int32_t>()[0] = 1;
@@ -1337,10 +1332,7 @@ ov::InferRequest create_hd_feature_transformer() {
     return hd_feature_transformer;
 }
 
-ov::Tensor reshape_hd_patches_2x2merge(const ov::Tensor& image_features,
-                                       size_t h_crop,
-                                       size_t w_crop,
-                                       InferRequest& hd_feature_transformer) {
+ov::Tensor reshape_hd_patches_2x2merge(const ov::Tensor& image_features, size_t h_crop, size_t w_crop, InferRequest& hd_feature_transformer) {
     ov::Shape shape = image_features.get_shape();
     OPENVINO_ASSERT(3 == shape.size());
     OPENVINO_ASSERT(24 * 24 == shape.at(1));
@@ -1364,24 +1356,23 @@ ov::Tensor add_image_newline(const ov::Tensor& image_features_hd, const std::vec
     for (size_t batch_id = 0; batch_id < nhwc.at(0); ++batch_id) {
         for (size_t row_id = 0; row_id < nhwc.at(1); ++row_id) {
             for (size_t col_id = 0; col_id < nhwc.at(2); ++col_id) {
-                std::copy_n(in + batch_id * nhwc.at(1) * nhwc.at(2) * nhwc.at(3) + row_id * nhwc.at(2) * nhwc.at(3) +
-                                col_id * nhwc.at(3),
-                            nhwc.at(3),
-                            out + batch_id * nhwc.at(1) * (nhwc.at(2) + 1) * nhwc.at(3) +
-                                row_id * (nhwc.at(2) + 1) * nhwc.at(3) + col_id * nhwc.at(3));
+                std::copy_n(
+                    in + batch_id * nhwc.at(1) * nhwc.at(2) * nhwc.at(3) + row_id * nhwc.at(2) * nhwc.at(3) + col_id * nhwc.at(3),
+                    nhwc.at(3),
+                    out + batch_id * nhwc.at(1) * (nhwc.at(2) + 1) * nhwc.at(3) + row_id * (nhwc.at(2) + 1) * nhwc.at(3) + col_id * nhwc.at(3)
+                );
             }
-            std::copy(sub_GN.begin(),
-                      sub_GN.end(),
-                      out + batch_id * nhwc.at(1) * (nhwc.at(2) + 1) * nhwc.at(3) +
-                          row_id * (nhwc.at(2) + 1) * nhwc.at(3) + nhwc.at(2) * nhwc.at(3));
+            std::copy(
+                sub_GN.begin(),
+                sub_GN.end(),
+                out + batch_id * nhwc.at(1) * (nhwc.at(2) + 1) * nhwc.at(3) + row_id * (nhwc.at(2) + 1) * nhwc.at(3) + nhwc.at(2) * nhwc.at(3)
+            );
         }
     }
     return image_features_hd_new_line;
 }
 
-ov::Tensor concatenate_2d(const ov::Tensor& first_1lf,
-                          const std::vector<float>& second_f,
-                          const ov::Tensor& third_1lf) {
+ov::Tensor concatenate_2d(const ov::Tensor& first_1lf, const std::vector<float>& second_f, const ov::Tensor& third_1lf) {
     size_t first_l = first_1lf.get_shape().at(1);
     constexpr size_t second_l = 1;
     size_t third_l = third_1lf.get_shape().at(1);
@@ -1396,20 +1387,12 @@ ov::Tensor concatenate_2d(const ov::Tensor& first_1lf,
 }
 
 // image_features.resized_source: (num_crops+1, 24*24, 1024)
-ov::Tensor hd_feature_transform(const EncodedImage& image_features,
-                                InferRequest& hd_feature_transformer,
-                                const std::vector<float>& sub_GN,
-                                const std::vector<float>& glb_GN,
-                                ov::InferRequest& vision_projection) {
+ov::Tensor hd_feature_transform(const EncodedImage& image_features, InferRequest& hd_feature_transformer, const std::vector<float>& sub_GN, const std::vector<float>& glb_GN, ov::InferRequest& vision_projection) {
     const ov::Shape& image_features_shape = image_features.resized_source.get_shape();
-    ov::Tensor global_image_features{ov::element::f32,
-                                     {1, image_features_shape.at(1), image_features_shape.at(2)},
-                                     image_features.resized_source.data<float>()};
+    ov::Tensor global_image_features{ov::element::f32, {1, image_features_shape.at(1), image_features_shape.at(2)}, image_features.resized_source.data<float>()};
     // global feature can be viewed as a special HD case with num_crops 1x1
-    ov::Tensor global_image_features_hd =
-        reshape_hd_patches_2x2merge(global_image_features, 1, 1, hd_feature_transformer);
-    ov::Tensor global_image_features_hd_newline =
-        add_image_newline(global_image_features_hd, sub_GN);  // [1,12*(12+1),4096]
+    ov::Tensor global_image_features_hd = reshape_hd_patches_2x2merge(global_image_features, 1, 1, hd_feature_transformer);
+    ov::Tensor global_image_features_hd_newline = add_image_newline(global_image_features_hd, sub_GN);  // [1,12*(12+1),4096]
     constexpr size_t INPUT_IMAGE_SIZE = 336;
     size_t h_crop = image_features.resized_source_size.height / INPUT_IMAGE_SIZE;
     size_t w_crop = image_features.resized_source_size.width / INPUT_IMAGE_SIZE;
@@ -1417,23 +1400,27 @@ ov::Tensor hd_feature_transform(const EncodedImage& image_features,
 
     // NOTE: real num_crops is padded
     // (num_crops, 24*24, 1024)
-    ov::Tensor sub_image_features{
-        ov::element::f32,
-        {num_crops, image_features_shape.at(1), image_features_shape.at(2)},
-        image_features.resized_source.data<float>() + image_features_shape.at(1) * image_features_shape.at(2)};
-    ov::Tensor sub_image_features_hd =
-        reshape_hd_patches_2x2merge(sub_image_features, h_crop, w_crop, hd_feature_transformer);  // [1, 24, 24, 4096]
-    ov::Tensor sub_image_features_hd_newline =
-        add_image_newline(sub_image_features_hd, sub_GN);  // [1,h_crop*12*(w_crop*12+1), 4096]
+    ov::Tensor sub_image_features{ov::element::f32, {
+        num_crops,
+        image_features_shape.at(1),
+        image_features_shape.at(2)
+    }, image_features.resized_source.data<float>() + image_features_shape.at(1) * image_features_shape.at(2)};
+    ov::Tensor sub_image_features_hd = reshape_hd_patches_2x2merge(sub_image_features, h_crop, w_crop, hd_feature_transformer);  // [1, 24, 24, 4096]
+    ov::Tensor sub_image_features_hd_newline = add_image_newline(sub_image_features_hd, sub_GN);  // [1,h_crop*12*(w_crop*12+1), 4096]
     return concatenate_2d(sub_image_features_hd_newline, glb_GN, global_image_features_hd_newline);  // [1,l,4096]
 }
 
 std::vector<ov::Tensor> split_tokenize(const std::string& text, ov::genai::Tokenizer& tokenizer) {
     constexpr int make_suffix_iterator = -1;
     std::regex rgx{R"(<\|image_\d+\|>)"};
-    std::sregex_token_iterator iter{text.begin(), text.end(), rgx, make_suffix_iterator};
+    std::sregex_token_iterator iter{
+        text.begin(),
+        text.end(),
+        rgx,
+        make_suffix_iterator
+    };
     std::vector<ov::Tensor> tokenized;
-    for (; iter != std::sregex_token_iterator{}; ++iter) {
+    for ( ; iter != std::sregex_token_iterator{}; ++iter) {
         if (iter->str().empty()) {
             continue;
         }
@@ -1442,19 +1429,16 @@ std::vector<ov::Tensor> split_tokenize(const std::string& text, ov::genai::Token
     return tokenized;
 }
 
-// ov::Tensor apply_template_and_tokenize(bool is_chat_conversation, const std::string& prompt,
-// ov::genai::VLMPerfMetrics& metrics, const std::string& chat_template_fallback = "") {
+// ov::Tensor apply_template_and_tokenize(bool is_chat_conversation, const std::string& prompt, ov::genai::VLMPerfMetrics& metrics, const std::string& chat_template_fallback = "") {
 //     ov::Tensor encoded_input_ids;
 //     if (is_chat_conversation) {
 //         // KV cache in model already contains prompts and answers from previous iterations.
 //         // So only new prompt wrapped into chat template to be sent into model. Tokenizer always returns
 //         // token_ids = {<bos token>, ...<valuable tokens>}. So if tokenizer applies only to the new prompt,
 //         // <bos token> will be inserted on every iteration.
-//         // So actual pipeline calculates input_ids for whole chat history + for whole chat history without the new
-//         prompt
+//         // So actual pipeline calculates input_ids for whole chat history + for whole chat history without the new prompt
 //         // and takes only the difference between them.
-//         // The chat history cannot be saved as already encoded tokens because generate call doesn't return <eos>
-//         token, but
+//         // The chat history cannot be saved as already encoded tokens because generate call doesn't return <eos> token, but
 //         // KV cache contains it. So we have to add it manually or get it by tokenization all chat history.
 //         m_history.push_back({{"role", "user"}, {"content", prompt}});
 //         constexpr bool add_generation_prompt = true;
@@ -1463,8 +1447,7 @@ std::vector<ov::Tensor> split_tokenize(const std::string& text, ov::genai::Token
 //             new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt);
 //         } catch (const std::exception& error) {
 //             // Use fallback chat template if it was not found in tokenizer_config.json
-//             new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt,
-//             chat_template_fallback);
+//             new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt, chat_template_fallback);
 //         }
 //         auto start_tokenizer_time = std::chrono::steady_clock::now();
 //         ov::Tensor new_chat_tokens = m_tokenizer.encode(new_templated_chat_history).input_ids;
@@ -1481,19 +1464,18 @@ std::vector<ov::Tensor> split_tokenize(const std::string& text, ov::genai::Token
 //             ).input_ids;
 //         }
 //         auto end_tokenizer_time = std::chrono::steady_clock::now();
-//         metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time -
-//         start_tokenizer_time)); m_templated_chat_history = std::move(new_templated_chat_history);
+//         metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time));
+//         m_templated_chat_history = std::move(new_templated_chat_history);
 //     } else {
 //         auto start_tokenizer_time = std::chrono::steady_clock::now();
 //         encoded_input_ids = m_tokenizer.encode(prompt).input_ids;
 //         auto end_tokenizer_time = std::chrono::steady_clock::now();
-//         metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time -
-//         start_tokenizer_time));
+//         metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time));
 //     }
 //     return encoded_input_ids;
 // }
-}  // namespace phi3_v
-}  // namespace
+}
+}
 
 class InputsEmbedderPhi3V : public InputsEmbedder::IInputsEmbedder {
 public:
@@ -1502,31 +1484,24 @@ class InputsEmbedderPhi3V : public InputsEmbedder::IInputsEmbedder {
     // Used to insert <|image_i|>\n per image (not a slice).
     size_t m_image_id = 1;
 
-    InputsEmbedderPhi3V(const VLMConfig& vlm_config,
-                        const std::filesystem::path& model_dir,
-                        const std::string& device,
-                        const ov::AnyMap device_config)
-        : IInputsEmbedder(vlm_config, model_dir, device, device_config),
-          m_image_id{0},
-          m_hd_feature_transformer{phi3_v::create_hd_feature_transformer()},
-          m_vision_projection{utils::singleton_core()
-                                  .compile_model(model_dir / "openvino_vision_projection_model.xml", device)
-                                  .create_infer_request()} {}
-
-    ov::Tensor get_inputs_embeds(const std::string& prompt,
-                                 const std::vector<ov::Tensor>& images,
-                                 ov::genai::VLMPerfMetrics& metrics) override {
+    InputsEmbedderPhi3V(
+        const VLMConfig& vlm_config,
+        const std::filesystem::path& model_dir,
+        const std::string& device,
+        const ov::AnyMap device_config
+    ):
+        IInputsEmbedder(vlm_config, model_dir, device, device_config), m_image_id{0},
+        m_hd_feature_transformer{phi3_v::create_hd_feature_transformer()},
+        m_vision_projection{utils::singleton_core().compile_model(model_dir / "openvino_vision_projection_model.xml", device).create_infer_request()} {}
+
+    ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics) override {
         // TODO: perfmetrics
-        OPENVINO_ASSERT(!std::regex_search(prompt, std::regex{R"(<\|image_\d+\|>)"}), "<|image_i|> can't be used in the prompt because it's reserved for images");
+        std::cout << prompt<<'\n';
         std::stringstream images_prompt;
         std::vector<ov::Tensor> images_features_proj;
         for (const ov::Tensor& image : to_single_image_tensors(images)) {
             EncodedImage encoded_image = m_vision_encoder.encode(image);
-            images_features_proj.push_back(phi3_v::hd_feature_transform(encoded_image,
-                                                                        m_hd_feature_transformer,
-                                                                        m_vlm_config.sub_GN,
-                                                                        m_vlm_config.glb_GN,
-                                                                        m_vision_projection));
+            images_features_proj.push_back(phi3_v::hd_feature_transform(encoded_image, m_hd_feature_transformer, m_vlm_config.sub_GN, m_vlm_config.glb_GN, m_vision_projection));
             images_prompt << "<|image_" << m_image_id << "|>\n";
             ++m_image_id;
         }
@@ -1589,26 +1564,27 @@ class InputsEmbedderPhi3V : public InputsEmbedder::IInputsEmbedder {
         // int64_t* end = ids + encoded_input_size;
         // float* inputs_embeds_data = inputs_embeds.data<float>();
         // for (const EncodedImage& encoded_image : embeds) {
-        //     const ov::Tensor& resampled_source = resample(encoded_image.resized_source,
-        //     {encoded_image.resized_source_size}); float* emb = resampled_source.data<float>(); ids = std::find(ids,
-        //     end, im_start_id); OPENVINO_ASSERT(end != ids);
+        //     const ov::Tensor& resampled_source = resample(encoded_image.resized_source, {encoded_image.resized_source_size});
+        //     float* emb = resampled_source.data<float>();
+        //     ids = std::find(ids, end, im_start_id);
+        //     OPENVINO_ASSERT(end != ids);
         //     ++ids;
-        //     std::copy_n(emb, resampled_source.get_size(), inputs_embeds_data + std::distance(begin, ids) *
-        //     m_vlm_config.hidden_size); ids += m_vlm_config.query_num; if (encoded_image.slices) {
+        //     std::copy_n(emb, resampled_source.get_size(), inputs_embeds_data + std::distance(begin, ids) * m_vlm_config.hidden_size);
+        //     ids += m_vlm_config.query_num;
+        //     if (encoded_image.slices) {
         //         size_t token_idx = 0;
         //         const ov::Shape& slices_shape = encoded_image.slices.get_shape();
         //         for (size_t i = 0; i < slices_shape.at(0); ++i) {
         //             for (size_t ja = 0; ja < slices_shape.at(1); ++ja) {
         //                 size_t d2 = slices_shape.at(2);
         //                 size_t d3 = slices_shape.at(3);
-        //                 ov::Tensor encoded_view{ov::element::f32, {1, d2, d3}, encoded_image.slices.data<float>() +
-        //                 (i * slices_shape.at(1) + ja) * d2 * d3}; const ov::Tensor& vision_embed_tensor_i_j =
-        //                 resample(encoded_view, {encoded_image.slices_size}); ids = std::find(ids, end,
-        //                 slice_start_id); OPENVINO_ASSERT(end != ids);
+        //                 ov::Tensor encoded_view{ov::element::f32, {1, d2, d3}, encoded_image.slices.data<float>() + (i * slices_shape.at(1) + ja) * d2 * d3};
+        //                 const ov::Tensor& vision_embed_tensor_i_j = resample(encoded_view, {encoded_image.slices_size});
+        //                 ids = std::find(ids, end, slice_start_id);
+        //                 OPENVINO_ASSERT(end != ids);
         //                 ++ids;
-        //                 std::copy_n(vision_embed_tensor_i_j.data<float>(), vision_embed_tensor_i_j.get_size(),
-        //                 inputs_embeds_data + std::distance(begin, ids) * m_vlm_config.hidden_size); ids +=
-        //                 m_vlm_config.query_num;
+        //                 std::copy_n(vision_embed_tensor_i_j.data<float>(), vision_embed_tensor_i_j.get_size(), inputs_embeds_data + std::distance(begin, ids) * m_vlm_config.hidden_size);
+        //                 ids += m_vlm_config.query_num;
         //             }
         //         }
         //     }
@@ -1647,8 +1623,7 @@ InputsEmbedder::InputsEmbedder(const VLMConfig& vlm_config,
     } else if (vlm_config.model_type == VLMModelType::PHI3_V) {
         m_impl = std::make_shared<InputsEmbedderPhi3V>(vlm_config, model_dir, device, device_config);
     } else {
-        OPENVINO_THROW(
-            "Unsupported model type in VLM InputsEmbedder class. Please, create feature request on new model support");
+        OPENVINO_THROW("Unsupported model type in VLM InputsEmbedder class. Please, create feature request on new model support");
     }
 }
 
@@ -1659,42 +1634,19 @@ InputsEmbedder::InputsEmbedder(const VLMConfig& vlm_config,
                                const std::string& device,
                                const ov::AnyMap device_config) {
     if (vlm_config.model_type == VLMModelType::MINICPM) {
-        m_impl = std::make_shared<InputsEmbedderMiniCPM>(vlm_config,
-                                                         models_map,
-                                                         tokenizer,
-                                                         config_dir_path,
-                                                         device,
-                                                         device_config);
+        m_impl = std::make_shared<InputsEmbedderMiniCPM>(vlm_config, models_map, tokenizer, config_dir_path, device, device_config);
     } else if (vlm_config.model_type == VLMModelType::LLAVA) {
-        m_impl = std::make_shared<InputsEmbedderLLaVA>(vlm_config,
-                                                       models_map,
-                                                       tokenizer,
-                                                       config_dir_path,
-                                                       device,
-                                                       device_config);
+        m_impl = std::make_shared<InputsEmbedderLLaVA>(vlm_config, models_map, tokenizer, config_dir_path, device, device_config);
     } else if (vlm_config.model_type == VLMModelType::LLAVA_NEXT) {
-        m_impl = std::make_shared<InputsEmbedderLLaVANext>(vlm_config,
-                                                           models_map,
-                                                           tokenizer,
-                                                           config_dir_path,
-                                                           device,
-                                                           device_config);
+        m_impl = std::make_shared<InputsEmbedderLLaVANext>(vlm_config, models_map, tokenizer, config_dir_path, device, device_config);
     } else if (vlm_config.model_type == VLMModelType::INTERNVL_CHAT) {
-        m_impl = std::make_shared<InputsEmbedderInternVLChat>(vlm_config,
-                                                              models_map,
-                                                              tokenizer,
-                                                              config_dir_path,
-                                                              device,
-                                                              device_config);
+        m_impl = std::make_shared<InputsEmbedderInternVLChat>(vlm_config, models_map, tokenizer, config_dir_path, device, device_config);
     } else {
-        OPENVINO_THROW(
-            "Unsupported model type in VLM InputsEmbedder class. Please, create feature request on new model support");
+        OPENVINO_THROW("Unsupported model type in VLM InputsEmbedder class. Please, create feature request on new model support");
     }
 }
 
-ov::Tensor InputsEmbedder::get_inputs_embeds(const std::string& prompt,
-                                             const std::vector<ov::Tensor>& images,
-                                             ov::genai::VLMPerfMetrics& metrics) {
+ov::Tensor InputsEmbedder::get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics) {
     return m_impl->get_inputs_embeds(prompt, images, metrics);
 }
 
@@ -1706,10 +1658,7 @@ std::vector<int64_t> InputsEmbedder::get_tokenized_history() const {
     return m_impl->get_tokenized_history();
 }
 
-void InputsEmbedder::update_tokenized_history(const std::vector<int64_t>& encoded_result,
-                                              std::optional<int64_t> last_disappeared_token,
-                                              bool is_beam_search,
-                                              size_t last_answer_len) {
+void InputsEmbedder::update_tokenized_history(const std::vector<int64_t>& encoded_result, std::optional<int64_t> last_disappeared_token, bool is_beam_search, size_t last_answer_len) {
     return m_impl->update_tokenized_history(encoded_result, last_disappeared_token, is_beam_search, last_answer_len);
 }
 
@@ -1733,4 +1682,4 @@ void InputsEmbedder::finish_chat() {
     return m_impl->finish_chat();
 }
 
-}  // namespace ov::genai
+} // namespace ov::genai

From edb2dc13e68724a875ddd5332d136ac54289fa95 Mon Sep 17 00:00:00 2001
From: Vladimir Zlobin <vladimir.zlobin@intel.com>
Date: Thu, 9 Jan 2025 14:41:15 +0400
Subject: [PATCH 11/28] working chat

---
 .../src/visual_language/inputs_embedder.cpp   | 214 ++++++------------
 1 file changed, 74 insertions(+), 140 deletions(-)

diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp
index b1027c533b..1f1e162127 100644
--- a/src/cpp/src/visual_language/inputs_embedder.cpp
+++ b/src/cpp/src/visual_language/inputs_embedder.cpp
@@ -676,7 +676,7 @@ class InputsEmbedderLLaVA : public InputsEmbedder::IInputsEmbedder {
         }
         size_t merged_seq_length = text_embeds_seq_length + total_image_seq_length - num_image_tokens;
 
-    constexpr size_t BATCH_SIZE = 1;
+        constexpr size_t BATCH_SIZE = 1;
         ov::Tensor merged_embeds(text_embeds.get_element_type(), {BATCH_SIZE, merged_seq_length, hidden_size});
         float* merged_data = merged_embeds.data<float>();
 
@@ -1407,7 +1407,13 @@ ov::Tensor hd_feature_transform(const EncodedImage& image_features, InferRequest
     }, image_features.resized_source.data<float>() + image_features_shape.at(1) * image_features_shape.at(2)};
     ov::Tensor sub_image_features_hd = reshape_hd_patches_2x2merge(sub_image_features, h_crop, w_crop, hd_feature_transformer);  // [1, 24, 24, 4096]
     ov::Tensor sub_image_features_hd_newline = add_image_newline(sub_image_features_hd, sub_GN);  // [1,h_crop*12*(w_crop*12+1), 4096]
-    return concatenate_2d(sub_image_features_hd_newline, glb_GN, global_image_features_hd_newline);  // [1,l,4096]
+    ov::Tensor image_embeddings = concatenate_2d(sub_image_features_hd_newline, glb_GN, global_image_features_hd_newline);  // [1,l,4096]
+    vision_projection.set_input_tensor(image_embeddings);
+    vision_projection.infer();
+    ov::Tensor out = vision_projection.get_output_tensor();
+    ov::Tensor res{out.get_element_type(), out.get_shape()};
+    out.copy_to(res);
+    return res;
 }
 
 std::vector<ov::Tensor> split_tokenize(const std::string& text, ov::genai::Tokenizer& tokenizer) {
@@ -1428,52 +1434,6 @@ std::vector<ov::Tensor> split_tokenize(const std::string& text, ov::genai::Token
     }
     return tokenized;
 }
-
-// ov::Tensor apply_template_and_tokenize(bool is_chat_conversation, const std::string& prompt, ov::genai::VLMPerfMetrics& metrics, const std::string& chat_template_fallback = "") {
-//     ov::Tensor encoded_input_ids;
-//     if (is_chat_conversation) {
-//         // KV cache in model already contains prompts and answers from previous iterations.
-//         // So only new prompt wrapped into chat template to be sent into model. Tokenizer always returns
-//         // token_ids = {<bos token>, ...<valuable tokens>}. So if tokenizer applies only to the new prompt,
-//         // <bos token> will be inserted on every iteration.
-//         // So actual pipeline calculates input_ids for whole chat history + for whole chat history without the new prompt
-//         // and takes only the difference between them.
-//         // The chat history cannot be saved as already encoded tokens because generate call doesn't return <eos> token, but
-//         // KV cache contains it. So we have to add it manually or get it by tokenization all chat history.
-//         m_history.push_back({{"role", "user"}, {"content", prompt}});
-//         constexpr bool add_generation_prompt = true;
-//         std::string new_templated_chat_history;
-//         try {
-//             new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt);
-//         } catch (const std::exception& error) {
-//             // Use fallback chat template if it was not found in tokenizer_config.json
-//             new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt, chat_template_fallback);
-//         }
-//         auto start_tokenizer_time = std::chrono::steady_clock::now();
-//         ov::Tensor new_chat_tokens = m_tokenizer.encode(new_templated_chat_history).input_ids;
-//         if (m_is_cache_empty) {
-//             encoded_input_ids = new_chat_tokens;
-//             // after first `get_inputs_embeds` is called, we supposed LLM is inferred and cache is not empty
-//             m_is_cache_empty = false;
-//         } else {
-//             TokenizedInputs prev_chat_tokens = m_tokenizer.encode(
-//                 m_templated_chat_history
-//             );
-//             encoded_input_ids = utils::subtract_chat_tokenized_inputs(
-//                 {new_chat_tokens}, prev_chat_tokens
-//             ).input_ids;
-//         }
-//         auto end_tokenizer_time = std::chrono::steady_clock::now();
-//         metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time));
-//         m_templated_chat_history = std::move(new_templated_chat_history);
-//     } else {
-//         auto start_tokenizer_time = std::chrono::steady_clock::now();
-//         encoded_input_ids = m_tokenizer.encode(prompt).input_ids;
-//         auto end_tokenizer_time = std::chrono::steady_clock::now();
-//         metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time));
-//     }
-//     return encoded_input_ids;
-// }
 }
 }
 
@@ -1495,100 +1455,74 @@ class InputsEmbedderPhi3V : public InputsEmbedder::IInputsEmbedder {
         m_vision_projection{utils::singleton_core().compile_model(model_dir / "openvino_vision_projection_model.xml", device).create_infer_request()} {}
 
     ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics) override {
-        // TODO: perfmetrics
-        std::cout << prompt<<'\n';
-        std::stringstream images_prompt;
+        OPENVINO_ASSERT(images.empty() || m_history.empty(), "Images can only be provided for initial prompt");
         std::vector<ov::Tensor> images_features_proj;
-        for (const ov::Tensor& image : to_single_image_tensors(images)) {
-            EncodedImage encoded_image = m_vision_encoder.encode(image);
-            images_features_proj.push_back(phi3_v::hd_feature_transform(encoded_image, m_hd_feature_transformer, m_vlm_config.sub_GN, m_vlm_config.glb_GN, m_vision_projection));
-            images_prompt << "<|image_" << m_image_id << "|>\n";
-            ++m_image_id;
+        std::vector<ov::Tensor> tokens;
+        if (m_history.empty()) {
+            std::stringstream images_prompt;
+            for (const ov::Tensor& image : to_single_image_tensors(images)) {
+                EncodedImage encoded_image = m_vision_encoder.encode(image);
+                images_features_proj.push_back(phi3_v::hd_feature_transform(encoded_image, m_hd_feature_transformer, m_vlm_config.sub_GN, m_vlm_config.glb_GN, m_vision_projection));
+                images_prompt << "<|image_" << m_image_id << "|>\n";
+                ++m_image_id;
+            }
+            images_prompt << prompt;
+            std::string new_templated_chat_history;
+            if (m_is_chat_conversation) {
+                m_history.push_back({{"role", "user"}, {"content", images_prompt.str()}});
+                constexpr bool add_generation_prompt = true;
+                m_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt);
+            } else {
+                m_templated_chat_history = images_prompt.str();
+            }
+            auto start_tokenizer_time = std::chrono::steady_clock::now();
+            tokens = phi3_v::split_tokenize(m_templated_chat_history, m_tokenizer);
+
+            auto end_tokenizer_time = std::chrono::steady_clock::now();
+            metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time));
+            if (m_is_chat_conversation) {
+                for (const ov::Tensor& chunk : tokens) {
+                    m_tokenized_history.insert(m_tokenized_history.end(), chunk.data<int64_t>(), chunk.data<int64_t>() + chunk.get_size());
+                }
+            }
+        } else {
+            tokens = {get_encoded_input_ids(prompt, metrics)};
+        }
+        OPENVINO_ASSERT(tokens.size() - 1 == images_features_proj.size());
+        size_t features_length = 0;
+        for (size_t im_id = 0; im_id < images_features_proj.size(); ++im_id) {
+            size_t text_length = tokens.at(im_id).get_shape().at(1);
+            size_t im_length = images_features_proj.at(im_id).get_shape().at(1);
+            features_length += text_length + im_length;
         }
-        images_prompt << prompt;
-        phi3_v::split_tokenize(images_prompt.str(), m_tokenizer);
-
-        ov::Tensor inputs_embeds;
-        //     if (m_vlm_config.use_image_id) {
-        //         images_prompt += m_vlm_config.im_id_start + std::to_string(m_image_id) + m_vlm_config.im_id_end;
-        //         ++m_image_id;
-        //     }
-        //     std::string unk64;
-        //     for (size_t idx = 0; idx < m_vlm_config.query_num; ++idx) {
-        //         unk64 += m_vlm_config.unk;
-        //     }
-        //     images_prompt += m_vlm_config.im_start + unk64 + m_vlm_config.im_end;
-        //     if (encoded_image.slices) {
-        //         ov::Shape slices_shape = encoded_image.slices.get_shape();
-        //         for (size_t row_idx = 0; row_idx < slices_shape.at(0); ++row_idx) {
-        //             for (size_t col_idx = 0; col_idx < slices_shape.at(1); ++col_idx) {
-        //                 images_prompt += m_vlm_config.slice_start + unk64 + m_vlm_config.slice_end;
-        //             }
-        //             images_prompt += '\n';
-        //         }
-        //     }
-        //     if ('\n' != *(images_prompt.end() - 1)) {
-        //         // Image wasn't sliced, add \n to the end of image anyway.
-        //         // Strangely, \n isn't placed between </image><slice>.
-        //         images_prompt += '\n';
-        //     }
-        //     embeds.push_back(std::move(encoded_image));
-        // }
-        // images_prompt += prompt;
-
-        // ov::Tensor encoded_input = get_encoded_input_ids(images_prompt);
-
-        // ov::Tensor inputs_embeds = m_embedding.infer(encoded_input);
-        // OPENVINO_ASSERT(
-        //     m_vlm_config.hidden_size == inputs_embeds.get_shape().at(2),
-        //     "Unexpected embedding size"
-        //;
-        // ov::Tensor special_tokens = m_tokenizer.encode(
-        //     m_vlm_config.im_start
-        //     + m_vlm_config.im_end
-        //     + m_vlm_config.slice_start
-        //     + m_vlm_config.slice_end
-        //.input_ids;
-        // OPENVINO_ASSERT(
-        //     4 == special_tokens.get_shape().at(1),
-        //     "Every special token must be represented with a single int."
-        //;
-        // int64_t im_start_id = special_tokens.data<int64_t>()[0];
-        // int64_t im_end_id = special_tokens.data<int64_t>()[1];
-        // int64_t slice_start_id = special_tokens.data<int64_t>()[2];
-        // int64_t slice_end_id = special_tokens.data<int64_t>()[3];
-        // int64_t im_start_pos = 0, slice_start_pos = 0;
-        // int64_t* begin = encoded_input.data<int64_t>();
-        // int64_t* ids = begin;
-        // size_t encoded_input_size = encoded_input.get_size();
-        // int64_t* end = ids + encoded_input_size;
-        // float* inputs_embeds_data = inputs_embeds.data<float>();
-        // for (const EncodedImage& encoded_image : embeds) {
-        //     const ov::Tensor& resampled_source = resample(encoded_image.resized_source, {encoded_image.resized_source_size});
-        //     float* emb = resampled_source.data<float>();
-        //     ids = std::find(ids, end, im_start_id);
-        //     OPENVINO_ASSERT(end != ids);
-        //     ++ids;
-        //     std::copy_n(emb, resampled_source.get_size(), inputs_embeds_data + std::distance(begin, ids) * m_vlm_config.hidden_size);
-        //     ids += m_vlm_config.query_num;
-        //     if (encoded_image.slices) {
-        //         size_t token_idx = 0;
-        //         const ov::Shape& slices_shape = encoded_image.slices.get_shape();
-        //         for (size_t i = 0; i < slices_shape.at(0); ++i) {
-        //             for (size_t ja = 0; ja < slices_shape.at(1); ++ja) {
-        //                 size_t d2 = slices_shape.at(2);
-        //                 size_t d3 = slices_shape.at(3);
-        //                 ov::Tensor encoded_view{ov::element::f32, {1, d2, d3}, encoded_image.slices.data<float>() + (i * slices_shape.at(1) + ja) * d2 * d3};
-        //                 const ov::Tensor& vision_embed_tensor_i_j = resample(encoded_view, {encoded_image.slices_size});
-        //                 ids = std::find(ids, end, slice_start_id);
-        //                 OPENVINO_ASSERT(end != ids);
-        //                 ++ids;
-        //                 std::copy_n(vision_embed_tensor_i_j.data<float>(), vision_embed_tensor_i_j.get_size(), inputs_embeds_data + std::distance(begin, ids) * m_vlm_config.hidden_size);
-        //                 ids += m_vlm_config.query_num;
-        //             }
-        //         }
-        //     }
-        // }
+        features_length += tokens.back().get_shape().at(1);
+        ov::Tensor inputs_embeds{ov::element::f32, {1, features_length, m_vlm_config.hidden_size}};
+        size_t offset = 0;
+        for (size_t im_id = 0; im_id < images_features_proj.size(); ++im_id) {
+            const ov::Tensor& text_embeds = m_embedding.infer(tokens.at(im_id));
+            const ov::Tensor& image_embeds = images_features_proj.at(im_id);
+            size_t text_length = text_embeds.get_shape().at(1);
+            size_t im_length = image_embeds.get_shape().at(1);
+            std::copy_n(
+                text_embeds.data<float>(),
+                text_embeds.get_size(),
+                inputs_embeds.data<float>() + offset * m_vlm_config.hidden_size
+            );
+            offset += text_length;
+            std::copy_n(
+                image_embeds.data<float>(),
+                image_embeds.get_size(),
+                inputs_embeds.data<float>() + offset * m_vlm_config.hidden_size
+            );
+            offset += im_length;
+        }
+        const ov::Tensor& text_embeds = m_embedding.infer(tokens.back());
+        size_t text_length = text_embeds.get_shape().at(1);
+        std::copy_n(
+            text_embeds.data<float>(),
+            text_embeds.get_size(),
+            inputs_embeds.data<float>() + offset * m_vlm_config.hidden_size
+        );
 
         if (!m_is_chat_conversation) {
             m_image_id = 0;

From f4c8bb8096a604cd09274a0d5be1738f383b781a Mon Sep 17 00:00:00 2001
From: Vladimir Zlobin <vladimir.zlobin@intel.com>
Date: Thu, 9 Jan 2025 14:49:19 +0400
Subject: [PATCH 12/28] Put resize back

---
 src/cpp/src/visual_language/vision_encoder.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/cpp/src/visual_language/vision_encoder.cpp b/src/cpp/src/visual_language/vision_encoder.cpp
index 0ca433b992..36de524b54 100644
--- a/src/cpp/src/visual_language/vision_encoder.cpp
+++ b/src/cpp/src/visual_language/vision_encoder.cpp
@@ -697,8 +697,7 @@ ov::Tensor HD_transform(const ov::Tensor& uint8, size_t num_crops) {
         return padding_336(ov::Tensor{ov::element::u8, {1, new_w, new_h, 3}, dst.buf.data()});
     }
     src = clip_image_u8{width, height, {uint8_data, uint8_data + uint8.get_size()}};
-    // bilinear_resize(src, dst, new_w, new_h);
-    dst = src; // TODO: put resize back
+    bilinear_resize(src, dst, new_w, new_h);
     return padding_336(ov::Tensor{ov::element::u8, {1, new_h, new_w, 3}, dst.buf.data()});
 }
 

From 2d988ab5a091d89d05b4569045674e5fc13d1420 Mon Sep 17 00:00:00 2001
From: Vladimir Zlobin <vladimir.zlobin@intel.com>
Date: Thu, 9 Jan 2025 15:59:39 +0400
Subject: [PATCH 13/28] clean up

---
 .../visual_language_chat.cpp                  | 22 ++---
 .../src/visual_language/inputs_embedder.cpp   | 86 +------------------
 .../src/visual_language/vision_encoder.cpp    | 36 +-------
 tests/python_tests/test_vlm_pipeline.py       | 23 +++--
 4 files changed, 30 insertions(+), 137 deletions(-)

diff --git a/samples/cpp/visual_language_chat/visual_language_chat.cpp b/samples/cpp/visual_language_chat/visual_language_chat.cpp
index 186e58df9e..e426965e66 100644
--- a/samples/cpp/visual_language_chat/visual_language_chat.cpp
+++ b/samples/cpp/visual_language_chat/visual_language_chat.cpp
@@ -9,7 +9,7 @@ bool print_subword(std::string&& subword) {
     return !(std::cout << subword << std::flush);
 }
 
-int main(int argc, char* argv[]) {
+int main(int argc, char* argv[]) try {
     if (3 != argc) {
         throw std::runtime_error(std::string{"Usage "} + argv[0] + " <MODEL_DIR> <IMAGE_FILE OR DIR_WITH_IMAGES>");
     }
@@ -48,14 +48,14 @@ int main(int argc, char* argv[]) {
             "question:\n";
     }
     pipe.finish_chat();
-// } catch (const std::exception& error) {
-//     try {
-//         std::cerr << error.what() << '\n';
-//     } catch (const std::ios_base::failure&) {}
-//     return EXIT_FAILURE;
-// } catch (...) {
-//     try {
-//         std::cerr << "Non-exception object thrown\n";
-//     } catch (const std::ios_base::failure&) {}
-//     return EXIT_FAILURE;
+} catch (const std::exception& error) {
+    try {
+        std::cerr << error.what() << '\n';
+    } catch (const std::ios_base::failure&) {}
+    return EXIT_FAILURE;
+} catch (...) {
+    try {
+        std::cerr << "Non-exception object thrown\n";
+    } catch (const std::ios_base::failure&) {}
+    return EXIT_FAILURE;
 }
diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp
index 1f1e162127..a8ac0f119c 100644
--- a/src/cpp/src/visual_language/inputs_embedder.cpp
+++ b/src/cpp/src/visual_language/inputs_embedder.cpp
@@ -1245,91 +1245,10 @@ ov::InferRequest create_hd_feature_transformer() {
     auto t67 = make_shared<Unsqueeze>(t66, t60);
     auto t68 = make_shared<Concat>(NodeVector{t45, t61, t67, t37}, 0);
     auto t69 = make_shared<Reshape>(t54, t68, false);
-
-    // t0 = opset.Parameter({'shape': [-1, 576, 1024], 'element_type': 'f32'},  #  -> f32[?,576,1024]
-    // t1 = opset.Parameter({'shape': [], 'element_type': 'i32'},  #  -> i32[]
-    // t2 = opset.Parameter({'shape': [], 'element_type': 'i32'},  #  -> i32[]
-    // t3 = opset.ShapeOf([t0], {'output_type': 'i64'},  # f32[?,576,1024] -> i64[3]
-    // t4 = opset.Constant(model, 4,    #  -> i64[](0)
-    // t5 = opset.Constant(model, 5,    #  -> i64[](0)
-    // t6 = opset.Gather([t3, t4, t5], {'batch_dims': 0},  # i64[3], i64[], i64[] -> i64[]
-    // t7 = opset.Constant(model, 7,    #  -> i64[1]([1])
-    // t8 = opset.Reshape([t6, t7], {'special_zero': False},  # i64[], i64[1] -> i64[1]
-    // t9 = opset.Constant(model, 9,    #  -> i64[](1)
-    // t10 = opset.Constant(model, 10,  #  -> i64[](0)
-    // t11 = opset.Gather([t3, t9, t10], {'batch_dims': 0},  # i64[3], i64[], i64[] -> i64[]
-    // t12 = opset.Convert([t11], {'destination_type': 'f32'},  # i64[] -> f32[]
-    // t13 = opset.Constant(model, 13,  #  -> f32[](0.5)
-    // t14 = opset.Power([t12, t13], {'auto_broadcast': 'numpy'},  # f32[], f32[] -> f32[]
-    // t15 = opset.Convert([t14], {'destination_type': 'i32'},  # f32[] -> i32[]
-    // t16 = opset.Convert([t15], {'destination_type': 'i64'},  # i32[] -> i64[]
-    // t17 = opset.Constant(model, 17,    #  -> i32[](0)
-    // t18 = opset.Unsqueeze([t16, t17], {},  # i64[], i32[] -> i64[1]
-    // t19 = opset.Constant(model, 19,  #  -> i64[1]([2])
-    // t20 = opset.Constant(model, 20,  #  -> i64[](0)
-    // t21 = opset.Gather([t3, t19, t20], {'batch_dims': 0},  # i64[3], i64[1], i64[] -> i64[1]
-    // t22 = opset.Concat([t8, t18, t18, t21], {'axis': 0},  # i64[1], i64[1], i64[1], i64[1] -> i64[4]
-    // t23 = opset.Reshape([t0, t22], {'special_zero': False},  # f32[?,576,1024], i64[4] -> f32[?,24,24,1024]
-    // t24 = opset.Constant(model, 24,  #  -> i64[](2)
-    // t25 = opset.Divide([t16, t24], {'auto_broadcast': 'numpy', 'm_pythondiv': True},  # i64[], i64[] -> i64[]
-    // t26 = opset.Floor([t25], {},  # i64[] -> i64[]
-    // t27 = opset.Constant(model, 27,   #  -> i32[](0)
-    // t28 = opset.Unsqueeze([t26, t27], {},  # i64[], i32[] -> i64[1]
-    // t29 = opset.Constant(model, 29,   #  -> i64[1]([2])
-    // t30 = opset.Constant(model, 30,   #  -> i64[1]([2])
-    // t31 = opset.Concat([t8, t28, t29, t28, t30, t21], {'axis': 0},  # i64[1], i64[1], i64[1], i64[1], i64[1], i64[1] -> i64[6]
-    // t32 = opset.Reshape([t23, t31], {'special_zero': False},  # f32[?,24,24,1024], i64[6] -> f32[?,12,2,12,2,1024]
-    // t33 = opset.Constant(model, 33,
-    // t34 = opset.Transpose([t32, t33], {},  # f32[?,12,2,12,2,1024], i64[6] -> f32[?,12,12,2,2,1024]
-    // t35 = opset.Constant(model, 35,   #  -> i64[1]([-1])
-    // t36 = opset.Constant(model, 36,  #  -> i64[1]([4])
-    // t37 = opset.Multiply([t21, t36], {'auto_broadcast': 'numpy'},  # i64[1], i64[1] -> i64[1]
-    // t38 = opset.Concat([t8, t35, t37], {'axis': 0},  # i64[1], i64[1], i64[1] -> i64[3]
-    // t39 = opset.Reshape([t34, t38], {'special_zero': False},  # f32[?,12,12,2,2,1024], i64[3] -> f32[?,?,4096]
-    // t40 = opset.Multiply([t1, t2], {'auto_broadcast': 'numpy'},  # i32[], i32[] -> i32[]
-    // t41 = opset.Convert([t40], {'destination_type': 'i64'},  # i32[] -> i64[]
-    // t42 = opset.Divide([t6, t41], {'auto_broadcast': 'numpy', 'm_pythondiv': True},  # i64[], i64[] -> i64[]
-    // t43 = opset.Floor([t42], {},  # i64[] -> i64[]
-    // t44 = opset.Constant(model, 44,   #  -> i32[](0)
-    // t45 = opset.Unsqueeze([t43, t44], {},  # i64[], i32[] -> i64[1]
-    // t46 = opset.Convert([t1], {'destination_type': 'i64'},  # i32[] -> i64[]
-    // t47 = opset.Unsqueeze([t46, t44], {},  # i64[], i32[] -> i64[1]
-    // t48 = opset.Convert([t2], {'destination_type': 'i64'},  # i32[] -> i64[]
-    // t49 = opset.Unsqueeze([t48, t44], {},  # i64[], i32[] -> i64[1]
-    // t50 = opset.Constant(model, 50,  #  -> i64[1]([-1])
-    // t51 = opset.Concat([t45, t47, t49, t28, t28, t50], {'axis': 0},  # i64[1], i64[1], i64[1], i64[1], i64[1], i64[1] -> i64[6]
-    // t52 = opset.Reshape([t39, t51], {'special_zero': False},  # f32[?,?,4096], i64[6] -> f32[?,?,?,?,?,?]
-    // t53 = opset.Constant(model, 53,
-    // t54 = opset.Transpose([t52, t53], {},  # f32[?,?,?,?,?,?], i64[6] -> f32[?,?,?,?,?,?]
-    // t55 = opset.Multiply([t1, t15], {'auto_broadcast': 'numpy'},  # i32[], i32[] -> i32[]
-    // t56 = opset.Convert([t55], {'destination_type': 'i64'},  # i32[] -> i64[]
-    // t57 = opset.Constant(model, 57,  #  -> i64[](2)
-    // t58 = opset.Divide([t56, t57], {'auto_broadcast': 'numpy', 'm_pythondiv': True},  # i64[], i64[] -> i64[]
-    // t59 = opset.Floor([t58], {},  # i64[] -> i64[]
-    // t60 = opset.Constant(model, 60,  #  -> i32[](0)
-    // t61 = opset.Unsqueeze([t59, t60], {},  # i64[], i32[] -> i64[1]
-    // t62 = opset.Multiply([t2, t15], {'auto_broadcast': 'numpy'},  # i32[], i32[] -> i32[]
-    // t63 = opset.Convert([t62], {'destination_type': 'i64'},  # i32[] -> i64[]
-    // t64 = opset.Constant(model, 64,  #  -> i64[](2)
-    // t65 = opset.Divide([t63, t64], {'auto_broadcast': 'numpy', 'm_pythondiv': True},  # i64[], i64[] -> i64[]
-    // t66 = opset.Floor([t65], {},  # i64[] -> i64[]
-    // t67 = opset.Unsqueeze([t66, t60], {},  # i64[], i32[] -> i64[1]
-    // t68 = opset.Concat([t45, t61, t67, t37], {'axis': 0},  # i64[1], i64[1], i64[1], i64[1] -> i64[4]
-    // t69 = opset.Reshape([t54, t68], {'special_zero': False},  # f32[?,?,?,?,?,?], i64[4] -> f32[?,?,?,?]
     shared_ptr<Model> model = make_shared<Model>(make_shared<Result>(t69), ParameterVector{t0, t1, t2});
-    ov::InferRequest hd_feature_transformer = utils::singleton_core().compile_model(
+    return utils::singleton_core().compile_model(
         model, "CPU"
     ).create_infer_request();
-    // hd_feature_transformer.set_input_tensor(0, ov::Tensor{f32, {1, 576, 1024}});
-    // ov::Tensor h_crop = ov::Tensor{i32, {}};
-    // h_crop.data<int32_t>()[0] = 1;
-    // hd_feature_transformer.set_input_tensor(1, h_crop);
-    // ov::Tensor w_crop = ov::Tensor{i32, {}};
-    // w_crop.data<int32_t>()[0] = 1;
-    // hd_feature_transformer.set_input_tensor(2, w_crop);
-    // hd_feature_transformer.infer();
-    // std::cout << hd_feature_transformer.get_output_tensor().get_shape() << '\n';  // [1,24,24,4096]
-    return hd_feature_transformer;
 }
 
 ov::Tensor reshape_hd_patches_2x2merge(const ov::Tensor& image_features, size_t h_crop, size_t w_crop, InferRequest& hd_feature_transformer) {
@@ -1458,7 +1377,7 @@ class InputsEmbedderPhi3V : public InputsEmbedder::IInputsEmbedder {
         OPENVINO_ASSERT(images.empty() || m_history.empty(), "Images can only be provided for initial prompt");
         std::vector<ov::Tensor> images_features_proj;
         std::vector<ov::Tensor> tokens;
-        if (m_history.empty()) {
+        if (!images.empty()) {
             std::stringstream images_prompt;
             for (const ov::Tensor& image : to_single_image_tensors(images)) {
                 EncodedImage encoded_image = m_vision_encoder.encode(image);
@@ -1467,7 +1386,6 @@ class InputsEmbedderPhi3V : public InputsEmbedder::IInputsEmbedder {
                 ++m_image_id;
             }
             images_prompt << prompt;
-            std::string new_templated_chat_history;
             if (m_is_chat_conversation) {
                 m_history.push_back({{"role", "user"}, {"content", images_prompt.str()}});
                 constexpr bool add_generation_prompt = true;
diff --git a/src/cpp/src/visual_language/vision_encoder.cpp b/src/cpp/src/visual_language/vision_encoder.cpp
index 36de524b54..931849916f 100644
--- a/src/cpp/src/visual_language/vision_encoder.cpp
+++ b/src/cpp/src/visual_language/vision_encoder.cpp
@@ -824,7 +824,7 @@ ov::Tensor pad_to_max_num_crops_tensor(const ov::Tensor& nchw, size_t max_crops)
     return padded;
 }
 
-std::tuple<ov::Tensor, ImageSize, size_t> get_pixel_values_phi3_v(const ov::Tensor& image, const ProcessorConfig& config) {
+std::tuple<ov::Tensor, ImageSize> get_pixel_values_phi3_v(const ov::Tensor& image, const ProcessorConfig& config) {
     ov::Tensor hd_image = HD_transform(image, config.phi3_v.num_crops);  // TODO: this is just resize_and_pad_image() from clip.hpp.
     ImageSize image_size{hd_image.get_shape().at(2), hd_image.get_shape().at(1)};
     clip_image_u8 img{hd_image.get_shape().at(2), hd_image.get_shape().at(1), {hd_image.data<uint8_t>(), hd_image.data<uint8_t>() + hd_image.get_size()}};
@@ -838,8 +838,7 @@ std::tuple<ov::Tensor, ImageSize, size_t> get_pixel_values_phi3_v(const ov::Tens
     ov::Tensor slices = slice_image(hd_image);
     ov::Tensor concatenated = concatenate_batch(global_image, slices);
     ov::Tensor pixel_values = pad_to_max_num_crops_tensor(concatenated, config.phi3_v.num_crops);
-    size_t num_img_tokens = (image_size.height / INPUT_IMAGE_SIZE) * (image_size.width / INPUT_IMAGE_SIZE) * config.phi3_v.num_img_tokens + 1 + (image_size.height / INPUT_IMAGE_SIZE + 1) * size_t(std::sqrt(config.phi3_v.num_img_tokens));
-    return {std::move(pixel_values), image_size, num_img_tokens};
+    return {std::move(pixel_values), image_size};
 }
 }  // namespace phi3_v
 }  // anonymous namespace
@@ -955,37 +954,8 @@ EncodedImage VisionEncoder::encode_internvl(const ov::Tensor& image, const Proce
 }
 
 EncodedImage VisionEncoder::encode_phi3_v(const ov::Tensor& image, const ProcessorConfig& config) {
-    // TODO: drop num_img_tokens
-    const auto& [pixel_values, image_size, num_img_tokens] = phi3_v::get_pixel_values_phi3_v(image, config);
-    // std::cout << pixel_values.data<float>()[3*336*336+0] << '\n';
-    // std::cout << pixel_values.data<float>()[3*336*336+1] << '\n';
-    // std::cout << pixel_values.data<float>()[3*336*336+100] << '\n';
-// -1.79226
-// -1.74847
-// -1.14993
-// 0.645675
-// 0.660273
-// 1.09823
+    const auto& [pixel_values, image_size] = phi3_v::get_pixel_values_phi3_v(image, config);
     m_vision_encoder.set_input_tensor(pixel_values);
     m_vision_encoder.infer();
-    // std::cout << pixel_values.get_shape() << ' ' << m_vision_encoder.get_output_tensor().get_shape() << '\n';
-    // ov::Tensor out = m_vision_encoder.get_output_tensor();
-    // std::cout << out.data<float>()[576*1024 + 0] << '\n';
-    // std::cout << out.data<float>()[576*1024 + 1] << '\n';
-    // std::cout << out.data<float>()[576*1024 + 1025] << '\n';
-    // std::cout << out.data<float>()[576*1024 + 4090] << '\n';
-    // std::cout << out.data<float>()[576*1024 + 80000] << '\n';
-// [5,3,336,336] [5,576,1024]
-// 0.134461
-// -0.867309
-// -0.274503
-// 1.73786
-// 0.13117
-// [5,3,336,336] [5,576,1024]
-// -1.01567
-// -0.291421
-// -0.260488
-// 0.743025
-// 1.4099
     return {m_vision_encoder.get_output_tensor(), image_size};
 }
diff --git a/tests/python_tests/test_vlm_pipeline.py b/tests/python_tests/test_vlm_pipeline.py
index 81c181bc54..53f678fc6a 100644
--- a/tests/python_tests/test_vlm_pipeline.py
+++ b/tests/python_tests/test_vlm_pipeline.py
@@ -9,17 +9,17 @@
 from openvino_genai import VLMPipeline, GenerationConfig
 from common import get_image_by_link, get_beam_search, get_multinomial_all_parameters
 
-def get_ov_model(cache):
-    model_dir = cache.mkdir("tiny-random-minicpmv-2_6")
+def get_ov_model(model_id, cache):
+    model_dir = cache.mkdir(model_id.split('/')[-1])
     if (model_dir / "openvino_language_model.xml").exists():
         return model_dir
-    model_id = "katuni4ka/tiny-random-minicpmv-2_6"
     processor = transformers.AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
     processor.tokenizer.save_pretrained(model_dir)
     ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer(processor.tokenizer, with_detokenizer=True)
     openvino.save_model(ov_tokenizer, model_dir / "openvino_tokenizer.xml")
     openvino.save_model(ov_detokenizer, model_dir / "openvino_detokenizer.xml")
     model = OVModelForVisualCausalLM.from_pretrained(model_id, compile=False, device="CPU", export=True, trust_remote_code=True)
+    processor.chat_template = processor.tokenizer.chat_template  # It seems that tiny-random-phi3-vision is saved incorrectly. That line works this around.
     processor.save_pretrained(model_dir)
     model.save_pretrained(model_dir)
     return model_dir
@@ -44,12 +44,16 @@ def get_ov_model(cache):
 
 @pytest.mark.precommit
 @pytest.mark.nightly
-def test_vlm_pipeline(cache):
+@pytest.mark.parametrize("model_id", [
+    "katuni4ka/tiny-random-minicpmv-2_6",
+    "katuni4ka/tiny-random-phi3-vision",
+])
+def test_vlm_pipeline(model_id, cache):
     def streamer(word: str) -> bool:
         return False
 
-    models_path = get_ov_model(cache)
-    generation_config = GenerationConfig(max_new_tokens=30)
+    models_path = get_ov_model(model_id, cache)
+    generation_config = GenerationConfig(max_new_tokens=100)
 
     for links in image_links_for_testing:
         images = []
@@ -70,7 +74,7 @@ def streamer(word: str) -> bool:
 @pytest.mark.precommit
 @pytest.mark.nightly
 def test_vlm_get_tokenizer(cache):
-    models_path = get_ov_model(cache)
+    models_path = get_ov_model("katuni4ka/tiny-random-minicpmv-2_6", cache)
     pipe = VLMPipeline(models_path, "CPU")
     tokenizer = pipe.get_tokenizer()
     tokenizer.encode("")
@@ -83,15 +87,16 @@ def test_vlm_get_tokenizer(cache):
     get_multinomial_all_parameters(),
 ])
 def test_sampling(config, cache):
-    models_path = get_ov_model(cache)
+    models_path = get_ov_model("katuni4ka/tiny-random-minicpmv-2_6", cache)
     image = get_image_by_link(image_links[0])
     pipe = VLMPipeline(models_path, "CPU")
     pipe.generate(prompts[0], image=image, generation_config=config)
 
 @pytest.mark.precommit
+@pytest.mark.nightly
 def test_perf_metrics(cache):
     import numpy as np
-    models_path = get_ov_model(cache)
+    models_path = get_ov_model("katuni4ka/tiny-random-minicpmv-2_6", cache)
 
     images = [get_image_by_link(image_links[0])]
 

From 6e24a25c8ab094f1b583fe7b06f2884bff20e6e4 Mon Sep 17 00:00:00 2001
From: Vladimir Zlobin <vladimir.zlobin@intel.com>
Date: Thu, 9 Jan 2025 16:07:19 +0400
Subject: [PATCH 14/28] clean up

---
 SUPPORTED_MODELS.md                            | 1 +
 src/cpp/src/visual_language/vision_encoder.cpp | 3 +--
 tests/python_tests/test_vlm_pipeline.py        | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/SUPPORTED_MODELS.md b/SUPPORTED_MODELS.md
index 71f382d529..9f404f7a1c 100644
--- a/SUPPORTED_MODELS.md
+++ b/SUPPORTED_MODELS.md
@@ -361,6 +361,7 @@ In addition to image generation models, `InpaintingPipeline` supports specialize
     <tr>
       <td><code>Phi3VForCausalLM</code></td>
       <td>phi3_v</td>
+      <td>Not supported</td>
       <td>
         <ul>
           <li><a href="https://huggingface.co/microsoft/Phi-3-vision-128k-instruct"><code>microsoft/Phi-3-vision-128k-instruct</code></a></li>
diff --git a/src/cpp/src/visual_language/vision_encoder.cpp b/src/cpp/src/visual_language/vision_encoder.cpp
index 931849916f..c5ca2b1025 100644
--- a/src/cpp/src/visual_language/vision_encoder.cpp
+++ b/src/cpp/src/visual_language/vision_encoder.cpp
@@ -651,7 +651,6 @@ constexpr size_t INPUT_IMAGE_SIZE = 336;
 ov::Tensor padding_336(const ov::Tensor& unpadded) {
     ov::Shape _1ss3 = unpadded.get_shape();
     size_t s1 = _1ss3.at(1), s2 = _1ss3.at(2);
-    // TODO: test horizontal and vertical images
     if (s1 < s2) {
         size_t tar = size_t(std::ceil(float(s1) / INPUT_IMAGE_SIZE) * INPUT_IMAGE_SIZE);
         size_t top_padding = (tar - s1) / 2;
@@ -825,7 +824,7 @@ ov::Tensor pad_to_max_num_crops_tensor(const ov::Tensor& nchw, size_t max_crops)
 }
 
 std::tuple<ov::Tensor, ImageSize> get_pixel_values_phi3_v(const ov::Tensor& image, const ProcessorConfig& config) {
-    ov::Tensor hd_image = HD_transform(image, config.phi3_v.num_crops);  // TODO: this is just resize_and_pad_image() from clip.hpp.
+    ov::Tensor hd_image = HD_transform(image, config.phi3_v.num_crops);
     ImageSize image_size{hd_image.get_shape().at(2), hd_image.get_shape().at(1)};
     clip_image_u8 img{hd_image.get_shape().at(2), hd_image.get_shape().at(1), {hd_image.data<uint8_t>(), hd_image.data<uint8_t>() + hd_image.get_size()}};
     clip_image_u8 dst;
diff --git a/tests/python_tests/test_vlm_pipeline.py b/tests/python_tests/test_vlm_pipeline.py
index 52345cbd06..3867806fe4 100644
--- a/tests/python_tests/test_vlm_pipeline.py
+++ b/tests/python_tests/test_vlm_pipeline.py
@@ -53,7 +53,7 @@ def streamer(word: str) -> bool:
         return False
 
     models_path = get_ov_model(model_id, cache)
-    generation_config = GenerationConfig(max_new_tokens=100)
+    generation_config = GenerationConfig(max_new_tokens=30)
 
     for links in image_links_for_testing:
         images = []

From 3fd78e461290d88fecaa3413079ed967a422ac16 Mon Sep 17 00:00:00 2001
From: Vladimir Zlobin <vladimir.zlobin@intel.com>
Date: Thu, 9 Jan 2025 16:24:06 +0400
Subject: [PATCH 15/28] fix compilation

---
 src/cpp/src/visual_language/inputs_embedder.cpp | 2 +-
 src/cpp/src/visual_language/vision_encoder.cpp  | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp
index a8ac0f119c..8755f3683c 100644
--- a/src/cpp/src/visual_language/inputs_embedder.cpp
+++ b/src/cpp/src/visual_language/inputs_embedder.cpp
@@ -1371,7 +1371,7 @@ class InputsEmbedderPhi3V : public InputsEmbedder::IInputsEmbedder {
     ):
         IInputsEmbedder(vlm_config, model_dir, device, device_config), m_image_id{0},
         m_hd_feature_transformer{phi3_v::create_hd_feature_transformer()},
-        m_vision_projection{utils::singleton_core().compile_model(model_dir / "openvino_vision_projection_model.xml", device).create_infer_request()} {}
+        m_vision_projection{utils::singleton_core().compile_model(model_dir / "openvino_vision_projection_model.xml", device, {}).create_infer_request()} {}
 
     ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics) override {
         OPENVINO_ASSERT(images.empty() || m_history.empty(), "Images can only be provided for initial prompt");
diff --git a/src/cpp/src/visual_language/vision_encoder.cpp b/src/cpp/src/visual_language/vision_encoder.cpp
index c5ca2b1025..56165e392c 100644
--- a/src/cpp/src/visual_language/vision_encoder.cpp
+++ b/src/cpp/src/visual_language/vision_encoder.cpp
@@ -691,11 +691,11 @@ ov::Tensor HD_transform(const ov::Tensor& uint8, size_t num_crops) {
     clip_image_u8 src{}, dst{};
     uint8_t* uint8_data = uint8.data<uint8_t>();
     if (trans) {
-        src = clip_image_u8{height, width, {uint8_data, uint8_data + uint8.get_size()}};
+        src = clip_image_u8{int(height), int(width), {uint8_data, uint8_data + uint8.get_size()}};
         bilinear_resize(src, dst, new_h, new_w);
         return padding_336(ov::Tensor{ov::element::u8, {1, new_w, new_h, 3}, dst.buf.data()});
     }
-    src = clip_image_u8{width, height, {uint8_data, uint8_data + uint8.get_size()}};
+    src = clip_image_u8{int(width), int(height), {uint8_data, uint8_data + uint8.get_size()}};
     bilinear_resize(src, dst, new_w, new_h);
     return padding_336(ov::Tensor{ov::element::u8, {1, new_h, new_w, 3}, dst.buf.data()});
 }
@@ -826,7 +826,7 @@ ov::Tensor pad_to_max_num_crops_tensor(const ov::Tensor& nchw, size_t max_crops)
 std::tuple<ov::Tensor, ImageSize> get_pixel_values_phi3_v(const ov::Tensor& image, const ProcessorConfig& config) {
     ov::Tensor hd_image = HD_transform(image, config.phi3_v.num_crops);
     ImageSize image_size{hd_image.get_shape().at(2), hd_image.get_shape().at(1)};
-    clip_image_u8 img{hd_image.get_shape().at(2), hd_image.get_shape().at(1), {hd_image.data<uint8_t>(), hd_image.data<uint8_t>() + hd_image.get_size()}};
+    clip_image_u8 img{int(hd_image.get_shape().at(2)), int(hd_image.get_shape().at(1)), {hd_image.data<uint8_t>(), hd_image.data<uint8_t>() + hd_image.get_size()}};
     clip_image_u8 dst;
     bicubic_resize(img, dst, INPUT_IMAGE_SIZE, INPUT_IMAGE_SIZE);
     ov::Tensor global_image{ov::element::u8, {1, INPUT_IMAGE_SIZE, INPUT_IMAGE_SIZE, 3}, dst.buf.data()};

From 91b170fa9f34c9556f9c0169c1b51e6c0641e53b Mon Sep 17 00:00:00 2001
From: Vladimir Zlobin <vladimir.zlobin@intel.com>
Date: Thu, 9 Jan 2025 17:39:43 +0400
Subject: [PATCH 16/28] fix prefix

---
 .../src/visual_language/inputs_embedder.cpp   | 20 ++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp
index 8755f3683c..bf51abe3f9 100644
--- a/src/cpp/src/visual_language/inputs_embedder.cpp
+++ b/src/cpp/src/visual_language/inputs_embedder.cpp
@@ -144,7 +144,7 @@ class InputsEmbedder::IInputsEmbedder {
         ),
         m_tokenizer(tokenizer) { }
 
-    ov::Tensor get_encoded_input_ids(const std::string& prompt, ov::genai::VLMPerfMetrics& metrics, const std::string& chat_template_fallback = "") {
+    ov::Tensor get_encoded_input_ids(const std::string& prompt, ov::genai::VLMPerfMetrics& metrics, const std::string& chat_template_fallback = "", bool add_special_tokens_for_chat = false) {
         ov::Tensor encoded_input_ids;
         if (m_is_chat_conversation) {
             // KV cache in model already contains prompts and answers from previous iterations.
@@ -165,8 +165,8 @@ class InputsEmbedder::IInputsEmbedder {
                 new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt, chat_template_fallback);
             }
             auto start_tokenizer_time = std::chrono::steady_clock::now();
-            ov::Tensor new_chat_tokens = m_tokenizer.encode(new_templated_chat_history, ov::genai::add_special_tokens(false)).input_ids;
-            TokenizedInputs prev_chat_tokens = m_tokenizer.encode(m_templated_chat_history, ov::genai::add_special_tokens(false));
+            ov::Tensor new_chat_tokens = m_tokenizer.encode(new_templated_chat_history, ov::genai::add_special_tokens(add_special_tokens_for_chat)).input_ids;
+            TokenizedInputs prev_chat_tokens = m_tokenizer.encode(m_templated_chat_history, ov::genai::add_special_tokens(add_special_tokens_for_chat));
 
             // some symbols combinations can be encoded by the tokenizer in different ways
             // if we met sequence with such combination of symbols, we cannot correctly subtract the new history from the old history
@@ -1349,7 +1349,8 @@ std::vector<ov::Tensor> split_tokenize(const std::string& text, ov::genai::Token
         if (iter->str().empty()) {
             continue;
         }
-        tokenized.push_back(tokenizer.encode(*iter).input_ids);
+        std::string substr = *iter;
+        tokenized.push_back(tokenizer.encode(substr, ov::genai::add_special_tokens(true)).input_ids);
     }
     return tokenized;
 }
@@ -1377,7 +1378,7 @@ class InputsEmbedderPhi3V : public InputsEmbedder::IInputsEmbedder {
         OPENVINO_ASSERT(images.empty() || m_history.empty(), "Images can only be provided for initial prompt");
         std::vector<ov::Tensor> images_features_proj;
         std::vector<ov::Tensor> tokens;
-        if (!images.empty()) {
+        if (m_history.empty()) {
             std::stringstream images_prompt;
             for (const ov::Tensor& image : to_single_image_tensors(images)) {
                 EncodedImage encoded_image = m_vision_encoder.encode(image);
@@ -1394,17 +1395,18 @@ class InputsEmbedderPhi3V : public InputsEmbedder::IInputsEmbedder {
                 m_templated_chat_history = images_prompt.str();
             }
             auto start_tokenizer_time = std::chrono::steady_clock::now();
+            ov::Tensor unmodified_tokens = m_tokenizer.encode(m_templated_chat_history, ov::genai::add_special_tokens(true)).input_ids;
             tokens = phi3_v::split_tokenize(m_templated_chat_history, m_tokenizer);
 
             auto end_tokenizer_time = std::chrono::steady_clock::now();
             metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time));
             if (m_is_chat_conversation) {
-                for (const ov::Tensor& chunk : tokens) {
-                    m_tokenized_history.insert(m_tokenized_history.end(), chunk.data<int64_t>(), chunk.data<int64_t>() + chunk.get_size());
-                }
+                m_tokenized_history = std::vector<int64_t>{unmodified_tokens.data<int64_t>(), unmodified_tokens.data<int64_t>() + unmodified_tokens.get_size()};
             }
         } else {
-            tokens = {get_encoded_input_ids(prompt, metrics)};
+            constexpr char ignored[] = "";
+            constexpr bool add_special_tokens = true;
+            tokens = {get_encoded_input_ids(prompt, metrics, ignored, add_special_tokens)};
         }
         OPENVINO_ASSERT(tokens.size() - 1 == images_features_proj.size());
         size_t features_length = 0;

From 793e4c828feee5e1c2e7adcf2cd0ba7c2a6b8d20 Mon Sep 17 00:00:00 2001
From: Vladimir Zlobin <vladimir.zlobin@intel.com>
Date: Fri, 10 Jan 2025 14:32:16 +0400
Subject: [PATCH 17/28] Add instructions to reproduce

---
 .../src/visual_language/inputs_embedder.cpp   | 33 +++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp
index bf51abe3f9..bae27533a3 100644
--- a/src/cpp/src/visual_language/inputs_embedder.cpp
+++ b/src/cpp/src/visual_language/inputs_embedder.cpp
@@ -1170,6 +1170,39 @@ namespace phi3_v {
 //     .permute(0, 1, 3, 2, 4, 5)  # n_img, h_crop, 12, w_crop, 12, 4096
 //     .reshape(num_images, h_crop * H // 2, w_crop * H // 2, 4 * C)  # n_img, h_crop*12, w_crop*12, 4096
 // )
+// Obtained in the following way
+// import torch
+// import openvino as ov
+// import numpy as np
+// class Model(torch.nn.Module):
+//     def forward(self, image_features, h_crop, w_crop):
+//         """
+//         image_features: (num_images*num_crops, 24*24, 1024)
+//         output: (num_images, h_crop*12, w_crop*12, 4096), h_crop*w_crop == num_crops
+//         """
+//         N, L, C = image_features.shape
+//         num_images = N // (h_crop * w_crop)
+//         H = (torch.tensor(L, dtype=torch.float32)**0.5).int()
+//         image_features_hd = (
+//             image_features.reshape(N, H, H, C)  # N, 24, 24, 1024
+//             .reshape(N, H // 2, 2, H // 2, 2, C)  # N, 12, 2, 12, 2, 1024
+//             .permute(0, 1, 3, 2, 4, 5)  # N, 12, 12, 2, 2, 1024
+//             .reshape(N, -1, 4 * C)  # N, 144, 4096
+//             .reshape(num_images, h_crop, w_crop, H // 2, H // 2, -1)  # n_img, h_crop, w_crop, 12, 12, 4096
+//             .permute(0, 1, 3, 2, 4, 5)  # n_img, h_crop, 12, w_crop, 12, 4096
+//             .reshape(num_images, h_crop * H // 2, w_crop * H // 2, 4 * C)  # n_img, h_crop*12, w_crop*12, 4096
+//         return {"o": image_features_hd}
+// model = Model()
+// example_input = {"image_features": torch.rand((4, 576, 1024), dtype=torch.float32), "h_crop": torch.tensor(2, dtype=torch.int32), "w_crop": torch.tensor(2, dtype=torch.int32)}
+// ov_model = ov.convert_model(model, example_input=example_input, input=ov.PartialShape([-1, 576, 1024]))
+// # ov_model.outputs[0].get_tensor().set_names({"out"})
+// ov.save_model(ov_model, "reshape_hd_patches_2x2merge.xml")
+// inp = np.arange(4 * 576 * 1024).reshape([4, 576, 1024])
+// test = ov.Core().compile_model(ov_model, "CPU")
+// print(ov_model)
+// print(test([inp, 2, 2])["o"].flatten())
+// 2. Run https://github.com/slyalin/openvino_devtools/blob/bcd4a51b1354b24b2316ac3e1c77b2f87ae7a497/openvino_devtools/ov2py.py with the IR.
+// 3. Translate the printed Python implementation to C++.
 ov::InferRequest create_hd_feature_transformer() {
     using namespace ov;
     using namespace element;

From bab2d46d751df925c562b6373a56e0f6e0e9b89c Mon Sep 17 00:00:00 2001
From: Vladimir Zlobin <vladimir.zlobin@intel.com>
Date: Tue, 14 Jan 2025 17:38:25 +0400
Subject: [PATCH 18/28] Split get_encoded_input_ids

---
 .../src/visual_language/inputs_embedder.cpp   | 57 ++++++++++++-------
 1 file changed, 35 insertions(+), 22 deletions(-)

diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp
index 5639f46537..448155d7a5 100644
--- a/src/cpp/src/visual_language/inputs_embedder.cpp
+++ b/src/cpp/src/visual_language/inputs_embedder.cpp
@@ -144,17 +144,8 @@ class InputsEmbedder::IInputsEmbedder {
         ),
         m_tokenizer(tokenizer) { }
 
-    ov::Tensor get_encoded_input_ids(const std::string& prompt, ov::genai::VLMPerfMetrics& metrics, const std::string& chat_template_fallback = {}, bool add_special_tokens_for_chat = false) {
-        ov::Tensor encoded_input_ids;
+    std::pair<ov::Tensor, ov::Tensor> apply_chat_template_tokenize(const std::string& prompt, ov::genai::VLMPerfMetrics& metrics, const std::string& chat_template_fallback = {} bool add_special_tokens_for_chat = false) {
         if (m_is_chat_conversation) {
-            // KV cache in model already contains prompts and answers from previous iterations.
-            // So only new prompt wrapped into chat template to be sent into model. Tokenizer always returns
-            // token_ids = {<bos token>, ...<valuable tokens>}. So if tokenizer applies only to the new prompt,
-            // <bos token> will be inserted on every iteration.
-            // So actual pipeline calculates input_ids for whole chat history + for whole chat history without the new prompt
-            // and takes only the difference between them.
-            // The chat history cannot be saved as already encoded tokens because generate call doesn't return <eos> token, but
-            // KV cache contains it. So we have to add it manually or get it by tokenization all chat history.
             m_history.push_back({{"role", "user"}, {"content", prompt}});
             constexpr bool add_generation_prompt = true;
             std::string new_templated_chat_history;
@@ -166,7 +157,31 @@ class InputsEmbedder::IInputsEmbedder {
             }
             auto start_tokenizer_time = std::chrono::steady_clock::now();
             ov::Tensor new_chat_tokens = m_tokenizer.encode(new_templated_chat_history, ov::genai::add_special_tokens(add_special_tokens_for_chat)).input_ids;
-            TokenizedInputs prev_chat_tokens = m_tokenizer.encode(m_templated_chat_history, ov::genai::add_special_tokens(add_special_tokens_for_chat));
+            ov::Tensor prev_chat_tokens = m_tokenizer.encode(m_templated_chat_history, ov::genai::add_special_tokens(add_special_tokens_for_chat)).input_ids;
+            auto end_tokenizer_time = std::chrono::steady_clock::now();
+            metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time));
+            m_templated_chat_history = std::move(new_templated_chat_history);
+            return {new_chat_tokens, prev_chat_tokens};
+        } else {
+            auto start_tokenizer_time = std::chrono::steady_clock::now();
+            ov::Tensor encoded_input_ids = m_tokenizer.encode(prompt).input_ids;
+            auto end_tokenizer_time = std::chrono::steady_clock::now();
+            metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time));
+            return {encoded_input_ids, ov::Tensor()};
+        }
+    }
+
+    ov::Tensor update_history(const ov::Tensor& new_chat_tokens, const ov::Tensor& prev_chat_tokens) {
+        if (m_is_chat_conversation) {
+            ov::Tensor encoded_input_ids;
+            // KV cache in model already contains prompts and answers from previous iterations.
+            // So only new prompt wrapped into chat template to be sent into model. Tokenizer always returns
+            // token_ids = {<bos token>, ...<valuable tokens>}. So if tokenizer applies only to the new prompt,
+            // <bos token> will be inserted on every iteration.
+            // So actual pipeline calculates input_ids for whole chat history + for whole chat history without the new prompt
+            // and takes only the difference between them.
+            // The chat history cannot be saved as already encoded tokens because generate call doesn't return <eos> token, but
+            // KV cache contains it. So we have to add it manually or get it by tokenization all chat history.
 
             // some symbols combinations can be encoded by the tokenizer in different ways
             // if we met sequence with such combination of symbols, we cannot correctly subtract the new history from the old history
@@ -174,7 +189,7 @@ class InputsEmbedder::IInputsEmbedder {
             size_t trusted_history_length = 0;
             if (!m_tokenized_history.empty()) {
                 std::set<int64_t> stop_tokens = {m_tokenizer.get_eos_token_id()};
-                trusted_history_length = ov::genai::utils::get_first_history_difference(prev_chat_tokens.input_ids, m_tokenized_history, stop_tokens);
+                trusted_history_length = ov::genai::utils::get_first_history_difference(prev_chat_tokens, m_tokenized_history, stop_tokens);
             }
 
             if (m_tokenized_history.empty()) {
@@ -200,27 +215,25 @@ class InputsEmbedder::IInputsEmbedder {
                 new_tensor.copy_to(encoded_input_ids);
             } else {
                 encoded_input_ids = utils::subtract_chat_tokenized_inputs(
-                    {new_chat_tokens}, prev_chat_tokens
+                    {new_chat_tokens}, {prev_chat_tokens}
                 ).input_ids;
 
                 if (m_last_disappeared_token.has_value())
                     encoded_input_ids = ov::genai::utils::push_front_inputs(encoded_input_ids, *m_last_disappeared_token);
             }
-            auto end_tokenizer_time = std::chrono::steady_clock::now();
-            metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time));
-            m_templated_chat_history = std::move(new_templated_chat_history);
             m_tokenized_history.clear();
             std::copy_n(new_chat_tokens.data<int64_t>(), new_chat_tokens.get_size(), std::back_inserter(m_tokenized_history));
+            return encoded_input_ids;
         } else {
-            auto start_tokenizer_time = std::chrono::steady_clock::now();
-            encoded_input_ids = m_tokenizer.encode(prompt).input_ids;
-            auto end_tokenizer_time = std::chrono::steady_clock::now();
-            metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time));
             m_tokenized_history.clear();
-            std::copy_n(encoded_input_ids.data<int64_t>(), encoded_input_ids.get_size(), std::back_inserter(m_tokenized_history));
+            std::copy_n(new_chat_tokens.data<int64_t>(), new_chat_tokens.get_size(), std::back_inserter(m_tokenized_history));
+            return new_chat_tokens;
         }
+    }
 
-        return encoded_input_ids;
+    ov::Tensor get_encoded_input_ids(const std::string& prompt, ov::genai::VLMPerfMetrics& metrics, const std::string& chat_template_fallback = "", bool add_special_tokens_for_chat = false) {
+        const auto [new_chat_tokens, prev_chat_tokens] = apply_chat_template_tokenize(prompt, metrics, chat_template_fallback, add_special_tokens_for_chat);
+        return update_history(new_chat_tokens, prev_chat_tokens);
     }
 
     /**

From b3ca05aeec0e8c9f88dcd14844b87e130b30019c Mon Sep 17 00:00:00 2001
From: Vladimir Zlobin <vladimir.zlobin@intel.com>
Date: Tue, 14 Jan 2025 18:12:12 +0400
Subject: [PATCH 19/28] sintax

---
 src/cpp/src/visual_language/inputs_embedder.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp
index 448155d7a5..4e417ddb34 100644
--- a/src/cpp/src/visual_language/inputs_embedder.cpp
+++ b/src/cpp/src/visual_language/inputs_embedder.cpp
@@ -144,7 +144,7 @@ class InputsEmbedder::IInputsEmbedder {
         ),
         m_tokenizer(tokenizer) { }
 
-    std::pair<ov::Tensor, ov::Tensor> apply_chat_template_tokenize(const std::string& prompt, ov::genai::VLMPerfMetrics& metrics, const std::string& chat_template_fallback = {} bool add_special_tokens_for_chat = false) {
+    std::pair<ov::Tensor, ov::Tensor> apply_chat_template_tokenize(const std::string& prompt, ov::genai::VLMPerfMetrics& metrics, const std::string& chat_template_fallback = {}, bool add_special_tokens_for_chat = false) {
         if (m_is_chat_conversation) {
             m_history.push_back({{"role", "user"}, {"content", prompt}});
             constexpr bool add_generation_prompt = true;

From 02d36b52a94e4ca12a2d0981d7f17df14358581b Mon Sep 17 00:00:00 2001
From: Vladimir Zlobin <vladimir.zlobin@intel.com>
Date: Wed, 15 Jan 2025 14:38:40 +0400
Subject: [PATCH 20/28] Phi-3.5-vision-instruc history

---
 .../visual_language_chat.cpp                  |  22 ++--
 .../src/visual_language/inputs_embedder.cpp   | 103 +++++++++++++-----
 2 files changed, 87 insertions(+), 38 deletions(-)

diff --git a/samples/cpp/visual_language_chat/visual_language_chat.cpp b/samples/cpp/visual_language_chat/visual_language_chat.cpp
index e426965e66..186e58df9e 100644
--- a/samples/cpp/visual_language_chat/visual_language_chat.cpp
+++ b/samples/cpp/visual_language_chat/visual_language_chat.cpp
@@ -9,7 +9,7 @@ bool print_subword(std::string&& subword) {
     return !(std::cout << subword << std::flush);
 }
 
-int main(int argc, char* argv[]) try {
+int main(int argc, char* argv[]) {
     if (3 != argc) {
         throw std::runtime_error(std::string{"Usage "} + argv[0] + " <MODEL_DIR> <IMAGE_FILE OR DIR_WITH_IMAGES>");
     }
@@ -48,14 +48,14 @@ int main(int argc, char* argv[]) try {
             "question:\n";
     }
     pipe.finish_chat();
-} catch (const std::exception& error) {
-    try {
-        std::cerr << error.what() << '\n';
-    } catch (const std::ios_base::failure&) {}
-    return EXIT_FAILURE;
-} catch (...) {
-    try {
-        std::cerr << "Non-exception object thrown\n";
-    } catch (const std::ios_base::failure&) {}
-    return EXIT_FAILURE;
+// } catch (const std::exception& error) {
+//     try {
+//         std::cerr << error.what() << '\n';
+//     } catch (const std::ios_base::failure&) {}
+//     return EXIT_FAILURE;
+// } catch (...) {
+//     try {
+//         std::cerr << "Non-exception object thrown\n";
+//     } catch (const std::ios_base::failure&) {}
+//     return EXIT_FAILURE;
 }
diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp
index 4e417ddb34..d52490c1b6 100644
--- a/src/cpp/src/visual_language/inputs_embedder.cpp
+++ b/src/cpp/src/visual_language/inputs_embedder.cpp
@@ -1400,6 +1400,53 @@ std::vector<ov::Tensor> split_tokenize(const std::string& text, ov::genai::Token
     }
     return tokenized;
 }
+
+ov::Tensor insert_image_placeholders(const std::vector<ov::Tensor>& chunks, size_t tokens_per_image) {
+    size_t merged_length = 0;
+    for (const ov::Tensor& chunk : chunks) {
+        merged_length += chunk.get_shape().at(1);
+    }
+    merged_length += chunks.empty() ? 0 : (chunks.size() - 1) * tokens_per_image;
+    ov::Tensor merged{ov::element::i64, {1, merged_length}};
+    size_t offset = 0;
+    int64_t image_id = -1;
+    for (const ov::Tensor& chunk : chunks) {
+        size_t length = chunk.get_shape().at(1);
+        std::copy_n(
+            chunk.data<int64_t>(),
+            length,
+            merged.data<int64_t>() + offset
+        );
+        offset += length;
+        if (offset < merged_length) {
+            std::fill_n(
+                merged.data<int64_t>() + offset,
+                tokens_per_image,
+                image_id
+            );
+            offset += tokens_per_image;
+            --image_id;
+        }
+    }
+    return merged;
+}
+
+std::vector<ov::Tensor> drop_image_placeholders(const ov::Tensor& tokens) {
+    std::vector<ov::Tensor> chunks;
+    size_t offset = 0;
+    while (offset < tokens.get_shape().at(1)) {
+        size_t length = 0;
+        while (offset + length < tokens.get_shape().at(1) && tokens.data<int64_t>()[offset + length] >= 0) {
+            ++length;
+        }
+        chunks.emplace_back(ov::element::i64, ov::Shape{1, length}, tokens.data<int64_t>() + offset);
+        offset += length;
+        while (offset < tokens.get_shape().at(1) && tokens.data<int64_t>()[offset] < 0) {
+            ++offset;
+        }
+    }
+    return chunks;
+}
 }
 }
 
@@ -1423,42 +1470,44 @@ class InputsEmbedderPhi3V : public InputsEmbedder::IInputsEmbedder {
     ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics) override {
         OPENVINO_ASSERT(images.empty() || m_history.empty(), "Images can only be provided for initial prompt");
         std::vector<ov::Tensor> images_features_proj;
-        std::vector<ov::Tensor> tokens;
-        if (m_history.empty()) {
-            std::stringstream images_prompt;
-            for (const ov::Tensor& image : to_single_image_tensors(images)) {
-                EncodedImage encoded_image = m_vision_encoder.encode(image);
-                images_features_proj.push_back(phi3_v::hd_feature_transform(encoded_image, m_hd_feature_transformer, m_vlm_config.sub_GN, m_vlm_config.glb_GN, m_vision_projection));
-                images_prompt << "<|image_" << m_image_id << "|>\n";
-                ++m_image_id;
-            }
-            images_prompt << prompt;
-            if (m_is_chat_conversation) {
-                m_history.push_back({{"role", "user"}, {"content", images_prompt.str()}});
-                constexpr bool add_generation_prompt = true;
-                m_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt);
-            } else {
-                m_templated_chat_history = images_prompt.str();
-            }
+        std::stringstream images_prompt;
+        for (const ov::Tensor& image : to_single_image_tensors(images)) {
+            EncodedImage encoded_image = m_vision_encoder.encode(image);
+            images_features_proj.push_back(phi3_v::hd_feature_transform(encoded_image, m_hd_feature_transformer, m_vlm_config.sub_GN, m_vlm_config.glb_GN, m_vision_projection));
+            images_prompt << "<|image_" << m_image_id << "|>\n";
+            ++m_image_id;
+        }
+        images_prompt << prompt;
+        std::vector<ov::Tensor> new_chat_tokens;
+        std::vector<ov::Tensor> prev_chat_tokens;
+        if (m_is_chat_conversation) {
+            m_history.push_back({{"role", "user"}, {"content", images_prompt.str()}});
+            constexpr bool add_generation_prompt = true;
+            std::string new_templated_chat_history;
+            new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt);
             auto start_tokenizer_time = std::chrono::steady_clock::now();
-            ov::Tensor unmodified_tokens = m_tokenizer.encode(m_templated_chat_history, ov::genai::add_special_tokens(true)).input_ids;
-            tokens = phi3_v::split_tokenize(m_templated_chat_history, m_tokenizer);
-
+            new_chat_tokens = phi3_v::split_tokenize(new_templated_chat_history, m_tokenizer);
+            prev_chat_tokens = phi3_v::split_tokenize(m_templated_chat_history, m_tokenizer);
             auto end_tokenizer_time = std::chrono::steady_clock::now();
             metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time));
-            if (m_is_chat_conversation) {
-                m_tokenized_history = std::vector<int64_t>{unmodified_tokens.data<int64_t>(), unmodified_tokens.data<int64_t>() + unmodified_tokens.get_size()};
-            }
+            m_templated_chat_history = std::move(new_templated_chat_history);
         } else {
-            constexpr char ignored[] = "";
-            constexpr bool add_special_tokens = true;
-            tokens = {get_encoded_input_ids(prompt, metrics, ignored, add_special_tokens)};
+            auto start_tokenizer_time = std::chrono::steady_clock::now();
+            new_chat_tokens = phi3_v::split_tokenize(images_prompt.str(), m_tokenizer);
+            auto end_tokenizer_time = std::chrono::steady_clock::now();
+            metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time));
         }
-        OPENVINO_ASSERT(tokens.size() - 1 == images_features_proj.size());
+        size_t tokens_per_image = images_features_proj.empty() ? 0 : images_features_proj.at(0).get_shape().at(1);
+        ov::Tensor new_merged_tokens = phi3_v::insert_image_placeholders(new_chat_tokens, tokens_per_image);
+        ov::Tensor prev_merged_tokens = phi3_v::insert_image_placeholders(prev_chat_tokens, tokens_per_image);
+        ov::Tensor new_tokens = update_history(new_merged_tokens, prev_merged_tokens);
+        std::vector<ov::Tensor> tokens = phi3_v::drop_image_placeholders(new_tokens);
+        OPENVINO_ASSERT(tokens.size() == images_features_proj.size() + 1);
         size_t features_length = 0;
         for (size_t im_id = 0; im_id < images_features_proj.size(); ++im_id) {
             size_t text_length = tokens.at(im_id).get_shape().at(1);
             size_t im_length = images_features_proj.at(im_id).get_shape().at(1);
+            OPENVINO_ASSERT(im_length == tokens_per_image);
             features_length += text_length + im_length;
         }
         features_length += tokens.back().get_shape().at(1);

From 9336dac8d348a413f54eca82888062810e05dedf Mon Sep 17 00:00:00 2001
From: Vladimir Zlobin <vladimir.zlobin@intel.com>
Date: Fri, 17 Jan 2025 12:04:43 +0400
Subject: [PATCH 21/28] Save tokens per image

---
 src/cpp/src/visual_language/inputs_embedder.cpp | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp
index d52490c1b6..4f788a6e73 100644
--- a/src/cpp/src/visual_language/inputs_embedder.cpp
+++ b/src/cpp/src/visual_language/inputs_embedder.cpp
@@ -1456,6 +1456,7 @@ class InputsEmbedderPhi3V : public InputsEmbedder::IInputsEmbedder {
     ov::InferRequest m_vision_projection;
     // Used to insert <|image_i|>\n per image (not a slice).
     size_t m_image_id = 1;
+    size_t m_tokens_per_image = 0;
 
     InputsEmbedderPhi3V(
         const VLMConfig& vlm_config,
@@ -1497,9 +1498,11 @@ class InputsEmbedderPhi3V : public InputsEmbedder::IInputsEmbedder {
             auto end_tokenizer_time = std::chrono::steady_clock::now();
             metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time));
         }
-        size_t tokens_per_image = images_features_proj.empty() ? 0 : images_features_proj.at(0).get_shape().at(1);
-        ov::Tensor new_merged_tokens = phi3_v::insert_image_placeholders(new_chat_tokens, tokens_per_image);
-        ov::Tensor prev_merged_tokens = phi3_v::insert_image_placeholders(prev_chat_tokens, tokens_per_image);
+        if (0 == m_tokens_per_image && !images_features_proj.empty()) {
+            m_tokens_per_image = images_features_proj.at(0).get_shape().at(1);
+        }
+        ov::Tensor new_merged_tokens = phi3_v::insert_image_placeholders(new_chat_tokens, m_tokens_per_image);
+        ov::Tensor prev_merged_tokens = phi3_v::insert_image_placeholders(prev_chat_tokens, m_tokens_per_image);
         ov::Tensor new_tokens = update_history(new_merged_tokens, prev_merged_tokens);
         std::vector<ov::Tensor> tokens = phi3_v::drop_image_placeholders(new_tokens);
         OPENVINO_ASSERT(tokens.size() == images_features_proj.size() + 1);
@@ -1507,7 +1510,7 @@ class InputsEmbedderPhi3V : public InputsEmbedder::IInputsEmbedder {
         for (size_t im_id = 0; im_id < images_features_proj.size(); ++im_id) {
             size_t text_length = tokens.at(im_id).get_shape().at(1);
             size_t im_length = images_features_proj.at(im_id).get_shape().at(1);
-            OPENVINO_ASSERT(im_length == tokens_per_image);
+            OPENVINO_ASSERT(im_length == m_tokens_per_image);
             features_length += text_length + im_length;
         }
         features_length += tokens.back().get_shape().at(1);

From 0652749d0dd120dc5696d7f725feaa8d66e1d95e Mon Sep 17 00:00:00 2001
From: Vladimir Zlobin <vladimir.zlobin@intel.com>
Date: Mon, 20 Jan 2025 11:20:18 +0400
Subject: [PATCH 22/28] Resolve merge conflict

---
 src/cpp/src/visual_language/inputs_embedder.cpp | 4 ----
 thirdparty/openvino_tokenizers                  | 2 +-
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp
index c7ecba38cb..24e932c0fd 100644
--- a/src/cpp/src/visual_language/inputs_embedder.cpp
+++ b/src/cpp/src/visual_language/inputs_embedder.cpp
@@ -221,10 +221,6 @@ class InputsEmbedder::IInputsEmbedder {
                 if (m_last_disappeared_token.has_value())
                     encoded_input_ids = ov::genai::utils::push_front_inputs(encoded_input_ids, *m_last_disappeared_token);
             }
-<<<<<<< HEAD
-=======
-            m_templated_chat_history = std::move(new_templated_chat_history);
->>>>>>> phi-3.5-vision-instruct
             m_tokenized_history.clear();
             std::copy_n(new_chat_tokens.data<int64_t>(), new_chat_tokens.get_size(), std::back_inserter(m_tokenized_history));
             return encoded_input_ids;
diff --git a/thirdparty/openvino_tokenizers b/thirdparty/openvino_tokenizers
index 708712d84d..d5f0abf827 160000
--- a/thirdparty/openvino_tokenizers
+++ b/thirdparty/openvino_tokenizers
@@ -1 +1 @@
-Subproject commit 708712d84d3201f816c5e44532c9e1b14e4d8be8
+Subproject commit d5f0abf8271f3cd8fc98d747b3e569fbeacca532

From d546486a234729edd33da607975f79554e49b2d6 Mon Sep 17 00:00:00 2001
From: Vladimir Zlobin <vladimir.zlobin@intel.com>
Date: Mon, 20 Jan 2025 13:45:28 +0400
Subject: [PATCH 23/28] clean up

---
 .../visual_language_chat.cpp                  | 22 +++++++++----------
 .../src/visual_language/inputs_embedder.cpp   | 10 ++++-----
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/samples/cpp/visual_language_chat/visual_language_chat.cpp b/samples/cpp/visual_language_chat/visual_language_chat.cpp
index 186e58df9e..e426965e66 100644
--- a/samples/cpp/visual_language_chat/visual_language_chat.cpp
+++ b/samples/cpp/visual_language_chat/visual_language_chat.cpp
@@ -9,7 +9,7 @@ bool print_subword(std::string&& subword) {
     return !(std::cout << subword << std::flush);
 }
 
-int main(int argc, char* argv[]) {
+int main(int argc, char* argv[]) try {
     if (3 != argc) {
         throw std::runtime_error(std::string{"Usage "} + argv[0] + " <MODEL_DIR> <IMAGE_FILE OR DIR_WITH_IMAGES>");
     }
@@ -48,14 +48,14 @@ int main(int argc, char* argv[]) {
             "question:\n";
     }
     pipe.finish_chat();
-// } catch (const std::exception& error) {
-//     try {
-//         std::cerr << error.what() << '\n';
-//     } catch (const std::ios_base::failure&) {}
-//     return EXIT_FAILURE;
-// } catch (...) {
-//     try {
-//         std::cerr << "Non-exception object thrown\n";
-//     } catch (const std::ios_base::failure&) {}
-//     return EXIT_FAILURE;
+} catch (const std::exception& error) {
+    try {
+        std::cerr << error.what() << '\n';
+    } catch (const std::ios_base::failure&) {}
+    return EXIT_FAILURE;
+} catch (...) {
+    try {
+        std::cerr << "Non-exception object thrown\n";
+    } catch (const std::ios_base::failure&) {}
+    return EXIT_FAILURE;
 }
diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp
index 24e932c0fd..eddb0eaa95 100644
--- a/src/cpp/src/visual_language/inputs_embedder.cpp
+++ b/src/cpp/src/visual_language/inputs_embedder.cpp
@@ -144,7 +144,7 @@ class InputsEmbedder::IInputsEmbedder {
         ),
         m_tokenizer(tokenizer) { }
 
-    std::pair<ov::Tensor, ov::Tensor> apply_chat_template_tokenize(const std::string& prompt, ov::genai::VLMPerfMetrics& metrics, const std::string& chat_template_fallback = {}, bool add_special_tokens_for_chat = false) {
+    std::pair<ov::Tensor, ov::Tensor> apply_chat_template_tokenize(const std::string& prompt, ov::genai::VLMPerfMetrics& metrics, const std::string& chat_template_fallback = {}) {
         if (m_is_chat_conversation) {
             m_history.push_back({{"role", "user"}, {"content", prompt}});
             constexpr bool add_generation_prompt = true;
@@ -156,8 +156,8 @@ class InputsEmbedder::IInputsEmbedder {
                 new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt, chat_template_fallback);
             }
             auto start_tokenizer_time = std::chrono::steady_clock::now();
-            ov::Tensor new_chat_tokens = m_tokenizer.encode(new_templated_chat_history, ov::genai::add_special_tokens(add_special_tokens_for_chat)).input_ids;
-            ov::Tensor prev_chat_tokens = m_tokenizer.encode(m_templated_chat_history, ov::genai::add_special_tokens(add_special_tokens_for_chat)).input_ids;
+            ov::Tensor new_chat_tokens = m_tokenizer.encode(new_templated_chat_history, ov::genai::add_special_tokens(false)).input_ids;
+            ov::Tensor prev_chat_tokens = m_tokenizer.encode(m_templated_chat_history, ov::genai::add_special_tokens(false)).input_ids;
             auto end_tokenizer_time = std::chrono::steady_clock::now();
             metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time));
             m_templated_chat_history = std::move(new_templated_chat_history);
@@ -231,8 +231,8 @@ class InputsEmbedder::IInputsEmbedder {
         }
     }
 
-    ov::Tensor get_encoded_input_ids(const std::string& prompt, ov::genai::VLMPerfMetrics& metrics, const std::string& chat_template_fallback = "", bool add_special_tokens_for_chat = false) {
-        const auto [new_chat_tokens, prev_chat_tokens] = apply_chat_template_tokenize(prompt, metrics, chat_template_fallback, add_special_tokens_for_chat);
+    ov::Tensor get_encoded_input_ids(const std::string& prompt, ov::genai::VLMPerfMetrics& metrics, const std::string& chat_template_fallback = "") {
+        const auto [new_chat_tokens, prev_chat_tokens] = apply_chat_template_tokenize(prompt, metrics, chat_template_fallback);
         return update_history(new_chat_tokens, prev_chat_tokens);
     }
 

From 98f73e23a7ddb04b80e74b75257167a05e0ad30e Mon Sep 17 00:00:00 2001
From: Vladimir Zlobin <vladimir.zlobin@intel.com>
Date: Mon, 20 Jan 2025 14:47:04 +0400
Subject: [PATCH 24/28] clean up

---
 src/cpp/src/visual_language/inputs_embedder.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp
index d36b16b6f2..66b17e5804 100644
--- a/src/cpp/src/visual_language/inputs_embedder.cpp
+++ b/src/cpp/src/visual_language/inputs_embedder.cpp
@@ -1454,7 +1454,7 @@ std::vector<ov::Tensor> drop_image_placeholders(const ov::Tensor& tokens) {
     return chunks;
 }
 }  // namespace phi3_v
-}
+}  // anonymous namespace
 
 class InputsEmbedderPhi3V : public InputsEmbedder::IInputsEmbedder {
 public:

From a5c63dd5f1ad87e66238152c6ac9b021e203eb43 Mon Sep 17 00:00:00 2001
From: Vladimir Zlobin <vladimir.zlobin@intel.com>
Date: Tue, 21 Jan 2025 11:48:56 +0400
Subject: [PATCH 25/28] Remove comment

---
 src/cpp/src/visual_language/vision_encoder.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cpp/src/visual_language/vision_encoder.cpp b/src/cpp/src/visual_language/vision_encoder.cpp
index 520262f3f8..04ddd63145 100644
--- a/src/cpp/src/visual_language/vision_encoder.cpp
+++ b/src/cpp/src/visual_language/vision_encoder.cpp
@@ -992,7 +992,7 @@ ov::Tensor transpose_image_patches_qwen2vl(const ov::Tensor& reshaped_patches) {
     
     return transposed_patches;
 }
-}  // anonymous namespace
+}
 
 VisionEncoder::VisionEncoder(const std::filesystem::path& model_dir, const VLMModelType model_type, const std::string& device, const ov::AnyMap device_config) :
     model_type(model_type) {

From 0d08310ca1cc021fa984ef1207a77c9b6dc1f462 Mon Sep 17 00:00:00 2001
From: Vladimir Zlobin <vladimir.zlobin@intel.com>
Date: Wed, 22 Jan 2025 19:03:10 +0400
Subject: [PATCH 26/28] Freeze mac OV_BRANCH

---
 .github/workflows/mac.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml
index f377d3e6a5..ecc73cc369 100644
--- a/.github/workflows/mac.yml
+++ b/.github/workflows/mac.yml
@@ -17,7 +17,7 @@ concurrency:
 
 env:
   PYTHON_VERSION: '3.10'
-  OV_BRANCH: 'master'
+  OV_BRANCH: 7f56fcd4658c6a427111ac835e809ddd87f0cad2
   OV_TARBALL: ''
 
 jobs:

From 066d972f2fcb19a5657d30443347c6a84ac0c292 Mon Sep 17 00:00:00 2001
From: Vladimir Zlobin <vladimir.zlobin@intel.com>
Date: Wed, 22 Jan 2025 20:05:29 +0400
Subject: [PATCH 27/28] Add notes

---
 SUPPORTED_MODELS.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/SUPPORTED_MODELS.md b/SUPPORTED_MODELS.md
index a13d1f6d7c..82b43a2fa1 100644
--- a/SUPPORTED_MODELS.md
+++ b/SUPPORTED_MODELS.md
@@ -312,6 +312,7 @@ In addition to image generation models, `InpaintingPipeline` supports specialize
       <th>Models</th>
       <th>LoRA support</th>
       <th>Example HuggingFace Models</th>
+      <th>Notes</th>
     </tr>
     <tr>
       <td><code>InternVL2</code></td>
@@ -329,6 +330,7 @@ In addition to image generation models, `InpaintingPipeline` supports specialize
           <li><a href="https://huggingface.co/OpenGVLab/InternVL2_5-8B"><code>OpenGVLab/InternVL2_5-8B</code></a></li>
         </ul>
       </td>
+      <td></td>
     </tr>
     <tr>
       <td><code>LLaVA</code></td>
@@ -339,6 +341,7 @@ In addition to image generation models, `InpaintingPipeline` supports specialize
           <li><a href="https://huggingface.co/llava-hf/llava-1.5-7b-hf"><code>llava-hf/llava-1.5-7b-hf</code></a></li>
         </ul>
       </td>
+      <td></td>
     </tr>
     <tr>
       <td><code>LLaVA-NeXT</code></td>
@@ -351,6 +354,7 @@ In addition to image generation models, `InpaintingPipeline` supports specialize
           <li><a href="https://huggingface.co/llava-hf/llama3-llava-next-8b-hf"><code>llava-hf/llama3-llava-next-8b-hf</code></a></li>
         </ul>
       </td>
+      <td></td>
     </tr>
     <tr>
       <td><code>MiniCPMV</code></td>
@@ -361,6 +365,7 @@ In addition to image generation models, `InpaintingPipeline` supports specialize
           <li><a href="https://huggingface.co/openbmb/MiniCPM-V-2_6"><code>openbmb/MiniCPM-V-2_6</code></a></li>
         </ul>
       </td>
+      <td></td>
     </tr>
     <tr>
       <td><code>Phi3VForCausalLM</code></td>
@@ -372,6 +377,7 @@ In addition to image generation models, `InpaintingPipeline` supports specialize
           <li><a href="https://huggingface.co/microsoft/Phi-3.5-vision-instruct"><code>microsoft/Phi-3.5-vision-instruct</code></a></li>
         </ul>
       </td>
+      <td>GPU is not supported</td>
     </tr>
     <tr>
       <td><code>Qwen2-VL</code></td>
@@ -383,6 +389,7 @@ In addition to image generation models, `InpaintingPipeline` supports specialize
           <li><a href="https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct"><code>Qwen/Qwen2-VL-7B-Instruct</code></a></li>
         </ul>
       </td>
+      <td></td>
     </tr>
   </tbody>
 </table>

From 03a29fc504668f84096e13ea04d5a04d2a41041c Mon Sep 17 00:00:00 2001
From: Vladimir Zlobin <vladimir.zlobin@intel.com>
Date: Thu, 23 Jan 2025 01:13:42 +0400
Subject: [PATCH 28/28] Extend notes

---
 SUPPORTED_MODELS.md | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/SUPPORTED_MODELS.md b/SUPPORTED_MODELS.md
index 82b43a2fa1..3064fb58c1 100644
--- a/SUPPORTED_MODELS.md
+++ b/SUPPORTED_MODELS.md
@@ -377,7 +377,10 @@ In addition to image generation models, `InpaintingPipeline` supports specialize
           <li><a href="https://huggingface.co/microsoft/Phi-3.5-vision-instruct"><code>microsoft/Phi-3.5-vision-instruct</code></a></li>
         </ul>
       </td>
-      <td>GPU is not supported</td>
+      <td>
+          <li>GPU isn't supported</li>
+          <li>These models' configs aren't consistent. It's required to override the default <code>eos_token_id</code> with the one from a tokenizer: <code>generation_config.set_eos_token_id(pipe.get_tokenizer().get_eos_token_id())</code>.</li>
+      </td>
     </tr>
     <tr>
       <td><code>Qwen2-VL</code></td>