Fill prompt for sampler analysis with real tokens in VLM pipeline

openvinotoolkit · Dec 12, 2024 · 2b26160 · 2b26160
1 parent 266c4b6
commit 2b26160
Show file tree

Hide file tree

Showing 5 changed files with 42 additions and 11 deletions.
diff --git a/src/cpp/src/utils.cpp b/src/cpp/src/utils.cpp
@@ -353,6 +353,14 @@ void trim_kv_cache(ov::InferRequest request, uint64_t remove_from_end, std::opti
     }
 }
 
+ov::Tensor push_front_inputs(const ov::Tensor& base_tensor, std::vector<int64_t> add_to_front) {
+    ov::Tensor new_tensor = ov::Tensor{ov::element::i64, {base_tensor.get_shape().at(0), base_tensor.get_shape().at(1) + add_to_front.size()}};
+    auto new_tensor_data = new_tensor.data<int64_t>();
+    std::copy(add_to_front.begin(), add_to_front.end(), new_tensor_data);
+    std::copy_n(base_tensor.data<int64_t>(), base_tensor.get_size(), new_tensor_data + add_to_front.size());
+    return new_tensor;
+}
+
 }  // namespace utils
 }  // namespace genai
 }  // namespace ov
diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp
@@ -102,6 +102,8 @@ size_t get_first_history_difference(const ov::Tensor& encoded_history, const std
 
 void trim_kv_cache(ov::InferRequest request, uint64_t remove_from_end, std::optional<AdapterController> adapter_controller);
 
+ov::Tensor push_front_inputs(const ov::Tensor& base_tensor, std::vector<int64_t> add_to_front);
+
 }  // namespace utils
 }  // namespace genai
 }  // namespace ov
diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp
@@ -9,6 +9,7 @@
 #include "visual_language/embedding_model.hpp"
 
 #include "utils.hpp"
+#include "debug_utils.hpp"
 
 namespace {
 
@@ -44,6 +45,8 @@ class InputsEmbedder::IInputsEmbedder {
     // The number of elements, which need to remove from the end of KV cache
     // removed elements will be added to inputs_ids
     size_t m_to_remove_from_hist = 0;
+    // Tail of previous output for LM in chat mode is missing in KV cache.
+    std::optional<int64_t> m_last_disappeared_token = std::nullopt;
 
 public:
     virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics) = 0;
@@ -64,9 +67,13 @@ class InputsEmbedder::IInputsEmbedder {
         return m_to_remove_from_hist;
     }
 
-    void update_tokenized_chat_history(std::vector<int64_t> encoded_result) {
+    void update_tokenized_chat_history(std::vector<int64_t> encoded_result, bool token_will_disappeared) {
         std::copy(encoded_result.begin(), encoded_result.end(), std::back_inserter(m_tokenized_chat_history));
         m_to_remove_from_hist = 0;
+        if (token_will_disappeared)
+            m_last_disappeared_token = encoded_result.back();
+        else
+            m_last_disappeared_token = std::nullopt;
     }
 
     virtual void start_chat(const std::string& system_message) {
@@ -173,7 +180,8 @@ class InputsEmbedder::IInputsEmbedder {
             if (m_tokenized_chat_history.empty()) {
                 encoded_input_ids = new_chat_tokens;
             } else if (last_same_hist_token != SIZE_MAX) {
-                m_to_remove_from_hist = m_tokenized_chat_history.size() - last_same_hist_token;
+                // kv cache is missed one last token, so, we need to keep it
+                m_to_remove_from_hist = m_tokenized_chat_history.size() - last_same_hist_token - 1;
 
                 ov::Tensor new_tensor = ov::Tensor(new_chat_tokens.get_element_type(),
                                                    {1, new_chat_tokens.get_shape().at(1) - last_same_hist_token},
@@ -183,6 +191,12 @@ class InputsEmbedder::IInputsEmbedder {
                 encoded_input_ids = utils::subtract_chat_tokenized_inputs(
                     {new_chat_tokens}, prev_chat_tokens
                 ).input_ids;
+
+                if (m_last_disappeared_token.has_value() && *m_last_disappeared_token == encoded_input_ids.data<int64_t>()[0]) {
+                    m_last_disappeared_token = std::nullopt;
+                }
+
+                encoded_input_ids = ov::genai::utils::push_front_inputs(encoded_input_ids, std::vector<int64_t>{*m_last_disappeared_token});
             }
             auto end_tokenizer_time = std::chrono::steady_clock::now();
             metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time));
@@ -1175,8 +1189,8 @@ std::vector<int64_t> InputsEmbedder::get_tokenized_chat_history() const {
     return m_impl->get_tokenized_chat_history();
 }
 
-void InputsEmbedder::update_tokenized_chat_history(std::vector<int64_t> encoded_result) {
-    return m_impl->update_tokenized_chat_history(encoded_result);
+void InputsEmbedder::update_tokenized_chat_history(std::vector<int64_t> encoded_result, bool token_will_disappeared) {
+    return m_impl->update_tokenized_chat_history(encoded_result, token_will_disappeared);
 }
 
 size_t InputsEmbedder::get_amount_to_remove_from_hist() const {

diff --git a/src/cpp/src/visual_language/inputs_embedder.hpp b/src/cpp/src/visual_language/inputs_embedder.hpp
@@ -42,15 +42,19 @@ class InputsEmbedder {
 
     // returns tokenized chat history
     std::vector<int64_t> get_tokenized_chat_history() const;
+
     // add new results to tokenized chat history
-    void update_tokenized_chat_history(std::vector<int64_t> encoded_result);
+    void update_tokenized_chat_history(std::vector<int64_t> encoded_result, bool token_will_disappeared);
+
     // returns amount of elements, which need to remove from the end of the KV cache
     size_t get_amount_to_remove_from_hist() const;
 
     // starts chat and adds optional system_message to chat history
     void start_chat(const std::string& system_message);
+
     // adds currently generated text to chat history
     void update_chat_history(const std::string& decoded_results);
+
     // finishes chat and clears a chat history 
     void finish_chat();
 private:

diff --git a/src/cpp/src/visual_language/pipeline.cpp b/src/cpp/src/visual_language/pipeline.cpp
@@ -17,6 +17,8 @@
 #include "utils.hpp"
 #include "lm_encoding.hpp"
 
+#include "debug_utils.hpp"
+
 using namespace ov::genai;
 
 namespace {
@@ -156,19 +158,18 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
         auto to_remove_from_hist = m_inputs_embedder->get_amount_to_remove_from_hist();
         ov::genai::utils::trim_kv_cache(m_language, to_remove_from_hist, std::nullopt);
 
-        Sampler sampler = Sampler(m_tokenizer);
-
         std::vector<SequenceGroup::Ptr> requests;
         size_t request_id = 0;
         size_t block_size = 1; // not used
         bool enable_prefix_caching = false;
 
-        auto tokenized_chat_history = m_inputs_embedder->get_tokenized_chat_history();
         size_t history_size = m_language.get_tensor("attention_mask").get_shape().at(1) - to_remove_from_hist;
         size_t inputs_embeds_size = inputs_embeds.get_shape().at(1);
 
-        ov::Tensor prompt_ids(ov::element::i64, { history_size + inputs_embeds_size });
-        std::fill_n(prompt_ids.data<int64_t>(), prompt_ids.get_size(), 0);
+        auto tokenized_chat_history = m_inputs_embedder->get_tokenized_chat_history();
+        ov::Tensor prompt_ids(ov::element::i64, { tokenized_chat_history.size() });
+        std::fill_n(prompt_ids.data<int64_t>(), prompt_ids.get_size(), m_tokenizer.get_pad_token_id());
+        std::copy(tokenized_chat_history.begin(), tokenized_chat_history.end(), prompt_ids.data<int64_t>());
 
         SequenceGroup::Ptr sequence_group = std::make_shared<SequenceGroup>(request_id, prompt_ids, generation_config, block_size, enable_prefix_caching);
         sequence_group->set_sequence_group_ptr(sequence_group);
@@ -197,6 +198,8 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
         ov::Tensor position_ids = ov::Tensor{ov::element::i64, { 1, inputs_embeds_size }};
         std::iota(position_ids.data<int64_t>(), position_ids.data<int64_t>() + position_ids.get_size(), history_size);
 
+        Sampler sampler = Sampler(m_tokenizer);
+
         ov::genai::EncodedResults encoded_result;
         int32_t m_selected_beam = 0;
         std::tie(encoded_result, m_selected_beam) = ov::genai::get_lm_encoded_results(m_language, inputs_embeds, new_atten_mask, streamer_ptr, sampler, requests,
@@ -236,7 +239,7 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
         decoded.perf_metrics.m_evaluated = false;
         decoded.perf_metrics.evaluate_statistics(generate_start_time);
 
-        m_inputs_embedder->update_tokenized_chat_history(encoded_result.tokens[0]);
+        m_inputs_embedder->update_tokenized_chat_history(encoded_result.tokens[0], requests[0]->get_finished_sequences()[0]->get_finish_reason() == GenerationFinishReason::LENGTH);
 
         return decoded;
     }