From 266c4b6ee37a5f3cf8bb0f578e05ea253ff7ebf5 Mon Sep 17 00:00:00 2001 From: sbalandi Date: Wed, 11 Dec 2024 19:23:57 +0000 Subject: [PATCH] Apply comments --- src/cpp/src/llm_pipeline.cpp | 98 ++++++++++--------- src/cpp/src/utils.cpp | 14 ++- src/cpp/src/utils.hpp | 2 +- .../src/visual_language/inputs_embedder.cpp | 6 +- src/cpp/src/visual_language/pipeline.cpp | 4 +- 5 files changed, 71 insertions(+), 53 deletions(-) diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp index 46c5e2cf0c..c62537bc07 100644 --- a/src/cpp/src/llm_pipeline.cpp +++ b/src/cpp/src/llm_pipeline.cpp @@ -43,6 +43,7 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { std::string m_templated_chat_history = {}; std::vector m_tokenized_chat_history; ov::genai::utils::GenerationChatInputsType m_chat_input_type = ov::genai::utils::GenerationChatInputsType::UNDEF; + size_t m_to_remove_from_hist = 0; StatefulLLMPipeline( const ov::InferRequest& request, @@ -111,6 +112,11 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { auto start_time = std::chrono::steady_clock::now(); GenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config; + // If eos_token_id was not provided, take value from default m_generation_config + if (config.eos_token_id == -1) + config.set_eos_token_id(m_generation_config.eos_token_id); + config.validate(); + TokenizedInputs encoded_input; if (auto input_vector = std::get_if>(&inputs)) { @@ -138,25 +144,34 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { // some symbols combinations can be encoded by the tokenizer in different ways // if we met sequence with such combination of symbols, we cannot correctly subtract the new history from the old history - // and find the difference as a prompt, so let's check it out and use the whole history in this case + // so let's check it out, find the trusted part and use it in on the next step + size_t last_same_hist_token = 0; if (!m_tokenized_chat_history.empty()) { - auto stop_tokens = config.stop_token_ids; - // config could be reset by user and stop_tokens could be empty - // but model/tokenizer still will rely to eos token, so let's add it - stop_tokens.insert(m_tokenizer.get_eos_token_id()); - size_t last_same_hist_token = ov::genai::utils::get_first_history_difference(prev_chat_tokens.input_ids, m_tokenized_chat_history, stop_tokens); + std::set stop_tokens = config.stop_token_ids; + last_same_hist_token = ov::genai::utils::get_first_history_difference(prev_chat_tokens.input_ids, m_tokenized_chat_history, stop_tokens); m_trust_encoded_history = last_same_hist_token == SIZE_MAX; } - if (!m_trust_encoded_history) { - reset_kv_state(); - m_selected_beam = std::nullopt; - } + if (m_tokenized_chat_history.empty()) { + encoded_input = new_chat_tokens; + } else if (last_same_hist_token != SIZE_MAX) { + m_to_remove_from_hist = m_tokenized_chat_history.size() - last_same_hist_token; - if (!m_tokenized_chat_history.empty() && m_trust_encoded_history) { - encoded_input = utils::subtract_chat_tokenized_inputs(new_chat_tokens, prev_chat_tokens); + ov::Tensor new_tensor = ov::Tensor(new_chat_tokens.input_ids.get_element_type(), + {1, new_chat_tokens.input_ids.get_shape().at(1) - last_same_hist_token}, + new_chat_tokens.input_ids.data() + last_same_hist_token); + + ov::Tensor new_attention_mask(ov::element::i64, new_tensor.get_shape()); + std::fill_n(new_attention_mask.data(), new_tensor.get_shape()[1], 1); + + encoded_input.input_ids = ov::Tensor(new_chat_tokens.input_ids.get_element_type(), + {1, new_chat_tokens.input_ids.get_shape().at(1) - last_same_hist_token}); + new_tensor.copy_to(encoded_input.input_ids); + encoded_input.attention_mask = new_attention_mask; + + m_selected_beam = std::nullopt; } else { - encoded_input = new_chat_tokens; + encoded_input = utils::subtract_chat_tokenized_inputs(new_chat_tokens, prev_chat_tokens); } m_templated_chat_history = new_templated_chat_history; m_tokenized_chat_history.clear(); @@ -169,6 +184,7 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { encoded_input = m_tokenizer.encode(prompt); } } + auto encode_stop_time = std::chrono::steady_clock::now(); auto encoded_results = generate(encoded_input, config, streamer); @@ -270,29 +286,25 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { "(input_ids, attention_mask, position_ids, beam_idx) " "but you have '" + std::to_string(num_inputs) + "' inputs"); - ov::Tensor tokenized_chat_history = ov::Tensor(ov::element::i64, {1, m_tokenized_chat_history.size()}, m_tokenized_chat_history.data()); + ov::genai::utils::trim_kv_cache(m_model_runner, m_to_remove_from_hist, m_adapter_controller); + size_t kv_cache_len = 0; ov::Tensor concatenated_attention_mask; if (is_chat_conversation && !m_tokenized_chat_history.empty()) { - if (m_trust_encoded_history) { - OPENVINO_ASSERT(batch_size == 1, "continuation of generation is possible only for batch 1"); - // If history is saved in KV cache, concatenate new attention_mask with the already existing. - // Between subsequent runs attention_mask should not be modified. - auto atten_mask_history = m_model_runner.get_tensor("attention_mask"); - auto prompt_len = attention_mask.get_shape()[1]; - kv_cache_len = atten_mask_history.get_shape()[1]; - - ov::Tensor new_atten_mask = ov::Tensor{ov::element::i64, {batch_size, kv_cache_len + prompt_len}}; - auto start_atten_hst = atten_mask_history.data() + kv_cache_len * (*m_selected_beam); - std::copy(start_atten_hst, start_atten_hst + kv_cache_len, - new_atten_mask.data()); - std::copy(attention_mask.data(), attention_mask.data() + prompt_len, - new_atten_mask.data() + kv_cache_len); - concatenated_attention_mask = new_atten_mask; - } else { - attention_mask = ov::genai::utils::init_attention_mask(tokenized_chat_history); - concatenated_attention_mask = attention_mask; - } + OPENVINO_ASSERT(batch_size == 1, "continuation of generation is possible only for batch 1"); + // If history is saved in KV cache, concatenate new attention_mask with the already existing. + // Between subsequent runs attention_mask should not be modified. + auto atten_mask_history = m_model_runner.get_tensor("attention_mask"); + auto prompt_len = attention_mask.get_shape()[1]; + kv_cache_len = atten_mask_history.get_shape()[1] - m_to_remove_from_hist; + + ov::Tensor new_atten_mask = ov::Tensor{ov::element::i64, {batch_size, kv_cache_len + prompt_len}}; + auto start_atten_hst = atten_mask_history.data() + kv_cache_len * (*m_selected_beam); + std::copy(start_atten_hst, start_atten_hst + kv_cache_len, + new_atten_mask.data()); + std::copy(attention_mask.data(), attention_mask.data() + prompt_len, + new_atten_mask.data() + kv_cache_len); + concatenated_attention_mask = new_atten_mask; } else { concatenated_attention_mask = attention_mask; } @@ -300,27 +312,22 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { bool position_ids_available = (num_inputs == 4); std::optional position_ids = std::nullopt; if (position_ids_available) { - if (is_chat_conversation && !m_trust_encoded_history) { - position_ids = ov::Tensor{ov::element::i64, tokenized_chat_history.get_shape()}; - } else { - position_ids = ov::Tensor{ov::element::i64, input_ids.get_shape()}; - } - utils::initialize_position_ids(*position_ids, attention_mask, kv_cache_len); + position_ids = ov::Tensor{ov::element::i64, input_ids.get_shape()}; + utils::initialize_position_ids(*position_ids, attention_mask, kv_cache_len); } if(m_adapter_controller) { m_adapter_controller->apply(m_model_runner, config.adapters); } - auto input_tokens = input_ids; if (is_chat_conversation && !m_trust_encoded_history) { - input_tokens = tokenized_chat_history; m_trust_encoded_history = true; + m_to_remove_from_hist = 0; } ov::genai::EncodedResults result; if (config.is_beam_search() && is_chat_conversation) { - std::tie(result, m_selected_beam) = beam_search(m_model_runner, input_tokens, concatenated_attention_mask, + std::tie(result, m_selected_beam) = beam_search(m_model_runner, input_ids, concatenated_attention_mask, config, position_ids, m_selected_beam); } else { std::vector requests; @@ -330,6 +337,7 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { for (size_t request_id = 0; request_id < batch_size; request_id++) { SequenceGroup::Ptr sequence_group; if (is_chat_conversation) { + ov::Tensor tokenized_chat_history = ov::Tensor(ov::element::i64, {1, m_tokenized_chat_history.size()}, m_tokenized_chat_history.data()); sequence_group = std::make_shared(request_id, tokenized_chat_history, config, block_size, enable_prefix_caching); } else { size_t seq_len = input_ids.get_shape().at(1); @@ -345,8 +353,8 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { } Sampler sampler = Sampler(m_tokenizer); - std::tie(result, m_selected_beam) = ov::genai::get_lm_encoded_results(m_model_runner, input_tokens, concatenated_attention_mask, streamer_ptr, - sampler, requests, position_ids, std::nullopt, m_selected_beam); + std::tie(result, m_selected_beam) = ov::genai::get_lm_encoded_results(m_model_runner, input_ids, concatenated_attention_mask, streamer_ptr, + sampler, requests, position_ids, std::nullopt, m_selected_beam); } if (is_chat_conversation) { @@ -371,6 +379,7 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { is_chat_conversation = true; m_selected_beam = std::nullopt; m_trust_encoded_history = true; + m_to_remove_from_hist = 0; m_chat_input_type = ov::genai::utils::GenerationChatInputsType::UNDEF; if (!m_tokenized_chat_history.empty()) { reset_kv_state(); @@ -391,6 +400,7 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase { is_chat_conversation = false; m_selected_beam = std::nullopt; m_trust_encoded_history = true; + m_to_remove_from_hist = 0; m_chat_input_type = ov::genai::utils::GenerationChatInputsType::UNDEF; if (!m_tokenized_chat_history.empty()) { reset_kv_state(); diff --git a/src/cpp/src/utils.cpp b/src/cpp/src/utils.cpp index 7f08d66a4a..ab2852b530 100644 --- a/src/cpp/src/utils.cpp +++ b/src/cpp/src/utils.cpp @@ -325,10 +325,17 @@ size_t get_first_history_difference(const ov::Tensor& encoded_history, const std return idx; } -void trim_kv_cache(ov::InferRequest request, uint64_t remove_from_end) { +void trim_kv_cache(ov::InferRequest request, uint64_t remove_from_end, std::optional adapter_controller) { + // nothing to trim in this case + if (remove_from_end == 0) + return; + // TODO: add handling for case with LoRA adapters enabled auto states = request.query_state(); for (auto& state : states) { + if(adapter_controller && adapter_controller->has_state_name(state.get_name())) + continue; + ov::Tensor old_tensor = state.get_state(); // [BATCH_SIZE, num_kv_heads, seq_len, head_size] auto shape = old_tensor.get_shape(); @@ -337,7 +344,10 @@ void trim_kv_cache(ov::InferRequest request, uint64_t remove_from_end) { ov::Coordinate new_shape_begin{0, 0, 0, 0}; ov::Coordinate new_shape_end{shape}; - auto new_tensor = ov::Tensor(old_tensor, new_shape_begin, new_shape_end); + auto trimmed_tensor = ov::Tensor(old_tensor, new_shape_begin, new_shape_end); + + ov::Tensor new_tensor(old_tensor.get_element_type(), shape); + trimmed_tensor.copy_to(new_tensor); state.set_state(new_tensor); } diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp index d984be7c23..fc52454490 100644 --- a/src/cpp/src/utils.hpp +++ b/src/cpp/src/utils.hpp @@ -100,7 +100,7 @@ void read_rt_info(std::shared_ptr& model, const char* name, T& value) size_t get_first_history_difference(const ov::Tensor& encoded_history, const std::vector tokenized_history, std::set stop_tokens); -void trim_kv_cache(ov::InferRequest request, uint64_t remove_from_end); +void trim_kv_cache(ov::InferRequest request, uint64_t remove_from_end, std::optional adapter_controller); } // namespace utils } // namespace genai diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp index 4da5049ba8..dfdb1521ef 100644 --- a/src/cpp/src/visual_language/inputs_embedder.cpp +++ b/src/cpp/src/visual_language/inputs_embedder.cpp @@ -158,8 +158,8 @@ class InputsEmbedder::IInputsEmbedder { new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt, chat_template_fallback); } auto start_tokenizer_time = std::chrono::steady_clock::now(); - ov::Tensor new_chat_tokens = m_tokenizer.encode(new_templated_chat_history).input_ids; - TokenizedInputs prev_chat_tokens = m_tokenizer.encode(m_templated_chat_history); + ov::Tensor new_chat_tokens = m_tokenizer.encode(new_templated_chat_history, ov::genai::add_special_tokens(false)).input_ids; + TokenizedInputs prev_chat_tokens = m_tokenizer.encode(m_templated_chat_history, ov::genai::add_special_tokens(false)); // some symbols combinations can be encoded by the tokenizer in different ways // if we met sequence with such combination of symbols, we cannot correctly subtract the new history from the old history @@ -173,7 +173,7 @@ class InputsEmbedder::IInputsEmbedder { if (m_tokenized_chat_history.empty()) { encoded_input_ids = new_chat_tokens; } else if (last_same_hist_token != SIZE_MAX) { - m_to_remove_from_hist = m_tokenized_chat_history.size() - 1 - last_same_hist_token; + m_to_remove_from_hist = m_tokenized_chat_history.size() - last_same_hist_token; ov::Tensor new_tensor = ov::Tensor(new_chat_tokens.get_element_type(), {1, new_chat_tokens.get_shape().at(1) - last_same_hist_token}, diff --git a/src/cpp/src/visual_language/pipeline.cpp b/src/cpp/src/visual_language/pipeline.cpp index 5f0a6e020d..7656daba94 100644 --- a/src/cpp/src/visual_language/pipeline.cpp +++ b/src/cpp/src/visual_language/pipeline.cpp @@ -154,9 +154,7 @@ class ov::genai::VLMPipeline::VLMPipelineImpl { auto end_get_inputs_embeds = std::chrono::steady_clock::now(); auto to_remove_from_hist = m_inputs_embedder->get_amount_to_remove_from_hist(); - if (to_remove_from_hist > 0) { - ov::genai::utils::trim_kv_cache(m_language, to_remove_from_hist); - } + ov::genai::utils::trim_kv_cache(m_language, to_remove_from_hist, std::nullopt); Sampler sampler = Sampler(m_tokenizer);