Skip to content

Commit

Permalink
Fill prompt for sampler analysis with real tokens in VLM pipeline
Browse files Browse the repository at this point in the history
  • Loading branch information
sbalandi committed Dec 12, 2024
1 parent 266c4b6 commit 2b26160
Show file tree
Hide file tree
Showing 5 changed files with 42 additions and 11 deletions.
8 changes: 8 additions & 0 deletions src/cpp/src/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -353,6 +353,14 @@ void trim_kv_cache(ov::InferRequest request, uint64_t remove_from_end, std::opti
}
}

ov::Tensor push_front_inputs(const ov::Tensor& base_tensor, std::vector<int64_t> add_to_front) {
ov::Tensor new_tensor = ov::Tensor{ov::element::i64, {base_tensor.get_shape().at(0), base_tensor.get_shape().at(1) + add_to_front.size()}};
auto new_tensor_data = new_tensor.data<int64_t>();
std::copy(add_to_front.begin(), add_to_front.end(), new_tensor_data);
std::copy_n(base_tensor.data<int64_t>(), base_tensor.get_size(), new_tensor_data + add_to_front.size());
return new_tensor;
}

} // namespace utils
} // namespace genai
} // namespace ov
2 changes: 2 additions & 0 deletions src/cpp/src/utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,8 @@ size_t get_first_history_difference(const ov::Tensor& encoded_history, const std

void trim_kv_cache(ov::InferRequest request, uint64_t remove_from_end, std::optional<AdapterController> adapter_controller);

ov::Tensor push_front_inputs(const ov::Tensor& base_tensor, std::vector<int64_t> add_to_front);

} // namespace utils
} // namespace genai
} // namespace ov
22 changes: 18 additions & 4 deletions src/cpp/src/visual_language/inputs_embedder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include "visual_language/embedding_model.hpp"

#include "utils.hpp"
#include "debug_utils.hpp"

namespace {

Expand Down Expand Up @@ -44,6 +45,8 @@ class InputsEmbedder::IInputsEmbedder {
// The number of elements, which need to remove from the end of KV cache
// removed elements will be added to inputs_ids
size_t m_to_remove_from_hist = 0;
// Tail of previous output for LM in chat mode is missing in KV cache.
std::optional<int64_t> m_last_disappeared_token = std::nullopt;

public:
virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics) = 0;
Expand All @@ -64,9 +67,13 @@ class InputsEmbedder::IInputsEmbedder {
return m_to_remove_from_hist;
}

void update_tokenized_chat_history(std::vector<int64_t> encoded_result) {
void update_tokenized_chat_history(std::vector<int64_t> encoded_result, bool token_will_disappeared) {
std::copy(encoded_result.begin(), encoded_result.end(), std::back_inserter(m_tokenized_chat_history));
m_to_remove_from_hist = 0;
if (token_will_disappeared)
m_last_disappeared_token = encoded_result.back();
else
m_last_disappeared_token = std::nullopt;
}

virtual void start_chat(const std::string& system_message) {
Expand Down Expand Up @@ -173,7 +180,8 @@ class InputsEmbedder::IInputsEmbedder {
if (m_tokenized_chat_history.empty()) {
encoded_input_ids = new_chat_tokens;
} else if (last_same_hist_token != SIZE_MAX) {
m_to_remove_from_hist = m_tokenized_chat_history.size() - last_same_hist_token;
// kv cache is missed one last token, so, we need to keep it
m_to_remove_from_hist = m_tokenized_chat_history.size() - last_same_hist_token - 1;

ov::Tensor new_tensor = ov::Tensor(new_chat_tokens.get_element_type(),
{1, new_chat_tokens.get_shape().at(1) - last_same_hist_token},
Expand All @@ -183,6 +191,12 @@ class InputsEmbedder::IInputsEmbedder {
encoded_input_ids = utils::subtract_chat_tokenized_inputs(
{new_chat_tokens}, prev_chat_tokens
).input_ids;

if (m_last_disappeared_token.has_value() && *m_last_disappeared_token == encoded_input_ids.data<int64_t>()[0]) {
m_last_disappeared_token = std::nullopt;
}

encoded_input_ids = ov::genai::utils::push_front_inputs(encoded_input_ids, std::vector<int64_t>{*m_last_disappeared_token});
}
auto end_tokenizer_time = std::chrono::steady_clock::now();
metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time));
Expand Down Expand Up @@ -1175,8 +1189,8 @@ std::vector<int64_t> InputsEmbedder::get_tokenized_chat_history() const {
return m_impl->get_tokenized_chat_history();
}

void InputsEmbedder::update_tokenized_chat_history(std::vector<int64_t> encoded_result) {
return m_impl->update_tokenized_chat_history(encoded_result);
void InputsEmbedder::update_tokenized_chat_history(std::vector<int64_t> encoded_result, bool token_will_disappeared) {
return m_impl->update_tokenized_chat_history(encoded_result, token_will_disappeared);
}

size_t InputsEmbedder::get_amount_to_remove_from_hist() const {
Expand Down
6 changes: 5 additions & 1 deletion src/cpp/src/visual_language/inputs_embedder.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,15 +42,19 @@ class InputsEmbedder {

// returns tokenized chat history
std::vector<int64_t> get_tokenized_chat_history() const;

// add new results to tokenized chat history
void update_tokenized_chat_history(std::vector<int64_t> encoded_result);
void update_tokenized_chat_history(std::vector<int64_t> encoded_result, bool token_will_disappeared);

// returns amount of elements, which need to remove from the end of the KV cache
size_t get_amount_to_remove_from_hist() const;

// starts chat and adds optional system_message to chat history
void start_chat(const std::string& system_message);

// adds currently generated text to chat history
void update_chat_history(const std::string& decoded_results);

// finishes chat and clears a chat history
void finish_chat();
private:
Expand Down
15 changes: 9 additions & 6 deletions src/cpp/src/visual_language/pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
#include "utils.hpp"
#include "lm_encoding.hpp"

#include "debug_utils.hpp"

using namespace ov::genai;

namespace {
Expand Down Expand Up @@ -156,19 +158,18 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
auto to_remove_from_hist = m_inputs_embedder->get_amount_to_remove_from_hist();
ov::genai::utils::trim_kv_cache(m_language, to_remove_from_hist, std::nullopt);

Sampler sampler = Sampler(m_tokenizer);

std::vector<SequenceGroup::Ptr> requests;
size_t request_id = 0;
size_t block_size = 1; // not used
bool enable_prefix_caching = false;

auto tokenized_chat_history = m_inputs_embedder->get_tokenized_chat_history();
size_t history_size = m_language.get_tensor("attention_mask").get_shape().at(1) - to_remove_from_hist;
size_t inputs_embeds_size = inputs_embeds.get_shape().at(1);

ov::Tensor prompt_ids(ov::element::i64, { history_size + inputs_embeds_size });
std::fill_n(prompt_ids.data<int64_t>(), prompt_ids.get_size(), 0);
auto tokenized_chat_history = m_inputs_embedder->get_tokenized_chat_history();
ov::Tensor prompt_ids(ov::element::i64, { tokenized_chat_history.size() });
std::fill_n(prompt_ids.data<int64_t>(), prompt_ids.get_size(), m_tokenizer.get_pad_token_id());
std::copy(tokenized_chat_history.begin(), tokenized_chat_history.end(), prompt_ids.data<int64_t>());

SequenceGroup::Ptr sequence_group = std::make_shared<SequenceGroup>(request_id, prompt_ids, generation_config, block_size, enable_prefix_caching);
sequence_group->set_sequence_group_ptr(sequence_group);
Expand Down Expand Up @@ -197,6 +198,8 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
ov::Tensor position_ids = ov::Tensor{ov::element::i64, { 1, inputs_embeds_size }};
std::iota(position_ids.data<int64_t>(), position_ids.data<int64_t>() + position_ids.get_size(), history_size);

Sampler sampler = Sampler(m_tokenizer);

ov::genai::EncodedResults encoded_result;
int32_t m_selected_beam = 0;
std::tie(encoded_result, m_selected_beam) = ov::genai::get_lm_encoded_results(m_language, inputs_embeds, new_atten_mask, streamer_ptr, sampler, requests,
Expand Down Expand Up @@ -236,7 +239,7 @@ class ov::genai::VLMPipeline::VLMPipelineImpl {
decoded.perf_metrics.m_evaluated = false;
decoded.perf_metrics.evaluate_statistics(generate_start_time);

m_inputs_embedder->update_tokenized_chat_history(encoded_result.tokens[0]);
m_inputs_embedder->update_tokenized_chat_history(encoded_result.tokens[0], requests[0]->get_finished_sequences()[0]->get_finish_reason() == GenerationFinishReason::LENGTH);

return decoded;
}
Expand Down

0 comments on commit 2b26160

Please sign in to comment.