Apply comments

openvinotoolkit · Dec 11, 2024 · b082d2b · b082d2b
1 parent cf6e61b
commit b082d2b
Show file tree

Hide file tree

Showing 8 changed files with 30 additions and 40 deletions.
diff --git a/src/cpp/src/continuous_batching_impl.cpp b/src/cpp/src/continuous_batching_impl.cpp
@@ -400,7 +400,7 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::_fill_prompt_log_probs(
     const float * logits_data = logits.data<float>();
     ov::Shape logits_shape = logits.get_shape();
     OPENVINO_ASSERT(logits_shape.size() == 3);
-    size_t batch_seq_len = logits_shape[1], vocab_size = logits_shape[2];
+    size_t vocab_size = logits_shape[2];
     for (size_t sequence_group_id = 0, currently_processed_tokens = 0; sequence_group_id < sequence_groups.size(); ++sequence_group_id) {
         SequenceGroup::Ptr sequence_group = sequence_groups[sequence_group_id];
         // requests not scheduled, in decoding phase or not echoing are not processed
@@ -410,8 +410,7 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::_fill_prompt_log_probs(
 
         size_t num_running_sequences = sequence_group->num_running_seqs();
         OPENVINO_ASSERT(num_running_sequences == 1);
-        size_t actual_seq_len = sequence_group->get_num_scheduled_tokens();
-        size_t padded_amount_of_processed_tokens = std::max(actual_seq_len, batch_seq_len);
+        size_t actual_seq_len = sequence_group->get_seq_len_to_sample();
 
         const float * sequence_group_logits_data = logits_data + vocab_size * currently_processed_tokens;
 
@@ -454,7 +453,7 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::_fill_prompt_log_probs(
 
             sequence_group->append_prompt_log_prob(token_logit - max_value - log_sum);
         }
-        currently_processed_tokens += padded_amount_of_processed_tokens * num_running_sequences;
+        currently_processed_tokens += actual_seq_len * num_running_sequences;
         // For max_new_tokens == 0, we don't reach sampling so need to notify handle separately
         if(sequence_group->get_sampling_parameters().max_new_tokens == 0) {
             sequence_group->notify_handle_echo_only();

diff --git a/src/cpp/src/llm_pipeline.cpp b/src/cpp/src/llm_pipeline.cpp
@@ -68,13 +68,13 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
             auto model = core.read_model(models_path / "openvino_model.xml");
             m_generation_config.adapters->set_tensor_name_prefix("base_model.model.model.");
             m_adapter_controller = AdapterController(model, *m_generation_config.adapters, device);   // TODO: Make the prefix name configurable
-            utils::slice_matmul_statefull_model(model);
+            utils::apply_slice_before_matmul_transformation(model);
             m_model_runner = core.compile_model(model, device, compile_plugin_config).create_infer_request();
         } else {
             auto [core_plugin_config, compile_plugin_config] = ov::genai::utils::split_core_compile_config(plugin_config);
             core.set_property(core_plugin_config);
             auto model = core.read_model(models_path / "openvino_model.xml");
-            utils::slice_matmul_statefull_model(model);
+            utils::apply_slice_before_matmul_transformation(model);
             m_model_runner = core.compile_model(model, device, compile_plugin_config).create_infer_request();
         }
 

diff --git a/src/cpp/src/model_runner.hpp b/src/cpp/src/model_runner.hpp
@@ -118,7 +118,7 @@ class ModelRunner {
         int64_t gathering_current_index = 0;
         std::vector<int64_t> gather_indice_values;
         try {
-            std::ignore = m_request.get_tensor("gather_indices");
+            std::ignore = m_request.get_tensor("sampled_tokens_indices");
             matmul_gathering_is_required = true;
         } catch (const ov::Exception&) {}
 
@@ -132,10 +132,7 @@ class ModelRunner {
             size_t prompt_len = sequence_group->get_prompt_len();
             size_t seq_len_after_gather = 0;
             bool echo_output = sequence_group->get_sampling_parameters().echo;
-
-            // spec: In case of multiple input tokens for current sequence (prompt_len > 1),
-            // context_len corresponds to first token within subgroup of scheduled tokens
-            size_t group_context_len = group_position_id;
+            bool sampling_is_required = sequence_group->requires_sampling();
 
             for (size_t seq_id = 0; seq_id < num_running_sequences; ++seq_id) {
                 Sequence::CPtr sequence = running_sequences[seq_id];
@@ -146,7 +143,7 @@ class ModelRunner {
                         sequence_group->get_prompt_ids()[position_id] :
                         sequence->get_generated_ids()[position_id - sequence_group->get_prompt_len()];
 
-                    if (matmul_gathering_is_required) {
+                    if (matmul_gathering_is_required && sampling_is_required) {
                         if (group_position_id + token_id >= prompt_len - 1 || echo_output) {
                             gather_indice_values.push_back(gathering_current_index);
                             seq_len_after_gather++;
@@ -189,7 +186,7 @@ class ModelRunner {
         if (matmul_gathering_is_required) {
             ov::Tensor gather_indices(ov::element::i64, {gather_indice_values.size()});
             std::memcpy(gather_indices.data(), gather_indice_values.data(), gather_indice_values.size() * sizeof(int64_t));
-            m_request.set_tensor("gather_indices", gather_indices);
+            m_request.set_tensor("sampled_tokens_indices", gather_indices);
         }
 
         // print_tensor("input_ids", input_ids);

diff --git a/src/cpp/src/sampler.cpp b/src/cpp/src/sampler.cpp
@@ -746,7 +746,7 @@ SamplerOutput Sampler::sample(std::vector<SequenceGroup::Ptr> & sequence_groups,
     const float * logits_data = logits.data<float>();
     ov::Shape logits_shape = logits.get_shape();
     OPENVINO_ASSERT(logits_shape.size() == 3);
-    size_t batch_seq_len = logits_shape[1], vocab_size = logits_shape[2];
+    size_t vocab_size = logits_shape[2];
 
     SamplerOutput sampler_output;
     for (size_t sequence_group_id = 0, currently_processed_tokens = 0; sequence_group_id < sequence_groups.size(); ++sequence_group_id) {
@@ -757,7 +757,6 @@ SamplerOutput Sampler::sample(std::vector<SequenceGroup::Ptr> & sequence_groups,
         size_t num_running_sequences = sequence_group->num_running_seqs();
         size_t actual_seq_len = sequence_group->is_matmul_sliced() ?
             sequence_group->get_seq_len_to_sample() : sequence_group->get_num_scheduled_tokens(); // points to a token which needs to be sampled
-        size_t padded_amount_of_processed_tokens = std::max(actual_seq_len, batch_seq_len);
         const ov::genai::GenerationConfig& sampling_params = sequence_group->get_sampling_parameters();
 
         const auto request_id = sequence_group->get_request_id();
@@ -900,7 +899,7 @@ SamplerOutput Sampler::sample(std::vector<SequenceGroup::Ptr> & sequence_groups,
         }
 
         // accumulate a number of processed tokens
-        currently_processed_tokens += padded_amount_of_processed_tokens * num_running_sequences;
+        currently_processed_tokens += actual_seq_len * num_running_sequences;
     }
 
     return sampler_output;

diff --git a/src/cpp/src/utils.cpp b/src/cpp/src/utils.cpp
@@ -8,6 +8,7 @@
 
 #include "openvino/op/add.hpp"
 #include "openvino/op/divide.hpp"
+#include "openvino/op/gather.hpp"
 #include "openvino/op/multiply.hpp"
 #include "openvino/op/matmul.hpp"
 #include "openvino/op/slice.hpp"
@@ -235,6 +236,7 @@ ov::genai::TokenizedInputs subtract_chat_tokenized_inputs(const ov::genai::Token
     return {new_input_ids, new_attention_mask};
 }
 
+namespace {
 std::shared_ptr<ov::Node> find_llm_matmul(const std::shared_ptr<ov::Model>& model) {
     auto last_node = model->output(0).get_node()->input_value(0).get_node_shared_ptr();
     std::shared_ptr<ov::Node> matmul = std::dynamic_pointer_cast<ov::op::v0::MatMul>(last_node);
@@ -253,8 +255,9 @@ std::shared_ptr<ov::Node> find_llm_matmul(const std::shared_ptr<ov::Model>& mode
     }
     return matmul;
 }
+} // namesoace
 
-void slice_matmul_statefull_model(std::shared_ptr<ov::Model> model) {
+void apply_slice_before_matmul_transformation(std::shared_ptr<ov::Model> model) {
     auto matmul = find_llm_matmul(model);
     if (matmul && matmul->input(0).get_partial_shape().rank().get_length() == 3) {
         auto start = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{1}, std::vector<int64_t>{-1});
@@ -266,11 +269,23 @@ void slice_matmul_statefull_model(std::shared_ptr<ov::Model> model) {
     }
 }
 
+void apply_gather_before_matmul_transformation(std::shared_ptr<ov::Model> model) {
+    auto matmul =  ov::genai::utils::find_llm_matmul(model);
+    if (matmul && matmul->input(0).get_partial_shape().rank().get_length() == 3) {
+        auto indices = std::make_shared<ov::op::v0::Parameter>(ov::element::i64, ov::PartialShape{-1});
+        indices->set_friendly_name("sampled_tokens_indices");
+        indices->output(0).get_tensor().set_names({"sampled_tokens_indices"});
+        auto axis = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{1}, std::vector<int64_t>{0});
+        auto gather = std::make_shared<ov::op::v8::Gather>(matmul->input_value(0), indices, axis);
+        matmul->input(0).replace_source_output(gather);
+        model->add_parameters({indices});
+    }
+}
+
 ov::Core singleton_core() {
     static ov::Core core;
     return core;
 }
-
 }  // namespace utils
 }  // namespace genai
 }  // namespace ov
diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp
@@ -82,9 +82,9 @@ std::pair<ov::AnyMap, ov::AnyMap> split_core_compile_config(const ov::AnyMap& pr
 
 ov::genai::TokenizedInputs subtract_chat_tokenized_inputs(const ov::genai::TokenizedInputs& minuend, const ov::genai::TokenizedInputs& subtrahend);
 
-std::shared_ptr<ov::Node> find_llm_matmul(const std::shared_ptr<ov::Model>& model);
+void apply_slice_before_matmul_transformation(std::shared_ptr<ov::Model> model);
 
-void slice_matmul_statefull_model(std::shared_ptr<ov::Model> model);
+void apply_gather_before_matmul_transformation(std::shared_ptr<ov::Model> model);
 
 ov::Core singleton_core();
 

diff --git a/src/cpp/src/utils/paged_attention_transformations.cpp b/src/cpp/src/utils/paged_attention_transformations.cpp
@@ -7,10 +7,6 @@
 #include "openvino/pass/sdpa_to_paged_attention.hpp"
 #include "utils.hpp"
 
-#include "openvino/op/constant.hpp"
-#include "openvino/op/gather.hpp"
-#include "openvino/op/parameter.hpp"
-
 namespace ov {
 namespace genai {
 namespace utils {
@@ -82,20 +78,6 @@ void apply_paged_attention_transformations(std::shared_ptr<ov::Model> model, Dev
     apply_paged_attention_transformations(model, per_layer_cache_control);
     set_kv_cache_type_and_shape(model, device_config);
 }
-
-void apply_gather_before_matmul_transformation(std::shared_ptr<ov::Model> model) {
-    auto matmul =  ov::genai::utils::find_llm_matmul(model);
-    if (matmul && matmul->input(0).get_partial_shape().rank().get_length() == 3) {
-        auto indices = std::make_shared<ov::op::v0::Parameter>(ov::element::i64, ov::PartialShape{-1});
-        indices->set_friendly_name("gather_indices");
-        indices->output(0).get_tensor().set_names({"gather_indices"});
-        auto axis = std::make_shared<ov::op::v0::Constant>(ov::element::i64, ov::Shape{1}, std::vector<int64_t>{0});
-        auto gather = std::make_shared<ov::op::v8::Gather>(matmul->input_value(0), indices, axis);
-        matmul->input(0).replace_source_output(gather);
-        model->add_parameters({indices});
-    }
-}
-
 }  // namespace utils
 }  // namespace genai
 }  // namespace ov
diff --git a/src/cpp/src/utils/paged_attention_transformations.hpp b/src/cpp/src/utils/paged_attention_transformations.hpp
@@ -23,8 +23,6 @@ void apply_paged_attention_transformations(std::shared_ptr<ov::Model> model, Dev
 
 void apply_paged_attention_transformations(std::shared_ptr<ov::Model> model, bool per_layer_cache_control = false);
 
-void apply_gather_before_matmul_transformation(std::shared_ptr<ov::Model> model);
-
 size_t get_kv_cache_size(const std::shared_ptr<ov::Model> model);
 
 void set_kv_cache_type_and_shape(std::shared_ptr<ov::Model> model, DeviceConfig& device_config);