Skip to content

Commit

Permalink
Fix
Browse files Browse the repository at this point in the history
  • Loading branch information
ilya-lavrenov committed Jan 27, 2025
1 parent cc8ea52 commit 34ecb9b
Show file tree
Hide file tree
Showing 8 changed files with 29 additions and 26 deletions.
4 changes: 2 additions & 2 deletions .github/labeler.yml
Original file line number Diff line number Diff line change
Expand Up @@ -103,8 +103,8 @@
- 'src/cpp/src/generation_handle.cpp'
- 'src/cpp/src/generation_stream.hpp'
- 'src/cpp/src/model_runner.hpp'
- 'src/cpp/src/utils/paged_attention_transformations.cpp'
- 'src/cpp/src/utils/paged_attention_transformations.hpp'
- 'src/cpp/src/paged_attention_transformations.cpp'
- 'src/cpp/src/paged_attention_transformations.hpp'
- 'src/cpp/src/scheduler.hpp'
- 'src/cpp/src/sequence_group.cpp'
- 'src/cpp/src/sequence_group.hpp'
Expand Down
25 changes: 15 additions & 10 deletions src/cpp/src/cache_manager.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -134,15 +134,19 @@ class CacheManager {
for (size_t decoder_layer_id = 0; decoder_layer_id < m_num_decoder_layers; ++decoder_layer_id) {
ov::Shape value_cache_shape = set_kv_blocks(m_value_shapes[decoder_layer_id], num_kv_blocks);
ov::Shape key_cache_shape = set_kv_blocks(m_key_shapes[decoder_layer_id], num_kv_blocks);

ov::element::Type key_precision = get_key_cache_precision(decoder_layer_id);
ov::element::Type value_precision = get_value_cache_precision(decoder_layer_id);

#ifdef _WIN32
ov::Tensor key_cache(get_key_cache_precision(decoder_layer_id), key_cache_shape);
ov::Tensor value_cache(m_device_config.get_cache_precision(decoder_layer_id), value_cache_shape);
ov::Tensor key_cache(key_precision, key_cache_shape);
ov::Tensor value_cache(value_precision, value_cache_shape);
#else
auto key_size = ov::shape_size(key_cache_shape) * get_key_cache_precision(decoder_layer_id).size();
auto value_size = ov::shape_size(value_cache_shape) * get_value_cache_precision(decoder_layer_id).size();
auto key_size = ov::shape_size(key_cache_shape) * key_precision.size();
auto value_size = ov::shape_size(value_cache_shape) * value_precision.size();

ov::Tensor key_cache = ov::Tensor(get_key_cache_precision(decoder_layer_id), key_cache_shape, TensorMmapAllocator(key_size));
ov::Tensor value_cache = ov::Tensor(get_value_cache_precision(decoder_layer_id), value_cache_shape, TensorMmapAllocator(value_size));
ov::Tensor key_cache(key_precision, key_cache_shape, TensorMmapAllocator(key_size));
ov::Tensor value_cache(value_precision, value_cache_shape, TensorMmapAllocator(value_size));
#endif

auto key_cache_roi_end = static_cast<unsigned char*>(key_cache.data());
Expand Down Expand Up @@ -180,8 +184,7 @@ class CacheManager {
if (m_key_cache.size() > decoder_layer_id) {
m_key_cache[decoder_layer_id] = key_cache;
m_value_cache[decoder_layer_id] = value_cache;
}
else {
} else {
m_key_cache.emplace_back(key_cache);
m_value_cache.emplace_back(value_cache);
}
Expand All @@ -190,9 +193,11 @@ class CacheManager {
}
} else {
auto remote_context = m_request.get_compiled_model().get_context();

for (size_t decoder_layer_id = 0; decoder_layer_id < m_num_decoder_layers; ++decoder_layer_id) {
ov::Shape value_cache_shape = set_kv_blocks(m_value_shapes[decoder_layer_id], num_kv_blocks);
ov::Shape key_cache_shape = set_kv_blocks(m_key_shapes[decoder_layer_id], num_kv_blocks);

ov::Tensor key_cache = remote_context.create_tensor(get_key_cache_precision(decoder_layer_id), key_cache_shape);
ov::Tensor value_cache = remote_context.create_tensor(get_value_cache_precision(decoder_layer_id), value_cache_shape);

Expand All @@ -208,11 +213,11 @@ class CacheManager {

m_key_cache[decoder_layer_id] = key_cache;
m_value_cache[decoder_layer_id] = value_cache;
}
else {
} else {
m_key_cache.emplace_back(key_cache);
m_value_cache.emplace_back(value_cache);
}

update_request_tensor(decoder_layer_id);
}
}
Expand Down
4 changes: 3 additions & 1 deletion src/cpp/src/continuous_batching_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
#include "text_callback_streamer.hpp"
#include "continuous_batching_impl.hpp"
#include "utils.hpp"
#include "utils/paged_attention_transformations.hpp"
#include "paged_attention_transformations.hpp"
#include "lora_helper.hpp"
#include "cache_state_dumper.hpp"
#include "utils.hpp"
Expand Down Expand Up @@ -86,6 +86,8 @@ void apply_kv_cache_precision(const std::shared_ptr<ov::Model>& model, const std
k->set_element_type(m_kv_cache_type);
v->set_element_type(m_kv_cache_type);
}

model->validate_nodes_and_infer_types();
}

} // namespace
Expand Down
12 changes: 4 additions & 8 deletions src/cpp/src/device_config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ class DeviceConfig {
std::vector<ov::PartialShape> m_key_cache_shape, m_value_cache_shape;
std::vector<KVHeadConfig> m_kv_heads_config;
size_t m_num_decoder_layers = 0;
size_t m_num_kv_blocks = 0, m_cache_size = 0; // KV cache sizes in either blocks or GBs
size_t m_block_size = 0; // block size is per inference device
std::string m_device;

Expand All @@ -38,14 +37,8 @@ class DeviceConfig {
DeviceConfig(const SchedulerConfig& scheduling_config, const std::string& device, const ov::AnyMap& plugin_config = {}) {
m_device = device;

// keep information about blocsk
// keep information about blocks
m_block_size = get_block_size_by_device(device);

if (scheduling_config.num_kv_blocks > 0) {
m_num_kv_blocks = scheduling_config.num_kv_blocks;
} else if (scheduling_config.cache_size > 0) {
m_cache_size = scheduling_config.cache_size;
}
}

void set_kv_head_configs(const std::vector<KVHeadConfig>& kv_heads_config) {
Expand All @@ -54,6 +47,9 @@ class DeviceConfig {
m_key_cache_shape.reserve(m_num_decoder_layers);
m_value_cache_shape.reserve(m_num_decoder_layers);

// TODO: can we hide this int8 KV cache head_size patching to plugins as well?
// E.g. PA model is passed to plugin as it w/o setting shapes in plugin specific order (i.e. where is block dimension)
// and plugin provides updated dimensions after compile_model
if (m_device == "CPU") {
// Scale, zero point and quantized data will be stored together.
// The layout for per token per head:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// Copyright (C) 2023-2025 Intel Corporation
// SPDX-License-Identifier: Apache-2.0

#include "utils/paged_attention_transformations.hpp"
#include "paged_attention_transformations.hpp"

#include "openvino/pass/manager.hpp"
#include "openvino/pass/sdpa_to_paged_attention.hpp"
Expand Down Expand Up @@ -69,8 +69,8 @@ void set_kv_cache_type_and_shape(std::shared_ptr<ov::Model> model, DeviceConfig&
auto v = value_cache_params[std::string("value_cache.") + std::to_string(idx)];

// allow a plugin to automatically set KV cache precisions
k->set_element_type(ov::element::undefined);
v->set_element_type(ov::element::undefined);
k->set_element_type(ov::element::dynamic);
v->set_element_type(ov::element::dynamic);

// set device specific KV cache shapes back to a PA model
k->set_partial_shape(device_config.get_key_cache_shape(idx));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@

#include "text_callback_streamer.hpp"
#include "speculative_decoding_impl.hpp"
#include "paged_attention_transformations.hpp"
#include "utils.hpp"
#include "utils/paged_attention_transformations.hpp"


namespace ov::genai {
Expand Down
2 changes: 1 addition & 1 deletion tests/cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ file(GLOB src_files "${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src/sequence_group.cpp"
"${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src/sampler.cpp"
"${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src/speculative_decoding/*.cpp"
"${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src/prompt_lookup/*.cpp"
"${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src/utils/*.cpp"
"${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src/paged_attention_transformations.cpp"
"${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src/utils.cpp"
"${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src/continuous_batching*.cpp"
"${OpenVINOGenAI_SOURCE_DIR}/src/cpp/src/icontinuous_batching.cpp"
Expand Down

0 comments on commit 34ecb9b

Please sign in to comment.