diff --git a/src/cpp/src/device_config.hpp b/src/cpp/src/device_config.hpp index cbf3fe70c5..52789ff013 100644 --- a/src/cpp/src/device_config.hpp +++ b/src/cpp/src/device_config.hpp @@ -10,22 +10,27 @@ #include "openvino/genai/scheduler_config.hpp" namespace ov::genai { + +/** + * Per layer KV cache size configuration + */ +struct KVHeadConfig { + size_t num_v_heads, num_k_heads; + size_t v_head_size, k_head_size; +}; + class DeviceConfig { ov::element::Type m_kv_cache_type; std::vector m_key_cache_shape, m_value_cache_shape; - std::vector m_num_kv_heads; - ov::Shape::value_type m_head_size, m_num_decoder_layers; - size_t m_num_kv_blocks = 0; - size_t m_block_size = 0; - size_t m_cache_size = 0; + std::vector m_kv_heads_config; + size_t m_num_decoder_layers = 0; + size_t m_num_kv_blocks = 0, m_cache_size = 0; // KV cache sizes in either blocks or GBs + size_t m_block_size = 0; // block size is per inference device std::string m_device; size_t get_block_size_by_device(const std::string& device) const { - const size_t cpu_block_size = 32; - const size_t gpu_block_size = 16; - - bool is_gpu = device.find("GPU") != std::string::npos; - + const size_t cpu_block_size = 32, gpu_block_size = 16; + const bool is_gpu = device.find("GPU") != std::string::npos; return is_gpu ? gpu_block_size : cpu_block_size; } @@ -83,17 +88,14 @@ class DeviceConfig { if (scheduling_config.num_kv_blocks > 0) { m_num_kv_blocks = scheduling_config.num_kv_blocks; - } - else if (scheduling_config.cache_size > 0) { + } else if (scheduling_config.cache_size > 0) { m_cache_size = scheduling_config.cache_size; } } - void set_model_params(std::vector num_kv_heads, size_t head_size, size_t num_decoder_layers) { - m_head_size = head_size; - m_num_decoder_layers = num_decoder_layers; - - m_num_kv_heads.assign(num_kv_heads.begin(), num_kv_heads.end()); + void set_kv_head_configs(std::vector kv_heads_config) { + m_kv_heads_config = kv_heads_config; + m_num_decoder_layers = m_kv_heads_config.size(); m_key_cache_shape.reserve(m_num_decoder_layers); m_value_cache_shape.reserve(m_num_decoder_layers); @@ -103,35 +105,37 @@ class DeviceConfig { // |scale(f32)|zeropoint(f32)|quantized data(u8,idx_1)|quantized data(u8,idx_2)|...|quantized data(u8,idx_head_size)| // so, we have to extend head_size by 8, which is sizeof(float) // for scale and sizeof(float) for zeropoint - if (m_kv_cache_type == ov::element::u8) - m_head_size += 8; + if (m_kv_cache_type == ov::element::u8) { + for (size_t layer_id = 0; layer_id < m_num_decoder_layers; ++layer_id) { + m_kv_heads_config[layer_id].k_head_size += 8; + m_kv_heads_config[layer_id].v_head_size += 8; + } + } } if (m_num_kv_blocks == 0 && m_cache_size > 0) { - size_t block_size = 0; - size_t size_in_bytes = m_cache_size * 1024 * 1024 * 1024; - for (size_t layer_id = 0; layer_id < m_num_decoder_layers; layer_id++) { - block_size += 2 * m_num_kv_heads[layer_id] * m_block_size * m_head_size * m_kv_cache_type.size(); - } - m_num_kv_blocks = size_in_bytes / block_size; + size_t size_in_bytes = m_cache_size * 1024 * 1024 * 1024; // convert GBs to bytes + m_num_kv_blocks = size_in_bytes / get_block_size_in_bytes(); } for (size_t layer_id = 0; layer_id < m_num_decoder_layers; layer_id++) { + const KVHeadConfig& config = m_kv_heads_config[layer_id]; + m_value_cache_shape.push_back(ov::PartialShape{ov::Dimension::dynamic(), - ov::Dimension(m_num_kv_heads[layer_id]), + ov::Dimension(config.num_v_heads), ov::Dimension(m_block_size), - ov::Dimension(m_head_size)}); + ov::Dimension(config.v_head_size)}); if (m_device.find("GPU") == std::string::npos) { m_key_cache_shape.push_back(ov::PartialShape{ov::Dimension::dynamic(), - ov::Dimension(m_num_kv_heads[layer_id]), + ov::Dimension(config.num_k_heads), ov::Dimension(m_block_size), - ov::Dimension(m_head_size)}); - } else if (m_device.find("GPU") != std::string::npos) { + ov::Dimension(config.k_head_size)}); + } else if (m_device.find("GPU") != std::string::npos) { // Update key shape, as the key's shape is different from the value's shape m_key_cache_shape.push_back(ov::PartialShape{ov::Dimension::dynamic(), - ov::Dimension(m_num_kv_heads[layer_id]), - ov::Dimension(m_head_size), + ov::Dimension(config.num_k_heads), + ov::Dimension(config.k_head_size), ov::Dimension(m_block_size)}); } } @@ -168,11 +172,13 @@ class DeviceConfig { } size_t get_block_size_in_bytes() const { - size_t block_size = 0; + size_t block_size_in_bytes = 0; for (size_t layer_id = 0; layer_id < m_num_decoder_layers; layer_id++) { - block_size += 2 * m_num_kv_heads[layer_id] * m_block_size * m_head_size * get_cache_precision().size(); + const KVHeadConfig& config = m_kv_heads_config[layer_id]; + block_size_in_bytes += config.k_head_size * config.num_k_heads + config.v_head_size * config.num_v_heads; } - return block_size; + block_size_in_bytes *= get_block_size() * get_cache_precision().size(); + return block_size_in_bytes; } }; } diff --git a/src/cpp/src/utils/paged_attention_transformations.cpp b/src/cpp/src/utils/paged_attention_transformations.cpp index baef7d8dd6..0d62bb10e9 100644 --- a/src/cpp/src/utils/paged_attention_transformations.cpp +++ b/src/cpp/src/utils/paged_attention_transformations.cpp @@ -31,37 +31,35 @@ void apply_paged_attention_transformations(std::shared_ptr model, boo } void set_kv_cache_type_and_shape(std::shared_ptr model, DeviceConfig& device_config) { - const ov::ParameterVector& parameters = model->get_parameters(); - std::map> key_cache_params, value_cache_params; - for (const auto& param_ptr : parameters) { + for (const auto& param_ptr : model->get_parameters()) { const auto& name = param_ptr->get_friendly_name(); if (name.find("key_cache.") == 0) { key_cache_params[name] = param_ptr; - } - else if (name.find("value_cache.") == 0) { + } else if (name.find("value_cache.") == 0) { value_cache_params[name] = param_ptr; } } - OPENVINO_ASSERT(key_cache_params.size() > 0); - OPENVINO_ASSERT(key_cache_params.size() == value_cache_params.size()); + OPENVINO_ASSERT(key_cache_params.size() == value_cache_params.size() && key_cache_params.size() > 0); - size_t num_layers = key_cache_params.size(); - // extract num_kv_heads and head_size - std::string key_cache_param_name = "key_cache.0"; - OPENVINO_ASSERT(key_cache_params.count(key_cache_param_name) != 0, "key_cache.0 tensor not found among model parameters"); - ov::PartialShape k_shape = key_cache_params[key_cache_param_name]->get_partial_shape(); - OPENVINO_ASSERT(k_shape.rank().get_length() == 3, "KV cache shape is expected to have rank 3, while shape is ", k_shape); - size_t head_size = k_shape[2].get_length(); - std::vector num_kv_heads(num_layers); - for (size_t idx = 0; idx < num_layers; idx++) { - size_t num_heads = key_cache_params[std::string("key_cache.") + std::to_string(idx)]->get_partial_shape()[1].get_length(); - num_kv_heads[idx] = num_heads; + size_t num_decoder_layers = key_cache_params.size(); + std::vector kv_heads_config(num_decoder_layers); + + for (size_t idx = 0; idx < num_decoder_layers; idx++) { + KVHeadConfig& config = kv_heads_config[idx]; + + auto key_shape = key_cache_params[std::string("key_cache.") + std::to_string(idx)]->get_partial_shape(); + config.num_k_heads = key_shape[1].get_length(); + config.k_head_size = key_shape[2].get_length(); + + auto value_shape = value_cache_params[std::string("value_cache.") + std::to_string(idx)]->get_partial_shape(); + config.num_v_heads = value_shape[1].get_length(); + config.v_head_size = value_shape[2].get_length(); } - device_config.set_model_params(num_kv_heads, head_size, num_layers); + device_config.set_kv_head_configs(kv_heads_config); - for (size_t idx = 0; idx < num_layers; idx++) { + for (size_t idx = 0; idx < num_decoder_layers; idx++) { auto k = key_cache_params[std::string("key_cache.") + std::to_string(idx)]; auto v = value_cache_params[std::string("value_cache.") + std::to_string(idx)]; k->set_element_type(device_config.get_cache_precision()); @@ -80,4 +78,4 @@ void apply_paged_attention_transformations(std::shared_ptr model, Dev } // namespace utils } // namespace genai -} // namespace ov \ No newline at end of file +} // namespace ov diff --git a/tests/cpp/cache_manager.cpp b/tests/cpp/cache_manager.cpp index 7d855ded12..0c483f0ec1 100644 --- a/tests/cpp/cache_manager.cpp +++ b/tests/cpp/cache_manager.cpp @@ -56,9 +56,9 @@ TEST(TestCacheManager, test_cache_size_param) { const std::string device = "CPU"; ov::genai::DeviceConfig device_config(core, scheduler_config, "CPU"); - size_t num_decoder_layers = 12; - std::vector num_kv_heads(12, 12); - device_config.set_model_params(num_kv_heads, 64, num_decoder_layers); + const size_t num_decoder_layers = 12; + const std::vector kv_heads_config(num_decoder_layers, KVHeadConfig { 12, 12, 64, 64 }); + device_config.set_kv_head_configs(kv_heads_config); ov::InferRequest request = core.compile_model(get_dummy_model(core, num_decoder_layers)).create_infer_request(); auto cache_manager = std::make_shared(device_config, request, core); @@ -79,9 +79,9 @@ TEST(TestCacheManager, test_kv_blocks_param) { const std::string device = "CPU"; ov::genai::DeviceConfig device_config(core, scheduler_config, "CPU"); - size_t num_decoder_layers = 12; - std::vector num_kv_heads(12, 12); - device_config.set_model_params(num_kv_heads, 64, num_decoder_layers); + const size_t num_decoder_layers = 12; + const std::vector kv_heads_config(num_decoder_layers, KVHeadConfig { 12, 12, 64, 64 }); + device_config.set_kv_head_configs(kv_heads_config); ov::InferRequest request = core.compile_model(get_dummy_model(core, num_decoder_layers)).create_infer_request(); auto cache_manager = std::make_shared(device_config, request, core); @@ -100,15 +100,16 @@ TEST(TestCacheManager, test_dynamic_cache_increase) { const std::string device = "CPU"; ov::genai::DeviceConfig device_config(core, scheduler_config, "CPU"); - size_t num_decoder_layers = 12; - size_t head_size = 64; - std::vector num_kv_heads(12, 12); - device_config.set_model_params(num_kv_heads, head_size, num_decoder_layers); + const size_t num_decoder_layers = 12; + const std::vector kv_heads_config(num_decoder_layers, KVHeadConfig { 12, 12, 64, 64 }); + device_config.set_kv_head_configs(kv_heads_config); + size_t block_size_in_bytes = 0; for (size_t layer_id = 0; layer_id < num_decoder_layers; layer_id++) { - block_size_in_bytes += 2 * num_kv_heads[layer_id] * device_config.get_block_size() * head_size * device_config.get_cache_precision().size(); + KVHeadConfig config = kv_heads_config[layer_id]; + block_size_in_bytes += config.k_head_size * config.num_k_heads + config.v_head_size * config.num_v_heads; } - + block_size_in_bytes *= device_config.get_block_size() * device_config.get_cache_precision().size(); ov::InferRequest request = core.compile_model(get_dummy_model(core, num_decoder_layers)).create_infer_request(); auto cache_manager = std::make_shared(device_config, request, core); diff --git a/tests/cpp/device_config.cpp b/tests/cpp/device_config.cpp index 93e06f02e7..a97037b1e8 100644 --- a/tests/cpp/device_config.cpp +++ b/tests/cpp/device_config.cpp @@ -18,13 +18,15 @@ TEST(TestDeviceConfig, kv_cache_precision_u8) { const std::string device = "CPU"; size_t num_decoder_layers = 12; size_t head_size = 64, head_size_u8 = head_size + 8; - std::vector num_kv_heads(12, 12); - ov::genai::DeviceConfig device_config_default(core, scheduler_config, "CPU"); - device_config_default.set_model_params(num_kv_heads, head_size_u8, num_decoder_layers); + ov::genai::KVHeadConfig kv_head_config { 12, 12, head_size_u8, head_size_u8 }; + ov::genai::KVHeadConfig kv_head_config_u8 { 12, 12, head_size, head_size }; + ov::genai::DeviceConfig device_config_default(core, scheduler_config, "CPU"); ov::genai::DeviceConfig device_config_u8(core, scheduler_config, "CPU", { ov::hint::kv_cache_precision(ov::element::u8) }); - device_config_u8.set_model_params(num_kv_heads, head_size, num_decoder_layers); + + device_config_default.set_kv_head_configs(std::vector(num_decoder_layers, kv_head_config)); + device_config_u8.set_kv_head_configs(std::vector(num_decoder_layers, kv_head_config_u8)); const auto ratio = ov::element::f16.size() / ov::element::u8.size(); ASSERT_EQ(device_config_default.get_num_kv_blocks() * ratio, device_config_u8.get_num_kv_blocks()); diff --git a/tests/cpp/scheduler.cpp b/tests/cpp/scheduler.cpp index ecd53fa665..201318347a 100644 --- a/tests/cpp/scheduler.cpp +++ b/tests/cpp/scheduler.cpp @@ -47,9 +47,9 @@ std::shared_ptr init_cache_manager(SchedulerConfig scheduler_confi size_t num_decoder_layers = 12; ov::InferRequest request = core.compile_model(get_model(core, num_decoder_layers)).create_infer_request(); size_t head_size = 64, head_size_u8 = head_size + 8; - std::vector num_kv_heads(12, 12); + std::vector kv_head_configs(num_decoder_layers, KVHeadConfig { 12, 12, head_size_u8, head_size_u8 }); ov::genai::DeviceConfig device_config(core, scheduler_config, "CPU"); - device_config.set_model_params(num_kv_heads, head_size_u8, num_decoder_layers); + device_config.set_kv_head_configs(kv_head_configs); return std::make_shared(device_config, request, core); }