Skip to content

Commit

Permalink
CB: support different num K/V heads and head_sizes per decoder layer
Browse files Browse the repository at this point in the history
  • Loading branch information
ilya-lavrenov committed Jan 21, 2025
1 parent fe6311d commit be10375
Show file tree
Hide file tree
Showing 5 changed files with 81 additions and 74 deletions.
76 changes: 41 additions & 35 deletions src/cpp/src/device_config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,22 +10,27 @@
#include "openvino/genai/scheduler_config.hpp"

namespace ov::genai {

/**
* Per layer KV cache size configuration
*/
struct KVHeadConfig {
size_t num_v_heads, num_k_heads;
size_t v_head_size, k_head_size;
};

class DeviceConfig {
ov::element::Type m_kv_cache_type;
std::vector<ov::PartialShape> m_key_cache_shape, m_value_cache_shape;
std::vector<ov::Shape::value_type> m_num_kv_heads;
ov::Shape::value_type m_head_size, m_num_decoder_layers;
size_t m_num_kv_blocks = 0;
size_t m_block_size = 0;
size_t m_cache_size = 0;
std::vector<KVHeadConfig> m_kv_heads_config;
size_t m_num_decoder_layers = 0;
size_t m_num_kv_blocks = 0, m_cache_size = 0; // KV cache sizes in either blocks or GBs
size_t m_block_size = 0; // block size is per inference device
std::string m_device;

size_t get_block_size_by_device(const std::string& device) const {
const size_t cpu_block_size = 32;
const size_t gpu_block_size = 16;

bool is_gpu = device.find("GPU") != std::string::npos;

const size_t cpu_block_size = 32, gpu_block_size = 16;
const bool is_gpu = device.find("GPU") != std::string::npos;
return is_gpu ? gpu_block_size : cpu_block_size;
}

Expand Down Expand Up @@ -83,17 +88,14 @@ class DeviceConfig {

if (scheduling_config.num_kv_blocks > 0) {
m_num_kv_blocks = scheduling_config.num_kv_blocks;
}
else if (scheduling_config.cache_size > 0) {
} else if (scheduling_config.cache_size > 0) {
m_cache_size = scheduling_config.cache_size;
}
}

void set_model_params(std::vector<size_t> num_kv_heads, size_t head_size, size_t num_decoder_layers) {
m_head_size = head_size;
m_num_decoder_layers = num_decoder_layers;

m_num_kv_heads.assign(num_kv_heads.begin(), num_kv_heads.end());
void set_kv_head_configs(std::vector<KVHeadConfig> kv_heads_config) {
m_kv_heads_config = kv_heads_config;
m_num_decoder_layers = m_kv_heads_config.size();
m_key_cache_shape.reserve(m_num_decoder_layers);
m_value_cache_shape.reserve(m_num_decoder_layers);

Expand All @@ -103,35 +105,37 @@ class DeviceConfig {
// |scale(f32)|zeropoint(f32)|quantized data(u8,idx_1)|quantized data(u8,idx_2)|...|quantized data(u8,idx_head_size)|
// so, we have to extend head_size by 8, which is sizeof(float)
// for scale and sizeof(float) for zeropoint
if (m_kv_cache_type == ov::element::u8)
m_head_size += 8;
if (m_kv_cache_type == ov::element::u8) {
for (size_t layer_id = 0; layer_id < m_num_decoder_layers; ++layer_id) {
m_kv_heads_config[layer_id].k_head_size += 8;
m_kv_heads_config[layer_id].v_head_size += 8;
}
}
}

if (m_num_kv_blocks == 0 && m_cache_size > 0) {
size_t block_size = 0;
size_t size_in_bytes = m_cache_size * 1024 * 1024 * 1024;
for (size_t layer_id = 0; layer_id < m_num_decoder_layers; layer_id++) {
block_size += 2 * m_num_kv_heads[layer_id] * m_block_size * m_head_size * m_kv_cache_type.size();
}
m_num_kv_blocks = size_in_bytes / block_size;
size_t size_in_bytes = m_cache_size * 1024 * 1024 * 1024; // convert GBs to bytes
m_num_kv_blocks = size_in_bytes / get_block_size_in_bytes();
}

for (size_t layer_id = 0; layer_id < m_num_decoder_layers; layer_id++) {
const KVHeadConfig& config = m_kv_heads_config[layer_id];

m_value_cache_shape.push_back(ov::PartialShape{ov::Dimension::dynamic(),
ov::Dimension(m_num_kv_heads[layer_id]),
ov::Dimension(config.num_v_heads),
ov::Dimension(m_block_size),
ov::Dimension(m_head_size)});
ov::Dimension(config.v_head_size)});

if (m_device.find("GPU") == std::string::npos) {
m_key_cache_shape.push_back(ov::PartialShape{ov::Dimension::dynamic(),
ov::Dimension(m_num_kv_heads[layer_id]),
ov::Dimension(config.num_k_heads),
ov::Dimension(m_block_size),
ov::Dimension(m_head_size)});
} else if (m_device.find("GPU") != std::string::npos) {
ov::Dimension(config.k_head_size)});
} else if (m_device.find("GPU") != std::string::npos) {
// Update key shape, as the key's shape is different from the value's shape
m_key_cache_shape.push_back(ov::PartialShape{ov::Dimension::dynamic(),
ov::Dimension(m_num_kv_heads[layer_id]),
ov::Dimension(m_head_size),
ov::Dimension(config.num_k_heads),
ov::Dimension(config.k_head_size),
ov::Dimension(m_block_size)});
}
}
Expand Down Expand Up @@ -168,11 +172,13 @@ class DeviceConfig {
}

size_t get_block_size_in_bytes() const {
size_t block_size = 0;
size_t block_size_in_bytes = 0;
for (size_t layer_id = 0; layer_id < m_num_decoder_layers; layer_id++) {
block_size += 2 * m_num_kv_heads[layer_id] * m_block_size * m_head_size * get_cache_precision().size();
const KVHeadConfig& config = m_kv_heads_config[layer_id];
block_size_in_bytes += config.k_head_size * config.num_k_heads + config.v_head_size * config.num_v_heads;
}
return block_size;
block_size_in_bytes *= get_block_size() * get_cache_precision().size();
return block_size_in_bytes;
}
};
}
40 changes: 19 additions & 21 deletions src/cpp/src/utils/paged_attention_transformations.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,37 +31,35 @@ void apply_paged_attention_transformations(std::shared_ptr<ov::Model> model, boo
}

void set_kv_cache_type_and_shape(std::shared_ptr<ov::Model> model, DeviceConfig& device_config) {
const ov::ParameterVector& parameters = model->get_parameters();

std::map<std::string, std::shared_ptr<ov::op::v0::Parameter>> key_cache_params, value_cache_params;
for (const auto& param_ptr : parameters) {
for (const auto& param_ptr : model->get_parameters()) {
const auto& name = param_ptr->get_friendly_name();
if (name.find("key_cache.") == 0) {
key_cache_params[name] = param_ptr;
}
else if (name.find("value_cache.") == 0) {
} else if (name.find("value_cache.") == 0) {
value_cache_params[name] = param_ptr;
}
}

OPENVINO_ASSERT(key_cache_params.size() > 0);
OPENVINO_ASSERT(key_cache_params.size() == value_cache_params.size());
OPENVINO_ASSERT(key_cache_params.size() == value_cache_params.size() && key_cache_params.size() > 0);

size_t num_layers = key_cache_params.size();
// extract num_kv_heads and head_size
std::string key_cache_param_name = "key_cache.0";
OPENVINO_ASSERT(key_cache_params.count(key_cache_param_name) != 0, "key_cache.0 tensor not found among model parameters");
ov::PartialShape k_shape = key_cache_params[key_cache_param_name]->get_partial_shape();
OPENVINO_ASSERT(k_shape.rank().get_length() == 3, "KV cache shape is expected to have rank 3, while shape is ", k_shape);
size_t head_size = k_shape[2].get_length();
std::vector<size_t> num_kv_heads(num_layers);
for (size_t idx = 0; idx < num_layers; idx++) {
size_t num_heads = key_cache_params[std::string("key_cache.") + std::to_string(idx)]->get_partial_shape()[1].get_length();
num_kv_heads[idx] = num_heads;
size_t num_decoder_layers = key_cache_params.size();
std::vector<KVHeadConfig> kv_heads_config(num_decoder_layers);

for (size_t idx = 0; idx < num_decoder_layers; idx++) {
KVHeadConfig& config = kv_heads_config[idx];

auto key_shape = key_cache_params[std::string("key_cache.") + std::to_string(idx)]->get_partial_shape();
config.num_k_heads = key_shape[1].get_length();
config.k_head_size = key_shape[2].get_length();

auto value_shape = value_cache_params[std::string("value_cache.") + std::to_string(idx)]->get_partial_shape();
config.num_v_heads = value_shape[1].get_length();
config.v_head_size = value_shape[2].get_length();
}
device_config.set_model_params(num_kv_heads, head_size, num_layers);
device_config.set_kv_head_configs(kv_heads_config);

for (size_t idx = 0; idx < num_layers; idx++) {
for (size_t idx = 0; idx < num_decoder_layers; idx++) {
auto k = key_cache_params[std::string("key_cache.") + std::to_string(idx)];
auto v = value_cache_params[std::string("value_cache.") + std::to_string(idx)];
k->set_element_type(device_config.get_cache_precision());
Expand All @@ -80,4 +78,4 @@ void apply_paged_attention_transformations(std::shared_ptr<ov::Model> model, Dev

} // namespace utils
} // namespace genai
} // namespace ov
} // namespace ov
25 changes: 13 additions & 12 deletions tests/cpp/cache_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,9 @@ TEST(TestCacheManager, test_cache_size_param) {

const std::string device = "CPU";
ov::genai::DeviceConfig device_config(core, scheduler_config, "CPU");
size_t num_decoder_layers = 12;
std::vector<size_t> num_kv_heads(12, 12);
device_config.set_model_params(num_kv_heads, 64, num_decoder_layers);
const size_t num_decoder_layers = 12;
const std::vector<KVHeadConfig> kv_heads_config(num_decoder_layers, KVHeadConfig { 12, 12, 64, 64 });
device_config.set_kv_head_configs(kv_heads_config);

ov::InferRequest request = core.compile_model(get_dummy_model(core, num_decoder_layers)).create_infer_request();
auto cache_manager = std::make_shared<ov::genai::CacheManager>(device_config, request, core);
Expand All @@ -79,9 +79,9 @@ TEST(TestCacheManager, test_kv_blocks_param) {

const std::string device = "CPU";
ov::genai::DeviceConfig device_config(core, scheduler_config, "CPU");
size_t num_decoder_layers = 12;
std::vector<size_t> num_kv_heads(12, 12);
device_config.set_model_params(num_kv_heads, 64, num_decoder_layers);
const size_t num_decoder_layers = 12;
const std::vector<KVHeadConfig> kv_heads_config(num_decoder_layers, KVHeadConfig { 12, 12, 64, 64 });
device_config.set_kv_head_configs(kv_heads_config);

ov::InferRequest request = core.compile_model(get_dummy_model(core, num_decoder_layers)).create_infer_request();
auto cache_manager = std::make_shared<ov::genai::CacheManager>(device_config, request, core);
Expand All @@ -100,15 +100,16 @@ TEST(TestCacheManager, test_dynamic_cache_increase) {

const std::string device = "CPU";
ov::genai::DeviceConfig device_config(core, scheduler_config, "CPU");
size_t num_decoder_layers = 12;
size_t head_size = 64;
std::vector<size_t> num_kv_heads(12, 12);
device_config.set_model_params(num_kv_heads, head_size, num_decoder_layers);
const size_t num_decoder_layers = 12;
const std::vector<KVHeadConfig> kv_heads_config(num_decoder_layers, KVHeadConfig { 12, 12, 64, 64 });
device_config.set_kv_head_configs(kv_heads_config);

size_t block_size_in_bytes = 0;
for (size_t layer_id = 0; layer_id < num_decoder_layers; layer_id++) {
block_size_in_bytes += 2 * num_kv_heads[layer_id] * device_config.get_block_size() * head_size * device_config.get_cache_precision().size();
KVHeadConfig config = kv_heads_config[layer_id];
block_size_in_bytes += config.k_head_size * config.num_k_heads + config.v_head_size * config.num_v_heads;
}

block_size_in_bytes *= device_config.get_block_size() * device_config.get_cache_precision().size();

ov::InferRequest request = core.compile_model(get_dummy_model(core, num_decoder_layers)).create_infer_request();
auto cache_manager = std::make_shared<ov::genai::CacheManager>(device_config, request, core);
Expand Down
10 changes: 6 additions & 4 deletions tests/cpp/device_config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,15 @@ TEST(TestDeviceConfig, kv_cache_precision_u8) {
const std::string device = "CPU";
size_t num_decoder_layers = 12;
size_t head_size = 64, head_size_u8 = head_size + 8;
std::vector<size_t> num_kv_heads(12, 12);

ov::genai::DeviceConfig device_config_default(core, scheduler_config, "CPU");
device_config_default.set_model_params(num_kv_heads, head_size_u8, num_decoder_layers);
ov::genai::KVHeadConfig kv_head_config { 12, 12, head_size_u8, head_size_u8 };
ov::genai::KVHeadConfig kv_head_config_u8 { 12, 12, head_size, head_size };

ov::genai::DeviceConfig device_config_default(core, scheduler_config, "CPU");
ov::genai::DeviceConfig device_config_u8(core, scheduler_config, "CPU", { ov::hint::kv_cache_precision(ov::element::u8) });
device_config_u8.set_model_params(num_kv_heads, head_size, num_decoder_layers);

device_config_default.set_kv_head_configs(std::vector<ov::genai::KVHeadConfig>(num_decoder_layers, kv_head_config));
device_config_u8.set_kv_head_configs(std::vector<ov::genai::KVHeadConfig>(num_decoder_layers, kv_head_config_u8));

const auto ratio = ov::element::f16.size() / ov::element::u8.size();
ASSERT_EQ(device_config_default.get_num_kv_blocks() * ratio, device_config_u8.get_num_kv_blocks());
Expand Down
4 changes: 2 additions & 2 deletions tests/cpp/scheduler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,9 @@ std::shared_ptr<CacheManager> init_cache_manager(SchedulerConfig scheduler_confi
size_t num_decoder_layers = 12;
ov::InferRequest request = core.compile_model(get_model(core, num_decoder_layers)).create_infer_request();
size_t head_size = 64, head_size_u8 = head_size + 8;
std::vector<size_t> num_kv_heads(12, 12);
std::vector<KVHeadConfig> kv_head_configs(num_decoder_layers, KVHeadConfig { 12, 12, head_size_u8, head_size_u8 });
ov::genai::DeviceConfig device_config(core, scheduler_config, "CPU");
device_config.set_model_params(num_kv_heads, head_size_u8, num_decoder_layers);
device_config.set_kv_head_configs(kv_head_configs);
return std::make_shared<CacheManager>(device_config, request, core);
}

Expand Down

0 comments on commit be10375

Please sign in to comment.