Skip to content

Commit

Permalink
[GPU] Changed default value for kv cache precision (#28502)
Browse files Browse the repository at this point in the history
### Details:
- Set default precision of kv cache as fp16 to match inference precision
- Update logic which handles multiple apply_user_properties() call for
same object to skip everything if config is already finalized.
- For platforms with xmx support set `kv_cache_precision` as
`inference_precision` to provide correct property query result

---------

Signed-off-by: Vladimir Paramuzov <[email protected]>
  • Loading branch information
vladimir-paramuzov authored Jan 23, 2025
1 parent 900d7d3 commit e6a98a2
Show file tree
Hide file tree
Showing 6 changed files with 28 additions and 25 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,6 @@ class ExecutionConfig {
void apply_performance_hints(const cldnn::device_info& info);
void apply_priority_hints(const cldnn::device_info& info);
void apply_debug_options(const cldnn::device_info& info);
void update_specific_default_properties(const cldnn::device_info& info);

template <typename T, PropertyMutability mutability>
void apply_rt_info_property(const ov::Property<T, mutability>& property, const ov::RTMap& rt_info) {
Expand All @@ -169,7 +168,7 @@ class ExecutionConfig {
std::map<std::string, PropertyVisibility> supported_properties;
std::map<std::string, BaseValidator::Ptr> property_validators;

bool specific_default_properties_is_set = false;
bool finalized = false;
};

} // namespace intel_gpu
Expand Down
1 change: 1 addition & 0 deletions src/plugins/intel_gpu/src/graph/program.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -530,6 +530,7 @@ void program::init_graph() {
node->get_output_layouts();
if (node->is_type<lstm_seq>()) {
_config.set_property(ov::intel_gpu::use_onednn(true));
_config.set_property(ov::intel_gpu::queue_type(QueueTypes::in_order));
}
}
// Perform initial shape_of subgraphs markup
Expand Down
4 changes: 3 additions & 1 deletion src/plugins/intel_gpu/src/plugin/plugin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -240,10 +240,12 @@ std::shared_ptr<ov::ICompiledModel> Plugin::compile_model(const std::shared_ptr<
auto context_impl = get_context_impl(context);
auto device_id = ov::DeviceIDParser{context_impl->get_device_name()}.get_device_id();

OPENVINO_ASSERT(m_configs_map.find(device_id) != m_configs_map.end(), "[GPU] LoadExeNetworkImpl: Couldn't find config for GPU with id ", device_id);
OPENVINO_ASSERT(m_configs_map.find(device_id) != m_configs_map.end(), "[GPU] compile_model: Couldn't find config for GPU with id ", device_id);

ExecutionConfig config = m_configs_map.at(device_id);
config.set_user_property(orig_config);
if (model->has_rt_info("runtime_options"))
config.apply_rt_info(context_impl->get_engine().get_device_info(), model->get_rt_info<ov::AnyMap>("runtime_options"), is_llm(model));
config.apply_user_properties(context_impl->get_engine().get_device_info());

set_cache_info(model, config);
Expand Down
41 changes: 21 additions & 20 deletions src/plugins/intel_gpu/src/runtime/execution_config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include "intel_gpu/runtime/execution_config.hpp"
#include "intel_gpu/runtime/debug_configuration.hpp"
#include "openvino/runtime/internal_properties.hpp"
#include "openvino/runtime/properties.hpp"

#include <thread>

Expand Down Expand Up @@ -59,7 +60,7 @@ void ExecutionConfig::set_default() {
std::make_tuple(ov::cache_mode, ov::CacheMode::OPTIMIZE_SPEED),
std::make_tuple(ov::cache_encryption_callbacks, EncryptionCallbacks{}),
std::make_tuple(ov::hint::dynamic_quantization_group_size, 0),
std::make_tuple(ov::hint::kv_cache_precision, ov::element::undefined),
std::make_tuple(ov::hint::kv_cache_precision, ov::element::f16),
std::make_tuple(ov::intel_gpu::hint::enable_kernels_reuse, false),
std::make_tuple(ov::weights_path, ""),
std::make_tuple(ov::hint::activations_scale_factor, -1.f),
Expand Down Expand Up @@ -230,26 +231,9 @@ void ExecutionConfig::apply_hints(const cldnn::device_info& info) {
apply_debug_options(info);
}

void ExecutionConfig::update_specific_default_properties(const cldnn::device_info& info) {
// These default properties should be set once.
if (specific_default_properties_is_set)
return;
specific_default_properties_is_set = true;

// Enable KV-cache compression by default for non-systolic platforms MFDNN-11755
if (get_property(ov::hint::kv_cache_precision) == ov::element::undefined && !info.supports_immad) {
set_property(ov::hint::kv_cache_precision(ov::element::i8));
}

// Enable dynamic quantization by default for non-systolic platforms
if (get_property(ov::hint::dynamic_quantization_group_size) == 0 && !info.supports_immad) {
set_property(ov::hint::dynamic_quantization_group_size(32));
}
}

void ExecutionConfig::apply_user_properties(const cldnn::device_info& info) {
// Update specific default properties, call once before internal_properties updated.
update_specific_default_properties(info);
if (finalized)
return;

// Copy internal properties before applying hints to ensure that
// a property set by hint won't be overriden by a value in user config.
Expand Down Expand Up @@ -280,6 +264,23 @@ void ExecutionConfig::apply_user_properties(const cldnn::device_info& info) {
}
}

if (!is_set_by_user(ov::hint::kv_cache_precision) || get_property(ov::hint::kv_cache_precision) == ov::element::undefined) {
if (info.supports_immad) { // MFDNN-11755
set_property(ov::hint::kv_cache_precision(get_property(ov::hint::inference_precision)));
} else {
// Enable KV-cache compression by default for non-systolic platforms only
set_property(ov::hint::kv_cache_precision(ov::element::i8));
}
}

// Enable dynamic quantization by default for non-systolic platforms
if (!is_set_by_user(ov::hint::dynamic_quantization_group_size) &&
get_property(ov::hint::dynamic_quantization_group_size) == 0 && !info.supports_immad) {
set_property(ov::hint::dynamic_quantization_group_size(32));
}

finalized = true;

user_properties.clear();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -539,7 +539,7 @@ class KVCacheIssueTests: public ::testing::Test {
auto core = ov::test::utils::PluginCache::get().core();

ov::AnyMap properties = {
ov::hint::kv_cache_precision(ov::element::undefined)
ov::hint::kv_cache_precision(ov::element::f16)
};

const size_t n_batch = 1;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ class SDPAWithKVCacheTest : public ::testing::Test, public ::testing::WithParamI
if (p.compressed) {
properties.emplace(ov::hint::kv_cache_precision(ov::element::i8));
} else {
properties.emplace(ov::hint::kv_cache_precision(ov::element::undefined));
properties.emplace(ov::hint::kv_cache_precision(ov::element::f16));
}

const size_t n_heads = 16;
Expand Down

0 comments on commit e6a98a2

Please sign in to comment.