From cad055489ba4de164f1f11fb0ece014e793a4309 Mon Sep 17 00:00:00 2001 From: Alexandra Sidorova Date: Thu, 23 Jan 2025 10:03:28 +0400 Subject: [PATCH] [Snippets][CPU] Added PagedAttentionExtension to check --- .../transformations/transformation_pipeline.cpp | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp index ee954c018e6332..880cdd54c42812 100644 --- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp +++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp @@ -1034,17 +1034,10 @@ void Transformations::MainSnippets(void) { #if defined(OPENVINO_ARCH_X86_64) // Currently, Snippets don't provide efficient execution for single token inference in LLM case. // To avoid performance degradations, we disable MHA tokenization into Subgraphs in LLMs'. - // We consider the presence of `ScaledDotProductAttentionWithKVCache` op in the model as a sign that this model is - // LLM. - const auto is_LLM = [this]() { - // Note: the variable `ops` should not exist during `SnippetsTokenization` execution. - // Otherwise, it will extend the life time of ops (since they're stored as shared ptrs) and - // they will be visible in the model during the tokenization passes even after removing or replacing. - const auto ops = model->get_ops(); - return std::any_of(ops.cbegin(), ops.cend(), [](const std::shared_ptr& op) { - return ov::is_type(op); - }); - }(); + // We consider the presence of `ScaledDotProductAttentionWithKVCache` and `PagedAttentionExtension` ops + // in the model as a sign that this model is LLM. + const auto is_LLM = ov::op::util::has_op_with_type(model) || + ov::op::util::has_op_with_type(model); // CPU Plugin Subgraph supports f32, bf16, quantized and fp16(on avx_512_core_amx_fp16 target) BRGEMM const auto is_infer_prc_supported_by_MHA =