Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Group Query Attention support with OV base OPs #28163

Draft
wants to merge 4 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
// Copyright (C) 2018-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include "openvino/op/group_query_attention.hpp"
#include "openvino/pass/matcher_pass.hpp"
#include "transformations_visibility.hpp"

namespace ov {
namespace pass {

class TRANSFORMATIONS_API GroupQueryAttentionDecomposition;

} // namespace pass
} // namespace ov

class ov::pass::GroupQueryAttentionDecomposition : public ov::pass::MatcherPass {
public:
OPENVINO_RTTI("GroupQueryAttentionDecomposition", "0");
GroupQueryAttentionDecomposition();
ov::OutputVector decompose(std::shared_ptr<ov::op::v15::GroupQueryAttention> node);
};
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@
#include "transformations/op_conversions/eye_decomposition.hpp"
#include "transformations/op_conversions/gelu7_downgrade.hpp"
#include "transformations/op_conversions/group_normalization_decomposition.hpp"
#include "transformations/op_conversions/group_query_attention_decomposition.hpp"
#include "transformations/op_conversions/hsigmoid_decomposition.hpp"
#include "transformations/op_conversions/hswish_decomposition.hpp"
#include "transformations/op_conversions/log_softmax_decomposition.hpp"
Expand Down Expand Up @@ -156,6 +157,7 @@ bool ov::pass::CommonOptimizations::run_on_model(const std::shared_ptr<ov::Model
REGISTER_DISABLED_PASS(manager, ConvertInterpolate1ToInterpolate4)

auto decomp = manager.register_pass<GraphRewrite>();
ADD_MATCHER(decomp, GroupQueryAttentionDecomposition)
ADD_MATCHER(decomp, ScaledDotProductAttentionDecomposition)
ADD_MATCHER(decomp, Gelu7Downgrade)
ADD_MATCHER(decomp, BidirectionalSequenceDecomposition)
Expand Down

Large diffs are not rendered by default.

54 changes: 54 additions & 0 deletions src/core/include/openvino/op/group_query_attention.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
// Copyright (C) 2018-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once

#include "openvino/op/op.hpp"

namespace ov {
namespace op {
namespace v15 {

// This is an experimental operation that is implemented in the plugins.
class OPENVINO_API GroupQueryAttention : public Op {
public:
OPENVINO_OP("GroupQueryAttention", "opset15", op::Op);

GroupQueryAttention() = default;
GroupQueryAttention(const ov::OutputVector& args,
unsigned int num_heads,
unsigned int kv_num_heads,
float scale,
bool do_rotary,
bool rotary_interleaved);
void validate_and_infer_types() override;
bool visit_attributes(AttributeVisitor& visitor) override;
std::shared_ptr<ov::Node> clone_with_new_inputs(const ov::OutputVector& new_args) const override;

unsigned int get_num_heads() const {
return m_num_heads;
}
unsigned int get_kv_num_heads() const {
return m_kv_num_heads;
}
float get_scale() const {
return m_scale;
}
bool get_do_rotary() const {
return m_do_rotary;
}
bool get_rotary_interleaved() const {
return m_rotary_interleaved;
}

private:
unsigned int m_num_heads;
unsigned int m_kv_num_heads;
float m_scale = 0;
bool m_do_rotary = false;
bool m_rotary_interleaved = false;
};

} // namespace v15
} // namespace op
} // namespace ov
47 changes: 47 additions & 0 deletions src/core/include/openvino/op/null.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
// Copyright (C) 2018-2025 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include "openvino/op/op.hpp"

namespace ov {
namespace op {
namespace v15 {

/// \brief Represents a missing optional input or output of an ONNX node
///
/// Some ONNX operators have inputs or outputs that are marked as optional,
/// which means that a referring node MAY forgo providing values for such inputs
/// or computing these outputs.
/// An empty string is used in place of a name of such input or output.
///
/// More:
/// https://github.com/onnx/onnx/blob/master/docs/IR.md#optional-inputs-and-outputs
class OPENVINO_API Null : public Op {
public:
OPENVINO_OP("Null", "opset15", op::Op);
Null() {
set_output_size(1);
}

static bool is_null(const ov::Node* node) {
return ov::as_type<const ov::op::v15::Null>(node) != nullptr;
}

static bool is_null(const std::shared_ptr<ov::Node>& node) {
return is_null(node.get());
}

static bool is_null(const Output<ov::Node>& output) {
return is_null(output.get_node());
}

virtual std::shared_ptr<Node> clone_with_new_inputs(const ov::OutputVector& new_args) const override {
return std::make_shared<ov::op::v15::Null>();
}
};
} // namespace v15
} // namespace op
} // namespace ov
2 changes: 2 additions & 0 deletions src/core/include/openvino/op/ops.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,8 @@
#include "openvino/op/roll.hpp"
#include "openvino/op/round.hpp"
#include "openvino/op/scaled_dot_product_attention.hpp"
#include "openvino/op/null.hpp"
#include "openvino/op/group_query_attention.hpp"
#include "openvino/op/scatter_elements_update.hpp"
#include "openvino/op/scatter_nd_update.hpp"
#include "openvino/op/scatter_update.hpp"
Expand Down
2 changes: 2 additions & 0 deletions src/core/include/openvino/opsets/opset15_tbl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -234,3 +234,5 @@ _OPENVINO_OP_REG(BitwiseLeftShift, ov::op::v15)
_OPENVINO_OP_REG(BitwiseRightShift, ov::op::v15)
_OPENVINO_OP_REG(SliceScatter, ov::op::v15)
_OPENVINO_OP_REG(SearchSorted, ov::op::v15)
_OPENVINO_OP_REG(GroupQueryAttention, ov::op::v15)
_OPENVINO_OP_REG(Null, ov::op::v15)
92 changes: 92 additions & 0 deletions src/core/src/op/group_query_attention.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
// Copyright (C) 2018-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "openvino/op/group_query_attention.hpp"

#include "itt.hpp"
#include "openvino/op/null.hpp"

using namespace std;
namespace ov {
namespace op {
namespace v15 {

GroupQueryAttention::GroupQueryAttention(const OutputVector& args,
unsigned int num_heads,
unsigned int kv_num_heads,
float scale,
bool do_rotary,
bool rotary_interleaved)
: Op(args),
m_num_heads(num_heads),
m_kv_num_heads(kv_num_heads),
m_scale(scale),
m_do_rotary(do_rotary),
m_rotary_interleaved(rotary_interleaved) {
constructor_validate_and_infer_types();
}

int64_t get_head_size(const PartialShape& input_shape, int num_heads, int kv_num_heads) {
return input_shape[2].get_length() / (num_heads + kv_num_heads * 2);
}

std::vector<int64_t> get_qkv_sizes(const PartialShape& input_shape, int num_heads, int kv_num_heads) {
int64_t per_head_size = get_head_size(input_shape, num_heads, kv_num_heads);
const std::vector<int64_t> qkv_sizes = {num_heads * per_head_size,
kv_num_heads * per_head_size,
kv_num_heads * per_head_size};
return qkv_sizes;
}

void GroupQueryAttention::validate_and_infer_types() {
OV_OP_SCOPE(v15_GroupQueryAttention_validate_and_infer_types);
PartialShape input_shape = get_input_partial_shape(0);
Dimension batch_size = input_shape[0];
Dimension sequence_len = input_shape[1];
Dimension head_size;
if (Null::is_null(input_value(1)) && Null::is_null(input_value(2))) {
head_size = get_head_size(input_shape, m_num_heads, m_kv_num_heads);
} else {
head_size = input_shape[2].get_length() / m_num_heads;
}
Dimension output_kv_len;
PartialShape kv_past_shape = get_input_partial_shape(3);
// FIXME: https://github.com/openvinotoolkit/openvino/pull/27648
if (kv_past_shape[2].is_static()) {
output_kv_len = kv_past_shape[2] + sequence_len;
} else {
output_kv_len = ov::Dimension();
}
auto element_type = get_input_element_type(0);
NODE_VALIDATION_CHECK(this,
element_type == element::f32 || element_type == element::f16,
"GroupQueryAttention only suuports f32 and f16");
set_output_type(0, element_type, PartialShape{batch_size, sequence_len, head_size * m_num_heads});
set_output_type(1, element_type, PartialShape{batch_size, m_kv_num_heads, output_kv_len, head_size});
set_output_type(2, element_type, PartialShape{batch_size, m_kv_num_heads, output_kv_len, head_size});
}

bool GroupQueryAttention::visit_attributes(AttributeVisitor& visitor) {
OV_OP_SCOPE(v15_GroupQueryAttention_visit_attributes);
visitor.on_attribute("do_rotary", m_do_rotary);
visitor.on_attribute("kv_num_heads", m_kv_num_heads);
visitor.on_attribute("num_heads", m_num_heads);
visitor.on_attribute("rotary_interleaved", m_rotary_interleaved);
visitor.on_attribute("scale", m_scale);
return true;
}

std::shared_ptr<ov::Node> GroupQueryAttention::clone_with_new_inputs(const ov::OutputVector& new_args) const {
OV_OP_SCOPE(v15_GroupQueryAttention_clone_with_new_inputs);
return std::make_shared<GroupQueryAttention>(new_args,
m_num_heads,
m_kv_num_heads,
m_scale,
m_do_rotary,
m_rotary_interleaved);
}

} // namespace v15
} // namespace op
} // namespace ov
2 changes: 1 addition & 1 deletion src/core/tests/opset.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ INSTANTIATE_TEST_SUITE_P(opset,
OpsetTestParams{ov::get_opset12, 178},
OpsetTestParams{ov::get_opset13, 186},
OpsetTestParams{ov::get_opset14, 188},
OpsetTestParams{ov::get_opset15, 199},
OpsetTestParams{ov::get_opset15, 201},
OpsetTestParams{ov::get_opset16, 4}),
OpsetTestNameGenerator{});

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
// Copyright (C) 2018-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "openvino/op/group_query_attention.hpp"
#include "openvino/op/null.hpp"

#include "core/null_node.hpp"
#include "core/operator_set.hpp"
#include "openvino/frontend/exception.hpp"

using namespace ov::op;
using ov::Shape;

namespace ov {
namespace frontend {
namespace onnx {
namespace com_microsoft {

namespace opset_1 {
ov::OutputVector group_query_attention(const ov::frontend::onnx::Node& node) {
const auto onnx_op_inputs = node.get_ov_inputs();
const auto num_heads = node.get_attribute_value<int64_t>("num_heads");
const auto kv_num_heads = node.get_attribute_value<int64_t>("kv_num_heads");
const auto scale = node.get_attribute_value<float>("scale", 0.0f);
const auto do_rotary = node.get_attribute_value<int64_t>("do_rotary", 0);
const auto rotary_interleaved = node.get_attribute_value<float>("rotary_interleaved", 0.0f);

OutputVector ov_op_inputs;
ov_op_inputs.reserve(onnx_op_inputs.size());
for (const auto& input : onnx_op_inputs) {
ov_op_inputs.push_back(ov::op::util::is_null(input) ? std::make_shared<v15::Null>() : input);
}
return std::make_shared<v15::GroupQueryAttention>(ov_op_inputs,
num_heads,
kv_num_heads,
scale,
do_rotary,
rotary_interleaved)
->outputs();
}

ONNX_OP("GroupQueryAttention", OPSET_SINCE(1), com_microsoft::opset_1::group_query_attention, MICROSOFT_DOMAIN);

} // namespace opset_1
} // namespace com_microsoft
} // namespace onnx
} // namespace frontend
} // namespace ov
Loading
Loading