Skip to content

Commit

Permalink
Merge branch 'master' into feature/e2e-package-electron-test
Browse files Browse the repository at this point in the history
  • Loading branch information
vishniakov-nikolai committed Jul 22, 2024
2 parents f15d3d4 + ffc135c commit 9a7af63
Show file tree
Hide file tree
Showing 32 changed files with 510 additions and 158 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -687,7 +687,7 @@ jobs:
Overall_Status:
name: ci/gha_overall_status
needs: [Smart_CI, Build, Debian_Packages, Samples, Conformance, ONNX_Runtime, CXX_Unit_Tests, Python_Unit_Tests, TensorFlow_Layer_Tests,
CPU_Functional_Tests, TensorFlow_Models_Tests_Precommit, PyTorch_Models_Tests, NVIDIA_Plugin, Openvino_tokenizers, iGPU]
CPU_Functional_Tests, TensorFlow_Models_Tests_Precommit, PyTorch_Models_Tests, NVIDIA_Plugin, Openvino_tokenizers]
if: ${{ always() }}
runs-on: ubuntu-latest
steps:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ bool has_valid_pattern(const ov::Output<ov::Node>& node_out) {
const auto const_node = std::dynamic_pointer_cast<ov::op::v0::Constant>(node_out.get_node_shared_ptr());
if (!const_node) {
// Lower bound of the value
auto lb = ov::evaluate_lower_bound(node_out);
auto lb = ov::util::evaluate_lower_bound(node_out);
if (!lb)
return false;
const auto lb_const_node =
Expand All @@ -36,7 +36,7 @@ bool has_valid_pattern(const ov::Output<ov::Node>& node_out) {
return true;

// Upper bound of the value
auto ub = ov::evaluate_upper_bound(node_out);
auto ub = ov::util::evaluate_upper_bound(node_out);
if (!ub)
return false;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ pass::AbsSinking::AbsSinking() {
graph_got_changed = true;
}
for (const auto& abs : abs_ops) {
auto bounds = ov::evaluate_both_bounds(abs->input_value(0));
auto bounds = ov::util::evaluate_both_bounds(abs->input_value(0));
if (ov::util::reduce_and(ov::util::greater_equal(bounds.first, 0))) {
replace_output_update_name(abs->output(0), abs->input_value(0));
graph_got_changed = true;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ void optimize_value_usage(ov::Output<ov::Node>& output, STS_map& symbol_shape_so
get_alternative_source_from_value_or_shape_source(symbol_shape_source, symbol, output, symbol_value_source);

if (alternative_source.get_node_shared_ptr() != nullptr) {
evaluate_both_bounds(alternative_source);
ov::util::evaluate_both_bounds(alternative_source);
output.replace(alternative_source);
} else {
// in case we can not optimize it -- it is symbol which appeared just now on the value path
Expand Down
3 changes: 3 additions & 0 deletions src/core/dev_api/openvino/core/bound_evaluation_util.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ namespace ov {
/// \return True if bounds can be propagated for output and order vector has valid data, otherwise false.
OPENVINO_API bool could_propagate(const Output<Node>& output, std::vector<Node*>& order);

namespace util {

/// \brief Evaluates lower value estimation of the output tensor. Traverses graph up to deduce
/// estimation through it.
/// \param Node output pointing to the tensor for estimation.
Expand All @@ -31,4 +33,5 @@ OPENVINO_API Tensor evaluate_upper_bound(const Output<Node>& output);
/// \param output Node output pointing to the tensor for estimation.
/// \return pair with Tensors for lower and upper value estimation.
OPENVINO_API std::pair<Tensor, Tensor> evaluate_both_bounds(const Output<Node>& output);
} // namespace util
} // namespace ov
2 changes: 1 addition & 1 deletion src/core/shape_inference/include/utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -366,7 +366,7 @@ ov::optional<TResult> get_input_bounds(const ov::Node* op, size_t port, const IT
out->reserve(lowers.size());
std::transform(lowers.cbegin(), lowers.cend(), lowers.cbegin(), std::back_inserter(*out), make_bound(et));
} else if (port < op->get_input_size()) {
auto bounds = ov::evaluate_both_bounds(op->get_input_source_output(port));
auto bounds = ov::util::evaluate_both_bounds(op->get_input_source_output(port));

if (bounds.first && bounds.second) {
const auto& et = bounds.first.get_element_type();
Expand Down
16 changes: 8 additions & 8 deletions src/core/src/bound_evaluate.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -299,15 +299,15 @@ bool ov::could_propagate(const Output<Node>& output, std::vector<Node*>& result)
return status;
}

ov::Tensor ov::evaluate_lower_bound(const Output<Node>& output) {
ov::Tensor ov::util::evaluate_lower_bound(const Output<Node>& output) {
return evaluate_bound(output, false);
}

ov::Tensor ov::evaluate_upper_bound(const Output<Node>& output) {
ov::Tensor ov::util::evaluate_upper_bound(const Output<Node>& output) {
return evaluate_bound(output, true);
}

std::pair<ov::Tensor, ov::Tensor> ov::evaluate_both_bounds(const Output<Node>& output) {
std::pair<ov::Tensor, ov::Tensor> ov::util::evaluate_both_bounds(const Output<Node>& output) {
const auto& output_tensor = output.get_tensor();
if (output_tensor.get_lower_value() && output_tensor.get_upper_value())
return {output_tensor.get_lower_value(), output_tensor.get_upper_value()};
Expand Down Expand Up @@ -381,10 +381,10 @@ bool ov::interval_bound_evaluator(const Node* node,
OPENVINO_ASSERT(node->get_input_size() == 2);

const auto num_of_outputs = node->get_output_size();
auto low_0 = ov::evaluate_lower_bound(node->get_input_source_output(0));
auto low_1 = ov::evaluate_lower_bound(node->get_input_source_output(1));
auto up_0 = ov::evaluate_upper_bound(node->get_input_source_output(0));
auto up_1 = ov::evaluate_upper_bound(node->get_input_source_output(1));
auto low_0 = ov::util::evaluate_lower_bound(node->get_input_source_output(0));
auto low_1 = ov::util::evaluate_lower_bound(node->get_input_source_output(1));
auto up_0 = ov::util::evaluate_upper_bound(node->get_input_source_output(0));
auto up_1 = ov::util::evaluate_upper_bound(node->get_input_source_output(1));
if (!low_0 || !low_1 || !up_0 || !up_1)
return false;

Expand Down Expand Up @@ -534,7 +534,7 @@ bool ov::has_and_set_equal_bounds(const Output<Node>& source) {
if (op::util::is_constant(source.get_node_shared_ptr()))
return true;

auto bounds = ov::evaluate_both_bounds(source);
auto bounds = ov::util::evaluate_both_bounds(source);
return are_same_tensor(bounds.first, bounds.second);
}

Expand Down
8 changes: 4 additions & 4 deletions src/core/src/op/divide.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,16 +80,16 @@ bool evaluate_bound(const Node* node, TensorVector& output_values, bool is_upper
OPENVINO_ASSERT(PartialShape::broadcast_merge_into(input_shape, input2.get_partial_shape(), node->get_autob()),
"Argument shapes in divide operation are inconsistent.");

const auto input1_low = evaluate_lower_bound(input1);
const auto input1_low = ov::util::evaluate_lower_bound(input1);
if (!input1_low)
return false;
const auto input1_up = evaluate_upper_bound(input1);
const auto input1_up = ov::util::evaluate_upper_bound(input1);
if (!input1_up)
return false;
const auto input2_low = evaluate_lower_bound(input2);
const auto input2_low = ov::util::evaluate_lower_bound(input2);
if (!input2_low)
return false;
const auto input2_up = evaluate_upper_bound(input2);
const auto input2_up = ov::util::evaluate_upper_bound(input2);
if (!input2_up)
return false;

Expand Down
4 changes: 2 additions & 2 deletions src/core/src/op/mod.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,8 +78,8 @@ namespace {
* @return Vector with inputs bounds tensors.
*/
TensorVector get_bounds(const Node* const op) {
auto&& v_bounds = ov::evaluate_both_bounds(op->input_value(0));
auto&& m_bounds = ov::evaluate_both_bounds(op->input_value(1));
auto&& v_bounds = ov::util::evaluate_both_bounds(op->input_value(0));
auto&& m_bounds = ov::util::evaluate_both_bounds(op->input_value(1));
return {std::move(v_bounds.first),
std::move(v_bounds.second),
std::move(m_bounds.first),
Expand Down
4 changes: 2 additions & 2 deletions src/core/tests/bound_evaluate.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,15 +42,15 @@ TEST_F(EvaluateBoundTest, no_exception_when_node_has_output_with_dynamic_rank) {
fn_op->set_output_type(1, element::i32, PartialShape{{1, 4}});
fn_op->validate_and_infer_types();

EXPECT_NO_THROW(evaluate_both_bounds(fn_op));
EXPECT_NO_THROW(ov::util::evaluate_both_bounds(fn_op));
}

TEST_F(EvaluateBoundTest, no_exception_when_node_has_output_with_dynamic_element_type) {
fn_op->set_output_type(0, element::dynamic, PartialShape{4});
fn_op->set_output_type(1, element::dynamic, PartialShape{4});
fn_op->validate_and_infer_types();

EXPECT_NO_THROW(evaluate_both_bounds(fn_op));
EXPECT_NO_THROW(ov::util::evaluate_both_bounds(fn_op));
}

using BoundEvaluatorTest = ::testing::Test;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -116,25 +116,13 @@ KERNEL(quantize_input)(


#if !REALIGN_FP16_OFFSET
# if OUTPUT_3D
# define MAIN_LOOP_ELEMENTS_COUNT INPUT0_SIZE_Y
# else
# define MAIN_LOOP_ELEMENTS_COUNT INPUT0_ELEMENTS_COUNT
# endif
#define MAIN_LOOP_ELEMENTS_COUNT IFM_SIZE
#else
// For REALIGN_FP16_OFFSET one feature is processed separately before entering main loop to correct alignment.
# if OUTPUT_3D
# define MAIN_LOOP_ELEMENTS_COUNT (INPUT0_SIZE_Y - 1)
# else
# define MAIN_LOOP_ELEMENTS_COUNT (INPUT0_ELEMENTS_COUNT - 1)
# endif
// For REALIGN_FP16_OFFSET one feature is processed separately before entering main loop to correct alignment.
#define MAIN_LOOP_ELEMENTS_COUNT (IFM_SIZE - 1)
#endif

#if OUTPUT_3D
# define INPUT_ELEMENTS_COUNT INPUT0_SIZE_Y
#else
# define INPUT_ELEMENTS_COUNT INPUT0_ELEMENTS_COUNT
#endif
#define INPUT_ELEMENTS_COUNT IFM_SIZE

#if IS_DYNAMIC && COMPRESSED_WEIGHTS_INT4
#pragma disable_includes_optimization
Expand Down Expand Up @@ -316,9 +304,7 @@ inline void FUNC(fc_bf_tiled_kernel_default)(
// NOTE: Manually unrolling multiplication loop leads to lower register pressure and allows for bigger block sizes,
// but significantly degrades readability and generality of code.
// It doesn't also show noticable performance improvement on tested configurations.
#if DECOMPRESSION_SCALE_POST_OP
ACCUMULATOR_VEC_TYPE acc_tmp[TILE_B] = { };
#endif
ACCUMULATOR_VEC_TYPE acc_tmp[TILE_B] = { };

#if USE_SLM && COMPRESSED_WEIGHTS_INT4
#if TILE_OFM != 2
Expand Down Expand Up @@ -481,9 +467,9 @@ inline void FUNC(fc_bf_tiled_kernel_default)(
#endif
#else
#if TILE_OFM > 1
((ACCUMULATOR_TYPE*)(&acc[bi]))[fi] += in_val * ((ACCUMULATOR_TYPE*)(&wei))[W_IDX];
((ACCUMULATOR_TYPE*)(&acc_tmp[bi]))[fi] += in_val * ((ACCUMULATOR_TYPE*)(&wei))[W_IDX];
#else
acc[bi] += in_val * ((ACCUMULATOR_TYPE*)(&wei))[W_IDX];
acc_tmp[bi] += in_val * ((ACCUMULATOR_TYPE*)(&wei))[W_IDX];
#endif
#endif
}
Expand Down Expand Up @@ -539,6 +525,18 @@ inline void FUNC(fc_bf_tiled_kernel_default)(
}
}
#endif

#if !DECOMPRESSION_SCALE_POST_OP
unroll_for (uint bi = 0; bi < TILE_B; ++bi) {
unroll_for(uint fi = 0; fi < TILE_OFM; ++fi) {
#if TILE_OFM > 1
((ACCUMULATOR_TYPE*)(&acc[bi]))[fi] += ((ACCUMULATOR_TYPE*)(&acc_tmp[bi]))[fi];
#else
acc[bi] += acc_tmp[bi];
#endif
}
}
#endif
}
// =====================================================================================================================================
// Leftovers
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,20 @@ static constexpr size_t min_slm_size = 256;
namespace kernel_selector {

static std::pair<size_t, size_t> get_input_bf_size(const fully_connected_params& params) {
size_t input_f = params.inputs[0].Feature().v;
size_t input_batch = params.inputs[0].Batch().v;
auto& input = params.inputs[0];
size_t input_f = input.Feature().v;
size_t input_batch = input.Batch().v;

// 3D input
if (params.outputs[0].GetLayout() == DataLayout::bfyx) {
input_f = params.inputs[0].Y().v;
input_batch = params.inputs[0].Batch().v * params.inputs[0].Feature().v;
input_f = input.Y().v;
input_batch = input.Batch().v * input.Feature().v;
}

// In Some model, input_f could be dynamic in input0. It refers to IFM value of weight.
if (input.is_dynamic() && input_f == 0 && params.weights.IFM().v != 0)
input_f = params.weights.IFM().v;

return {input_batch, input_f};
}

Expand Down Expand Up @@ -153,8 +159,7 @@ bool FullyConnected_bf_tiled::Validate(const Params& params) const {

// Dynamic kernel doesn't support dynamic weights yet
if (fc_params.is_shape_agnostic && input.is_dynamic()) {
if ((output.GetLayout() == DataLayout::bfyx && input.Y().v == 0) ||
(output.GetLayout() == DataLayout::bf && input.Feature().v == 0))
if (get_input_bf_size(fc_params).second == 0)
return false;
}

Expand Down Expand Up @@ -509,6 +514,7 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para
jit.AddConstant(MakeJitConstant("DYNAMIC_QUANTIZE", 0));
}

jit.AddConstant(MakeJitConstant("IFM_SIZE", get_input_bf_size(params).second));
jit.AddConstant(MakeJitConstant("SIMD", simd));
jit.AddConstant(MakeJitConstant("TILE_B", dispatchData.tile_m));
jit.AddConstant(MakeJitConstant("HALF_TILE_B", dispatchData.tile_m/2));
Expand Down Expand Up @@ -539,16 +545,18 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para

// for 3d output we are treating spatial as features
if (params.outputs[0].GetLayout() == DataLayout::bfyx) {
auto tile_in_b_pitch = (params.inputs[0].Feature().pitch == 0) ? get_input_bf_size(params).second : params.inputs[0].Feature().pitch;
jit.AddConstant(MakeJitConstant("TILE_OUT_F_NUM", params.outputs[0].Y().v));
jit.AddConstant(MakeJitConstant("TILE_OUT_F_PITCH", params.outputs[0].Y().pitch));
jit.AddConstant(MakeJitConstant("TILE_IN_B_PITCH", params.inputs[0].Feature().pitch));
jit.AddConstant(MakeJitConstant("TILE_IN_B_PITCH", tile_in_b_pitch));
jit.AddConstant(MakeJitConstant("TILE_OUT_B_PITCH", params.outputs[0].Feature().pitch));
jit.AddConstant(MakeJitConstant("OUTPUT_3D", true));
jit.AddConstant(MakeJitConstant("BATCH_SIZE", "(OUTPUT_BATCH_NUM * OUTPUT_FEATURE_NUM)"));
} else {
auto tile_in_b_pitch = (params.inputs[0].Batch().pitch == 0) ? get_input_bf_size(params).second : params.inputs[0].Batch().pitch;
jit.AddConstant(MakeJitConstant("TILE_OUT_F_NUM", params.outputs[0].Feature().v));
jit.AddConstant(MakeJitConstant("TILE_OUT_F_PITCH", params.outputs[0].Feature().pitch));
jit.AddConstant(MakeJitConstant("TILE_IN_B_PITCH", params.inputs[0].Batch().pitch));
jit.AddConstant(MakeJitConstant("TILE_IN_B_PITCH", tile_in_b_pitch));
jit.AddConstant(MakeJitConstant("TILE_OUT_B_PITCH", params.outputs[0].Batch().pitch));
jit.AddConstant(MakeJitConstant("BATCH_SIZE", "(OUTPUT_BATCH_NUM)"));
}
Expand Down Expand Up @@ -614,6 +622,12 @@ void FullyConnected_bf_tiled::GetUpdateDispatchDataFunc(KernelData& kd) const {
kd.kernels[execute_kernel_idx].params.workGroups.local = dispatchData.lws;
kd.kernels[execute_kernel_idx].skip_execution = KernelData::SkipKernelExecution(prim_params);

auto& input = prim_params.inputs[0];
if (prim_params.outputs[0].GetLayout() == DataLayout::bfyx)
OPENVINO_ASSERT(input.X().pad.Total() == 0 && input.Y().pad.Total() == 0, "[GPU] Invalid padding in spatial axes observed in FC bf tiled.");
else
OPENVINO_ASSERT(input.Feature().pad.Total() == 0, "[GPU] Invalid padding in f axis observed in FC bf tiled.");

if (!kd.internalBufferSizes.empty()) {
// Pre-quantizing kernel was generated. Update the kernel and intermediate buffers or disable it.
if (execute_type == KernelType::DEFAULT) {
Expand Down Expand Up @@ -784,7 +798,8 @@ KernelsData FullyConnected_bf_tiled::GetMultiKernelsData(const Params &params,
{
auto& quan_kernel = kd.kernels[0];
DispatchData dyn_quan_dispatch = dispatchData;
dyn_quan_dispatch.gws = {std::max((fc_params.inputs[0].PhysicalSize() / quantize_grp_size), (size_t)1), 1, 1};
auto input_size = std::max(fc_params.inputs[0].PhysicalSize(), get_input_bf_size(fc_params).second);
dyn_quan_dispatch.gws = {input_size / quantize_grp_size, 1, 1};
dyn_quan_dispatch.lws = {16, 1, 1};
quan_kernel.params.workGroups.global = dyn_quan_dispatch.gws;
quan_kernel.params.workGroups.local = dyn_quan_dispatch.lws;
Expand Down Expand Up @@ -814,8 +829,8 @@ KernelsData FullyConnected_bf_tiled::GetMultiKernelsData(const Params &params,
quan_kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, 0});
quan_kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0});
quan_kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 1});
kd.internalBufferSizes.push_back(fc_params.inputs[0].PhysicalSize());
kd.internalBufferSizes.push_back(fc_params.inputs[0].PhysicalSize() / quantize_grp_size * 2);
kd.internalBufferSizes.push_back(input_size);
kd.internalBufferSizes.push_back(input_size / quantize_grp_size * 2);
kernel_number++;
}
kd.internalBufferDataType = Datatype::F16;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -266,9 +266,10 @@ const std::vector<ShapeRelatedParams> IS3D_smoke = {
},

{ov::test::static_shapes_to_test_representation({{1, 429}, {1, 429, 1}}), {true, true}},

{
{
{{-1, -1}, {{1, 129}, {2, 129}, {1, 129}, {2, 129}}},
{{-1, -1, -1}, {{1, 1, 129}, {1, 2, 129}, {1, 1, 129}, {1, 2, 129}}},
{{1, 129, 1}, {{1, 129, 1}, {1, 129, 1}, {1, 129, 1}, {1, 129, 1}}}
},
{true, true}
Expand Down
Loading

0 comments on commit 9a7af63

Please sign in to comment.