diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index 4358af5707b077..e75cc2a1867e55 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -687,7 +687,7 @@ jobs: Overall_Status: name: ci/gha_overall_status needs: [Smart_CI, Build, Debian_Packages, Samples, Conformance, ONNX_Runtime, CXX_Unit_Tests, Python_Unit_Tests, TensorFlow_Layer_Tests, - CPU_Functional_Tests, TensorFlow_Models_Tests_Precommit, PyTorch_Models_Tests, NVIDIA_Plugin, Openvino_tokenizers, iGPU] + CPU_Functional_Tests, TensorFlow_Models_Tests_Precommit, PyTorch_Models_Tests, NVIDIA_Plugin, Openvino_tokenizers] if: ${{ always() }} runs-on: ubuntu-latest steps: diff --git a/src/common/transformations/src/transformations/common_optimizations/reshape_sequence_fusion.cpp b/src/common/transformations/src/transformations/common_optimizations/reshape_sequence_fusion.cpp index de616d8996f720..e51715a34a9cdf 100644 --- a/src/common/transformations/src/transformations/common_optimizations/reshape_sequence_fusion.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/reshape_sequence_fusion.cpp @@ -20,7 +20,7 @@ bool has_valid_pattern(const ov::Output& node_out) { const auto const_node = std::dynamic_pointer_cast(node_out.get_node_shared_ptr()); if (!const_node) { // Lower bound of the value - auto lb = ov::evaluate_lower_bound(node_out); + auto lb = ov::util::evaluate_lower_bound(node_out); if (!lb) return false; const auto lb_const_node = @@ -36,7 +36,7 @@ bool has_valid_pattern(const ov::Output& node_out) { return true; // Upper bound of the value - auto ub = ov::evaluate_upper_bound(node_out); + auto ub = ov::util::evaluate_upper_bound(node_out); if (!ub) return false; diff --git a/src/common/transformations/src/transformations/common_optimizations/simplify_shape_of_sub_graph.cpp b/src/common/transformations/src/transformations/common_optimizations/simplify_shape_of_sub_graph.cpp index 9089474dd42bd3..d4bb02227c56ac 100644 --- a/src/common/transformations/src/transformations/common_optimizations/simplify_shape_of_sub_graph.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/simplify_shape_of_sub_graph.cpp @@ -164,7 +164,7 @@ pass::AbsSinking::AbsSinking() { graph_got_changed = true; } for (const auto& abs : abs_ops) { - auto bounds = ov::evaluate_both_bounds(abs->input_value(0)); + auto bounds = ov::util::evaluate_both_bounds(abs->input_value(0)); if (ov::util::reduce_and(ov::util::greater_equal(bounds.first, 0))) { replace_output_update_name(abs->output(0), abs->input_value(0)); graph_got_changed = true; diff --git a/src/common/transformations/src/transformations/symbolic_transformations/symbol_optimization.cpp b/src/common/transformations/src/transformations/symbolic_transformations/symbol_optimization.cpp index 469150c02a857c..27790904f4360b 100644 --- a/src/common/transformations/src/transformations/symbolic_transformations/symbol_optimization.cpp +++ b/src/common/transformations/src/transformations/symbolic_transformations/symbol_optimization.cpp @@ -215,7 +215,7 @@ void optimize_value_usage(ov::Output& output, STS_map& symbol_shape_so get_alternative_source_from_value_or_shape_source(symbol_shape_source, symbol, output, symbol_value_source); if (alternative_source.get_node_shared_ptr() != nullptr) { - evaluate_both_bounds(alternative_source); + ov::util::evaluate_both_bounds(alternative_source); output.replace(alternative_source); } else { // in case we can not optimize it -- it is symbol which appeared just now on the value path diff --git a/src/core/dev_api/openvino/core/bound_evaluation_util.hpp b/src/core/dev_api/openvino/core/bound_evaluation_util.hpp index 86c8e9fa81e694..936a2fe891e182 100644 --- a/src/core/dev_api/openvino/core/bound_evaluation_util.hpp +++ b/src/core/dev_api/openvino/core/bound_evaluation_util.hpp @@ -14,6 +14,8 @@ namespace ov { /// \return True if bounds can be propagated for output and order vector has valid data, otherwise false. OPENVINO_API bool could_propagate(const Output& output, std::vector& order); +namespace util { + /// \brief Evaluates lower value estimation of the output tensor. Traverses graph up to deduce /// estimation through it. /// \param Node output pointing to the tensor for estimation. @@ -31,4 +33,5 @@ OPENVINO_API Tensor evaluate_upper_bound(const Output& output); /// \param output Node output pointing to the tensor for estimation. /// \return pair with Tensors for lower and upper value estimation. OPENVINO_API std::pair evaluate_both_bounds(const Output& output); +} // namespace util } // namespace ov diff --git a/src/core/shape_inference/include/utils.hpp b/src/core/shape_inference/include/utils.hpp index 11702c8f4e1b42..9298498d00bdc5 100644 --- a/src/core/shape_inference/include/utils.hpp +++ b/src/core/shape_inference/include/utils.hpp @@ -366,7 +366,7 @@ ov::optional get_input_bounds(const ov::Node* op, size_t port, const IT out->reserve(lowers.size()); std::transform(lowers.cbegin(), lowers.cend(), lowers.cbegin(), std::back_inserter(*out), make_bound(et)); } else if (port < op->get_input_size()) { - auto bounds = ov::evaluate_both_bounds(op->get_input_source_output(port)); + auto bounds = ov::util::evaluate_both_bounds(op->get_input_source_output(port)); if (bounds.first && bounds.second) { const auto& et = bounds.first.get_element_type(); diff --git a/src/core/src/bound_evaluate.cpp b/src/core/src/bound_evaluate.cpp index d86b01c81165cb..55c31538e91b86 100644 --- a/src/core/src/bound_evaluate.cpp +++ b/src/core/src/bound_evaluate.cpp @@ -299,15 +299,15 @@ bool ov::could_propagate(const Output& output, std::vector& result) return status; } -ov::Tensor ov::evaluate_lower_bound(const Output& output) { +ov::Tensor ov::util::evaluate_lower_bound(const Output& output) { return evaluate_bound(output, false); } -ov::Tensor ov::evaluate_upper_bound(const Output& output) { +ov::Tensor ov::util::evaluate_upper_bound(const Output& output) { return evaluate_bound(output, true); } -std::pair ov::evaluate_both_bounds(const Output& output) { +std::pair ov::util::evaluate_both_bounds(const Output& output) { const auto& output_tensor = output.get_tensor(); if (output_tensor.get_lower_value() && output_tensor.get_upper_value()) return {output_tensor.get_lower_value(), output_tensor.get_upper_value()}; @@ -381,10 +381,10 @@ bool ov::interval_bound_evaluator(const Node* node, OPENVINO_ASSERT(node->get_input_size() == 2); const auto num_of_outputs = node->get_output_size(); - auto low_0 = ov::evaluate_lower_bound(node->get_input_source_output(0)); - auto low_1 = ov::evaluate_lower_bound(node->get_input_source_output(1)); - auto up_0 = ov::evaluate_upper_bound(node->get_input_source_output(0)); - auto up_1 = ov::evaluate_upper_bound(node->get_input_source_output(1)); + auto low_0 = ov::util::evaluate_lower_bound(node->get_input_source_output(0)); + auto low_1 = ov::util::evaluate_lower_bound(node->get_input_source_output(1)); + auto up_0 = ov::util::evaluate_upper_bound(node->get_input_source_output(0)); + auto up_1 = ov::util::evaluate_upper_bound(node->get_input_source_output(1)); if (!low_0 || !low_1 || !up_0 || !up_1) return false; @@ -534,7 +534,7 @@ bool ov::has_and_set_equal_bounds(const Output& source) { if (op::util::is_constant(source.get_node_shared_ptr())) return true; - auto bounds = ov::evaluate_both_bounds(source); + auto bounds = ov::util::evaluate_both_bounds(source); return are_same_tensor(bounds.first, bounds.second); } diff --git a/src/core/src/op/divide.cpp b/src/core/src/op/divide.cpp index b00b731b296351..dfbc25c0bd0553 100644 --- a/src/core/src/op/divide.cpp +++ b/src/core/src/op/divide.cpp @@ -80,16 +80,16 @@ bool evaluate_bound(const Node* node, TensorVector& output_values, bool is_upper OPENVINO_ASSERT(PartialShape::broadcast_merge_into(input_shape, input2.get_partial_shape(), node->get_autob()), "Argument shapes in divide operation are inconsistent."); - const auto input1_low = evaluate_lower_bound(input1); + const auto input1_low = ov::util::evaluate_lower_bound(input1); if (!input1_low) return false; - const auto input1_up = evaluate_upper_bound(input1); + const auto input1_up = ov::util::evaluate_upper_bound(input1); if (!input1_up) return false; - const auto input2_low = evaluate_lower_bound(input2); + const auto input2_low = ov::util::evaluate_lower_bound(input2); if (!input2_low) return false; - const auto input2_up = evaluate_upper_bound(input2); + const auto input2_up = ov::util::evaluate_upper_bound(input2); if (!input2_up) return false; diff --git a/src/core/src/op/mod.cpp b/src/core/src/op/mod.cpp index 1d15ffb60d4971..57306aa0373863 100644 --- a/src/core/src/op/mod.cpp +++ b/src/core/src/op/mod.cpp @@ -78,8 +78,8 @@ namespace { * @return Vector with inputs bounds tensors. */ TensorVector get_bounds(const Node* const op) { - auto&& v_bounds = ov::evaluate_both_bounds(op->input_value(0)); - auto&& m_bounds = ov::evaluate_both_bounds(op->input_value(1)); + auto&& v_bounds = ov::util::evaluate_both_bounds(op->input_value(0)); + auto&& m_bounds = ov::util::evaluate_both_bounds(op->input_value(1)); return {std::move(v_bounds.first), std::move(v_bounds.second), std::move(m_bounds.first), diff --git a/src/core/tests/bound_evaluate.cpp b/src/core/tests/bound_evaluate.cpp index 038c1bae444e9d..4c5c7681a45a0d 100644 --- a/src/core/tests/bound_evaluate.cpp +++ b/src/core/tests/bound_evaluate.cpp @@ -42,7 +42,7 @@ TEST_F(EvaluateBoundTest, no_exception_when_node_has_output_with_dynamic_rank) { fn_op->set_output_type(1, element::i32, PartialShape{{1, 4}}); fn_op->validate_and_infer_types(); - EXPECT_NO_THROW(evaluate_both_bounds(fn_op)); + EXPECT_NO_THROW(ov::util::evaluate_both_bounds(fn_op)); } TEST_F(EvaluateBoundTest, no_exception_when_node_has_output_with_dynamic_element_type) { @@ -50,7 +50,7 @@ TEST_F(EvaluateBoundTest, no_exception_when_node_has_output_with_dynamic_element fn_op->set_output_type(1, element::dynamic, PartialShape{4}); fn_op->validate_and_infer_types(); - EXPECT_NO_THROW(evaluate_both_bounds(fn_op)); + EXPECT_NO_THROW(ov::util::evaluate_both_bounds(fn_op)); } using BoundEvaluatorTest = ::testing::Test; diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl index f22c1ee136c004..3dae1669ccc23f 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl @@ -116,25 +116,13 @@ KERNEL(quantize_input)( #if !REALIGN_FP16_OFFSET -# if OUTPUT_3D -# define MAIN_LOOP_ELEMENTS_COUNT INPUT0_SIZE_Y -# else -# define MAIN_LOOP_ELEMENTS_COUNT INPUT0_ELEMENTS_COUNT -# endif + #define MAIN_LOOP_ELEMENTS_COUNT IFM_SIZE #else -// For REALIGN_FP16_OFFSET one feature is processed separately before entering main loop to correct alignment. -# if OUTPUT_3D -# define MAIN_LOOP_ELEMENTS_COUNT (INPUT0_SIZE_Y - 1) -# else -# define MAIN_LOOP_ELEMENTS_COUNT (INPUT0_ELEMENTS_COUNT - 1) -# endif + // For REALIGN_FP16_OFFSET one feature is processed separately before entering main loop to correct alignment. + #define MAIN_LOOP_ELEMENTS_COUNT (IFM_SIZE - 1) #endif -#if OUTPUT_3D -# define INPUT_ELEMENTS_COUNT INPUT0_SIZE_Y -#else -# define INPUT_ELEMENTS_COUNT INPUT0_ELEMENTS_COUNT -#endif +#define INPUT_ELEMENTS_COUNT IFM_SIZE #if IS_DYNAMIC && COMPRESSED_WEIGHTS_INT4 #pragma disable_includes_optimization @@ -316,9 +304,7 @@ inline void FUNC(fc_bf_tiled_kernel_default)( // NOTE: Manually unrolling multiplication loop leads to lower register pressure and allows for bigger block sizes, // but significantly degrades readability and generality of code. // It doesn't also show noticable performance improvement on tested configurations. - #if DECOMPRESSION_SCALE_POST_OP - ACCUMULATOR_VEC_TYPE acc_tmp[TILE_B] = { }; - #endif + ACCUMULATOR_VEC_TYPE acc_tmp[TILE_B] = { }; #if USE_SLM && COMPRESSED_WEIGHTS_INT4 #if TILE_OFM != 2 @@ -481,9 +467,9 @@ inline void FUNC(fc_bf_tiled_kernel_default)( #endif #else #if TILE_OFM > 1 - ((ACCUMULATOR_TYPE*)(&acc[bi]))[fi] += in_val * ((ACCUMULATOR_TYPE*)(&wei))[W_IDX]; + ((ACCUMULATOR_TYPE*)(&acc_tmp[bi]))[fi] += in_val * ((ACCUMULATOR_TYPE*)(&wei))[W_IDX]; #else - acc[bi] += in_val * ((ACCUMULATOR_TYPE*)(&wei))[W_IDX]; + acc_tmp[bi] += in_val * ((ACCUMULATOR_TYPE*)(&wei))[W_IDX]; #endif #endif } @@ -539,6 +525,18 @@ inline void FUNC(fc_bf_tiled_kernel_default)( } } #endif + +#if !DECOMPRESSION_SCALE_POST_OP + unroll_for (uint bi = 0; bi < TILE_B; ++bi) { + unroll_for(uint fi = 0; fi < TILE_OFM; ++fi) { + #if TILE_OFM > 1 + ((ACCUMULATOR_TYPE*)(&acc[bi]))[fi] += ((ACCUMULATOR_TYPE*)(&acc_tmp[bi]))[fi]; + #else + acc[bi] += acc_tmp[bi]; + #endif + } + } +#endif } // ===================================================================================================================================== // Leftovers diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp index c6b0acda06c56a..07d81dce5e3f23 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp @@ -15,14 +15,20 @@ static constexpr size_t min_slm_size = 256; namespace kernel_selector { static std::pair get_input_bf_size(const fully_connected_params& params) { - size_t input_f = params.inputs[0].Feature().v; - size_t input_batch = params.inputs[0].Batch().v; + auto& input = params.inputs[0]; + size_t input_f = input.Feature().v; + size_t input_batch = input.Batch().v; + // 3D input if (params.outputs[0].GetLayout() == DataLayout::bfyx) { - input_f = params.inputs[0].Y().v; - input_batch = params.inputs[0].Batch().v * params.inputs[0].Feature().v; + input_f = input.Y().v; + input_batch = input.Batch().v * input.Feature().v; } + // In Some model, input_f could be dynamic in input0. It refers to IFM value of weight. + if (input.is_dynamic() && input_f == 0 && params.weights.IFM().v != 0) + input_f = params.weights.IFM().v; + return {input_batch, input_f}; } @@ -153,8 +159,7 @@ bool FullyConnected_bf_tiled::Validate(const Params& params) const { // Dynamic kernel doesn't support dynamic weights yet if (fc_params.is_shape_agnostic && input.is_dynamic()) { - if ((output.GetLayout() == DataLayout::bfyx && input.Y().v == 0) || - (output.GetLayout() == DataLayout::bf && input.Feature().v == 0)) + if (get_input_bf_size(fc_params).second == 0) return false; } @@ -509,6 +514,7 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para jit.AddConstant(MakeJitConstant("DYNAMIC_QUANTIZE", 0)); } + jit.AddConstant(MakeJitConstant("IFM_SIZE", get_input_bf_size(params).second)); jit.AddConstant(MakeJitConstant("SIMD", simd)); jit.AddConstant(MakeJitConstant("TILE_B", dispatchData.tile_m)); jit.AddConstant(MakeJitConstant("HALF_TILE_B", dispatchData.tile_m/2)); @@ -539,16 +545,18 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para // for 3d output we are treating spatial as features if (params.outputs[0].GetLayout() == DataLayout::bfyx) { + auto tile_in_b_pitch = (params.inputs[0].Feature().pitch == 0) ? get_input_bf_size(params).second : params.inputs[0].Feature().pitch; jit.AddConstant(MakeJitConstant("TILE_OUT_F_NUM", params.outputs[0].Y().v)); jit.AddConstant(MakeJitConstant("TILE_OUT_F_PITCH", params.outputs[0].Y().pitch)); - jit.AddConstant(MakeJitConstant("TILE_IN_B_PITCH", params.inputs[0].Feature().pitch)); + jit.AddConstant(MakeJitConstant("TILE_IN_B_PITCH", tile_in_b_pitch)); jit.AddConstant(MakeJitConstant("TILE_OUT_B_PITCH", params.outputs[0].Feature().pitch)); jit.AddConstant(MakeJitConstant("OUTPUT_3D", true)); jit.AddConstant(MakeJitConstant("BATCH_SIZE", "(OUTPUT_BATCH_NUM * OUTPUT_FEATURE_NUM)")); } else { + auto tile_in_b_pitch = (params.inputs[0].Batch().pitch == 0) ? get_input_bf_size(params).second : params.inputs[0].Batch().pitch; jit.AddConstant(MakeJitConstant("TILE_OUT_F_NUM", params.outputs[0].Feature().v)); jit.AddConstant(MakeJitConstant("TILE_OUT_F_PITCH", params.outputs[0].Feature().pitch)); - jit.AddConstant(MakeJitConstant("TILE_IN_B_PITCH", params.inputs[0].Batch().pitch)); + jit.AddConstant(MakeJitConstant("TILE_IN_B_PITCH", tile_in_b_pitch)); jit.AddConstant(MakeJitConstant("TILE_OUT_B_PITCH", params.outputs[0].Batch().pitch)); jit.AddConstant(MakeJitConstant("BATCH_SIZE", "(OUTPUT_BATCH_NUM)")); } @@ -614,6 +622,12 @@ void FullyConnected_bf_tiled::GetUpdateDispatchDataFunc(KernelData& kd) const { kd.kernels[execute_kernel_idx].params.workGroups.local = dispatchData.lws; kd.kernels[execute_kernel_idx].skip_execution = KernelData::SkipKernelExecution(prim_params); + auto& input = prim_params.inputs[0]; + if (prim_params.outputs[0].GetLayout() == DataLayout::bfyx) + OPENVINO_ASSERT(input.X().pad.Total() == 0 && input.Y().pad.Total() == 0, "[GPU] Invalid padding in spatial axes observed in FC bf tiled."); + else + OPENVINO_ASSERT(input.Feature().pad.Total() == 0, "[GPU] Invalid padding in f axis observed in FC bf tiled."); + if (!kd.internalBufferSizes.empty()) { // Pre-quantizing kernel was generated. Update the kernel and intermediate buffers or disable it. if (execute_type == KernelType::DEFAULT) { @@ -784,7 +798,8 @@ KernelsData FullyConnected_bf_tiled::GetMultiKernelsData(const Params ¶ms, { auto& quan_kernel = kd.kernels[0]; DispatchData dyn_quan_dispatch = dispatchData; - dyn_quan_dispatch.gws = {std::max((fc_params.inputs[0].PhysicalSize() / quantize_grp_size), (size_t)1), 1, 1}; + auto input_size = std::max(fc_params.inputs[0].PhysicalSize(), get_input_bf_size(fc_params).second); + dyn_quan_dispatch.gws = {input_size / quantize_grp_size, 1, 1}; dyn_quan_dispatch.lws = {16, 1, 1}; quan_kernel.params.workGroups.global = dyn_quan_dispatch.gws; quan_kernel.params.workGroups.local = dyn_quan_dispatch.lws; @@ -814,8 +829,8 @@ KernelsData FullyConnected_bf_tiled::GetMultiKernelsData(const Params ¶ms, quan_kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, 0}); quan_kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0}); quan_kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 1}); - kd.internalBufferSizes.push_back(fc_params.inputs[0].PhysicalSize()); - kd.internalBufferSizes.push_back(fc_params.inputs[0].PhysicalSize() / quantize_grp_size * 2); + kd.internalBufferSizes.push_back(input_size); + kd.internalBufferSizes.push_back(input_size / quantize_grp_size * 2); kernel_number++; } kd.internalBufferDataType = Datatype::F16; diff --git a/src/plugins/intel_gpu/tests/functional/single_layer_tests/dynamic/matmul.cpp b/src/plugins/intel_gpu/tests/functional/single_layer_tests/dynamic/matmul.cpp index c88a3826fe0f8f..36b6370a85c2f4 100644 --- a/src/plugins/intel_gpu/tests/functional/single_layer_tests/dynamic/matmul.cpp +++ b/src/plugins/intel_gpu/tests/functional/single_layer_tests/dynamic/matmul.cpp @@ -266,9 +266,10 @@ const std::vector IS3D_smoke = { }, {ov::test::static_shapes_to_test_representation({{1, 429}, {1, 429, 1}}), {true, true}}, + { { - {{-1, -1}, {{1, 129}, {2, 129}, {1, 129}, {2, 129}}}, + {{-1, -1, -1}, {{1, 1, 129}, {1, 2, 129}, {1, 1, 129}, {1, 2, 129}}}, {{1, 129, 1}, {{1, 129, 1}, {1, 129, 1}, {1, 129, 1}, {1, 129, 1}}} }, {true, true} diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp index 13607316545d78..3e4c1635229c86 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp @@ -1255,7 +1255,7 @@ class fully_connected_gpu_tests: public ::testing::Test { } } - void test_compressed_int4_scale_dyn_quan(bool is_caching_test, bool is_dynamic, int batch = 1) { + void test_compressed_int4_scale_dyn_quan(bool is_caching_test, bool is_dynamic, int batch = 1, bool is_wei_dyn = false) { tests::random_generator rg(GET_SUITE_NAME); auto& engine = get_test_engine(); @@ -1285,6 +1285,11 @@ class fully_connected_gpu_tests: public ::testing::Test { auto scale_data = rg.generate_random_1d(ofm_num * ifm_num / scales_group_size, -4.0f, 4.0f); set_values(scale_mem, scale_data); + if (is_wei_dyn) { + // ifm_num is dynamic + dyn_input_ps = is_3d ? ov::PartialShape{ -1, -1, -1 } : ov::PartialShape{ -1, -1}; + } + auto in_layout = is_dynamic ? layout{ dyn_input_ps, data_types::f16, format::bfyx } : layout{ input_ps, data_types::f16, format::bfyx }; @@ -1302,7 +1307,8 @@ class fully_connected_gpu_tests: public ::testing::Test { auto config = get_test_default_config(engine); config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); - config.set_property(ov::intel_gpu::optimize_data(true)); + ov::intel_gpu::ImplementationDesc fc_impl_desc = { format::bfyx, "fully_connected_gpu_bfyx_ref", impl_types::ocl }; + config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ {"fc_prim", fc_impl_desc} })); network network(engine, topology, config); network.set_input_data("input", input_mem); @@ -1365,13 +1371,13 @@ class fully_connected_gpu_tests: public ::testing::Test { } - void test_compressed_int4_scale(bool is_caching_test, bool is_dynamic, long int batch_num, long int scales_group_size = 128) { + void test_compressed_int4_scale(bool is_caching_test, bool is_dynamic, long int batch_num, long int scales_group_size = 128, bool is_wei_dyn = false) { tests::random_generator rg(GET_SUITE_NAME); auto& engine = get_test_engine(); auto supports_immad = engine.get_device_info().supports_immad; long int ifm_num = 256; - long int ofm_num = 256; + long int ofm_num = 512; auto input_mem = engine.allocate_memory({ { batch_num, ifm_num}, data_types::f16, format::bfyx }); auto weights_mem = engine.allocate_memory({ {ofm_num, ifm_num}, data_types::u4, format::bfyx }); @@ -1392,6 +1398,11 @@ class fully_connected_gpu_tests: public ::testing::Test { auto in_layout = is_dynamic ? layout{ {-1, ifm_num}, data_types::f16, format::bfyx } : layout{ {batch_num, ifm_num}, data_types::f16, format::bfyx }; + if (is_dynamic && is_wei_dyn) { + // ifm_num is dynamic + in_layout = layout{ {-1, -1}, data_types::f16, format::bfyx }; + } + auto dcomp_zp_name = supports_immad ? "dcomp_zp" : ""; auto fc_prim = fully_connected("fc_prim", input_info("input"), "weights", "", "scale", dcomp_zp_name, data_types::f16, 2, 2); @@ -1409,6 +1420,8 @@ class fully_connected_gpu_tests: public ::testing::Test { auto config = get_test_default_config(engine); config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + ov::intel_gpu::ImplementationDesc fc_impl_desc = { format::bfyx, "fully_connected_gpu_bfyx_ref", impl_types::ocl }; + config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ {"fc_prim", fc_impl_desc} })); network network(engine, topology, config); network.set_input_data("input", input_mem); @@ -1461,6 +1474,66 @@ class fully_connected_gpu_tests: public ::testing::Test { ASSERT_NEAR(output_ptr_ref[i], output_ptr[i], 9.0) << "i = " << i; } + void test_compressed_int4_accumulation(bool is_caching_test, bool is_dynamic, long int batch_num) { + auto& engine = get_test_engine(); + + long int ifm_num = 4096; + long int ofm_num = 4; + + auto input_mem = engine.allocate_memory({ { batch_num, ifm_num}, data_types::f16, format::bfyx }); + auto weights_mem = engine.allocate_memory({ {ofm_num, ifm_num}, data_types::u4, format::bfyx }); + auto scale_mem = engine.allocate_memory({ {ofm_num, ifm_num}, data_types::f16, format::bfyx }); + + auto input_data = std::vector(input_mem->count()); + std::fill(input_data.begin(), input_data.end(), 1); + set_values(input_mem, input_data); + + auto weigths_data = std::vector(weights_mem->count() / 2); + std::fill(weigths_data.begin(), weigths_data.end(), 0x11); + set_values(weights_mem, weigths_data); + + auto scale_data = std::vector(scale_mem->count()); + std::fill(scale_data.begin(), scale_data.end(), 1); + set_values(scale_mem, scale_data); + + auto in_layout = is_dynamic ? layout{ {-1, ifm_num}, data_types::f16, format::bfyx } + : layout{ {batch_num, ifm_num}, data_types::f16, format::bfyx }; + primitive_id empty_id = ""; + + auto fc_prim = fully_connected("fc_prim", input_info("input"), "weights", empty_id, "scale", empty_id, data_types::f16); + + topology topology( + input_layout("input", in_layout), + data("weights", weights_mem), + data("scale", scale_mem), + fc_prim + ); + + auto config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + config.set_property(ov::intel_gpu::optimize_data(true)); + network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test); + + // Impl is selected only when it is running from cldnn + if (is_dynamic && !engine.get_device_info().supports_immad) { + auto inst = network->get_primitive("fc_prim"); + auto impl = inst->get_impl(); + ASSERT_TRUE(impl != NULL); + ASSERT_EQ(impl->get_kernels().size(), 1); + } + + network->set_input_data("input", input_mem); + + auto outputs = network->execute(); + ASSERT_EQ(outputs.size(), size_t(1)); + ASSERT_EQ(outputs.begin()->first, "fc_prim"); + + auto output_mem = outputs.begin()->second.get_memory(); + cldnn::mem_lock output_ptr (output_mem, get_test_stream()); + for (size_t i = 0; i < output_ptr.size(); i++) + ASSERT_NEAR(ov::float16(ifm_num), output_ptr[i], 9.0) << "i = " << i; + } + void test_compressed_int4_scale_reuse(bool is_caching_test, bool is_dynamic, long int batch_num, long int scales_group_size = 128) { tests::random_generator rg(GET_SUITE_NAME); auto& engine = get_test_engine(); @@ -3259,6 +3332,10 @@ TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dynamic) { this->test_compressed_int4_scale(false, true, 260); } +TEST_F(fully_connected_gpu_tests, compressed_int4_dynamic_acc) { + this->test_compressed_int4_accumulation(false, true, 512); +} + TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dynamic_cached) { this->test_compressed_int4_scale(true, true, 260); } @@ -3323,6 +3400,32 @@ TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dyn_cache_dynamic) { this->test_compressed_int4_scale_dyn_quan(true, true, 512); } +TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dynamic_f_input) { + this->test_compressed_int4_scale(false, true, 256, true); +} + +TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dynamic_f_input_cached) { + this->test_compressed_int4_scale(true, true, 260, true); +} +TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dynamic_f_input_b1g64) { + this->test_compressed_int4_scale(false, true, 1, 64, true); +} + +TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dynamic_f_input_b1g128) { + this->test_compressed_int4_scale(false, true, 1, 128, true); +} + +TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dyn_quan_dynamic_f_input_single_batch) { + this->test_compressed_int4_scale_dyn_quan(false, true, 1, true); +} + +TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dyn_quan_dynamic_f_input) { + this->test_compressed_int4_scale_dyn_quan(false, true, 512, true); +} + +TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dyn_quan_dynamic_f_input_unaligned) { + this->test_compressed_int4_scale_dyn_quan(false, true, 511, true); +} TEST_F(fully_connected_gpu_tests, compressed_scale_bias) { diff --git a/src/plugins/intel_npu/src/al/include/npu.hpp b/src/plugins/intel_npu/src/al/include/npu.hpp index 925b80ca7734fe..5d46ae3ae2a4ac 100644 --- a/src/plugins/intel_npu/src/al/include/npu.hpp +++ b/src/plugins/intel_npu/src/al/include/npu.hpp @@ -92,6 +92,11 @@ class IDevice : public std::enable_shared_from_this { ov::intel_npu::MemType mem_type = ov::intel_npu::MemType::L0_INTERNAL_BUF, void* mem = nullptr); + virtual ov::SoPtr createHostTensor(std::shared_ptr context, + const ov::element::Type& element_type, + const ov::Shape& shape, + const Config& config); + protected: virtual ~IDevice() = default; }; diff --git a/src/plugins/intel_npu/src/al/src/npu.cpp b/src/plugins/intel_npu/src/al/src/npu.cpp index 3b8c670ffd3404..8da55475e9b4f7 100644 --- a/src/plugins/intel_npu/src/al/src/npu.cpp +++ b/src/plugins/intel_npu/src/al/src/npu.cpp @@ -81,4 +81,11 @@ ov::SoPtr IDevice::createRemoteTensor(std::shared_ptr IDevice::createHostTensor(std::shared_ptr, + const ov::element::Type&, + const ov::Shape&, + const Config&) { + OPENVINO_THROW("Create Host Tensor is not supported"); +} + } // namespace intel_npu diff --git a/src/plugins/intel_npu/src/backend/include/zero_device.hpp b/src/plugins/intel_npu/src/backend/include/zero_device.hpp index f198453b932d83..fc4ac58f7643c5 100644 --- a/src/plugins/intel_npu/src/backend/include/zero_device.hpp +++ b/src/plugins/intel_npu/src/backend/include/zero_device.hpp @@ -47,6 +47,11 @@ class ZeroDevice : public IDevice { ov::intel_npu::MemType mem_type = ov::intel_npu::MemType::L0_INTERNAL_BUF, void* mem = nullptr) override; + ov::SoPtr createHostTensor(std::shared_ptr context, + const ov::element::Type& element_type, + const ov::Shape& shape, + const Config& config) override; + ZeroDevice& operator=(const ZeroDevice&) = delete; ZeroDevice(const ZeroDevice&) = delete; diff --git a/src/plugins/intel_npu/src/backend/include/zero_host_tensor.hpp b/src/plugins/intel_npu/src/backend/include/zero_host_tensor.hpp new file mode 100644 index 00000000000000..ce28bf572541bc --- /dev/null +++ b/src/plugins/intel_npu/src/backend/include/zero_host_tensor.hpp @@ -0,0 +1,39 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "intel_npu/al/config/config.hpp" +#include "openvino/runtime/itensor.hpp" +#include "zero_init.hpp" +#include "zero_remote_tensor.hpp" + +namespace intel_npu { + +class ZeroHostTensor : public ov::ITensor { +public: + ZeroHostTensor(std::shared_ptr context, + std::shared_ptr init_structs, + const ov::element::Type element_type, + const ov::Shape& shape, + const Config& config); + + ~ZeroHostTensor() override = default; + + void* data(const ov::element::Type& element_type) const override; + const ov::element::Type& get_element_type() const override; + + const ov::Shape& get_shape() const override; + + const ov::Strides& get_strides() const override; + + void set_shape(ov::Shape new_shape) override; + + std::shared_ptr get_impl() const; + +private: + std::shared_ptr m_impl; +}; + +} // namespace intel_npu diff --git a/src/plugins/intel_npu/src/backend/include/zero_infer_request.hpp b/src/plugins/intel_npu/src/backend/include/zero_infer_request.hpp index 725a0e96c76f6e..cbf3a9466364be 100644 --- a/src/plugins/intel_npu/src/backend/include/zero_infer_request.hpp +++ b/src/plugins/intel_npu/src/backend/include/zero_infer_request.hpp @@ -53,8 +53,9 @@ class ZeroInferRequest final : public SyncInferRequest { * @brief Check the received remote tensor and copy it to the Level Zero tensor * @param tensor Reference to a tensor. * @param name Friendly name of the tensor. + * @param isParameter True if tensor is a parameter. */ - void set_remote_tensor_data(std::shared_ptr tensor, const std::string& name); + void set_remote_tensor_data(std::shared_ptr tensor, const std::string& name, bool isParameter); void check_network_precision(const ov::element::Type_t precision) const override; void create_pipeline(); @@ -77,8 +78,7 @@ class ZeroInferRequest final : public SyncInferRequest { // specific operations on the plugin in this case. size_t _batchSize = DEFAULT_BATCH_SIZE; - bool _createPipeline = true; - bool _updateCommandList = false; + bool _pipelineIsCreated = false; }; } // namespace intel_npu diff --git a/src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp b/src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp index 78bca3718711e3..b8724dcdd53f73 100644 --- a/src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp +++ b/src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp @@ -16,7 +16,6 @@ struct TensorData { void* mem; size_t size; bool levelZeroTensorCreatedLocally = true; - bool changed = false; }; struct Pipeline { @@ -32,7 +31,7 @@ struct Pipeline { virtual void pull(size_t batch_index) = 0; virtual void reset(size_t batch_index) const = 0; - virtual void updateCommandList(std::unordered_map& tensors_data, size_t batch_size) = 0; + virtual void updateCommandList(const TensorData& tensors_data, uint32_t index, size_t batch_size) = 0; protected: zeroMemory::MemoryManagementUnit _deviceInputs; diff --git a/src/plugins/intel_npu/src/backend/include/zero_wrappers.hpp b/src/plugins/intel_npu/src/backend/include/zero_wrappers.hpp index 2b432619fff4f8..76cfce8fecfa26 100644 --- a/src/plugins/intel_npu/src/backend/include/zero_wrappers.hpp +++ b/src/plugins/intel_npu/src/backend/include/zero_wrappers.hpp @@ -87,7 +87,7 @@ class CommandList { void appendGraphInitialize(const ze_graph_handle_t& graph_handle) const; void appendGraphExecute(const ze_graph_handle_t& graph_handle, const ze_graph_profiling_query_handle_t& profiling_query_handle) const; - void updateMutableCommandList(const void* pNext = nullptr) const; + void updateMutableCommandList(uint32_t arg_index, const void* arg_value) const; void appendNpuTimestamp(uint64_t* timestamp_buff) const; void appendBarrier() const; void close() const; @@ -96,9 +96,6 @@ class CommandList { inline ze_command_list_handle_t handle() const { return _handle; } - uint64_t getCommandListId() const { - return _command_id; - } private: ze_command_list_handle_t _handle = nullptr; diff --git a/src/plugins/intel_npu/src/backend/src/zero_device.cpp b/src/plugins/intel_npu/src/backend/src/zero_device.cpp index 56ee453b7d77c2..595ce734b533e9 100644 --- a/src/plugins/intel_npu/src/backend/src/zero_device.cpp +++ b/src/plugins/intel_npu/src/backend/src/zero_device.cpp @@ -9,6 +9,7 @@ #include "intel_npu/al/itt.hpp" #include "intel_npu/utils/zero/zero_api.hpp" #include "zero_executor.hpp" +#include "zero_host_tensor.hpp" #include "zero_infer_request.hpp" #include "zero_remote_tensor.hpp" #include "zero_utils.hpp" @@ -193,3 +194,10 @@ ov::SoPtr ZeroDevice::createRemoteTensor(std::shared_ptr(context, _initStructs, element_type, shape, config, tensor_type, mem_type, mem)}; }; + +ov::SoPtr ZeroDevice::createHostTensor(std::shared_ptr context, + const ov::element::Type& element_type, + const ov::Shape& shape, + const Config& config) { + return {std::make_shared(context, _initStructs, element_type, shape, config)}; +}; diff --git a/src/plugins/intel_npu/src/backend/src/zero_host_tensor.cpp b/src/plugins/intel_npu/src/backend/src/zero_host_tensor.cpp new file mode 100644 index 00000000000000..e4ebe2c1d5a8ba --- /dev/null +++ b/src/plugins/intel_npu/src/backend/src/zero_host_tensor.cpp @@ -0,0 +1,48 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "zero_host_tensor.hpp" + +#include "openvino/runtime/intel_npu/remote_properties.hpp" + +namespace intel_npu { + +ZeroHostTensor::ZeroHostTensor(std::shared_ptr context, + std::shared_ptr init_structs, + const ov::element::Type element_type, + const ov::Shape& shape, + const Config& config) + : m_impl(std::make_shared(context, + init_structs, + element_type, + shape, + config, + ov::intel_npu::TensorType::BINDED, + ov::intel_npu::MemType::L0_INTERNAL_BUF)) {} + +void* ZeroHostTensor::data(const ov::element::Type&) const { + return m_impl->get_properties().find(ov::intel_npu::mem_handle.name())->second.as(); +} + +const ov::element::Type& ZeroHostTensor::get_element_type() const { + return m_impl->get_element_type(); +} + +const ov::Shape& ZeroHostTensor::get_shape() const { + return m_impl->get_shape(); +} + +const ov::Strides& ZeroHostTensor::get_strides() const { + return m_impl->get_strides(); +} + +void ZeroHostTensor::set_shape(ov::Shape new_shape) { + m_impl->set_shape(new_shape); +} + +std::shared_ptr ZeroHostTensor::get_impl() const { + return m_impl; +} + +} // namespace intel_npu diff --git a/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp b/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp index 36738f32e9f6c3..773827a4864724 100644 --- a/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp +++ b/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp @@ -402,13 +402,26 @@ void ZeroInferRequest::set_tensor_data(std::shared_ptr tensor, cons if (setTensorData) { _tensorsData[name] = TensorData{_copyAllTensors.at(name)->data(), _copyAllTensors.at(name)->get_byte_size(), - levelZeroTensorCreatedLocally, - !_createPipeline}; - _updateCommandList = true; + levelZeroTensorCreatedLocally}; + + if (_pipelineIsCreated) { + _logger.debug("ZeroInferRequest::infer_async - update command list"); + + intel_npu::ZeroExecutor::ArgumentDescriptor desc; + if (isParameter) { + desc = _executor->inputs_desc_map().at(name); + } else { + desc = _executor->outputs_desc_map().at(name); + } + + _pipeline->updateCommandList(_tensorsData[name], desc.idx, _batchSize); + } } } -void ZeroInferRequest::set_remote_tensor_data(std::shared_ptr tensor, const std::string& name) { +void ZeroInferRequest::set_remote_tensor_data(std::shared_ptr tensor, + const std::string& name, + bool isParameter) { auto l0_context = reinterpret_cast( extract_object(tensor->get_context()->get_property(), ov::intel_npu::l0_context)); if (_initStructs->getContext() != l0_context) { @@ -421,8 +434,20 @@ void ZeroInferRequest::set_remote_tensor_data(std::shared_ptr } _copyAllTensors[name] = tensor; - _tensorsData[name] = TensorData{data, tensor->get_byte_size(), false, !_createPipeline}; - _updateCommandList = true; + _tensorsData[name] = TensorData{data, tensor->get_byte_size(), false}; + + if (_pipelineIsCreated) { + _logger.debug("ZeroInferRequest::infer_async - update command list"); + + intel_npu::ZeroExecutor::ArgumentDescriptor desc; + if (isParameter) { + desc = _executor->inputs_desc_map().at(name); + } else { + desc = _executor->outputs_desc_map().at(name); + } + + _pipeline->updateCommandList(_tensorsData[name], desc.idx, _batchSize); + } } void ZeroInferRequest::set_tensor(const ov::Output& port, const ov::SoPtr& tensor) { @@ -444,7 +469,9 @@ void ZeroInferRequest::set_tensor(const ov::Output& port, const ov::op::util::is_parameter(port.get_node())); } else { _logger.debug("ZeroInferRequest::set_tensor - set new remote tensor"); - set_remote_tensor_data(remoteTensor, port.get_node()->get_friendly_name()); + set_remote_tensor_data(remoteTensor, + port.get_node()->get_friendly_name(), + ov::op::util::is_parameter(port.get_node())); } } } @@ -489,23 +516,11 @@ void ZeroInferRequest::infer_async() { OV_ITT_SCOPED_TASK(itt::domains::LevelZeroBackend, "infer_async"); _executor->mutexLock(); - - if (_createPipeline) { + if (!_pipelineIsCreated) { create_pipeline(); - _createPipeline = false; - _updateCommandList = false; + _pipelineIsCreated = true; } - - if (_initStructs->getMutableCommandListVersion()) { - if (_updateCommandList) { - _logger.debug("ZeroInferRequest::infer_async - update command list"); - _pipeline->updateCommandList(_tensorsData, _batchSize); - - _updateCommandList = false; - } - } - _executor->mutexUnlock(); for (const std::string& name : _inputAndStateInputNames) { diff --git a/src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp b/src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp index 3a4ea554d157ec..f98e84a34a0a46 100644 --- a/src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp +++ b/src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp @@ -143,7 +143,7 @@ struct DiscretePipeline final : public Pipeline { } }; - void updateCommandList(std::unordered_map&, size_t) override{}; + void updateCommandList(const TensorData&, uint32_t, size_t) override {} private: const Config _config; @@ -274,60 +274,11 @@ struct IntegratedPipeline final : public Pipeline { _logger.debug("IntegratedPipeline - rest() completed"); }; - void updateCommandList(std::unordered_map& tensors_data, size_t batch_size) override { - std::vector mutable_argument_desc; - int32_t changed_tensors = 0; - - for (const auto& desc : tensors_data) { - if (desc.second.changed == true) { - changed_tensors++; - } - } - - mutable_argument_desc.reserve(changed_tensors); - - auto set_mutable_desc = - [&](int32_t mutable_desc_index, uint64_t command_list_id, uint32_t arg_index, const void* arg_value) { - mutable_argument_desc.emplace_back(ze_mutable_graph_argument_exp_desc_t{ - ZE_STRUCTURE_TYPE_MUTABLE_GRAPH_ARGUMENT_EXP_DESC, - mutable_desc_index ? &mutable_argument_desc.at(mutable_desc_index - 1) : nullptr, - command_list_id, - arg_index, - arg_value}); - }; - + void updateCommandList(const TensorData& tensors_data, uint32_t index, size_t batch_size) override { for (size_t i = 0; i < batch_size; i++) { - int32_t mutable_argument_desc_index = -1; - - for (const auto& desc : _executor->inputs_desc_map()) { - TensorData& inputTensorData = tensors_data.at(desc.first); - - if (inputTensorData.changed == true) { - set_mutable_desc( - ++mutable_argument_desc_index, - _command_lists.at(i)->getCommandListId(), - desc.second.idx, - static_cast(inputTensorData.mem) + (i * inputTensorData.size) / batch_size); - - inputTensorData.changed = false; - } - } - - for (const auto& desc : _executor->outputs_desc_map()) { - TensorData& outputTensorData = tensors_data.at(desc.first); - - if (outputTensorData.changed == true) { - set_mutable_desc( - ++mutable_argument_desc_index, - _command_lists.at(i)->getCommandListId(), - desc.second.idx, - static_cast(outputTensorData.mem) + (i * outputTensorData.size) / batch_size); - - outputTensorData.changed = false; - } - } - - _command_lists.at(i)->updateMutableCommandList(&mutable_argument_desc.at(mutable_argument_desc_index)); + _command_lists.at(i)->updateMutableCommandList( + index, + static_cast(tensors_data.mem) + (i * tensors_data.size) / batch_size); _command_lists.at(i)->close(); } }; diff --git a/src/plugins/intel_npu/src/backend/src/zero_wrappers.cpp b/src/plugins/intel_npu/src/backend/src/zero_wrappers.cpp index 77ebd858cc3e07..2cd249aad19a92 100644 --- a/src/plugins/intel_npu/src/backend/src/zero_wrappers.cpp +++ b/src/plugins/intel_npu/src/backend/src/zero_wrappers.cpp @@ -114,11 +114,16 @@ CommandList::~CommandList() { _log.error("zeCommandListDestroy failed %#X", uint64_t(result)); } } -void CommandList::updateMutableCommandList(const void* pNext) const { - ze_mutable_commands_exp_desc_t mutable_commands_exp_desc_t = { - static_cast(ZE_MUTABLE_COMMAND_EXP_FLAG_GRAPH_ARGUMENT), - pNext, - 0}; +void CommandList::updateMutableCommandList(uint32_t arg_index, const void* arg_value) const { + ze_mutable_graph_argument_exp_desc_t desc = {ZE_STRUCTURE_TYPE_MUTABLE_GRAPH_ARGUMENT_EXP_DESC, + nullptr, + _command_id, + arg_index, + arg_value}; + + ze_mutable_commands_exp_desc_t mutable_commands_exp_desc_t = {ZE_STRUCTURE_TYPE_MUTABLE_COMMANDS_EXP_DESC, + &desc, + 0}; zeroUtils::throwOnFail("zeCommandListUpdateMutableCommandsExp", zeCommandListUpdateMutableCommandsExp(_handle, &mutable_commands_exp_desc_t)); diff --git a/src/plugins/intel_npu/src/plugin/include/remote_context.hpp b/src/plugins/intel_npu/src/plugin/include/remote_context.hpp index 398884dcb673ac..2fce44526c358e 100644 --- a/src/plugins/intel_npu/src/plugin/include/remote_context.hpp +++ b/src/plugins/intel_npu/src/plugin/include/remote_context.hpp @@ -43,6 +43,14 @@ class RemoteContextImpl : public ov::IRemoteContext { const ov::Shape& shape, const ov::AnyMap& params) override; + /** + * @brief This method is used to create a host tensor object friendly for the device in current context. + * @param type Tensor element type. + * @param shape Tensor shape. + * @return A tensor instance with device friendly memory. + */ + ov::SoPtr create_host_tensor(const ov::element::Type type, const ov::Shape& shape) override; + private: std::shared_ptr get_this_shared_ptr(); diff --git a/src/plugins/intel_npu/src/plugin/src/remote_context.cpp b/src/plugins/intel_npu/src/plugin/src/remote_context.cpp index 25683be31fe9e4..9539826f985147 100644 --- a/src/plugins/intel_npu/src/plugin/src/remote_context.cpp +++ b/src/plugins/intel_npu/src/plugin/src/remote_context.cpp @@ -84,6 +84,15 @@ ov::SoPtr RemoteContextImpl::create_tensor(const ov::element: mem_handle_object); } +ov::SoPtr RemoteContextImpl::create_host_tensor(const ov::element::Type type, const ov::Shape& shape) { + auto device = _backends->getDevice(_config.get()); + if (device == nullptr) { + OPENVINO_THROW("Device is not available"); + } + + return device->createHostTensor(get_this_shared_ptr(), type, shape, _config); +} + const std::string& RemoteContextImpl::get_device_name() const { return _device_name; } diff --git a/src/plugins/intel_npu/src/utils/include/intel_npu/utils/zero/zero_api.hpp b/src/plugins/intel_npu/src/utils/include/intel_npu/utils/zero/zero_api.hpp index 3de0dedd8d6878..6cb9e23d203c11 100644 --- a/src/plugins/intel_npu/src/utils/include/intel_npu/utils/zero/zero_api.hpp +++ b/src/plugins/intel_npu/src/utils/include/intel_npu/utils/zero/zero_api.hpp @@ -8,6 +8,8 @@ #include +#include "openvino/core/except.hpp" + #ifndef _WIN32 # define LIB_ZE_LOADER_SUFFIX ".1" #endif @@ -26,8 +28,6 @@ namespace intel_npu { symbol_statement(zeCommandListCreate) \ symbol_statement(zeCommandListDestroy) \ symbol_statement(zeCommandListReset) \ - symbol_statement(zeCommandListGetNextCommandIdExp) \ - symbol_statement(zeCommandListUpdateMutableCommandsExp) \ symbol_statement(zeCommandQueueCreate) \ symbol_statement(zeCommandQueueDestroy) \ symbol_statement(zeCommandQueueExecuteCommandLists) \ @@ -58,6 +58,11 @@ namespace intel_npu { symbol_statement(zeMemAllocHost) \ symbol_statement(zeMemFree) \ symbol_statement(zeMemGetAllocProperties) + +//unsupported symbols with older ze_loader versions +#define weak_symbols_list() \ + symbol_statement(zeCommandListGetNextCommandIdExp) \ + symbol_statement(zeCommandListUpdateMutableCommandsExp) // clang-format on class ZeroApi { @@ -73,6 +78,7 @@ class ZeroApi { } #define symbol_statement(symbol) decltype(&::symbol) symbol; symbols_list(); + weak_symbols_list(); #undef symbol_statement private: @@ -84,11 +90,17 @@ class ZeroApi { #define symbol_statement(symbol) \ template \ inline typename std::invoke_result::type wrapped_##symbol(Args... args) { \ - return ZeroApi::getInstance().symbol(std::forward(args)...); \ + auto& ref = ZeroApi::getInstance(); \ + if (ref.symbol == nullptr) { \ + OPENVINO_THROW("Unsupported symbol " #symbol); \ + } \ + return ref.symbol(std::forward(args)...); \ } symbols_list(); +weak_symbols_list(); #undef symbol_statement #define symbol_statement(symbol) inline decltype(&::symbol) symbol = wrapped_##symbol; symbols_list(); +weak_symbols_list(); #undef symbol_statement } // namespace intel_npu diff --git a/src/plugins/intel_npu/src/utils/src/zero/zero_api.cpp b/src/plugins/intel_npu/src/utils/src/zero/zero_api.cpp index fd3e128b3afc94..991e8d5f9f9e65 100644 --- a/src/plugins/intel_npu/src/utils/src/zero/zero_api.cpp +++ b/src/plugins/intel_npu/src/utils/src/zero/zero_api.cpp @@ -4,7 +4,6 @@ #include "intel_npu/utils/zero/zero_api.hpp" -#include "openvino/core/except.hpp" #include "openvino/util/file_util.hpp" #include "openvino/util/shared_object.hpp" @@ -29,14 +28,24 @@ ZeroApi::ZeroApi() { try { #define symbol_statement(symbol) \ this->symbol = reinterpret_cast(ov::util::get_symbol(lib, #symbol)); - symbols_list() + symbols_list(); #undef symbol_statement } catch (const std::runtime_error& error) { OPENVINO_THROW(error.what()); } +#define symbol_statement(symbol) \ + try { \ + this->symbol = reinterpret_cast(ov::util::get_symbol(lib, #symbol)); \ + } catch (const std::runtime_error&) { \ + this->symbol = nullptr; \ + } + weak_symbols_list(); +#undef symbol_statement + #define symbol_statement(symbol) symbol = this->symbol; symbols_list(); + weak_symbols_list(); #undef symbol_statement } diff --git a/src/plugins/intel_npu/tests/functional/behavior/infer_request_run.hpp b/src/plugins/intel_npu/tests/functional/behavior/infer_request_run.hpp index 660eb875f72d38..6b7372223c6bea 100644 --- a/src/plugins/intel_npu/tests/functional/behavior/infer_request_run.hpp +++ b/src/plugins/intel_npu/tests/functional/behavior/infer_request_run.hpp @@ -473,6 +473,55 @@ TEST_P(BatchingRunTests, SetInputTensorInfer_Caching) { for (size_t i = 0; i < shape_size; ++i) { EXPECT_NEAR(actual[i], 6.f, 1e-5) << "Expected=6, actual=" << actual[i] << " for index " << i; } + + delete[] buffer; +} + +TEST_P(BatchingRunTests, CheckTwoRunsInfer) { + auto batch_shape = Shape{4, 2, 2, 2}; + auto shape_size = ov::shape_size(batch_shape); + auto model = createBatchingModel(element::f32, batch_shape, "N..."); + float* buffer = new float[shape_size]; + + auto context = core->get_default_context(target_device); + + compiled_model = core->compile_model(model, target_device, configuration); + ov::InferRequest inference_request; + inference_request = compiled_model.create_infer_request(); + + ov::Tensor tensor{element::f32, batch_shape, buffer}; + + inference_request.set_input_tensor(tensor); + auto actual_tensor = inference_request.get_output_tensor(0); + auto* actual = actual_tensor.data(); + auto* input_data = tensor.data(); + for (size_t i = 0; i < shape_size; ++i) { + input_data[i] = 5.f; + } + inference_request.infer(); // Adds '1' to each element + for (size_t i = 0; i < shape_size; ++i) { + EXPECT_NEAR(actual[i], 6.f, 1e-5) << "Expected=6, actual=" << actual[i] << " for index " << i; + } + + auto l0_host_input_tensor = context.create_host_tensor(ov::element::f32, batch_shape); + auto l0_host_output_tensor = context.create_host_tensor(ov::element::f32, actual_tensor.get_shape()); + + auto* input_data_host_tensor = l0_host_input_tensor.data(); + input_data = reinterpret_cast(input_data_host_tensor); + for (size_t i = 0; i < shape_size; ++i) { + input_data[i] = 5.f; + } + inference_request.set_input_tensor(l0_host_input_tensor); + inference_request.set_output_tensor(l0_host_output_tensor); + inference_request.infer(); + + auto* actual_host_tensor = l0_host_output_tensor.data(); + actual = reinterpret_cast(actual_host_tensor); + for (size_t i = 0; i < shape_size; ++i) { + EXPECT_NEAR(actual[i], 6.f, 1e-5) << "Expected=6, actual=" << actual[i] << " for index " << i; + } + + delete[] buffer; } } // namespace behavior diff --git a/src/plugins/intel_npu/tests/functional/behavior/remote_tensor_tests/remote_run.hpp b/src/plugins/intel_npu/tests/functional/behavior/remote_tensor_tests/remote_run.hpp index a6023e6e678d3d..a58da0253a9d74 100644 --- a/src/plugins/intel_npu/tests/functional/behavior/remote_tensor_tests/remote_run.hpp +++ b/src/plugins/intel_npu/tests/functional/behavior/remote_tensor_tests/remote_run.hpp @@ -128,7 +128,7 @@ TEST_P(RemoteRunTests, CheckRemoteTensorInternalBufChangingTensors) { // set output remote tensor auto remote_output_tensor = inference_request.get_output_tensor(); - auto output_remote_tensor = context.create_l0_host_tensor(ov::element::f32, remote_output_tensor.get_shape()); + auto output_remote_tensor = context.create_tensor(ov::element::f32, remote_output_tensor.get_shape()); remote_output_tensor = {}; OV_ASSERT_NO_THROW(inference_request.set_output_tensor(output_remote_tensor)); @@ -202,8 +202,7 @@ TEST_P(RemoteRunTests, CheckOutputDataFromTwoRunsInOutRemoteTensors1) { auto remote_input_tensor = context.create_l0_host_tensor(ov::element::f32, input_shape, ov::intel_npu::TensorType::INPUT); - remote_output_tensor = context.create_l0_host_tensor(ov::element::f32, output_shape) - .as(); + remote_output_tensor = context.create_l0_host_tensor(ov::element::f32, output_shape); memset(remote_input_tensor.get(), 99, byte_size); OV_ASSERT_NO_THROW(inference_request.set_input_tensor(remote_input_tensor)); @@ -305,8 +304,7 @@ TEST_P(RemoteRunTests, CheckOutputDataFromTwoRunsInOutRemoteTensors3) { auto remote_input_tensor = context.create_l0_host_tensor(ov::element::f32, input_shape, ov::intel_npu::TensorType::INPUT); - auto remote_output_tensor = - context.create_l0_host_tensor(ov::element::f32, output_shape).as(); + auto remote_output_tensor = context.create_l0_host_tensor(ov::element::f32, output_shape); memset(remote_input_tensor.get(), 99, byte_size); OV_ASSERT_NO_THROW(inference_request.set_input_tensor(remote_input_tensor)); @@ -318,6 +316,74 @@ TEST_P(RemoteRunTests, CheckOutputDataFromTwoRunsInOutRemoteTensors3) { EXPECT_EQ(memcmp(first_output.data(), second_output, first_output.get_byte_size()), 0); } +TEST_P(RemoteRunTests, CheckOutputDataFromTwoRunsInOutRemoteTensorsHostTensor1) { + // Skip test according to plugin specific disabledTestPatterns() (if any) + SKIP_IF_CURRENT_TEST_IS_DISABLED() + + ov::InferRequest inference_request; + ov::Tensor first_output; + + auto context = core->get_default_context(target_device).as(); + + OV_ASSERT_NO_THROW(compiled_model = core->compile_model(ov_model, target_device, configuration)); + OV_ASSERT_NO_THROW(inference_request = compiled_model.create_infer_request()); + auto tensor = inference_request.get_input_tensor(); + memset(tensor.data(), 99, tensor.get_byte_size()); + OV_ASSERT_NO_THROW(inference_request.infer()); + first_output = inference_request.get_output_tensor(); + + auto l0_host_input_tensor = context.create_host_tensor(ov::element::f32, tensor.get_shape()); + auto l0_host_output_tensor = context.create_host_tensor(ov::element::f32, first_output.get_shape()); + + memset(l0_host_input_tensor.data(), 99, tensor.get_byte_size()); + OV_ASSERT_NO_THROW(inference_request.set_input_tensor(l0_host_input_tensor)); + OV_ASSERT_NO_THROW(inference_request.set_output_tensor(l0_host_output_tensor)); + OV_ASSERT_NO_THROW(inference_request.infer()); + + EXPECT_NE(first_output.data(), l0_host_output_tensor.data()); + EXPECT_EQ(memcmp(first_output.data(), l0_host_output_tensor.data(), first_output.get_byte_size()), 0); +} + +TEST_P(RemoteRunTests, CheckOutputDataFromTwoRunsInOutRemoteTensorsHostTensor2) { + // Skip test according to plugin specific disabledTestPatterns() (if any) + SKIP_IF_CURRENT_TEST_IS_DISABLED() + + ov::InferRequest inference_request; + + auto context = core->get_default_context(target_device).as(); + + OV_ASSERT_NO_THROW(compiled_model = core->compile_model(ov_model, target_device, configuration)); + OV_ASSERT_NO_THROW(inference_request = compiled_model.create_infer_request()); + auto input_tensor = inference_request.get_input_tensor(); + auto output_tensor = inference_request.get_output_tensor(); + const auto byte_size = input_tensor.get_byte_size(); + auto input_shape = input_tensor.get_shape(); + auto output_shape = output_tensor.get_shape(); + input_tensor = {}; + output_tensor = {}; + + auto remote_input_tensor = + context.create_l0_host_tensor(ov::element::f32, input_shape, ov::intel_npu::TensorType::INPUT); + auto remote_output_tensor = + context.create_l0_host_tensor(ov::element::f32, output_shape, ov::intel_npu::TensorType::INPUT); + memset(remote_input_tensor.get(), 1, byte_size); + OV_ASSERT_NO_THROW(inference_request.set_input_tensor(remote_input_tensor)); + OV_ASSERT_NO_THROW(inference_request.set_output_tensor(remote_output_tensor)); + OV_ASSERT_NO_THROW(inference_request.infer()); + + auto l0_host_input_tensor = context.create_host_tensor(ov::element::f32, input_shape); + auto l0_host_output_tensor = context.create_host_tensor(ov::element::f32, output_shape); + + memset(l0_host_input_tensor.data(), 99, byte_size); + OV_ASSERT_NO_THROW(inference_request.set_input_tensor(l0_host_input_tensor)); + OV_ASSERT_NO_THROW(inference_request.set_output_tensor(l0_host_output_tensor)); + OV_ASSERT_NO_THROW(inference_request.infer()); + + EXPECT_NE(remote_output_tensor.get(), l0_host_output_tensor.data()); + EXPECT_NE(memcmp(remote_output_tensor.get(), l0_host_output_tensor.data(), remote_output_tensor.get_byte_size()), + 0); +} + } // namespace behavior } // namespace test } // namespace ov