From fd2117aff0045441338543bb5366ff90a7922f62 Mon Sep 17 00:00:00 2001 From: fishbell Date: Fri, 24 Jan 2025 14:53:03 +0800 Subject: [PATCH] remove blocked format Signed-off-by: fishbell --- .../quantize_gpu_scale_shift_vload8_opt.cl | 27 +++---- ...quantize_kernel_scale_shift_vload8_opt.cpp | 61 ++++---------- .../unit/test_cases/quantize_gpu_test.cpp | 80 ++++++++++++++++++- 3 files changed, 105 insertions(+), 63 deletions(-) diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/quantize_gpu_scale_shift_vload8_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/quantize_gpu_scale_shift_vload8_opt.cl index efed489fbcf736..409c4a09f32366 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/quantize_gpu_scale_shift_vload8_opt.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/quantize_gpu_scale_shift_vload8_opt.cl @@ -41,15 +41,6 @@ KERNEL(quantize_gpu_scale_shift_vload8_opt)(OPTIONAL_SHAPE_INFO_ARG OUTPUT_VEC_TYPE res; - INPUT1_TYPE input_scale_val = IN_SCALE_VAL; - - INPUT1_TYPE input_shift_val = IN_SHIFT_VAL; - - INPUT1_TYPE output_scale_val = OUT_SCALE_VAL; - - INPUT1_TYPE output_shift_val = OUT_SHIFT_VAL; - - #if HAS_CLAMP #if CAN_USE_OUTPUT_RANGE INPUT1_TYPE output_low_val = OUT_LO_VAL; @@ -67,9 +58,9 @@ KERNEL(quantize_gpu_scale_shift_vload8_opt)(OPTIONAL_SHAPE_INFO_ARG #if CAN_USE_OUTPUT_RANGE #if HAS_PRE_SHIFT - INPUT1_VEC_TYPE val = TO_VECTOR_TYPE(INPUT1_TYPE, 8)(in0) * input_scale_val + input_shift_val; + INPUT1_VEC_TYPE val = TO_VECTOR_TYPE(INPUT1_TYPE, 8)(in0) * IN_SCALE_VAL + IN_SHIFT_VAL; #else - INPUT1_VEC_TYPE val = TO_VECTOR_TYPE(INPUT1_TYPE, 8)(in0) * input_scale_val; + INPUT1_VEC_TYPE val = TO_VECTOR_TYPE(INPUT1_TYPE, 8)(in0) * IN_SCALE_VAL; #endif #if HAS_OUTPUT_RANGE_ROUND @@ -77,11 +68,11 @@ KERNEL(quantize_gpu_scale_shift_vload8_opt)(OPTIONAL_SHAPE_INFO_ARG #endif #if HAS_POST_SCALE - val *= output_scale_val; + val *= OUT_SCALE_VAL; #endif #if HAS_POST_SHIFT - val += output_shift_val; + val += OUT_SHIFT_VAL; #endif #if HAS_CLAMP @@ -107,17 +98,17 @@ KERNEL(quantize_gpu_scale_shift_vload8_opt)(OPTIONAL_SHAPE_INFO_ARG #endif #if HAS_PRE_SHIFT - val = round(val * input_scale_val + input_shift_val); + val = round(val * IN_SCALE_VAL + IN_SHIFT_VAL); #else - val = round(val * input_scale_val); + val = round(val * IN_SCALE_VAL); #endif #if HAS_POST_SCALE - val *= output_scale_val; + val *= OUT_SCALE_VAL; #endif #if HAS_POST_SHIFT - val += output_shift_val; + val += OUT_SHIFT_VAL; #endif #endif // CAN_USE_OUTPUT_RANGE @@ -127,7 +118,7 @@ KERNEL(quantize_gpu_scale_shift_vload8_opt)(OPTIONAL_SHAPE_INFO_ARG // *********************************** // #if FEATURE_BLOCKED_FORMAT - if (of < OUTPUT_FEATURE_NUM) + //if (of < OUTPUT_FEATURE_NUM) #endif #if OUTPUT_IS_FP res = TO_VECTOR_TYPE_SAT(OUTPUT_TYPE, 8)(val); diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/quantize/quantize_kernel_scale_shift_vload8_opt.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/quantize/quantize_kernel_scale_shift_vload8_opt.cpp index a99ec9f8b05069..6539928c9a76f5 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/quantize/quantize_kernel_scale_shift_vload8_opt.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/quantize/quantize_kernel_scale_shift_vload8_opt.cpp @@ -51,27 +51,6 @@ JitConstants QuantizeKernelScaleShift_vload8::GetJitConstants(const quantize_par const CommonDispatchData& dispatchData) const { JitConstants jit = Parent::GetJitConstants(params, dispatchData); - if (params.outputs[0].GetLayout() == DataLayout::b_fs_yx_fsv16 || - params.outputs[0].GetLayout() == DataLayout::b_fs_yx_fsv16 || - params.outputs[0].GetLayout() == DataLayout::b_fs_zyx_fsv32 || - params.outputs[0].GetLayout() == DataLayout::bs_fs_yx_bsv16_fsv16 || - params.outputs[0].GetLayout() == DataLayout::bs_fs_yx_bsv16_fsv32 || - params.outputs[0].GetLayout() == DataLayout::bs_fs_yx_bsv32_fsv16 || - params.outputs[0].GetLayout() == DataLayout::bs_fs_yx_bsv32_fsv32 || - params.outputs[0].GetLayout() == DataLayout::bs_fs_zyx_bsv16_fsv16 || - params.outputs[0].GetLayout() == DataLayout::bs_fs_zyx_bsv16_fsv32 || - params.outputs[0].GetLayout() == DataLayout::bs_fs_zyx_bsv32_fsv16 || - params.outputs[0].GetLayout() == DataLayout::bs_fs_zyx_bsv32_fsv32) { - jit.AddConstant(MakeJitConstant("FEATURE_BLOCKED_FORMAT", true)); - jit.AddConstant(MakeJitConstant("GWS_BATCH", 2)); - jit.AddConstant(MakeJitConstant("GWS_FEATURE", 1)); - jit.AddConstant(MakeJitConstant("GWS_YX", 0)); - jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", sub_group_size)); - } else { - auto tensor_jits = GetTensorFriendlyWorkGroupsJit(params.outputs[0]); - jit.Merge(tensor_jits); - } - auto can_use_output_range = params.per_tensor_output_range && params.out_lo < params.out_hi; auto has_output_range_round = !(params.outputs[0].GetDType() == Datatype::INT8 || params.outputs[0].GetDType() == Datatype::UINT8); @@ -106,31 +85,25 @@ bool QuantizeKernelScaleShift_vload8::Validate(const Params& p) const { !params.per_tensor_output_scale || !params.per_tensor_output_shift || (params.has_pre_shift && !params.per_tensor_input_shift)) return false; - // TBD, do we really need the strick block_size checking to support blocked foramt? - for (size_t i = 0; i < params.inputs.size(); i++) { - const auto input_layout = params.inputs[i].GetLayout(); - const auto batch_size = params.inputs[i].Batch().v; - const auto feature_size = params.inputs[i].Feature().v; - if ((input_layout == DataLayout::b_fs_yx_fsv16 && feature_size % 16 != 0) || - (input_layout == DataLayout::b_fs_yx_fsv32 && feature_size % 32 != 0) || - (input_layout == DataLayout::b_fs_zyx_fsv16 && feature_size % 16 != 0) || - (input_layout == DataLayout::b_fs_yx_fsv4 && feature_size % 8 != 0) || - input_layout == DataLayout::fs_b_yx_fsv32 || - (input_layout == DataLayout::bs_fs_yx_bsv32_fsv16 && (feature_size % 16 != 0 || batch_size % 32 != 0)) || - (input_layout == DataLayout::bs_fs_yx_bsv32_fsv32 && (feature_size % 32 != 0 || batch_size % 32 != 0))) + /*auto check_blocked_format = [] (const DataTensor& dt) -> bool { + // if padding is there for blocked format, there will be uncessary cals introduced if directly using vec compute + auto feature_block_size = 16; + auto feature_size = dt.Feature().v; + if (feature_size % feature_block_size != 0) return false; - } - if ((params.outputs[0].GetLayout() == DataLayout::b_fs_yx_fsv16 && params.outputs[0].Feature().v % 16 != 0) || - (params.outputs[0].GetLayout() == DataLayout::b_fs_yx_fsv32 && params.outputs[0].Feature().v % 32 != 0) || - (params.outputs[0].GetLayout() == DataLayout::b_fs_zyx_fsv16 && params.outputs[0].Feature().v % 16 != 0) || - (params.outputs[0].GetLayout() == DataLayout::b_fs_yx_fsv4 && params.outputs[0].Feature().v % 8 != 0) || - params.outputs[0].GetLayout() == DataLayout::fs_b_yx_fsv32 || - (params.outputs[0].GetLayout() == DataLayout::bs_fs_yx_bsv32_fsv16 && - (params.outputs[0].Feature().v % 16 != 0 || params.outputs[0].Batch().v % 32 != 0)) || - (params.outputs[0].GetLayout() == DataLayout::bs_fs_yx_bsv32_fsv32 && - (params.outputs[0].Feature().v % 32 != 0 || params.outputs[0].Batch().v % 32 != 0))) + if (dt.DoubleBlockedLayout()) { + auto batch_size = dt.Batch().v; + if (batch_size % feature_block_size != 0) + return false; + } + return true; + };*/ + if (!params.outputs[0].SimpleLayout() || params.outputs[0].GetLayout() != params.inputs[0].GetLayout() || params.outputs[0].PhysicalSize() % 8 != 0) + return false; + /*if (!params.outputs[0].SimpleLayout()) { + //return check_blocked_format(params.outputs[0]); return false; - // TBD maybe need more stric check? + }*/ return true; } diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/quantize_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/quantize_gpu_test.cpp index bb44a1022f368d..437af0b503b7a7 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/quantize_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/quantize_gpu_test.cpp @@ -749,6 +749,85 @@ TEST(quantize_gpu, dynamic) { } } +TEST(quantize_gpu, opt_vec_kernel) { + auto& engine = get_test_engine(); + + auto input = engine.allocate_memory({ { 1, 16, 2, 2 }, data_types::f32, format::bfyx }); + auto input_low = engine.allocate_memory({ { 1, 1, 1, 1 }, data_types::f32, format::bfyx }); + auto input_high = engine.allocate_memory({ { 1, 1, 1, 1 }, data_types::f32, format::bfyx }); + auto output_low = engine.allocate_memory({ { 1, 1, 1, 1 }, data_types::f32, format::bfyx }); + auto output_high = engine.allocate_memory({ { 1, 1, 1, 1 }, data_types::f32, format::bfyx }); + + layout in_dyn_layout { ov::PartialShape::dynamic(4), data_types::f32, format::bfyx }; + + set_values(input, { -1.0f, 2.1f, 3.0f, 4.0f, + 5.0f, 2.0f, 2.0f, 3.0f, + 4.0f, 6.0f, 3.0f, 3.0f, + 3.0f, 5.0f, 1.0f, 1.0f, + + 1.0f, 1.0f, 1.0f, 1.0f, + 4.0f, 6.0f, 3.0f, 3.0f, + 3.0f, 5.0f, 1.0f, 1.0f, + 1.0f, 1.0f, 1.0f, 1.0f, + + 1.0f, 2.0f, 3.0f, 4.0f, + 5.0f, 2.0f, 2.0f, 3.0f, + 4.0f, 6.0f, 3.0f, 3.0f, + 3.0f, 5.0f, 1.0f, 1.0f, + + 1.0f, 1.0f, 1.0f, 1.0f, + 4.0f, 6.0f, 3.0f, 3.0f, + 3.0f, 5.0f, 1.0f, 1.0f, + 1.0f, 1.0f, 1.0f, 1.0f }); + + set_values(input_low, { 0.0f}); + set_values(input_high, { 10.0f}); + + set_values(output_low, { 0.0f }); + set_values(output_high, { 255.0f }); + + std::vector ref_data = {0, 54, 76, 102, 128, 51, 51, 76, 102, 153, 76, 76, 76, 128, 26, 26, + 26, 26, 26, 26, 102, 153, 76, 76, 76, 128, 26, 26, 26, 26, 26, 26, + 26, 51, 76, 102, 128, 51, 51, 76, 102, 153, 76, 76, 76, 128, 26, 26, + 26, 26, 26, 26, 102, 153, 76, 76, 76, 128, 26, 26, 26, 26, 26, 26}; + + topology topology; + topology.add( + input_layout("input", in_dyn_layout), + data("input_low", input_low), + data("input_high", input_high), + data("output_low", output_low), + data("output_high", output_high), + quantize("quantize", input_info("input"), input_info("input_low"), input_info("input_high"), input_info("output_low"), input_info("output_high"), 255, data_types::u8) + ); + + ExecutionConfig config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + config.set_property(ov::intel_gpu::optimize_data(true)); + network network(engine, topology, config); + network.set_input_data("input", input); + + auto inst = network.get_primitive("quantize"); + auto impl = inst->get_impl(); + ASSERT_TRUE(impl != nullptr); + ASSERT_TRUE(impl->is_dynamic()); + + auto outputs = network.execute(); + + auto output = outputs.at("quantize").get_memory(); + cldnn::mem_lock output_ptr(output, get_test_stream()); + + // Check that layout and memory contains logical size of tensor + ASSERT_EQ(output->count(), (size_t)64); + ASSERT_EQ(output->get_layout().count(), (size_t)64); + + ASSERT_EQ(output->size(), ref_data.size() * sizeof(uint8_t)); + + for (size_t i = 0; i < ref_data.size(); ++i) { + ASSERT_NEAR(output_ptr[i], ref_data[i], 1) << " index = " << i; + } +} + TEST(quantize_gpu, dynamic_fsv16) { auto& engine = get_test_engine(); @@ -1050,7 +1129,6 @@ struct quantize_random_test : testing::TestWithParam