From fd2117aff0045441338543bb5366ff90a7922f62 Mon Sep 17 00:00:00 2001
From: fishbell <bell.song@intel.com>
Date: Fri, 24 Jan 2025 14:53:03 +0800
Subject: [PATCH] remove blocked format

Signed-off-by: fishbell <bell.song@intel.com>
---
 .../quantize_gpu_scale_shift_vload8_opt.cl    | 27 +++----
 ...quantize_kernel_scale_shift_vload8_opt.cpp | 61 ++++----------
 .../unit/test_cases/quantize_gpu_test.cpp     | 80 ++++++++++++++++++-
 3 files changed, 105 insertions(+), 63 deletions(-)

diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/quantize_gpu_scale_shift_vload8_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/quantize_gpu_scale_shift_vload8_opt.cl
index efed489fbcf736..409c4a09f32366 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/quantize_gpu_scale_shift_vload8_opt.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/quantize_gpu_scale_shift_vload8_opt.cl
@@ -41,15 +41,6 @@ KERNEL(quantize_gpu_scale_shift_vload8_opt)(OPTIONAL_SHAPE_INFO_ARG
 
     OUTPUT_VEC_TYPE res;
 
-    INPUT1_TYPE input_scale_val  = IN_SCALE_VAL;
-
-    INPUT1_TYPE input_shift_val  = IN_SHIFT_VAL;
-
-    INPUT1_TYPE output_scale_val = OUT_SCALE_VAL;
-
-    INPUT1_TYPE output_shift_val = OUT_SHIFT_VAL;
-
-
 #if HAS_CLAMP
 #if CAN_USE_OUTPUT_RANGE
     INPUT1_TYPE output_low_val   = OUT_LO_VAL;
@@ -67,9 +58,9 @@ KERNEL(quantize_gpu_scale_shift_vload8_opt)(OPTIONAL_SHAPE_INFO_ARG
 #if CAN_USE_OUTPUT_RANGE
 
 #if HAS_PRE_SHIFT
-    INPUT1_VEC_TYPE val = TO_VECTOR_TYPE(INPUT1_TYPE, 8)(in0) * input_scale_val + input_shift_val;
+    INPUT1_VEC_TYPE val = TO_VECTOR_TYPE(INPUT1_TYPE, 8)(in0) * IN_SCALE_VAL + IN_SHIFT_VAL;
 #else
-    INPUT1_VEC_TYPE val = TO_VECTOR_TYPE(INPUT1_TYPE, 8)(in0) * input_scale_val;
+    INPUT1_VEC_TYPE val = TO_VECTOR_TYPE(INPUT1_TYPE, 8)(in0) * IN_SCALE_VAL;
 #endif
 
 #if HAS_OUTPUT_RANGE_ROUND
@@ -77,11 +68,11 @@ KERNEL(quantize_gpu_scale_shift_vload8_opt)(OPTIONAL_SHAPE_INFO_ARG
 #endif
 
 #if HAS_POST_SCALE
-    val *= output_scale_val;
+    val *= OUT_SCALE_VAL;
 #endif
 
 #if HAS_POST_SHIFT
-    val += output_shift_val;
+    val += OUT_SHIFT_VAL;
 #endif
 
 #if HAS_CLAMP
@@ -107,17 +98,17 @@ KERNEL(quantize_gpu_scale_shift_vload8_opt)(OPTIONAL_SHAPE_INFO_ARG
 #endif
 
 #if HAS_PRE_SHIFT
-    val = round(val * input_scale_val + input_shift_val);
+    val = round(val * IN_SCALE_VAL + IN_SHIFT_VAL);
 #else
-    val = round(val * input_scale_val);
+    val = round(val * IN_SCALE_VAL);
 #endif
 
 #if HAS_POST_SCALE
-    val *= output_scale_val;
+    val *= OUT_SCALE_VAL;
 #endif
 
 #if HAS_POST_SHIFT
-    val += output_shift_val;
+    val += OUT_SHIFT_VAL;
 #endif
 
 #endif // CAN_USE_OUTPUT_RANGE
@@ -127,7 +118,7 @@ KERNEL(quantize_gpu_scale_shift_vload8_opt)(OPTIONAL_SHAPE_INFO_ARG
 // *********************************** //
 
 #if FEATURE_BLOCKED_FORMAT
-    if (of < OUTPUT_FEATURE_NUM)
+    //if (of < OUTPUT_FEATURE_NUM)
 #endif
 #if OUTPUT_IS_FP
         res = TO_VECTOR_TYPE_SAT(OUTPUT_TYPE, 8)(val);
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/quantize/quantize_kernel_scale_shift_vload8_opt.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/quantize/quantize_kernel_scale_shift_vload8_opt.cpp
index a99ec9f8b05069..6539928c9a76f5 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/quantize/quantize_kernel_scale_shift_vload8_opt.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/quantize/quantize_kernel_scale_shift_vload8_opt.cpp
@@ -51,27 +51,6 @@ JitConstants QuantizeKernelScaleShift_vload8::GetJitConstants(const quantize_par
                                                               const CommonDispatchData& dispatchData) const {
     JitConstants jit = Parent::GetJitConstants(params, dispatchData);
 
-    if (params.outputs[0].GetLayout() == DataLayout::b_fs_yx_fsv16 ||
-        params.outputs[0].GetLayout() == DataLayout::b_fs_yx_fsv16 ||
-        params.outputs[0].GetLayout() == DataLayout::b_fs_zyx_fsv32 ||
-        params.outputs[0].GetLayout() == DataLayout::bs_fs_yx_bsv16_fsv16 ||
-        params.outputs[0].GetLayout() == DataLayout::bs_fs_yx_bsv16_fsv32 ||
-        params.outputs[0].GetLayout() == DataLayout::bs_fs_yx_bsv32_fsv16 ||
-        params.outputs[0].GetLayout() == DataLayout::bs_fs_yx_bsv32_fsv32 ||
-        params.outputs[0].GetLayout() == DataLayout::bs_fs_zyx_bsv16_fsv16 ||
-        params.outputs[0].GetLayout() == DataLayout::bs_fs_zyx_bsv16_fsv32 ||
-        params.outputs[0].GetLayout() == DataLayout::bs_fs_zyx_bsv32_fsv16 ||
-        params.outputs[0].GetLayout() == DataLayout::bs_fs_zyx_bsv32_fsv32) {
-        jit.AddConstant(MakeJitConstant("FEATURE_BLOCKED_FORMAT", true));
-        jit.AddConstant(MakeJitConstant("GWS_BATCH", 2));
-        jit.AddConstant(MakeJitConstant("GWS_FEATURE", 1));
-        jit.AddConstant(MakeJitConstant("GWS_YX", 0));
-        jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", sub_group_size));
-    } else {
-        auto tensor_jits = GetTensorFriendlyWorkGroupsJit(params.outputs[0]);
-        jit.Merge(tensor_jits);
-    }
-
     auto can_use_output_range = params.per_tensor_output_range && params.out_lo < params.out_hi;
     auto has_output_range_round =
         !(params.outputs[0].GetDType() == Datatype::INT8 || params.outputs[0].GetDType() == Datatype::UINT8);
@@ -106,31 +85,25 @@ bool QuantizeKernelScaleShift_vload8::Validate(const Params& p) const {
         !params.per_tensor_output_scale || !params.per_tensor_output_shift ||
         (params.has_pre_shift && !params.per_tensor_input_shift))
         return false;
-    // TBD, do we really need the strick block_size checking to support blocked foramt?
-    for (size_t i = 0; i < params.inputs.size(); i++) {
-        const auto input_layout = params.inputs[i].GetLayout();
-        const auto batch_size = params.inputs[i].Batch().v;
-        const auto feature_size = params.inputs[i].Feature().v;
-        if ((input_layout == DataLayout::b_fs_yx_fsv16 && feature_size % 16 != 0) ||
-            (input_layout == DataLayout::b_fs_yx_fsv32 && feature_size % 32 != 0) ||
-            (input_layout == DataLayout::b_fs_zyx_fsv16 && feature_size % 16 != 0) ||
-            (input_layout == DataLayout::b_fs_yx_fsv4 && feature_size % 8 != 0) ||
-            input_layout == DataLayout::fs_b_yx_fsv32 ||
-            (input_layout == DataLayout::bs_fs_yx_bsv32_fsv16 && (feature_size % 16 != 0 || batch_size % 32 != 0)) ||
-            (input_layout == DataLayout::bs_fs_yx_bsv32_fsv32 && (feature_size % 32 != 0 || batch_size % 32 != 0)))
+    /*auto check_blocked_format = [] (const DataTensor& dt) -> bool {
+        // if padding is there for blocked format, there will be uncessary cals introduced if directly using vec compute
+        auto feature_block_size = 16;
+        auto feature_size = dt.Feature().v;
+        if (feature_size % feature_block_size != 0)
             return false;
-    }
-    if ((params.outputs[0].GetLayout() == DataLayout::b_fs_yx_fsv16 && params.outputs[0].Feature().v % 16 != 0) ||
-        (params.outputs[0].GetLayout() == DataLayout::b_fs_yx_fsv32 && params.outputs[0].Feature().v % 32 != 0) ||
-        (params.outputs[0].GetLayout() == DataLayout::b_fs_zyx_fsv16 && params.outputs[0].Feature().v % 16 != 0) ||
-        (params.outputs[0].GetLayout() == DataLayout::b_fs_yx_fsv4 && params.outputs[0].Feature().v % 8 != 0) ||
-        params.outputs[0].GetLayout() == DataLayout::fs_b_yx_fsv32 ||
-        (params.outputs[0].GetLayout() == DataLayout::bs_fs_yx_bsv32_fsv16 &&
-         (params.outputs[0].Feature().v % 16 != 0 || params.outputs[0].Batch().v % 32 != 0)) ||
-        (params.outputs[0].GetLayout() == DataLayout::bs_fs_yx_bsv32_fsv32 &&
-         (params.outputs[0].Feature().v % 32 != 0 || params.outputs[0].Batch().v % 32 != 0)))
+        if (dt.DoubleBlockedLayout()) {
+            auto batch_size = dt.Batch().v;
+            if (batch_size % feature_block_size != 0)
+                return false;
+        }
+        return true;
+    };*/
+    if (!params.outputs[0].SimpleLayout() || params.outputs[0].GetLayout() != params.inputs[0].GetLayout() || params.outputs[0].PhysicalSize() % 8 != 0)
+        return false;
+    /*if (!params.outputs[0].SimpleLayout()) {
+        //return check_blocked_format(params.outputs[0]);
         return false;
-    // TBD maybe need more stric check?
+    }*/
     return true;
 }
 
diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/quantize_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/quantize_gpu_test.cpp
index bb44a1022f368d..437af0b503b7a7 100644
--- a/src/plugins/intel_gpu/tests/unit/test_cases/quantize_gpu_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/test_cases/quantize_gpu_test.cpp
@@ -749,6 +749,85 @@ TEST(quantize_gpu, dynamic) {
     }
 }
 
+TEST(quantize_gpu, opt_vec_kernel) {
+    auto& engine = get_test_engine();
+
+    auto input       = engine.allocate_memory({ { 1, 16, 2, 2 }, data_types::f32, format::bfyx });
+    auto input_low   = engine.allocate_memory({ { 1, 1, 1, 1 }, data_types::f32, format::bfyx });
+    auto input_high  = engine.allocate_memory({ { 1, 1, 1, 1 }, data_types::f32, format::bfyx });
+    auto output_low  = engine.allocate_memory({ { 1, 1,  1, 1 }, data_types::f32, format::bfyx });
+    auto output_high = engine.allocate_memory({ { 1, 1,  1, 1 }, data_types::f32, format::bfyx });
+
+    layout in_dyn_layout { ov::PartialShape::dynamic(4), data_types::f32, format::bfyx };
+
+    set_values(input, { -1.0f, 2.1f, 3.0f, 4.0f,
+                         5.0f, 2.0f, 2.0f, 3.0f,
+                         4.0f, 6.0f, 3.0f, 3.0f,
+                         3.0f, 5.0f, 1.0f, 1.0f,
+
+                         1.0f, 1.0f, 1.0f, 1.0f,
+                         4.0f, 6.0f, 3.0f, 3.0f,
+                         3.0f, 5.0f, 1.0f, 1.0f,
+                         1.0f, 1.0f, 1.0f, 1.0f,
+
+                         1.0f, 2.0f, 3.0f, 4.0f,
+                         5.0f, 2.0f, 2.0f, 3.0f,
+                         4.0f, 6.0f, 3.0f, 3.0f,
+                         3.0f, 5.0f, 1.0f, 1.0f,
+
+                         1.0f, 1.0f, 1.0f, 1.0f,
+                         4.0f, 6.0f, 3.0f, 3.0f,
+                         3.0f, 5.0f, 1.0f, 1.0f,
+                         1.0f, 1.0f, 1.0f, 1.0f });
+
+    set_values(input_low,  { 0.0f});
+    set_values(input_high, { 10.0f});
+
+    set_values(output_low,  { 0.0f });
+    set_values(output_high, { 255.0f });
+
+    std::vector<uint8_t> ref_data = {0,  54, 76, 102, 128, 51,  51, 76, 102, 153, 76, 76, 76, 128, 26, 26,
+                                     26, 26, 26, 26,  102, 153, 76, 76, 76,  128, 26, 26, 26, 26,  26, 26,
+                                     26, 51, 76, 102, 128, 51,  51, 76, 102, 153, 76, 76, 76, 128, 26, 26,
+                                     26, 26, 26, 26,  102, 153, 76, 76, 76,  128, 26, 26, 26, 26,  26, 26};
+
+    topology topology;
+    topology.add(
+        input_layout("input", in_dyn_layout),
+        data("input_low", input_low),
+        data("input_high", input_high),
+        data("output_low", output_low),
+        data("output_high", output_high),
+        quantize("quantize", input_info("input"), input_info("input_low"), input_info("input_high"), input_info("output_low"), input_info("output_high"), 255, data_types::u8)
+    );
+
+    ExecutionConfig config = get_test_default_config(engine);
+    config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
+    config.set_property(ov::intel_gpu::optimize_data(true));
+    network network(engine, topology, config);
+    network.set_input_data("input", input);
+
+    auto inst = network.get_primitive("quantize");
+    auto impl = inst->get_impl();
+    ASSERT_TRUE(impl != nullptr);
+    ASSERT_TRUE(impl->is_dynamic());
+
+    auto outputs = network.execute();
+
+    auto output = outputs.at("quantize").get_memory();
+    cldnn::mem_lock<uint8_t> output_ptr(output, get_test_stream());
+
+    // Check that layout and memory contains logical size of tensor
+    ASSERT_EQ(output->count(), (size_t)64);
+    ASSERT_EQ(output->get_layout().count(), (size_t)64);
+
+    ASSERT_EQ(output->size(), ref_data.size() * sizeof(uint8_t));
+
+    for (size_t i = 0; i < ref_data.size(); ++i) {
+        ASSERT_NEAR(output_ptr[i], ref_data[i], 1) << " index = " << i;
+    }
+}
+
 TEST(quantize_gpu, dynamic_fsv16) {
     auto& engine = get_test_engine();
 
@@ -1050,7 +1129,6 @@ struct quantize_random_test : testing::TestWithParam<quantize_random_test_params
             FAIL() << "Not supported inputs number: " << params.inputs_num;
         }
 
-
         network net_opt(engine, topo_opt, get_test_default_config(engine));
         net_opt.set_input_data("input_opt", input_opt);