Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[do not review]for perf eval-opt quantize #28655

Draft
wants to merge 3 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
// Copyright (C) 2018-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "include/batch_headers/fetch_data.cl"

#define TO_OUTPUT_TYPE CAT(convert_, OUTPUT_TYPE)
#define INPUT0_VEC_TYPE MAKE_VECTOR_TYPE(INPUT0_TYPE, 8)
#define INPUT1_VEC_TYPE MAKE_VECTOR_TYPE(INPUT1_TYPE, 8)
#define OUTPUT_VEC_TYPE MAKE_VECTOR_TYPE(OUTPUT_TYPE, 8)

#define TO_VECTOR_TYPE_IMPL_8(elem_type) CAT(convert_##elem_type, 8)
#define TO_VECTOR_TYPE(elem_type, size) CAT(TO_VECTOR_TYPE_IMPL_, size)(elem_type)

#define TO_VECTOR_TYPE_IMPL_SAT_8(elem_type) CAT(convert_##elem_type, 8##_sat)
#define TO_VECTOR_TYPE_IMPL_SAT_RTE_8(elem_type) CAT(convert_##elem_type, 8##_sat_rte)
#define TO_VECTOR_TYPE_SAT(elem_type, size) CAT(TO_VECTOR_TYPE_IMPL_SAT_, size)(elem_type)
#define TO_VECTOR_TYPE_SAT_RTE(elem_type, size) CAT(TO_VECTOR_TYPE_IMPL_SAT_RTE_, size)(elem_type)
#define VLOAD_DECLS vload8(global_id, input)
#ifdef SUB_GROUP_SIZE
REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
#endif
#ifndef IS_DYNAMIC
__attribute__((reqd_work_group_size(LWS_0, LWS_1, LWS_2)))
#endif
KERNEL(quantize_gpu_scale_shift_vload8_opt)(OPTIONAL_SHAPE_INFO_ARG
const __global INPUT0_TYPE* input,
const __global INPUT1_TYPE* input_low,
const __global INPUT2_TYPE* input_high,
const __global INPUT3_TYPE* output_low,
const __global INPUT4_TYPE* output_high,
const __global INPUT5_TYPE* input_scale,
const __global INPUT6_TYPE* input_shift,
const __global INPUT7_TYPE* output_scale,
const __global INPUT8_TYPE* output_shift,
__global OUTPUT_TYPE* output)
{
const int global_id = get_global_id(0);

const INPUT0_VEC_TYPE in0 = VLOAD_DECLS;

OUTPUT_VEC_TYPE res;

#if HAS_CLAMP
#if CAN_USE_OUTPUT_RANGE
INPUT1_TYPE output_low_val = OUT_LO_VAL;
INPUT1_TYPE output_high_val = OUT_HI_VAL;
#else
INPUT1_TYPE input_low_val = IN_LO_VAL;
INPUT1_TYPE input_high_val = IN_HI_VAL;
#endif // CAN_USE_OUTPUT_RANGE
#endif // HAS_CLAMP

// ************************************************************* //
// Calculations for optimized branch with the output range usage //
// ************************************************************* //

#if CAN_USE_OUTPUT_RANGE

#if HAS_PRE_SHIFT
INPUT1_VEC_TYPE val = TO_VECTOR_TYPE(INPUT1_TYPE, 8)(in0) * IN_SCALE_VAL + IN_SHIFT_VAL;
#else
INPUT1_VEC_TYPE val = TO_VECTOR_TYPE(INPUT1_TYPE, 8)(in0) * IN_SCALE_VAL;
#endif

#if HAS_OUTPUT_RANGE_ROUND
val = round(val);
#endif

#if HAS_POST_SCALE
val *= OUT_SCALE_VAL;
#endif

#if HAS_POST_SHIFT
val += OUT_SHIFT_VAL;
#endif

#if HAS_CLAMP
#if HAS_MIN_CLAMP && HAS_MAX_CLAMP
val = clamp(val, output_low_val, output_high_val);
#elif HAS_MIN_CLAMP
val = max(val, output_low_val);
#else // HAS_MAX_CLAMP
val = min(val, output_high_val);
#endif
#endif // HAS_CLAMP

// ************************************************************** //
// Calculations for alternative branch with the input range usage //
// ************************************************************** //

#else // CAN_USE_OUTPUT_RANGE

#if HAS_CLAMP
INPUT1_VEC_TYPE val = clamp(TO_VECTOR_TYPE(INPUT1_TYPE, 8)(in0), input_low_val, input_high_val);
#else
INPUT1_VEC_TYPE val = TO_VECTOR_TYPE(INPUT1_TYPE, 8)(in0);
#endif

#if HAS_PRE_SHIFT
val = round(val * IN_SCALE_VAL + IN_SHIFT_VAL);
#else
val = round(val * IN_SCALE_VAL);
#endif

#if HAS_POST_SCALE
val *= OUT_SCALE_VAL;
#endif

#if HAS_POST_SHIFT
val += OUT_SHIFT_VAL;
#endif

#endif // CAN_USE_OUTPUT_RANGE

// *********************************** //
// Common section with results writing //
// *********************************** //

#if FEATURE_BLOCKED_FORMAT
//if (of < OUTPUT_FEATURE_NUM)
#endif
#if OUTPUT_IS_FP
res = TO_VECTOR_TYPE_SAT(OUTPUT_TYPE, 8)(val);
#else
res = TO_VECTOR_TYPE_SAT_RTE(OUTPUT_TYPE, 8)(val);;
#endif

vstore8(res, global_id, output);
}

#undef TO_OUTPUT_TYPE
#undef TO_OUTPUT_TYPE_SAT_RTE
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
// Copyright (C) 2018-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "quantize_kernel_scale_shift_vload8_opt.h"

#include <iostream>
#include <string>

#include "kernel_selector_utils.h"

static const size_t sub_group_size = 32;
static const size_t feature_size = 32;

namespace kernel_selector {
ParamsKey QuantizeKernelScaleShift_vload8::GetSupportedKey() const {
ParamsKey k;
k.EnableInputDataType(Datatype::UINT8);
k.EnableInputDataType(Datatype::INT8);
k.EnableInputDataType(Datatype::F16);
k.EnableInputDataType(Datatype::F32);
k.EnableOutputDataType(Datatype::F16);
k.EnableOutputDataType(Datatype::F32);
k.EnableOutputDataType(Datatype::UINT8);
k.EnableOutputDataType(Datatype::INT8);
k.EnableAllInputLayout();
k.EnableAllOutputLayout();
k.EnableTensorOffset();
k.EnableTensorPitches();
k.EnableBatching();
k.EnableDifferentTypes();
k.EnableQuantizeScaleShiftOpt();
k.EnableDynamicShapesSupport();
return k;
}

CommonDispatchData QuantizeKernelScaleShift_vload8::SetDefault(const quantize_params& params) const {
CommonDispatchData dispatchData;
// need special handle for blocked format??
if (true) {
dispatchData.gws[0] = std::max(params.outputs[0].LogicalSize() / 8, (size_t)1);
dispatchData.gws[1] = 1;
dispatchData.gws[2] = 1;
}
dispatchData.lws = GetOptimalLocalWorkGroupSizes({dispatchData.gws[0], dispatchData.gws[1], dispatchData.gws[2]},
params.engineInfo);
return dispatchData;
}

JitConstants QuantizeKernelScaleShift_vload8::GetJitConstants(const quantize_params& params,
const CommonDispatchData& dispatchData) const {
JitConstants jit = Parent::GetJitConstants(params, dispatchData);

auto can_use_output_range = params.per_tensor_output_range && params.out_lo < params.out_hi;
auto has_output_range_round =
!(params.outputs[0].GetDType() == Datatype::INT8 || params.outputs[0].GetDType() == Datatype::UINT8);

jit.AddConstant(MakeJitConstant("HAS_POST_SCALE", params.has_post_scale));
jit.AddConstant(MakeJitConstant("HAS_POST_SHIFT", params.has_post_shift));
jit.AddConstant(MakeJitConstant("HAS_PRE_SHIFT", params.has_pre_shift));
jit.AddConstant(MakeJitConstant("HAS_CLAMP", params.has_clamp));
jit.AddConstant(MakeJitConstant("HAS_MIN_CLAMP", params.has_min_clamp));
jit.AddConstant(MakeJitConstant("HAS_MAX_CLAMP", params.has_max_clamp));
jit.AddConstant(MakeJitConstant("IN_LO_VAL", params.in_lo));
jit.AddConstant(MakeJitConstant("IN_HI_VAL", params.in_hi));
jit.AddConstant(MakeJitConstant("OUT_LO_VAL", params.out_lo));
jit.AddConstant(MakeJitConstant("OUT_HI_VAL", params.out_hi));
jit.AddConstant(MakeJitConstant("IN_SCALE_VAL", params.in_scale));
jit.AddConstant(MakeJitConstant("IN_SHIFT_VAL", params.in_shift));
jit.AddConstant(MakeJitConstant("OUT_SCALE_VAL", params.out_scale));
jit.AddConstant(MakeJitConstant("OUT_SHIFT_VAL", params.out_shift));
jit.AddConstant(MakeJitConstant("CAN_USE_OUTPUT_RANGE", can_use_output_range));
jit.AddConstant(MakeJitConstant("HAS_OUTPUT_RANGE_ROUND", has_output_range_round));

return jit;
}

bool QuantizeKernelScaleShift_vload8::Validate(const Params& p) const {
const quantize_params& params = static_cast<const quantize_params&>(p);
if (params.inputs.size() != 9)
return false;

// this kernel is opt for per tensor quantization params for now
if (!params.per_tensor_input_range || !params.per_tensor_output_range || !params.per_tensor_input_scale ||
!params.per_tensor_output_scale || !params.per_tensor_output_shift ||
(params.has_pre_shift && !params.per_tensor_input_shift))
return false;
/*auto check_blocked_format = [] (const DataTensor& dt) -> bool {
// if padding is there for blocked format, there will be uncessary cals introduced if directly using vec compute
auto feature_block_size = 16;
auto feature_size = dt.Feature().v;
if (feature_size % feature_block_size != 0)
return false;
if (dt.DoubleBlockedLayout()) {
auto batch_size = dt.Batch().v;
if (batch_size % feature_block_size != 0)
return false;
}
return true;
};*/
if (!params.outputs[0].SimpleLayout() || params.outputs[0].GetLayout() != params.inputs[0].GetLayout() || params.outputs[0].PhysicalSize() % 8 != 0)
return false;
/*if (!params.outputs[0].SimpleLayout()) {
//return check_blocked_format(params.outputs[0]);
return false;
}*/
return true;
}

KernelsData QuantizeKernelScaleShift_vload8::GetKernelsData(const Params& params) const {
assert(params.GetType() == KernelType::QUANTIZE);

KernelData kd = KernelData::Default<quantize_params>(params);
quantize_params& nparams = *static_cast<quantize_params*>(kd.params.get());

if (!Validate(params)) {
return {};
}

auto dispatchData = SetDefault(nparams);
auto entry_point = GetEntryPoint(kernelName, nparams.layerID, params);
auto cldnn_jit = GetJitConstants(nparams, dispatchData);
auto jit = CreateJit(kernelName, cldnn_jit, entry_point);

GetUpdateDispatchDataFunc(kd);

auto& kernel = kd.kernels[0];

kernel.params.workGroups.global = dispatchData.gws;
kernel.params.workGroups.local = dispatchData.lws;
kernel.code.kernelString = GetKernelString(kernelName, jit, entry_point, params.engineInfo, EXE_MODE_DEFAULT);
kernel.params.arguments =
GetArgsDesc(static_cast<int>(nparams.inputs.size()), false, false, 0, 1, nparams.has_dynamic_tensors());

return {kd};
}

KernelsPriority QuantizeKernelScaleShift_vload8::GetKernelsPriority(const Params& /*params*/) const {
return FORCE_PRIORITY_8;
}
} // namespace kernel_selector
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
// Copyright (C) 2018-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include "quantize_kernel_base.h"

namespace kernel_selector {

class QuantizeKernelScaleShift_vload8 : public QuantizeKernelBase {
public:
using Parent = QuantizeKernelBase;

QuantizeKernelScaleShift_vload8() : QuantizeKernelBase("quantize_gpu_scale_shift_vload8_opt") {}
virtual ~QuantizeKernelScaleShift_vload8() {}
CommonDispatchData SetDefault(const quantize_params& params) const override;
KernelsPriority GetKernelsPriority(const Params& params) const override;
ParamsKey GetSupportedKey() const override;
KernelsData GetKernelsData(const Params& params) const override;
protected:
bool Validate(const Params& p) const override;
JitConstants GetJitConstants(const quantize_params& params, const CommonDispatchData& dispatchData) const override;
};
} // namespace kernel_selector
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,14 @@
#include "quantize_kernel_selector.h"
#include "quantize_kernel_ref.h"
#include "quantize_kernel_scale_shift_opt.h"
#include "quantize_kernel_scale_shift_vload8_opt.h"

namespace kernel_selector {

quantize_kernel_selector::quantize_kernel_selector() {
Attach<QuantizeKernelRef>();
Attach<QuantizeKernelScaleShift>();
Attach<QuantizeKernelScaleShift_vload8>();
}

KernelsData quantize_kernel_selector::GetBestKernels(const Params& params) const {
Expand Down
Loading
Loading