Skip to content

Commit

Permalink
fix: revert acl_winograd_convolution to stateful
Browse files Browse the repository at this point in the history
partially reverts 16d6dd4: "cpu: aarch64: Enable stateless ACL depthwise convolution"

reverts commit 03db3e4: "cpu: aarch64: Call stateless ACL API from winograd convolution"

reverts commit 513f882: "cpu: aarch64: hot fix for aux tensor management of stateless gemm-conv and winograd conv without lock."

Signed-off-by: Siddhartha Menon <[email protected]>
  • Loading branch information
Sqvid authored and theComputeKid committed Jan 9, 2025
1 parent 423ffd4 commit 73c2053
Show file tree
Hide file tree
Showing 7 changed files with 230 additions and 196 deletions.
6 changes: 1 addition & 5 deletions src/common/memory_tracking.hpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/*******************************************************************************
* Copyright 2018-2024 Intel Corporation
* Copyright 2024 Arm Ltd. and affiliates
* Copyright 2024-2025 Arm Ltd. and affiliates
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -205,8 +205,6 @@ enum {
key_conv_ncsp_matmul_dst,
key_conv_ncsp_diff_sp_sum,
key_conv_padded_bias,
key_conv_permuted_inputs,
key_conv_permuted_outputs,
key_conv_permuted_weights,
key_conv_rtus_space,
key_conv_store_wsp,
Expand Down Expand Up @@ -317,11 +315,9 @@ enum {
key_softmax_interim_store,
key_sum_reduction,
key_sum_srcs_cvt,
key_wino_transformed_weights,
key_wino_U,
key_wino_V,
key_wino_M,
key_wino_workspace,
// These two keys should always be the last ones,
// even though they are not in alphabetical order
key_nested,
Expand Down
54 changes: 52 additions & 2 deletions src/cpu/aarch64/acl_convolution_utils.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*******************************************************************************
* Copyright 2020-2024 Arm Ltd. and affiliates
* Copyright 2020-2025 Arm Ltd. and affiliates
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -14,7 +14,7 @@
* limitations under the License.
*******************************************************************************/

#include "acl_convolution_utils.hpp"
#include "cpu/aarch64/acl_convolution_utils.hpp"
#include "common/convolution_pd.hpp"
#include "common/utils.hpp"
#include "oneapi/dnnl/dnnl.h"
Expand Down Expand Up @@ -283,6 +283,56 @@ status_t acl_init_conf(acl_conv_conf_t &acp, memory_desc_t &src_md,

return status::success;
}

status_t init_conf_wino(acl_conv_conf_t &acp, memory_desc_t &src_md,
memory_desc_t &weights_md, memory_desc_t &dst_md,
memory_desc_t &bias_md, const convolution_desc_t &cd,
const primitive_attr_t &attr) {

// Under these conditions, fallback to faster GEMM-based convolution
// unless the user explicitly specifies Winograd algorithm
// clang-format off
if (one_of(true, src_md.dims[2] > 112, // ih
src_md.dims[3] > 112, // iw
src_md.dims[1] < 64, // ic
dst_md.dims[1] < 64, // oc
dnnl_get_max_threads() > 28)
&& cd.alg_kind == alg_kind::convolution_auto) {
return status::unimplemented;
}
// clang-format on

// General Compute Library checks, memory tags are also set there
acp.alg_winograd = true;
CHECK(acl_init_conf(acp, src_md, weights_md, dst_md, bias_md, cd, attr));

const bool shape_ok
// only unit strides allowed
= (acp.padstride_info.stride() == std::pair<uint, uint> {1, 1})
// Note: Compute Library supports arbitrary padding for wino kernels
// but we only allow small padding to be consistent with oneDNN
&& (acp.padstride_info.pad().first <= 1) // padding left/right
&& (acp.padstride_info.pad().second <= 1) // padding top/bottom
// only non-dilated convolutions allowed
&& (acp.dilation_info == arm_compute::Size2D(1, 1));

ACL_CHECK_SUPPORT(!shape_ok, "shape not supported by winograd kernels");

// clang-format off
// Validate convolution manually to check for return status
ACL_CHECK_VALID(arm_compute::NEWinogradConvolutionLayer::validate(
&acp.src_tensor_info,
&acp.wei_tensor_info,
acp.with_bias ? &acp.bia_tensor_info : nullptr,
&acp.dst_tensor_info,
acp.padstride_info,
acp.act_info,
true)); // enable_fast_math flag in ACL Winograd
// clang-format on

return status::success;
}

} // namespace acl_convolution_utils

} // namespace aarch64
Expand Down
58 changes: 57 additions & 1 deletion src/cpu/aarch64/acl_convolution_utils.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*******************************************************************************
* Copyright 2020-2024 Arm Ltd. and affiliates
* Copyright 2020-2025 Arm Ltd. and affiliates
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -30,6 +30,10 @@ namespace aarch64 {

template <typename ConvOp>
struct acl_obj_t {
arm_compute::Tensor src_tensor;
arm_compute::Tensor wei_tensor;
arm_compute::Tensor bia_tensor;
arm_compute::Tensor dst_tensor;
ConvOp conv;
arm_compute::experimental::MemoryRequirements aux_mem_req;
};
Expand Down Expand Up @@ -64,6 +68,11 @@ status_t acl_init_conf(acl_conv_conf_t &acp, memory_desc_t &src_md,
memory_desc_t &bias_md, const convolution_desc_t &cd,
const primitive_attr_t &attr);

status_t init_conf_wino(acl_conv_conf_t &acp, memory_desc_t &src_md,
memory_desc_t &weights_md, memory_desc_t &dst_md,
memory_desc_t &bias_md, const convolution_desc_t &cd,
const primitive_attr_t &attr);

} // namespace acl_convolution_utils

// Keys are anonymous with local linkage. So deduce the type automagically.
Expand Down Expand Up @@ -175,6 +184,53 @@ status_t execute_forward_conv_acl(const exec_ctx_t &ctx,
return status::success;
}

template <typename conv_obj_t, typename conv_pd_t, typename src_data_t,
typename wei_data_t = src_data_t, typename dst_data_t = src_data_t,
typename bia_data_t = src_data_t>
status_t execute_forward_conv_acl(
const exec_ctx_t &ctx, conv_obj_t &acl_conv_obj, const conv_pd_t *pd) {
bool with_bias = pd->acp_.with_bias;
bool use_dst_acc_for_sum = pd->acp_.use_dst_acc_for_sum;

auto src_base = CTX_IN_MEM(const src_data_t *, DNNL_ARG_SRC);
auto wei_base = CTX_IN_MEM(const wei_data_t *, DNNL_ARG_WEIGHTS);

// import_memory() and free() methods do not allocate/free any additional
// memory, only acquire/release pointers.
acl_conv_obj.src_tensor.allocator()->import_memory(
const_cast<src_data_t *>(src_base));
acl_conv_obj.wei_tensor.allocator()->import_memory(
const_cast<wei_data_t *>(wei_base));

const auto scratchpad = ctx.get_scratchpad_grantor();

// If we have an unfused sum post op, put the result in a scratchpad tensor.
// Result will be summed to the dst during acl_post_ops.execute
auto dst_base = use_dst_acc_for_sum
? scratchpad.get<void>(memory_tracking::names::key_generic_acc)
: CTX_OUT_MEM(dst_data_t *, DNNL_ARG_DST);
acl_conv_obj.dst_tensor.allocator()->import_memory(dst_base);

if (with_bias) {
auto bia_base = CTX_IN_MEM(const bia_data_t *, DNNL_ARG_BIAS);
acl_conv_obj.bia_tensor.allocator()->import_memory(
const_cast<bia_data_t *>(bia_base));
}

acl_conv_obj.conv.run();

acl_conv_obj.src_tensor.allocator()->free();
acl_conv_obj.wei_tensor.allocator()->free();
if (with_bias) { acl_conv_obj.bia_tensor.allocator()->free(); }

void *dst = acl_conv_obj.dst_tensor.buffer();
pd->post_ops.execute(ctx, dst);

acl_conv_obj.dst_tensor.allocator()->free();

return status::success;
}

} // namespace aarch64
} // namespace cpu
} // namespace impl
Expand Down
32 changes: 5 additions & 27 deletions src/cpu/aarch64/acl_gemm_convolution.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*******************************************************************************
* Copyright 2020-2024 Arm Ltd. and affiliates
* Copyright 2020-2025 Arm Ltd. and affiliates
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -89,44 +89,22 @@ template <data_type_t src_t, data_type_t wei_t, data_type_t dst_t,
data_type_t bia_t>
status_t acl_gemm_convolution_fwd_t<src_t, wei_t, dst_t, bia_t>::init(
engine_t *engine) {
// commented due to hot fix solution for stateless API which should be replaced soon.
// auto acp_ = pd()->acp_;
// acl_obj_->conv.configure(&acp_.src_tensor_info, &acp_.wei_tensor_info,
// acp_.with_bias ? &acp_.bia_tensor_info : nullptr,
// &acp_.dst_tensor_info, acp_.padstride_info, acp_.weights_info,
// acp_.dilation_info, acp_.act_info, acp_.fast_math);
// acl_obj_->aux_mem_req = acl_obj_->conv.workspace();
return status::success;
}

template <data_type_t src_type, data_type_t wei_type, data_type_t dst_type,
data_type_t bia_type>
std::unique_ptr<acl_obj_t<typename acl_gemm_convolution_fwd_t<src_type,
wei_type, dst_type, bia_type>::Op>>
acl_gemm_convolution_fwd_t<src_type, wei_type, dst_type,
bia_type>::reinitialize_acl_obj() const {
auto acp_ = pd()->acp_;
std::unique_ptr<acl_obj_t<Op>> acl_obj = std::make_unique<acl_obj_t<Op>>();
acl_obj->conv.configure(&acp_.src_tensor_info, &acp_.wei_tensor_info,
acl_obj_->conv.configure(&acp_.src_tensor_info, &acp_.wei_tensor_info,
acp_.with_bias ? &acp_.bia_tensor_info : nullptr,
&acp_.dst_tensor_info, acp_.padstride_info, acp_.weights_info,
acp_.dilation_info, acp_.act_info, acp_.fast_math);
acl_obj->aux_mem_req = acl_obj->conv.workspace();
return acl_obj;
acl_obj_->aux_mem_req = acl_obj_->conv.workspace();
return status::success;
}

template <data_type_t src_t, data_type_t wei_t, data_type_t dst_t,
data_type_t bia_t>
status_t
acl_gemm_convolution_fwd_t<src_t, wei_t, dst_t, bia_t>::execute_forward(
const exec_ctx_t &ctx) const {
// Temporary hotfix: We're using a local acl_obj instance in this method
// instead of the class member acl_obj_. This hotfix is to bypass persistent aux mem requirements but is not the ideal solution.
// It should be refactored or removed in the future when a more permanent fix is implemented.
auto acl_obj = reinitialize_acl_obj();

return execute_forward_conv_acl<acl_obj_t<Op>, pd_t, src_data_t, wei_data_t,
dst_data_t, bia_data_t>(ctx, acl_obj.get(), pd(), gemm_conv_keys);
dst_data_t, bia_data_t>(ctx, acl_obj_.get(), pd(), gemm_conv_keys);
}

using namespace data_type;
Expand Down
17 changes: 4 additions & 13 deletions src/cpu/aarch64/acl_gemm_convolution.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*******************************************************************************
* Copyright 2020-2024 Arm Ltd. and affiliates
* Copyright 2020-2025 Arm Ltd. and affiliates
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -47,10 +47,8 @@ struct acl_gemm_convolution_fwd_t : public primitive_t {
acl_post_ops_t post_ops;
};

// hot fix solution for stateless API which should be replaced soon.
// acl_gemm_convolution_fwd_t(const pd_t *apd)
// : primitive_t(apd), acl_obj_(std::make_unique<acl_obj_t<Op>>()) {}
acl_gemm_convolution_fwd_t(const pd_t *apd) : primitive_t(apd) {}
acl_gemm_convolution_fwd_t(const pd_t *apd)
: primitive_t(apd), acl_obj_(std::make_unique<acl_obj_t<Op>>()) {}

status_t init(engine_t *engine) override;

Expand All @@ -65,15 +63,8 @@ struct acl_gemm_convolution_fwd_t : public primitive_t {

private:
status_t execute_forward(const exec_ctx_t &ctx) const;

// hot fix solution for stateless API which should be replaced soon.
std::unique_ptr<acl_obj_t<Op>> reinitialize_acl_obj() const;

const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }

// commented due to hot fix solution for stateless API which should be replaced soon.
// std::unique_ptr<acl_obj_t<Op>> acl_obj_;

std::unique_ptr<acl_obj_t<Op>> acl_obj_;
}; // acl_gemm_convolution_fwd_t

} // namespace aarch64
Expand Down
Loading

0 comments on commit 73c2053

Please sign in to comment.