diff --git a/.github/workflows/check-format.yml b/.github/workflows/check-format.yml index 48cf30aa635..52867d7b414 100644 --- a/.github/workflows/check-format.yml +++ b/.github/workflows/check-format.yml @@ -17,7 +17,7 @@ defaults: jobs: check-format: name: Check format - runs-on: ubuntu-20.04 + runs-on: ubuntu-24.04 if: github.repository_owner == 'Samsung' steps: @@ -29,15 +29,10 @@ jobs: with: python-version: '3.x' - # C format: clang-format-16 + # C format: clang-format-16 (already installed) # Python format: yapf==0.40.2 - name: Install packages run: | - sudo apt-get install -y gnupg2 software-properties-common - wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add - - sudo add-apt-repository "deb http://apt.llvm.org/focal/ llvm-toolchain-focal-16 main" - sudo apt-get update && sudo apt-get install -qqy clang-format-16 - python -m pip install --upgrade pip pip install yapf==0.40.2 - name: Check diff --git a/Makefile.template b/Makefile.template index 6e0c29590c7..4a93c1acf3d 100644 --- a/Makefile.template +++ b/Makefile.template @@ -202,7 +202,7 @@ runtime_tar_internal: tar -zcf $(WORKSPACE)/onert-test-package.tar.gz -C $(INSTALL_PATH) $(shell ls $(INSTALL_PATH) -I lib -I include) acl_tar_internal: - tar -zcf $(WORKSPACE)/onert-acl.tar.gz -C ${OVERLAY_FOLDER} lib/libarm_compute.so lib/libarm_compute_core.so lib/libarm_compute_graph.so + tar -zcf $(WORKSPACE)/onert-acl.tar.gz -C ${OVERLAY_FOLDER} lib/libarm_compute.so lib/libarm_compute_graph.so install_acl_internal: # Workaround to install acl for test (ignore error when there is no file to copy) diff --git a/compiler/circledump/src/OpPrinter.cpp b/compiler/circledump/src/OpPrinter.cpp index 61a0941ea67..e14db02d0eb 100644 --- a/compiler/circledump/src/OpPrinter.cpp +++ b/compiler/circledump/src/OpPrinter.cpp @@ -825,6 +825,20 @@ class InstanceNormPrinter : public OpPrinter } }; +class RmsNormPrinter : public OpPrinter +{ +public: + void options(const circle::Operator *op, std::ostream &os) const override + { + if (auto *params = op->builtin_options_as_RmsNormOptions()) + { + os << " "; + os << "epsilon(" << params->epsilon() << ") "; + os << std::endl; + } + } +}; + OpPrinterRegistry::OpPrinterRegistry() { _op_map[circle::BuiltinOperator_ADD] = make_unique(); @@ -912,6 +926,7 @@ OpPrinterRegistry::OpPrinterRegistry() _op_map[circle::BuiltinOperator_BCQ_GATHER] = make_unique(); _op_map[circle::BuiltinOperator_GRU] = make_unique(); _op_map[circle::BuiltinOperator_INSTANCE_NORM] = make_unique(); + _op_map[circle::BuiltinOperator_RMS_NORM] = make_unique(); } } // namespace circledump diff --git a/compiler/common-artifacts/exclude.lst b/compiler/common-artifacts/exclude.lst index 4358bc02cdd..fcb5caa48a4 100644 --- a/compiler/common-artifacts/exclude.lst +++ b/compiler/common-artifacts/exclude.lst @@ -9,6 +9,7 @@ optimize(Add_STR_000) # STRING is not supported optimize(Add_STR_001) # STRING is not supported ## CircleRecipes +optimize(RmsNorm_000) #[[ tcgenerate : Exclude from test data generation(TestDataGenerator) ]] ## TensorFlowLiteRecipes @@ -178,3 +179,4 @@ tcgenerate(CircleFullyConnected_U4_002) tcgenerate(GRU_000) # luci-interpreter does not support custom GRU tcgenerate(InstanceNorm_000) tcgenerate(InstanceNorm_001) +tcgenerate(RmsNorm_000) diff --git a/compiler/logo/src/Passes/RemoveDeadNodeWithQueryPass.cpp b/compiler/logo/src/Passes/RemoveDeadNodeWithQueryPass.cpp index 5c745212a29..00a14e70928 100644 --- a/compiler/logo/src/Passes/RemoveDeadNodeWithQueryPass.cpp +++ b/compiler/logo/src/Passes/RemoveDeadNodeWithQueryPass.cpp @@ -43,15 +43,17 @@ bool RemoveDeadNodeWithQueryPass::run(loco::Graph *g) } // Find the nodes that should not be dead node in candidates - for (auto node : candidates) + for (auto it = candidates.begin(); it != candidates.end();) { - if (auto service = node->dialect()->service()) + if (auto service = (*it)->dialect()->service()) { - if (!service->isDeadNode(node)) + if (!service->isDeadNode(*it)) { - candidates.erase(node); + it = candidates.erase(it); + continue; } } + ++it; } for (auto node : candidates) diff --git a/compiler/luci/export/src/CircleBuiltinTypesExtractor.h b/compiler/luci/export/src/CircleBuiltinTypesExtractor.h index efc2a510649..1e1adfca5f5 100644 --- a/compiler/luci/export/src/CircleBuiltinTypesExtractor.h +++ b/compiler/luci/export/src/CircleBuiltinTypesExtractor.h @@ -548,6 +548,10 @@ class BuiltinOptionsExtractor final to_circle_actfunc(node->fusedActivationFunction())) .Union(); } + flatbuffers::Offset visit(luci::CircleRmsNorm *node) + { + return circle::CreateRmsNormOptions(_builder, node->epsilon()).Union(); + } protected: flatbuffers::FlatBufferBuilder &_builder; diff --git a/compiler/luci/export/src/CircleExporterUtils.h b/compiler/luci/export/src/CircleExporterUtils.h index 6d0ebd6cb29..49822d5d775 100644 --- a/compiler/luci/export/src/CircleExporterUtils.h +++ b/compiler/luci/export/src/CircleExporterUtils.h @@ -66,7 +66,7 @@ CircleTensorIndex get_tensor_index(loco::Node *node); // check if Flatbuffer builder can no longer hold the given amount of the data inline bool check_size_limit(const flatbuffers::FlatBufferBuilder &fb, const uint64_t data_size) { - return data_size > FLATBUFFERS_SIZE_MAX - fb.GetSize(); + return FLATBUFFERS_SIZE_MAX < data_size + fb.GetSize(); } } // namespace luci diff --git a/compiler/luci/export/src/CircleOps.lst b/compiler/luci/export/src/CircleOps.lst index 8c693baca23..91b079ac91a 100644 --- a/compiler/luci/export/src/CircleOps.lst +++ b/compiler/luci/export/src/CircleOps.lst @@ -141,6 +141,7 @@ CIRCLE_NODE(CircleBCQFullyConnected, BuiltinOperator_BCQ_FULLY_CONNECTED, Builti CIRCLE_NODE(CircleBCQGather, BuiltinOperator_BCQ_GATHER, BuiltinOptions_BCQGatherOptions) CIRCLE_NODE(CircleGRU, BuiltinOperator_GRU, BuiltinOptions_GRUOptions) CIRCLE_NODE(CircleInstanceNorm, BuiltinOperator_INSTANCE_NORM, BuiltinOptions_InstanceNormOptions) +CIRCLE_NODE(CircleRmsNorm, BuiltinOperator_RMS_NORM, BuiltinOptions_RmsNormOptions) // Virtual node(s) CIRCLE_VNODE(CircleBidirectionalSequenceLSTMOut) CIRCLE_VNODE(CircleConst) diff --git a/compiler/luci/import/include/luci/Import/Nodes.h b/compiler/luci/import/include/luci/Import/Nodes.h index f3f4871b469..6fcd5d975d0 100644 --- a/compiler/luci/import/include/luci/Import/Nodes.h +++ b/compiler/luci/import/include/luci/Import/Nodes.h @@ -107,6 +107,7 @@ #include "Nodes/CircleResizeNearestNeighbor.h" #include "Nodes/CircleReverseSequence.h" #include "Nodes/CircleReverseV2.h" +#include "Nodes/CircleRmsNorm.h" #include "Nodes/CircleRound.h" #include "Nodes/CircleRsqrt.h" #include "Nodes/CircleScatterNd.h" diff --git a/compiler/luci/import/include/luci/Import/Nodes/CircleRmsNorm.h b/compiler/luci/import/include/luci/Import/Nodes/CircleRmsNorm.h new file mode 100644 index 00000000000..a2ebcdf657b --- /dev/null +++ b/compiler/luci/import/include/luci/Import/Nodes/CircleRmsNorm.h @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __LUCI_IMPORT_OP_CIRCLE_RMS_NORM_H__ +#define __LUCI_IMPORT_OP_CIRCLE_RMS_NORM_H__ + +#include "luci/Import/GraphBuilder.h" + +namespace luci +{ + +class CircleRmsNormGraphBuilder : public GraphBuilder +{ +public: + bool validate(const ValidateArgs &args) const final; + +private: + CircleNode *build_node(const circle::OperatorT &op, const std::vector &inputs, + loco::Graph *graph) const final; +}; + +} // namespace luci + +#endif // __LUCI_IMPORT_OP_CIRCLE_RMS_NORM_H__ diff --git a/compiler/luci/import/src/GraphBuilderRegistry.cpp b/compiler/luci/import/src/GraphBuilderRegistry.cpp index 29edf8348f3..1e2e8837029 100644 --- a/compiler/luci/import/src/GraphBuilderRegistry.cpp +++ b/compiler/luci/import/src/GraphBuilderRegistry.cpp @@ -116,6 +116,7 @@ GraphBuilderRegistry::GraphBuilderRegistry() CIRCLE_NODE(RESIZE_NEAREST_NEIGHBOR, CircleResizeNearestNeighborGraphBuilder); // 97 CIRCLE_NODE(REVERSE_SEQUENCE, CircleReverseSequenceGraphBuilder); // 112 CIRCLE_NODE(REVERSE_V2, CircleReverseV2GraphBuilder); // 105 + CIRCLE_NODE(RMS_NORM, CircleRmsNormGraphBuilder); // 255 CIRCLE_NODE(ROUND, CircleRoundGraphBuilder); // 116 CIRCLE_NODE(RSQRT, CircleRsqrtGraphBuilder); // 76 CIRCLE_NODE(SCATTER_ND, CircleScatterNdGraphBuilder); // 122 diff --git a/compiler/luci/import/src/Nodes/CircleRmsNorm.cpp b/compiler/luci/import/src/Nodes/CircleRmsNorm.cpp new file mode 100644 index 00000000000..28fef764a65 --- /dev/null +++ b/compiler/luci/import/src/Nodes/CircleRmsNorm.cpp @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "luci/Import/Nodes/CircleRmsNorm.h" + +#include + +#include + +namespace luci +{ + +bool CircleRmsNormGraphBuilder::validate(const ValidateArgs &args) const +{ + // TODO check dtypes + return GraphBuilder::validate(args, 3); +} + +CircleNode *CircleRmsNormGraphBuilder::build_node(const circle::OperatorT &op, + const std::vector &inputs, + loco::Graph *graph) const +{ + auto *node = graph->nodes()->create(); + node->input(inputs.at(0)); + node->gamma(inputs.at(1)); + node->beta(inputs.at(2)); + + const auto *options = op.builtin_options.AsRmsNormOptions(); + node->epsilon(options->epsilon); + + return node; +} + +} // namespace luci diff --git a/compiler/luci/logex/src/CircleNodeSummaryBuilder.cpp b/compiler/luci/logex/src/CircleNodeSummaryBuilder.cpp index 2ff37afe165..8f27737e969 100644 --- a/compiler/luci/logex/src/CircleNodeSummaryBuilder.cpp +++ b/compiler/luci/logex/src/CircleNodeSummaryBuilder.cpp @@ -223,6 +223,7 @@ CircleNodeSummaryBuilder::create_builder(const luci::CircleNode *node) CIRCLE_NODE(RESIZE_NEAREST_NEIGHBOR, CircleResizeNearestNeighborSummaryBuilder) CIRCLE_NODE(REVERSE_SEQUENCE, CircleReverseSequenceSummaryBuilder) CIRCLE_NODE(REVERSE_V2, CircleReverseV2SummaryBuilder) + CIRCLE_NODE(RMS_NORM, CircleRmsNormSummaryBuilder) CIRCLE_NODE(ROUND, CircleRoundSummaryBuilder) CIRCLE_NODE(RSQRT, CircleRsqrtSummaryBuilder) CIRCLE_NODE(SCATTER_ND, CircleScatterNdSummaryBuilder) diff --git a/compiler/luci/logex/src/CircleNodeSummaryBuilders.cpp b/compiler/luci/logex/src/CircleNodeSummaryBuilders.cpp index f0a92ef91d1..1d605d3946c 100644 --- a/compiler/luci/logex/src/CircleNodeSummaryBuilders.cpp +++ b/compiler/luci/logex/src/CircleNodeSummaryBuilders.cpp @@ -890,6 +890,18 @@ std::vector CircleReverseV2SummaryBuilder::get_input_names(const lu return {"tensor", "axis"}; } +std::vector CircleRmsNormSummaryBuilder::get_input_names(const luci::CircleNode *) +{ + return {"input", "gamma", "beta"}; +} + +void CircleRmsNormSummaryBuilder::build_attributes(const luci::CircleNode *node, + locop::NodeSummary &s) +{ + auto rmsnorm = loco::must_cast(node); + s.args().append("epsilon", std::to_string(rmsnorm->epsilon())); +} + std::vector CircleScatterNdSummaryBuilder::get_input_names(const luci::CircleNode *) { return {"indices", "updates", "shape"}; diff --git a/compiler/luci/logex/src/CircleNodeSummaryBuilders.h b/compiler/luci/logex/src/CircleNodeSummaryBuilders.h index f489e9b6eb6..9ca64d49064 100644 --- a/compiler/luci/logex/src/CircleNodeSummaryBuilders.h +++ b/compiler/luci/logex/src/CircleNodeSummaryBuilders.h @@ -583,6 +583,13 @@ class CircleReverseV2SummaryBuilder final : public CircleNodeSummaryBuilder std::vector get_input_names(const luci::CircleNode *); }; +class CircleRmsNormSummaryBuilder final : public CircleNodeSummaryBuilder +{ +private: + std::vector get_input_names(const luci::CircleNode *); + void build_attributes(const luci::CircleNode *node, locop::NodeSummary &s); +}; + class CircleRoundSummaryBuilder final : public CircleNodeWithXSummaryBuilder { }; diff --git a/compiler/luci/partition/include/luci/ConnectNode.h b/compiler/luci/partition/include/luci/ConnectNode.h index 7539aaf6bee..592dd3b4d29 100644 --- a/compiler/luci/partition/include/luci/ConnectNode.h +++ b/compiler/luci/partition/include/luci/ConnectNode.h @@ -187,6 +187,7 @@ class ConnectNode final : public luci::CircleNodeVisitor void visit(const luci::CircleBCQGather *) final; void visit(const luci::CircleGRU *) final; void visit(const luci::CircleInstanceNorm *) final; + void visit(const luci::CircleRmsNorm *) final; // NOTE CircleInput and CircleOutput are not handled here as these need // link with graph I/O diff --git a/compiler/luci/partition/src/Nodes/CircleRmsNorm.cpp b/compiler/luci/partition/src/Nodes/CircleRmsNorm.cpp new file mode 100644 index 00000000000..fa7f58af357 --- /dev/null +++ b/compiler/luci/partition/src/Nodes/CircleRmsNorm.cpp @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "luci/ConnectNode.h" + +namespace +{ + +void connect(luci::ConnectNode *cn, const luci::CircleRmsNorm *node) +{ + auto *cloned = loco::must_cast(cn->find_clone(node)); + + luci::CircleNode *input = loco::must_cast(node->input()); + luci::CircleNode *gamma = loco::must_cast(node->gamma()); + luci::CircleNode *beta = loco::must_cast(node->beta()); + + cloned->input(cn->find_clone(input)); + cloned->gamma(cn->find_clone(gamma)); + cloned->beta(cn->find_clone(beta)); +} + +} // namespace + +namespace luci +{ + +void ConnectNode::visit(const luci::CircleRmsNorm *node) { connect(this, node); } + +} // namespace luci diff --git a/compiler/luci/partition/src/Nodes/CircleRmsNorm.test.cpp b/compiler/luci/partition/src/Nodes/CircleRmsNorm.test.cpp new file mode 100644 index 00000000000..625e66c2a14 --- /dev/null +++ b/compiler/luci/partition/src/Nodes/CircleRmsNorm.test.cpp @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "luci/ConnectNode.h" + +#include "ConnectNode.test.h" + +#include + +#include + +namespace +{ + +using namespace luci::test; + +class NodeGraphlet : public NodeGraphletT +{ +public: + NodeGraphlet() = default; + +public: + void init(loco::Graph *g) override { NodeGraphletT::init(g); } +}; + +class TestNodeGraph : public TestIsOGraph<3>, public NodeGraphlet +{ +public: + TestNodeGraph() = default; + +public: + void init(const ShapeU32 shape) + { + TestIsOGraph<3>::init({shape, shape, shape}, shape); + NodeGraphlet::init(g()); + + node()->input(input(0)); + node()->gamma(input(1)); + node()->beta(input(2)); + + output()->from(node()); + } +}; + +} // namespace + +TEST(ConnectNodeTest, connect_RmsNorm) +{ + TestNodeGraph tng; + tng.init({2, 3}); + + ConnectionTestHelper cth; + cth.prepare_inputs(&tng); + + auto *node = tng.node(); + ASSERT_NO_THROW(loco::must_cast(node)); + + auto *clone = luci::clone_node(node, cth.graph_clone()); + ASSERT_NO_THROW(loco::must_cast(clone)); + + cth.clone_connect(node, clone); + + ASSERT_EQ(3, clone->arity()); + ASSERT_EQ(cth.inputs(0), clone->arg(0)); + ASSERT_EQ(cth.inputs(1), clone->arg(1)); + ASSERT_EQ(cth.inputs(2), clone->arg(2)); +} + +TEST(ConnectNodeTest, connect_RmsNorm_NEG) +{ + TestNodeGraph tng; + tng.init({2, 3}); + + ConnectionTestHelper cth; + cth.prepare_inputs_miss(&tng); + + auto *node = tng.node(); + ASSERT_NO_THROW(loco::must_cast(node)); + + auto *clone = luci::clone_node(node, cth.graph_clone()); + ASSERT_NO_THROW(loco::must_cast(clone)); + + EXPECT_ANY_THROW(cth.clone_connect(node, clone)); +} diff --git a/compiler/luci/pass/include/luci/CircleOptimizer.h b/compiler/luci/pass/include/luci/CircleOptimizer.h index d4f675f36fe..a9bf652e323 100644 --- a/compiler/luci/pass/include/luci/CircleOptimizer.h +++ b/compiler/luci/pass/include/luci/CircleOptimizer.h @@ -78,6 +78,7 @@ class CircleOptimizer final FusePRelu, FuseGelu, FuseRsqrt, + FuseRmsNorm, ShuffleWeightTo16x1Float32, RemoveRedundantTranspose, ReplaceMulAddWithDepthwiseConv, diff --git a/compiler/luci/pass/include/luci/Pass/FuseRmsNormPass.h b/compiler/luci/pass/include/luci/Pass/FuseRmsNormPass.h new file mode 100644 index 00000000000..54acc1a26ef --- /dev/null +++ b/compiler/luci/pass/include/luci/Pass/FuseRmsNormPass.h @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __LUCI_FUSE_RMSNORM_PASS_H__ +#define __LUCI_FUSE_RMSNORM_PASS_H__ + +#include + +namespace luci +{ + +/** + * @brief Class to fuse certain pattern of subgraph into CircleRmsNorm + */ +struct FuseRmsNormPass final : public logo::Pass +{ + const char *name(void) const final { return "luci::FuseRmsNormPass"; } + + bool run(loco::Graph *g) final; +}; + +} // namespace luci + +#endif // __LUCI_FUSE_RMSNORM_PASS_H__ diff --git a/compiler/luci/pass/src/CircleOptimizer.cpp b/compiler/luci/pass/src/CircleOptimizer.cpp index bf18b973d6d..154b1f75963 100644 --- a/compiler/luci/pass/src/CircleOptimizer.cpp +++ b/compiler/luci/pass/src/CircleOptimizer.cpp @@ -56,6 +56,7 @@ #include "luci/Pass/FuseSliceWithTConvPass.h" #include "luci/Pass/FuseHorizontalFullyConnectedPass.h" #include "luci/Pass/FuseTransposeWithMeanPass.h" +#include "luci/Pass/FuseRmsNormPass.h" #include "luci/Pass/MakeBatchNormGammaPositivePass.h" #include "luci/Pass/RemoveDuplicateConstPass.h" #include "luci/Pass/RemoveFakeQuantPass.h" @@ -344,6 +345,7 @@ void CircleOptimizer::optimize(loco::Graph *g) const option_to_pass[Options::Algorithm::FuseRsqrt] = &createPassInstance; option_to_pass[Options::Algorithm::FuseHorizontalFullyConnected] = &createPassInstance; option_to_pass[Options::Algorithm::FuseTransposeWithMean] = &createPassInstance; + option_to_pass[Options::Algorithm::FuseRmsNorm] = &createPassInstance; option_to_pass[Options::Algorithm::FoldAddV2] = &createPassInstance; option_to_pass[Options::Algorithm::FoldCast] = &createPassInstance; option_to_pass[Options::Algorithm::FoldDensify] = &createPassInstance; diff --git a/compiler/luci/pass/src/ConvertNCHWToNHWCPass.cpp b/compiler/luci/pass/src/ConvertNCHWToNHWCPass.cpp index 74abb7e343d..2f6e2552437 100644 --- a/compiler/luci/pass/src/ConvertNCHWToNHWCPass.cpp +++ b/compiler/luci/pass/src/ConvertNCHWToNHWCPass.cpp @@ -431,38 +431,6 @@ luci::CircleConst *create_NHWC_from_NCHW(luci::CircleConst *constant) return nhwc_const; } -// NOTE Following conditions can be extended later -// -// Find PAD with an NCHW pattern described below -// - Paddings shape : [4, 2] -// - Paddings value : [[0, 0], [0, 0], [h_t, h_b], [w_t, w_b]]] -bool is_NCHW(const luci::CirclePad *node) -{ - const auto paddings = dynamic_cast(node->paddings()); - // Non-const paddings is not supported - if (paddings == nullptr) - return false; - - if (paddings->rank() != 2) - return false; - - if (paddings->dim(0).value() != 4 || paddings->dim(1).value() != 2) - return false; - - // Only check the first two dimensions - for (uint32_t dim = 0; dim < 2; dim++) - { - for (uint32_t i = 0; i < 2; i++) - { - auto data = paddings->at(dim * 2 + i); - if (data != 0) - return false; - } - } - - return true; -} - template bool check_NC_padding_zero(const luci::CircleConst *node) { assert(node->dtype() == T); // FIX_CALLER_UNLESS @@ -480,8 +448,12 @@ template bool check_NC_padding_zero(const luci::CircleConst * return true; } -// NOTE Copied from is_NCHW(CirclePad) -bool is_NCHW(const luci::CirclePadV2 *node) +// NOTE Following conditions can be extended later +// +// Find PAD with an NCHW pattern described below +// - Paddings shape : [4, 2] +// - Paddings value : [[0, 0], [0, 0], [h_t, h_b], [w_t, w_b]]] +template bool is_NCHW_pad_op(const T *node) { const auto paddings = dynamic_cast(node->paddings()); // Non-const paddings is not supported @@ -513,34 +485,6 @@ bool is_NCHW(const luci::CirclePadV2 *node) return true; } -// NOTE Copied from is_NCHW(CirclePad) -bool is_NCHW(const luci::CircleMirrorPad *node) -{ - const auto paddings = dynamic_cast(node->paddings()); - // Non-const paddings is not supported - if (paddings == nullptr) - return false; - - if (paddings->rank() != 2) - return false; - - if (paddings->dim(0).value() != 4 || paddings->dim(1).value() != 2) - return false; - - // Only check the first two dimensions - for (uint32_t dim = 0; dim < 2; dim++) - { - for (uint32_t i = 0; i < 2; i++) - { - auto data = paddings->at(dim * 2 + i); - if (data != 0) - return false; - } - } - - return true; -} - bool is_const(const loco::Node *node) { if (not dynamic_cast(node)) @@ -715,6 +659,106 @@ template bool convert_eltwise_binary(T *node) return true; } +template bool convert_reduction(T *node) +{ + auto input = loco::must_cast(node->input()); + if (input->rank() != 4) + return false; + + auto rindices = dynamic_cast(node->reduction_indices()); + if (not rindices) + return false; + + auto nhwc_rindices = create_NHWC_rindices(rindices); + if (not nhwc_rindices) + return false; + + auto pre_trans = create_pre_transpose(node); + pre_trans->a(input); + node->input(pre_trans); + + // Do shape inference for this node again. + node->shape_status(luci::ShapeStatus::UNDEFINED); + + node->reduction_indices(nhwc_rindices); + + if (node->keep_dims()) + { + auto post_trans = create_post_transpose(node); + loco::replace(node).with(post_trans); + + post_trans->a(node); + + return true; + } + + // node->keep_dims() == false + // 1D output never needs a transpose + if (node->rank() <= 1) + return true; + + std::vector reduced_dims_nhwc(4, false); + uint32_t num_reduced_indices = nhwc_rindices->size(); + + for (uint32_t ri = 0; ri < num_reduced_indices; ++ri) + { + reduced_dims_nhwc[nhwc_rindices->at(ri)] = true; + } + + // if channel dimension has been reduced, we don't need a transpose + if (reduced_dims_nhwc[3]) + return true; + + // likewise, if both space dimensions are reduced, no transpose is needed + if (reduced_dims_nhwc[1] && reduced_dims_nhwc[2]) + return true; + + std::vector post_trans_ind; + // case 1: only N is reduced + if (num_reduced_indices == 1 && reduced_dims_nhwc[0]) + post_trans_ind = {2, 0, 1}; + + // case 2: only H or W is reduced + if (num_reduced_indices == 1 && (reduced_dims_nhwc[1] || reduced_dims_nhwc[2])) + post_trans_ind = {0, 2, 1}; + + // case 3: N and either H or W are reduced + if (num_reduced_indices == 2) + post_trans_ind = {1, 0}; + + auto post_trans = create_Nd_transpose(node, post_trans_ind); + loco::replace(node).with(post_trans); + + post_trans->a(node); + + return true; +} + +template bool convert_pad(T *node) +{ + if (!is_NCHW_pad_op(node)) + return false; + + const auto pred_node = loco::must_cast(node->input()); + auto pre_trans = create_pre_transpose(node); + pre_trans->a(pred_node); + node->input(pre_trans); + + auto nchw_paddings = loco::must_cast(node->paddings()); + const auto nhwc_paddings = create_NHWC_paddings(nchw_paddings); + node->paddings(nhwc_paddings); + + // Do shape inference for this node again. + node->shape_status(luci::ShapeStatus::UNDEFINED); + + auto post_trans = create_post_transpose(node); + loco::replace(node).with(post_trans); + + post_trans->a(node); + + return true; +} + class ConvertNCHWToNHWC final : public luci::CircleNodeMutableVisitor { // Default @@ -854,80 +898,7 @@ class ConvertNCHWToNHWC final : public luci::CircleNodeMutableVisitor return true; } - bool visit(luci::CircleMean *node) - { - auto input = loco::must_cast(node->input()); - if (input->rank() != 4) - return false; - - auto rindices = dynamic_cast(node->reduction_indices()); - if (not rindices) - return false; - - auto nhwc_rindices = create_NHWC_rindices(rindices); - if (not nhwc_rindices) - return false; - - auto pre_trans = create_pre_transpose(node); - pre_trans->a(input); - node->input(pre_trans); - - // Do shape inference for this node again. - node->shape_status(luci::ShapeStatus::UNDEFINED); - - node->reduction_indices(nhwc_rindices); - - if (node->keep_dims()) - { - auto post_trans = create_post_transpose(node); - loco::replace(node).with(post_trans); - - post_trans->a(node); - - return true; - } - - // node->keep_dims() == false - // 1D output never needs a transpose - if (node->rank() <= 1) - return true; - - std::vector reduced_dims_nhwc(4, false); - uint32_t num_reduced_indices = nhwc_rindices->size(); - - for (uint32_t ri = 0; ri < num_reduced_indices; ++ri) - { - reduced_dims_nhwc[nhwc_rindices->at(ri)] = true; - } - - // if channel dimension has been reduced, we don't need a transpose - if (reduced_dims_nhwc[3]) - return true; - - // likewise, if both space dimensions are reduced, no transpose is needed - if (reduced_dims_nhwc[1] && reduced_dims_nhwc[2]) - return true; - - std::vector post_trans_ind; - // case 1: only N is reduced - if (num_reduced_indices == 1 && reduced_dims_nhwc[0]) - post_trans_ind = {2, 0, 1}; - - // case 2: only H or W is reduced - if (num_reduced_indices == 1 && (reduced_dims_nhwc[1] || reduced_dims_nhwc[2])) - post_trans_ind = {0, 2, 1}; - - // case 3: N and either H or W are reduced - if (num_reduced_indices == 2) - post_trans_ind = {1, 0}; - - auto post_trans = create_Nd_transpose(node, post_trans_ind); - loco::replace(node).with(post_trans); - - post_trans->a(node); - - return true; - } + bool visit(luci::CircleMean *node) { return convert_reduction(node); } bool visit(luci::CircleMinimum *node) { @@ -959,236 +930,19 @@ class ConvertNCHWToNHWC final : public luci::CircleNodeMutableVisitor return true; } - bool visit(luci::CircleMirrorPad *node) - { - if (!is_NCHW(node)) - return false; - - const auto pred_node = loco::must_cast(node->input()); - auto pre_trans = create_pre_transpose(node); - pre_trans->a(pred_node); - node->input(pre_trans); - - auto nchw_paddings = loco::must_cast(node->paddings()); - const auto nhwc_paddings = create_NHWC_paddings(nchw_paddings); - node->paddings(nhwc_paddings); - - // Do shape inference for this node again. - node->shape_status(luci::ShapeStatus::UNDEFINED); - - auto post_trans = create_post_transpose(node); - loco::replace(node).with(post_trans); - - post_trans->a(node); - - return true; - } + bool visit(luci::CircleMirrorPad *node) { return convert_pad(node); } bool visit(luci::CircleMul *node) { return convert_eltwise_binary(node); } bool visit(luci::CircleNeg *node) { return convert_unary_x(node); } - bool visit(luci::CirclePad *node) - { - if (!is_NCHW(node)) - return false; - - const auto pred_node = loco::must_cast(node->input()); - auto pre_trans = create_pre_transpose(node); - pre_trans->a(pred_node); - node->input(pre_trans); - - auto nchw_paddings = loco::must_cast(node->paddings()); - const auto nhwc_paddings = create_NHWC_paddings(nchw_paddings); - node->paddings(nhwc_paddings); - - // Do shape inference for this node again. - node->shape_status(luci::ShapeStatus::UNDEFINED); - - auto post_trans = create_post_transpose(node); - loco::replace(node).with(post_trans); - - post_trans->a(node); - - return true; - } - - bool visit(luci::CirclePadV2 *node) - { - if (!is_NCHW(node)) - return false; - - const auto pred_node = loco::must_cast(node->input()); - auto pre_trans = create_pre_transpose(node); - pre_trans->a(pred_node); - node->input(pre_trans); - - auto nchw_paddings = loco::must_cast(node->paddings()); - const auto nhwc_paddings = create_NHWC_paddings(nchw_paddings); - node->paddings(nhwc_paddings); - - // Do shape inference for this node again. - node->shape_status(luci::ShapeStatus::UNDEFINED); - - auto post_trans = create_post_transpose(node); - loco::replace(node).with(post_trans); - - post_trans->a(node); - - return true; - } - - // TODO Reduce duplicate code with CircleMean - bool visit(luci::CircleReduceMax *node) - { - auto input = loco::must_cast(node->input()); - if (input->rank() != 4) - return false; - - auto rindices = dynamic_cast(node->reduction_indices()); - if (not rindices) - return false; - - auto nhwc_rindices = create_NHWC_rindices(rindices); - if (not nhwc_rindices) - return false; - - auto pre_trans = create_pre_transpose(node); - pre_trans->a(input); - node->input(pre_trans); - - // Do shape inference for this node again. - node->shape_status(luci::ShapeStatus::UNDEFINED); - - node->reduction_indices(nhwc_rindices); - - if (node->keep_dims()) - { - auto post_trans = create_post_transpose(node); - loco::replace(node).with(post_trans); - - post_trans->a(node); - - return true; - } - - // The below codes handle the cases where node->keep_dims() == false - // 1D output never needs a transpose - if (node->rank() <= 1) - return true; - - std::vector reduced_dims_nhwc(4, false); - uint32_t num_reduced_indices = nhwc_rindices->size(); - - for (uint32_t ri = 0; ri < num_reduced_indices; ++ri) - { - reduced_dims_nhwc[nhwc_rindices->at(ri)] = true; - } - - // if channel dimension has been reduced, we don't need a transpose - if (reduced_dims_nhwc[3]) - return true; - - // likewise, if both space dimensions are reduced, no transpose is needed - if (reduced_dims_nhwc[1] && reduced_dims_nhwc[2]) - return true; - - std::vector post_trans_ind; - // case 1: only N is reduced - if (num_reduced_indices == 1 && reduced_dims_nhwc[0]) - post_trans_ind = {2, 0, 1}; - - // case 2: only H or W is reduced - if (num_reduced_indices == 1 && (reduced_dims_nhwc[1] || reduced_dims_nhwc[2])) - post_trans_ind = {0, 2, 1}; - - // case 3: N and either H or W are reduced - if (num_reduced_indices == 2) - post_trans_ind = {1, 0}; - - auto post_trans = create_Nd_transpose(node, post_trans_ind); - loco::replace(node).with(post_trans); - - post_trans->a(node); - - return true; - } - - // TODO Reduce duplicate codes with CircleReduceMax - bool visit(luci::CircleReduceMin *node) - { - auto input = loco::must_cast(node->input()); - if (input->rank() != 4) - return false; - - auto rindices = dynamic_cast(node->reduction_indices()); - if (not rindices) - return false; - - auto nhwc_rindices = create_NHWC_rindices(rindices); - if (not nhwc_rindices) - return false; + bool visit(luci::CirclePad *node) { return convert_pad(node); } - auto pre_trans = create_pre_transpose(node); - pre_trans->a(input); - node->input(pre_trans); - - // Do shape inference for this node again. - node->shape_status(luci::ShapeStatus::UNDEFINED); + bool visit(luci::CirclePadV2 *node) { return convert_pad(node); } - node->reduction_indices(nhwc_rindices); + bool visit(luci::CircleReduceMax *node) { return convert_reduction(node); } - if (node->keep_dims()) - { - auto post_trans = create_post_transpose(node); - loco::replace(node).with(post_trans); - - post_trans->a(node); - - return true; - } - - // The below codes handle the cases where node->keep_dims() == false - // 1D output never needs a transpose - if (node->rank() <= 1) - return true; - - std::vector reduced_dims_nhwc(4, false); - uint32_t num_reduced_indices = nhwc_rindices->size(); - - for (uint32_t ri = 0; ri < num_reduced_indices; ++ri) - { - reduced_dims_nhwc[nhwc_rindices->at(ri)] = true; - } - - // if channel dimension has been reduced, we don't need a transpose - if (reduced_dims_nhwc[3]) - return true; - - // likewise, if both space dimensions are reduced, no transpose is needed - if (reduced_dims_nhwc[1] && reduced_dims_nhwc[2]) - return true; - - std::vector post_trans_ind; - // case 1: only N is reduced - if (num_reduced_indices == 1 && reduced_dims_nhwc[0]) - post_trans_ind = {2, 0, 1}; - - // case 2: only H or W is reduced - if (num_reduced_indices == 1 && (reduced_dims_nhwc[1] || reduced_dims_nhwc[2])) - post_trans_ind = {0, 2, 1}; - - // case 3: N and either H or W are reduced - if (num_reduced_indices == 2) - post_trans_ind = {1, 0}; - - auto post_trans = create_Nd_transpose(node, post_trans_ind); - loco::replace(node).with(post_trans); - - post_trans->a(node); - - return true; - } + bool visit(luci::CircleReduceMin *node) { return convert_reduction(node); } bool visit(luci::CircleRelu *node) { return convert_unary_features(node); } diff --git a/compiler/luci/pass/src/ConvertToFakeQuantizedModelPass.cpp b/compiler/luci/pass/src/ConvertToFakeQuantizedModelPass.cpp index 4a0bc663369..6f40891feb3 100644 --- a/compiler/luci/pass/src/ConvertToFakeQuantizedModelPass.cpp +++ b/compiler/luci/pass/src/ConvertToFakeQuantizedModelPass.cpp @@ -219,6 +219,7 @@ struct FakeQuantize final : public luci::CircleNodeMutableVisitor void visit(luci::CircleRelu6 *node) { fq_activation(node); } void visit(luci::CircleResizeBilinear *node) { fq_activation(node); } void visit(luci::CircleResizeNearestNeighbor *node) { fq_activation(node); } + void visit(luci::CircleRmsNorm *node) { fq_activation(node); } void visit(luci::CircleRsqrt *node) { fq_activation(node); } void visit(luci::CircleSoftmax *node) { fq_activation(node); } void visit(luci::CircleSqrt *node) { fq_activation(node); } diff --git a/compiler/luci/pass/src/FuseRmsNormPass.cpp b/compiler/luci/pass/src/FuseRmsNormPass.cpp new file mode 100644 index 00000000000..f2ecbffa77f --- /dev/null +++ b/compiler/luci/pass/src/FuseRmsNormPass.cpp @@ -0,0 +1,218 @@ +/* + * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "luci/Pass/FuseRmsNormPass.h" +#include "helpers/NodeFiller.h" + +#include +#include +#include + +#include +#include + +namespace +{ + +/** + * Below diagram shows RMS normalization pattern to fuse. + * - this pattern will be replaced with one RmsNorm + * + * [In] + * | + * V + * +---- ifm ----+ + * | | | + * | V | + * | mul <---+ + * | | + * | V + * | mean + * | | + * | V + * | add_epsilon + * | | + * | V + * | rsqrt + * | | + * | V + * +---> mul_input + * | + * V + * [Out] + */ + +class RmsNormPattern final +{ +public: + RmsNormPattern(luci::CircleMul *candidate) + { + assert(candidate); // FIX_CALLER_UNLESS + _mul_input = candidate; + } + +public: + bool matched(); + +public: + luci::CircleNode *_ifm = nullptr; + luci::CircleMul *_mul_pow = nullptr; + luci::CircleMean *_mean = nullptr; + luci::CircleAdd *_add_epsilon = nullptr; + luci::CircleRsqrt *_rsqrt = nullptr; + luci::CircleMul *_mul_input = nullptr; + luci::CircleConst *_const_epsilon = nullptr; + luci::CircleConst *_const_gamma = nullptr; + luci::CircleConst *_const_beta = nullptr; +}; + +#define CHECK_OR_FALSE(condition) \ + if (not(condition)) \ + return false; + +luci::CircleConst *make_const_one(loco::Graph *graph, float value) +{ + auto const_one = graph->nodes()->create(); + const_one->dtype(loco::DataType::FLOAT32); + const_one->rank(1); + const_one->dim(0) = 1; + const_one->shape_status(luci::ShapeStatus::VALID); + const_one->size(1); + const_one->at(0) = value; + return const_one; +} + +bool RmsNormPattern::matched() +{ + CHECK_OR_FALSE(luci::fill(&_ifm, &_rsqrt).with_commutative_args_of(_mul_input)); + _add_epsilon = dynamic_cast(_rsqrt->x()); + CHECK_OR_FALSE(_add_epsilon); + CHECK_OR_FALSE(luci::fill(&_mean, &_const_epsilon).with_commutative_args_of(_add_epsilon)); + CHECK_OR_FALSE(_const_epsilon->dtype() == loco::DataType::FLOAT32); + _mul_pow = dynamic_cast(_mean->input()); + CHECK_OR_FALSE(_mul_pow); + CHECK_OR_FALSE(_mul_pow->x() == _ifm); + CHECK_OR_FALSE(_mul_pow->y() == _ifm); + + assert(_const_gamma == nullptr); + assert(_const_beta == nullptr); + + /* + NOTE: Current FuseRmsNormPass assumes no gamma(scale) and beta(bias). + But, RmsNorm kernel expects gamma and beta. + So, it creates default gamma(1.0) and beta(0.0). + */ + auto graph = _mul_input->graph(); + _const_gamma = make_const_one(graph, 1.0f); + _const_beta = make_const_one(graph, 0.0f); + _const_gamma->name(_mul_input->name() + "/gamma"); + _const_beta->name(_mul_input->name() + "/beta"); + + return true; +} +#undef CHECK_OR_FALSE + +class FuseRmsNorm final +{ +public: + FuseRmsNorm(const RmsNormPattern *p) : _p(p) {} + +public: + void apply(void); + +private: + luci::CircleRmsNorm *create_rms_norm(loco::Graph *graph); + +private: + const RmsNormPattern *_p = nullptr; +}; + +luci::CircleRmsNorm *FuseRmsNorm::create_rms_norm(loco::Graph *graph) +{ + assert(graph); + + auto rms_norm = graph->nodes()->create(); + rms_norm->input(_p->_ifm); + rms_norm->gamma(_p->_const_gamma); + rms_norm->beta(_p->_const_beta); + float epsilon = _p->_const_epsilon->at(0); + rms_norm->epsilon(epsilon); + + rms_norm->name("FusedRmsNorm/" + _p->_mul_input->name()); + + return rms_norm; +} + +void FuseRmsNorm::apply() +{ + auto graph = _p->_mul_input->graph(); + + auto rms_norm = create_rms_norm(graph); + + // set origin + std::vector> origin_vec{ + luci::get_origin(_p->_mul_pow), luci::get_origin(_p->_mean), + luci::get_origin(_p->_add_epsilon), luci::get_origin(_p->_rsqrt), + luci::get_origin(_p->_mul_input), + }; + + luci::add_origin(rms_norm, luci::composite_origin(origin_vec)); + + replace(_p->_mul_input).with(rms_norm); +} + +} // namespace + +namespace +{ + +bool fuse_rms_norm(luci::CircleMul *mul) +{ + assert(mul); + + RmsNormPattern pattern(mul); + if (pattern.matched()) + { + FuseRmsNorm fuse(&pattern); + fuse.apply(); + return true; + } + + return false; +} + +} // namespace + +namespace luci +{ + +bool FuseRmsNormPass::run(loco::Graph *g) +{ + bool changed = false; + + for (auto node : loco::active_nodes(loco::output_nodes(g))) + { + auto mul = dynamic_cast(node); + if (not mul) + continue; + + if (fuse_rms_norm(mul)) + changed = true; + } + return changed; +} + +} // namespace luci diff --git a/compiler/luci/pass/src/FuseRmsNormPass.test.cpp b/compiler/luci/pass/src/FuseRmsNormPass.test.cpp new file mode 100644 index 00000000000..07bb97fe01a --- /dev/null +++ b/compiler/luci/pass/src/FuseRmsNormPass.test.cpp @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "luci/Pass/FuseRmsNormPass.h" + +#include + +TEST(FuseRmsNormPassTest, name) +{ + luci::FuseRmsNormPass pass; + auto const name = pass.name(); + ASSERT_NE(nullptr, name); +} diff --git a/compiler/luci/pass/src/QuantizeActivation.h b/compiler/luci/pass/src/QuantizeActivation.h index 162ec2c66ae..bd8bd2a8167 100644 --- a/compiler/luci/pass/src/QuantizeActivation.h +++ b/compiler/luci/pass/src/QuantizeActivation.h @@ -75,6 +75,7 @@ struct QuantizeConstInputActivation final : public luci::CircleNodeMutableVisito SKIP(luci::CircleFullyConnected) SKIP(luci::CircleInstanceNorm) SKIP(luci::CirclePRelu) + SKIP(luci::CircleRmsNorm) SKIP(luci::CircleTransposeConv) // Handled in PropagateQParamBackwardPass diff --git a/compiler/luci/pass/src/QuantizePreCheckerPass.cpp b/compiler/luci/pass/src/QuantizePreCheckerPass.cpp index 4b3b7e33095..1eea4f66d5d 100644 --- a/compiler/luci/pass/src/QuantizePreCheckerPass.cpp +++ b/compiler/luci/pass/src/QuantizePreCheckerPass.cpp @@ -84,6 +84,7 @@ struct ConstInputChecker final : public luci::CircleNodeMutableVisitor CHECK_NODE_WITH_TWO_INPUT_CONST(luci::CircleDepthwiseConv2D, filter, bias) CHECK_NODE_WITH_TWO_INPUT_CONST(luci::CircleFullyConnected, weights, bias) CHECK_NODE_WITH_TWO_INPUT_CONST(luci::CircleInstanceNorm, gamma, beta) + CHECK_NODE_WITH_TWO_INPUT_CONST(luci::CircleRmsNorm, gamma, beta) // Ops that receive three const nodes as an inputs CHECK_NODE_WITH_THREE_INPUT_CONST(luci::CircleTransposeConv, inputSizes, filter, bias) diff --git a/compiler/luci/pass/src/QuantizePreCheckerPass.test.cpp b/compiler/luci/pass/src/QuantizePreCheckerPass.test.cpp index 8f6a96f3330..3f6295f4a2e 100644 --- a/compiler/luci/pass/src/QuantizePreCheckerPass.test.cpp +++ b/compiler/luci/pass/src/QuantizePreCheckerPass.test.cpp @@ -192,6 +192,49 @@ class SimpleInstanceNormGraph luci::CircleOutput *output = nullptr; }; +class SimpleRmsNormGraph +{ +public: + SimpleRmsNormGraph(bool make_valid) + { + rms_norm_node = g.nodes()->create(); + input_1 = g.nodes()->create(); + gamma = g.nodes()->create(); + + rms_norm_node->input(input_1); + rms_norm_node->gamma(gamma); + + if (make_valid) + { + beta = g.nodes()->create(); + rms_norm_node->beta(beta); + } + else + { + input_2 = g.nodes()->create(); + rms_norm_node->beta(input_2); + } + + output = g.nodes()->create(); + + auto graph_output = g.outputs()->create(); + output->index(graph_output->index()); + + output->from(rms_norm_node); + } + +public: + loco::Graph g; + +private: + luci::CircleRmsNorm *rms_norm_node = nullptr; + luci::CircleInput *input_1 = nullptr; + luci::CircleInput *input_2 = nullptr; + luci::CircleConst *gamma = nullptr; + luci::CircleConst *beta = nullptr; + luci::CircleOutput *output = nullptr; +}; + class SimpleTransposeConvGraph { public: @@ -363,6 +406,25 @@ TEST(QuantizePreCheckerPassTest, instance_norm_NEG) EXPECT_ANY_THROW(checker.run(&invalid_graph.g)); } +// Test RmsNorm +TEST(QuantizePreCheckerPassTest, rms_norm) +{ + SimpleRmsNormGraph valid_graph(true); + + luci::QuantizePreCheckerPass checker{}; + + EXPECT_NO_THROW(checker.run(&valid_graph.g)); +} + +TEST(QuantizePreCheckerPassTest, rms_norm_NEG) +{ + SimpleRmsNormGraph invalid_graph(false); + + luci::QuantizePreCheckerPass checker{}; + + EXPECT_ANY_THROW(checker.run(&invalid_graph.g)); +} + // Test TransposeConv TEST(QuantizePreCheckerPassTest, transpose_conv) { diff --git a/compiler/luci/pass/src/QuantizeWeights.cpp b/compiler/luci/pass/src/QuantizeWeights.cpp index 17a887cfa4f..5350e21a4ca 100644 --- a/compiler/luci/pass/src/QuantizeWeights.cpp +++ b/compiler/luci/pass/src/QuantizeWeights.cpp @@ -507,6 +507,36 @@ void QuantizeWeights::visit(luci::CircleInstanceNorm *node) } } +void QuantizeWeights::visit(luci::CircleRmsNorm *node) +{ + LOGGER(l); + INFO(l) << "QuantizeWeights QuantizeWeights::visit node: " << node->name() << std::endl; + + auto gamma = loco::must_cast(node->gamma()); + auto beta = loco::must_cast(node->beta()); + + if (!is_quantized(gamma)) + { + assert(gamma->dtype() == loco::DataType::FLOAT32); + auto new_gamma = luci::clone(gamma); + if (granularity == QuantizationGranularity::LayerWise) + quant_const(new_gamma, output_type); + else if (granularity == QuantizationGranularity::ChannelWise) + quant_const_per_channel(new_gamma, output_type); + node->gamma(new_gamma); + } + if (!is_quantized(beta)) + { + assert(beta->dtype() == loco::DataType::FLOAT32); + auto new_beta = luci::clone(beta); + if (granularity == QuantizationGranularity::LayerWise) + quant_const(new_beta, output_type); + else if (granularity == QuantizationGranularity::ChannelWise) + quant_const_per_channel(new_beta, output_type); + node->beta(new_beta); + } +} + void QuantizeWeights::visit(luci::CirclePRelu *node) { LOGGER(l); diff --git a/compiler/luci/pass/src/QuantizeWeights.h b/compiler/luci/pass/src/QuantizeWeights.h index f62cd40f3cb..b3913f2e809 100644 --- a/compiler/luci/pass/src/QuantizeWeights.h +++ b/compiler/luci/pass/src/QuantizeWeights.h @@ -44,6 +44,7 @@ struct QuantizeWeights final : public luci::CircleNodeMutableVisitor void visit(luci::CircleConv2D *node); void visit(luci::CircleDepthwiseConv2D *node); void visit(luci::CircleInstanceNorm *node); + void visit(luci::CircleRmsNorm *node); void visit(luci::CirclePRelu *node); void visit(luci::CircleTransposeConv *node); void visit(luci::CircleFullyConnected *node); diff --git a/compiler/luci/pass/src/QuantizeWithMinMaxPass.cpp b/compiler/luci/pass/src/QuantizeWithMinMaxPass.cpp index bdb50d67a87..695e8b1eeeb 100644 --- a/compiler/luci/pass/src/QuantizeWithMinMaxPass.cpp +++ b/compiler/luci/pass/src/QuantizeWithMinMaxPass.cpp @@ -301,6 +301,7 @@ struct InsertQuantizeOp final : public luci::CircleNodeMutableVisitor INSERT_QUANTIZE_TO_UNARY_OP(luci::CircleResizeBilinear, input) INSERT_QUANTIZE_TO_UNARY_OP(luci::CircleResizeNearestNeighbor, input) INSERT_QUANTIZE_TO_UNARY_OP(luci::CircleReverseSequence, input) + INSERT_QUANTIZE_TO_UNARY_OP(luci::CircleRmsNorm, input) INSERT_QUANTIZE_TO_UNARY_OP(luci::CircleRsqrt, x) INSERT_QUANTIZE_TO_UNARY_OP(luci::CircleSlice, input) INSERT_QUANTIZE_TO_UNARY_OP(luci::CircleSoftmax, logits) @@ -548,12 +549,12 @@ void QuantizeWithMinMaxPass::set_output_type(loco::Graph *g) const * * Why quantization sequence was determined as above? * - Activation and weights should be quantized before bias (1->2->3). Input/Output - * dtype can be updated at the end (4->5). + * dtype is updated after all the other nodes are quantzied (4->5). * - During activation quantization, * - Backward propagation is performed earlier than forward propagation. This allows - * backward-propagated qpram to be overwritten during forward propagation. - * We made this decision as Ops for forward propagation (reshape, transpose, ..) - * are more common than backward propagation. TODO Check this decision is safe. + * backward-propagated qparam to be overwritten during forward propagation. + * We made the decision because it's more common to propagate qparam forward (reshape, + * transpose) than backward (concat, pad_v2, ..). * - QuantizeSpecialActivation is called before forward propagation to make sure that * the pre-defined qparam values are propagated. */ diff --git a/compiler/luci/pass/src/VerifyQuantizedNodeGranularity.h b/compiler/luci/pass/src/VerifyQuantizedNodeGranularity.h index cc618bf0e2f..6fc6a26ba46 100644 --- a/compiler/luci/pass/src/VerifyQuantizedNodeGranularity.h +++ b/compiler/luci/pass/src/VerifyQuantizedNodeGranularity.h @@ -93,6 +93,8 @@ class VerifyQuantizedNodeGranularity : public luci::CircleNodeVisitor virtual bool visit(const luci::CircleInstanceNorm *node) = 0; + virtual bool visit(const luci::CircleRmsNorm *node) = 0; + bool visit(const luci::CirclePack *node) { RETURN_FALSE_UNLESS(is_lwq(node)) @@ -511,6 +513,15 @@ class VerifyQuantizedNodeChannelWiseGranularity final : public VerifyQuantizedNo return true; } + bool visit(const luci::CircleRmsNorm *node) + { + RETURN_FALSE_UNLESS(is_lwq(node)) + RETURN_FALSE_UNLESS(is_lwq(node->input())) + RETURN_FALSE_UNLESS(is_cwq_const(node->gamma(), rank(node->gamma()) - 1)) + RETURN_FALSE_UNLESS(is_cwq_const(node->beta(), rank(node->beta()) - 1)) + return true; + } + bool visit(const luci::CirclePRelu *node) { RETURN_FALSE_UNLESS(is_lwq(node)) @@ -595,6 +606,15 @@ class VerifyQuantizedNodeLayerWiseGranularity final : public VerifyQuantizedNode return true; } + bool visit(const luci::CircleRmsNorm *node) + { + RETURN_FALSE_UNLESS(is_lwq(node)) + RETURN_FALSE_UNLESS(is_lwq(node->input())) + RETURN_FALSE_UNLESS(is_lwq_const(node->gamma())) + RETURN_FALSE_UNLESS(is_lwq_const(node->beta())) + return true; + } + bool visit(const luci::CirclePRelu *node) { RETURN_FALSE_UNLESS(is_lwq(node)) diff --git a/compiler/luci/pass/src/VerifyQuantizedNodeType.cpp b/compiler/luci/pass/src/VerifyQuantizedNodeType.cpp index 4bad9522b85..1f0ff43b779 100644 --- a/compiler/luci/pass/src/VerifyQuantizedNodeType.cpp +++ b/compiler/luci/pass/src/VerifyQuantizedNodeType.cpp @@ -364,6 +364,12 @@ bool VerifyQuantizedNodeTypeBase::visit(const luci::CircleResizeNe return true; } +template +bool VerifyQuantizedNodeTypeBase::visit(const luci::CircleRmsNorm *node) +{ + return group_has_type(node, Qtype); +} + template bool VerifyQuantizedNodeTypeBase::visit(const luci::CircleRsqrt *node) { diff --git a/compiler/luci/pass/src/VerifyQuantizedNodeType.h b/compiler/luci/pass/src/VerifyQuantizedNodeType.h index 03f1e1d8640..15ec384413c 100644 --- a/compiler/luci/pass/src/VerifyQuantizedNodeType.h +++ b/compiler/luci/pass/src/VerifyQuantizedNodeType.h @@ -110,6 +110,7 @@ class VerifyQuantizedNodeTypeBase : public luci::CircleNodeVisitor, bool visit(const luci::CircleReshape *node); bool visit(const luci::CircleResizeBilinear *node); bool visit(const luci::CircleResizeNearestNeighbor *node); + bool visit(const luci::CircleRmsNorm *node); bool visit(const luci::CircleRsqrt *node); bool visit(const luci::CircleSlice *node); bool visit(const luci::CircleSpaceToBatchND *node); diff --git a/compiler/luci/service/src/CircleCloneNode.h b/compiler/luci/service/src/CircleCloneNode.h index e2f61e1eb0e..64c9e4f486f 100644 --- a/compiler/luci/service/src/CircleCloneNode.h +++ b/compiler/luci/service/src/CircleCloneNode.h @@ -259,6 +259,7 @@ class CloneNode final : public luci::CircleNodeVisitor luci::CircleNode *visit(const luci::CircleBCQGather *) final; luci::CircleNode *visit(const luci::CircleInstanceNorm *) final; luci::CircleNode *visit(const luci::CircleGRU *) final; + luci::CircleNode *visit(const luci::CircleRmsNorm *) final; // NOTE CircleInput and CircleOutput are not handled here as these need // link with graph I/O diff --git a/compiler/luci/service/src/CircleShapeInferenceHelper.cpp b/compiler/luci/service/src/CircleShapeInferenceHelper.cpp index 3d78a31a12e..c10746b86cc 100644 --- a/compiler/luci/service/src/CircleShapeInferenceHelper.cpp +++ b/compiler/luci/service/src/CircleShapeInferenceHelper.cpp @@ -168,14 +168,17 @@ loco::TensorShape pad_shape(const loco::TensorShape &input_shape, const luci::Ci // TODO support other data type LUCI_ASSERT(paddings->dtype() == S32 || paddings->dtype() == S64, "Support int 32/64 for now"); - LUCI_ASSERT(paddings->rank() == 2, "paddings should be rank 2"); + if (paddings->rank() != 2) + INTERNAL_EXN("paddings should be rank 2"); int32_t n = paddings->dim(0).value(); int32_t v = paddings->dim(1).value(); - LUCI_ASSERT(v == 2, "paddings should be [n, 2]"); - LUCI_ASSERT(n == int32_t(input_shape.rank()), - "paddings [n, 2] should have same value of input rank"); + if (v != 2) + INTERNAL_EXN("paddings should be [n, 2]"); + + if (n != int32_t(input_shape.rank())) + INTERNAL_EXN("paddings [n, 2] should have same value of input rank"); loco::TensorShape output_shape; diff --git a/compiler/luci/service/src/CircleShapeInferenceRule.cpp b/compiler/luci/service/src/CircleShapeInferenceRule.cpp index 42c45353361..a094b681d0c 100644 --- a/compiler/luci/service/src/CircleShapeInferenceRule.cpp +++ b/compiler/luci/service/src/CircleShapeInferenceRule.cpp @@ -2198,6 +2198,13 @@ class ShapeInferenceAlgorithm final : public luci::CircleNodeVisitorinput()).as(); + + return loco::NodeShape{input_shape}; + } + // Virtual loco::NodeShape visit(const luci::CircleInput *node) final { return infer_input(node); } diff --git a/compiler/luci/service/src/CircleTypeInferenceRule.cpp b/compiler/luci/service/src/CircleTypeInferenceRule.cpp index 78dde1004b5..6b656567071 100644 --- a/compiler/luci/service/src/CircleTypeInferenceRule.cpp +++ b/compiler/luci/service/src/CircleTypeInferenceRule.cpp @@ -579,6 +579,11 @@ struct TypeInferenceAlgorithm final : public luci::CircleNodeVisitorinput()); } + loco::DataType visit(const luci::CircleRmsNorm *node) final + { + return luci::dtype_get(node->input()); + } + // Virtual loco::DataType visit(const luci::CircleInput *node) final { return node->dtype(); } diff --git a/compiler/luci/service/src/Nodes/CirclePad.test.cpp b/compiler/luci/service/src/Nodes/CirclePad.test.cpp index 070b9b31075..5b221b55861 100644 --- a/compiler/luci/service/src/Nodes/CirclePad.test.cpp +++ b/compiler/luci/service/src/Nodes/CirclePad.test.cpp @@ -124,3 +124,87 @@ TEST(ShapeRuleTest, pad_non_const_paddings) ASSERT_EQ(0, shape.dim(2).value()); ASSERT_EQ(0, shape.dim(3).value()); } + +TEST(ShapeRuleTest, paddings_invalid_rank_NEG) +{ + auto g = loco::make_graph(); + auto node_pad = g->nodes()->create(); + + auto node_paddings = g->nodes()->create(); + auto node_input = g->nodes()->create(); + + loco::TensorShape shape; + luci::sinf::Rule shape_inf_rule; + + node_input->shape({1, 2, 3, 4}); + node_input->shape_status(luci::ShapeStatus::VALID); + + node_paddings->dtype(loco::DataType::S64); + node_paddings->shape({4, 2, 3}); + node_paddings->shape_status(luci::ShapeStatus::VALID); + + const loco::DataType S64 = loco::DataType::S64; + uint32_t t = 64 * 8; + node_paddings->size(t); + + node_pad->input(node_input); + node_pad->paddings(node_paddings); + + ASSERT_ANY_THROW(shape_inf_rule.infer(node_pad, shape)); +} + +TEST(ShapeRuleTest, paddings_invalid_shape_1_NEG) +{ + auto g = loco::make_graph(); + auto node_pad = g->nodes()->create(); + + auto node_paddings = g->nodes()->create(); + auto node_input = g->nodes()->create(); + + loco::TensorShape shape; + luci::sinf::Rule shape_inf_rule; + + node_input->shape({1, 2, 3, 4}); + node_input->shape_status(luci::ShapeStatus::VALID); + + node_paddings->dtype(loco::DataType::S64); + node_paddings->shape({4, 4}); + node_paddings->shape_status(luci::ShapeStatus::VALID); + + const loco::DataType S64 = loco::DataType::S64; + uint32_t t = 64 * 8; + node_paddings->size(t); + + node_pad->input(node_input); + node_pad->paddings(node_paddings); + + ASSERT_ANY_THROW(shape_inf_rule.infer(node_pad, shape)); +} + +TEST(ShapeRuleTest, paddings_invalid_shape_2_NEG) +{ + auto g = loco::make_graph(); + auto node_pad = g->nodes()->create(); + + auto node_paddings = g->nodes()->create(); + auto node_input = g->nodes()->create(); + + loco::TensorShape shape; + luci::sinf::Rule shape_inf_rule; + + node_input->shape({1, 2, 3, 4}); + node_input->shape_status(luci::ShapeStatus::VALID); + + node_paddings->dtype(loco::DataType::S64); + node_paddings->shape({5, 2}); + node_paddings->shape_status(luci::ShapeStatus::VALID); + + const loco::DataType S64 = loco::DataType::S64; + uint32_t t = 64 * 8; + node_paddings->size(t); + + node_pad->input(node_input); + node_pad->paddings(node_paddings); + + ASSERT_ANY_THROW(shape_inf_rule.infer(node_pad, shape)); +} diff --git a/compiler/luci/service/src/Nodes/CircleReshape.cpp b/compiler/luci/service/src/Nodes/CircleReshape.cpp index 0de10960b51..778f8d45762 100644 --- a/compiler/luci/service/src/Nodes/CircleReshape.cpp +++ b/compiler/luci/service/src/Nodes/CircleReshape.cpp @@ -87,6 +87,10 @@ loco::TensorShape Algorithm::visit(const luci::CircleReshape *node) for (uint32_t axis = 0; axis < shape_by_input.rank(); ++axis) { shape_by_input.dim(axis) = const_shape_node->at(axis); + if (const_shape_node->at(axis) < 0) + { + shape_by_input.dim(axis).unset(); + } } } else @@ -139,7 +143,7 @@ loco::TensorShape Algorithm::visit(const luci::CircleReshape *node) for (uint32_t dim_index = 0; dim_index < output_shape.rank(); ++dim_index) { const uint32_t dim_value = output_shape.dim(dim_index).value(); - if (static_cast(dim_value) == -1) + if (not output_shape.dim(dim_index).known()) { LUCI_ASSERT(unknown_dim_index == UINT32_MAX, "More than one unknown dimension"); unknown_dim_index = dim_index; diff --git a/compiler/luci/service/src/Nodes/CircleRmsNorm.cpp b/compiler/luci/service/src/Nodes/CircleRmsNorm.cpp new file mode 100644 index 00000000000..0fdf2bdf3d8 --- /dev/null +++ b/compiler/luci/service/src/Nodes/CircleRmsNorm.cpp @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "CircleCloneNode.h" + +namespace luci +{ + +luci::CircleNode *CloneNode::visit(const luci::CircleRmsNorm *node) +{ + auto *cloned = _graph->nodes()->create(); + if (cloned != nullptr) + { + cloned->epsilon(node->epsilon()); + } + return cloned; +} + +} // namespace luci diff --git a/compiler/luci/service/src/Nodes/CircleRmsNorm.test.cpp b/compiler/luci/service/src/Nodes/CircleRmsNorm.test.cpp new file mode 100644 index 00000000000..9bd0bc891da --- /dev/null +++ b/compiler/luci/service/src/Nodes/CircleRmsNorm.test.cpp @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "luci/Service/CircleNodeClone.h" + +#include + +TEST(CloneNodeTest, clone_RmsNorm) +{ + auto g = loco::make_graph(); + auto node_fc = g->nodes()->create(); + node_fc->epsilon(3); + + auto gc = loco::make_graph(); + auto cloned = luci::clone_node(node_fc, gc.get()); + ASSERT_NE(nullptr, cloned); + ASSERT_EQ(gc.get(), cloned->graph()); + + auto cloned_fc = dynamic_cast(cloned); + ASSERT_NE(nullptr, cloned_fc); + ASSERT_EQ(node_fc->epsilon(), cloned_fc->epsilon()); +} diff --git a/compiler/one-cmds/one-codegen b/compiler/one-cmds/one-codegen index 8551056f6a8..84e7b31d906 100644 --- a/compiler/one-cmds/one-codegen +++ b/compiler/one-cmds/one-codegen @@ -35,6 +35,9 @@ import onelib.utils as oneutils # TODO Find better way to suppress trackback on error sys.tracebacklimit = 0 +COMMAND_KEYS = ['__command', 'command'] +BACKEND_KEY = 'BACKEND' + def _get_parser(backends_list): codegen_usage = 'one-codegen [-h] [-v] [-C CONFIG] [-b BACKEND | -T TARGET] [--] [COMMANDS FOR BACKEND]' @@ -81,7 +84,8 @@ def _verify_arg(parser, args, cfg_args, cfg_target_args, backend_args, unknown_a # overwrite the value if it exists as command line option has higher priority. if oneutils.is_valid_attr(args, 'target'): target_to_run = args.target - given_backend = backends.get_backend_from_target_conf(target_to_run) + given_backend = backends.get_value_from_target_conf( + target_to_run, BACKEND_KEY) if not given_backend: parser.error(f'Not found {target_to_run} target.') else: @@ -213,8 +217,11 @@ def main(): assert (oneutils.is_valid_attr(cfg_args, 'command')) setattr(cfg_args, args.backend, cfg_args.command) else: + given_backend = None # get backend information - given_backend = backends.get_backend_from_target_conf(target_to_run) + if target_to_run: + given_backend = backends.get_value_from_target_conf( + target_to_run, BACKEND_KEY) # check if command schema for the backend exists # 1. if it exists, run the command according to the schema. # 2. if it doesn't exist, insert "--target ${TARGET}" at the beginning of the given command. @@ -251,7 +258,9 @@ def main(): # [15], [16] else: assert oneutils.is_valid_attr(args, 'target') - given_backends = [backends.get_backend_from_target_conf(target_to_run)] + given_backends = [ + backends.get_value_from_target_conf(target_to_run, BACKEND_KEY) + ] # make commands # 1. if command schema exists diff --git a/compiler/one-cmds/one-profile b/compiler/one-cmds/one-profile index 2477a350bf2..585517f7c4c 100644 --- a/compiler/one-cmds/one-profile +++ b/compiler/one-cmds/one-profile @@ -35,6 +35,9 @@ import onelib.utils as oneutils # TODO Find better way to suppress trackback on error sys.tracebacklimit = 0 +COMMAND_KEYS = ['__command', 'command'] +BACKEND_KEY = 'BACKEND' + def _get_backends_list(): """ @@ -120,7 +123,8 @@ def _verify_arg(parser, args, cfg_args, cfg_target_args, backend_args, unknown_a # overwrite the value if it exists as command line option has higher priority. if oneutils.is_valid_attr(args, 'target'): target_to_run = args.target - given_backend = backends.get_backend_from_target_conf(target_to_run) + given_backend = backends.get_value_from_target_conf( + target_to_run, BACKEND_KEY) if not given_backend: parser.error(f'Not found {target_to_run} target.') else: @@ -248,8 +252,11 @@ def main(): assert (oneutils.is_valid_attr(cfg_args, 'command')) setattr(cfg_args, args.backend, cfg_args.command) else: + given_backend = None # get backend information - given_backend = backends.get_backend_from_target_conf(target_to_run) + if target_to_run: + given_backend = backends.get_value_from_target_conf( + target_to_run, BACKEND_KEY) # check if command schema exists # 1. if it exists, run the command according to the schema. # 2. if it doesn't exist, insert "--target ${TARGET}" at the beginning of the given command. @@ -286,7 +293,9 @@ def main(): # [15], [16] else: assert oneutils.is_valid_attr(args, 'target') - given_backends = [backends.get_backend_from_target_conf(target_to_run)] + given_backends = [ + backends.get_value_from_target_conf(target_to_run, BACKEND_KEY) + ] # make commands # 1. if command schema exists diff --git a/compiler/one-cmds/onelib/argumentparse.py b/compiler/one-cmds/onelib/argumentparse.py index 7b266cf5955..bc2d626a996 100644 --- a/compiler/one-cmds/onelib/argumentparse.py +++ b/compiler/one-cmds/onelib/argumentparse.py @@ -139,7 +139,46 @@ def print_help(self): oneutils.run([driver_path, '-h'], err_prefix=self.driver) + def get_option_names(self, *, flatten=False, without_dash=False): + """ + Get registered option names. + + :param flatten: single option can have multiple names. + If it is True, such options are returned after flattened. + :param without_dash: optional argument has leading dash on its names. + If it is True, option names are returned without such dashes. + + For example, say there are options like these. + + parser.add_argument("--verbose", action=NormalOption, dtype=bool) + parser.add_argument("--output", "--output_path", action=NormalOption) + + [EXAMPLES] + get_option_names() + [[--verbose], [--output, --output_path]] + get_option_names(without_dash=True) + [[verbose], [output, output_path]] + get_option_names(flatten=True) + [--verbose, --output, --output_path] + get_option_names(flatten=True, without_dash=True) + [verbose, output, output_path] + """ + names = [] + for action in self._actions: + names.append(action[0]) + + if flatten: + names = [name for name_l in names for name in name_l] + if without_dash: + names = [name.lstrip('-') for name in names] + + return names + def check_if_valid_option_name(self, *args, **kwargs): + existing_options = self.get_option_names(flatten=True, without_dash=True) + args_without_dash = [arg.lstrip('-') for arg in args] + if any(arg in existing_options for arg in args_without_dash): + raise RuntimeError('Duplicate option names') if not 'action' in kwargs: raise RuntimeError('"action" keyword argument is required') diff --git a/compiler/one-cmds/onelib/backends.py b/compiler/one-cmds/onelib/backends.py index 4403a07cbb6..f7336bde233 100644 --- a/compiler/one-cmds/onelib/backends.py +++ b/compiler/one-cmds/onelib/backends.py @@ -28,6 +28,7 @@ ├── include ├── lib ├── optimization +├── target └── test The list where `one-XXXX` finds its backends @@ -36,6 +37,21 @@ NOTE If there are backends of the same name in different places, the closer to the top in the list, the higher the priority. + +[About TARGET and BACKEND] + "Target" refers to an instance from the core of the system and + "Backend" refers to an architecture. Say there is a NPU that has + multiple cores. Its cores may have different global buffer + size, DSPM size and clock rate, etc, which are described in + each configuration file of "Target". Even though they + are different target, they may follow same architecture, which means + they have same "Backend". + +[Path for TARGET configuration] + - /usr/share/one/target/${TARGET}.ini + +[Path for BACKEND tools] + - /usr/share/one/backends/${BACKEND} """ @@ -62,11 +78,11 @@ def get_list(cmdname): return backends_list -def get_backend_from_target_conf(target: str): +def get_value_from_target_conf(target: str, key: str): dir_path = os.path.dirname(os.path.realpath(__file__)) target_conf_path = dir_path + f'/../../target/{target}.ini' if not os.path.isfile(target_conf_path): - return None + raise FileNotFoundError(f"Not found given target configuration: {target}") # target config doesn't have section. # but, configparser needs configs to have one or more sections. @@ -77,11 +93,16 @@ def get_backend_from_target_conf(target: str): parser.read_string(config_str) assert parser.has_section(DUMMY_SECTION) - BACKEND_KEY = 'BACKEND' - if BACKEND_KEY in parser[DUMMY_SECTION]: - return parser[DUMMY_SECTION][BACKEND_KEY] + # Check if target file is valid + TARGET_KEY = 'TARGET' + assert TARGET_KEY in parser[DUMMY_SECTION] + if target != parser[DUMMY_SECTION][TARGET_KEY]: + raise RuntimeError("Invalid target file.") - return None + if key in parser[DUMMY_SECTION]: + return parser[DUMMY_SECTION][key] + + raise RuntimeError(f"Not found '{key}' key in target configuration.") def search_driver(driver): diff --git a/compiler/one-cmds/tests/one-codegen_006.test b/compiler/one-cmds/tests/one-codegen_006.test index d5e3dc86843..6797a3a6a4d 100644 --- a/compiler/one-cmds/tests/one-codegen_006.test +++ b/compiler/one-cmds/tests/one-codegen_006.test @@ -52,7 +52,7 @@ clean_envir() trap_err_onexit() { echo "${filename_ext} FAILED" - rm -rf ../bin/dummy-compile + clean_envir exit 255 } diff --git a/compiler/one-cmds/tests/one-codegen_010.test b/compiler/one-cmds/tests/one-codegen_010.test index a81310e2899..2f942eb4f8b 100644 --- a/compiler/one-cmds/tests/one-codegen_010.test +++ b/compiler/one-cmds/tests/one-codegen_010.test @@ -52,7 +52,7 @@ clean_envir() trap_err_onexit() { echo "${filename_ext} FAILED" - rm -rf ../bin/dummy-compile + clean_envir exit 255 } diff --git a/compiler/one-cmds/tests/one-codegen_011.test b/compiler/one-cmds/tests/one-codegen_011.test index 0b7f9174fdb..5d78d5bc562 100644 --- a/compiler/one-cmds/tests/one-codegen_011.test +++ b/compiler/one-cmds/tests/one-codegen_011.test @@ -53,7 +53,7 @@ clean_envir() trap_err_onexit() { echo "${filename_ext} FAILED" - rm -rf ../bin/dummy-compile + clean_envir exit 255 } diff --git a/compiler/one-cmds/tests/one-codegen_neg_006.cfg b/compiler/one-cmds/tests/one-codegen_neg_006.cfg new file mode 100644 index 00000000000..afa3051d34f --- /dev/null +++ b/compiler/one-cmds/tests/one-codegen_neg_006.cfg @@ -0,0 +1,9 @@ +[onecc] +one-codegen=True + +[backend] +target=one-codegen_neg_006 + +[one-codegen] +o=one-codegen_neg_006.tvn +input=one-codegen_neg_006.circle diff --git a/compiler/one-cmds/tests/one-codegen_neg_006.ini b/compiler/one-cmds/tests/one-codegen_neg_006.ini new file mode 100644 index 00000000000..c128e39f277 --- /dev/null +++ b/compiler/one-cmds/tests/one-codegen_neg_006.ini @@ -0,0 +1,2 @@ +TARGET=one-codegen_neg_006 +BACKEND=dummy diff --git a/compiler/one-cmds/tests/one-codegen_neg_006.py b/compiler/one-cmds/tests/one-codegen_neg_006.py new file mode 100644 index 00000000000..71aba159b01 --- /dev/null +++ b/compiler/one-cmds/tests/one-codegen_neg_006.py @@ -0,0 +1,14 @@ +from onelib import argumentparse +from onelib.argumentparse import DriverName, NormalOption, TargetOption + + +def command_schema(): + parser = argumentparse.ArgumentParser() + parser.add_argument("dummy-compile", action=DriverName) + parser.add_argument("--target", action=TargetOption) + parser.add_argument("--DSP-quota", action=NormalOption) + parser.add_argument("-o", action=NormalOption) + parser.add_argument("--op", "-o", action=NormalOption) # duplicate names + parser.add_argument("input", action=NormalOption) + + return parser diff --git a/compiler/one-cmds/tests/one-codegen_neg_006.test b/compiler/one-cmds/tests/one-codegen_neg_006.test new file mode 100644 index 00000000000..526f9799465 --- /dev/null +++ b/compiler/one-cmds/tests/one-codegen_neg_006.test @@ -0,0 +1,116 @@ +#!/bin/bash + +# Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# command schema has duplicate names. + +: ' +This test assumes below directories. + +[one hierarchy] + one + ├── backends + │   └── command + │   └── dummy + │   └── codegen.py + ├── bin + ├── doc + ├── include + ├── lib + ├── optimization + ├── target + └── test # pwd +' + +BACKENDS_ALREADY_EXIST=true +CMD_ALREADY_EXIST=true +DUMMY_ALREADY_EXIST=true +TARGET_ALREADY_EXIST=true + +BACKEND_NAME="dummy" + +filename_ext="$(basename -- $0)" +filename="${filename_ext%.*}" + +configfile="one-codegen_neg_006.cfg" +outputfile="one-codegen_neg_006.tvn" +targetfile="one-codegen_neg_006.ini" +commandschema="one-codegen_neg_006.py" + +clean_envir() +{ + rm -rf ../bin/dummy-compile + rm -rf ../target/${targetfile} + rm -rf "../backends/command/${BACKEND_NAME}/codegen.py" + if [ "$TARGET_ALREADY_EXIST" = false ]; then + rm -rf ../target/ + fi + if [ "$DUMMY_ALREADY_EXIST" = false ]; then + rm -rf "../backends/command/${BACKEND_NAME}/" + fi + if [ "$CMD_ALREADY_EXIST" = false ]; then + rm -rf ../backends/command/ + fi + if [ "$BACKENDS_ALREADY_EXIST" = false ]; then + rm -rf ../backends/ + fi +} + +trap_err_onexit() +{ + if grep -q "Duplicate option names" "${filename}.log"; then + echo "${filename_ext} SUCCESS" + clean_envir + exit 0 + fi + + echo "${filename_ext} FAILED" + clean_envir + exit 255 +} + +trap trap_err_onexit ERR + +rm -f ${filename}.log +rm -rf ${outputfile} + +if [ ! -d "../target/" ]; then + mkdir -p ../target/ + TARGET_ALREADY_EXIST=false +fi +if [ ! -d "../backends/" ]; then + mkdir -p ../backends/ + BACKENDS_ALREADY_EXIST=false +fi +if [ ! -d "../backends/command/" ]; then + mkdir -p ../backends/command/ + CMD_ALREADY_EXIST=false +fi +if [ ! -d "../backends/command/${BACKEND_NAME}/" ]; then + mkdir -p ../backends/command/${BACKEND_NAME}/ + DUMMY_ALREADY_EXIST=false +fi + +# copy dummy tools to bin folder +cp dummy-compile ../bin/dummy-compile +cp ${targetfile} ../target/ +cp ${commandschema} "../backends/command/${BACKEND_NAME}/codegen.py" + +# run test +onecc -C ${configfile} > ${filename}.log 2>&1 + +clean_envir +echo "${filename_ext} FAILED" +exit 255 diff --git a/compiler/one-cmds/tests/one-profile_006.test b/compiler/one-cmds/tests/one-profile_006.test index 3afb575a238..03a54d52e0e 100644 --- a/compiler/one-cmds/tests/one-profile_006.test +++ b/compiler/one-cmds/tests/one-profile_006.test @@ -52,7 +52,7 @@ clean_envir() trap_err_onexit() { echo "${filename_ext} FAILED" - rm -rf ../bin/dummy-profile + clean_envir exit 255 } diff --git a/compiler/one-cmds/tests/one-profile_010.test b/compiler/one-cmds/tests/one-profile_010.test index 681b9d6dc75..b0c4953bfd2 100644 --- a/compiler/one-cmds/tests/one-profile_010.test +++ b/compiler/one-cmds/tests/one-profile_010.test @@ -52,7 +52,7 @@ clean_envir() trap_err_onexit() { echo "${filename_ext} FAILED" - rm -rf ../bin/dummy-profile + clean_envir exit 255 } diff --git a/compiler/one-cmds/tests/one-profile_011.test b/compiler/one-cmds/tests/one-profile_011.test index db2e1c81196..4c156d76c2a 100644 --- a/compiler/one-cmds/tests/one-profile_011.test +++ b/compiler/one-cmds/tests/one-profile_011.test @@ -52,7 +52,7 @@ clean_envir() trap_err_onexit() { echo "${filename_ext} FAILED" - rm -rf ../bin/dummy-profile + clean_envir exit 255 } diff --git a/compiler/one-cmds/tests/onecc_057.test b/compiler/one-cmds/tests/onecc_057.test index f0076093b2e..83eea6fbf9a 100644 --- a/compiler/one-cmds/tests/onecc_057.test +++ b/compiler/one-cmds/tests/onecc_057.test @@ -19,11 +19,16 @@ filename_ext="$(basename -- $0)" filename="${filename_ext%.*}" -trap_err_onexit() +clean_envir() { - echo "${filename_ext} FAILED" rm -rf ../bin/dummyV2-profile rm -rf ../bin/dummyV3-profile +} + +trap_err_onexit() +{ + echo "${filename_ext} FAILED" + clean_envir exit 255 } @@ -45,7 +50,6 @@ if ! grep -q "dummyV3-profile with onecc_057_overwrite" "${filename}.log"; then trap_err_onexit fi -rm -rf ../bin/dummyV2-profile -rm -rf ../bin/dummyV3-profile +clean_envir echo "${filename_ext} SUCCESS" diff --git a/compiler/one-cmds/tests/onecc_060.ini b/compiler/one-cmds/tests/onecc_060.ini index 6d3a9ac3849..23f2a32b75a 100644 --- a/compiler/one-cmds/tests/onecc_060.ini +++ b/compiler/one-cmds/tests/onecc_060.ini @@ -1,2 +1,2 @@ -TARGET=rose +TARGET=onecc_060 BACKEND=dummy diff --git a/compiler/one-cmds/tests/onecc_neg_038.test b/compiler/one-cmds/tests/onecc_neg_038.test index 13c7d75f406..f629fe0c82d 100644 --- a/compiler/one-cmds/tests/onecc_neg_038.test +++ b/compiler/one-cmds/tests/onecc_neg_038.test @@ -69,6 +69,8 @@ clean_envir() trap_err_onexit() { + clean_envir + if grep -q "Only either of option type is allowed: positional or optional" "${filename}.log"; then echo "${filename_ext} SUCCESS" exit 0 @@ -108,5 +110,7 @@ cp onecc_neg_038.py "../backends/command/${BACKEND_NAME}/codegen.py" # run test onecc -C ${configfile} > ${filename}.log 2>&1 +clean_envir + echo "${filename_ext} FAILED" exit 255 diff --git a/compiler/one-cmds/tests/onecc_neg_039.cfg b/compiler/one-cmds/tests/onecc_neg_039.cfg new file mode 100644 index 00000000000..18275d484f8 --- /dev/null +++ b/compiler/one-cmds/tests/onecc_neg_039.cfg @@ -0,0 +1,9 @@ +[onecc] +one-codegen=True + +[backend] +target=onecc_neg_039 + +[one-codegen] +backend=dummy +command=-o onecc_neg_039.tvn onecc_neg_039.circle diff --git a/compiler/one-cmds/tests/onecc_neg_039.ini b/compiler/one-cmds/tests/onecc_neg_039.ini new file mode 100644 index 00000000000..e2bc54b04e1 --- /dev/null +++ b/compiler/one-cmds/tests/onecc_neg_039.ini @@ -0,0 +1,2 @@ +TARGET=rose # invalid name +BACKEND=dummy diff --git a/compiler/one-cmds/tests/onecc_neg_039.test b/compiler/one-cmds/tests/onecc_neg_039.test new file mode 100644 index 00000000000..75a365dea99 --- /dev/null +++ b/compiler/one-cmds/tests/onecc_neg_039.test @@ -0,0 +1,84 @@ +#!/bin/bash + +# Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Invalid target file + +: ' +This test assumes below directories. + +[one hierarchy] + one + ├── backends + ├── bin + ├── doc + ├── include + ├── lib + ├── optimization + ├── target + └── test # pwd +' + +TARGET_ALREADY_EXIST=true + +filename_ext="$(basename -- $0)" +filename="${filename_ext%.*}" + +configfile="onecc_neg_039.cfg" +outputfile="onecc_neg_039.tvn" +targetfile="onecc_neg_039.ini" + +clean_envir() +{ + rm -rf ../bin/dummy-compile + rm -rf ../target/${targetfile} + if [ "$TARGET_ALREADY_EXIST" = false ]; then + rm -rf ../target/ + fi +} + +trap_err_onexit() +{ + if grep -q "Invalid target file" "${filename}.log"; then + echo "${filename_ext} SUCCESS" + clean_envir + exit 0 + fi + + echo "${filename_ext} FAILED" + clean_envir + exit 255 +} + +trap trap_err_onexit ERR + +rm -f ${filename}.log +rm -rf ${outputfile} + +if [ ! -d "../target/" ]; then + mkdir -p ../target/ + TARGET_ALREADY_EXIST=false +fi + +# copy dummy tools to bin folder +cp dummy-compile ../bin/dummy-compile +cp ${targetfile} ../target/ + +# run test +onecc -C ${configfile} > ${filename}.log 2>&1 + +echo "${filename_ext} FAILED" +clean_envir +exit 255 diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h index c75ae9a5086..119a5e5d1b8 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h @@ -50,46 +50,16 @@ #include "arm_compute/runtime/CL/CLTensor.h" #include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h" #include "arm_compute/runtime/MemoryGroup.h" -#include "src/core/CL/kernels/CLTransposeKernel.h" +#include "arm_compute/runtime/CL/functions/CLTranspose.h" namespace arm_compute { -/** Basic function to reshape the weights of Fully Connected layer with OpenCL. This function calls - * the following kernels: - * - * -# @ref CLTransposeKernel - * - * @note The fully connected layer accepts "weights" tensors only with 2 dimensions. - */ -class CLFullyConnectedHybridLayerReshapeWeights : public ICLSimpleFunction -{ -public: - /** Set the input and output tensors. - * - * @param[in] input Weights tensor. The weights must be 2 dimensional. Data types supported: - * S8. - * @param[out] output Destination tensor which stores the transposed input tensor. Data type - * supported: Same as @p input. - */ - void configure(const ICLTensor *input, ICLTensor *output); - /** Static function to check if given info will lead to a valid configuration of @ref - * CLFullyConnectedHybridLayerReshapeWeights - * - * @param[in] input Weights tensor. The weights must be 2 dimensional. Data types supported: - * S8. - * @param[in] output Destination tensor which stores the transposed input tensor. Data type - * supported: Same as @p input. - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output); -}; /** Basic function to compute a Fully Connected layer on OpenCL. This function calls the following * OpenCL kernels: * * -# @ref CLIm2ColKernel (called when the input comes from a convolutional layer) - * -# @ref CLFullyConnectedHybridLayerReshapeWeights (if @p are_weights_reshaped is set to false + * -# @ref CLTranspose (if @p are_weights_reshaped is set to false * and transpose_weights is set to true ) (called once) * -# @ref CLGEMMLowpMatrixMultiplyCore (if quantized symmetric) * -# @ref CLGEMMMatrixAccumulateBiasesKernel (if @p biases is not equal to nullptr) @@ -165,7 +135,7 @@ class CLFullyConnectedHybridLayer : public IFunction bool retain_internal_weights); MemoryGroup _memory_group; - CLFullyConnectedHybridLayerReshapeWeights _reshape_weights_kernel; + CLTranspose _reshape_weights_kernel; CLScaleFactorSymm8Kernel _scale_factor_kernel; CLQuantizationSymmetricKernel _quant_input_kernel; CLGEMMLowpMatrixMultiplyCore _mm_gemmlowp; diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h index c08da526aab..919f019aceb 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h @@ -50,45 +50,15 @@ #include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h" #include "arm_compute/runtime/IWeightsManager.h" #include "arm_compute/runtime/MemoryGroup.h" -#include "src/core/CL/kernels/CLTransposeKernel.h" +#include "arm_compute/runtime/CL/functions/CLTranspose.h" namespace arm_compute { -/** Basic function to reshape the weights of Fully Connected layer with OpenCL. This function calls - * the following kernels: - * - * -# @ref CLTransposeKernel - * - * @note The fully connected layer accepts "weights" tensors only with 2 dimensions. - */ -class CLFullyConnectedLayerReshapeWeightsEx : public ICLSimpleFunction -{ -public: - /** Set the input and output tensors. - * - * @param[in] input Weights tensor. The weights must be 2 dimensional. Data types supported: - * QASYMM8/F16/F32. - * @param[out] output Destination tensor which stores the transposed input tensor. Data type - * supported: Same as @p input. - */ - void configure(const ICLTensor *input, ICLTensor *output); - /** Static function to check if given info will lead to a valid configuration of @ref - * CLFullyConnectedLayerReshapeWeightsEx - * - * @param[in] input Weights tensor. The weights must be 2 dimensional. Data types supported: - * QASYMM8/F16/F32. - * @param[in] output Destination tensor which stores the transposed input tensor. Data type - * supported: Same as @p input. - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output); -}; namespace weights_transformations { /** Basic function to manage the reshape weights generated from @ref - * CLFullyConnectedLayerReshapeWeightsEx */ + * CLTranspose */ class CLFullyConnectedLayerReshapeWeightsExManaged : public ITransformWeights { public: @@ -118,7 +88,7 @@ class CLFullyConnectedLayerReshapeWeightsExManaged : public ITransformWeights private: static constexpr uint32_t _uid = 0x0; CLTensor _output{}; - CLFullyConnectedLayerReshapeWeightsEx _func{}; + CLTranspose _func{}; }; } // namespace weights_transformations @@ -209,7 +179,7 @@ class CLFullyConnectedLayerEx : public IFunction weights_transformations::CLFullyConnectedLayerReshapeWeightsExManaged _reshape_weights_managed_function; CLFlattenLayer _flatten_layer; - CLFullyConnectedLayerReshapeWeightsEx _reshape_weights_function; + CLTranspose _reshape_weights_function; CLGEMM _mm_gemm; CLGEMMLowpMatrixMultiplyCore _mm_gemmlowp; CLTensor _flatten_output; diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPadLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPadLayerEx.h index ee1879aaa1c..f60565da041 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPadLayerEx.h +++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPadLayerEx.h @@ -43,8 +43,7 @@ #include "arm_compute/runtime/CL/CLTensor.h" #include "arm_compute/runtime/IFunction.h" #include "arm_compute/core/CL/kernels/CLPadLayerKernelEx.h" -#include "src/core/gpu/cl/kernels/ClCopyKernel.h" -// #include "arm_compute/runtime/CL/functions/CLCopy.h" +#include "arm_compute/runtime/CL/functions/CLCopy.h" #include namespace arm_compute @@ -123,7 +122,7 @@ class CLPadLayerEx : public IFunction void configure_reflect_mode(ICLTensor *input, ICLTensor *output); std::unique_ptr _pad_kernel; - std::unique_ptr _copy_kernel; + std::unique_ptr _copy_kernel; bool _perform_pad; }; } // namespace arm_compute diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h index 21459271020..13b224167fa 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h @@ -48,43 +48,15 @@ #include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h" #include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" #include "arm_compute/runtime/Tensor.h" -#include "src/core/NEON/kernels/NETransposeKernel.h" +#include "arm_compute/runtime/NEON/functions/NETranspose.h" namespace arm_compute { -/** Basic function to reshape the weights of Fully Connected layer with NEON. This function calls - * the following kernels: - * - * -# @ref NETransposeKernel - * - * @note The fully connected layer accepts "weights" tensors only with 2 dimensions. - */ -class NEFullyConnectedHybridLayerReshapeWeights : public INESimpleFunctionNoBorder -{ -public: - /** Set the input and output tensors. - * - * @param[in] input Weights tensor. The weights must be 2 dimensional. Data types supported: - * QASYMM8/F16/F32. - * @param[out] output Destination tensor. Data type supported: Same as @p input. - */ - void configure(const ITensor *input, ITensor *output); - /** Static function to check if given info will lead to a valid configuration of @ref - * NEFullyConnectedHybridLayerReshapeWeights - * - * @param[in] input Weights tensor info. The weights must be 2 dimensional. Data types supported: - * QASYMM8/F16/F32. - * @param[in] output Destination tensor info. Data type supported: Same as @p input. - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output); -}; /** Basic function to compute a Fully Connected layer on NEON. This function calls the following * NEON kernels: * -# @ref NEIm2ColKernel (called when the input comes from a convolutional layer) - * -# @ref NEFullyConnectedHybridLayerReshapeWeights (if @p are_weights_reshaped is set to false + * -# @ref NETranspose (if @p are_weights_reshaped is set to false * and transpose_weights is set to true ) (called once) * -# @ref NEGEMMMatrixMultiplyKernel or @ref NEGEMMLowpMatrixMultiplyCore (if quantized * asymmetric) @@ -162,7 +134,7 @@ class NEFullyConnectedHybridLayer : public IFunction void configure_mm(const ITensor *input, const ITensor *weights, ITensor *output); MemoryGroup _memory_group; - NEFullyConnectedHybridLayerReshapeWeights _reshape_weights_function; + NETranspose _reshape_weights_function; NEQuantizationSymmetricKernel _quant_input_kernel; NEGEMMLowpMatrixMultiplyCore _mm_gemmlowp; NEMultiplyScaleFactorKernel _multiply_scale_kernel; diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h index 2bbb1fea126..aaceeaa99d1 100644 --- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h +++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h @@ -51,21 +51,17 @@ #include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h" #include "arm_compute/runtime/MemoryGroup.h" #include "arm_compute/runtime/Tensor.h" -#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h" -#include "src/core/NEON/kernels/NETransposeKernel.h" +#include "arm_compute/runtime/NEON/functions/NETranspose.h" namespace arm_compute { /** Basic function to compute a Fully Connected layer on NEON. This function calls the following * NEON kernels: * -# @ref NEIm2ColKernel (called when the input comes from a convolutional layer) - * -# @ref NEFullyConnectedLayerReshapeWeights (if @p are_weights_reshaped is set to false and + * -# @ref NETranspose (if @p are_weights_reshaped is set to false and * transpose_weights is set to true ) (called once) * -# @ref NEGEMMMatrixMultiplyKernel or @ref NEGEMMLowpMatrixMultiplyCore (if quantized * asymmetric) - * -# @ref NEGEMMMatrixAccumulateBiasesKernel or @ref - * NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint (if quantized asymmetric) (if @p biases is - * not equal to nullptr) * * @note The fully connected layer accepts "weights" tensors only with 2 dimensions. * @note The difference from NEFullyConnectedLayer is that this class supports weights as input @@ -136,29 +132,28 @@ class NEFullyConnectedLayerEx : public IFunction void prepare() override; private: - void configure_fc_fc(const ITensor *input, const ITensor *weights, ITensor *output); - void configure_conv_fc(const ITensor *input, const ITensor *weights, ITensor *output); - void configure_mm(const ITensor *input, const ITensor *weights, ITensor *output); + void configure_fc_fc(const ITensor *input, const ITensor *weights, const ITensor *bias, + ITensor *output, const FullyConnectedLayerInfo &fc_info); + void configure_conv_fc(const ITensor *input, const ITensor *weights, const ITensor *bias, + ITensor *output, const FullyConnectedLayerInfo &fc_info); + void configure_mm(const ITensor *input, const ITensor *weights, const ITensor *bias, + ITensor *output, const FullyConnectedLayerInfo &fc_info); MemoryGroup _memory_group; - NEFlattenLayer _flatten_kernel; NEConvertFullyConnectedWeights _convert_weights; - NEFullyConnectedLayerReshapeWeights _reshape_weights_function; + NEFlattenLayer _flatten_kernel; + NETranspose _reshape_weights_function; NEGEMM _mm_gemm; NEGEMMLowpMatrixMultiplyCore _mm_gemmlowp; - NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint _gemmlowp_output_stage; - NEGEMMMatrixAccumulateBiasesKernel _accumulate_biases_kernel; Tensor _flatten_output; - Tensor _gemmlowp_output; Tensor _converted_weights_output; Tensor _reshape_weights_output; - const ITensor *_original_weights; bool _are_weights_converted; bool _are_weights_reshaped; bool _is_fc_after_conv; - bool _accumulate_biases; bool _is_quantized; bool _is_prepared; + const ITensor *_original_weights; }; } // namespace arm_compute #endif /* __ARM_COMPUTE_NEFULLYCONNECTEDLAYEREX_H__ */ diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp index 464f60deec8..290343ae446 100644 --- a/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp @@ -44,6 +44,7 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculatorEx.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" #include "src/core/helpers/AutoConfiguration.h" @@ -164,7 +165,7 @@ void CLDirectTransposeConvLayer::configure(const CLCompileContext &compile_conte _original_weights = weights; _flip_axis.allocator()->init(TensorInfo(TensorShape(2U), 1, DataType::U32)); _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout)); - _flip_weights.configure(compile_context, weights, &_weights_flipped, &_flip_axis); + _flip_weights.configure(compile_context, weights, &_weights_flipped, &_flip_axis, false); auto out_dims = transposeconv_output_dimensions( input->info()->dimension(idx_w), input->info()->dimension(idx_h), diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp index af936e873e4..b07555ee7d7 100644 --- a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp @@ -65,19 +65,6 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I } } // namespace -void CLFullyConnectedHybridLayerReshapeWeights::configure(const ICLTensor *input, ICLTensor *output) -{ - auto k = std::make_unique(); - k->configure(input, output); - _kernel = std::move(k); -} - -Status CLFullyConnectedHybridLayerReshapeWeights::validate(const ITensorInfo *input, - const ITensorInfo *output) -{ - return CLTransposeKernel::validate(input, output); -} - CLFullyConnectedHybridLayer::CLFullyConnectedHybridLayer( std::shared_ptr memory_manager) : _memory_group(memory_manager), _reshape_weights_kernel(), _quant_input_kernel(), @@ -245,8 +232,7 @@ Status CLFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITe if (!weights_reshaped) { // Validate reshape weights kernel - ARM_COMPUTE_RETURN_ON_ERROR( - CLFullyConnectedHybridLayerReshapeWeights::validate(weights_to_use, &reshaped_weights)); + ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(weights_to_use, &reshaped_weights)); weights_to_use = &reshaped_weights; } diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp index c6a88d3409f..9f8c3390041 100644 --- a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp @@ -45,6 +45,7 @@ #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "arm_compute/function_info/GEMMInfo.h" #include "support/Cast.h" @@ -109,8 +110,13 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I fc_info.retain_internal_weights, // retain_internal_weights gemmlowp_output_stage, // gemmlowp_output_stage fc_info.fp_mixed_precision, // fp_mixed_precision + false, // fast_math true, // broadcast_bias - ActivationLayerInfo()); // activation_info + ActivationLayerInfo(), // activation_info + false, // fixed_format + WeightFormat::OHWI, // weight_format + false, // pretranspose_B + bias != nullptr); // acccumulate if (is_data_type_quantized_asymmetric(input.data_type())) { @@ -139,19 +145,6 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I } } // namespace -void CLFullyConnectedLayerReshapeWeightsEx::configure(const ICLTensor *input, ICLTensor *output) -{ - auto k = std::make_unique(); - k->configure(input, output); - _kernel = std::move(k); -} - -Status CLFullyConnectedLayerReshapeWeightsEx::validate(const ITensorInfo *input, - const ITensorInfo *output) -{ - return CLTransposeKernel::validate(input, output); -} - CLFullyConnectedLayerEx::CLFullyConnectedLayerEx(std::shared_ptr memory_manager, IWeightsManager *weights_manager) : _memory_group(memory_manager), _weights_manager(weights_manager), _convert_weights(), @@ -178,8 +171,13 @@ void CLFullyConnectedLayerEx::configure_mm(const ICLTensor *input, const ICLTens fc_info.retain_internal_weights, // retain_internal_weights gemmlowp_output_stage, // gemmlowp_output_stage fc_info.fp_mixed_precision, // fp_mixed_precision + false, // fast_math true, // broadcast_bias - ActivationLayerInfo()); // activation_info + ActivationLayerInfo(), // activation_info + false, // fixed_format + WeightFormat::OHWI, // weight_format + false, // pretranspose_B + bias != nullptr); // acccumulate if (_is_quantized) { @@ -358,11 +356,9 @@ Status CLFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; bool is_fc_after_conv = true; - const ITensorInfo &flatten_input = TensorInfo(input->clone() - ->set_is_resizable(true) - .reset_padding() - .set_tensor_shape(compute_flatten_shape(input)) - .set_data_layout(DataLayout::NCHW)); + const ITensorInfo &flatten_input = + TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( + compute_flatten_shape(input))); const ITensorInfo &reshaped_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( compute_transposed_shape(*weights))); @@ -395,8 +391,7 @@ Status CLFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor if (!weights_reshaped) { // Validate reshape weights kernel - ARM_COMPUTE_RETURN_ON_ERROR( - CLFullyConnectedLayerReshapeWeightsEx::validate(weights, &reshaped_weights)); + ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(weights, &reshaped_weights)); weights_to_use = &reshaped_weights; } @@ -507,77 +502,6 @@ void CLFullyConnectedLayerEx::run() void CLFullyConnectedLayerEx::prepare() { -#if 0 // TODO Remove this block - if(!_is_prepared) - { - if(!_weights_manager) - { - ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); - } - - auto release_unused = [](CLTensor * w) - { - if(!w->is_used()) - { - CLScheduler::get().queue().finish(); - w->allocator()->free(); - } - }; - - // Pointer to current weights - const ICLTensor *cur_weights = _original_weights; - - // Reshape of the weights if needed (happens only once) - if(!_are_weights_reshaped) - { - if(_weights_manager && _weights_manager->are_weights_managed(_original_weights)) - { - cur_weights = utils::cast::polymorphic_downcast(_weights_manager->run(cur_weights, &_reshape_weights_managed_function)); - } - else - { - // Run reshape weights kernel and mark weights as unused - _reshape_weights_output.allocator()->allocate(); - _reshape_weights_function.run(); - - cur_weights->mark_as_unused(); - cur_weights = &_reshape_weights_output; - } - _are_weights_reshaped = true; - } - - // Convert weights if needed (happens only once) - if(!_are_weights_converted) - { - if(_weights_manager && _weights_manager->are_weights_managed(cur_weights)) - { - _weights_manager->run(cur_weights, &_convert_weights_managed); - } - else - { - _converted_weights_output.allocator()->allocate(); - _convert_weights.run(); - cur_weights->mark_as_unused(); - } - - _are_weights_converted = true; - } - - // Release reshaped weights if unused - release_unused(&_reshape_weights_output); - - // Prepare GEMM prepare and release unused weights - if(!_is_quantized) - { - _mm_gemm.prepare(); - } - - // Release converted weights if unused - release_unused(&_reshape_weights_output); - release_unused(&_converted_weights_output); - - _is_prepared = true; - } -#endif + // DO NOTHING } } // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLPadLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLPadLayerEx.cpp index 4d940e96632..e67bb1ce6ea 100644 --- a/compute/ARMComputeEx/src/runtime/CL/functions/CLPadLayerEx.cpp +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLPadLayerEx.cpp @@ -43,8 +43,8 @@ namespace arm_compute { CLPadLayerEx::CLPadLayerEx() - : _pad_kernel(std::make_unique()), - _copy_kernel(std::make_unique()), _perform_pad(false) + : _pad_kernel(std::make_unique()), _copy_kernel(std::make_unique()), + _perform_pad(false) { } @@ -74,7 +74,7 @@ void CLPadLayerEx::configure(const CLCompileContext &compile_context, ICLTensor Window copy_window = Window(); copy_window.use_tensor_dimensions(output->info()->tensor_shape()); // Copy the input to the whole output if no padding is applied - _copy_kernel->configure(compile_context, input->info(), output->info(), ©_window); + _copy_kernel->configure(compile_context, input, output, ©_window); } } Status CLPadLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output, @@ -92,7 +92,7 @@ Status CLPadLayerEx::validate(const ITensorInfo *input, const ITensorInfo *outpu } else { - ARM_COMPUTE_RETURN_ON_ERROR(opencl::kernels::ClCopyKernel::validate(input, output)); + ARM_COMPUTE_RETURN_ON_ERROR(CLCopy::validate(input, output)); } return Status{}; } @@ -104,7 +104,7 @@ void CLPadLayerEx::run() } else { - CLScheduler::get().enqueue(*_copy_kernel); + _copy_kernel->run(); } } } // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp index f3f093c188d..af0bc49e168 100644 --- a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp +++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp @@ -44,6 +44,7 @@ #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" #include #include diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp index fbd88fff0a9..4505122dc75 100644 --- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp @@ -64,19 +64,6 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I } } // namespace -void NEFullyConnectedHybridLayerReshapeWeights::configure(const ITensor *input, ITensor *output) -{ - auto k = std::make_unique(); - k->configure(input, output); - _kernel = std::move(k); -} - -Status NEFullyConnectedHybridLayerReshapeWeights::validate(const ITensorInfo *input, - const ITensorInfo *output) -{ - return NETransposeKernel::validate(input, output); -} - NEFullyConnectedHybridLayer::NEFullyConnectedHybridLayer( std::shared_ptr memory_manager) : _memory_group(std::move(memory_manager)), _reshape_weights_function(), _quant_input_kernel(), @@ -108,6 +95,7 @@ void NEFullyConnectedHybridLayer::configure(const ITensor *input, const ITensor _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; _accumulate_biases = false; + _is_prepared = fc_info.retain_internal_weights; _original_weights = weights; // Configure accumulate biases kernel for non quantized asymmetric types @@ -129,7 +117,7 @@ void NEFullyConnectedHybridLayer::configure(const ITensor *input, const ITensor // Check if we have a fully connected layer with batches const bool is_batched_fc_layer = output->info()->dimension(1) > 1; - bool _is_fc_after_conv; + bool _is_fc_after_conv = false; if (is_batched_fc_layer) { _is_fc_after_conv = @@ -143,7 +131,7 @@ void NEFullyConnectedHybridLayer::configure(const ITensor *input, const ITensor } ARM_COMPUTE_ERROR_ON_MSG(_is_fc_after_conv, "NEFullyConnectedHybridLayer does not support after conv"); - (void)_is_fc_after_conv; + ARM_COMPUTE_UNUSED(_is_fc_after_conv); // Reshape weights if needed if (!_are_weights_reshaped) @@ -216,8 +204,7 @@ Status NEFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITe if (!weights_reshaped) { // Validate reshape weights kernel - ARM_COMPUTE_RETURN_ON_ERROR( - NEFullyConnectedHybridLayerReshapeWeights::validate(weights_to_use, &reshaped_weights)); + ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(weights_to_use, &reshaped_weights)); weights_to_use = &reshaped_weights; } diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp index 758f7dc59cb..36adc045d11 100644 --- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp @@ -56,8 +56,66 @@ using namespace arm_compute::misc::shape_calculator; namespace { -Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo &output) +Status construct_gemmlowp_output_stage(const ITensorInfo &input, const ITensorInfo &weights, + const ITensorInfo &output, + GEMMLowpOutputStageInfo &gemmlowp_output_stage) { + gemmlowp_output_stage.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; + gemmlowp_output_stage.gemmlowp_offset = 0; + gemmlowp_output_stage.gemmlowp_multiplier = 0; + gemmlowp_output_stage.gemmlowp_shift = 0; + + // Configure output stage for quantized case + if (is_data_type_quantized_asymmetric(input.data_type())) + { + const UniformQuantizationInfo iq_info = input.quantization_info().uniform(); + const UniformQuantizationInfo wq_info = weights.quantization_info().uniform(); + const UniformQuantizationInfo oq_info = output.quantization_info().uniform(); + + const auto output_quant_info = (output.total_size() == 0) ? iq_info : oq_info; + + const float multiplier = (iq_info.scale * wq_info.scale) / output_quant_info.scale; + int output_multiplier = 0; + int output_shift = 0; + ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier_less_than_one( + multiplier, &output_multiplier, &output_shift)); + + // Set the GEMMLowp output stage info + gemmlowp_output_stage.gemmlowp_offset = output_quant_info.offset; + gemmlowp_output_stage.gemmlowp_multiplier = output_multiplier; + gemmlowp_output_stage.gemmlowp_shift = output_shift; + gemmlowp_output_stage.gemmlowp_min_bound = 0; + gemmlowp_output_stage.gemmlowp_max_bound = 255; + gemmlowp_output_stage.gemmlowp_multipliers.push_back(output_multiplier); + gemmlowp_output_stage.gemmlowp_shifts.push_back(output_shift); + } + + return Status{}; +} + +Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo *bias, + const ITensorInfo &output, const FullyConnectedLayerInfo &fc_info) +{ + GEMMLowpOutputStageInfo gemmlowp_output_stage; + ARM_COMPUTE_RETURN_ON_ERROR( + construct_gemmlowp_output_stage(input, weights, output, gemmlowp_output_stage)); + + const GEMMInfo &gemm_info = GEMMInfo(false, // is_a_reshaped + false, // is_b_reshaped + true, // reshape_b_only_on_first_run + 0, // depth_output_gemm3d + false, // reinterpret_input_as_3d + fc_info.retain_internal_weights, // retain_internal_weights + gemmlowp_output_stage, // gemmlowp_output_stage + fc_info.fp_mixed_precision, // fp_mixed_precision + false, // fast_math + true, // broadcast_bias + ActivationLayerInfo(), // activation_info + false, // fixed_format + WeightFormat::OHWI, // weight_format + false, // pretranspose_B + bias != nullptr); // acccumulate + if (is_data_type_quantized_asymmetric(input.data_type())) { // Since we need negative offsets for computing convolution, we need to change @@ -71,13 +129,13 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I // Validate gemmlowp function ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyCore::validate( &input.clone()->set_quantization_info(input_quantization_info), - &weights.clone()->set_quantization_info(weights_quantization_info), nullptr, &output)); + &weights.clone()->set_quantization_info(weights_quantization_info), bias, &output, + gemm_info)); } else { ARM_COMPUTE_RETURN_ON_ERROR( - NEGEMM::validate(&input, &weights, nullptr, &output, 1.f, 0.0f, - GEMMInfo(false, false, false /* Reshape weights only for the first run */))); + NEGEMM::validate(&input, &weights, bias, &output, 1.f, 0.0f, gemm_info)); } return Status{}; @@ -85,18 +143,38 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I } // namespace NEFullyConnectedLayerEx::NEFullyConnectedLayerEx(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _flatten_kernel(), _convert_weights(), - _reshape_weights_function(), _mm_gemm(), _mm_gemmlowp(), _gemmlowp_output_stage(), - _accumulate_biases_kernel(), _flatten_output(), _gemmlowp_output(), _converted_weights_output(), - _reshape_weights_output(), _original_weights(nullptr), _are_weights_converted(true), - _are_weights_reshaped(false), _is_fc_after_conv(false), _accumulate_biases(false), - _is_quantized(false), _is_prepared(false) + : _memory_group(std::move(memory_manager)), _convert_weights(), _flatten_kernel(), + _reshape_weights_function(), _mm_gemm(), _mm_gemmlowp(), _flatten_output(), + _converted_weights_output(), _reshape_weights_output(), _are_weights_converted(true), + _are_weights_reshaped(false), _is_fc_after_conv(false), _is_quantized(false), + _is_prepared(false), _original_weights(nullptr) { } void NEFullyConnectedLayerEx::configure_mm(const ITensor *input, const ITensor *weights, - ITensor *output) + const ITensor *bias, ITensor *output, + const FullyConnectedLayerInfo &fc_info) { + GEMMLowpOutputStageInfo gemmlowp_output_stage; + construct_gemmlowp_output_stage(*input->info(), *weights->info(), *output->info(), + gemmlowp_output_stage); + + const GEMMInfo &gemm_info = GEMMInfo(false, // is_a_reshaped + false, // is_b_reshaped + true, // reshape_b_only_on_first_run + 0, // depth_output_gemm3d + false, // reinterpret_input_as_3d + fc_info.retain_internal_weights, // retain_internal_weights + gemmlowp_output_stage, // gemmlowp_output_stage + fc_info.fp_mixed_precision, // fp_mixed_precision + false, // fast_math + true, // broadcast_bias + ActivationLayerInfo(), // activation_info + false, // fixed_format + WeightFormat::OHWI, // weight_format + false, // pretranspose_B + bias != nullptr); // acccumulate + if (_is_quantized) { // Since we need negative offsets for computing convolution, we need to change @@ -111,7 +189,7 @@ void NEFullyConnectedLayerEx::configure_mm(const ITensor *input, const ITensor * weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset)); // Configure gemmlowp function - _mm_gemmlowp.configure(input, weights, nullptr, output); + _mm_gemmlowp.configure(input, weights, bias, output, gemm_info); // Revert back QuantizatioInfo as input and weights could be used in other fully connected // layers @@ -121,13 +199,13 @@ void NEFullyConnectedLayerEx::configure_mm(const ITensor *input, const ITensor * else { // Configure matrix multiply kernel - _mm_gemm.configure(input, weights, nullptr, output, 1.f, 0.0f, - GEMMInfo(false, false, false /* Reshape weights only for the first run */)); + _mm_gemm.configure(input, weights, bias, output, 1.f, 1.0f, gemm_info); } } void NEFullyConnectedLayerEx::configure_conv_fc(const ITensor *input, const ITensor *weights, - ITensor *output) + const ITensor *bias, ITensor *output, + const FullyConnectedLayerInfo &fc_info) { ARM_COMPUTE_ERROR_ON( (weights->info()->dimension(1) != @@ -146,19 +224,20 @@ void NEFullyConnectedLayerEx::configure_conv_fc(const ITensor *input, const ITen _flatten_kernel.configure(input, &_flatten_output); // Configure matrix multiply kernel - configure_mm(&_flatten_output, weights, output); + configure_mm(&_flatten_output, weights, bias, output, fc_info); // Allocate the output tensor for flatten once all the configure methods have been called _flatten_output.allocator()->allocate(); } void NEFullyConnectedLayerEx::configure_fc_fc(const ITensor *input, const ITensor *weights, - ITensor *output) + const ITensor *bias, ITensor *output, + const FullyConnectedLayerInfo &fc_info) { ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1)); // Configure matrix multiply kernel - configure_mm(input, weights, output); + configure_mm(input, weights, bias, output, fc_info); } void NEFullyConnectedLayerEx::configure(const ITensor *input, const ITensor *weights, @@ -174,26 +253,9 @@ void NEFullyConnectedLayerEx::configure(const ITensor *input, const ITensor *wei _are_weights_converted = true; _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; _is_fc_after_conv = true; - _accumulate_biases = false; _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type()); _original_weights = weights; - // Configure gemmlowp output - if (_is_quantized) - { - _gemmlowp_output.allocator()->init( - output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32)); - } - - // Configure accumulate biases kernel for non quantized asymmetric types - if (biases != nullptr && !_is_quantized) - { - _accumulate_biases = true; - - // Configure accumulate biases kernel - _accumulate_biases_kernel.configure(output, biases); - } - // With the Fully Connected layer we can have 4 different cases: // 1) Convolution layer -> Fully Connected layer without batches // 2) Fully Connected layer -> Fully Connected layer without batches @@ -235,32 +297,15 @@ void NEFullyConnectedLayerEx::configure(const ITensor *input, const ITensor *wei _are_weights_converted = false; } - ITensor *tmp_output = (_is_quantized) ? &_gemmlowp_output : output; if (_is_fc_after_conv) { // Fully Connected layer after a Convolution Layer without batches - configure_conv_fc(input, weights_to_use, tmp_output); + configure_conv_fc(input, weights_to_use, biases, output, fc_info); } else { // Fully Connected layer after a Fully Connected Layer without batches - configure_fc_fc(input, weights_to_use, tmp_output); - } - - // Configure output stage for asymmetric quantized types - if (_is_quantized) - { - float multiplier = input->info()->quantization_info().uniform().scale * - weights->info()->quantization_info().uniform().scale / - output->info()->quantization_info().uniform().scale; - int output_multiplier; - int output_shift; - quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, - &output_shift); - _gemmlowp_output_stage.configure(&_gemmlowp_output, biases, output, output_multiplier, - output_shift, - output->info()->quantization_info().uniform().offset); - _gemmlowp_output.allocator()->allocate(); + configure_fc_fc(input, weights_to_use, biases, output, fc_info); } _are_weights_reshaped = _are_weights_reshaped || fc_info.retain_internal_weights; @@ -279,7 +324,6 @@ Status NEFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; bool is_fc_after_conv = true; - bool is_quantized = is_data_type_quantized_asymmetric(input->data_type()); const ITensorInfo &flatten_input = TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape( @@ -290,15 +334,6 @@ Status NEFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor const ITensorInfo &converted_weights = weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()) : TensorInfo(*reshaped_weights.clone()); - const ITensorInfo &gemmlowp_output = TensorInfo( - output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32)); - - // Configure accumulate biases kernel for non quantized asymmetric types - if (biases != nullptr && !is_quantized) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases); - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixAccumulateBiasesKernel::validate(output, biases)); - } // With the Fully Connected layer we can have 4 different cases: // 1) Convolution layer -> Fully Connected layer without batches @@ -308,7 +343,6 @@ Status NEFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor const ITensorInfo *input_to_use = input; const ITensorInfo *weights_to_use = weights; - const ITensorInfo *tmp_output = (is_quantized) ? &gemmlowp_output : output; // Check if we have a fully connected layer with batches const bool is_batched_fc_layer = output->dimension(1) > 1; @@ -327,8 +361,7 @@ Status NEFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor if (!weights_reshaped) { // Validate reshape weights kernel - ARM_COMPUTE_RETURN_ON_ERROR( - NEFullyConnectedLayerReshapeWeights::validate(weights, &reshaped_weights)); + ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(weights, &reshaped_weights)); weights_to_use = &reshaped_weights; } @@ -357,14 +390,8 @@ Status NEFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1)); } // Validate matrix multiply kernel - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(*input_to_use, *weights_to_use, *tmp_output)); - - // Validate output stage for asymmetric quantized types - if (is_quantized) - { - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate( - &gemmlowp_output, biases, output)); - } + ARM_COMPUTE_RETURN_ON_ERROR( + validate_mm(*input_to_use, *weights_to_use, biases, *output, fc_info)); return Status{}; } @@ -374,13 +401,9 @@ void NEFullyConnectedLayerEx::run() if (!_is_prepared) { if (!_are_weights_reshaped) - { _reshape_weights_output.allocator()->allocate(); - } if (!_are_weights_converted) - { _converted_weights_output.allocator()->allocate(); - } _is_prepared = true; } @@ -423,75 +446,10 @@ void NEFullyConnectedLayerEx::run() { _mm_gemm.run(); } - - // Accumulate biases if provided - if (_is_quantized) - { - _gemmlowp_output_stage.run(); - } - else - { - if (_accumulate_biases) - { - NEScheduler::get().schedule(&_accumulate_biases_kernel, Window::DimY); - } - } } void NEFullyConnectedLayerEx::prepare() { -#if 0 // TODO Remove this block - if (!_is_prepared) - { - ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); - - auto release_unused = [](Tensor *w) { - if (!w->is_used()) - { - w->allocator()->free(); - } - }; - - // Pointer to current weights - const ITensor *cur_weights = _original_weights; - - // Reshape of the weights (happens only once) - if (!_are_weights_reshaped) - { - // Run reshape weights kernel and mark weights as unused - _reshape_weights_output.allocator()->allocate(); - _reshape_weights_function.run(); - - cur_weights->mark_as_unused(); - cur_weights = &_reshape_weights_output; - _are_weights_reshaped = true; - } - - // Convert weights if needed (happens only once) - if (!_are_weights_converted) - { - _converted_weights_output.allocator()->allocate(); - _convert_weights.run(); - - cur_weights->mark_as_unused(); - _are_weights_converted = true; - } - - // Release reshaped weights if unused - release_unused(&_reshape_weights_output); - - // Prepare GEMM prepare and release unused weights - if (!_is_quantized) - { - _mm_gemm.prepare(); - } - - // Release converted weights if unused - release_unused(&_reshape_weights_output); - release_unused(&_converted_weights_output); - - _is_prepared = true; - } -#endif + // DO NOTHING } } // namespace arm_compute diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp index 2199839fb86..a525214d349 100644 --- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp +++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp @@ -41,7 +41,9 @@ void NEFullyConnectedReshapingLayer::configure(const arm_compute::ITensor *input if (_needs_reshape) { // reshape - auto_init_if_empty(*_neon_buffer.info(), _input->info()->clone()->set_tensor_shape(reshape)); + auto_init_if_empty(*_neon_buffer.info(), + _input->info()->clone()->set_tensor_shape(reshape).set_data_layout( + _input->info()->data_layout())); _neon_reshape.configure(_input, &_neon_buffer); input_to_use = &_neon_buffer; } @@ -53,11 +55,10 @@ void NEFullyConnectedReshapingLayer::configure(const arm_compute::ITensor *input fc->configure(input_to_use, _weights, _biases, _output); return std::unique_ptr(fc); } - else + else if (kernel_type == KernelType::PREPROCESSED_WEIGHTS) { - assert(kernel_type == KernelType::PREPROCESSED_WEIGHTS); - - bool is_hybrid = input->info()->data_type() == DataType::F32 && + bool is_hybrid = (input->info()->data_type() == DataType::F32 || + input->info()->data_type() == DataType::F16) && (weights->info()->data_type() == DataType::QSYMM8 || weights->info()->data_type() == DataType::QASYMM8_SIGNED); @@ -78,6 +79,10 @@ void NEFullyConnectedReshapingLayer::configure(const arm_compute::ITensor *input return std::unique_ptr(fc); } } + else + { + throw std::runtime_error("NEFullyConnectedReshapingLayer: Unsupported kernel type"); + } }(); // NOTE _neon_buffer is inaccessible from outside, and thus it is safe to invoke allocate here. diff --git a/infra/cmake/packages/ARMComputeSourceConfig.cmake b/infra/cmake/packages/ARMComputeSourceConfig.cmake index 16e12bbcaba..c2ad30b67ef 100644 --- a/infra/cmake/packages/ARMComputeSourceConfig.cmake +++ b/infra/cmake/packages/ARMComputeSourceConfig.cmake @@ -8,7 +8,7 @@ function(_ARMComputeSource_import) nnas_include(OptionTools) envoption(EXTERNAL_DOWNLOAD_SERVER "https://github.com") - set(ARMCOMPUTE_URL ${EXTERNAL_DOWNLOAD_SERVER}/ARM-software/ComputeLibrary/archive/v21.02.tar.gz) + set(ARMCOMPUTE_URL ${EXTERNAL_DOWNLOAD_SERVER}/ARM-software/ComputeLibrary/archive/v24.07.tar.gz) ExternalSource_Download(ARMCOMPUTE ${ARMCOMPUTE_URL}) set(ARMComputeSource_DIR ${ARMCOMPUTE_SOURCE_DIR} PARENT_SCOPE) diff --git a/infra/nnfw/cmake/packages/ARMComputeConfig.cmake b/infra/nnfw/cmake/packages/ARMComputeConfig.cmake index e55a0f4aebd..6f49496f033 100644 --- a/infra/nnfw/cmake/packages/ARMComputeConfig.cmake +++ b/infra/nnfw/cmake/packages/ARMComputeConfig.cmake @@ -29,12 +29,6 @@ function(_ARMCompute_Import) list(APPEND INCLUDE_DIR ${ARMComputeSource_DIR} ${ARMComputeSource_DIR}/include) endif(NOT INCLUDE_DIR OR NOT HEADER_SRC_DIR) - if(NOT CORE_LIBRARY) - set(ARMCompute_FOUND FALSE PARENT_SCOPE) - message(STATUS "Cannot find libarm_compute_core.so") - return() - endif() - if(NOT RUNTIME_LIBRARY) message(STATUS "Cannot find libarm_compute.so") set(ARMCompute_FOUND FALSE PARENT_SCOPE) @@ -47,18 +41,10 @@ function(_ARMCompute_Import) return() endif() - if(NOT TARGET arm_compute_core) - add_library(arm_compute_core INTERFACE) - target_include_directories(arm_compute_core SYSTEM INTERFACE ${INCLUDE_DIR}) - target_link_libraries(arm_compute_core INTERFACE dl ${LIB_PTHREAD}) - target_link_libraries(arm_compute_core INTERFACE ${CORE_LIBRARY}) - endif(NOT TARGET arm_compute_core) - if(NOT TARGET arm_compute) add_library(arm_compute INTERFACE) target_include_directories(arm_compute SYSTEM INTERFACE ${INCLUDE_DIR}) target_link_libraries(arm_compute INTERFACE ${RUNTIME_LIBRARY}) - target_link_libraries(arm_compute INTERFACE arm_compute_core) endif(NOT TARGET arm_compute) if(NOT TARGET arm_compute_graph) diff --git a/nnpackage/schema/circle_schema.fbs b/nnpackage/schema/circle_schema.fbs index 0498318bfce..e13bd3842cb 100644 --- a/nnpackage/schema/circle_schema.fbs +++ b/nnpackage/schema/circle_schema.fbs @@ -33,7 +33,8 @@ // Version 0.6: Base up to TensorFlow Lite v2.13.0 schema. // Version 0.7: Base up to TensorFlow Lite v2.15.0 schema, deprecate data_format in Subgraph table // Version 0.8: GRU op is added. UINT4 is added. -// Version 0.9: GGML_Q{X}_{Y} types are added. Weight compression option is added +// Version 0.9: GGML_Q{X}_{Y} types are added. Weight compression option is added. +// ROPE op is added. namespace circle; @@ -286,6 +287,7 @@ table Tensor { // set of acceptable options. // LINT.IfChange enum BuiltinOperator : int32 { + ROPE = -7, RMS_NORM = -6, GRU = -5, BCQ_GATHER = -4, @@ -636,6 +638,7 @@ union BuiltinOptions { BitcastOptions, BitwiseXorOptions, RightShiftOptions, + RoPEOptions = 249, RmsNormOptions = 250, GRUOptions = 251, BCQGatherOptions = 252, @@ -1525,6 +1528,15 @@ table RmsNormOptions { epsilon:float; } +enum RoPEMode : int { + GPT_NEOX, + GPT_J, +} + +table RoPEOptions { + mode: RoPEMode; +} + // An OperatorCode can be an enum value (BuiltinOperator) if the operator is a // builtin, or a string if the operator is custom. table OperatorCode { diff --git a/res/CircleRecipes/RmsNorm_000/test.recipe b/res/CircleRecipes/RmsNorm_000/test.recipe new file mode 100644 index 00000000000..e5e0c30df14 --- /dev/null +++ b/res/CircleRecipes/RmsNorm_000/test.recipe @@ -0,0 +1,46 @@ +operand { + name: "ifm" + type: FLOAT32 + shape { dim: 1 dim: 3 dim: 3 dim: 4 } +} +operand { + name: "gamma" + type: FLOAT32 + shape { dim: 4 } + filler { + tag: "explicit" + arg: "1.0" + arg: "1.0" + arg: "1.0" + arg: "1.0" + } +} +operand { + name: "beta" + type: FLOAT32 + shape { dim: 4 } + filler { + tag: "explicit" + arg: "0.0" + arg: "0.0" + arg: "0.0" + arg: "0.0" + } +} +operand { + name: "ofm" + type: FLOAT32 + shape { dim: 1 dim: 3 dim: 3 dim: 4 } +} +operation { + type: "RmsNorm" + input: "ifm" + input: "gamma" + input: "beta" + output: "ofm" + rms_norm_options { + epsilon: 0.0001 + } +} +input: "ifm" +output: "ofm" diff --git a/res/CircleRecipes/RmsNorm_000/test.reverse b/res/CircleRecipes/RmsNorm_000/test.reverse new file mode 100644 index 00000000000..e69de29bb2d diff --git a/res/CircleSchema/0.9/circle_schema.fbs b/res/CircleSchema/0.9/circle_schema.fbs index 0498318bfce..e13bd3842cb 100644 --- a/res/CircleSchema/0.9/circle_schema.fbs +++ b/res/CircleSchema/0.9/circle_schema.fbs @@ -33,7 +33,8 @@ // Version 0.6: Base up to TensorFlow Lite v2.13.0 schema. // Version 0.7: Base up to TensorFlow Lite v2.15.0 schema, deprecate data_format in Subgraph table // Version 0.8: GRU op is added. UINT4 is added. -// Version 0.9: GGML_Q{X}_{Y} types are added. Weight compression option is added +// Version 0.9: GGML_Q{X}_{Y} types are added. Weight compression option is added. +// ROPE op is added. namespace circle; @@ -286,6 +287,7 @@ table Tensor { // set of acceptable options. // LINT.IfChange enum BuiltinOperator : int32 { + ROPE = -7, RMS_NORM = -6, GRU = -5, BCQ_GATHER = -4, @@ -636,6 +638,7 @@ union BuiltinOptions { BitcastOptions, BitwiseXorOptions, RightShiftOptions, + RoPEOptions = 249, RmsNormOptions = 250, GRUOptions = 251, BCQGatherOptions = 252, @@ -1525,6 +1528,15 @@ table RmsNormOptions { epsilon:float; } +enum RoPEMode : int { + GPT_NEOX, + GPT_J, +} + +table RoPEOptions { + mode: RoPEMode; +} + // An OperatorCode can be an enum value (BuiltinOperator) if the operator is a // builtin, or a string if the operator is custom. table OperatorCode { diff --git a/res/TensorFlowLiteRecipes/Net_RmsNorm_000/test.recipe b/res/TensorFlowLiteRecipes/Net_RmsNorm_000/test.recipe new file mode 100644 index 00000000000..b89984abfee --- /dev/null +++ b/res/TensorFlowLiteRecipes/Net_RmsNorm_000/test.recipe @@ -0,0 +1,117 @@ +operand { + name: "Input" + type: FLOAT32 + shape { + dim: 1 + dim: 1 + dim: 4 + } +} +operand { + name: "RmsNorm/Mul/Square" + type: FLOAT32 + shape { + dim: 1 + dim: 1 + dim: 4 + } +} +operand { + name: "RmsNorm/Mean/Axis" + type: INT32 + shape { + } + filler { + tag: "explicit" + arg: "-1" + } +} +operand { + name: "RmsNorm/Mean/MeanSquare" + type: FLOAT32 + shape { + dim: 1 + dim: 1 + dim: 1 + } +} +operand { + name: "RmsNorm/Add/Epsilon" + type: FLOAT32 + shape { + } + filler { + tag: "explicit" + arg: "1e-06" + } +} +operand { + name: "RmsNorm/Add/MeanSquare_plus_eps" + type: FLOAT32 + shape { + dim: 1 + dim: 1 + dim: 1 + } +} +operand { + name: "RmsNorm/Sqrt/RMS" + type: FLOAT32 + shape { + dim: 1 + dim: 1 + dim: 1 + } +} +operand { + name: "RmsNorm/Mul/RmsNorm" + type: FLOAT32 + shape { + dim: 1 + dim: 1 + dim: 4 + } +} +operation { + type: "Mul" + input: "Input" + input: "Input" + output: "RmsNorm/Mul/Square" + mul_options { + activation: NONE + } +} +operation { + type: "Mean" + input: "RmsNorm/Mul/Square" + input: "RmsNorm/Mean/Axis" + output: "RmsNorm/Mean/MeanSquare" + mean_options { + keep_dims: true + } +} +operation { + type: "Add" + input: "RmsNorm/Mean/MeanSquare" + input: "RmsNorm/Add/Epsilon" + output: "RmsNorm/Add/MeanSquare_plus_eps" + add_options { + activation: NONE + } +} +operation { + type: "Rsqrt" + input: "RmsNorm/Add/MeanSquare_plus_eps" + output: "RmsNorm/Sqrt/RMS" +} +operation { + type: "Mul" + input: "Input" + input: "RmsNorm/Sqrt/RMS" + output: "RmsNorm/Mul/RmsNorm" + mul_options { + activation: NONE + } +} +input: "Input" +output: "RmsNorm/Mul/RmsNorm" diff --git a/res/TensorFlowLiteRecipes/Net_RmsNorm_000/test.rule b/res/TensorFlowLiteRecipes/Net_RmsNorm_000/test.rule new file mode 100644 index 00000000000..1586fc89482 --- /dev/null +++ b/res/TensorFlowLiteRecipes/Net_RmsNorm_000/test.rule @@ -0,0 +1,7 @@ +# To check if this network is converted to circle RmsNorm op + +RULE "VERIFY_FILE_FORMAT" $(verify_file_format) '=' 1 + +RULE "RMS_NORM_EXIST" $(op_count RMS_NORM) '=' 1 +RULE "NO_ADD" $(op_count ADD) '=' 0 +RULE "NO_MUL" $(op_count MUL) '=' 0 diff --git a/res/TensorFlowLiteRecipes/REGRESS_Issue_13863/test.recipe b/res/TensorFlowLiteRecipes/REGRESS_Issue_13863/test.recipe new file mode 100644 index 00000000000..1407f63b353 --- /dev/null +++ b/res/TensorFlowLiteRecipes/REGRESS_Issue_13863/test.recipe @@ -0,0 +1,258 @@ +# This recipe was created using tflchef-reverse with badDead.zip +# from the How_to_reproduce section of Issue_13863. +# In the model, the dim value was changed to a single digit value, +# and the shape_signature was removed. +# https://github.com/Samsung/ONE/issues/13863 + +operand { + name: "serving_default_input:0" + type: FLOAT32 + shape { + dim: 1 + dim: 4 + dim: 4 + dim: 3 + } + quant { + quantized_dimension: 0 + } + is_variable: false +} +operand { + name: "Const" + type: FLOAT32 + shape { + } + filler { + tag: "explicit" + arg: "2" + } + quant { + quantized_dimension: 0 + } + is_variable: false +} +operand { + name: "Const_1" + type: FLOAT32 + shape { + } + filler { + tag: "explicit" + arg: "4" + } + quant { + quantized_dimension: 0 + } + is_variable: false +} +operand { + name: "model/tf.split/split/split_dim" + type: INT32 + shape { + } + filler { + tag: "explicit" + arg: "1" + } + quant { + quantized_dimension: 0 + } + is_variable: false +} +operand { + name: "model/flatten/Const" + type: INT32 + shape { + dim: 2 + } + filler { + tag: "explicit" + arg: "-1" + arg: "48" + } + quant { + quantized_dimension: 0 + } + is_variable: false +} +operand { + name: "PartitionedCall:3" + type: FLOAT32 + shape { + dim: 1 + dim: 1 + dim: 4 + dim: 4 + dim: 3 + } + quant { + quantized_dimension: 0 + } + is_variable: false +} +operand { + name: "model/flatten/Reshape" + type: FLOAT32 + shape { + dim: 1 + dim: 48 + } + quant { + quantized_dimension: 0 + } + is_variable: false +} +operand { + name: "PartitionedCall:0" + type: FLOAT32 + shape { + dim: 1 + dim: 16 + } + quant { + quantized_dimension: 0 + } + is_variable: false +} +operand { + name: "model/tf.split/split" + type: FLOAT32 + shape { + dim: 1 + dim: 16 + } + quant { + quantized_dimension: 0 + } + is_variable: false +} +operand { + name: "model/tf.split/split1" + type: FLOAT32 + shape { + dim: 1 + dim: 16 + } + quant { + quantized_dimension: 0 + } + is_variable: false +} +operand { + name: "model/tf.compat.v1.math.scalar_mul_1/Mul" + type: FLOAT32 + shape { + dim: 1 + dim: 48 + } + quant { + quantized_dimension: 0 + } + is_variable: false +} +operand { + name: "PartitionedCall:2" + type: FLOAT32 + shape { + dim: 1 + dim: 1 + dim: 48 + } + quant { + quantized_dimension: 0 + } + is_variable: false +} +operand { + name: "model/tf.compat.v1.math.scalar_mul/Mul" + type: FLOAT32 + shape { + dim: 1 + dim: 48 + } + quant { + quantized_dimension: 0 + } + is_variable: false +} +operand { + name: "PartitionedCall:1" + type: FLOAT32 + shape { + dim: 1 + dim: 1 + dim: 48 + } + quant { + quantized_dimension: 0 + } + is_variable: false +} +operation { + type: "Pack" + input: "serving_default_input:0" + output: "PartitionedCall:3" + pack_options { + values_count: 1 + axis: 0 + } +} +operation { + type: "Reshape" + input: "serving_default_input:0" + input: "model/flatten/Const" + output: "model/flatten/Reshape" +} +operation { + type: "Split" + input: "model/tf.split/split/split_dim" + input: "model/flatten/Reshape" + output: "PartitionedCall:0" + output: "model/tf.split/split" + output: "model/tf.split/split1" + split_options { + num_splits: 3 + } +} +operation { + type: "Mul" + input: "model/flatten/Reshape" + input: "Const_1" + output: "model/tf.compat.v1.math.scalar_mul_1/Mul" + mul_options { + activation: NONE + } +} +operation { + type: "Pack" + input: "model/tf.compat.v1.math.scalar_mul_1/Mul" + output: "PartitionedCall:2" + pack_options { + values_count: 1 + axis: 0 + } +} +operation { + type: "Mul" + input: "model/flatten/Reshape" + input: "Const" + output: "model/tf.compat.v1.math.scalar_mul/Mul" + mul_options { + activation: NONE + } +} +operation { + type: "Pack" + input: "model/tf.compat.v1.math.scalar_mul/Mul" + output: "PartitionedCall:1" + pack_options { + values_count: 1 + axis: 0 + } +} +input: "serving_default_input:0" +output: "PartitionedCall:2" +output: "PartitionedCall:1" +output: "PartitionedCall:3" +output: "PartitionedCall:0" diff --git a/res/TensorFlowLiteRecipes/REGRESS_Issue_13863/test.rule b/res/TensorFlowLiteRecipes/REGRESS_Issue_13863/test.rule new file mode 100644 index 00000000000..e9aa6eddeb6 --- /dev/null +++ b/res/TensorFlowLiteRecipes/REGRESS_Issue_13863/test.rule @@ -0,0 +1,7 @@ +# Verify that the pack operation has been successfully removed +# Check that the reshape operation exists (substitute_pack_to_reshape pass applied) + +RULE "VERIFY_FILE_FORMAT" $(verify_file_format) '=' 1 + +RULE "NO_PACK" $(op_count PACK) '=' 0 +RULE "RESHAPE_EXIST" $(op_count RESHAPE) '=' 4 diff --git a/runtime/onert/backend/acl_cl/KernelGenerator.cc b/runtime/onert/backend/acl_cl/KernelGenerator.cc index c32c298a11e..bd4f209c6be 100644 --- a/runtime/onert/backend/acl_cl/KernelGenerator.cc +++ b/runtime/onert/backend/acl_cl/KernelGenerator.cc @@ -92,12 +92,16 @@ void KernelGenerator::visit(const ir::operation::BatchToSpaceND &node) auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index); auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index); - auto block_size_tensor = _tensor_reg->getAclTensor(block_size_index); - assert(_ctx.at(block_size_index).data()); + if (!_ctx.at(block_size_index).data()) + throw std::runtime_error("ACL CL does not support dynamic block size for BatchToSpaceND"); + + auto block = _ctx.at(block_size_index).asVector(); + int32_t height = block[0]; + int32_t width = block[1]; auto fn = acl_common::generateLayer( - ifm_tensor->handle(), block_size_tensor->handle(), ofm_tensor->handle()); + ifm_tensor->handle(), width, height, ofm_tensor->handle()); _return_fn = asAclFunction(std::move(fn)); } @@ -121,6 +125,10 @@ void KernelGenerator::visit(const ir::operation::BinaryArithmetic &node) { case ir::operation::BinaryArithmetic::ArithmeticType::ADD: { + arm_compute::CLArithmeticAddition::validate(lhs_tensor->info(), rhs_tensor->info(), + ofm_tensor->info(), + arm_compute::ConvertPolicy::SATURATE, act_info) + .throw_if_error(); fn = acl_common::generateLayer( lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(), arm_compute::ConvertPolicy::SATURATE, act_info); @@ -128,6 +136,10 @@ void KernelGenerator::visit(const ir::operation::BinaryArithmetic &node) } case ir::operation::BinaryArithmetic::ArithmeticType::SUB: { + arm_compute::CLArithmeticSubtraction::validate(lhs_tensor->info(), rhs_tensor->info(), + ofm_tensor->info(), + arm_compute::ConvertPolicy::SATURATE, act_info) + .throw_if_error(); fn = acl_common::generateLayer( lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(), arm_compute::ConvertPolicy::SATURATE, act_info); @@ -135,6 +147,11 @@ void KernelGenerator::visit(const ir::operation::BinaryArithmetic &node) } case ir::operation::BinaryArithmetic::ArithmeticType::MUL: { + arm_compute::CLPixelWiseMultiplication::validate( + lhs_tensor->info(), rhs_tensor->info(), ofm_tensor->info(), 1.0, + arm_compute::ConvertPolicy::SATURATE, arm_compute::RoundingPolicy::TO_NEAREST_EVEN, + act_info) + .throw_if_error(); fn = acl_common::generateLayer( lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(), 1.0, // scale arm_compute::ConvertPolicy::SATURATE, arm_compute::RoundingPolicy::TO_NEAREST_EVEN, @@ -143,6 +160,9 @@ void KernelGenerator::visit(const ir::operation::BinaryArithmetic &node) } case ir::operation::BinaryArithmetic::ArithmeticType::DIV: { + arm_compute::CLArithmeticDivision::validate(lhs_tensor->info(), rhs_tensor->info(), + ofm_tensor->info(), act_info) + .throw_if_error(); fn = acl_common::generateLayer( lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(), act_info); break; @@ -1529,7 +1549,7 @@ void KernelGenerator::visit(const ir::operation::Reverse &node) } auto fn = acl_common::generateLayer( - ifm_tensor->handle(), ofm_tensor->handle(), axis_tensor->handle()); + ifm_tensor->handle(), ofm_tensor->handle(), axis_tensor->handle(), false); _return_fn = asAclFunction(std::move(fn)); } diff --git a/runtime/onert/backend/acl_common/Convert.cc b/runtime/onert/backend/acl_common/Convert.cc index eb11fcf2385..f0904a685d9 100644 --- a/runtime/onert/backend/acl_common/Convert.cc +++ b/runtime/onert/backend/acl_common/Convert.cc @@ -252,19 +252,6 @@ std::unique_ptr asAclFunction(std::unique_ptr<::arm_compute::IFunct return std::make_unique(std::move(layer)); } -ir::Layout asRuntimeLayout(::arm_compute::DataLayout data_layout) -{ - switch (data_layout) - { - case ::arm_compute::DataLayout::NHWC: - return ir::Layout::NHWC; - case ::arm_compute::DataLayout::NCHW: - return ir::Layout::NCHW; - default: - return ir::Layout::UNKNOWN; - } -} - ir::DataType asRuntimeDataType(::arm_compute::DataType data_type) { switch (data_type) diff --git a/runtime/onert/backend/acl_common/Convert.h b/runtime/onert/backend/acl_common/Convert.h index dd6ce59183f..6dd8d01ab06 100644 --- a/runtime/onert/backend/acl_common/Convert.h +++ b/runtime/onert/backend/acl_common/Convert.h @@ -73,7 +73,6 @@ std::unique_ptr asFunction(std::unique_ptr<::arm_compute::IFunction> return std::make_unique(std::move(fn)); } -ir::Layout asRuntimeLayout(::arm_compute::DataLayout data_layout); ir::DataType asRuntimeDataType(::arm_compute::DataType data_type); arm_compute::PoolingType convertPoolType(ir::operation::Pool2D::PoolType pool_type_ir); diff --git a/runtime/onert/backend/acl_neon/KernelGenerator.cc b/runtime/onert/backend/acl_neon/KernelGenerator.cc index f0b10399613..4712cf468bd 100644 --- a/runtime/onert/backend/acl_neon/KernelGenerator.cc +++ b/runtime/onert/backend/acl_neon/KernelGenerator.cc @@ -118,12 +118,16 @@ void KernelGenerator::visit(const ir::operation::BatchToSpaceND &node) auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index); auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index); - auto block_size_tensor = _tensor_reg->getAclTensor(block_size_index); - assert(_ctx.at(block_size_index).data()); + if (!_ctx.at(block_size_index).data()) + throw std::runtime_error("ACL NEON does not support dynamic block size for BatchToSpaceND"); + + auto block = _ctx.at(block_size_index).asVector(); + int32_t height = block[0]; + int32_t width = block[1]; auto fn = acl_common::generateLayer( - ifm_tensor->handle(), block_size_tensor->handle(), ofm_tensor->handle()); + ifm_tensor->handle(), width, height, ofm_tensor->handle()); _return_fn = asAclFunction(std::move(fn)); } @@ -145,6 +149,10 @@ void KernelGenerator::visit(const ir::operation::BinaryArithmetic &node) { case ir::operation::BinaryArithmetic::ArithmeticType::ADD: { + arm_compute::NEArithmeticAddition::validate(lhs_tensor->info(), rhs_tensor->info(), + ofm_tensor->info(), + arm_compute::ConvertPolicy::SATURATE) + .throw_if_error(); fn = acl_common::generateLayer( lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(), arm_compute::ConvertPolicy::SATURATE); @@ -152,6 +160,10 @@ void KernelGenerator::visit(const ir::operation::BinaryArithmetic &node) } case ir::operation::BinaryArithmetic::ArithmeticType::SUB: { + arm_compute::NEArithmeticSubtraction::validate(lhs_tensor->info(), rhs_tensor->info(), + ofm_tensor->info(), + arm_compute::ConvertPolicy::SATURATE) + .throw_if_error(); fn = acl_common::generateLayer( lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(), arm_compute::ConvertPolicy::SATURATE); @@ -159,6 +171,10 @@ void KernelGenerator::visit(const ir::operation::BinaryArithmetic &node) } case ir::operation::BinaryArithmetic::ArithmeticType::MUL: { + arm_compute::NEPixelWiseMultiplication::validate( + lhs_tensor->info(), rhs_tensor->info(), ofm_tensor->info(), 1.0, + arm_compute::ConvertPolicy::SATURATE, arm_compute::RoundingPolicy::TO_ZERO) + .throw_if_error(); // RoundingPolicy for scale:1.0 is only allowed RoundingPolicy::TO_ZERO fn = acl_common::generateLayer( lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(), 1.0, // scale @@ -167,6 +183,9 @@ void KernelGenerator::visit(const ir::operation::BinaryArithmetic &node) } case ir::operation::BinaryArithmetic::ArithmeticType::DIV: { + arm_compute::NEElementwiseDivision::validate(lhs_tensor->info(), rhs_tensor->info(), + ofm_tensor->info()) + .throw_if_error(); fn = acl_common::generateLayer( lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle()); break; diff --git a/runtime/onert/backend/cpu/CMakeLists.txt b/runtime/onert/backend/cpu/CMakeLists.txt index 12777a2173d..92cdf3c18d1 100644 --- a/runtime/onert/backend/cpu/CMakeLists.txt +++ b/runtime/onert/backend/cpu/CMakeLists.txt @@ -12,6 +12,8 @@ target_link_libraries(${LIB_ONERT_BACKEND_CPU} PRIVATE nnfw_coverage) target_link_libraries(${LIB_ONERT_BACKEND_CPU} PRIVATE ruy) target_link_libraries(${LIB_ONERT_BACKEND_CPU} INTERFACE ruy_instrumentation) target_link_libraries(${LIB_ONERT_BACKEND_CPU} PRIVATE ndarray) +# Set public: ExternalContext is used in train backend +target_link_libraries(${LIB_ONERT_BACKEND_CPU} PUBLIC ggml) set_target_properties(${LIB_ONERT_BACKEND_CPU} PROPERTIES OUTPUT_NAME backend_cpu diff --git a/runtime/onert/backend/cpu/ExternalContext.h b/runtime/onert/backend/cpu/ExternalContext.h index 8c1f4ccf16c..9c7e9368bc2 100644 --- a/runtime/onert/backend/cpu/ExternalContext.h +++ b/runtime/onert/backend/cpu/ExternalContext.h @@ -19,6 +19,7 @@ #include #include +#include #include @@ -47,10 +48,18 @@ class ExternalContext _ruy_context->set_max_num_threads(target_num_threads); } + void initGgmlContext() + { + if (_ggml_context == nullptr) + _ggml_context = std::unique_ptr( + ggml_init({.mem_size = 0, .mem_buffer = nullptr, .no_alloc = true}), &ggml_free); + } + ruy::Context *ruy_context() const { return _ruy_context.get(); } private: const std::unique_ptr _ruy_context; + std::unique_ptr _ggml_context{nullptr, &ggml_free}; }; } // namespace cpu diff --git a/runtime/onert/backend/train/MemoryManager.cc b/runtime/onert/backend/train/MemoryManager.cc index fd156fea231..64a665dd620 100644 --- a/runtime/onert/backend/train/MemoryManager.cc +++ b/runtime/onert/backend/train/MemoryManager.cc @@ -93,6 +93,42 @@ uint8_t *DisposableMemoryManager::getBuffer(const DisposableTensorIndex &ind) co return _mem_alloc->base() + mem_blk.offset; } +LayerScopeMemoryManager::LayerScopeMemoryManager() : _mem_planner{createMemoryPlanner()} +{ + // DO NOTHING +} + +basic::IMemoryPlanner *LayerScopeMemoryManager::createMemoryPlanner() +{ + auto planner_id = util::getConfigString(util::config::CPU_MEMORY_PLANNER); + return MemoryPlannerFactory::get().create(planner_id); +} + +void LayerScopeMemoryManager::allocate(void) +{ + _mem_alloc = std::make_shared(_mem_planner->capacity()); + assert(_mem_alloc->base()); +} + +uint8_t *LayerScopeMemoryManager::getBuffer(const LayerScopeTensorIndex &ind) const +{ + assert(_mem_planner->memory_plans().find(ind) != _mem_planner->memory_plans().end()); + const auto &mem_blk = _mem_planner->memory_plans().at(ind); + return _mem_alloc->base() + mem_blk.offset; +} + +void LayerScopeMemoryManager::deallocate(void) { _mem_alloc->release(); } + +void LayerScopeMemoryManager::claimPlan(const LayerScopeTensorIndex &ind, uint32_t size) +{ + _mem_planner->claim(ind, size); +} + +void LayerScopeMemoryManager::releasePlan(const LayerScopeTensorIndex &ind) +{ + _mem_planner->release(ind); +} + } // namespace train } // namespace backend } // namespace onert diff --git a/runtime/onert/backend/train/MemoryManager.h b/runtime/onert/backend/train/MemoryManager.h index 98e840bf7f7..8333c838bce 100644 --- a/runtime/onert/backend/train/MemoryManager.h +++ b/runtime/onert/backend/train/MemoryManager.h @@ -20,6 +20,7 @@ #include #include "DisposableTensorIndex.h" +#include "LayerScopeTensorIndex.h" namespace onert { @@ -67,7 +68,25 @@ class DisposableMemoryManager std::shared_ptr _mem_alloc; }; -// TODO: Add LayerScopeMemoryManager using MemoryPlannerFactory +class LayerScopeMemoryManager +{ +public: + LayerScopeMemoryManager(); + + void allocate(void); + uint8_t *getBuffer(const LayerScopeTensorIndex &ind) const; + void deallocate(void); + + void claimPlan(const LayerScopeTensorIndex &ind, uint32_t size); + void releasePlan(const LayerScopeTensorIndex &ind); + +private: + basic::IMemoryPlanner *createMemoryPlanner(); + +private: + std::shared_ptr> _mem_planner; + std::shared_ptr _mem_alloc; +}; } // namespace train } // namespace backend diff --git a/runtime/onert/backend/train/MemoryPlanner.test.cc b/runtime/onert/backend/train/MemoryPlanner.test.cc index f030ecb1bff..7a908b5df87 100644 --- a/runtime/onert/backend/train/MemoryPlanner.test.cc +++ b/runtime/onert/backend/train/MemoryPlanner.test.cc @@ -25,7 +25,6 @@ using namespace onert::backend::train; using onert::ir::OperandIndex; using onert::ir::OperationIndex; -// TODO: Add test testcase for {Bump, FirstFit, WIC}Planner namespace { @@ -178,7 +177,7 @@ TEST(FirstFitPlanner, disposable_claim_release_test) }); } -TEST(FirstFitPlanner, disposable_neg_release_non_existing_index) +TEST(FirstFitPlanner, neg_disposable_release_non_existing_index) { PlannerVerifier p; @@ -203,7 +202,7 @@ TEST(FirstFitPlanner, disposable_neg_release_non_existing_index) }); } -TEST(FirstFitPlanner, disposable_neg_release_twice) +TEST(FirstFitPlanner, neg_disposable_release_twice) { PlannerVerifier p; @@ -276,4 +275,189 @@ TEST(WICPlanner, disposable_claim_release_test) }); } -// Add Testcase for LayerScopeTensorIndex, using PlannerVerifier +TEST(BumpPlanner, layerscope_claim_test) +{ + PlannerVerifier p; + + ASSERT_NO_FATAL_FAILURE({ + p.claim(0, 0, 10, 0); + p.claim(1, 0, 20, 10); + p.claim(2, 2, 30, 30); + p.release(0, 0); + p.capacity(60); + }); +} + +TEST(FirstFitPlanner, layerscope_claim_release_test) +{ + PlannerVerifier p; + + ASSERT_NO_FATAL_FAILURE({ + // 0 CLAIM - 10 + p.claim(0, 0, 10, 0); + + // 1 CLAIM - 20 + p.claim(1, 0, 20, 10); + + // 2 CLAIM - 30 + p.claim(2, 2, 30, 30); + + // 0 RELEASE - 10 + p.release(0, 0); + + // 3 CLAIM - 20 + p.claim(3, 1, 20, 60); + + // 4 CLAIM - 5 + p.claim(4, 1, 5, 0); + + // 5 CLAIM - 10 + p.claim(5, 1, 10, 80); + + // 6 CLAIM - 5 + p.claim(6, 1, 5, 5); + + // 2 RELEASE - 30 + p.release(2, 2); + + // 7 CLAIM - 35 + p.claim(7, 1, 35, 90); + + // 8 CLAIM - 10 + p.claim(8, 1, 10, 30); + + // 4 RELEASE - 5 + p.release(4, 1); + + // 9 CLAIM - 10 + p.claim(9, 0, 10, 40); + + // 10 CLAIM - 10 + p.claim(10, 0, 10, 50); + + // 6 RELEASE + p.release(6, 1); + + // 1 RELEASE + p.release(1, 0); + + // 8 RELEASE + p.release(8, 1); + + // 9 RELEASE + p.release(9, 0); + + // 10 RELEASE + p.release(10, 0); + + // 3 RELEASE + p.release(3, 1); + + // 5 RELEASE + p.release(5, 1); + + // 7 RELEASE + p.release(7, 1); + + // CAPACITY - 125 + p.capacity(125); + }); +} + +TEST(FirstFitPlanner, neg_layerscope_release_non_existing_index) +{ + PlannerVerifier p; + + auto on_only_debug_mode = [&p]() { + EXPECT_DEATH({ p.release(0, 1); }, + "Cannot release for given index. It has been not claimed or released already."); + return true; + }; + + ASSERT_NO_FATAL_FAILURE({ + // 0 CLAIM - 10 + p.claim(0, 0, 10, 0); + + // 1 CLAIM - 20 + p.claim(1, 0, 20, 10); + + // 2 CLAIM - 30 + p.claim(2, 2, 30, 30); + + // RELEASE non-existing index + assert(on_only_debug_mode()); + }); +} + +TEST(FirstFitPlanner, neg_layerscope_release_twice) +{ + PlannerVerifier p; + + auto on_only_debug_mode = [&p]() { + EXPECT_EXIT({ p.release(0, 0); }, ::testing::KilledBySignal(SIGABRT), + "Cannot release for given index. It has been not claimed or released already."); + return true; + }; + + ASSERT_NO_FATAL_FAILURE({ + // 0 CLAIM - 10 + p.claim(0, 0, 10, 0); + + // 1 CLAIM - 20 + p.claim(1, 0, 20, 10); + + // 2 CLAIM - 30 + p.claim(2, 2, 30, 30); + + // 0 RELEASE - 10 + p.release(0, 0); + + // 0 RELEASE again + assert(on_only_debug_mode()); + }); +} + +TEST(WICPlanner, layerscope_claim_release_test) +{ + PlannerVerifier p; + + ASSERT_NO_FATAL_FAILURE({ + p.claim(0, 0, 20); + p.claim(1, 0, 5); + p.release(0, 0); + p.claim(2, 2, 10); + p.release(1, 0); + p.claim(3, 1, 10); + p.release(2, 2); + p.claim(4, 1, 10); + p.release(3, 1); + p.claim(5, 1, 20); + p.release(4, 1); + p.claim(6, 1, 20); + p.release(5, 1); + + // VERIFY 0 - 0 + p.verify(0, 0, 20, 0); + + // VERIFY 1 - 20 + p.verify(1, 0, 5, 20); + + // VERIFY 2 - 0 + p.verify(2, 2, 10, 0); + + // VERIFY 3 - 10 + p.verify(3, 1, 10, 10); + + // VERIFY 4 - 20 + p.verify(4, 1, 10, 20); + + // VERIFY 5 - 0 + p.verify(5, 1, 20, 0); + + // VERIFY 6 - 20 + p.verify(6, 1, 20, 20); + + // CAPACITY - 40 + p.capacity(40); + }); +} diff --git a/runtime/onert/backend/train/TensorManager.h b/runtime/onert/backend/train/TensorManager.h index 6e0910e182d..c9553c3913e 100644 --- a/runtime/onert/backend/train/TensorManager.h +++ b/runtime/onert/backend/train/TensorManager.h @@ -61,6 +61,7 @@ class TensorManager void releaseGradientPlan(const ir::OperandIndex &ind); void claimDisposableBackPropPlan(const DisposableTensorIndex &ind); void releaseDisposableBackPropPlan(const DisposableTensorIndex &ind); + // TODO Add member functions related to LayerScopeMemoryManager private: std::unique_ptr _nonconst_mgr; @@ -68,6 +69,8 @@ class TensorManager std::unique_ptr _back_prop_mgr; std::unique_ptr _gradient_mgr; std::unique_ptr _disposable_back_prop_mgr; + // TODO: enable _layer_scope_mgr + // std::unique_ptr _layer_scope_mgr; const std::shared_ptr _tensors; }; diff --git a/runtime/onert/backend/train/ops/LossLayer.cc b/runtime/onert/backend/train/ops/LossLayer.cc index 6f5f8705bba..e5a026ba863 100644 --- a/runtime/onert/backend/train/ops/LossLayer.cc +++ b/runtime/onert/backend/train/ops/LossLayer.cc @@ -26,7 +26,8 @@ namespace ops { LossLayer::LossLayer() - : _y_pred(nullptr), _y_true(nullptr), _output(nullptr), _back_prop_y_pred(nullptr) + : _y_pred(nullptr), _y_true(nullptr), _output(nullptr), _back_prop_y_pred(nullptr), + _reduction_type(ir::train::LossReductionType::Undefined) { // DO NOTHING } diff --git a/runtime/onert/backend/trix/Convert.cc b/runtime/onert/backend/trix/Convert.cc index fe003e7ead5..684dc80dd53 100644 --- a/runtime/onert/backend/trix/Convert.cc +++ b/runtime/onert/backend/trix/Convert.cc @@ -23,19 +23,6 @@ namespace backend namespace trix { -data_layout convertDataLayout(const ir::Layout layout) -{ - switch (layout) - { - case ir::Layout::NCHW: - return DATA_LAYOUT_NCHW; - case ir::Layout::NHWC: - return DATA_LAYOUT_NHWC; - default: - throw std::runtime_error("Unknown Layout"); - } -} - data_type convertDataType(const ir::DataType type) { switch (type) diff --git a/runtime/onert/backend/trix/Convert.h b/runtime/onert/backend/trix/Convert.h index 9359f0a5084..12d7eea1943 100644 --- a/runtime/onert/backend/trix/Convert.h +++ b/runtime/onert/backend/trix/Convert.h @@ -31,14 +31,6 @@ namespace backend namespace trix { -/** - * @brief Convert type of layout from onert type to npu type - * - * @param layout Layout type in onert - * @return data_layout Layout type in npu - */ -data_layout convertDataLayout(const ir::Layout layout); - /** * @brief Convert type of data from onert type to npu type * diff --git a/tests/nnapi/nnapi_gtest.skip.aarch64-android.acl_neon b/tests/nnapi/nnapi_gtest.skip.aarch64-android.acl_neon index 03bdf091679..9e337bc7eba 100644 --- a/tests/nnapi/nnapi_gtest.skip.aarch64-android.acl_neon +++ b/tests/nnapi/nnapi_gtest.skip.aarch64-android.acl_neon @@ -76,6 +76,8 @@ GeneratedTests.fill_ex_dynamic_nnfw GeneratedTests.fully_connected_dynamic_nnfw GeneratedTests.fully_connected_float_2_weights_as_inputs GeneratedTests.fully_connected_hybrid_1_nnfw +GeneratedTests.fully_connected_quant8_large_weights_as_inputs +GeneratedTests.fully_connected_quant8_weights_as_inputs GeneratedTests.fusedbatchnorm_ex_dynamic_nnfw GeneratedTests.fusedbatchnorm_ex_float_fusedbatchnorm_1141 GeneratedTests.gather_dynamic_nnfw diff --git a/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_neon b/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_neon index 03bdf091679..9e337bc7eba 100644 --- a/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_neon +++ b/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_neon @@ -76,6 +76,8 @@ GeneratedTests.fill_ex_dynamic_nnfw GeneratedTests.fully_connected_dynamic_nnfw GeneratedTests.fully_connected_float_2_weights_as_inputs GeneratedTests.fully_connected_hybrid_1_nnfw +GeneratedTests.fully_connected_quant8_large_weights_as_inputs +GeneratedTests.fully_connected_quant8_weights_as_inputs GeneratedTests.fusedbatchnorm_ex_dynamic_nnfw GeneratedTests.fusedbatchnorm_ex_float_fusedbatchnorm_1141 GeneratedTests.gather_dynamic_nnfw diff --git a/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_neon b/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_neon index a3320998ab3..f636f551009 100644 --- a/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_neon +++ b/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_neon @@ -75,6 +75,9 @@ GeneratedTests.fill_ex_4D_float GeneratedTests.fill_ex_dynamic_nnfw GeneratedTests.fully_connected_dynamic_nnfw GeneratedTests.fully_connected_float_2_weights_as_inputs +GeneratedTests.fully_connected_hybrid_1_nnfw +GeneratedTests.fully_connected_quant8_large_weights_as_inputs +GeneratedTests.fully_connected_quant8_weights_as_inputs GeneratedTests.fusedbatchnorm_ex_dynamic_nnfw GeneratedTests.fusedbatchnorm_ex_float_fusedbatchnorm_1141 GeneratedTests.gather_dynamic_nnfw diff --git a/tests/nnfw_api/CMakeLists.txt b/tests/nnfw_api/CMakeLists.txt index 887ee9b1b74..1214290576b 100644 --- a/tests/nnfw_api/CMakeLists.txt +++ b/tests/nnfw_api/CMakeLists.txt @@ -37,6 +37,7 @@ target_link_libraries(${RUNTIME_NNFW_API_TEST} nnfw-dev jsoncpp) target_link_libraries(${RUNTIME_NNFW_API_TEST} gtest gmock) target_link_libraries(${RUNTIME_NNFW_API_TEST} ${LIB_PTHREAD} dl) target_link_libraries(${RUNTIME_NNFW_API_TEST} circle_schema) +target_link_libraries(${RUNTIME_NNFW_API_TEST} ggml) install(TARGETS ${RUNTIME_NNFW_API_TEST} DESTINATION unittest) diff --git a/tests/nnfw_api/lib/common.cc b/tests/nnfw_api/lib/common.cc index 3c3bc68d093..2ccf712837e 100644 --- a/tests/nnfw_api/lib/common.cc +++ b/tests/nnfw_api/lib/common.cc @@ -17,6 +17,8 @@ #include "common.h" +#include + bool tensorInfoEqual(const nnfw_tensorinfo &info1, const nnfw_tensorinfo &info2) { if (info1.dtype != info2.dtype) @@ -38,3 +40,23 @@ uint64_t tensorInfoNumElements(const nnfw_tensorinfo &ti) } return n; } + +std::vector quantData(const std::vector &buf_val, const circle::TensorType type) +{ + switch (type) + { + case circle::TensorType::TensorType_GGML_Q4_0: + { + size_t num_elems = buf_val.size(); + const size_t block_size = ggml_blck_size(GGML_TYPE_Q4_0); + const int64_t num_block = num_elems / block_size; + const size_t block_struct_size = ggml_type_size(GGML_TYPE_Q4_0); + + auto buf = std::vector(num_block * block_struct_size); + ggml_quantize_chunk(GGML_TYPE_Q4_0, buf_val.data(), buf.data(), 0, 1, num_elems, nullptr); + return buf; + } + default: + throw std::runtime_error("Unsupported tensor type"); + } +} diff --git a/tests/nnfw_api/lib/common.h b/tests/nnfw_api/lib/common.h index aec49792c56..5d72ea8cabe 100644 --- a/tests/nnfw_api/lib/common.h +++ b/tests/nnfw_api/lib/common.h @@ -19,8 +19,10 @@ #include #include +#include bool tensorInfoEqual(const nnfw_tensorinfo &info1, const nnfw_tensorinfo &info2); uint64_t tensorInfoNumElements(const nnfw_tensorinfo &info); +std::vector quantData(const std::vector &buf_val, const circle::TensorType type); #endif // __NNFW_API_TEST_COMMON_H__ diff --git a/tests/nnfw_api/src/GenModelTests/one_op_tests/Add.test.cc b/tests/nnfw_api/src/GenModelTests/one_op_tests/Add.test.cc index 9fc0e86b6b5..3365c071906 100644 --- a/tests/nnfw_api/src/GenModelTests/one_op_tests/Add.test.cc +++ b/tests/nnfw_api/src/GenModelTests/one_op_tests/Add.test.cc @@ -293,7 +293,7 @@ TEST_F(GenModelTest, neg_OneOp_Add_VarToVarInt16) cgen.setInputsAndOutputs({lhs, rhs}, {out}); _context = std::make_unique(cgen.finish()); - // _context->addTestCase(uniformTCD({{1, 3, 2, 4}, {5, -4, -7, 4}}, {{0, -32, -46, 2}})); + _context->addTestCase(uniformTCD({{1, 3, 2, 4}, {5, -4, -7, 4}}, {{0, -32, -46, 2}})); _context->setBackends({"acl_cl", "acl_neon", "cpu"}); _context->expectFailCompile();