diff --git a/.github/workflows/check-format.yml b/.github/workflows/check-format.yml
index 48cf30aa635..52867d7b414 100644
--- a/.github/workflows/check-format.yml
+++ b/.github/workflows/check-format.yml
@@ -17,7 +17,7 @@ defaults:
 jobs:
   check-format:
     name: Check format
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-24.04
     if: github.repository_owner == 'Samsung'
 
     steps:
@@ -29,15 +29,10 @@ jobs:
         with:
           python-version: '3.x'
 
-      # C format: clang-format-16
+      # C format: clang-format-16 (already installed)
       # Python format: yapf==0.40.2
       - name: Install packages
         run: |
-          sudo apt-get install -y gnupg2 software-properties-common
-          wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add -
-          sudo add-apt-repository "deb http://apt.llvm.org/focal/ llvm-toolchain-focal-16 main"
-          sudo apt-get update && sudo apt-get install -qqy clang-format-16
-          python -m pip install --upgrade pip
           pip install yapf==0.40.2
 
       - name: Check
diff --git a/Makefile.template b/Makefile.template
index 6e0c29590c7..4a93c1acf3d 100644
--- a/Makefile.template
+++ b/Makefile.template
@@ -202,7 +202,7 @@ runtime_tar_internal:
 	tar -zcf $(WORKSPACE)/onert-test-package.tar.gz -C $(INSTALL_PATH) $(shell ls $(INSTALL_PATH) -I lib -I include)
 
 acl_tar_internal:
-	tar -zcf $(WORKSPACE)/onert-acl.tar.gz -C ${OVERLAY_FOLDER} lib/libarm_compute.so lib/libarm_compute_core.so lib/libarm_compute_graph.so
+	tar -zcf $(WORKSPACE)/onert-acl.tar.gz -C ${OVERLAY_FOLDER} lib/libarm_compute.so lib/libarm_compute_graph.so
 
 install_acl_internal:
 # Workaround to install acl for test (ignore error when there is no file to copy)
diff --git a/compiler/circledump/src/OpPrinter.cpp b/compiler/circledump/src/OpPrinter.cpp
index 61a0941ea67..e14db02d0eb 100644
--- a/compiler/circledump/src/OpPrinter.cpp
+++ b/compiler/circledump/src/OpPrinter.cpp
@@ -825,6 +825,20 @@ class InstanceNormPrinter : public OpPrinter
   }
 };
 
+class RmsNormPrinter : public OpPrinter
+{
+public:
+  void options(const circle::Operator *op, std::ostream &os) const override
+  {
+    if (auto *params = op->builtin_options_as_RmsNormOptions())
+    {
+      os << "    ";
+      os << "epsilon(" << params->epsilon() << ") ";
+      os << std::endl;
+    }
+  }
+};
+
 OpPrinterRegistry::OpPrinterRegistry()
 {
   _op_map[circle::BuiltinOperator_ADD] = make_unique<AddPrinter>();
@@ -912,6 +926,7 @@ OpPrinterRegistry::OpPrinterRegistry()
   _op_map[circle::BuiltinOperator_BCQ_GATHER] = make_unique<BCQGatherPrinter>();
   _op_map[circle::BuiltinOperator_GRU] = make_unique<GRUPrinter>();
   _op_map[circle::BuiltinOperator_INSTANCE_NORM] = make_unique<InstanceNormPrinter>();
+  _op_map[circle::BuiltinOperator_RMS_NORM] = make_unique<RmsNormPrinter>();
 }
 
 } // namespace circledump
diff --git a/compiler/common-artifacts/exclude.lst b/compiler/common-artifacts/exclude.lst
index 4358bc02cdd..fcb5caa48a4 100644
--- a/compiler/common-artifacts/exclude.lst
+++ b/compiler/common-artifacts/exclude.lst
@@ -9,6 +9,7 @@ optimize(Add_STR_000) # STRING is not supported
 optimize(Add_STR_001) # STRING is not supported
 
 ## CircleRecipes
+optimize(RmsNorm_000)
 
 #[[ tcgenerate : Exclude from test data generation(TestDataGenerator) ]]
 ## TensorFlowLiteRecipes
@@ -178,3 +179,4 @@ tcgenerate(CircleFullyConnected_U4_002)
 tcgenerate(GRU_000) # luci-interpreter does not support custom GRU
 tcgenerate(InstanceNorm_000)
 tcgenerate(InstanceNorm_001)
+tcgenerate(RmsNorm_000)
diff --git a/compiler/logo/src/Passes/RemoveDeadNodeWithQueryPass.cpp b/compiler/logo/src/Passes/RemoveDeadNodeWithQueryPass.cpp
index 5c745212a29..00a14e70928 100644
--- a/compiler/logo/src/Passes/RemoveDeadNodeWithQueryPass.cpp
+++ b/compiler/logo/src/Passes/RemoveDeadNodeWithQueryPass.cpp
@@ -43,15 +43,17 @@ bool RemoveDeadNodeWithQueryPass::run(loco::Graph *g)
   }
 
   // Find the nodes that should not be dead node in candidates
-  for (auto node : candidates)
+  for (auto it = candidates.begin(); it != candidates.end();)
   {
-    if (auto service = node->dialect()->service<DeadNodeQueryService>())
+    if (auto service = (*it)->dialect()->service<DeadNodeQueryService>())
     {
-      if (!service->isDeadNode(node))
+      if (!service->isDeadNode(*it))
       {
-        candidates.erase(node);
+        it = candidates.erase(it);
+        continue;
       }
     }
+    ++it;
   }
 
   for (auto node : candidates)
diff --git a/compiler/luci/export/src/CircleBuiltinTypesExtractor.h b/compiler/luci/export/src/CircleBuiltinTypesExtractor.h
index efc2a510649..1e1adfca5f5 100644
--- a/compiler/luci/export/src/CircleBuiltinTypesExtractor.h
+++ b/compiler/luci/export/src/CircleBuiltinTypesExtractor.h
@@ -548,6 +548,10 @@ class BuiltinOptionsExtractor final
                                              to_circle_actfunc(node->fusedActivationFunction()))
       .Union();
   }
+  flatbuffers::Offset<void> visit(luci::CircleRmsNorm *node)
+  {
+    return circle::CreateRmsNormOptions(_builder, node->epsilon()).Union();
+  }
 
 protected:
   flatbuffers::FlatBufferBuilder &_builder;
diff --git a/compiler/luci/export/src/CircleExporterUtils.h b/compiler/luci/export/src/CircleExporterUtils.h
index 6d0ebd6cb29..49822d5d775 100644
--- a/compiler/luci/export/src/CircleExporterUtils.h
+++ b/compiler/luci/export/src/CircleExporterUtils.h
@@ -66,7 +66,7 @@ CircleTensorIndex get_tensor_index(loco::Node *node);
 // check if Flatbuffer builder can no longer hold the given amount of the data
 inline bool check_size_limit(const flatbuffers::FlatBufferBuilder &fb, const uint64_t data_size)
 {
-  return data_size > FLATBUFFERS_SIZE_MAX - fb.GetSize();
+  return FLATBUFFERS_SIZE_MAX < data_size + fb.GetSize();
 }
 
 } // namespace luci
diff --git a/compiler/luci/export/src/CircleOps.lst b/compiler/luci/export/src/CircleOps.lst
index 8c693baca23..91b079ac91a 100644
--- a/compiler/luci/export/src/CircleOps.lst
+++ b/compiler/luci/export/src/CircleOps.lst
@@ -141,6 +141,7 @@ CIRCLE_NODE(CircleBCQFullyConnected, BuiltinOperator_BCQ_FULLY_CONNECTED, Builti
 CIRCLE_NODE(CircleBCQGather, BuiltinOperator_BCQ_GATHER, BuiltinOptions_BCQGatherOptions)
 CIRCLE_NODE(CircleGRU, BuiltinOperator_GRU, BuiltinOptions_GRUOptions)
 CIRCLE_NODE(CircleInstanceNorm, BuiltinOperator_INSTANCE_NORM, BuiltinOptions_InstanceNormOptions)
+CIRCLE_NODE(CircleRmsNorm, BuiltinOperator_RMS_NORM, BuiltinOptions_RmsNormOptions)
 // Virtual node(s)
 CIRCLE_VNODE(CircleBidirectionalSequenceLSTMOut)
 CIRCLE_VNODE(CircleConst)
diff --git a/compiler/luci/import/include/luci/Import/Nodes.h b/compiler/luci/import/include/luci/Import/Nodes.h
index f3f4871b469..6fcd5d975d0 100644
--- a/compiler/luci/import/include/luci/Import/Nodes.h
+++ b/compiler/luci/import/include/luci/Import/Nodes.h
@@ -107,6 +107,7 @@
 #include "Nodes/CircleResizeNearestNeighbor.h"
 #include "Nodes/CircleReverseSequence.h"
 #include "Nodes/CircleReverseV2.h"
+#include "Nodes/CircleRmsNorm.h"
 #include "Nodes/CircleRound.h"
 #include "Nodes/CircleRsqrt.h"
 #include "Nodes/CircleScatterNd.h"
diff --git a/compiler/luci/import/include/luci/Import/Nodes/CircleRmsNorm.h b/compiler/luci/import/include/luci/Import/Nodes/CircleRmsNorm.h
new file mode 100644
index 00000000000..a2ebcdf657b
--- /dev/null
+++ b/compiler/luci/import/include/luci/Import/Nodes/CircleRmsNorm.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_IMPORT_OP_CIRCLE_RMS_NORM_H__
+#define __LUCI_IMPORT_OP_CIRCLE_RMS_NORM_H__
+
+#include "luci/Import/GraphBuilder.h"
+
+namespace luci
+{
+
+class CircleRmsNormGraphBuilder : public GraphBuilder
+{
+public:
+  bool validate(const ValidateArgs &args) const final;
+
+private:
+  CircleNode *build_node(const circle::OperatorT &op, const std::vector<CircleNode *> &inputs,
+                         loco::Graph *graph) const final;
+};
+
+} // namespace luci
+
+#endif // __LUCI_IMPORT_OP_CIRCLE_RMS_NORM_H__
diff --git a/compiler/luci/import/src/GraphBuilderRegistry.cpp b/compiler/luci/import/src/GraphBuilderRegistry.cpp
index 29edf8348f3..1e2e8837029 100644
--- a/compiler/luci/import/src/GraphBuilderRegistry.cpp
+++ b/compiler/luci/import/src/GraphBuilderRegistry.cpp
@@ -116,6 +116,7 @@ GraphBuilderRegistry::GraphBuilderRegistry()
   CIRCLE_NODE(RESIZE_NEAREST_NEIGHBOR, CircleResizeNearestNeighborGraphBuilder);           // 97
   CIRCLE_NODE(REVERSE_SEQUENCE, CircleReverseSequenceGraphBuilder);                        // 112
   CIRCLE_NODE(REVERSE_V2, CircleReverseV2GraphBuilder);                                    // 105
+  CIRCLE_NODE(RMS_NORM, CircleRmsNormGraphBuilder);                                        // 255
   CIRCLE_NODE(ROUND, CircleRoundGraphBuilder);                                             // 116
   CIRCLE_NODE(RSQRT, CircleRsqrtGraphBuilder);                                             // 76
   CIRCLE_NODE(SCATTER_ND, CircleScatterNdGraphBuilder);                                    // 122
diff --git a/compiler/luci/import/src/Nodes/CircleRmsNorm.cpp b/compiler/luci/import/src/Nodes/CircleRmsNorm.cpp
new file mode 100644
index 00000000000..28fef764a65
--- /dev/null
+++ b/compiler/luci/import/src/Nodes/CircleRmsNorm.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Import/Nodes/CircleRmsNorm.h"
+
+#include <luci/IR/Nodes/CircleRmsNorm.h>
+
+#include <loco.h>
+
+namespace luci
+{
+
+bool CircleRmsNormGraphBuilder::validate(const ValidateArgs &args) const
+{
+  // TODO check dtypes
+  return GraphBuilder::validate(args, 3);
+}
+
+CircleNode *CircleRmsNormGraphBuilder::build_node(const circle::OperatorT &op,
+                                                  const std::vector<CircleNode *> &inputs,
+                                                  loco::Graph *graph) const
+{
+  auto *node = graph->nodes()->create<CircleRmsNorm>();
+  node->input(inputs.at(0));
+  node->gamma(inputs.at(1));
+  node->beta(inputs.at(2));
+
+  const auto *options = op.builtin_options.AsRmsNormOptions();
+  node->epsilon(options->epsilon);
+
+  return node;
+}
+
+} // namespace luci
diff --git a/compiler/luci/logex/src/CircleNodeSummaryBuilder.cpp b/compiler/luci/logex/src/CircleNodeSummaryBuilder.cpp
index 2ff37afe165..8f27737e969 100644
--- a/compiler/luci/logex/src/CircleNodeSummaryBuilder.cpp
+++ b/compiler/luci/logex/src/CircleNodeSummaryBuilder.cpp
@@ -223,6 +223,7 @@ CircleNodeSummaryBuilder::create_builder(const luci::CircleNode *node)
     CIRCLE_NODE(RESIZE_NEAREST_NEIGHBOR, CircleResizeNearestNeighborSummaryBuilder)
     CIRCLE_NODE(REVERSE_SEQUENCE, CircleReverseSequenceSummaryBuilder)
     CIRCLE_NODE(REVERSE_V2, CircleReverseV2SummaryBuilder)
+    CIRCLE_NODE(RMS_NORM, CircleRmsNormSummaryBuilder)
     CIRCLE_NODE(ROUND, CircleRoundSummaryBuilder)
     CIRCLE_NODE(RSQRT, CircleRsqrtSummaryBuilder)
     CIRCLE_NODE(SCATTER_ND, CircleScatterNdSummaryBuilder)
diff --git a/compiler/luci/logex/src/CircleNodeSummaryBuilders.cpp b/compiler/luci/logex/src/CircleNodeSummaryBuilders.cpp
index f0a92ef91d1..1d605d3946c 100644
--- a/compiler/luci/logex/src/CircleNodeSummaryBuilders.cpp
+++ b/compiler/luci/logex/src/CircleNodeSummaryBuilders.cpp
@@ -890,6 +890,18 @@ std::vector<std::string> CircleReverseV2SummaryBuilder::get_input_names(const lu
   return {"tensor", "axis"};
 }
 
+std::vector<std::string> CircleRmsNormSummaryBuilder::get_input_names(const luci::CircleNode *)
+{
+  return {"input", "gamma", "beta"};
+}
+
+void CircleRmsNormSummaryBuilder::build_attributes(const luci::CircleNode *node,
+                                                   locop::NodeSummary &s)
+{
+  auto rmsnorm = loco::must_cast<const luci::CircleRmsNorm *>(node);
+  s.args().append("epsilon", std::to_string(rmsnorm->epsilon()));
+}
+
 std::vector<std::string> CircleScatterNdSummaryBuilder::get_input_names(const luci::CircleNode *)
 {
   return {"indices", "updates", "shape"};
diff --git a/compiler/luci/logex/src/CircleNodeSummaryBuilders.h b/compiler/luci/logex/src/CircleNodeSummaryBuilders.h
index f489e9b6eb6..9ca64d49064 100644
--- a/compiler/luci/logex/src/CircleNodeSummaryBuilders.h
+++ b/compiler/luci/logex/src/CircleNodeSummaryBuilders.h
@@ -583,6 +583,13 @@ class CircleReverseV2SummaryBuilder final : public CircleNodeSummaryBuilder
   std::vector<std::string> get_input_names(const luci::CircleNode *);
 };
 
+class CircleRmsNormSummaryBuilder final : public CircleNodeSummaryBuilder
+{
+private:
+  std::vector<std::string> get_input_names(const luci::CircleNode *);
+  void build_attributes(const luci::CircleNode *node, locop::NodeSummary &s);
+};
+
 class CircleRoundSummaryBuilder final : public CircleNodeWithXSummaryBuilder
 {
 };
diff --git a/compiler/luci/partition/include/luci/ConnectNode.h b/compiler/luci/partition/include/luci/ConnectNode.h
index 7539aaf6bee..592dd3b4d29 100644
--- a/compiler/luci/partition/include/luci/ConnectNode.h
+++ b/compiler/luci/partition/include/luci/ConnectNode.h
@@ -187,6 +187,7 @@ class ConnectNode final : public luci::CircleNodeVisitor<void>
   void visit(const luci::CircleBCQGather *) final;
   void visit(const luci::CircleGRU *) final;
   void visit(const luci::CircleInstanceNorm *) final;
+  void visit(const luci::CircleRmsNorm *) final;
 
   // NOTE CircleInput and CircleOutput are not handled here as these need
   //      link with graph I/O
diff --git a/compiler/luci/partition/src/Nodes/CircleRmsNorm.cpp b/compiler/luci/partition/src/Nodes/CircleRmsNorm.cpp
new file mode 100644
index 00000000000..fa7f58af357
--- /dev/null
+++ b/compiler/luci/partition/src/Nodes/CircleRmsNorm.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/ConnectNode.h"
+
+namespace
+{
+
+void connect(luci::ConnectNode *cn, const luci::CircleRmsNorm *node)
+{
+  auto *cloned = loco::must_cast<luci::CircleRmsNorm *>(cn->find_clone(node));
+
+  luci::CircleNode *input = loco::must_cast<luci::CircleNode *>(node->input());
+  luci::CircleNode *gamma = loco::must_cast<luci::CircleNode *>(node->gamma());
+  luci::CircleNode *beta = loco::must_cast<luci::CircleNode *>(node->beta());
+
+  cloned->input(cn->find_clone(input));
+  cloned->gamma(cn->find_clone(gamma));
+  cloned->beta(cn->find_clone(beta));
+}
+
+} // namespace
+
+namespace luci
+{
+
+void ConnectNode::visit(const luci::CircleRmsNorm *node) { connect(this, node); }
+
+} // namespace luci
diff --git a/compiler/luci/partition/src/Nodes/CircleRmsNorm.test.cpp b/compiler/luci/partition/src/Nodes/CircleRmsNorm.test.cpp
new file mode 100644
index 00000000000..625e66c2a14
--- /dev/null
+++ b/compiler/luci/partition/src/Nodes/CircleRmsNorm.test.cpp
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/ConnectNode.h"
+
+#include "ConnectNode.test.h"
+
+#include <luci/Service/CircleNodeClone.h>
+
+#include <gtest/gtest.h>
+
+namespace
+{
+
+using namespace luci::test;
+
+class NodeGraphlet : public NodeGraphletT<luci::CircleRmsNorm>
+{
+public:
+  NodeGraphlet() = default;
+
+public:
+  void init(loco::Graph *g) override { NodeGraphletT<luci::CircleRmsNorm>::init(g); }
+};
+
+class TestNodeGraph : public TestIsOGraph<3>, public NodeGraphlet
+{
+public:
+  TestNodeGraph() = default;
+
+public:
+  void init(const ShapeU32 shape)
+  {
+    TestIsOGraph<3>::init({shape, shape, shape}, shape);
+    NodeGraphlet::init(g());
+
+    node()->input(input(0));
+    node()->gamma(input(1));
+    node()->beta(input(2));
+
+    output()->from(node());
+  }
+};
+
+} // namespace
+
+TEST(ConnectNodeTest, connect_RmsNorm)
+{
+  TestNodeGraph tng;
+  tng.init({2, 3});
+
+  ConnectionTestHelper cth;
+  cth.prepare_inputs(&tng);
+
+  auto *node = tng.node();
+  ASSERT_NO_THROW(loco::must_cast<luci::CircleRmsNorm *>(node));
+
+  auto *clone = luci::clone_node(node, cth.graph_clone());
+  ASSERT_NO_THROW(loco::must_cast<luci::CircleRmsNorm *>(clone));
+
+  cth.clone_connect(node, clone);
+
+  ASSERT_EQ(3, clone->arity());
+  ASSERT_EQ(cth.inputs(0), clone->arg(0));
+  ASSERT_EQ(cth.inputs(1), clone->arg(1));
+  ASSERT_EQ(cth.inputs(2), clone->arg(2));
+}
+
+TEST(ConnectNodeTest, connect_RmsNorm_NEG)
+{
+  TestNodeGraph tng;
+  tng.init({2, 3});
+
+  ConnectionTestHelper cth;
+  cth.prepare_inputs_miss(&tng);
+
+  auto *node = tng.node();
+  ASSERT_NO_THROW(loco::must_cast<luci::CircleRmsNorm *>(node));
+
+  auto *clone = luci::clone_node(node, cth.graph_clone());
+  ASSERT_NO_THROW(loco::must_cast<luci::CircleRmsNorm *>(clone));
+
+  EXPECT_ANY_THROW(cth.clone_connect(node, clone));
+}
diff --git a/compiler/luci/pass/include/luci/CircleOptimizer.h b/compiler/luci/pass/include/luci/CircleOptimizer.h
index d4f675f36fe..a9bf652e323 100644
--- a/compiler/luci/pass/include/luci/CircleOptimizer.h
+++ b/compiler/luci/pass/include/luci/CircleOptimizer.h
@@ -78,6 +78,7 @@ class CircleOptimizer final
       FusePRelu,
       FuseGelu,
       FuseRsqrt,
+      FuseRmsNorm,
       ShuffleWeightTo16x1Float32,
       RemoveRedundantTranspose,
       ReplaceMulAddWithDepthwiseConv,
diff --git a/compiler/luci/pass/include/luci/Pass/FuseRmsNormPass.h b/compiler/luci/pass/include/luci/Pass/FuseRmsNormPass.h
new file mode 100644
index 00000000000..54acc1a26ef
--- /dev/null
+++ b/compiler/luci/pass/include/luci/Pass/FuseRmsNormPass.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_FUSE_RMSNORM_PASS_H__
+#define __LUCI_FUSE_RMSNORM_PASS_H__
+
+#include <logo/Pass.h>
+
+namespace luci
+{
+
+/**
+ * @brief  Class to fuse certain pattern of subgraph into CircleRmsNorm
+ */
+struct FuseRmsNormPass final : public logo::Pass
+{
+  const char *name(void) const final { return "luci::FuseRmsNormPass"; }
+
+  bool run(loco::Graph *g) final;
+};
+
+} // namespace luci
+
+#endif // __LUCI_FUSE_RMSNORM_PASS_H__
diff --git a/compiler/luci/pass/src/CircleOptimizer.cpp b/compiler/luci/pass/src/CircleOptimizer.cpp
index bf18b973d6d..154b1f75963 100644
--- a/compiler/luci/pass/src/CircleOptimizer.cpp
+++ b/compiler/luci/pass/src/CircleOptimizer.cpp
@@ -56,6 +56,7 @@
 #include "luci/Pass/FuseSliceWithTConvPass.h"
 #include "luci/Pass/FuseHorizontalFullyConnectedPass.h"
 #include "luci/Pass/FuseTransposeWithMeanPass.h"
+#include "luci/Pass/FuseRmsNormPass.h"
 #include "luci/Pass/MakeBatchNormGammaPositivePass.h"
 #include "luci/Pass/RemoveDuplicateConstPass.h"
 #include "luci/Pass/RemoveFakeQuantPass.h"
@@ -344,6 +345,7 @@ void CircleOptimizer::optimize(loco::Graph *g) const
   option_to_pass[Options::Algorithm::FuseRsqrt] = &createPassInstance<luci::FuseRsqrtPass>;
   option_to_pass[Options::Algorithm::FuseHorizontalFullyConnected] = &createPassInstance<luci::FuseHorizontalFullyConnectedPass>;
   option_to_pass[Options::Algorithm::FuseTransposeWithMean] = &createPassInstance<luci::FuseTransposeWithMeanPass>;
+  option_to_pass[Options::Algorithm::FuseRmsNorm] = &createPassInstance<luci::FuseRmsNormPass>;
   option_to_pass[Options::Algorithm::FoldAddV2] = &createPassInstance<luci::FoldAddV2Pass>;
   option_to_pass[Options::Algorithm::FoldCast] = &createPassInstance<luci::FoldCastPass>;
   option_to_pass[Options::Algorithm::FoldDensify] = &createPassInstance<luci::FoldDensifyPass>;
diff --git a/compiler/luci/pass/src/ConvertNCHWToNHWCPass.cpp b/compiler/luci/pass/src/ConvertNCHWToNHWCPass.cpp
index 74abb7e343d..2f6e2552437 100644
--- a/compiler/luci/pass/src/ConvertNCHWToNHWCPass.cpp
+++ b/compiler/luci/pass/src/ConvertNCHWToNHWCPass.cpp
@@ -431,38 +431,6 @@ luci::CircleConst *create_NHWC_from_NCHW(luci::CircleConst *constant)
   return nhwc_const;
 }
 
-// NOTE Following conditions can be extended later
-//
-// Find PAD with an NCHW pattern described below
-//   - Paddings shape : [4, 2]
-//   - Paddings value : [[0, 0], [0, 0], [h_t, h_b], [w_t, w_b]]]
-bool is_NCHW(const luci::CirclePad *node)
-{
-  const auto paddings = dynamic_cast<luci::CircleConst *>(node->paddings());
-  // Non-const paddings is not supported
-  if (paddings == nullptr)
-    return false;
-
-  if (paddings->rank() != 2)
-    return false;
-
-  if (paddings->dim(0).value() != 4 || paddings->dim(1).value() != 2)
-    return false;
-
-  // Only check the first two dimensions
-  for (uint32_t dim = 0; dim < 2; dim++)
-  {
-    for (uint32_t i = 0; i < 2; i++)
-    {
-      auto data = paddings->at<loco::DataType::S32>(dim * 2 + i);
-      if (data != 0)
-        return false;
-    }
-  }
-
-  return true;
-}
-
 template <loco::DataType T> bool check_NC_padding_zero(const luci::CircleConst *node)
 {
   assert(node->dtype() == T); // FIX_CALLER_UNLESS
@@ -480,8 +448,12 @@ template <loco::DataType T> bool check_NC_padding_zero(const luci::CircleConst *
   return true;
 }
 
-// NOTE Copied from is_NCHW(CirclePad)
-bool is_NCHW(const luci::CirclePadV2 *node)
+// NOTE Following conditions can be extended later
+//
+// Find PAD with an NCHW pattern described below
+//   - Paddings shape : [4, 2]
+//   - Paddings value : [[0, 0], [0, 0], [h_t, h_b], [w_t, w_b]]]
+template <typename T> bool is_NCHW_pad_op(const T *node)
 {
   const auto paddings = dynamic_cast<luci::CircleConst *>(node->paddings());
   // Non-const paddings is not supported
@@ -513,34 +485,6 @@ bool is_NCHW(const luci::CirclePadV2 *node)
   return true;
 }
 
-// NOTE Copied from is_NCHW(CirclePad)
-bool is_NCHW(const luci::CircleMirrorPad *node)
-{
-  const auto paddings = dynamic_cast<luci::CircleConst *>(node->paddings());
-  // Non-const paddings is not supported
-  if (paddings == nullptr)
-    return false;
-
-  if (paddings->rank() != 2)
-    return false;
-
-  if (paddings->dim(0).value() != 4 || paddings->dim(1).value() != 2)
-    return false;
-
-  // Only check the first two dimensions
-  for (uint32_t dim = 0; dim < 2; dim++)
-  {
-    for (uint32_t i = 0; i < 2; i++)
-    {
-      auto data = paddings->at<loco::DataType::S32>(dim * 2 + i);
-      if (data != 0)
-        return false;
-    }
-  }
-
-  return true;
-}
-
 bool is_const(const loco::Node *node)
 {
   if (not dynamic_cast<const luci::CircleConst *>(node))
@@ -715,6 +659,106 @@ template <class T> bool convert_eltwise_binary(T *node)
   return true;
 }
 
+template <class T> bool convert_reduction(T *node)
+{
+  auto input = loco::must_cast<luci::CircleNode *>(node->input());
+  if (input->rank() != 4)
+    return false;
+
+  auto rindices = dynamic_cast<luci::CircleConst *>(node->reduction_indices());
+  if (not rindices)
+    return false;
+
+  auto nhwc_rindices = create_NHWC_rindices(rindices);
+  if (not nhwc_rindices)
+    return false;
+
+  auto pre_trans = create_pre_transpose(node);
+  pre_trans->a(input);
+  node->input(pre_trans);
+
+  // Do shape inference for this node again.
+  node->shape_status(luci::ShapeStatus::UNDEFINED);
+
+  node->reduction_indices(nhwc_rindices);
+
+  if (node->keep_dims())
+  {
+    auto post_trans = create_post_transpose(node);
+    loco::replace(node).with(post_trans);
+
+    post_trans->a(node);
+
+    return true;
+  }
+
+  // node->keep_dims() == false
+  // 1D output never needs a transpose
+  if (node->rank() <= 1)
+    return true;
+
+  std::vector<bool> reduced_dims_nhwc(4, false);
+  uint32_t num_reduced_indices = nhwc_rindices->size<loco::DataType::S32>();
+
+  for (uint32_t ri = 0; ri < num_reduced_indices; ++ri)
+  {
+    reduced_dims_nhwc[nhwc_rindices->at<loco::DataType::S32>(ri)] = true;
+  }
+
+  // if channel dimension has been reduced, we don't need a transpose
+  if (reduced_dims_nhwc[3])
+    return true;
+
+  // likewise, if both space dimensions are reduced, no transpose is needed
+  if (reduced_dims_nhwc[1] && reduced_dims_nhwc[2])
+    return true;
+
+  std::vector<int32_t> post_trans_ind;
+  // case 1: only N is reduced
+  if (num_reduced_indices == 1 && reduced_dims_nhwc[0])
+    post_trans_ind = {2, 0, 1};
+
+  // case 2: only H or W is reduced
+  if (num_reduced_indices == 1 && (reduced_dims_nhwc[1] || reduced_dims_nhwc[2]))
+    post_trans_ind = {0, 2, 1};
+
+  // case 3: N and either H or W are reduced
+  if (num_reduced_indices == 2)
+    post_trans_ind = {1, 0};
+
+  auto post_trans = create_Nd_transpose(node, post_trans_ind);
+  loco::replace(node).with(post_trans);
+
+  post_trans->a(node);
+
+  return true;
+}
+
+template <class T> bool convert_pad(T *node)
+{
+  if (!is_NCHW_pad_op(node))
+    return false;
+
+  const auto pred_node = loco::must_cast<luci::CircleNode *>(node->input());
+  auto pre_trans = create_pre_transpose(node);
+  pre_trans->a(pred_node);
+  node->input(pre_trans);
+
+  auto nchw_paddings = loco::must_cast<luci::CircleConst *>(node->paddings());
+  const auto nhwc_paddings = create_NHWC_paddings(nchw_paddings);
+  node->paddings(nhwc_paddings);
+
+  // Do shape inference for this node again.
+  node->shape_status(luci::ShapeStatus::UNDEFINED);
+
+  auto post_trans = create_post_transpose(node);
+  loco::replace(node).with(post_trans);
+
+  post_trans->a(node);
+
+  return true;
+}
+
 class ConvertNCHWToNHWC final : public luci::CircleNodeMutableVisitor<bool>
 {
   // Default
@@ -854,80 +898,7 @@ class ConvertNCHWToNHWC final : public luci::CircleNodeMutableVisitor<bool>
     return true;
   }
 
-  bool visit(luci::CircleMean *node)
-  {
-    auto input = loco::must_cast<luci::CircleNode *>(node->input());
-    if (input->rank() != 4)
-      return false;
-
-    auto rindices = dynamic_cast<luci::CircleConst *>(node->reduction_indices());
-    if (not rindices)
-      return false;
-
-    auto nhwc_rindices = create_NHWC_rindices(rindices);
-    if (not nhwc_rindices)
-      return false;
-
-    auto pre_trans = create_pre_transpose(node);
-    pre_trans->a(input);
-    node->input(pre_trans);
-
-    // Do shape inference for this node again.
-    node->shape_status(luci::ShapeStatus::UNDEFINED);
-
-    node->reduction_indices(nhwc_rindices);
-
-    if (node->keep_dims())
-    {
-      auto post_trans = create_post_transpose(node);
-      loco::replace(node).with(post_trans);
-
-      post_trans->a(node);
-
-      return true;
-    }
-
-    // node->keep_dims() == false
-    // 1D output never needs a transpose
-    if (node->rank() <= 1)
-      return true;
-
-    std::vector<bool> reduced_dims_nhwc(4, false);
-    uint32_t num_reduced_indices = nhwc_rindices->size<loco::DataType::S32>();
-
-    for (uint32_t ri = 0; ri < num_reduced_indices; ++ri)
-    {
-      reduced_dims_nhwc[nhwc_rindices->at<loco::DataType::S32>(ri)] = true;
-    }
-
-    // if channel dimension has been reduced, we don't need a transpose
-    if (reduced_dims_nhwc[3])
-      return true;
-
-    // likewise, if both space dimensions are reduced, no transpose is needed
-    if (reduced_dims_nhwc[1] && reduced_dims_nhwc[2])
-      return true;
-
-    std::vector<int32_t> post_trans_ind;
-    // case 1: only N is reduced
-    if (num_reduced_indices == 1 && reduced_dims_nhwc[0])
-      post_trans_ind = {2, 0, 1};
-
-    // case 2: only H or W is reduced
-    if (num_reduced_indices == 1 && (reduced_dims_nhwc[1] || reduced_dims_nhwc[2]))
-      post_trans_ind = {0, 2, 1};
-
-    // case 3: N and either H or W are reduced
-    if (num_reduced_indices == 2)
-      post_trans_ind = {1, 0};
-
-    auto post_trans = create_Nd_transpose(node, post_trans_ind);
-    loco::replace(node).with(post_trans);
-
-    post_trans->a(node);
-
-    return true;
-  }
+  bool visit(luci::CircleMean *node) { return convert_reduction<luci::CircleMean>(node); }
 
   bool visit(luci::CircleMinimum *node)
   {
@@ -959,236 +930,19 @@ class ConvertNCHWToNHWC final : public luci::CircleNodeMutableVisitor<bool>
     return true;
   }
 
-  bool visit(luci::CircleMirrorPad *node)
-  {
-    if (!is_NCHW(node))
-      return false;
-
-    const auto pred_node = loco::must_cast<luci::CircleNode *>(node->input());
-    auto pre_trans = create_pre_transpose(node);
-    pre_trans->a(pred_node);
-    node->input(pre_trans);
-
-    auto nchw_paddings = loco::must_cast<luci::CircleConst *>(node->paddings());
-    const auto nhwc_paddings = create_NHWC_paddings(nchw_paddings);
-    node->paddings(nhwc_paddings);
-
-    // Do shape inference for this node again.
-    node->shape_status(luci::ShapeStatus::UNDEFINED);
-
-    auto post_trans = create_post_transpose(node);
-    loco::replace(node).with(post_trans);
-
-    post_trans->a(node);
-
-    return true;
-  }
+  bool visit(luci::CircleMirrorPad *node) { return convert_pad<luci::CircleMirrorPad>(node); }
 
   bool visit(luci::CircleMul *node) { return convert_eltwise_binary<luci::CircleMul>(node); }
 
   bool visit(luci::CircleNeg *node) { return convert_unary_x<luci::CircleNeg>(node); }
 
-  bool visit(luci::CirclePad *node)
-  {
-    if (!is_NCHW(node))
-      return false;
-
-    const auto pred_node = loco::must_cast<luci::CircleNode *>(node->input());
-    auto pre_trans = create_pre_transpose(node);
-    pre_trans->a(pred_node);
-    node->input(pre_trans);
-
-    auto nchw_paddings = loco::must_cast<luci::CircleConst *>(node->paddings());
-    const auto nhwc_paddings = create_NHWC_paddings(nchw_paddings);
-    node->paddings(nhwc_paddings);
-
-    // Do shape inference for this node again.
-    node->shape_status(luci::ShapeStatus::UNDEFINED);
-
-    auto post_trans = create_post_transpose(node);
-    loco::replace(node).with(post_trans);
-
-    post_trans->a(node);
-
-    return true;
-  }
-
-  bool visit(luci::CirclePadV2 *node)
-  {
-    if (!is_NCHW(node))
-      return false;
-
-    const auto pred_node = loco::must_cast<luci::CircleNode *>(node->input());
-    auto pre_trans = create_pre_transpose(node);
-    pre_trans->a(pred_node);
-    node->input(pre_trans);
-
-    auto nchw_paddings = loco::must_cast<luci::CircleConst *>(node->paddings());
-    const auto nhwc_paddings = create_NHWC_paddings(nchw_paddings);
-    node->paddings(nhwc_paddings);
-
-    // Do shape inference for this node again.
-    node->shape_status(luci::ShapeStatus::UNDEFINED);
-
-    auto post_trans = create_post_transpose(node);
-    loco::replace(node).with(post_trans);
-
-    post_trans->a(node);
-
-    return true;
-  }
-
-  // TODO Reduce duplicate code with CircleMean
-  bool visit(luci::CircleReduceMax *node)
-  {
-    auto input = loco::must_cast<luci::CircleNode *>(node->input());
-    if (input->rank() != 4)
-      return false;
-
-    auto rindices = dynamic_cast<luci::CircleConst *>(node->reduction_indices());
-    if (not rindices)
-      return false;
-
-    auto nhwc_rindices = create_NHWC_rindices(rindices);
-    if (not nhwc_rindices)
-      return false;
-
-    auto pre_trans = create_pre_transpose(node);
-    pre_trans->a(input);
-    node->input(pre_trans);
-
-    // Do shape inference for this node again.
-    node->shape_status(luci::ShapeStatus::UNDEFINED);
-
-    node->reduction_indices(nhwc_rindices);
-
-    if (node->keep_dims())
-    {
-      auto post_trans = create_post_transpose(node);
-      loco::replace(node).with(post_trans);
-
-      post_trans->a(node);
-
-      return true;
-    }
-
-    // The below codes handle the cases where node->keep_dims() == false
-    // 1D output never needs a transpose
-    if (node->rank() <= 1)
-      return true;
-
-    std::vector<bool> reduced_dims_nhwc(4, false);
-    uint32_t num_reduced_indices = nhwc_rindices->size<loco::DataType::S32>();
-
-    for (uint32_t ri = 0; ri < num_reduced_indices; ++ri)
-    {
-      reduced_dims_nhwc[nhwc_rindices->at<loco::DataType::S32>(ri)] = true;
-    }
-
-    // if channel dimension has been reduced, we don't need a transpose
-    if (reduced_dims_nhwc[3])
-      return true;
-
-    // likewise, if both space dimensions are reduced, no transpose is needed
-    if (reduced_dims_nhwc[1] && reduced_dims_nhwc[2])
-      return true;
-
-    std::vector<int32_t> post_trans_ind;
-    // case 1: only N is reduced
-    if (num_reduced_indices == 1 && reduced_dims_nhwc[0])
-      post_trans_ind = {2, 0, 1};
-
-    // case 2: only H or W is reduced
-    if (num_reduced_indices == 1 && (reduced_dims_nhwc[1] || reduced_dims_nhwc[2]))
-      post_trans_ind = {0, 2, 1};
-
-    // case 3: N and either H or W are reduced
-    if (num_reduced_indices == 2)
-      post_trans_ind = {1, 0};
-
-    auto post_trans = create_Nd_transpose(node, post_trans_ind);
-    loco::replace(node).with(post_trans);
-
-    post_trans->a(node);
-
-    return true;
-  }
-
-  // TODO Reduce duplicate codes with CircleReduceMax
-  bool visit(luci::CircleReduceMin *node)
-  {
-    auto input = loco::must_cast<luci::CircleNode *>(node->input());
-    if (input->rank() != 4)
-      return false;
-
-    auto rindices = dynamic_cast<luci::CircleConst *>(node->reduction_indices());
-    if (not rindices)
-      return false;
-
-    auto nhwc_rindices = create_NHWC_rindices(rindices);
-    if (not nhwc_rindices)
-      return false;
+  bool visit(luci::CirclePad *node) { return convert_pad<luci::CirclePad>(node); }
 
-    auto pre_trans = create_pre_transpose(node);
-    pre_trans->a(input);
-    node->input(pre_trans);
-
-    // Do shape inference for this node again.
-    node->shape_status(luci::ShapeStatus::UNDEFINED);
+  bool visit(luci::CirclePadV2 *node) { return convert_pad<luci::CirclePadV2>(node); }
 
-    node->reduction_indices(nhwc_rindices);
+  bool visit(luci::CircleReduceMax *node) { return convert_reduction<luci::CircleReduceMax>(node); }
 
-    if (node->keep_dims())
-    {
-      auto post_trans = create_post_transpose(node);
-      loco::replace(node).with(post_trans);
-
-      post_trans->a(node);
-
-      return true;
-    }
-
-    // The below codes handle the cases where node->keep_dims() == false
-    // 1D output never needs a transpose
-    if (node->rank() <= 1)
-      return true;
-
-    std::vector<bool> reduced_dims_nhwc(4, false);
-    uint32_t num_reduced_indices = nhwc_rindices->size<loco::DataType::S32>();
-
-    for (uint32_t ri = 0; ri < num_reduced_indices; ++ri)
-    {
-      reduced_dims_nhwc[nhwc_rindices->at<loco::DataType::S32>(ri)] = true;
-    }
-
-    // if channel dimension has been reduced, we don't need a transpose
-    if (reduced_dims_nhwc[3])
-      return true;
-
-    // likewise, if both space dimensions are reduced, no transpose is needed
-    if (reduced_dims_nhwc[1] && reduced_dims_nhwc[2])
-      return true;
-
-    std::vector<int32_t> post_trans_ind;
-    // case 1: only N is reduced
-    if (num_reduced_indices == 1 && reduced_dims_nhwc[0])
-      post_trans_ind = {2, 0, 1};
-
-    // case 2: only H or W is reduced
-    if (num_reduced_indices == 1 && (reduced_dims_nhwc[1] || reduced_dims_nhwc[2]))
-      post_trans_ind = {0, 2, 1};
-
-    // case 3: N and either H or W are reduced
-    if (num_reduced_indices == 2)
-      post_trans_ind = {1, 0};
-
-    auto post_trans = create_Nd_transpose(node, post_trans_ind);
-    loco::replace(node).with(post_trans);
-
-    post_trans->a(node);
-
-    return true;
-  }
+  bool visit(luci::CircleReduceMin *node) { return convert_reduction<luci::CircleReduceMin>(node); }
 
   bool visit(luci::CircleRelu *node) { return convert_unary_features<luci::CircleRelu>(node); }
 
diff --git a/compiler/luci/pass/src/ConvertToFakeQuantizedModelPass.cpp b/compiler/luci/pass/src/ConvertToFakeQuantizedModelPass.cpp
index 4a0bc663369..6f40891feb3 100644
--- a/compiler/luci/pass/src/ConvertToFakeQuantizedModelPass.cpp
+++ b/compiler/luci/pass/src/ConvertToFakeQuantizedModelPass.cpp
@@ -219,6 +219,7 @@ struct FakeQuantize final : public luci::CircleNodeMutableVisitor<void>
   void visit(luci::CircleRelu6 *node) { fq_activation(node); }
   void visit(luci::CircleResizeBilinear *node) { fq_activation(node); }
   void visit(luci::CircleResizeNearestNeighbor *node) { fq_activation(node); }
+  void visit(luci::CircleRmsNorm *node) { fq_activation(node); }
   void visit(luci::CircleRsqrt *node) { fq_activation(node); }
   void visit(luci::CircleSoftmax *node) { fq_activation(node); }
   void visit(luci::CircleSqrt *node) { fq_activation(node); }
diff --git a/compiler/luci/pass/src/FuseRmsNormPass.cpp b/compiler/luci/pass/src/FuseRmsNormPass.cpp
new file mode 100644
index 00000000000..f2ecbffa77f
--- /dev/null
+++ b/compiler/luci/pass/src/FuseRmsNormPass.cpp
@@ -0,0 +1,218 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/FuseRmsNormPass.h"
+#include "helpers/NodeFiller.h"
+
+#include <luci/IR/CircleNodes.h>
+#include <luci/Profile/CircleNodeOrigin.h>
+#include <luci/Service/CircleNodeClone.h>
+
+#include <cmath>
+#include <cassert>
+
+namespace
+{
+
+/**
+ * Below diagram shows RMS normalization pattern to fuse.
+ * - this pattern will be replaced with one RmsNorm
+ *
+ *           [In]
+ *            |
+ *            V
+ *     +---- ifm ----+
+ *     |      |      |
+ *     |      V      |
+ *     |     mul <---+
+ *     |      |
+ *     |      V
+ *     |     mean
+ *     |      |
+ *     |      V
+ *     |     add_epsilon
+ *     |      |
+ *     |      V
+ *     |     rsqrt
+ *     |      |
+ *     |      V
+ *     +---> mul_input
+ *            |
+ *            V
+ *          [Out]
+ */
+
+class RmsNormPattern final
+{
+public:
+  RmsNormPattern(luci::CircleMul *candidate)
+  {
+    assert(candidate); // FIX_CALLER_UNLESS
+    _mul_input = candidate;
+  }
+
+public:
+  bool matched();
+
+public:
+  luci::CircleNode *_ifm = nullptr;
+  luci::CircleMul *_mul_pow = nullptr;
+  luci::CircleMean *_mean = nullptr;
+  luci::CircleAdd *_add_epsilon = nullptr;
+  luci::CircleRsqrt *_rsqrt = nullptr;
+  luci::CircleMul *_mul_input = nullptr;
+  luci::CircleConst *_const_epsilon = nullptr;
+  luci::CircleConst *_const_gamma = nullptr;
+  luci::CircleConst *_const_beta = nullptr;
+};
+
+#define CHECK_OR_FALSE(condition) \
+  if (not(condition))             \
+    return false;
+
+luci::CircleConst *make_const_one(loco::Graph *graph, float value)
+{
+  auto const_one = graph->nodes()->create<luci::CircleConst>();
+  const_one->dtype(loco::DataType::FLOAT32);
+  const_one->rank(1);
+  const_one->dim(0) = 1;
+  const_one->shape_status(luci::ShapeStatus::VALID);
+  const_one->size<loco::DataType::FLOAT32>(1);
+  const_one->at<loco::DataType::FLOAT32>(0) = value;
+  return const_one;
+}
+
+bool RmsNormPattern::matched()
+{
+  CHECK_OR_FALSE(luci::fill(&_ifm, &_rsqrt).with_commutative_args_of(_mul_input));
+  _add_epsilon = dynamic_cast<luci::CircleAdd *>(_rsqrt->x());
+  CHECK_OR_FALSE(_add_epsilon);
+  CHECK_OR_FALSE(luci::fill(&_mean, &_const_epsilon).with_commutative_args_of(_add_epsilon));
+  CHECK_OR_FALSE(_const_epsilon->dtype() == loco::DataType::FLOAT32);
+  _mul_pow = dynamic_cast<luci::CircleMul *>(_mean->input());
+  CHECK_OR_FALSE(_mul_pow);
+  CHECK_OR_FALSE(_mul_pow->x() == _ifm);
+  CHECK_OR_FALSE(_mul_pow->y() == _ifm);
+
+  assert(_const_gamma == nullptr);
+  assert(_const_beta == nullptr);
+
+  /*
+   NOTE: Current FuseRmsNormPass assumes no gamma(scale) and beta(bias).
+   But, RmsNorm kernel expects gamma and beta.
+   So, it creates default gamma(1.0) and beta(0.0).
+  */
+  auto graph = _mul_input->graph();
+  _const_gamma = make_const_one(graph, 1.0f);
+  _const_beta = make_const_one(graph, 0.0f);
+  _const_gamma->name(_mul_input->name() + "/gamma");
+  _const_beta->name(_mul_input->name() + "/beta");
+
+  return true;
+}
+#undef CHECK_OR_FALSE
+
+class FuseRmsNorm final
+{
+public:
+  FuseRmsNorm(const RmsNormPattern *p) : _p(p) {}
+
+public:
+  void apply(void);
+
+private:
+  luci::CircleRmsNorm *create_rms_norm(loco::Graph *graph);
+
+private:
+  const RmsNormPattern *_p = nullptr;
+};
+
+luci::CircleRmsNorm *FuseRmsNorm::create_rms_norm(loco::Graph *graph)
+{
+  assert(graph);
+
+  auto rms_norm = graph->nodes()->create<luci::CircleRmsNorm>();
+  rms_norm->input(_p->_ifm);
+  rms_norm->gamma(_p->_const_gamma);
+  rms_norm->beta(_p->_const_beta);
+  float epsilon = _p->_const_epsilon->at<loco::DataType::FLOAT32>(0);
+  rms_norm->epsilon(epsilon);
+
+  rms_norm->name("FusedRmsNorm/" + _p->_mul_input->name());
+
+  return rms_norm;
+}
+
+void FuseRmsNorm::apply()
+{
+  auto graph = _p->_mul_input->graph();
+
+  auto rms_norm = create_rms_norm(graph);
+
+  // set origin
+  std::vector<std::shared_ptr<luci::CircleNodeOrigin>> origin_vec{
+    luci::get_origin(_p->_mul_pow),     luci::get_origin(_p->_mean),
+    luci::get_origin(_p->_add_epsilon), luci::get_origin(_p->_rsqrt),
+    luci::get_origin(_p->_mul_input),
+  };
+
+  luci::add_origin(rms_norm, luci::composite_origin(origin_vec));
+
+  replace(_p->_mul_input).with(rms_norm);
+}
+
+} // namespace
+
+namespace
+{
+
+bool fuse_rms_norm(luci::CircleMul *mul)
+{
+  assert(mul);
+
+  RmsNormPattern pattern(mul);
+  if (pattern.matched())
+  {
+    FuseRmsNorm fuse(&pattern);
+    fuse.apply();
+    return true;
+  }
+
+  return false;
+}
+
+} // namespace
+
+namespace luci
+{
+
+bool FuseRmsNormPass::run(loco::Graph *g)
+{
+  bool changed = false;
+
+  for (auto node : loco::active_nodes(loco::output_nodes(g)))
+  {
+    auto mul = dynamic_cast<luci::CircleMul *>(node);
+    if (not mul)
+      continue;
+
+    if (fuse_rms_norm(mul))
+      changed = true;
+  }
+  return changed;
+}
+
+} // namespace luci
diff --git a/compiler/luci/pass/src/FuseRmsNormPass.test.cpp b/compiler/luci/pass/src/FuseRmsNormPass.test.cpp
new file mode 100644
index 00000000000..07bb97fe01a
--- /dev/null
+++ b/compiler/luci/pass/src/FuseRmsNormPass.test.cpp
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/FuseRmsNormPass.h"
+
+#include <gtest/gtest.h>
+
+TEST(FuseRmsNormPassTest, name)
+{
+  luci::FuseRmsNormPass pass;
+  auto const name = pass.name();
+  ASSERT_NE(nullptr, name);
+}
diff --git a/compiler/luci/pass/src/QuantizeActivation.h b/compiler/luci/pass/src/QuantizeActivation.h
index 162ec2c66ae..bd8bd2a8167 100644
--- a/compiler/luci/pass/src/QuantizeActivation.h
+++ b/compiler/luci/pass/src/QuantizeActivation.h
@@ -75,6 +75,7 @@ struct QuantizeConstInputActivation final : public luci::CircleNodeMutableVisito
   SKIP(luci::CircleFullyConnected)
   SKIP(luci::CircleInstanceNorm)
   SKIP(luci::CirclePRelu)
+  SKIP(luci::CircleRmsNorm)
   SKIP(luci::CircleTransposeConv)
 
   // Handled in PropagateQParamBackwardPass
diff --git a/compiler/luci/pass/src/QuantizePreCheckerPass.cpp b/compiler/luci/pass/src/QuantizePreCheckerPass.cpp
index 4b3b7e33095..1eea4f66d5d 100644
--- a/compiler/luci/pass/src/QuantizePreCheckerPass.cpp
+++ b/compiler/luci/pass/src/QuantizePreCheckerPass.cpp
@@ -84,6 +84,7 @@ struct ConstInputChecker final : public luci::CircleNodeMutableVisitor<void>
   CHECK_NODE_WITH_TWO_INPUT_CONST(luci::CircleDepthwiseConv2D, filter, bias)
   CHECK_NODE_WITH_TWO_INPUT_CONST(luci::CircleFullyConnected, weights, bias)
   CHECK_NODE_WITH_TWO_INPUT_CONST(luci::CircleInstanceNorm, gamma, beta)
+  CHECK_NODE_WITH_TWO_INPUT_CONST(luci::CircleRmsNorm, gamma, beta)
 
   // Ops that receive three const nodes as an inputs
   CHECK_NODE_WITH_THREE_INPUT_CONST(luci::CircleTransposeConv, inputSizes, filter, bias)
diff --git a/compiler/luci/pass/src/QuantizePreCheckerPass.test.cpp b/compiler/luci/pass/src/QuantizePreCheckerPass.test.cpp
index 8f6a96f3330..3f6295f4a2e 100644
--- a/compiler/luci/pass/src/QuantizePreCheckerPass.test.cpp
+++ b/compiler/luci/pass/src/QuantizePreCheckerPass.test.cpp
@@ -192,6 +192,49 @@ class SimpleInstanceNormGraph
   luci::CircleOutput *output = nullptr;
 };
 
+class SimpleRmsNormGraph
+{
+public:
+  SimpleRmsNormGraph(bool make_valid)
+  {
+    rms_norm_node = g.nodes()->create<luci::CircleRmsNorm>();
+    input_1 = g.nodes()->create<luci::CircleInput>();
+    gamma = g.nodes()->create<luci::CircleConst>();
+
+    rms_norm_node->input(input_1);
+    rms_norm_node->gamma(gamma);
+
+    if (make_valid)
+    {
+      beta = g.nodes()->create<luci::CircleConst>();
+      rms_norm_node->beta(beta);
+    }
+    else
+    {
+      input_2 = g.nodes()->create<luci::CircleInput>();
+      rms_norm_node->beta(input_2);
+    }
+
+    output = g.nodes()->create<luci::CircleOutput>();
+
+    auto graph_output = g.outputs()->create();
+    output->index(graph_output->index());
+
+    output->from(rms_norm_node);
+  }
+
+public:
+  loco::Graph g;
+
+private:
+  luci::CircleRmsNorm *rms_norm_node = nullptr;
+  luci::CircleInput *input_1 = nullptr;
+  luci::CircleInput *input_2 = nullptr;
+  luci::CircleConst *gamma = nullptr;
+  luci::CircleConst *beta = nullptr;
+  luci::CircleOutput *output = nullptr;
+};
+
 class SimpleTransposeConvGraph
 {
 public:
@@ -363,6 +406,25 @@ TEST(QuantizePreCheckerPassTest, instance_norm_NEG)
   EXPECT_ANY_THROW(checker.run(&invalid_graph.g));
 }
 
+// Test RmsNorm
+TEST(QuantizePreCheckerPassTest, rms_norm)
+{
+  SimpleRmsNormGraph valid_graph(true);
+
+  luci::QuantizePreCheckerPass checker{};
+
+  EXPECT_NO_THROW(checker.run(&valid_graph.g));
+}
+
+TEST(QuantizePreCheckerPassTest, rms_norm_NEG)
+{
+  SimpleRmsNormGraph invalid_graph(false);
+
+  luci::QuantizePreCheckerPass checker{};
+
+  EXPECT_ANY_THROW(checker.run(&invalid_graph.g));
+}
+
 // Test TransposeConv
 TEST(QuantizePreCheckerPassTest, transpose_conv)
 {
diff --git a/compiler/luci/pass/src/QuantizeWeights.cpp b/compiler/luci/pass/src/QuantizeWeights.cpp
index 17a887cfa4f..5350e21a4ca 100644
--- a/compiler/luci/pass/src/QuantizeWeights.cpp
+++ b/compiler/luci/pass/src/QuantizeWeights.cpp
@@ -507,6 +507,36 @@ void QuantizeWeights::visit(luci::CircleInstanceNorm *node)
   }
 }
 
+void QuantizeWeights::visit(luci::CircleRmsNorm *node)
+{
+  LOGGER(l);
+  INFO(l) << "QuantizeWeights QuantizeWeights::visit node: " << node->name() << std::endl;
+
+  auto gamma = loco::must_cast<luci::CircleConst *>(node->gamma());
+  auto beta = loco::must_cast<luci::CircleConst *>(node->beta());
+
+  if (!is_quantized(gamma))
+  {
+    assert(gamma->dtype() == loco::DataType::FLOAT32);
+    auto new_gamma = luci::clone(gamma);
+    if (granularity == QuantizationGranularity::LayerWise)
+      quant_const(new_gamma, output_type);
+    else if (granularity == QuantizationGranularity::ChannelWise)
+      quant_const_per_channel(new_gamma, output_type);
+    node->gamma(new_gamma);
+  }
+  if (!is_quantized(beta))
+  {
+    assert(beta->dtype() == loco::DataType::FLOAT32);
+    auto new_beta = luci::clone(beta);
+    if (granularity == QuantizationGranularity::LayerWise)
+      quant_const(new_beta, output_type);
+    else if (granularity == QuantizationGranularity::ChannelWise)
+      quant_const_per_channel(new_beta, output_type);
+    node->beta(new_beta);
+  }
+}
+
 void QuantizeWeights::visit(luci::CirclePRelu *node)
 {
   LOGGER(l);
diff --git a/compiler/luci/pass/src/QuantizeWeights.h b/compiler/luci/pass/src/QuantizeWeights.h
index f62cd40f3cb..b3913f2e809 100644
--- a/compiler/luci/pass/src/QuantizeWeights.h
+++ b/compiler/luci/pass/src/QuantizeWeights.h
@@ -44,6 +44,7 @@ struct QuantizeWeights final : public luci::CircleNodeMutableVisitor<void>
   void visit(luci::CircleConv2D *node);
   void visit(luci::CircleDepthwiseConv2D *node);
   void visit(luci::CircleInstanceNorm *node);
+  void visit(luci::CircleRmsNorm *node);
   void visit(luci::CirclePRelu *node);
   void visit(luci::CircleTransposeConv *node);
   void visit(luci::CircleFullyConnected *node);
diff --git a/compiler/luci/pass/src/QuantizeWithMinMaxPass.cpp b/compiler/luci/pass/src/QuantizeWithMinMaxPass.cpp
index bdb50d67a87..695e8b1eeeb 100644
--- a/compiler/luci/pass/src/QuantizeWithMinMaxPass.cpp
+++ b/compiler/luci/pass/src/QuantizeWithMinMaxPass.cpp
@@ -301,6 +301,7 @@ struct InsertQuantizeOp final : public luci::CircleNodeMutableVisitor<void>
   INSERT_QUANTIZE_TO_UNARY_OP(luci::CircleResizeBilinear, input)
   INSERT_QUANTIZE_TO_UNARY_OP(luci::CircleResizeNearestNeighbor, input)
   INSERT_QUANTIZE_TO_UNARY_OP(luci::CircleReverseSequence, input)
+  INSERT_QUANTIZE_TO_UNARY_OP(luci::CircleRmsNorm, input)
   INSERT_QUANTIZE_TO_UNARY_OP(luci::CircleRsqrt, x)
   INSERT_QUANTIZE_TO_UNARY_OP(luci::CircleSlice, input)
   INSERT_QUANTIZE_TO_UNARY_OP(luci::CircleSoftmax, logits)
@@ -548,12 +549,12 @@ void QuantizeWithMinMaxPass::set_output_type(loco::Graph *g) const
  *
  * Why quantization sequence was determined as above?
  * - Activation and weights should be quantized before bias (1->2->3). Input/Output
- *   dtype can be updated at the end (4->5).
+ *   dtype is updated after all the other nodes are quantzied (4->5).
  * - During activation quantization,
  *   - Backward propagation is performed earlier than forward propagation. This allows
- *     backward-propagated qpram to be overwritten during forward propagation.
- *     We made this decision as Ops for forward propagation (reshape, transpose, ..)
- *     are more common than backward propagation. TODO Check this decision is safe.
+ *     backward-propagated qparam to be overwritten during forward propagation.
+ *     We made the decision because it's more common to propagate qparam forward (reshape,
+ *     transpose) than backward (concat, pad_v2, ..).
  *   - QuantizeSpecialActivation is called before forward propagation to make sure that
  *     the pre-defined qparam values are propagated.
  */
diff --git a/compiler/luci/pass/src/VerifyQuantizedNodeGranularity.h b/compiler/luci/pass/src/VerifyQuantizedNodeGranularity.h
index cc618bf0e2f..6fc6a26ba46 100644
--- a/compiler/luci/pass/src/VerifyQuantizedNodeGranularity.h
+++ b/compiler/luci/pass/src/VerifyQuantizedNodeGranularity.h
@@ -93,6 +93,8 @@ class VerifyQuantizedNodeGranularity : public luci::CircleNodeVisitor<bool>
 
   virtual bool visit(const luci::CircleInstanceNorm *node) = 0;
 
+  virtual bool visit(const luci::CircleRmsNorm *node) = 0;
+
   bool visit(const luci::CirclePack *node)
   {
     RETURN_FALSE_UNLESS(is_lwq(node))
@@ -511,6 +513,15 @@ class VerifyQuantizedNodeChannelWiseGranularity final : public VerifyQuantizedNo
     return true;
   }
 
+  bool visit(const luci::CircleRmsNorm *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node))
+    RETURN_FALSE_UNLESS(is_lwq(node->input()))
+    RETURN_FALSE_UNLESS(is_cwq_const(node->gamma(), rank(node->gamma()) - 1))
+    RETURN_FALSE_UNLESS(is_cwq_const(node->beta(), rank(node->beta()) - 1))
+    return true;
+  }
+
   bool visit(const luci::CirclePRelu *node)
   {
     RETURN_FALSE_UNLESS(is_lwq(node))
@@ -595,6 +606,15 @@ class VerifyQuantizedNodeLayerWiseGranularity final : public VerifyQuantizedNode
     return true;
   }
 
+  bool visit(const luci::CircleRmsNorm *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node))
+    RETURN_FALSE_UNLESS(is_lwq(node->input()))
+    RETURN_FALSE_UNLESS(is_lwq_const(node->gamma()))
+    RETURN_FALSE_UNLESS(is_lwq_const(node->beta()))
+    return true;
+  }
+
   bool visit(const luci::CirclePRelu *node)
   {
     RETURN_FALSE_UNLESS(is_lwq(node))
diff --git a/compiler/luci/pass/src/VerifyQuantizedNodeType.cpp b/compiler/luci/pass/src/VerifyQuantizedNodeType.cpp
index 4bad9522b85..1f0ff43b779 100644
--- a/compiler/luci/pass/src/VerifyQuantizedNodeType.cpp
+++ b/compiler/luci/pass/src/VerifyQuantizedNodeType.cpp
@@ -364,6 +364,12 @@ bool VerifyQuantizedNodeTypeBase<Qtype, Btype>::visit(const luci::CircleResizeNe
   return true;
 }
 
+template <loco::DataType Qtype, loco::DataType Btype>
+bool VerifyQuantizedNodeTypeBase<Qtype, Btype>::visit(const luci::CircleRmsNorm *node)
+{
+  return group_has_type(node, Qtype);
+}
+
 template <loco::DataType Qtype, loco::DataType Btype>
 bool VerifyQuantizedNodeTypeBase<Qtype, Btype>::visit(const luci::CircleRsqrt *node)
 {
diff --git a/compiler/luci/pass/src/VerifyQuantizedNodeType.h b/compiler/luci/pass/src/VerifyQuantizedNodeType.h
index 03f1e1d8640..15ec384413c 100644
--- a/compiler/luci/pass/src/VerifyQuantizedNodeType.h
+++ b/compiler/luci/pass/src/VerifyQuantizedNodeType.h
@@ -110,6 +110,7 @@ class VerifyQuantizedNodeTypeBase : public luci::CircleNodeVisitor<bool>,
   bool visit(const luci::CircleReshape *node);
   bool visit(const luci::CircleResizeBilinear *node);
   bool visit(const luci::CircleResizeNearestNeighbor *node);
+  bool visit(const luci::CircleRmsNorm *node);
   bool visit(const luci::CircleRsqrt *node);
   bool visit(const luci::CircleSlice *node);
   bool visit(const luci::CircleSpaceToBatchND *node);
diff --git a/compiler/luci/service/src/CircleCloneNode.h b/compiler/luci/service/src/CircleCloneNode.h
index e2f61e1eb0e..64c9e4f486f 100644
--- a/compiler/luci/service/src/CircleCloneNode.h
+++ b/compiler/luci/service/src/CircleCloneNode.h
@@ -259,6 +259,7 @@ class CloneNode final : public luci::CircleNodeVisitor<luci::CircleNode *>
   luci::CircleNode *visit(const luci::CircleBCQGather *) final;
   luci::CircleNode *visit(const luci::CircleInstanceNorm *) final;
   luci::CircleNode *visit(const luci::CircleGRU *) final;
+  luci::CircleNode *visit(const luci::CircleRmsNorm *) final;
 
   // NOTE CircleInput and CircleOutput are not handled here as these need
   //      link with graph I/O
diff --git a/compiler/luci/service/src/CircleShapeInferenceHelper.cpp b/compiler/luci/service/src/CircleShapeInferenceHelper.cpp
index 3d78a31a12e..c10746b86cc 100644
--- a/compiler/luci/service/src/CircleShapeInferenceHelper.cpp
+++ b/compiler/luci/service/src/CircleShapeInferenceHelper.cpp
@@ -168,14 +168,17 @@ loco::TensorShape pad_shape(const loco::TensorShape &input_shape, const luci::Ci
 
   // TODO support other data type
   LUCI_ASSERT(paddings->dtype() == S32 || paddings->dtype() == S64, "Support int 32/64 for now");
-  LUCI_ASSERT(paddings->rank() == 2, "paddings should be rank 2");
+  if (paddings->rank() != 2)
+    INTERNAL_EXN("paddings should be rank 2");
 
   int32_t n = paddings->dim(0).value();
   int32_t v = paddings->dim(1).value();
 
-  LUCI_ASSERT(v == 2, "paddings should be [n, 2]");
-  LUCI_ASSERT(n == int32_t(input_shape.rank()),
-              "paddings [n, 2] should have same value of input rank");
+  if (v != 2)
+    INTERNAL_EXN("paddings should be [n, 2]");
+
+  if (n != int32_t(input_shape.rank()))
+    INTERNAL_EXN("paddings [n, 2] should have same value of input rank");
 
   loco::TensorShape output_shape;
 
diff --git a/compiler/luci/service/src/CircleShapeInferenceRule.cpp b/compiler/luci/service/src/CircleShapeInferenceRule.cpp
index 42c45353361..a094b681d0c 100644
--- a/compiler/luci/service/src/CircleShapeInferenceRule.cpp
+++ b/compiler/luci/service/src/CircleShapeInferenceRule.cpp
@@ -2198,6 +2198,13 @@ class ShapeInferenceAlgorithm final : public luci::CircleNodeVisitor<loco::NodeS
 
   loco::NodeShape visit(const luci::CircleGRU *node) final { return infer_circle_gru(node); }
 
+  loco::NodeShape visit(const luci::CircleRmsNorm *node) final
+  {
+    auto input_shape = luci::shape_get(node->input()).as<loco::TensorShape>();
+
+    return loco::NodeShape{input_shape};
+  }
+
   // Virtual
   loco::NodeShape visit(const luci::CircleInput *node) final { return infer_input(node); }
 
diff --git a/compiler/luci/service/src/CircleTypeInferenceRule.cpp b/compiler/luci/service/src/CircleTypeInferenceRule.cpp
index 78dde1004b5..6b656567071 100644
--- a/compiler/luci/service/src/CircleTypeInferenceRule.cpp
+++ b/compiler/luci/service/src/CircleTypeInferenceRule.cpp
@@ -579,6 +579,11 @@ struct TypeInferenceAlgorithm final : public luci::CircleNodeVisitor<loco::DataT
     return luci::dtype_get(node->input());
   }
 
+  loco::DataType visit(const luci::CircleRmsNorm *node) final
+  {
+    return luci::dtype_get(node->input());
+  }
+
   // Virtual
   loco::DataType visit(const luci::CircleInput *node) final { return node->dtype(); }
 
diff --git a/compiler/luci/service/src/Nodes/CirclePad.test.cpp b/compiler/luci/service/src/Nodes/CirclePad.test.cpp
index 070b9b31075..5b221b55861 100644
--- a/compiler/luci/service/src/Nodes/CirclePad.test.cpp
+++ b/compiler/luci/service/src/Nodes/CirclePad.test.cpp
@@ -124,3 +124,87 @@ TEST(ShapeRuleTest, pad_non_const_paddings)
   ASSERT_EQ(0, shape.dim(2).value());
   ASSERT_EQ(0, shape.dim(3).value());
 }
+
+TEST(ShapeRuleTest, paddings_invalid_rank_NEG)
+{
+  auto g = loco::make_graph();
+  auto node_pad = g->nodes()->create<luci::CirclePad>();
+
+  auto node_paddings = g->nodes()->create<luci::CircleConst>();
+  auto node_input = g->nodes()->create<luci::CircleInput>();
+
+  loco::TensorShape shape;
+  luci::sinf::Rule shape_inf_rule;
+
+  node_input->shape({1, 2, 3, 4});
+  node_input->shape_status(luci::ShapeStatus::VALID);
+
+  node_paddings->dtype(loco::DataType::S64);
+  node_paddings->shape({4, 2, 3});
+  node_paddings->shape_status(luci::ShapeStatus::VALID);
+
+  const loco::DataType S64 = loco::DataType::S64;
+  uint32_t t = 64 * 8;
+  node_paddings->size<S64>(t);
+
+  node_pad->input(node_input);
+  node_pad->paddings(node_paddings);
+
+  ASSERT_ANY_THROW(shape_inf_rule.infer(node_pad, shape));
+}
+
+TEST(ShapeRuleTest, paddings_invalid_shape_1_NEG)
+{
+  auto g = loco::make_graph();
+  auto node_pad = g->nodes()->create<luci::CirclePad>();
+
+  auto node_paddings = g->nodes()->create<luci::CircleConst>();
+  auto node_input = g->nodes()->create<luci::CircleInput>();
+
+  loco::TensorShape shape;
+  luci::sinf::Rule shape_inf_rule;
+
+  node_input->shape({1, 2, 3, 4});
+  node_input->shape_status(luci::ShapeStatus::VALID);
+
+  node_paddings->dtype(loco::DataType::S64);
+  node_paddings->shape({4, 4});
+  node_paddings->shape_status(luci::ShapeStatus::VALID);
+
+  const loco::DataType S64 = loco::DataType::S64;
+  uint32_t t = 64 * 8;
+  node_paddings->size<S64>(t);
+
+  node_pad->input(node_input);
+  node_pad->paddings(node_paddings);
+
+  ASSERT_ANY_THROW(shape_inf_rule.infer(node_pad, shape));
+}
+
+TEST(ShapeRuleTest, paddings_invalid_shape_2_NEG)
+{
+  auto g = loco::make_graph();
+  auto node_pad = g->nodes()->create<luci::CirclePad>();
+
+  auto node_paddings = g->nodes()->create<luci::CircleConst>();
+  auto node_input = g->nodes()->create<luci::CircleInput>();
+
+  loco::TensorShape shape;
+  luci::sinf::Rule shape_inf_rule;
+
+  node_input->shape({1, 2, 3, 4});
+  node_input->shape_status(luci::ShapeStatus::VALID);
+
+  node_paddings->dtype(loco::DataType::S64);
+  node_paddings->shape({5, 2});
+  node_paddings->shape_status(luci::ShapeStatus::VALID);
+
+  const loco::DataType S64 = loco::DataType::S64;
+  uint32_t t = 64 * 8;
+  node_paddings->size<S64>(t);
+
+  node_pad->input(node_input);
+  node_pad->paddings(node_paddings);
+
+  ASSERT_ANY_THROW(shape_inf_rule.infer(node_pad, shape));
+}
diff --git a/compiler/luci/service/src/Nodes/CircleReshape.cpp b/compiler/luci/service/src/Nodes/CircleReshape.cpp
index 0de10960b51..778f8d45762 100644
--- a/compiler/luci/service/src/Nodes/CircleReshape.cpp
+++ b/compiler/luci/service/src/Nodes/CircleReshape.cpp
@@ -87,6 +87,10 @@ loco::TensorShape Algorithm::visit(const luci::CircleReshape *node)
       for (uint32_t axis = 0; axis < shape_by_input.rank(); ++axis)
       {
         shape_by_input.dim(axis) = const_shape_node->at<S32>(axis);
+        if (const_shape_node->at<S32>(axis) < 0)
+        {
+          shape_by_input.dim(axis).unset();
+        }
       }
     }
     else
@@ -139,7 +143,7 @@ loco::TensorShape Algorithm::visit(const luci::CircleReshape *node)
   for (uint32_t dim_index = 0; dim_index < output_shape.rank(); ++dim_index)
   {
     const uint32_t dim_value = output_shape.dim(dim_index).value();
-    if (static_cast<int>(dim_value) == -1)
+    if (not output_shape.dim(dim_index).known())
     {
       LUCI_ASSERT(unknown_dim_index == UINT32_MAX, "More than one unknown dimension");
       unknown_dim_index = dim_index;
diff --git a/compiler/luci/service/src/Nodes/CircleRmsNorm.cpp b/compiler/luci/service/src/Nodes/CircleRmsNorm.cpp
new file mode 100644
index 00000000000..0fdf2bdf3d8
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleRmsNorm.cpp
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleRmsNorm *node)
+{
+  auto *cloned = _graph->nodes()->create<luci::CircleRmsNorm>();
+  if (cloned != nullptr)
+  {
+    cloned->epsilon(node->epsilon());
+  }
+  return cloned;
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleRmsNorm.test.cpp b/compiler/luci/service/src/Nodes/CircleRmsNorm.test.cpp
new file mode 100644
index 00000000000..9bd0bc891da
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleRmsNorm.test.cpp
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_RmsNorm)
+{
+  auto g = loco::make_graph();
+  auto node_fc = g->nodes()->create<luci::CircleRmsNorm>();
+  node_fc->epsilon(3);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_fc, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_fc = dynamic_cast<luci::CircleRmsNorm *>(cloned);
+  ASSERT_NE(nullptr, cloned_fc);
+  ASSERT_EQ(node_fc->epsilon(), cloned_fc->epsilon());
+}
diff --git a/compiler/one-cmds/one-codegen b/compiler/one-cmds/one-codegen
index 8551056f6a8..84e7b31d906 100644
--- a/compiler/one-cmds/one-codegen
+++ b/compiler/one-cmds/one-codegen
@@ -35,6 +35,9 @@ import onelib.utils as oneutils
 # TODO Find better way to suppress trackback on error
 sys.tracebacklimit = 0
 
+COMMAND_KEYS = ['__command', 'command']
+BACKEND_KEY = 'BACKEND'
+
 
 def _get_parser(backends_list):
     codegen_usage = 'one-codegen [-h] [-v] [-C CONFIG] [-b BACKEND | -T TARGET] [--] [COMMANDS FOR BACKEND]'
@@ -81,7 +84,8 @@ def _verify_arg(parser, args, cfg_args, cfg_target_args, backend_args, unknown_a
             # overwrite the value if it exists as command line option has higher priority.
             if oneutils.is_valid_attr(args, 'target'):
                 target_to_run = args.target
-            given_backend = backends.get_backend_from_target_conf(target_to_run)
+            given_backend = backends.get_value_from_target_conf(
+                target_to_run, BACKEND_KEY)
             if not given_backend:
                 parser.error(f'Not found {target_to_run} target.')
         else:
@@ -213,8 +217,11 @@ def main():
                     assert (oneutils.is_valid_attr(cfg_args, 'command'))
                     setattr(cfg_args, args.backend, cfg_args.command)
             else:
+                given_backend = None
                 # get backend information
-                given_backend = backends.get_backend_from_target_conf(target_to_run)
+                if target_to_run:
+                    given_backend = backends.get_value_from_target_conf(
+                        target_to_run, BACKEND_KEY)
                 # check if command schema for the backend exists
                 # 1. if it exists, run the command according to the schema.
                 # 2. if it doesn't exist, insert "--target ${TARGET}" at the beginning of the given command.
@@ -251,7 +258,9 @@ def main():
         # [15], [16]
         else:
             assert oneutils.is_valid_attr(args, 'target')
-            given_backends = [backends.get_backend_from_target_conf(target_to_run)]
+            given_backends = [
+                backends.get_value_from_target_conf(target_to_run, BACKEND_KEY)
+            ]
 
     # make commands
     # 1. if command schema exists
diff --git a/compiler/one-cmds/one-profile b/compiler/one-cmds/one-profile
index 2477a350bf2..585517f7c4c 100644
--- a/compiler/one-cmds/one-profile
+++ b/compiler/one-cmds/one-profile
@@ -35,6 +35,9 @@ import onelib.utils as oneutils
 # TODO Find better way to suppress trackback on error
 sys.tracebacklimit = 0
 
+COMMAND_KEYS = ['__command', 'command']
+BACKEND_KEY = 'BACKEND'
+
 
 def _get_backends_list():
     """
@@ -120,7 +123,8 @@ def _verify_arg(parser, args, cfg_args, cfg_target_args, backend_args, unknown_a
             # overwrite the value if it exists as command line option has higher priority.
             if oneutils.is_valid_attr(args, 'target'):
                 target_to_run = args.target
-            given_backend = backends.get_backend_from_target_conf(target_to_run)
+            given_backend = backends.get_value_from_target_conf(
+                target_to_run, BACKEND_KEY)
             if not given_backend:
                 parser.error(f'Not found {target_to_run} target.')
         else:
@@ -248,8 +252,11 @@ def main():
                     assert (oneutils.is_valid_attr(cfg_args, 'command'))
                     setattr(cfg_args, args.backend, cfg_args.command)
             else:
+                given_backend = None
                 # get backend information
-                given_backend = backends.get_backend_from_target_conf(target_to_run)
+                if target_to_run:
+                    given_backend = backends.get_value_from_target_conf(
+                        target_to_run, BACKEND_KEY)
                 # check if command schema exists
                 # 1. if it exists, run the command according to the schema.
                 # 2. if it doesn't exist, insert "--target ${TARGET}" at the beginning of the given command.
@@ -286,7 +293,9 @@ def main():
         # [15], [16]
         else:
             assert oneutils.is_valid_attr(args, 'target')
-            given_backends = [backends.get_backend_from_target_conf(target_to_run)]
+            given_backends = [
+                backends.get_value_from_target_conf(target_to_run, BACKEND_KEY)
+            ]
 
     # make commands
     # 1. if command schema exists
diff --git a/compiler/one-cmds/onelib/argumentparse.py b/compiler/one-cmds/onelib/argumentparse.py
index 7b266cf5955..bc2d626a996 100644
--- a/compiler/one-cmds/onelib/argumentparse.py
+++ b/compiler/one-cmds/onelib/argumentparse.py
@@ -139,7 +139,46 @@ def print_help(self):
 
         oneutils.run([driver_path, '-h'], err_prefix=self.driver)
 
+    def get_option_names(self, *, flatten=False, without_dash=False):
+        """
+        Get registered option names.
+
+        :param flatten: single option can have multiple names. 
+          If it is True, such options are returned after flattened.
+        :param without_dash: optional argument has leading dash on its names. 
+          If it is True, option names are returned without such dashes.
+
+        For example, say there are options like these.
+
+          parser.add_argument("--verbose", action=NormalOption, dtype=bool)
+          parser.add_argument("--output", "--output_path", action=NormalOption)
+        
+        [EXAMPLES]
+          get_option_names()
+            [[--verbose], [--output, --output_path]]
+          get_option_names(without_dash=True)
+            [[verbose], [output, output_path]]
+          get_option_names(flatten=True)
+            [--verbose, --output, --output_path]
+          get_option_names(flatten=True, without_dash=True)
+            [verbose, output, output_path]
+        """
+        names = []
+        for action in self._actions:
+            names.append(action[0])
+
+        if flatten:
+            names = [name for name_l in names for name in name_l]
+        if without_dash:
+            names = [name.lstrip('-') for name in names]
+
+        return names
+
     def check_if_valid_option_name(self, *args, **kwargs):
+        existing_options = self.get_option_names(flatten=True, without_dash=True)
+        args_without_dash = [arg.lstrip('-') for arg in args]
+        if any(arg in existing_options for arg in args_without_dash):
+            raise RuntimeError('Duplicate option names')
         if not 'action' in kwargs:
             raise RuntimeError('"action" keyword argument is required')
 
diff --git a/compiler/one-cmds/onelib/backends.py b/compiler/one-cmds/onelib/backends.py
index 4403a07cbb6..f7336bde233 100644
--- a/compiler/one-cmds/onelib/backends.py
+++ b/compiler/one-cmds/onelib/backends.py
@@ -28,6 +28,7 @@
 ├── include
 ├── lib
 ├── optimization
+├── target
 └── test
 
 The list where `one-XXXX` finds its backends
@@ -36,6 +37,21 @@
 
 NOTE If there are backends of the same name in different places,
     the closer to the top in the list, the higher the priority.
+
+[About TARGET and BACKEND]
+  "Target" refers to an instance from the core of the system and
+  "Backend" refers to an architecture. Say there is a NPU that has
+  multiple cores. Its cores may have different global buffer 
+  size, DSPM size and clock rate, etc, which are described in 
+  each configuration file of "Target". Even though they
+  are different target, they may follow same architecture, which means
+  they have same "Backend".
+
+[Path for TARGET configuration]
+  - /usr/share/one/target/${TARGET}.ini
+
+[Path for BACKEND tools]
+  - /usr/share/one/backends/${BACKEND}
 """
 
 
@@ -62,11 +78,11 @@ def get_list(cmdname):
     return backends_list
 
 
-def get_backend_from_target_conf(target: str):
+def get_value_from_target_conf(target: str, key: str):
     dir_path = os.path.dirname(os.path.realpath(__file__))
     target_conf_path = dir_path + f'/../../target/{target}.ini'
     if not os.path.isfile(target_conf_path):
-        return None
+        raise FileNotFoundError(f"Not found given target configuration: {target}")
 
     # target config doesn't have section.
     # but, configparser needs configs to have one or more sections.
@@ -77,11 +93,16 @@ def get_backend_from_target_conf(target: str):
     parser.read_string(config_str)
     assert parser.has_section(DUMMY_SECTION)
 
-    BACKEND_KEY = 'BACKEND'
-    if BACKEND_KEY in parser[DUMMY_SECTION]:
-        return parser[DUMMY_SECTION][BACKEND_KEY]
+    # Check if target file is valid
+    TARGET_KEY = 'TARGET'
+    assert TARGET_KEY in parser[DUMMY_SECTION]
+    if target != parser[DUMMY_SECTION][TARGET_KEY]:
+        raise RuntimeError("Invalid target file.")
 
-    return None
+    if key in parser[DUMMY_SECTION]:
+        return parser[DUMMY_SECTION][key]
+
+    raise RuntimeError(f"Not found '{key}' key in target configuration.")
 
 
 def search_driver(driver):
diff --git a/compiler/one-cmds/tests/one-codegen_006.test b/compiler/one-cmds/tests/one-codegen_006.test
index d5e3dc86843..6797a3a6a4d 100644
--- a/compiler/one-cmds/tests/one-codegen_006.test
+++ b/compiler/one-cmds/tests/one-codegen_006.test
@@ -52,7 +52,7 @@ clean_envir()
 trap_err_onexit()
 {
   echo "${filename_ext} FAILED"
-  rm -rf ../bin/dummy-compile
+  clean_envir
   exit 255
 }
 
diff --git a/compiler/one-cmds/tests/one-codegen_010.test b/compiler/one-cmds/tests/one-codegen_010.test
index a81310e2899..2f942eb4f8b 100644
--- a/compiler/one-cmds/tests/one-codegen_010.test
+++ b/compiler/one-cmds/tests/one-codegen_010.test
@@ -52,7 +52,7 @@ clean_envir()
 trap_err_onexit()
 {
   echo "${filename_ext} FAILED"
-  rm -rf ../bin/dummy-compile
+  clean_envir
   exit 255
 }
 
diff --git a/compiler/one-cmds/tests/one-codegen_011.test b/compiler/one-cmds/tests/one-codegen_011.test
index 0b7f9174fdb..5d78d5bc562 100644
--- a/compiler/one-cmds/tests/one-codegen_011.test
+++ b/compiler/one-cmds/tests/one-codegen_011.test
@@ -53,7 +53,7 @@ clean_envir()
 trap_err_onexit()
 {
   echo "${filename_ext} FAILED"
-  rm -rf ../bin/dummy-compile
+  clean_envir
   exit 255
 }
 
diff --git a/compiler/one-cmds/tests/one-codegen_neg_006.cfg b/compiler/one-cmds/tests/one-codegen_neg_006.cfg
new file mode 100644
index 00000000000..afa3051d34f
--- /dev/null
+++ b/compiler/one-cmds/tests/one-codegen_neg_006.cfg
@@ -0,0 +1,9 @@
+[onecc]
+one-codegen=True
+
+[backend]
+target=one-codegen_neg_006
+
+[one-codegen]
+o=one-codegen_neg_006.tvn
+input=one-codegen_neg_006.circle
diff --git a/compiler/one-cmds/tests/one-codegen_neg_006.ini b/compiler/one-cmds/tests/one-codegen_neg_006.ini
new file mode 100644
index 00000000000..c128e39f277
--- /dev/null
+++ b/compiler/one-cmds/tests/one-codegen_neg_006.ini
@@ -0,0 +1,2 @@
+TARGET=one-codegen_neg_006
+BACKEND=dummy
diff --git a/compiler/one-cmds/tests/one-codegen_neg_006.py b/compiler/one-cmds/tests/one-codegen_neg_006.py
new file mode 100644
index 00000000000..71aba159b01
--- /dev/null
+++ b/compiler/one-cmds/tests/one-codegen_neg_006.py
@@ -0,0 +1,14 @@
+from onelib import argumentparse
+from onelib.argumentparse import DriverName, NormalOption, TargetOption
+
+
+def command_schema():
+    parser = argumentparse.ArgumentParser()
+    parser.add_argument("dummy-compile", action=DriverName)
+    parser.add_argument("--target", action=TargetOption)
+    parser.add_argument("--DSP-quota", action=NormalOption)
+    parser.add_argument("-o", action=NormalOption)
+    parser.add_argument("--op", "-o", action=NormalOption)  # duplicate names
+    parser.add_argument("input", action=NormalOption)
+
+    return parser
diff --git a/compiler/one-cmds/tests/one-codegen_neg_006.test b/compiler/one-cmds/tests/one-codegen_neg_006.test
new file mode 100644
index 00000000000..526f9799465
--- /dev/null
+++ b/compiler/one-cmds/tests/one-codegen_neg_006.test
@@ -0,0 +1,116 @@
+#!/bin/bash
+
+# Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# command schema has duplicate names.
+
+: '
+This test assumes below directories.
+
+[one hierarchy]
+    one
+    ├── backends
+    │   └── command
+    │       └── dummy
+    │           └── codegen.py
+    ├── bin
+    ├── doc
+    ├── include
+    ├── lib
+    ├── optimization
+    ├── target
+    └── test # pwd
+'
+
+BACKENDS_ALREADY_EXIST=true
+CMD_ALREADY_EXIST=true
+DUMMY_ALREADY_EXIST=true
+TARGET_ALREADY_EXIST=true
+
+BACKEND_NAME="dummy"
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+configfile="one-codegen_neg_006.cfg"
+outputfile="one-codegen_neg_006.tvn"
+targetfile="one-codegen_neg_006.ini"
+commandschema="one-codegen_neg_006.py"
+
+clean_envir()
+{
+  rm -rf ../bin/dummy-compile
+  rm -rf ../target/${targetfile}
+  rm -rf "../backends/command/${BACKEND_NAME}/codegen.py"
+  if [ "$TARGET_ALREADY_EXIST" = false ]; then
+    rm -rf ../target/
+  fi
+  if [ "$DUMMY_ALREADY_EXIST" = false ]; then
+    rm -rf "../backends/command/${BACKEND_NAME}/"
+  fi
+  if [ "$CMD_ALREADY_EXIST" = false ]; then
+    rm -rf ../backends/command/
+  fi
+  if [ "$BACKENDS_ALREADY_EXIST" = false ]; then
+    rm -rf ../backends/
+  fi
+}
+
+trap_err_onexit()
+{
+  if grep -q "Duplicate option names" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    clean_envir
+    exit 0
+  fi
+
+  echo "${filename_ext} FAILED"
+  clean_envir
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+rm -f ${filename}.log
+rm -rf ${outputfile}
+
+if [ ! -d "../target/" ]; then
+  mkdir -p ../target/
+  TARGET_ALREADY_EXIST=false
+fi
+if [ ! -d "../backends/" ]; then
+  mkdir -p ../backends/
+  BACKENDS_ALREADY_EXIST=false
+fi
+if [ ! -d "../backends/command/" ]; then
+  mkdir -p ../backends/command/
+  CMD_ALREADY_EXIST=false
+fi
+if [ ! -d "../backends/command/${BACKEND_NAME}/" ]; then
+  mkdir -p ../backends/command/${BACKEND_NAME}/
+  DUMMY_ALREADY_EXIST=false
+fi
+
+# copy dummy tools to bin folder
+cp dummy-compile ../bin/dummy-compile
+cp ${targetfile} ../target/
+cp ${commandschema} "../backends/command/${BACKEND_NAME}/codegen.py"
+
+# run test
+onecc -C ${configfile} > ${filename}.log 2>&1
+
+clean_envir
+echo "${filename_ext} FAILED"
+exit 255
diff --git a/compiler/one-cmds/tests/one-profile_006.test b/compiler/one-cmds/tests/one-profile_006.test
index 3afb575a238..03a54d52e0e 100644
--- a/compiler/one-cmds/tests/one-profile_006.test
+++ b/compiler/one-cmds/tests/one-profile_006.test
@@ -52,7 +52,7 @@ clean_envir()
 trap_err_onexit()
 {
   echo "${filename_ext} FAILED"
-  rm -rf ../bin/dummy-profile
+  clean_envir
   exit 255
 }
 
diff --git a/compiler/one-cmds/tests/one-profile_010.test b/compiler/one-cmds/tests/one-profile_010.test
index 681b9d6dc75..b0c4953bfd2 100644
--- a/compiler/one-cmds/tests/one-profile_010.test
+++ b/compiler/one-cmds/tests/one-profile_010.test
@@ -52,7 +52,7 @@ clean_envir()
 trap_err_onexit()
 {
   echo "${filename_ext} FAILED"
-  rm -rf ../bin/dummy-profile
+  clean_envir
   exit 255
 }
 
diff --git a/compiler/one-cmds/tests/one-profile_011.test b/compiler/one-cmds/tests/one-profile_011.test
index db2e1c81196..4c156d76c2a 100644
--- a/compiler/one-cmds/tests/one-profile_011.test
+++ b/compiler/one-cmds/tests/one-profile_011.test
@@ -52,7 +52,7 @@ clean_envir()
 trap_err_onexit()
 {
   echo "${filename_ext} FAILED"
-  rm -rf ../bin/dummy-profile
+  clean_envir
   exit 255
 }
 
diff --git a/compiler/one-cmds/tests/onecc_057.test b/compiler/one-cmds/tests/onecc_057.test
index f0076093b2e..83eea6fbf9a 100644
--- a/compiler/one-cmds/tests/onecc_057.test
+++ b/compiler/one-cmds/tests/onecc_057.test
@@ -19,11 +19,16 @@
 filename_ext="$(basename -- $0)"
 filename="${filename_ext%.*}"
 
-trap_err_onexit()
+clean_envir()
 {
-  echo "${filename_ext} FAILED"
   rm -rf ../bin/dummyV2-profile
   rm -rf ../bin/dummyV3-profile
+}
+
+trap_err_onexit()
+{
+  echo "${filename_ext} FAILED"
+  clean_envir
   exit 255
 }
 
@@ -45,7 +50,6 @@ if ! grep -q "dummyV3-profile with onecc_057_overwrite" "${filename}.log"; then
   trap_err_onexit
 fi
 
-rm -rf ../bin/dummyV2-profile
-rm -rf ../bin/dummyV3-profile
+clean_envir
 
 echo "${filename_ext} SUCCESS"
diff --git a/compiler/one-cmds/tests/onecc_060.ini b/compiler/one-cmds/tests/onecc_060.ini
index 6d3a9ac3849..23f2a32b75a 100644
--- a/compiler/one-cmds/tests/onecc_060.ini
+++ b/compiler/one-cmds/tests/onecc_060.ini
@@ -1,2 +1,2 @@
-TARGET=rose
+TARGET=onecc_060
 BACKEND=dummy
diff --git a/compiler/one-cmds/tests/onecc_neg_038.test b/compiler/one-cmds/tests/onecc_neg_038.test
index 13c7d75f406..f629fe0c82d 100644
--- a/compiler/one-cmds/tests/onecc_neg_038.test
+++ b/compiler/one-cmds/tests/onecc_neg_038.test
@@ -69,6 +69,8 @@ clean_envir()
 
 trap_err_onexit()
 {
+  clean_envir
+
   if grep -q "Only either of option type is allowed: positional or optional" "${filename}.log"; then
     echo "${filename_ext} SUCCESS"
     exit 0
@@ -108,5 +110,7 @@ cp onecc_neg_038.py "../backends/command/${BACKEND_NAME}/codegen.py"
 # run test
 onecc -C ${configfile} > ${filename}.log 2>&1
 
+clean_envir
+
 echo "${filename_ext} FAILED"
 exit 255
diff --git a/compiler/one-cmds/tests/onecc_neg_039.cfg b/compiler/one-cmds/tests/onecc_neg_039.cfg
new file mode 100644
index 00000000000..18275d484f8
--- /dev/null
+++ b/compiler/one-cmds/tests/onecc_neg_039.cfg
@@ -0,0 +1,9 @@
+[onecc]
+one-codegen=True
+
+[backend]
+target=onecc_neg_039
+
+[one-codegen]
+backend=dummy
+command=-o onecc_neg_039.tvn onecc_neg_039.circle
diff --git a/compiler/one-cmds/tests/onecc_neg_039.ini b/compiler/one-cmds/tests/onecc_neg_039.ini
new file mode 100644
index 00000000000..e2bc54b04e1
--- /dev/null
+++ b/compiler/one-cmds/tests/onecc_neg_039.ini
@@ -0,0 +1,2 @@
+TARGET=rose # invalid name
+BACKEND=dummy
diff --git a/compiler/one-cmds/tests/onecc_neg_039.test b/compiler/one-cmds/tests/onecc_neg_039.test
new file mode 100644
index 00000000000..75a365dea99
--- /dev/null
+++ b/compiler/one-cmds/tests/onecc_neg_039.test
@@ -0,0 +1,84 @@
+#!/bin/bash
+
+# Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Invalid target file
+
+: '
+This test assumes below directories.
+
+[one hierarchy]
+    one
+    ├── backends
+    ├── bin
+    ├── doc
+    ├── include
+    ├── lib
+    ├── optimization
+    ├── target
+    └── test # pwd
+'
+
+TARGET_ALREADY_EXIST=true
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+configfile="onecc_neg_039.cfg"
+outputfile="onecc_neg_039.tvn"
+targetfile="onecc_neg_039.ini"
+
+clean_envir()
+{
+  rm -rf ../bin/dummy-compile
+  rm -rf ../target/${targetfile}
+  if [ "$TARGET_ALREADY_EXIST" = false ]; then
+    rm -rf ../target/
+  fi
+}
+
+trap_err_onexit()
+{
+  if grep -q "Invalid target file" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    clean_envir
+    exit 0
+  fi
+
+  echo "${filename_ext} FAILED"
+  clean_envir
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+rm -f ${filename}.log
+rm -rf ${outputfile}
+
+if [ ! -d "../target/" ]; then
+  mkdir -p ../target/
+  TARGET_ALREADY_EXIST=false
+fi
+
+# copy dummy tools to bin folder
+cp dummy-compile ../bin/dummy-compile
+cp ${targetfile} ../target/
+
+# run test
+onecc -C ${configfile} > ${filename}.log 2>&1
+
+echo "${filename_ext} FAILED"
+clean_envir
+exit 255
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h
index c75ae9a5086..119a5e5d1b8 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h
@@ -50,46 +50,16 @@
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
 #include "arm_compute/runtime/MemoryGroup.h"
-#include "src/core/CL/kernels/CLTransposeKernel.h"
+#include "arm_compute/runtime/CL/functions/CLTranspose.h"
 
 namespace arm_compute
 {
-/** Basic function to reshape the weights of Fully Connected layer with OpenCL. This function calls
- * the following kernels:
- *
- *  -# @ref CLTransposeKernel
- *
- * @note  The fully connected layer accepts "weights" tensors only with 2 dimensions.
- */
-class CLFullyConnectedHybridLayerReshapeWeights : public ICLSimpleFunction
-{
-public:
-  /** Set the input and output tensors.
-   *
-   * @param[in]  input  Weights tensor. The weights must be 2 dimensional. Data types supported:
-   * S8.
-   * @param[out] output Destination tensor which stores the transposed input tensor. Data type
-   * supported: Same as @p input.
-   */
-  void configure(const ICLTensor *input, ICLTensor *output);
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * CLFullyConnectedHybridLayerReshapeWeights
-   *
-   * @param[in] input  Weights tensor. The weights must be 2 dimensional. Data types supported:
-   * S8.
-   * @param[in] output Destination tensor which stores the transposed input tensor. Data type
-   * supported: Same as @p input.
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-};
 
 /** Basic function to compute a Fully Connected layer on OpenCL. This function calls the following
  * OpenCL kernels:
  *
  *  -# @ref CLIm2ColKernel (called when the input comes from a convolutional layer)
- *  -# @ref CLFullyConnectedHybridLayerReshapeWeights (if @p are_weights_reshaped is set to false
+ *  -# @ref CLTranspose (if @p are_weights_reshaped is set to false
  * and transpose_weights is set to true ) (called once)
  *  -# @ref CLGEMMLowpMatrixMultiplyCore (if quantized symmetric)
  *  -# @ref CLGEMMMatrixAccumulateBiasesKernel (if @p biases is not equal to nullptr)
@@ -165,7 +135,7 @@ class CLFullyConnectedHybridLayer : public IFunction
                     bool retain_internal_weights);
 
   MemoryGroup _memory_group;
-  CLFullyConnectedHybridLayerReshapeWeights _reshape_weights_kernel;
+  CLTranspose _reshape_weights_kernel;
   CLScaleFactorSymm8Kernel _scale_factor_kernel;
   CLQuantizationSymmetricKernel _quant_input_kernel;
   CLGEMMLowpMatrixMultiplyCore _mm_gemmlowp;
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h
index c08da526aab..919f019aceb 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h
@@ -50,45 +50,15 @@
 #include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
 #include "arm_compute/runtime/IWeightsManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
-#include "src/core/CL/kernels/CLTransposeKernel.h"
+#include "arm_compute/runtime/CL/functions/CLTranspose.h"
 
 namespace arm_compute
 {
-/** Basic function to reshape the weights of Fully Connected layer with OpenCL. This function calls
- * the following kernels:
- *
- *  -# @ref CLTransposeKernel
- *
- * @note  The fully connected layer accepts "weights" tensors only with 2 dimensions.
- */
-class CLFullyConnectedLayerReshapeWeightsEx : public ICLSimpleFunction
-{
-public:
-  /** Set the input and output tensors.
-   *
-   * @param[in]  input  Weights tensor. The weights must be 2 dimensional. Data types supported:
-   * QASYMM8/F16/F32.
-   * @param[out] output Destination tensor which stores the transposed input tensor. Data type
-   * supported: Same as @p input.
-   */
-  void configure(const ICLTensor *input, ICLTensor *output);
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * CLFullyConnectedLayerReshapeWeightsEx
-   *
-   * @param[in] input  Weights tensor. The weights must be 2 dimensional. Data types supported:
-   * QASYMM8/F16/F32.
-   * @param[in] output Destination tensor which stores the transposed input tensor. Data type
-   * supported: Same as @p input.
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-};
 
 namespace weights_transformations
 {
 /** Basic function to manage the reshape weights generated from @ref
- * CLFullyConnectedLayerReshapeWeightsEx */
+ * CLTranspose */
 class CLFullyConnectedLayerReshapeWeightsExManaged : public ITransformWeights
 {
 public:
@@ -118,7 +88,7 @@ class CLFullyConnectedLayerReshapeWeightsExManaged : public ITransformWeights
 private:
   static constexpr uint32_t _uid = 0x0;
   CLTensor _output{};
-  CLFullyConnectedLayerReshapeWeightsEx _func{};
+  CLTranspose _func{};
 };
 } // namespace weights_transformations
 
@@ -209,7 +179,7 @@ class CLFullyConnectedLayerEx : public IFunction
   weights_transformations::CLFullyConnectedLayerReshapeWeightsExManaged
     _reshape_weights_managed_function;
   CLFlattenLayer _flatten_layer;
-  CLFullyConnectedLayerReshapeWeightsEx _reshape_weights_function;
+  CLTranspose _reshape_weights_function;
   CLGEMM _mm_gemm;
   CLGEMMLowpMatrixMultiplyCore _mm_gemmlowp;
   CLTensor _flatten_output;
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPadLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPadLayerEx.h
index ee1879aaa1c..f60565da041 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPadLayerEx.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPadLayerEx.h
@@ -43,8 +43,7 @@
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/core/CL/kernels/CLPadLayerKernelEx.h"
-#include "src/core/gpu/cl/kernels/ClCopyKernel.h"
-// #include "arm_compute/runtime/CL/functions/CLCopy.h"
+#include "arm_compute/runtime/CL/functions/CLCopy.h"
 #include <memory>
 
 namespace arm_compute
@@ -123,7 +122,7 @@ class CLPadLayerEx : public IFunction
   void configure_reflect_mode(ICLTensor *input, ICLTensor *output);
 
   std::unique_ptr<CLPadLayerKernelEx> _pad_kernel;
-  std::unique_ptr<opencl::kernels::ClCopyKernel> _copy_kernel;
+  std::unique_ptr<CLCopy> _copy_kernel;
   bool _perform_pad;
 };
 } // namespace arm_compute
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h
index 21459271020..13b224167fa 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h
@@ -48,43 +48,15 @@
 #include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
 #include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
 #include "arm_compute/runtime/Tensor.h"
-#include "src/core/NEON/kernels/NETransposeKernel.h"
+#include "arm_compute/runtime/NEON/functions/NETranspose.h"
 
 namespace arm_compute
 {
-/** Basic function to reshape the weights of Fully Connected layer with NEON. This function calls
- * the following kernels:
- *
- *  -# @ref NETransposeKernel
- *
- * @note  The fully connected layer accepts "weights" tensors only with 2 dimensions.
- */
-class NEFullyConnectedHybridLayerReshapeWeights : public INESimpleFunctionNoBorder
-{
-public:
-  /** Set the input and output tensors.
-   *
-   * @param[in]  input  Weights tensor. The weights must be 2 dimensional. Data types supported:
-   * QASYMM8/F16/F32.
-   * @param[out] output Destination tensor. Data type supported: Same as @p input.
-   */
-  void configure(const ITensor *input, ITensor *output);
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * NEFullyConnectedHybridLayerReshapeWeights
-   *
-   * @param[in] input  Weights tensor info. The weights must be 2 dimensional. Data types supported:
-   * QASYMM8/F16/F32.
-   * @param[in] output Destination tensor info. Data type supported: Same as @p input.
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-};
 
 /** Basic function to compute a Fully Connected layer on NEON. This function calls the following
  * NEON kernels:
  *  -# @ref NEIm2ColKernel (called when the input comes from a convolutional layer)
- *  -# @ref NEFullyConnectedHybridLayerReshapeWeights (if @p are_weights_reshaped is set to false
+ *  -# @ref NETranspose (if @p are_weights_reshaped is set to false
  * and transpose_weights is set to true ) (called once)
  *  -# @ref NEGEMMMatrixMultiplyKernel or @ref NEGEMMLowpMatrixMultiplyCore (if quantized
  * asymmetric)
@@ -162,7 +134,7 @@ class NEFullyConnectedHybridLayer : public IFunction
   void configure_mm(const ITensor *input, const ITensor *weights, ITensor *output);
 
   MemoryGroup _memory_group;
-  NEFullyConnectedHybridLayerReshapeWeights _reshape_weights_function;
+  NETranspose _reshape_weights_function;
   NEQuantizationSymmetricKernel _quant_input_kernel;
   NEGEMMLowpMatrixMultiplyCore _mm_gemmlowp;
   NEMultiplyScaleFactorKernel _multiply_scale_kernel;
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h
index 2bbb1fea126..aaceeaa99d1 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h
@@ -51,21 +51,17 @@
 #include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h"
-#include "src/core/NEON/kernels/NETransposeKernel.h"
+#include "arm_compute/runtime/NEON/functions/NETranspose.h"
 
 namespace arm_compute
 {
 /** Basic function to compute a Fully Connected layer on NEON. This function calls the following
  * NEON kernels:
  *  -# @ref NEIm2ColKernel (called when the input comes from a convolutional layer)
- *  -# @ref NEFullyConnectedLayerReshapeWeights (if @p are_weights_reshaped is set to false and
+ *  -# @ref NETranspose (if @p are_weights_reshaped is set to false and
  * transpose_weights is set to true ) (called once)
  *  -# @ref NEGEMMMatrixMultiplyKernel or @ref NEGEMMLowpMatrixMultiplyCore (if quantized
  * asymmetric)
- *  -# @ref NEGEMMMatrixAccumulateBiasesKernel or @ref
- * NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint (if quantized asymmetric) (if @p biases is
- * not equal to nullptr)
  *
  * @note  The fully connected layer accepts "weights" tensors only with 2 dimensions.
  * @note  The difference from NEFullyConnectedLayer is that this class supports weights as input
@@ -136,29 +132,28 @@ class NEFullyConnectedLayerEx : public IFunction
   void prepare() override;
 
 private:
-  void configure_fc_fc(const ITensor *input, const ITensor *weights, ITensor *output);
-  void configure_conv_fc(const ITensor *input, const ITensor *weights, ITensor *output);
-  void configure_mm(const ITensor *input, const ITensor *weights, ITensor *output);
+  void configure_fc_fc(const ITensor *input, const ITensor *weights, const ITensor *bias,
+                       ITensor *output, const FullyConnectedLayerInfo &fc_info);
+  void configure_conv_fc(const ITensor *input, const ITensor *weights, const ITensor *bias,
+                         ITensor *output, const FullyConnectedLayerInfo &fc_info);
+  void configure_mm(const ITensor *input, const ITensor *weights, const ITensor *bias,
+                    ITensor *output, const FullyConnectedLayerInfo &fc_info);
 
   MemoryGroup _memory_group;
-  NEFlattenLayer _flatten_kernel;
   NEConvertFullyConnectedWeights _convert_weights;
-  NEFullyConnectedLayerReshapeWeights _reshape_weights_function;
+  NEFlattenLayer _flatten_kernel;
+  NETranspose _reshape_weights_function;
   NEGEMM _mm_gemm;
   NEGEMMLowpMatrixMultiplyCore _mm_gemmlowp;
-  NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint _gemmlowp_output_stage;
-  NEGEMMMatrixAccumulateBiasesKernel _accumulate_biases_kernel;
   Tensor _flatten_output;
-  Tensor _gemmlowp_output;
   Tensor _converted_weights_output;
   Tensor _reshape_weights_output;
-  const ITensor *_original_weights;
   bool _are_weights_converted;
   bool _are_weights_reshaped;
   bool _is_fc_after_conv;
-  bool _accumulate_biases;
   bool _is_quantized;
   bool _is_prepared;
+  const ITensor *_original_weights;
 };
 } // namespace arm_compute
 #endif /* __ARM_COMPUTE_NEFULLYCONNECTEDLAYEREX_H__ */
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp
index 464f60deec8..290343ae446 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp
@@ -44,6 +44,7 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
 
 #include "src/core/helpers/AutoConfiguration.h"
 
@@ -164,7 +165,7 @@ void CLDirectTransposeConvLayer::configure(const CLCompileContext &compile_conte
   _original_weights = weights;
   _flip_axis.allocator()->init(TensorInfo(TensorShape(2U), 1, DataType::U32));
   _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout));
-  _flip_weights.configure(compile_context, weights, &_weights_flipped, &_flip_axis);
+  _flip_weights.configure(compile_context, weights, &_weights_flipped, &_flip_axis, false);
 
   auto out_dims = transposeconv_output_dimensions(
     input->info()->dimension(idx_w), input->info()->dimension(idx_h),
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp
index af936e873e4..b07555ee7d7 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp
@@ -65,19 +65,6 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I
 }
 } // namespace
 
-void CLFullyConnectedHybridLayerReshapeWeights::configure(const ICLTensor *input, ICLTensor *output)
-{
-  auto k = std::make_unique<CLTransposeKernel>();
-  k->configure(input, output);
-  _kernel = std::move(k);
-}
-
-Status CLFullyConnectedHybridLayerReshapeWeights::validate(const ITensorInfo *input,
-                                                           const ITensorInfo *output)
-{
-  return CLTransposeKernel::validate(input, output);
-}
-
 CLFullyConnectedHybridLayer::CLFullyConnectedHybridLayer(
   std::shared_ptr<IMemoryManager> memory_manager)
   : _memory_group(memory_manager), _reshape_weights_kernel(), _quant_input_kernel(),
@@ -245,8 +232,7 @@ Status CLFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITe
   if (!weights_reshaped)
   {
     // Validate reshape weights kernel
-    ARM_COMPUTE_RETURN_ON_ERROR(
-      CLFullyConnectedHybridLayerReshapeWeights::validate(weights_to_use, &reshaped_weights));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(weights_to_use, &reshaped_weights));
     weights_to_use = &reshaped_weights;
   }
 
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp
index c6a88d3409f..9f8c3390041 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp
@@ -45,6 +45,7 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/function_info/GEMMInfo.h"
 
 #include "support/Cast.h"
 
@@ -109,8 +110,13 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I
                                        fc_info.retain_internal_weights, // retain_internal_weights
                                        gemmlowp_output_stage,           // gemmlowp_output_stage
                                        fc_info.fp_mixed_precision,      // fp_mixed_precision
+                                       false,                           // fast_math
                                        true,                            // broadcast_bias
-                                       ActivationLayerInfo());          // activation_info
+                                       ActivationLayerInfo(),           // activation_info
+                                       false,                           // fixed_format
+                                       WeightFormat::OHWI,              // weight_format
+                                       false,                           // pretranspose_B
+                                       bias != nullptr);                // acccumulate
 
   if (is_data_type_quantized_asymmetric(input.data_type()))
   {
@@ -139,19 +145,6 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I
 }
 } // namespace
 
-void CLFullyConnectedLayerReshapeWeightsEx::configure(const ICLTensor *input, ICLTensor *output)
-{
-  auto k = std::make_unique<CLTransposeKernel>();
-  k->configure(input, output);
-  _kernel = std::move(k);
-}
-
-Status CLFullyConnectedLayerReshapeWeightsEx::validate(const ITensorInfo *input,
-                                                       const ITensorInfo *output)
-{
-  return CLTransposeKernel::validate(input, output);
-}
-
 CLFullyConnectedLayerEx::CLFullyConnectedLayerEx(std::shared_ptr<IMemoryManager> memory_manager,
                                                  IWeightsManager *weights_manager)
   : _memory_group(memory_manager), _weights_manager(weights_manager), _convert_weights(),
@@ -178,8 +171,13 @@ void CLFullyConnectedLayerEx::configure_mm(const ICLTensor *input, const ICLTens
                                        fc_info.retain_internal_weights, // retain_internal_weights
                                        gemmlowp_output_stage,           // gemmlowp_output_stage
                                        fc_info.fp_mixed_precision,      // fp_mixed_precision
+                                       false,                           // fast_math
                                        true,                            // broadcast_bias
-                                       ActivationLayerInfo());          // activation_info
+                                       ActivationLayerInfo(),           // activation_info
+                                       false,                           // fixed_format
+                                       WeightFormat::OHWI,              // weight_format
+                                       false,                           // pretranspose_B
+                                       bias != nullptr);                // acccumulate
 
   if (_is_quantized)
   {
@@ -358,11 +356,9 @@ Status CLFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor
   bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
   bool is_fc_after_conv = true;
 
-  const ITensorInfo &flatten_input = TensorInfo(input->clone()
-                                                  ->set_is_resizable(true)
-                                                  .reset_padding()
-                                                  .set_tensor_shape(compute_flatten_shape(input))
-                                                  .set_data_layout(DataLayout::NCHW));
+  const ITensorInfo &flatten_input =
+    TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
+      compute_flatten_shape(input)));
   const ITensorInfo &reshaped_weights =
     TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
       compute_transposed_shape(*weights)));
@@ -395,8 +391,7 @@ Status CLFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor
   if (!weights_reshaped)
   {
     // Validate reshape weights kernel
-    ARM_COMPUTE_RETURN_ON_ERROR(
-      CLFullyConnectedLayerReshapeWeightsEx::validate(weights, &reshaped_weights));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(weights, &reshaped_weights));
     weights_to_use = &reshaped_weights;
   }
 
@@ -507,77 +502,6 @@ void CLFullyConnectedLayerEx::run()
 
 void CLFullyConnectedLayerEx::prepare()
 {
-#if 0 // TODO Remove this block
-    if(!_is_prepared)
-    {
-        if(!_weights_manager)
-        {
-            ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-        }
-
-        auto release_unused = [](CLTensor * w)
-        {
-            if(!w->is_used())
-            {
-                CLScheduler::get().queue().finish();
-                w->allocator()->free();
-            }
-        };
-
-        // Pointer to current weights
-        const ICLTensor *cur_weights = _original_weights;
-
-        // Reshape of the weights if needed (happens only once)
-        if(!_are_weights_reshaped)
-        {
-            if(_weights_manager && _weights_manager->are_weights_managed(_original_weights))
-            {
-                cur_weights = utils::cast::polymorphic_downcast<ICLTensor *>(_weights_manager->run(cur_weights, &_reshape_weights_managed_function));
-            }
-            else
-            {
-                // Run reshape weights kernel and mark weights as unused
-                _reshape_weights_output.allocator()->allocate();
-                _reshape_weights_function.run();
-
-                cur_weights->mark_as_unused();
-                cur_weights = &_reshape_weights_output;
-            }
-            _are_weights_reshaped = true;
-        }
-
-        // Convert weights if needed (happens only once)
-        if(!_are_weights_converted)
-        {
-            if(_weights_manager && _weights_manager->are_weights_managed(cur_weights))
-            {
-                _weights_manager->run(cur_weights, &_convert_weights_managed);
-            }
-            else
-            {
-                _converted_weights_output.allocator()->allocate();
-                _convert_weights.run();
-                cur_weights->mark_as_unused();
-            }
-
-            _are_weights_converted = true;
-        }
-
-        // Release reshaped weights if unused
-        release_unused(&_reshape_weights_output);
-
-        // Prepare GEMM prepare and release unused weights
-        if(!_is_quantized)
-        {
-            _mm_gemm.prepare();
-        }
-
-        // Release converted weights if unused
-        release_unused(&_reshape_weights_output);
-        release_unused(&_converted_weights_output);
-
-        _is_prepared = true;
-    }
-#endif
+  // DO NOTHING
 }
 } // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLPadLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLPadLayerEx.cpp
index 4d940e96632..e67bb1ce6ea 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLPadLayerEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLPadLayerEx.cpp
@@ -43,8 +43,8 @@
 namespace arm_compute
 {
 CLPadLayerEx::CLPadLayerEx()
-  : _pad_kernel(std::make_unique<CLPadLayerKernelEx>()),
-    _copy_kernel(std::make_unique<opencl::kernels::ClCopyKernel>()), _perform_pad(false)
+  : _pad_kernel(std::make_unique<CLPadLayerKernelEx>()), _copy_kernel(std::make_unique<CLCopy>()),
+    _perform_pad(false)
 {
 }
 
@@ -74,7 +74,7 @@ void CLPadLayerEx::configure(const CLCompileContext &compile_context, ICLTensor
     Window copy_window = Window();
     copy_window.use_tensor_dimensions(output->info()->tensor_shape());
     // Copy the input to the whole output if no padding is applied
-    _copy_kernel->configure(compile_context, input->info(), output->info(), &copy_window);
+    _copy_kernel->configure(compile_context, input, output, &copy_window);
   }
 }
 Status CLPadLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output,
@@ -92,7 +92,7 @@ Status CLPadLayerEx::validate(const ITensorInfo *input, const ITensorInfo *outpu
   }
   else
   {
-    ARM_COMPUTE_RETURN_ON_ERROR(opencl::kernels::ClCopyKernel::validate(input, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLCopy::validate(input, output));
   }
   return Status{};
 }
@@ -104,7 +104,7 @@ void CLPadLayerEx::run()
   }
   else
   {
-    CLScheduler::get().enqueue(*_copy_kernel);
+    _copy_kernel->run();
   }
 }
 } // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp
index f3f093c188d..af0bc49e168 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp
@@ -44,6 +44,7 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
 
 #include <cmath>
 #include <memory>
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp
index fbd88fff0a9..4505122dc75 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp
@@ -64,19 +64,6 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I
 }
 } // namespace
 
-void NEFullyConnectedHybridLayerReshapeWeights::configure(const ITensor *input, ITensor *output)
-{
-  auto k = std::make_unique<NETransposeKernel>();
-  k->configure(input, output);
-  _kernel = std::move(k);
-}
-
-Status NEFullyConnectedHybridLayerReshapeWeights::validate(const ITensorInfo *input,
-                                                           const ITensorInfo *output)
-{
-  return NETransposeKernel::validate(input, output);
-}
-
 NEFullyConnectedHybridLayer::NEFullyConnectedHybridLayer(
   std::shared_ptr<IMemoryManager> memory_manager)
   : _memory_group(std::move(memory_manager)), _reshape_weights_function(), _quant_input_kernel(),
@@ -108,6 +95,7 @@ void NEFullyConnectedHybridLayer::configure(const ITensor *input, const ITensor
 
   _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
   _accumulate_biases = false;
+  _is_prepared = fc_info.retain_internal_weights;
   _original_weights = weights;
 
   // Configure accumulate biases kernel for non quantized asymmetric types
@@ -129,7 +117,7 @@ void NEFullyConnectedHybridLayer::configure(const ITensor *input, const ITensor
 
   // Check if we have a fully connected layer with batches
   const bool is_batched_fc_layer = output->info()->dimension(1) > 1;
-  bool _is_fc_after_conv;
+  bool _is_fc_after_conv = false;
   if (is_batched_fc_layer)
   {
     _is_fc_after_conv =
@@ -143,7 +131,7 @@ void NEFullyConnectedHybridLayer::configure(const ITensor *input, const ITensor
   }
   ARM_COMPUTE_ERROR_ON_MSG(_is_fc_after_conv,
                            "NEFullyConnectedHybridLayer does not support after conv");
-  (void)_is_fc_after_conv;
+  ARM_COMPUTE_UNUSED(_is_fc_after_conv);
 
   // Reshape weights if needed
   if (!_are_weights_reshaped)
@@ -216,8 +204,7 @@ Status NEFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITe
   if (!weights_reshaped)
   {
     // Validate reshape weights kernel
-    ARM_COMPUTE_RETURN_ON_ERROR(
-      NEFullyConnectedHybridLayerReshapeWeights::validate(weights_to_use, &reshaped_weights));
+    ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(weights_to_use, &reshaped_weights));
     weights_to_use = &reshaped_weights;
   }
 
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp
index 758f7dc59cb..36adc045d11 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp
@@ -56,8 +56,66 @@ using namespace arm_compute::misc::shape_calculator;
 
 namespace
 {
-Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo &output)
+Status construct_gemmlowp_output_stage(const ITensorInfo &input, const ITensorInfo &weights,
+                                       const ITensorInfo &output,
+                                       GEMMLowpOutputStageInfo &gemmlowp_output_stage)
 {
+  gemmlowp_output_stage.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+  gemmlowp_output_stage.gemmlowp_offset = 0;
+  gemmlowp_output_stage.gemmlowp_multiplier = 0;
+  gemmlowp_output_stage.gemmlowp_shift = 0;
+
+  // Configure output stage for quantized case
+  if (is_data_type_quantized_asymmetric(input.data_type()))
+  {
+    const UniformQuantizationInfo iq_info = input.quantization_info().uniform();
+    const UniformQuantizationInfo wq_info = weights.quantization_info().uniform();
+    const UniformQuantizationInfo oq_info = output.quantization_info().uniform();
+
+    const auto output_quant_info = (output.total_size() == 0) ? iq_info : oq_info;
+
+    const float multiplier = (iq_info.scale * wq_info.scale) / output_quant_info.scale;
+    int output_multiplier = 0;
+    int output_shift = 0;
+    ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier_less_than_one(
+      multiplier, &output_multiplier, &output_shift));
+
+    // Set the GEMMLowp output stage info
+    gemmlowp_output_stage.gemmlowp_offset = output_quant_info.offset;
+    gemmlowp_output_stage.gemmlowp_multiplier = output_multiplier;
+    gemmlowp_output_stage.gemmlowp_shift = output_shift;
+    gemmlowp_output_stage.gemmlowp_min_bound = 0;
+    gemmlowp_output_stage.gemmlowp_max_bound = 255;
+    gemmlowp_output_stage.gemmlowp_multipliers.push_back(output_multiplier);
+    gemmlowp_output_stage.gemmlowp_shifts.push_back(output_shift);
+  }
+
+  return Status{};
+}
+
+Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo *bias,
+                   const ITensorInfo &output, const FullyConnectedLayerInfo &fc_info)
+{
+  GEMMLowpOutputStageInfo gemmlowp_output_stage;
+  ARM_COMPUTE_RETURN_ON_ERROR(
+    construct_gemmlowp_output_stage(input, weights, output, gemmlowp_output_stage));
+
+  const GEMMInfo &gemm_info = GEMMInfo(false, // is_a_reshaped
+                                       false, // is_b_reshaped
+                                       true,  // reshape_b_only_on_first_run
+                                       0,     // depth_output_gemm3d
+                                       false, // reinterpret_input_as_3d
+                                       fc_info.retain_internal_weights, // retain_internal_weights
+                                       gemmlowp_output_stage,           // gemmlowp_output_stage
+                                       fc_info.fp_mixed_precision,      // fp_mixed_precision
+                                       false,                           // fast_math
+                                       true,                            // broadcast_bias
+                                       ActivationLayerInfo(),           // activation_info
+                                       false,                           // fixed_format
+                                       WeightFormat::OHWI,              // weight_format
+                                       false,                           // pretranspose_B
+                                       bias != nullptr);                // acccumulate
+
   if (is_data_type_quantized_asymmetric(input.data_type()))
   {
     // Since we need negative offsets for computing convolution, we need to change
@@ -71,13 +129,13 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I
     // Validate gemmlowp function
     ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyCore::validate(
       &input.clone()->set_quantization_info(input_quantization_info),
-      &weights.clone()->set_quantization_info(weights_quantization_info), nullptr, &output));
+      &weights.clone()->set_quantization_info(weights_quantization_info), bias, &output,
+      gemm_info));
   }
   else
   {
     ARM_COMPUTE_RETURN_ON_ERROR(
-      NEGEMM::validate(&input, &weights, nullptr, &output, 1.f, 0.0f,
-                       GEMMInfo(false, false, false /* Reshape weights only for the first run */)));
+      NEGEMM::validate(&input, &weights, bias, &output, 1.f, 0.0f, gemm_info));
   }
 
   return Status{};
@@ -85,18 +143,38 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I
 } // namespace
 
 NEFullyConnectedLayerEx::NEFullyConnectedLayerEx(std::shared_ptr<IMemoryManager> memory_manager)
-  : _memory_group(std::move(memory_manager)), _flatten_kernel(), _convert_weights(),
-    _reshape_weights_function(), _mm_gemm(), _mm_gemmlowp(), _gemmlowp_output_stage(),
-    _accumulate_biases_kernel(), _flatten_output(), _gemmlowp_output(), _converted_weights_output(),
-    _reshape_weights_output(), _original_weights(nullptr), _are_weights_converted(true),
-    _are_weights_reshaped(false), _is_fc_after_conv(false), _accumulate_biases(false),
-    _is_quantized(false), _is_prepared(false)
+  : _memory_group(std::move(memory_manager)), _convert_weights(), _flatten_kernel(),
+    _reshape_weights_function(), _mm_gemm(), _mm_gemmlowp(), _flatten_output(),
+    _converted_weights_output(), _reshape_weights_output(), _are_weights_converted(true),
+    _are_weights_reshaped(false), _is_fc_after_conv(false), _is_quantized(false),
+    _is_prepared(false), _original_weights(nullptr)
 {
 }
 
 void NEFullyConnectedLayerEx::configure_mm(const ITensor *input, const ITensor *weights,
-                                           ITensor *output)
+                                           const ITensor *bias, ITensor *output,
+                                           const FullyConnectedLayerInfo &fc_info)
 {
+  GEMMLowpOutputStageInfo gemmlowp_output_stage;
+  construct_gemmlowp_output_stage(*input->info(), *weights->info(), *output->info(),
+                                  gemmlowp_output_stage);
+
+  const GEMMInfo &gemm_info = GEMMInfo(false, // is_a_reshaped
+                                       false, // is_b_reshaped
+                                       true,  // reshape_b_only_on_first_run
+                                       0,     // depth_output_gemm3d
+                                       false, // reinterpret_input_as_3d
+                                       fc_info.retain_internal_weights, // retain_internal_weights
+                                       gemmlowp_output_stage,           // gemmlowp_output_stage
+                                       fc_info.fp_mixed_precision,      // fp_mixed_precision
+                                       false,                           // fast_math
+                                       true,                            // broadcast_bias
+                                       ActivationLayerInfo(),           // activation_info
+                                       false,                           // fixed_format
+                                       WeightFormat::OHWI,              // weight_format
+                                       false,                           // pretranspose_B
+                                       bias != nullptr);                // acccumulate
+
   if (_is_quantized)
   {
     // Since we need negative offsets for computing convolution, we need to change
@@ -111,7 +189,7 @@ void NEFullyConnectedLayerEx::configure_mm(const ITensor *input, const ITensor *
       weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
 
     // Configure gemmlowp function
-    _mm_gemmlowp.configure(input, weights, nullptr, output);
+    _mm_gemmlowp.configure(input, weights, bias, output, gemm_info);
 
     // Revert back QuantizatioInfo as input and weights could be used in other fully connected
     // layers
@@ -121,13 +199,13 @@ void NEFullyConnectedLayerEx::configure_mm(const ITensor *input, const ITensor *
   else
   {
     // Configure matrix multiply kernel
-    _mm_gemm.configure(input, weights, nullptr, output, 1.f, 0.0f,
-                       GEMMInfo(false, false, false /* Reshape weights only for the first run */));
+    _mm_gemm.configure(input, weights, bias, output, 1.f, 1.0f, gemm_info);
   }
 }
 
 void NEFullyConnectedLayerEx::configure_conv_fc(const ITensor *input, const ITensor *weights,
-                                                ITensor *output)
+                                                const ITensor *bias, ITensor *output,
+                                                const FullyConnectedLayerInfo &fc_info)
 {
   ARM_COMPUTE_ERROR_ON(
     (weights->info()->dimension(1) !=
@@ -146,19 +224,20 @@ void NEFullyConnectedLayerEx::configure_conv_fc(const ITensor *input, const ITen
   _flatten_kernel.configure(input, &_flatten_output);
 
   // Configure matrix multiply kernel
-  configure_mm(&_flatten_output, weights, output);
+  configure_mm(&_flatten_output, weights, bias, output, fc_info);
 
   // Allocate the output tensor for flatten once all the configure methods have been called
   _flatten_output.allocator()->allocate();
 }
 
 void NEFullyConnectedLayerEx::configure_fc_fc(const ITensor *input, const ITensor *weights,
-                                              ITensor *output)
+                                              const ITensor *bias, ITensor *output,
+                                              const FullyConnectedLayerInfo &fc_info)
 {
   ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1));
 
   // Configure matrix multiply kernel
-  configure_mm(input, weights, output);
+  configure_mm(input, weights, bias, output, fc_info);
 }
 
 void NEFullyConnectedLayerEx::configure(const ITensor *input, const ITensor *weights,
@@ -174,26 +253,9 @@ void NEFullyConnectedLayerEx::configure(const ITensor *input, const ITensor *wei
   _are_weights_converted = true;
   _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
   _is_fc_after_conv = true;
-  _accumulate_biases = false;
   _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
   _original_weights = weights;
 
-  // Configure gemmlowp output
-  if (_is_quantized)
-  {
-    _gemmlowp_output.allocator()->init(
-      output->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
-  }
-
-  // Configure accumulate biases kernel for non quantized asymmetric types
-  if (biases != nullptr && !_is_quantized)
-  {
-    _accumulate_biases = true;
-
-    // Configure accumulate biases kernel
-    _accumulate_biases_kernel.configure(output, biases);
-  }
-
   // With the Fully Connected layer we can have 4 different cases:
   //  1) Convolution layer -> Fully Connected layer without batches
   //  2) Fully Connected layer -> Fully Connected layer without batches
@@ -235,32 +297,15 @@ void NEFullyConnectedLayerEx::configure(const ITensor *input, const ITensor *wei
     _are_weights_converted = false;
   }
 
-  ITensor *tmp_output = (_is_quantized) ? &_gemmlowp_output : output;
   if (_is_fc_after_conv)
   {
     // Fully Connected layer after a Convolution Layer without batches
-    configure_conv_fc(input, weights_to_use, tmp_output);
+    configure_conv_fc(input, weights_to_use, biases, output, fc_info);
   }
   else
   {
     // Fully Connected layer after a Fully Connected Layer without batches
-    configure_fc_fc(input, weights_to_use, tmp_output);
-  }
-
-  // Configure output stage for asymmetric quantized types
-  if (_is_quantized)
-  {
-    float multiplier = input->info()->quantization_info().uniform().scale *
-                       weights->info()->quantization_info().uniform().scale /
-                       output->info()->quantization_info().uniform().scale;
-    int output_multiplier;
-    int output_shift;
-    quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier,
-                                                               &output_shift);
-    _gemmlowp_output_stage.configure(&_gemmlowp_output, biases, output, output_multiplier,
-                                     output_shift,
-                                     output->info()->quantization_info().uniform().offset);
-    _gemmlowp_output.allocator()->allocate();
+    configure_fc_fc(input, weights_to_use, biases, output, fc_info);
   }
 
   _are_weights_reshaped = _are_weights_reshaped || fc_info.retain_internal_weights;
@@ -279,7 +324,6 @@ Status NEFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor
 
   bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
   bool is_fc_after_conv = true;
-  bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
 
   const ITensorInfo &flatten_input =
     TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(
@@ -290,15 +334,6 @@ Status NEFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor
   const ITensorInfo &converted_weights =
     weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding())
                      : TensorInfo(*reshaped_weights.clone());
-  const ITensorInfo &gemmlowp_output = TensorInfo(
-    output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
-
-  // Configure accumulate biases kernel for non quantized asymmetric types
-  if (biases != nullptr && !is_quantized)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
-    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixAccumulateBiasesKernel::validate(output, biases));
-  }
 
   // With the Fully Connected layer we can have 4 different cases:
   //  1) Convolution layer -> Fully Connected layer without batches
@@ -308,7 +343,6 @@ Status NEFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor
 
   const ITensorInfo *input_to_use = input;
   const ITensorInfo *weights_to_use = weights;
-  const ITensorInfo *tmp_output = (is_quantized) ? &gemmlowp_output : output;
 
   // Check if we have a fully connected layer with batches
   const bool is_batched_fc_layer = output->dimension(1) > 1;
@@ -327,8 +361,7 @@ Status NEFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor
   if (!weights_reshaped)
   {
     // Validate reshape weights kernel
-    ARM_COMPUTE_RETURN_ON_ERROR(
-      NEFullyConnectedLayerReshapeWeights::validate(weights, &reshaped_weights));
+    ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(weights, &reshaped_weights));
     weights_to_use = &reshaped_weights;
   }
 
@@ -357,14 +390,8 @@ Status NEFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor
     ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1));
   }
   // Validate matrix multiply kernel
-  ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(*input_to_use, *weights_to_use, *tmp_output));
-
-  // Validate output stage for asymmetric quantized types
-  if (is_quantized)
-  {
-    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(
-      &gemmlowp_output, biases, output));
-  }
+  ARM_COMPUTE_RETURN_ON_ERROR(
+    validate_mm(*input_to_use, *weights_to_use, biases, *output, fc_info));
 
   return Status{};
 }
@@ -374,13 +401,9 @@ void NEFullyConnectedLayerEx::run()
   if (!_is_prepared)
   {
     if (!_are_weights_reshaped)
-    {
       _reshape_weights_output.allocator()->allocate();
-    }
     if (!_are_weights_converted)
-    {
       _converted_weights_output.allocator()->allocate();
-    }
     _is_prepared = true;
   }
 
@@ -423,75 +446,10 @@ void NEFullyConnectedLayerEx::run()
   {
     _mm_gemm.run();
   }
-
-  // Accumulate biases if provided
-  if (_is_quantized)
-  {
-    _gemmlowp_output_stage.run();
-  }
-  else
-  {
-    if (_accumulate_biases)
-    {
-      NEScheduler::get().schedule(&_accumulate_biases_kernel, Window::DimY);
-    }
-  }
 }
 
 void NEFullyConnectedLayerEx::prepare()
 {
-#if 0 // TODO Remove this block
-  if (!_is_prepared)
-  {
-    ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-
-    auto release_unused = [](Tensor *w) {
-      if (!w->is_used())
-      {
-        w->allocator()->free();
-      }
-    };
-
-    // Pointer to current weights
-    const ITensor *cur_weights = _original_weights;
-
-    // Reshape of the weights (happens only once)
-    if (!_are_weights_reshaped)
-    {
-      // Run reshape weights kernel and mark weights as unused
-      _reshape_weights_output.allocator()->allocate();
-      _reshape_weights_function.run();
-
-      cur_weights->mark_as_unused();
-      cur_weights = &_reshape_weights_output;
-      _are_weights_reshaped = true;
-    }
-
-    // Convert weights if needed (happens only once)
-    if (!_are_weights_converted)
-    {
-      _converted_weights_output.allocator()->allocate();
-      _convert_weights.run();
-
-      cur_weights->mark_as_unused();
-      _are_weights_converted = true;
-    }
-
-    // Release reshaped weights if unused
-    release_unused(&_reshape_weights_output);
-
-    // Prepare GEMM prepare and release unused weights
-    if (!_is_quantized)
-    {
-      _mm_gemm.prepare();
-    }
-
-    // Release converted weights if unused
-    release_unused(&_reshape_weights_output);
-    release_unused(&_converted_weights_output);
-
-    _is_prepared = true;
-  }
-#endif
+  // DO NOTHING
 }
 } // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp
index 2199839fb86..a525214d349 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp
@@ -41,7 +41,9 @@ void NEFullyConnectedReshapingLayer::configure(const arm_compute::ITensor *input
   if (_needs_reshape)
   {
     // reshape
-    auto_init_if_empty(*_neon_buffer.info(), _input->info()->clone()->set_tensor_shape(reshape));
+    auto_init_if_empty(*_neon_buffer.info(),
+                       _input->info()->clone()->set_tensor_shape(reshape).set_data_layout(
+                         _input->info()->data_layout()));
     _neon_reshape.configure(_input, &_neon_buffer);
     input_to_use = &_neon_buffer;
   }
@@ -53,11 +55,10 @@ void NEFullyConnectedReshapingLayer::configure(const arm_compute::ITensor *input
       fc->configure(input_to_use, _weights, _biases, _output);
       return std::unique_ptr<arm_compute::IFunction>(fc);
     }
-    else
+    else if (kernel_type == KernelType::PREPROCESSED_WEIGHTS)
     {
-      assert(kernel_type == KernelType::PREPROCESSED_WEIGHTS);
-
-      bool is_hybrid = input->info()->data_type() == DataType::F32 &&
+      bool is_hybrid = (input->info()->data_type() == DataType::F32 ||
+                        input->info()->data_type() == DataType::F16) &&
                        (weights->info()->data_type() == DataType::QSYMM8 ||
                         weights->info()->data_type() == DataType::QASYMM8_SIGNED);
 
@@ -78,6 +79,10 @@ void NEFullyConnectedReshapingLayer::configure(const arm_compute::ITensor *input
         return std::unique_ptr<arm_compute::IFunction>(fc);
       }
     }
+    else
+    {
+      throw std::runtime_error("NEFullyConnectedReshapingLayer: Unsupported kernel type");
+    }
   }();
 
   // NOTE _neon_buffer is inaccessible from outside, and thus it is safe to invoke allocate here.
diff --git a/infra/cmake/packages/ARMComputeSourceConfig.cmake b/infra/cmake/packages/ARMComputeSourceConfig.cmake
index 16e12bbcaba..c2ad30b67ef 100644
--- a/infra/cmake/packages/ARMComputeSourceConfig.cmake
+++ b/infra/cmake/packages/ARMComputeSourceConfig.cmake
@@ -8,7 +8,7 @@ function(_ARMComputeSource_import)
   nnas_include(OptionTools)
 
   envoption(EXTERNAL_DOWNLOAD_SERVER "https://github.com")
-  set(ARMCOMPUTE_URL ${EXTERNAL_DOWNLOAD_SERVER}/ARM-software/ComputeLibrary/archive/v21.02.tar.gz)
+  set(ARMCOMPUTE_URL ${EXTERNAL_DOWNLOAD_SERVER}/ARM-software/ComputeLibrary/archive/v24.07.tar.gz)
   ExternalSource_Download(ARMCOMPUTE ${ARMCOMPUTE_URL})
 
   set(ARMComputeSource_DIR ${ARMCOMPUTE_SOURCE_DIR} PARENT_SCOPE)
diff --git a/infra/nnfw/cmake/packages/ARMComputeConfig.cmake b/infra/nnfw/cmake/packages/ARMComputeConfig.cmake
index e55a0f4aebd..6f49496f033 100644
--- a/infra/nnfw/cmake/packages/ARMComputeConfig.cmake
+++ b/infra/nnfw/cmake/packages/ARMComputeConfig.cmake
@@ -29,12 +29,6 @@ function(_ARMCompute_Import)
     list(APPEND INCLUDE_DIR ${ARMComputeSource_DIR} ${ARMComputeSource_DIR}/include)
   endif(NOT INCLUDE_DIR OR NOT HEADER_SRC_DIR)
 
-  if(NOT CORE_LIBRARY)
-    set(ARMCompute_FOUND FALSE PARENT_SCOPE)
-    message(STATUS "Cannot find libarm_compute_core.so")
-    return()
-  endif()
-
   if(NOT RUNTIME_LIBRARY)
     message(STATUS "Cannot find libarm_compute.so")
     set(ARMCompute_FOUND FALSE PARENT_SCOPE)
@@ -47,18 +41,10 @@ function(_ARMCompute_Import)
     return()
   endif()
 
-  if(NOT TARGET arm_compute_core)
-    add_library(arm_compute_core INTERFACE)
-    target_include_directories(arm_compute_core SYSTEM INTERFACE ${INCLUDE_DIR})
-	target_link_libraries(arm_compute_core INTERFACE dl ${LIB_PTHREAD})
-    target_link_libraries(arm_compute_core INTERFACE ${CORE_LIBRARY})
-  endif(NOT TARGET arm_compute_core)
-
   if(NOT TARGET arm_compute)
     add_library(arm_compute INTERFACE)
     target_include_directories(arm_compute SYSTEM INTERFACE ${INCLUDE_DIR})
     target_link_libraries(arm_compute INTERFACE ${RUNTIME_LIBRARY})
-    target_link_libraries(arm_compute INTERFACE arm_compute_core)
   endif(NOT TARGET arm_compute)
 
   if(NOT TARGET arm_compute_graph)
diff --git a/nnpackage/schema/circle_schema.fbs b/nnpackage/schema/circle_schema.fbs
index 0498318bfce..e13bd3842cb 100644
--- a/nnpackage/schema/circle_schema.fbs
+++ b/nnpackage/schema/circle_schema.fbs
@@ -33,7 +33,8 @@
 // Version 0.6: Base up to TensorFlow Lite v2.13.0 schema.
 // Version 0.7: Base up to TensorFlow Lite v2.15.0 schema, deprecate data_format in Subgraph table
 // Version 0.8: GRU op is added. UINT4 is added.
-// Version 0.9: GGML_Q{X}_{Y} types are added. Weight compression option is added
+// Version 0.9: GGML_Q{X}_{Y} types are added. Weight compression option is added.
+//              ROPE op is added.
 
 namespace circle;
 
@@ -286,6 +287,7 @@ table Tensor {
 // set of acceptable options.
 // LINT.IfChange
 enum BuiltinOperator : int32 {
+  ROPE = -7,
   RMS_NORM = -6,
   GRU = -5,
   BCQ_GATHER = -4,
@@ -636,6 +638,7 @@ union BuiltinOptions {
   BitcastOptions,
   BitwiseXorOptions,
   RightShiftOptions,
+  RoPEOptions = 249,
   RmsNormOptions = 250,
   GRUOptions = 251,
   BCQGatherOptions = 252,
@@ -1525,6 +1528,15 @@ table RmsNormOptions {
   epsilon:float;
 }
 
+enum RoPEMode : int {
+  GPT_NEOX,
+  GPT_J,
+}
+
+table RoPEOptions {
+  mode: RoPEMode;
+}
+
 // An OperatorCode can be an enum value (BuiltinOperator) if the operator is a
 // builtin, or a string if the operator is custom.
 table OperatorCode {
diff --git a/res/CircleRecipes/RmsNorm_000/test.recipe b/res/CircleRecipes/RmsNorm_000/test.recipe
new file mode 100644
index 00000000000..e5e0c30df14
--- /dev/null
+++ b/res/CircleRecipes/RmsNorm_000/test.recipe
@@ -0,0 +1,46 @@
+operand {
+  name: "ifm"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 4 }
+}
+operand {
+  name: "gamma"
+  type: FLOAT32
+  shape { dim: 4 }
+  filler {
+    tag: "explicit"
+    arg: "1.0"
+    arg: "1.0"
+    arg: "1.0"
+    arg: "1.0"
+  }
+}
+operand {
+  name: "beta"
+  type: FLOAT32
+  shape { dim: 4 }
+  filler {
+    tag: "explicit"
+    arg: "0.0"
+    arg: "0.0"
+    arg: "0.0"
+    arg: "0.0"
+  }
+}
+operand {
+  name: "ofm"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 4 }
+}
+operation {
+  type: "RmsNorm"
+  input: "ifm"
+  input: "gamma"
+  input: "beta"
+  output: "ofm"
+  rms_norm_options {
+    epsilon: 0.0001
+  }
+}
+input: "ifm"
+output: "ofm"
diff --git a/res/CircleRecipes/RmsNorm_000/test.reverse b/res/CircleRecipes/RmsNorm_000/test.reverse
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/res/CircleSchema/0.9/circle_schema.fbs b/res/CircleSchema/0.9/circle_schema.fbs
index 0498318bfce..e13bd3842cb 100644
--- a/res/CircleSchema/0.9/circle_schema.fbs
+++ b/res/CircleSchema/0.9/circle_schema.fbs
@@ -33,7 +33,8 @@
 // Version 0.6: Base up to TensorFlow Lite v2.13.0 schema.
 // Version 0.7: Base up to TensorFlow Lite v2.15.0 schema, deprecate data_format in Subgraph table
 // Version 0.8: GRU op is added. UINT4 is added.
-// Version 0.9: GGML_Q{X}_{Y} types are added. Weight compression option is added
+// Version 0.9: GGML_Q{X}_{Y} types are added. Weight compression option is added.
+//              ROPE op is added.
 
 namespace circle;
 
@@ -286,6 +287,7 @@ table Tensor {
 // set of acceptable options.
 // LINT.IfChange
 enum BuiltinOperator : int32 {
+  ROPE = -7,
   RMS_NORM = -6,
   GRU = -5,
   BCQ_GATHER = -4,
@@ -636,6 +638,7 @@ union BuiltinOptions {
   BitcastOptions,
   BitwiseXorOptions,
   RightShiftOptions,
+  RoPEOptions = 249,
   RmsNormOptions = 250,
   GRUOptions = 251,
   BCQGatherOptions = 252,
@@ -1525,6 +1528,15 @@ table RmsNormOptions {
   epsilon:float;
 }
 
+enum RoPEMode : int {
+  GPT_NEOX,
+  GPT_J,
+}
+
+table RoPEOptions {
+  mode: RoPEMode;
+}
+
 // An OperatorCode can be an enum value (BuiltinOperator) if the operator is a
 // builtin, or a string if the operator is custom.
 table OperatorCode {
diff --git a/res/TensorFlowLiteRecipes/Net_RmsNorm_000/test.recipe b/res/TensorFlowLiteRecipes/Net_RmsNorm_000/test.recipe
new file mode 100644
index 00000000000..b89984abfee
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Net_RmsNorm_000/test.recipe
@@ -0,0 +1,117 @@
+operand {
+  name: "Input"
+  type: FLOAT32
+  shape {
+    dim: 1
+    dim: 1
+    dim: 4
+  }
+}
+operand {
+  name: "RmsNorm/Mul/Square"
+  type: FLOAT32
+  shape {
+    dim: 1
+    dim: 1
+    dim: 4
+  }
+}
+operand {
+  name: "RmsNorm/Mean/Axis"
+  type: INT32
+  shape {
+  }
+  filler {
+    tag: "explicit"
+    arg: "-1"
+  }
+}
+operand {
+  name: "RmsNorm/Mean/MeanSquare"
+  type: FLOAT32
+  shape {
+    dim: 1
+    dim: 1
+    dim: 1
+  }
+}
+operand {
+  name: "RmsNorm/Add/Epsilon"
+  type: FLOAT32
+  shape {
+  }
+  filler {
+    tag: "explicit"
+    arg: "1e-06"
+  }
+}
+operand {
+  name: "RmsNorm/Add/MeanSquare_plus_eps"
+  type: FLOAT32
+  shape {
+    dim: 1
+    dim: 1
+    dim: 1
+  }
+}
+operand {
+  name: "RmsNorm/Sqrt/RMS"
+  type: FLOAT32
+  shape {
+    dim: 1
+    dim: 1
+    dim: 1
+  }
+}
+operand {
+  name: "RmsNorm/Mul/RmsNorm"
+  type: FLOAT32
+  shape {
+    dim: 1
+    dim: 1
+    dim: 4
+  }
+}
+operation {
+  type: "Mul"
+  input: "Input"
+  input: "Input"
+  output: "RmsNorm/Mul/Square"
+  mul_options {
+    activation: NONE
+  }
+}
+operation {
+  type: "Mean"
+  input: "RmsNorm/Mul/Square"
+  input: "RmsNorm/Mean/Axis"
+  output: "RmsNorm/Mean/MeanSquare"
+  mean_options {
+    keep_dims: true
+  }
+}
+operation {
+  type: "Add"
+  input: "RmsNorm/Mean/MeanSquare"
+  input: "RmsNorm/Add/Epsilon"
+  output: "RmsNorm/Add/MeanSquare_plus_eps"
+  add_options {
+    activation: NONE
+  }
+}
+operation {
+  type: "Rsqrt"
+  input: "RmsNorm/Add/MeanSquare_plus_eps"
+  output: "RmsNorm/Sqrt/RMS"
+}
+operation {
+  type: "Mul"
+  input: "Input"
+  input: "RmsNorm/Sqrt/RMS"
+  output: "RmsNorm/Mul/RmsNorm"
+  mul_options {
+    activation: NONE
+  }
+}
+input: "Input"
+output: "RmsNorm/Mul/RmsNorm"
diff --git a/res/TensorFlowLiteRecipes/Net_RmsNorm_000/test.rule b/res/TensorFlowLiteRecipes/Net_RmsNorm_000/test.rule
new file mode 100644
index 00000000000..1586fc89482
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Net_RmsNorm_000/test.rule
@@ -0,0 +1,7 @@
+# To check if this network is converted to circle RmsNorm op
+
+RULE    "VERIFY_FILE_FORMAT"      $(verify_file_format) '=' 1
+
+RULE    "RMS_NORM_EXIST"          $(op_count RMS_NORM) '=' 1
+RULE    "NO_ADD"                  $(op_count ADD) '=' 0
+RULE    "NO_MUL"                  $(op_count MUL) '=' 0
diff --git a/res/TensorFlowLiteRecipes/REGRESS_Issue_13863/test.recipe b/res/TensorFlowLiteRecipes/REGRESS_Issue_13863/test.recipe
new file mode 100644
index 00000000000..1407f63b353
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/REGRESS_Issue_13863/test.recipe
@@ -0,0 +1,258 @@
+# This recipe was created using tflchef-reverse with badDead.zip
+# from the How_to_reproduce section of Issue_13863.
+# In the model, the dim value was changed to a single digit value,
+# and the shape_signature was removed.
+# https://github.com/Samsung/ONE/issues/13863
+
+operand {
+  name: "serving_default_input:0"
+  type: FLOAT32
+  shape {
+    dim: 1
+    dim: 4
+    dim: 4
+    dim: 3
+  }
+  quant {
+    quantized_dimension: 0
+  }
+  is_variable: false
+}
+operand {
+  name: "Const"
+  type: FLOAT32
+  shape {
+  }
+  filler {
+    tag: "explicit"
+    arg: "2"
+  }
+  quant {
+    quantized_dimension: 0
+  }
+  is_variable: false
+}
+operand {
+  name: "Const_1"
+  type: FLOAT32
+  shape {
+  }
+  filler {
+    tag: "explicit"
+    arg: "4"
+  }
+  quant {
+    quantized_dimension: 0
+  }
+  is_variable: false
+}
+operand {
+  name: "model/tf.split/split/split_dim"
+  type: INT32
+  shape {
+  }
+  filler {
+    tag: "explicit"
+    arg: "1"
+  }
+  quant {
+    quantized_dimension: 0
+  }
+  is_variable: false
+}
+operand {
+  name: "model/flatten/Const"
+  type: INT32
+  shape {
+    dim: 2
+  }
+  filler {
+    tag: "explicit"
+    arg: "-1"
+    arg: "48"
+  }
+  quant {
+    quantized_dimension: 0
+  }
+  is_variable: false
+}
+operand {
+  name: "PartitionedCall:3"
+  type: FLOAT32
+  shape {
+    dim: 1
+    dim: 1
+    dim: 4
+    dim: 4
+    dim: 3
+  }
+  quant {
+    quantized_dimension: 0
+  }
+  is_variable: false
+}
+operand {
+  name: "model/flatten/Reshape"
+  type: FLOAT32
+  shape {
+    dim: 1
+    dim: 48
+  }
+  quant {
+    quantized_dimension: 0
+  }
+  is_variable: false
+}
+operand {
+  name: "PartitionedCall:0"
+  type: FLOAT32
+  shape {
+    dim: 1
+    dim: 16
+  }
+  quant {
+    quantized_dimension: 0
+  }
+  is_variable: false
+}
+operand {
+  name: "model/tf.split/split"
+  type: FLOAT32
+  shape {
+    dim: 1
+    dim: 16
+  }
+  quant {
+    quantized_dimension: 0
+  }
+  is_variable: false
+}
+operand {
+  name: "model/tf.split/split1"
+  type: FLOAT32
+  shape {
+    dim: 1
+    dim: 16
+  }
+  quant {
+    quantized_dimension: 0
+  }
+  is_variable: false
+}
+operand {
+  name: "model/tf.compat.v1.math.scalar_mul_1/Mul"
+  type: FLOAT32
+  shape {
+    dim: 1
+    dim: 48
+  }
+  quant {
+    quantized_dimension: 0
+  }
+  is_variable: false
+}
+operand {
+  name: "PartitionedCall:2"
+  type: FLOAT32
+  shape {
+    dim: 1
+    dim: 1
+    dim: 48
+  }
+  quant {
+    quantized_dimension: 0
+  }
+  is_variable: false
+}
+operand {
+  name: "model/tf.compat.v1.math.scalar_mul/Mul"
+  type: FLOAT32
+  shape {
+    dim: 1
+    dim: 48
+  }
+  quant {
+    quantized_dimension: 0
+  }
+  is_variable: false
+}
+operand {
+  name: "PartitionedCall:1"
+  type: FLOAT32
+  shape {
+    dim: 1
+    dim: 1
+    dim: 48
+  }
+  quant {
+    quantized_dimension: 0
+  }
+  is_variable: false
+}
+operation {
+  type: "Pack"
+  input: "serving_default_input:0"
+  output: "PartitionedCall:3"
+  pack_options {
+    values_count: 1
+    axis: 0
+  }
+}
+operation {
+  type: "Reshape"
+  input: "serving_default_input:0"
+  input: "model/flatten/Const"
+  output: "model/flatten/Reshape"
+}
+operation {
+  type: "Split"
+  input: "model/tf.split/split/split_dim"
+  input: "model/flatten/Reshape"
+  output: "PartitionedCall:0"
+  output: "model/tf.split/split"
+  output: "model/tf.split/split1"
+  split_options {
+    num_splits: 3
+  }
+}
+operation {
+  type: "Mul"
+  input: "model/flatten/Reshape"
+  input: "Const_1"
+  output: "model/tf.compat.v1.math.scalar_mul_1/Mul"
+  mul_options {
+    activation: NONE
+  }
+}
+operation {
+  type: "Pack"
+  input: "model/tf.compat.v1.math.scalar_mul_1/Mul"
+  output: "PartitionedCall:2"
+  pack_options {
+    values_count: 1
+    axis: 0
+  }
+}
+operation {
+  type: "Mul"
+  input: "model/flatten/Reshape"
+  input: "Const"
+  output: "model/tf.compat.v1.math.scalar_mul/Mul"
+  mul_options {
+    activation: NONE
+  }
+}
+operation {
+  type: "Pack"
+  input: "model/tf.compat.v1.math.scalar_mul/Mul"
+  output: "PartitionedCall:1"
+  pack_options {
+    values_count: 1
+    axis: 0
+  }
+}
+input: "serving_default_input:0"
+output: "PartitionedCall:2"
+output: "PartitionedCall:1"
+output: "PartitionedCall:3"
+output: "PartitionedCall:0"
diff --git a/res/TensorFlowLiteRecipes/REGRESS_Issue_13863/test.rule b/res/TensorFlowLiteRecipes/REGRESS_Issue_13863/test.rule
new file mode 100644
index 00000000000..e9aa6eddeb6
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/REGRESS_Issue_13863/test.rule
@@ -0,0 +1,7 @@
+# Verify that the pack operation has been successfully removed
+# Check that the reshape operation exists (substitute_pack_to_reshape pass applied)
+
+RULE    "VERIFY_FILE_FORMAT"      $(verify_file_format) '=' 1
+
+RULE    "NO_PACK"                 $(op_count PACK) '=' 0
+RULE    "RESHAPE_EXIST"           $(op_count RESHAPE) '=' 4
diff --git a/runtime/onert/backend/acl_cl/KernelGenerator.cc b/runtime/onert/backend/acl_cl/KernelGenerator.cc
index c32c298a11e..bd4f209c6be 100644
--- a/runtime/onert/backend/acl_cl/KernelGenerator.cc
+++ b/runtime/onert/backend/acl_cl/KernelGenerator.cc
@@ -92,12 +92,16 @@ void KernelGenerator::visit(const ir::operation::BatchToSpaceND &node)
 
   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
-  auto block_size_tensor = _tensor_reg->getAclTensor(block_size_index);
 
-  assert(_ctx.at(block_size_index).data());
+  if (!_ctx.at(block_size_index).data())
+    throw std::runtime_error("ACL CL does not support dynamic block size for BatchToSpaceND");
+
+  auto block = _ctx.at(block_size_index).asVector<int32_t>();
+  int32_t height = block[0];
+  int32_t width = block[1];
 
   auto fn = acl_common::generateLayer<arm_compute::CLBatchToSpaceLayer>(
-    ifm_tensor->handle(), block_size_tensor->handle(), ofm_tensor->handle());
+    ifm_tensor->handle(), width, height, ofm_tensor->handle());
 
   _return_fn = asAclFunction(std::move(fn));
 }
@@ -121,6 +125,10 @@ void KernelGenerator::visit(const ir::operation::BinaryArithmetic &node)
   {
     case ir::operation::BinaryArithmetic::ArithmeticType::ADD:
     {
+      arm_compute::CLArithmeticAddition::validate(lhs_tensor->info(), rhs_tensor->info(),
+                                                  ofm_tensor->info(),
+                                                  arm_compute::ConvertPolicy::SATURATE, act_info)
+        .throw_if_error();
       fn = acl_common::generateLayer<arm_compute::CLArithmeticAddition>(
         lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(),
         arm_compute::ConvertPolicy::SATURATE, act_info);
@@ -128,6 +136,10 @@ void KernelGenerator::visit(const ir::operation::BinaryArithmetic &node)
     }
     case ir::operation::BinaryArithmetic::ArithmeticType::SUB:
     {
+      arm_compute::CLArithmeticSubtraction::validate(lhs_tensor->info(), rhs_tensor->info(),
+                                                     ofm_tensor->info(),
+                                                     arm_compute::ConvertPolicy::SATURATE, act_info)
+        .throw_if_error();
       fn = acl_common::generateLayer<arm_compute::CLArithmeticSubtraction>(
         lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(),
         arm_compute::ConvertPolicy::SATURATE, act_info);
@@ -135,6 +147,11 @@ void KernelGenerator::visit(const ir::operation::BinaryArithmetic &node)
     }
     case ir::operation::BinaryArithmetic::ArithmeticType::MUL:
     {
+      arm_compute::CLPixelWiseMultiplication::validate(
+        lhs_tensor->info(), rhs_tensor->info(), ofm_tensor->info(), 1.0,
+        arm_compute::ConvertPolicy::SATURATE, arm_compute::RoundingPolicy::TO_NEAREST_EVEN,
+        act_info)
+        .throw_if_error();
       fn = acl_common::generateLayer<arm_compute::CLPixelWiseMultiplication>(
         lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(), 1.0, // scale
         arm_compute::ConvertPolicy::SATURATE, arm_compute::RoundingPolicy::TO_NEAREST_EVEN,
@@ -143,6 +160,9 @@ void KernelGenerator::visit(const ir::operation::BinaryArithmetic &node)
     }
     case ir::operation::BinaryArithmetic::ArithmeticType::DIV:
     {
+      arm_compute::CLArithmeticDivision::validate(lhs_tensor->info(), rhs_tensor->info(),
+                                                  ofm_tensor->info(), act_info)
+        .throw_if_error();
       fn = acl_common::generateLayer<arm_compute::CLArithmeticDivision>(
         lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(), act_info);
       break;
@@ -1529,7 +1549,7 @@ void KernelGenerator::visit(const ir::operation::Reverse &node)
   }
 
   auto fn = acl_common::generateLayer<arm_compute::CLReverse>(
-    ifm_tensor->handle(), ofm_tensor->handle(), axis_tensor->handle());
+    ifm_tensor->handle(), ofm_tensor->handle(), axis_tensor->handle(), false);
 
   _return_fn = asAclFunction(std::move(fn));
 }
diff --git a/runtime/onert/backend/acl_common/Convert.cc b/runtime/onert/backend/acl_common/Convert.cc
index eb11fcf2385..f0904a685d9 100644
--- a/runtime/onert/backend/acl_common/Convert.cc
+++ b/runtime/onert/backend/acl_common/Convert.cc
@@ -252,19 +252,6 @@ std::unique_ptr<AclFunction> asAclFunction(std::unique_ptr<::arm_compute::IFunct
   return std::make_unique<AclFunction>(std::move(layer));
 }
 
-ir::Layout asRuntimeLayout(::arm_compute::DataLayout data_layout)
-{
-  switch (data_layout)
-  {
-    case ::arm_compute::DataLayout::NHWC:
-      return ir::Layout::NHWC;
-    case ::arm_compute::DataLayout::NCHW:
-      return ir::Layout::NCHW;
-    default:
-      return ir::Layout::UNKNOWN;
-  }
-}
-
 ir::DataType asRuntimeDataType(::arm_compute::DataType data_type)
 {
   switch (data_type)
diff --git a/runtime/onert/backend/acl_common/Convert.h b/runtime/onert/backend/acl_common/Convert.h
index dd6ce59183f..6dd8d01ab06 100644
--- a/runtime/onert/backend/acl_common/Convert.h
+++ b/runtime/onert/backend/acl_common/Convert.h
@@ -73,7 +73,6 @@ std::unique_ptr<T_Function> asFunction(std::unique_ptr<::arm_compute::IFunction>
   return std::make_unique<T_Function>(std::move(fn));
 }
 
-ir::Layout asRuntimeLayout(::arm_compute::DataLayout data_layout);
 ir::DataType asRuntimeDataType(::arm_compute::DataType data_type);
 
 arm_compute::PoolingType convertPoolType(ir::operation::Pool2D::PoolType pool_type_ir);
diff --git a/runtime/onert/backend/acl_neon/KernelGenerator.cc b/runtime/onert/backend/acl_neon/KernelGenerator.cc
index f0b10399613..4712cf468bd 100644
--- a/runtime/onert/backend/acl_neon/KernelGenerator.cc
+++ b/runtime/onert/backend/acl_neon/KernelGenerator.cc
@@ -118,12 +118,16 @@ void KernelGenerator::visit(const ir::operation::BatchToSpaceND &node)
 
   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
-  auto block_size_tensor = _tensor_reg->getAclTensor(block_size_index);
 
-  assert(_ctx.at(block_size_index).data());
+  if (!_ctx.at(block_size_index).data())
+    throw std::runtime_error("ACL NEON does not support dynamic block size for BatchToSpaceND");
+
+  auto block = _ctx.at(block_size_index).asVector<int32_t>();
+  int32_t height = block[0];
+  int32_t width = block[1];
 
   auto fn = acl_common::generateLayer<arm_compute::NEBatchToSpaceLayer>(
-    ifm_tensor->handle(), block_size_tensor->handle(), ofm_tensor->handle());
+    ifm_tensor->handle(), width, height, ofm_tensor->handle());
 
   _return_fn = asAclFunction(std::move(fn));
 }
@@ -145,6 +149,10 @@ void KernelGenerator::visit(const ir::operation::BinaryArithmetic &node)
   {
     case ir::operation::BinaryArithmetic::ArithmeticType::ADD:
     {
+      arm_compute::NEArithmeticAddition::validate(lhs_tensor->info(), rhs_tensor->info(),
+                                                  ofm_tensor->info(),
+                                                  arm_compute::ConvertPolicy::SATURATE)
+        .throw_if_error();
       fn = acl_common::generateLayer<arm_compute::NEArithmeticAddition>(
         lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(),
         arm_compute::ConvertPolicy::SATURATE);
@@ -152,6 +160,10 @@ void KernelGenerator::visit(const ir::operation::BinaryArithmetic &node)
     }
     case ir::operation::BinaryArithmetic::ArithmeticType::SUB:
     {
+      arm_compute::NEArithmeticSubtraction::validate(lhs_tensor->info(), rhs_tensor->info(),
+                                                     ofm_tensor->info(),
+                                                     arm_compute::ConvertPolicy::SATURATE)
+        .throw_if_error();
       fn = acl_common::generateLayer<arm_compute::NEArithmeticSubtraction>(
         lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(),
         arm_compute::ConvertPolicy::SATURATE);
@@ -159,6 +171,10 @@ void KernelGenerator::visit(const ir::operation::BinaryArithmetic &node)
     }
     case ir::operation::BinaryArithmetic::ArithmeticType::MUL:
     {
+      arm_compute::NEPixelWiseMultiplication::validate(
+        lhs_tensor->info(), rhs_tensor->info(), ofm_tensor->info(), 1.0,
+        arm_compute::ConvertPolicy::SATURATE, arm_compute::RoundingPolicy::TO_ZERO)
+        .throw_if_error();
       // RoundingPolicy for scale:1.0 is only allowed RoundingPolicy::TO_ZERO
       fn = acl_common::generateLayer<arm_compute::NEPixelWiseMultiplication>(
         lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(), 1.0, // scale
@@ -167,6 +183,9 @@ void KernelGenerator::visit(const ir::operation::BinaryArithmetic &node)
     }
     case ir::operation::BinaryArithmetic::ArithmeticType::DIV:
     {
+      arm_compute::NEElementwiseDivision::validate(lhs_tensor->info(), rhs_tensor->info(),
+                                                   ofm_tensor->info())
+        .throw_if_error();
       fn = acl_common::generateLayer<arm_compute::NEElementwiseDivision>(
         lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
       break;
diff --git a/runtime/onert/backend/cpu/CMakeLists.txt b/runtime/onert/backend/cpu/CMakeLists.txt
index 12777a2173d..92cdf3c18d1 100644
--- a/runtime/onert/backend/cpu/CMakeLists.txt
+++ b/runtime/onert/backend/cpu/CMakeLists.txt
@@ -12,6 +12,8 @@ target_link_libraries(${LIB_ONERT_BACKEND_CPU} PRIVATE nnfw_coverage)
 target_link_libraries(${LIB_ONERT_BACKEND_CPU} PRIVATE ruy)
 target_link_libraries(${LIB_ONERT_BACKEND_CPU} INTERFACE ruy_instrumentation)
 target_link_libraries(${LIB_ONERT_BACKEND_CPU} PRIVATE ndarray)
+# Set public: ExternalContext is used in train backend
+target_link_libraries(${LIB_ONERT_BACKEND_CPU} PUBLIC ggml)
 
 set_target_properties(${LIB_ONERT_BACKEND_CPU} PROPERTIES
   OUTPUT_NAME backend_cpu
diff --git a/runtime/onert/backend/cpu/ExternalContext.h b/runtime/onert/backend/cpu/ExternalContext.h
index 8c1f4ccf16c..9c7e9368bc2 100644
--- a/runtime/onert/backend/cpu/ExternalContext.h
+++ b/runtime/onert/backend/cpu/ExternalContext.h
@@ -19,6 +19,7 @@
 
 #include <util/ConfigSource.h>
 #include <ruy/context.h>
+#include <ggml.h>
 
 #include <memory>
 
@@ -47,10 +48,18 @@ class ExternalContext
     _ruy_context->set_max_num_threads(target_num_threads);
   }
 
+  void initGgmlContext()
+  {
+    if (_ggml_context == nullptr)
+      _ggml_context = std::unique_ptr<ggml_context, decltype(&ggml_free)>(
+        ggml_init({.mem_size = 0, .mem_buffer = nullptr, .no_alloc = true}), &ggml_free);
+  }
+
   ruy::Context *ruy_context() const { return _ruy_context.get(); }
 
 private:
   const std::unique_ptr<ruy::Context> _ruy_context;
+  std::unique_ptr<ggml_context, decltype(&ggml_free)> _ggml_context{nullptr, &ggml_free};
 };
 
 } // namespace cpu
diff --git a/runtime/onert/backend/train/MemoryManager.cc b/runtime/onert/backend/train/MemoryManager.cc
index fd156fea231..64a665dd620 100644
--- a/runtime/onert/backend/train/MemoryManager.cc
+++ b/runtime/onert/backend/train/MemoryManager.cc
@@ -93,6 +93,42 @@ uint8_t *DisposableMemoryManager::getBuffer(const DisposableTensorIndex &ind) co
   return _mem_alloc->base() + mem_blk.offset;
 }
 
+LayerScopeMemoryManager::LayerScopeMemoryManager() : _mem_planner{createMemoryPlanner()}
+{
+  // DO NOTHING
+}
+
+basic::IMemoryPlanner<LayerScopeTensorIndex> *LayerScopeMemoryManager::createMemoryPlanner()
+{
+  auto planner_id = util::getConfigString(util::config::CPU_MEMORY_PLANNER);
+  return MemoryPlannerFactory<LayerScopeTensorIndex>::get().create(planner_id);
+}
+
+void LayerScopeMemoryManager::allocate(void)
+{
+  _mem_alloc = std::make_shared<basic::Allocator>(_mem_planner->capacity());
+  assert(_mem_alloc->base());
+}
+
+uint8_t *LayerScopeMemoryManager::getBuffer(const LayerScopeTensorIndex &ind) const
+{
+  assert(_mem_planner->memory_plans().find(ind) != _mem_planner->memory_plans().end());
+  const auto &mem_blk = _mem_planner->memory_plans().at(ind);
+  return _mem_alloc->base() + mem_blk.offset;
+}
+
+void LayerScopeMemoryManager::deallocate(void) { _mem_alloc->release(); }
+
+void LayerScopeMemoryManager::claimPlan(const LayerScopeTensorIndex &ind, uint32_t size)
+{
+  _mem_planner->claim(ind, size);
+}
+
+void LayerScopeMemoryManager::releasePlan(const LayerScopeTensorIndex &ind)
+{
+  _mem_planner->release(ind);
+}
+
 } // namespace train
 } // namespace backend
 } // namespace onert
diff --git a/runtime/onert/backend/train/MemoryManager.h b/runtime/onert/backend/train/MemoryManager.h
index 98e840bf7f7..8333c838bce 100644
--- a/runtime/onert/backend/train/MemoryManager.h
+++ b/runtime/onert/backend/train/MemoryManager.h
@@ -20,6 +20,7 @@
 #include <backend/basic/MemoryManager.h>
 
 #include "DisposableTensorIndex.h"
+#include "LayerScopeTensorIndex.h"
 
 namespace onert
 {
@@ -67,7 +68,25 @@ class DisposableMemoryManager
   std::shared_ptr<basic::Allocator> _mem_alloc;
 };
 
-// TODO: Add LayerScopeMemoryManager using MemoryPlannerFactory<LayerScopeTensorIndex>
+class LayerScopeMemoryManager
+{
+public:
+  LayerScopeMemoryManager();
+
+  void allocate(void);
+  uint8_t *getBuffer(const LayerScopeTensorIndex &ind) const;
+  void deallocate(void);
+
+  void claimPlan(const LayerScopeTensorIndex &ind, uint32_t size);
+  void releasePlan(const LayerScopeTensorIndex &ind);
+
+private:
+  basic::IMemoryPlanner<LayerScopeTensorIndex> *createMemoryPlanner();
+
+private:
+  std::shared_ptr<basic::IMemoryPlanner<LayerScopeTensorIndex>> _mem_planner;
+  std::shared_ptr<basic::Allocator> _mem_alloc;
+};
 
 } // namespace train
 } // namespace backend
diff --git a/runtime/onert/backend/train/MemoryPlanner.test.cc b/runtime/onert/backend/train/MemoryPlanner.test.cc
index f030ecb1bff..7a908b5df87 100644
--- a/runtime/onert/backend/train/MemoryPlanner.test.cc
+++ b/runtime/onert/backend/train/MemoryPlanner.test.cc
@@ -25,7 +25,6 @@ using namespace onert::backend::train;
 using onert::ir::OperandIndex;
 using onert::ir::OperationIndex;
 
-// TODO: Add test testcase for {Bump, FirstFit, WIC}Planner<LayerScopeTensor>
 namespace
 {
 
@@ -178,7 +177,7 @@ TEST(FirstFitPlanner, disposable_claim_release_test)
   });
 }
 
-TEST(FirstFitPlanner, disposable_neg_release_non_existing_index)
+TEST(FirstFitPlanner, neg_disposable_release_non_existing_index)
 {
   PlannerVerifier<FirstFitPlanner, DisposableTensorIndex> p;
 
@@ -203,7 +202,7 @@ TEST(FirstFitPlanner, disposable_neg_release_non_existing_index)
   });
 }
 
-TEST(FirstFitPlanner, disposable_neg_release_twice)
+TEST(FirstFitPlanner, neg_disposable_release_twice)
 {
   PlannerVerifier<FirstFitPlanner, DisposableTensorIndex> p;
 
@@ -276,4 +275,189 @@ TEST(WICPlanner, disposable_claim_release_test)
   });
 }
 
-// Add Testcase for LayerScopeTensorIndex, using PlannerVerifier
+TEST(BumpPlanner, layerscope_claim_test)
+{
+  PlannerVerifier<BumpPlanner, LayerScopeTensorIndex> p;
+
+  ASSERT_NO_FATAL_FAILURE({
+    p.claim(0, 0, 10, 0);
+    p.claim(1, 0, 20, 10);
+    p.claim(2, 2, 30, 30);
+    p.release(0, 0);
+    p.capacity(60);
+  });
+}
+
+TEST(FirstFitPlanner, layerscope_claim_release_test)
+{
+  PlannerVerifier<FirstFitPlanner, LayerScopeTensorIndex> p;
+
+  ASSERT_NO_FATAL_FAILURE({
+    // 0 CLAIM - 10
+    p.claim(0, 0, 10, 0);
+
+    // 1 CLAIM - 20
+    p.claim(1, 0, 20, 10);
+
+    // 2 CLAIM - 30
+    p.claim(2, 2, 30, 30);
+
+    // 0 RELEASE - 10
+    p.release(0, 0);
+
+    // 3 CLAIM - 20
+    p.claim(3, 1, 20, 60);
+
+    // 4 CLAIM - 5
+    p.claim(4, 1, 5, 0);
+
+    // 5 CLAIM - 10
+    p.claim(5, 1, 10, 80);
+
+    // 6 CLAIM - 5
+    p.claim(6, 1, 5, 5);
+
+    // 2 RELEASE - 30
+    p.release(2, 2);
+
+    // 7 CLAIM - 35
+    p.claim(7, 1, 35, 90);
+
+    // 8 CLAIM - 10
+    p.claim(8, 1, 10, 30);
+
+    // 4 RELEASE - 5
+    p.release(4, 1);
+
+    // 9 CLAIM - 10
+    p.claim(9, 0, 10, 40);
+
+    // 10 CLAIM - 10
+    p.claim(10, 0, 10, 50);
+
+    // 6 RELEASE
+    p.release(6, 1);
+
+    // 1 RELEASE
+    p.release(1, 0);
+
+    // 8 RELEASE
+    p.release(8, 1);
+
+    // 9 RELEASE
+    p.release(9, 0);
+
+    // 10 RELEASE
+    p.release(10, 0);
+
+    // 3 RELEASE
+    p.release(3, 1);
+
+    // 5 RELEASE
+    p.release(5, 1);
+
+    // 7 RELEASE
+    p.release(7, 1);
+
+    // CAPACITY - 125
+    p.capacity(125);
+  });
+}
+
+TEST(FirstFitPlanner, neg_layerscope_release_non_existing_index)
+{
+  PlannerVerifier<FirstFitPlanner, LayerScopeTensorIndex> p;
+
+  auto on_only_debug_mode = [&p]() {
+    EXPECT_DEATH({ p.release(0, 1); },
+                 "Cannot release for given index. It has been not claimed or released already.");
+    return true;
+  };
+
+  ASSERT_NO_FATAL_FAILURE({
+    // 0 CLAIM - 10
+    p.claim(0, 0, 10, 0);
+
+    // 1 CLAIM - 20
+    p.claim(1, 0, 20, 10);
+
+    // 2 CLAIM - 30
+    p.claim(2, 2, 30, 30);
+
+    // RELEASE non-existing index
+    assert(on_only_debug_mode());
+  });
+}
+
+TEST(FirstFitPlanner, neg_layerscope_release_twice)
+{
+  PlannerVerifier<FirstFitPlanner, LayerScopeTensorIndex> p;
+
+  auto on_only_debug_mode = [&p]() {
+    EXPECT_EXIT({ p.release(0, 0); }, ::testing::KilledBySignal(SIGABRT),
+                "Cannot release for given index. It has been not claimed or released already.");
+    return true;
+  };
+
+  ASSERT_NO_FATAL_FAILURE({
+    // 0 CLAIM - 10
+    p.claim(0, 0, 10, 0);
+
+    // 1 CLAIM - 20
+    p.claim(1, 0, 20, 10);
+
+    // 2 CLAIM - 30
+    p.claim(2, 2, 30, 30);
+
+    // 0 RELEASE - 10
+    p.release(0, 0);
+
+    // 0 RELEASE again
+    assert(on_only_debug_mode());
+  });
+}
+
+TEST(WICPlanner, layerscope_claim_release_test)
+{
+  PlannerVerifier<WICPlanner, LayerScopeTensorIndex> p;
+
+  ASSERT_NO_FATAL_FAILURE({
+    p.claim(0, 0, 20);
+    p.claim(1, 0, 5);
+    p.release(0, 0);
+    p.claim(2, 2, 10);
+    p.release(1, 0);
+    p.claim(3, 1, 10);
+    p.release(2, 2);
+    p.claim(4, 1, 10);
+    p.release(3, 1);
+    p.claim(5, 1, 20);
+    p.release(4, 1);
+    p.claim(6, 1, 20);
+    p.release(5, 1);
+
+    // VERIFY 0 - 0
+    p.verify(0, 0, 20, 0);
+
+    // VERIFY 1 - 20
+    p.verify(1, 0, 5, 20);
+
+    // VERIFY 2 - 0
+    p.verify(2, 2, 10, 0);
+
+    // VERIFY 3 - 10
+    p.verify(3, 1, 10, 10);
+
+    // VERIFY 4 - 20
+    p.verify(4, 1, 10, 20);
+
+    // VERIFY 5 - 0
+    p.verify(5, 1, 20, 0);
+
+    // VERIFY 6 - 20
+    p.verify(6, 1, 20, 20);
+
+    // CAPACITY - 40
+    p.capacity(40);
+  });
+}
diff --git a/runtime/onert/backend/train/TensorManager.h b/runtime/onert/backend/train/TensorManager.h
index 6e0910e182d..c9553c3913e 100644
--- a/runtime/onert/backend/train/TensorManager.h
+++ b/runtime/onert/backend/train/TensorManager.h
@@ -61,6 +61,7 @@ class TensorManager
   void releaseGradientPlan(const ir::OperandIndex &ind);
   void claimDisposableBackPropPlan(const DisposableTensorIndex &ind);
   void releaseDisposableBackPropPlan(const DisposableTensorIndex &ind);
+  // TODO Add member functions related to LayerScopeMemoryManager
 
 private:
   std::unique_ptr<MemoryManager> _nonconst_mgr;
@@ -68,6 +69,8 @@ class TensorManager
   std::unique_ptr<MemoryManager> _back_prop_mgr;
   std::unique_ptr<MemoryManager> _gradient_mgr;
   std::unique_ptr<DisposableMemoryManager> _disposable_back_prop_mgr;
+  // TODO: enable _layer_scope_mgr
+  // std::unique_ptr<LayerScopeMemoryManager> _layer_scope_mgr;
   const std::shared_ptr<TensorRegistry> _tensors;
 };
 
diff --git a/runtime/onert/backend/train/ops/LossLayer.cc b/runtime/onert/backend/train/ops/LossLayer.cc
index 6f5f8705bba..e5a026ba863 100644
--- a/runtime/onert/backend/train/ops/LossLayer.cc
+++ b/runtime/onert/backend/train/ops/LossLayer.cc
@@ -26,7 +26,8 @@ namespace ops
 {
 
 LossLayer::LossLayer()
-  : _y_pred(nullptr), _y_true(nullptr), _output(nullptr), _back_prop_y_pred(nullptr)
+  : _y_pred(nullptr), _y_true(nullptr), _output(nullptr), _back_prop_y_pred(nullptr),
+    _reduction_type(ir::train::LossReductionType::Undefined)
 {
   // DO NOTHING
 }
diff --git a/runtime/onert/backend/trix/Convert.cc b/runtime/onert/backend/trix/Convert.cc
index fe003e7ead5..684dc80dd53 100644
--- a/runtime/onert/backend/trix/Convert.cc
+++ b/runtime/onert/backend/trix/Convert.cc
@@ -23,19 +23,6 @@ namespace backend
 namespace trix
 {
 
-data_layout convertDataLayout(const ir::Layout layout)
-{
-  switch (layout)
-  {
-    case ir::Layout::NCHW:
-      return DATA_LAYOUT_NCHW;
-    case ir::Layout::NHWC:
-      return DATA_LAYOUT_NHWC;
-    default:
-      throw std::runtime_error("Unknown Layout");
-  }
-}
-
 data_type convertDataType(const ir::DataType type)
 {
   switch (type)
diff --git a/runtime/onert/backend/trix/Convert.h b/runtime/onert/backend/trix/Convert.h
index 9359f0a5084..12d7eea1943 100644
--- a/runtime/onert/backend/trix/Convert.h
+++ b/runtime/onert/backend/trix/Convert.h
@@ -31,14 +31,6 @@ namespace backend
 namespace trix
 {
 
-/**
- * @brief Convert type of layout from onert type to npu type
- *
- * @param layout Layout type in onert
- * @return data_layout Layout type in npu
- */
-data_layout convertDataLayout(const ir::Layout layout);
-
 /**
  * @brief Convert type of data from onert type to npu type
  *
diff --git a/tests/nnapi/nnapi_gtest.skip.aarch64-android.acl_neon b/tests/nnapi/nnapi_gtest.skip.aarch64-android.acl_neon
index 03bdf091679..9e337bc7eba 100644
--- a/tests/nnapi/nnapi_gtest.skip.aarch64-android.acl_neon
+++ b/tests/nnapi/nnapi_gtest.skip.aarch64-android.acl_neon
@@ -76,6 +76,8 @@ GeneratedTests.fill_ex_dynamic_nnfw
 GeneratedTests.fully_connected_dynamic_nnfw
 GeneratedTests.fully_connected_float_2_weights_as_inputs
 GeneratedTests.fully_connected_hybrid_1_nnfw
+GeneratedTests.fully_connected_quant8_large_weights_as_inputs
+GeneratedTests.fully_connected_quant8_weights_as_inputs
 GeneratedTests.fusedbatchnorm_ex_dynamic_nnfw
 GeneratedTests.fusedbatchnorm_ex_float_fusedbatchnorm_1141
 GeneratedTests.gather_dynamic_nnfw
diff --git a/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_neon b/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_neon
index 03bdf091679..9e337bc7eba 100644
--- a/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_neon
+++ b/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_neon
@@ -76,6 +76,8 @@ GeneratedTests.fill_ex_dynamic_nnfw
 GeneratedTests.fully_connected_dynamic_nnfw
 GeneratedTests.fully_connected_float_2_weights_as_inputs
 GeneratedTests.fully_connected_hybrid_1_nnfw
+GeneratedTests.fully_connected_quant8_large_weights_as_inputs
+GeneratedTests.fully_connected_quant8_weights_as_inputs
 GeneratedTests.fusedbatchnorm_ex_dynamic_nnfw
 GeneratedTests.fusedbatchnorm_ex_float_fusedbatchnorm_1141
 GeneratedTests.gather_dynamic_nnfw
diff --git a/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_neon b/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_neon
index a3320998ab3..f636f551009 100644
--- a/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_neon
+++ b/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_neon
@@ -75,6 +75,9 @@ GeneratedTests.fill_ex_4D_float
 GeneratedTests.fill_ex_dynamic_nnfw
 GeneratedTests.fully_connected_dynamic_nnfw
 GeneratedTests.fully_connected_float_2_weights_as_inputs
+GeneratedTests.fully_connected_hybrid_1_nnfw
+GeneratedTests.fully_connected_quant8_large_weights_as_inputs
+GeneratedTests.fully_connected_quant8_weights_as_inputs
 GeneratedTests.fusedbatchnorm_ex_dynamic_nnfw
 GeneratedTests.fusedbatchnorm_ex_float_fusedbatchnorm_1141
 GeneratedTests.gather_dynamic_nnfw
diff --git a/tests/nnfw_api/CMakeLists.txt b/tests/nnfw_api/CMakeLists.txt
index 887ee9b1b74..1214290576b 100644
--- a/tests/nnfw_api/CMakeLists.txt
+++ b/tests/nnfw_api/CMakeLists.txt
@@ -37,6 +37,7 @@ target_link_libraries(${RUNTIME_NNFW_API_TEST} nnfw-dev jsoncpp)
 target_link_libraries(${RUNTIME_NNFW_API_TEST} gtest gmock)
 target_link_libraries(${RUNTIME_NNFW_API_TEST} ${LIB_PTHREAD} dl)
 target_link_libraries(${RUNTIME_NNFW_API_TEST} circle_schema)
+target_link_libraries(${RUNTIME_NNFW_API_TEST} ggml)
 
 install(TARGETS ${RUNTIME_NNFW_API_TEST} DESTINATION unittest)
 
diff --git a/tests/nnfw_api/lib/common.cc b/tests/nnfw_api/lib/common.cc
index 3c3bc68d093..2ccf712837e 100644
--- a/tests/nnfw_api/lib/common.cc
+++ b/tests/nnfw_api/lib/common.cc
@@ -17,6 +17,8 @@
 
 #include "common.h"
 
+#include <ggml.h>
+
 bool tensorInfoEqual(const nnfw_tensorinfo &info1, const nnfw_tensorinfo &info2)
 {
   if (info1.dtype != info2.dtype)
@@ -38,3 +40,23 @@ uint64_t tensorInfoNumElements(const nnfw_tensorinfo &ti)
   }
   return n;
 }
+
+std::vector<uint8_t> quantData(const std::vector<float> &buf_val, const circle::TensorType type)
+{
+  switch (type)
+  {
+    case circle::TensorType::TensorType_GGML_Q4_0:
+    {
+      size_t num_elems = buf_val.size();
+      const size_t block_size = ggml_blck_size(GGML_TYPE_Q4_0);
+      const int64_t num_block = num_elems / block_size;
+      const size_t block_struct_size = ggml_type_size(GGML_TYPE_Q4_0);
+
+      auto buf = std::vector<uint8_t>(num_block * block_struct_size);
+      ggml_quantize_chunk(GGML_TYPE_Q4_0, buf_val.data(), buf.data(), 0, 1, num_elems, nullptr);
+      return buf;
+    }
+    default:
+      throw std::runtime_error("Unsupported tensor type");
+  }
+}
diff --git a/tests/nnfw_api/lib/common.h b/tests/nnfw_api/lib/common.h
index aec49792c56..5d72ea8cabe 100644
--- a/tests/nnfw_api/lib/common.h
+++ b/tests/nnfw_api/lib/common.h
@@ -19,8 +19,10 @@
 
 #include <gtest/gtest.h>
 #include <nnfw.h>
+#include <circle_schema_generated.h>
 
 bool tensorInfoEqual(const nnfw_tensorinfo &info1, const nnfw_tensorinfo &info2);
 uint64_t tensorInfoNumElements(const nnfw_tensorinfo &info);
+std::vector<uint8_t> quantData(const std::vector<float> &buf_val, const circle::TensorType type);
 
 #endif // __NNFW_API_TEST_COMMON_H__
diff --git a/tests/nnfw_api/src/GenModelTests/one_op_tests/Add.test.cc b/tests/nnfw_api/src/GenModelTests/one_op_tests/Add.test.cc
index 9fc0e86b6b5..3365c071906 100644
--- a/tests/nnfw_api/src/GenModelTests/one_op_tests/Add.test.cc
+++ b/tests/nnfw_api/src/GenModelTests/one_op_tests/Add.test.cc
@@ -293,7 +293,7 @@ TEST_F(GenModelTest, neg_OneOp_Add_VarToVarInt16)
   cgen.setInputsAndOutputs({lhs, rhs}, {out});
 
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
-  // _context->addTestCase(uniformTCD<int8_t>({{1, 3, 2, 4}, {5, -4, -7, 4}}, {{0, -32, -46, 2}}));
+  _context->addTestCase(uniformTCD<int8_t>({{1, 3, 2, 4}, {5, -4, -7, 4}}, {{0, -32, -46, 2}}));
   _context->setBackends({"acl_cl", "acl_neon", "cpu"});
   _context->expectFailCompile();