diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
index 4358af5707b077..e75cc2a1867e55 100644
--- a/.github/workflows/linux.yml
+++ b/.github/workflows/linux.yml
@@ -687,7 +687,7 @@ jobs:
   Overall_Status:
     name: ci/gha_overall_status
     needs: [Smart_CI, Build, Debian_Packages, Samples, Conformance, ONNX_Runtime, CXX_Unit_Tests, Python_Unit_Tests, TensorFlow_Layer_Tests,
-            CPU_Functional_Tests, TensorFlow_Models_Tests_Precommit, PyTorch_Models_Tests, NVIDIA_Plugin, Openvino_tokenizers, iGPU]
+            CPU_Functional_Tests, TensorFlow_Models_Tests_Precommit, PyTorch_Models_Tests, NVIDIA_Plugin, Openvino_tokenizers]
     if: ${{ always() }}
     runs-on: ubuntu-latest
     steps:
diff --git a/src/common/transformations/src/transformations/common_optimizations/reshape_sequence_fusion.cpp b/src/common/transformations/src/transformations/common_optimizations/reshape_sequence_fusion.cpp
index de616d8996f720..e51715a34a9cdf 100644
--- a/src/common/transformations/src/transformations/common_optimizations/reshape_sequence_fusion.cpp
+++ b/src/common/transformations/src/transformations/common_optimizations/reshape_sequence_fusion.cpp
@@ -20,7 +20,7 @@ bool has_valid_pattern(const ov::Output<ov::Node>& node_out) {
     const auto const_node = std::dynamic_pointer_cast<ov::op::v0::Constant>(node_out.get_node_shared_ptr());
     if (!const_node) {
         // Lower bound of the value
-        auto lb = ov::evaluate_lower_bound(node_out);
+        auto lb = ov::util::evaluate_lower_bound(node_out);
         if (!lb)
             return false;
         const auto lb_const_node =
@@ -36,7 +36,7 @@ bool has_valid_pattern(const ov::Output<ov::Node>& node_out) {
             return true;
 
         // Upper bound of the value
-        auto ub = ov::evaluate_upper_bound(node_out);
+        auto ub = ov::util::evaluate_upper_bound(node_out);
         if (!ub)
             return false;
 
diff --git a/src/common/transformations/src/transformations/common_optimizations/simplify_shape_of_sub_graph.cpp b/src/common/transformations/src/transformations/common_optimizations/simplify_shape_of_sub_graph.cpp
index 9089474dd42bd3..d4bb02227c56ac 100644
--- a/src/common/transformations/src/transformations/common_optimizations/simplify_shape_of_sub_graph.cpp
+++ b/src/common/transformations/src/transformations/common_optimizations/simplify_shape_of_sub_graph.cpp
@@ -164,7 +164,7 @@ pass::AbsSinking::AbsSinking() {
             graph_got_changed = true;
         }
         for (const auto& abs : abs_ops) {
-            auto bounds = ov::evaluate_both_bounds(abs->input_value(0));
+            auto bounds = ov::util::evaluate_both_bounds(abs->input_value(0));
             if (ov::util::reduce_and(ov::util::greater_equal(bounds.first, 0))) {
                 replace_output_update_name(abs->output(0), abs->input_value(0));
                 graph_got_changed = true;
diff --git a/src/common/transformations/src/transformations/symbolic_transformations/symbol_optimization.cpp b/src/common/transformations/src/transformations/symbolic_transformations/symbol_optimization.cpp
index 469150c02a857c..27790904f4360b 100644
--- a/src/common/transformations/src/transformations/symbolic_transformations/symbol_optimization.cpp
+++ b/src/common/transformations/src/transformations/symbolic_transformations/symbol_optimization.cpp
@@ -215,7 +215,7 @@ void optimize_value_usage(ov::Output<ov::Node>& output, STS_map& symbol_shape_so
             get_alternative_source_from_value_or_shape_source(symbol_shape_source, symbol, output, symbol_value_source);
 
     if (alternative_source.get_node_shared_ptr() != nullptr) {
-        evaluate_both_bounds(alternative_source);
+        ov::util::evaluate_both_bounds(alternative_source);
         output.replace(alternative_source);
     } else {
         // in case we can not optimize it -- it is symbol which appeared just now on the value path
diff --git a/src/core/dev_api/openvino/core/bound_evaluation_util.hpp b/src/core/dev_api/openvino/core/bound_evaluation_util.hpp
index 86c8e9fa81e694..936a2fe891e182 100644
--- a/src/core/dev_api/openvino/core/bound_evaluation_util.hpp
+++ b/src/core/dev_api/openvino/core/bound_evaluation_util.hpp
@@ -14,6 +14,8 @@ namespace ov {
 /// \return True if bounds can be propagated for output and order vector has valid data, otherwise false.
 OPENVINO_API bool could_propagate(const Output<Node>& output, std::vector<Node*>& order);
 
+namespace util {
+
 /// \brief Evaluates lower value estimation of the output tensor. Traverses graph up to deduce
 /// estimation through it.
 /// \param Node output pointing to the tensor for estimation.
@@ -31,4 +33,5 @@ OPENVINO_API Tensor evaluate_upper_bound(const Output<Node>& output);
 /// \param output Node output pointing to the tensor for estimation.
 /// \return pair with Tensors for lower and upper value estimation.
 OPENVINO_API std::pair<Tensor, Tensor> evaluate_both_bounds(const Output<Node>& output);
+}  // namespace util
 }  // namespace ov
diff --git a/src/core/shape_inference/include/utils.hpp b/src/core/shape_inference/include/utils.hpp
index 11702c8f4e1b42..9298498d00bdc5 100644
--- a/src/core/shape_inference/include/utils.hpp
+++ b/src/core/shape_inference/include/utils.hpp
@@ -366,7 +366,7 @@ ov::optional<TResult> get_input_bounds(const ov::Node* op, size_t port, const IT
         out->reserve(lowers.size());
         std::transform(lowers.cbegin(), lowers.cend(), lowers.cbegin(), std::back_inserter(*out), make_bound(et));
     } else if (port < op->get_input_size()) {
-        auto bounds = ov::evaluate_both_bounds(op->get_input_source_output(port));
+        auto bounds = ov::util::evaluate_both_bounds(op->get_input_source_output(port));
 
         if (bounds.first && bounds.second) {
             const auto& et = bounds.first.get_element_type();
diff --git a/src/core/src/bound_evaluate.cpp b/src/core/src/bound_evaluate.cpp
index d86b01c81165cb..55c31538e91b86 100644
--- a/src/core/src/bound_evaluate.cpp
+++ b/src/core/src/bound_evaluate.cpp
@@ -299,15 +299,15 @@ bool ov::could_propagate(const Output<Node>& output, std::vector<Node*>& result)
     return status;
 }
 
-ov::Tensor ov::evaluate_lower_bound(const Output<Node>& output) {
+ov::Tensor ov::util::evaluate_lower_bound(const Output<Node>& output) {
     return evaluate_bound(output, false);
 }
 
-ov::Tensor ov::evaluate_upper_bound(const Output<Node>& output) {
+ov::Tensor ov::util::evaluate_upper_bound(const Output<Node>& output) {
     return evaluate_bound(output, true);
 }
 
-std::pair<ov::Tensor, ov::Tensor> ov::evaluate_both_bounds(const Output<Node>& output) {
+std::pair<ov::Tensor, ov::Tensor> ov::util::evaluate_both_bounds(const Output<Node>& output) {
     const auto& output_tensor = output.get_tensor();
     if (output_tensor.get_lower_value() && output_tensor.get_upper_value())
         return {output_tensor.get_lower_value(), output_tensor.get_upper_value()};
@@ -381,10 +381,10 @@ bool ov::interval_bound_evaluator(const Node* node,
     OPENVINO_ASSERT(node->get_input_size() == 2);
 
     const auto num_of_outputs = node->get_output_size();
-    auto low_0 = ov::evaluate_lower_bound(node->get_input_source_output(0));
-    auto low_1 = ov::evaluate_lower_bound(node->get_input_source_output(1));
-    auto up_0 = ov::evaluate_upper_bound(node->get_input_source_output(0));
-    auto up_1 = ov::evaluate_upper_bound(node->get_input_source_output(1));
+    auto low_0 = ov::util::evaluate_lower_bound(node->get_input_source_output(0));
+    auto low_1 = ov::util::evaluate_lower_bound(node->get_input_source_output(1));
+    auto up_0 = ov::util::evaluate_upper_bound(node->get_input_source_output(0));
+    auto up_1 = ov::util::evaluate_upper_bound(node->get_input_source_output(1));
     if (!low_0 || !low_1 || !up_0 || !up_1)
         return false;
 
@@ -534,7 +534,7 @@ bool ov::has_and_set_equal_bounds(const Output<Node>& source) {
     if (op::util::is_constant(source.get_node_shared_ptr()))
         return true;
 
-    auto bounds = ov::evaluate_both_bounds(source);
+    auto bounds = ov::util::evaluate_both_bounds(source);
     return are_same_tensor(bounds.first, bounds.second);
 }
 
diff --git a/src/core/src/op/divide.cpp b/src/core/src/op/divide.cpp
index b00b731b296351..dfbc25c0bd0553 100644
--- a/src/core/src/op/divide.cpp
+++ b/src/core/src/op/divide.cpp
@@ -80,16 +80,16 @@ bool evaluate_bound(const Node* node, TensorVector& output_values, bool is_upper
     OPENVINO_ASSERT(PartialShape::broadcast_merge_into(input_shape, input2.get_partial_shape(), node->get_autob()),
                     "Argument shapes in divide operation are inconsistent.");
 
-    const auto input1_low = evaluate_lower_bound(input1);
+    const auto input1_low = ov::util::evaluate_lower_bound(input1);
     if (!input1_low)
         return false;
-    const auto input1_up = evaluate_upper_bound(input1);
+    const auto input1_up = ov::util::evaluate_upper_bound(input1);
     if (!input1_up)
         return false;
-    const auto input2_low = evaluate_lower_bound(input2);
+    const auto input2_low = ov::util::evaluate_lower_bound(input2);
     if (!input2_low)
         return false;
-    const auto input2_up = evaluate_upper_bound(input2);
+    const auto input2_up = ov::util::evaluate_upper_bound(input2);
     if (!input2_up)
         return false;
 
diff --git a/src/core/src/op/mod.cpp b/src/core/src/op/mod.cpp
index 1d15ffb60d4971..57306aa0373863 100644
--- a/src/core/src/op/mod.cpp
+++ b/src/core/src/op/mod.cpp
@@ -78,8 +78,8 @@ namespace {
  * @return Vector with inputs bounds tensors.
  */
 TensorVector get_bounds(const Node* const op) {
-    auto&& v_bounds = ov::evaluate_both_bounds(op->input_value(0));
-    auto&& m_bounds = ov::evaluate_both_bounds(op->input_value(1));
+    auto&& v_bounds = ov::util::evaluate_both_bounds(op->input_value(0));
+    auto&& m_bounds = ov::util::evaluate_both_bounds(op->input_value(1));
     return {std::move(v_bounds.first),
             std::move(v_bounds.second),
             std::move(m_bounds.first),
diff --git a/src/core/tests/bound_evaluate.cpp b/src/core/tests/bound_evaluate.cpp
index 038c1bae444e9d..4c5c7681a45a0d 100644
--- a/src/core/tests/bound_evaluate.cpp
+++ b/src/core/tests/bound_evaluate.cpp
@@ -42,7 +42,7 @@ TEST_F(EvaluateBoundTest, no_exception_when_node_has_output_with_dynamic_rank) {
     fn_op->set_output_type(1, element::i32, PartialShape{{1, 4}});
     fn_op->validate_and_infer_types();
 
-    EXPECT_NO_THROW(evaluate_both_bounds(fn_op));
+    EXPECT_NO_THROW(ov::util::evaluate_both_bounds(fn_op));
 }
 
 TEST_F(EvaluateBoundTest, no_exception_when_node_has_output_with_dynamic_element_type) {
@@ -50,7 +50,7 @@ TEST_F(EvaluateBoundTest, no_exception_when_node_has_output_with_dynamic_element
     fn_op->set_output_type(1, element::dynamic, PartialShape{4});
     fn_op->validate_and_infer_types();
 
-    EXPECT_NO_THROW(evaluate_both_bounds(fn_op));
+    EXPECT_NO_THROW(ov::util::evaluate_both_bounds(fn_op));
 }
 
 using BoundEvaluatorTest = ::testing::Test;
diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl
index f22c1ee136c004..3dae1669ccc23f 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl
@@ -116,25 +116,13 @@ KERNEL(quantize_input)(
 
 
 #if !REALIGN_FP16_OFFSET
-#   if OUTPUT_3D
-#       define MAIN_LOOP_ELEMENTS_COUNT  INPUT0_SIZE_Y
-#   else
-#       define MAIN_LOOP_ELEMENTS_COUNT  INPUT0_ELEMENTS_COUNT
-#   endif
+    #define MAIN_LOOP_ELEMENTS_COUNT  IFM_SIZE
 #else
-// For REALIGN_FP16_OFFSET one feature is processed separately before entering main loop to correct alignment.
-#   if OUTPUT_3D
-#       define MAIN_LOOP_ELEMENTS_COUNT  (INPUT0_SIZE_Y - 1)
-#   else
-#       define MAIN_LOOP_ELEMENTS_COUNT  (INPUT0_ELEMENTS_COUNT - 1)
-#   endif
+    // For REALIGN_FP16_OFFSET one feature is processed separately before entering main loop to correct alignment.
+    #define MAIN_LOOP_ELEMENTS_COUNT  (IFM_SIZE - 1)
 #endif
 
-#if OUTPUT_3D
-#   define INPUT_ELEMENTS_COUNT INPUT0_SIZE_Y
-#else
-#   define INPUT_ELEMENTS_COUNT INPUT0_ELEMENTS_COUNT
-#endif
+#define INPUT_ELEMENTS_COUNT IFM_SIZE
 
 #if IS_DYNAMIC && COMPRESSED_WEIGHTS_INT4
 #pragma disable_includes_optimization
@@ -316,9 +304,7 @@ inline void FUNC(fc_bf_tiled_kernel_default)(
         // NOTE: Manually unrolling multiplication loop leads to lower register pressure and allows for bigger block sizes,
         //       but significantly degrades readability and generality of code.
         //       It doesn't also show noticable performance improvement on tested configurations.
-        #if DECOMPRESSION_SCALE_POST_OP
-            ACCUMULATOR_VEC_TYPE acc_tmp[TILE_B] = { };
-        #endif
+        ACCUMULATOR_VEC_TYPE acc_tmp[TILE_B] = { };
 
         #if USE_SLM && COMPRESSED_WEIGHTS_INT4
             #if TILE_OFM != 2
@@ -481,9 +467,9 @@ inline void FUNC(fc_bf_tiled_kernel_default)(
                     #endif
 #else
                     #if TILE_OFM > 1
-                        ((ACCUMULATOR_TYPE*)(&acc[bi]))[fi] += in_val * ((ACCUMULATOR_TYPE*)(&wei))[W_IDX];
+                        ((ACCUMULATOR_TYPE*)(&acc_tmp[bi]))[fi] += in_val * ((ACCUMULATOR_TYPE*)(&wei))[W_IDX];
                     #else
-                        acc[bi] += in_val * ((ACCUMULATOR_TYPE*)(&wei))[W_IDX];
+                        acc_tmp[bi] += in_val * ((ACCUMULATOR_TYPE*)(&wei))[W_IDX];
                     #endif
 #endif
                     }
@@ -539,6 +525,18 @@ inline void FUNC(fc_bf_tiled_kernel_default)(
             }
         }
 #endif
+
+#if !DECOMPRESSION_SCALE_POST_OP
+        unroll_for (uint bi = 0; bi < TILE_B; ++bi) {
+            unroll_for(uint fi = 0; fi < TILE_OFM; ++fi) {
+                #if TILE_OFM > 1
+                ((ACCUMULATOR_TYPE*)(&acc[bi]))[fi] += ((ACCUMULATOR_TYPE*)(&acc_tmp[bi]))[fi];
+                #else
+                acc[bi] += acc_tmp[bi];
+                #endif
+            }
+        }
+#endif
     }
     // =====================================================================================================================================
     // Leftovers
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp
index c6b0acda06c56a..07d81dce5e3f23 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp
@@ -15,14 +15,20 @@ static constexpr size_t min_slm_size = 256;
 namespace kernel_selector {
 
 static std::pair<size_t, size_t> get_input_bf_size(const fully_connected_params& params) {
-    size_t input_f = params.inputs[0].Feature().v;
-    size_t input_batch = params.inputs[0].Batch().v;
+    auto& input = params.inputs[0];
+    size_t input_f = input.Feature().v;
+    size_t input_batch = input.Batch().v;
+
     // 3D input
     if (params.outputs[0].GetLayout() == DataLayout::bfyx) {
-        input_f = params.inputs[0].Y().v;
-        input_batch = params.inputs[0].Batch().v * params.inputs[0].Feature().v;
+        input_f = input.Y().v;
+        input_batch = input.Batch().v * input.Feature().v;
     }
 
+    // In Some model, input_f could be dynamic in input0. It refers to IFM value of weight.
+    if (input.is_dynamic() && input_f == 0 && params.weights.IFM().v != 0)
+        input_f = params.weights.IFM().v;
+
     return {input_batch, input_f};
 }
 
@@ -153,8 +159,7 @@ bool FullyConnected_bf_tiled::Validate(const Params& params) const {
 
     // Dynamic kernel doesn't support dynamic weights yet
     if (fc_params.is_shape_agnostic && input.is_dynamic()) {
-        if ((output.GetLayout() == DataLayout::bfyx && input.Y().v == 0) ||
-            (output.GetLayout() == DataLayout::bf && input.Feature().v == 0))
+        if (get_input_bf_size(fc_params).second == 0)
             return false;
     }
 
@@ -509,6 +514,7 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para
         jit.AddConstant(MakeJitConstant("DYNAMIC_QUANTIZE", 0));
     }
 
+    jit.AddConstant(MakeJitConstant("IFM_SIZE", get_input_bf_size(params).second));
     jit.AddConstant(MakeJitConstant("SIMD", simd));
     jit.AddConstant(MakeJitConstant("TILE_B", dispatchData.tile_m));
     jit.AddConstant(MakeJitConstant("HALF_TILE_B", dispatchData.tile_m/2));
@@ -539,16 +545,18 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para
 
     // for 3d output we are treating spatial as features
     if (params.outputs[0].GetLayout() == DataLayout::bfyx) {
+        auto tile_in_b_pitch = (params.inputs[0].Feature().pitch == 0) ? get_input_bf_size(params).second : params.inputs[0].Feature().pitch;
         jit.AddConstant(MakeJitConstant("TILE_OUT_F_NUM", params.outputs[0].Y().v));
         jit.AddConstant(MakeJitConstant("TILE_OUT_F_PITCH", params.outputs[0].Y().pitch));
-        jit.AddConstant(MakeJitConstant("TILE_IN_B_PITCH", params.inputs[0].Feature().pitch));
+        jit.AddConstant(MakeJitConstant("TILE_IN_B_PITCH", tile_in_b_pitch));
         jit.AddConstant(MakeJitConstant("TILE_OUT_B_PITCH", params.outputs[0].Feature().pitch));
         jit.AddConstant(MakeJitConstant("OUTPUT_3D", true));
         jit.AddConstant(MakeJitConstant("BATCH_SIZE", "(OUTPUT_BATCH_NUM * OUTPUT_FEATURE_NUM)"));
     } else {
+        auto tile_in_b_pitch = (params.inputs[0].Batch().pitch == 0) ? get_input_bf_size(params).second : params.inputs[0].Batch().pitch;
         jit.AddConstant(MakeJitConstant("TILE_OUT_F_NUM", params.outputs[0].Feature().v));
         jit.AddConstant(MakeJitConstant("TILE_OUT_F_PITCH", params.outputs[0].Feature().pitch));
-        jit.AddConstant(MakeJitConstant("TILE_IN_B_PITCH", params.inputs[0].Batch().pitch));
+        jit.AddConstant(MakeJitConstant("TILE_IN_B_PITCH", tile_in_b_pitch));
         jit.AddConstant(MakeJitConstant("TILE_OUT_B_PITCH", params.outputs[0].Batch().pitch));
         jit.AddConstant(MakeJitConstant("BATCH_SIZE", "(OUTPUT_BATCH_NUM)"));
     }
@@ -614,6 +622,12 @@ void FullyConnected_bf_tiled::GetUpdateDispatchDataFunc(KernelData& kd) const {
             kd.kernels[execute_kernel_idx].params.workGroups.local = dispatchData.lws;
             kd.kernels[execute_kernel_idx].skip_execution = KernelData::SkipKernelExecution(prim_params);
 
+            auto& input = prim_params.inputs[0];
+            if (prim_params.outputs[0].GetLayout() == DataLayout::bfyx)
+                OPENVINO_ASSERT(input.X().pad.Total() == 0 && input.Y().pad.Total() == 0, "[GPU] Invalid padding in spatial axes observed in FC bf tiled.");
+            else
+                OPENVINO_ASSERT(input.Feature().pad.Total() == 0, "[GPU] Invalid padding in f axis observed in FC bf tiled.");
+
             if (!kd.internalBufferSizes.empty()) {
                 // Pre-quantizing kernel was generated. Update the kernel and intermediate buffers or disable it.
                 if (execute_type == KernelType::DEFAULT) {
@@ -784,7 +798,8 @@ KernelsData FullyConnected_bf_tiled::GetMultiKernelsData(const Params &params,
     {
         auto& quan_kernel = kd.kernels[0];
         DispatchData dyn_quan_dispatch = dispatchData;
-        dyn_quan_dispatch.gws = {std::max((fc_params.inputs[0].PhysicalSize() / quantize_grp_size), (size_t)1), 1, 1};
+        auto input_size = std::max(fc_params.inputs[0].PhysicalSize(), get_input_bf_size(fc_params).second);
+        dyn_quan_dispatch.gws = {input_size / quantize_grp_size, 1, 1};
         dyn_quan_dispatch.lws = {16, 1, 1};
         quan_kernel.params.workGroups.global = dyn_quan_dispatch.gws;
         quan_kernel.params.workGroups.local = dyn_quan_dispatch.lws;
@@ -814,8 +829,8 @@ KernelsData FullyConnected_bf_tiled::GetMultiKernelsData(const Params &params,
         quan_kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, 0});
         quan_kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0});
         quan_kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 1});
-        kd.internalBufferSizes.push_back(fc_params.inputs[0].PhysicalSize());
-        kd.internalBufferSizes.push_back(fc_params.inputs[0].PhysicalSize() / quantize_grp_size * 2);
+        kd.internalBufferSizes.push_back(input_size);
+        kd.internalBufferSizes.push_back(input_size / quantize_grp_size * 2);
         kernel_number++;
     }
     kd.internalBufferDataType = Datatype::F16;
diff --git a/src/plugins/intel_gpu/tests/functional/single_layer_tests/dynamic/matmul.cpp b/src/plugins/intel_gpu/tests/functional/single_layer_tests/dynamic/matmul.cpp
index c88a3826fe0f8f..36b6370a85c2f4 100644
--- a/src/plugins/intel_gpu/tests/functional/single_layer_tests/dynamic/matmul.cpp
+++ b/src/plugins/intel_gpu/tests/functional/single_layer_tests/dynamic/matmul.cpp
@@ -266,9 +266,10 @@ const std::vector<ShapeRelatedParams> IS3D_smoke = {
     },
 
     {ov::test::static_shapes_to_test_representation({{1, 429}, {1, 429, 1}}), {true, true}},
+
     {
         {
-            {{-1, -1}, {{1, 129}, {2, 129}, {1, 129}, {2, 129}}},
+            {{-1, -1, -1}, {{1, 1, 129}, {1, 2, 129}, {1, 1, 129}, {1, 2, 129}}},
             {{1, 129, 1}, {{1, 129, 1}, {1, 129, 1}, {1, 129, 1}, {1, 129, 1}}}
         },
         {true, true}
diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp
index 13607316545d78..3e4c1635229c86 100644
--- a/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp
+++ b/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp
@@ -1255,7 +1255,7 @@ class fully_connected_gpu_tests: public ::testing::Test {
         }
     }
 
-    void test_compressed_int4_scale_dyn_quan(bool is_caching_test, bool is_dynamic, int batch = 1) {
+    void test_compressed_int4_scale_dyn_quan(bool is_caching_test, bool is_dynamic, int batch = 1, bool is_wei_dyn = false) {
         tests::random_generator rg(GET_SUITE_NAME);
         auto& engine = get_test_engine();
 
@@ -1285,6 +1285,11 @@ class fully_connected_gpu_tests: public ::testing::Test {
         auto scale_data = rg.generate_random_1d<ov::float16>(ofm_num * ifm_num / scales_group_size, -4.0f, 4.0f);
         set_values(scale_mem, scale_data);
 
+        if (is_wei_dyn) {
+            // ifm_num is dynamic
+            dyn_input_ps = is_3d ?  ov::PartialShape{ -1, -1, -1 } : ov::PartialShape{ -1, -1};
+        }
+
         auto in_layout = is_dynamic ? layout{ dyn_input_ps, data_types::f16, format::bfyx }
                                     : layout{ input_ps, data_types::f16, format::bfyx };
 
@@ -1302,7 +1307,8 @@ class fully_connected_gpu_tests: public ::testing::Test {
 
             auto config = get_test_default_config(engine);
             config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
-            config.set_property(ov::intel_gpu::optimize_data(true));
+            ov::intel_gpu::ImplementationDesc fc_impl_desc = { format::bfyx, "fully_connected_gpu_bfyx_ref", impl_types::ocl };
+            config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ {"fc_prim", fc_impl_desc} }));
 
             network network(engine, topology, config);
             network.set_input_data("input", input_mem);
@@ -1365,13 +1371,13 @@ class fully_connected_gpu_tests: public ::testing::Test {
     }
 
 
-    void test_compressed_int4_scale(bool is_caching_test, bool is_dynamic, long int batch_num, long int scales_group_size = 128) {
+    void test_compressed_int4_scale(bool is_caching_test, bool is_dynamic, long int batch_num, long int scales_group_size = 128, bool is_wei_dyn = false) {
         tests::random_generator rg(GET_SUITE_NAME);
         auto& engine = get_test_engine();
         auto supports_immad = engine.get_device_info().supports_immad;
 
         long int ifm_num = 256;
-        long int ofm_num = 256;
+        long int ofm_num = 512;
 
         auto input_mem = engine.allocate_memory({ { batch_num, ifm_num}, data_types::f16, format::bfyx });
         auto weights_mem = engine.allocate_memory({ {ofm_num, ifm_num}, data_types::u4, format::bfyx });
@@ -1392,6 +1398,11 @@ class fully_connected_gpu_tests: public ::testing::Test {
         auto in_layout = is_dynamic ? layout{ {-1, ifm_num}, data_types::f16, format::bfyx }
                                     : layout{ {batch_num, ifm_num}, data_types::f16, format::bfyx };
 
+        if (is_dynamic && is_wei_dyn) {
+            // ifm_num is dynamic
+            in_layout = layout{ {-1, -1}, data_types::f16, format::bfyx };
+        }
+
         auto dcomp_zp_name = supports_immad ? "dcomp_zp" : "";
 
         auto fc_prim = fully_connected("fc_prim", input_info("input"), "weights", "", "scale", dcomp_zp_name, data_types::f16, 2, 2);
@@ -1409,6 +1420,8 @@ class fully_connected_gpu_tests: public ::testing::Test {
 
             auto config = get_test_default_config(engine);
             config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
+            ov::intel_gpu::ImplementationDesc fc_impl_desc = { format::bfyx, "fully_connected_gpu_bfyx_ref", impl_types::ocl };
+            config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ {"fc_prim", fc_impl_desc} }));
 
             network network(engine, topology, config);
             network.set_input_data("input", input_mem);
@@ -1461,6 +1474,66 @@ class fully_connected_gpu_tests: public ::testing::Test {
             ASSERT_NEAR(output_ptr_ref[i], output_ptr[i], 9.0) << "i = " << i;
     }
 
+    void test_compressed_int4_accumulation(bool is_caching_test, bool is_dynamic, long int batch_num) {
+        auto& engine = get_test_engine();
+
+        long int ifm_num = 4096;
+        long int ofm_num = 4;
+
+        auto input_mem      = engine.allocate_memory({ { batch_num, ifm_num}, data_types::f16, format::bfyx });
+        auto weights_mem    = engine.allocate_memory({ {ofm_num, ifm_num}, data_types::u4, format::bfyx });
+        auto scale_mem      = engine.allocate_memory({ {ofm_num, ifm_num}, data_types::f16, format::bfyx });
+
+        auto input_data = std::vector<ov::float16>(input_mem->count());
+        std::fill(input_data.begin(), input_data.end(), 1);
+        set_values(input_mem, input_data);
+
+        auto weigths_data = std::vector<uint8_t>(weights_mem->count() / 2);
+        std::fill(weigths_data.begin(), weigths_data.end(), 0x11);
+        set_values(weights_mem, weigths_data);
+
+        auto scale_data = std::vector<ov::float16>(scale_mem->count());
+        std::fill(scale_data.begin(), scale_data.end(), 1);
+        set_values(scale_mem, scale_data);
+
+        auto in_layout = is_dynamic ? layout{ {-1, ifm_num}, data_types::f16, format::bfyx }
+                                    : layout{ {batch_num, ifm_num}, data_types::f16, format::bfyx };
+        primitive_id empty_id = "";
+
+        auto fc_prim = fully_connected("fc_prim", input_info("input"), "weights", empty_id, "scale", empty_id, data_types::f16);
+
+        topology topology(
+            input_layout("input", in_layout),
+            data("weights", weights_mem),
+            data("scale", scale_mem),
+            fc_prim
+        );
+
+        auto config = get_test_default_config(engine);
+        config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
+        config.set_property(ov::intel_gpu::optimize_data(true));
+        network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test);
+
+        // Impl is selected only when it is running from cldnn
+        if (is_dynamic && !engine.get_device_info().supports_immad) {
+            auto inst = network->get_primitive("fc_prim");
+            auto impl = inst->get_impl();
+            ASSERT_TRUE(impl != NULL);
+            ASSERT_EQ(impl->get_kernels().size(), 1);
+        }
+
+        network->set_input_data("input", input_mem);
+
+        auto outputs = network->execute();
+        ASSERT_EQ(outputs.size(), size_t(1));
+        ASSERT_EQ(outputs.begin()->first, "fc_prim");
+
+        auto output_mem = outputs.begin()->second.get_memory();
+        cldnn::mem_lock<ov::float16> output_ptr (output_mem, get_test_stream());
+        for (size_t i = 0; i < output_ptr.size(); i++)
+            ASSERT_NEAR(ov::float16(ifm_num), output_ptr[i], 9.0) << "i = " << i;
+    }
+
     void test_compressed_int4_scale_reuse(bool is_caching_test, bool is_dynamic, long int batch_num, long int scales_group_size = 128) {
         tests::random_generator rg(GET_SUITE_NAME);
         auto& engine = get_test_engine();
@@ -3259,6 +3332,10 @@ TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dynamic) {
     this->test_compressed_int4_scale(false, true, 260);
 }
 
+TEST_F(fully_connected_gpu_tests, compressed_int4_dynamic_acc) {
+    this->test_compressed_int4_accumulation(false, true, 512);
+}
+
 TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dynamic_cached) {
     this->test_compressed_int4_scale(true, true, 260);
 }
@@ -3323,6 +3400,32 @@ TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dyn_cache_dynamic) {
     this->test_compressed_int4_scale_dyn_quan(true, true, 512);
 }
 
+TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dynamic_f_input) {
+    this->test_compressed_int4_scale(false, true, 256, true);
+}
+
+TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dynamic_f_input_cached) {
+    this->test_compressed_int4_scale(true, true, 260, true);
+}
+TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dynamic_f_input_b1g64) {
+    this->test_compressed_int4_scale(false, true, 1, 64, true);
+}
+
+TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dynamic_f_input_b1g128) {
+    this->test_compressed_int4_scale(false, true, 1, 128, true);
+}
+
+TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dyn_quan_dynamic_f_input_single_batch) {
+    this->test_compressed_int4_scale_dyn_quan(false, true, 1, true);
+}
+
+TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dyn_quan_dynamic_f_input) {
+    this->test_compressed_int4_scale_dyn_quan(false, true, 512, true);
+}
+
+TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dyn_quan_dynamic_f_input_unaligned) {
+    this->test_compressed_int4_scale_dyn_quan(false, true, 511, true);
+}
 
 
 TEST_F(fully_connected_gpu_tests, compressed_scale_bias) {
diff --git a/src/plugins/intel_npu/src/al/include/npu.hpp b/src/plugins/intel_npu/src/al/include/npu.hpp
index 925b80ca7734fe..5d46ae3ae2a4ac 100644
--- a/src/plugins/intel_npu/src/al/include/npu.hpp
+++ b/src/plugins/intel_npu/src/al/include/npu.hpp
@@ -92,6 +92,11 @@ class IDevice : public std::enable_shared_from_this<IDevice> {
         ov::intel_npu::MemType mem_type = ov::intel_npu::MemType::L0_INTERNAL_BUF,
         void* mem = nullptr);
 
+    virtual ov::SoPtr<ov::ITensor> createHostTensor(std::shared_ptr<ov::IRemoteContext> context,
+                                                    const ov::element::Type& element_type,
+                                                    const ov::Shape& shape,
+                                                    const Config& config);
+
 protected:
     virtual ~IDevice() = default;
 };
diff --git a/src/plugins/intel_npu/src/al/src/npu.cpp b/src/plugins/intel_npu/src/al/src/npu.cpp
index 3b8c670ffd3404..8da55475e9b4f7 100644
--- a/src/plugins/intel_npu/src/al/src/npu.cpp
+++ b/src/plugins/intel_npu/src/al/src/npu.cpp
@@ -81,4 +81,11 @@ ov::SoPtr<ov::IRemoteTensor> IDevice::createRemoteTensor(std::shared_ptr<ov::IRe
     OPENVINO_THROW("Create Remote Tensor is not supported");
 }
 
+ov::SoPtr<ov::ITensor> IDevice::createHostTensor(std::shared_ptr<ov::IRemoteContext>,
+                                                 const ov::element::Type&,
+                                                 const ov::Shape&,
+                                                 const Config&) {
+    OPENVINO_THROW("Create Host Tensor is not supported");
+}
+
 }  // namespace intel_npu
diff --git a/src/plugins/intel_npu/src/backend/include/zero_device.hpp b/src/plugins/intel_npu/src/backend/include/zero_device.hpp
index f198453b932d83..fc4ac58f7643c5 100644
--- a/src/plugins/intel_npu/src/backend/include/zero_device.hpp
+++ b/src/plugins/intel_npu/src/backend/include/zero_device.hpp
@@ -47,6 +47,11 @@ class ZeroDevice : public IDevice {
         ov::intel_npu::MemType mem_type = ov::intel_npu::MemType::L0_INTERNAL_BUF,
         void* mem = nullptr) override;
 
+    ov::SoPtr<ov::ITensor> createHostTensor(std::shared_ptr<ov::IRemoteContext> context,
+                                            const ov::element::Type& element_type,
+                                            const ov::Shape& shape,
+                                            const Config& config) override;
+
     ZeroDevice& operator=(const ZeroDevice&) = delete;
     ZeroDevice(const ZeroDevice&) = delete;
 
diff --git a/src/plugins/intel_npu/src/backend/include/zero_host_tensor.hpp b/src/plugins/intel_npu/src/backend/include/zero_host_tensor.hpp
new file mode 100644
index 00000000000000..ce28bf572541bc
--- /dev/null
+++ b/src/plugins/intel_npu/src/backend/include/zero_host_tensor.hpp
@@ -0,0 +1,39 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "intel_npu/al/config/config.hpp"
+#include "openvino/runtime/itensor.hpp"
+#include "zero_init.hpp"
+#include "zero_remote_tensor.hpp"
+
+namespace intel_npu {
+
+class ZeroHostTensor : public ov::ITensor {
+public:
+    ZeroHostTensor(std::shared_ptr<ov::IRemoteContext> context,
+                   std::shared_ptr<ZeroInitStructsHolder> init_structs,
+                   const ov::element::Type element_type,
+                   const ov::Shape& shape,
+                   const Config& config);
+
+    ~ZeroHostTensor() override = default;
+
+    void* data(const ov::element::Type& element_type) const override;
+    const ov::element::Type& get_element_type() const override;
+
+    const ov::Shape& get_shape() const override;
+
+    const ov::Strides& get_strides() const override;
+
+    void set_shape(ov::Shape new_shape) override;
+
+    std::shared_ptr<ZeroRemoteTensor> get_impl() const;
+
+private:
+    std::shared_ptr<ZeroRemoteTensor> m_impl;
+};
+
+}  // namespace intel_npu
diff --git a/src/plugins/intel_npu/src/backend/include/zero_infer_request.hpp b/src/plugins/intel_npu/src/backend/include/zero_infer_request.hpp
index 725a0e96c76f6e..cbf3a9466364be 100644
--- a/src/plugins/intel_npu/src/backend/include/zero_infer_request.hpp
+++ b/src/plugins/intel_npu/src/backend/include/zero_infer_request.hpp
@@ -53,8 +53,9 @@ class ZeroInferRequest final : public SyncInferRequest {
      * @brief Check the received remote tensor and copy it to the Level Zero tensor
      * @param tensor Reference to a tensor.
      * @param name Friendly name of the tensor.
+     * @param isParameter True if tensor is a parameter.
      */
-    void set_remote_tensor_data(std::shared_ptr<ZeroRemoteTensor> tensor, const std::string& name);
+    void set_remote_tensor_data(std::shared_ptr<ZeroRemoteTensor> tensor, const std::string& name, bool isParameter);
 
     void check_network_precision(const ov::element::Type_t precision) const override;
     void create_pipeline();
@@ -77,8 +78,7 @@ class ZeroInferRequest final : public SyncInferRequest {
     // specific operations on the plugin in this case.
     size_t _batchSize = DEFAULT_BATCH_SIZE;
 
-    bool _createPipeline = true;
-    bool _updateCommandList = false;
+    bool _pipelineIsCreated = false;
 };
 
 }  //  namespace intel_npu
diff --git a/src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp b/src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp
index 78bca3718711e3..b8724dcdd53f73 100644
--- a/src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp
+++ b/src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp
@@ -16,7 +16,6 @@ struct TensorData {
     void* mem;
     size_t size;
     bool levelZeroTensorCreatedLocally = true;
-    bool changed = false;
 };
 
 struct Pipeline {
@@ -32,7 +31,7 @@ struct Pipeline {
     virtual void pull(size_t batch_index) = 0;
     virtual void reset(size_t batch_index) const = 0;
 
-    virtual void updateCommandList(std::unordered_map<std::string, TensorData>& tensors_data, size_t batch_size) = 0;
+    virtual void updateCommandList(const TensorData& tensors_data, uint32_t index, size_t batch_size) = 0;
 
 protected:
     zeroMemory::MemoryManagementUnit _deviceInputs;
diff --git a/src/plugins/intel_npu/src/backend/include/zero_wrappers.hpp b/src/plugins/intel_npu/src/backend/include/zero_wrappers.hpp
index 2b432619fff4f8..76cfce8fecfa26 100644
--- a/src/plugins/intel_npu/src/backend/include/zero_wrappers.hpp
+++ b/src/plugins/intel_npu/src/backend/include/zero_wrappers.hpp
@@ -87,7 +87,7 @@ class CommandList {
     void appendGraphInitialize(const ze_graph_handle_t& graph_handle) const;
     void appendGraphExecute(const ze_graph_handle_t& graph_handle,
                             const ze_graph_profiling_query_handle_t& profiling_query_handle) const;
-    void updateMutableCommandList(const void* pNext = nullptr) const;
+    void updateMutableCommandList(uint32_t arg_index, const void* arg_value) const;
     void appendNpuTimestamp(uint64_t* timestamp_buff) const;
     void appendBarrier() const;
     void close() const;
@@ -96,9 +96,6 @@ class CommandList {
     inline ze_command_list_handle_t handle() const {
         return _handle;
     }
-    uint64_t getCommandListId() const {
-        return _command_id;
-    }
 
 private:
     ze_command_list_handle_t _handle = nullptr;
diff --git a/src/plugins/intel_npu/src/backend/src/zero_device.cpp b/src/plugins/intel_npu/src/backend/src/zero_device.cpp
index 56ee453b7d77c2..595ce734b533e9 100644
--- a/src/plugins/intel_npu/src/backend/src/zero_device.cpp
+++ b/src/plugins/intel_npu/src/backend/src/zero_device.cpp
@@ -9,6 +9,7 @@
 #include "intel_npu/al/itt.hpp"
 #include "intel_npu/utils/zero/zero_api.hpp"
 #include "zero_executor.hpp"
+#include "zero_host_tensor.hpp"
 #include "zero_infer_request.hpp"
 #include "zero_remote_tensor.hpp"
 #include "zero_utils.hpp"
@@ -193,3 +194,10 @@ ov::SoPtr<ov::IRemoteTensor> ZeroDevice::createRemoteTensor(std::shared_ptr<ov::
     return {std::make_shared<
         ZeroRemoteTensor>(context, _initStructs, element_type, shape, config, tensor_type, mem_type, mem)};
 };
+
+ov::SoPtr<ov::ITensor> ZeroDevice::createHostTensor(std::shared_ptr<ov::IRemoteContext> context,
+                                                    const ov::element::Type& element_type,
+                                                    const ov::Shape& shape,
+                                                    const Config& config) {
+    return {std::make_shared<ZeroHostTensor>(context, _initStructs, element_type, shape, config)};
+};
diff --git a/src/plugins/intel_npu/src/backend/src/zero_host_tensor.cpp b/src/plugins/intel_npu/src/backend/src/zero_host_tensor.cpp
new file mode 100644
index 00000000000000..e4ebe2c1d5a8ba
--- /dev/null
+++ b/src/plugins/intel_npu/src/backend/src/zero_host_tensor.cpp
@@ -0,0 +1,48 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "zero_host_tensor.hpp"
+
+#include "openvino/runtime/intel_npu/remote_properties.hpp"
+
+namespace intel_npu {
+
+ZeroHostTensor::ZeroHostTensor(std::shared_ptr<ov::IRemoteContext> context,
+                               std::shared_ptr<ZeroInitStructsHolder> init_structs,
+                               const ov::element::Type element_type,
+                               const ov::Shape& shape,
+                               const Config& config)
+    : m_impl(std::make_shared<ZeroRemoteTensor>(context,
+                                                init_structs,
+                                                element_type,
+                                                shape,
+                                                config,
+                                                ov::intel_npu::TensorType::BINDED,
+                                                ov::intel_npu::MemType::L0_INTERNAL_BUF)) {}
+
+void* ZeroHostTensor::data(const ov::element::Type&) const {
+    return m_impl->get_properties().find(ov::intel_npu::mem_handle.name())->second.as<void*>();
+}
+
+const ov::element::Type& ZeroHostTensor::get_element_type() const {
+    return m_impl->get_element_type();
+}
+
+const ov::Shape& ZeroHostTensor::get_shape() const {
+    return m_impl->get_shape();
+}
+
+const ov::Strides& ZeroHostTensor::get_strides() const {
+    return m_impl->get_strides();
+}
+
+void ZeroHostTensor::set_shape(ov::Shape new_shape) {
+    m_impl->set_shape(new_shape);
+}
+
+std::shared_ptr<ZeroRemoteTensor> ZeroHostTensor::get_impl() const {
+    return m_impl;
+}
+
+}  // namespace intel_npu
diff --git a/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp b/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp
index 36738f32e9f6c3..773827a4864724 100644
--- a/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp
+++ b/src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp
@@ -402,13 +402,26 @@ void ZeroInferRequest::set_tensor_data(std::shared_ptr<ov::ITensor> tensor, cons
     if (setTensorData) {
         _tensorsData[name] = TensorData{_copyAllTensors.at(name)->data(),
                                         _copyAllTensors.at(name)->get_byte_size(),
-                                        levelZeroTensorCreatedLocally,
-                                        !_createPipeline};
-        _updateCommandList = true;
+                                        levelZeroTensorCreatedLocally};
+
+        if (_pipelineIsCreated) {
+            _logger.debug("ZeroInferRequest::infer_async - update command list");
+
+            intel_npu::ZeroExecutor::ArgumentDescriptor desc;
+            if (isParameter) {
+                desc = _executor->inputs_desc_map().at(name);
+            } else {
+                desc = _executor->outputs_desc_map().at(name);
+            }
+
+            _pipeline->updateCommandList(_tensorsData[name], desc.idx, _batchSize);
+        }
     }
 }
 
-void ZeroInferRequest::set_remote_tensor_data(std::shared_ptr<ZeroRemoteTensor> tensor, const std::string& name) {
+void ZeroInferRequest::set_remote_tensor_data(std::shared_ptr<ZeroRemoteTensor> tensor,
+                                              const std::string& name,
+                                              bool isParameter) {
     auto l0_context = reinterpret_cast<ze_context_handle_t>(
         extract_object(tensor->get_context()->get_property(), ov::intel_npu::l0_context));
     if (_initStructs->getContext() != l0_context) {
@@ -421,8 +434,20 @@ void ZeroInferRequest::set_remote_tensor_data(std::shared_ptr<ZeroRemoteTensor>
     }
 
     _copyAllTensors[name] = tensor;
-    _tensorsData[name] = TensorData{data, tensor->get_byte_size(), false, !_createPipeline};
-    _updateCommandList = true;
+    _tensorsData[name] = TensorData{data, tensor->get_byte_size(), false};
+
+    if (_pipelineIsCreated) {
+        _logger.debug("ZeroInferRequest::infer_async - update command list");
+
+        intel_npu::ZeroExecutor::ArgumentDescriptor desc;
+        if (isParameter) {
+            desc = _executor->inputs_desc_map().at(name);
+        } else {
+            desc = _executor->outputs_desc_map().at(name);
+        }
+
+        _pipeline->updateCommandList(_tensorsData[name], desc.idx, _batchSize);
+    }
 }
 
 void ZeroInferRequest::set_tensor(const ov::Output<const ov::Node>& port, const ov::SoPtr<ov::ITensor>& tensor) {
@@ -444,7 +469,9 @@ void ZeroInferRequest::set_tensor(const ov::Output<const ov::Node>& port, const
                             ov::op::util::is_parameter(port.get_node()));
         } else {
             _logger.debug("ZeroInferRequest::set_tensor - set new remote tensor");
-            set_remote_tensor_data(remoteTensor, port.get_node()->get_friendly_name());
+            set_remote_tensor_data(remoteTensor,
+                                   port.get_node()->get_friendly_name(),
+                                   ov::op::util::is_parameter(port.get_node()));
         }
     }
 }
@@ -489,23 +516,11 @@ void ZeroInferRequest::infer_async() {
     OV_ITT_SCOPED_TASK(itt::domains::LevelZeroBackend, "infer_async");
 
     _executor->mutexLock();
-
-    if (_createPipeline) {
+    if (!_pipelineIsCreated) {
         create_pipeline();
 
-        _createPipeline = false;
-        _updateCommandList = false;
+        _pipelineIsCreated = true;
     }
-
-    if (_initStructs->getMutableCommandListVersion()) {
-        if (_updateCommandList) {
-            _logger.debug("ZeroInferRequest::infer_async - update command list");
-            _pipeline->updateCommandList(_tensorsData, _batchSize);
-
-            _updateCommandList = false;
-        }
-    }
-
     _executor->mutexUnlock();
 
     for (const std::string& name : _inputAndStateInputNames) {
diff --git a/src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp b/src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp
index 3a4ea554d157ec..f98e84a34a0a46 100644
--- a/src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp
+++ b/src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp
@@ -143,7 +143,7 @@ struct DiscretePipeline final : public Pipeline {
         }
     };
 
-    void updateCommandList(std::unordered_map<std::string, TensorData>&, size_t) override{};
+    void updateCommandList(const TensorData&, uint32_t, size_t) override {}
 
 private:
     const Config _config;
@@ -274,60 +274,11 @@ struct IntegratedPipeline final : public Pipeline {
         _logger.debug("IntegratedPipeline - rest() completed");
     };
 
-    void updateCommandList(std::unordered_map<std::string, TensorData>& tensors_data, size_t batch_size) override {
-        std::vector<ze_mutable_graph_argument_exp_desc_t> mutable_argument_desc;
-        int32_t changed_tensors = 0;
-
-        for (const auto& desc : tensors_data) {
-            if (desc.second.changed == true) {
-                changed_tensors++;
-            }
-        }
-
-        mutable_argument_desc.reserve(changed_tensors);
-
-        auto set_mutable_desc =
-            [&](int32_t mutable_desc_index, uint64_t command_list_id, uint32_t arg_index, const void* arg_value) {
-                mutable_argument_desc.emplace_back(ze_mutable_graph_argument_exp_desc_t{
-                    ZE_STRUCTURE_TYPE_MUTABLE_GRAPH_ARGUMENT_EXP_DESC,
-                    mutable_desc_index ? &mutable_argument_desc.at(mutable_desc_index - 1) : nullptr,
-                    command_list_id,
-                    arg_index,
-                    arg_value});
-            };
-
+    void updateCommandList(const TensorData& tensors_data, uint32_t index, size_t batch_size) override {
         for (size_t i = 0; i < batch_size; i++) {
-            int32_t mutable_argument_desc_index = -1;
-
-            for (const auto& desc : _executor->inputs_desc_map()) {
-                TensorData& inputTensorData = tensors_data.at(desc.first);
-
-                if (inputTensorData.changed == true) {
-                    set_mutable_desc(
-                        ++mutable_argument_desc_index,
-                        _command_lists.at(i)->getCommandListId(),
-                        desc.second.idx,
-                        static_cast<unsigned char*>(inputTensorData.mem) + (i * inputTensorData.size) / batch_size);
-
-                    inputTensorData.changed = false;
-                }
-            }
-
-            for (const auto& desc : _executor->outputs_desc_map()) {
-                TensorData& outputTensorData = tensors_data.at(desc.first);
-
-                if (outputTensorData.changed == true) {
-                    set_mutable_desc(
-                        ++mutable_argument_desc_index,
-                        _command_lists.at(i)->getCommandListId(),
-                        desc.second.idx,
-                        static_cast<unsigned char*>(outputTensorData.mem) + (i * outputTensorData.size) / batch_size);
-
-                    outputTensorData.changed = false;
-                }
-            }
-
-            _command_lists.at(i)->updateMutableCommandList(&mutable_argument_desc.at(mutable_argument_desc_index));
+            _command_lists.at(i)->updateMutableCommandList(
+                index,
+                static_cast<unsigned char*>(tensors_data.mem) + (i * tensors_data.size) / batch_size);
             _command_lists.at(i)->close();
         }
     };
diff --git a/src/plugins/intel_npu/src/backend/src/zero_wrappers.cpp b/src/plugins/intel_npu/src/backend/src/zero_wrappers.cpp
index 77ebd858cc3e07..2cd249aad19a92 100644
--- a/src/plugins/intel_npu/src/backend/src/zero_wrappers.cpp
+++ b/src/plugins/intel_npu/src/backend/src/zero_wrappers.cpp
@@ -114,11 +114,16 @@ CommandList::~CommandList() {
         _log.error("zeCommandListDestroy failed %#X", uint64_t(result));
     }
 }
-void CommandList::updateMutableCommandList(const void* pNext) const {
-    ze_mutable_commands_exp_desc_t mutable_commands_exp_desc_t = {
-        static_cast<ze_structure_type_t>(ZE_MUTABLE_COMMAND_EXP_FLAG_GRAPH_ARGUMENT),
-        pNext,
-        0};
+void CommandList::updateMutableCommandList(uint32_t arg_index, const void* arg_value) const {
+    ze_mutable_graph_argument_exp_desc_t desc = {ZE_STRUCTURE_TYPE_MUTABLE_GRAPH_ARGUMENT_EXP_DESC,
+                                                 nullptr,
+                                                 _command_id,
+                                                 arg_index,
+                                                 arg_value};
+
+    ze_mutable_commands_exp_desc_t mutable_commands_exp_desc_t = {ZE_STRUCTURE_TYPE_MUTABLE_COMMANDS_EXP_DESC,
+                                                                  &desc,
+                                                                  0};
 
     zeroUtils::throwOnFail("zeCommandListUpdateMutableCommandsExp",
                            zeCommandListUpdateMutableCommandsExp(_handle, &mutable_commands_exp_desc_t));
diff --git a/src/plugins/intel_npu/src/plugin/include/remote_context.hpp b/src/plugins/intel_npu/src/plugin/include/remote_context.hpp
index 398884dcb673ac..2fce44526c358e 100644
--- a/src/plugins/intel_npu/src/plugin/include/remote_context.hpp
+++ b/src/plugins/intel_npu/src/plugin/include/remote_context.hpp
@@ -43,6 +43,14 @@ class RemoteContextImpl : public ov::IRemoteContext {
                                                const ov::Shape& shape,
                                                const ov::AnyMap& params) override;
 
+    /**
+     * @brief This method is used to create a host tensor object friendly for the device in current context.
+     * @param type Tensor element type.
+     * @param shape Tensor shape.
+     * @return A tensor instance with device friendly memory.
+     */
+    ov::SoPtr<ov::ITensor> create_host_tensor(const ov::element::Type type, const ov::Shape& shape) override;
+
 private:
     std::shared_ptr<ov::IRemoteContext> get_this_shared_ptr();
 
diff --git a/src/plugins/intel_npu/src/plugin/src/remote_context.cpp b/src/plugins/intel_npu/src/plugin/src/remote_context.cpp
index 25683be31fe9e4..9539826f985147 100644
--- a/src/plugins/intel_npu/src/plugin/src/remote_context.cpp
+++ b/src/plugins/intel_npu/src/plugin/src/remote_context.cpp
@@ -84,6 +84,15 @@ ov::SoPtr<ov::IRemoteTensor> RemoteContextImpl::create_tensor(const ov::element:
                                       mem_handle_object);
 }
 
+ov::SoPtr<ov::ITensor> RemoteContextImpl::create_host_tensor(const ov::element::Type type, const ov::Shape& shape) {
+    auto device = _backends->getDevice(_config.get<DEVICE_ID>());
+    if (device == nullptr) {
+        OPENVINO_THROW("Device is not available");
+    }
+
+    return device->createHostTensor(get_this_shared_ptr(), type, shape, _config);
+}
+
 const std::string& RemoteContextImpl::get_device_name() const {
     return _device_name;
 }
diff --git a/src/plugins/intel_npu/src/utils/include/intel_npu/utils/zero/zero_api.hpp b/src/plugins/intel_npu/src/utils/include/intel_npu/utils/zero/zero_api.hpp
index 3de0dedd8d6878..6cb9e23d203c11 100644
--- a/src/plugins/intel_npu/src/utils/include/intel_npu/utils/zero/zero_api.hpp
+++ b/src/plugins/intel_npu/src/utils/include/intel_npu/utils/zero/zero_api.hpp
@@ -8,6 +8,8 @@
 
 #include <memory>
 
+#include "openvino/core/except.hpp"
+
 #ifndef _WIN32
 #    define LIB_ZE_LOADER_SUFFIX ".1"
 #endif
@@ -26,8 +28,6 @@ namespace intel_npu {
     symbol_statement(zeCommandListCreate)                     \
     symbol_statement(zeCommandListDestroy)                    \
     symbol_statement(zeCommandListReset)                      \
-    symbol_statement(zeCommandListGetNextCommandIdExp)        \
-    symbol_statement(zeCommandListUpdateMutableCommandsExp)   \
     symbol_statement(zeCommandQueueCreate)                    \
     symbol_statement(zeCommandQueueDestroy)                   \
     symbol_statement(zeCommandQueueExecuteCommandLists)       \
@@ -58,6 +58,11 @@ namespace intel_npu {
     symbol_statement(zeMemAllocHost)                          \
     symbol_statement(zeMemFree)                               \
     symbol_statement(zeMemGetAllocProperties)
+
+//unsupported symbols with older ze_loader versions
+#define weak_symbols_list()                                   \
+    symbol_statement(zeCommandListGetNextCommandIdExp)        \
+    symbol_statement(zeCommandListUpdateMutableCommandsExp)
 // clang-format on
 
 class ZeroApi {
@@ -73,6 +78,7 @@ class ZeroApi {
     }
 #define symbol_statement(symbol) decltype(&::symbol) symbol;
     symbols_list();
+    weak_symbols_list();
 #undef symbol_statement
 
 private:
@@ -84,11 +90,17 @@ class ZeroApi {
 #define symbol_statement(symbol)                                                                            \
     template <typename... Args>                                                                             \
     inline typename std::invoke_result<decltype(&::symbol), Args...>::type wrapped_##symbol(Args... args) { \
-        return ZeroApi::getInstance().symbol(std::forward<Args>(args)...);                                  \
+        auto& ref = ZeroApi::getInstance();                                                                 \
+        if (ref.symbol == nullptr) {                                                                        \
+            OPENVINO_THROW("Unsupported symbol " #symbol);                                                  \
+        }                                                                                                   \
+        return ref.symbol(std::forward<Args>(args)...);                                                     \
     }
 symbols_list();
+weak_symbols_list();
 #undef symbol_statement
 #define symbol_statement(symbol) inline decltype(&::symbol) symbol = wrapped_##symbol;
 symbols_list();
+weak_symbols_list();
 #undef symbol_statement
 }  // namespace intel_npu
diff --git a/src/plugins/intel_npu/src/utils/src/zero/zero_api.cpp b/src/plugins/intel_npu/src/utils/src/zero/zero_api.cpp
index fd3e128b3afc94..991e8d5f9f9e65 100644
--- a/src/plugins/intel_npu/src/utils/src/zero/zero_api.cpp
+++ b/src/plugins/intel_npu/src/utils/src/zero/zero_api.cpp
@@ -4,7 +4,6 @@
 
 #include "intel_npu/utils/zero/zero_api.hpp"
 
-#include "openvino/core/except.hpp"
 #include "openvino/util/file_util.hpp"
 #include "openvino/util/shared_object.hpp"
 
@@ -29,14 +28,24 @@ ZeroApi::ZeroApi() {
     try {
 #define symbol_statement(symbol) \
     this->symbol = reinterpret_cast<decltype(&::symbol)>(ov::util::get_symbol(lib, #symbol));
-        symbols_list()
+        symbols_list();
 #undef symbol_statement
     } catch (const std::runtime_error& error) {
         OPENVINO_THROW(error.what());
     }
 
+#define symbol_statement(symbol)                                                                  \
+    try {                                                                                         \
+        this->symbol = reinterpret_cast<decltype(&::symbol)>(ov::util::get_symbol(lib, #symbol)); \
+    } catch (const std::runtime_error&) {                                                         \
+        this->symbol = nullptr;                                                                   \
+    }
+    weak_symbols_list();
+#undef symbol_statement
+
 #define symbol_statement(symbol) symbol = this->symbol;
     symbols_list();
+    weak_symbols_list();
 #undef symbol_statement
 }
 
diff --git a/src/plugins/intel_npu/tests/functional/behavior/infer_request_run.hpp b/src/plugins/intel_npu/tests/functional/behavior/infer_request_run.hpp
index 660eb875f72d38..6b7372223c6bea 100644
--- a/src/plugins/intel_npu/tests/functional/behavior/infer_request_run.hpp
+++ b/src/plugins/intel_npu/tests/functional/behavior/infer_request_run.hpp
@@ -473,6 +473,55 @@ TEST_P(BatchingRunTests, SetInputTensorInfer_Caching) {
     for (size_t i = 0; i < shape_size; ++i) {
         EXPECT_NEAR(actual[i], 6.f, 1e-5) << "Expected=6, actual=" << actual[i] << " for index " << i;
     }
+
+    delete[] buffer;
+}
+
+TEST_P(BatchingRunTests, CheckTwoRunsInfer) {
+    auto batch_shape = Shape{4, 2, 2, 2};
+    auto shape_size = ov::shape_size(batch_shape);
+    auto model = createBatchingModel(element::f32, batch_shape, "N...");
+    float* buffer = new float[shape_size];
+
+    auto context = core->get_default_context(target_device);
+
+    compiled_model = core->compile_model(model, target_device, configuration);
+    ov::InferRequest inference_request;
+    inference_request = compiled_model.create_infer_request();
+
+    ov::Tensor tensor{element::f32, batch_shape, buffer};
+
+    inference_request.set_input_tensor(tensor);
+    auto actual_tensor = inference_request.get_output_tensor(0);
+    auto* actual = actual_tensor.data<float>();
+    auto* input_data = tensor.data<float>();
+    for (size_t i = 0; i < shape_size; ++i) {
+        input_data[i] = 5.f;
+    }
+    inference_request.infer();  // Adds '1' to each element
+    for (size_t i = 0; i < shape_size; ++i) {
+        EXPECT_NEAR(actual[i], 6.f, 1e-5) << "Expected=6, actual=" << actual[i] << " for index " << i;
+    }
+
+    auto l0_host_input_tensor = context.create_host_tensor(ov::element::f32, batch_shape);
+    auto l0_host_output_tensor = context.create_host_tensor(ov::element::f32, actual_tensor.get_shape());
+
+    auto* input_data_host_tensor = l0_host_input_tensor.data();
+    input_data = reinterpret_cast<float*>(input_data_host_tensor);
+    for (size_t i = 0; i < shape_size; ++i) {
+        input_data[i] = 5.f;
+    }
+    inference_request.set_input_tensor(l0_host_input_tensor);
+    inference_request.set_output_tensor(l0_host_output_tensor);
+    inference_request.infer();
+
+    auto* actual_host_tensor = l0_host_output_tensor.data();
+    actual = reinterpret_cast<float*>(actual_host_tensor);
+    for (size_t i = 0; i < shape_size; ++i) {
+        EXPECT_NEAR(actual[i], 6.f, 1e-5) << "Expected=6, actual=" << actual[i] << " for index " << i;
+    }
+
+    delete[] buffer;
 }
 
 }  // namespace behavior
diff --git a/src/plugins/intel_npu/tests/functional/behavior/remote_tensor_tests/remote_run.hpp b/src/plugins/intel_npu/tests/functional/behavior/remote_tensor_tests/remote_run.hpp
index a6023e6e678d3d..a58da0253a9d74 100644
--- a/src/plugins/intel_npu/tests/functional/behavior/remote_tensor_tests/remote_run.hpp
+++ b/src/plugins/intel_npu/tests/functional/behavior/remote_tensor_tests/remote_run.hpp
@@ -128,7 +128,7 @@ TEST_P(RemoteRunTests, CheckRemoteTensorInternalBufChangingTensors) {
 
     // set output remote tensor
     auto remote_output_tensor = inference_request.get_output_tensor();
-    auto output_remote_tensor = context.create_l0_host_tensor(ov::element::f32, remote_output_tensor.get_shape());
+    auto output_remote_tensor = context.create_tensor(ov::element::f32, remote_output_tensor.get_shape());
     remote_output_tensor = {};
 
     OV_ASSERT_NO_THROW(inference_request.set_output_tensor(output_remote_tensor));
@@ -202,8 +202,7 @@ TEST_P(RemoteRunTests, CheckOutputDataFromTwoRunsInOutRemoteTensors1) {
 
         auto remote_input_tensor =
             context.create_l0_host_tensor(ov::element::f32, input_shape, ov::intel_npu::TensorType::INPUT);
-        remote_output_tensor = context.create_l0_host_tensor(ov::element::f32, output_shape)
-                                   .as<ov::intel_npu::level_zero::ZeroBufferTensor>();
+        remote_output_tensor = context.create_l0_host_tensor(ov::element::f32, output_shape);
 
         memset(remote_input_tensor.get(), 99, byte_size);
         OV_ASSERT_NO_THROW(inference_request.set_input_tensor(remote_input_tensor));
@@ -305,8 +304,7 @@ TEST_P(RemoteRunTests, CheckOutputDataFromTwoRunsInOutRemoteTensors3) {
 
     auto remote_input_tensor =
         context.create_l0_host_tensor(ov::element::f32, input_shape, ov::intel_npu::TensorType::INPUT);
-    auto remote_output_tensor =
-        context.create_l0_host_tensor(ov::element::f32, output_shape).as<ov::intel_npu::level_zero::ZeroBufferTensor>();
+    auto remote_output_tensor = context.create_l0_host_tensor(ov::element::f32, output_shape);
 
     memset(remote_input_tensor.get(), 99, byte_size);
     OV_ASSERT_NO_THROW(inference_request.set_input_tensor(remote_input_tensor));
@@ -318,6 +316,74 @@ TEST_P(RemoteRunTests, CheckOutputDataFromTwoRunsInOutRemoteTensors3) {
     EXPECT_EQ(memcmp(first_output.data(), second_output, first_output.get_byte_size()), 0);
 }
 
+TEST_P(RemoteRunTests, CheckOutputDataFromTwoRunsInOutRemoteTensorsHostTensor1) {
+    // Skip test according to plugin specific disabledTestPatterns() (if any)
+    SKIP_IF_CURRENT_TEST_IS_DISABLED()
+
+    ov::InferRequest inference_request;
+    ov::Tensor first_output;
+
+    auto context = core->get_default_context(target_device).as<ov::intel_npu::level_zero::ZeroContext>();
+
+    OV_ASSERT_NO_THROW(compiled_model = core->compile_model(ov_model, target_device, configuration));
+    OV_ASSERT_NO_THROW(inference_request = compiled_model.create_infer_request());
+    auto tensor = inference_request.get_input_tensor();
+    memset(tensor.data(), 99, tensor.get_byte_size());
+    OV_ASSERT_NO_THROW(inference_request.infer());
+    first_output = inference_request.get_output_tensor();
+
+    auto l0_host_input_tensor = context.create_host_tensor(ov::element::f32, tensor.get_shape());
+    auto l0_host_output_tensor = context.create_host_tensor(ov::element::f32, first_output.get_shape());
+
+    memset(l0_host_input_tensor.data(), 99, tensor.get_byte_size());
+    OV_ASSERT_NO_THROW(inference_request.set_input_tensor(l0_host_input_tensor));
+    OV_ASSERT_NO_THROW(inference_request.set_output_tensor(l0_host_output_tensor));
+    OV_ASSERT_NO_THROW(inference_request.infer());
+
+    EXPECT_NE(first_output.data(), l0_host_output_tensor.data());
+    EXPECT_EQ(memcmp(first_output.data(), l0_host_output_tensor.data(), first_output.get_byte_size()), 0);
+}
+
+TEST_P(RemoteRunTests, CheckOutputDataFromTwoRunsInOutRemoteTensorsHostTensor2) {
+    // Skip test according to plugin specific disabledTestPatterns() (if any)
+    SKIP_IF_CURRENT_TEST_IS_DISABLED()
+
+    ov::InferRequest inference_request;
+
+    auto context = core->get_default_context(target_device).as<ov::intel_npu::level_zero::ZeroContext>();
+
+    OV_ASSERT_NO_THROW(compiled_model = core->compile_model(ov_model, target_device, configuration));
+    OV_ASSERT_NO_THROW(inference_request = compiled_model.create_infer_request());
+    auto input_tensor = inference_request.get_input_tensor();
+    auto output_tensor = inference_request.get_output_tensor();
+    const auto byte_size = input_tensor.get_byte_size();
+    auto input_shape = input_tensor.get_shape();
+    auto output_shape = output_tensor.get_shape();
+    input_tensor = {};
+    output_tensor = {};
+
+    auto remote_input_tensor =
+        context.create_l0_host_tensor(ov::element::f32, input_shape, ov::intel_npu::TensorType::INPUT);
+    auto remote_output_tensor =
+        context.create_l0_host_tensor(ov::element::f32, output_shape, ov::intel_npu::TensorType::INPUT);
+    memset(remote_input_tensor.get(), 1, byte_size);
+    OV_ASSERT_NO_THROW(inference_request.set_input_tensor(remote_input_tensor));
+    OV_ASSERT_NO_THROW(inference_request.set_output_tensor(remote_output_tensor));
+    OV_ASSERT_NO_THROW(inference_request.infer());
+
+    auto l0_host_input_tensor = context.create_host_tensor(ov::element::f32, input_shape);
+    auto l0_host_output_tensor = context.create_host_tensor(ov::element::f32, output_shape);
+
+    memset(l0_host_input_tensor.data(), 99, byte_size);
+    OV_ASSERT_NO_THROW(inference_request.set_input_tensor(l0_host_input_tensor));
+    OV_ASSERT_NO_THROW(inference_request.set_output_tensor(l0_host_output_tensor));
+    OV_ASSERT_NO_THROW(inference_request.infer());
+
+    EXPECT_NE(remote_output_tensor.get(), l0_host_output_tensor.data());
+    EXPECT_NE(memcmp(remote_output_tensor.get(), l0_host_output_tensor.data(), remote_output_tensor.get_byte_size()),
+              0);
+}
+
 }  // namespace behavior
 }  // namespace test
 }  // namespace ov