From 6ba55bf22293ca91481d2c39e297e18b5a503e3c Mon Sep 17 00:00:00 2001
From: nihuini <nihuini@tencent.com>
Date: Wed, 3 Jan 2024 14:30:40 +0800
Subject: [PATCH 01/19] create layer decoupled

---
 cmake/ncnn_add_layer.cmake                   |  56 +---
 cmake/ncnn_add_shader.cmake                  |   4 +-
 cmake/ncnn_generate_shader_comp_header.cmake |   4 +-
 src/gpu.cpp                                  |   2 +-
 src/layer.cpp                                | 293 ++++++++++++++++++-
 src/layer.h                                  |   8 +
 src/layer/convolution.cpp                    |  13 +-
 src/layer/convolution.h                      |   2 -
 src/layer/convolution1d.cpp                  |   8 -
 src/layer/convolution1d.h                    |   2 -
 src/layer/convolutiondepthwise.cpp           |  12 +-
 src/layer/convolutiondepthwise.h             |   2 -
 src/layer/convolutiondepthwise1d.cpp         |   5 -
 src/layer/convolutiondepthwise1d.h           |   2 -
 src/layer/innerproduct.cpp                   |  12 +-
 src/layer/innerproduct.h                     |   2 -
 src/layer/noop.cpp                           |  14 -
 src/layer/noop.h                             |   5 -
 src/layer/split.cpp                          |  26 --
 src/layer/split.h                            |   7 -
 src/layer/vulkan/noop_vulkan.cpp             |  35 +++
 src/layer/vulkan/noop_vulkan.h               |  34 +++
 src/layer/vulkan/split_vulkan.cpp            |  47 +++
 src/layer/vulkan/split_vulkan.h              |  34 +++
 src/layer_registry.h.in                      |   8 +
 25 files changed, 472 insertions(+), 165 deletions(-)
 create mode 100644 src/layer/vulkan/noop_vulkan.cpp
 create mode 100644 src/layer/vulkan/noop_vulkan.h
 create mode 100644 src/layer/vulkan/split_vulkan.cpp
 create mode 100644 src/layer/vulkan/split_vulkan.h

diff --git a/cmake/ncnn_add_layer.cmake b/cmake/ncnn_add_layer.cmake
index e6c74fec5eb..079bdd9f506 100644
--- a/cmake/ncnn_add_layer.cmake
+++ b/cmake/ncnn_add_layer.cmake
@@ -31,35 +31,14 @@ macro(ncnn_add_arch_opt_layer class NCNN_TARGET_ARCH_OPT NCNN_TARGET_ARCH_OPT_CF
         list(APPEND ncnn_SRCS ${NCNN_${NCNN_TARGET_ARCH_OPT}_HEADER} ${NCNN_${NCNN_TARGET_ARCH_OPT}_SOURCE})
 
         # generate layer_declaration and layer_registry file
-        set(layer_declaration "${layer_declaration}#include \"layer/${name}.h\"\n")
-        set(layer_declaration_class "class ${class}_final_${NCNN_TARGET_ARCH_OPT} : virtual public ${class}")
-        set(create_pipeline_content "        { int ret = ${class}::create_pipeline(opt); if (ret) return ret; }\n")
-        set(destroy_pipeline_content "        { int ret = ${class}::destroy_pipeline(opt); if (ret) return ret; }\n")
-
-        if(WITH_LAYER_${name}_vulkan)
-            set(layer_declaration "${layer_declaration}#include \"layer/vulkan/${name}_vulkan.h\"\n")
-            set(layer_declaration_class "${layer_declaration_class}, virtual public ${class}_vulkan")
-            set(create_pipeline_content "${create_pipeline_content}        if (vkdev) { int ret = ${class}_vulkan::create_pipeline(opt); if (ret) return ret; }\n")
-            set(destroy_pipeline_content "        if (vkdev) { int ret = ${class}_vulkan::destroy_pipeline(opt); if (ret) return ret; }\n${destroy_pipeline_content}")
-        endif()
-
         set(layer_declaration "${layer_declaration}#include \"layer/${NCNN_TARGET_ARCH}/${name}_${NCNN_TARGET_ARCH}_${NCNN_TARGET_ARCH_OPT}.h\"\n")
-        set(layer_declaration_class "${layer_declaration_class}, virtual public ${class}_${NCNN_TARGET_ARCH}_${NCNN_TARGET_ARCH_OPT}")
-        set(create_pipeline_content "${create_pipeline_content}        { int ret = ${class}_${NCNN_TARGET_ARCH}_${NCNN_TARGET_ARCH_OPT}::create_pipeline(opt); if (ret) return ret; }\n")
-        set(destroy_pipeline_content "        { int ret = ${class}_${NCNN_TARGET_ARCH}_${NCNN_TARGET_ARCH_OPT}::destroy_pipeline(opt); if (ret) return ret; }\n${destroy_pipeline_content}")
-
-        set(layer_declaration "${layer_declaration}namespace ncnn {\n${layer_declaration_class}\n{\n")
-        set(layer_declaration "${layer_declaration}public:\n")
-        set(layer_declaration "${layer_declaration}    virtual int create_pipeline(const Option& opt) {\n${create_pipeline_content}        return 0;\n    }\n")
-        set(layer_declaration "${layer_declaration}    virtual int destroy_pipeline(const Option& opt) {\n${destroy_pipeline_content}        return 0;\n    }\n")
-        set(layer_declaration "${layer_declaration}};\n")
-        set(layer_declaration "${layer_declaration}DEFINE_LAYER_CREATOR(${class}_final_${NCNN_TARGET_ARCH_OPT})\n} // namespace ncnn\n\n")
-
-        set(layer_registry_${NCNN_TARGET_ARCH_OPT} "${layer_registry_${NCNN_TARGET_ARCH_OPT}}#if NCNN_STRING\n{\"${class}\", ${class}_final_${NCNN_TARGET_ARCH_OPT}_layer_creator},\n#else\n{${class}_final_${NCNN_TARGET_ARCH_OPT}_layer_creator},\n#endif\n")
+        set(layer_declaration "${layer_declaration}namespace ncnn { DEFINE_LAYER_CREATOR(${class}_${NCNN_TARGET_ARCH}_${NCNN_TARGET_ARCH_OPT}) }\n")
+
+        set(layer_registry_${NCNN_TARGET_ARCH_OPT} "${layer_registry_${NCNN_TARGET_ARCH_OPT}}#if NCNN_STRING\n{\"${class}\", ${class}_${NCNN_TARGET_ARCH}_${NCNN_TARGET_ARCH_OPT}_layer_creator},\n#else\n{${class}_${NCNN_TARGET_ARCH}_${NCNN_TARGET_ARCH_OPT}_layer_creator},\n#endif\n")
     else()
         # no isa optimized version
         if(WITH_LAYER_${name})
-            set(layer_registry_${NCNN_TARGET_ARCH_OPT} "${layer_registry_${NCNN_TARGET_ARCH_OPT}}#if NCNN_STRING\n{\"${class}\", ${class}_final_layer_creator},\n#else\n{${class}_final_layer_creator},\n#endif\n")
+            set(layer_registry_${NCNN_TARGET_ARCH_OPT} "${layer_registry_${NCNN_TARGET_ARCH_OPT}}#if NCNN_STRING\n{\"${class}\", ${class}_layer_creator},\n#else\n{${class}_layer_creator},\n#endif\n")
         else()
             set(layer_registry_${NCNN_TARGET_ARCH_OPT} "${layer_registry_${NCNN_TARGET_ARCH_OPT}}#if NCNN_STRING\n{\"${class}\", 0},\n#else\n{0},\n#endif\n")
         endif()
@@ -110,18 +89,14 @@ macro(ncnn_add_layer class)
     # generate layer_declaration and layer_registry file
     if(WITH_LAYER_${name})
         set(layer_declaration "${layer_declaration}#include \"layer/${name}.h\"\n")
-        set(layer_declaration_class "class ${class}_final : virtual public ${class}")
-        set(create_pipeline_content "        { int ret = ${class}::create_pipeline(opt); if (ret) return ret; }\n")
-        set(destroy_pipeline_content "        { int ret = ${class}::destroy_pipeline(opt); if (ret) return ret; }\n")
+        set(layer_declaration "${layer_declaration}namespace ncnn { DEFINE_LAYER_CREATOR(${class}) }\n")
 
         source_group ("sources\\\\layers" FILES "${CMAKE_CURRENT_SOURCE_DIR}/layer/${name}.cpp")
     endif()
 
     if(WITH_LAYER_${name}_vulkan)
         set(layer_declaration "${layer_declaration}#include \"layer/vulkan/${name}_vulkan.h\"\n")
-        set(layer_declaration_class "${layer_declaration_class}, virtual public ${class}_vulkan")
-        set(create_pipeline_content "${create_pipeline_content}        if (vkdev) { int ret = ${class}_vulkan::create_pipeline(opt); if (ret) return ret; }\n")
-        set(destroy_pipeline_content "        if (vkdev) { int ret = ${class}_vulkan::destroy_pipeline(opt); if (ret) return ret; }\n${destroy_pipeline_content}")
+        set(layer_declaration "${layer_declaration}namespace ncnn { DEFINE_LAYER_CREATOR(${class}_vulkan) }\n")
 
         file(GLOB_RECURSE NCNN_SHADER_SRCS "layer/vulkan/shader/${name}.comp")
         file(GLOB_RECURSE NCNN_SHADER_SUBSRCS "layer/vulkan/shader/${name}_*.comp")
@@ -135,26 +110,21 @@ macro(ncnn_add_layer class)
 
     if(WITH_LAYER_${name}_${NCNN_TARGET_ARCH})
         set(layer_declaration "${layer_declaration}#include \"layer/${NCNN_TARGET_ARCH}/${name}_${NCNN_TARGET_ARCH}.h\"\n")
-        set(layer_declaration_class "${layer_declaration_class}, virtual public ${class}_${NCNN_TARGET_ARCH}")
-        set(create_pipeline_content "${create_pipeline_content}        { int ret = ${class}_${NCNN_TARGET_ARCH}::create_pipeline(opt); if (ret) return ret; }\n")
-        set(destroy_pipeline_content "        { int ret = ${class}_${NCNN_TARGET_ARCH}::destroy_pipeline(opt); if (ret) return ret; }\n${destroy_pipeline_content}")
+        set(layer_declaration "${layer_declaration}namespace ncnn { DEFINE_LAYER_CREATOR(${class}_${NCNN_TARGET_ARCH}) }\n")
 
         source_group ("sources\\\\layers\\\\${NCNN_TARGET_ARCH}" FILES "${CMAKE_CURRENT_SOURCE_DIR}/layer/${NCNN_TARGET_ARCH}/${name}_${NCNN_TARGET_ARCH}.cpp")
     endif()
 
     if(WITH_LAYER_${name})
-        set(layer_declaration "${layer_declaration}namespace ncnn {\n${layer_declaration_class}\n{\n")
-        set(layer_declaration "${layer_declaration}public:\n")
-        set(layer_declaration "${layer_declaration}    virtual int create_pipeline(const Option& opt) {\n${create_pipeline_content}        return 0;\n    }\n")
-        set(layer_declaration "${layer_declaration}    virtual int destroy_pipeline(const Option& opt) {\n${destroy_pipeline_content}        return 0;\n    }\n")
-        set(layer_declaration "${layer_declaration}};\n")
-        set(layer_declaration "${layer_declaration}DEFINE_LAYER_CREATOR(${class}_final)\n} // namespace ncnn\n\n")
+        set(layer_registry "${layer_registry}#if NCNN_STRING\n{\"${class}\", ${class}_layer_creator},\n#else\n{${class}_layer_creator},\n#endif\n")
+    else()
+        set(layer_registry "${layer_registry}#if NCNN_STRING\n{\"${class}\", 0},\n#else\n{0},\n#endif\n")
     endif()
 
-    if(WITH_LAYER_${name})
-        set(layer_registry "${layer_registry}#if NCNN_STRING\n{\"${class}\", ${class}_final_layer_creator},\n#else\n{${class}_final_layer_creator},\n#endif\n")
+    if(WITH_LAYER_${name}_vulkan)
+        set(layer_registry_vulkan "${layer_registry_vulkan}#if NCNN_STRING\n{\"${class}\", ${class}_vulkan_layer_creator},\n#else\n{${class}_vulkan_layer_creator},\n#endif\n")
     else()
-        set(layer_registry "${layer_registry}#if NCNN_STRING\n{\"${class}\", 0},\n#else\n{0},\n#endif\n")
+        set(layer_registry_vulkan "${layer_registry_vulkan}#if NCNN_STRING\n{\"${class}\", 0},\n#else\n{0},\n#endif\n")
     endif()
 
     if(NCNN_TARGET_ARCH STREQUAL "x86")
diff --git a/cmake/ncnn_add_shader.cmake b/cmake/ncnn_add_shader.cmake
index 8006241bc05..76680f4ca81 100644
--- a/cmake/ncnn_add_shader.cmake
+++ b/cmake/ncnn_add_shader.cmake
@@ -1,7 +1,7 @@
 
 macro(ncnn_add_shader NCNN_SHADER_SRC)
     get_filename_component(NCNN_SHADER_SRC_NAME_WE ${NCNN_SHADER_SRC} NAME_WE)
-    set(NCNN_SHADER_COMP_HEADER ${CMAKE_CURRENT_BINARY_DIR}/${NCNN_SHADER_SRC_NAME_WE}.comp.hex.h)
+    set(NCNN_SHADER_COMP_HEADER ${CMAKE_CURRENT_BINARY_DIR}/layer/vulkan/shader/${NCNN_SHADER_SRC_NAME_WE}.comp.hex.h)
 
     add_custom_command(
         OUTPUT ${NCNN_SHADER_COMP_HEADER}
@@ -13,7 +13,7 @@ macro(ncnn_add_shader NCNN_SHADER_SRC)
     set_source_files_properties(${NCNN_SHADER_COMP_HEADER} PROPERTIES GENERATED TRUE)
 
     get_filename_component(NCNN_SHADER_COMP_HEADER_NAME ${NCNN_SHADER_COMP_HEADER} NAME)
-    string(APPEND layer_shader_spv_data "#include \"${NCNN_SHADER_COMP_HEADER_NAME}\"\n")
+    string(APPEND layer_shader_spv_data "#include \"layer/vulkan/shader/${NCNN_SHADER_COMP_HEADER_NAME}\"\n")
 
     get_filename_component(NCNN_SHADER_SRC_NAME_WE ${NCNN_SHADER_SRC} NAME_WE)
     string(APPEND layer_shader_registry "{${NCNN_SHADER_SRC_NAME_WE}_comp_data,sizeof(${NCNN_SHADER_SRC_NAME_WE}_comp_data)},\n")
diff --git a/cmake/ncnn_generate_shader_comp_header.cmake b/cmake/ncnn_generate_shader_comp_header.cmake
index a41b6328d8d..79f7c1eff3b 100644
--- a/cmake/ncnn_generate_shader_comp_header.cmake
+++ b/cmake/ncnn_generate_shader_comp_header.cmake
@@ -18,8 +18,8 @@ string(REGEX REPLACE "\n\n" "\n" comp_data "${comp_data}")
 get_filename_component(SHADER_SRC_NAME_WE ${SHADER_SRC} NAME_WE)
 
 # text to hex
-file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/${SHADER_SRC_NAME_WE}.text2hex.txt "${comp_data}")
-file(READ ${CMAKE_CURRENT_BINARY_DIR}/${SHADER_SRC_NAME_WE}.text2hex.txt comp_data_hex HEX)
+file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/layer/vulkan/shader/${SHADER_SRC_NAME_WE}.text2hex.txt "${comp_data}")
+file(READ ${CMAKE_CURRENT_BINARY_DIR}/layer/vulkan/shader/${SHADER_SRC_NAME_WE}.text2hex.txt comp_data_hex HEX)
 string(REGEX REPLACE "([0-9a-f][0-9a-f])" "0x\\1," comp_data_hex ${comp_data_hex})
 string(FIND "${comp_data_hex}" "," tail_comma REVERSE)
 string(SUBSTRING "${comp_data_hex}" 0 ${tail_comma} comp_data_hex)
diff --git a/src/gpu.cpp b/src/gpu.cpp
index 447f66bae44..adba869e1e9 100644
--- a/src/gpu.cpp
+++ b/src/gpu.cpp
@@ -26,7 +26,7 @@
 #include "glslang/glslang/Public/ShaderLang.h"
 #endif
 
-#include "vulkan_activation.comp.hex.h"
+#include "layer/vulkan/shader/vulkan_activation.comp.hex.h"
 
 #include "command.h"
 #include "layer.h"
diff --git a/src/layer.cpp b/src/layer.cpp
index 562576a5493..cf895b4169d 100644
--- a/src/layer.cpp
+++ b/src/layer.cpp
@@ -18,21 +18,7 @@
 
 #include <string.h>
 
-#ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable : 4250)
-#endif
-#ifdef __clang__
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Woverloaded-virtual"
-#endif
 #include "layer_declaration.h"
-#ifdef __clang__
-#pragma clang diagnostic pop
-#endif
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
 
 namespace ncnn {
 
@@ -221,9 +207,272 @@ Layer* create_layer(const char* type)
 
     return create_layer(index);
 }
+
+Layer* create_layer_cpu(const char* type)
+{
+    int index = layer_to_index(type);
+    if (index == -1)
+        return 0;
+
+    return create_layer_cpu(index);
+}
+
+#if NCNN_VULKAN
+Layer* create_layer_vulkan(const char* type)
+{
+    int index = layer_to_index(type);
+    if (index == -1)
+        return 0;
+
+    return create_layer_vulkan(index);
+}
+#endif // NCNN_VULKAN
 #endif // NCNN_STRING
 
+// internal wrapper
+class Layer_final : public Layer
+{
+public:
+    Layer* layer_cpu;
+#if NCNN_VULKAN
+    Layer* layer_vulkan;
+#endif
+
+    // utility functions for transfer layer properties
+    void set_layer_properties()
+    {
+        layer_cpu->userdata = userdata;
+
+        layer_cpu->bottoms = bottoms;
+        layer_cpu->tops = tops;
+        layer_cpu->bottom_shapes = bottom_shapes;
+        layer_cpu->top_shapes = top_shapes;
+        layer_cpu->featmask = featmask;
+
+#if NCNN_VULKAN
+        if (layer_vulkan)
+        {
+            layer_vulkan->vkdev = vkdev;
+
+            layer_vulkan->userdata = userdata;
+
+            layer_vulkan->bottoms = bottoms;
+            layer_vulkan->tops = tops;
+            layer_vulkan->bottom_shapes = bottom_shapes;
+            layer_vulkan->top_shapes = top_shapes;
+            layer_vulkan->featmask = featmask;
+        }
+#endif
+    }
+
+    void get_layer_properties()
+    {
+        one_blob_only = layer_cpu->one_blob_only;
+        support_inplace = layer_cpu->support_inplace;
+        support_packing = layer_cpu->support_packing;
+        support_bf16_storage = layer_cpu->support_bf16_storage;
+        support_fp16_storage = layer_cpu->support_fp16_storage;
+        support_int8_storage = layer_cpu->support_int8_storage;
+
+        support_vulkan = 0;
+        support_image_storage = 0;
+        support_tensor_storage = 0;
+
+#if NCNN_VULKAN
+        if (layer_vulkan && vkdev)
+        {
+            support_vulkan = layer_vulkan->support_vulkan;
+            support_image_storage = layer_vulkan->support_image_storage;
+            support_tensor_storage = layer_vulkan->support_tensor_storage;
+        }
+#endif
+    }
+
+public:
+    Layer_final()
+    {
+        layer_cpu = 0;
+#if NCNN_VULKAN
+        layer_vulkan = 0;
+#endif
+    }
+
+    ~Layer_final()
+    {
+        delete layer_cpu;
+#if NCNN_VULKAN
+        delete layer_vulkan;
+#endif
+    }
+
+    virtual int load_param(const ParamDict& pd)
+    {
+        set_layer_properties();
+        {
+            int ret = layer_cpu->load_param(pd);
+            if (ret)
+                return ret;
+        }
+#if NCNN_VULKAN
+        if (layer_vulkan && vkdev)
+        {
+            int ret = layer_vulkan->load_param(pd);
+            if (ret)
+                return ret;
+        }
+#endif // NCNN_VULKAN
+        get_layer_properties();
+        return 0;
+    }
+
+    virtual int load_model(const ModelBin& mb)
+    {
+        {
+            int ret = layer_cpu->load_model(mb);
+            if (ret)
+                return ret;
+        }
+#if NCNN_VULKAN
+        if (layer_vulkan && vkdev)
+        {
+            int ret = layer_vulkan->load_model(mb);
+            if (ret)
+                return ret;
+        }
+#endif // NCNN_VULKAN
+        get_layer_properties();
+        return 0;
+    }
+
+    virtual int create_pipeline(const Option& opt)
+    {
+        set_layer_properties();
+        {
+            int ret = layer_cpu->create_pipeline(opt);
+            if (ret)
+                return ret;
+        }
+#if NCNN_VULKAN
+        if (layer_vulkan && vkdev)
+        {
+            int ret = layer_vulkan->create_pipeline(opt);
+            if (ret)
+                return ret;
+        }
+#endif // NCNN_VULKAN
+        get_layer_properties();
+        return 0;
+    }
+
+    virtual int destroy_pipeline(const Option& opt)
+    {
+        {
+            int ret = layer_cpu->destroy_pipeline(opt);
+            if (ret)
+                return ret;
+        }
+#if NCNN_VULKAN
+        if (layer_vulkan && vkdev)
+        {
+            int ret = layer_vulkan->destroy_pipeline(opt);
+            if (ret)
+                return ret;
+        }
+#endif // NCNN_VULKAN
+        return 0;
+    }
+
+public:
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
+    {
+        return layer_cpu->forward(bottom_blobs, top_blobs, opt);
+    }
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+    {
+        return layer_cpu->forward(bottom_blob, top_blob, opt);
+    }
+
+    virtual int forward_inplace(std::vector<Mat>& bottom_top_blobs, const Option& opt) const
+    {
+        return layer_cpu->forward_inplace(bottom_top_blobs, opt);
+    }
+
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const
+    {
+        return layer_cpu->forward_inplace(bottom_top_blob, opt);
+    }
+
+#if NCNN_VULKAN
+public:
+    virtual int upload_model(VkTransfer& cmd, const Option& opt)
+    {
+        return layer_vulkan->upload_model(cmd, opt);
+    }
+
+    virtual int forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkMat>& top_blobs, VkCompute& cmd, const Option& opt) const
+    {
+        return layer_vulkan->forward(bottom_blobs, top_blobs, cmd, opt);
+    }
+
+    virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const
+    {
+        return layer_vulkan->forward(bottom_blob, top_blob, cmd, opt);
+    }
+
+    virtual int forward(const std::vector<VkImageMat>& bottom_blobs, std::vector<VkImageMat>& top_blobs, VkCompute& cmd, const Option& opt) const
+    {
+        return layer_vulkan->forward(bottom_blobs, top_blobs, cmd, opt);
+    }
+
+    virtual int forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const
+    {
+        return layer_vulkan->forward(bottom_blob, top_blob, cmd, opt);
+    }
+
+    virtual int forward_inplace(std::vector<VkMat>& bottom_top_blobs, VkCompute& cmd, const Option& opt) const
+    {
+        return layer_vulkan->forward_inplace(bottom_top_blobs, cmd, opt);
+    }
+
+    virtual int forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const
+    {
+        return layer_vulkan->forward_inplace(bottom_top_blob, cmd, opt);
+    }
+
+    virtual int forward_inplace(std::vector<VkImageMat>& bottom_top_blobs, VkCompute& cmd, const Option& opt) const
+    {
+        return layer_vulkan->forward_inplace(bottom_top_blobs, cmd, opt);
+    }
+
+    virtual int forward_inplace(VkImageMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const
+    {
+        return layer_vulkan->forward_inplace(bottom_top_blob, cmd, opt);
+    }
+#endif // NCNN_VULKAN
+};
+
 Layer* create_layer(int index)
+{
+    Layer* layer_cpu = create_layer_cpu(index);
+    if (!layer_cpu)
+        return 0;
+
+    Layer_final* layer_final = new Layer_final;
+    layer_final->layer_cpu = layer_cpu;
+
+#if NCNN_VULKAN
+    layer_final->layer_vulkan = create_layer_vulkan(index);
+#endif
+
+    layer_final->typeindex = index;
+    layer_final->set_layer_properties();
+    layer_final->get_layer_properties();
+
+    return layer_final;
+}
+
+Layer* create_layer_cpu(int index)
 {
     if (index < 0 || index >= layer_registry_entry_count)
         return 0;
@@ -293,4 +542,20 @@ Layer* create_layer(int index)
     return layer;
 }
 
+#if NCNN_VULKAN
+Layer* create_layer_vulkan(int index)
+{
+    if (index < 0 || index >= layer_registry_entry_count)
+        return 0;
+
+    layer_creator_func layer_creator = layer_registry_vulkan[index].creator;
+    if (!layer_creator)
+        return 0;
+
+    Layer* layer = layer_creator(0);
+    layer->typeindex = index;
+    return layer;
+}
+#endif // NCNN_VULKAN
+
 } // namespace ncnn
diff --git a/src/layer.h b/src/layer.h
index 573f58cf94a..e04f606145b 100644
--- a/src/layer.h
+++ b/src/layer.h
@@ -199,9 +199,17 @@ struct overwrite_builtin_layer_registry_entry
 NCNN_EXPORT int layer_to_index(const char* type);
 // create layer from type name
 NCNN_EXPORT Layer* create_layer(const char* type);
+NCNN_EXPORT Layer* create_layer_cpu(const char* type);
+#if NCNN_VULKAN
+NCNN_EXPORT Layer* create_layer_vulkan(const char* type);
+#endif // NCNN_VULKAN
 #endif // NCNN_STRING
 // create layer from layer type
 NCNN_EXPORT Layer* create_layer(int index);
+NCNN_EXPORT Layer* create_layer_cpu(int index);
+#if NCNN_VULKAN
+NCNN_EXPORT Layer* create_layer_vulkan(int index);
+#endif // NCNN_VULKAN
 
 #define DEFINE_LAYER_CREATOR(name)                          \
     ::ncnn::Layer* name##_layer_creator(void* /*userdata*/) \
diff --git a/src/layer/convolution.cpp b/src/layer/convolution.cpp
index 4acf91869ae..bf83a69a100 100644
--- a/src/layer/convolution.cpp
+++ b/src/layer/convolution.cpp
@@ -95,17 +95,9 @@ int Convolution::load_model(const ModelBin& mb)
     }
 #endif // NCNN_INT8
 
-    return 0;
-}
-
-int Convolution::create_pipeline(const Option& opt)
-{
-    if (dynamic_weight)
-        return 0;
-
 #if NCNN_INT8
     // runtime quantize the weight data
-    if (opt.use_int8_inference && weight_data.elemsize == (size_t)4u && int8_scale_term)
+    if (weight_data.elemsize == (size_t)4u && int8_scale_term)
     {
         const int maxk = kernel_w * kernel_h;
         const int num_input = weight_data_size / num_output / maxk;
@@ -114,7 +106,8 @@ int Convolution::create_pipeline(const Option& opt)
 
         Mat weight_data_int8;
 
-        Option opt_q = opt;
+        Option opt_q;
+        opt_q.num_threads = 1;
         opt_q.blob_allocator = weight_data.allocator;
         opt_q.use_packing_layout = false;
         quantize_to_int8(weight_data_r2, weight_data_int8, weight_data_int8_scales, opt_q);
diff --git a/src/layer/convolution.h b/src/layer/convolution.h
index 476a7aaf67b..7af0735fd30 100644
--- a/src/layer/convolution.h
+++ b/src/layer/convolution.h
@@ -28,8 +28,6 @@ class Convolution : public Layer
 
     virtual int load_model(const ModelBin& mb);
 
-    virtual int create_pipeline(const Option& opt);
-
     virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 
     virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
diff --git a/src/layer/convolution1d.cpp b/src/layer/convolution1d.cpp
index 184b2bdb60d..7d6be1e111e 100644
--- a/src/layer/convolution1d.cpp
+++ b/src/layer/convolution1d.cpp
@@ -67,14 +67,6 @@ int Convolution1D::load_model(const ModelBin& mb)
     return 0;
 }
 
-int Convolution1D::create_pipeline(const Option&)
-{
-    if (dynamic_weight)
-        return 0;
-
-    return 0;
-}
-
 static int convolution1d(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data, const Mat& bias_data, int kernel_w, int stride_w, int dilation_w, int activation_type, const Mat& activation_params, const Option& opt)
 {
     const int h = bottom_blob.h;
diff --git a/src/layer/convolution1d.h b/src/layer/convolution1d.h
index e30807e5c9b..d87099e25f2 100644
--- a/src/layer/convolution1d.h
+++ b/src/layer/convolution1d.h
@@ -28,8 +28,6 @@ class Convolution1D : public Layer
 
     virtual int load_model(const ModelBin& mb);
 
-    virtual int create_pipeline(const Option& opt);
-
     virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 
     virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
diff --git a/src/layer/convolutiondepthwise.cpp b/src/layer/convolutiondepthwise.cpp
index e820a192cb3..fb8e1e5c0b2 100644
--- a/src/layer/convolutiondepthwise.cpp
+++ b/src/layer/convolutiondepthwise.cpp
@@ -124,14 +124,9 @@ int ConvolutionDepthWise::load_model(const ModelBin& mb)
     }
 #endif // NCNN_INT8
 
-    return 0;
-}
-
-int ConvolutionDepthWise::create_pipeline(const Option& opt)
-{
 #if NCNN_INT8
     // runtime quantize the weight data
-    if (opt.use_int8_inference && weight_data.elemsize == (size_t)4u && int8_scale_term)
+    if (weight_data.elemsize == (size_t)4u && int8_scale_term)
     {
         Mat int8_weight_data(weight_data_size, (size_t)1u);
         if (int8_weight_data.empty())
@@ -141,7 +136,8 @@ int ConvolutionDepthWise::create_pipeline(const Option& opt)
 
         for (int g = 0; g < group; g++)
         {
-            Option opt_q = opt;
+            Option opt_q;
+            opt_q.num_threads = 1;
             opt_q.blob_allocator = int8_weight_data.allocator;
             opt_q.use_packing_layout = false;
 
@@ -153,8 +149,6 @@ int ConvolutionDepthWise::create_pipeline(const Option& opt)
 
         weight_data = int8_weight_data;
     }
-#else
-    (void)(opt);
 #endif // NCNN_INT8
 
     return 0;
diff --git a/src/layer/convolutiondepthwise.h b/src/layer/convolutiondepthwise.h
index e893aa07fc9..8a955dbd23b 100644
--- a/src/layer/convolutiondepthwise.h
+++ b/src/layer/convolutiondepthwise.h
@@ -28,8 +28,6 @@ class ConvolutionDepthWise : public Layer
 
     virtual int load_model(const ModelBin& mb);
 
-    virtual int create_pipeline(const Option& opt);
-
     virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 
     virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
diff --git a/src/layer/convolutiondepthwise1d.cpp b/src/layer/convolutiondepthwise1d.cpp
index 79c83168051..61fddf881f7 100644
--- a/src/layer/convolutiondepthwise1d.cpp
+++ b/src/layer/convolutiondepthwise1d.cpp
@@ -73,11 +73,6 @@ int ConvolutionDepthWise1D::load_model(const ModelBin& mb)
     return 0;
 }
 
-int ConvolutionDepthWise1D::create_pipeline(const Option&)
-{
-    return 0;
-}
-
 static int convolutiondepthwise1d(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data, const Mat& bias_data, int kernel_w, int stride_w, int dilation_w, int group, int activation_type, const Mat& activation_params, const Option& opt)
 {
     const int h = bottom_blob.h;
diff --git a/src/layer/convolutiondepthwise1d.h b/src/layer/convolutiondepthwise1d.h
index e2c195dc489..6026f04981d 100644
--- a/src/layer/convolutiondepthwise1d.h
+++ b/src/layer/convolutiondepthwise1d.h
@@ -28,8 +28,6 @@ class ConvolutionDepthWise1D : public Layer
 
     virtual int load_model(const ModelBin& mb);
 
-    virtual int create_pipeline(const Option& opt);
-
     virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 
     virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
diff --git a/src/layer/innerproduct.cpp b/src/layer/innerproduct.cpp
index 4cc22981c34..9cb422d21b6 100644
--- a/src/layer/innerproduct.cpp
+++ b/src/layer/innerproduct.cpp
@@ -69,21 +69,17 @@ int InnerProduct::load_model(const ModelBin& mb)
     }
 #endif // NCNN_INT8
 
-    return 0;
-}
-
-int InnerProduct::create_pipeline(const Option& opt)
-{
 #if NCNN_INT8
     // runtime quantize the weight data
-    if (opt.use_int8_inference && weight_data.elemsize == (size_t)4u && int8_scale_term)
+    if (weight_data.elemsize == (size_t)4u && int8_scale_term)
     {
         const int num_input = weight_data_size / num_output;
 
         Mat weight_data_r2 = weight_data.reshape(num_input, num_output);
 
         Mat weight_data_int8;
-        Option opt_q = opt;
+        Option opt_q;
+        opt_q.num_threads = 1;
         opt_q.use_packing_layout = false;
         quantize_to_int8(weight_data_r2, weight_data_int8, weight_data_int8_scales, opt_q);
         if (weight_data_int8.empty())
@@ -91,8 +87,6 @@ int InnerProduct::create_pipeline(const Option& opt)
 
         weight_data = weight_data_int8.reshape(weight_data_size);
     }
-#else
-    (void)(opt);
 #endif // NCNN_INT8
 
     return 0;
diff --git a/src/layer/innerproduct.h b/src/layer/innerproduct.h
index 1f9b3fdc0a5..becf7b1d01a 100644
--- a/src/layer/innerproduct.h
+++ b/src/layer/innerproduct.h
@@ -28,8 +28,6 @@ class InnerProduct : public Layer
 
     virtual int load_model(const ModelBin& mb);
 
-    virtual int create_pipeline(const Option& opt);
-
     virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 
 protected:
diff --git a/src/layer/noop.cpp b/src/layer/noop.cpp
index 68572b0ba28..a8b42f70e83 100644
--- a/src/layer/noop.cpp
+++ b/src/layer/noop.cpp
@@ -20,11 +20,9 @@ namespace ncnn {
 Noop::Noop()
 {
     support_inplace = true;
-    support_vulkan = true;
     support_packing = true;
     support_fp16_storage = cpu_support_arm_asimdhp() || cpu_support_riscv_zfh();
     support_bf16_storage = true;
-    support_image_storage = true;
 }
 
 int Noop::forward_inplace(std::vector<Mat>& /*bottom_top_blobs*/, const Option& /*opt*/) const
@@ -32,16 +30,4 @@ int Noop::forward_inplace(std::vector<Mat>& /*bottom_top_blobs*/, const Option&
     return 0;
 }
 
-#if NCNN_VULKAN
-int Noop::forward_inplace(std::vector<VkMat>& /*bottom_top_blobs*/, VkCompute& /*cmd*/, const Option& /*opt*/) const
-{
-    return 0;
-}
-
-int Noop::forward_inplace(std::vector<VkImageMat>& /*bottom_top_blobs*/, VkCompute& /*cmd*/, const Option& /*opt*/) const
-{
-    return 0;
-}
-#endif // NCNN_VULKAN
-
 } // namespace ncnn
diff --git a/src/layer/noop.h b/src/layer/noop.h
index 1fb7af35c08..75bbdd1a308 100644
--- a/src/layer/noop.h
+++ b/src/layer/noop.h
@@ -25,11 +25,6 @@ class Noop : public Layer
     Noop();
 
     virtual int forward_inplace(std::vector<Mat>& bottom_top_blobs, const Option& opt) const;
-
-#if NCNN_VULKAN
-    virtual int forward_inplace(std::vector<VkMat>& bottom_top_blobs, VkCompute& cmd, const Option& opt) const;
-    virtual int forward_inplace(std::vector<VkImageMat>& bottom_top_blobs, VkCompute& cmd, const Option& opt) const;
-#endif // NCNN_VULKAN
 };
 
 } // namespace ncnn
diff --git a/src/layer/split.cpp b/src/layer/split.cpp
index f79fce0f15c..996624dfe7a 100644
--- a/src/layer/split.cpp
+++ b/src/layer/split.cpp
@@ -21,11 +21,9 @@ Split::Split()
 {
     one_blob_only = false;
     support_inplace = false;
-    support_vulkan = true;
     support_packing = true;
     support_fp16_storage = cpu_support_arm_asimdhp() || cpu_support_riscv_zfh();
     support_bf16_storage = true;
-    support_image_storage = true;
 }
 
 int Split::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& /*opt*/) const
@@ -39,28 +37,4 @@ int Split::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_b
     return 0;
 }
 
-#if NCNN_VULKAN
-int Split::forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkMat>& top_blobs, VkCompute& /*cmd*/, const Option& /*opt*/) const
-{
-    const VkMat& bottom_blob = bottom_blobs[0];
-    for (size_t i = 0; i < top_blobs.size(); i++)
-    {
-        top_blobs[i] = bottom_blob;
-    }
-
-    return 0;
-}
-
-int Split::forward(const std::vector<VkImageMat>& bottom_blobs, std::vector<VkImageMat>& top_blobs, VkCompute& /*cmd*/, const Option& /*opt*/) const
-{
-    const VkImageMat& bottom_blob = bottom_blobs[0];
-    for (size_t i = 0; i < top_blobs.size(); i++)
-    {
-        top_blobs[i] = bottom_blob;
-    }
-
-    return 0;
-}
-#endif // NCNN_VULKAN
-
 } // namespace ncnn
diff --git a/src/layer/split.h b/src/layer/split.h
index 7437866cfc5..53686f82be3 100644
--- a/src/layer/split.h
+++ b/src/layer/split.h
@@ -25,13 +25,6 @@ class Split : public Layer
     Split();
 
     virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
-
-#if NCNN_VULKAN
-    virtual int forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkMat>& top_blobs, VkCompute& cmd, const Option& opt) const;
-    virtual int forward(const std::vector<VkImageMat>& bottom_blobs, std::vector<VkImageMat>& top_blobs, VkCompute& cmd, const Option& opt) const;
-#endif // NCNN_VULKAN
-
-public:
 };
 
 } // namespace ncnn
diff --git a/src/layer/vulkan/noop_vulkan.cpp b/src/layer/vulkan/noop_vulkan.cpp
new file mode 100644
index 00000000000..3a59d2613a3
--- /dev/null
+++ b/src/layer/vulkan/noop_vulkan.cpp
@@ -0,0 +1,35 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "noop_vulkan.h"
+
+namespace ncnn {
+
+Noop_vulkan::Noop_vulkan()
+{
+    support_vulkan = true;
+    support_image_storage = true;
+}
+
+int Noop_vulkan::forward_inplace(std::vector<VkMat>& /*bottom_top_blobs*/, VkCompute& /*cmd*/, const Option& /*opt*/) const
+{
+    return 0;
+}
+
+int Noop_vulkan::forward_inplace(std::vector<VkImageMat>& /*bottom_top_blobs*/, VkCompute& /*cmd*/, const Option& /*opt*/) const
+{
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/vulkan/noop_vulkan.h b/src/layer/vulkan/noop_vulkan.h
new file mode 100644
index 00000000000..a26cf626ab6
--- /dev/null
+++ b/src/layer/vulkan/noop_vulkan.h
@@ -0,0 +1,34 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_NOOP_VULKAN_H
+#define LAYER_NOOP_VULKAN_H
+
+#include "noop.h"
+
+namespace ncnn {
+
+class Noop_vulkan : virtual public Noop
+{
+public:
+    Noop_vulkan();
+
+    using Noop::forward;
+    virtual int forward_inplace(std::vector<VkMat>& bottom_top_blobs, VkCompute& cmd, const Option& opt) const;
+    virtual int forward_inplace(std::vector<VkImageMat>& bottom_top_blobs, VkCompute& cmd, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_NOOP_VULKAN_H
diff --git a/src/layer/vulkan/split_vulkan.cpp b/src/layer/vulkan/split_vulkan.cpp
new file mode 100644
index 00000000000..791069cc7d7
--- /dev/null
+++ b/src/layer/vulkan/split_vulkan.cpp
@@ -0,0 +1,47 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "split_vulkan.h"
+
+namespace ncnn {
+
+Split_vulkan::Split_vulkan()
+{
+    support_vulkan = true;
+    support_image_storage = true;
+}
+
+int Split_vulkan::forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkMat>& top_blobs, VkCompute& /*cmd*/, const Option& /*opt*/) const
+{
+    const VkMat& bottom_blob = bottom_blobs[0];
+    for (size_t i = 0; i < top_blobs.size(); i++)
+    {
+        top_blobs[i] = bottom_blob;
+    }
+
+    return 0;
+}
+
+int Split_vulkan::forward(const std::vector<VkImageMat>& bottom_blobs, std::vector<VkImageMat>& top_blobs, VkCompute& /*cmd*/, const Option& /*opt*/) const
+{
+    const VkImageMat& bottom_blob = bottom_blobs[0];
+    for (size_t i = 0; i < top_blobs.size(); i++)
+    {
+        top_blobs[i] = bottom_blob;
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/vulkan/split_vulkan.h b/src/layer/vulkan/split_vulkan.h
new file mode 100644
index 00000000000..b5ace0cb2ce
--- /dev/null
+++ b/src/layer/vulkan/split_vulkan.h
@@ -0,0 +1,34 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_SPLIT_VULKAN_H
+#define LAYER_SPLIT_VULKAN_H
+
+#include "split.h"
+
+namespace ncnn {
+
+class Split_vulkan : virtual public Split
+{
+public:
+    Split_vulkan();
+
+    using Split::forward;
+    virtual int forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkMat>& top_blobs, VkCompute& cmd, const Option& opt) const;
+    virtual int forward(const std::vector<VkImageMat>& bottom_blobs, std::vector<VkImageMat>& top_blobs, VkCompute& cmd, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_SPLIT_VULKAN_H
diff --git a/src/layer_registry.h.in b/src/layer_registry.h.in
index 52393b498e6..4b6398d0ced 100644
--- a/src/layer_registry.h.in
+++ b/src/layer_registry.h.in
@@ -11,11 +11,13 @@ static const layer_registry_entry layer_registry_avx512[] = {
 @layer_registry_avx512@
 };
 #endif // NCNN_RUNTIME_CPU && NCNN_AVX512
+
 #if NCNN_RUNTIME_CPU && NCNN_FMA
 static const layer_registry_entry layer_registry_fma[] = {
 @layer_registry_fma@
 };
 #endif // NCNN_RUNTIME_CPU && NCNN_FMA
+
 #if NCNN_RUNTIME_CPU && NCNN_AVX
 static const layer_registry_entry layer_registry_avx[] = {
 @layer_registry_avx@
@@ -45,3 +47,9 @@ static const layer_registry_entry layer_registry_rvv[] = {
 @layer_registry_rvv@
 };
 #endif // NCNN_RUNTIME_CPU && NCNN_RVV
+
+#if NCNN_VULKAN
+static const layer_registry_entry layer_registry_vulkan[] = {
+@layer_registry_vulkan@
+};
+#endif // NCNN_VULKAN

From fdc2226006912c7aea2634a73df02ffe85bc90f0 Mon Sep 17 00:00:00 2001
From: nihuini <nihuini@tencent.com>
Date: Wed, 3 Jan 2024 15:01:14 +0800
Subject: [PATCH 02/19] no more virtual public

---
 cmake/ncnn_add_layer.cmake                    | 20 ++++++++-----
 src/layer.cpp                                 | 28 +++++++++++++++++++
 src/layer.h                                   |  2 ++
 src/layer/arm/absval_arm.h                    |  2 +-
 src/layer/arm/batchnorm_arm.h                 |  2 +-
 src/layer/arm/bias_arm.h                      |  2 +-
 src/layer/arm/binaryop_arm.h                  |  2 +-
 src/layer/arm/cast_arm.h                      |  2 +-
 src/layer/arm/clip_arm.h                      |  2 +-
 src/layer/arm/concat_arm.h                    |  2 +-
 src/layer/arm/convolution1d_arm.h             |  2 +-
 src/layer/arm/convolution_arm.h               |  2 +-
 src/layer/arm/convolutiondepthwise_arm.h      |  2 +-
 src/layer/arm/crop_arm.h                      |  2 +-
 src/layer/arm/deconvolution_arm.h             |  2 +-
 src/layer/arm/deconvolutiondepthwise_arm.h    |  2 +-
 src/layer/arm/dequantize_arm.h                |  2 +-
 src/layer/arm/dropout_arm.h                   |  2 +-
 src/layer/arm/eltwise_arm.h                   |  2 +-
 src/layer/arm/flatten_arm.h                   |  2 +-
 src/layer/arm/gelu_arm.h                      |  2 +-
 src/layer/arm/gemm_arm.h                      |  2 +-
 src/layer/arm/gru_arm.h                       |  2 +-
 src/layer/arm/hardsigmoid_arm.h               |  2 +-
 src/layer/arm/hardswish_arm.h                 |  2 +-
 src/layer/arm/innerproduct_arm.h              |  2 +-
 src/layer/arm/instancenorm_arm.h              |  2 +-
 src/layer/arm/interp_arm.h                    |  2 +-
 src/layer/arm/lrn_arm.h                       |  2 +-
 src/layer/arm/lstm_arm.h                      |  2 +-
 src/layer/arm/matmul_arm.h                    |  2 +-
 src/layer/arm/mish_arm.h                      |  2 +-
 src/layer/arm/multiheadattention_arm.h        |  2 +-
 src/layer/arm/packing_arm.h                   |  2 +-
 src/layer/arm/padding_arm.h                   |  2 +-
 src/layer/arm/pixelshuffle_arm.h              |  2 +-
 src/layer/arm/pooling_arm.h                   |  2 +-
 src/layer/arm/prelu_arm.h                     |  2 +-
 src/layer/arm/quantize_arm.h                  |  2 +-
 src/layer/arm/relu_arm.h                      |  2 +-
 src/layer/arm/requantize_arm.h                |  2 +-
 src/layer/arm/reshape_arm.h                   |  2 +-
 src/layer/arm/rnn_arm.h                       |  2 +-
 src/layer/arm/scale_arm.h                     |  2 +-
 src/layer/arm/selu_arm.h                      |  2 +-
 src/layer/arm/shufflechannel_arm.h            |  2 +-
 src/layer/arm/sigmoid_arm.h                   |  2 +-
 src/layer/arm/slice_arm.h                     |  2 +-
 src/layer/arm/softmax_arm.h                   |  2 +-
 src/layer/arm/swish_arm.h                     |  2 +-
 src/layer/arm/tanh_arm.h                      |  2 +-
 src/layer/arm/unaryop_arm.h                   |  2 +-
 src/layer/loongarch/absval_loongarch.h        |  2 +-
 src/layer/loongarch/batchnorm_loongarch.h     |  2 +-
 src/layer/loongarch/bias_loongarch.h          |  2 +-
 src/layer/loongarch/binaryop_loongarch.h      |  2 +-
 src/layer/loongarch/cast_loongarch.h          |  2 +-
 src/layer/loongarch/clip_loongarch.h          |  2 +-
 src/layer/loongarch/concat_loongarch.h        |  2 +-
 src/layer/loongarch/convolution1d_loongarch.h |  2 +-
 src/layer/loongarch/convolution_loongarch.h   |  2 +-
 .../convolutiondepthwise_loongarch.h          |  2 +-
 src/layer/loongarch/crop_loongarch.h          |  2 +-
 src/layer/loongarch/deconvolution_loongarch.h |  2 +-
 .../deconvolutiondepthwise_loongarch.h        |  2 +-
 src/layer/loongarch/dequantize_loongarch.h    |  2 +-
 src/layer/loongarch/dropout_loongarch.h       |  2 +-
 src/layer/loongarch/eltwise_loongarch.h       |  2 +-
 src/layer/loongarch/flatten_loongarch.h       |  2 +-
 src/layer/loongarch/hardsigmoid_loongarch.h   |  2 +-
 src/layer/loongarch/hardswish_loongarch.h     |  2 +-
 src/layer/loongarch/innerproduct_loongarch.h  |  2 +-
 src/layer/loongarch/interp_loongarch.h        |  2 +-
 src/layer/loongarch/mish_loongarch.h          |  2 +-
 src/layer/loongarch/packing_loongarch.h       |  2 +-
 src/layer/loongarch/padding_loongarch.h       |  2 +-
 src/layer/loongarch/pooling_loongarch.h       |  2 +-
 src/layer/loongarch/prelu_loongarch.h         |  2 +-
 src/layer/loongarch/quantize_loongarch.h      |  2 +-
 src/layer/loongarch/relu_loongarch.h          |  2 +-
 src/layer/loongarch/requantize_loongarch.h    |  2 +-
 src/layer/loongarch/sigmoid_loongarch.h       |  2 +-
 src/layer/loongarch/slice_loongarch.h         |  2 +-
 src/layer/loongarch/softmax_loongarch.h       |  2 +-
 src/layer/loongarch/swish_loongarch.h         |  2 +-
 src/layer/loongarch/tanh_loongarch.h          |  2 +-
 src/layer/loongarch/unaryop_loongarch.h       |  2 +-
 src/layer/mips/absval_mips.h                  |  2 +-
 src/layer/mips/batchnorm_mips.h               |  2 +-
 src/layer/mips/bias_mips.h                    |  2 +-
 src/layer/mips/binaryop_mips.h                |  2 +-
 src/layer/mips/cast_mips.h                    |  2 +-
 src/layer/mips/clip_mips.h                    |  2 +-
 src/layer/mips/concat_mips.h                  |  2 +-
 src/layer/mips/convolution1d_mips.h           |  2 +-
 src/layer/mips/convolution_mips.h             |  2 +-
 src/layer/mips/convolutiondepthwise_mips.h    |  2 +-
 src/layer/mips/crop_mips.h                    |  2 +-
 src/layer/mips/deconvolution_mips.h           |  2 +-
 src/layer/mips/deconvolutiondepthwise_mips.h  |  2 +-
 src/layer/mips/dequantize_mips.h              |  2 +-
 src/layer/mips/dropout_mips.h                 |  2 +-
 src/layer/mips/eltwise_mips.h                 |  2 +-
 src/layer/mips/flatten_mips.h                 |  2 +-
 src/layer/mips/hardsigmoid_mips.h             |  2 +-
 src/layer/mips/hardswish_mips.h               |  2 +-
 src/layer/mips/innerproduct_mips.h            |  2 +-
 src/layer/mips/interp_mips.h                  |  2 +-
 src/layer/mips/mish_mips.h                    |  2 +-
 src/layer/mips/packing_mips.h                 |  2 +-
 src/layer/mips/padding_mips.h                 |  2 +-
 src/layer/mips/pooling_mips.h                 |  2 +-
 src/layer/mips/prelu_mips.h                   |  2 +-
 src/layer/mips/quantize_mips.h                |  2 +-
 src/layer/mips/relu_mips.h                    |  2 +-
 src/layer/mips/requantize_mips.h              |  2 +-
 src/layer/mips/sigmoid_mips.h                 |  2 +-
 src/layer/mips/slice_mips.h                   |  2 +-
 src/layer/mips/softmax_mips.h                 |  2 +-
 src/layer/mips/swish_mips.h                   |  2 +-
 src/layer/mips/tanh_mips.h                    |  2 +-
 src/layer/mips/unaryop_mips.h                 |  2 +-
 src/layer/riscv/absval_riscv.h                |  2 +-
 src/layer/riscv/batchnorm_riscv.h             |  2 +-
 src/layer/riscv/binaryop_riscv.h              |  2 +-
 src/layer/riscv/cast_riscv.h                  |  2 +-
 src/layer/riscv/clip_riscv.h                  |  2 +-
 src/layer/riscv/concat_riscv.h                |  2 +-
 src/layer/riscv/convolution1d_riscv.h         |  2 +-
 src/layer/riscv/convolution_riscv.h           |  2 +-
 src/layer/riscv/convolutiondepthwise_riscv.h  |  2 +-
 src/layer/riscv/crop_riscv.h                  |  2 +-
 src/layer/riscv/deconvolution_riscv.h         |  2 +-
 .../riscv/deconvolutiondepthwise_riscv.h      |  2 +-
 src/layer/riscv/dropout_riscv.h               |  2 +-
 src/layer/riscv/flatten_riscv.h               |  2 +-
 src/layer/riscv/gelu_riscv.h                  |  2 +-
 src/layer/riscv/gemm_riscv.h                  |  2 +-
 src/layer/riscv/gru_riscv.h                   |  2 +-
 src/layer/riscv/hardsigmoid_riscv.h           |  2 +-
 src/layer/riscv/hardswish_riscv.h             |  2 +-
 src/layer/riscv/innerproduct_riscv.h          |  2 +-
 src/layer/riscv/instancenorm_riscv.h          |  2 +-
 src/layer/riscv/interp_riscv.h                |  2 +-
 src/layer/riscv/mish_riscv.h                  |  2 +-
 src/layer/riscv/packing_riscv.h               |  2 +-
 src/layer/riscv/padding_riscv.h               |  2 +-
 src/layer/riscv/pooling_riscv.h               |  2 +-
 src/layer/riscv/prelu_riscv.h                 |  2 +-
 src/layer/riscv/relu_riscv.h                  |  2 +-
 src/layer/riscv/selu_riscv.h                  |  2 +-
 src/layer/riscv/sigmoid_riscv.h               |  2 +-
 src/layer/riscv/softmax_riscv.h               |  2 +-
 src/layer/riscv/swish_riscv.h                 |  2 +-
 src/layer/riscv/tanh_riscv.h                  |  2 +-
 src/layer/riscv/unaryop_riscv.h               |  2 +-
 src/layer/vulkan/absval_vulkan.h              |  2 +-
 src/layer/vulkan/batchnorm_vulkan.h           |  2 +-
 src/layer/vulkan/binaryop_vulkan.h            |  2 +-
 src/layer/vulkan/cast_vulkan.h                |  2 +-
 src/layer/vulkan/celu_vulkan.h                |  2 +-
 src/layer/vulkan/clip_vulkan.h                |  2 +-
 src/layer/vulkan/concat_vulkan.h              |  2 +-
 src/layer/vulkan/convolution1d_vulkan.h       |  2 +-
 src/layer/vulkan/convolution_vulkan.h         |  2 +-
 .../vulkan/convolutiondepthwise_vulkan.h      |  2 +-
 src/layer/vulkan/crop_vulkan.h                |  2 +-
 src/layer/vulkan/deconvolution_vulkan.h       |  2 +-
 .../vulkan/deconvolutiondepthwise_vulkan.h    |  2 +-
 src/layer/vulkan/deepcopy_vulkan.h            |  2 +-
 src/layer/vulkan/dropout_vulkan.h             |  2 +-
 src/layer/vulkan/eltwise_vulkan.h             |  2 +-
 src/layer/vulkan/elu_vulkan.h                 |  2 +-
 src/layer/vulkan/erf_vulkan.h                 |  2 +-
 src/layer/vulkan/flatten_vulkan.h             |  2 +-
 src/layer/vulkan/gelu_vulkan.h                |  2 +-
 src/layer/vulkan/gemm_vulkan.h                |  2 +-
 src/layer/vulkan/hardsigmoid_vulkan.h         |  2 +-
 src/layer/vulkan/hardswish_vulkan.h           |  2 +-
 src/layer/vulkan/innerproduct_vulkan.h        |  2 +-
 src/layer/vulkan/instancenorm_vulkan.h        |  2 +-
 src/layer/vulkan/interp_vulkan.h              |  2 +-
 src/layer/vulkan/lrn_vulkan.h                 |  2 +-
 src/layer/vulkan/memorydata_vulkan.h          |  2 +-
 src/layer/vulkan/mish_vulkan.h                |  2 +-
 src/layer/vulkan/multiheadattention_vulkan.h  |  2 +-
 src/layer/vulkan/noop_vulkan.h                |  2 +-
 src/layer/vulkan/normalize_vulkan.h           |  2 +-
 src/layer/vulkan/packing_vulkan.h             |  2 +-
 src/layer/vulkan/padding_vulkan.h             |  2 +-
 src/layer/vulkan/permute_vulkan.h             |  2 +-
 src/layer/vulkan/pixelshuffle_vulkan.h        |  2 +-
 src/layer/vulkan/pooling_vulkan.h             |  2 +-
 src/layer/vulkan/prelu_vulkan.h               |  2 +-
 src/layer/vulkan/priorbox_vulkan.h            |  2 +-
 src/layer/vulkan/relu_vulkan.h                |  2 +-
 src/layer/vulkan/reorg_vulkan.h               |  2 +-
 src/layer/vulkan/reshape_vulkan.h             |  2 +-
 src/layer/vulkan/scale_vulkan.h               |  2 +-
 src/layer/vulkan/shufflechannel_vulkan.h      |  2 +-
 src/layer/vulkan/sigmoid_vulkan.h             |  2 +-
 src/layer/vulkan/slice_vulkan.h               |  2 +-
 src/layer/vulkan/softmax_vulkan.h             |  2 +-
 src/layer/vulkan/split_vulkan.h               |  2 +-
 src/layer/vulkan/swish_vulkan.h               |  2 +-
 src/layer/vulkan/tanh_vulkan.h                |  2 +-
 src/layer/vulkan/unaryop_vulkan.h             |  2 +-
 src/layer/x86/batchnorm_x86.h                 |  2 +-
 src/layer/x86/bias_x86.h                      |  2 +-
 src/layer/x86/binaryop_x86.h                  |  2 +-
 src/layer/x86/bnll_x86.h                      |  2 +-
 src/layer/x86/cast_x86.h                      |  2 +-
 src/layer/x86/clip_x86.h                      |  2 +-
 src/layer/x86/concat_x86.h                    |  2 +-
 src/layer/x86/convolution1d_x86.h             |  2 +-
 src/layer/x86/convolution_x86.h               |  2 +-
 src/layer/x86/convolutiondepthwise_x86.h      |  2 +-
 src/layer/x86/crop_x86.h                      |  2 +-
 src/layer/x86/deconvolution_x86.h             |  2 +-
 src/layer/x86/deconvolutiondepthwise_x86.h    |  2 +-
 src/layer/x86/deformableconv2d_x86.h          |  2 +-
 src/layer/x86/dequantize_x86.h                |  2 +-
 src/layer/x86/dropout_x86.h                   |  2 +-
 src/layer/x86/eltwise_x86.h                   |  2 +-
 src/layer/x86/elu_x86.h                       |  2 +-
 src/layer/x86/flatten_x86.h                   |  2 +-
 src/layer/x86/gelu_x86.h                      |  2 +-
 src/layer/x86/gemm_x86.h                      |  2 +-
 src/layer/x86/gridsample_x86.h                |  2 +-
 src/layer/x86/groupnorm_x86.h                 |  2 +-
 src/layer/x86/hardsigmoid_x86.h               |  2 +-
 src/layer/x86/hardswish_x86.h                 |  2 +-
 src/layer/x86/innerproduct_x86.h              |  2 +-
 src/layer/x86/interp_x86.h                    |  2 +-
 src/layer/x86/layernorm_x86.h                 |  2 +-
 src/layer/x86/lrn_x86.h                       |  2 +-
 src/layer/x86/lstm_x86.h                      |  2 +-
 src/layer/x86/matmul_x86.h                    |  2 +-
 src/layer/x86/mish_x86.h                      |  2 +-
 src/layer/x86/multiheadattention_x86.h        |  2 +-
 src/layer/x86/packing_x86.h                   |  2 +-
 src/layer/x86/padding_x86.h                   |  2 +-
 src/layer/x86/pooling_x86.h                   |  2 +-
 src/layer/x86/prelu_x86.h                     |  2 +-
 src/layer/x86/quantize_x86.h                  |  2 +-
 src/layer/x86/relu_x86.h                      |  2 +-
 src/layer/x86/requantize_x86.h                |  2 +-
 src/layer/x86/reshape_x86.h                   |  2 +-
 src/layer/x86/roialign_x86.h                  |  2 +-
 src/layer/x86/scale_x86.h                     |  2 +-
 src/layer/x86/selu_x86.h                      |  2 +-
 src/layer/x86/shufflechannel_x86.h            |  2 +-
 src/layer/x86/sigmoid_x86.h                   |  2 +-
 src/layer/x86/slice_x86.h                     |  2 +-
 src/layer/x86/softmax_x86.h                   |  2 +-
 src/layer/x86/swish_x86.h                     |  2 +-
 src/layer/x86/tanh_x86.h                      |  2 +-
 src/layer/x86/unaryop_x86.h                   |  2 +-
 src/layer/x86/yolov3detectionoutput_x86.h     |  2 +-
 src/layer_registry.h.in                       |  4 +++
 tests/testutil.h                              | 12 ++++----
 261 files changed, 309 insertions(+), 269 deletions(-)

diff --git a/cmake/ncnn_add_layer.cmake b/cmake/ncnn_add_layer.cmake
index 079bdd9f506..4eeedb010c7 100644
--- a/cmake/ncnn_add_layer.cmake
+++ b/cmake/ncnn_add_layer.cmake
@@ -94,6 +94,13 @@ macro(ncnn_add_layer class)
         source_group ("sources\\\\layers" FILES "${CMAKE_CURRENT_SOURCE_DIR}/layer/${name}.cpp")
     endif()
 
+    if(WITH_LAYER_${name}_${NCNN_TARGET_ARCH})
+        set(layer_declaration "${layer_declaration}#include \"layer/${NCNN_TARGET_ARCH}/${name}_${NCNN_TARGET_ARCH}.h\"\n")
+        set(layer_declaration "${layer_declaration}namespace ncnn { DEFINE_LAYER_CREATOR(${class}_${NCNN_TARGET_ARCH}) }\n")
+
+        source_group ("sources\\\\layers\\\\${NCNN_TARGET_ARCH}" FILES "${CMAKE_CURRENT_SOURCE_DIR}/layer/${NCNN_TARGET_ARCH}/${name}_${NCNN_TARGET_ARCH}.cpp")
+    endif()
+
     if(WITH_LAYER_${name}_vulkan)
         set(layer_declaration "${layer_declaration}#include \"layer/vulkan/${name}_vulkan.h\"\n")
         set(layer_declaration "${layer_declaration}namespace ncnn { DEFINE_LAYER_CREATOR(${class}_vulkan) }\n")
@@ -108,19 +115,18 @@ macro(ncnn_add_layer class)
         source_group ("sources\\\\layers\\\\vulkan" FILES "${CMAKE_CURRENT_SOURCE_DIR}/layer/vulkan/${name}_vulkan.cpp")
     endif()
 
-    if(WITH_LAYER_${name}_${NCNN_TARGET_ARCH})
-        set(layer_declaration "${layer_declaration}#include \"layer/${NCNN_TARGET_ARCH}/${name}_${NCNN_TARGET_ARCH}.h\"\n")
-        set(layer_declaration "${layer_declaration}namespace ncnn { DEFINE_LAYER_CREATOR(${class}_${NCNN_TARGET_ARCH}) }\n")
-
-        source_group ("sources\\\\layers\\\\${NCNN_TARGET_ARCH}" FILES "${CMAKE_CURRENT_SOURCE_DIR}/layer/${NCNN_TARGET_ARCH}/${name}_${NCNN_TARGET_ARCH}.cpp")
-    endif()
-
     if(WITH_LAYER_${name})
         set(layer_registry "${layer_registry}#if NCNN_STRING\n{\"${class}\", ${class}_layer_creator},\n#else\n{${class}_layer_creator},\n#endif\n")
     else()
         set(layer_registry "${layer_registry}#if NCNN_STRING\n{\"${class}\", 0},\n#else\n{0},\n#endif\n")
     endif()
 
+    if(WITH_LAYER_${name}_${NCNN_TARGET_ARCH})
+        set(layer_registry_arch "${layer_registry_arch}#if NCNN_STRING\n{\"${class}\", ${class}_${NCNN_TARGET_ARCH}_layer_creator},\n#else\n{${class}_${NCNN_TARGET_ARCH}_layer_creator},\n#endif\n")
+    else()
+        set(layer_registry_arch "${layer_registry_arch}#if NCNN_STRING\n{\"${class}\", 0},\n#else\n{0},\n#endif\n")
+    endif()
+
     if(WITH_LAYER_${name}_vulkan)
         set(layer_registry_vulkan "${layer_registry_vulkan}#if NCNN_STRING\n{\"${class}\", ${class}_vulkan_layer_creator},\n#else\n{${class}_vulkan_layer_creator},\n#endif\n")
     else()
diff --git a/src/layer.cpp b/src/layer.cpp
index cf895b4169d..ed28091fafa 100644
--- a/src/layer.cpp
+++ b/src/layer.cpp
@@ -208,6 +208,15 @@ Layer* create_layer(const char* type)
     return create_layer(index);
 }
 
+Layer* create_layer_naive(const char* type)
+{
+    int index = layer_to_index(type);
+    if (index == -1)
+        return 0;
+
+    return create_layer_naive(index);
+}
+
 Layer* create_layer_cpu(const char* type)
 {
     int index = layer_to_index(type);
@@ -472,6 +481,20 @@ Layer* create_layer(int index)
     return layer_final;
 }
 
+Layer* create_layer_naive(int index)
+{
+    if (index < 0 || index >= layer_registry_entry_count)
+        return 0;
+
+    layer_creator_func layer_creator = layer_registry[index].creator;
+    if (!layer_creator)
+        return 0;
+
+    Layer* layer = layer_creator(0);
+    layer->typeindex = index;
+    return layer;
+}
+
 Layer* create_layer_cpu(int index)
 {
     if (index < 0 || index >= layer_registry_entry_count)
@@ -529,6 +552,11 @@ Layer* create_layer_cpu(int index)
     }
     else
 #endif // NCNN_RUNTIME_CPU && NCNN_RVV
+    {
+        layer_creator = layer_registry_arch[index].creator;
+    }
+
+    if (!layer_creator)
     {
         layer_creator = layer_registry[index].creator;
     }
diff --git a/src/layer.h b/src/layer.h
index e04f606145b..d44713de451 100644
--- a/src/layer.h
+++ b/src/layer.h
@@ -199,6 +199,7 @@ struct overwrite_builtin_layer_registry_entry
 NCNN_EXPORT int layer_to_index(const char* type);
 // create layer from type name
 NCNN_EXPORT Layer* create_layer(const char* type);
+NCNN_EXPORT Layer* create_layer_naive(const char* type);
 NCNN_EXPORT Layer* create_layer_cpu(const char* type);
 #if NCNN_VULKAN
 NCNN_EXPORT Layer* create_layer_vulkan(const char* type);
@@ -206,6 +207,7 @@ NCNN_EXPORT Layer* create_layer_vulkan(const char* type);
 #endif // NCNN_STRING
 // create layer from layer type
 NCNN_EXPORT Layer* create_layer(int index);
+NCNN_EXPORT Layer* create_layer_naive(int index);
 NCNN_EXPORT Layer* create_layer_cpu(int index);
 #if NCNN_VULKAN
 NCNN_EXPORT Layer* create_layer_vulkan(int index);
diff --git a/src/layer/arm/absval_arm.h b/src/layer/arm/absval_arm.h
index 7e6f150e7f1..c1cea1dfb9e 100644
--- a/src/layer/arm/absval_arm.h
+++ b/src/layer/arm/absval_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class AbsVal_arm : virtual public AbsVal
+class AbsVal_arm : public AbsVal
 {
 public:
     AbsVal_arm();
diff --git a/src/layer/arm/batchnorm_arm.h b/src/layer/arm/batchnorm_arm.h
index 9be82439cb4..1393bb30e12 100644
--- a/src/layer/arm/batchnorm_arm.h
+++ b/src/layer/arm/batchnorm_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class BatchNorm_arm : virtual public BatchNorm
+class BatchNorm_arm : public BatchNorm
 {
 public:
     BatchNorm_arm();
diff --git a/src/layer/arm/bias_arm.h b/src/layer/arm/bias_arm.h
index a3b61cd300d..5f08facf17e 100644
--- a/src/layer/arm/bias_arm.h
+++ b/src/layer/arm/bias_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Bias_arm : virtual public Bias
+class Bias_arm : public Bias
 {
 public:
     virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
diff --git a/src/layer/arm/binaryop_arm.h b/src/layer/arm/binaryop_arm.h
index 6bb950495ce..1337065eb40 100644
--- a/src/layer/arm/binaryop_arm.h
+++ b/src/layer/arm/binaryop_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class BinaryOp_arm : virtual public BinaryOp
+class BinaryOp_arm : public BinaryOp
 {
 public:
     BinaryOp_arm();
diff --git a/src/layer/arm/cast_arm.h b/src/layer/arm/cast_arm.h
index 190090a859a..fc32c70d3dd 100644
--- a/src/layer/arm/cast_arm.h
+++ b/src/layer/arm/cast_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Cast_arm : virtual public Cast
+class Cast_arm : public Cast
 {
 public:
     Cast_arm();
diff --git a/src/layer/arm/clip_arm.h b/src/layer/arm/clip_arm.h
index 8af695172e1..ef281d249e7 100644
--- a/src/layer/arm/clip_arm.h
+++ b/src/layer/arm/clip_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Clip_arm : virtual public Clip
+class Clip_arm : public Clip
 {
 public:
     Clip_arm();
diff --git a/src/layer/arm/concat_arm.h b/src/layer/arm/concat_arm.h
index c09dfa27568..9491a280110 100644
--- a/src/layer/arm/concat_arm.h
+++ b/src/layer/arm/concat_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Concat_arm : virtual public Concat
+class Concat_arm : public Concat
 {
 public:
     Concat_arm();
diff --git a/src/layer/arm/convolution1d_arm.h b/src/layer/arm/convolution1d_arm.h
index 83e0ea83809..48babb914d2 100644
--- a/src/layer/arm/convolution1d_arm.h
+++ b/src/layer/arm/convolution1d_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Convolution1D_arm : virtual public Convolution1D
+class Convolution1D_arm : public Convolution1D
 {
 public:
     Convolution1D_arm();
diff --git a/src/layer/arm/convolution_arm.h b/src/layer/arm/convolution_arm.h
index b70b339f046..e25f509c711 100644
--- a/src/layer/arm/convolution_arm.h
+++ b/src/layer/arm/convolution_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Convolution_arm : virtual public Convolution
+class Convolution_arm : public Convolution
 {
 public:
     Convolution_arm();
diff --git a/src/layer/arm/convolutiondepthwise_arm.h b/src/layer/arm/convolutiondepthwise_arm.h
index 412590f101e..8536c081320 100644
--- a/src/layer/arm/convolutiondepthwise_arm.h
+++ b/src/layer/arm/convolutiondepthwise_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class ConvolutionDepthWise_arm : virtual public ConvolutionDepthWise
+class ConvolutionDepthWise_arm : public ConvolutionDepthWise
 {
 public:
     ConvolutionDepthWise_arm();
diff --git a/src/layer/arm/crop_arm.h b/src/layer/arm/crop_arm.h
index e3f6d5109a3..9f2bea6e1bd 100644
--- a/src/layer/arm/crop_arm.h
+++ b/src/layer/arm/crop_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Crop_arm : virtual public Crop
+class Crop_arm : public Crop
 {
 public:
     Crop_arm();
diff --git a/src/layer/arm/deconvolution_arm.h b/src/layer/arm/deconvolution_arm.h
index 3c7979687cb..b4cdcbe0ee9 100644
--- a/src/layer/arm/deconvolution_arm.h
+++ b/src/layer/arm/deconvolution_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Deconvolution_arm : virtual public Deconvolution
+class Deconvolution_arm : public Deconvolution
 {
 public:
     Deconvolution_arm();
diff --git a/src/layer/arm/deconvolutiondepthwise_arm.h b/src/layer/arm/deconvolutiondepthwise_arm.h
index 6eff45ede3a..a7ef393dd25 100644
--- a/src/layer/arm/deconvolutiondepthwise_arm.h
+++ b/src/layer/arm/deconvolutiondepthwise_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class DeconvolutionDepthWise_arm : virtual public DeconvolutionDepthWise
+class DeconvolutionDepthWise_arm : public DeconvolutionDepthWise
 {
 public:
     DeconvolutionDepthWise_arm();
diff --git a/src/layer/arm/dequantize_arm.h b/src/layer/arm/dequantize_arm.h
index 5bba8de7fdd..677c731db69 100644
--- a/src/layer/arm/dequantize_arm.h
+++ b/src/layer/arm/dequantize_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Dequantize_arm : virtual public Dequantize
+class Dequantize_arm : public Dequantize
 {
 public:
     Dequantize_arm();
diff --git a/src/layer/arm/dropout_arm.h b/src/layer/arm/dropout_arm.h
index 395c5a9d02c..9a970525aae 100644
--- a/src/layer/arm/dropout_arm.h
+++ b/src/layer/arm/dropout_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Dropout_arm : virtual public Dropout
+class Dropout_arm : public Dropout
 {
 public:
     Dropout_arm();
diff --git a/src/layer/arm/eltwise_arm.h b/src/layer/arm/eltwise_arm.h
index 5480f2293ce..6bd91f5dab5 100644
--- a/src/layer/arm/eltwise_arm.h
+++ b/src/layer/arm/eltwise_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Eltwise_arm : virtual public Eltwise
+class Eltwise_arm : public Eltwise
 {
 public:
     Eltwise_arm();
diff --git a/src/layer/arm/flatten_arm.h b/src/layer/arm/flatten_arm.h
index 92932ba7744..9bc9a0d1b99 100644
--- a/src/layer/arm/flatten_arm.h
+++ b/src/layer/arm/flatten_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Flatten_arm : virtual public Flatten
+class Flatten_arm : public Flatten
 {
 public:
     Flatten_arm();
diff --git a/src/layer/arm/gelu_arm.h b/src/layer/arm/gelu_arm.h
index 283f063bb69..5be9fc4d6d5 100644
--- a/src/layer/arm/gelu_arm.h
+++ b/src/layer/arm/gelu_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class GELU_arm : virtual public GELU
+class GELU_arm : public GELU
 {
 public:
     GELU_arm();
diff --git a/src/layer/arm/gemm_arm.h b/src/layer/arm/gemm_arm.h
index e4e4b81f2ee..0c1eab108ba 100644
--- a/src/layer/arm/gemm_arm.h
+++ b/src/layer/arm/gemm_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Gemm_arm : virtual public Gemm
+class Gemm_arm : public Gemm
 {
 public:
     Gemm_arm();
diff --git a/src/layer/arm/gru_arm.h b/src/layer/arm/gru_arm.h
index e1e8fbb08fd..6eae1656b01 100644
--- a/src/layer/arm/gru_arm.h
+++ b/src/layer/arm/gru_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class GRU_arm : virtual public GRU
+class GRU_arm : public GRU
 {
 public:
     GRU_arm();
diff --git a/src/layer/arm/hardsigmoid_arm.h b/src/layer/arm/hardsigmoid_arm.h
index bfa04828ac6..13783ff1690 100644
--- a/src/layer/arm/hardsigmoid_arm.h
+++ b/src/layer/arm/hardsigmoid_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class HardSigmoid_arm : virtual public HardSigmoid
+class HardSigmoid_arm : public HardSigmoid
 {
 public:
     HardSigmoid_arm();
diff --git a/src/layer/arm/hardswish_arm.h b/src/layer/arm/hardswish_arm.h
index 7309ba6c71f..a534ceb1677 100644
--- a/src/layer/arm/hardswish_arm.h
+++ b/src/layer/arm/hardswish_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class HardSwish_arm : virtual public HardSwish
+class HardSwish_arm : public HardSwish
 {
 public:
     HardSwish_arm();
diff --git a/src/layer/arm/innerproduct_arm.h b/src/layer/arm/innerproduct_arm.h
index f1eee178f9c..70a54533151 100644
--- a/src/layer/arm/innerproduct_arm.h
+++ b/src/layer/arm/innerproduct_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class InnerProduct_arm : virtual public InnerProduct
+class InnerProduct_arm : public InnerProduct
 {
 public:
     InnerProduct_arm();
diff --git a/src/layer/arm/instancenorm_arm.h b/src/layer/arm/instancenorm_arm.h
index 102c49fe2b0..98dec71ac48 100644
--- a/src/layer/arm/instancenorm_arm.h
+++ b/src/layer/arm/instancenorm_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class InstanceNorm_arm : virtual public InstanceNorm
+class InstanceNorm_arm : public InstanceNorm
 {
 public:
     InstanceNorm_arm();
diff --git a/src/layer/arm/interp_arm.h b/src/layer/arm/interp_arm.h
index 5ea9873ae78..6c15c73801b 100644
--- a/src/layer/arm/interp_arm.h
+++ b/src/layer/arm/interp_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Interp_arm : virtual public Interp
+class Interp_arm : public Interp
 {
 public:
     Interp_arm();
diff --git a/src/layer/arm/lrn_arm.h b/src/layer/arm/lrn_arm.h
index db9a04e0adb..f2c43ba08f2 100644
--- a/src/layer/arm/lrn_arm.h
+++ b/src/layer/arm/lrn_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class LRN_arm : virtual public LRN
+class LRN_arm : public LRN
 {
 public:
     virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
diff --git a/src/layer/arm/lstm_arm.h b/src/layer/arm/lstm_arm.h
index a42dff28823..b5ee1092a52 100644
--- a/src/layer/arm/lstm_arm.h
+++ b/src/layer/arm/lstm_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class LSTM_arm : virtual public LSTM
+class LSTM_arm : public LSTM
 {
 public:
     LSTM_arm();
diff --git a/src/layer/arm/matmul_arm.h b/src/layer/arm/matmul_arm.h
index 4d4784ce50d..a4537300d5a 100644
--- a/src/layer/arm/matmul_arm.h
+++ b/src/layer/arm/matmul_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class MatMul_arm : virtual public MatMul
+class MatMul_arm : public MatMul
 {
 public:
     MatMul_arm();
diff --git a/src/layer/arm/mish_arm.h b/src/layer/arm/mish_arm.h
index 708611589f4..9f99a7a1200 100644
--- a/src/layer/arm/mish_arm.h
+++ b/src/layer/arm/mish_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Mish_arm : virtual public Mish
+class Mish_arm : public Mish
 {
 public:
     Mish_arm();
diff --git a/src/layer/arm/multiheadattention_arm.h b/src/layer/arm/multiheadattention_arm.h
index fb1010b1b01..f1b721f22ea 100644
--- a/src/layer/arm/multiheadattention_arm.h
+++ b/src/layer/arm/multiheadattention_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class MultiHeadAttention_arm : virtual public MultiHeadAttention
+class MultiHeadAttention_arm : public MultiHeadAttention
 {
 public:
     MultiHeadAttention_arm();
diff --git a/src/layer/arm/packing_arm.h b/src/layer/arm/packing_arm.h
index 20cb04ac5f3..17c64854058 100644
--- a/src/layer/arm/packing_arm.h
+++ b/src/layer/arm/packing_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Packing_arm : virtual public Packing
+class Packing_arm : public Packing
 {
 public:
     Packing_arm();
diff --git a/src/layer/arm/padding_arm.h b/src/layer/arm/padding_arm.h
index 81156fcd831..164cfe4c33a 100644
--- a/src/layer/arm/padding_arm.h
+++ b/src/layer/arm/padding_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Padding_arm : virtual public Padding
+class Padding_arm : public Padding
 {
 public:
     Padding_arm();
diff --git a/src/layer/arm/pixelshuffle_arm.h b/src/layer/arm/pixelshuffle_arm.h
index c40d67ddec8..a2d714c9ebb 100644
--- a/src/layer/arm/pixelshuffle_arm.h
+++ b/src/layer/arm/pixelshuffle_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class PixelShuffle_arm : virtual public PixelShuffle
+class PixelShuffle_arm : public PixelShuffle
 {
 public:
     PixelShuffle_arm();
diff --git a/src/layer/arm/pooling_arm.h b/src/layer/arm/pooling_arm.h
index 0193faa6a87..ead9270c717 100644
--- a/src/layer/arm/pooling_arm.h
+++ b/src/layer/arm/pooling_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Pooling_arm : virtual public Pooling
+class Pooling_arm : public Pooling
 {
 public:
     Pooling_arm();
diff --git a/src/layer/arm/prelu_arm.h b/src/layer/arm/prelu_arm.h
index e65801a3be0..9354be7440b 100644
--- a/src/layer/arm/prelu_arm.h
+++ b/src/layer/arm/prelu_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class PReLU_arm : virtual public PReLU
+class PReLU_arm : public PReLU
 {
 public:
     PReLU_arm();
diff --git a/src/layer/arm/quantize_arm.h b/src/layer/arm/quantize_arm.h
index 3ed271ca7fe..60a716198cb 100644
--- a/src/layer/arm/quantize_arm.h
+++ b/src/layer/arm/quantize_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Quantize_arm : virtual public Quantize
+class Quantize_arm : public Quantize
 {
 public:
     Quantize_arm();
diff --git a/src/layer/arm/relu_arm.h b/src/layer/arm/relu_arm.h
index 77bda6ac5b5..c2212513a42 100644
--- a/src/layer/arm/relu_arm.h
+++ b/src/layer/arm/relu_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class ReLU_arm : virtual public ReLU
+class ReLU_arm : public ReLU
 {
 public:
     ReLU_arm();
diff --git a/src/layer/arm/requantize_arm.h b/src/layer/arm/requantize_arm.h
index e7093a7e4c1..c6fc184a018 100644
--- a/src/layer/arm/requantize_arm.h
+++ b/src/layer/arm/requantize_arm.h
@@ -20,7 +20,7 @@
 
 namespace ncnn {
 
-class Requantize_arm : virtual public Requantize
+class Requantize_arm : public Requantize
 {
 public:
     Requantize_arm();
diff --git a/src/layer/arm/reshape_arm.h b/src/layer/arm/reshape_arm.h
index 7a2474b7cb5..85466ecfd68 100644
--- a/src/layer/arm/reshape_arm.h
+++ b/src/layer/arm/reshape_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Reshape_arm : virtual public Reshape
+class Reshape_arm : public Reshape
 {
 public:
     Reshape_arm();
diff --git a/src/layer/arm/rnn_arm.h b/src/layer/arm/rnn_arm.h
index 5defad4cf08..18e75642b9e 100644
--- a/src/layer/arm/rnn_arm.h
+++ b/src/layer/arm/rnn_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class RNN_arm : virtual public RNN
+class RNN_arm : public RNN
 {
 public:
     RNN_arm();
diff --git a/src/layer/arm/scale_arm.h b/src/layer/arm/scale_arm.h
index c327376d17e..c540cdd62ed 100644
--- a/src/layer/arm/scale_arm.h
+++ b/src/layer/arm/scale_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Scale_arm : virtual public Scale
+class Scale_arm : public Scale
 {
 public:
     Scale_arm();
diff --git a/src/layer/arm/selu_arm.h b/src/layer/arm/selu_arm.h
index ad0bdf2f955..d951804db68 100644
--- a/src/layer/arm/selu_arm.h
+++ b/src/layer/arm/selu_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class SELU_arm : virtual public SELU
+class SELU_arm : public SELU
 {
 public:
     virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
diff --git a/src/layer/arm/shufflechannel_arm.h b/src/layer/arm/shufflechannel_arm.h
index f7a32ac4ab7..dcdbf760bb3 100644
--- a/src/layer/arm/shufflechannel_arm.h
+++ b/src/layer/arm/shufflechannel_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class ShuffleChannel_arm : virtual public ShuffleChannel
+class ShuffleChannel_arm : public ShuffleChannel
 {
 public:
     ShuffleChannel_arm();
diff --git a/src/layer/arm/sigmoid_arm.h b/src/layer/arm/sigmoid_arm.h
index f532a44d6f5..4c3901abbe9 100644
--- a/src/layer/arm/sigmoid_arm.h
+++ b/src/layer/arm/sigmoid_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Sigmoid_arm : virtual public Sigmoid
+class Sigmoid_arm : public Sigmoid
 {
 public:
     Sigmoid_arm();
diff --git a/src/layer/arm/slice_arm.h b/src/layer/arm/slice_arm.h
index 50da56743b2..c3b558b9e1d 100644
--- a/src/layer/arm/slice_arm.h
+++ b/src/layer/arm/slice_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Slice_arm : virtual public Slice
+class Slice_arm : public Slice
 {
 public:
     Slice_arm();
diff --git a/src/layer/arm/softmax_arm.h b/src/layer/arm/softmax_arm.h
index fced6398c54..78c540845b0 100644
--- a/src/layer/arm/softmax_arm.h
+++ b/src/layer/arm/softmax_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Softmax_arm : virtual public Softmax
+class Softmax_arm : public Softmax
 {
 public:
     Softmax_arm();
diff --git a/src/layer/arm/swish_arm.h b/src/layer/arm/swish_arm.h
index ac24757c397..907d79708ab 100644
--- a/src/layer/arm/swish_arm.h
+++ b/src/layer/arm/swish_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Swish_arm : virtual public Swish
+class Swish_arm : public Swish
 {
 public:
     Swish_arm();
diff --git a/src/layer/arm/tanh_arm.h b/src/layer/arm/tanh_arm.h
index e019b32ec4f..db62f117a56 100644
--- a/src/layer/arm/tanh_arm.h
+++ b/src/layer/arm/tanh_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class TanH_arm : virtual public TanH
+class TanH_arm : public TanH
 {
 public:
     TanH_arm();
diff --git a/src/layer/arm/unaryop_arm.h b/src/layer/arm/unaryop_arm.h
index 66994eb2103..ab4b23c05f1 100644
--- a/src/layer/arm/unaryop_arm.h
+++ b/src/layer/arm/unaryop_arm.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class UnaryOp_arm : virtual public UnaryOp
+class UnaryOp_arm : public UnaryOp
 {
 public:
     UnaryOp_arm();
diff --git a/src/layer/loongarch/absval_loongarch.h b/src/layer/loongarch/absval_loongarch.h
index 0a3143cea43..855f959cf00 100644
--- a/src/layer/loongarch/absval_loongarch.h
+++ b/src/layer/loongarch/absval_loongarch.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class AbsVal_loongarch : virtual public AbsVal
+class AbsVal_loongarch : public AbsVal
 {
 public:
     AbsVal_loongarch();
diff --git a/src/layer/loongarch/batchnorm_loongarch.h b/src/layer/loongarch/batchnorm_loongarch.h
index 8b38d5e1f66..fb477a9aedb 100644
--- a/src/layer/loongarch/batchnorm_loongarch.h
+++ b/src/layer/loongarch/batchnorm_loongarch.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class BatchNorm_loongarch : virtual public BatchNorm
+class BatchNorm_loongarch : public BatchNorm
 {
 public:
     BatchNorm_loongarch();
diff --git a/src/layer/loongarch/bias_loongarch.h b/src/layer/loongarch/bias_loongarch.h
index f122ffa0dd9..35824997487 100644
--- a/src/layer/loongarch/bias_loongarch.h
+++ b/src/layer/loongarch/bias_loongarch.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Bias_loongarch : virtual public Bias
+class Bias_loongarch : public Bias
 {
 public:
     virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
diff --git a/src/layer/loongarch/binaryop_loongarch.h b/src/layer/loongarch/binaryop_loongarch.h
index bcf9ef5442f..2fc401ad610 100644
--- a/src/layer/loongarch/binaryop_loongarch.h
+++ b/src/layer/loongarch/binaryop_loongarch.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class BinaryOp_loongarch : virtual public BinaryOp
+class BinaryOp_loongarch : public BinaryOp
 {
 public:
     BinaryOp_loongarch();
diff --git a/src/layer/loongarch/cast_loongarch.h b/src/layer/loongarch/cast_loongarch.h
index 1fe75c687d8..8925f242ed5 100644
--- a/src/layer/loongarch/cast_loongarch.h
+++ b/src/layer/loongarch/cast_loongarch.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Cast_loongarch : virtual public Cast
+class Cast_loongarch : public Cast
 {
 public:
     Cast_loongarch();
diff --git a/src/layer/loongarch/clip_loongarch.h b/src/layer/loongarch/clip_loongarch.h
index 43df62035ff..1ebeee2aeac 100644
--- a/src/layer/loongarch/clip_loongarch.h
+++ b/src/layer/loongarch/clip_loongarch.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Clip_loongarch : virtual public Clip
+class Clip_loongarch : public Clip
 {
 public:
     Clip_loongarch();
diff --git a/src/layer/loongarch/concat_loongarch.h b/src/layer/loongarch/concat_loongarch.h
index 934c85244df..91b32ef2faf 100644
--- a/src/layer/loongarch/concat_loongarch.h
+++ b/src/layer/loongarch/concat_loongarch.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Concat_loongarch : virtual public Concat
+class Concat_loongarch : public Concat
 {
 public:
     Concat_loongarch();
diff --git a/src/layer/loongarch/convolution1d_loongarch.h b/src/layer/loongarch/convolution1d_loongarch.h
index 36393df4568..922fae598f4 100644
--- a/src/layer/loongarch/convolution1d_loongarch.h
+++ b/src/layer/loongarch/convolution1d_loongarch.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Convolution1D_loongarch : virtual public Convolution1D
+class Convolution1D_loongarch : public Convolution1D
 {
 public:
     Convolution1D_loongarch();
diff --git a/src/layer/loongarch/convolution_loongarch.h b/src/layer/loongarch/convolution_loongarch.h
index a84281bf713..7807f43f9f1 100644
--- a/src/layer/loongarch/convolution_loongarch.h
+++ b/src/layer/loongarch/convolution_loongarch.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Convolution_loongarch : virtual public Convolution
+class Convolution_loongarch : public Convolution
 {
 public:
     Convolution_loongarch();
diff --git a/src/layer/loongarch/convolutiondepthwise_loongarch.h b/src/layer/loongarch/convolutiondepthwise_loongarch.h
index 554fe764304..35cdd8f008d 100644
--- a/src/layer/loongarch/convolutiondepthwise_loongarch.h
+++ b/src/layer/loongarch/convolutiondepthwise_loongarch.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class ConvolutionDepthWise_loongarch : virtual public ConvolutionDepthWise
+class ConvolutionDepthWise_loongarch : public ConvolutionDepthWise
 {
 public:
     ConvolutionDepthWise_loongarch();
diff --git a/src/layer/loongarch/crop_loongarch.h b/src/layer/loongarch/crop_loongarch.h
index 0ba460256d6..cfb4ff352ba 100644
--- a/src/layer/loongarch/crop_loongarch.h
+++ b/src/layer/loongarch/crop_loongarch.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Crop_loongarch : virtual public Crop
+class Crop_loongarch : public Crop
 {
 public:
     Crop_loongarch();
diff --git a/src/layer/loongarch/deconvolution_loongarch.h b/src/layer/loongarch/deconvolution_loongarch.h
index f67b5d7e4e1..00ddf67e05b 100644
--- a/src/layer/loongarch/deconvolution_loongarch.h
+++ b/src/layer/loongarch/deconvolution_loongarch.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Deconvolution_loongarch : virtual public Deconvolution
+class Deconvolution_loongarch : public Deconvolution
 {
 public:
     Deconvolution_loongarch();
diff --git a/src/layer/loongarch/deconvolutiondepthwise_loongarch.h b/src/layer/loongarch/deconvolutiondepthwise_loongarch.h
index b710f07ecf3..87c5351fab4 100644
--- a/src/layer/loongarch/deconvolutiondepthwise_loongarch.h
+++ b/src/layer/loongarch/deconvolutiondepthwise_loongarch.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class DeconvolutionDepthWise_loongarch : virtual public DeconvolutionDepthWise
+class DeconvolutionDepthWise_loongarch : public DeconvolutionDepthWise
 {
 public:
     DeconvolutionDepthWise_loongarch();
diff --git a/src/layer/loongarch/dequantize_loongarch.h b/src/layer/loongarch/dequantize_loongarch.h
index 61a408d5c50..ae7d3fe6479 100644
--- a/src/layer/loongarch/dequantize_loongarch.h
+++ b/src/layer/loongarch/dequantize_loongarch.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Dequantize_loongarch : virtual public Dequantize
+class Dequantize_loongarch : public Dequantize
 {
 public:
     Dequantize_loongarch();
diff --git a/src/layer/loongarch/dropout_loongarch.h b/src/layer/loongarch/dropout_loongarch.h
index 42810050677..f9beff05034 100644
--- a/src/layer/loongarch/dropout_loongarch.h
+++ b/src/layer/loongarch/dropout_loongarch.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Dropout_loongarch : virtual public Dropout
+class Dropout_loongarch : public Dropout
 {
 public:
     Dropout_loongarch();
diff --git a/src/layer/loongarch/eltwise_loongarch.h b/src/layer/loongarch/eltwise_loongarch.h
index f9715b20cad..f523132bb5f 100644
--- a/src/layer/loongarch/eltwise_loongarch.h
+++ b/src/layer/loongarch/eltwise_loongarch.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Eltwise_loongarch : virtual public Eltwise
+class Eltwise_loongarch : public Eltwise
 {
 public:
     Eltwise_loongarch();
diff --git a/src/layer/loongarch/flatten_loongarch.h b/src/layer/loongarch/flatten_loongarch.h
index afd35c701f5..da75fd12f3f 100644
--- a/src/layer/loongarch/flatten_loongarch.h
+++ b/src/layer/loongarch/flatten_loongarch.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Flatten_loongarch : virtual public Flatten
+class Flatten_loongarch : public Flatten
 {
 public:
     Flatten_loongarch();
diff --git a/src/layer/loongarch/hardsigmoid_loongarch.h b/src/layer/loongarch/hardsigmoid_loongarch.h
index 755ae89ff03..519a4ba9594 100644
--- a/src/layer/loongarch/hardsigmoid_loongarch.h
+++ b/src/layer/loongarch/hardsigmoid_loongarch.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class HardSigmoid_loongarch : virtual public HardSigmoid
+class HardSigmoid_loongarch : public HardSigmoid
 {
 public:
     HardSigmoid_loongarch();
diff --git a/src/layer/loongarch/hardswish_loongarch.h b/src/layer/loongarch/hardswish_loongarch.h
index e9b0821245c..ef69cb05417 100644
--- a/src/layer/loongarch/hardswish_loongarch.h
+++ b/src/layer/loongarch/hardswish_loongarch.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class HardSwish_loongarch : virtual public HardSwish
+class HardSwish_loongarch : public HardSwish
 {
 public:
     HardSwish_loongarch();
diff --git a/src/layer/loongarch/innerproduct_loongarch.h b/src/layer/loongarch/innerproduct_loongarch.h
index 4d9574ce919..2ae1a1e57e0 100644
--- a/src/layer/loongarch/innerproduct_loongarch.h
+++ b/src/layer/loongarch/innerproduct_loongarch.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class InnerProduct_loongarch : virtual public InnerProduct
+class InnerProduct_loongarch : public InnerProduct
 {
 public:
     InnerProduct_loongarch();
diff --git a/src/layer/loongarch/interp_loongarch.h b/src/layer/loongarch/interp_loongarch.h
index 4c0e0f3dc86..f1fa80705d5 100644
--- a/src/layer/loongarch/interp_loongarch.h
+++ b/src/layer/loongarch/interp_loongarch.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Interp_loongarch : virtual public Interp
+class Interp_loongarch : public Interp
 {
 public:
     Interp_loongarch();
diff --git a/src/layer/loongarch/mish_loongarch.h b/src/layer/loongarch/mish_loongarch.h
index 97c6f0520f5..0c796758064 100644
--- a/src/layer/loongarch/mish_loongarch.h
+++ b/src/layer/loongarch/mish_loongarch.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Mish_loongarch : virtual public Mish
+class Mish_loongarch : public Mish
 {
 public:
     Mish_loongarch();
diff --git a/src/layer/loongarch/packing_loongarch.h b/src/layer/loongarch/packing_loongarch.h
index 1db215cfee7..476ebd33a87 100644
--- a/src/layer/loongarch/packing_loongarch.h
+++ b/src/layer/loongarch/packing_loongarch.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Packing_loongarch : virtual public Packing
+class Packing_loongarch : public Packing
 {
 public:
     Packing_loongarch();
diff --git a/src/layer/loongarch/padding_loongarch.h b/src/layer/loongarch/padding_loongarch.h
index 137fbc4459e..de416464783 100644
--- a/src/layer/loongarch/padding_loongarch.h
+++ b/src/layer/loongarch/padding_loongarch.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Padding_loongarch : virtual public Padding
+class Padding_loongarch : public Padding
 {
 public:
     Padding_loongarch();
diff --git a/src/layer/loongarch/pooling_loongarch.h b/src/layer/loongarch/pooling_loongarch.h
index 97e0c9ff2f7..646b10947b3 100644
--- a/src/layer/loongarch/pooling_loongarch.h
+++ b/src/layer/loongarch/pooling_loongarch.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Pooling_loongarch : virtual public Pooling
+class Pooling_loongarch : public Pooling
 {
 public:
     Pooling_loongarch();
diff --git a/src/layer/loongarch/prelu_loongarch.h b/src/layer/loongarch/prelu_loongarch.h
index 97031bb0601..bafd7ac4c68 100644
--- a/src/layer/loongarch/prelu_loongarch.h
+++ b/src/layer/loongarch/prelu_loongarch.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class PReLU_loongarch : virtual public PReLU
+class PReLU_loongarch : public PReLU
 {
 public:
     PReLU_loongarch();
diff --git a/src/layer/loongarch/quantize_loongarch.h b/src/layer/loongarch/quantize_loongarch.h
index cae04aab171..dcc0d8e097e 100644
--- a/src/layer/loongarch/quantize_loongarch.h
+++ b/src/layer/loongarch/quantize_loongarch.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Quantize_loongarch : virtual public Quantize
+class Quantize_loongarch : public Quantize
 {
 public:
     Quantize_loongarch();
diff --git a/src/layer/loongarch/relu_loongarch.h b/src/layer/loongarch/relu_loongarch.h
index 445c6e8febc..6ee6684fdb7 100644
--- a/src/layer/loongarch/relu_loongarch.h
+++ b/src/layer/loongarch/relu_loongarch.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class ReLU_loongarch : virtual public ReLU
+class ReLU_loongarch : public ReLU
 {
 public:
     ReLU_loongarch();
diff --git a/src/layer/loongarch/requantize_loongarch.h b/src/layer/loongarch/requantize_loongarch.h
index 8175989959e..4afaf9df3d3 100644
--- a/src/layer/loongarch/requantize_loongarch.h
+++ b/src/layer/loongarch/requantize_loongarch.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Requantize_loongarch : virtual public Requantize
+class Requantize_loongarch : public Requantize
 {
 public:
     Requantize_loongarch();
diff --git a/src/layer/loongarch/sigmoid_loongarch.h b/src/layer/loongarch/sigmoid_loongarch.h
index b15aad235db..02354d2a5a4 100644
--- a/src/layer/loongarch/sigmoid_loongarch.h
+++ b/src/layer/loongarch/sigmoid_loongarch.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Sigmoid_loongarch : virtual public Sigmoid
+class Sigmoid_loongarch : public Sigmoid
 {
 public:
     Sigmoid_loongarch();
diff --git a/src/layer/loongarch/slice_loongarch.h b/src/layer/loongarch/slice_loongarch.h
index b42138ba418..2f5faed8cbf 100644
--- a/src/layer/loongarch/slice_loongarch.h
+++ b/src/layer/loongarch/slice_loongarch.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Slice_loongarch : virtual public Slice
+class Slice_loongarch : public Slice
 {
 public:
     Slice_loongarch();
diff --git a/src/layer/loongarch/softmax_loongarch.h b/src/layer/loongarch/softmax_loongarch.h
index 3c8272a6412..baf930fcbd2 100644
--- a/src/layer/loongarch/softmax_loongarch.h
+++ b/src/layer/loongarch/softmax_loongarch.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Softmax_loongarch : virtual public Softmax
+class Softmax_loongarch : public Softmax
 {
 public:
     virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
diff --git a/src/layer/loongarch/swish_loongarch.h b/src/layer/loongarch/swish_loongarch.h
index b8d0b80f01e..9b7d2ac851f 100644
--- a/src/layer/loongarch/swish_loongarch.h
+++ b/src/layer/loongarch/swish_loongarch.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Swish_loongarch : virtual public Swish
+class Swish_loongarch : public Swish
 {
 public:
     Swish_loongarch();
diff --git a/src/layer/loongarch/tanh_loongarch.h b/src/layer/loongarch/tanh_loongarch.h
index ecbab01ec8f..74231eb56b6 100644
--- a/src/layer/loongarch/tanh_loongarch.h
+++ b/src/layer/loongarch/tanh_loongarch.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class TanH_loongarch : virtual public TanH
+class TanH_loongarch : public TanH
 {
 public:
     TanH_loongarch();
diff --git a/src/layer/loongarch/unaryop_loongarch.h b/src/layer/loongarch/unaryop_loongarch.h
index 8170bec50cf..f4210aeab57 100644
--- a/src/layer/loongarch/unaryop_loongarch.h
+++ b/src/layer/loongarch/unaryop_loongarch.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class UnaryOp_loongarch : virtual public UnaryOp
+class UnaryOp_loongarch : public UnaryOp
 {
 public:
     UnaryOp_loongarch();
diff --git a/src/layer/mips/absval_mips.h b/src/layer/mips/absval_mips.h
index c028c312f35..95dca4d596a 100644
--- a/src/layer/mips/absval_mips.h
+++ b/src/layer/mips/absval_mips.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class AbsVal_mips : virtual public AbsVal
+class AbsVal_mips : public AbsVal
 {
 public:
     AbsVal_mips();
diff --git a/src/layer/mips/batchnorm_mips.h b/src/layer/mips/batchnorm_mips.h
index c18902ebad7..6df49407a0e 100644
--- a/src/layer/mips/batchnorm_mips.h
+++ b/src/layer/mips/batchnorm_mips.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class BatchNorm_mips : virtual public BatchNorm
+class BatchNorm_mips : public BatchNorm
 {
 public:
     BatchNorm_mips();
diff --git a/src/layer/mips/bias_mips.h b/src/layer/mips/bias_mips.h
index 3757c0b421e..dfef2159b4d 100644
--- a/src/layer/mips/bias_mips.h
+++ b/src/layer/mips/bias_mips.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Bias_mips : virtual public Bias
+class Bias_mips : public Bias
 {
 public:
     virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
diff --git a/src/layer/mips/binaryop_mips.h b/src/layer/mips/binaryop_mips.h
index 55d0f2cf363..e682373ba56 100644
--- a/src/layer/mips/binaryop_mips.h
+++ b/src/layer/mips/binaryop_mips.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class BinaryOp_mips : virtual public BinaryOp
+class BinaryOp_mips : public BinaryOp
 {
 public:
     BinaryOp_mips();
diff --git a/src/layer/mips/cast_mips.h b/src/layer/mips/cast_mips.h
index e37374bda6c..adabee5f888 100644
--- a/src/layer/mips/cast_mips.h
+++ b/src/layer/mips/cast_mips.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Cast_mips : virtual public Cast
+class Cast_mips : public Cast
 {
 public:
     Cast_mips();
diff --git a/src/layer/mips/clip_mips.h b/src/layer/mips/clip_mips.h
index 951888e0562..5db94bc5454 100644
--- a/src/layer/mips/clip_mips.h
+++ b/src/layer/mips/clip_mips.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Clip_mips : virtual public Clip
+class Clip_mips : public Clip
 {
 public:
     Clip_mips();
diff --git a/src/layer/mips/concat_mips.h b/src/layer/mips/concat_mips.h
index 994ca85cf3b..c4ab84f3037 100644
--- a/src/layer/mips/concat_mips.h
+++ b/src/layer/mips/concat_mips.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Concat_mips : virtual public Concat
+class Concat_mips : public Concat
 {
 public:
     Concat_mips();
diff --git a/src/layer/mips/convolution1d_mips.h b/src/layer/mips/convolution1d_mips.h
index 13e66e4f36c..dcc9bd4de4a 100644
--- a/src/layer/mips/convolution1d_mips.h
+++ b/src/layer/mips/convolution1d_mips.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Convolution1D_mips : virtual public Convolution1D
+class Convolution1D_mips : public Convolution1D
 {
 public:
     Convolution1D_mips();
diff --git a/src/layer/mips/convolution_mips.h b/src/layer/mips/convolution_mips.h
index e8fe54f87a2..8401c6dfd51 100644
--- a/src/layer/mips/convolution_mips.h
+++ b/src/layer/mips/convolution_mips.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Convolution_mips : virtual public Convolution
+class Convolution_mips : public Convolution
 {
 public:
     Convolution_mips();
diff --git a/src/layer/mips/convolutiondepthwise_mips.h b/src/layer/mips/convolutiondepthwise_mips.h
index 9d28009b8a1..24d1650b0c0 100644
--- a/src/layer/mips/convolutiondepthwise_mips.h
+++ b/src/layer/mips/convolutiondepthwise_mips.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class ConvolutionDepthWise_mips : virtual public ConvolutionDepthWise
+class ConvolutionDepthWise_mips : public ConvolutionDepthWise
 {
 public:
     ConvolutionDepthWise_mips();
diff --git a/src/layer/mips/crop_mips.h b/src/layer/mips/crop_mips.h
index e61c73a44d1..77c077e7153 100644
--- a/src/layer/mips/crop_mips.h
+++ b/src/layer/mips/crop_mips.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Crop_mips : virtual public Crop
+class Crop_mips : public Crop
 {
 public:
     Crop_mips();
diff --git a/src/layer/mips/deconvolution_mips.h b/src/layer/mips/deconvolution_mips.h
index 218bd812672..b7c0d2e7578 100644
--- a/src/layer/mips/deconvolution_mips.h
+++ b/src/layer/mips/deconvolution_mips.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Deconvolution_mips : virtual public Deconvolution
+class Deconvolution_mips : public Deconvolution
 {
 public:
     Deconvolution_mips();
diff --git a/src/layer/mips/deconvolutiondepthwise_mips.h b/src/layer/mips/deconvolutiondepthwise_mips.h
index a033d7c11c3..24e7a481edf 100644
--- a/src/layer/mips/deconvolutiondepthwise_mips.h
+++ b/src/layer/mips/deconvolutiondepthwise_mips.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class DeconvolutionDepthWise_mips : virtual public DeconvolutionDepthWise
+class DeconvolutionDepthWise_mips : public DeconvolutionDepthWise
 {
 public:
     DeconvolutionDepthWise_mips();
diff --git a/src/layer/mips/dequantize_mips.h b/src/layer/mips/dequantize_mips.h
index 09623e20d4f..8ae7e542c12 100644
--- a/src/layer/mips/dequantize_mips.h
+++ b/src/layer/mips/dequantize_mips.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Dequantize_mips : virtual public Dequantize
+class Dequantize_mips : public Dequantize
 {
 public:
     Dequantize_mips();
diff --git a/src/layer/mips/dropout_mips.h b/src/layer/mips/dropout_mips.h
index a5a4dbebb90..05fa38463d7 100644
--- a/src/layer/mips/dropout_mips.h
+++ b/src/layer/mips/dropout_mips.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Dropout_mips : virtual public Dropout
+class Dropout_mips : public Dropout
 {
 public:
     Dropout_mips();
diff --git a/src/layer/mips/eltwise_mips.h b/src/layer/mips/eltwise_mips.h
index 55252ec661d..9b4ac77319f 100644
--- a/src/layer/mips/eltwise_mips.h
+++ b/src/layer/mips/eltwise_mips.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Eltwise_mips : virtual public Eltwise
+class Eltwise_mips : public Eltwise
 {
 public:
     Eltwise_mips();
diff --git a/src/layer/mips/flatten_mips.h b/src/layer/mips/flatten_mips.h
index 725ceda6431..c9f33225f98 100644
--- a/src/layer/mips/flatten_mips.h
+++ b/src/layer/mips/flatten_mips.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Flatten_mips : virtual public Flatten
+class Flatten_mips : public Flatten
 {
 public:
     Flatten_mips();
diff --git a/src/layer/mips/hardsigmoid_mips.h b/src/layer/mips/hardsigmoid_mips.h
index a1ce9986eca..51cab82627f 100644
--- a/src/layer/mips/hardsigmoid_mips.h
+++ b/src/layer/mips/hardsigmoid_mips.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class HardSigmoid_mips : virtual public HardSigmoid
+class HardSigmoid_mips : public HardSigmoid
 {
 public:
     HardSigmoid_mips();
diff --git a/src/layer/mips/hardswish_mips.h b/src/layer/mips/hardswish_mips.h
index 692cf22eac2..8ace7fe79f5 100644
--- a/src/layer/mips/hardswish_mips.h
+++ b/src/layer/mips/hardswish_mips.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class HardSwish_mips : virtual public HardSwish
+class HardSwish_mips : public HardSwish
 {
 public:
     HardSwish_mips();
diff --git a/src/layer/mips/innerproduct_mips.h b/src/layer/mips/innerproduct_mips.h
index 59b26c53627..c96db3f93d1 100644
--- a/src/layer/mips/innerproduct_mips.h
+++ b/src/layer/mips/innerproduct_mips.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class InnerProduct_mips : virtual public InnerProduct
+class InnerProduct_mips : public InnerProduct
 {
 public:
     InnerProduct_mips();
diff --git a/src/layer/mips/interp_mips.h b/src/layer/mips/interp_mips.h
index c15b4990cde..baff10b4e38 100644
--- a/src/layer/mips/interp_mips.h
+++ b/src/layer/mips/interp_mips.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Interp_mips : virtual public Interp
+class Interp_mips : public Interp
 {
 public:
     Interp_mips();
diff --git a/src/layer/mips/mish_mips.h b/src/layer/mips/mish_mips.h
index 68cc9ff6f0f..33342a4f5d3 100644
--- a/src/layer/mips/mish_mips.h
+++ b/src/layer/mips/mish_mips.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Mish_mips : virtual public Mish
+class Mish_mips : public Mish
 {
 public:
     Mish_mips();
diff --git a/src/layer/mips/packing_mips.h b/src/layer/mips/packing_mips.h
index e90536f4908..ccc57f8af7b 100644
--- a/src/layer/mips/packing_mips.h
+++ b/src/layer/mips/packing_mips.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Packing_mips : virtual public Packing
+class Packing_mips : public Packing
 {
 public:
     Packing_mips();
diff --git a/src/layer/mips/padding_mips.h b/src/layer/mips/padding_mips.h
index 3153f3e2b35..6d4ae8c2f70 100644
--- a/src/layer/mips/padding_mips.h
+++ b/src/layer/mips/padding_mips.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Padding_mips : virtual public Padding
+class Padding_mips : public Padding
 {
 public:
     Padding_mips();
diff --git a/src/layer/mips/pooling_mips.h b/src/layer/mips/pooling_mips.h
index dab4038ecca..ec17a06a99c 100644
--- a/src/layer/mips/pooling_mips.h
+++ b/src/layer/mips/pooling_mips.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Pooling_mips : virtual public Pooling
+class Pooling_mips : public Pooling
 {
 public:
     Pooling_mips();
diff --git a/src/layer/mips/prelu_mips.h b/src/layer/mips/prelu_mips.h
index 9ef259ce833..6174c2570c3 100644
--- a/src/layer/mips/prelu_mips.h
+++ b/src/layer/mips/prelu_mips.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class PReLU_mips : virtual public PReLU
+class PReLU_mips : public PReLU
 {
 public:
     PReLU_mips();
diff --git a/src/layer/mips/quantize_mips.h b/src/layer/mips/quantize_mips.h
index 2607e573f5d..220d73af106 100644
--- a/src/layer/mips/quantize_mips.h
+++ b/src/layer/mips/quantize_mips.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Quantize_mips : virtual public Quantize
+class Quantize_mips : public Quantize
 {
 public:
     Quantize_mips();
diff --git a/src/layer/mips/relu_mips.h b/src/layer/mips/relu_mips.h
index 7fdeae828ef..74e55a6be10 100644
--- a/src/layer/mips/relu_mips.h
+++ b/src/layer/mips/relu_mips.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class ReLU_mips : virtual public ReLU
+class ReLU_mips : public ReLU
 {
 public:
     ReLU_mips();
diff --git a/src/layer/mips/requantize_mips.h b/src/layer/mips/requantize_mips.h
index a9138b9ea72..6ba740895d2 100644
--- a/src/layer/mips/requantize_mips.h
+++ b/src/layer/mips/requantize_mips.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Requantize_mips : virtual public Requantize
+class Requantize_mips : public Requantize
 {
 public:
     Requantize_mips();
diff --git a/src/layer/mips/sigmoid_mips.h b/src/layer/mips/sigmoid_mips.h
index 7ba089b3b4c..2bf166e954d 100644
--- a/src/layer/mips/sigmoid_mips.h
+++ b/src/layer/mips/sigmoid_mips.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Sigmoid_mips : virtual public Sigmoid
+class Sigmoid_mips : public Sigmoid
 {
 public:
     Sigmoid_mips();
diff --git a/src/layer/mips/slice_mips.h b/src/layer/mips/slice_mips.h
index 648233f8e6c..73274d867a0 100644
--- a/src/layer/mips/slice_mips.h
+++ b/src/layer/mips/slice_mips.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Slice_mips : virtual public Slice
+class Slice_mips : public Slice
 {
 public:
     Slice_mips();
diff --git a/src/layer/mips/softmax_mips.h b/src/layer/mips/softmax_mips.h
index 06ce5e16284..91437c13f56 100644
--- a/src/layer/mips/softmax_mips.h
+++ b/src/layer/mips/softmax_mips.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Softmax_mips : virtual public Softmax
+class Softmax_mips : public Softmax
 {
 public:
     virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
diff --git a/src/layer/mips/swish_mips.h b/src/layer/mips/swish_mips.h
index 706106d9269..1dc6753a381 100644
--- a/src/layer/mips/swish_mips.h
+++ b/src/layer/mips/swish_mips.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Swish_mips : virtual public Swish
+class Swish_mips : public Swish
 {
 public:
     Swish_mips();
diff --git a/src/layer/mips/tanh_mips.h b/src/layer/mips/tanh_mips.h
index d1310f18310..12e38d07f71 100644
--- a/src/layer/mips/tanh_mips.h
+++ b/src/layer/mips/tanh_mips.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class TanH_mips : virtual public TanH
+class TanH_mips : public TanH
 {
 public:
     TanH_mips();
diff --git a/src/layer/mips/unaryop_mips.h b/src/layer/mips/unaryop_mips.h
index 0a6f12bc3e5..800d028bb21 100644
--- a/src/layer/mips/unaryop_mips.h
+++ b/src/layer/mips/unaryop_mips.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class UnaryOp_mips : virtual public UnaryOp
+class UnaryOp_mips : public UnaryOp
 {
 public:
     UnaryOp_mips();
diff --git a/src/layer/riscv/absval_riscv.h b/src/layer/riscv/absval_riscv.h
index 66d33c834a8..0d35c6b61a0 100644
--- a/src/layer/riscv/absval_riscv.h
+++ b/src/layer/riscv/absval_riscv.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class AbsVal_riscv : virtual public AbsVal
+class AbsVal_riscv : public AbsVal
 {
 public:
     AbsVal_riscv();
diff --git a/src/layer/riscv/batchnorm_riscv.h b/src/layer/riscv/batchnorm_riscv.h
index e2365fa5fcf..1ed4dc63d0d 100644
--- a/src/layer/riscv/batchnorm_riscv.h
+++ b/src/layer/riscv/batchnorm_riscv.h
@@ -18,7 +18,7 @@
 #include "batchnorm.h"
 
 namespace ncnn {
-class BatchNorm_riscv : virtual public BatchNorm
+class BatchNorm_riscv : public BatchNorm
 {
 public:
     BatchNorm_riscv();
diff --git a/src/layer/riscv/binaryop_riscv.h b/src/layer/riscv/binaryop_riscv.h
index 0ecd34d685c..afc728b6e68 100644
--- a/src/layer/riscv/binaryop_riscv.h
+++ b/src/layer/riscv/binaryop_riscv.h
@@ -21,7 +21,7 @@
 
 namespace ncnn {
 
-class BinaryOp_riscv : virtual public BinaryOp
+class BinaryOp_riscv : public BinaryOp
 {
 public:
     BinaryOp_riscv();
diff --git a/src/layer/riscv/cast_riscv.h b/src/layer/riscv/cast_riscv.h
index 4b55159d819..7c6fbb6d4ce 100644
--- a/src/layer/riscv/cast_riscv.h
+++ b/src/layer/riscv/cast_riscv.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Cast_riscv : virtual public Cast
+class Cast_riscv : public Cast
 {
 public:
     Cast_riscv();
diff --git a/src/layer/riscv/clip_riscv.h b/src/layer/riscv/clip_riscv.h
index 16a9eb963f9..051995e18d6 100644
--- a/src/layer/riscv/clip_riscv.h
+++ b/src/layer/riscv/clip_riscv.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Clip_riscv : virtual public Clip
+class Clip_riscv : public Clip
 {
 public:
     Clip_riscv();
diff --git a/src/layer/riscv/concat_riscv.h b/src/layer/riscv/concat_riscv.h
index eb85d47819d..23029340350 100644
--- a/src/layer/riscv/concat_riscv.h
+++ b/src/layer/riscv/concat_riscv.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Concat_riscv : virtual public Concat
+class Concat_riscv : public Concat
 {
 public:
     Concat_riscv();
diff --git a/src/layer/riscv/convolution1d_riscv.h b/src/layer/riscv/convolution1d_riscv.h
index 2aa4bbe0f41..f0e7f881801 100644
--- a/src/layer/riscv/convolution1d_riscv.h
+++ b/src/layer/riscv/convolution1d_riscv.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Convolution1D_riscv : virtual public Convolution1D
+class Convolution1D_riscv : public Convolution1D
 {
 public:
     Convolution1D_riscv();
diff --git a/src/layer/riscv/convolution_riscv.h b/src/layer/riscv/convolution_riscv.h
index 17bb43ca0e5..a4e008c9dd1 100644
--- a/src/layer/riscv/convolution_riscv.h
+++ b/src/layer/riscv/convolution_riscv.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Convolution_riscv : virtual public Convolution
+class Convolution_riscv : public Convolution
 {
 public:
     Convolution_riscv();
diff --git a/src/layer/riscv/convolutiondepthwise_riscv.h b/src/layer/riscv/convolutiondepthwise_riscv.h
index b0152e0b207..f9503975296 100644
--- a/src/layer/riscv/convolutiondepthwise_riscv.h
+++ b/src/layer/riscv/convolutiondepthwise_riscv.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class ConvolutionDepthWise_riscv : virtual public ConvolutionDepthWise
+class ConvolutionDepthWise_riscv : public ConvolutionDepthWise
 {
 public:
     ConvolutionDepthWise_riscv();
diff --git a/src/layer/riscv/crop_riscv.h b/src/layer/riscv/crop_riscv.h
index 86d2c8064e3..404022fafb2 100644
--- a/src/layer/riscv/crop_riscv.h
+++ b/src/layer/riscv/crop_riscv.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Crop_riscv : virtual public Crop
+class Crop_riscv : public Crop
 {
 public:
     Crop_riscv();
diff --git a/src/layer/riscv/deconvolution_riscv.h b/src/layer/riscv/deconvolution_riscv.h
index 903a420427a..57d30349aad 100644
--- a/src/layer/riscv/deconvolution_riscv.h
+++ b/src/layer/riscv/deconvolution_riscv.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Deconvolution_riscv : virtual public Deconvolution
+class Deconvolution_riscv : public Deconvolution
 {
 public:
     Deconvolution_riscv();
diff --git a/src/layer/riscv/deconvolutiondepthwise_riscv.h b/src/layer/riscv/deconvolutiondepthwise_riscv.h
index 5cdbd0d0676..b0c8f7b0119 100644
--- a/src/layer/riscv/deconvolutiondepthwise_riscv.h
+++ b/src/layer/riscv/deconvolutiondepthwise_riscv.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class DeconvolutionDepthWise_riscv : virtual public DeconvolutionDepthWise
+class DeconvolutionDepthWise_riscv : public DeconvolutionDepthWise
 {
 public:
     DeconvolutionDepthWise_riscv();
diff --git a/src/layer/riscv/dropout_riscv.h b/src/layer/riscv/dropout_riscv.h
index d685c0ee3b4..9c28d867251 100644
--- a/src/layer/riscv/dropout_riscv.h
+++ b/src/layer/riscv/dropout_riscv.h
@@ -22,7 +22,7 @@
 
 namespace ncnn {
 
-class Dropout_riscv : virtual public Dropout
+class Dropout_riscv : public Dropout
 {
 public:
     Dropout_riscv();
diff --git a/src/layer/riscv/flatten_riscv.h b/src/layer/riscv/flatten_riscv.h
index 52a290ca678..31860340213 100644
--- a/src/layer/riscv/flatten_riscv.h
+++ b/src/layer/riscv/flatten_riscv.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Flatten_riscv : virtual public Flatten
+class Flatten_riscv : public Flatten
 {
 public:
     Flatten_riscv();
diff --git a/src/layer/riscv/gelu_riscv.h b/src/layer/riscv/gelu_riscv.h
index fbe522694d1..8a2e9492cc9 100644
--- a/src/layer/riscv/gelu_riscv.h
+++ b/src/layer/riscv/gelu_riscv.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class GELU_riscv : virtual public GELU
+class GELU_riscv : public GELU
 {
 public:
     GELU_riscv();
diff --git a/src/layer/riscv/gemm_riscv.h b/src/layer/riscv/gemm_riscv.h
index b92add63891..6bca092fb1f 100644
--- a/src/layer/riscv/gemm_riscv.h
+++ b/src/layer/riscv/gemm_riscv.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Gemm_riscv : virtual public Gemm
+class Gemm_riscv : public Gemm
 {
 public:
     Gemm_riscv();
diff --git a/src/layer/riscv/gru_riscv.h b/src/layer/riscv/gru_riscv.h
index 18c69ab594b..46bb624519f 100644
--- a/src/layer/riscv/gru_riscv.h
+++ b/src/layer/riscv/gru_riscv.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class GRU_riscv : virtual public GRU
+class GRU_riscv : public GRU
 {
 public:
     GRU_riscv();
diff --git a/src/layer/riscv/hardsigmoid_riscv.h b/src/layer/riscv/hardsigmoid_riscv.h
index b876c485b62..3c264b3188e 100644
--- a/src/layer/riscv/hardsigmoid_riscv.h
+++ b/src/layer/riscv/hardsigmoid_riscv.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class HardSigmoid_riscv : virtual public HardSigmoid
+class HardSigmoid_riscv : public HardSigmoid
 {
 public:
     HardSigmoid_riscv();
diff --git a/src/layer/riscv/hardswish_riscv.h b/src/layer/riscv/hardswish_riscv.h
index 662cd067024..cfec7916f59 100644
--- a/src/layer/riscv/hardswish_riscv.h
+++ b/src/layer/riscv/hardswish_riscv.h
@@ -22,7 +22,7 @@
 
 namespace ncnn {
 
-class HardSwish_riscv : virtual public HardSwish
+class HardSwish_riscv : public HardSwish
 {
 public:
     HardSwish_riscv();
diff --git a/src/layer/riscv/innerproduct_riscv.h b/src/layer/riscv/innerproduct_riscv.h
index 0503ea3d4fa..d3056d5801d 100644
--- a/src/layer/riscv/innerproduct_riscv.h
+++ b/src/layer/riscv/innerproduct_riscv.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class InnerProduct_riscv : virtual public InnerProduct
+class InnerProduct_riscv : public InnerProduct
 {
 public:
     InnerProduct_riscv();
diff --git a/src/layer/riscv/instancenorm_riscv.h b/src/layer/riscv/instancenorm_riscv.h
index 80583cc2c89..b0d2e9004ac 100644
--- a/src/layer/riscv/instancenorm_riscv.h
+++ b/src/layer/riscv/instancenorm_riscv.h
@@ -18,7 +18,7 @@
 #include "instancenorm.h"
 
 namespace ncnn {
-class InstanceNorm_riscv : virtual public InstanceNorm
+class InstanceNorm_riscv : public InstanceNorm
 {
 public:
     InstanceNorm_riscv();
diff --git a/src/layer/riscv/interp_riscv.h b/src/layer/riscv/interp_riscv.h
index 2f6ca89da34..f479223519b 100644
--- a/src/layer/riscv/interp_riscv.h
+++ b/src/layer/riscv/interp_riscv.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Interp_riscv : virtual public Interp
+class Interp_riscv : public Interp
 {
 public:
     Interp_riscv();
diff --git a/src/layer/riscv/mish_riscv.h b/src/layer/riscv/mish_riscv.h
index 5421ebb2791..2e2be1a2b44 100644
--- a/src/layer/riscv/mish_riscv.h
+++ b/src/layer/riscv/mish_riscv.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Mish_riscv : virtual public Mish
+class Mish_riscv : public Mish
 {
 public:
     Mish_riscv();
diff --git a/src/layer/riscv/packing_riscv.h b/src/layer/riscv/packing_riscv.h
index 4d556890f3f..097d774993c 100644
--- a/src/layer/riscv/packing_riscv.h
+++ b/src/layer/riscv/packing_riscv.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Packing_riscv : virtual public Packing
+class Packing_riscv : public Packing
 {
 public:
     Packing_riscv();
diff --git a/src/layer/riscv/padding_riscv.h b/src/layer/riscv/padding_riscv.h
index c591806fa3e..7642dccae5f 100644
--- a/src/layer/riscv/padding_riscv.h
+++ b/src/layer/riscv/padding_riscv.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Padding_riscv : virtual public Padding
+class Padding_riscv : public Padding
 {
 public:
     Padding_riscv();
diff --git a/src/layer/riscv/pooling_riscv.h b/src/layer/riscv/pooling_riscv.h
index 48d8feb8233..e285b58eb19 100644
--- a/src/layer/riscv/pooling_riscv.h
+++ b/src/layer/riscv/pooling_riscv.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Pooling_riscv : virtual public Pooling
+class Pooling_riscv : public Pooling
 {
 public:
     Pooling_riscv();
diff --git a/src/layer/riscv/prelu_riscv.h b/src/layer/riscv/prelu_riscv.h
index 23e5b7ee998..70acbc5d250 100644
--- a/src/layer/riscv/prelu_riscv.h
+++ b/src/layer/riscv/prelu_riscv.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class PReLU_riscv : virtual public PReLU
+class PReLU_riscv : public PReLU
 {
 public:
     PReLU_riscv();
diff --git a/src/layer/riscv/relu_riscv.h b/src/layer/riscv/relu_riscv.h
index 516f90d3d76..58181b533b8 100644
--- a/src/layer/riscv/relu_riscv.h
+++ b/src/layer/riscv/relu_riscv.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class ReLU_riscv : virtual public ReLU
+class ReLU_riscv : public ReLU
 {
 public:
     ReLU_riscv();
diff --git a/src/layer/riscv/selu_riscv.h b/src/layer/riscv/selu_riscv.h
index 2cd552fb9b8..185b7f5b2c8 100644
--- a/src/layer/riscv/selu_riscv.h
+++ b/src/layer/riscv/selu_riscv.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class SELU_riscv : virtual public SELU
+class SELU_riscv : public SELU
 {
 public:
     virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
diff --git a/src/layer/riscv/sigmoid_riscv.h b/src/layer/riscv/sigmoid_riscv.h
index 2b4b33b7cbe..8f014e6c4f2 100644
--- a/src/layer/riscv/sigmoid_riscv.h
+++ b/src/layer/riscv/sigmoid_riscv.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Sigmoid_riscv : virtual public Sigmoid
+class Sigmoid_riscv : public Sigmoid
 {
 public:
     Sigmoid_riscv();
diff --git a/src/layer/riscv/softmax_riscv.h b/src/layer/riscv/softmax_riscv.h
index bb39b5e3ba8..f93dc3022e1 100644
--- a/src/layer/riscv/softmax_riscv.h
+++ b/src/layer/riscv/softmax_riscv.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Softmax_riscv : virtual public Softmax
+class Softmax_riscv : public Softmax
 {
 public:
     Softmax_riscv();
diff --git a/src/layer/riscv/swish_riscv.h b/src/layer/riscv/swish_riscv.h
index 00de62fce4c..05d5cbe1cfd 100644
--- a/src/layer/riscv/swish_riscv.h
+++ b/src/layer/riscv/swish_riscv.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Swish_riscv : virtual public Swish
+class Swish_riscv : public Swish
 {
 public:
     Swish_riscv();
diff --git a/src/layer/riscv/tanh_riscv.h b/src/layer/riscv/tanh_riscv.h
index c7038ef4f3e..6fb22ce91f3 100644
--- a/src/layer/riscv/tanh_riscv.h
+++ b/src/layer/riscv/tanh_riscv.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class TanH_riscv : virtual public TanH
+class TanH_riscv : public TanH
 {
 public:
     TanH_riscv();
diff --git a/src/layer/riscv/unaryop_riscv.h b/src/layer/riscv/unaryop_riscv.h
index 7e4e4fa8bfe..215ad3426a4 100644
--- a/src/layer/riscv/unaryop_riscv.h
+++ b/src/layer/riscv/unaryop_riscv.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class UnaryOp_riscv : virtual public UnaryOp
+class UnaryOp_riscv : public UnaryOp
 {
 public:
     UnaryOp_riscv();
diff --git a/src/layer/vulkan/absval_vulkan.h b/src/layer/vulkan/absval_vulkan.h
index d14c2ac5388..9652aac9b16 100644
--- a/src/layer/vulkan/absval_vulkan.h
+++ b/src/layer/vulkan/absval_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class AbsVal_vulkan : virtual public AbsVal
+class AbsVal_vulkan : public AbsVal
 {
 public:
     AbsVal_vulkan();
diff --git a/src/layer/vulkan/batchnorm_vulkan.h b/src/layer/vulkan/batchnorm_vulkan.h
index 783b84b6efb..eedf049167d 100644
--- a/src/layer/vulkan/batchnorm_vulkan.h
+++ b/src/layer/vulkan/batchnorm_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class BatchNorm_vulkan : virtual public BatchNorm
+class BatchNorm_vulkan : public BatchNorm
 {
 public:
     BatchNorm_vulkan();
diff --git a/src/layer/vulkan/binaryop_vulkan.h b/src/layer/vulkan/binaryop_vulkan.h
index 97ebcacc9f6..1c66186a0c3 100644
--- a/src/layer/vulkan/binaryop_vulkan.h
+++ b/src/layer/vulkan/binaryop_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class BinaryOp_vulkan : virtual public BinaryOp
+class BinaryOp_vulkan : public BinaryOp
 {
 public:
     BinaryOp_vulkan();
diff --git a/src/layer/vulkan/cast_vulkan.h b/src/layer/vulkan/cast_vulkan.h
index c184c7439ac..47ce3b27920 100644
--- a/src/layer/vulkan/cast_vulkan.h
+++ b/src/layer/vulkan/cast_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Cast_vulkan : virtual public Cast
+class Cast_vulkan : public Cast
 {
 public:
     Cast_vulkan();
diff --git a/src/layer/vulkan/celu_vulkan.h b/src/layer/vulkan/celu_vulkan.h
index b5e25e19b4d..2c03a4b9c98 100644
--- a/src/layer/vulkan/celu_vulkan.h
+++ b/src/layer/vulkan/celu_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class CELU_vulkan : virtual public CELU
+class CELU_vulkan : public CELU
 {
 public:
     CELU_vulkan();
diff --git a/src/layer/vulkan/clip_vulkan.h b/src/layer/vulkan/clip_vulkan.h
index ea73eacd050..79e7745f0c4 100644
--- a/src/layer/vulkan/clip_vulkan.h
+++ b/src/layer/vulkan/clip_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Clip_vulkan : virtual public Clip
+class Clip_vulkan : public Clip
 {
 public:
     Clip_vulkan();
diff --git a/src/layer/vulkan/concat_vulkan.h b/src/layer/vulkan/concat_vulkan.h
index 3db05044ea9..109750f3d8d 100644
--- a/src/layer/vulkan/concat_vulkan.h
+++ b/src/layer/vulkan/concat_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Concat_vulkan : virtual public Concat
+class Concat_vulkan : public Concat
 {
 public:
     Concat_vulkan();
diff --git a/src/layer/vulkan/convolution1d_vulkan.h b/src/layer/vulkan/convolution1d_vulkan.h
index 4fb22040daa..f01e1523161 100644
--- a/src/layer/vulkan/convolution1d_vulkan.h
+++ b/src/layer/vulkan/convolution1d_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Convolution1D_vulkan : virtual public Convolution1D
+class Convolution1D_vulkan : public Convolution1D
 {
 public:
     Convolution1D_vulkan();
diff --git a/src/layer/vulkan/convolution_vulkan.h b/src/layer/vulkan/convolution_vulkan.h
index 0efa76fec5c..90d6471d58b 100644
--- a/src/layer/vulkan/convolution_vulkan.h
+++ b/src/layer/vulkan/convolution_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Convolution_vulkan : virtual public Convolution
+class Convolution_vulkan : public Convolution
 {
 public:
     Convolution_vulkan();
diff --git a/src/layer/vulkan/convolutiondepthwise_vulkan.h b/src/layer/vulkan/convolutiondepthwise_vulkan.h
index 3689e369c2b..47785b707e2 100644
--- a/src/layer/vulkan/convolutiondepthwise_vulkan.h
+++ b/src/layer/vulkan/convolutiondepthwise_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class ConvolutionDepthWise_vulkan : virtual public ConvolutionDepthWise
+class ConvolutionDepthWise_vulkan : public ConvolutionDepthWise
 {
 public:
     ConvolutionDepthWise_vulkan();
diff --git a/src/layer/vulkan/crop_vulkan.h b/src/layer/vulkan/crop_vulkan.h
index e60b77f5e7c..4480268849a 100644
--- a/src/layer/vulkan/crop_vulkan.h
+++ b/src/layer/vulkan/crop_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Crop_vulkan : virtual public Crop
+class Crop_vulkan : public Crop
 {
 public:
     Crop_vulkan();
diff --git a/src/layer/vulkan/deconvolution_vulkan.h b/src/layer/vulkan/deconvolution_vulkan.h
index 578bdc96747..a4bee03c5e4 100644
--- a/src/layer/vulkan/deconvolution_vulkan.h
+++ b/src/layer/vulkan/deconvolution_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Deconvolution_vulkan : virtual public Deconvolution
+class Deconvolution_vulkan : public Deconvolution
 {
 public:
     Deconvolution_vulkan();
diff --git a/src/layer/vulkan/deconvolutiondepthwise_vulkan.h b/src/layer/vulkan/deconvolutiondepthwise_vulkan.h
index bf38f254eb5..6ea7931e32a 100644
--- a/src/layer/vulkan/deconvolutiondepthwise_vulkan.h
+++ b/src/layer/vulkan/deconvolutiondepthwise_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class DeconvolutionDepthWise_vulkan : virtual public DeconvolutionDepthWise
+class DeconvolutionDepthWise_vulkan : public DeconvolutionDepthWise
 {
 public:
     DeconvolutionDepthWise_vulkan();
diff --git a/src/layer/vulkan/deepcopy_vulkan.h b/src/layer/vulkan/deepcopy_vulkan.h
index a7a89d17a67..867ff1af454 100644
--- a/src/layer/vulkan/deepcopy_vulkan.h
+++ b/src/layer/vulkan/deepcopy_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class DeepCopy_vulkan : virtual public DeepCopy
+class DeepCopy_vulkan : public DeepCopy
 {
 public:
     DeepCopy_vulkan();
diff --git a/src/layer/vulkan/dropout_vulkan.h b/src/layer/vulkan/dropout_vulkan.h
index da2e9ad6051..e45159b7659 100644
--- a/src/layer/vulkan/dropout_vulkan.h
+++ b/src/layer/vulkan/dropout_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Dropout_vulkan : virtual public Dropout
+class Dropout_vulkan : public Dropout
 {
 public:
     Dropout_vulkan();
diff --git a/src/layer/vulkan/eltwise_vulkan.h b/src/layer/vulkan/eltwise_vulkan.h
index 2516db55dd2..09418657186 100644
--- a/src/layer/vulkan/eltwise_vulkan.h
+++ b/src/layer/vulkan/eltwise_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Eltwise_vulkan : virtual public Eltwise
+class Eltwise_vulkan : public Eltwise
 {
 public:
     Eltwise_vulkan();
diff --git a/src/layer/vulkan/elu_vulkan.h b/src/layer/vulkan/elu_vulkan.h
index 62da80a00c5..c616c3be1b9 100644
--- a/src/layer/vulkan/elu_vulkan.h
+++ b/src/layer/vulkan/elu_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class ELU_vulkan : virtual public ELU
+class ELU_vulkan : public ELU
 {
 public:
     ELU_vulkan();
diff --git a/src/layer/vulkan/erf_vulkan.h b/src/layer/vulkan/erf_vulkan.h
index c793c558687..3f2ae5ace64 100644
--- a/src/layer/vulkan/erf_vulkan.h
+++ b/src/layer/vulkan/erf_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Erf_vulkan : virtual public Erf
+class Erf_vulkan : public Erf
 {
 public:
     Erf_vulkan();
diff --git a/src/layer/vulkan/flatten_vulkan.h b/src/layer/vulkan/flatten_vulkan.h
index 510cab1285f..1068ce547c3 100644
--- a/src/layer/vulkan/flatten_vulkan.h
+++ b/src/layer/vulkan/flatten_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Flatten_vulkan : virtual public Flatten
+class Flatten_vulkan : public Flatten
 {
 public:
     Flatten_vulkan();
diff --git a/src/layer/vulkan/gelu_vulkan.h b/src/layer/vulkan/gelu_vulkan.h
index 2c04bc40ba1..ced6f07af4d 100644
--- a/src/layer/vulkan/gelu_vulkan.h
+++ b/src/layer/vulkan/gelu_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class GELU_vulkan : virtual public GELU
+class GELU_vulkan : public GELU
 {
 public:
     GELU_vulkan();
diff --git a/src/layer/vulkan/gemm_vulkan.h b/src/layer/vulkan/gemm_vulkan.h
index 4edbc2f5472..d9fa92018e4 100644
--- a/src/layer/vulkan/gemm_vulkan.h
+++ b/src/layer/vulkan/gemm_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Gemm_vulkan : virtual public Gemm
+class Gemm_vulkan : public Gemm
 {
 public:
     Gemm_vulkan();
diff --git a/src/layer/vulkan/hardsigmoid_vulkan.h b/src/layer/vulkan/hardsigmoid_vulkan.h
index 23ea48e2959..b0902948c7b 100644
--- a/src/layer/vulkan/hardsigmoid_vulkan.h
+++ b/src/layer/vulkan/hardsigmoid_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class HardSigmoid_vulkan : virtual public HardSigmoid
+class HardSigmoid_vulkan : public HardSigmoid
 {
 public:
     HardSigmoid_vulkan();
diff --git a/src/layer/vulkan/hardswish_vulkan.h b/src/layer/vulkan/hardswish_vulkan.h
index cd5f93f1d76..ab4726877ef 100644
--- a/src/layer/vulkan/hardswish_vulkan.h
+++ b/src/layer/vulkan/hardswish_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class HardSwish_vulkan : virtual public HardSwish
+class HardSwish_vulkan : public HardSwish
 {
 public:
     HardSwish_vulkan();
diff --git a/src/layer/vulkan/innerproduct_vulkan.h b/src/layer/vulkan/innerproduct_vulkan.h
index 4fe138d480f..9002c581c92 100644
--- a/src/layer/vulkan/innerproduct_vulkan.h
+++ b/src/layer/vulkan/innerproduct_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class InnerProduct_vulkan : virtual public InnerProduct
+class InnerProduct_vulkan : public InnerProduct
 {
 public:
     InnerProduct_vulkan();
diff --git a/src/layer/vulkan/instancenorm_vulkan.h b/src/layer/vulkan/instancenorm_vulkan.h
index 6ff269d9fab..943fff65aee 100644
--- a/src/layer/vulkan/instancenorm_vulkan.h
+++ b/src/layer/vulkan/instancenorm_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class InstanceNorm_vulkan : virtual public InstanceNorm
+class InstanceNorm_vulkan : public InstanceNorm
 {
 public:
     InstanceNorm_vulkan();
diff --git a/src/layer/vulkan/interp_vulkan.h b/src/layer/vulkan/interp_vulkan.h
index 94724a78689..5f1752341fe 100644
--- a/src/layer/vulkan/interp_vulkan.h
+++ b/src/layer/vulkan/interp_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Interp_vulkan : virtual public Interp
+class Interp_vulkan : public Interp
 {
 public:
     Interp_vulkan();
diff --git a/src/layer/vulkan/lrn_vulkan.h b/src/layer/vulkan/lrn_vulkan.h
index 30b3f0cee80..ad8cc99348d 100644
--- a/src/layer/vulkan/lrn_vulkan.h
+++ b/src/layer/vulkan/lrn_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class LRN_vulkan : virtual public LRN
+class LRN_vulkan : public LRN
 {
 public:
     LRN_vulkan();
diff --git a/src/layer/vulkan/memorydata_vulkan.h b/src/layer/vulkan/memorydata_vulkan.h
index 7ba21283b75..32655abdcae 100644
--- a/src/layer/vulkan/memorydata_vulkan.h
+++ b/src/layer/vulkan/memorydata_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class MemoryData_vulkan : virtual public MemoryData
+class MemoryData_vulkan : public MemoryData
 {
 public:
     MemoryData_vulkan();
diff --git a/src/layer/vulkan/mish_vulkan.h b/src/layer/vulkan/mish_vulkan.h
index 762e331bfc6..864884382de 100644
--- a/src/layer/vulkan/mish_vulkan.h
+++ b/src/layer/vulkan/mish_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Mish_vulkan : virtual public Mish
+class Mish_vulkan : public Mish
 {
 public:
     Mish_vulkan();
diff --git a/src/layer/vulkan/multiheadattention_vulkan.h b/src/layer/vulkan/multiheadattention_vulkan.h
index 49662db47a2..3b77d96db48 100644
--- a/src/layer/vulkan/multiheadattention_vulkan.h
+++ b/src/layer/vulkan/multiheadattention_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class MultiHeadAttention_vulkan : virtual public MultiHeadAttention
+class MultiHeadAttention_vulkan : public MultiHeadAttention
 {
 public:
     MultiHeadAttention_vulkan();
diff --git a/src/layer/vulkan/noop_vulkan.h b/src/layer/vulkan/noop_vulkan.h
index a26cf626ab6..84d05d07a80 100644
--- a/src/layer/vulkan/noop_vulkan.h
+++ b/src/layer/vulkan/noop_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Noop_vulkan : virtual public Noop
+class Noop_vulkan : public Noop
 {
 public:
     Noop_vulkan();
diff --git a/src/layer/vulkan/normalize_vulkan.h b/src/layer/vulkan/normalize_vulkan.h
index ca44828df1a..4ad20cc457f 100644
--- a/src/layer/vulkan/normalize_vulkan.h
+++ b/src/layer/vulkan/normalize_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Normalize_vulkan : virtual public Normalize
+class Normalize_vulkan : public Normalize
 {
 public:
     Normalize_vulkan();
diff --git a/src/layer/vulkan/packing_vulkan.h b/src/layer/vulkan/packing_vulkan.h
index 954698f98dd..fb9d1cd154f 100644
--- a/src/layer/vulkan/packing_vulkan.h
+++ b/src/layer/vulkan/packing_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Packing_vulkan : virtual public Packing
+class Packing_vulkan : public Packing
 {
 public:
     Packing_vulkan();
diff --git a/src/layer/vulkan/padding_vulkan.h b/src/layer/vulkan/padding_vulkan.h
index faea7bd9266..bc6a235ea1c 100644
--- a/src/layer/vulkan/padding_vulkan.h
+++ b/src/layer/vulkan/padding_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Padding_vulkan : virtual public Padding
+class Padding_vulkan : public Padding
 {
 public:
     Padding_vulkan();
diff --git a/src/layer/vulkan/permute_vulkan.h b/src/layer/vulkan/permute_vulkan.h
index c9fc6cfdef1..fd073bec245 100644
--- a/src/layer/vulkan/permute_vulkan.h
+++ b/src/layer/vulkan/permute_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Permute_vulkan : virtual public Permute
+class Permute_vulkan : public Permute
 {
 public:
     Permute_vulkan();
diff --git a/src/layer/vulkan/pixelshuffle_vulkan.h b/src/layer/vulkan/pixelshuffle_vulkan.h
index f24e2dd53b1..d0b812f2bb5 100644
--- a/src/layer/vulkan/pixelshuffle_vulkan.h
+++ b/src/layer/vulkan/pixelshuffle_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class PixelShuffle_vulkan : virtual public PixelShuffle
+class PixelShuffle_vulkan : public PixelShuffle
 {
 public:
     PixelShuffle_vulkan();
diff --git a/src/layer/vulkan/pooling_vulkan.h b/src/layer/vulkan/pooling_vulkan.h
index a3529b2708c..a336908d5d7 100644
--- a/src/layer/vulkan/pooling_vulkan.h
+++ b/src/layer/vulkan/pooling_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Pooling_vulkan : virtual public Pooling
+class Pooling_vulkan : public Pooling
 {
 public:
     Pooling_vulkan();
diff --git a/src/layer/vulkan/prelu_vulkan.h b/src/layer/vulkan/prelu_vulkan.h
index a58f7ce00b3..d2bae5eaac6 100644
--- a/src/layer/vulkan/prelu_vulkan.h
+++ b/src/layer/vulkan/prelu_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class PReLU_vulkan : virtual public PReLU
+class PReLU_vulkan : public PReLU
 {
 public:
     PReLU_vulkan();
diff --git a/src/layer/vulkan/priorbox_vulkan.h b/src/layer/vulkan/priorbox_vulkan.h
index 5b11387e0f5..394b12d0fa9 100644
--- a/src/layer/vulkan/priorbox_vulkan.h
+++ b/src/layer/vulkan/priorbox_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class PriorBox_vulkan : virtual public PriorBox
+class PriorBox_vulkan : public PriorBox
 {
 public:
     PriorBox_vulkan();
diff --git a/src/layer/vulkan/relu_vulkan.h b/src/layer/vulkan/relu_vulkan.h
index 7ac8fa76ae0..287781fdaa6 100644
--- a/src/layer/vulkan/relu_vulkan.h
+++ b/src/layer/vulkan/relu_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class ReLU_vulkan : virtual public ReLU
+class ReLU_vulkan : public ReLU
 {
 public:
     ReLU_vulkan();
diff --git a/src/layer/vulkan/reorg_vulkan.h b/src/layer/vulkan/reorg_vulkan.h
index 1be2ade3601..f1565486996 100644
--- a/src/layer/vulkan/reorg_vulkan.h
+++ b/src/layer/vulkan/reorg_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Reorg_vulkan : virtual public Reorg
+class Reorg_vulkan : public Reorg
 {
 public:
     Reorg_vulkan();
diff --git a/src/layer/vulkan/reshape_vulkan.h b/src/layer/vulkan/reshape_vulkan.h
index 134ae1b9ece..6b408f79940 100644
--- a/src/layer/vulkan/reshape_vulkan.h
+++ b/src/layer/vulkan/reshape_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Reshape_vulkan : virtual public Reshape
+class Reshape_vulkan : public Reshape
 {
 public:
     Reshape_vulkan();
diff --git a/src/layer/vulkan/scale_vulkan.h b/src/layer/vulkan/scale_vulkan.h
index 867667e3da3..72851030d2d 100644
--- a/src/layer/vulkan/scale_vulkan.h
+++ b/src/layer/vulkan/scale_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Scale_vulkan : virtual public Scale
+class Scale_vulkan : public Scale
 {
 public:
     Scale_vulkan();
diff --git a/src/layer/vulkan/shufflechannel_vulkan.h b/src/layer/vulkan/shufflechannel_vulkan.h
index 183e45ddaf7..1cbc706ba02 100644
--- a/src/layer/vulkan/shufflechannel_vulkan.h
+++ b/src/layer/vulkan/shufflechannel_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class ShuffleChannel_vulkan : virtual public ShuffleChannel
+class ShuffleChannel_vulkan : public ShuffleChannel
 {
 public:
     ShuffleChannel_vulkan();
diff --git a/src/layer/vulkan/sigmoid_vulkan.h b/src/layer/vulkan/sigmoid_vulkan.h
index 2d244506f4e..1350f6a47d4 100644
--- a/src/layer/vulkan/sigmoid_vulkan.h
+++ b/src/layer/vulkan/sigmoid_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Sigmoid_vulkan : virtual public Sigmoid
+class Sigmoid_vulkan : public Sigmoid
 {
 public:
     Sigmoid_vulkan();
diff --git a/src/layer/vulkan/slice_vulkan.h b/src/layer/vulkan/slice_vulkan.h
index 53793752baa..92f9ad154b1 100644
--- a/src/layer/vulkan/slice_vulkan.h
+++ b/src/layer/vulkan/slice_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Slice_vulkan : virtual public Slice
+class Slice_vulkan : public Slice
 {
 public:
     Slice_vulkan();
diff --git a/src/layer/vulkan/softmax_vulkan.h b/src/layer/vulkan/softmax_vulkan.h
index 35478d2da24..aeff8d40be3 100644
--- a/src/layer/vulkan/softmax_vulkan.h
+++ b/src/layer/vulkan/softmax_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Softmax_vulkan : virtual public Softmax
+class Softmax_vulkan : public Softmax
 {
 public:
     Softmax_vulkan();
diff --git a/src/layer/vulkan/split_vulkan.h b/src/layer/vulkan/split_vulkan.h
index b5ace0cb2ce..8e1998a3a93 100644
--- a/src/layer/vulkan/split_vulkan.h
+++ b/src/layer/vulkan/split_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Split_vulkan : virtual public Split
+class Split_vulkan : public Split
 {
 public:
     Split_vulkan();
diff --git a/src/layer/vulkan/swish_vulkan.h b/src/layer/vulkan/swish_vulkan.h
index f8d7c9f7707..a562767cbba 100644
--- a/src/layer/vulkan/swish_vulkan.h
+++ b/src/layer/vulkan/swish_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Swish_vulkan : virtual public Swish
+class Swish_vulkan : public Swish
 {
 public:
     Swish_vulkan();
diff --git a/src/layer/vulkan/tanh_vulkan.h b/src/layer/vulkan/tanh_vulkan.h
index cccb2701483..1926363a0f8 100644
--- a/src/layer/vulkan/tanh_vulkan.h
+++ b/src/layer/vulkan/tanh_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class TanH_vulkan : virtual public TanH
+class TanH_vulkan : public TanH
 {
 public:
     TanH_vulkan();
diff --git a/src/layer/vulkan/unaryop_vulkan.h b/src/layer/vulkan/unaryop_vulkan.h
index c1d99873889..bad5377f9b3 100644
--- a/src/layer/vulkan/unaryop_vulkan.h
+++ b/src/layer/vulkan/unaryop_vulkan.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class UnaryOp_vulkan : virtual public UnaryOp
+class UnaryOp_vulkan : public UnaryOp
 {
 public:
     UnaryOp_vulkan();
diff --git a/src/layer/x86/batchnorm_x86.h b/src/layer/x86/batchnorm_x86.h
index b991e313c3e..7168332a1b3 100644
--- a/src/layer/x86/batchnorm_x86.h
+++ b/src/layer/x86/batchnorm_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class BatchNorm_x86 : virtual public BatchNorm
+class BatchNorm_x86 : public BatchNorm
 {
 public:
     BatchNorm_x86();
diff --git a/src/layer/x86/bias_x86.h b/src/layer/x86/bias_x86.h
index 39d1bcef492..ab8e30de56d 100644
--- a/src/layer/x86/bias_x86.h
+++ b/src/layer/x86/bias_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Bias_x86 : virtual public Bias
+class Bias_x86 : public Bias
 {
 public:
     virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
diff --git a/src/layer/x86/binaryop_x86.h b/src/layer/x86/binaryop_x86.h
index 9f3ebb3cac9..cd3ff12a989 100644
--- a/src/layer/x86/binaryop_x86.h
+++ b/src/layer/x86/binaryop_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class BinaryOp_x86 : virtual public BinaryOp
+class BinaryOp_x86 : public BinaryOp
 {
 public:
     BinaryOp_x86();
diff --git a/src/layer/x86/bnll_x86.h b/src/layer/x86/bnll_x86.h
index ac7536b75bf..b3fad45ca7d 100644
--- a/src/layer/x86/bnll_x86.h
+++ b/src/layer/x86/bnll_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class BNLL_x86 : virtual public BNLL
+class BNLL_x86 : public BNLL
 {
 public:
     BNLL_x86();
diff --git a/src/layer/x86/cast_x86.h b/src/layer/x86/cast_x86.h
index bd1ec503382..45b27a8c6ce 100644
--- a/src/layer/x86/cast_x86.h
+++ b/src/layer/x86/cast_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Cast_x86 : virtual public Cast
+class Cast_x86 : public Cast
 {
 public:
     Cast_x86();
diff --git a/src/layer/x86/clip_x86.h b/src/layer/x86/clip_x86.h
index be026777f08..45a4058e90e 100644
--- a/src/layer/x86/clip_x86.h
+++ b/src/layer/x86/clip_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Clip_x86 : virtual public Clip
+class Clip_x86 : public Clip
 {
 public:
     Clip_x86();
diff --git a/src/layer/x86/concat_x86.h b/src/layer/x86/concat_x86.h
index 054d4b784d9..28ff162dbdc 100644
--- a/src/layer/x86/concat_x86.h
+++ b/src/layer/x86/concat_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Concat_x86 : virtual public Concat
+class Concat_x86 : public Concat
 {
 public:
     Concat_x86();
diff --git a/src/layer/x86/convolution1d_x86.h b/src/layer/x86/convolution1d_x86.h
index ec1782b7063..497b34e5962 100644
--- a/src/layer/x86/convolution1d_x86.h
+++ b/src/layer/x86/convolution1d_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Convolution1D_x86 : virtual public Convolution1D
+class Convolution1D_x86 : public Convolution1D
 {
 public:
     Convolution1D_x86();
diff --git a/src/layer/x86/convolution_x86.h b/src/layer/x86/convolution_x86.h
index 44889ef5a3c..fdfa88f7374 100644
--- a/src/layer/x86/convolution_x86.h
+++ b/src/layer/x86/convolution_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Convolution_x86 : virtual public Convolution
+class Convolution_x86 : public Convolution
 {
 public:
     Convolution_x86();
diff --git a/src/layer/x86/convolutiondepthwise_x86.h b/src/layer/x86/convolutiondepthwise_x86.h
index 6fe066e5bed..1fedb119bd3 100644
--- a/src/layer/x86/convolutiondepthwise_x86.h
+++ b/src/layer/x86/convolutiondepthwise_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class ConvolutionDepthWise_x86 : virtual public ConvolutionDepthWise
+class ConvolutionDepthWise_x86 : public ConvolutionDepthWise
 {
 public:
     ConvolutionDepthWise_x86();
diff --git a/src/layer/x86/crop_x86.h b/src/layer/x86/crop_x86.h
index e7e3d140fc5..ba0fc1b607e 100644
--- a/src/layer/x86/crop_x86.h
+++ b/src/layer/x86/crop_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Crop_x86 : virtual public Crop
+class Crop_x86 : public Crop
 {
 public:
     Crop_x86();
diff --git a/src/layer/x86/deconvolution_x86.h b/src/layer/x86/deconvolution_x86.h
index 4951870bcd0..66c23eef3f3 100644
--- a/src/layer/x86/deconvolution_x86.h
+++ b/src/layer/x86/deconvolution_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Deconvolution_x86 : virtual public Deconvolution
+class Deconvolution_x86 : public Deconvolution
 {
 public:
     Deconvolution_x86();
diff --git a/src/layer/x86/deconvolutiondepthwise_x86.h b/src/layer/x86/deconvolutiondepthwise_x86.h
index 07fb5e54f9b..9c9e54cccf4 100644
--- a/src/layer/x86/deconvolutiondepthwise_x86.h
+++ b/src/layer/x86/deconvolutiondepthwise_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class DeconvolutionDepthWise_x86 : virtual public DeconvolutionDepthWise
+class DeconvolutionDepthWise_x86 : public DeconvolutionDepthWise
 {
 public:
     DeconvolutionDepthWise_x86();
diff --git a/src/layer/x86/deformableconv2d_x86.h b/src/layer/x86/deformableconv2d_x86.h
index e5ab4e08c99..66cce21ab49 100644
--- a/src/layer/x86/deformableconv2d_x86.h
+++ b/src/layer/x86/deformableconv2d_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class DeformableConv2D_x86 : virtual public DeformableConv2D
+class DeformableConv2D_x86 : public DeformableConv2D
 {
 public:
     DeformableConv2D_x86();
diff --git a/src/layer/x86/dequantize_x86.h b/src/layer/x86/dequantize_x86.h
index 2d8a6a22b0a..52bfcaed22e 100644
--- a/src/layer/x86/dequantize_x86.h
+++ b/src/layer/x86/dequantize_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Dequantize_x86 : virtual public Dequantize
+class Dequantize_x86 : public Dequantize
 {
 public:
     Dequantize_x86();
diff --git a/src/layer/x86/dropout_x86.h b/src/layer/x86/dropout_x86.h
index 959c9889e34..d44a8987162 100644
--- a/src/layer/x86/dropout_x86.h
+++ b/src/layer/x86/dropout_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Dropout_x86 : virtual public Dropout
+class Dropout_x86 : public Dropout
 {
 public:
     Dropout_x86();
diff --git a/src/layer/x86/eltwise_x86.h b/src/layer/x86/eltwise_x86.h
index 0f4eac064e0..e941817a303 100644
--- a/src/layer/x86/eltwise_x86.h
+++ b/src/layer/x86/eltwise_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Eltwise_x86 : virtual public Eltwise
+class Eltwise_x86 : public Eltwise
 {
 public:
     Eltwise_x86();
diff --git a/src/layer/x86/elu_x86.h b/src/layer/x86/elu_x86.h
index cd49c4f7d5a..6da00490d21 100644
--- a/src/layer/x86/elu_x86.h
+++ b/src/layer/x86/elu_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class ELU_x86 : virtual public ELU
+class ELU_x86 : public ELU
 {
 public:
     ELU_x86();
diff --git a/src/layer/x86/flatten_x86.h b/src/layer/x86/flatten_x86.h
index fcd512ae194..29820121695 100644
--- a/src/layer/x86/flatten_x86.h
+++ b/src/layer/x86/flatten_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Flatten_x86 : virtual public Flatten
+class Flatten_x86 : public Flatten
 {
 public:
     Flatten_x86();
diff --git a/src/layer/x86/gelu_x86.h b/src/layer/x86/gelu_x86.h
index 75d821bfd45..ba4b43e65ec 100644
--- a/src/layer/x86/gelu_x86.h
+++ b/src/layer/x86/gelu_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class GELU_x86 : virtual public GELU
+class GELU_x86 : public GELU
 {
 public:
     GELU_x86();
diff --git a/src/layer/x86/gemm_x86.h b/src/layer/x86/gemm_x86.h
index ef14872d76e..6f8eb4a82bf 100644
--- a/src/layer/x86/gemm_x86.h
+++ b/src/layer/x86/gemm_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Gemm_x86 : virtual public Gemm
+class Gemm_x86 : public Gemm
 {
 public:
     Gemm_x86();
diff --git a/src/layer/x86/gridsample_x86.h b/src/layer/x86/gridsample_x86.h
index 826414eefc9..caf7c7c50c3 100644
--- a/src/layer/x86/gridsample_x86.h
+++ b/src/layer/x86/gridsample_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class GridSample_x86 : virtual public GridSample
+class GridSample_x86 : public GridSample
 {
 public:
     GridSample_x86();
diff --git a/src/layer/x86/groupnorm_x86.h b/src/layer/x86/groupnorm_x86.h
index c3085e3622e..151884e5455 100644
--- a/src/layer/x86/groupnorm_x86.h
+++ b/src/layer/x86/groupnorm_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class GroupNorm_x86 : virtual public GroupNorm
+class GroupNorm_x86 : public GroupNorm
 {
 public:
     GroupNorm_x86();
diff --git a/src/layer/x86/hardsigmoid_x86.h b/src/layer/x86/hardsigmoid_x86.h
index b111608bb87..418a8dc941f 100644
--- a/src/layer/x86/hardsigmoid_x86.h
+++ b/src/layer/x86/hardsigmoid_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class HardSigmoid_x86 : virtual public HardSigmoid
+class HardSigmoid_x86 : public HardSigmoid
 {
 public:
     HardSigmoid_x86();
diff --git a/src/layer/x86/hardswish_x86.h b/src/layer/x86/hardswish_x86.h
index 37fd42a513c..4fe521ea47d 100644
--- a/src/layer/x86/hardswish_x86.h
+++ b/src/layer/x86/hardswish_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class HardSwish_x86 : virtual public HardSwish
+class HardSwish_x86 : public HardSwish
 {
 public:
     HardSwish_x86();
diff --git a/src/layer/x86/innerproduct_x86.h b/src/layer/x86/innerproduct_x86.h
index 211131e6132..19da245f32f 100644
--- a/src/layer/x86/innerproduct_x86.h
+++ b/src/layer/x86/innerproduct_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class InnerProduct_x86 : virtual public InnerProduct
+class InnerProduct_x86 : public InnerProduct
 {
 public:
     InnerProduct_x86();
diff --git a/src/layer/x86/interp_x86.h b/src/layer/x86/interp_x86.h
index 6f91b950ef5..46fcde6f221 100644
--- a/src/layer/x86/interp_x86.h
+++ b/src/layer/x86/interp_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Interp_x86 : virtual public Interp
+class Interp_x86 : public Interp
 {
 public:
     Interp_x86();
diff --git a/src/layer/x86/layernorm_x86.h b/src/layer/x86/layernorm_x86.h
index 42eb551ed95..7e8ec05894c 100644
--- a/src/layer/x86/layernorm_x86.h
+++ b/src/layer/x86/layernorm_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class LayerNorm_x86 : virtual public LayerNorm
+class LayerNorm_x86 : public LayerNorm
 {
 public:
     LayerNorm_x86();
diff --git a/src/layer/x86/lrn_x86.h b/src/layer/x86/lrn_x86.h
index 3fe791872c6..9aa85367cda 100644
--- a/src/layer/x86/lrn_x86.h
+++ b/src/layer/x86/lrn_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class LRN_x86 : virtual public LRN
+class LRN_x86 : public LRN
 {
 public:
     virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
diff --git a/src/layer/x86/lstm_x86.h b/src/layer/x86/lstm_x86.h
index cab7d7e32fa..1dc56d45e03 100644
--- a/src/layer/x86/lstm_x86.h
+++ b/src/layer/x86/lstm_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class LSTM_x86 : virtual public LSTM
+class LSTM_x86 : public LSTM
 {
 public:
     LSTM_x86();
diff --git a/src/layer/x86/matmul_x86.h b/src/layer/x86/matmul_x86.h
index 12311e7a94d..afbb85a7883 100644
--- a/src/layer/x86/matmul_x86.h
+++ b/src/layer/x86/matmul_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class MatMul_x86 : virtual public MatMul
+class MatMul_x86 : public MatMul
 {
 public:
     MatMul_x86();
diff --git a/src/layer/x86/mish_x86.h b/src/layer/x86/mish_x86.h
index fe625e2ca37..dce8823c6f5 100644
--- a/src/layer/x86/mish_x86.h
+++ b/src/layer/x86/mish_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Mish_x86 : virtual public Mish
+class Mish_x86 : public Mish
 {
 public:
     Mish_x86();
diff --git a/src/layer/x86/multiheadattention_x86.h b/src/layer/x86/multiheadattention_x86.h
index a19a18001f5..55ea41780dd 100644
--- a/src/layer/x86/multiheadattention_x86.h
+++ b/src/layer/x86/multiheadattention_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class MultiHeadAttention_x86 : virtual public MultiHeadAttention
+class MultiHeadAttention_x86 : public MultiHeadAttention
 {
 public:
     MultiHeadAttention_x86();
diff --git a/src/layer/x86/packing_x86.h b/src/layer/x86/packing_x86.h
index a00e74a4411..9f8f368039d 100644
--- a/src/layer/x86/packing_x86.h
+++ b/src/layer/x86/packing_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Packing_x86 : virtual public Packing
+class Packing_x86 : public Packing
 {
 public:
     Packing_x86();
diff --git a/src/layer/x86/padding_x86.h b/src/layer/x86/padding_x86.h
index f01a4a19757..8772fe30eed 100644
--- a/src/layer/x86/padding_x86.h
+++ b/src/layer/x86/padding_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Padding_x86 : virtual public Padding
+class Padding_x86 : public Padding
 {
 public:
     Padding_x86();
diff --git a/src/layer/x86/pooling_x86.h b/src/layer/x86/pooling_x86.h
index b79685c1840..030964fcb4d 100644
--- a/src/layer/x86/pooling_x86.h
+++ b/src/layer/x86/pooling_x86.h
@@ -22,7 +22,7 @@
 
 namespace ncnn {
 
-class Pooling_x86 : virtual public Pooling
+class Pooling_x86 : public Pooling
 {
 public:
     Pooling_x86();
diff --git a/src/layer/x86/prelu_x86.h b/src/layer/x86/prelu_x86.h
index 6bbfeae0f0d..17d60d4b297 100644
--- a/src/layer/x86/prelu_x86.h
+++ b/src/layer/x86/prelu_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class PReLU_x86 : virtual public PReLU
+class PReLU_x86 : public PReLU
 {
 public:
     PReLU_x86();
diff --git a/src/layer/x86/quantize_x86.h b/src/layer/x86/quantize_x86.h
index 6fb2d41d662..5c743fe4cff 100644
--- a/src/layer/x86/quantize_x86.h
+++ b/src/layer/x86/quantize_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Quantize_x86 : virtual public Quantize
+class Quantize_x86 : public Quantize
 {
 public:
     Quantize_x86();
diff --git a/src/layer/x86/relu_x86.h b/src/layer/x86/relu_x86.h
index 6d3cce1c5d8..9d0b5966f53 100644
--- a/src/layer/x86/relu_x86.h
+++ b/src/layer/x86/relu_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class ReLU_x86 : virtual public ReLU
+class ReLU_x86 : public ReLU
 {
 public:
     ReLU_x86();
diff --git a/src/layer/x86/requantize_x86.h b/src/layer/x86/requantize_x86.h
index 02b6880f0e9..febc418654f 100644
--- a/src/layer/x86/requantize_x86.h
+++ b/src/layer/x86/requantize_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Requantize_x86 : virtual public Requantize
+class Requantize_x86 : public Requantize
 {
 public:
     Requantize_x86();
diff --git a/src/layer/x86/reshape_x86.h b/src/layer/x86/reshape_x86.h
index a29b91c1b50..56c8ddfb357 100644
--- a/src/layer/x86/reshape_x86.h
+++ b/src/layer/x86/reshape_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Reshape_x86 : virtual public Reshape
+class Reshape_x86 : public Reshape
 {
 public:
     Reshape_x86();
diff --git a/src/layer/x86/roialign_x86.h b/src/layer/x86/roialign_x86.h
index f1c4ff912b3..1b91c1a8cbe 100644
--- a/src/layer/x86/roialign_x86.h
+++ b/src/layer/x86/roialign_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class ROIAlign_x86 : virtual public ROIAlign
+class ROIAlign_x86 : public ROIAlign
 {
 public:
     ROIAlign_x86();
diff --git a/src/layer/x86/scale_x86.h b/src/layer/x86/scale_x86.h
index 840e6903c33..f06cf414688 100644
--- a/src/layer/x86/scale_x86.h
+++ b/src/layer/x86/scale_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Scale_x86 : virtual public Scale
+class Scale_x86 : public Scale
 {
 public:
     Scale_x86();
diff --git a/src/layer/x86/selu_x86.h b/src/layer/x86/selu_x86.h
index d7b5bf8a87e..7f4a78f80ed 100644
--- a/src/layer/x86/selu_x86.h
+++ b/src/layer/x86/selu_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class SELU_x86 : virtual public SELU
+class SELU_x86 : public SELU
 {
 public:
     SELU_x86();
diff --git a/src/layer/x86/shufflechannel_x86.h b/src/layer/x86/shufflechannel_x86.h
index 6adca483c17..1e4328a2560 100644
--- a/src/layer/x86/shufflechannel_x86.h
+++ b/src/layer/x86/shufflechannel_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class ShuffleChannel_x86 : virtual public ShuffleChannel
+class ShuffleChannel_x86 : public ShuffleChannel
 {
 public:
     ShuffleChannel_x86();
diff --git a/src/layer/x86/sigmoid_x86.h b/src/layer/x86/sigmoid_x86.h
index 05ea2c40f11..52bf85d9eaf 100644
--- a/src/layer/x86/sigmoid_x86.h
+++ b/src/layer/x86/sigmoid_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Sigmoid_x86 : virtual public Sigmoid
+class Sigmoid_x86 : public Sigmoid
 {
 public:
     Sigmoid_x86();
diff --git a/src/layer/x86/slice_x86.h b/src/layer/x86/slice_x86.h
index fd6fbf9a1b7..0c9b266f84d 100644
--- a/src/layer/x86/slice_x86.h
+++ b/src/layer/x86/slice_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Slice_x86 : virtual public Slice
+class Slice_x86 : public Slice
 {
 public:
     Slice_x86();
diff --git a/src/layer/x86/softmax_x86.h b/src/layer/x86/softmax_x86.h
index c899dcd1cc8..3d1b733a9ec 100644
--- a/src/layer/x86/softmax_x86.h
+++ b/src/layer/x86/softmax_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Softmax_x86 : virtual public Softmax
+class Softmax_x86 : public Softmax
 {
 public:
     Softmax_x86();
diff --git a/src/layer/x86/swish_x86.h b/src/layer/x86/swish_x86.h
index 03c6d5e4b30..76b7c3d83f6 100644
--- a/src/layer/x86/swish_x86.h
+++ b/src/layer/x86/swish_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Swish_x86 : virtual public Swish
+class Swish_x86 : public Swish
 {
 public:
     Swish_x86();
diff --git a/src/layer/x86/tanh_x86.h b/src/layer/x86/tanh_x86.h
index 60913d49c7b..e4c4477bc56 100644
--- a/src/layer/x86/tanh_x86.h
+++ b/src/layer/x86/tanh_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class TanH_x86 : virtual public TanH
+class TanH_x86 : public TanH
 {
 public:
     TanH_x86();
diff --git a/src/layer/x86/unaryop_x86.h b/src/layer/x86/unaryop_x86.h
index 8e8f6c4d2de..0e4a7ff59e1 100644
--- a/src/layer/x86/unaryop_x86.h
+++ b/src/layer/x86/unaryop_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class UnaryOp_x86 : virtual public UnaryOp
+class UnaryOp_x86 : public UnaryOp
 {
 public:
     UnaryOp_x86();
diff --git a/src/layer/x86/yolov3detectionoutput_x86.h b/src/layer/x86/yolov3detectionoutput_x86.h
index ef93d4647f8..c378b5827b7 100644
--- a/src/layer/x86/yolov3detectionoutput_x86.h
+++ b/src/layer/x86/yolov3detectionoutput_x86.h
@@ -19,7 +19,7 @@
 
 namespace ncnn {
 
-class Yolov3DetectionOutput_x86 : virtual public Yolov3DetectionOutput
+class Yolov3DetectionOutput_x86 : public Yolov3DetectionOutput
 {
 public:
     Yolov3DetectionOutput_x86();
diff --git a/src/layer_registry.h.in b/src/layer_registry.h.in
index 4b6398d0ced..dfe8e73ce79 100644
--- a/src/layer_registry.h.in
+++ b/src/layer_registry.h.in
@@ -6,6 +6,10 @@ static const layer_registry_entry layer_registry[] = {
 @layer_registry@
 };
 
+static const layer_registry_entry layer_registry_arch[] = {
+@layer_registry_arch@
+};
+
 #if NCNN_RUNTIME_CPU && NCNN_AVX512
 static const layer_registry_entry layer_registry_avx512[] = {
 @layer_registry_avx512@
diff --git a/tests/testutil.h b/tests/testutil.h
index c5d1ca05d80..1215b59142c 100644
--- a/tests/testutil.h
+++ b/tests/testutil.h
@@ -325,7 +325,7 @@ static int CompareMat(const std::vector<ncnn::Mat>& a, const std::vector<ncnn::M
 template<typename T>
 int test_layer_naive(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const std::vector<ncnn::Mat>& a, int top_blob_count, std::vector<ncnn::Mat>& b, void (*func)(T*), int flag)
 {
-    ncnn::Layer* op = ncnn::create_layer(typeindex);
+    ncnn::Layer* op = ncnn::create_layer_naive(typeindex);
 
     if (func)
     {
@@ -385,7 +385,7 @@ int test_layer_naive(int typeindex, const ncnn::ParamDict& pd, const std::vector
 template<typename T>
 int test_layer_cpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const std::vector<ncnn::Mat>& a, int top_blob_count, std::vector<ncnn::Mat>& c, const std::vector<ncnn::Mat>& top_shapes, void (*func)(T*), int flag)
 {
-    ncnn::Layer* op = ncnn::create_layer(typeindex);
+    ncnn::Layer* op = ncnn::create_layer_cpu(typeindex);
 
     if (!op->support_packing && _opt.use_packing_layout)
     {
@@ -626,7 +626,7 @@ int test_layer_gpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<n
         return 233;
     }
 
-    ncnn::Layer* op = ncnn::create_layer(typeindex);
+    ncnn::Layer* op = ncnn::create_layer_vulkan(typeindex);
 
     if (!op->support_vulkan)
     {
@@ -855,7 +855,7 @@ int test_layer(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn:
 template<typename T>
 int test_layer_naive(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Mat& a, ncnn::Mat& b, void (*func)(T*), int flag)
 {
-    ncnn::Layer* op = ncnn::create_layer(typeindex);
+    ncnn::Layer* op = ncnn::create_layer_naive(typeindex);
 
     if (func)
     {
@@ -902,7 +902,7 @@ int test_layer_naive(int typeindex, const ncnn::ParamDict& pd, const std::vector
 template<typename T>
 int test_layer_cpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const ncnn::Mat& a, ncnn::Mat& c, const ncnn::Mat& top_shape, void (*func)(T*), int flag)
 {
-    ncnn::Layer* op = ncnn::create_layer(typeindex);
+    ncnn::Layer* op = ncnn::create_layer_cpu(typeindex);
 
     if (!op->support_packing && _opt.use_packing_layout)
     {
@@ -1126,7 +1126,7 @@ int test_layer_gpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<n
         return 233;
     }
 
-    ncnn::Layer* op = ncnn::create_layer(typeindex);
+    ncnn::Layer* op = ncnn::create_layer_vulkan(typeindex);
 
     if (!op->support_vulkan)
     {

From cd3af0129109b934b7ff4ee4f4ec43cd166b5784 Mon Sep 17 00:00:00 2001
From: nihuini <nihuini@tencent.com>
Date: Wed, 3 Jan 2024 15:24:14 +0800
Subject: [PATCH 03/19] wip

---
 tests/CMakeLists.txt                    |  4 +-
 tests/test_absval.cpp                   |  3 +-
 tests/test_batchnorm.cpp                |  3 +-
 tests/test_bias.cpp                     |  3 +-
 tests/test_binaryop.cpp                 |  5 +-
 tests/test_binaryop_1.cpp               |  5 +-
 tests/test_binaryop_2.cpp               |  5 +-
 tests/test_binaryop_3.cpp               |  5 +-
 tests/test_bnll.cpp                     |  3 +-
 tests/test_cast.cpp                     |  5 +-
 tests/test_celu.cpp                     |  3 +-
 tests/test_clip.cpp                     |  3 +-
 tests/test_concat.cpp                   |  3 +-
 tests/test_convolution.cpp              |  9 ++--
 tests/test_convolution1d.cpp            |  5 +-
 tests/test_convolution3d.cpp            |  3 +-
 tests/test_convolution_1.cpp            |  9 ++--
 tests/test_convolution_2.cpp            |  9 ++--
 tests/test_convolution_3.cpp            | 15 +++---
 tests/test_convolutiondepthwise.cpp     |  3 +-
 tests/test_convolutiondepthwise1d.cpp   |  5 +-
 tests/test_convolutiondepthwise3d.cpp   |  3 +-
 tests/test_convolutiondepthwise_1.cpp   |  5 +-
 tests/test_copyto.cpp                   |  3 +-
 tests/test_copyto_1.cpp                 |  3 +-
 tests/test_crop.cpp                     |  3 +-
 tests/test_crop_1.cpp                   |  3 +-
 tests/test_crop_2.cpp                   |  3 +-
 tests/test_cumulativesum.cpp            |  3 +-
 tests/test_deconvolution.cpp            |  9 ++--
 tests/test_deconvolution1d.cpp          |  5 +-
 tests/test_deconvolution3d.cpp          |  3 +-
 tests/test_deconvolutiondepthwise.cpp   |  3 +-
 tests/test_deconvolutiondepthwise1d.cpp |  5 +-
 tests/test_deconvolutiondepthwise3d.cpp |  3 +-
 tests/test_deconvolutiondepthwise_1.cpp |  3 +-
 tests/test_deepcopy.cpp                 |  3 +-
 tests/test_deformableconv2d.cpp         |  7 ++-
 tests/test_deformableconv2d_1.cpp       |  7 ++-
 tests/test_deformableconv2d_2.cpp       |  7 ++-
 tests/test_deformableconv2d_3.cpp       |  7 ++-
 tests/test_deformableconv2d_4.cpp       |  3 +-
 tests/test_dequantize.cpp               |  5 +-
 tests/test_diag.cpp                     |  3 +-
 tests/test_dropout.cpp                  |  3 +-
 tests/test_einsum.cpp                   |  3 +-
 tests/test_eltwise.cpp                  |  3 +-
 tests/test_elu.cpp                      |  3 +-
 tests/test_erf.cpp                      |  3 +-
 tests/test_expanddims.cpp               |  5 +-
 tests/test_flatten.cpp                  |  5 +-
 tests/test_fold.cpp                     |  3 +-
 tests/test_gelu.cpp                     |  3 +-
 tests/test_gemm.cpp                     |  5 +-
 tests/test_gemm_1.cpp                   |  3 +-
 tests/test_glu.cpp                      |  3 +-
 tests/test_gridsample.cpp               |  3 +-
 tests/test_groupnorm.cpp                |  3 +-
 tests/test_gru.cpp                      |  9 ++--
 tests/test_hardsigmoid.cpp              |  3 +-
 tests/test_hardswish.cpp                |  3 +-
 tests/test_innerproduct.cpp             |  9 ++--
 tests/test_instancenorm.cpp             |  3 +-
 tests/test_interp.cpp                   | 13 +++--
 tests/test_layernorm.cpp                |  3 +-
 tests/test_lrn.cpp                      |  3 +-
 tests/test_lstm.cpp                     |  9 ++--
 tests/test_matmul.cpp                   |  5 +-
 tests/test_memorydata.cpp               |  3 +-
 tests/test_mish.cpp                     |  3 +-
 tests/test_multiheadattention.cpp       |  7 ++-
 tests/test_noop.cpp                     |  3 +-
 tests/test_normalize.cpp                |  3 +-
 tests/test_packing.cpp                  |  2 -
 tests/test_padding.cpp                  |  5 +-
 tests/test_permute.cpp                  |  3 +-
 tests/test_pixelshuffle.cpp             |  3 +-
 tests/test_pooling.cpp                  |  3 +-
 tests/test_pooling1d.cpp                |  3 +-
 tests/test_pooling3d.cpp                |  3 +-
 tests/test_power.cpp                    |  3 +-
 tests/test_prelu.cpp                    |  3 +-
 tests/test_priorbox.cpp                 |  5 +-
 tests/test_quantize.cpp                 |  3 +-
 tests/test_reduction.cpp                |  5 +-
 tests/test_relu.cpp                     |  3 +-
 tests/test_reorg.cpp                    |  3 +-
 tests/test_requantize.cpp               |  5 +-
 tests/test_reshape.cpp                  |  3 +-
 tests/test_reshape_1.cpp                |  3 +-
 tests/test_rnn.cpp                      |  9 ++--
 tests/test_roialign.cpp                 |  3 +-
 tests/test_roipooling.cpp               |  3 +-
 tests/test_scale.cpp                    |  5 +-
 tests/test_selu.cpp                     |  3 +-
 tests/test_shrink.cpp                   |  3 +-
 tests/test_shufflechannel.cpp           |  3 +-
 tests/test_sigmoid.cpp                  |  3 +-
 tests/test_slice.cpp                    |  5 +-
 tests/test_softmax.cpp                  |  3 +-
 tests/test_softplus.cpp                 |  3 +-
 tests/test_squeeze.cpp                  |  5 +-
 tests/test_swish.cpp                    |  3 +-
 tests/test_tanh.cpp                     |  3 +-
 tests/test_tile.cpp                     |  5 +-
 tests/test_unaryop.cpp                  |  3 +-
 tests/test_unfold.cpp                   |  3 +-
 tests/test_yolov3detectionoutput.cpp    |  3 +-
 tests/testutil.h                        | 64 ++++++++++---------------
 109 files changed, 202 insertions(+), 322 deletions(-)

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index bef56d44a58..2dff6c38692 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -71,7 +71,7 @@ ncnn_add_layer_test(BatchNorm)
 ncnn_add_layer_test(Bias)
 ncnn_add_layer_test(BinaryOp)
 ncnn_add_layer_test(BNLL)
-ncnn_add_layer_test(Cast)
+# ncnn_add_layer_test(Cast)
 ncnn_add_layer_test(CELU)
 ncnn_add_layer_test(Clip)
 ncnn_add_layer_test(Concat)
@@ -122,7 +122,7 @@ ncnn_add_layer_test(Mish)
 ncnn_add_layer_test(MultiHeadAttention)
 ncnn_add_layer_test(Noop)
 ncnn_add_layer_test(Normalize)
-ncnn_add_layer_test(Packing)
+# ncnn_add_layer_test(Packing)
 ncnn_add_layer_test(Padding)
 ncnn_add_layer_test(Permute)
 ncnn_add_layer_test(PixelShuffle)
diff --git a/tests/test_absval.cpp b/tests/test_absval.cpp
index 6312a3b01be..e931606be13 100644
--- a/tests/test_absval.cpp
+++ b/tests/test_absval.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/absval.h"
 #include "testutil.h"
 
 static int test_absval(const ncnn::Mat& a)
@@ -21,7 +20,7 @@ static int test_absval(const ncnn::Mat& a)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::AbsVal>("AbsVal", pd, weights, a);
+    int ret = test_layer("AbsVal", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_absval failed a.dims=%d a=(%d %d %d)\n", a.dims, a.w, a.h, a.c);
diff --git a/tests/test_batchnorm.cpp b/tests/test_batchnorm.cpp
index 8e86e5b48da..a977a33640d 100644
--- a/tests/test_batchnorm.cpp
+++ b/tests/test_batchnorm.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/batchnorm.h"
 #include "testutil.h"
 
 static int test_batchnorm(const ncnn::Mat& a, float eps)
@@ -35,7 +34,7 @@ static int test_batchnorm(const ncnn::Mat& a, float eps)
     // var must be positive
     Randomize(weights[2], 0.001f, 2.f);
 
-    int ret = test_layer<ncnn::BatchNorm>("BatchNorm", pd, weights, a);
+    int ret = test_layer("BatchNorm", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_batchnorm failed a.dims=%d a=(%d %d %d %d) eps=%f\n", a.dims, a.w, a.h, a.d, a.c, eps);
diff --git a/tests/test_bias.cpp b/tests/test_bias.cpp
index fe71820968f..d522a950bf7 100644
--- a/tests/test_bias.cpp
+++ b/tests/test_bias.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/bias.h"
 #include "testutil.h"
 
 static int test_bias(const ncnn::Mat& a)
@@ -25,7 +24,7 @@ static int test_bias(const ncnn::Mat& a)
     std::vector<ncnn::Mat> weights(1);
     weights[0] = RandomMat(channels);
 
-    int ret = test_layer<ncnn::Bias>("Bias", pd, weights, a);
+    int ret = test_layer("Bias", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_bias failed a.dims=%d a=(%d %d %d %d)\n", a.dims, a.w, a.h, a.d, a.c);
diff --git a/tests/test_binaryop.cpp b/tests/test_binaryop.cpp
index 89f953eaccb..4a0552d8b7a 100644
--- a/tests/test_binaryop.cpp
+++ b/tests/test_binaryop.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/binaryop.h"
 #include "testutil.h"
 
 #define OP_TYPE_MAX 12
@@ -67,7 +66,7 @@ static int test_binaryop(const ncnn::Mat& _a, const ncnn::Mat& _b, int flag)
     ab[0] = a;
     ab[1] = b;
 
-    int ret = test_layer<ncnn::BinaryOp>("BinaryOp", pd, weights, ab, 1, 0.001, 0, flag);
+    int ret = test_layer("BinaryOp", pd, weights, ab, 1, 0.001, 0, flag);
     if (ret != 0)
     {
         fprintf(stderr, "test_binaryop failed a.dims=%d a=(%d %d %d %d) b.dims=%d b=(%d %d %d %d) op_type=%d\n", a.dims, a.w, a.h, a.d, a.c, b.dims, b.w, b.h, b.d, b.c, op_type);
@@ -109,7 +108,7 @@ static int test_binaryop(const ncnn::Mat& _a, float b, int flag)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::BinaryOp>("BinaryOp", pd, weights, a, 0.001, 0, flag);
+    int ret = test_layer("BinaryOp", pd, weights, a, 0.001, 0, flag);
     if (ret != 0)
     {
         fprintf(stderr, "test_binaryop failed a.dims=%d a=(%d %d %d %d) b=%f op_type=%d\n", a.dims, a.w, a.h, a.d, a.c, b, op_type);
diff --git a/tests/test_binaryop_1.cpp b/tests/test_binaryop_1.cpp
index d6b20ede1a8..d899932291f 100644
--- a/tests/test_binaryop_1.cpp
+++ b/tests/test_binaryop_1.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/binaryop.h"
 #include "testutil.h"
 
 #define OP_TYPE_MAX 12
@@ -67,7 +66,7 @@ static int test_binaryop(const ncnn::Mat& _a, const ncnn::Mat& _b, int flag)
     ab[0] = a;
     ab[1] = b;
 
-    int ret = test_layer<ncnn::BinaryOp>("BinaryOp", pd, weights, ab, 1, 0.001, 0, flag);
+    int ret = test_layer("BinaryOp", pd, weights, ab, 1, 0.001, 0, flag);
     if (ret != 0)
     {
         fprintf(stderr, "test_binaryop failed a.dims=%d a=(%d %d %d %d) b.dims=%d b=(%d %d %d %d) op_type=%d\n", a.dims, a.w, a.h, a.d, a.c, b.dims, b.w, b.h, b.d, b.c, op_type);
@@ -109,7 +108,7 @@ static int test_binaryop(const ncnn::Mat& _a, float b, int flag)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::BinaryOp>("BinaryOp", pd, weights, a, 0.001, 0, flag);
+    int ret = test_layer("BinaryOp", pd, weights, a, 0.001, 0, flag);
     if (ret != 0)
     {
         fprintf(stderr, "test_binaryop failed a.dims=%d a=(%d %d %d %d) b=%f op_type=%d\n", a.dims, a.w, a.h, a.d, a.c, b, op_type);
diff --git a/tests/test_binaryop_2.cpp b/tests/test_binaryop_2.cpp
index 14c5e7d3dac..3427f7af248 100644
--- a/tests/test_binaryop_2.cpp
+++ b/tests/test_binaryop_2.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/binaryop.h"
 #include "testutil.h"
 
 #define OP_TYPE_MAX 12
@@ -67,7 +66,7 @@ static int test_binaryop(const ncnn::Mat& _a, const ncnn::Mat& _b, int flag)
     ab[0] = a;
     ab[1] = b;
 
-    int ret = test_layer<ncnn::BinaryOp>("BinaryOp", pd, weights, ab, 1, 0.001, 0, flag);
+    int ret = test_layer("BinaryOp", pd, weights, ab, 1, 0.001, 0, flag);
     if (ret != 0)
     {
         fprintf(stderr, "test_binaryop failed a.dims=%d a=(%d %d %d %d) b.dims=%d b=(%d %d %d %d) op_type=%d\n", a.dims, a.w, a.h, a.d, a.c, b.dims, b.w, b.h, b.d, b.c, op_type);
@@ -109,7 +108,7 @@ static int test_binaryop(const ncnn::Mat& _a, float b, int flag)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::BinaryOp>("BinaryOp", pd, weights, a, 0.001, 0, flag);
+    int ret = test_layer("BinaryOp", pd, weights, a, 0.001, 0, flag);
     if (ret != 0)
     {
         fprintf(stderr, "test_binaryop failed a.dims=%d a=(%d %d %d %d) b=%f op_type=%d\n", a.dims, a.w, a.h, a.d, a.c, b, op_type);
diff --git a/tests/test_binaryop_3.cpp b/tests/test_binaryop_3.cpp
index 655c2a3ce91..7509fc6aae2 100644
--- a/tests/test_binaryop_3.cpp
+++ b/tests/test_binaryop_3.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/binaryop.h"
 #include "testutil.h"
 
 #define OP_TYPE_MAX 12
@@ -67,7 +66,7 @@ static int test_binaryop(const ncnn::Mat& _a, const ncnn::Mat& _b, int flag)
     ab[0] = a;
     ab[1] = b;
 
-    int ret = test_layer<ncnn::BinaryOp>("BinaryOp", pd, weights, ab, 1, 0.001, 0, flag);
+    int ret = test_layer("BinaryOp", pd, weights, ab, 1, 0.001, 0, flag);
     if (ret != 0)
     {
         fprintf(stderr, "test_binaryop failed a.dims=%d a=(%d %d %d %d) b.dims=%d b=(%d %d %d %d) op_type=%d\n", a.dims, a.w, a.h, a.d, a.c, b.dims, b.w, b.h, b.d, b.c, op_type);
@@ -109,7 +108,7 @@ static int test_binaryop(const ncnn::Mat& _a, float b, int flag)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::BinaryOp>("BinaryOp", pd, weights, a, 0.001, 0, flag);
+    int ret = test_layer("BinaryOp", pd, weights, a, 0.001, 0, flag);
     if (ret != 0)
     {
         fprintf(stderr, "test_binaryop failed a.dims=%d a=(%d %d %d %d) b=%f op_type=%d\n", a.dims, a.w, a.h, a.d, a.c, b, op_type);
diff --git a/tests/test_bnll.cpp b/tests/test_bnll.cpp
index 2bb35376200..0b22fda679a 100644
--- a/tests/test_bnll.cpp
+++ b/tests/test_bnll.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/bnll.h"
 #include "testutil.h"
 
 static int test_bnll(const ncnn::Mat& a)
@@ -21,7 +20,7 @@ static int test_bnll(const ncnn::Mat& a)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::BNLL>("BNLL", pd, weights, a);
+    int ret = test_layer("BNLL", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_bnll failed a.dims=%d a=(%d %d %d)\n", a.dims, a.w, a.h, a.c);
diff --git a/tests/test_cast.cpp b/tests/test_cast.cpp
index 0470c4e1ab0..fb1f9399bd4 100644
--- a/tests/test_cast.cpp
+++ b/tests/test_cast.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/cast.h"
 #include "testutil.h"
 
 static int cast_cpu_naive(const ncnn::Mat& a, ncnn::Mat& b, int type_from, int type_to)
@@ -29,7 +28,7 @@ static int cast_cpu_naive(const ncnn::Mat& a, ncnn::Mat& b, int type_from, int t
     opt.use_int8_inference = false;
     opt.use_packing_layout = false;
 
-    ncnn::Layer* op = ncnn::create_layer("Cast");
+    ncnn::Layer* op = ncnn::create_layer_naive("Cast");
 
     op->load_param(pd);
 
@@ -39,7 +38,7 @@ static int cast_cpu_naive(const ncnn::Mat& a, ncnn::Mat& b, int type_from, int t
 
     op->create_pipeline(opt);
 
-    ((ncnn::Cast*)op)->ncnn::Cast::forward(a, b, opt);
+    op->forward(a, b, opt);
 
     op->destroy_pipeline(opt);
 
diff --git a/tests/test_celu.cpp b/tests/test_celu.cpp
index 703864eabdc..36a11ccd021 100644
--- a/tests/test_celu.cpp
+++ b/tests/test_celu.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/celu.h"
 #include "testutil.h"
 
 static int test_celu(const ncnn::Mat& a, float alpha)
@@ -22,7 +21,7 @@ static int test_celu(const ncnn::Mat& a, float alpha)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::CELU>("CELU", pd, weights, a);
+    int ret = test_layer("CELU", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_celu failed a.dims=%d a=(%d %d %d %d) alpha=%f\n", a.dims, a.w, a.h, a.d, a.c, alpha);
diff --git a/tests/test_clip.cpp b/tests/test_clip.cpp
index 553085e2d63..72a35fcf2cc 100644
--- a/tests/test_clip.cpp
+++ b/tests/test_clip.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/clip.h"
 #include "testutil.h"
 
 static int test_clip(const ncnn::Mat& a, float min, float max)
@@ -23,7 +22,7 @@ static int test_clip(const ncnn::Mat& a, float min, float max)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Clip>("Clip", pd, weights, a);
+    int ret = test_layer("Clip", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_clip failed a.dims=%d a=(%d %d %d %d) min=%f max=%f\n", a.dims, a.w, a.h, a.d, a.c, min, max);
diff --git a/tests/test_concat.cpp b/tests/test_concat.cpp
index 3ba621c110f..c4931383b4c 100644
--- a/tests/test_concat.cpp
+++ b/tests/test_concat.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/concat.h"
 #include "testutil.h"
 
 static int test_concat(const std::vector<ncnn::Mat>& a, int axis)
@@ -22,7 +21,7 @@ static int test_concat(const std::vector<ncnn::Mat>& a, int axis)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Concat>("Concat", pd, weights, a);
+    int ret = test_layer("Concat", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_concat failed a[0].dims=%d a[0]=(%d %d %d %d) axis=%d\n", a[0].dims, a[0].w, a[0].h, a[0].d, a[0].c, axis);
diff --git a/tests/test_convolution.cpp b/tests/test_convolution.cpp
index 394ca0e8d57..b37634fc333 100644
--- a/tests/test_convolution.cpp
+++ b/tests/test_convolution.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/convolution.h"
 #include "testutil.h"
 
 static int test_convolution(int w, int h, int c, int outch, int kernel, int dilation, int stride, int pad, int bias)
@@ -42,7 +41,7 @@ static int test_convolution(int w, int h, int c, int outch, int kernel, int dila
 
     float epsilon = 0.001;
 
-    int ret = test_layer<ncnn::Convolution>("Convolution", pd, weights, a, epsilon);
+    int ret = test_layer("Convolution", pd, weights, a, epsilon);
     if (ret != 0)
     {
         fprintf(stderr, "test_convolution failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
@@ -62,7 +61,7 @@ static int test_convolution(int w, int h, int c, int outch, int kernel, int dila
         opt.use_sgemm_convolution = false;
         opt.use_winograd_convolution = false;
 
-        ret = test_layer_opt<ncnn::Convolution>("Convolution", pd, weights, opt, a, epsilon);
+        ret = test_layer_opt("Convolution", pd, weights, opt, a, epsilon);
         if (ret != 0)
         {
             fprintf(stderr, "test_convolution failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
@@ -83,7 +82,7 @@ static int test_convolution(int w, int h, int c, int outch, int kernel, int dila
         opt.use_sgemm_convolution = false;
         opt.use_winograd_convolution = false;
 
-        ret = test_layer_opt<ncnn::Convolution>("Convolution", pd, weights, opt, a, epsilon);
+        ret = test_layer_opt("Convolution", pd, weights, opt, a, epsilon);
         if (ret != 0)
         {
             fprintf(stderr, "test_convolution failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
@@ -97,7 +96,7 @@ static int test_convolution(int w, int h, int c, int outch, int kernel, int dila
         opt.num_threads = 1;
         opt.use_a53_a55_optimized_kernel = true;
 
-        ret = test_layer_opt<ncnn::Convolution>("Convolution", pd, weights, opt, a, epsilon);
+        ret = test_layer_opt("Convolution", pd, weights, opt, a, epsilon);
         if (ret != 0)
         {
             fprintf(stderr, "test_convolution failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
diff --git a/tests/test_convolution1d.cpp b/tests/test_convolution1d.cpp
index bea75da301c..1b194b56236 100644
--- a/tests/test_convolution1d.cpp
+++ b/tests/test_convolution1d.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/convolution1d.h"
 #include "testutil.h"
 
 static int test_convolution1d(int w, int h, int outh, int kernel, int dilation, int stride, int pad, int bias)
@@ -40,7 +39,7 @@ static int test_convolution1d(int w, int h, int outh, int kernel, int dilation,
     if (bias)
         weights[1] = RandomMat(outh);
 
-    int ret = test_layer<ncnn::Convolution1D>("Convolution1D", pd, weights, a);
+    int ret = test_layer("Convolution1D", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_convolution1d failed w=%d h=%d outh=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, outh, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
@@ -167,7 +166,7 @@ static int test_convolution1d_dynamic(int w, int h, int outh, int kernel, int di
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Convolution1D>("Convolution1D", pd, weights, as);
+    int ret = test_layer("Convolution1D", pd, weights, as);
     if (ret != 0)
     {
         fprintf(stderr, "test_convolution1d_dynamic failed w=%d h=%d outh=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, outh, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
diff --git a/tests/test_convolution3d.cpp b/tests/test_convolution3d.cpp
index 2cd752982e7..ba1ca745478 100644
--- a/tests/test_convolution3d.cpp
+++ b/tests/test_convolution3d.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/convolution3d.h"
 #include "testutil.h"
 
 static int test_convolution3d(int w, int h, int d, int c, int outch, int kernel, int dilation, int stride, int pad, int bias)
@@ -40,7 +39,7 @@ static int test_convolution3d(int w, int h, int d, int c, int outch, int kernel,
     if (bias)
         weights[1] = RandomMat(outch);
 
-    int ret = test_layer<ncnn::Convolution3D>("Convolution3D", pd, weights, a);
+    int ret = test_layer("Convolution3D", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_convolution3d failed w=%d h=%d d=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, d, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
diff --git a/tests/test_convolution_1.cpp b/tests/test_convolution_1.cpp
index 22f634247a3..77dd6dab1eb 100644
--- a/tests/test_convolution_1.cpp
+++ b/tests/test_convolution_1.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/convolution.h"
 #include "testutil.h"
 
 static int test_convolution(int w, int h, int c, int outch, int kernel, int dilation, int stride, int pad, int bias)
@@ -42,7 +41,7 @@ static int test_convolution(int w, int h, int c, int outch, int kernel, int dila
 
     float epsilon = 0.001;
 
-    int ret = test_layer<ncnn::Convolution>("Convolution", pd, weights, a, epsilon);
+    int ret = test_layer("Convolution", pd, weights, a, epsilon);
     if (ret != 0)
     {
         fprintf(stderr, "test_convolution failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
@@ -62,7 +61,7 @@ static int test_convolution(int w, int h, int c, int outch, int kernel, int dila
         opt.use_sgemm_convolution = false;
         opt.use_winograd_convolution = false;
 
-        ret = test_layer_opt<ncnn::Convolution>("Convolution", pd, weights, opt, a, epsilon);
+        ret = test_layer_opt("Convolution", pd, weights, opt, a, epsilon);
         if (ret != 0)
         {
             fprintf(stderr, "test_convolution failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
@@ -83,7 +82,7 @@ static int test_convolution(int w, int h, int c, int outch, int kernel, int dila
         opt.use_sgemm_convolution = false;
         opt.use_winograd_convolution = false;
 
-        ret = test_layer_opt<ncnn::Convolution>("Convolution", pd, weights, opt, a, epsilon);
+        ret = test_layer_opt("Convolution", pd, weights, opt, a, epsilon);
         if (ret != 0)
         {
             fprintf(stderr, "test_convolution failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
@@ -97,7 +96,7 @@ static int test_convolution(int w, int h, int c, int outch, int kernel, int dila
         opt.num_threads = 1;
         opt.use_a53_a55_optimized_kernel = true;
 
-        ret = test_layer_opt<ncnn::Convolution>("Convolution", pd, weights, opt, a, epsilon);
+        ret = test_layer_opt("Convolution", pd, weights, opt, a, epsilon);
         if (ret != 0)
         {
             fprintf(stderr, "test_convolution failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
diff --git a/tests/test_convolution_2.cpp b/tests/test_convolution_2.cpp
index cff52d77fbd..5135f5bd780 100644
--- a/tests/test_convolution_2.cpp
+++ b/tests/test_convolution_2.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/convolution.h"
 #include "testutil.h"
 
 static int test_convolution(int w, int h, int c, int outch, int kernel, int dilation, int stride, int pad, int bias)
@@ -44,7 +43,7 @@ static int test_convolution(int w, int h, int c, int outch, int kernel, int dila
     Randomize(weights[0], -0.6, 0.6);
     float epsilon = 0.001;
 
-    int ret = test_layer<ncnn::Convolution>("Convolution", pd, weights, a, epsilon);
+    int ret = test_layer("Convolution", pd, weights, a, epsilon);
     if (ret != 0)
     {
         fprintf(stderr, "test_convolution failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
@@ -64,7 +63,7 @@ static int test_convolution(int w, int h, int c, int outch, int kernel, int dila
         opt.use_sgemm_convolution = false;
         opt.use_winograd_convolution = false;
 
-        ret = test_layer_opt<ncnn::Convolution>("Convolution", pd, weights, opt, a, epsilon);
+        ret = test_layer_opt("Convolution", pd, weights, opt, a, epsilon);
         if (ret != 0)
         {
             fprintf(stderr, "test_convolution failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
@@ -85,7 +84,7 @@ static int test_convolution(int w, int h, int c, int outch, int kernel, int dila
         opt.use_sgemm_convolution = false;
         opt.use_winograd_convolution = false;
 
-        ret = test_layer_opt<ncnn::Convolution>("Convolution", pd, weights, opt, a, epsilon);
+        ret = test_layer_opt("Convolution", pd, weights, opt, a, epsilon);
         if (ret != 0)
         {
             fprintf(stderr, "test_convolution failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
@@ -98,7 +97,7 @@ static int test_convolution(int w, int h, int c, int outch, int kernel, int dila
         opt.num_threads = 1;
         opt.use_a53_a55_optimized_kernel = true;
 
-        ret = test_layer_opt<ncnn::Convolution>("Convolution", pd, weights, opt, a, epsilon);
+        ret = test_layer_opt("Convolution", pd, weights, opt, a, epsilon);
         if (ret != 0)
         {
             fprintf(stderr, "test_convolution failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
diff --git a/tests/test_convolution_3.cpp b/tests/test_convolution_3.cpp
index b013380b512..5e40cb59c11 100644
--- a/tests/test_convolution_3.cpp
+++ b/tests/test_convolution_3.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/convolution.h"
 #include "testutil.h"
 
 static int test_convolution_vec(int w, int outch, int kernel, int dilation, int stride, int pad, int bias)
@@ -40,7 +39,7 @@ static int test_convolution_vec(int w, int outch, int kernel, int dilation, int
     if (bias)
         weights[1] = RandomMat(outch);
 
-    int ret = test_layer<ncnn::Convolution>("Convolution", pd, weights, a);
+    int ret = test_layer("Convolution", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_convolution_vec failed w=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
@@ -93,7 +92,7 @@ static int test_convolution_dynamic(int w, int h, int c, int outch, int kernel,
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Convolution>("Convolution", pd, weights, as);
+    int ret = test_layer("Convolution", pd, weights, as);
     if (ret != 0)
     {
         fprintf(stderr, "test_convolution_dynamic failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
@@ -183,7 +182,7 @@ static int test_convolution_int8(int w, int h, int c, int outch, int kernel, int
     }
 
     int flag = TEST_LAYER_DISABLE_GPU_TESTING;
-    int ret = test_layer<ncnn::Convolution>("Convolution", pd, weights, a, requant ? 1.0f : 0.001f, 0, flag);
+    int ret = test_layer("Convolution", pd, weights, a, requant ? 1.0f : 0.001f, 0, flag);
     if (ret != 0)
     {
         fprintf(stderr, "test_convolution_int8 failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d requant=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, requant, activation_type, activation_params[0], activation_params[1]);
@@ -206,7 +205,7 @@ static int test_convolution_int8(int w, int h, int c, int outch, int kernel, int
         opt.use_winograd23_convolution = true;
         opt.use_winograd43_convolution = false;
 
-        ret = test_layer_opt<ncnn::Convolution>("Convolution", pd, weights, opt, a, requant ? 1.0f : 0.001f, 0, flag);
+        ret = test_layer_opt("Convolution", pd, weights, opt, a, requant ? 1.0f : 0.001f, 0, flag);
         if (ret != 0)
         {
             fprintf(stderr, "test_convolution_int8 failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d requant=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, requant, activation_type, activation_params[0], activation_params[1]);
@@ -227,7 +226,7 @@ static int test_convolution_int8(int w, int h, int c, int outch, int kernel, int
         opt.use_sgemm_convolution = false;
         opt.use_winograd_convolution = false;
 
-        ret = test_layer_opt<ncnn::Convolution>("Convolution", pd, weights, opt, a, requant ? 1.0f : 0.001f, 0, flag);
+        ret = test_layer_opt("Convolution", pd, weights, opt, a, requant ? 1.0f : 0.001f, 0, flag);
         if (ret != 0)
         {
             fprintf(stderr, "test_convolution_int8 failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d requant=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, requant, activation_type, activation_params[0], activation_params[1]);
@@ -248,7 +247,7 @@ static int test_convolution_int8(int w, int h, int c, int outch, int kernel, int
         opt.use_sgemm_convolution = false;
         opt.use_winograd_convolution = false;
 
-        ret = test_layer_opt<ncnn::Convolution>("Convolution", pd, weights, opt, a, requant ? 1.0f : 0.001f, 0, flag);
+        ret = test_layer_opt("Convolution", pd, weights, opt, a, requant ? 1.0f : 0.001f, 0, flag);
         if (ret != 0)
         {
             fprintf(stderr, "test_convolution_int8 failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d requant=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, requant, activation_type, activation_params[0], activation_params[1]);
@@ -269,7 +268,7 @@ static int test_convolution_int8(int w, int h, int c, int outch, int kernel, int
         opt.use_sgemm_convolution = false;
         opt.use_winograd_convolution = false;
 
-        ret = test_layer_opt<ncnn::Convolution>("Convolution", pd, weights, opt, a, requant ? 1.0f : 0.001f, 0, flag);
+        ret = test_layer_opt("Convolution", pd, weights, opt, a, requant ? 1.0f : 0.001f, 0, flag);
         if (ret != 0)
         {
             fprintf(stderr, "test_convolution_int8 failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d requant=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, requant, activation_type, activation_params[0], activation_params[1]);
diff --git a/tests/test_convolutiondepthwise.cpp b/tests/test_convolutiondepthwise.cpp
index 715fc73662c..dcded4f98da 100644
--- a/tests/test_convolutiondepthwise.cpp
+++ b/tests/test_convolutiondepthwise.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/convolutiondepthwise.h"
 #include "testutil.h"
 
 static int test_convolutiondepthwise(int w, int h, int c, int outch, int kernel, int dilation, int stride, int pad, int bias, int group)
@@ -40,7 +39,7 @@ static int test_convolutiondepthwise(int w, int h, int c, int outch, int kernel,
     weights[0] = RandomMat(outch / group * c / group * kernel * kernel * group);
     weights[1] = RandomMat(outch);
 
-    int ret = test_layer<ncnn::ConvolutionDepthWise>("ConvolutionDepthWise", pd, weights, a);
+    int ret = test_layer("ConvolutionDepthWise", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_convolutiondepthwise failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d group=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, group, activation_type, activation_params[0], activation_params[1]);
diff --git a/tests/test_convolutiondepthwise1d.cpp b/tests/test_convolutiondepthwise1d.cpp
index bb80e80d985..fe7fc254312 100644
--- a/tests/test_convolutiondepthwise1d.cpp
+++ b/tests/test_convolutiondepthwise1d.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/convolutiondepthwise1d.h"
 #include "testutil.h"
 
 static int test_convolutiondepthwise1d(int w, int h, int outh, int kernel, int dilation, int stride, int pad, int bias, int group)
@@ -40,7 +39,7 @@ static int test_convolutiondepthwise1d(int w, int h, int outh, int kernel, int d
     weights[0] = RandomMat(outh / group * h / group * kernel * kernel * group);
     weights[1] = RandomMat(outh);
 
-    int ret = test_layer<ncnn::ConvolutionDepthWise1D>("ConvolutionDepthWise1D", pd, weights, a);
+    int ret = test_layer("ConvolutionDepthWise1D", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_convolutiondepthwise1d failed w=%d h=%d outh=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d group=%d act=%d actparams=[%f,%f]\n", w, h, outh, kernel, dilation, stride, pad, bias, group, activation_type, activation_params[0], activation_params[1]);
@@ -155,7 +154,7 @@ static int test_convolutiondepthwise1d_dynamic(int w, int h, int outh, int kerne
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::ConvolutionDepthWise1D>("ConvolutionDepthWise1D", pd, weights, as);
+    int ret = test_layer("ConvolutionDepthWise1D", pd, weights, as);
     if (ret != 0)
     {
         fprintf(stderr, "test_convolutiondepthwise1d_dynamic failed w=%d h=%d outh=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d group=%d act=%d actparams=[%f,%f]\n", w, h, outh, kernel, dilation, stride, pad, bias, group, activation_type, activation_params[0], activation_params[1]);
diff --git a/tests/test_convolutiondepthwise3d.cpp b/tests/test_convolutiondepthwise3d.cpp
index 7ecb51447b3..ad392532acf 100644
--- a/tests/test_convolutiondepthwise3d.cpp
+++ b/tests/test_convolutiondepthwise3d.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/convolutiondepthwise3d.h"
 #include "testutil.h"
 
 static int test_convolutiondepthwise3d(int w, int h, int d, int c, int outch, int kernel, int dilation, int stride, int pad, int bias, int group)
@@ -40,7 +39,7 @@ static int test_convolutiondepthwise3d(int w, int h, int d, int c, int outch, in
     weights[0] = RandomMat(outch / group * c / group * kernel * kernel * kernel * group);
     weights[1] = RandomMat(outch);
 
-    int ret = test_layer<ncnn::ConvolutionDepthWise3D>("ConvolutionDepthWise3D", pd, weights, a);
+    int ret = test_layer("ConvolutionDepthWise3D", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_convolutiondepthwise3d failed w=%d h=%d d=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d group=%d act=%d actparams=[%f,%f]\n", w, h, d, c, outch, kernel, dilation, stride, pad, bias, group, activation_type, activation_params[0], activation_params[1]);
diff --git a/tests/test_convolutiondepthwise_1.cpp b/tests/test_convolutiondepthwise_1.cpp
index 3d10a7a8e85..e1429e30c05 100644
--- a/tests/test_convolutiondepthwise_1.cpp
+++ b/tests/test_convolutiondepthwise_1.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/convolutiondepthwise.h"
 #include "testutil.h"
 
 static int test_convolutiondepthwise_dynamic(int w, int h, int c, int outch, int kernel, int dilation, int stride, int pad, int bias, int group)
@@ -45,7 +44,7 @@ static int test_convolutiondepthwise_dynamic(int w, int h, int c, int outch, int
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::ConvolutionDepthWise>("ConvolutionDepthWise", pd, weights, as);
+    int ret = test_layer("ConvolutionDepthWise", pd, weights, as);
     if (ret != 0)
     {
         fprintf(stderr, "test_convolutiondepthwise_dynamic failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d group=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, group, activation_type, activation_params[0], activation_params[1]);
@@ -138,7 +137,7 @@ static int test_convolutiondepthwise_int8(int w, int h, int c, int outch, int ke
     }
 
     int flag = TEST_LAYER_DISABLE_GPU_TESTING;
-    int ret = test_layer<ncnn::ConvolutionDepthWise>("ConvolutionDepthWise", pd, weights, a, requant ? 1.0f : 0.001f, 0, flag);
+    int ret = test_layer("ConvolutionDepthWise", pd, weights, a, requant ? 1.0f : 0.001f, 0, flag);
     if (ret != 0)
     {
         fprintf(stderr, "test_convolutiondepthwise_int8 failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d group=%d requant=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, group, requant, activation_type, activation_params[0], activation_params[1]);
diff --git a/tests/test_copyto.cpp b/tests/test_copyto.cpp
index 93a42fd4231..78288663035 100644
--- a/tests/test_copyto.cpp
+++ b/tests/test_copyto.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/copyto.h"
 #include "testutil.h"
 
 static int test_copyto(const ncnn::Mat& self, const ncnn::Mat& src, int woffset, int hoffset, int doffset, int coffset)
@@ -29,7 +28,7 @@ static int test_copyto(const ncnn::Mat& self, const ncnn::Mat& src, int woffset,
     as[0] = self;
     as[1] = src;
 
-    int ret = test_layer<ncnn::CopyTo>("CopyTo", pd, weights, as, 1);
+    int ret = test_layer("CopyTo", pd, weights, as, 1);
     if (ret != 0)
     {
         fprintf(stderr, "test_copyto failed self.dims=%d self=(%d %d %d %d) src.dims=%d src=(%d %d %d %d) woffset=%d hoffset=%d doffset=%d coffset=%d\n", self.dims, self.w, self.h, self.d, self.c, src.dims, src.w, src.h, src.d, src.c, woffset, hoffset, doffset, coffset);
diff --git a/tests/test_copyto_1.cpp b/tests/test_copyto_1.cpp
index 6d5cc220fe2..a381cdabf51 100644
--- a/tests/test_copyto_1.cpp
+++ b/tests/test_copyto_1.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/copyto.h"
 #include "testutil.h"
 
 static ncnn::Mat IntArrayMat(int a0)
@@ -77,7 +76,7 @@ static int test_copyto(const ncnn::Mat& self, const ncnn::Mat& src, const ncnn::
     as[0] = self;
     as[1] = src;
 
-    int ret = test_layer<ncnn::CopyTo>("CopyTo", pd, weights, as, 1);
+    int ret = test_layer("CopyTo", pd, weights, as, 1);
     if (ret != 0)
     {
         fprintf(stderr, "test_copyto failed self.dims=%d self=(%d %d %d %d) src.dims=%d src=(%d %d %d %d)", self.dims, self.w, self.h, self.d, self.c, src.dims, src.w, src.h, src.d, src.c);
diff --git a/tests/test_crop.cpp b/tests/test_crop.cpp
index b2a29778fec..d2a03eb538d 100644
--- a/tests/test_crop.cpp
+++ b/tests/test_crop.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/crop.h"
 #include "testutil.h"
 
 static int test_crop(const ncnn::Mat& a, int woffset, int hoffset, int doffset, int coffset, int outw, int outh, int outd, int outc, int woffset2, int hoffset2, int doffset2, int coffset2)
@@ -33,7 +32,7 @@ static int test_crop(const ncnn::Mat& a, int woffset, int hoffset, int doffset,
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Crop>("Crop", pd, weights, a);
+    int ret = test_layer("Crop", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_crop failed a.dims=%d a=(%d %d %d %d) woffset=%d hoffset=%d doffset=%d coffset=%d outw=%d outh=%d outd=%d outc=%d woffset2=%d hoffset2=%d doffset2=%d coffset2=%d\n", a.dims, a.w, a.h, a.d, a.c, woffset, hoffset, doffset, coffset, outw, outh, outd, outc, woffset2, hoffset2, doffset2, coffset2);
diff --git a/tests/test_crop_1.cpp b/tests/test_crop_1.cpp
index c875a51c7fa..3064dc1de69 100644
--- a/tests/test_crop_1.cpp
+++ b/tests/test_crop_1.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/crop.h"
 #include "testutil.h"
 
 static ncnn::Mat IntArrayMat(int a0)
@@ -74,7 +73,7 @@ static int test_crop(const ncnn::Mat& a, const ncnn::Mat& starts, const ncnn::Ma
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Crop>("Crop", pd, weights, a);
+    int ret = test_layer("Crop", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_crop failed a.dims=%d a=(%d %d %d %d)", a.dims, a.w, a.h, a.d, a.c);
diff --git a/tests/test_crop_2.cpp b/tests/test_crop_2.cpp
index 287634b973e..b896caa2e8e 100644
--- a/tests/test_crop_2.cpp
+++ b/tests/test_crop_2.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/crop.h"
 #include "testutil.h"
 
 static int test_crop(const ncnn::Mat& a, int woffset, int hoffset, int doffset, int coffset, const ncnn::Mat& ref)
@@ -37,7 +36,7 @@ static int test_crop(const ncnn::Mat& a, int woffset, int hoffset, int doffset,
     ab[0] = a;
     ab[1] = ref;
 
-    int ret = test_layer<ncnn::Crop>("Crop", pd, weights, ab);
+    int ret = test_layer("Crop", pd, weights, ab);
     if (ret != 0)
     {
         fprintf(stderr, "test_crop failed a.dims=%d a=(%d %d %d %d) woffset=%d hoffset=%d doffset=%d coffset=%d ref.dims=%d ref=(%d %d %d %d)\n", a.dims, a.w, a.h, a.d, a.c, woffset, hoffset, doffset, coffset, ref.dims, ref.w, ref.h, ref.d, ref.c);
diff --git a/tests/test_cumulativesum.cpp b/tests/test_cumulativesum.cpp
index de38ab44d98..666a5e04074 100644
--- a/tests/test_cumulativesum.cpp
+++ b/tests/test_cumulativesum.cpp
@@ -10,7 +10,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/cumulativesum.h"
 #include "testutil.h"
 
 static int test_cumulativesum(const ncnn::Mat& a, int axis)
@@ -20,7 +19,7 @@ static int test_cumulativesum(const ncnn::Mat& a, int axis)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::CumulativeSum>("CumulativeSum", pd, weights, a);
+    int ret = test_layer("CumulativeSum", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_cumulativesum failed a.dims=%d a=(%d %d %d) axis=%d\n", a.dims, a.w, a.h, a.c, axis);
diff --git a/tests/test_deconvolution.cpp b/tests/test_deconvolution.cpp
index 4a0027b68c4..5da32924644 100644
--- a/tests/test_deconvolution.cpp
+++ b/tests/test_deconvolution.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/deconvolution.h"
 #include "testutil.h"
 
 static int test_deconvolution(int w, int h, int c, int outch, int kernel, int dilation, int stride, int pad, int bias, int output_pad_right, int output_pad_bottom, int output_w, int output_h)
@@ -49,7 +48,7 @@ static int test_deconvolution(int w, int h, int c, int outch, int kernel, int di
     weights[0] = RandomMat(outch * c * kernel * kernel);
     weights[1] = RandomMat(outch);
 
-    int ret = test_layer<ncnn::Deconvolution>("Deconvolution", pd, weights, a);
+    int ret = test_layer("Deconvolution", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_deconvolution failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f] output_pad_right=%d output_pad_bottom=%d output_w=%d output_h=%d\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1], output_pad_right, output_pad_bottom, output_w, output_h);
@@ -68,7 +67,7 @@ static int test_deconvolution(int w, int h, int c, int outch, int kernel, int di
         opt.use_sgemm_convolution = false;
         opt.use_winograd_convolution = false;
 
-        ret = test_layer_opt<ncnn::Deconvolution>("Deconvolution", pd, weights, opt, a);
+        ret = test_layer_opt("Deconvolution", pd, weights, opt, a);
         if (ret != 0)
         {
             fprintf(stderr, "test_deconvolution failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f] output_pad_right=%d output_pad_bottom=%d output_w=%d output_h=%d\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1], output_pad_right, output_pad_bottom, output_w, output_h);
@@ -88,7 +87,7 @@ static int test_deconvolution(int w, int h, int c, int outch, int kernel, int di
         opt.use_sgemm_convolution = false;
         opt.use_winograd_convolution = false;
 
-        ret = test_layer_opt<ncnn::Deconvolution>("Deconvolution", pd, weights, opt, a);
+        ret = test_layer_opt("Deconvolution", pd, weights, opt, a);
         if (ret != 0)
         {
             fprintf(stderr, "test_deconvolution failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f] output_pad_right=%d output_pad_bottom=%d output_w=%d output_h=%d\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1], output_pad_right, output_pad_bottom, output_w, output_h);
@@ -190,7 +189,7 @@ static int test_deconvolution_dynamic(int w, int h, int c, int outch, int kernel
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Deconvolution>("Deconvolution", pd, weights, as);
+    int ret = test_layer("Deconvolution", pd, weights, as);
     if (ret != 0)
     {
         fprintf(stderr, "test_deconvolution_dynamic failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f] output_pad_right=%d output_pad_bottom=%d output_w=%d output_h=%d\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1], output_pad_right, output_pad_bottom, output_w, output_h);
diff --git a/tests/test_deconvolution1d.cpp b/tests/test_deconvolution1d.cpp
index b1b24ee6af9..4836173ab4a 100644
--- a/tests/test_deconvolution1d.cpp
+++ b/tests/test_deconvolution1d.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/deconvolution1d.h"
 #include "testutil.h"
 
 static int test_deconvolution1d(int w, int h, int outh, int kernel, int dilation, int stride, int pad, int bias, int output_pad_right, int output_w)
@@ -47,7 +46,7 @@ static int test_deconvolution1d(int w, int h, int outh, int kernel, int dilation
     weights[0] = RandomMat(outh * h * kernel);
     weights[1] = RandomMat(outh);
 
-    int ret = test_layer<ncnn::Deconvolution1D>("Deconvolution1D", pd, weights, a);
+    int ret = test_layer("Deconvolution1D", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_deconvolution1d failed w=%d h=%d outh=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f] output_pad_right=%d output_w=%d\n", w, h, outh, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1], output_pad_right, output_w);
@@ -138,7 +137,7 @@ static int test_deconvolution1d_dynamic(int w, int h, int outh, int kernel, int
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Deconvolution1D>("Deconvolution1D", pd, weights, as);
+    int ret = test_layer("Deconvolution1D", pd, weights, as);
     if (ret != 0)
     {
         fprintf(stderr, "test_deconvolution1d_dynamic failed w=%d h=%d outh=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f] output_pad_right=%d output_w=%d\n", w, h, outh, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1], output_pad_right, output_w);
diff --git a/tests/test_deconvolution3d.cpp b/tests/test_deconvolution3d.cpp
index d9e8dcea268..4d1ad2f204b 100644
--- a/tests/test_deconvolution3d.cpp
+++ b/tests/test_deconvolution3d.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/deconvolution3d.h"
 #include "testutil.h"
 
 static int test_deconvolution3d(int w, int h, int d, int c, int outch, int kernel, int dilation, int stride, int pad, int bias, int output_pad_right, int output_pad_bottom, int output_pad_behind, int output_w, int output_h, int output_d)
@@ -51,7 +50,7 @@ static int test_deconvolution3d(int w, int h, int d, int c, int outch, int kerne
     weights[0] = RandomMat(outch * c * kernel * kernel * kernel);
     weights[1] = RandomMat(outch);
 
-    int ret = test_layer<ncnn::Deconvolution3D>("Deconvolution3D", pd, weights, a);
+    int ret = test_layer("Deconvolution3D", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_deconvolution3d failed w=%d h=%d d=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f] output_pad_right=%d output_pad_bottom=%d output_pad_behind=%d output_w=%d output_h=%d output_d=%d\n", w, h, d, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1], output_pad_right, output_pad_bottom, output_pad_behind, output_w, output_h, output_d);
diff --git a/tests/test_deconvolutiondepthwise.cpp b/tests/test_deconvolutiondepthwise.cpp
index 3c9c703002b..80b9f052d11 100644
--- a/tests/test_deconvolutiondepthwise.cpp
+++ b/tests/test_deconvolutiondepthwise.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/deconvolutiondepthwise.h"
 #include "testutil.h"
 
 static int test_deconvolutiondepthwise(int w, int h, int c, int outch, int kernel, int dilation, int stride, int pad, int bias, int group, int output_pad_right, int output_pad_bottom, int output_w, int output_h)
@@ -50,7 +49,7 @@ static int test_deconvolutiondepthwise(int w, int h, int c, int outch, int kerne
     weights[0] = RandomMat(outch / group * c / group * kernel * kernel * group);
     weights[1] = RandomMat(outch);
 
-    int ret = test_layer<ncnn::DeconvolutionDepthWise>("DeconvolutionDepthWise", pd, weights, a);
+    int ret = test_layer("DeconvolutionDepthWise", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_deconvolutiondepthwise failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d group=%d act=%d actparams=[%f,%f] output_pad_right=%d output_pad_bottom=%d output_w=%d output_h=%d\n", w, h, c, outch, kernel, dilation, stride, pad, bias, group, activation_type, activation_params[0], activation_params[1], output_pad_right, output_pad_bottom, output_w, output_h);
diff --git a/tests/test_deconvolutiondepthwise1d.cpp b/tests/test_deconvolutiondepthwise1d.cpp
index cadd149ff59..f02d021185c 100644
--- a/tests/test_deconvolutiondepthwise1d.cpp
+++ b/tests/test_deconvolutiondepthwise1d.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/deconvolutiondepthwise1d.h"
 #include "testutil.h"
 
 static int test_deconvolutiondepthwise1d(int w, int h, int outh, int kernel, int dilation, int stride, int pad, int bias, int group, int output_pad_right, int output_w)
@@ -48,7 +47,7 @@ static int test_deconvolutiondepthwise1d(int w, int h, int outh, int kernel, int
     weights[0] = RandomMat(outh / group * h / group * kernel * group);
     weights[1] = RandomMat(outh);
 
-    int ret = test_layer<ncnn::DeconvolutionDepthWise1D>("DeconvolutionDepthWise1D", pd, weights, a);
+    int ret = test_layer("DeconvolutionDepthWise1D", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_deconvolutiondepthwise1d failed w=%d h=%d outh=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d group=%d act=%d actparams=[%f,%f] output_pad_right=%d output_w=%d\n", w, h, outh, kernel, dilation, stride, pad, bias, group, activation_type, activation_params[0], activation_params[1], output_pad_right, output_w);
@@ -145,7 +144,7 @@ static int test_deconvolutiondepthwise1d_dynamic(int w, int h, int outh, int ker
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::DeconvolutionDepthWise1D>("DeconvolutionDepthWise1D", pd, weights, as);
+    int ret = test_layer("DeconvolutionDepthWise1D", pd, weights, as);
     if (ret != 0)
     {
         fprintf(stderr, "test_deconvolutiondepthwise1d_dynamic failed w=%d h=%d outh=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d group=%d act=%d actparams=[%f,%f] output_pad_right=%d output_w=%d\n", w, h, outh, kernel, dilation, stride, pad, bias, group, activation_type, activation_params[0], activation_params[1], output_pad_right, output_w);
diff --git a/tests/test_deconvolutiondepthwise3d.cpp b/tests/test_deconvolutiondepthwise3d.cpp
index b57ce0add11..ff2f47193d2 100644
--- a/tests/test_deconvolutiondepthwise3d.cpp
+++ b/tests/test_deconvolutiondepthwise3d.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/deconvolutiondepthwise3d.h"
 #include "testutil.h"
 
 static int test_deconvolutiondepthwise3d(int w, int h, int d, int c, int outch, int kernel, int dilation, int stride, int pad, int bias, int group, int output_pad_right, int output_pad_bottom, int output_pad_behind, int output_w, int output_h, int output_d)
@@ -52,7 +51,7 @@ static int test_deconvolutiondepthwise3d(int w, int h, int d, int c, int outch,
     weights[0] = RandomMat(outch / group * c / group * kernel * kernel * kernel * group);
     weights[1] = RandomMat(outch);
 
-    int ret = test_layer<ncnn::DeconvolutionDepthWise3D>("DeconvolutionDepthWise3D", pd, weights, a);
+    int ret = test_layer("DeconvolutionDepthWise3D", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_deconvolutiondepthwise3d failed w=%d h=%d d=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d group=%d act=%d actparams=[%f,%f] output_pad_right=%d output_pad_bottom=%d output_pad_behind=%d output_w=%d output_h=%d output_d=%d\n", w, h, d, c, outch, kernel, dilation, stride, pad, bias, group, activation_type, activation_params[0], activation_params[1], output_pad_right, output_pad_bottom, output_pad_behind, output_w, output_h, output_d);
diff --git a/tests/test_deconvolutiondepthwise_1.cpp b/tests/test_deconvolutiondepthwise_1.cpp
index 02fc8f97351..adbd8155660 100644
--- a/tests/test_deconvolutiondepthwise_1.cpp
+++ b/tests/test_deconvolutiondepthwise_1.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/deconvolutiondepthwise.h"
 #include "testutil.h"
 
 static int test_deconvolutiondepthwise_dynamic(int w, int h, int c, int outch, int kernel, int dilation, int stride, int pad, int bias, int group, int output_pad_right, int output_pad_bottom, int output_w, int output_h)
@@ -55,7 +54,7 @@ static int test_deconvolutiondepthwise_dynamic(int w, int h, int c, int outch, i
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::DeconvolutionDepthWise>("DeconvolutionDepthWise", pd, weights, as);
+    int ret = test_layer("DeconvolutionDepthWise", pd, weights, as);
     if (ret != 0)
     {
         fprintf(stderr, "test_deconvolutiondepthwise_dynamic failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d group=%d act=%d actparams=[%f,%f] output_pad_right=%d output_pad_bottom=%d output_w=%d output_h=%d\n", w, h, c, outch, kernel, dilation, stride, pad, bias, group, activation_type, activation_params[0], activation_params[1], output_pad_right, output_pad_bottom, output_w, output_h);
diff --git a/tests/test_deepcopy.cpp b/tests/test_deepcopy.cpp
index 1b04733618d..ad21a007d90 100644
--- a/tests/test_deepcopy.cpp
+++ b/tests/test_deepcopy.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/deepcopy.h"
 #include "testutil.h"
 
 static int test_deepcopy(const ncnn::Mat& a)
@@ -21,7 +20,7 @@ static int test_deepcopy(const ncnn::Mat& a)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::DeepCopy>("DeepCopy", pd, weights, a);
+    int ret = test_layer("DeepCopy", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_deepcopy failed a.dims=%d a=(%d %d %d)\n", a.dims, a.w, a.h, a.c);
diff --git a/tests/test_deformableconv2d.cpp b/tests/test_deformableconv2d.cpp
index c6f3443ef1b..2274978c255 100644
--- a/tests/test_deformableconv2d.cpp
+++ b/tests/test_deformableconv2d.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/deformableconv2d.h"
 #include "testutil.h"
 
 static int test_deformableconv2d(int w, int h, int c, int outch, int kernel, int dilation, int stride, int pad, int bias)
@@ -48,7 +47,7 @@ static int test_deformableconv2d(int w, int h, int c, int outch, int kernel, int
         weights[1] = RandomMat(outch);
 
     float epsilon = 0.001;
-    int ret = test_layer<ncnn::DeformableConv2D>("DeformableConv2D", pd, weights, a, 1, epsilon);
+    int ret = test_layer("DeformableConv2D", pd, weights, a, 1, epsilon);
     if (ret != 0)
     {
         fprintf(stderr, "test_deformableconv2d failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
@@ -67,7 +66,7 @@ static int test_deformableconv2d(int w, int h, int c, int outch, int kernel, int
         opt.use_sgemm_convolution = false;
         opt.use_winograd_convolution = false;
 
-        ret = test_layer_opt<ncnn::DeformableConv2D>("DeformableConv2D", pd, weights, opt, a, 1, epsilon);
+        ret = test_layer_opt("DeformableConv2D", pd, weights, opt, a, 1, epsilon);
         if (ret != 0)
         {
             fprintf(stderr, "test_deformableconv2d failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
@@ -87,7 +86,7 @@ static int test_deformableconv2d(int w, int h, int c, int outch, int kernel, int
         opt.use_sgemm_convolution = false;
         opt.use_winograd_convolution = false;
 
-        ret = test_layer_opt<ncnn::DeformableConv2D>("DeformableConv2D", pd, weights, opt, a, 1, epsilon);
+        ret = test_layer_opt("DeformableConv2D", pd, weights, opt, a, 1, epsilon);
         if (ret != 0)
         {
             fprintf(stderr, "test_deformableconv2d failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
diff --git a/tests/test_deformableconv2d_1.cpp b/tests/test_deformableconv2d_1.cpp
index 4a97034b47b..134c4e9b64f 100644
--- a/tests/test_deformableconv2d_1.cpp
+++ b/tests/test_deformableconv2d_1.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/deformableconv2d.h"
 #include "testutil.h"
 
 static int test_deformableconv2d(int w, int h, int c, int outch, int kernel, int dilation, int stride, int pad, int bias)
@@ -48,7 +47,7 @@ static int test_deformableconv2d(int w, int h, int c, int outch, int kernel, int
         weights[1] = RandomMat(outch);
 
     float epsilon = 0.001;
-    int ret = test_layer<ncnn::DeformableConv2D>("DeformableConv2D", pd, weights, a, 1, epsilon);
+    int ret = test_layer("DeformableConv2D", pd, weights, a, 1, epsilon);
     if (ret != 0)
     {
         fprintf(stderr, "test_deformableconv2d failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
@@ -67,7 +66,7 @@ static int test_deformableconv2d(int w, int h, int c, int outch, int kernel, int
         opt.use_sgemm_convolution = false;
         opt.use_winograd_convolution = false;
 
-        ret = test_layer_opt<ncnn::DeformableConv2D>("DeformableConv2D", pd, weights, opt, a, 1, epsilon);
+        ret = test_layer_opt("DeformableConv2D", pd, weights, opt, a, 1, epsilon);
         if (ret != 0)
         {
             fprintf(stderr, "test_deformableconv2d failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
@@ -87,7 +86,7 @@ static int test_deformableconv2d(int w, int h, int c, int outch, int kernel, int
         opt.use_sgemm_convolution = false;
         opt.use_winograd_convolution = false;
 
-        ret = test_layer_opt<ncnn::DeformableConv2D>("DeformableConv2D", pd, weights, opt, a, 1, epsilon);
+        ret = test_layer_opt("DeformableConv2D", pd, weights, opt, a, 1, epsilon);
         if (ret != 0)
         {
             fprintf(stderr, "test_deformableconv2d failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
diff --git a/tests/test_deformableconv2d_2.cpp b/tests/test_deformableconv2d_2.cpp
index 70b059d2b93..42ca21765f4 100644
--- a/tests/test_deformableconv2d_2.cpp
+++ b/tests/test_deformableconv2d_2.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/deformableconv2d.h"
 #include "testutil.h"
 
 static int test_deformableconv2d(int w, int h, int c, int outch, int kernel, int dilation, int stride, int pad, int bias)
@@ -48,7 +47,7 @@ static int test_deformableconv2d(int w, int h, int c, int outch, int kernel, int
         weights[1] = RandomMat(outch);
 
     float epsilon = 0.001;
-    int ret = test_layer<ncnn::DeformableConv2D>("DeformableConv2D", pd, weights, a, 1, epsilon);
+    int ret = test_layer("DeformableConv2D", pd, weights, a, 1, epsilon);
     if (ret != 0)
     {
         fprintf(stderr, "test_deformableconv2d failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
@@ -67,7 +66,7 @@ static int test_deformableconv2d(int w, int h, int c, int outch, int kernel, int
         opt.use_sgemm_convolution = false;
         opt.use_winograd_convolution = false;
 
-        ret = test_layer_opt<ncnn::DeformableConv2D>("DeformableConv2D", pd, weights, opt, a, 1, epsilon);
+        ret = test_layer_opt("DeformableConv2D", pd, weights, opt, a, 1, epsilon);
         if (ret != 0)
         {
             fprintf(stderr, "test_deformableconv2d failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
@@ -87,7 +86,7 @@ static int test_deformableconv2d(int w, int h, int c, int outch, int kernel, int
         opt.use_sgemm_convolution = false;
         opt.use_winograd_convolution = false;
 
-        ret = test_layer_opt<ncnn::DeformableConv2D>("DeformableConv2D", pd, weights, opt, a, 1, epsilon);
+        ret = test_layer_opt("DeformableConv2D", pd, weights, opt, a, 1, epsilon);
         if (ret != 0)
         {
             fprintf(stderr, "test_deformableconv2d failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
diff --git a/tests/test_deformableconv2d_3.cpp b/tests/test_deformableconv2d_3.cpp
index 26b726b2b2e..e16301fd805 100644
--- a/tests/test_deformableconv2d_3.cpp
+++ b/tests/test_deformableconv2d_3.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/deformableconv2d.h"
 #include "testutil.h"
 
 static int test_deformableconv2d(int w, int h, int c, int outch, int kernel, int dilation, int stride, int pad, int bias)
@@ -48,7 +47,7 @@ static int test_deformableconv2d(int w, int h, int c, int outch, int kernel, int
         weights[1] = RandomMat(outch);
 
     float epsilon = 0.001;
-    int ret = test_layer<ncnn::DeformableConv2D>("DeformableConv2D", pd, weights, a, 1, epsilon);
+    int ret = test_layer("DeformableConv2D", pd, weights, a, 1, epsilon);
     if (ret != 0)
     {
         fprintf(stderr, "test_deformableconv2d failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
@@ -67,7 +66,7 @@ static int test_deformableconv2d(int w, int h, int c, int outch, int kernel, int
         opt.use_sgemm_convolution = false;
         opt.use_winograd_convolution = false;
 
-        ret = test_layer_opt<ncnn::DeformableConv2D>("DeformableConv2D", pd, weights, opt, a, 1, epsilon);
+        ret = test_layer_opt("DeformableConv2D", pd, weights, opt, a, 1, epsilon);
         if (ret != 0)
         {
             fprintf(stderr, "test_deformableconv2d failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
@@ -87,7 +86,7 @@ static int test_deformableconv2d(int w, int h, int c, int outch, int kernel, int
         opt.use_sgemm_convolution = false;
         opt.use_winograd_convolution = false;
 
-        ret = test_layer_opt<ncnn::DeformableConv2D>("DeformableConv2D", pd, weights, opt, a, 1, epsilon);
+        ret = test_layer_opt("DeformableConv2D", pd, weights, opt, a, 1, epsilon);
         if (ret != 0)
         {
             fprintf(stderr, "test_deformableconv2d failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
diff --git a/tests/test_deformableconv2d_4.cpp b/tests/test_deformableconv2d_4.cpp
index eca9f289dec..1981c762b9d 100644
--- a/tests/test_deformableconv2d_4.cpp
+++ b/tests/test_deformableconv2d_4.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/deformableconv2d.h"
 #include "testutil.h"
 
 static int test_deformableconv2d(int w, int h, int c, int outch, int kernel, int dilation, int stride, int pad, int bias)
@@ -48,7 +47,7 @@ static int test_deformableconv2d(int w, int h, int c, int outch, int kernel, int
         weights[1] = RandomMat(outch);
 
     float epsilon = 0.001;
-    int ret = test_layer<ncnn::DeformableConv2D>("DeformableConv2D", pd, weights, a, 1, epsilon);
+    int ret = test_layer("DeformableConv2D", pd, weights, a, 1, epsilon);
     if (ret != 0)
     {
         fprintf(stderr, "test_deformableconv2d failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]);
diff --git a/tests/test_dequantize.cpp b/tests/test_dequantize.cpp
index 803fbd70c37..ca05059fa45 100644
--- a/tests/test_dequantize.cpp
+++ b/tests/test_dequantize.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/dequantize.h"
 #include "testutil.h"
 
 static int test_dequantize(const ncnn::Mat& a, int scale_data_size, int bias_data_size)
@@ -27,7 +26,7 @@ static int test_dequantize(const ncnn::Mat& a, int scale_data_size, int bias_dat
         weights[1] = RandomMat(bias_data_size);
 
     int flag = TEST_LAYER_DISABLE_AUTO_INPUT_CASTING;
-    int ret = test_layer<ncnn::Dequantize>("Dequantize", pd, weights, a, 0.001, 0, flag);
+    int ret = test_layer("Dequantize", pd, weights, a, 0.001, 0, flag);
     if (ret != 0)
     {
         fprintf(stderr, "test_dequantize failed a.dims=%d a=(%d %d %d) scale_data_size=%d bias_data_size=%d\n", a.dims, a.w, a.h, a.c, scale_data_size, bias_data_size);
@@ -48,7 +47,7 @@ static int test_dequantize_pack8(const ncnn::Mat& a, int scale_data_size, int bi
         weights[1] = RandomMat(bias_data_size);
 
     int flag = TEST_LAYER_DISABLE_AUTO_INPUT_CASTING | TEST_LAYER_ENABLE_FORCE_INPUT_PACK8;
-    int ret = test_layer<ncnn::Dequantize>("Dequantize", pd, weights, a, 0.001, 0, flag);
+    int ret = test_layer("Dequantize", pd, weights, a, 0.001, 0, flag);
     if (ret != 0)
     {
         fprintf(stderr, "test_dequantize_pack8 failed a.dims=%d a=(%d %d %d) scale_data_size=%d bias_data_size=%d\n", a.dims, a.w, a.h, a.c, scale_data_size, bias_data_size);
diff --git a/tests/test_diag.cpp b/tests/test_diag.cpp
index bb192d78ccc..53eefe31e4a 100644
--- a/tests/test_diag.cpp
+++ b/tests/test_diag.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/diag.h"
 #include "testutil.h"
 
 static int test_diag(const ncnn::Mat& a, int diagonal)
@@ -22,7 +21,7 @@ static int test_diag(const ncnn::Mat& a, int diagonal)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Diag>("Diag", pd, weights, a);
+    int ret = test_layer("Diag", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_diag failed a.dims=%d a=(%d %d %d %d)\n", a.dims, a.w, a.h, a.d, a.c);
diff --git a/tests/test_dropout.cpp b/tests/test_dropout.cpp
index e15d10bcebc..964dd8a39dd 100644
--- a/tests/test_dropout.cpp
+++ b/tests/test_dropout.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/dropout.h"
 #include "testutil.h"
 
 static int test_dropout(const ncnn::Mat& a, float scale)
@@ -22,7 +21,7 @@ static int test_dropout(const ncnn::Mat& a, float scale)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Dropout>("Dropout", pd, weights, a);
+    int ret = test_layer("Dropout", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_dropout failed a.dims=%d a=(%d %d %d) scale=%f\n", a.dims, a.w, a.h, a.c, scale);
diff --git a/tests/test_einsum.cpp b/tests/test_einsum.cpp
index c1df4747142..a189061dfbc 100644
--- a/tests/test_einsum.cpp
+++ b/tests/test_einsum.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/einsum.h"
 #include "testutil.h"
 
 static int test_einsum(const std::vector<ncnn::Mat>& a, const std::string& equation)
@@ -28,7 +27,7 @@ static int test_einsum(const std::vector<ncnn::Mat>& a, const std::string& equat
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Einsum>("Einsum", pd, weights, a);
+    int ret = test_layer("Einsum", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_einsum failed a[0].dims=%d a[0]=(%d %d %d) equation=%s\n", a[0].dims, a[0].w, a[0].h, a[0].c, equation.c_str());
diff --git a/tests/test_eltwise.cpp b/tests/test_eltwise.cpp
index 25da0196cd0..84ddfcf1359 100644
--- a/tests/test_eltwise.cpp
+++ b/tests/test_eltwise.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/eltwise.h"
 #include "testutil.h"
 
 static void print_float_array(const ncnn::Mat& a)
@@ -33,7 +32,7 @@ static int test_eltwise(const std::vector<ncnn::Mat>& a, int op_type, const ncnn
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Eltwise>("Eltwise", pd, weights, a);
+    int ret = test_layer("Eltwise", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_eltwise failed a[0].dims=%d a[0]=(%d %d %d %d) op_type=%d", a[0].dims, a[0].w, a[0].h, a[0].d, a[0].c, op_type);
diff --git a/tests/test_elu.cpp b/tests/test_elu.cpp
index a8736a3efad..cd78f846d25 100644
--- a/tests/test_elu.cpp
+++ b/tests/test_elu.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/elu.h"
 #include "testutil.h"
 
 static int test_elu(const ncnn::Mat& a)
@@ -23,7 +22,7 @@ static int test_elu(const ncnn::Mat& a)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::ELU>("ELU", pd, weights, a);
+    int ret = test_layer("ELU", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_elu failed a.dims=%d a=(%d %d %d %d) alpha=%f\n", a.dims, a.w, a.h, a.d, a.c, alpha);
diff --git a/tests/test_erf.cpp b/tests/test_erf.cpp
index cc1102c8b97..454d13c1a37 100644
--- a/tests/test_erf.cpp
+++ b/tests/test_erf.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/erf.h"
 #include "testutil.h"
 
 static int test_erf(const ncnn::Mat& a)
@@ -21,7 +20,7 @@ static int test_erf(const ncnn::Mat& a)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Erf>("Erf", pd, weights, a);
+    int ret = test_layer("Erf", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_erf failed a.dims=%d a=(%d %d %d)\n", a.dims, a.w, a.h, a.c);
diff --git a/tests/test_expanddims.cpp b/tests/test_expanddims.cpp
index d05d84a9d3b..129f9f261b1 100644
--- a/tests/test_expanddims.cpp
+++ b/tests/test_expanddims.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/expanddims.h"
 #include "testutil.h"
 
 static int test_expanddims(const ncnn::Mat& a, int expand_w, int expand_h, int expand_d, int expand_c)
@@ -25,7 +24,7 @@ static int test_expanddims(const ncnn::Mat& a, int expand_w, int expand_h, int e
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::ExpandDims>("ExpandDims", pd, weights, a);
+    int ret = test_layer("ExpandDims", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_expanddims failed a.dims=%d a=(%d %d %d %d) expand_w=%d expand_h=%d expand_d=%d expand_c=%d\n", a.dims, a.w, a.h, a.d, a.c, expand_w, expand_h, expand_d, expand_c);
@@ -91,7 +90,7 @@ static int test_expanddims_axes(const ncnn::Mat& a, const ncnn::Mat& axes)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::ExpandDims>("ExpandDims", pd, weights, a);
+    int ret = test_layer("ExpandDims", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_expanddims_axes failed a.dims=%d a=(%d %d %d %d)\n", a.dims, a.w, a.h, a.d, a.c);
diff --git a/tests/test_flatten.cpp b/tests/test_flatten.cpp
index 654347b1e7e..c4fb6e35a53 100644
--- a/tests/test_flatten.cpp
+++ b/tests/test_flatten.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/flatten.h"
 #include "testutil.h"
 
 static int test_flatten(const ncnn::Mat& a)
@@ -21,7 +20,7 @@ static int test_flatten(const ncnn::Mat& a)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Flatten>("Flatten", pd, weights, a);
+    int ret = test_layer("Flatten", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_flatten failed a.dims=%d a=(%d %d %d %d)\n", a.dims, a.w, a.h, a.d, a.c);
@@ -60,7 +59,7 @@ static int test_flatten_int8(const ncnn::Mat& a)
     std::vector<ncnn::Mat> weights(0);
 
     int flag = TEST_LAYER_DISABLE_AUTO_INPUT_CASTING | TEST_LAYER_DISABLE_GPU_TESTING;
-    int ret = test_layer<ncnn::Flatten>("Flatten", pd, weights, a, 0.001, 0, flag);
+    int ret = test_layer("Flatten", pd, weights, a, 0.001, 0, flag);
     if (ret != 0)
     {
         fprintf(stderr, "test_flatten_int8 failed a.dims=%d a=(%d %d %d %d)\n", a.dims, a.w, a.h, a.d, a.c);
diff --git a/tests/test_fold.cpp b/tests/test_fold.cpp
index 11a38428fdd..cdb6dc88ce1 100644
--- a/tests/test_fold.cpp
+++ b/tests/test_fold.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/fold.h"
 #include "testutil.h"
 
 static int test_fold(int w, int h, int outw, int outh, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_w, int pad_h)
@@ -33,7 +32,7 @@ static int test_fold(int w, int h, int outw, int outh, int kernel_w, int kernel_
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Fold>("Fold", pd, weights, a);
+    int ret = test_layer("Fold", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_fold failed w=%d h=%d outw=%d outh=%d kernel=%d,%d dilation=%d,%d stride=%d,%d pad=%d,%d\n", w, h, outw, outh, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_w, pad_h);
diff --git a/tests/test_gelu.cpp b/tests/test_gelu.cpp
index fa8f176b3cd..4bcdd6a93ce 100644
--- a/tests/test_gelu.cpp
+++ b/tests/test_gelu.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/gelu.h"
 #include "testutil.h"
 
 static int test_gelu(const ncnn::Mat& a, bool fast_gelu)
@@ -22,7 +21,7 @@ static int test_gelu(const ncnn::Mat& a, bool fast_gelu)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::GELU>("GELU", pd, weights, a);
+    int ret = test_layer("GELU", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_gelu failed a.dims=%d a=(%d %d %d %d) fast_gelu=%s\n", a.dims, a.w, a.h, a.d, a.c, fast_gelu ? "true" : "false");
diff --git a/tests/test_gemm.cpp b/tests/test_gemm.cpp
index 09152057cb2..c2900e9ac61 100644
--- a/tests/test_gemm.cpp
+++ b/tests/test_gemm.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/gemm.h"
 #include "testutil.h"
 
 static int test_gemm(int M, int N, int K, float alpha, int transA, int transB, int output_transpose, int constantA, int constantB, int output_N1M = 0)
@@ -50,7 +49,7 @@ static int test_gemm(int M, int N, int K, float alpha, int transA, int transB, i
         Randomize(a[i]);
     }
 
-    int ret = test_layer<ncnn::Gemm>("Gemm", pd, weights, a);
+    int ret = test_layer("Gemm", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_gemm failed M=%d N=%d K=%d alpha=%f transA=%d transB=%d output_transpose=%d constantA=%d constantB=%d output_N1M=%d\n", M, N, K, alpha, transA, transB, output_transpose, constantA, constantB, output_N1M);
@@ -128,7 +127,7 @@ static int test_gemm_bias(int M, int N, int K, const ncnn::Mat& C, float alpha,
         Randomize(a[i]);
     }
 
-    int ret = test_layer<ncnn::Gemm>("Gemm", pd, weights, a);
+    int ret = test_layer("Gemm", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_gemm_bias failed M=%d N=%d K=%d C.dims=%d C=(%d %d %d) alpha=%f beta=%f transA=%d transB=%d output_transpose=%d constantA=%d constantB=%d constantC=%d\n", M, N, K, C.dims, C.w, C.h, C.c, alpha, beta, transA, transB, output_transpose, constantA, constantB, constantC);
diff --git a/tests/test_gemm_1.cpp b/tests/test_gemm_1.cpp
index 5d5fcb3bff0..59a0c825627 100644
--- a/tests/test_gemm_1.cpp
+++ b/tests/test_gemm_1.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/gemm.h"
 #include "testutil.h"
 
 static int test_gemm(int M, int N, int K, int TILE_M, int TILE_N, int TILE_K, float alpha, int transA, int transB, int output_transpose)
@@ -37,7 +36,7 @@ static int test_gemm(int M, int N, int K, int TILE_M, int TILE_N, int TILE_K, fl
     Randomize(a[0]);
     Randomize(a[1]);
 
-    int ret = test_layer<ncnn::Gemm>("Gemm", pd, weights, a);
+    int ret = test_layer("Gemm", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_gemm failed M=%d N=%d K=%d TILE_M=%d TILE_N=%d TILE_K=%d alpha=%f transA=%d transB=%d output_transpose=%d\n", M, N, K, TILE_M, TILE_N, TILE_K, alpha, transA, transB, output_transpose);
diff --git a/tests/test_glu.cpp b/tests/test_glu.cpp
index 58555aa5357..3313b4f534c 100644
--- a/tests/test_glu.cpp
+++ b/tests/test_glu.cpp
@@ -10,7 +10,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/glu.h"
 #include "testutil.h"
 
 static int test_glu(const ncnn::Mat& a, int axis)
@@ -20,7 +19,7 @@ static int test_glu(const ncnn::Mat& a, int axis)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::GLU>("GLU", pd, weights, a);
+    int ret = test_layer("GLU", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_glu failed a.dims=%d a=(%d %d %d) axis=%d\n", a.dims, a.w, a.h, a.c, axis);
diff --git a/tests/test_gridsample.cpp b/tests/test_gridsample.cpp
index 0e384115352..438b6218d8f 100644
--- a/tests/test_gridsample.cpp
+++ b/tests/test_gridsample.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/gridsample.h"
 #include "testutil.h"
 
 static int test_gridsample(const ncnn::Mat& a, const ncnn::Mat& grid, int sample_type, int padding_mode, int align_corner, int permute_fusion)
@@ -29,7 +28,7 @@ static int test_gridsample(const ncnn::Mat& a, const ncnn::Mat& grid, int sample
     as[0] = a;
     as[1] = grid;
 
-    int ret = test_layer<ncnn::GridSample>("GridSample", pd, weights, as);
+    int ret = test_layer("GridSample", pd, weights, as);
     if (ret != 0)
     {
         fprintf(stderr, "test_gridsample failed a.dims=%d a=(%d %d %d %d) grid.dims=%d grid=(%d %d %d %d) sample_type=%d padding_mode=%d align_corner=%d permute_fusion=%d",
diff --git a/tests/test_groupnorm.cpp b/tests/test_groupnorm.cpp
index cebb85617a5..0fea0988e62 100644
--- a/tests/test_groupnorm.cpp
+++ b/tests/test_groupnorm.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/groupnorm.h"
 #include "testutil.h"
 
 static int test_groupnorm(const ncnn::Mat& a, int group, float eps, int affine)
@@ -37,7 +36,7 @@ static int test_groupnorm(const ncnn::Mat& a, int group, float eps, int affine)
     weights[0] = RandomMat(channels);
     weights[1] = RandomMat(channels);
 
-    int ret = test_layer<ncnn::GroupNorm>("GroupNorm", pd, weights, a);
+    int ret = test_layer("GroupNorm", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_groupnorm failed a.dims=%d a=(%d %d %d) group=%d eps=%f\n", a.dims, a.w, a.h, a.c, group, eps);
diff --git a/tests/test_gru.cpp b/tests/test_gru.cpp
index 006e544873b..487daeb3a27 100644
--- a/tests/test_gru.cpp
+++ b/tests/test_gru.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/gru.h"
 #include "testutil.h"
 
 static int test_gru(const ncnn::Mat& a, int outch, int direction)
@@ -30,7 +29,7 @@ static int test_gru(const ncnn::Mat& a, int outch, int direction)
     weights[1] = RandomMat(outch * 4 * num_directions);
     weights[2] = RandomMat(outch * outch * 3 * num_directions);
 
-    int ret = test_layer<ncnn::GRU>("GRU", pd, weights, a);
+    int ret = test_layer("GRU", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_gru failed a.dims=%d a=(%d %d %d) outch=%d, direction = %d \n", a.dims, a.w, a.h, a.c, outch, direction);
@@ -61,7 +60,7 @@ int test_gru_layer_with_hidden(const ncnn::Mat& a, int outch, int direction)
     as[0] = a;
     as[1] = hidden;
 
-    int ret = test_layer<ncnn::GRU>("GRU", pd, weights, as, 2);
+    int ret = test_layer("GRU", pd, weights, as, 2);
     if (ret != 0)
     {
         fprintf(stderr, "test_gru_layer_with_hidden failed a.dims=%d a=(%d %d %d) outch=%d, direction = %d \n", a.dims, a.w, a.h, a.c, outch, direction);
@@ -92,7 +91,7 @@ int test_gru_layer_with_hidden_input(const ncnn::Mat& a, int outch, int directio
     as[0] = a;
     as[1] = hidden;
 
-    int ret = test_layer<ncnn::GRU>("GRU", pd, weights, as, 1);
+    int ret = test_layer("GRU", pd, weights, as, 1);
     if (ret != 0)
     {
         fprintf(stderr, "test_gru_layer_with_hidden_input failed a.dims=%d a=(%d %d %d) outch=%d, direction = %d \n", a.dims, a.w, a.h, a.c, outch, direction);
@@ -119,7 +118,7 @@ int test_gru_layer_with_hidden_output(const ncnn::Mat& a, int outch, int directi
     std::vector<ncnn::Mat> as(1);
     as[0] = a;
 
-    int ret = test_layer<ncnn::GRU>("GRU", pd, weights, as, 2);
+    int ret = test_layer("GRU", pd, weights, as, 2);
     if (ret != 0)
     {
         fprintf(stderr, "test_gru_layer_with_hidden_output failed a.dims=%d a=(%d %d %d) outch=%d, direction = %d \n", a.dims, a.w, a.h, a.c, outch, direction);
diff --git a/tests/test_hardsigmoid.cpp b/tests/test_hardsigmoid.cpp
index 8941b661788..d1318ef4962 100644
--- a/tests/test_hardsigmoid.cpp
+++ b/tests/test_hardsigmoid.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/hardsigmoid.h"
 #include "testutil.h"
 
 static int test_hardsigmoid(const ncnn::Mat& a, float alpha, float beta)
@@ -23,7 +22,7 @@ static int test_hardsigmoid(const ncnn::Mat& a, float alpha, float beta)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::HardSigmoid>("HardSigmoid", pd, weights, a);
+    int ret = test_layer("HardSigmoid", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_hardsigmoid failed a.dims=%d a=(%d %d %d) alpha=%f beta=%f\n", a.dims, a.w, a.h, a.c, alpha, beta);
diff --git a/tests/test_hardswish.cpp b/tests/test_hardswish.cpp
index a6ca76da8ec..0f3352652de 100644
--- a/tests/test_hardswish.cpp
+++ b/tests/test_hardswish.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/hardswish.h"
 #include "testutil.h"
 
 static int test_hardswish(const ncnn::Mat& a, float alpha, float beta)
@@ -23,7 +22,7 @@ static int test_hardswish(const ncnn::Mat& a, float alpha, float beta)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::HardSwish>("HardSwish", pd, weights, a);
+    int ret = test_layer("HardSwish", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_hardswish failed a.dims=%d a=(%d %d %d) alpha=%f beta=%f\n", a.dims, a.w, a.h, a.c, alpha, beta);
diff --git a/tests/test_innerproduct.cpp b/tests/test_innerproduct.cpp
index a9ec260db68..298ad6aa078 100644
--- a/tests/test_innerproduct.cpp
+++ b/tests/test_innerproduct.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/innerproduct.h"
 #include "testutil.h"
 
 static int test_innerproduct(const ncnn::Mat& a, int outch, int bias)
@@ -34,7 +33,7 @@ static int test_innerproduct(const ncnn::Mat& a, int outch, int bias)
     if (bias)
         weights[1] = RandomMat(outch);
 
-    int ret = test_layer<ncnn::InnerProduct>("InnerProduct", pd, weights, a);
+    int ret = test_layer("InnerProduct", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_innerproduct failed a.dims=%d a=(%d %d %d) outch=%d bias=%d act=%d actparams=[%f,%f]\n", a.dims, a.w, a.h, a.c, outch, bias, activation_type, activation_params[0], activation_params[1]);
@@ -122,7 +121,7 @@ static int test_innerproduct_int8(const ncnn::Mat& a, int outch, int bias)
     }
 
     int flag = TEST_LAYER_DISABLE_GPU_TESTING;
-    int ret = test_layer<ncnn::InnerProduct>("InnerProduct", pd, weights, a, 0.001f, 0, flag);
+    int ret = test_layer("InnerProduct", pd, weights, a, 0.001f, 0, flag);
     if (ret != 0)
     {
         fprintf(stderr, "test_innerproduct_int8 failed a.dims=%d a=(%d %d %d) outch=%d bias=%d act=%d actparams=[%f,%f]\n", a.dims, a.w, a.h, a.c, outch, bias, activation_type, activation_params[0], activation_params[1]);
@@ -167,7 +166,7 @@ static int test_innerproduct_gemm(const ncnn::Mat& a, int outch, int bias)
     if (bias)
         weights[1] = RandomMat(outch);
 
-    int ret = test_layer<ncnn::InnerProduct>("InnerProduct", pd, weights, a);
+    int ret = test_layer("InnerProduct", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_innerproduct_gemm failed a.dims=%d a=(%d %d %d) outch=%d bias=%d act=%d actparams=[%f,%f]\n", a.dims, a.w, a.h, a.c, outch, bias, activation_type, activation_params[0], activation_params[1]);
@@ -234,7 +233,7 @@ static int test_innerproduct_gemm_int8(const ncnn::Mat& a, int outch, int bias)
     }
 
     int flag = TEST_LAYER_DISABLE_GPU_TESTING;
-    int ret = test_layer<ncnn::InnerProduct>("InnerProduct", pd, weights, a, 0.001f, 0, flag);
+    int ret = test_layer("InnerProduct", pd, weights, a, 0.001f, 0, flag);
     if (ret != 0)
     {
         fprintf(stderr, "test_innerproduct_gemm_int8 failed a.dims=%d a=(%d %d %d) outch=%d bias=%d\n", a.dims, a.w, a.h, a.c, outch, bias);
diff --git a/tests/test_instancenorm.cpp b/tests/test_instancenorm.cpp
index 900c351e10f..1c28dc3582c 100644
--- a/tests/test_instancenorm.cpp
+++ b/tests/test_instancenorm.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/instancenorm.h"
 #include "testutil.h"
 
 static int test_instancenorm(const ncnn::Mat& a, float eps, int affine)
@@ -28,7 +27,7 @@ static int test_instancenorm(const ncnn::Mat& a, float eps, int affine)
     weights[0] = RandomMat(channels);
     weights[1] = RandomMat(channels);
 
-    int ret = test_layer<ncnn::InstanceNorm>("InstanceNorm", pd, weights, a);
+    int ret = test_layer("InstanceNorm", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_instancenorm failed a.dims=%d a=(%d %d %d) eps=%f affine=%d\n", a.dims, a.w, a.h, a.c, eps, affine);
diff --git a/tests/test_interp.cpp b/tests/test_interp.cpp
index a2b0620f869..b2c646cf84e 100644
--- a/tests/test_interp.cpp
+++ b/tests/test_interp.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/interp.h"
 #include "testutil.h"
 
 static int test_interp(const ncnn::Mat& a, int resize_type, float height_scale, float width_scale, int output_height, int output_width)
@@ -26,7 +25,7 @@ static int test_interp(const ncnn::Mat& a, int resize_type, float height_scale,
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Interp>("Interp", pd, weights, a);
+    int ret = test_layer("Interp", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_interp failed a.dims=%d a=(%d %d %d) resize_type=%d height_scale=%f width_scale=%f output_height=%d output_width=%d\n", a.dims, a.w, a.h, a.c, resize_type, height_scale, width_scale, output_height, output_width);
@@ -47,7 +46,7 @@ static int test_interp_ref(const ncnn::Mat& a, int resize_type, int output_heigh
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Interp>("Interp", pd, weights, as);
+    int ret = test_layer("Interp", pd, weights, as);
     if (ret != 0)
     {
         fprintf(stderr, "test_interp_ref failed a.dims=%d a=(%d %d %d) resize_type=%d output_height=%d output_width=%d\n", a.dims, a.w, a.h, a.c, resize_type, output_height, output_width);
@@ -68,7 +67,7 @@ static int test_interp_align_corner(const ncnn::Mat& a, int resize_type, float h
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Interp>("Interp", pd, weights, a);
+    int ret = test_layer("Interp", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_interp failed a.dims=%d a=(%d %d %d) resize_type=%d height_scale=%f width_scale=%f output_height=%d output_width=%d align_corner=%d\n", a.dims, a.w, a.h, a.c, resize_type, height_scale, width_scale, output_height, output_width, align_corner);
@@ -88,7 +87,7 @@ static int test_interp(const ncnn::Mat& a, int resize_type, float width_scale, i
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Interp>("Interp", pd, weights, a);
+    int ret = test_layer("Interp", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_interp failed a.dims=%d a=(%d %d %d) resize_type=%d width_scale=%f output_width=%d\n", a.dims, a.w, a.h, a.c, resize_type, width_scale, output_width);
@@ -109,7 +108,7 @@ static int test_interp_ref(const ncnn::Mat& a, int resize_type, int output_width
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Interp>("Interp", pd, weights, as);
+    int ret = test_layer("Interp", pd, weights, as);
     if (ret != 0)
     {
         fprintf(stderr, "test_interp_ref failed a.dims=%d a=(%d %d %d) resize_type=%d output_width=%d\n", a.dims, a.w, a.h, a.c, resize_type, output_width);
@@ -130,7 +129,7 @@ static int test_interp_align_corner(const ncnn::Mat& a, int resize_type, float w
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Interp>("Interp", pd, weights, a);
+    int ret = test_layer("Interp", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_interp failed a.dims=%d a=(%d %d %d) resize_type=%d width_scale=%f output_width=%d align_corner=%d\n", a.dims, a.w, a.h, a.c, resize_type, width_scale, output_width, align_corner);
diff --git a/tests/test_layernorm.cpp b/tests/test_layernorm.cpp
index fefb37c8a4c..b6d6684ab4c 100644
--- a/tests/test_layernorm.cpp
+++ b/tests/test_layernorm.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/layernorm.h"
 #include "testutil.h"
 
 static int test_layernorm(const ncnn::Mat& a, int affine_size, float eps, int affine)
@@ -26,7 +25,7 @@ static int test_layernorm(const ncnn::Mat& a, int affine_size, float eps, int af
     weights[0] = RandomMat(affine_size);
     weights[1] = RandomMat(affine_size);
 
-    int ret = test_layer<ncnn::LayerNorm>("LayerNorm", pd, weights, a);
+    int ret = test_layer("LayerNorm", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_layernorm failed a.dims=%d a=(%d %d %d) affine_size=%d eps=%f affine=%d\n", a.dims, a.w, a.h, a.c, affine_size, eps, affine);
diff --git a/tests/test_lrn.cpp b/tests/test_lrn.cpp
index 7f0122d0651..6fe5d0e6a1f 100644
--- a/tests/test_lrn.cpp
+++ b/tests/test_lrn.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/lrn.h"
 #include "testutil.h"
 
 static int test_lrn(const ncnn::Mat& a, int region_type, int local_size, float alpha, float beta, float bias)
@@ -26,7 +25,7 @@ static int test_lrn(const ncnn::Mat& a, int region_type, int local_size, float a
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::LRN>("LRN", pd, weights, a);
+    int ret = test_layer("LRN", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_lrn failed a.dims=%d a=(%d %d %d) region_type=%d local_size=%d alpha=%f beta=%f bias=%f\n", a.dims, a.w, a.h, a.c, region_type, local_size, alpha, beta, bias);
diff --git a/tests/test_lstm.cpp b/tests/test_lstm.cpp
index fb76ad0fbd7..8b5788a86dc 100644
--- a/tests/test_lstm.cpp
+++ b/tests/test_lstm.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/lstm.h"
 #include "testutil.h"
 
 static int test_lstm(const ncnn::Mat& a, int outch, int direction, int hidden_size = 0)
@@ -37,7 +36,7 @@ static int test_lstm(const ncnn::Mat& a, int outch, int direction, int hidden_si
         weights[3] = RandomMat(hidden_size * outch * num_directions);
     }
 
-    int ret = test_layer<ncnn::LSTM>("LSTM", pd, weights, a);
+    int ret = test_layer("LSTM", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_lstm failed a.dims=%d a=(%d %d %d) outch=%d direction=%d hidden_size=%d\n", a.dims, a.w, a.h, a.c, outch, direction, hidden_size);
@@ -79,7 +78,7 @@ int test_lstm_layer_with_hidden(const ncnn::Mat& a, int outch, int direction, in
     as[1] = hidden;
     as[2] = cell;
 
-    int ret = test_layer<ncnn::LSTM>("LSTM", pd, weights, as, 3);
+    int ret = test_layer("LSTM", pd, weights, as, 3);
     if (ret != 0)
     {
         fprintf(stderr, "test_lstm_layer_with_hidden failed a.dims=%d a=(%d %d %d) outch=%d direction=%d hidden_size=%d\n", a.dims, a.w, a.h, a.c, outch, direction, hidden_size);
@@ -121,7 +120,7 @@ int test_lstm_layer_with_hidden_input(const ncnn::Mat& a, int outch, int directi
     as[1] = hidden;
     as[2] = cell;
 
-    int ret = test_layer<ncnn::LSTM>("LSTM", pd, weights, as, 1);
+    int ret = test_layer("LSTM", pd, weights, as, 1);
     if (ret != 0)
     {
         fprintf(stderr, "test_lstm_layer_with_hidden_input failed a.dims=%d a=(%d %d %d) outch=%d direction=%d hidden_size=%d\n", a.dims, a.w, a.h, a.c, outch, direction, hidden_size);
@@ -155,7 +154,7 @@ int test_lstm_layer_with_hidden_output(const ncnn::Mat& a, int outch, int direct
     std::vector<ncnn::Mat> as(1);
     as[0] = a;
 
-    int ret = test_layer<ncnn::LSTM>("LSTM", pd, weights, as, 3);
+    int ret = test_layer("LSTM", pd, weights, as, 3);
     if (ret != 0)
     {
         fprintf(stderr, "test_lstm_layer_with_hidden_output failed a.dims=%d a=(%d %d %d) outch=%d direction=%d hidden_size=%d\n", a.dims, a.w, a.h, a.c, outch, direction, hidden_size);
diff --git a/tests/test_matmul.cpp b/tests/test_matmul.cpp
index 34b4aad0354..0ca17d10825 100644
--- a/tests/test_matmul.cpp
+++ b/tests/test_matmul.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/matmul.h"
 #include "testutil.h"
 
 static int test_matmul(const ncnn::Mat& a, const ncnn::Mat& b)
@@ -26,7 +25,7 @@ static int test_matmul(const ncnn::Mat& a, const ncnn::Mat& b)
     as[0] = a;
     as[1] = b;
 
-    int ret = test_layer<ncnn::MatMul>("MatMul", pd, weights, as);
+    int ret = test_layer("MatMul", pd, weights, as);
     if (ret != 0)
     {
         fprintf(stderr, "test_matmul failed a.dims=%d a=(%d %d %d %d) b.dims=%d b=(%d %d %d %d)\n", a.dims, a.w, a.h, a.d, a.c, b.dims, b.w, b.h, b.d, b.c);
@@ -46,7 +45,7 @@ static int test_matmul_transb(const ncnn::Mat& a, const ncnn::Mat& b)
     as[0] = a;
     as[1] = b;
 
-    int ret = test_layer<ncnn::MatMul>("MatMul", pd, weights, as);
+    int ret = test_layer("MatMul", pd, weights, as);
     if (ret != 0)
     {
         fprintf(stderr, "test_matmul_transb failed a.dims=%d a=(%d %d %d %d) b.dims=%d b=(%d %d %d %d)\n", a.dims, a.w, a.h, a.d, a.c, b.dims, b.w, b.h, b.d, b.c);
diff --git a/tests/test_memorydata.cpp b/tests/test_memorydata.cpp
index 25b2bd0b1fb..ff15fab3582 100644
--- a/tests/test_memorydata.cpp
+++ b/tests/test_memorydata.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/memorydata.h"
 #include "testutil.h"
 
 static int test_memorydata(const ncnn::Mat& a)
@@ -27,7 +26,7 @@ static int test_memorydata(const ncnn::Mat& a)
 
     std::vector<ncnn::Mat> as(0);
 
-    int ret = test_layer<ncnn::MemoryData>("MemoryData", pd, weights, as, 1);
+    int ret = test_layer("MemoryData", pd, weights, as, 1);
     if (ret != 0)
     {
         fprintf(stderr, "test_memorydata failed a.dims=%d a=(%d %d %d)\n", a.dims, a.w, a.h, a.c);
diff --git a/tests/test_mish.cpp b/tests/test_mish.cpp
index ffbc923f65c..7bf03e19f63 100644
--- a/tests/test_mish.cpp
+++ b/tests/test_mish.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/mish.h"
 #include "testutil.h"
 
 static int test_mish(const ncnn::Mat& a)
@@ -21,7 +20,7 @@ static int test_mish(const ncnn::Mat& a)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Mish>("Mish", pd, weights, a);
+    int ret = test_layer("Mish", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_mish failed a.dims=%d a=(%d %d %d %d)\n", a.dims, a.w, a.h, a.d, a.c);
diff --git a/tests/test_multiheadattention.cpp b/tests/test_multiheadattention.cpp
index e243fb910a9..ad29c6b98b0 100644
--- a/tests/test_multiheadattention.cpp
+++ b/tests/test_multiheadattention.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/multiheadattention.h"
 #include "testutil.h"
 
 static int test_multiheadattention(const ncnn::Mat& q, const ncnn::Mat& k, const ncnn::Mat& v, int num_heads, int kdim, int vdim, int attn_mask)
@@ -49,7 +48,7 @@ static int test_multiheadattention(const ncnn::Mat& q, const ncnn::Mat& k, const
 
     float epsilon = 0.005;
 
-    int ret = test_layer<ncnn::MultiHeadAttention>("MultiHeadAttention", pd, weights, as, 1, epsilon);
+    int ret = test_layer("MultiHeadAttention", pd, weights, as, 1, epsilon);
     if (ret != 0)
     {
         fprintf(stderr, "test_multiheadattention failed q=(%d %d) k=(%d %d) v=(%d %d) num_heads=%d kdim=%d vdim=%d attn_mask=%d\n", q.w, q.h, k.w, k.h, v.w, v.h, num_heads, kdim, vdim, attn_mask);
@@ -85,7 +84,7 @@ static int test_multiheadattention_samekv(const ncnn::Mat& q, const ncnn::Mat& k
 
     float epsilon = 0.005;
 
-    int ret = test_layer<ncnn::MultiHeadAttention>("MultiHeadAttention", pd, weights, as, 1, epsilon);
+    int ret = test_layer("MultiHeadAttention", pd, weights, as, 1, epsilon);
     if (ret != 0)
     {
         fprintf(stderr, "test_multiheadattention_samekv failed q=(%d %d) kv=(%d %d) num_heads=%d kvdim=%d\n", q.w, q.h, kv.w, kv.h, num_heads, kvdim);
@@ -118,7 +117,7 @@ static int test_multiheadattention_sameqkv(const ncnn::Mat& a, int num_heads)
 
     float epsilon = 0.005;
 
-    int ret = test_layer<ncnn::MultiHeadAttention>("MultiHeadAttention", pd, weights, as, 1, epsilon);
+    int ret = test_layer("MultiHeadAttention", pd, weights, as, 1, epsilon);
     if (ret != 0)
     {
         fprintf(stderr, "test_multiheadattention_sameqkv failed a=(%d %d) num_heads=%d\n", a.w, a.h, num_heads);
diff --git a/tests/test_noop.cpp b/tests/test_noop.cpp
index ad7e9552ad3..8484a151e79 100644
--- a/tests/test_noop.cpp
+++ b/tests/test_noop.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/noop.h"
 #include "testutil.h"
 
 static int test_noop(const ncnn::Mat& a)
@@ -24,7 +23,7 @@ static int test_noop(const ncnn::Mat& a)
     std::vector<ncnn::Mat> as(1);
     as[0] = a;
 
-    int ret = test_layer<ncnn::Noop>("Noop", pd, weights, as, 1);
+    int ret = test_layer("Noop", pd, weights, as, 1);
     if (ret != 0)
     {
         fprintf(stderr, "test_noop failed a.dims=%d a=(%d %d %d)\n", a.dims, a.w, a.h, a.c);
diff --git a/tests/test_normalize.cpp b/tests/test_normalize.cpp
index f1e56b659a5..d20d19ef69c 100644
--- a/tests/test_normalize.cpp
+++ b/tests/test_normalize.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/normalize.h"
 #include "testutil.h"
 
 static int test_normalize(const ncnn::Mat& a, int across_spatial, int across_channel, int channel_shared, float eps, int eps_mode)
@@ -30,7 +29,7 @@ static int test_normalize(const ncnn::Mat& a, int across_spatial, int across_cha
     std::vector<ncnn::Mat> weights(1);
     weights[0] = RandomMat(scale_data_size);
 
-    int ret = test_layer<ncnn::Normalize>("Normalize", pd, weights, a);
+    int ret = test_layer("Normalize", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_normalize failed a.dims=%d a=(%d %d %d) across_spatial=%d across_channel=%d channel_shared=%d eps=%f eps_mode=%d\n", a.dims, a.w, a.h, a.c, across_spatial, across_channel, channel_shared, eps, eps_mode);
diff --git a/tests/test_packing.cpp b/tests/test_packing.cpp
index 5de21f7bbb5..84652d9e0b7 100644
--- a/tests/test_packing.cpp
+++ b/tests/test_packing.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/packing.h"
 #include "testutil.h"
 
 static int test_packing_cpu_fp32(const ncnn::Mat& a, int in_elempack, int out_elempack)
@@ -189,7 +188,6 @@ static int test_packing_cpu(const ncnn::Mat& a, int in_elempack, int out_elempac
 }
 
 #if NCNN_VULKAN
-#include "layer/vulkan/packing_vulkan.h"
 
 static int test_packing_gpu_buffer(const ncnn::Mat& a, int in_elempack, int out_elempack)
 {
diff --git a/tests/test_padding.cpp b/tests/test_padding.cpp
index be192069ff7..f8018781e72 100644
--- a/tests/test_padding.cpp
+++ b/tests/test_padding.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/padding.h"
 #include "testutil.h"
 
 static int test_padding(const ncnn::Mat& a, int top, int bottom, int left, int right, int front, int behind, int type, float value, int per_channel_pad_data_size)
@@ -32,7 +31,7 @@ static int test_padding(const ncnn::Mat& a, int top, int bottom, int left, int r
     if (per_channel_pad_data_size)
         weights[0] = RandomMat(per_channel_pad_data_size);
 
-    int ret = test_layer<ncnn::Padding>("Padding", pd, weights, a);
+    int ret = test_layer("Padding", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_padding failed a.dims=%d a=(%d %d %d %d) top=%d bottom=%d left=%d right=%d front=%d behind=%d type=%d value=%f per_channel_pad_data_size=%d\n", a.dims, a.w, a.h, a.d, a.c, top, bottom, left, right, front, behind, type, value, per_channel_pad_data_size);
@@ -242,7 +241,7 @@ static int test_padding_int8(const ncnn::Mat& a, int top, int bottom, int left,
         weights[0] = RandomMat(per_channel_pad_data_size);
 
     int flag = TEST_LAYER_DISABLE_AUTO_INPUT_CASTING | TEST_LAYER_DISABLE_GPU_TESTING;
-    int ret = test_layer<ncnn::Padding>("Padding", pd, weights, a, 0.001, 0, flag);
+    int ret = test_layer("Padding", pd, weights, a, 0.001, 0, flag);
     if (ret != 0)
     {
         fprintf(stderr, "test_padding_int8 failed a.dims=%d a=(%d %d %d %d) top=%d bottom=%d left=%d right=%d front=%d behind=%d type=%d value=%f per_channel_pad_data_size=%d\n", a.dims, a.w, a.h, a.d, a.c, top, bottom, left, right, front, behind, type, value, per_channel_pad_data_size);
diff --git a/tests/test_permute.cpp b/tests/test_permute.cpp
index e6f9c7d9d54..2793185c935 100644
--- a/tests/test_permute.cpp
+++ b/tests/test_permute.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/permute.h"
 #include "testutil.h"
 
 static int test_permute(const ncnn::Mat& a, int order_type)
@@ -22,7 +21,7 @@ static int test_permute(const ncnn::Mat& a, int order_type)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Permute>("Permute", pd, weights, a);
+    int ret = test_layer("Permute", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_permute failed a.dims=%d a=(%d %d %d %d) order_type=%d\n", a.dims, a.w, a.h, a.d, a.c, order_type);
diff --git a/tests/test_pixelshuffle.cpp b/tests/test_pixelshuffle.cpp
index 1fa04b6161e..f55c81a9f1c 100644
--- a/tests/test_pixelshuffle.cpp
+++ b/tests/test_pixelshuffle.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/pixelshuffle.h"
 #include "testutil.h"
 
 static int test_pixelshuffle(const ncnn::Mat& a, int upscale_factor, int mode)
@@ -23,7 +22,7 @@ static int test_pixelshuffle(const ncnn::Mat& a, int upscale_factor, int mode)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::PixelShuffle>("PixelShuffle", pd, weights, a);
+    int ret = test_layer("PixelShuffle", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_pixelshuffle failed a.dims=%d a=(%d %d %d) upscale_factor=%d mode=%d\n", a.dims, a.w, a.h, a.c, upscale_factor, mode);
diff --git a/tests/test_pooling.cpp b/tests/test_pooling.cpp
index df0e69e922d..01d85b80e07 100644
--- a/tests/test_pooling.cpp
+++ b/tests/test_pooling.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/pooling.h"
 #include "testutil.h"
 
 static int test_pooling(int w, int h, int c, int pooling_type, int kernel, int stride, int pad, int global_pooling, int pad_mode, int avgpool_count_include_pad, int adaptive_pooling, int out_w)
@@ -32,7 +31,7 @@ static int test_pooling(int w, int h, int c, int pooling_type, int kernel, int s
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Pooling>("Pooling", pd, weights, a);
+    int ret = test_layer("Pooling", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_pooling failed w=%d h=%d c=%d pooling_type=%d kernel=%d stride=%d pad=%d global_pooling=%d pad_mode=%d avgpool_count_include_pad=%d adaptive_pooling=%d out_w=%d\n", w, h, c, pooling_type, kernel, stride, pad, global_pooling, pad_mode, avgpool_count_include_pad, adaptive_pooling, out_w);
diff --git a/tests/test_pooling1d.cpp b/tests/test_pooling1d.cpp
index 05a7cb83327..b73c4fdb6d1 100644
--- a/tests/test_pooling1d.cpp
+++ b/tests/test_pooling1d.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/pooling1d.h"
 #include "testutil.h"
 
 static int test_pooling1d(int w, int h, int pooling_type, int kernel, int stride, int pad, int global_pooling, int pad_mode, int avgpool_count_include_pad, int adaptive_pooling, int out_w)
@@ -32,7 +31,7 @@ static int test_pooling1d(int w, int h, int pooling_type, int kernel, int stride
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Pooling1D>("Pooling1D", pd, weights, a);
+    int ret = test_layer("Pooling1D", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_pooling1d failed w=%d h=%d pooling_type=%d kernel=%d stride=%d pad=%d global_pooling=%d pad_mode=%d avgpool_count_include_pad=%d adaptive_pooling=%d out_w=%d\n", w, h, pooling_type, kernel, stride, pad, global_pooling, pad_mode, avgpool_count_include_pad, adaptive_pooling, out_w);
diff --git a/tests/test_pooling3d.cpp b/tests/test_pooling3d.cpp
index 870fd79a939..4296bd446c5 100644
--- a/tests/test_pooling3d.cpp
+++ b/tests/test_pooling3d.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/pooling3d.h"
 #include "testutil.h"
 
 static int test_pooling3d(int w, int h, int d, int c, int pooling_type, int kernel, int stride, int pad, int global_pooling, int pad_mode, int avgpool_count_include_pad, int adaptive_pooling, int out_w)
@@ -32,7 +31,7 @@ static int test_pooling3d(int w, int h, int d, int c, int pooling_type, int kern
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Pooling3D>("Pooling3D", pd, weights, a);
+    int ret = test_layer("Pooling3D", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_pooling3d failed w=%d h=%d d=%d c=%d pooling_type=%d kernel=%d stride=%d pad=%d global_pooling=%d pad_mode=%d avgpool_count_include_pad=%d adaptive_pooling=%d out_w=%d\n", w, h, d, c, pooling_type, kernel, stride, pad, global_pooling, pad_mode, avgpool_count_include_pad, adaptive_pooling, out_w);
diff --git a/tests/test_power.cpp b/tests/test_power.cpp
index da399cedaf1..550cda42f9f 100644
--- a/tests/test_power.cpp
+++ b/tests/test_power.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/power.h"
 #include "testutil.h"
 
 static int test_power(const ncnn::Mat& a)
@@ -24,7 +23,7 @@ static int test_power(const ncnn::Mat& a)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Power>("Power", pd, weights, a);
+    int ret = test_layer("Power", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_power failed a.dims=%d a=(%d %d %d)\n", a.dims, a.w, a.h, a.c);
diff --git a/tests/test_prelu.cpp b/tests/test_prelu.cpp
index 4184a288ada..02887de8162 100644
--- a/tests/test_prelu.cpp
+++ b/tests/test_prelu.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/prelu.h"
 #include "testutil.h"
 
 static int test_prelu(const ncnn::Mat& a, int num_slope)
@@ -23,7 +22,7 @@ static int test_prelu(const ncnn::Mat& a, int num_slope)
     std::vector<ncnn::Mat> weights(1);
     weights[0] = RandomMat(num_slope);
 
-    int ret = test_layer<ncnn::PReLU>("PReLU", pd, weights, a);
+    int ret = test_layer("PReLU", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_prelu failed a.dims=%d a=(%d %d %d) num_slope=%d\n", a.dims, a.w, a.h, a.c, num_slope);
diff --git a/tests/test_priorbox.cpp b/tests/test_priorbox.cpp
index c1c53ecaee2..ae224320966 100644
--- a/tests/test_priorbox.cpp
+++ b/tests/test_priorbox.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/priorbox.h"
 #include "testutil.h"
 
 static int test_priorbox_caffe()
@@ -51,7 +50,7 @@ static int test_priorbox_caffe()
     as[0] = RandomMat(72, 72, 1);
     as[1] = RandomMat(512, 512, 1);
 
-    int ret = test_layer<ncnn::PriorBox>("PriorBox", pd, weights, as, 1);
+    int ret = test_layer("PriorBox", pd, weights, as, 1);
     if (ret != 0)
     {
         fprintf(stderr, "test_priorbox_caffe failed\n");
@@ -98,7 +97,7 @@ static int test_priorbox_mxnet()
     std::vector<ncnn::Mat> as(1);
     as[0] = RandomMat(72, 72, 1);
 
-    int ret = test_layer<ncnn::PriorBox>("PriorBox", pd, weights, as, 1);
+    int ret = test_layer("PriorBox", pd, weights, as, 1);
     if (ret != 0)
     {
         fprintf(stderr, "test_priorbox_mxnet failed\n");
diff --git a/tests/test_quantize.cpp b/tests/test_quantize.cpp
index afc21e3f9ee..a6e67b23d46 100644
--- a/tests/test_quantize.cpp
+++ b/tests/test_quantize.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/quantize.h"
 #include "testutil.h"
 
 static int test_quantize(const ncnn::Mat& a, float scale_low, float scale_high)
@@ -37,7 +36,7 @@ static int test_quantize(const ncnn::Mat& a, float scale_low, float scale_high)
     std::vector<ncnn::Mat> weights(1);
     weights[0] = scale_data;
 
-    int ret = test_layer<ncnn::Quantize>("Quantize", pd, weights, a);
+    int ret = test_layer("Quantize", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_quantize failed a.dims=%d a=(%d %d %d) scale_low=%f scale_high=%f\n", a.dims, a.w, a.h, a.c, scale_low, scale_high);
diff --git a/tests/test_reduction.cpp b/tests/test_reduction.cpp
index 01b808c4fc6..3895d353333 100644
--- a/tests/test_reduction.cpp
+++ b/tests/test_reduction.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/reduction.h"
 #include "testutil.h"
 
 #define OP_TYPE_MAX 11
@@ -86,7 +85,7 @@ static int test_reduction(const ncnn::Mat& _a, float coeff, int keepdims)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Reduction>("Reduction", pd, weights, a);
+    int ret = test_layer("Reduction", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_reduction failed a.dims=%d a=(%d %d %d %d) op_type=%d coeff=%f keepdims=%d reduce_all=1\n", a.dims, a.w, a.h, a.d, a.c, op_type, coeff, keepdims);
@@ -113,7 +112,7 @@ static int test_reduction(const ncnn::Mat& _a, float coeff, int keepdims, const
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Reduction>("Reduction", pd, weights, a);
+    int ret = test_layer("Reduction", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_reduction failed a.dims=%d a=(%d %d %d %d) op_type=%d coeff=%f keepdims=%d", a.dims, a.w, a.h, a.d, a.c, op_type, coeff, keepdims);
diff --git a/tests/test_relu.cpp b/tests/test_relu.cpp
index 26cbf00314b..8ecf293adce 100644
--- a/tests/test_relu.cpp
+++ b/tests/test_relu.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/relu.h"
 #include "testutil.h"
 
 static int test_relu(const ncnn::Mat& a, float slope)
@@ -22,7 +21,7 @@ static int test_relu(const ncnn::Mat& a, float slope)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::ReLU>("ReLU", pd, weights, a);
+    int ret = test_layer("ReLU", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_relu failed a.dims=%d a=(%d %d %d %d) slope=%f\n", a.dims, a.w, a.h, a.d, a.c, slope);
diff --git a/tests/test_reorg.cpp b/tests/test_reorg.cpp
index c6308cdc6ce..c1c363072ed 100644
--- a/tests/test_reorg.cpp
+++ b/tests/test_reorg.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/reorg.h"
 #include "testutil.h"
 
 static int test_reorg(const ncnn::Mat& a, int stride, int mode)
@@ -23,7 +22,7 @@ static int test_reorg(const ncnn::Mat& a, int stride, int mode)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Reorg>("Reorg", pd, weights, a);
+    int ret = test_layer("Reorg", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_reorg failed a.dims=%d a=(%d %d %d) stride=%d mode=%d\n", a.dims, a.w, a.h, a.c, stride, mode);
diff --git a/tests/test_requantize.cpp b/tests/test_requantize.cpp
index 63f99c7153a..1032d529ea6 100644
--- a/tests/test_requantize.cpp
+++ b/tests/test_requantize.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/requantize.h"
 #include "testutil.h"
 
 static int test_requantize(const ncnn::Mat& a, int scale_in_data_size, int scale_out_data_size, int bias_data_size, int activation_type, float alpha, float beta)
@@ -38,7 +37,7 @@ static int test_requantize(const ncnn::Mat& a, int scale_in_data_size, int scale
     Randomize(weights[1], 10, 100);
 
     int flag = TEST_LAYER_DISABLE_AUTO_INPUT_CASTING;
-    int ret = test_layer<ncnn::Requantize>("Requantize", pd, weights, a, 1, 0, flag);
+    int ret = test_layer("Requantize", pd, weights, a, 1, 0, flag);
     if (ret != 0)
     {
         fprintf(stderr, "test_requantize failed a.dims=%d a=(%d %d %d) scale_in_data_size=%d scale_out_data_size=%d bias_data_size=%d act=%d actparams=[%f,%f]\n", a.dims, a.w, a.h, a.c, scale_in_data_size, scale_out_data_size, bias_data_size, activation_type, activation_params[0], activation_params[1]);
@@ -81,7 +80,7 @@ static int test_requantize_pack8(const ncnn::Mat& a, int scale_in_data_size, int
     Randomize(weights[1], 10, 100);
 
     int flag = TEST_LAYER_DISABLE_AUTO_INPUT_CASTING | TEST_LAYER_ENABLE_FORCE_INPUT_PACK8;
-    int ret = test_layer<ncnn::Requantize>("Requantize", pd, weights, a, 1, 0, flag);
+    int ret = test_layer("Requantize", pd, weights, a, 1, 0, flag);
     if (ret != 0)
     {
         fprintf(stderr, "test_requantize_pack8 failed a.dims=%d a=(%d %d %d) scale_in_data_size=%d scale_out_data_size=%d bias_data_size=%d act=%d actparams=[%f,%f]\n", a.dims, a.w, a.h, a.c, scale_in_data_size, scale_out_data_size, bias_data_size, activation_type, activation_params[0], activation_params[1]);
diff --git a/tests/test_reshape.cpp b/tests/test_reshape.cpp
index fd15dda4e0a..b908e941b57 100644
--- a/tests/test_reshape.cpp
+++ b/tests/test_reshape.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/reshape.h"
 #include "testutil.h"
 
 static int test_reshape(const ncnn::Mat& a, int outw, int outh, int outd, int outc)
@@ -25,7 +24,7 @@ static int test_reshape(const ncnn::Mat& a, int outw, int outh, int outd, int ou
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Reshape>("Reshape", pd, weights, a);
+    int ret = test_layer("Reshape", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_reshape failed a.dims=%d a=(%d %d %d %d) outw=%d outh=%d outd=%d outc=%d\n", a.dims, a.w, a.h, a.d, a.c, outw, outh, outd, outc);
diff --git a/tests/test_reshape_1.cpp b/tests/test_reshape_1.cpp
index 63cbbf1baf0..4f8ef8e263f 100644
--- a/tests/test_reshape_1.cpp
+++ b/tests/test_reshape_1.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/reshape.h"
 #include "testutil.h"
 
 static int test_reshape_permute(const ncnn::Mat& a, int outw, int outh, int outd, int outc)
@@ -26,7 +25,7 @@ static int test_reshape_permute(const ncnn::Mat& a, int outw, int outh, int outd
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Reshape>("Reshape", pd, weights, a);
+    int ret = test_layer("Reshape", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_reshape_permute failed a.dims=%d a=(%d %d %d %d) outw=%d outh=%d outd=%d outc=%d\n", a.dims, a.w, a.h, a.d, a.c, outw, outh, outd, outc);
diff --git a/tests/test_rnn.cpp b/tests/test_rnn.cpp
index 31b89a22a20..f9cb9a5d752 100644
--- a/tests/test_rnn.cpp
+++ b/tests/test_rnn.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/rnn.h"
 #include "testutil.h"
 
 static int test_rnn(const ncnn::Mat& a, int outch, int direction)
@@ -30,7 +29,7 @@ static int test_rnn(const ncnn::Mat& a, int outch, int direction)
     weights[1] = RandomMat(outch * num_directions);
     weights[2] = RandomMat(outch * outch * num_directions);
 
-    int ret = test_layer<ncnn::RNN>("RNN", pd, weights, a);
+    int ret = test_layer("RNN", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_rnn failed a.dims=%d a=(%d %d %d) outch=%d, direction = %d \n", a.dims, a.w, a.h, a.c, outch, direction);
@@ -61,7 +60,7 @@ int test_rnn_layer_with_hidden(const ncnn::Mat& a, int outch, int direction)
     as[0] = a;
     as[1] = hidden;
 
-    int ret = test_layer<ncnn::RNN>("RNN", pd, weights, as, 2);
+    int ret = test_layer("RNN", pd, weights, as, 2);
     if (ret != 0)
     {
         fprintf(stderr, "test_rnn_layer_with_hidden failed a.dims=%d a=(%d %d %d) outch=%d, direction = %d \n", a.dims, a.w, a.h, a.c, outch, direction);
@@ -92,7 +91,7 @@ int test_rnn_layer_with_hidden_input(const ncnn::Mat& a, int outch, int directio
     as[0] = a;
     as[1] = hidden;
 
-    int ret = test_layer<ncnn::RNN>("RNN", pd, weights, as, 1);
+    int ret = test_layer("RNN", pd, weights, as, 1);
     if (ret != 0)
     {
         fprintf(stderr, "test_rnn_layer_with_hidden_input failed a.dims=%d a=(%d %d %d) outch=%d, direction = %d \n", a.dims, a.w, a.h, a.c, outch, direction);
@@ -119,7 +118,7 @@ int test_rnn_layer_with_hidden_output(const ncnn::Mat& a, int outch, int directi
     std::vector<ncnn::Mat> as(1);
     as[0] = a;
 
-    int ret = test_layer<ncnn::RNN>("RNN", pd, weights, as, 2);
+    int ret = test_layer("RNN", pd, weights, as, 2);
     if (ret != 0)
     {
         fprintf(stderr, "test_rnn_layer_with_hidden_output failed a.dims=%d a=(%d %d %d) outch=%d, direction = %d \n", a.dims, a.w, a.h, a.c, outch, direction);
diff --git a/tests/test_roialign.cpp b/tests/test_roialign.cpp
index 709d9af76ee..1f7e99230a3 100644
--- a/tests/test_roialign.cpp
+++ b/tests/test_roialign.cpp
@@ -13,7 +13,6 @@
 // specific language governing permissions and limitations under the License.
 
 #include "layer.h"
-#include "layer/roialign.h"
 #include "testutil.h"
 
 static int test_roialign(int w, int h, int c, int pooled_width, int pooled_height, float spatial_scale, int sampling_ratio, bool aligned, int version)
@@ -37,7 +36,7 @@ static int test_roialign(int w, int h, int c, int pooled_width, int pooled_heigh
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::ROIAlign>("ROIAlign", pd, weights, a);
+    int ret = test_layer("ROIAlign", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_roialign failed base_w=%d base_h=%d base_c=%d pooled_width=%d pooled_height=%d spatial_scale=%4f.3\n", w, h, c, pooled_width, pooled_height, spatial_scale);
diff --git a/tests/test_roipooling.cpp b/tests/test_roipooling.cpp
index 7eb838ca2c9..0abab3428ec 100644
--- a/tests/test_roipooling.cpp
+++ b/tests/test_roipooling.cpp
@@ -13,7 +13,6 @@
 // specific language governing permissions and limitations under the License.
 
 #include "layer.h"
-#include "layer/roipooling.h"
 #include "testutil.h"
 
 static int test_roipooling(int w, int h, int c, int pooled_width, int pooled_height, float spatial_scale)
@@ -34,7 +33,7 @@ static int test_roipooling(int w, int h, int c, int pooled_width, int pooled_hei
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::ROIPooling>("ROIPooling", pd, weights, a);
+    int ret = test_layer("ROIPooling", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_roipooling failed base_w=%d base_h=%d base_c=%d pooled_width=%d pooled_height=%d spatial_scale=%4f.3\n", w, h, c, pooled_width, pooled_height, spatial_scale);
diff --git a/tests/test_scale.cpp b/tests/test_scale.cpp
index e4045a0e9e2..7b02370242d 100644
--- a/tests/test_scale.cpp
+++ b/tests/test_scale.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/scale.h"
 #include "testutil.h"
 
 static int test_scale(const ncnn::Mat& a, int bias)
@@ -31,7 +30,7 @@ static int test_scale(const ncnn::Mat& a, int bias)
     if (bias)
         weights[1] = RandomMat(scale_data_size);
 
-    int ret = test_layer<ncnn::Scale>("Scale", pd, weights, a);
+    int ret = test_layer("Scale", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_scale failed a.dims=%d a=(%d %d %d) bias=%d\n", a.dims, a.w, a.h, a.c, bias);
@@ -56,7 +55,7 @@ static int test_scale_attention(const ncnn::Mat& a)
     ab[0] = a;
     ab[1] = RandomMat(scale_data_size);
 
-    int ret = test_layer<ncnn::Scale>("Scale", pd, weights, ab, 2);
+    int ret = test_layer("Scale", pd, weights, ab, 2);
     if (ret != 0)
     {
         fprintf(stderr, "test_scale_attention failed a.dims=%d a=(%d %d %d)\n", a.dims, a.w, a.h, a.c);
diff --git a/tests/test_selu.cpp b/tests/test_selu.cpp
index 3844c94ccf1..ac55c8b4299 100644
--- a/tests/test_selu.cpp
+++ b/tests/test_selu.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/selu.h"
 #include "testutil.h"
 
 static int test_selu(const ncnn::Mat& a, float alpha, float lambda)
@@ -23,7 +22,7 @@ static int test_selu(const ncnn::Mat& a, float alpha, float lambda)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::SELU>("SELU", pd, weights, a);
+    int ret = test_layer("SELU", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_selu failed a.dims=%d a=(%d %d %d %d) alpha=%f lambda=%f\n", a.dims, a.w, a.h, a.d, a.c, alpha, lambda);
diff --git a/tests/test_shrink.cpp b/tests/test_shrink.cpp
index 2eef8dd0976..f1e9040980b 100644
--- a/tests/test_shrink.cpp
+++ b/tests/test_shrink.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/shrink.h"
 #include "testutil.h"
 
 static int test_shrink(const ncnn::Mat& a, float lambd, float bias)
@@ -23,7 +22,7 @@ static int test_shrink(const ncnn::Mat& a, float lambd, float bias)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Shrink>("Shrink", pd, weights, a);
+    int ret = test_layer("Shrink", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_shrink failed a.dims=%d a=(%d %d %d %d)\n", a.dims, a.w, a.h, a.d, a.c);
diff --git a/tests/test_shufflechannel.cpp b/tests/test_shufflechannel.cpp
index ad21a184e89..ea528747d50 100644
--- a/tests/test_shufflechannel.cpp
+++ b/tests/test_shufflechannel.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/shufflechannel.h"
 #include "testutil.h"
 
 static int test_shufflechannel(int w, int h, int c, int group, int reverse)
@@ -25,7 +24,7 @@ static int test_shufflechannel(int w, int h, int c, int group, int reverse)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::ShuffleChannel>("ShuffleChannel", pd, weights, a);
+    int ret = test_layer("ShuffleChannel", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_shufflechannel failed w=%d h=%d c=%d group=%d reverse=%d\n", w, h, c, group, reverse);
diff --git a/tests/test_sigmoid.cpp b/tests/test_sigmoid.cpp
index ba03a8d5a3f..83e98e89dd7 100644
--- a/tests/test_sigmoid.cpp
+++ b/tests/test_sigmoid.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/sigmoid.h"
 #include "testutil.h"
 
 static int test_sigmoid(const ncnn::Mat& a)
@@ -21,7 +20,7 @@ static int test_sigmoid(const ncnn::Mat& a)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Sigmoid>("Sigmoid", pd, weights, a);
+    int ret = test_layer("Sigmoid", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_sigmoid failed a.dims=%d a=(%d %d %d %d)\n", a.dims, a.w, a.h, a.d, a.c);
diff --git a/tests/test_slice.cpp b/tests/test_slice.cpp
index 59cf10e8d68..dd7c8d0e23b 100644
--- a/tests/test_slice.cpp
+++ b/tests/test_slice.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/slice.h"
 #include "testutil.h"
 
 static ncnn::Mat IntArrayMat(int a0)
@@ -76,7 +75,7 @@ static int test_slice(const ncnn::Mat& a, const ncnn::Mat& slices, int axis)
     std::vector<ncnn::Mat> a0(1);
     a0[0] = a;
 
-    int ret = test_layer<ncnn::Slice>("Slice", pd, weights, a0, slices.w);
+    int ret = test_layer("Slice", pd, weights, a0, slices.w);
     if (ret != 0)
     {
         fprintf(stderr, "test_slice failed a.dims=%d a=(%d %d %d %d)", a.dims, a.w, a.h, a.d, a.c);
@@ -99,7 +98,7 @@ static int test_slice_indices(const ncnn::Mat& a, const ncnn::Mat& indices, int
     std::vector<ncnn::Mat> a0(1);
     a0[0] = a;
 
-    int ret = test_layer<ncnn::Slice>("Slice", pd, weights, a0, indices.w);
+    int ret = test_layer("Slice", pd, weights, a0, indices.w);
     if (ret != 0)
     {
         fprintf(stderr, "test_slice_indices failed a.dims=%d a=(%d %d %d %d)", a.dims, a.w, a.h, a.d, a.c);
diff --git a/tests/test_softmax.cpp b/tests/test_softmax.cpp
index cbbfd932849..c26dfce7158 100644
--- a/tests/test_softmax.cpp
+++ b/tests/test_softmax.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/softmax.h"
 #include "testutil.h"
 
 static int test_softmax(const ncnn::Mat& a, int axis)
@@ -23,7 +22,7 @@ static int test_softmax(const ncnn::Mat& a, int axis)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Softmax>("Softmax", pd, weights, a);
+    int ret = test_layer("Softmax", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_softmax failed a.dims=%d a=(%d %d %d) axis=%d\n", a.dims, a.w, a.h, a.c, axis);
diff --git a/tests/test_softplus.cpp b/tests/test_softplus.cpp
index 2bd37567a0b..1aa6c3aff98 100644
--- a/tests/test_softplus.cpp
+++ b/tests/test_softplus.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/softplus.h"
 #include "testutil.h"
 
 static int test_softplus(const ncnn::Mat& a)
@@ -21,7 +20,7 @@ static int test_softplus(const ncnn::Mat& a)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Softplus>("Softplus", pd, weights, a);
+    int ret = test_layer("Softplus", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_softplus failed a.dims=%d a=(%d %d %d)\n", a.dims, a.w, a.h, a.c);
diff --git a/tests/test_squeeze.cpp b/tests/test_squeeze.cpp
index 403f95bdf9b..02f772c8581 100644
--- a/tests/test_squeeze.cpp
+++ b/tests/test_squeeze.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/squeeze.h"
 #include "testutil.h"
 
 static int test_squeeze(const ncnn::Mat& a, int squeeze_w, int squeeze_h, int squeeze_d, int squeeze_c)
@@ -25,7 +24,7 @@ static int test_squeeze(const ncnn::Mat& a, int squeeze_w, int squeeze_h, int sq
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Squeeze>("Squeeze", pd, weights, a);
+    int ret = test_layer("Squeeze", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_squeeze failed a.dims=%d a=(%d %d %d %d) squeeze_w=%d squeeze_h=%d squeeze_d=%d squeeze_c=%d\n", a.dims, a.w, a.h, a.d, a.c, squeeze_w, squeeze_h, squeeze_d, squeeze_c);
@@ -91,7 +90,7 @@ static int test_squeeze_axes(const ncnn::Mat& a, const ncnn::Mat& axes)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Squeeze>("Squeeze", pd, weights, a);
+    int ret = test_layer("Squeeze", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_squeeze_axes failed a.dims=%d a=(%d %d %d %d)\n", a.dims, a.w, a.h, a.d, a.c);
diff --git a/tests/test_swish.cpp b/tests/test_swish.cpp
index 3a1fbfa4d14..b67c5d4bd52 100644
--- a/tests/test_swish.cpp
+++ b/tests/test_swish.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/swish.h"
 #include "testutil.h"
 
 static int test_swish(const ncnn::Mat& a)
@@ -21,7 +20,7 @@ static int test_swish(const ncnn::Mat& a)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Swish>("Swish", pd, weights, a);
+    int ret = test_layer("Swish", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_swish failed a.dims=%d a=(%d %d %d %d)\n", a.dims, a.w, a.h, a.d, a.c);
diff --git a/tests/test_tanh.cpp b/tests/test_tanh.cpp
index 0cbfe6bfcac..141a5cf37d7 100644
--- a/tests/test_tanh.cpp
+++ b/tests/test_tanh.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/tanh.h"
 #include "testutil.h"
 
 static int test_tanh(const ncnn::Mat& a)
@@ -21,7 +20,7 @@ static int test_tanh(const ncnn::Mat& a)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::TanH>("TanH", pd, weights, a);
+    int ret = test_layer("TanH", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_tanh failed a.dims=%d a=(%d %d %d %d)\n", a.dims, a.w, a.h, a.d, a.c);
diff --git a/tests/test_tile.cpp b/tests/test_tile.cpp
index 2b3595071f3..ffc238eb10c 100644
--- a/tests/test_tile.cpp
+++ b/tests/test_tile.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/tile.h"
 #include "testutil.h"
 
 static int test_tile(const ncnn::Mat& a, int axis, int tiles)
@@ -23,7 +22,7 @@ static int test_tile(const ncnn::Mat& a, int axis, int tiles)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Tile>("Tile", pd, weights, a);
+    int ret = test_layer("Tile", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_tile failed a.dims=%d a=(%d %d %d %d) axis=%d tiles=%d\n", a.dims, a.w, a.h, a.d, a.c, axis, tiles);
@@ -89,7 +88,7 @@ static int test_tile(const ncnn::Mat& a, const ncnn::Mat& repeats)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Tile>("Tile", pd, weights, a);
+    int ret = test_layer("Tile", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_tile failed a.dims=%d a=(%d %d %d %d) repeats=", a.dims, a.w, a.h, a.d, a.c);
diff --git a/tests/test_unaryop.cpp b/tests/test_unaryop.cpp
index 44274fd071f..3ff6ad8de57 100644
--- a/tests/test_unaryop.cpp
+++ b/tests/test_unaryop.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/unaryop.h"
 #include "testutil.h"
 
 #define OP_TYPE_MAX 20
@@ -46,7 +45,7 @@ static int test_unaryop(const ncnn::Mat& _a)
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::UnaryOp>("UnaryOp", pd, weights, a);
+    int ret = test_layer("UnaryOp", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_unaryop failed a.dims=%d a=(%d %d %d %d) op_type=%d\n", a.dims, a.w, a.h, a.d, a.c, op_type);
diff --git a/tests/test_unfold.cpp b/tests/test_unfold.cpp
index 4eea1d020ea..12b4066d341 100644
--- a/tests/test_unfold.cpp
+++ b/tests/test_unfold.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/unfold.h"
 #include "testutil.h"
 
 static int test_unfold(int w, int h, int c, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_w, int pad_h, float pad_value)
@@ -32,7 +31,7 @@ static int test_unfold(int w, int h, int c, int kernel_w, int kernel_h, int dila
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Unfold>("Unfold", pd, weights, a);
+    int ret = test_layer("Unfold", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_unfold failed w=%d h=%d c=%d kernel=%d,%d dilation=%d,%d stride=%d,%d pad=%d,%d pad_value=%f\n", w, h, c, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_w, pad_h, pad_value);
diff --git a/tests/test_yolov3detectionoutput.cpp b/tests/test_yolov3detectionoutput.cpp
index 1ad931a4719..c4b1d32b10a 100644
--- a/tests/test_yolov3detectionoutput.cpp
+++ b/tests/test_yolov3detectionoutput.cpp
@@ -12,7 +12,6 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "layer/yolov3detectionoutput.h"
 #include "testutil.h"
 
 static int test_yolov3detectionoutput(const std::vector<ncnn::Mat>& a, int num_class,
@@ -30,7 +29,7 @@ static int test_yolov3detectionoutput(const std::vector<ncnn::Mat>& a, int num_c
 
     std::vector<ncnn::Mat> weights(0);
 
-    int ret = test_layer<ncnn::Yolov3DetectionOutput>("Yolov3DetectionOutput", pd, weights, a);
+    int ret = test_layer("Yolov3DetectionOutput", pd, weights, a);
     if (ret != 0)
     {
         fprintf(stderr, "test_yolov3detectionoutput failed a.dims=%d a=(%d %d %d) ", a[0].dims, a[0].w, a[0].h, a[0].c);
diff --git a/tests/testutil.h b/tests/testutil.h
index 1215b59142c..2b740b9c48c 100644
--- a/tests/testutil.h
+++ b/tests/testutil.h
@@ -322,14 +322,13 @@ static int CompareMat(const std::vector<ncnn::Mat>& a, const std::vector<ncnn::M
     return 0;
 }
 
-template<typename T>
-int test_layer_naive(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const std::vector<ncnn::Mat>& a, int top_blob_count, std::vector<ncnn::Mat>& b, void (*func)(T*), int flag)
+int test_layer_naive(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const std::vector<ncnn::Mat>& a, int top_blob_count, std::vector<ncnn::Mat>& b, void (*func)(ncnn::Layer*), int flag)
 {
     ncnn::Layer* op = ncnn::create_layer_naive(typeindex);
 
     if (func)
     {
-        (*func)((T*)op);
+        (*func)((ncnn::Layer*)op);
     }
 
     op->load_param(pd);
@@ -368,11 +367,11 @@ int test_layer_naive(int typeindex, const ncnn::ParamDict& pd, const std::vector
             b[i] = a[i].clone();
         }
 
-        ((T*)op)->T::forward_inplace(b, opt);
+        op->forward_inplace(b, opt);
     }
     else
     {
-        ((T*)op)->T::forward(a, b, opt);
+        op->forward(a, b, opt);
     }
 
     op->destroy_pipeline(opt);
@@ -382,8 +381,7 @@ int test_layer_naive(int typeindex, const ncnn::ParamDict& pd, const std::vector
     return 0;
 }
 
-template<typename T>
-int test_layer_cpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const std::vector<ncnn::Mat>& a, int top_blob_count, std::vector<ncnn::Mat>& c, const std::vector<ncnn::Mat>& top_shapes, void (*func)(T*), int flag)
+int test_layer_cpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const std::vector<ncnn::Mat>& a, int top_blob_count, std::vector<ncnn::Mat>& c, const std::vector<ncnn::Mat>& top_shapes, void (*func)(ncnn::Layer*), int flag)
 {
     ncnn::Layer* op = ncnn::create_layer_cpu(typeindex);
 
@@ -400,7 +398,7 @@ int test_layer_cpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<n
 
     if (func)
     {
-        (*func)((T*)op);
+        (*func)((ncnn::Layer*)op);
     }
 
     if (!top_shapes.empty())
@@ -617,8 +615,7 @@ int test_layer_cpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<n
 }
 
 #if NCNN_VULKAN
-template<typename T>
-int test_layer_gpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const std::vector<ncnn::Mat>& a, int top_blob_count, std::vector<ncnn::Mat>& d, const std::vector<ncnn::Mat>& top_shapes, void (*func)(T*), int flag)
+int test_layer_gpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const std::vector<ncnn::Mat>& a, int top_blob_count, std::vector<ncnn::Mat>& d, const std::vector<ncnn::Mat>& top_shapes, void (*func)(ncnn::Layer*), int flag)
 {
     if (!_opt.use_packing_layout)
     {
@@ -640,7 +637,7 @@ int test_layer_gpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<n
 
     if (func)
     {
-        (*func)((T*)op);
+        (*func)((ncnn::Layer*)op);
     }
 
     if (!top_shapes.empty())
@@ -787,8 +784,7 @@ int test_layer_gpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<n
 }
 #endif // NCNN_VULKAN
 
-template<typename T>
-int test_layer(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const std::vector<ncnn::Mat>& a, int top_blob_count, const std::vector<ncnn::Mat>& top_shapes = std::vector<ncnn::Mat>(), float epsilon = 0.001, void (*func)(T*) = 0, int flag = 0)
+int test_layer(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const std::vector<ncnn::Mat>& a, int top_blob_count, const std::vector<ncnn::Mat>& top_shapes = std::vector<ncnn::Mat>(), float epsilon = 0.001, void (*func)(ncnn::Layer*) = 0, int flag = 0)
 {
     // naive
     std::vector<ncnn::Mat> b;
@@ -852,14 +848,13 @@ int test_layer(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn:
     return 0;
 }
 
-template<typename T>
-int test_layer_naive(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Mat& a, ncnn::Mat& b, void (*func)(T*), int flag)
+int test_layer_naive(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Mat& a, ncnn::Mat& b, void (*func)(ncnn::Layer*), int flag)
 {
     ncnn::Layer* op = ncnn::create_layer_naive(typeindex);
 
     if (func)
     {
-        (*func)((T*)op);
+        (*func)((ncnn::Layer*)op);
     }
 
     op->load_param(pd);
@@ -885,11 +880,11 @@ int test_layer_naive(int typeindex, const ncnn::ParamDict& pd, const std::vector
     if (op->support_inplace)
     {
         b = a.clone();
-        ((T*)op)->T::forward_inplace(b, opt);
+        op->forward_inplace(b, opt);
     }
     else
     {
-        ((T*)op)->T::forward(a, b, opt);
+        op->forward(a, b, opt);
     }
 
     op->destroy_pipeline(opt);
@@ -899,8 +894,7 @@ int test_layer_naive(int typeindex, const ncnn::ParamDict& pd, const std::vector
     return 0;
 }
 
-template<typename T>
-int test_layer_cpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const ncnn::Mat& a, ncnn::Mat& c, const ncnn::Mat& top_shape, void (*func)(T*), int flag)
+int test_layer_cpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const ncnn::Mat& a, ncnn::Mat& c, const ncnn::Mat& top_shape, void (*func)(ncnn::Layer*), int flag)
 {
     ncnn::Layer* op = ncnn::create_layer_cpu(typeindex);
 
@@ -917,7 +911,7 @@ int test_layer_cpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<n
 
     if (func)
     {
-        (*func)((T*)op);
+        (*func)((ncnn::Layer*)op);
     }
 
     if (top_shape.dims)
@@ -1117,8 +1111,7 @@ int test_layer_cpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<n
 }
 
 #if NCNN_VULKAN
-template<typename T>
-int test_layer_gpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const ncnn::Mat& a, ncnn::Mat& d, const ncnn::Mat& top_shape, void (*func)(T*), int flag)
+int test_layer_gpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const ncnn::Mat& a, ncnn::Mat& d, const ncnn::Mat& top_shape, void (*func)(ncnn::Layer*), int flag)
 {
     if (!_opt.use_packing_layout)
     {
@@ -1140,7 +1133,7 @@ int test_layer_gpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<n
 
     if (func)
     {
-        (*func)((T*)op);
+        (*func)((ncnn::Layer*)op);
     }
 
     if (top_shape.dims)
@@ -1268,8 +1261,7 @@ int test_layer_gpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<n
 }
 #endif // NCNN_VULKAN
 
-template<typename T>
-int test_layer(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const ncnn::Mat& a, const ncnn::Mat& top_shape = ncnn::Mat(), float epsilon = 0.001, void (*func)(T*) = 0, int flag = 0)
+int test_layer(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const ncnn::Mat& a, const ncnn::Mat& top_shape = ncnn::Mat(), float epsilon = 0.001, void (*func)(ncnn::Layer*) = 0, int flag = 0)
 {
     // naive
     ncnn::Mat b;
@@ -1333,8 +1325,7 @@ int test_layer(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn:
     return 0;
 }
 
-template<typename T>
-int test_layer_opt(const char* layer_type, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& opt, const std::vector<ncnn::Mat>& a, int top_blob_count = 1, float epsilon = 0.001, void (*func)(T*) = 0, int flag = 0)
+int test_layer_opt(const char* layer_type, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& opt, const std::vector<ncnn::Mat>& a, int top_blob_count = 1, float epsilon = 0.001, void (*func)(ncnn::Layer*) = 0, int flag = 0)
 {
     // fp16 representation
     std::vector<ncnn::Mat> a_fp16;
@@ -1399,7 +1390,7 @@ int test_layer_opt(const char* layer_type, const ncnn::ParamDict& pd, const std:
     }
 
     std::vector<ncnn::Mat> top_shapes;
-    int ret = test_layer<T>(ncnn::layer_to_index(layer_type), pd, weights_fp16, opt, a_fp16, top_blob_count, top_shapes, epsilon_fp16, func, flag);
+    int ret = test_layer(ncnn::layer_to_index(layer_type), pd, weights_fp16, opt, a_fp16, top_blob_count, top_shapes, epsilon_fp16, func, flag);
     if (ret != 0)
     {
         fprintf(stderr, "test_layer %s failed use_packing_layout=%d use_fp16_packed=%d use_fp16_storage=%d use_fp16_arithmetic=%d use_shader_pack8=%d use_bf16_storage=%d use_image_storage=%d use_sgemm_convolution=%d use_winograd_convolution=%d\n", layer_type, opt.use_packing_layout, opt.use_fp16_packed, opt.use_fp16_storage, opt.use_fp16_arithmetic, opt.use_shader_pack8, opt.use_bf16_storage, opt.use_image_storage, opt.use_sgemm_convolution, opt.use_winograd_convolution);
@@ -1409,8 +1400,7 @@ int test_layer_opt(const char* layer_type, const ncnn::ParamDict& pd, const std:
     return 0;
 }
 
-template<typename T>
-int test_layer_opt(const char* layer_type, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& opt, const ncnn::Mat& a, float epsilon = 0.001, void (*func)(T*) = 0, int flag = 0)
+int test_layer_opt(const char* layer_type, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& opt, const ncnn::Mat& a, float epsilon = 0.001, void (*func)(ncnn::Layer*) = 0, int flag = 0)
 {
     // fp16 representation
     ncnn::Mat a_fp16;
@@ -1467,7 +1457,7 @@ int test_layer_opt(const char* layer_type, const ncnn::ParamDict& pd, const std:
     }
 
     ncnn::Mat top_shape;
-    int ret = test_layer<T>(ncnn::layer_to_index(layer_type), pd, weights_fp16, opt, a_fp16, top_shape, epsilon_fp16, func, flag);
+    int ret = test_layer(ncnn::layer_to_index(layer_type), pd, weights_fp16, opt, a_fp16, top_shape, epsilon_fp16, func, flag);
     if (ret != 0)
     {
         fprintf(stderr, "test_layer %s failed use_packing_layout=%d use_fp16_packed=%d use_fp16_storage=%d use_fp16_arithmetic=%d use_shader_pack8=%d use_bf16_storage=%d use_image_storage=%d use_sgemm_convolution=%d use_winograd_convolution=%d\n", layer_type, opt.use_packing_layout, opt.use_fp16_packed, opt.use_fp16_storage, opt.use_fp16_arithmetic, opt.use_shader_pack8, opt.use_bf16_storage, opt.use_image_storage, opt.use_sgemm_convolution, opt.use_winograd_convolution);
@@ -1477,8 +1467,7 @@ int test_layer_opt(const char* layer_type, const ncnn::ParamDict& pd, const std:
     return 0;
 }
 
-template<typename T>
-int test_layer(const char* layer_type, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const std::vector<ncnn::Mat>& a, int top_blob_count = 1, float epsilon = 0.001, void (*func)(T*) = 0, int flag = 0)
+int test_layer(const char* layer_type, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const std::vector<ncnn::Mat>& a, int top_blob_count = 1, float epsilon = 0.001, void (*func)(ncnn::Layer*) = 0, int flag = 0)
 {
     // pack fp16p fp16s fp16a bf16s shader8 image
     const int options[][7] = {
@@ -1506,7 +1495,7 @@ int test_layer(const char* layer_type, const ncnn::ParamDict& pd, const std::vec
         opt.use_shader_pack8 = options[i][5];
         opt.use_image_storage = options[i][6];
 
-        int ret = test_layer_opt<T>(layer_type, pd, weights, opt, a, top_blob_count, epsilon, func, flag);
+        int ret = test_layer_opt(layer_type, pd, weights, opt, a, top_blob_count, epsilon, func, flag);
         if (ret != 0)
             return ret;
     }
@@ -1514,8 +1503,7 @@ int test_layer(const char* layer_type, const ncnn::ParamDict& pd, const std::vec
     return 0;
 }
 
-template<typename T>
-int test_layer(const char* layer_type, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Mat& a, float epsilon = 0.001, void (*func)(T*) = 0, int flag = 0)
+int test_layer(const char* layer_type, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Mat& a, float epsilon = 0.001, void (*func)(ncnn::Layer*) = 0, int flag = 0)
 {
     // pack fp16p fp16s fp16a bf16s shader8 image
     const int options[][7] = {
@@ -1543,7 +1531,7 @@ int test_layer(const char* layer_type, const ncnn::ParamDict& pd, const std::vec
         opt.use_shader_pack8 = options[i][5];
         opt.use_image_storage = options[i][6];
 
-        int ret = test_layer_opt<T>(layer_type, pd, weights, opt, a, epsilon, func, flag);
+        int ret = test_layer_opt(layer_type, pd, weights, opt, a, epsilon, func, flag);
         if (ret != 0)
             return ret;
     }

From 5432872a9681fd391ac983b50b55b5a92448f396 Mon Sep 17 00:00:00 2001
From: nihuini <nihuini@tencent.com>
Date: Wed, 3 Jan 2024 15:40:40 +0800
Subject: [PATCH 04/19] allow build test with shared library

---
 CMakeLists.txt       |    5 -
 tests/CMakeLists.txt |    7 +-
 tests/testutil.cpp   | 1528 ++++++++++++++++++++++++++++++++++++++++++
 tests/testutil.h     | 1502 +----------------------------------------
 4 files changed, 1568 insertions(+), 1474 deletions(-)
 create mode 100644 tests/testutil.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4ffd677bb33..785e2cd3926 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -97,11 +97,6 @@ else()
 endif()
 
 if(NCNN_SHARED_LIB)
-    if(NCNN_BUILD_TESTS)
-        message(WARNING "NCNN_SHARED_LIB must be OFF to build tests! NCNN_BUILD_TESTS will be turned off.")
-        set(NCNN_BUILD_TESTS OFF)
-    endif()
-
     if(NCNN_ENABLE_LTO)
         # enable global link time optimization
         cmake_policy(SET CMP0069 NEW)
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 2dff6c38692..4ce231fbd2a 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -4,9 +4,12 @@ if(MSVC)
     add_definitions(/wd4996)
 endif()
 
+add_library(ncnntestutil STATIC testutil.cpp)
+target_link_libraries(ncnntestutil PUBLIC ncnn)
+
 macro(ncnn_add_test name)
     add_executable(test_${name} test_${name}.cpp)
-    target_link_libraries(test_${name} PRIVATE ncnn)
+    target_link_libraries(test_${name} PRIVATE ncnntestutil ncnn)
 
     add_test(NAME test_${name} COMMAND ${CMAKE_COMMAND} -DTEST_EXECUTABLE=$<TARGET_FILE:test_${name}> -P ${CMAKE_CURRENT_SOURCE_DIR}/../cmake/run_test.cmake)
 
@@ -24,7 +27,7 @@ macro(ncnn_add_layer_test class)
         foreach(test_file ${test_${name}_SRCS})
             get_filename_component(test_filename ${test_file} NAME_WE)
             add_executable(${test_filename} ${test_file})
-            target_link_libraries(${test_filename} PRIVATE ncnn)
+            target_link_libraries(${test_filename} PRIVATE ncnntestutil ncnn)
 
             add_test(NAME ${test_filename} COMMAND ${CMAKE_COMMAND} -DTEST_EXECUTABLE=$<TARGET_FILE:${test_filename}> -P ${CMAKE_CURRENT_SOURCE_DIR}/../cmake/run_test.cmake)
 
diff --git a/tests/testutil.cpp b/tests/testutil.cpp
new file mode 100644
index 00000000000..eb4d4fb1ca9
--- /dev/null
+++ b/tests/testutil.cpp
@@ -0,0 +1,1528 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "testutil.h"
+
+#include "cpu.h"
+#include "layer.h"
+#include "mat.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#if NCNN_VULKAN
+#include "command.h"
+#include "gpu.h"
+#endif // NCNN_VULKAN
+
+float RandomFloat(float a, float b)
+{
+    float random = ((float)RAND()) / (float)uint64_t(-1); //RAND_MAX;
+    float diff = b - a;
+    float r = random * diff;
+    float v = a + r;
+    // generate denormal as zero
+    if (v < 0.0001 && v > -0.0001)
+        v = 0.f;
+    return v;
+}
+
+int RandomInt(int a, int b)
+{
+    float random = ((float)RAND()) / (float)uint64_t(-1); //RAND_MAX;
+    int diff = b - a;
+    float r = random * diff;
+    return a + (int)r;
+}
+
+signed char RandomS8()
+{
+    return (signed char)RandomInt(-127, 127);
+}
+
+void Randomize(ncnn::Mat& m, float a, float b)
+{
+    for (size_t i = 0; i < m.total(); i++)
+    {
+        m[i] = RandomFloat(a, b);
+    }
+}
+
+void RandomizeInt(ncnn::Mat& m, int a, int b)
+{
+    for (size_t i = 0; i < m.total(); i++)
+    {
+        ((int*)m)[i] = RandomInt(a, b);
+    }
+}
+
+void RandomizeS8(ncnn::Mat& m)
+{
+    for (size_t i = 0; i < m.total(); i++)
+    {
+        ((signed char*)m)[i] = RandomS8();
+    }
+}
+
+ncnn::Mat RandomMat(int w, float a, float b)
+{
+    ncnn::Mat m(w);
+    Randomize(m, a, b);
+    return m;
+}
+
+ncnn::Mat RandomMat(int w, int h, float a, float b)
+{
+    ncnn::Mat m(w, h);
+    Randomize(m, a, b);
+    return m;
+}
+
+ncnn::Mat RandomMat(int w, int h, int c, float a, float b)
+{
+    ncnn::Mat m(w, h, c);
+    Randomize(m, a, b);
+    return m;
+}
+
+ncnn::Mat RandomMat(int w, int h, int d, int c, float a, float b)
+{
+    ncnn::Mat m(w, h, d, c);
+    Randomize(m, a, b);
+    return m;
+}
+
+ncnn::Mat RandomIntMat(int w)
+{
+    ncnn::Mat m(w);
+    RandomizeInt(m);
+    return m;
+}
+
+ncnn::Mat RandomIntMat(int w, int h)
+{
+    ncnn::Mat m(w, h);
+    RandomizeInt(m);
+    return m;
+}
+
+ncnn::Mat RandomIntMat(int w, int h, int c)
+{
+    ncnn::Mat m(w, h, c);
+    RandomizeInt(m);
+    return m;
+}
+
+ncnn::Mat RandomIntMat(int w, int h, int d, int c)
+{
+    ncnn::Mat m(w, h, d, c);
+    RandomizeInt(m);
+    return m;
+}
+
+ncnn::Mat RandomS8Mat(int w)
+{
+    ncnn::Mat m(w, (size_t)1u);
+    RandomizeS8(m);
+    return m;
+}
+
+ncnn::Mat RandomS8Mat(int w, int h)
+{
+    ncnn::Mat m(w, h, (size_t)1u);
+    RandomizeS8(m);
+    return m;
+}
+
+ncnn::Mat RandomS8Mat(int w, int h, int c)
+{
+    ncnn::Mat m(w, h, c, (size_t)1u);
+    RandomizeS8(m);
+    return m;
+}
+
+ncnn::Mat RandomS8Mat(int w, int h, int d, int c)
+{
+    ncnn::Mat m(w, h, d, c, (size_t)1u);
+    RandomizeS8(m);
+    return m;
+}
+
+ncnn::Mat scales_mat(const ncnn::Mat& mat, int m, int k, int ldx)
+{
+    ncnn::Mat weight_scales(m);
+    for (int i = 0; i < m; ++i)
+    {
+        float min = mat[0], _max = mat[0];
+        const float* ptr = (const float*)(mat.data) + i * ldx;
+        for (int j = 0; j < k; ++j)
+        {
+            if (min > ptr[j])
+            {
+                min = ptr[j];
+            }
+            if (_max < ptr[j])
+            {
+                _max = ptr[j];
+            }
+        }
+        const float abs_min = abs(min), abs_max = abs(_max);
+        weight_scales[i] = 127.f / (abs_min > abs_max ? abs_min : abs_max);
+    }
+    return weight_scales;
+}
+
+bool NearlyEqual(float a, float b, float epsilon)
+{
+    if (a == b)
+        return true;
+
+    float diff = (float)fabs(a - b);
+    if (diff <= epsilon)
+        return true;
+
+    // relative error
+    return diff < epsilon * std::max(fabs(a), fabs(b));
+}
+
+int Compare(const ncnn::Mat& a, const ncnn::Mat& b, float epsilon)
+{
+#define CHECK_MEMBER(m)                                                                 \
+    if (a.m != b.m)                                                                     \
+    {                                                                                   \
+        fprintf(stderr, #m " not match    expect %d but got %d\n", (int)a.m, (int)b.m); \
+        return -1;                                                                      \
+    }
+
+    CHECK_MEMBER(dims)
+    CHECK_MEMBER(w)
+    CHECK_MEMBER(h)
+    CHECK_MEMBER(d)
+    CHECK_MEMBER(c)
+    CHECK_MEMBER(elemsize)
+    CHECK_MEMBER(elempack)
+
+#undef CHECK_MEMBER
+
+    for (int q = 0; q < a.c; q++)
+    {
+        const ncnn::Mat ma = a.channel(q);
+        const ncnn::Mat mb = b.channel(q);
+        for (int z = 0; z < a.d; z++)
+        {
+            const ncnn::Mat da = ma.depth(z);
+            const ncnn::Mat db = mb.depth(z);
+            for (int i = 0; i < a.h; i++)
+            {
+                const float* pa = da.row(i);
+                const float* pb = db.row(i);
+                for (int j = 0; j < a.w; j++)
+                {
+                    if (!NearlyEqual(pa[j], pb[j], epsilon))
+                    {
+                        fprintf(stderr, "value not match  at c:%d d:%d h:%d w:%d    expect %f but got %f\n", q, z, i, j, pa[j], pb[j]);
+                        return -1;
+                    }
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+int CompareMat(const ncnn::Mat& a, const ncnn::Mat& b, float epsilon)
+{
+    ncnn::Option opt;
+    opt.num_threads = 1;
+
+    if (a.elempack != 1)
+    {
+        ncnn::Mat a1;
+        ncnn::convert_packing(a, a1, 1, opt);
+        return CompareMat(a1, b, epsilon);
+    }
+
+    if (b.elempack != 1)
+    {
+        ncnn::Mat b1;
+        ncnn::convert_packing(b, b1, 1, opt);
+        return CompareMat(a, b1, epsilon);
+    }
+
+    if (a.elemsize == 2u)
+    {
+        ncnn::Mat a32;
+        cast_float16_to_float32(a, a32, opt);
+        return CompareMat(a32, b, epsilon);
+    }
+    if (a.elemsize == 1u)
+    {
+        ncnn::Mat a32;
+        cast_int8_to_float32(a, a32, opt);
+        return CompareMat(a32, b, epsilon);
+    }
+
+    if (b.elemsize == 2u)
+    {
+        ncnn::Mat b32;
+        cast_float16_to_float32(b, b32, opt);
+        return CompareMat(a, b32, epsilon);
+    }
+    if (b.elemsize == 1u)
+    {
+        ncnn::Mat b32;
+        cast_int8_to_float32(b, b32, opt);
+        return CompareMat(a, b32, epsilon);
+    }
+
+    return Compare(a, b, epsilon);
+}
+
+int CompareMat(const std::vector<ncnn::Mat>& a, const std::vector<ncnn::Mat>& b, float epsilon)
+{
+    if (a.size() != b.size())
+    {
+        fprintf(stderr, "output blob count not match %zu %zu\n", a.size(), b.size());
+        return -1;
+    }
+
+    for (size_t i = 0; i < a.size(); i++)
+    {
+        if (CompareMat(a[i], b[i], epsilon))
+        {
+            fprintf(stderr, "output blob %zu not match\n", i);
+            return -1;
+        }
+    }
+
+    return 0;
+}
+
+int test_layer_naive(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const std::vector<ncnn::Mat>& a, int top_blob_count, std::vector<ncnn::Mat>& b, void (*func)(ncnn::Layer*), int flag)
+{
+    ncnn::Layer* op = ncnn::create_layer_naive(typeindex);
+
+    if (func)
+    {
+        (*func)((ncnn::Layer*)op);
+    }
+
+    op->load_param(pd);
+
+    if (op->one_blob_only && a.size() != 1)
+    {
+        fprintf(stderr, "layer with one_blob_only but consume multiple inputs\n");
+        delete op;
+        return -1;
+    }
+
+    ncnn::ModelBinFromMatArray mb(weights.data());
+
+    op->load_model(mb);
+
+    ncnn::Option opt;
+    opt.num_threads = 1;
+    opt.lightmode = false;
+    opt.use_packing_layout = false;
+    opt.use_fp16_packed = false;
+    opt.use_fp16_storage = false;
+    opt.use_fp16_arithmetic = false;
+    opt.use_shader_pack8 = false;
+    opt.use_image_storage = false;
+    opt.use_bf16_storage = false;
+    opt.use_vulkan_compute = false;
+
+    op->create_pipeline(opt);
+
+    b.resize(top_blob_count);
+
+    if (op->support_inplace)
+    {
+        for (size_t i = 0; i < a.size(); i++)
+        {
+            b[i] = a[i].clone();
+        }
+
+        op->forward_inplace(b, opt);
+    }
+    else
+    {
+        op->forward(a, b, opt);
+    }
+
+    op->destroy_pipeline(opt);
+
+    delete op;
+
+    return 0;
+}
+
+int test_layer_cpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const std::vector<ncnn::Mat>& a, int top_blob_count, std::vector<ncnn::Mat>& c, const std::vector<ncnn::Mat>& top_shapes, void (*func)(ncnn::Layer*), int flag)
+{
+    ncnn::Layer* op = ncnn::create_layer_cpu(typeindex);
+
+    if (!op->support_packing && _opt.use_packing_layout)
+    {
+        delete op;
+        return 233;
+    }
+    if (!op->support_bf16_storage && !op->support_fp16_storage && (_opt.use_bf16_storage || _opt.use_fp16_arithmetic))
+    {
+        delete op;
+        return 233;
+    }
+
+    if (func)
+    {
+        (*func)((ncnn::Layer*)op);
+    }
+
+    if (!top_shapes.empty())
+    {
+        op->bottom_shapes = a;
+        op->top_shapes = top_shapes;
+    }
+
+    op->load_param(pd);
+
+    if (op->one_blob_only && a.size() != 1)
+    {
+        fprintf(stderr, "layer with one_blob_only but consume multiple inputs\n");
+        delete op;
+        return -1;
+    }
+
+    ncnn::ModelBinFromMatArray mb(weights.data());
+
+    op->load_model(mb);
+
+    ncnn::Option opt = _opt;
+    opt.num_threads = 1;
+    opt.use_vulkan_compute = false;
+
+    op->create_pipeline(opt);
+
+    if (!op->support_packing && _opt.use_packing_layout)
+    {
+        op->destroy_pipeline(opt);
+        delete op;
+        return 233;
+    }
+    if (!op->support_bf16_storage && !op->support_fp16_storage && (_opt.use_bf16_storage || _opt.use_fp16_arithmetic))
+    {
+        op->destroy_pipeline(opt);
+        delete op;
+        return 233;
+    }
+
+    std::vector<ncnn::Mat> a4(a.size());
+
+    for (size_t i = 0; i < a4.size(); i++)
+    {
+        // clang-format off
+        // *INDENT-OFF*
+#if NCNN_ARM82
+        if (opt.use_fp16_storage && ncnn::cpu_support_arm_asimdhp() && op->support_fp16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
+        {
+            ncnn::cast_float32_to_float16(a[i], a4[i], opt);
+        }
+        else
+#endif // NCNN_ARM82
+#if NCNN_RVV
+        if (opt.use_fp16_storage && ncnn::cpu_support_riscv_v() && ncnn::cpu_support_riscv_zfh() && op->support_fp16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
+        {
+            ncnn::cast_float32_to_float16(a[i], a4[i], opt);
+        }
+        else
+#endif // NCNN_RVV
+#if NCNN_BF16
+        if (opt.use_bf16_storage && op->support_bf16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
+        {
+            ncnn::cast_float32_to_bfloat16(a[i], a4[i], opt);
+        }
+        else
+#endif // NCNN_BF16
+        if (opt.use_fp16_storage && op->support_fp16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
+        {
+            ncnn::cast_float32_to_float16(a[i], a4[i], opt);
+        }
+        else
+        {
+            a4[i] = a[i];
+        }
+        // *INDENT-ON*
+        // clang-format on
+
+        if (opt.use_packing_layout && op->support_packing && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_PACKING))
+        {
+            // resolve dst_elempack
+            int dims = a4[i].dims;
+            int elemcount = 0;
+            if (dims == 1) elemcount = a4[i].elempack * a4[i].w;
+            if (dims == 2) elemcount = a4[i].elempack * a4[i].h;
+            if (dims == 3 || dims == 4) elemcount = a4[i].elempack * a4[i].c;
+
+            int elembits = a4[i].elembits();
+
+            int dst_elempack = 1;
+
+            if (elembits == 32)
+            {
+#if NCNN_AVX512
+                if (elemcount % 16 == 0 && ncnn::cpu_support_x86_avx512())
+                    dst_elempack = 16;
+                else if (elemcount % 8 == 0 && ncnn::cpu_support_x86_avx())
+                    dst_elempack = 8;
+                else if (elemcount % 4 == 0)
+                    dst_elempack = 4;
+#elif NCNN_AVX
+                if (elemcount % 8 == 0 && ncnn::cpu_support_x86_avx())
+                    dst_elempack = 8;
+                else if (elemcount % 4 == 0)
+                    dst_elempack = 4;
+#elif NCNN_RVV
+                const int packn = ncnn::cpu_riscv_vlenb() / (elembits / 8);
+                if (elemcount % packn == 0)
+                    dst_elempack = packn;
+#else
+                if (elemcount % 4 == 0)
+                    dst_elempack = 4;
+#endif
+            }
+            if (elembits == 16)
+            {
+#if NCNN_ARM82
+                if (elemcount % 8 == 0 && ncnn::cpu_support_arm_asimdhp() && opt.use_fp16_arithmetic)
+                    dst_elempack = 8;
+                else if (elemcount % 4 == 0)
+                    dst_elempack = 4;
+#elif NCNN_RVV
+                const int packn = ncnn::cpu_riscv_vlenb() / 2;
+                if (elemcount % packn == 0)
+                    dst_elempack = packn;
+#else
+                if (elemcount % 4 == 0)
+                    dst_elempack = 4;
+#endif
+            }
+            if (elembits == 8)
+            {
+#if NCNN_RVV
+                const int packn = ncnn::cpu_riscv_vlenb() / 1;
+                if (elemcount % packn == 0)
+                    dst_elempack = packn;
+#else
+                if (elemcount % 8 == 0)
+                    dst_elempack = 8;
+#endif
+            }
+
+            if (flag & TEST_LAYER_ENABLE_FORCE_INPUT_PACK8)
+                dst_elempack = 8;
+
+            ncnn::Mat a4_packed;
+            ncnn::convert_packing(a4[i], a4_packed, dst_elempack, opt);
+            a4[i] = a4_packed;
+        }
+    }
+
+    c.resize(top_blob_count);
+
+    if (op->support_inplace)
+    {
+        for (size_t i = 0; i < a4.size(); i++)
+        {
+            c[i] = a4[i].clone();
+        }
+
+        op->forward_inplace(c, opt);
+    }
+    else
+    {
+        op->forward(a4, c, opt);
+    }
+
+    for (size_t i = 0; i < c.size(); i++)
+    {
+        // clang-format off
+        // *INDENT-OFF*
+#if NCNN_ARM82
+        if (opt.use_fp16_storage && ncnn::cpu_support_arm_asimdhp() && op->support_fp16_storage && c[i].elembits() == 16)
+        {
+            ncnn::Mat c_fp32;
+            ncnn::cast_float16_to_float32(c[i], c_fp32, opt);
+            c[i] = c_fp32;
+        }
+        else
+#endif // NCNN_ARM82
+#if NCNN_RVV
+        if (opt.use_fp16_storage && ncnn::cpu_support_riscv_v() && ncnn::cpu_support_riscv_zfh() && op->support_fp16_storage && c[i].elembits() == 16)
+        {
+            ncnn::Mat c_fp32;
+            ncnn::cast_float16_to_float32(c[i], c_fp32, opt);
+            c[i] = c_fp32;
+        }
+        else
+#endif // NCNN_RVV
+#if NCNN_BF16
+        if (opt.use_bf16_storage && op->support_bf16_storage && c[i].elembits() == 16)
+        {
+            ncnn::Mat c_fp32;
+            ncnn::cast_bfloat16_to_float32(c[i], c_fp32, opt);
+            c[i] = c_fp32;
+        }
+        else
+#endif // NCNN_BF16
+        if (opt.use_fp16_storage && op->support_fp16_storage && c[i].elembits() == 16)
+        {
+            ncnn::Mat c_fp32;
+            ncnn::cast_float16_to_float32(c[i], c_fp32, opt);
+            c[i] = c_fp32;
+        }
+        // *INDENT-ON*
+        // clang-format on
+    }
+
+    op->destroy_pipeline(opt);
+
+    delete op;
+
+    return 0;
+}
+
+#if NCNN_VULKAN
+int test_layer_gpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const std::vector<ncnn::Mat>& a, int top_blob_count, std::vector<ncnn::Mat>& d, const std::vector<ncnn::Mat>& top_shapes, void (*func)(ncnn::Layer*), int flag)
+{
+    if (!_opt.use_packing_layout)
+    {
+        // pack1 test is useless for gpu
+        return 233;
+    }
+
+    ncnn::Layer* op = ncnn::create_layer_vulkan(typeindex);
+
+    if (!op->support_vulkan)
+    {
+        delete op;
+        return 233;
+    }
+
+    ncnn::VulkanDevice* vkdev = ncnn::get_gpu_device();
+
+    op->vkdev = vkdev;
+
+    if (func)
+    {
+        (*func)((ncnn::Layer*)op);
+    }
+
+    if (!top_shapes.empty())
+    {
+        op->bottom_shapes = a;
+        op->top_shapes = top_shapes;
+    }
+
+    op->load_param(pd);
+
+    if (op->one_blob_only && a.size() != 1)
+    {
+        fprintf(stderr, "layer with one_blob_only but consume multiple inputs\n");
+        delete op;
+        return -1;
+    }
+
+    ncnn::ModelBinFromMatArray mb(weights.data());
+
+    op->load_model(mb);
+
+    ncnn::VkWeightAllocator g_weight_vkallocator(vkdev);
+    ncnn::VkWeightStagingAllocator g_weight_staging_vkallocator(vkdev);
+
+    ncnn::VkAllocator* blob_vkallocator = vkdev->acquire_blob_allocator();
+    ncnn::VkAllocator* staging_vkallocator = vkdev->acquire_staging_allocator();
+
+    ncnn::Option opt = _opt;
+    opt.num_threads = 1;
+    opt.use_vulkan_compute = true;
+
+#if __APPLE__
+    opt.use_image_storage = false;
+#endif
+
+    opt.blob_vkallocator = blob_vkallocator;
+    opt.workspace_vkallocator = blob_vkallocator;
+    opt.staging_vkallocator = staging_vkallocator;
+
+    if (!vkdev->info.support_fp16_packed()) opt.use_fp16_packed = false;
+    if (!vkdev->info.support_fp16_storage()) opt.use_fp16_storage = false;
+    if (!vkdev->info.support_fp16_arithmetic()) opt.use_fp16_arithmetic = false;
+    if (!vkdev->info.support_cooperative_matrix()) opt.use_cooperative_matrix = false;
+
+    // FIXME fp16a may produce large error
+    opt.use_fp16_arithmetic = false;
+
+    op->create_pipeline(opt);
+
+    if (!op->support_vulkan)
+    {
+        op->destroy_pipeline(opt);
+        delete op;
+        return 233;
+    }
+
+    {
+        ncnn::VkTransfer cmd(vkdev);
+
+        ncnn::Option opt_upload = opt;
+        opt_upload.blob_vkallocator = &g_weight_vkallocator;
+        opt_upload.workspace_vkallocator = &g_weight_vkallocator;
+        opt_upload.staging_vkallocator = &g_weight_staging_vkallocator;
+
+        op->upload_model(cmd, opt_upload);
+
+        cmd.submit_and_wait();
+    }
+
+    d.resize(top_blob_count);
+
+    {
+        // forward
+        ncnn::VkCompute cmd(vkdev);
+
+        if (op->support_image_storage && opt.use_image_storage)
+        {
+            // upload
+            std::vector<ncnn::VkImageMat> a_gpu(a.size());
+            for (size_t i = 0; i < a_gpu.size(); i++)
+            {
+                cmd.record_upload(a[i], a_gpu[i], opt);
+            }
+
+            std::vector<ncnn::VkImageMat> d_gpu(top_blob_count);
+            if (op->support_inplace)
+            {
+                op->forward_inplace(a_gpu, cmd, opt);
+
+                d_gpu = a_gpu;
+            }
+            else
+            {
+                op->forward(a_gpu, d_gpu, cmd, opt);
+            }
+
+            // download
+            for (size_t i = 0; i < d_gpu.size(); i++)
+            {
+                cmd.record_download(d_gpu[i], d[i], opt);
+            }
+        }
+        else
+        {
+            // upload
+            std::vector<ncnn::VkMat> a_gpu(a.size());
+            for (size_t i = 0; i < a_gpu.size(); i++)
+            {
+                cmd.record_upload(a[i], a_gpu[i], opt);
+            }
+
+            std::vector<ncnn::VkMat> d_gpu(top_blob_count);
+            if (op->support_inplace)
+            {
+                op->forward_inplace(a_gpu, cmd, opt);
+
+                d_gpu = a_gpu;
+            }
+            else
+            {
+                op->forward(a_gpu, d_gpu, cmd, opt);
+            }
+
+            // download
+            for (size_t i = 0; i < d_gpu.size(); i++)
+            {
+                cmd.record_download(d_gpu[i], d[i], opt);
+            }
+        }
+
+        cmd.submit_and_wait();
+    }
+
+    op->destroy_pipeline(opt);
+
+    delete op;
+
+    vkdev->reclaim_blob_allocator(blob_vkallocator);
+    vkdev->reclaim_staging_allocator(staging_vkallocator);
+    g_weight_vkallocator.clear();
+    g_weight_staging_vkallocator.clear();
+
+    return 0;
+}
+#endif // NCNN_VULKAN
+
+int test_layer(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const std::vector<ncnn::Mat>& a, int top_blob_count, const std::vector<ncnn::Mat>& top_shapes, float epsilon, void (*func)(ncnn::Layer*), int flag)
+{
+    // naive
+    std::vector<ncnn::Mat> b;
+    {
+        int ret = test_layer_naive(typeindex, pd, weights, a, top_blob_count, b, func, flag);
+        if (ret != 233 && ret != 0)
+        {
+            fprintf(stderr, "test_layer_naive failed\n");
+            return -1;
+        }
+    }
+
+    // cpu
+    {
+        std::vector<ncnn::Mat> c;
+        int ret = test_layer_cpu(typeindex, pd, weights, _opt, a, top_blob_count, c, std::vector<ncnn::Mat>(), func, flag);
+        if (ret != 233 && (ret != 0 || CompareMat(b, c, epsilon) != 0))
+        {
+            fprintf(stderr, "test_layer_cpu failed\n");
+            return -1;
+        }
+    }
+
+    // cpu shape hint
+    {
+        std::vector<ncnn::Mat> c;
+        int ret = test_layer_cpu(typeindex, pd, weights, _opt, a, top_blob_count, c, b, func, flag);
+        if (ret != 233 && (ret != 0 || CompareMat(b, c, epsilon) != 0))
+        {
+            fprintf(stderr, "test_layer_cpu failed with shape hint\n");
+            return -1;
+        }
+    }
+
+#if NCNN_VULKAN
+    // gpu
+    if (!(flag & TEST_LAYER_DISABLE_GPU_TESTING))
+    {
+        std::vector<ncnn::Mat> d;
+        int ret = test_layer_gpu(typeindex, pd, weights, _opt, a, top_blob_count, d, std::vector<ncnn::Mat>(), func, flag);
+        if (ret != 233 && (ret != 0 || CompareMat(b, d, epsilon) != 0))
+        {
+            fprintf(stderr, "test_layer_gpu failed\n");
+            return -1;
+        }
+    }
+
+    // gpu shape hint
+    if (!(flag & TEST_LAYER_DISABLE_GPU_TESTING))
+    {
+        std::vector<ncnn::Mat> d;
+        int ret = test_layer_gpu(typeindex, pd, weights, _opt, a, top_blob_count, d, b, func, flag);
+        if (ret != 233 && (ret != 0 || CompareMat(b, d, epsilon) != 0))
+        {
+            fprintf(stderr, "test_layer_gpu failed with shape hint\n");
+            return -1;
+        }
+    }
+#endif // NCNN_VULKAN
+
+    return 0;
+}
+
+int test_layer_naive(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Mat& a, ncnn::Mat& b, void (*func)(ncnn::Layer*), int flag)
+{
+    ncnn::Layer* op = ncnn::create_layer_naive(typeindex);
+
+    if (func)
+    {
+        (*func)((ncnn::Layer*)op);
+    }
+
+    op->load_param(pd);
+
+    ncnn::ModelBinFromMatArray mb(weights.data());
+
+    op->load_model(mb);
+
+    ncnn::Option opt;
+    opt.num_threads = 1;
+    opt.lightmode = false;
+    opt.use_packing_layout = false;
+    opt.use_fp16_packed = false;
+    opt.use_fp16_storage = false;
+    opt.use_fp16_arithmetic = false;
+    opt.use_shader_pack8 = false;
+    opt.use_image_storage = false;
+    opt.use_bf16_storage = false;
+    opt.use_vulkan_compute = false;
+
+    op->create_pipeline(opt);
+
+    if (op->support_inplace)
+    {
+        b = a.clone();
+        op->forward_inplace(b, opt);
+    }
+    else
+    {
+        op->forward(a, b, opt);
+    }
+
+    op->destroy_pipeline(opt);
+
+    delete op;
+
+    return 0;
+}
+
+int test_layer_cpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const ncnn::Mat& a, ncnn::Mat& c, const ncnn::Mat& top_shape, void (*func)(ncnn::Layer*), int flag)
+{
+    ncnn::Layer* op = ncnn::create_layer_cpu(typeindex);
+
+    if (!op->support_packing && _opt.use_packing_layout)
+    {
+        delete op;
+        return 233;
+    }
+    if (!op->support_bf16_storage && !op->support_fp16_storage && (_opt.use_bf16_storage || _opt.use_fp16_arithmetic))
+    {
+        delete op;
+        return 233;
+    }
+
+    if (func)
+    {
+        (*func)((ncnn::Layer*)op);
+    }
+
+    if (top_shape.dims)
+    {
+        op->bottom_shapes.resize(1);
+        op->top_shapes.resize(1);
+        op->bottom_shapes[0] = a;
+        op->top_shapes[0] = top_shape;
+    }
+
+    op->load_param(pd);
+
+    ncnn::ModelBinFromMatArray mb(weights.data());
+
+    op->load_model(mb);
+
+    ncnn::Option opt = _opt;
+    opt.num_threads = 1;
+    opt.use_vulkan_compute = false;
+
+    op->create_pipeline(opt);
+
+    if (!op->support_packing && _opt.use_packing_layout)
+    {
+        op->destroy_pipeline(opt);
+        delete op;
+        return 233;
+    }
+    if (!op->support_bf16_storage && !op->support_fp16_storage && (_opt.use_bf16_storage || _opt.use_fp16_arithmetic))
+    {
+        op->destroy_pipeline(opt);
+        delete op;
+        return 233;
+    }
+
+    ncnn::Mat a4;
+
+    // clang-format off
+    // *INDENT-OFF*
+#if NCNN_ARM82
+    if (opt.use_fp16_storage && ncnn::cpu_support_arm_asimdhp() && op->support_fp16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
+    {
+        ncnn::cast_float32_to_float16(a, a4, opt);
+    }
+    else
+#endif // NCNN_ARM82
+#if NCNN_RVV
+    if (opt.use_fp16_storage && ncnn::cpu_support_riscv_v() && ncnn::cpu_support_riscv_zfh() && op->support_fp16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
+    {
+        ncnn::cast_float32_to_float16(a, a4, opt);
+    }
+    else
+#endif // NCNN_RVV
+#if NCNN_BF16
+    if (opt.use_bf16_storage && op->support_bf16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
+    {
+        ncnn::cast_float32_to_bfloat16(a, a4, opt);
+    }
+    else
+#endif // NCNN_BF16
+    if (opt.use_fp16_storage && op->support_fp16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
+    {
+        ncnn::cast_float32_to_float16(a, a4, opt);
+    }
+    else
+    {
+        a4 = a;
+    }
+    // *INDENT-ON*
+    // clang-format on
+
+    if (opt.use_packing_layout && op->support_packing && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_PACKING))
+    {
+        // resolve dst_elempack
+        int dims = a4.dims;
+        int elemcount = 0;
+        if (dims == 1) elemcount = a4.elempack * a4.w;
+        if (dims == 2) elemcount = a4.elempack * a4.h;
+        if (dims == 3 || dims == 4) elemcount = a4.elempack * a4.c;
+
+        int elembits = a4.elembits();
+
+        int dst_elempack = 1;
+
+        if (elembits == 32)
+        {
+#if NCNN_AVX512
+            if (elemcount % 16 == 0 && ncnn::cpu_support_x86_avx512())
+                dst_elempack = 16;
+            else if (elemcount % 8 == 0 && ncnn::cpu_support_x86_avx())
+                dst_elempack = 8;
+            else if (elemcount % 4 == 0)
+                dst_elempack = 4;
+#elif NCNN_AVX
+            if (elemcount % 8 == 0 && ncnn::cpu_support_x86_avx())
+                dst_elempack = 8;
+            else if (elemcount % 4 == 0)
+                dst_elempack = 4;
+#elif NCNN_RVV
+            const int packn = ncnn::cpu_riscv_vlenb() / (elembits / 8);
+            if (elemcount % packn == 0)
+                dst_elempack = packn;
+#else
+            if (elemcount % 4 == 0)
+                dst_elempack = 4;
+#endif
+        }
+        if (elembits == 16)
+        {
+#if NCNN_ARM82
+            if (elemcount % 8 == 0 && ncnn::cpu_support_arm_asimdhp() && opt.use_fp16_arithmetic)
+                dst_elempack = 8;
+            else if (elemcount % 4 == 0)
+                dst_elempack = 4;
+#elif NCNN_RVV
+            const int packn = ncnn::cpu_riscv_vlenb() / 2;
+            if (elemcount % packn == 0)
+                dst_elempack = packn;
+#else
+            if (elemcount % 4 == 0)
+                dst_elempack = 4;
+#endif
+        }
+        if (elembits == 8)
+        {
+#if NCNN_RVV
+            const int packn = ncnn::cpu_riscv_vlenb() / 1;
+            if (elemcount % packn == 0)
+                dst_elempack = packn;
+#else
+            if (elemcount % 8 == 0)
+                dst_elempack = 8;
+#endif
+        }
+
+        if (flag & TEST_LAYER_ENABLE_FORCE_INPUT_PACK8)
+            dst_elempack = 8;
+
+        ncnn::Mat a4_packed;
+        ncnn::convert_packing(a4, a4_packed, dst_elempack, opt);
+        a4 = a4_packed;
+    }
+
+    if (op->support_inplace)
+    {
+        c = a4.clone();
+        op->forward_inplace(c, opt);
+    }
+    else
+    {
+        op->forward(a4, c, opt);
+    }
+
+    // clang-format off
+    // *INDENT-OFF*
+#if NCNN_ARM82
+    if (opt.use_fp16_storage && ncnn::cpu_support_arm_asimdhp() && op->support_fp16_storage && c.elembits() == 16)
+    {
+        ncnn::Mat c_fp32;
+        ncnn::cast_float16_to_float32(c, c_fp32, opt);
+        c = c_fp32;
+    }
+    else
+#endif // NCNN_ARM82
+#if NCNN_RVV
+    if (opt.use_fp16_storage && ncnn::cpu_support_riscv_v() && ncnn::cpu_support_riscv_zfh() && op->support_fp16_storage && c.elembits() == 16)
+    {
+        ncnn::Mat c_fp32;
+        ncnn::cast_float16_to_float32(c, c_fp32, opt);
+        c = c_fp32;
+    }
+    else
+#endif // NCNN_RVV
+#if NCNN_BF16
+    if (opt.use_bf16_storage && op->support_bf16_storage && c.elembits() == 16)
+    {
+        ncnn::Mat c_fp32;
+        ncnn::cast_bfloat16_to_float32(c, c_fp32, opt);
+        c = c_fp32;
+    }
+    else
+#endif // NCNN_BF16
+    if (opt.use_fp16_storage && op->support_fp16_storage && c.elembits() == 16)
+    {
+        ncnn::Mat c_fp32;
+        ncnn::cast_float16_to_float32(c, c_fp32, opt);
+        c = c_fp32;
+    }
+    // *INDENT-ON*
+    // clang-format on
+
+    op->destroy_pipeline(opt);
+
+    delete op;
+
+    return 0;
+}
+
+#if NCNN_VULKAN
+int test_layer_gpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const ncnn::Mat& a, ncnn::Mat& d, const ncnn::Mat& top_shape, void (*func)(ncnn::Layer*), int flag)
+{
+    if (!_opt.use_packing_layout)
+    {
+        // pack1 test is useless for gpu
+        return 233;
+    }
+
+    ncnn::Layer* op = ncnn::create_layer_vulkan(typeindex);
+
+    if (!op->support_vulkan)
+    {
+        delete op;
+        return 233;
+    }
+
+    ncnn::VulkanDevice* vkdev = ncnn::get_gpu_device();
+
+    op->vkdev = vkdev;
+
+    if (func)
+    {
+        (*func)((ncnn::Layer*)op);
+    }
+
+    if (top_shape.dims)
+    {
+        op->bottom_shapes.resize(1);
+        op->top_shapes.resize(1);
+        op->bottom_shapes[0] = a;
+        op->top_shapes[0] = top_shape;
+    }
+
+    op->load_param(pd);
+
+    ncnn::ModelBinFromMatArray mb(weights.data());
+
+    op->load_model(mb);
+
+    ncnn::VkWeightAllocator g_weight_vkallocator(vkdev);
+    ncnn::VkWeightStagingAllocator g_weight_staging_vkallocator(vkdev);
+
+    ncnn::VkAllocator* blob_vkallocator = vkdev->acquire_blob_allocator();
+    ncnn::VkAllocator* staging_vkallocator = vkdev->acquire_staging_allocator();
+
+    ncnn::Option opt = _opt;
+    opt.num_threads = 1;
+    opt.use_vulkan_compute = true;
+
+#if __APPLE__
+    opt.use_image_storage = false;
+#endif
+
+    opt.blob_vkallocator = blob_vkallocator;
+    opt.workspace_vkallocator = blob_vkallocator;
+    opt.staging_vkallocator = staging_vkallocator;
+
+    if (!vkdev->info.support_fp16_packed()) opt.use_fp16_packed = false;
+    if (!vkdev->info.support_fp16_storage()) opt.use_fp16_storage = false;
+    if (!vkdev->info.support_fp16_arithmetic()) opt.use_fp16_arithmetic = false;
+    if (!vkdev->info.support_cooperative_matrix()) opt.use_cooperative_matrix = false;
+
+    // FIXME fp16a may produce large error
+    opt.use_fp16_arithmetic = false;
+
+    op->create_pipeline(opt);
+
+    if (!op->support_vulkan)
+    {
+        op->destroy_pipeline(opt);
+        delete op;
+        return 233;
+    }
+
+    {
+        ncnn::VkTransfer cmd(vkdev);
+
+        ncnn::Option opt_upload = opt;
+        opt_upload.blob_vkallocator = &g_weight_vkallocator;
+        opt_upload.workspace_vkallocator = &g_weight_vkallocator;
+        opt_upload.staging_vkallocator = &g_weight_staging_vkallocator;
+
+        op->upload_model(cmd, opt_upload);
+
+        cmd.submit_and_wait();
+    }
+
+    {
+        // forward
+        ncnn::VkCompute cmd(vkdev);
+
+        if (op->support_image_storage && opt.use_image_storage)
+        {
+            // upload
+            ncnn::VkImageMat a_gpu;
+            cmd.record_upload(a, a_gpu, opt);
+
+            ncnn::VkImageMat d_gpu;
+            if (op->support_inplace)
+            {
+                op->forward_inplace(a_gpu, cmd, opt);
+
+                d_gpu = a_gpu;
+            }
+            else
+            {
+                op->forward(a_gpu, d_gpu, cmd, opt);
+            }
+
+            // download
+            cmd.record_download(d_gpu, d, opt);
+        }
+        else
+        {
+            // upload
+            ncnn::VkMat a_gpu;
+            cmd.record_upload(a, a_gpu, opt);
+
+            ncnn::VkMat d_gpu;
+            if (op->support_inplace)
+            {
+                op->forward_inplace(a_gpu, cmd, opt);
+
+                d_gpu = a_gpu;
+            }
+            else
+            {
+                op->forward(a_gpu, d_gpu, cmd, opt);
+            }
+
+            // download
+            cmd.record_download(d_gpu, d, opt);
+        }
+
+        cmd.submit_and_wait();
+    }
+
+    op->destroy_pipeline(opt);
+
+    delete op;
+
+    vkdev->reclaim_blob_allocator(blob_vkallocator);
+    vkdev->reclaim_staging_allocator(staging_vkallocator);
+    g_weight_vkallocator.clear();
+    g_weight_staging_vkallocator.clear();
+
+    return 0;
+}
+#endif // NCNN_VULKAN
+
+int test_layer(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const ncnn::Mat& a, const ncnn::Mat& top_shape, float epsilon, void (*func)(ncnn::Layer*), int flag)
+{
+    // naive
+    ncnn::Mat b;
+    {
+        int ret = test_layer_naive(typeindex, pd, weights, a, b, func, flag);
+        if (ret != 233 && ret != 0)
+        {
+            fprintf(stderr, "test_layer_naive failed\n");
+            return -1;
+        }
+    }
+
+    // cpu
+    {
+        ncnn::Mat c;
+        int ret = test_layer_cpu(typeindex, pd, weights, _opt, a, c, ncnn::Mat(), func, flag);
+        if (ret != 233 && (ret != 0 || CompareMat(b, c, epsilon) != 0))
+        {
+            fprintf(stderr, "test_layer_cpu failed\n");
+            return -1;
+        }
+    }
+
+    // cpu shape hint
+    {
+        ncnn::Mat c;
+        int ret = test_layer_cpu(typeindex, pd, weights, _opt, a, c, b, func, flag);
+        if (ret != 233 && (ret != 0 || CompareMat(b, c, epsilon) != 0))
+        {
+            fprintf(stderr, "test_layer_cpu failed with shape hint\n");
+            return -1;
+        }
+    }
+
+#if NCNN_VULKAN
+    // gpu
+    if (!(flag & TEST_LAYER_DISABLE_GPU_TESTING))
+    {
+        ncnn::Mat d;
+        int ret = test_layer_gpu(typeindex, pd, weights, _opt, a, d, ncnn::Mat(), func, flag);
+        if (ret != 233 && (ret != 0 || CompareMat(b, d, epsilon) != 0))
+        {
+            fprintf(stderr, "test_layer_gpu failed\n");
+            return -1;
+        }
+    }
+
+    // gpu shape hint
+    if (!(flag & TEST_LAYER_DISABLE_GPU_TESTING))
+    {
+        ncnn::Mat d;
+        int ret = test_layer_gpu(typeindex, pd, weights, _opt, a, d, b, func, flag);
+        if (ret != 233 && (ret != 0 || CompareMat(b, d, epsilon) != 0))
+        {
+            fprintf(stderr, "test_layer_gpu failed with shape hint\n");
+            return -1;
+        }
+    }
+#endif // NCNN_VULKAN
+
+    return 0;
+}
+
+int test_layer_opt(const char* layer_type, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& opt, const std::vector<ncnn::Mat>& a, int top_blob_count, float epsilon, void (*func)(ncnn::Layer*), int flag)
+{
+    // fp16 representation
+    std::vector<ncnn::Mat> a_fp16;
+    if (opt.use_bf16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
+    {
+        a_fp16.resize(a.size());
+        for (size_t j = 0; j < a.size(); j++)
+        {
+            ncnn::Mat tmp;
+            ncnn::cast_float32_to_bfloat16(a[j], tmp, opt);
+            ncnn::cast_bfloat16_to_float32(tmp, a_fp16[j], opt);
+        }
+    }
+    else if ((opt.use_fp16_packed || opt.use_fp16_storage) && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
+    {
+        a_fp16.resize(a.size());
+        for (size_t j = 0; j < a.size(); j++)
+        {
+            ncnn::Mat tmp;
+            ncnn::cast_float32_to_float16(a[j], tmp, opt);
+            ncnn::cast_float16_to_float32(tmp, a_fp16[j], opt);
+        }
+    }
+    else
+    {
+        a_fp16 = a;
+    }
+
+    std::vector<ncnn::Mat> weights_fp16;
+    float epsilon_fp16;
+    if (opt.use_bf16_storage)
+    {
+        weights_fp16.resize(weights.size());
+        for (size_t j = 0; j < weights.size(); j++)
+        {
+            ncnn::Mat tmp;
+            ncnn::cast_float32_to_bfloat16(weights[j], tmp, opt);
+            ncnn::cast_bfloat16_to_float32(tmp, weights_fp16[j], opt);
+        }
+        epsilon_fp16 = epsilon * 100; // 0.1
+    }
+    else if (opt.use_fp16_packed || opt.use_fp16_storage)
+    {
+        weights_fp16.resize(weights.size());
+        for (size_t j = 0; j < weights.size(); j++)
+        {
+            ncnn::Mat tmp;
+            ncnn::cast_float32_to_float16(weights[j], tmp, opt);
+            ncnn::cast_float16_to_float32(tmp, weights_fp16[j], opt);
+        }
+        epsilon_fp16 = epsilon * 100; // 0.1
+    }
+    else
+    {
+        weights_fp16 = weights;
+        epsilon_fp16 = epsilon;
+    }
+
+    if (opt.use_fp16_arithmetic)
+    {
+        epsilon_fp16 = epsilon * 1000; // 1.0
+    }
+
+    std::vector<ncnn::Mat> top_shapes;
+    int ret = test_layer(ncnn::layer_to_index(layer_type), pd, weights_fp16, opt, a_fp16, top_blob_count, top_shapes, epsilon_fp16, func, flag);
+    if (ret != 0)
+    {
+        fprintf(stderr, "test_layer %s failed use_packing_layout=%d use_fp16_packed=%d use_fp16_storage=%d use_fp16_arithmetic=%d use_shader_pack8=%d use_bf16_storage=%d use_image_storage=%d use_sgemm_convolution=%d use_winograd_convolution=%d\n", layer_type, opt.use_packing_layout, opt.use_fp16_packed, opt.use_fp16_storage, opt.use_fp16_arithmetic, opt.use_shader_pack8, opt.use_bf16_storage, opt.use_image_storage, opt.use_sgemm_convolution, opt.use_winograd_convolution);
+        return ret;
+    }
+
+    return 0;
+}
+
+int test_layer_opt(const char* layer_type, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& opt, const ncnn::Mat& a, float epsilon, void (*func)(ncnn::Layer*), int flag)
+{
+    // fp16 representation
+    ncnn::Mat a_fp16;
+    if (opt.use_bf16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
+    {
+        ncnn::Mat tmp;
+        ncnn::cast_float32_to_bfloat16(a, tmp, opt);
+        ncnn::cast_bfloat16_to_float32(tmp, a_fp16, opt);
+    }
+    else if ((opt.use_fp16_packed || opt.use_fp16_storage) && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
+    {
+        ncnn::Mat tmp;
+        ncnn::cast_float32_to_float16(a, tmp, opt);
+        ncnn::cast_float16_to_float32(tmp, a_fp16, opt);
+    }
+    else
+    {
+        a_fp16 = a;
+    }
+
+    std::vector<ncnn::Mat> weights_fp16;
+    float epsilon_fp16;
+    if (opt.use_bf16_storage)
+    {
+        weights_fp16.resize(weights.size());
+        for (size_t j = 0; j < weights.size(); j++)
+        {
+            ncnn::Mat tmp;
+            ncnn::cast_float32_to_bfloat16(weights[j], tmp, opt);
+            ncnn::cast_bfloat16_to_float32(tmp, weights_fp16[j], opt);
+        }
+        epsilon_fp16 = epsilon * 100; // 0.1
+    }
+    else if (opt.use_fp16_packed || opt.use_fp16_storage)
+    {
+        weights_fp16.resize(weights.size());
+        for (size_t j = 0; j < weights.size(); j++)
+        {
+            ncnn::Mat tmp;
+            ncnn::cast_float32_to_float16(weights[j], tmp, opt);
+            ncnn::cast_float16_to_float32(tmp, weights_fp16[j], opt);
+        }
+        epsilon_fp16 = epsilon * 100; // 0.1
+    }
+    else
+    {
+        weights_fp16 = weights;
+        epsilon_fp16 = epsilon;
+    }
+
+    if (opt.use_fp16_arithmetic)
+    {
+        epsilon_fp16 = epsilon * 1000; // 1.0
+    }
+
+    ncnn::Mat top_shape;
+    int ret = test_layer(ncnn::layer_to_index(layer_type), pd, weights_fp16, opt, a_fp16, top_shape, epsilon_fp16, func, flag);
+    if (ret != 0)
+    {
+        fprintf(stderr, "test_layer %s failed use_packing_layout=%d use_fp16_packed=%d use_fp16_storage=%d use_fp16_arithmetic=%d use_shader_pack8=%d use_bf16_storage=%d use_image_storage=%d use_sgemm_convolution=%d use_winograd_convolution=%d\n", layer_type, opt.use_packing_layout, opt.use_fp16_packed, opt.use_fp16_storage, opt.use_fp16_arithmetic, opt.use_shader_pack8, opt.use_bf16_storage, opt.use_image_storage, opt.use_sgemm_convolution, opt.use_winograd_convolution);
+        return ret;
+    }
+
+    return 0;
+}
+
+int test_layer(const char* layer_type, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const std::vector<ncnn::Mat>& a, int top_blob_count, float epsilon, void (*func)(ncnn::Layer*), int flag)
+{
+    // pack fp16p fp16s fp16a bf16s shader8 image
+    const int options[][7] = {
+        {0, 0, 0, 0, 0, 0, 0},
+        {0, 0, 1, 0, 0, 0, 0},
+        {0, 0, 1, 1, 1, 0, 0},
+        {1, 0, 0, 0, 0, 0, 0},
+        {1, 1, 0, 0, 1, 0, 0},
+        {1, 0, 1, 0, 0, 1, 0},
+        {1, 1, 1, 1, 0, 0, 0},
+        {1, 1, 1, 1, 1, 1, 1},
+    };
+
+    const int opt_count = sizeof(options) / sizeof(options[0]);
+
+    for (int i = 0; i < opt_count; i++)
+    {
+        ncnn::Option opt;
+        opt.num_threads = 1;
+        opt.use_packing_layout = options[i][0];
+        opt.use_fp16_packed = options[i][1];
+        opt.use_fp16_storage = options[i][2];
+        opt.use_fp16_arithmetic = options[i][3];
+        opt.use_bf16_storage = options[i][4];
+        opt.use_shader_pack8 = options[i][5];
+        opt.use_image_storage = options[i][6];
+
+        int ret = test_layer_opt(layer_type, pd, weights, opt, a, top_blob_count, epsilon, func, flag);
+        if (ret != 0)
+            return ret;
+    }
+
+    return 0;
+}
+
+int test_layer(const char* layer_type, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Mat& a, float epsilon, void (*func)(ncnn::Layer*), int flag)
+{
+    // pack fp16p fp16s fp16a bf16s shader8 image
+    const int options[][7] = {
+        {0, 0, 0, 0, 0, 0, 0},
+        {0, 0, 1, 0, 0, 0, 0},
+        {0, 0, 1, 1, 1, 0, 0},
+        {1, 0, 0, 0, 0, 0, 0},
+        {1, 1, 0, 0, 1, 0, 0},
+        {1, 0, 1, 0, 0, 1, 0},
+        {1, 1, 1, 1, 0, 0, 0},
+        {1, 1, 1, 1, 1, 1, 1},
+    };
+
+    const int opt_count = sizeof(options) / sizeof(options[0]);
+
+    for (int i = 0; i < opt_count; i++)
+    {
+        ncnn::Option opt;
+        opt.num_threads = 1;
+        opt.use_packing_layout = options[i][0];
+        opt.use_fp16_packed = options[i][1];
+        opt.use_fp16_storage = options[i][2];
+        opt.use_fp16_arithmetic = options[i][3];
+        opt.use_bf16_storage = options[i][4];
+        opt.use_shader_pack8 = options[i][5];
+        opt.use_image_storage = options[i][6];
+
+        int ret = test_layer_opt(layer_type, pd, weights, opt, a, epsilon, func, flag);
+        if (ret != 0)
+            return ret;
+    }
+
+    return 0;
+}
diff --git a/tests/testutil.h b/tests/testutil.h
index 2b740b9c48c..88e1ce2e160 100644
--- a/tests/testutil.h
+++ b/tests/testutil.h
@@ -23,11 +23,6 @@
 #include <stdio.h>
 #include <stdlib.h>
 
-#if NCNN_VULKAN
-#include "command.h"
-#include "gpu.h"
-#endif // NCNN_VULKAN
-
 static struct prng_rand_t g_prng_rand_state;
 
 #define SRAND(seed) prng_srand(seed, &g_prng_rand_state)
@@ -38,1505 +33,78 @@ static struct prng_rand_t g_prng_rand_state;
 #define TEST_LAYER_DISABLE_GPU_TESTING        (1 << 2)
 #define TEST_LAYER_ENABLE_FORCE_INPUT_PACK8   (1 << 3)
 
-static float RandomFloat(float a = -1.2f, float b = 1.2f)
-{
-    float random = ((float)RAND()) / (float)uint64_t(-1); //RAND_MAX;
-    float diff = b - a;
-    float r = random * diff;
-    float v = a + r;
-    // generate denormal as zero
-    if (v < 0.0001 && v > -0.0001)
-        v = 0.f;
-    return v;
-}
-
-static int RandomInt(int a = -10000, int b = 10000)
-{
-    float random = ((float)RAND()) / (float)uint64_t(-1); //RAND_MAX;
-    int diff = b - a;
-    float r = random * diff;
-    return a + (int)r;
-}
-
-static signed char RandomS8()
-{
-    return (signed char)RandomInt(-127, 127);
-}
-
-static void Randomize(ncnn::Mat& m, float a = -1.2f, float b = 1.2f)
-{
-    for (size_t i = 0; i < m.total(); i++)
-    {
-        m[i] = RandomFloat(a, b);
-    }
-}
-
-static void RandomizeInt(ncnn::Mat& m, int a = -10000, int b = 10000)
-{
-    for (size_t i = 0; i < m.total(); i++)
-    {
-        ((int*)m)[i] = RandomInt(a, b);
-    }
-}
-
-static void RandomizeS8(ncnn::Mat& m)
-{
-    for (size_t i = 0; i < m.total(); i++)
-    {
-        ((signed char*)m)[i] = RandomS8();
-    }
-}
-
-static ncnn::Mat RandomMat(int w, float a = -1.2f, float b = 1.2f)
-{
-    ncnn::Mat m(w);
-    Randomize(m, a, b);
-    return m;
-}
-
-static ncnn::Mat RandomMat(int w, int h, float a = -1.2f, float b = 1.2f)
-{
-    ncnn::Mat m(w, h);
-    Randomize(m, a, b);
-    return m;
-}
-
-static ncnn::Mat RandomMat(int w, int h, int c, float a = -1.2f, float b = 1.2f)
-{
-    ncnn::Mat m(w, h, c);
-    Randomize(m, a, b);
-    return m;
-}
-
-static ncnn::Mat RandomMat(int w, int h, int d, int c, float a = -1.2f, float b = 1.2f)
-{
-    ncnn::Mat m(w, h, d, c);
-    Randomize(m, a, b);
-    return m;
-}
-
-static ncnn::Mat RandomIntMat(int w)
-{
-    ncnn::Mat m(w);
-    RandomizeInt(m);
-    return m;
-}
-
-static ncnn::Mat RandomIntMat(int w, int h)
-{
-    ncnn::Mat m(w, h);
-    RandomizeInt(m);
-    return m;
-}
-
-static ncnn::Mat RandomIntMat(int w, int h, int c)
-{
-    ncnn::Mat m(w, h, c);
-    RandomizeInt(m);
-    return m;
-}
-
-static ncnn::Mat RandomIntMat(int w, int h, int d, int c)
-{
-    ncnn::Mat m(w, h, d, c);
-    RandomizeInt(m);
-    return m;
-}
-
-static ncnn::Mat RandomS8Mat(int w)
-{
-    ncnn::Mat m(w, (size_t)1u);
-    RandomizeS8(m);
-    return m;
-}
-
-static ncnn::Mat RandomS8Mat(int w, int h)
-{
-    ncnn::Mat m(w, h, (size_t)1u);
-    RandomizeS8(m);
-    return m;
-}
-
-static ncnn::Mat RandomS8Mat(int w, int h, int c)
-{
-    ncnn::Mat m(w, h, c, (size_t)1u);
-    RandomizeS8(m);
-    return m;
-}
-
-static ncnn::Mat RandomS8Mat(int w, int h, int d, int c)
-{
-    ncnn::Mat m(w, h, d, c, (size_t)1u);
-    RandomizeS8(m);
-    return m;
-}
-
-static ncnn::Mat scales_mat(const ncnn::Mat& mat, int m, int k, int ldx)
-{
-    ncnn::Mat weight_scales(m);
-    for (int i = 0; i < m; ++i)
-    {
-        float min = mat[0], _max = mat[0];
-        const float* ptr = (const float*)(mat.data) + i * ldx;
-        for (int j = 0; j < k; ++j)
-        {
-            if (min > ptr[j])
-            {
-                min = ptr[j];
-            }
-            if (_max < ptr[j])
-            {
-                _max = ptr[j];
-            }
-        }
-        const float abs_min = abs(min), abs_max = abs(_max);
-        weight_scales[i] = 127.f / (abs_min > abs_max ? abs_min : abs_max);
-    }
-    return weight_scales;
-}
-
-static bool NearlyEqual(float a, float b, float epsilon)
-{
-    if (a == b)
-        return true;
-
-    float diff = (float)fabs(a - b);
-    if (diff <= epsilon)
-        return true;
-
-    // relative error
-    return diff < epsilon * std::max(fabs(a), fabs(b));
-}
-
-static int Compare(const ncnn::Mat& a, const ncnn::Mat& b, float epsilon = 0.001)
-{
-#define CHECK_MEMBER(m)                                                                 \
-    if (a.m != b.m)                                                                     \
-    {                                                                                   \
-        fprintf(stderr, #m " not match    expect %d but got %d\n", (int)a.m, (int)b.m); \
-        return -1;                                                                      \
-    }
-
-    CHECK_MEMBER(dims)
-    CHECK_MEMBER(w)
-    CHECK_MEMBER(h)
-    CHECK_MEMBER(d)
-    CHECK_MEMBER(c)
-    CHECK_MEMBER(elemsize)
-    CHECK_MEMBER(elempack)
-
-#undef CHECK_MEMBER
-
-    for (int q = 0; q < a.c; q++)
-    {
-        const ncnn::Mat ma = a.channel(q);
-        const ncnn::Mat mb = b.channel(q);
-        for (int z = 0; z < a.d; z++)
-        {
-            const ncnn::Mat da = ma.depth(z);
-            const ncnn::Mat db = mb.depth(z);
-            for (int i = 0; i < a.h; i++)
-            {
-                const float* pa = da.row(i);
-                const float* pb = db.row(i);
-                for (int j = 0; j < a.w; j++)
-                {
-                    if (!NearlyEqual(pa[j], pb[j], epsilon))
-                    {
-                        fprintf(stderr, "value not match  at c:%d d:%d h:%d w:%d    expect %f but got %f\n", q, z, i, j, pa[j], pb[j]);
-                        return -1;
-                    }
-                }
-            }
-        }
-    }
-
-    return 0;
-}
-
-static int CompareMat(const ncnn::Mat& a, const ncnn::Mat& b, float epsilon = 0.001)
-{
-    ncnn::Option opt;
-    opt.num_threads = 1;
-
-    if (a.elempack != 1)
-    {
-        ncnn::Mat a1;
-        ncnn::convert_packing(a, a1, 1, opt);
-        return CompareMat(a1, b, epsilon);
-    }
-
-    if (b.elempack != 1)
-    {
-        ncnn::Mat b1;
-        ncnn::convert_packing(b, b1, 1, opt);
-        return CompareMat(a, b1, epsilon);
-    }
-
-    if (a.elemsize == 2u)
-    {
-        ncnn::Mat a32;
-        cast_float16_to_float32(a, a32, opt);
-        return CompareMat(a32, b, epsilon);
-    }
-    if (a.elemsize == 1u)
-    {
-        ncnn::Mat a32;
-        cast_int8_to_float32(a, a32, opt);
-        return CompareMat(a32, b, epsilon);
-    }
-
-    if (b.elemsize == 2u)
-    {
-        ncnn::Mat b32;
-        cast_float16_to_float32(b, b32, opt);
-        return CompareMat(a, b32, epsilon);
-    }
-    if (b.elemsize == 1u)
-    {
-        ncnn::Mat b32;
-        cast_int8_to_float32(b, b32, opt);
-        return CompareMat(a, b32, epsilon);
-    }
-
-    return Compare(a, b, epsilon);
-}
-
-static int CompareMat(const std::vector<ncnn::Mat>& a, const std::vector<ncnn::Mat>& b, float epsilon = 0.001)
-{
-    if (a.size() != b.size())
-    {
-        fprintf(stderr, "output blob count not match %zu %zu\n", a.size(), b.size());
-        return -1;
-    }
-
-    for (size_t i = 0; i < a.size(); i++)
-    {
-        if (CompareMat(a[i], b[i], epsilon))
-        {
-            fprintf(stderr, "output blob %zu not match\n", i);
-            return -1;
-        }
-    }
-
-    return 0;
-}
-
-int test_layer_naive(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const std::vector<ncnn::Mat>& a, int top_blob_count, std::vector<ncnn::Mat>& b, void (*func)(ncnn::Layer*), int flag)
-{
-    ncnn::Layer* op = ncnn::create_layer_naive(typeindex);
-
-    if (func)
-    {
-        (*func)((ncnn::Layer*)op);
-    }
-
-    op->load_param(pd);
-
-    if (op->one_blob_only && a.size() != 1)
-    {
-        fprintf(stderr, "layer with one_blob_only but consume multiple inputs\n");
-        delete op;
-        return -1;
-    }
-
-    ncnn::ModelBinFromMatArray mb(weights.data());
-
-    op->load_model(mb);
-
-    ncnn::Option opt;
-    opt.num_threads = 1;
-    opt.lightmode = false;
-    opt.use_packing_layout = false;
-    opt.use_fp16_packed = false;
-    opt.use_fp16_storage = false;
-    opt.use_fp16_arithmetic = false;
-    opt.use_shader_pack8 = false;
-    opt.use_image_storage = false;
-    opt.use_bf16_storage = false;
-    opt.use_vulkan_compute = false;
-
-    op->create_pipeline(opt);
-
-    b.resize(top_blob_count);
-
-    if (op->support_inplace)
-    {
-        for (size_t i = 0; i < a.size(); i++)
-        {
-            b[i] = a[i].clone();
-        }
-
-        op->forward_inplace(b, opt);
-    }
-    else
-    {
-        op->forward(a, b, opt);
-    }
-
-    op->destroy_pipeline(opt);
-
-    delete op;
-
-    return 0;
-}
-
-int test_layer_cpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const std::vector<ncnn::Mat>& a, int top_blob_count, std::vector<ncnn::Mat>& c, const std::vector<ncnn::Mat>& top_shapes, void (*func)(ncnn::Layer*), int flag)
-{
-    ncnn::Layer* op = ncnn::create_layer_cpu(typeindex);
-
-    if (!op->support_packing && _opt.use_packing_layout)
-    {
-        delete op;
-        return 233;
-    }
-    if (!op->support_bf16_storage && !op->support_fp16_storage && (_opt.use_bf16_storage || _opt.use_fp16_arithmetic))
-    {
-        delete op;
-        return 233;
-    }
-
-    if (func)
-    {
-        (*func)((ncnn::Layer*)op);
-    }
-
-    if (!top_shapes.empty())
-    {
-        op->bottom_shapes = a;
-        op->top_shapes = top_shapes;
-    }
-
-    op->load_param(pd);
-
-    if (op->one_blob_only && a.size() != 1)
-    {
-        fprintf(stderr, "layer with one_blob_only but consume multiple inputs\n");
-        delete op;
-        return -1;
-    }
-
-    ncnn::ModelBinFromMatArray mb(weights.data());
-
-    op->load_model(mb);
-
-    ncnn::Option opt = _opt;
-    opt.num_threads = 1;
-    opt.use_vulkan_compute = false;
-
-    op->create_pipeline(opt);
-
-    if (!op->support_packing && _opt.use_packing_layout)
-    {
-        op->destroy_pipeline(opt);
-        delete op;
-        return 233;
-    }
-    if (!op->support_bf16_storage && !op->support_fp16_storage && (_opt.use_bf16_storage || _opt.use_fp16_arithmetic))
-    {
-        op->destroy_pipeline(opt);
-        delete op;
-        return 233;
-    }
-
-    std::vector<ncnn::Mat> a4(a.size());
-
-    for (size_t i = 0; i < a4.size(); i++)
-    {
-        // clang-format off
-        // *INDENT-OFF*
-#if NCNN_ARM82
-        if (opt.use_fp16_storage && ncnn::cpu_support_arm_asimdhp() && op->support_fp16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
-        {
-            ncnn::cast_float32_to_float16(a[i], a4[i], opt);
-        }
-        else
-#endif // NCNN_ARM82
-#if NCNN_RVV
-        if (opt.use_fp16_storage && ncnn::cpu_support_riscv_v() && ncnn::cpu_support_riscv_zfh() && op->support_fp16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
-        {
-            ncnn::cast_float32_to_float16(a[i], a4[i], opt);
-        }
-        else
-#endif // NCNN_RVV
-#if NCNN_BF16
-        if (opt.use_bf16_storage && op->support_bf16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
-        {
-            ncnn::cast_float32_to_bfloat16(a[i], a4[i], opt);
-        }
-        else
-#endif // NCNN_BF16
-        if (opt.use_fp16_storage && op->support_fp16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
-        {
-            ncnn::cast_float32_to_float16(a[i], a4[i], opt);
-        }
-        else
-        {
-            a4[i] = a[i];
-        }
-        // *INDENT-ON*
-        // clang-format on
-
-        if (opt.use_packing_layout && op->support_packing && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_PACKING))
-        {
-            // resolve dst_elempack
-            int dims = a4[i].dims;
-            int elemcount = 0;
-            if (dims == 1) elemcount = a4[i].elempack * a4[i].w;
-            if (dims == 2) elemcount = a4[i].elempack * a4[i].h;
-            if (dims == 3 || dims == 4) elemcount = a4[i].elempack * a4[i].c;
-
-            int elembits = a4[i].elembits();
-
-            int dst_elempack = 1;
-
-            if (elembits == 32)
-            {
-#if NCNN_AVX512
-                if (elemcount % 16 == 0 && ncnn::cpu_support_x86_avx512())
-                    dst_elempack = 16;
-                else if (elemcount % 8 == 0 && ncnn::cpu_support_x86_avx())
-                    dst_elempack = 8;
-                else if (elemcount % 4 == 0)
-                    dst_elempack = 4;
-#elif NCNN_AVX
-                if (elemcount % 8 == 0 && ncnn::cpu_support_x86_avx())
-                    dst_elempack = 8;
-                else if (elemcount % 4 == 0)
-                    dst_elempack = 4;
-#elif NCNN_RVV
-                const int packn = ncnn::cpu_riscv_vlenb() / (elembits / 8);
-                if (elemcount % packn == 0)
-                    dst_elempack = packn;
-#else
-                if (elemcount % 4 == 0)
-                    dst_elempack = 4;
-#endif
-            }
-            if (elembits == 16)
-            {
-#if NCNN_ARM82
-                if (elemcount % 8 == 0 && ncnn::cpu_support_arm_asimdhp() && opt.use_fp16_arithmetic)
-                    dst_elempack = 8;
-                else if (elemcount % 4 == 0)
-                    dst_elempack = 4;
-#elif NCNN_RVV
-                const int packn = ncnn::cpu_riscv_vlenb() / 2;
-                if (elemcount % packn == 0)
-                    dst_elempack = packn;
-#else
-                if (elemcount % 4 == 0)
-                    dst_elempack = 4;
-#endif
-            }
-            if (elembits == 8)
-            {
-#if NCNN_RVV
-                const int packn = ncnn::cpu_riscv_vlenb() / 1;
-                if (elemcount % packn == 0)
-                    dst_elempack = packn;
-#else
-                if (elemcount % 8 == 0)
-                    dst_elempack = 8;
-#endif
-            }
-
-            if (flag & TEST_LAYER_ENABLE_FORCE_INPUT_PACK8)
-                dst_elempack = 8;
-
-            ncnn::Mat a4_packed;
-            ncnn::convert_packing(a4[i], a4_packed, dst_elempack, opt);
-            a4[i] = a4_packed;
-        }
-    }
-
-    c.resize(top_blob_count);
-
-    if (op->support_inplace)
-    {
-        for (size_t i = 0; i < a4.size(); i++)
-        {
-            c[i] = a4[i].clone();
-        }
-
-        op->forward_inplace(c, opt);
-    }
-    else
-    {
-        op->forward(a4, c, opt);
-    }
-
-    for (size_t i = 0; i < c.size(); i++)
-    {
-        // clang-format off
-        // *INDENT-OFF*
-#if NCNN_ARM82
-        if (opt.use_fp16_storage && ncnn::cpu_support_arm_asimdhp() && op->support_fp16_storage && c[i].elembits() == 16)
-        {
-            ncnn::Mat c_fp32;
-            ncnn::cast_float16_to_float32(c[i], c_fp32, opt);
-            c[i] = c_fp32;
-        }
-        else
-#endif // NCNN_ARM82
-#if NCNN_RVV
-        if (opt.use_fp16_storage && ncnn::cpu_support_riscv_v() && ncnn::cpu_support_riscv_zfh() && op->support_fp16_storage && c[i].elembits() == 16)
-        {
-            ncnn::Mat c_fp32;
-            ncnn::cast_float16_to_float32(c[i], c_fp32, opt);
-            c[i] = c_fp32;
-        }
-        else
-#endif // NCNN_RVV
-#if NCNN_BF16
-        if (opt.use_bf16_storage && op->support_bf16_storage && c[i].elembits() == 16)
-        {
-            ncnn::Mat c_fp32;
-            ncnn::cast_bfloat16_to_float32(c[i], c_fp32, opt);
-            c[i] = c_fp32;
-        }
-        else
-#endif // NCNN_BF16
-        if (opt.use_fp16_storage && op->support_fp16_storage && c[i].elembits() == 16)
-        {
-            ncnn::Mat c_fp32;
-            ncnn::cast_float16_to_float32(c[i], c_fp32, opt);
-            c[i] = c_fp32;
-        }
-        // *INDENT-ON*
-        // clang-format on
-    }
-
-    op->destroy_pipeline(opt);
-
-    delete op;
-
-    return 0;
-}
-
-#if NCNN_VULKAN
-int test_layer_gpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const std::vector<ncnn::Mat>& a, int top_blob_count, std::vector<ncnn::Mat>& d, const std::vector<ncnn::Mat>& top_shapes, void (*func)(ncnn::Layer*), int flag)
-{
-    if (!_opt.use_packing_layout)
-    {
-        // pack1 test is useless for gpu
-        return 233;
-    }
-
-    ncnn::Layer* op = ncnn::create_layer_vulkan(typeindex);
-
-    if (!op->support_vulkan)
-    {
-        delete op;
-        return 233;
-    }
-
-    ncnn::VulkanDevice* vkdev = ncnn::get_gpu_device();
-
-    op->vkdev = vkdev;
-
-    if (func)
-    {
-        (*func)((ncnn::Layer*)op);
-    }
-
-    if (!top_shapes.empty())
-    {
-        op->bottom_shapes = a;
-        op->top_shapes = top_shapes;
-    }
-
-    op->load_param(pd);
-
-    if (op->one_blob_only && a.size() != 1)
-    {
-        fprintf(stderr, "layer with one_blob_only but consume multiple inputs\n");
-        delete op;
-        return -1;
-    }
-
-    ncnn::ModelBinFromMatArray mb(weights.data());
-
-    op->load_model(mb);
-
-    ncnn::VkWeightAllocator g_weight_vkallocator(vkdev);
-    ncnn::VkWeightStagingAllocator g_weight_staging_vkallocator(vkdev);
-
-    ncnn::VkAllocator* blob_vkallocator = vkdev->acquire_blob_allocator();
-    ncnn::VkAllocator* staging_vkallocator = vkdev->acquire_staging_allocator();
-
-    ncnn::Option opt = _opt;
-    opt.num_threads = 1;
-    opt.use_vulkan_compute = true;
-
-#if __APPLE__
-    opt.use_image_storage = false;
-#endif
+float RandomFloat(float a = -1.2f, float b = 1.2f);
 
-    opt.blob_vkallocator = blob_vkallocator;
-    opt.workspace_vkallocator = blob_vkallocator;
-    opt.staging_vkallocator = staging_vkallocator;
+int RandomInt(int a = -10000, int b = 10000);
 
-    if (!vkdev->info.support_fp16_packed()) opt.use_fp16_packed = false;
-    if (!vkdev->info.support_fp16_storage()) opt.use_fp16_storage = false;
-    if (!vkdev->info.support_fp16_arithmetic()) opt.use_fp16_arithmetic = false;
-    if (!vkdev->info.support_cooperative_matrix()) opt.use_cooperative_matrix = false;
+signed char RandomS8();
 
-    // FIXME fp16a may produce large error
-    opt.use_fp16_arithmetic = false;
+void Randomize(ncnn::Mat& m, float a = -1.2f, float b = 1.2f);
 
-    op->create_pipeline(opt);
+void RandomizeInt(ncnn::Mat& m, int a = -10000, int b = 10000);
 
-    if (!op->support_vulkan)
-    {
-        op->destroy_pipeline(opt);
-        delete op;
-        return 233;
-    }
+void RandomizeS8(ncnn::Mat& m);
 
-    {
-        ncnn::VkTransfer cmd(vkdev);
+ncnn::Mat RandomMat(int w, float a = -1.2f, float b = 1.2f);
 
-        ncnn::Option opt_upload = opt;
-        opt_upload.blob_vkallocator = &g_weight_vkallocator;
-        opt_upload.workspace_vkallocator = &g_weight_vkallocator;
-        opt_upload.staging_vkallocator = &g_weight_staging_vkallocator;
+ncnn::Mat RandomMat(int w, int h, float a = -1.2f, float b = 1.2f);
 
-        op->upload_model(cmd, opt_upload);
+ncnn::Mat RandomMat(int w, int h, int c, float a = -1.2f, float b = 1.2f);
 
-        cmd.submit_and_wait();
-    }
+ncnn::Mat RandomMat(int w, int h, int d, int c, float a = -1.2f, float b = 1.2f);
 
-    d.resize(top_blob_count);
+ncnn::Mat RandomIntMat(int w);
 
-    {
-        // forward
-        ncnn::VkCompute cmd(vkdev);
+ncnn::Mat RandomIntMat(int w, int h);
 
-        if (op->support_image_storage && opt.use_image_storage)
-        {
-            // upload
-            std::vector<ncnn::VkImageMat> a_gpu(a.size());
-            for (size_t i = 0; i < a_gpu.size(); i++)
-            {
-                cmd.record_upload(a[i], a_gpu[i], opt);
-            }
+ncnn::Mat RandomIntMat(int w, int h, int c);
 
-            std::vector<ncnn::VkImageMat> d_gpu(top_blob_count);
-            if (op->support_inplace)
-            {
-                op->forward_inplace(a_gpu, cmd, opt);
+ncnn::Mat RandomIntMat(int w, int h, int d, int c);
 
-                d_gpu = a_gpu;
-            }
-            else
-            {
-                op->forward(a_gpu, d_gpu, cmd, opt);
-            }
+ncnn::Mat RandomS8Mat(int w);
 
-            // download
-            for (size_t i = 0; i < d_gpu.size(); i++)
-            {
-                cmd.record_download(d_gpu[i], d[i], opt);
-            }
-        }
-        else
-        {
-            // upload
-            std::vector<ncnn::VkMat> a_gpu(a.size());
-            for (size_t i = 0; i < a_gpu.size(); i++)
-            {
-                cmd.record_upload(a[i], a_gpu[i], opt);
-            }
+ncnn::Mat RandomS8Mat(int w, int h);
 
-            std::vector<ncnn::VkMat> d_gpu(top_blob_count);
-            if (op->support_inplace)
-            {
-                op->forward_inplace(a_gpu, cmd, opt);
+ncnn::Mat RandomS8Mat(int w, int h, int c);
 
-                d_gpu = a_gpu;
-            }
-            else
-            {
-                op->forward(a_gpu, d_gpu, cmd, opt);
-            }
+ncnn::Mat RandomS8Mat(int w, int h, int d, int c);
 
-            // download
-            for (size_t i = 0; i < d_gpu.size(); i++)
-            {
-                cmd.record_download(d_gpu[i], d[i], opt);
-            }
-        }
+ncnn::Mat scales_mat(const ncnn::Mat& mat, int m, int k, int ldx);
 
-        cmd.submit_and_wait();
-    }
+bool NearlyEqual(float a, float b, float epsilon);
 
-    op->destroy_pipeline(opt);
+int Compare(const ncnn::Mat& a, const ncnn::Mat& b, float epsilon = 0.001);
 
-    delete op;
+int CompareMat(const ncnn::Mat& a, const ncnn::Mat& b, float epsilon = 0.001);
 
-    vkdev->reclaim_blob_allocator(blob_vkallocator);
-    vkdev->reclaim_staging_allocator(staging_vkallocator);
-    g_weight_vkallocator.clear();
-    g_weight_staging_vkallocator.clear();
+int CompareMat(const std::vector<ncnn::Mat>& a, const std::vector<ncnn::Mat>& b, float epsilon = 0.001);
 
-    return 0;
-}
-#endif // NCNN_VULKAN
-
-int test_layer(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const std::vector<ncnn::Mat>& a, int top_blob_count, const std::vector<ncnn::Mat>& top_shapes = std::vector<ncnn::Mat>(), float epsilon = 0.001, void (*func)(ncnn::Layer*) = 0, int flag = 0)
-{
-    // naive
-    std::vector<ncnn::Mat> b;
-    {
-        int ret = test_layer_naive(typeindex, pd, weights, a, top_blob_count, b, func, flag);
-        if (ret != 233 && ret != 0)
-        {
-            fprintf(stderr, "test_layer_naive failed\n");
-            return -1;
-        }
-    }
-
-    // cpu
-    {
-        std::vector<ncnn::Mat> c;
-        int ret = test_layer_cpu(typeindex, pd, weights, _opt, a, top_blob_count, c, std::vector<ncnn::Mat>(), func, flag);
-        if (ret != 233 && (ret != 0 || CompareMat(b, c, epsilon) != 0))
-        {
-            fprintf(stderr, "test_layer_cpu failed\n");
-            return -1;
-        }
-    }
+int test_layer_naive(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const std::vector<ncnn::Mat>& a, int top_blob_count, std::vector<ncnn::Mat>& b, void (*func)(ncnn::Layer*), int flag);
 
-    // cpu shape hint
-    {
-        std::vector<ncnn::Mat> c;
-        int ret = test_layer_cpu(typeindex, pd, weights, _opt, a, top_blob_count, c, b, func, flag);
-        if (ret != 233 && (ret != 0 || CompareMat(b, c, epsilon) != 0))
-        {
-            fprintf(stderr, "test_layer_cpu failed with shape hint\n");
-            return -1;
-        }
-    }
+int test_layer_cpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const std::vector<ncnn::Mat>& a, int top_blob_count, std::vector<ncnn::Mat>& c, const std::vector<ncnn::Mat>& top_shapes, void (*func)(ncnn::Layer*), int flag);
 
 #if NCNN_VULKAN
-    // gpu
-    if (!(flag & TEST_LAYER_DISABLE_GPU_TESTING))
-    {
-        std::vector<ncnn::Mat> d;
-        int ret = test_layer_gpu(typeindex, pd, weights, _opt, a, top_blob_count, d, std::vector<ncnn::Mat>(), func, flag);
-        if (ret != 233 && (ret != 0 || CompareMat(b, d, epsilon) != 0))
-        {
-            fprintf(stderr, "test_layer_gpu failed\n");
-            return -1;
-        }
-    }
-
-    // gpu shape hint
-    if (!(flag & TEST_LAYER_DISABLE_GPU_TESTING))
-    {
-        std::vector<ncnn::Mat> d;
-        int ret = test_layer_gpu(typeindex, pd, weights, _opt, a, top_blob_count, d, b, func, flag);
-        if (ret != 233 && (ret != 0 || CompareMat(b, d, epsilon) != 0))
-        {
-            fprintf(stderr, "test_layer_gpu failed with shape hint\n");
-            return -1;
-        }
-    }
-#endif // NCNN_VULKAN
-
-    return 0;
-}
-
-int test_layer_naive(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Mat& a, ncnn::Mat& b, void (*func)(ncnn::Layer*), int flag)
-{
-    ncnn::Layer* op = ncnn::create_layer_naive(typeindex);
-
-    if (func)
-    {
-        (*func)((ncnn::Layer*)op);
-    }
-
-    op->load_param(pd);
-
-    ncnn::ModelBinFromMatArray mb(weights.data());
-
-    op->load_model(mb);
-
-    ncnn::Option opt;
-    opt.num_threads = 1;
-    opt.lightmode = false;
-    opt.use_packing_layout = false;
-    opt.use_fp16_packed = false;
-    opt.use_fp16_storage = false;
-    opt.use_fp16_arithmetic = false;
-    opt.use_shader_pack8 = false;
-    opt.use_image_storage = false;
-    opt.use_bf16_storage = false;
-    opt.use_vulkan_compute = false;
-
-    op->create_pipeline(opt);
-
-    if (op->support_inplace)
-    {
-        b = a.clone();
-        op->forward_inplace(b, opt);
-    }
-    else
-    {
-        op->forward(a, b, opt);
-    }
-
-    op->destroy_pipeline(opt);
-
-    delete op;
-
-    return 0;
-}
-
-int test_layer_cpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const ncnn::Mat& a, ncnn::Mat& c, const ncnn::Mat& top_shape, void (*func)(ncnn::Layer*), int flag)
-{
-    ncnn::Layer* op = ncnn::create_layer_cpu(typeindex);
-
-    if (!op->support_packing && _opt.use_packing_layout)
-    {
-        delete op;
-        return 233;
-    }
-    if (!op->support_bf16_storage && !op->support_fp16_storage && (_opt.use_bf16_storage || _opt.use_fp16_arithmetic))
-    {
-        delete op;
-        return 233;
-    }
-
-    if (func)
-    {
-        (*func)((ncnn::Layer*)op);
-    }
-
-    if (top_shape.dims)
-    {
-        op->bottom_shapes.resize(1);
-        op->top_shapes.resize(1);
-        op->bottom_shapes[0] = a;
-        op->top_shapes[0] = top_shape;
-    }
-
-    op->load_param(pd);
-
-    ncnn::ModelBinFromMatArray mb(weights.data());
-
-    op->load_model(mb);
-
-    ncnn::Option opt = _opt;
-    opt.num_threads = 1;
-    opt.use_vulkan_compute = false;
-
-    op->create_pipeline(opt);
-
-    if (!op->support_packing && _opt.use_packing_layout)
-    {
-        op->destroy_pipeline(opt);
-        delete op;
-        return 233;
-    }
-    if (!op->support_bf16_storage && !op->support_fp16_storage && (_opt.use_bf16_storage || _opt.use_fp16_arithmetic))
-    {
-        op->destroy_pipeline(opt);
-        delete op;
-        return 233;
-    }
-
-    ncnn::Mat a4;
-
-    // clang-format off
-    // *INDENT-OFF*
-#if NCNN_ARM82
-    if (opt.use_fp16_storage && ncnn::cpu_support_arm_asimdhp() && op->support_fp16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
-    {
-        ncnn::cast_float32_to_float16(a, a4, opt);
-    }
-    else
-#endif // NCNN_ARM82
-#if NCNN_RVV
-    if (opt.use_fp16_storage && ncnn::cpu_support_riscv_v() && ncnn::cpu_support_riscv_zfh() && op->support_fp16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
-    {
-        ncnn::cast_float32_to_float16(a, a4, opt);
-    }
-    else
-#endif // NCNN_RVV
-#if NCNN_BF16
-    if (opt.use_bf16_storage && op->support_bf16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
-    {
-        ncnn::cast_float32_to_bfloat16(a, a4, opt);
-    }
-    else
-#endif // NCNN_BF16
-    if (opt.use_fp16_storage && op->support_fp16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
-    {
-        ncnn::cast_float32_to_float16(a, a4, opt);
-    }
-    else
-    {
-        a4 = a;
-    }
-    // *INDENT-ON*
-    // clang-format on
-
-    if (opt.use_packing_layout && op->support_packing && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_PACKING))
-    {
-        // resolve dst_elempack
-        int dims = a4.dims;
-        int elemcount = 0;
-        if (dims == 1) elemcount = a4.elempack * a4.w;
-        if (dims == 2) elemcount = a4.elempack * a4.h;
-        if (dims == 3 || dims == 4) elemcount = a4.elempack * a4.c;
-
-        int elembits = a4.elembits();
-
-        int dst_elempack = 1;
-
-        if (elembits == 32)
-        {
-#if NCNN_AVX512
-            if (elemcount % 16 == 0 && ncnn::cpu_support_x86_avx512())
-                dst_elempack = 16;
-            else if (elemcount % 8 == 0 && ncnn::cpu_support_x86_avx())
-                dst_elempack = 8;
-            else if (elemcount % 4 == 0)
-                dst_elempack = 4;
-#elif NCNN_AVX
-            if (elemcount % 8 == 0 && ncnn::cpu_support_x86_avx())
-                dst_elempack = 8;
-            else if (elemcount % 4 == 0)
-                dst_elempack = 4;
-#elif NCNN_RVV
-            const int packn = ncnn::cpu_riscv_vlenb() / (elembits / 8);
-            if (elemcount % packn == 0)
-                dst_elempack = packn;
-#else
-            if (elemcount % 4 == 0)
-                dst_elempack = 4;
-#endif
-        }
-        if (elembits == 16)
-        {
-#if NCNN_ARM82
-            if (elemcount % 8 == 0 && ncnn::cpu_support_arm_asimdhp() && opt.use_fp16_arithmetic)
-                dst_elempack = 8;
-            else if (elemcount % 4 == 0)
-                dst_elempack = 4;
-#elif NCNN_RVV
-            const int packn = ncnn::cpu_riscv_vlenb() / 2;
-            if (elemcount % packn == 0)
-                dst_elempack = packn;
-#else
-            if (elemcount % 4 == 0)
-                dst_elempack = 4;
-#endif
-        }
-        if (elembits == 8)
-        {
-#if NCNN_RVV
-            const int packn = ncnn::cpu_riscv_vlenb() / 1;
-            if (elemcount % packn == 0)
-                dst_elempack = packn;
-#else
-            if (elemcount % 8 == 0)
-                dst_elempack = 8;
-#endif
-        }
-
-        if (flag & TEST_LAYER_ENABLE_FORCE_INPUT_PACK8)
-            dst_elempack = 8;
-
-        ncnn::Mat a4_packed;
-        ncnn::convert_packing(a4, a4_packed, dst_elempack, opt);
-        a4 = a4_packed;
-    }
-
-    if (op->support_inplace)
-    {
-        c = a4.clone();
-        op->forward_inplace(c, opt);
-    }
-    else
-    {
-        op->forward(a4, c, opt);
-    }
-
-    // clang-format off
-    // *INDENT-OFF*
-#if NCNN_ARM82
-    if (opt.use_fp16_storage && ncnn::cpu_support_arm_asimdhp() && op->support_fp16_storage && c.elembits() == 16)
-    {
-        ncnn::Mat c_fp32;
-        ncnn::cast_float16_to_float32(c, c_fp32, opt);
-        c = c_fp32;
-    }
-    else
-#endif // NCNN_ARM82
-#if NCNN_RVV
-    if (opt.use_fp16_storage && ncnn::cpu_support_riscv_v() && ncnn::cpu_support_riscv_zfh() && op->support_fp16_storage && c.elembits() == 16)
-    {
-        ncnn::Mat c_fp32;
-        ncnn::cast_float16_to_float32(c, c_fp32, opt);
-        c = c_fp32;
-    }
-    else
-#endif // NCNN_RVV
-#if NCNN_BF16
-    if (opt.use_bf16_storage && op->support_bf16_storage && c.elembits() == 16)
-    {
-        ncnn::Mat c_fp32;
-        ncnn::cast_bfloat16_to_float32(c, c_fp32, opt);
-        c = c_fp32;
-    }
-    else
-#endif // NCNN_BF16
-    if (opt.use_fp16_storage && op->support_fp16_storage && c.elembits() == 16)
-    {
-        ncnn::Mat c_fp32;
-        ncnn::cast_float16_to_float32(c, c_fp32, opt);
-        c = c_fp32;
-    }
-    // *INDENT-ON*
-    // clang-format on
-
-    op->destroy_pipeline(opt);
-
-    delete op;
-
-    return 0;
-}
-
-#if NCNN_VULKAN
-int test_layer_gpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const ncnn::Mat& a, ncnn::Mat& d, const ncnn::Mat& top_shape, void (*func)(ncnn::Layer*), int flag)
-{
-    if (!_opt.use_packing_layout)
-    {
-        // pack1 test is useless for gpu
-        return 233;
-    }
-
-    ncnn::Layer* op = ncnn::create_layer_vulkan(typeindex);
-
-    if (!op->support_vulkan)
-    {
-        delete op;
-        return 233;
-    }
-
-    ncnn::VulkanDevice* vkdev = ncnn::get_gpu_device();
-
-    op->vkdev = vkdev;
-
-    if (func)
-    {
-        (*func)((ncnn::Layer*)op);
-    }
-
-    if (top_shape.dims)
-    {
-        op->bottom_shapes.resize(1);
-        op->top_shapes.resize(1);
-        op->bottom_shapes[0] = a;
-        op->top_shapes[0] = top_shape;
-    }
-
-    op->load_param(pd);
-
-    ncnn::ModelBinFromMatArray mb(weights.data());
-
-    op->load_model(mb);
-
-    ncnn::VkWeightAllocator g_weight_vkallocator(vkdev);
-    ncnn::VkWeightStagingAllocator g_weight_staging_vkallocator(vkdev);
-
-    ncnn::VkAllocator* blob_vkallocator = vkdev->acquire_blob_allocator();
-    ncnn::VkAllocator* staging_vkallocator = vkdev->acquire_staging_allocator();
-
-    ncnn::Option opt = _opt;
-    opt.num_threads = 1;
-    opt.use_vulkan_compute = true;
-
-#if __APPLE__
-    opt.use_image_storage = false;
-#endif
-
-    opt.blob_vkallocator = blob_vkallocator;
-    opt.workspace_vkallocator = blob_vkallocator;
-    opt.staging_vkallocator = staging_vkallocator;
-
-    if (!vkdev->info.support_fp16_packed()) opt.use_fp16_packed = false;
-    if (!vkdev->info.support_fp16_storage()) opt.use_fp16_storage = false;
-    if (!vkdev->info.support_fp16_arithmetic()) opt.use_fp16_arithmetic = false;
-    if (!vkdev->info.support_cooperative_matrix()) opt.use_cooperative_matrix = false;
-
-    // FIXME fp16a may produce large error
-    opt.use_fp16_arithmetic = false;
-
-    op->create_pipeline(opt);
-
-    if (!op->support_vulkan)
-    {
-        op->destroy_pipeline(opt);
-        delete op;
-        return 233;
-    }
-
-    {
-        ncnn::VkTransfer cmd(vkdev);
-
-        ncnn::Option opt_upload = opt;
-        opt_upload.blob_vkallocator = &g_weight_vkallocator;
-        opt_upload.workspace_vkallocator = &g_weight_vkallocator;
-        opt_upload.staging_vkallocator = &g_weight_staging_vkallocator;
-
-        op->upload_model(cmd, opt_upload);
-
-        cmd.submit_and_wait();
-    }
-
-    {
-        // forward
-        ncnn::VkCompute cmd(vkdev);
-
-        if (op->support_image_storage && opt.use_image_storage)
-        {
-            // upload
-            ncnn::VkImageMat a_gpu;
-            cmd.record_upload(a, a_gpu, opt);
-
-            ncnn::VkImageMat d_gpu;
-            if (op->support_inplace)
-            {
-                op->forward_inplace(a_gpu, cmd, opt);
-
-                d_gpu = a_gpu;
-            }
-            else
-            {
-                op->forward(a_gpu, d_gpu, cmd, opt);
-            }
-
-            // download
-            cmd.record_download(d_gpu, d, opt);
-        }
-        else
-        {
-            // upload
-            ncnn::VkMat a_gpu;
-            cmd.record_upload(a, a_gpu, opt);
-
-            ncnn::VkMat d_gpu;
-            if (op->support_inplace)
-            {
-                op->forward_inplace(a_gpu, cmd, opt);
-
-                d_gpu = a_gpu;
-            }
-            else
-            {
-                op->forward(a_gpu, d_gpu, cmd, opt);
-            }
-
-            // download
-            cmd.record_download(d_gpu, d, opt);
-        }
-
-        cmd.submit_and_wait();
-    }
-
-    op->destroy_pipeline(opt);
-
-    delete op;
-
-    vkdev->reclaim_blob_allocator(blob_vkallocator);
-    vkdev->reclaim_staging_allocator(staging_vkallocator);
-    g_weight_vkallocator.clear();
-    g_weight_staging_vkallocator.clear();
-
-    return 0;
-}
+int test_layer_gpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const std::vector<ncnn::Mat>& a, int top_blob_count, std::vector<ncnn::Mat>& d, const std::vector<ncnn::Mat>& top_shapes, void (*func)(ncnn::Layer*), int flag);
 #endif // NCNN_VULKAN
 
-int test_layer(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const ncnn::Mat& a, const ncnn::Mat& top_shape = ncnn::Mat(), float epsilon = 0.001, void (*func)(ncnn::Layer*) = 0, int flag = 0)
-{
-    // naive
-    ncnn::Mat b;
-    {
-        int ret = test_layer_naive(typeindex, pd, weights, a, b, func, flag);
-        if (ret != 233 && ret != 0)
-        {
-            fprintf(stderr, "test_layer_naive failed\n");
-            return -1;
-        }
-    }
+int test_layer(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const std::vector<ncnn::Mat>& a, int top_blob_count, const std::vector<ncnn::Mat>& top_shapes = std::vector<ncnn::Mat>(), float epsilon = 0.001, void (*func)(ncnn::Layer*) = 0, int flag = 0);
 
-    // cpu
-    {
-        ncnn::Mat c;
-        int ret = test_layer_cpu(typeindex, pd, weights, _opt, a, c, ncnn::Mat(), func, flag);
-        if (ret != 233 && (ret != 0 || CompareMat(b, c, epsilon) != 0))
-        {
-            fprintf(stderr, "test_layer_cpu failed\n");
-            return -1;
-        }
-    }
+int test_layer_naive(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Mat& a, ncnn::Mat& b, void (*func)(ncnn::Layer*), int flag);
 
-    // cpu shape hint
-    {
-        ncnn::Mat c;
-        int ret = test_layer_cpu(typeindex, pd, weights, _opt, a, c, b, func, flag);
-        if (ret != 233 && (ret != 0 || CompareMat(b, c, epsilon) != 0))
-        {
-            fprintf(stderr, "test_layer_cpu failed with shape hint\n");
-            return -1;
-        }
-    }
+int test_layer_cpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const ncnn::Mat& a, ncnn::Mat& c, const ncnn::Mat& top_shape, void (*func)(ncnn::Layer*), int flag);
 
 #if NCNN_VULKAN
-    // gpu
-    if (!(flag & TEST_LAYER_DISABLE_GPU_TESTING))
-    {
-        ncnn::Mat d;
-        int ret = test_layer_gpu(typeindex, pd, weights, _opt, a, d, ncnn::Mat(), func, flag);
-        if (ret != 233 && (ret != 0 || CompareMat(b, d, epsilon) != 0))
-        {
-            fprintf(stderr, "test_layer_gpu failed\n");
-            return -1;
-        }
-    }
-
-    // gpu shape hint
-    if (!(flag & TEST_LAYER_DISABLE_GPU_TESTING))
-    {
-        ncnn::Mat d;
-        int ret = test_layer_gpu(typeindex, pd, weights, _opt, a, d, b, func, flag);
-        if (ret != 233 && (ret != 0 || CompareMat(b, d, epsilon) != 0))
-        {
-            fprintf(stderr, "test_layer_gpu failed with shape hint\n");
-            return -1;
-        }
-    }
+int test_layer_gpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const ncnn::Mat& a, ncnn::Mat& d, const ncnn::Mat& top_shape, void (*func)(ncnn::Layer*), int flag);
 #endif // NCNN_VULKAN
 
-    return 0;
-}
-
-int test_layer_opt(const char* layer_type, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& opt, const std::vector<ncnn::Mat>& a, int top_blob_count = 1, float epsilon = 0.001, void (*func)(ncnn::Layer*) = 0, int flag = 0)
-{
-    // fp16 representation
-    std::vector<ncnn::Mat> a_fp16;
-    if (opt.use_bf16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
-    {
-        a_fp16.resize(a.size());
-        for (size_t j = 0; j < a.size(); j++)
-        {
-            ncnn::Mat tmp;
-            ncnn::cast_float32_to_bfloat16(a[j], tmp, opt);
-            ncnn::cast_bfloat16_to_float32(tmp, a_fp16[j], opt);
-        }
-    }
-    else if ((opt.use_fp16_packed || opt.use_fp16_storage) && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
-    {
-        a_fp16.resize(a.size());
-        for (size_t j = 0; j < a.size(); j++)
-        {
-            ncnn::Mat tmp;
-            ncnn::cast_float32_to_float16(a[j], tmp, opt);
-            ncnn::cast_float16_to_float32(tmp, a_fp16[j], opt);
-        }
-    }
-    else
-    {
-        a_fp16 = a;
-    }
-
-    std::vector<ncnn::Mat> weights_fp16;
-    float epsilon_fp16;
-    if (opt.use_bf16_storage)
-    {
-        weights_fp16.resize(weights.size());
-        for (size_t j = 0; j < weights.size(); j++)
-        {
-            ncnn::Mat tmp;
-            ncnn::cast_float32_to_bfloat16(weights[j], tmp, opt);
-            ncnn::cast_bfloat16_to_float32(tmp, weights_fp16[j], opt);
-        }
-        epsilon_fp16 = epsilon * 100; // 0.1
-    }
-    else if (opt.use_fp16_packed || opt.use_fp16_storage)
-    {
-        weights_fp16.resize(weights.size());
-        for (size_t j = 0; j < weights.size(); j++)
-        {
-            ncnn::Mat tmp;
-            ncnn::cast_float32_to_float16(weights[j], tmp, opt);
-            ncnn::cast_float16_to_float32(tmp, weights_fp16[j], opt);
-        }
-        epsilon_fp16 = epsilon * 100; // 0.1
-    }
-    else
-    {
-        weights_fp16 = weights;
-        epsilon_fp16 = epsilon;
-    }
-
-    if (opt.use_fp16_arithmetic)
-    {
-        epsilon_fp16 = epsilon * 1000; // 1.0
-    }
-
-    std::vector<ncnn::Mat> top_shapes;
-    int ret = test_layer(ncnn::layer_to_index(layer_type), pd, weights_fp16, opt, a_fp16, top_blob_count, top_shapes, epsilon_fp16, func, flag);
-    if (ret != 0)
-    {
-        fprintf(stderr, "test_layer %s failed use_packing_layout=%d use_fp16_packed=%d use_fp16_storage=%d use_fp16_arithmetic=%d use_shader_pack8=%d use_bf16_storage=%d use_image_storage=%d use_sgemm_convolution=%d use_winograd_convolution=%d\n", layer_type, opt.use_packing_layout, opt.use_fp16_packed, opt.use_fp16_storage, opt.use_fp16_arithmetic, opt.use_shader_pack8, opt.use_bf16_storage, opt.use_image_storage, opt.use_sgemm_convolution, opt.use_winograd_convolution);
-        return ret;
-    }
-
-    return 0;
-}
-
-int test_layer_opt(const char* layer_type, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& opt, const ncnn::Mat& a, float epsilon = 0.001, void (*func)(ncnn::Layer*) = 0, int flag = 0)
-{
-    // fp16 representation
-    ncnn::Mat a_fp16;
-    if (opt.use_bf16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
-    {
-        ncnn::Mat tmp;
-        ncnn::cast_float32_to_bfloat16(a, tmp, opt);
-        ncnn::cast_bfloat16_to_float32(tmp, a_fp16, opt);
-    }
-    else if ((opt.use_fp16_packed || opt.use_fp16_storage) && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
-    {
-        ncnn::Mat tmp;
-        ncnn::cast_float32_to_float16(a, tmp, opt);
-        ncnn::cast_float16_to_float32(tmp, a_fp16, opt);
-    }
-    else
-    {
-        a_fp16 = a;
-    }
-
-    std::vector<ncnn::Mat> weights_fp16;
-    float epsilon_fp16;
-    if (opt.use_bf16_storage)
-    {
-        weights_fp16.resize(weights.size());
-        for (size_t j = 0; j < weights.size(); j++)
-        {
-            ncnn::Mat tmp;
-            ncnn::cast_float32_to_bfloat16(weights[j], tmp, opt);
-            ncnn::cast_bfloat16_to_float32(tmp, weights_fp16[j], opt);
-        }
-        epsilon_fp16 = epsilon * 100; // 0.1
-    }
-    else if (opt.use_fp16_packed || opt.use_fp16_storage)
-    {
-        weights_fp16.resize(weights.size());
-        for (size_t j = 0; j < weights.size(); j++)
-        {
-            ncnn::Mat tmp;
-            ncnn::cast_float32_to_float16(weights[j], tmp, opt);
-            ncnn::cast_float16_to_float32(tmp, weights_fp16[j], opt);
-        }
-        epsilon_fp16 = epsilon * 100; // 0.1
-    }
-    else
-    {
-        weights_fp16 = weights;
-        epsilon_fp16 = epsilon;
-    }
-
-    if (opt.use_fp16_arithmetic)
-    {
-        epsilon_fp16 = epsilon * 1000; // 1.0
-    }
-
-    ncnn::Mat top_shape;
-    int ret = test_layer(ncnn::layer_to_index(layer_type), pd, weights_fp16, opt, a_fp16, top_shape, epsilon_fp16, func, flag);
-    if (ret != 0)
-    {
-        fprintf(stderr, "test_layer %s failed use_packing_layout=%d use_fp16_packed=%d use_fp16_storage=%d use_fp16_arithmetic=%d use_shader_pack8=%d use_bf16_storage=%d use_image_storage=%d use_sgemm_convolution=%d use_winograd_convolution=%d\n", layer_type, opt.use_packing_layout, opt.use_fp16_packed, opt.use_fp16_storage, opt.use_fp16_arithmetic, opt.use_shader_pack8, opt.use_bf16_storage, opt.use_image_storage, opt.use_sgemm_convolution, opt.use_winograd_convolution);
-        return ret;
-    }
-
-    return 0;
-}
-
-int test_layer(const char* layer_type, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const std::vector<ncnn::Mat>& a, int top_blob_count = 1, float epsilon = 0.001, void (*func)(ncnn::Layer*) = 0, int flag = 0)
-{
-    // pack fp16p fp16s fp16a bf16s shader8 image
-    const int options[][7] = {
-        {0, 0, 0, 0, 0, 0, 0},
-        {0, 0, 1, 0, 0, 0, 0},
-        {0, 0, 1, 1, 1, 0, 0},
-        {1, 0, 0, 0, 0, 0, 0},
-        {1, 1, 0, 0, 1, 0, 0},
-        {1, 0, 1, 0, 0, 1, 0},
-        {1, 1, 1, 1, 0, 0, 0},
-        {1, 1, 1, 1, 1, 1, 1},
-    };
-
-    const int opt_count = sizeof(options) / sizeof(options[0]);
-
-    for (int i = 0; i < opt_count; i++)
-    {
-        ncnn::Option opt;
-        opt.num_threads = 1;
-        opt.use_packing_layout = options[i][0];
-        opt.use_fp16_packed = options[i][1];
-        opt.use_fp16_storage = options[i][2];
-        opt.use_fp16_arithmetic = options[i][3];
-        opt.use_bf16_storage = options[i][4];
-        opt.use_shader_pack8 = options[i][5];
-        opt.use_image_storage = options[i][6];
-
-        int ret = test_layer_opt(layer_type, pd, weights, opt, a, top_blob_count, epsilon, func, flag);
-        if (ret != 0)
-            return ret;
-    }
-
-    return 0;
-}
-
-int test_layer(const char* layer_type, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Mat& a, float epsilon = 0.001, void (*func)(ncnn::Layer*) = 0, int flag = 0)
-{
-    // pack fp16p fp16s fp16a bf16s shader8 image
-    const int options[][7] = {
-        {0, 0, 0, 0, 0, 0, 0},
-        {0, 0, 1, 0, 0, 0, 0},
-        {0, 0, 1, 1, 1, 0, 0},
-        {1, 0, 0, 0, 0, 0, 0},
-        {1, 1, 0, 0, 1, 0, 0},
-        {1, 0, 1, 0, 0, 1, 0},
-        {1, 1, 1, 1, 0, 0, 0},
-        {1, 1, 1, 1, 1, 1, 1},
-    };
+int test_layer(int typeindex, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& _opt, const ncnn::Mat& a, const ncnn::Mat& top_shape = ncnn::Mat(), float epsilon = 0.001, void (*func)(ncnn::Layer*) = 0, int flag = 0);
 
-    const int opt_count = sizeof(options) / sizeof(options[0]);
+int test_layer_opt(const char* layer_type, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& opt, const std::vector<ncnn::Mat>& a, int top_blob_count = 1, float epsilon = 0.001, void (*func)(ncnn::Layer*) = 0, int flag = 0);
 
-    for (int i = 0; i < opt_count; i++)
-    {
-        ncnn::Option opt;
-        opt.num_threads = 1;
-        opt.use_packing_layout = options[i][0];
-        opt.use_fp16_packed = options[i][1];
-        opt.use_fp16_storage = options[i][2];
-        opt.use_fp16_arithmetic = options[i][3];
-        opt.use_bf16_storage = options[i][4];
-        opt.use_shader_pack8 = options[i][5];
-        opt.use_image_storage = options[i][6];
+int test_layer_opt(const char* layer_type, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Option& opt, const ncnn::Mat& a, float epsilon = 0.001, void (*func)(ncnn::Layer*) = 0, int flag = 0);
 
-        int ret = test_layer_opt(layer_type, pd, weights, opt, a, epsilon, func, flag);
-        if (ret != 0)
-            return ret;
-    }
+int test_layer(const char* layer_type, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const std::vector<ncnn::Mat>& a, int top_blob_count = 1, float epsilon = 0.001, void (*func)(ncnn::Layer*) = 0, int flag = 0);
 
-    return 0;
-}
+int test_layer(const char* layer_type, const ncnn::ParamDict& pd, const std::vector<ncnn::Mat>& weights, const ncnn::Mat& a, float epsilon = 0.001, void (*func)(ncnn::Layer*) = 0, int flag = 0);
 
 #endif // TESTUTIL_H

From 0d50b4967bef4ccf7ce4206282c7b5038b6cf620 Mon Sep 17 00:00:00 2001
From: nihuini <nihuini@tencent.com>
Date: Wed, 3 Jan 2024 15:44:50 +0800
Subject: [PATCH 05/19] fix build

---
 src/layer/convolution.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/layer/convolution.cpp b/src/layer/convolution.cpp
index bf83a69a100..ba9eaef8752 100644
--- a/src/layer/convolution.cpp
+++ b/src/layer/convolution.cpp
@@ -116,8 +116,6 @@ int Convolution::load_model(const ModelBin& mb)
 
         weight_data = weight_data_int8.reshape(weight_data_size);
     }
-#else
-    (void)(opt);
 #endif // NCNN_INT8
 
     return 0;

From 624d46d2c3e16f3fa2b7d17ce76ed7f1402c9f3e Mon Sep 17 00:00:00 2001
From: nihuini <nihuini@tencent.com>
Date: Wed, 3 Jan 2024 16:16:54 +0800
Subject: [PATCH 06/19] bring test cast packing

---
 tests/CMakeLists.txt   |   4 +-
 tests/test_cast.cpp    |  27 +++++------
 tests/test_packing.cpp | 107 +++++++++++++++++------------------------
 3 files changed, 57 insertions(+), 81 deletions(-)

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 4ce231fbd2a..d30229b870c 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -74,7 +74,7 @@ ncnn_add_layer_test(BatchNorm)
 ncnn_add_layer_test(Bias)
 ncnn_add_layer_test(BinaryOp)
 ncnn_add_layer_test(BNLL)
-# ncnn_add_layer_test(Cast)
+ncnn_add_layer_test(Cast)
 ncnn_add_layer_test(CELU)
 ncnn_add_layer_test(Clip)
 ncnn_add_layer_test(Concat)
@@ -125,7 +125,7 @@ ncnn_add_layer_test(Mish)
 ncnn_add_layer_test(MultiHeadAttention)
 ncnn_add_layer_test(Noop)
 ncnn_add_layer_test(Normalize)
-# ncnn_add_layer_test(Packing)
+ncnn_add_layer_test(Packing)
 ncnn_add_layer_test(Padding)
 ncnn_add_layer_test(Permute)
 ncnn_add_layer_test(PixelShuffle)
diff --git a/tests/test_cast.cpp b/tests/test_cast.cpp
index fb1f9399bd4..fa44f06bc8c 100644
--- a/tests/test_cast.cpp
+++ b/tests/test_cast.cpp
@@ -24,9 +24,6 @@ static int cast_cpu_naive(const ncnn::Mat& a, ncnn::Mat& b, int type_from, int t
 
     ncnn::Option opt;
     opt.num_threads = 1;
-    opt.use_vulkan_compute = false;
-    opt.use_int8_inference = false;
-    opt.use_packing_layout = false;
 
     ncnn::Layer* op = ncnn::create_layer_naive("Cast");
 
@@ -61,7 +58,7 @@ static int test_cast_cpu(const ncnn::Mat& a, int type_from, int type_to)
     opt.use_int8_inference = false;
     opt.use_packing_layout = false;
 
-    ncnn::Layer* op = ncnn::create_layer("Cast");
+    ncnn::Layer* op = ncnn::create_layer_cpu("Cast");
 
     op->load_param(pd);
 
@@ -75,7 +72,7 @@ static int test_cast_cpu(const ncnn::Mat& a, int type_from, int type_to)
     cast_cpu_naive(a, a_fp16, 1, type_from);
 
     ncnn::Mat b;
-    ((ncnn::Cast*)op)->ncnn::Cast::forward(a_fp16, b, opt);
+    cast_cpu_naive(a_fp16, b, type_from, type_to);
 
     ncnn::Mat c;
     op->forward(a_fp16, c, opt);
@@ -106,7 +103,7 @@ static int test_cast_cpu_packed(const ncnn::Mat& a, int type_from, int type_to)
     opt.use_vulkan_compute = false;
     opt.use_packing_layout = false;
 
-    ncnn::Layer* op = ncnn::create_layer("Cast");
+    ncnn::Layer* op = ncnn::create_layer_cpu("Cast");
 
     op->load_param(pd);
 
@@ -120,7 +117,7 @@ static int test_cast_cpu_packed(const ncnn::Mat& a, int type_from, int type_to)
     cast_cpu_naive(a, a_fp16, 1, type_from);
 
     ncnn::Mat b;
-    ((ncnn::Cast*)op)->ncnn::Cast::forward(a_fp16, b, opt);
+    cast_cpu_naive(a_fp16, b, type_from, type_to);
 
     ncnn::Mat a4;
     ncnn::convert_packing(a, a4, 4, opt);
@@ -179,7 +176,7 @@ static int test_cast_gpu_fp16p(const ncnn::Mat& a, int type_from, int type_to)
     if (!vkdev->info.support_fp16_packed()) opt.use_fp16_packed = false;
     if (!vkdev->info.support_fp16_storage()) opt.use_fp16_storage = false;
 
-    ncnn::Layer* op = ncnn::create_layer("Cast");
+    ncnn::Layer* op = ncnn::create_layer_vulkan("Cast");
 
     op->vkdev = vkdev;
 
@@ -202,7 +199,7 @@ static int test_cast_gpu_fp16p(const ncnn::Mat& a, int type_from, int type_to)
     }
 
     ncnn::Mat b;
-    ((ncnn::Cast*)op)->ncnn::Cast::forward(a_fp16, b, opt);
+    cast_cpu_naive(a_fp16, b, type_from, type_to);
 
     ncnn::Mat d;
 
@@ -295,7 +292,7 @@ static int test_cast_gpu_fp16p_pack8(const ncnn::Mat& a, int type_from, int type
     if (!vkdev->info.support_fp16_packed()) opt.use_fp16_packed = false;
     if (!vkdev->info.support_fp16_storage()) opt.use_fp16_storage = false;
 
-    ncnn::Layer* op = ncnn::create_layer("Cast");
+    ncnn::Layer* op = ncnn::create_layer_vulkan("Cast");
 
     op->vkdev = vkdev;
 
@@ -318,7 +315,7 @@ static int test_cast_gpu_fp16p_pack8(const ncnn::Mat& a, int type_from, int type
     }
 
     ncnn::Mat b;
-    ((ncnn::Cast*)op)->ncnn::Cast::forward(a_fp16, b, opt);
+    cast_cpu_naive(a_fp16, b, type_from, type_to);
 
     ncnn::Mat d;
 
@@ -412,7 +409,7 @@ static int test_cast_gpu_image_fp16p(const ncnn::Mat& a, int type_from, int type
     if (!vkdev->info.support_fp16_packed()) opt.use_fp16_packed = false;
     if (!vkdev->info.support_fp16_storage()) opt.use_fp16_storage = false;
 
-    ncnn::Layer* op = ncnn::create_layer("Cast");
+    ncnn::Layer* op = ncnn::create_layer_vulkan("Cast");
 
     op->vkdev = vkdev;
 
@@ -435,7 +432,7 @@ static int test_cast_gpu_image_fp16p(const ncnn::Mat& a, int type_from, int type
     }
 
     ncnn::Mat b;
-    ((ncnn::Cast*)op)->ncnn::Cast::forward(a_fp16, b, opt);
+    cast_cpu_naive(a_fp16, b, type_from, type_to);
 
     ncnn::Mat d;
 
@@ -528,7 +525,7 @@ static int test_cast_gpu_image_fp16p_pack8(const ncnn::Mat& a, int type_from, in
     if (!vkdev->info.support_fp16_packed()) opt.use_fp16_packed = false;
     if (!vkdev->info.support_fp16_storage()) opt.use_fp16_storage = false;
 
-    ncnn::Layer* op = ncnn::create_layer("Cast");
+    ncnn::Layer* op = ncnn::create_layer_vulkan("Cast");
 
     op->vkdev = vkdev;
 
@@ -551,7 +548,7 @@ static int test_cast_gpu_image_fp16p_pack8(const ncnn::Mat& a, int type_from, in
     }
 
     ncnn::Mat b;
-    ((ncnn::Cast*)op)->ncnn::Cast::forward(a_fp16, b, opt);
+    cast_cpu_naive(a_fp16, b, type_from, type_to);
 
     ncnn::Mat d;
 
diff --git a/tests/test_packing.cpp b/tests/test_packing.cpp
index 84652d9e0b7..da921dc50ff 100644
--- a/tests/test_packing.cpp
+++ b/tests/test_packing.cpp
@@ -14,6 +14,35 @@
 
 #include "testutil.h"
 
+static int packing_cpu_naive(const ncnn::Mat& a, ncnn::Mat& b, int out_elempack)
+{
+    ncnn::ParamDict pd;
+    pd.set(0, out_elempack);
+
+    std::vector<ncnn::Mat> weights(0);
+
+    ncnn::Option opt;
+    opt.num_threads = 1;
+
+    ncnn::Layer* op = ncnn::create_layer_naive("Packing");
+
+    op->load_param(pd);
+
+    ncnn::ModelBinFromMatArray mb(weights.data());
+
+    op->load_model(mb);
+
+    op->create_pipeline(opt);
+
+    op->forward(a, b, opt);
+
+    op->destroy_pipeline(opt);
+
+    delete op;
+
+    return 0;
+}
+
 static int test_packing_cpu_fp32(const ncnn::Mat& a, int in_elempack, int out_elempack)
 {
     ncnn::ParamDict pd;
@@ -29,7 +58,7 @@ static int test_packing_cpu_fp32(const ncnn::Mat& a, int in_elempack, int out_el
     opt.use_fp16_arithmetic = false;
     opt.use_packing_layout = false;
 
-    ncnn::Layer* op = ncnn::create_layer("Packing");
+    ncnn::Layer* op = ncnn::create_layer_cpu("Packing");
 
     op->load_param(pd);
 
@@ -43,7 +72,7 @@ static int test_packing_cpu_fp32(const ncnn::Mat& a, int in_elempack, int out_el
     ncnn::convert_packing(a, ap, in_elempack, opt);
 
     ncnn::Mat b;
-    ((ncnn::Packing*)op)->ncnn::Packing::forward(ap, b, opt);
+    packing_cpu_naive(ap, b, out_elempack);
 
     ncnn::Mat c;
     op->forward(ap, c, opt);
@@ -76,7 +105,7 @@ static int test_packing_cpu_fp16(const ncnn::Mat& a, int in_elempack, int out_el
     opt.use_fp16_arithmetic = true;
     opt.use_packing_layout = false;
 
-    ncnn::Layer* op = ncnn::create_layer("Packing");
+    ncnn::Layer* op = ncnn::create_layer_cpu("Packing");
 
     if (!op->support_fp16_storage)
     {
@@ -99,7 +128,7 @@ static int test_packing_cpu_fp16(const ncnn::Mat& a, int in_elempack, int out_el
     ncnn::convert_packing(a16, ap, in_elempack, opt);
 
     ncnn::Mat b;
-    ((ncnn::Packing*)op)->ncnn::Packing::forward(ap, b, opt);
+    packing_cpu_naive(ap, b, out_elempack);
 
     ncnn::Mat c;
     op->forward(ap, c, opt);
@@ -135,7 +164,7 @@ static int test_packing_cpu_int8(const ncnn::Mat& a, int in_elempack, int out_el
     opt.use_fp16_arithmetic = false;
     opt.use_packing_layout = false;
 
-    ncnn::Layer* op = ncnn::create_layer("Packing");
+    ncnn::Layer* op = ncnn::create_layer_cpu("Packing");
 
     op->load_param(pd);
 
@@ -155,7 +184,7 @@ static int test_packing_cpu_int8(const ncnn::Mat& a, int in_elempack, int out_el
     ncnn::convert_packing(a8, ap, in_elempack, opt);
 
     ncnn::Mat b;
-    ((ncnn::Packing*)op)->ncnn::Packing::forward(ap, b, opt);
+    packing_cpu_naive(ap, b, out_elempack);
 
     ncnn::Mat c;
     op->forward(ap, c, opt);
@@ -225,7 +254,7 @@ static int test_packing_gpu_buffer(const ncnn::Mat& a, int in_elempack, int out_
     if (!vkdev->info.support_fp16_packed()) opt.use_fp16_packed = false;
     if (!vkdev->info.support_fp16_storage()) opt.use_fp16_storage = false;
 
-    ncnn::Layer* op = ncnn::create_layer("Packing");
+    ncnn::Layer* op = ncnn::create_layer_vulkan("Packing");
 
     op->vkdev = vkdev;
 
@@ -241,7 +270,7 @@ static int test_packing_gpu_buffer(const ncnn::Mat& a, int in_elempack, int out_
     ncnn::convert_packing(a, ap, in_elempack, opt);
 
     ncnn::Mat b;
-    ((ncnn::Packing*)op)->ncnn::Packing::forward(ap, b, opt);
+    packing_cpu_naive(ap, b, out_elempack);
 
     ncnn::Mat d;
 
@@ -312,7 +341,7 @@ static int test_packing_gpu_image(const ncnn::Mat& a, int in_elempack, int out_e
     if (!vkdev->info.support_fp16_packed()) opt.use_fp16_packed = false;
     if (!vkdev->info.support_fp16_storage()) opt.use_fp16_storage = false;
 
-    ncnn::Layer* op = ncnn::create_layer("Packing");
+    ncnn::Layer* op = ncnn::create_layer_vulkan("Packing");
 
     op->vkdev = vkdev;
 
@@ -328,7 +357,7 @@ static int test_packing_gpu_image(const ncnn::Mat& a, int in_elempack, int out_e
     ncnn::convert_packing(a, ap, in_elempack, opt);
 
     ncnn::Mat b;
-    ((ncnn::Packing*)op)->ncnn::Packing::forward(ap, b, opt);
+    packing_cpu_naive(ap, b, out_elempack);
 
     ncnn::Mat d;
 
@@ -365,15 +394,6 @@ static int test_packing_gpu_image(const ncnn::Mat& a, int in_elempack, int out_e
 
 static int test_packing_gpu_buffer2image(const ncnn::Mat& a, int in_elempack, int out_elempack)
 {
-    ncnn::ParamDict pd;
-    pd.set(0, out_elempack);
-    pd.set(2, 1); // cast_type_from
-    pd.set(3, 1); // cast_type_to
-    pd.set(4, 0); // storage_type_from
-    pd.set(5, 1); // storage_type_to
-
-    std::vector<ncnn::Mat> weights(0);
-
     ncnn::Option opt;
     opt.num_threads = 1;
     opt.use_vulkan_compute = true;
@@ -399,23 +419,11 @@ static int test_packing_gpu_buffer2image(const ncnn::Mat& a, int in_elempack, in
     if (!vkdev->info.support_fp16_packed()) opt.use_fp16_packed = false;
     if (!vkdev->info.support_fp16_storage()) opt.use_fp16_storage = false;
 
-    ncnn::Packing_vulkan* op = new ncnn::Packing_vulkan;
-
-    op->vkdev = vkdev;
-
-    op->load_param(pd);
-
-    ncnn::ModelBinFromMatArray mb(weights.data());
-
-    op->load_model(mb);
-
-    op->create_pipeline(opt);
-
     ncnn::Mat ap;
     ncnn::convert_packing(a, ap, in_elempack, opt);
 
     ncnn::Mat b;
-    ((ncnn::Packing*)op)->ncnn::Packing::forward(ap, b, opt);
+    packing_cpu_naive(ap, b, out_elempack);
 
     ncnn::Mat d;
 
@@ -427,17 +435,13 @@ static int test_packing_gpu_buffer2image(const ncnn::Mat& a, int in_elempack, in
     cmd.record_clone(ap, a_gpu, opt);
 
     ncnn::VkImageMat d_gpu;
-    op->forward(a_gpu, d_gpu, cmd, opt);
+    vkdev->convert_packing(a_gpu, d_gpu, out_elempack, cmd, opt);
 
     // download
     cmd.record_clone(d_gpu, d, opt);
 
     cmd.submit_and_wait();
 
-    op->destroy_pipeline(opt);
-
-    delete op;
-
     vkdev->reclaim_blob_allocator(blob_vkallocator);
     vkdev->reclaim_staging_allocator(staging_vkallocator);
 
@@ -452,15 +456,6 @@ static int test_packing_gpu_buffer2image(const ncnn::Mat& a, int in_elempack, in
 
 static int test_packing_gpu_image2buffer(const ncnn::Mat& a, int in_elempack, int out_elempack)
 {
-    ncnn::ParamDict pd;
-    pd.set(0, out_elempack);
-    pd.set(2, 1); // cast_type_from
-    pd.set(3, 1); // cast_type_to
-    pd.set(4, 1); // storage_type_from
-    pd.set(5, 0); // storage_type_to
-
-    std::vector<ncnn::Mat> weights(0);
-
     ncnn::Option opt;
     opt.num_threads = 1;
     opt.use_vulkan_compute = true;
@@ -486,23 +481,11 @@ static int test_packing_gpu_image2buffer(const ncnn::Mat& a, int in_elempack, in
     if (!vkdev->info.support_fp16_packed()) opt.use_fp16_packed = false;
     if (!vkdev->info.support_fp16_storage()) opt.use_fp16_storage = false;
 
-    ncnn::Packing_vulkan* op = new ncnn::Packing_vulkan;
-
-    op->vkdev = vkdev;
-
-    op->load_param(pd);
-
-    ncnn::ModelBinFromMatArray mb(weights.data());
-
-    op->load_model(mb);
-
-    op->create_pipeline(opt);
-
     ncnn::Mat ap;
     ncnn::convert_packing(a, ap, in_elempack, opt);
 
     ncnn::Mat b;
-    ((ncnn::Packing*)op)->ncnn::Packing::forward(ap, b, opt);
+    packing_cpu_naive(ap, b, out_elempack);
 
     ncnn::Mat d;
 
@@ -514,17 +497,13 @@ static int test_packing_gpu_image2buffer(const ncnn::Mat& a, int in_elempack, in
     cmd.record_clone(ap, a_gpu, opt);
 
     ncnn::VkMat d_gpu;
-    op->forward(a_gpu, d_gpu, cmd, opt);
+    vkdev->convert_packing(a_gpu, d_gpu, out_elempack, cmd, opt);
 
     // download
     cmd.record_clone(d_gpu, d, opt);
 
     cmd.submit_and_wait();
 
-    op->destroy_pipeline(opt);
-
-    delete op;
-
     vkdev->reclaim_blob_allocator(blob_vkallocator);
     vkdev->reclaim_staging_allocator(staging_vkallocator);
 

From 3851f79db7b790312be3a18a46d6c792f610ca43 Mon Sep 17 00:00:00 2001
From: nihuini <nihuini@tencent.com>
Date: Wed, 3 Jan 2024 17:06:56 +0800
Subject: [PATCH 07/19] fix test int8 winograd43

---
 tests/test_convolution_3.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tests/test_convolution_3.cpp b/tests/test_convolution_3.cpp
index 5e40cb59c11..d8d34818688 100644
--- a/tests/test_convolution_3.cpp
+++ b/tests/test_convolution_3.cpp
@@ -167,6 +167,14 @@ static int test_convolution_int8(int w, int h, int c, int outch, int kernel, int
     ncnn::Mat weight_scales = scales_mat(weights[0], outch, c * kernel * kernel, c * kernel * kernel);
     ncnn::Mat input_scales = scales_mat(a, 1, w * h * c, a.cstep);
     ncnn::Mat top_scales = requant ? scales_mat(a, 1, w * h * c, a.cstep) : ncnn::Mat();
+
+    if (kernel == 3 && dilation == 1 && stride == 1)
+    {
+        // test for 6bit quant
+        for (int i = 0; i < weight_scales.w; i++)
+            weight_scales[i] = weight_scales[i] / 4.f;
+    }
+
     if (bias)
     {
         weights[1] = RandomMat(outch);

From 7ea4abe98ca6588607768c9ebc84475e8fb0b576 Mon Sep 17 00:00:00 2001
From: nihuini <nihuini@tencent.com>
Date: Wed, 3 Jan 2024 17:45:00 +0800
Subject: [PATCH 08/19] fix rand

---
 tests/testutil.cpp | 13 +++++++++++++
 tests/testutil.h   | 11 +++++------
 2 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/tests/testutil.cpp b/tests/testutil.cpp
index eb4d4fb1ca9..86599bebd51 100644
--- a/tests/testutil.cpp
+++ b/tests/testutil.cpp
@@ -17,6 +17,7 @@
 #include "cpu.h"
 #include "layer.h"
 #include "mat.h"
+#include "prng.h"
 
 #include <stdio.h>
 #include <stdlib.h>
@@ -26,6 +27,18 @@
 #include "gpu.h"
 #endif // NCNN_VULKAN
 
+static struct prng_rand_t g_prng_rand_state;
+
+void SRAND(int seed)
+{
+    prng_srand(seed, &g_prng_rand_state);
+}
+
+uint64_t RAND()
+{
+    return prng_rand(&g_prng_rand_state);
+}
+
 float RandomFloat(float a, float b)
 {
     float random = ((float)RAND()) / (float)uint64_t(-1); //RAND_MAX;
diff --git a/tests/testutil.h b/tests/testutil.h
index 88e1ce2e160..12f9d0daa65 100644
--- a/tests/testutil.h
+++ b/tests/testutil.h
@@ -18,21 +18,20 @@
 #include "cpu.h"
 #include "layer.h"
 #include "mat.h"
-#include "prng.h"
 
 #include <stdio.h>
+#include <stdint.h>
 #include <stdlib.h>
 
-static struct prng_rand_t g_prng_rand_state;
-
-#define SRAND(seed) prng_srand(seed, &g_prng_rand_state)
-#define RAND()      prng_rand(&g_prng_rand_state)
-
 #define TEST_LAYER_DISABLE_AUTO_INPUT_PACKING (1 << 0)
 #define TEST_LAYER_DISABLE_AUTO_INPUT_CASTING (1 << 1)
 #define TEST_LAYER_DISABLE_GPU_TESTING        (1 << 2)
 #define TEST_LAYER_ENABLE_FORCE_INPUT_PACK8   (1 << 3)
 
+void SRAND(int seed);
+
+uint64_t RAND();
+
 float RandomFloat(float a = -1.2f, float b = 1.2f);
 
 int RandomInt(int a = -10000, int b = 10000);

From 6c046ba5a814dd448dc2227eaed931cec58cee08 Mon Sep 17 00:00:00 2001
From: nihuini <nihuini@tencent.com>
Date: Wed, 3 Jan 2024 19:17:42 +0800
Subject: [PATCH 09/19] fix

---
 tests/testutil.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tests/testutil.cpp b/tests/testutil.cpp
index 86599bebd51..b5f2fd34148 100644
--- a/tests/testutil.cpp
+++ b/tests/testutil.cpp
@@ -625,6 +625,10 @@ int test_layer_gpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<n
     }
 
     ncnn::Layer* op = ncnn::create_layer_vulkan(typeindex);
+    if (!op)
+    {
+        return 233;
+    }
 
     if (!op->support_vulkan)
     {
@@ -1121,6 +1125,10 @@ int test_layer_gpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<n
     }
 
     ncnn::Layer* op = ncnn::create_layer_vulkan(typeindex);
+    if (!op)
+    {
+        return 233;
+    }
 
     if (!op->support_vulkan)
     {

From 8a29df6b986997baf1e5cefc5002e628cf222594 Mon Sep 17 00:00:00 2001
From: nihuini <nihuini@tencent.com>
Date: Thu, 4 Jan 2024 11:14:07 +0800
Subject: [PATCH 10/19] decouple cpu vulkan

---
 src/layer/arm/convolution1d_arm.cpp                |  2 +-
 src/layer/arm/convolution_arm.cpp                  |  4 ++--
 src/layer/arm/convolutiondepthwise_arm.cpp         |  4 ++--
 src/layer/arm/deconvolution_arm.cpp                |  4 ++--
 src/layer/arm/deconvolution_arm_asimdhp.cpp        |  2 +-
 src/layer/arm/deconvolutiondepthwise_arm.cpp       |  4 ++--
 .../arm/deconvolutiondepthwise_arm_asimdhp.cpp     |  2 +-
 src/layer/arm/innerproduct_arm.cpp                 |  2 +-
 src/layer/arm/matmul_arm.cpp                       |  2 +-
 src/layer/arm/multiheadattention_arm.cpp           | 14 +++++++-------
 src/layer/convolution.cpp                          |  2 +-
 src/layer/fused_activation.h                       | 12 ++++++------
 src/layer/loongarch/convolution1d_loongarch.cpp    |  2 +-
 src/layer/loongarch/convolution_loongarch.cpp      |  2 +-
 .../loongarch/convolutiondepthwise_loongarch.cpp   |  4 ++--
 src/layer/loongarch/deconvolution_loongarch.cpp    |  2 +-
 .../loongarch/deconvolutiondepthwise_loongarch.cpp |  4 ++--
 src/layer/loongarch/innerproduct_loongarch.cpp     |  2 +-
 src/layer/mips/convolution1d_mips.cpp              |  2 +-
 src/layer/mips/convolution_mips.cpp                |  2 +-
 src/layer/mips/convolutiondepthwise_mips.cpp       |  4 ++--
 src/layer/mips/deconvolution_mips.cpp              |  2 +-
 src/layer/mips/deconvolutiondepthwise_mips.cpp     |  4 ++--
 src/layer/mips/innerproduct_mips.cpp               |  2 +-
 src/layer/riscv/convolution1d_riscv.cpp            |  2 +-
 src/layer/riscv/convolution_riscv.cpp              |  2 +-
 src/layer/riscv/convolutiondepthwise_riscv.cpp     |  4 ++--
 src/layer/riscv/deconvolution_riscv.cpp            |  2 +-
 src/layer/riscv/deconvolutiondepthwise_riscv.cpp   |  4 ++--
 src/layer/riscv/innerproduct_riscv.cpp             |  2 +-
 src/layer/vulkan/convolution1d_vulkan.cpp          |  2 +-
 src/layer/vulkan/convolution_vulkan.cpp            |  6 +++---
 src/layer/vulkan/convolutiondepthwise_vulkan.cpp   |  2 +-
 src/layer/vulkan/deconvolution_vulkan.cpp          |  4 ++--
 src/layer/vulkan/deconvolutiondepthwise_vulkan.cpp |  4 ++--
 src/layer/vulkan/innerproduct_vulkan.cpp           |  2 +-
 src/layer/vulkan/multiheadattention_vulkan.cpp     | 10 +++++-----
 src/layer/vulkan/pooling_vulkan.cpp                |  2 +-
 src/layer/vulkan/reshape_vulkan.cpp                | 12 ++++++------
 src/layer/x86/convolution1d_x86.cpp                |  2 +-
 src/layer/x86/convolution_x86.cpp                  |  6 +++---
 src/layer/x86/convolutiondepthwise_x86.cpp         |  4 ++--
 src/layer/x86/deconvolution_x86.cpp                |  4 ++--
 src/layer/x86/deconvolutiondepthwise_x86.cpp       |  4 ++--
 src/layer/x86/deformableconv2d_x86.cpp             |  2 +-
 src/layer/x86/innerproduct_x86.cpp                 |  2 +-
 src/layer/x86/matmul_x86.cpp                       |  2 +-
 src/layer/x86/multiheadattention_x86.cpp           | 14 +++++++-------
 src/layer/yolodetectionoutput.cpp                  |  2 +-
 src/layer/yolov3detectionoutput.cpp                |  2 +-
 50 files changed, 95 insertions(+), 95 deletions(-)

diff --git a/src/layer/arm/convolution1d_arm.cpp b/src/layer/arm/convolution1d_arm.cpp
index 48368fb9cc6..1bfb375e188 100644
--- a/src/layer/arm/convolution1d_arm.cpp
+++ b/src/layer/arm/convolution1d_arm.cpp
@@ -196,7 +196,7 @@ int Convolution1D_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector
         bias_data_flattened.elempack = 1;
     }
 
-    ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution1D);
+    ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Convolution1D);
 
     ncnn::ParamDict pd;
     pd.set(0, _num_output);
diff --git a/src/layer/arm/convolution_arm.cpp b/src/layer/arm/convolution_arm.cpp
index cde8c216873..cfcd28c09f4 100644
--- a/src/layer/arm/convolution_arm.cpp
+++ b/src/layer/arm/convolution_arm.cpp
@@ -157,7 +157,7 @@ int Convolution_arm::create_pipeline(const Option& opt)
 
     if ((!support_packing || !opt.use_packing_layout) && !opt.use_bf16_storage && kernel_w == kernel_h && dilation_w != 1 && dilation_h == dilation_w && stride_w == 1 && stride_h == 1)
     {
-        convolution_dilation1 = ncnn::create_layer(ncnn::LayerType::Convolution);
+        convolution_dilation1 = ncnn::create_layer_cpu(ncnn::LayerType::Convolution);
 
         // set param
         ncnn::ParamDict pd;
@@ -807,7 +807,7 @@ int Convolution_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<M
         bias_data_flattened.elempack = 1;
     }
 
-    ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution);
+    ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Convolution);
 
     ncnn::ParamDict pd;
     pd.set(0, _num_output);
diff --git a/src/layer/arm/convolutiondepthwise_arm.cpp b/src/layer/arm/convolutiondepthwise_arm.cpp
index 30b07134260..2ae661650e7 100644
--- a/src/layer/arm/convolutiondepthwise_arm.cpp
+++ b/src/layer/arm/convolutiondepthwise_arm.cpp
@@ -206,7 +206,7 @@ int ConvolutionDepthWise_arm::create_group_ops(const Option& opt)
         if (bias_term)
             bias_data_g = bias_data.range(num_output_g * g, num_output_g);
 
-        ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution);
+        ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Convolution);
 
         // set param
         ncnn::ParamDict pd;
@@ -650,7 +650,7 @@ int ConvolutionDepthWise_arm::forward(const std::vector<Mat>& bottom_blobs, std:
         bias_data_flattened.elempack = 1;
     }
 
-    ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::ConvolutionDepthWise);
+    ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::ConvolutionDepthWise);
 
     ncnn::ParamDict pd;
     pd.set(0, _num_output);
diff --git a/src/layer/arm/deconvolution_arm.cpp b/src/layer/arm/deconvolution_arm.cpp
index ef8a56f9931..9061d59eafe 100644
--- a/src/layer/arm/deconvolution_arm.cpp
+++ b/src/layer/arm/deconvolution_arm.cpp
@@ -85,7 +85,7 @@ int Deconvolution_arm::create_pipeline(const Option& opt)
     {
         const int maxk = kernel_w * kernel_h;
 
-        gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
+        gemm = ncnn::create_layer_cpu(ncnn::LayerType::Gemm);
 
         ncnn::ParamDict pd;
         pd.set(2, 1);                 // transA
@@ -851,7 +851,7 @@ int Deconvolution_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector
         bias_data_flattened.elempack = 1;
     }
 
-    ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Deconvolution);
+    ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Deconvolution);
 
     ncnn::ParamDict pd;
     pd.set(0, _num_output);
diff --git a/src/layer/arm/deconvolution_arm_asimdhp.cpp b/src/layer/arm/deconvolution_arm_asimdhp.cpp
index c98ba40309b..a12614b4d97 100644
--- a/src/layer/arm/deconvolution_arm_asimdhp.cpp
+++ b/src/layer/arm/deconvolution_arm_asimdhp.cpp
@@ -45,7 +45,7 @@ int Deconvolution_arm::create_pipeline_fp16s(const Option& opt)
     {
         const int maxk = kernel_w * kernel_h;
 
-        gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
+        gemm = ncnn::create_layer_cpu(ncnn::LayerType::Gemm);
 
         ncnn::ParamDict pd;
         pd.set(2, 1);                 // transA
diff --git a/src/layer/arm/deconvolutiondepthwise_arm.cpp b/src/layer/arm/deconvolutiondepthwise_arm.cpp
index 478bd1740dc..6edd735cb97 100644
--- a/src/layer/arm/deconvolutiondepthwise_arm.cpp
+++ b/src/layer/arm/deconvolutiondepthwise_arm.cpp
@@ -148,7 +148,7 @@ int DeconvolutionDepthWise_arm::create_pipeline(const Option& opt)
             if (bias_term)
                 bias_data_g = bias_data.range(num_output_g * g, num_output_g);
 
-            ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Deconvolution);
+            ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Deconvolution);
 
             // set param
             ncnn::ParamDict pd;
@@ -562,7 +562,7 @@ int DeconvolutionDepthWise_arm::forward(const std::vector<Mat>& bottom_blobs, st
         bias_data_flattened.elempack = 1;
     }
 
-    ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::DeconvolutionDepthWise);
+    ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::DeconvolutionDepthWise);
 
     ncnn::ParamDict pd;
     pd.set(0, _num_output);
diff --git a/src/layer/arm/deconvolutiondepthwise_arm_asimdhp.cpp b/src/layer/arm/deconvolutiondepthwise_arm_asimdhp.cpp
index 09e0fca4356..e6c636525a4 100644
--- a/src/layer/arm/deconvolutiondepthwise_arm_asimdhp.cpp
+++ b/src/layer/arm/deconvolutiondepthwise_arm_asimdhp.cpp
@@ -100,7 +100,7 @@ int DeconvolutionDepthWise_arm::create_pipeline_fp16s(const Option& opt)
             if (bias_term)
                 bias_data_g = bias_data.range(num_output_g * g, num_output_g);
 
-            ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Deconvolution);
+            ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Deconvolution);
 
             // set param
             ncnn::ParamDict pd;
diff --git a/src/layer/arm/innerproduct_arm.cpp b/src/layer/arm/innerproduct_arm.cpp
index 98eda2d171b..31689008e79 100644
--- a/src/layer/arm/innerproduct_arm.cpp
+++ b/src/layer/arm/innerproduct_arm.cpp
@@ -46,7 +46,7 @@ InnerProduct_arm::InnerProduct_arm()
 int InnerProduct_arm::create_pipeline(const Option& opt)
 {
     {
-        flatten = ncnn::create_layer(ncnn::LayerType::Flatten);
+        flatten = ncnn::create_layer_cpu(ncnn::LayerType::Flatten);
 
         ncnn::ParamDict pd;
 
diff --git a/src/layer/arm/matmul_arm.cpp b/src/layer/arm/matmul_arm.cpp
index 7117ce49511..363ab4490bb 100644
--- a/src/layer/arm/matmul_arm.cpp
+++ b/src/layer/arm/matmul_arm.cpp
@@ -37,7 +37,7 @@ MatMul_arm::MatMul_arm()
 
 int MatMul_arm::create_pipeline(const Option& opt)
 {
-    gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
+    gemm = ncnn::create_layer_cpu(ncnn::LayerType::Gemm);
 
     ncnn::ParamDict pd;
     pd.set(2, 0);      // transA
diff --git a/src/layer/arm/multiheadattention_arm.cpp b/src/layer/arm/multiheadattention_arm.cpp
index 15eca715699..37323a2255f 100644
--- a/src/layer/arm/multiheadattention_arm.cpp
+++ b/src/layer/arm/multiheadattention_arm.cpp
@@ -48,7 +48,7 @@ int MultiHeadAttention_arm::create_pipeline(const Option& _opt)
     opt.use_bf16_storage &= support_bf16_storage;
 
     {
-        qk_softmax = ncnn::create_layer(ncnn::LayerType::Softmax);
+        qk_softmax = ncnn::create_layer_cpu(ncnn::LayerType::Softmax);
         ncnn::ParamDict pd;
         pd.set(0, -1);
         pd.set(1, 1);
@@ -61,7 +61,7 @@ int MultiHeadAttention_arm::create_pipeline(const Option& _opt)
         const int embed_dim_per_head = embed_dim / num_heads;
         const float inv_sqrt_embed_dim_per_head = 1.f / sqrtf(embed_dim_per_head);
 
-        q_gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
+        q_gemm = ncnn::create_layer_cpu(ncnn::LayerType::Gemm);
         ncnn::ParamDict pd;
         pd.set(0, inv_sqrt_embed_dim_per_head);
         pd.set(1, 1.f);
@@ -92,7 +92,7 @@ int MultiHeadAttention_arm::create_pipeline(const Option& _opt)
     }
 
     {
-        k_gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
+        k_gemm = ncnn::create_layer_cpu(ncnn::LayerType::Gemm);
         ncnn::ParamDict pd;
         pd.set(2, 0);         // transA
         pd.set(3, 1);         // transB
@@ -121,7 +121,7 @@ int MultiHeadAttention_arm::create_pipeline(const Option& _opt)
     }
 
     {
-        v_gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
+        v_gemm = ncnn::create_layer_cpu(ncnn::LayerType::Gemm);
         ncnn::ParamDict pd;
         pd.set(2, 0);         // transA
         pd.set(3, 1);         // transB
@@ -150,7 +150,7 @@ int MultiHeadAttention_arm::create_pipeline(const Option& _opt)
     }
 
     {
-        o_gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
+        o_gemm = ncnn::create_layer_cpu(ncnn::LayerType::Gemm);
         ncnn::ParamDict pd;
         pd.set(2, 1);         // transA
         pd.set(3, 1);         // transB
@@ -177,7 +177,7 @@ int MultiHeadAttention_arm::create_pipeline(const Option& _opt)
     }
 
     {
-        qk_gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
+        qk_gemm = ncnn::create_layer_cpu(ncnn::LayerType::Gemm);
         ncnn::ParamDict pd;
         pd.set(2, 1);                   // transA
         pd.set(3, 0);                   // transB
@@ -198,7 +198,7 @@ int MultiHeadAttention_arm::create_pipeline(const Option& _opt)
     }
 
     {
-        qkv_gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
+        qkv_gemm = ncnn::create_layer_cpu(ncnn::LayerType::Gemm);
         ncnn::ParamDict pd;
         pd.set(2, 0);   // transA
         pd.set(3, 1);   // transB
diff --git a/src/layer/convolution.cpp b/src/layer/convolution.cpp
index ba9eaef8752..fe025456f48 100644
--- a/src/layer/convolution.cpp
+++ b/src/layer/convolution.cpp
@@ -210,7 +210,7 @@ int Convolution::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
         if (bottom_blob.w * bottom_blob.elempack == num_input)
         {
             // call InnerProduct
-            ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::InnerProduct);
+            ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::InnerProduct);
 
             // set param
             ncnn::ParamDict pd;
diff --git a/src/layer/fused_activation.h b/src/layer/fused_activation.h
index 275fd9e2f9a..d5919257792 100644
--- a/src/layer/fused_activation.h
+++ b/src/layer/fused_activation.h
@@ -80,14 +80,14 @@ static ncnn::Layer* create_activation_layer(int activation_type, const ncnn::Mat
 
     if (activation_type == 1)
     {
-        activation = ncnn::create_layer(ncnn::LayerType::ReLU);
+        activation = ncnn::create_layer_cpu(ncnn::LayerType::ReLU);
 
         ncnn::ParamDict pd;
         activation->load_param(pd);
     }
     else if (activation_type == 2)
     {
-        activation = ncnn::create_layer(ncnn::LayerType::ReLU);
+        activation = ncnn::create_layer_cpu(ncnn::LayerType::ReLU);
 
         ncnn::ParamDict pd;
         pd.set(0, activation_params[0]); // slope
@@ -95,7 +95,7 @@ static ncnn::Layer* create_activation_layer(int activation_type, const ncnn::Mat
     }
     else if (activation_type == 3)
     {
-        activation = ncnn::create_layer(ncnn::LayerType::Clip);
+        activation = ncnn::create_layer_cpu(ncnn::LayerType::Clip);
 
         ncnn::ParamDict pd;
         pd.set(0, activation_params[0]); // min
@@ -105,21 +105,21 @@ static ncnn::Layer* create_activation_layer(int activation_type, const ncnn::Mat
     }
     else if (activation_type == 4)
     {
-        activation = ncnn::create_layer(ncnn::LayerType::Sigmoid);
+        activation = ncnn::create_layer_cpu(ncnn::LayerType::Sigmoid);
 
         ncnn::ParamDict pd;
         activation->load_param(pd);
     }
     else if (activation_type == 5)
     {
-        activation = ncnn::create_layer(ncnn::LayerType::Mish);
+        activation = ncnn::create_layer_cpu(ncnn::LayerType::Mish);
 
         ncnn::ParamDict pd;
         activation->load_param(pd);
     }
     else if (activation_type == 6)
     {
-        activation = ncnn::create_layer(ncnn::LayerType::HardSwish);
+        activation = ncnn::create_layer_cpu(ncnn::LayerType::HardSwish);
 
         ncnn::ParamDict pd;
         pd.set(0, activation_params[0]); // alpha
diff --git a/src/layer/loongarch/convolution1d_loongarch.cpp b/src/layer/loongarch/convolution1d_loongarch.cpp
index 0b1a11c868f..0917a79f62e 100644
--- a/src/layer/loongarch/convolution1d_loongarch.cpp
+++ b/src/layer/loongarch/convolution1d_loongarch.cpp
@@ -342,7 +342,7 @@ int Convolution1D_loongarch::forward(const std::vector<Mat>& bottom_blobs, std::
         bias_data_flattened.elempack = 1;
     }
 
-    ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution1D);
+    ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Convolution1D);
 
     ncnn::ParamDict pd;
     pd.set(0, _num_output);
diff --git a/src/layer/loongarch/convolution_loongarch.cpp b/src/layer/loongarch/convolution_loongarch.cpp
index 7816d1c66d2..c38254c0035 100644
--- a/src/layer/loongarch/convolution_loongarch.cpp
+++ b/src/layer/loongarch/convolution_loongarch.cpp
@@ -593,7 +593,7 @@ int Convolution_loongarch::forward(const std::vector<Mat>& bottom_blobs, std::ve
         bias_data_flattened.elempack = 1;
     }
 
-    ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution);
+    ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Convolution);
 
     ncnn::ParamDict pd;
     pd.set(0, _num_output);
diff --git a/src/layer/loongarch/convolutiondepthwise_loongarch.cpp b/src/layer/loongarch/convolutiondepthwise_loongarch.cpp
index 4d134cc4a39..2546c19bfd0 100644
--- a/src/layer/loongarch/convolutiondepthwise_loongarch.cpp
+++ b/src/layer/loongarch/convolutiondepthwise_loongarch.cpp
@@ -125,7 +125,7 @@ int ConvolutionDepthWise_loongarch::create_group_ops(const Option& opt)
         if (bias_term)
             bias_data_g = bias_data.range(num_output_g * g, num_output_g);
 
-        ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution);
+        ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Convolution);
 
         // set param
         ncnn::ParamDict pd;
@@ -537,7 +537,7 @@ int ConvolutionDepthWise_loongarch::forward(const std::vector<Mat>& bottom_blobs
         bias_data_flattened.elempack = 1;
     }
 
-    ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::ConvolutionDepthWise);
+    ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::ConvolutionDepthWise);
 
     ncnn::ParamDict pd;
     pd.set(0, _num_output);
diff --git a/src/layer/loongarch/deconvolution_loongarch.cpp b/src/layer/loongarch/deconvolution_loongarch.cpp
index 2d934bccb06..cdf7c0a2638 100644
--- a/src/layer/loongarch/deconvolution_loongarch.cpp
+++ b/src/layer/loongarch/deconvolution_loongarch.cpp
@@ -348,7 +348,7 @@ int Deconvolution_loongarch::forward(const std::vector<Mat>& bottom_blobs, std::
         bias_data_flattened.elempack = 1;
     }
 
-    ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Deconvolution);
+    ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Deconvolution);
 
     ncnn::ParamDict pd;
     pd.set(0, _num_output);
diff --git a/src/layer/loongarch/deconvolutiondepthwise_loongarch.cpp b/src/layer/loongarch/deconvolutiondepthwise_loongarch.cpp
index f4f4d37bf7f..cc9d24a506c 100644
--- a/src/layer/loongarch/deconvolutiondepthwise_loongarch.cpp
+++ b/src/layer/loongarch/deconvolutiondepthwise_loongarch.cpp
@@ -119,7 +119,7 @@ int DeconvolutionDepthWise_loongarch::create_group_ops(const Option& opt)
         if (bias_term)
             bias_data_g = bias_data.range(num_output_g * g, num_output_g);
 
-        ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Deconvolution);
+        ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Deconvolution);
 
         // set param
         ncnn::ParamDict pd;
@@ -476,7 +476,7 @@ int DeconvolutionDepthWise_loongarch::forward(const std::vector<Mat>& bottom_blo
         bias_data_flattened.elempack = 1;
     }
 
-    ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::DeconvolutionDepthWise);
+    ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::DeconvolutionDepthWise);
 
     ncnn::ParamDict pd;
     pd.set(0, _num_output);
diff --git a/src/layer/loongarch/innerproduct_loongarch.cpp b/src/layer/loongarch/innerproduct_loongarch.cpp
index 34e908fc11a..b17d3f830c2 100644
--- a/src/layer/loongarch/innerproduct_loongarch.cpp
+++ b/src/layer/loongarch/innerproduct_loongarch.cpp
@@ -37,7 +37,7 @@ InnerProduct_loongarch::InnerProduct_loongarch()
 int InnerProduct_loongarch::create_pipeline(const Option& opt)
 {
     {
-        flatten = ncnn::create_layer(ncnn::LayerType::Flatten);
+        flatten = ncnn::create_layer_cpu(ncnn::LayerType::Flatten);
 
         ncnn::ParamDict pd;
 
diff --git a/src/layer/mips/convolution1d_mips.cpp b/src/layer/mips/convolution1d_mips.cpp
index fc61c940687..e9cf211e49b 100644
--- a/src/layer/mips/convolution1d_mips.cpp
+++ b/src/layer/mips/convolution1d_mips.cpp
@@ -342,7 +342,7 @@ int Convolution1D_mips::forward(const std::vector<Mat>& bottom_blobs, std::vecto
         bias_data_flattened.elempack = 1;
     }
 
-    ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution1D);
+    ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Convolution1D);
 
     ncnn::ParamDict pd;
     pd.set(0, _num_output);
diff --git a/src/layer/mips/convolution_mips.cpp b/src/layer/mips/convolution_mips.cpp
index bc547e4a667..8f566f43a6c 100644
--- a/src/layer/mips/convolution_mips.cpp
+++ b/src/layer/mips/convolution_mips.cpp
@@ -593,7 +593,7 @@ int Convolution_mips::forward(const std::vector<Mat>& bottom_blobs, std::vector<
         bias_data_flattened.elempack = 1;
     }
 
-    ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution);
+    ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Convolution);
 
     ncnn::ParamDict pd;
     pd.set(0, _num_output);
diff --git a/src/layer/mips/convolutiondepthwise_mips.cpp b/src/layer/mips/convolutiondepthwise_mips.cpp
index 991cb07872d..17bb2e012e6 100644
--- a/src/layer/mips/convolutiondepthwise_mips.cpp
+++ b/src/layer/mips/convolutiondepthwise_mips.cpp
@@ -125,7 +125,7 @@ int ConvolutionDepthWise_mips::create_group_ops(const Option& opt)
         if (bias_term)
             bias_data_g = bias_data.range(num_output_g * g, num_output_g);
 
-        ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution);
+        ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Convolution);
 
         // set param
         ncnn::ParamDict pd;
@@ -537,7 +537,7 @@ int ConvolutionDepthWise_mips::forward(const std::vector<Mat>& bottom_blobs, std
         bias_data_flattened.elempack = 1;
     }
 
-    ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::ConvolutionDepthWise);
+    ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::ConvolutionDepthWise);
 
     ncnn::ParamDict pd;
     pd.set(0, _num_output);
diff --git a/src/layer/mips/deconvolution_mips.cpp b/src/layer/mips/deconvolution_mips.cpp
index 506d3072096..607313614c0 100644
--- a/src/layer/mips/deconvolution_mips.cpp
+++ b/src/layer/mips/deconvolution_mips.cpp
@@ -348,7 +348,7 @@ int Deconvolution_mips::forward(const std::vector<Mat>& bottom_blobs, std::vecto
         bias_data_flattened.elempack = 1;
     }
 
-    ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Deconvolution);
+    ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Deconvolution);
 
     ncnn::ParamDict pd;
     pd.set(0, _num_output);
diff --git a/src/layer/mips/deconvolutiondepthwise_mips.cpp b/src/layer/mips/deconvolutiondepthwise_mips.cpp
index 533bf522ad9..404335b0efe 100644
--- a/src/layer/mips/deconvolutiondepthwise_mips.cpp
+++ b/src/layer/mips/deconvolutiondepthwise_mips.cpp
@@ -119,7 +119,7 @@ int DeconvolutionDepthWise_mips::create_group_ops(const Option& opt)
         if (bias_term)
             bias_data_g = bias_data.range(num_output_g * g, num_output_g);
 
-        ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Deconvolution);
+        ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Deconvolution);
 
         // set param
         ncnn::ParamDict pd;
@@ -476,7 +476,7 @@ int DeconvolutionDepthWise_mips::forward(const std::vector<Mat>& bottom_blobs, s
         bias_data_flattened.elempack = 1;
     }
 
-    ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::DeconvolutionDepthWise);
+    ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::DeconvolutionDepthWise);
 
     ncnn::ParamDict pd;
     pd.set(0, _num_output);
diff --git a/src/layer/mips/innerproduct_mips.cpp b/src/layer/mips/innerproduct_mips.cpp
index b064a20e522..87a32b86cfe 100644
--- a/src/layer/mips/innerproduct_mips.cpp
+++ b/src/layer/mips/innerproduct_mips.cpp
@@ -37,7 +37,7 @@ InnerProduct_mips::InnerProduct_mips()
 int InnerProduct_mips::create_pipeline(const Option& opt)
 {
     {
-        flatten = ncnn::create_layer(ncnn::LayerType::Flatten);
+        flatten = ncnn::create_layer_cpu(ncnn::LayerType::Flatten);
 
         ncnn::ParamDict pd;
 
diff --git a/src/layer/riscv/convolution1d_riscv.cpp b/src/layer/riscv/convolution1d_riscv.cpp
index d3d17861d89..ff02d6bc24f 100644
--- a/src/layer/riscv/convolution1d_riscv.cpp
+++ b/src/layer/riscv/convolution1d_riscv.cpp
@@ -387,7 +387,7 @@ int Convolution1D_riscv::forward(const std::vector<Mat>& bottom_blobs, std::vect
         bias_data_flattened.elempack = 1;
     }
 
-    ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution1D);
+    ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Convolution1D);
 
     ncnn::ParamDict pd;
     pd.set(0, _num_output);
diff --git a/src/layer/riscv/convolution_riscv.cpp b/src/layer/riscv/convolution_riscv.cpp
index 4c4d57c6a57..a4c73986bc4 100644
--- a/src/layer/riscv/convolution_riscv.cpp
+++ b/src/layer/riscv/convolution_riscv.cpp
@@ -677,7 +677,7 @@ int Convolution_riscv::forward(const std::vector<Mat>& bottom_blobs, std::vector
         bias_data_flattened.elempack = 1;
     }
 
-    ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution);
+    ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Convolution);
 
     ncnn::ParamDict pd;
     pd.set(0, _num_output);
diff --git a/src/layer/riscv/convolutiondepthwise_riscv.cpp b/src/layer/riscv/convolutiondepthwise_riscv.cpp
index eb39ac0baa7..7d772d75ef9 100644
--- a/src/layer/riscv/convolutiondepthwise_riscv.cpp
+++ b/src/layer/riscv/convolutiondepthwise_riscv.cpp
@@ -146,7 +146,7 @@ int ConvolutionDepthWise_riscv::create_group_ops(const Option& opt)
         if (bias_term)
             bias_data_g = bias_data.range(num_output_g * g, num_output_g);
 
-        ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution);
+        ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Convolution);
 
         // set param
         ncnn::ParamDict pd;
@@ -613,7 +613,7 @@ int ConvolutionDepthWise_riscv::forward(const std::vector<Mat>& bottom_blobs, st
         bias_data_flattened.elempack = 1;
     }
 
-    ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::ConvolutionDepthWise);
+    ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::ConvolutionDepthWise);
 
     ncnn::ParamDict pd;
     pd.set(0, _num_output);
diff --git a/src/layer/riscv/deconvolution_riscv.cpp b/src/layer/riscv/deconvolution_riscv.cpp
index 9202d367f93..9483a2f8af3 100644
--- a/src/layer/riscv/deconvolution_riscv.cpp
+++ b/src/layer/riscv/deconvolution_riscv.cpp
@@ -404,7 +404,7 @@ int Deconvolution_riscv::forward(const std::vector<Mat>& bottom_blobs, std::vect
         bias_data_flattened.elempack = 1;
     }
 
-    ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Deconvolution);
+    ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Deconvolution);
 
     ncnn::ParamDict pd;
     pd.set(0, _num_output);
diff --git a/src/layer/riscv/deconvolutiondepthwise_riscv.cpp b/src/layer/riscv/deconvolutiondepthwise_riscv.cpp
index eee765c4ea6..9f127b1160c 100644
--- a/src/layer/riscv/deconvolutiondepthwise_riscv.cpp
+++ b/src/layer/riscv/deconvolutiondepthwise_riscv.cpp
@@ -139,7 +139,7 @@ int DeconvolutionDepthWise_riscv::create_group_ops(const Option& opt)
         if (bias_term)
             bias_data_g = bias_data.range(num_output_g * g, num_output_g);
 
-        ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Deconvolution);
+        ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Deconvolution);
 
         // set param
         ncnn::ParamDict pd;
@@ -531,7 +531,7 @@ int DeconvolutionDepthWise_riscv::forward(const std::vector<Mat>& bottom_blobs,
         bias_data_flattened.elempack = 1;
     }
 
-    ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::DeconvolutionDepthWise);
+    ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::DeconvolutionDepthWise);
 
     ncnn::ParamDict pd;
     pd.set(0, _num_output);
diff --git a/src/layer/riscv/innerproduct_riscv.cpp b/src/layer/riscv/innerproduct_riscv.cpp
index ac7b3169708..1f0d698a979 100644
--- a/src/layer/riscv/innerproduct_riscv.cpp
+++ b/src/layer/riscv/innerproduct_riscv.cpp
@@ -40,7 +40,7 @@ InnerProduct_riscv::InnerProduct_riscv()
 int InnerProduct_riscv::create_pipeline(const Option& opt)
 {
     {
-        flatten = ncnn::create_layer(ncnn::LayerType::Flatten);
+        flatten = ncnn::create_layer_cpu(ncnn::LayerType::Flatten);
 
         ncnn::ParamDict pd;
 
diff --git a/src/layer/vulkan/convolution1d_vulkan.cpp b/src/layer/vulkan/convolution1d_vulkan.cpp
index 53dff49262b..a69a48d822e 100644
--- a/src/layer/vulkan/convolution1d_vulkan.cpp
+++ b/src/layer/vulkan/convolution1d_vulkan.cpp
@@ -47,7 +47,7 @@ int Convolution1D_vulkan::create_pipeline(const Option& _opt)
     int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
 
     {
-        padding = ncnn::create_layer(ncnn::LayerType::Padding);
+        padding = ncnn::create_layer_vulkan(ncnn::LayerType::Padding);
         padding->vkdev = vkdev;
 
         ncnn::ParamDict pd;
diff --git a/src/layer/vulkan/convolution_vulkan.cpp b/src/layer/vulkan/convolution_vulkan.cpp
index beb0bccb9bf..4b93baefb1a 100644
--- a/src/layer/vulkan/convolution_vulkan.cpp
+++ b/src/layer/vulkan/convolution_vulkan.cpp
@@ -117,7 +117,7 @@ int Convolution_vulkan::create_pipeline(const Option& _opt)
     if (kernel_w == 1 && kernel_h == 1)
     {
         {
-            reshape_1x1xw = ncnn::create_layer(ncnn::LayerType::Reshape);
+            reshape_1x1xw = ncnn::create_layer_vulkan(ncnn::LayerType::Reshape);
             reshape_1x1xw->vkdev = vkdev;
 
             reshape_1x1xw->bottom_shapes.resize(1);
@@ -136,7 +136,7 @@ int Convolution_vulkan::create_pipeline(const Option& _opt)
         }
 
         {
-            reshape_w = ncnn::create_layer(ncnn::LayerType::Reshape);
+            reshape_w = ncnn::create_layer_vulkan(ncnn::LayerType::Reshape);
             reshape_w->vkdev = vkdev;
 
             reshape_w->bottom_shapes.resize(1);
@@ -157,7 +157,7 @@ int Convolution_vulkan::create_pipeline(const Option& _opt)
     bool is_conv3x3s1d1 = kernel_w == 3 && kernel_h == 3 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1;
 
     {
-        padding = ncnn::create_layer(ncnn::LayerType::Padding);
+        padding = ncnn::create_layer_vulkan(ncnn::LayerType::Padding);
         padding->vkdev = vkdev;
 
         padding->bottom_shapes.resize(1);
diff --git a/src/layer/vulkan/convolutiondepthwise_vulkan.cpp b/src/layer/vulkan/convolutiondepthwise_vulkan.cpp
index 57069074c96..0fbae334184 100644
--- a/src/layer/vulkan/convolutiondepthwise_vulkan.cpp
+++ b/src/layer/vulkan/convolutiondepthwise_vulkan.cpp
@@ -177,7 +177,7 @@ int ConvolutionDepthWise_vulkan::create_pipeline(const Option& _opt)
     }
 
     {
-        padding = ncnn::create_layer(ncnn::LayerType::Padding);
+        padding = ncnn::create_layer_vulkan(ncnn::LayerType::Padding);
         padding->vkdev = vkdev;
 
         padding->bottom_shapes.resize(1);
diff --git a/src/layer/vulkan/deconvolution_vulkan.cpp b/src/layer/vulkan/deconvolution_vulkan.cpp
index c53aedefc84..de530e54621 100644
--- a/src/layer/vulkan/deconvolution_vulkan.cpp
+++ b/src/layer/vulkan/deconvolution_vulkan.cpp
@@ -109,7 +109,7 @@ int Deconvolution_vulkan::create_pipeline(const Option& _opt)
     }
 
     {
-        crop = ncnn::create_layer(ncnn::LayerType::Crop);
+        crop = ncnn::create_layer_vulkan(ncnn::LayerType::Crop);
         crop->vkdev = vkdev;
 
         crop->bottom_shapes.resize(1);
@@ -128,7 +128,7 @@ int Deconvolution_vulkan::create_pipeline(const Option& _opt)
     }
 
     {
-        output_crop = ncnn::create_layer(ncnn::LayerType::Crop);
+        output_crop = ncnn::create_layer_vulkan(ncnn::LayerType::Crop);
         output_crop->vkdev = vkdev;
 
         output_crop->bottom_shapes.resize(1);
diff --git a/src/layer/vulkan/deconvolutiondepthwise_vulkan.cpp b/src/layer/vulkan/deconvolutiondepthwise_vulkan.cpp
index b24418fa428..cf874439619 100644
--- a/src/layer/vulkan/deconvolutiondepthwise_vulkan.cpp
+++ b/src/layer/vulkan/deconvolutiondepthwise_vulkan.cpp
@@ -168,7 +168,7 @@ int DeconvolutionDepthWise_vulkan::create_pipeline(const Option& _opt)
     }
 
     {
-        crop = ncnn::create_layer(ncnn::LayerType::Crop);
+        crop = ncnn::create_layer_vulkan(ncnn::LayerType::Crop);
         crop->vkdev = vkdev;
 
         crop->bottom_shapes.resize(1);
@@ -187,7 +187,7 @@ int DeconvolutionDepthWise_vulkan::create_pipeline(const Option& _opt)
     }
 
     {
-        output_crop = ncnn::create_layer(ncnn::LayerType::Crop);
+        output_crop = ncnn::create_layer_vulkan(ncnn::LayerType::Crop);
         output_crop->vkdev = vkdev;
 
         output_crop->bottom_shapes.resize(1);
diff --git a/src/layer/vulkan/innerproduct_vulkan.cpp b/src/layer/vulkan/innerproduct_vulkan.cpp
index 06bf7b56943..df87b3de258 100644
--- a/src/layer/vulkan/innerproduct_vulkan.cpp
+++ b/src/layer/vulkan/innerproduct_vulkan.cpp
@@ -214,7 +214,7 @@ int InnerProduct_vulkan::create_pipeline(const Option& _opt)
     }
 
     {
-        flatten = ncnn::create_layer(ncnn::LayerType::Flatten);
+        flatten = ncnn::create_layer_vulkan(ncnn::LayerType::Flatten);
         flatten->vkdev = vkdev;
 
         flatten->bottom_shapes.resize(1);
diff --git a/src/layer/vulkan/multiheadattention_vulkan.cpp b/src/layer/vulkan/multiheadattention_vulkan.cpp
index acb28869382..142ccf4f8b6 100644
--- a/src/layer/vulkan/multiheadattention_vulkan.cpp
+++ b/src/layer/vulkan/multiheadattention_vulkan.cpp
@@ -49,7 +49,7 @@ int MultiHeadAttention_vulkan::create_pipeline(const Option& opt)
     {
         const float inv_sqrt_embed_dim_per_head = 1.f / sqrtf(embed_dim_per_head);
 
-        q_gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
+        q_gemm = ncnn::create_layer_vulkan(ncnn::LayerType::Gemm);
         q_gemm->vkdev = vkdev;
         ncnn::ParamDict pd;
         pd.set(0, inv_sqrt_embed_dim_per_head);
@@ -75,7 +75,7 @@ int MultiHeadAttention_vulkan::create_pipeline(const Option& opt)
     }
 
     {
-        k_gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
+        k_gemm = ncnn::create_layer_vulkan(ncnn::LayerType::Gemm);
         k_gemm->vkdev = vkdev;
         ncnn::ParamDict pd;
         pd.set(2, 0);         // transA
@@ -99,7 +99,7 @@ int MultiHeadAttention_vulkan::create_pipeline(const Option& opt)
     }
 
     {
-        v_gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
+        v_gemm = ncnn::create_layer_vulkan(ncnn::LayerType::Gemm);
         v_gemm->vkdev = vkdev;
         ncnn::ParamDict pd;
         pd.set(2, 0);         // transA
@@ -182,7 +182,7 @@ int MultiHeadAttention_vulkan::create_pipeline(const Option& opt)
     }
 
     {
-        qk_softmax = ncnn::create_layer(ncnn::LayerType::Softmax);
+        qk_softmax = ncnn::create_layer_vulkan(ncnn::LayerType::Softmax);
         qk_softmax->vkdev = vkdev;
         ncnn::ParamDict pd;
         pd.set(0, -1);
@@ -193,7 +193,7 @@ int MultiHeadAttention_vulkan::create_pipeline(const Option& opt)
     }
 
     {
-        o_gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
+        o_gemm = ncnn::create_layer_vulkan(ncnn::LayerType::Gemm);
         o_gemm->vkdev = vkdev;
         ncnn::ParamDict pd;
         pd.set(2, 1);         // transA
diff --git a/src/layer/vulkan/pooling_vulkan.cpp b/src/layer/vulkan/pooling_vulkan.cpp
index eeba214ccac..ee7a9093301 100644
--- a/src/layer/vulkan/pooling_vulkan.cpp
+++ b/src/layer/vulkan/pooling_vulkan.cpp
@@ -128,7 +128,7 @@ int Pooling_vulkan::create_pipeline(const Option& _opt)
     }
 
     {
-        padding = ncnn::create_layer(ncnn::LayerType::Padding);
+        padding = ncnn::create_layer_vulkan(ncnn::LayerType::Padding);
         padding->vkdev = vkdev;
 
         padding->bottom_shapes.resize(1);
diff --git a/src/layer/vulkan/reshape_vulkan.cpp b/src/layer/vulkan/reshape_vulkan.cpp
index 567acc6651d..e33efca47cc 100644
--- a/src/layer/vulkan/reshape_vulkan.cpp
+++ b/src/layer/vulkan/reshape_vulkan.cpp
@@ -121,7 +121,7 @@ int Reshape_vulkan::create_pipeline(const Option& _opt)
     if (need_permute)
     {
         {
-            permute_wh = ncnn::create_layer(ncnn::LayerType::Permute);
+            permute_wh = ncnn::create_layer_vulkan(ncnn::LayerType::Permute);
             permute_wh->vkdev = vkdev;
 
             permute_wh->bottom_shapes.resize(1);
@@ -137,7 +137,7 @@ int Reshape_vulkan::create_pipeline(const Option& _opt)
             permute_wh->create_pipeline(opt);
         }
         {
-            permute_hwc = ncnn::create_layer(ncnn::LayerType::Permute);
+            permute_hwc = ncnn::create_layer_vulkan(ncnn::LayerType::Permute);
             permute_hwc->vkdev = vkdev;
 
             permute_hwc->bottom_shapes.resize(1);
@@ -153,7 +153,7 @@ int Reshape_vulkan::create_pipeline(const Option& _opt)
             permute_hwc->create_pipeline(opt);
         }
         {
-            permute_dhwc = ncnn::create_layer(ncnn::LayerType::Permute);
+            permute_dhwc = ncnn::create_layer_vulkan(ncnn::LayerType::Permute);
             permute_dhwc->vkdev = vkdev;
 
             permute_dhwc->bottom_shapes.resize(1);
@@ -171,7 +171,7 @@ int Reshape_vulkan::create_pipeline(const Option& _opt)
 
         if (ndim == 2)
         {
-            permute_hw = ncnn::create_layer(ncnn::LayerType::Permute);
+            permute_hw = ncnn::create_layer_vulkan(ncnn::LayerType::Permute);
             permute_hw->vkdev = vkdev;
 
             permute_hw->bottom_shapes.resize(1);
@@ -188,7 +188,7 @@ int Reshape_vulkan::create_pipeline(const Option& _opt)
         }
         if (ndim == 3)
         {
-            permute_chw = ncnn::create_layer(ncnn::LayerType::Permute);
+            permute_chw = ncnn::create_layer_vulkan(ncnn::LayerType::Permute);
             permute_chw->vkdev = vkdev;
 
             permute_chw->bottom_shapes.resize(1);
@@ -205,7 +205,7 @@ int Reshape_vulkan::create_pipeline(const Option& _opt)
         }
         if (ndim == 4)
         {
-            permute_cdhw = ncnn::create_layer(ncnn::LayerType::Permute);
+            permute_cdhw = ncnn::create_layer_vulkan(ncnn::LayerType::Permute);
             permute_cdhw->vkdev = vkdev;
 
             permute_cdhw->bottom_shapes.resize(1);
diff --git a/src/layer/x86/convolution1d_x86.cpp b/src/layer/x86/convolution1d_x86.cpp
index e7df16b8316..26c72678b67 100644
--- a/src/layer/x86/convolution1d_x86.cpp
+++ b/src/layer/x86/convolution1d_x86.cpp
@@ -126,7 +126,7 @@ int Convolution1D_x86::forward(const std::vector<Mat>& bottom_blobs, std::vector
         bias_data_flattened.elempack = 1;
     }
 
-    ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution1D);
+    ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Convolution1D);
 
     ncnn::ParamDict pd;
     pd.set(0, _num_output);
diff --git a/src/layer/x86/convolution_x86.cpp b/src/layer/x86/convolution_x86.cpp
index 6e828ff0d21..5c97b02eebd 100644
--- a/src/layer/x86/convolution_x86.cpp
+++ b/src/layer/x86/convolution_x86.cpp
@@ -297,7 +297,7 @@ int Convolution_x86::create_pipeline(const Option& opt)
 
     if (!opt.use_packing_layout && kernel_w == kernel_h && dilation_w != 1 && dilation_h == dilation_w && stride_w == 1 && stride_h == 1)
     {
-        convolution_dilation1 = ncnn::create_layer(ncnn::LayerType::Convolution);
+        convolution_dilation1 = ncnn::create_layer_cpu(ncnn::LayerType::Convolution);
 
         // set param
         ncnn::ParamDict pd;
@@ -469,7 +469,7 @@ int Convolution_x86::create_pipeline(const Option& opt)
     {
         const int maxk = kernel_w * kernel_h;
 
-        gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
+        gemm = ncnn::create_layer_cpu(ncnn::LayerType::Gemm);
 
         ncnn::ParamDict pd;
         pd.set(2, 0);                   // transA
@@ -1182,7 +1182,7 @@ int Convolution_x86::forward(const std::vector<Mat>& bottom_blobs, std::vector<M
         bias_data_flattened.elempack = 1;
     }
 
-    ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution);
+    ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Convolution);
 
     ncnn::ParamDict pd;
     pd.set(0, _num_output);
diff --git a/src/layer/x86/convolutiondepthwise_x86.cpp b/src/layer/x86/convolutiondepthwise_x86.cpp
index c27778b3fe2..48be4ab0e06 100644
--- a/src/layer/x86/convolutiondepthwise_x86.cpp
+++ b/src/layer/x86/convolutiondepthwise_x86.cpp
@@ -174,7 +174,7 @@ int ConvolutionDepthWise_x86::create_group_ops(const Option& opt)
         if (bias_term)
             bias_data_g = bias_data.range(num_output_g * g, num_output_g);
 
-        ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution);
+        ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Convolution);
 
         // set param
         ncnn::ParamDict pd;
@@ -780,7 +780,7 @@ int ConvolutionDepthWise_x86::forward(const std::vector<Mat>& bottom_blobs, std:
         bias_data_flattened.elempack = 1;
     }
 
-    ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::ConvolutionDepthWise);
+    ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::ConvolutionDepthWise);
 
     ncnn::ParamDict pd;
     pd.set(0, _num_output);
diff --git a/src/layer/x86/deconvolution_x86.cpp b/src/layer/x86/deconvolution_x86.cpp
index 46bdca2a397..09d13616d58 100644
--- a/src/layer/x86/deconvolution_x86.cpp
+++ b/src/layer/x86/deconvolution_x86.cpp
@@ -94,7 +94,7 @@ int Deconvolution_x86::create_pipeline(const Option& opt)
     {
         const int maxk = kernel_w * kernel_h;
 
-        gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
+        gemm = ncnn::create_layer_cpu(ncnn::LayerType::Gemm);
 
         ncnn::ParamDict pd;
         pd.set(2, 1);                 // transA
@@ -694,7 +694,7 @@ int Deconvolution_x86::forward(const std::vector<Mat>& bottom_blobs, std::vector
         bias_data_flattened.elempack = 1;
     }
 
-    ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Deconvolution);
+    ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Deconvolution);
 
     ncnn::ParamDict pd;
     pd.set(0, _num_output);
diff --git a/src/layer/x86/deconvolutiondepthwise_x86.cpp b/src/layer/x86/deconvolutiondepthwise_x86.cpp
index 43a573a64ef..7a790701857 100644
--- a/src/layer/x86/deconvolutiondepthwise_x86.cpp
+++ b/src/layer/x86/deconvolutiondepthwise_x86.cpp
@@ -146,7 +146,7 @@ int DeconvolutionDepthWise_x86::create_group_ops(const Option& opt)
         if (bias_term)
             bias_data_g = bias_data.range(num_output_g * g, num_output_g);
 
-        ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Deconvolution);
+        ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Deconvolution);
 
         // set param
         ncnn::ParamDict pd;
@@ -641,7 +641,7 @@ int DeconvolutionDepthWise_x86::forward(const std::vector<Mat>& bottom_blobs, st
         bias_data_flattened.elempack = 1;
     }
 
-    ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::DeconvolutionDepthWise);
+    ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::DeconvolutionDepthWise);
 
     ncnn::ParamDict pd;
     pd.set(0, _num_output);
diff --git a/src/layer/x86/deformableconv2d_x86.cpp b/src/layer/x86/deformableconv2d_x86.cpp
index 076e56f7e64..b3bf9887898 100644
--- a/src/layer/x86/deformableconv2d_x86.cpp
+++ b/src/layer/x86/deformableconv2d_x86.cpp
@@ -134,7 +134,7 @@ int DeformableConv2D_x86::create_pipeline(const Option& opt)
     {
         const int maxk = kernel_w * kernel_h;
 
-        gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
+        gemm = ncnn::create_layer_cpu(ncnn::LayerType::Gemm);
 
         ncnn::ParamDict pd;
         pd.set(2, 0);                   // transA
diff --git a/src/layer/x86/innerproduct_x86.cpp b/src/layer/x86/innerproduct_x86.cpp
index 67bf0cca548..0ca253ebd1d 100644
--- a/src/layer/x86/innerproduct_x86.cpp
+++ b/src/layer/x86/innerproduct_x86.cpp
@@ -53,7 +53,7 @@ int InnerProduct_x86::create_pipeline(const Option& opt)
 {
     //     if (opt.use_packing_layout)
     {
-        flatten = ncnn::create_layer(ncnn::LayerType::Flatten);
+        flatten = ncnn::create_layer_cpu(ncnn::LayerType::Flatten);
 
         ncnn::ParamDict pd;
 
diff --git a/src/layer/x86/matmul_x86.cpp b/src/layer/x86/matmul_x86.cpp
index 2c829ea1848..d0afe81f76b 100644
--- a/src/layer/x86/matmul_x86.cpp
+++ b/src/layer/x86/matmul_x86.cpp
@@ -25,7 +25,7 @@ MatMul_x86::MatMul_x86()
 
 int MatMul_x86::create_pipeline(const Option& opt)
 {
-    gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
+    gemm = ncnn::create_layer_cpu(ncnn::LayerType::Gemm);
 
     ncnn::ParamDict pd;
     pd.set(2, 0);      // transA
diff --git a/src/layer/x86/multiheadattention_x86.cpp b/src/layer/x86/multiheadattention_x86.cpp
index 98397437c9d..a7ff58288c2 100644
--- a/src/layer/x86/multiheadattention_x86.cpp
+++ b/src/layer/x86/multiheadattention_x86.cpp
@@ -42,7 +42,7 @@ int MultiHeadAttention_x86::create_pipeline(const Option& opt)
         const int embed_dim_per_head = embed_dim / num_heads;
         const float inv_sqrt_embed_dim_per_head = 1.f / sqrtf(embed_dim_per_head);
 
-        q_gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
+        q_gemm = ncnn::create_layer_cpu(ncnn::LayerType::Gemm);
         ncnn::ParamDict pd;
         pd.set(0, inv_sqrt_embed_dim_per_head);
         pd.set(1, 1.f);
@@ -73,7 +73,7 @@ int MultiHeadAttention_x86::create_pipeline(const Option& opt)
     }
 
     {
-        k_gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
+        k_gemm = ncnn::create_layer_cpu(ncnn::LayerType::Gemm);
         ncnn::ParamDict pd;
         pd.set(2, 0);         // transA
         pd.set(3, 1);         // transB
@@ -102,7 +102,7 @@ int MultiHeadAttention_x86::create_pipeline(const Option& opt)
     }
 
     {
-        v_gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
+        v_gemm = ncnn::create_layer_cpu(ncnn::LayerType::Gemm);
         ncnn::ParamDict pd;
         pd.set(2, 0);         // transA
         pd.set(3, 1);         // transB
@@ -131,7 +131,7 @@ int MultiHeadAttention_x86::create_pipeline(const Option& opt)
     }
 
     {
-        qk_gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
+        qk_gemm = ncnn::create_layer_cpu(ncnn::LayerType::Gemm);
         ncnn::ParamDict pd;
         pd.set(2, 1);                   // transA
         pd.set(3, 0);                   // transB
@@ -151,7 +151,7 @@ int MultiHeadAttention_x86::create_pipeline(const Option& opt)
         qk_gemm->create_pipeline(opt1);
     }
     {
-        qkv_gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
+        qkv_gemm = ncnn::create_layer_cpu(ncnn::LayerType::Gemm);
         ncnn::ParamDict pd;
         pd.set(2, 0);   // transA
         pd.set(3, 1);   // transB
@@ -173,7 +173,7 @@ int MultiHeadAttention_x86::create_pipeline(const Option& opt)
     }
 
     {
-        qk_softmax = ncnn::create_layer(ncnn::LayerType::Softmax);
+        qk_softmax = ncnn::create_layer_cpu(ncnn::LayerType::Softmax);
         ncnn::ParamDict pd;
         pd.set(0, -1);
         pd.set(1, 1);
@@ -183,7 +183,7 @@ int MultiHeadAttention_x86::create_pipeline(const Option& opt)
     }
 
     {
-        o_gemm = ncnn::create_layer(ncnn::LayerType::Gemm);
+        o_gemm = ncnn::create_layer_cpu(ncnn::LayerType::Gemm);
         ncnn::ParamDict pd;
         pd.set(2, 1);         // transA
         pd.set(3, 1);         // transB
diff --git a/src/layer/yolodetectionoutput.cpp b/src/layer/yolodetectionoutput.cpp
index 9b9ba7dc289..1e0d86d73a4 100644
--- a/src/layer/yolodetectionoutput.cpp
+++ b/src/layer/yolodetectionoutput.cpp
@@ -38,7 +38,7 @@ int YoloDetectionOutput::load_param(const ParamDict& pd)
 int YoloDetectionOutput::create_pipeline(const Option& opt)
 {
     {
-        softmax = ncnn::create_layer(ncnn::LayerType::Softmax);
+        softmax = ncnn::create_layer_cpu(ncnn::LayerType::Softmax);
 
         ncnn::ParamDict pd;
         pd.set(0, 0); // axis
diff --git a/src/layer/yolov3detectionoutput.cpp b/src/layer/yolov3detectionoutput.cpp
index 494fb6d186a..7528f5033cd 100644
--- a/src/layer/yolov3detectionoutput.cpp
+++ b/src/layer/yolov3detectionoutput.cpp
@@ -25,7 +25,7 @@ Yolov3DetectionOutput::Yolov3DetectionOutput()
     one_blob_only = false;
     support_inplace = false;
 
-    //softmax = ncnn::create_layer(ncnn::LayerType::Softmax);
+    //softmax = ncnn::create_layer_cpu(ncnn::LayerType::Softmax);
 
     // set param
     ncnn::ParamDict pd;

From 4637fb08b4726e96f55f106f501705693e2d8663 Mon Sep 17 00:00:00 2001
From: nihuini <nihuini@tencent.com>
Date: Thu, 4 Jan 2024 16:10:53 +0800
Subject: [PATCH 11/19] wip

---
 src/layer.cpp                                 |  44 +++--
 src/layer/vulkan/convolution1d_vulkan.cpp     |  12 +-
 src/layer/vulkan/convolution1d_vulkan.h       |   2 +
 src/layer/vulkan/convolution_vulkan.cpp       |  12 +-
 src/layer/vulkan/convolution_vulkan.h         |   2 +
 .../vulkan/convolutiondepthwise_vulkan.cpp    |  12 +-
 .../vulkan/convolutiondepthwise_vulkan.h      |   2 +
 src/layer/vulkan/deconvolution_vulkan.cpp     |  12 +-
 src/layer/vulkan/deconvolution_vulkan.h       |   2 +
 .../vulkan/deconvolutiondepthwise_vulkan.cpp  |  12 +-
 .../vulkan/deconvolutiondepthwise_vulkan.h    |   2 +
 src/net.cpp                                   | 179 +++++++++++++-----
 tests/testutil.cpp                            |   8 +-
 13 files changed, 224 insertions(+), 77 deletions(-)

diff --git a/src/layer.cpp b/src/layer.cpp
index ed28091fafa..253430a1974 100644
--- a/src/layer.cpp
+++ b/src/layer.cpp
@@ -317,11 +317,6 @@ class Layer_final : public Layer
     virtual int load_param(const ParamDict& pd)
     {
         set_layer_properties();
-        {
-            int ret = layer_cpu->load_param(pd);
-            if (ret)
-                return ret;
-        }
 #if NCNN_VULKAN
         if (layer_vulkan && vkdev)
         {
@@ -329,18 +324,19 @@ class Layer_final : public Layer
             if (ret)
                 return ret;
         }
+        else
 #endif // NCNN_VULKAN
+        {
+            int ret = layer_cpu->load_param(pd);
+            if (ret)
+                return ret;
+        }
         get_layer_properties();
         return 0;
     }
 
     virtual int load_model(const ModelBin& mb)
     {
-        {
-            int ret = layer_cpu->load_model(mb);
-            if (ret)
-                return ret;
-        }
 #if NCNN_VULKAN
         if (layer_vulkan && vkdev)
         {
@@ -348,7 +344,13 @@ class Layer_final : public Layer
             if (ret)
                 return ret;
         }
+        else
 #endif // NCNN_VULKAN
+        {
+            int ret = layer_cpu->load_model(mb);
+            if (ret)
+                return ret;
+        }
         get_layer_properties();
         return 0;
     }
@@ -356,11 +358,6 @@ class Layer_final : public Layer
     virtual int create_pipeline(const Option& opt)
     {
         set_layer_properties();
-        {
-            int ret = layer_cpu->create_pipeline(opt);
-            if (ret)
-                return ret;
-        }
 #if NCNN_VULKAN
         if (layer_vulkan && vkdev)
         {
@@ -368,18 +365,19 @@ class Layer_final : public Layer
             if (ret)
                 return ret;
         }
+        else
 #endif // NCNN_VULKAN
+        {
+            int ret = layer_cpu->create_pipeline(opt);
+            if (ret)
+                return ret;
+        }
         get_layer_properties();
         return 0;
     }
 
     virtual int destroy_pipeline(const Option& opt)
     {
-        {
-            int ret = layer_cpu->destroy_pipeline(opt);
-            if (ret)
-                return ret;
-        }
 #if NCNN_VULKAN
         if (layer_vulkan && vkdev)
         {
@@ -387,7 +385,13 @@ class Layer_final : public Layer
             if (ret)
                 return ret;
         }
+        else
 #endif // NCNN_VULKAN
+        {
+            int ret = layer_cpu->destroy_pipeline(opt);
+            if (ret)
+                return ret;
+        }
         return 0;
     }
 
diff --git a/src/layer/vulkan/convolution1d_vulkan.cpp b/src/layer/vulkan/convolution1d_vulkan.cpp
index a69a48d822e..12162c5b7fc 100644
--- a/src/layer/vulkan/convolution1d_vulkan.cpp
+++ b/src/layer/vulkan/convolution1d_vulkan.cpp
@@ -29,15 +29,23 @@ Convolution1D_vulkan::Convolution1D_vulkan()
     pipeline_convolution1d = 0;
 }
 
-int Convolution1D_vulkan::create_pipeline(const Option& _opt)
+int Convolution1D_vulkan::load_param(const ParamDict& pd)
 {
+    int ret = Convolution1D::load_param(pd);
+    if (ret)
+        return ret;
+
     if (dynamic_weight)
     {
         support_vulkan = false;
         support_image_storage = false;
-        return 0;
     }
 
+    return 0;
+}
+
+int Convolution1D_vulkan::create_pipeline(const Option& _opt)
+{
     Option opt = _opt;
 
     const int maxk = kernel_w;
diff --git a/src/layer/vulkan/convolution1d_vulkan.h b/src/layer/vulkan/convolution1d_vulkan.h
index f01e1523161..28d692ae618 100644
--- a/src/layer/vulkan/convolution1d_vulkan.h
+++ b/src/layer/vulkan/convolution1d_vulkan.h
@@ -24,6 +24,8 @@ class Convolution1D_vulkan : public Convolution1D
 public:
     Convolution1D_vulkan();
 
+    virtual int load_param(const ParamDict& pd);
+
     virtual int create_pipeline(const Option& opt);
     virtual int destroy_pipeline(const Option& opt);
 
diff --git a/src/layer/vulkan/convolution_vulkan.cpp b/src/layer/vulkan/convolution_vulkan.cpp
index 4b93baefb1a..5e3bffca51b 100644
--- a/src/layer/vulkan/convolution_vulkan.cpp
+++ b/src/layer/vulkan/convolution_vulkan.cpp
@@ -43,15 +43,23 @@ Convolution_vulkan::Convolution_vulkan()
     reshape_w = 0;
 }
 
-int Convolution_vulkan::create_pipeline(const Option& _opt)
+int Convolution_vulkan::load_param(const ParamDict& pd)
 {
+    int ret = Convolution::load_param(pd);
+    if (ret)
+        return ret;
+
     if (dynamic_weight)
     {
         support_vulkan = false;
         support_image_storage = false;
-        return 0;
     }
 
+    return 0;
+}
+
+int Convolution_vulkan::create_pipeline(const Option& _opt)
+{
     Option opt = _opt;
     const Mat& shape = bottom_shapes.empty() ? Mat() : bottom_shapes[0];
     const Mat& out_shape = top_shapes.empty() ? Mat() : top_shapes[0];
diff --git a/src/layer/vulkan/convolution_vulkan.h b/src/layer/vulkan/convolution_vulkan.h
index 90d6471d58b..fa4bdbc5350 100644
--- a/src/layer/vulkan/convolution_vulkan.h
+++ b/src/layer/vulkan/convolution_vulkan.h
@@ -24,6 +24,8 @@ class Convolution_vulkan : public Convolution
 public:
     Convolution_vulkan();
 
+    virtual int load_param(const ParamDict& pd);
+
     virtual int create_pipeline(const Option& opt);
     virtual int destroy_pipeline(const Option& opt);
 
diff --git a/src/layer/vulkan/convolutiondepthwise_vulkan.cpp b/src/layer/vulkan/convolutiondepthwise_vulkan.cpp
index 0fbae334184..23405a6269d 100644
--- a/src/layer/vulkan/convolutiondepthwise_vulkan.cpp
+++ b/src/layer/vulkan/convolutiondepthwise_vulkan.cpp
@@ -41,15 +41,23 @@ ConvolutionDepthWise_vulkan::ConvolutionDepthWise_vulkan()
     pipeline_convolutiondepthwise_group_pack8to1 = 0;
 }
 
-int ConvolutionDepthWise_vulkan::create_pipeline(const Option& _opt)
+int ConvolutionDepthWise_vulkan::load_param(const ParamDict& pd)
 {
+    int ret = ConvolutionDepthWise::load_param(pd);
+    if (ret)
+        return ret;
+
     if (dynamic_weight)
     {
         support_vulkan = false;
         support_image_storage = false;
-        return 0;
     }
 
+    return 0;
+}
+
+int ConvolutionDepthWise_vulkan::create_pipeline(const Option& _opt)
+{
     Option opt = _opt;
     const Mat& shape = bottom_shapes.empty() ? Mat() : bottom_shapes[0];
     const Mat& out_shape = top_shapes.empty() ? Mat() : top_shapes[0];
diff --git a/src/layer/vulkan/convolutiondepthwise_vulkan.h b/src/layer/vulkan/convolutiondepthwise_vulkan.h
index 47785b707e2..7a6cfe1f640 100644
--- a/src/layer/vulkan/convolutiondepthwise_vulkan.h
+++ b/src/layer/vulkan/convolutiondepthwise_vulkan.h
@@ -24,6 +24,8 @@ class ConvolutionDepthWise_vulkan : public ConvolutionDepthWise
 public:
     ConvolutionDepthWise_vulkan();
 
+    virtual int load_param(const ParamDict& pd);
+
     virtual int create_pipeline(const Option& opt);
     virtual int destroy_pipeline(const Option& opt);
 
diff --git a/src/layer/vulkan/deconvolution_vulkan.cpp b/src/layer/vulkan/deconvolution_vulkan.cpp
index de530e54621..33d56e34ada 100644
--- a/src/layer/vulkan/deconvolution_vulkan.cpp
+++ b/src/layer/vulkan/deconvolution_vulkan.cpp
@@ -33,15 +33,23 @@ Deconvolution_vulkan::Deconvolution_vulkan()
     pipeline_deconvolution_col2im = 0;
 }
 
-int Deconvolution_vulkan::create_pipeline(const Option& _opt)
+int Deconvolution_vulkan::load_param(const ParamDict& pd)
 {
+    int ret = Deconvolution::load_param(pd);
+    if (ret)
+        return ret;
+
     if (dynamic_weight)
     {
         support_vulkan = false;
         support_image_storage = false;
-        return 0;
     }
 
+    return 0;
+}
+
+int Deconvolution_vulkan::create_pipeline(const Option& _opt)
+{
     Option opt = _opt;
     const Mat& shape = bottom_shapes.empty() ? Mat() : bottom_shapes[0];
     const Mat& out_shape = top_shapes.empty() ? Mat() : top_shapes[0];
diff --git a/src/layer/vulkan/deconvolution_vulkan.h b/src/layer/vulkan/deconvolution_vulkan.h
index a4bee03c5e4..6e18c38d681 100644
--- a/src/layer/vulkan/deconvolution_vulkan.h
+++ b/src/layer/vulkan/deconvolution_vulkan.h
@@ -24,6 +24,8 @@ class Deconvolution_vulkan : public Deconvolution
 public:
     Deconvolution_vulkan();
 
+    virtual int load_param(const ParamDict& pd);
+
     virtual int create_pipeline(const Option& opt);
     virtual int destroy_pipeline(const Option& opt);
 
diff --git a/src/layer/vulkan/deconvolutiondepthwise_vulkan.cpp b/src/layer/vulkan/deconvolutiondepthwise_vulkan.cpp
index cf874439619..08e6b1a23c0 100644
--- a/src/layer/vulkan/deconvolutiondepthwise_vulkan.cpp
+++ b/src/layer/vulkan/deconvolutiondepthwise_vulkan.cpp
@@ -42,15 +42,23 @@ DeconvolutionDepthWise_vulkan::DeconvolutionDepthWise_vulkan()
     pipeline_deconvolutiondepthwise_group_pack8to1 = 0;
 }
 
-int DeconvolutionDepthWise_vulkan::create_pipeline(const Option& _opt)
+int DeconvolutionDepthWise_vulkan::load_param(const ParamDict& pd)
 {
+    int ret = DeconvolutionDepthWise::load_param(pd);
+    if (ret)
+        return ret;
+
     if (dynamic_weight)
     {
         support_vulkan = false;
         support_image_storage = false;
-        return 0;
     }
 
+    return 0;
+}
+
+int DeconvolutionDepthWise_vulkan::create_pipeline(const Option& _opt)
+{
     Option opt = _opt;
     const Mat& shape = bottom_shapes.empty() ? Mat() : bottom_shapes[0];
     const Mat& out_shape = top_shapes.empty() ? Mat() : top_shapes[0];
diff --git a/src/layer/vulkan/deconvolutiondepthwise_vulkan.h b/src/layer/vulkan/deconvolutiondepthwise_vulkan.h
index 6ea7931e32a..5346de8e628 100644
--- a/src/layer/vulkan/deconvolutiondepthwise_vulkan.h
+++ b/src/layer/vulkan/deconvolutiondepthwise_vulkan.h
@@ -24,6 +24,8 @@ class DeconvolutionDepthWise_vulkan : public DeconvolutionDepthWise
 public:
     DeconvolutionDepthWise_vulkan();
 
+    virtual int load_param(const ParamDict& pd);
+
     virtual int create_pipeline(const Option& opt);
     virtual int destroy_pipeline(const Option& opt);
 
diff --git a/src/net.cpp b/src/net.cpp
index a7198d0a16e..0d3c1a1272b 100644
--- a/src/net.cpp
+++ b/src/net.cpp
@@ -1377,9 +1377,15 @@ int Net::load_param(const DataReader& dr)
         SCAN_VALUE("%d", top_count)
 
         Layer* layer = create_overwrite_builtin_layer(layer_type);
+#if NCNN_VULKAN
+        if (!layer && opt.use_vulkan_compute && d->vkdev)
+        {
+            layer = create_layer_vulkan(layer_type);
+        }
+#endif // NCNN_VULKAN
         if (!layer)
         {
-            layer = create_layer(layer_type);
+            layer = create_layer_cpu(layer_type);
         }
         if (!layer)
         {
@@ -1402,7 +1408,6 @@ int Net::load_param(const DataReader& dr)
         //         NCNN_LOGE("new layer %d %s", i, layer_name);
 
         layer->bottoms.resize(bottom_count);
-
         for (int j = 0; j < bottom_count; j++)
         {
             char bottom_name[256];
@@ -1446,20 +1451,16 @@ int Net::load_param(const DataReader& dr)
             blob_index++;
         }
 
+        int layer_support_vulkan = layer->support_vulkan;
+
         // layer specific params
         int pdlr = pd.load_param(dr);
         if (pdlr != 0)
         {
-            NCNN_LOGE("ParamDict load_param %d %s failed", i, layer->name.c_str());
+            NCNN_LOGE("ParamDict load_param %d %s failed", i, layer_name);
             continue;
         }
 
-        if (layer->support_int8_storage)
-        {
-            // no int8 gpu support yet
-            opt.use_vulkan_compute = false;
-        }
-
         // pull out top shape hints
         Mat shape_hints = pd.get(30, Mat());
         if (!shape_hints.empty())
@@ -1503,13 +1504,70 @@ int Net::load_param(const DataReader& dr)
         // pull out layer specific feature disabled set
         layer->featmask = pd.get(31, 0);
 
+        if (layer->support_int8_storage)
+        {
+            // no int8 gpu support yet
+            opt.use_vulkan_compute = false;
+        }
+
+        Option opt1 = get_masked_option(opt, layer->featmask);
+#if NCNN_VULKAN
+        if (opt1.use_vulkan_compute)
+        {
+            if (!layer->support_image_storage) opt1.use_image_storage = false;
+        }
+        else
+        {
+            layer->vkdev = 0;
+            layer->support_vulkan = false;
+        }
+#endif // NCNN_VULKAN
+
         int lr = layer->load_param(pd);
         if (lr != 0)
         {
-            NCNN_LOGE("layer load_param %d %s failed", i, layer->name.c_str());
+            NCNN_LOGE("layer load_param %d %s failed", i, layer_name);
             continue;
         }
 
+        if (layer_support_vulkan && (!layer->support_vulkan || !opt1.use_vulkan_compute))
+        {
+            // vulkan layer cannot handle these param, recreate cpu layer
+            Layer* layer_cpu = create_overwrite_builtin_layer(layer_type);
+            if (!layer_cpu)
+            {
+                layer_cpu = create_layer_cpu(layer_type);
+            }
+            if (!layer_cpu)
+            {
+                layer_cpu = create_custom_layer(layer_type);
+            }
+            if (!layer_cpu)
+            {
+                NCNN_LOGE("layer %s not exists or registered", layer_type);
+                clear();
+                return -1;
+            }
+
+            layer_cpu->type = layer->type;
+            layer_cpu->name = layer->name;
+            layer_cpu->bottoms = layer->bottoms;
+            layer_cpu->tops = layer->tops;
+            layer_cpu->bottom_shapes = layer->bottom_shapes;
+            layer_cpu->top_shapes = layer->top_shapes;
+            layer_cpu->featmask = layer->featmask;
+
+            int lr = layer_cpu->load_param(pd);
+            if (lr != 0)
+            {
+                NCNN_LOGE("layer load_param %d %s failed", i, layer_name);
+                continue;
+            }
+
+            delete layer;
+            layer = layer_cpu;
+        }
+
         d->layers[i] = layer;
     }
 
@@ -1611,9 +1669,15 @@ int Net::load_param_bin(const DataReader& dr)
         READ_VALUE(top_count)
 
         Layer* layer = create_overwrite_builtin_layer(typeindex);
+#if NCNN_VULKAN
+        if (!layer && opt.use_vulkan_compute && d->vkdev)
+        {
+            layer = create_layer_vulkan(typeindex);
+        }
+#endif // NCNN_VULKAN
         if (!layer)
         {
-            layer = create_layer(typeindex);
+            layer = create_layer_cpu(typeindex);
         }
         if (!layer)
         {
@@ -1665,24 +1729,16 @@ int Net::load_param_bin(const DataReader& dr)
             layer->tops[j] = top_blob_index;
         }
 
+        int layer_support_vulkan = layer->support_vulkan;
+
         // layer specific params
         int pdlr = pd.load_param_bin(dr);
         if (pdlr != 0)
         {
-#if NCNN_STRING
-            NCNN_LOGE("ParamDict load_param %d %s failed", i, layer->name.c_str());
-#else
-            NCNN_LOGE("ParamDict load_param %d failed", i);
-#endif
+            NCNN_LOGE("ParamDict load_param_bin %d failed", i);
             continue;
         }
 
-        if (layer->support_int8_storage)
-        {
-            // no int8 gpu support yet
-            opt.use_vulkan_compute = false;
-        }
-
         // pull out top blob shape hints
         Mat shape_hints = pd.get(30, Mat());
         if (!shape_hints.empty())
@@ -1729,14 +1785,68 @@ int Net::load_param_bin(const DataReader& dr)
         int lr = layer->load_param(pd);
         if (lr != 0)
         {
-#if NCNN_STRING
-            NCNN_LOGE("layer load_param %d %s failed", i, layer->name.c_str());
-#else
             NCNN_LOGE("layer load_param %d failed", i);
-#endif
             continue;
         }
 
+        if (layer->support_int8_storage)
+        {
+            // no int8 gpu support yet
+            opt.use_vulkan_compute = false;
+        }
+
+        Option opt1 = get_masked_option(opt, layer->featmask);
+#if NCNN_VULKAN
+        if (opt1.use_vulkan_compute)
+        {
+            if (!layer->support_image_storage) opt1.use_image_storage = false;
+        }
+        else
+        {
+            layer->vkdev = 0;
+            layer->support_vulkan = false;
+        }
+#endif // NCNN_VULKAN
+
+        if (layer_support_vulkan && (!layer->support_vulkan || !opt1.use_vulkan_compute))
+        {
+            // vulkan layer cannot handle these param, recreate cpu layer
+            Layer* layer_cpu = create_overwrite_builtin_layer(typeindex);
+            if (!layer_cpu)
+            {
+                layer_cpu = create_layer_cpu(typeindex);
+            }
+            if (!layer_cpu)
+            {
+                int custom_index = typeindex & ~LayerType::CustomBit;
+                layer_cpu = create_custom_layer(custom_index);
+            }
+            if (!layer_cpu)
+            {
+                NCNN_LOGE("layer %d not exists or registered", typeindex);
+                clear();
+                return -1;
+            }
+
+            layer_cpu->type = layer->type;
+            layer_cpu->name = layer->name;
+            layer_cpu->bottoms = layer->bottoms;
+            layer_cpu->tops = layer->tops;
+            layer_cpu->bottom_shapes = layer->bottom_shapes;
+            layer_cpu->top_shapes = layer->top_shapes;
+            layer_cpu->featmask = layer->featmask;
+
+            int lr = layer_cpu->load_param(pd);
+            if (lr != 0)
+            {
+                NCNN_LOGE("layer load_param %d failed", i);
+                continue;
+            }
+
+            delete layer;
+            layer = layer_cpu;
+        }
+
         d->layers[i] = layer;
     }
 
@@ -1796,24 +1906,7 @@ int Net::load_model(const DataReader& dr)
             break;
         }
 
-        if (layer->support_int8_storage)
-        {
-            // no int8 gpu support yet
-            opt.use_vulkan_compute = false;
-        }
-
         Option opt1 = get_masked_option(opt, layer->featmask);
-#if NCNN_VULKAN
-        if (opt1.use_vulkan_compute)
-        {
-            if (!layer->support_image_storage) opt1.use_image_storage = false;
-        }
-        else
-        {
-            layer->vkdev = 0;
-            layer->support_vulkan = false;
-        }
-#endif // NCNN_VULKAN
 
         int cret = layer->create_pipeline(opt1);
         if (cret != 0)
diff --git a/tests/testutil.cpp b/tests/testutil.cpp
index b5f2fd34148..1bcc6ff5237 100644
--- a/tests/testutil.cpp
+++ b/tests/testutil.cpp
@@ -630,6 +630,8 @@ int test_layer_gpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<n
         return 233;
     }
 
+    op->load_param(pd);
+
     if (!op->support_vulkan)
     {
         delete op;
@@ -651,8 +653,6 @@ int test_layer_gpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<n
         op->top_shapes = top_shapes;
     }
 
-    op->load_param(pd);
-
     if (op->one_blob_only && a.size() != 1)
     {
         fprintf(stderr, "layer with one_blob_only but consume multiple inputs\n");
@@ -1130,6 +1130,8 @@ int test_layer_gpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<n
         return 233;
     }
 
+    op->load_param(pd);
+
     if (!op->support_vulkan)
     {
         delete op;
@@ -1153,8 +1155,6 @@ int test_layer_gpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<n
         op->top_shapes[0] = top_shape;
     }
 
-    op->load_param(pd);
-
     ncnn::ModelBinFromMatArray mb(weights.data());
 
     op->load_model(mb);

From 90a4ec464264cb3350dab59f6949a8e3e8de293b Mon Sep 17 00:00:00 2001
From: nihuini <nihuini@tencent.com>
Date: Thu, 4 Jan 2024 16:21:08 +0800
Subject: [PATCH 12/19] fix build

---
 src/net.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/net.cpp b/src/net.cpp
index 0d3c1a1272b..9738e55bc02 100644
--- a/src/net.cpp
+++ b/src/net.cpp
@@ -1828,8 +1828,6 @@ int Net::load_param_bin(const DataReader& dr)
                 return -1;
             }
 
-            layer_cpu->type = layer->type;
-            layer_cpu->name = layer->name;
             layer_cpu->bottoms = layer->bottoms;
             layer_cpu->tops = layer->tops;
             layer_cpu->bottom_shapes = layer->bottom_shapes;

From 661e3471ff5b6607d33dfa496ef696e463f47edb Mon Sep 17 00:00:00 2001
From: nihuini <nihuini@tencent.com>
Date: Thu, 4 Jan 2024 17:33:11 +0800
Subject: [PATCH 13/19] wip

---
 src/layer/arm/convolution1d_arm.cpp           |  4 ++
 src/layer/arm/convolution1d_arm_asimdhp.cpp   |  2 +
 src/layer/arm/convolution_arm.cpp             | 37 +++++--------------
 src/layer/arm/convolution_arm_asimdhp.cpp     | 15 ++------
 src/layer/arm/convolutiondepthwise_arm.cpp    | 25 +++----------
 .../arm/convolutiondepthwise_arm_asimdhp.cpp  | 10 +----
 src/layer/arm/deconvolution_arm.cpp           | 10 +----
 src/layer/arm/deconvolution_arm_asimdhp.cpp   |  5 +--
 src/layer/arm/deconvolutiondepthwise_arm.cpp  | 10 +----
 .../deconvolutiondepthwise_arm_asimdhp.cpp    |  5 +--
 src/layer/arm/gemm_arm.cpp                    | 30 +++------------
 src/layer/arm/gemm_arm_asimdhp.cpp            | 15 ++------
 src/layer/arm/gemm_arm_vfpv4.cpp              | 15 ++------
 src/layer/arm/gru_arm.cpp                     |  8 ++++
 src/layer/arm/gru_arm_asimdhp.cpp             |  4 ++
 src/layer/arm/innerproduct_arm.cpp            | 15 ++------
 src/layer/arm/innerproduct_arm_vfpv4.cpp      |  5 +--
 src/layer/arm/lstm_arm.cpp                    | 18 +++------
 src/layer/arm/lstm_arm_asimdhp.cpp            |  9 ++---
 src/layer/arm/multiheadattention_arm.cpp      | 28 ++++----------
 src/layer/arm/rnn_arm.cpp                     |  8 ++++
 src/layer/arm/rnn_arm_asimdhp.cpp             |  4 ++
 .../loongarch/convolution1d_loongarch.cpp     |  2 +
 src/layer/loongarch/convolution_loongarch.cpp | 10 +----
 .../convolutiondepthwise_loongarch.cpp        | 17 +++------
 .../loongarch/deconvolution_loongarch.cpp     |  5 +--
 .../deconvolutiondepthwise_loongarch.cpp      |  7 ++--
 .../loongarch/innerproduct_loongarch.cpp      | 15 ++------
 src/layer/mips/convolution1d_mips.cpp         |  2 +
 src/layer/mips/convolution_mips.cpp           | 10 +----
 src/layer/mips/convolutiondepthwise_mips.cpp  | 17 +++------
 src/layer/mips/deconvolution_mips.cpp         |  5 +--
 .../mips/deconvolutiondepthwise_mips.cpp      |  7 ++--
 src/layer/mips/innerproduct_mips.cpp          | 15 ++------
 src/layer/riscv/convolution1d_riscv.cpp       |  4 ++
 src/layer/riscv/convolution_riscv.cpp         | 10 +----
 .../riscv/convolutiondepthwise_riscv.cpp      | 20 ++--------
 src/layer/riscv/deconvolution_riscv.cpp       | 10 +----
 .../riscv/deconvolutiondepthwise_riscv.cpp    | 20 ++--------
 src/layer/riscv/gemm_riscv.cpp                | 15 ++------
 src/layer/riscv/gru_riscv.cpp                 |  4 ++
 src/layer/riscv/innerproduct_riscv.cpp        | 10 +----
 src/layer/vulkan/convolution1d_vulkan.cpp     |  3 ++
 src/layer/vulkan/convolution_vulkan.cpp       |  3 ++
 .../vulkan/convolutiondepthwise_vulkan.cpp    |  3 ++
 src/layer/vulkan/deconvolution_vulkan.cpp     |  3 ++
 .../vulkan/deconvolutiondepthwise_vulkan.cpp  |  3 ++
 src/layer/vulkan/gemm_vulkan.cpp              |  4 ++
 src/layer/vulkan/innerproduct_vulkan.cpp      |  3 ++
 .../vulkan/multiheadattention_vulkan.cpp      | 12 ++++++
 src/layer/x86/convolution1d_x86.cpp           |  2 +
 src/layer/x86/convolution_x86.cpp             | 20 ++--------
 src/layer/x86/convolutiondepthwise_x86.cpp    | 17 +++------
 src/layer/x86/deconvolution_x86.cpp           |  5 +--
 src/layer/x86/deconvolutiondepthwise_x86.cpp  |  7 ++--
 src/layer/x86/deformableconv2d_x86.cpp        |  5 +--
 src/layer/x86/gemm_x86.cpp                    | 15 ++------
 src/layer/x86/innerproduct_x86.cpp            | 15 ++------
 src/layer/x86/lstm_x86.cpp                    |  9 ++---
 src/layer/x86/multiheadattention_x86.cpp      | 28 ++++----------
 60 files changed, 212 insertions(+), 432 deletions(-)

diff --git a/src/layer/arm/convolution1d_arm.cpp b/src/layer/arm/convolution1d_arm.cpp
index 1bfb375e188..26389279b18 100644
--- a/src/layer/arm/convolution1d_arm.cpp
+++ b/src/layer/arm/convolution1d_arm.cpp
@@ -68,6 +68,8 @@ int Convolution1D_arm::create_pipeline(const Option& opt)
 
     convolution1d_transform_kernel_packed(weight_data, weight_data_tm, num_input, num_output, kernel_w);
 
+    weight_data.release();
+
     return 0;
 }
 
@@ -237,6 +239,8 @@ int Convolution1D_arm::create_pipeline_bf16s(const Option& /*opt*/)
 
     convolution1d_transform_kernel_packed_bf16s(weight_data, weight_data_tm, num_input, num_output, kernel_w);
 
+    weight_data.release();
+
     return 0;
 }
 
diff --git a/src/layer/arm/convolution1d_arm_asimdhp.cpp b/src/layer/arm/convolution1d_arm_asimdhp.cpp
index bbbd5883027..2e194eabf21 100644
--- a/src/layer/arm/convolution1d_arm_asimdhp.cpp
+++ b/src/layer/arm/convolution1d_arm_asimdhp.cpp
@@ -36,6 +36,8 @@ int Convolution1D_arm::create_pipeline_fp16s(const Option& opt)
 
     ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);
 
+    weight_data.release();
+
     return 0;
 }
 
diff --git a/src/layer/arm/convolution_arm.cpp b/src/layer/arm/convolution_arm.cpp
index cfcd28c09f4..f7f04619e9e 100644
--- a/src/layer/arm/convolution_arm.cpp
+++ b/src/layer/arm/convolution_arm.cpp
@@ -194,6 +194,8 @@ int Convolution_arm::create_pipeline(const Option& opt)
 
         convolution_dilation1->create_pipeline(opt);
 
+        weight_data.release();
+
         return 0;
     }
 
@@ -222,10 +224,7 @@ int Convolution_arm::create_pipeline(const Option& opt)
         else
             conv3x3s1_winograd23_transform_kernel(weight_data, weight_winograd23_data, num_input, num_output, opt);
 
-        if (opt.lightmode)
-        {
-            weight_data.release();
-        }
+        weight_data.release();
 
         return 0;
     }
@@ -271,10 +270,7 @@ int Convolution_arm::create_pipeline(const Option& opt)
     {
         convolution_im2col_gemm_transform_kernel(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h, opt);
 
-        if (opt.lightmode)
-        {
-            weight_data.release();
-        }
+        weight_data.release();
 
         return 0;
     }
@@ -309,10 +305,7 @@ int Convolution_arm::create_pipeline(const Option& opt)
         convolution_transform_kernel_packed(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h);
     }
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
@@ -911,10 +904,7 @@ int Convolution_arm::create_pipeline_bf16s(const Option& opt)
         else
             conv3x3s1_winograd23_transform_kernel(weight_data, weight_winograd23_data, num_input, num_output, opt);
 
-        if (opt.lightmode)
-        {
-            weight_data.release();
-        }
+        weight_data.release();
 
         return 0;
     }
@@ -960,10 +950,7 @@ int Convolution_arm::create_pipeline_bf16s(const Option& opt)
     {
         convolution_im2col_gemm_transform_kernel_bf16s(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h, opt);
 
-        if (opt.lightmode)
-        {
-            weight_data.release();
-        }
+        weight_data.release();
 
         return 0;
     }
@@ -984,10 +971,7 @@ int Convolution_arm::create_pipeline_bf16s(const Option& opt)
         convolution_transform_kernel_packed_bf16s(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h);
     }
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
@@ -1300,10 +1284,7 @@ int Convolution_arm::create_pipeline_int8_arm(const Option& opt)
         scale_in_data[p] = scale_in;
     }
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
diff --git a/src/layer/arm/convolution_arm_asimdhp.cpp b/src/layer/arm/convolution_arm_asimdhp.cpp
index 6480aa2e78a..51ec51675a8 100644
--- a/src/layer/arm/convolution_arm_asimdhp.cpp
+++ b/src/layer/arm/convolution_arm_asimdhp.cpp
@@ -108,10 +108,7 @@ int Convolution_arm::create_pipeline_fp16s(const Option& opt)
         else
             conv3x3s1_winograd23_transform_kernel_fp16sa(weight_data, weight_winograd23_data, num_input, num_output, opt);
 
-        if (opt.lightmode)
-        {
-            weight_data.release();
-        }
+        weight_data.release();
 
         if (opt.use_fp16_arithmetic)
         {
@@ -192,10 +189,7 @@ int Convolution_arm::create_pipeline_fp16s(const Option& opt)
 
         ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);
 
-        if (opt.lightmode)
-        {
-            weight_data.release();
-        }
+        weight_data.release();
 
         return 0;
     }
@@ -225,10 +219,7 @@ int Convolution_arm::create_pipeline_fp16s(const Option& opt)
         ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);
     }
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
diff --git a/src/layer/arm/convolutiondepthwise_arm.cpp b/src/layer/arm/convolutiondepthwise_arm.cpp
index 2ae661650e7..f9f4a1fdc2d 100644
--- a/src/layer/arm/convolutiondepthwise_arm.cpp
+++ b/src/layer/arm/convolutiondepthwise_arm.cpp
@@ -119,10 +119,7 @@ int ConvolutionDepthWise_arm::create_pipeline(const Option& opt)
                 ncnn::cast_float32_to_bfloat16(weight_data, weight_data_tm, opt);
             }
 
-            if (opt.lightmode)
-            {
-                weight_data.release();
-            }
+            weight_data.release();
 
             return 0;
         }
@@ -164,10 +161,7 @@ int ConvolutionDepthWise_arm::create_pipeline(const Option& opt)
             }
         }
 
-        if (opt.lightmode)
-        {
-            weight_data.release();
-        }
+        weight_data.release();
 
         return 0;
     }
@@ -175,10 +169,7 @@ int ConvolutionDepthWise_arm::create_pipeline(const Option& opt)
     // group convolution
     create_group_ops(opt);
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
@@ -1031,10 +1022,7 @@ int ConvolutionDepthWise_arm::create_pipeline_int8_arm(const Option& opt)
             weight_data_tm = weight_data;
         }
 
-        if (opt.lightmode)
-        {
-            weight_data.release();
-        }
+        weight_data.release();
 
         return 0;
     }
@@ -1042,10 +1030,7 @@ int ConvolutionDepthWise_arm::create_pipeline_int8_arm(const Option& opt)
     // group convolution
     create_group_ops(opt);
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
diff --git a/src/layer/arm/convolutiondepthwise_arm_asimdhp.cpp b/src/layer/arm/convolutiondepthwise_arm_asimdhp.cpp
index f7d2cfee84c..1d5f2782cc1 100644
--- a/src/layer/arm/convolutiondepthwise_arm_asimdhp.cpp
+++ b/src/layer/arm/convolutiondepthwise_arm_asimdhp.cpp
@@ -76,10 +76,7 @@ int ConvolutionDepthWise_arm::create_pipeline_fp16s(const Option& opt)
 
         ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);
 
-        if (opt.lightmode)
-        {
-            weight_data.release();
-        }
+        weight_data.release();
 
         return 0;
     }
@@ -87,10 +84,7 @@ int ConvolutionDepthWise_arm::create_pipeline_fp16s(const Option& opt)
     // group convolution
     create_group_ops(opt);
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
diff --git a/src/layer/arm/deconvolution_arm.cpp b/src/layer/arm/deconvolution_arm.cpp
index 9061d59eafe..24c825ae266 100644
--- a/src/layer/arm/deconvolution_arm.cpp
+++ b/src/layer/arm/deconvolution_arm.cpp
@@ -211,10 +211,7 @@ int Deconvolution_arm::create_pipeline(const Option& opt)
         }
     }
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
@@ -957,10 +954,7 @@ int Deconvolution_arm::create_pipeline_bf16s(const Option& opt)
         }
     }
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
diff --git a/src/layer/arm/deconvolution_arm_asimdhp.cpp b/src/layer/arm/deconvolution_arm_asimdhp.cpp
index a12614b4d97..b5498d815f3 100644
--- a/src/layer/arm/deconvolution_arm_asimdhp.cpp
+++ b/src/layer/arm/deconvolution_arm_asimdhp.cpp
@@ -154,10 +154,7 @@ int Deconvolution_arm::create_pipeline_fp16s(const Option& opt)
 
     ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
diff --git a/src/layer/arm/deconvolutiondepthwise_arm.cpp b/src/layer/arm/deconvolutiondepthwise_arm.cpp
index 6edd735cb97..4eac426d9de 100644
--- a/src/layer/arm/deconvolutiondepthwise_arm.cpp
+++ b/src/layer/arm/deconvolutiondepthwise_arm.cpp
@@ -104,10 +104,7 @@ int DeconvolutionDepthWise_arm::create_pipeline(const Option& opt)
                 ncnn::cast_float32_to_bfloat16(weight_data_transposed, weight_data_tm, opt);
             }
 
-            if (opt.lightmode)
-            {
-                weight_data.release();
-            }
+            weight_data.release();
 
             return 0;
         }
@@ -193,10 +190,7 @@ int DeconvolutionDepthWise_arm::create_pipeline(const Option& opt)
         }
     }
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
diff --git a/src/layer/arm/deconvolutiondepthwise_arm_asimdhp.cpp b/src/layer/arm/deconvolutiondepthwise_arm_asimdhp.cpp
index e6c636525a4..5fa42d07490 100644
--- a/src/layer/arm/deconvolutiondepthwise_arm_asimdhp.cpp
+++ b/src/layer/arm/deconvolutiondepthwise_arm_asimdhp.cpp
@@ -145,10 +145,7 @@ int DeconvolutionDepthWise_arm::create_pipeline_fp16s(const Option& opt)
         }
     }
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
diff --git a/src/layer/arm/gemm_arm.cpp b/src/layer/arm/gemm_arm.cpp
index 2d4ff8734f8..3463550d3d4 100644
--- a/src/layer/arm/gemm_arm.cpp
+++ b/src/layer/arm/gemm_arm.cpp
@@ -4201,10 +4201,7 @@ int Gemm_arm::create_pipeline(const Option& opt)
             }
         }
 
-        if (opt.lightmode)
-        {
-            A_data.release();
-        }
+        A_data.release();
     }
 
     if (constantB)
@@ -4244,10 +4241,7 @@ int Gemm_arm::create_pipeline(const Option& opt)
             }
         }
 
-        if (opt.lightmode)
-        {
-            B_data.release();
-        }
+        B_data.release();
     }
 
     if (constantC && constant_broadcast_type_C != -1)
@@ -4277,10 +4271,7 @@ int Gemm_arm::create_pipeline(const Option& opt)
             CT_data = C2;
         }
 
-        if (opt.lightmode)
-        {
-            C_data.release();
-        }
+        C_data.release();
     }
 
     if (constantA || constantB || constantC)
@@ -4898,10 +4889,7 @@ int Gemm_arm::create_pipeline_bf16s(const Option& opt)
             }
         }
 
-        if (opt.lightmode)
-        {
-            A_data.release();
-        }
+        A_data.release();
     }
 
     if (constantB)
@@ -4941,10 +4929,7 @@ int Gemm_arm::create_pipeline_bf16s(const Option& opt)
             }
         }
 
-        if (opt.lightmode)
-        {
-            B_data.release();
-        }
+        B_data.release();
     }
 
     if (constantC && constant_broadcast_type_C != -1)
@@ -4974,10 +4959,7 @@ int Gemm_arm::create_pipeline_bf16s(const Option& opt)
             CT_data = C2;
         }
 
-        if (opt.lightmode)
-        {
-            C_data.release();
-        }
+        C_data.release();
     }
 
     if (constantA || constantB || constantC)
diff --git a/src/layer/arm/gemm_arm_asimdhp.cpp b/src/layer/arm/gemm_arm_asimdhp.cpp
index ff840df3b50..cfe6ce8ce60 100644
--- a/src/layer/arm/gemm_arm_asimdhp.cpp
+++ b/src/layer/arm/gemm_arm_asimdhp.cpp
@@ -2736,10 +2736,7 @@ int Gemm_arm::create_pipeline_fp16sa(const Option& opt)
             }
         }
 
-        if (opt.lightmode)
-        {
-            A_data.release();
-        }
+        A_data.release();
     }
 
     if (constantB)
@@ -2779,10 +2776,7 @@ int Gemm_arm::create_pipeline_fp16sa(const Option& opt)
             }
         }
 
-        if (opt.lightmode)
-        {
-            B_data.release();
-        }
+        B_data.release();
     }
 
     if (constantC && constant_broadcast_type_C != -1)
@@ -2808,10 +2802,7 @@ int Gemm_arm::create_pipeline_fp16sa(const Option& opt)
             }
         }
 
-        if (opt.lightmode)
-        {
-            C_data.release();
-        }
+        C_data.release();
     }
 
     if (constantA || constantB || constantC)
diff --git a/src/layer/arm/gemm_arm_vfpv4.cpp b/src/layer/arm/gemm_arm_vfpv4.cpp
index 3d29af41860..5792e47e980 100644
--- a/src/layer/arm/gemm_arm_vfpv4.cpp
+++ b/src/layer/arm/gemm_arm_vfpv4.cpp
@@ -427,10 +427,7 @@ int Gemm_arm::create_pipeline_fp16s(const Option& opt)
             }
         }
 
-        if (opt.lightmode)
-        {
-            A_data.release();
-        }
+        A_data.release();
     }
 
     if (constantB)
@@ -470,10 +467,7 @@ int Gemm_arm::create_pipeline_fp16s(const Option& opt)
             }
         }
 
-        if (opt.lightmode)
-        {
-            B_data.release();
-        }
+        B_data.release();
     }
 
     if (constantC && constant_broadcast_type_C != -1)
@@ -503,10 +497,7 @@ int Gemm_arm::create_pipeline_fp16s(const Option& opt)
             CT_data = C2;
         }
 
-        if (opt.lightmode)
-        {
-            C_data.release();
-        }
+        C_data.release();
     }
 
     if (constantA || constantB || constantC)
diff --git a/src/layer/arm/gru_arm.cpp b/src/layer/arm/gru_arm.cpp
index 70df351a555..58df8275ad5 100644
--- a/src/layer/arm/gru_arm.cpp
+++ b/src/layer/arm/gru_arm.cpp
@@ -250,6 +250,10 @@ int GRU_arm::create_pipeline(const Option& opt)
         }
     }
 
+    weight_xc_data.release();
+    bias_c_data.release();
+    weight_hc_data.release();
+
     return 0;
 }
 
@@ -1368,6 +1372,10 @@ int GRU_arm::create_pipeline_bf16s(const Option& opt)
         }
     }
 
+    weight_xc_data.release();
+    bias_c_data.release();
+    weight_hc_data.release();
+
     return 0;
 }
 
diff --git a/src/layer/arm/gru_arm_asimdhp.cpp b/src/layer/arm/gru_arm_asimdhp.cpp
index c38458176af..fcdce2d8e18 100644
--- a/src/layer/arm/gru_arm_asimdhp.cpp
+++ b/src/layer/arm/gru_arm_asimdhp.cpp
@@ -914,6 +914,10 @@ int GRU_arm::create_pipeline_fp16s(const Option& opt)
         }
     }
 
+    weight_xc_data.release();
+    bias_c_data.release();
+    weight_hc_data.release();
+
     return 0;
 }
 
diff --git a/src/layer/arm/innerproduct_arm.cpp b/src/layer/arm/innerproduct_arm.cpp
index 31689008e79..0cbc78525eb 100644
--- a/src/layer/arm/innerproduct_arm.cpp
+++ b/src/layer/arm/innerproduct_arm.cpp
@@ -122,10 +122,7 @@ int InnerProduct_arm::create_pipeline(const Option& opt)
         weight_data_tm = weight_data;
     }
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
@@ -868,10 +865,7 @@ int InnerProduct_arm::create_pipeline_bf16s(const Option& opt)
         }
     }
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
@@ -1264,10 +1258,7 @@ int InnerProduct_arm::create_pipeline_int8_arm(const Option& opt)
         scale_in_data[p] = scale_in;
     }
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
diff --git a/src/layer/arm/innerproduct_arm_vfpv4.cpp b/src/layer/arm/innerproduct_arm_vfpv4.cpp
index 435fb883e50..6a6eab84fba 100644
--- a/src/layer/arm/innerproduct_arm_vfpv4.cpp
+++ b/src/layer/arm/innerproduct_arm_vfpv4.cpp
@@ -41,10 +41,7 @@ int InnerProduct_arm::create_pipeline_fp16s(const Option& opt)
     }
 #endif
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
diff --git a/src/layer/arm/lstm_arm.cpp b/src/layer/arm/lstm_arm.cpp
index 04d7277547e..b8d5afe93dc 100644
--- a/src/layer/arm/lstm_arm.cpp
+++ b/src/layer/arm/lstm_arm.cpp
@@ -124,12 +124,9 @@ int LSTM_arm::create_pipeline(const Option& opt)
         }
     }
 
-    if (opt.lightmode)
-    {
-        weight_xc_data.release();
-        bias_c_data.release();
-        weight_hc_data.release();
-    }
+    weight_xc_data.release();
+    bias_c_data.release();
+    weight_hc_data.release();
 
     return 0;
 }
@@ -931,12 +928,9 @@ int LSTM_arm::create_pipeline_bf16s(const Option& opt)
         }
     }
 
-    if (opt.lightmode)
-    {
-        weight_xc_data.release();
-        bias_c_data.release();
-        weight_hc_data.release();
-    }
+    weight_xc_data.release();
+    bias_c_data.release();
+    weight_hc_data.release();
 
     return 0;
 }
diff --git a/src/layer/arm/lstm_arm_asimdhp.cpp b/src/layer/arm/lstm_arm_asimdhp.cpp
index 1d3fc71cdfc..593af33ccd4 100644
--- a/src/layer/arm/lstm_arm_asimdhp.cpp
+++ b/src/layer/arm/lstm_arm_asimdhp.cpp
@@ -835,12 +835,9 @@ int LSTM_arm::create_pipeline_fp16s(const Option& opt)
         }
     }
 
-    if (opt.lightmode)
-    {
-        weight_xc_data.release();
-        bias_c_data.release();
-        weight_hc_data.release();
-    }
+    weight_xc_data.release();
+    bias_c_data.release();
+    weight_hc_data.release();
 
     return 0;
 }
diff --git a/src/layer/arm/multiheadattention_arm.cpp b/src/layer/arm/multiheadattention_arm.cpp
index 37323a2255f..b3f3d7aa8e7 100644
--- a/src/layer/arm/multiheadattention_arm.cpp
+++ b/src/layer/arm/multiheadattention_arm.cpp
@@ -84,11 +84,8 @@ int MultiHeadAttention_arm::create_pipeline(const Option& _opt)
         q_gemm->load_model(ModelBinFromMatArray(weights));
         q_gemm->create_pipeline(opt);
 
-        if (opt.lightmode)
-        {
-            q_weight_data.release();
-            q_bias_data.release();
-        }
+        q_weight_data.release();
+        q_bias_data.release();
     }
 
     {
@@ -113,11 +110,8 @@ int MultiHeadAttention_arm::create_pipeline(const Option& _opt)
         k_gemm->load_model(ModelBinFromMatArray(weights));
         k_gemm->create_pipeline(opt);
 
-        if (opt.lightmode)
-        {
-            k_weight_data.release();
-            k_bias_data.release();
-        }
+        k_weight_data.release();
+        k_bias_data.release();
     }
 
     {
@@ -142,11 +136,8 @@ int MultiHeadAttention_arm::create_pipeline(const Option& _opt)
         v_gemm->load_model(ModelBinFromMatArray(weights));
         v_gemm->create_pipeline(opt);
 
-        if (opt.lightmode)
-        {
-            v_weight_data.release();
-            v_bias_data.release();
-        }
+        v_weight_data.release();
+        v_bias_data.release();
     }
 
     {
@@ -169,11 +160,8 @@ int MultiHeadAttention_arm::create_pipeline(const Option& _opt)
         o_gemm->load_model(ModelBinFromMatArray(weights));
         o_gemm->create_pipeline(opt);
 
-        if (opt.lightmode)
-        {
-            out_weight_data.release();
-            out_bias_data.release();
-        }
+        out_weight_data.release();
+        out_bias_data.release();
     }
 
     {
diff --git a/src/layer/arm/rnn_arm.cpp b/src/layer/arm/rnn_arm.cpp
index 19f439ea2d5..15b9f0b8a0d 100644
--- a/src/layer/arm/rnn_arm.cpp
+++ b/src/layer/arm/rnn_arm.cpp
@@ -139,6 +139,10 @@ int RNN_arm::create_pipeline(const Option& opt)
 
     bias_c_data_packed = bias_c_data;
 
+    weight_xc_data.release();
+    bias_c_data.release();
+    weight_hc_data.release();
+
     return 0;
 }
 
@@ -732,6 +736,10 @@ int RNN_arm::create_pipeline_bf16s(const Option& opt)
 
     cast_float32_to_bfloat16(bias_c_data, bias_c_data_packed, opt);
 
+    weight_xc_data.release();
+    bias_c_data.release();
+    weight_hc_data.release();
+
     return 0;
 }
 
diff --git a/src/layer/arm/rnn_arm_asimdhp.cpp b/src/layer/arm/rnn_arm_asimdhp.cpp
index c34b3e8bb48..467dba614f8 100644
--- a/src/layer/arm/rnn_arm_asimdhp.cpp
+++ b/src/layer/arm/rnn_arm_asimdhp.cpp
@@ -517,6 +517,10 @@ int RNN_arm::create_pipeline_fp16s(const Option& opt)
 
     cast_float32_to_float16(bias_c_data, bias_c_data_packed, opt);
 
+    weight_xc_data.release();
+    bias_c_data.release();
+    weight_hc_data.release();
+
     return 0;
 }
 
diff --git a/src/layer/loongarch/convolution1d_loongarch.cpp b/src/layer/loongarch/convolution1d_loongarch.cpp
index 0917a79f62e..1f804861e7a 100644
--- a/src/layer/loongarch/convolution1d_loongarch.cpp
+++ b/src/layer/loongarch/convolution1d_loongarch.cpp
@@ -78,6 +78,8 @@ int Convolution1D_loongarch::create_pipeline(const Option& opt)
         }
     }
 
+    weight_data.release();
+
     return 0;
 }
 
diff --git a/src/layer/loongarch/convolution_loongarch.cpp b/src/layer/loongarch/convolution_loongarch.cpp
index c38254c0035..3c5d0c1a424 100644
--- a/src/layer/loongarch/convolution_loongarch.cpp
+++ b/src/layer/loongarch/convolution_loongarch.cpp
@@ -225,10 +225,7 @@ int Convolution_loongarch::create_pipeline(const Option& opt)
         }
     }
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
@@ -792,10 +789,7 @@ int Convolution_loongarch::create_pipeline_int8_loongarch(const Option& opt)
         scale_in_data[p] = scale_in;
     }
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
diff --git a/src/layer/loongarch/convolutiondepthwise_loongarch.cpp b/src/layer/loongarch/convolutiondepthwise_loongarch.cpp
index 2546c19bfd0..0c5050dbce0 100644
--- a/src/layer/loongarch/convolutiondepthwise_loongarch.cpp
+++ b/src/layer/loongarch/convolutiondepthwise_loongarch.cpp
@@ -83,10 +83,7 @@ int ConvolutionDepthWise_loongarch::create_pipeline(const Option& opt)
             weight_data_tm = weight_data;
         }
 
-        if (opt.lightmode)
-        {
-            weight_data.release();
-        }
+        weight_data.release();
 
         return 0;
     }
@@ -94,10 +91,7 @@ int ConvolutionDepthWise_loongarch::create_pipeline(const Option& opt)
     // group convolution
     create_group_ops(opt);
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
@@ -606,16 +600,15 @@ int ConvolutionDepthWise_loongarch::create_pipeline_int8_loongarch(const Option&
             weight_data_tm = weight_data;
         }
 
+        weight_data.release();
+
         return 0;
     }
 
     // group convolution
     create_group_ops(opt);
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
diff --git a/src/layer/loongarch/deconvolution_loongarch.cpp b/src/layer/loongarch/deconvolution_loongarch.cpp
index cdf7c0a2638..62b9d872b60 100644
--- a/src/layer/loongarch/deconvolution_loongarch.cpp
+++ b/src/layer/loongarch/deconvolution_loongarch.cpp
@@ -126,10 +126,7 @@ int Deconvolution_loongarch::create_pipeline(const Option& opt)
     {
     }
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
diff --git a/src/layer/loongarch/deconvolutiondepthwise_loongarch.cpp b/src/layer/loongarch/deconvolutiondepthwise_loongarch.cpp
index cc9d24a506c..9495a99aae0 100644
--- a/src/layer/loongarch/deconvolutiondepthwise_loongarch.cpp
+++ b/src/layer/loongarch/deconvolutiondepthwise_loongarch.cpp
@@ -82,16 +82,15 @@ int DeconvolutionDepthWise_loongarch::create_pipeline(const Option& opt)
             weight_data_tm = weight_data_transposed;
         }
 
+        weight_data.release();
+
         return 0;
     }
 
     // group convolution
     create_group_ops(opt);
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
diff --git a/src/layer/loongarch/innerproduct_loongarch.cpp b/src/layer/loongarch/innerproduct_loongarch.cpp
index b17d3f830c2..e6b8eb0936b 100644
--- a/src/layer/loongarch/innerproduct_loongarch.cpp
+++ b/src/layer/loongarch/innerproduct_loongarch.cpp
@@ -99,10 +99,7 @@ int InnerProduct_loongarch::create_pipeline(const Option& opt)
         weight_data_tm = weight_data;
     }
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
@@ -655,10 +652,7 @@ int InnerProduct_loongarch::create_pipeline_fp16s(const Option& opt)
         ncnn::cast_float32_to_float16(weight_data_r2, weight_data_tm, opt);
     }
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
@@ -1146,10 +1140,7 @@ int InnerProduct_loongarch::create_pipeline_int8_loongarch(const Option& opt)
         scale_in_data[p] = scale_in;
     }
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
diff --git a/src/layer/mips/convolution1d_mips.cpp b/src/layer/mips/convolution1d_mips.cpp
index e9cf211e49b..5db88c1935f 100644
--- a/src/layer/mips/convolution1d_mips.cpp
+++ b/src/layer/mips/convolution1d_mips.cpp
@@ -78,6 +78,8 @@ int Convolution1D_mips::create_pipeline(const Option& opt)
         }
     }
 
+    weight_data.release();
+
     return 0;
 }
 
diff --git a/src/layer/mips/convolution_mips.cpp b/src/layer/mips/convolution_mips.cpp
index 8f566f43a6c..af420e61a9a 100644
--- a/src/layer/mips/convolution_mips.cpp
+++ b/src/layer/mips/convolution_mips.cpp
@@ -225,10 +225,7 @@ int Convolution_mips::create_pipeline(const Option& opt)
         }
     }
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
@@ -792,10 +789,7 @@ int Convolution_mips::create_pipeline_int8_mips(const Option& opt)
         scale_in_data[p] = scale_in;
     }
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
diff --git a/src/layer/mips/convolutiondepthwise_mips.cpp b/src/layer/mips/convolutiondepthwise_mips.cpp
index 17bb2e012e6..0c9bdca30ce 100644
--- a/src/layer/mips/convolutiondepthwise_mips.cpp
+++ b/src/layer/mips/convolutiondepthwise_mips.cpp
@@ -83,10 +83,7 @@ int ConvolutionDepthWise_mips::create_pipeline(const Option& opt)
             weight_data_tm = weight_data;
         }
 
-        if (opt.lightmode)
-        {
-            weight_data.release();
-        }
+        weight_data.release();
 
         return 0;
     }
@@ -94,10 +91,7 @@ int ConvolutionDepthWise_mips::create_pipeline(const Option& opt)
     // group convolution
     create_group_ops(opt);
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
@@ -606,16 +600,15 @@ int ConvolutionDepthWise_mips::create_pipeline_int8_mips(const Option& opt)
             weight_data_tm = weight_data;
         }
 
+        weight_data.release();
+
         return 0;
     }
 
     // group convolution
     create_group_ops(opt);
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
diff --git a/src/layer/mips/deconvolution_mips.cpp b/src/layer/mips/deconvolution_mips.cpp
index 607313614c0..208400f532e 100644
--- a/src/layer/mips/deconvolution_mips.cpp
+++ b/src/layer/mips/deconvolution_mips.cpp
@@ -126,10 +126,7 @@ int Deconvolution_mips::create_pipeline(const Option& opt)
     {
     }
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
diff --git a/src/layer/mips/deconvolutiondepthwise_mips.cpp b/src/layer/mips/deconvolutiondepthwise_mips.cpp
index 404335b0efe..e6f5dd43478 100644
--- a/src/layer/mips/deconvolutiondepthwise_mips.cpp
+++ b/src/layer/mips/deconvolutiondepthwise_mips.cpp
@@ -82,16 +82,15 @@ int DeconvolutionDepthWise_mips::create_pipeline(const Option& opt)
             weight_data_tm = weight_data_transposed;
         }
 
+        weight_data.release();
+
         return 0;
     }
 
     // group convolution
     create_group_ops(opt);
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
diff --git a/src/layer/mips/innerproduct_mips.cpp b/src/layer/mips/innerproduct_mips.cpp
index 87a32b86cfe..9d926bfd08d 100644
--- a/src/layer/mips/innerproduct_mips.cpp
+++ b/src/layer/mips/innerproduct_mips.cpp
@@ -99,10 +99,7 @@ int InnerProduct_mips::create_pipeline(const Option& opt)
         weight_data_tm = weight_data;
     }
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
@@ -655,10 +652,7 @@ int InnerProduct_mips::create_pipeline_fp16s(const Option& opt)
         ncnn::cast_float32_to_float16(weight_data_r2, weight_data_tm, opt);
     }
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
@@ -1146,10 +1140,7 @@ int InnerProduct_mips::create_pipeline_int8_mips(const Option& opt)
         scale_in_data[p] = scale_in;
     }
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
diff --git a/src/layer/riscv/convolution1d_riscv.cpp b/src/layer/riscv/convolution1d_riscv.cpp
index ff02d6bc24f..fbf4190ca94 100644
--- a/src/layer/riscv/convolution1d_riscv.cpp
+++ b/src/layer/riscv/convolution1d_riscv.cpp
@@ -95,6 +95,8 @@ int Convolution1D_riscv::create_pipeline(const Option& opt)
         }
     }
 
+    weight_data.release();
+
     return 0;
 }
 
@@ -470,6 +472,8 @@ int Convolution1D_riscv::create_pipeline_fp16s(const Option& opt)
 
     ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);
 
+    weight_data.release();
+
     return 0;
 }
 
diff --git a/src/layer/riscv/convolution_riscv.cpp b/src/layer/riscv/convolution_riscv.cpp
index a4c73986bc4..be413e5be25 100644
--- a/src/layer/riscv/convolution_riscv.cpp
+++ b/src/layer/riscv/convolution_riscv.cpp
@@ -237,10 +237,7 @@ int Convolution_riscv::create_pipeline(const Option& opt)
         }
     }
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
@@ -837,10 +834,7 @@ int Convolution_riscv::create_pipeline_fp16s(const Option& opt)
         ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);
     }
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
diff --git a/src/layer/riscv/convolutiondepthwise_riscv.cpp b/src/layer/riscv/convolutiondepthwise_riscv.cpp
index 7d772d75ef9..d913fe7e1d5 100644
--- a/src/layer/riscv/convolutiondepthwise_riscv.cpp
+++ b/src/layer/riscv/convolutiondepthwise_riscv.cpp
@@ -104,10 +104,7 @@ int ConvolutionDepthWise_riscv::create_pipeline(const Option& opt)
             weight_data_tm = weight_data;
         }
 
-        if (opt.lightmode)
-        {
-            weight_data.release();
-        }
+        weight_data.release();
 
         return 0;
     }
@@ -115,10 +112,7 @@ int ConvolutionDepthWise_riscv::create_pipeline(const Option& opt)
     // group convolution
     create_group_ops(opt);
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
@@ -688,10 +682,7 @@ int ConvolutionDepthWise_riscv::create_pipeline_fp16s(const Option& opt)
 
         ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);
 
-        if (opt.lightmode)
-        {
-            weight_data.release();
-        }
+        weight_data.release();
 
         return 0;
     }
@@ -699,10 +690,7 @@ int ConvolutionDepthWise_riscv::create_pipeline_fp16s(const Option& opt)
     // group convolution
     create_group_ops(opt);
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
diff --git a/src/layer/riscv/deconvolution_riscv.cpp b/src/layer/riscv/deconvolution_riscv.cpp
index 9483a2f8af3..6b395282908 100644
--- a/src/layer/riscv/deconvolution_riscv.cpp
+++ b/src/layer/riscv/deconvolution_riscv.cpp
@@ -148,10 +148,7 @@ int Deconvolution_riscv::create_pipeline(const Option& opt)
     {
     }
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
@@ -533,10 +530,7 @@ int Deconvolution_riscv::create_pipeline_fp16s(const Option& opt)
 
     ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
diff --git a/src/layer/riscv/deconvolutiondepthwise_riscv.cpp b/src/layer/riscv/deconvolutiondepthwise_riscv.cpp
index 9f127b1160c..7b567cf63e0 100644
--- a/src/layer/riscv/deconvolutiondepthwise_riscv.cpp
+++ b/src/layer/riscv/deconvolutiondepthwise_riscv.cpp
@@ -97,10 +97,7 @@ int DeconvolutionDepthWise_riscv::create_pipeline(const Option& opt)
             weight_data_tm = weight_data_transposed;
         }
 
-        if (opt.lightmode)
-        {
-            weight_data.release();
-        }
+        weight_data.release();
 
         return 0;
     }
@@ -108,10 +105,7 @@ int DeconvolutionDepthWise_riscv::create_pipeline(const Option& opt)
     // group convolution
     create_group_ops(opt);
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
@@ -625,10 +619,7 @@ int DeconvolutionDepthWise_riscv::create_pipeline_fp16s(const Option& opt)
 
         ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);
 
-        if (opt.lightmode)
-        {
-            weight_data.release();
-        }
+        weight_data.release();
 
         return 0;
     }
@@ -636,10 +627,7 @@ int DeconvolutionDepthWise_riscv::create_pipeline_fp16s(const Option& opt)
     // group convolution
     create_group_ops(opt);
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
diff --git a/src/layer/riscv/gemm_riscv.cpp b/src/layer/riscv/gemm_riscv.cpp
index ec5a5cdac41..33f8913bd1b 100644
--- a/src/layer/riscv/gemm_riscv.cpp
+++ b/src/layer/riscv/gemm_riscv.cpp
@@ -4006,10 +4006,7 @@ int Gemm_riscv::create_pipeline(const Option& opt)
             }
         }
 
-        if (opt.lightmode)
-        {
-            A_data.release();
-        }
+        A_data.release();
     }
 
     if (constantB)
@@ -4049,10 +4046,7 @@ int Gemm_riscv::create_pipeline(const Option& opt)
             }
         }
 
-        if (opt.lightmode)
-        {
-            B_data.release();
-        }
+        B_data.release();
     }
 
     if (constantC && constant_broadcast_type_C != -1)
@@ -4082,10 +4076,7 @@ int Gemm_riscv::create_pipeline(const Option& opt)
             CT_data = C2;
         }
 
-        if (opt.lightmode)
-        {
-            C_data.release();
-        }
+        C_data.release();
     }
 
     if (constantA || constantB || constantC)
diff --git a/src/layer/riscv/gru_riscv.cpp b/src/layer/riscv/gru_riscv.cpp
index 28afa5081d0..c7e36c1c0fc 100644
--- a/src/layer/riscv/gru_riscv.cpp
+++ b/src/layer/riscv/gru_riscv.cpp
@@ -714,6 +714,10 @@ int GRU_riscv::create_pipeline_fp16sa(const Option& opt)
     cast_float32_to_float16(weight_hc_data, weight_hc_data_fp16sa, opt);
     cast_float32_to_float16(bias_c_data, bias_c_data_fp16sa, opt);
 
+    weight_xc_data.release();
+    bias_c_data.release();
+    weight_hc_data.release();
+
     return 0;
 }
 
diff --git a/src/layer/riscv/innerproduct_riscv.cpp b/src/layer/riscv/innerproduct_riscv.cpp
index 1f0d698a979..accfc683584 100644
--- a/src/layer/riscv/innerproduct_riscv.cpp
+++ b/src/layer/riscv/innerproduct_riscv.cpp
@@ -106,10 +106,7 @@ int InnerProduct_riscv::create_pipeline(const Option& opt)
         weight_data_tm = weight_data;
     }
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
@@ -563,10 +560,7 @@ int InnerProduct_riscv::create_pipeline_fp16s(const Option& opt)
 
     ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
diff --git a/src/layer/vulkan/convolution1d_vulkan.cpp b/src/layer/vulkan/convolution1d_vulkan.cpp
index 12162c5b7fc..fdef247bdf9 100644
--- a/src/layer/vulkan/convolution1d_vulkan.cpp
+++ b/src/layer/vulkan/convolution1d_vulkan.cpp
@@ -135,6 +135,9 @@ int Convolution1D_vulkan::create_pipeline(const Option& _opt)
         pipeline_convolution1d->create(shader_type_index, opt, specializations);
     }
 
+    weight_data.release();
+    bias_data.release();
+
     return 0;
 }
 
diff --git a/src/layer/vulkan/convolution_vulkan.cpp b/src/layer/vulkan/convolution_vulkan.cpp
index 5e3bffca51b..4ba7c279e3c 100644
--- a/src/layer/vulkan/convolution_vulkan.cpp
+++ b/src/layer/vulkan/convolution_vulkan.cpp
@@ -1150,6 +1150,9 @@ int Convolution_vulkan::create_pipeline(const Option& _opt)
         pipeline_convolution->create(shader_type_index, opt, specializations);
     }
 
+    weight_data.release();
+    bias_data.release();
+
     return 0;
 }
 
diff --git a/src/layer/vulkan/convolutiondepthwise_vulkan.cpp b/src/layer/vulkan/convolutiondepthwise_vulkan.cpp
index 23405a6269d..9b54d136ccf 100644
--- a/src/layer/vulkan/convolutiondepthwise_vulkan.cpp
+++ b/src/layer/vulkan/convolutiondepthwise_vulkan.cpp
@@ -412,6 +412,9 @@ int ConvolutionDepthWise_vulkan::create_pipeline(const Option& _opt)
         pipeline_convolutiondepthwise_group_pack8to1->create(LayerShaderType::convolutiondepthwise_group_pack8to1, opt, specializations);
     }
 
+    weight_data.release();
+    bias_data.release();
+
     return 0;
 }
 
diff --git a/src/layer/vulkan/deconvolution_vulkan.cpp b/src/layer/vulkan/deconvolution_vulkan.cpp
index 33d56e34ada..83f71f84172 100644
--- a/src/layer/vulkan/deconvolution_vulkan.cpp
+++ b/src/layer/vulkan/deconvolution_vulkan.cpp
@@ -464,6 +464,9 @@ int Deconvolution_vulkan::create_pipeline(const Option& _opt)
     pipeline_deconvolution->set_optimal_local_size_xyz(local_size_xyz);
     pipeline_deconvolution->create(shader_type_index, opt, specializations);
 
+    weight_data.release();
+    bias_data.release();
+
     return 0;
 }
 
diff --git a/src/layer/vulkan/deconvolutiondepthwise_vulkan.cpp b/src/layer/vulkan/deconvolutiondepthwise_vulkan.cpp
index 08e6b1a23c0..af23d229547 100644
--- a/src/layer/vulkan/deconvolutiondepthwise_vulkan.cpp
+++ b/src/layer/vulkan/deconvolutiondepthwise_vulkan.cpp
@@ -436,6 +436,9 @@ int DeconvolutionDepthWise_vulkan::create_pipeline(const Option& _opt)
         pipeline_deconvolutiondepthwise_group_pack8to1->create(LayerShaderType::deconvolutiondepthwise_group_pack8to1, opt, specializations);
     }
 
+    weight_data.release();
+    bias_data.release();
+
     return 0;
 }
 
diff --git a/src/layer/vulkan/gemm_vulkan.cpp b/src/layer/vulkan/gemm_vulkan.cpp
index ad768c63dd2..f30fa552f11 100644
--- a/src/layer/vulkan/gemm_vulkan.cpp
+++ b/src/layer/vulkan/gemm_vulkan.cpp
@@ -100,6 +100,10 @@ int Gemm_vulkan::create_pipeline(const Option& opt)
         pipeline_gemm->create(LayerShaderType::gemm, opt, specializations);
     }
 
+    A_data.release();
+    B_data.release();
+    C_data.release();
+
     return 0;
 }
 
diff --git a/src/layer/vulkan/innerproduct_vulkan.cpp b/src/layer/vulkan/innerproduct_vulkan.cpp
index df87b3de258..de23feef70e 100644
--- a/src/layer/vulkan/innerproduct_vulkan.cpp
+++ b/src/layer/vulkan/innerproduct_vulkan.cpp
@@ -364,6 +364,9 @@ int InnerProduct_vulkan::create_pipeline(const Option& _opt)
         return 0;
     }
 
+    weight_data.release();
+    bias_data.release();
+
     return 0;
 }
 
diff --git a/src/layer/vulkan/multiheadattention_vulkan.cpp b/src/layer/vulkan/multiheadattention_vulkan.cpp
index 142ccf4f8b6..411b81b05e9 100644
--- a/src/layer/vulkan/multiheadattention_vulkan.cpp
+++ b/src/layer/vulkan/multiheadattention_vulkan.cpp
@@ -72,6 +72,9 @@ int MultiHeadAttention_vulkan::create_pipeline(const Option& opt)
         weights[1] = q_bias_data;
         q_gemm->load_model(ModelBinFromMatArray(weights));
         q_gemm->create_pipeline(opt);
+
+        q_weight_data.release();
+        q_bias_data.release();
     }
 
     {
@@ -96,6 +99,9 @@ int MultiHeadAttention_vulkan::create_pipeline(const Option& opt)
         weights[1] = k_bias_data;
         k_gemm->load_model(ModelBinFromMatArray(weights));
         k_gemm->create_pipeline(opt);
+
+        k_weight_data.release();
+        k_bias_data.release();
     }
 
     {
@@ -120,6 +126,9 @@ int MultiHeadAttention_vulkan::create_pipeline(const Option& opt)
         weights[1] = v_bias_data;
         v_gemm->load_model(ModelBinFromMatArray(weights));
         v_gemm->create_pipeline(opt);
+
+        v_weight_data.release();
+        v_bias_data.release();
     }
 
     {
@@ -212,6 +221,9 @@ int MultiHeadAttention_vulkan::create_pipeline(const Option& opt)
         weights[1] = out_bias_data;
         o_gemm->load_model(ModelBinFromMatArray(weights));
         o_gemm->create_pipeline(opt);
+
+        out_weight_data.release();
+        out_bias_data.release();
     }
 
     return 0;
diff --git a/src/layer/x86/convolution1d_x86.cpp b/src/layer/x86/convolution1d_x86.cpp
index 26c72678b67..905db18b728 100644
--- a/src/layer/x86/convolution1d_x86.cpp
+++ b/src/layer/x86/convolution1d_x86.cpp
@@ -43,6 +43,8 @@ int Convolution1D_x86::create_pipeline(const Option& /*opt*/)
 
     convolution1d_transform_kernel_packed(weight_data, weight_data_tm, num_input, num_output, kernel_w);
 
+    weight_data.release();
+
     return 0;
 }
 
diff --git a/src/layer/x86/convolution_x86.cpp b/src/layer/x86/convolution_x86.cpp
index 5c97b02eebd..b7c78de5e4d 100644
--- a/src/layer/x86/convolution_x86.cpp
+++ b/src/layer/x86/convolution_x86.cpp
@@ -334,10 +334,7 @@ int Convolution_x86::create_pipeline(const Option& opt)
 
         convolution_dilation1->create_pipeline(opt);
 
-        if (opt.lightmode)
-        {
-            weight_data.release();
-        }
+        weight_data.release();
 
         return 0;
     }
@@ -454,10 +451,7 @@ int Convolution_x86::create_pipeline(const Option& opt)
             }
         }
 
-        if (opt.lightmode)
-        {
-            weight_data.release();
-        }
+        weight_data.release();
 
         return 0;
     }
@@ -548,10 +542,7 @@ int Convolution_x86::create_pipeline(const Option& opt)
         }
     }
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
@@ -1259,10 +1250,7 @@ int Convolution_x86::create_pipeline_int8_x86(const Option& opt)
         scale_in_data[p] = scale_in;
     }
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
diff --git a/src/layer/x86/convolutiondepthwise_x86.cpp b/src/layer/x86/convolutiondepthwise_x86.cpp
index 48be4ab0e06..6a9fb7fb4c2 100644
--- a/src/layer/x86/convolutiondepthwise_x86.cpp
+++ b/src/layer/x86/convolutiondepthwise_x86.cpp
@@ -132,10 +132,7 @@ int ConvolutionDepthWise_x86::create_pipeline(const Option& opt)
             }
         }
 
-        if (opt.lightmode)
-        {
-            weight_data.release();
-        }
+        weight_data.release();
 
         return 0;
     }
@@ -143,10 +140,7 @@ int ConvolutionDepthWise_x86::create_pipeline(const Option& opt)
     // group convolution
     create_group_ops(opt);
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
@@ -849,16 +843,15 @@ int ConvolutionDepthWise_x86::create_pipeline_int8_x86(const Option& opt)
             weight_data_tm = weight_data;
         }
 
+        weight_data.release();
+
         return 0;
     }
 
     // group convolution
     create_group_ops(opt);
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
diff --git a/src/layer/x86/deconvolution_x86.cpp b/src/layer/x86/deconvolution_x86.cpp
index 09d13616d58..6a94104a43d 100644
--- a/src/layer/x86/deconvolution_x86.cpp
+++ b/src/layer/x86/deconvolution_x86.cpp
@@ -193,10 +193,7 @@ int Deconvolution_x86::create_pipeline(const Option& opt)
         }
     }
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
diff --git a/src/layer/x86/deconvolutiondepthwise_x86.cpp b/src/layer/x86/deconvolutiondepthwise_x86.cpp
index 7a790701857..4a1e89d26a8 100644
--- a/src/layer/x86/deconvolutiondepthwise_x86.cpp
+++ b/src/layer/x86/deconvolutiondepthwise_x86.cpp
@@ -109,16 +109,15 @@ int DeconvolutionDepthWise_x86::create_pipeline(const Option& opt)
             weight_data_tm = weight_data_transposed;
         }
 
+        weight_data.release();
+
         return 0;
     }
 
     // group convolution
     create_group_ops(opt);
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
diff --git a/src/layer/x86/deformableconv2d_x86.cpp b/src/layer/x86/deformableconv2d_x86.cpp
index b3bf9887898..8fc7bdf2855 100644
--- a/src/layer/x86/deformableconv2d_x86.cpp
+++ b/src/layer/x86/deformableconv2d_x86.cpp
@@ -203,10 +203,7 @@ int DeformableConv2D_x86::create_pipeline(const Option& opt)
         deformableconv2d_transform_kernel_packed_sse(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
     }
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
diff --git a/src/layer/x86/gemm_x86.cpp b/src/layer/x86/gemm_x86.cpp
index 19cd7ebc09a..4ab37836a43 100644
--- a/src/layer/x86/gemm_x86.cpp
+++ b/src/layer/x86/gemm_x86.cpp
@@ -7235,10 +7235,7 @@ int Gemm_x86::create_pipeline(const Option& opt)
             }
         }
 
-        if (opt.lightmode)
-        {
-            A_data.release();
-        }
+        A_data.release();
     }
 
     if (constantB)
@@ -7282,10 +7279,7 @@ int Gemm_x86::create_pipeline(const Option& opt)
             }
         }
 
-        if (opt.lightmode)
-        {
-            B_data.release();
-        }
+        B_data.release();
     }
 
     if (constantC && constant_broadcast_type_C != -1)
@@ -7321,10 +7315,7 @@ int Gemm_x86::create_pipeline(const Option& opt)
             CT_data = C2;
         }
 
-        if (opt.lightmode)
-        {
-            C_data.release();
-        }
+        C_data.release();
     }
 
     if (constantA || constantB || constantC)
diff --git a/src/layer/x86/innerproduct_x86.cpp b/src/layer/x86/innerproduct_x86.cpp
index 0ca253ebd1d..dee07d1de64 100644
--- a/src/layer/x86/innerproduct_x86.cpp
+++ b/src/layer/x86/innerproduct_x86.cpp
@@ -80,10 +80,7 @@ int InnerProduct_x86::create_pipeline(const Option& opt)
 
     innerproduct_transform_kernel_sse(weight_data, weight_data_tm, num_input, num_output, opt);
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
@@ -178,10 +175,7 @@ int InnerProduct_x86::create_pipeline_fp16s(const Option& opt)
 
     innerproduct_transform_kernel_fp16s_sse(weight_data, weight_data_tm, num_input, num_output, opt);
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
@@ -287,10 +281,7 @@ int InnerProduct_x86::create_pipeline_int8_x86(const Option& opt)
         scale_in_data[p] = scale_in;
     }
 
-    if (opt.lightmode)
-    {
-        weight_data.release();
-    }
+    weight_data.release();
 
     return 0;
 }
diff --git a/src/layer/x86/lstm_x86.cpp b/src/layer/x86/lstm_x86.cpp
index 6ba218e53d3..5d693648f44 100644
--- a/src/layer/x86/lstm_x86.cpp
+++ b/src/layer/x86/lstm_x86.cpp
@@ -182,12 +182,9 @@ int LSTM_x86::create_pipeline(const Option& opt)
         }
     }
 
-    if (opt.lightmode)
-    {
-        weight_xc_data.release();
-        bias_c_data.release();
-        weight_hc_data.release();
-    }
+    weight_xc_data.release();
+    bias_c_data.release();
+    weight_hc_data.release();
 
     return 0;
 }
diff --git a/src/layer/x86/multiheadattention_x86.cpp b/src/layer/x86/multiheadattention_x86.cpp
index a7ff58288c2..2bddad5582d 100644
--- a/src/layer/x86/multiheadattention_x86.cpp
+++ b/src/layer/x86/multiheadattention_x86.cpp
@@ -65,11 +65,8 @@ int MultiHeadAttention_x86::create_pipeline(const Option& opt)
         q_gemm->load_model(ModelBinFromMatArray(weights));
         q_gemm->create_pipeline(opt);
 
-        if (opt.lightmode)
-        {
-            q_weight_data.release();
-            q_bias_data.release();
-        }
+        q_weight_data.release();
+        q_bias_data.release();
     }
 
     {
@@ -94,11 +91,8 @@ int MultiHeadAttention_x86::create_pipeline(const Option& opt)
         k_gemm->load_model(ModelBinFromMatArray(weights));
         k_gemm->create_pipeline(opt);
 
-        if (opt.lightmode)
-        {
-            k_weight_data.release();
-            k_bias_data.release();
-        }
+        k_weight_data.release();
+        k_bias_data.release();
     }
 
     {
@@ -123,11 +117,8 @@ int MultiHeadAttention_x86::create_pipeline(const Option& opt)
         v_gemm->load_model(ModelBinFromMatArray(weights));
         v_gemm->create_pipeline(opt);
 
-        if (opt.lightmode)
-        {
-            v_weight_data.release();
-            v_bias_data.release();
-        }
+        v_weight_data.release();
+        v_bias_data.release();
     }
 
     {
@@ -202,11 +193,8 @@ int MultiHeadAttention_x86::create_pipeline(const Option& opt)
         o_gemm->load_model(ModelBinFromMatArray(weights));
         o_gemm->create_pipeline(opt);
 
-        if (opt.lightmode)
-        {
-            out_weight_data.release();
-            out_bias_data.release();
-        }
+        out_weight_data.release();
+        out_bias_data.release();
     }
 
     return 0;

From 69be7f0d2bb400821b22ac8887ec8838fe372a93 Mon Sep 17 00:00:00 2001
From: nihuini <nihuini@tencent.com>
Date: Thu, 4 Jan 2024 17:49:15 +0800
Subject: [PATCH 14/19] fix

---
 src/layer/loongarch/convolution1d_loongarch.cpp | 2 --
 src/layer/mips/convolution1d_mips.cpp           | 2 --
 src/layer/riscv/convolution1d_riscv.cpp         | 2 --
 3 files changed, 6 deletions(-)

diff --git a/src/layer/loongarch/convolution1d_loongarch.cpp b/src/layer/loongarch/convolution1d_loongarch.cpp
index 1f804861e7a..0917a79f62e 100644
--- a/src/layer/loongarch/convolution1d_loongarch.cpp
+++ b/src/layer/loongarch/convolution1d_loongarch.cpp
@@ -78,8 +78,6 @@ int Convolution1D_loongarch::create_pipeline(const Option& opt)
         }
     }
 
-    weight_data.release();
-
     return 0;
 }
 
diff --git a/src/layer/mips/convolution1d_mips.cpp b/src/layer/mips/convolution1d_mips.cpp
index 5db88c1935f..e9cf211e49b 100644
--- a/src/layer/mips/convolution1d_mips.cpp
+++ b/src/layer/mips/convolution1d_mips.cpp
@@ -78,8 +78,6 @@ int Convolution1D_mips::create_pipeline(const Option& opt)
         }
     }
 
-    weight_data.release();
-
     return 0;
 }
 
diff --git a/src/layer/riscv/convolution1d_riscv.cpp b/src/layer/riscv/convolution1d_riscv.cpp
index fbf4190ca94..6c581a0edeb 100644
--- a/src/layer/riscv/convolution1d_riscv.cpp
+++ b/src/layer/riscv/convolution1d_riscv.cpp
@@ -95,8 +95,6 @@ int Convolution1D_riscv::create_pipeline(const Option& opt)
         }
     }
 
-    weight_data.release();
-
     return 0;
 }
 

From 35e1178595fe66f305d63f543734a2dbdb25f2b6 Mon Sep 17 00:00:00 2001
From: nihuini <nihuini@tencent.com>
Date: Thu, 4 Jan 2024 19:43:05 +0800
Subject: [PATCH 15/19] fix int8

---
 src/net.cpp | 24 +++++++-----------------
 1 file changed, 7 insertions(+), 17 deletions(-)

diff --git a/src/net.cpp b/src/net.cpp
index 9738e55bc02..dcd83a1548e 100644
--- a/src/net.cpp
+++ b/src/net.cpp
@@ -1504,6 +1504,13 @@ int Net::load_param(const DataReader& dr)
         // pull out layer specific feature disabled set
         layer->featmask = pd.get(31, 0);
 
+        int lr = layer->load_param(pd);
+        if (lr != 0)
+        {
+            NCNN_LOGE("layer load_param %d %s failed", i, layer_name);
+            continue;
+        }
+
         if (layer->support_int8_storage)
         {
             // no int8 gpu support yet
@@ -1516,20 +1523,8 @@ int Net::load_param(const DataReader& dr)
         {
             if (!layer->support_image_storage) opt1.use_image_storage = false;
         }
-        else
-        {
-            layer->vkdev = 0;
-            layer->support_vulkan = false;
-        }
 #endif // NCNN_VULKAN
 
-        int lr = layer->load_param(pd);
-        if (lr != 0)
-        {
-            NCNN_LOGE("layer load_param %d %s failed", i, layer_name);
-            continue;
-        }
-
         if (layer_support_vulkan && (!layer->support_vulkan || !opt1.use_vulkan_compute))
         {
             // vulkan layer cannot handle these param, recreate cpu layer
@@ -1801,11 +1796,6 @@ int Net::load_param_bin(const DataReader& dr)
         {
             if (!layer->support_image_storage) opt1.use_image_storage = false;
         }
-        else
-        {
-            layer->vkdev = 0;
-            layer->support_vulkan = false;
-        }
 #endif // NCNN_VULKAN
 
         if (layer_support_vulkan && (!layer->support_vulkan || !opt1.use_vulkan_compute))

From 4c0412ef7524d2c5f902a461f0197ee63f6d2c89 Mon Sep 17 00:00:00 2001
From: nihuini <nihuini@tencent.com>
Date: Thu, 4 Jan 2024 19:46:56 +0800
Subject: [PATCH 16/19] drop old scripts

---
 cmake/ncnn_generate_shader_spv_header.cmake | 581 --------------------
 src/CMakeLists.txt                          |   2 -
 2 files changed, 583 deletions(-)
 delete mode 100644 cmake/ncnn_generate_shader_spv_header.cmake

diff --git a/cmake/ncnn_generate_shader_spv_header.cmake b/cmake/ncnn_generate_shader_spv_header.cmake
deleted file mode 100644
index 93649daed92..00000000000
--- a/cmake/ncnn_generate_shader_spv_header.cmake
+++ /dev/null
@@ -1,581 +0,0 @@
-
-function(ncnn_generate_shader_spv_header SHADER_SPV_HEADER SHADER_SPV_HEX_HEADERS SHADER_SRC)
-
-    # fp32
-    get_filename_component(SHADER_SRC_NAME_WE ${SHADER_SRC} NAME_WE)
-
-    set(SHADER_SPV_HEX_FILE ${CMAKE_CURRENT_BINARY_DIR}/${SHADER_SRC_NAME_WE}.spv.hex.h)
-    add_custom_command(
-        OUTPUT ${SHADER_SPV_HEX_FILE}
-        COMMAND ${GLSLANGVALIDATOR_EXECUTABLE}
-        ARGS -Dsfp=float -Dsfpvec2=vec2 -Dsfpvec4=vec4 -Dsfpvec8=mat2x4 -Dsfpmat4=mat4
-             -Dafp=float -Dafpvec2=vec2 -Dafpvec4=vec4 -Dafpvec8=mat2x4 -Dafpmat4=mat4
-             "-D buffer_ld1(buf,i)=buf[i]"
-             "-D buffer_st1(buf,i,v)={buf[i]=v;}"
-             "-D buffer_cp1(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_cp1to4(buf,i,sbuf,si4)={buf[i]=vec4(sbuf[si4.r],sbuf[si4.g],sbuf[si4.b],sbuf[si4.a]);}"
-             "-D buffer_cp1to8(buf,i,sbuf,si4,sii4)={buf[i]=mat2x4(sbuf[si4.r],sbuf[si4.g],sbuf[si4.b],sbuf[si4.a],sbuf[sii4.r],sbuf[sii4.g],sbuf[sii4.b],sbuf[sii4.a]);}"
-             "-D buffer_ld2(buf,i)=buf[i]"
-             "-D buffer_st2(buf,i,v)={buf[i]=v;}"
-             "-D buffer_cp2(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_ld4(buf,i)=buf[i]"
-             "-D buffer_st4(buf,i,v)={buf[i]=v;}"
-             "-D buffer_cp4(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_cp4to1(buf,i4,sbuf,si)={vec4 _v=sbuf[si]; buf[i4.r]=_v.r;buf[i4.g]=_v.g;buf[i4.b]=_v.b;buf[i4.a]=_v.a;}"
-             "-D buffer_cp4to8(buf,i,sbuf,si2)={buf[i]=mat2x4(sbuf[si2.r],sbuf[si2.g]);}"
-             "-D buffer_ld8(buf,i)=buf[i]"
-             "-D buffer_st8(buf,i,v)={buf[i]=v;}"
-             "-D buffer_cp8(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_cp8to1(buf,i4,ii4,sbuf,si)={mat2x4 _v=sbuf[si]; buf[i4.r]=_v[0].r;buf[i4.g]=_v[0].g;buf[i4.b]=_v[0].b;buf[i4.a]=_v[0].a; buf[ii4.r]=_v[1].r;buf[ii4.g]=_v[1].g;buf[ii4.b]=_v[1].b;buf[ii4.a]=_v[1].a;}"
-             "-D buffer_cp8to4(buf,i2,sbuf,si)={mat2x4 _v=sbuf[si]; buf[i2.r]=_v[0];buf[i2.g]=_v[1];}"
-             "-D sfp2afpmat4(v)=v"
-             "-D afp2sfpmat4(v)=v"
-             "-D psc(x)=(x==0?p.x:x)"
-             -V -s -x -o ${SHADER_SPV_HEX_FILE} ${SHADER_SRC}
-        DEPENDS ${SHADER_SRC}
-        COMMENT "Building SPIR-V module ${SHADER_SRC_NAME_WE}.spv"
-        VERBATIM
-    )
-    set_source_files_properties(${SHADER_SPV_HEX_FILE} PROPERTIES GENERATED TRUE)
-
-    # fp16 packed
-    set(SHADER_fp16p_SRC_NAME_WE "${SHADER_SRC_NAME_WE}_fp16p")
-
-    set(SHADER_fp16p_SPV_HEX_FILE ${CMAKE_CURRENT_BINARY_DIR}/${SHADER_fp16p_SRC_NAME_WE}.spv.hex.h)
-    add_custom_command(
-        OUTPUT ${SHADER_fp16p_SPV_HEX_FILE}
-        COMMAND ${GLSLANGVALIDATOR_EXECUTABLE}
-        ARGS -Dsfp=float -Dsfpvec2=uint -Dsfpvec4=uvec2 -Dsfpvec8=uvec4
-             -Dafp=float -Dafpvec2=vec2 -Dafpvec4=vec4  -Dafpvec8=mat2x4 -Dafpmat4=mat4
-             "-D buffer_ld1(buf,i)=buf[i]"
-             "-D buffer_st1(buf,i,v)={buf[i]=v;}"
-             "-D buffer_cp1(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_cp1to4(buf,i,sbuf,si4)={buf[i]=uvec2(packHalf2x16(vec2(sbuf[si4.r],sbuf[si4.g])),packHalf2x16(vec2(sbuf[si4.b],sbuf[si4.a])));}"
-             "-D buffer_cp1to8(buf,i,sbuf,si4,sii4)={buf[i]=uvec4(packHalf2x16(vec2(sbuf[si4.r],sbuf[si4.g])),packHalf2x16(vec2(sbuf[si4.b],sbuf[si4.a])),packHalf2x16(vec2(sbuf[sii4.r],sbuf[sii4.g])),packHalf2x16(vec2(sbuf[sii4.b],sbuf[sii4.a])));}"
-             "-D buffer_ld2(buf,i)=unpackHalf2x16(buf[i])"
-             "-D buffer_st2(buf,i,v)={buf[i]=packHalf2x16(v)}"
-             "-D buffer_cp2(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_ld4(buf,i)=vec4(unpackHalf2x16(buf[i].x),unpackHalf2x16(buf[i].y))"
-             "-D buffer_st4(buf,i,v)={buf[i]=uvec2(packHalf2x16(v.rg),packHalf2x16(v.ba));}"
-             "-D buffer_cp4(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_cp4to1(buf,i4,sbuf,si)={uvec2 _v=sbuf[si]; vec2 _v0=unpackHalf2x16(_v.x);vec2 _v1=unpackHalf2x16(_v.y); buf[i4.r]=_v0.r;buf[i4.g]=_v0.g;buf[i4.b]=_v1.r;buf[i4.a]=_v1.g;}"
-             "-D buffer_cp4to8(buf,i,sbuf,si2)={buf[i]=uvec4(sbuf[si2.r],sbuf[si2.g]);}"
-             "-D buffer_ld8(buf,i)=mat2x4(vec4(unpackHalf2x16(buf[i].r),unpackHalf2x16(buf[i].g)),vec4(unpackHalf2x16(buf[i].b),unpackHalf2x16(buf[i].a)))"
-             "-D buffer_st8(buf,i,v)={buf[i]=uvec4(uvec2(packHalf2x16(v[0].rg),packHalf2x16(v[0].ba)),uvec2(packHalf2x16(v[1].rg),packHalf2x16(v[1].ba)));}"
-             "-D buffer_cp8(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_cp8to1(buf,i4,ii4,sbuf,si)={uvec4 _v=sbuf[si]; vec2 _v0=unpackHalf2x16(_v.r);vec2 _v1=unpackHalf2x16(_v.g);vec2 _v2=unpackHalf2x16(_v.b);vec2 _v3=unpackHalf2x16(_v.a); buf[i4.r]=_v0.r;buf[i4.g]=_v0.g;buf[i4.b]=_v1.r;buf[i4.a]=_v1.g; buf[ii4.r]=_v2.r;buf[ii4.g]=_v2.g;buf[ii4.b]=_v3.r;buf[ii4.a]=_v3.g;}"
-             "-D buffer_cp8to4(buf,i2,sbuf,si)={uvec4 _v=sbuf[si]; buf[i2.r]=_v.rg;buf[i2.g]=_v.ba;}"
-             "-D psc(x)=(x==0?p.x:x)"
-             -DNCNN_fp16_packed=1
-             -V -s -x -o ${SHADER_fp16p_SPV_HEX_FILE} ${SHADER_SRC}
-        DEPENDS ${SHADER_SRC}
-        COMMENT "Building SPIR-V module ${SHADER_fp16p_SRC_NAME_WE}.spv"
-        VERBATIM
-    )
-    set_source_files_properties(${SHADER_fp16p_SPV_HEX_FILE} PROPERTIES GENERATED TRUE)
-
-    # fp16 packed + fp16 arithmetic
-    set(SHADER_fp16pa_SRC_NAME_WE "${SHADER_SRC_NAME_WE}_fp16pa")
-
-    set(SHADER_fp16pa_SPV_HEX_FILE ${CMAKE_CURRENT_BINARY_DIR}/${SHADER_fp16pa_SRC_NAME_WE}.spv.hex.h)
-    add_custom_command(
-        OUTPUT ${SHADER_fp16pa_SPV_HEX_FILE}
-        COMMAND ${GLSLANGVALIDATOR_EXECUTABLE}
-        ARGS -Dsfp=float -Dsfpvec2=uint -Dsfpvec4=uvec2 -Dsfpvec8=uvec4
-             -Dafp=float16_t -Dafpvec2=f16vec2 -Dafpvec4=f16vec4  -Dafpvec8=f16mat2x4 -Dafpmat4=f16mat4
-             "-D buffer_ld1(buf,i)=float16_t(buf[i])"
-             "-D buffer_st1(buf,i,v)={buf[i]=float(v);}"
-             "-D buffer_cp1(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_cp1to4(buf,i,sbuf,si4)={buf[i]=uvec2(packHalf2x16(vec2(f16vec2(sbuf[si4.r],sbuf[si4.g]))),packHalf2x16(vec2(f16vec2(sbuf[si4.b],sbuf[si4.a]))));}"
-             "-D buffer_cp1to8(buf,i,sbuf,si4,sii4)={buf[i]=uvec4(packHalf2x16(vec2(f16vec2(sbuf[si4.r],sbuf[si4.g]))),packHalf2x16(vec2(f16vec2(sbuf[si4.b],sbuf[si4.a]))),packHalf2x16(vec2(f16vec2(sbuf[sii4.r],sbuf[sii4.g]))),packHalf2x16(vec2(f16vec2(sbuf[sii4.b],sbuf[sii4.a]))));}"
-             "-D buffer_ld2(buf,i)=f16vec2(unpackHalf2x16(buf[i]))"
-             "-D buffer_st2(buf,i,v)={buf[i]=packHalf2x16(vec2(v))}"
-             "-D buffer_cp2(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_ld4(buf,i)=f16vec4(vec4(unpackHalf2x16(buf[i].x),unpackHalf2x16(buf[i].y)))"
-             "-D buffer_st4(buf,i,v)={buf[i]=uvec2(packHalf2x16(vec2(v.rg)),packHalf2x16(vec2(v.ba)));}"
-             "-D buffer_cp4(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_cp4to1(buf,i4,sbuf,si)={uvec2 _v=sbuf[si]; vec2 _v0=unpackHalf2x16(_v.x);vec2 _v1=unpackHalf2x16(_v.y); buf[i4.r]=_v0.r;buf[i4.g]=_v0.g;buf[i4.b]=_v1.r;buf[i4.a]=_v1.g;}"
-             "-D buffer_cp4to8(buf,i,sbuf,si2)={buf[i]=uvec4(sbuf[si2.r],sbuf[si2.g]);}"
-             "-D buffer_ld8(buf,i)=f16mat2x4(f16vec4(vec4(unpackHalf2x16(buf[i].r),unpackHalf2x16(buf[i].g))),f16vec4(vec4(unpackHalf2x16(buf[i].b),unpackHalf2x16(buf[i].a))))"
-             "-D buffer_st8(buf,i,v)={buf[i]=uvec4(uvec2(packHalf2x16(vec2(v[0].rg)),packHalf2x16(vec2(v[0].ba))),uvec2(packHalf2x16(vec2(v[1].rg)),packHalf2x16(vec2(v[1].ba))));}"
-             "-D buffer_cp8(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_cp8to1(buf,i4,ii4,sbuf,si)={uvec4 _v=sbuf[si]; vec2 _v0=unpackHalf2x16(_v.r);vec2 _v1=unpackHalf2x16(_v.g);vec2 _v2=unpackHalf2x16(_v.b);vec2 _v3=unpackHalf2x16(_v.a); buf[i4.r]=_v0.r;buf[i4.g]=_v0.g;buf[i4.b]=_v1.r;buf[i4.a]=_v1.g; buf[ii4.r]=_v2.r;buf[ii4.g]=_v2.g;buf[ii4.b]=_v3.r;buf[ii4.a]=_v3.g;}"
-             "-D buffer_cp8to4(buf,i2,sbuf,si)={uvec4 _v=sbuf[si]; buf[i2.r]=_v.rg;buf[i2.g]=_v.ba;}"
-             "-D psc(x)=(x==0?p.x:x)"
-             -DNCNN_fp16_packed=1 -DNCNN_fp16_arithmetic=1
-             -V -s -x -o ${SHADER_fp16pa_SPV_HEX_FILE} ${SHADER_SRC}
-        DEPENDS ${SHADER_SRC}
-        COMMENT "Building SPIR-V module ${SHADER_fp16pa_SRC_NAME_WE}.spv"
-        VERBATIM
-    )
-    set_source_files_properties(${SHADER_fp16pa_SPV_HEX_FILE} PROPERTIES GENERATED TRUE)
-
-    # fp16 storage
-    set(SHADER_fp16s_SRC_NAME_WE "${SHADER_SRC_NAME_WE}_fp16s")
-
-    set(SHADER_fp16s_SPV_HEX_FILE ${CMAKE_CURRENT_BINARY_DIR}/${SHADER_fp16s_SRC_NAME_WE}.spv.hex.h)
-    add_custom_command(
-        OUTPUT ${SHADER_fp16s_SPV_HEX_FILE}
-        COMMAND ${GLSLANGVALIDATOR_EXECUTABLE}
-        ARGS -Dsfp=float16_t -Dsfpvec2=f16vec2 -Dsfpvec4=f16vec4
-             -Dafp=float     -Dafpvec2=vec2    -Dafpvec4=vec4    -Dafpvec8=mat2x4 -Dafpmat4=mat4
-             "-D buffer_ld1(buf,i)=float(buf[i])"
-             "-D buffer_st1(buf,i,v)={buf[i]=float16_t(v);}"
-             "-D buffer_cp1(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_cp1to4(buf,i,sbuf,si4)={buf[i].r=sbuf[si4.r];buf[i].g=sbuf[si4.g];buf[i].b=sbuf[si4.b];buf[i].a=sbuf[si4.a];}"
-             "-D buffer_cp1to8(buf,i,sbuf,si4,sii4)={buf[i].abcd.r=sbuf[si4.r];buf[i].abcd.g=sbuf[si4.g];buf[i].abcd.b=sbuf[si4.b];buf[i].abcd.a=sbuf[si4.a];buf[i].efgh.r=sbuf[sii4.r];buf[i].efgh.g=sbuf[sii4.g];buf[i].efgh.b=sbuf[sii4.b];buf[i].efgh.a=sbuf[sii4.a];}"
-             "-D buffer_ld2(buf,i)=vec2(buf[i])"
-             "-D buffer_st2(buf,i,v)={buf[i]=f16vec2(v);}"
-             "-D buffer_cp2(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_ld4(buf,i)=vec4(buf[i])"
-             "-D buffer_st4(buf,i,v)={buf[i]=f16vec4(v);}"
-             "-D buffer_cp4(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_cp4to1(buf,i4,sbuf,si)={buf[i4.r]=sbuf[si].r;buf[i4.g]=sbuf[si].g;buf[i4.b]=sbuf[si].b;buf[i4.a]=sbuf[si].a;}"
-             "-D buffer_cp4to8(buf,i,sbuf,si2)={buf[i].abcd=sbuf[si2.r];buf[i].efgh=sbuf[si2.g];}"
-             "-D buffer_ld8(buf,i)=mat2x4(vec4(buf[i].abcd),vec4(buf[i].efgh))"
-             "-D buffer_st8(buf,i,v)={buf[i].abcd=f16vec4(v[0]);buf[i].efgh=f16vec4(v[1]);}"
-             "-D buffer_cp8(buf,i,sbuf,si)={buf[i].abcd=sbuf[si].abcd;buf[i].efgh=sbuf[si].efgh;}"
-             "-D buffer_cp8to1(buf,i4,ii4,sbuf,si)={buf[i4.r]=sbuf[si].abcd.r;buf[i4.g]=sbuf[si].abcd.g;buf[i4.b]=sbuf[si].abcd.b;buf[i4.a]=sbuf[si].abcd.a; buf[ii4.r]=sbuf[si].efgh.r;buf[ii4.g]=sbuf[si].efgh.g;buf[ii4.b]=sbuf[si].efgh.b;buf[ii4.a]=sbuf[si].efgh.a;}"
-             "-D buffer_cp8to4(buf,i2,sbuf,si)={buf[i2.r]=sbuf[si].abcd;buf[i2.g]=sbuf[si].efgh;}"
-             "-D psc(x)=(x==0?p.x:x)"
-             -DNCNN_fp16_storage=1
-             -V -s -x -o ${SHADER_fp16s_SPV_HEX_FILE} ${SHADER_SRC}
-        DEPENDS ${SHADER_SRC}
-        COMMENT "Building SPIR-V module ${SHADER_fp16s_SRC_NAME_WE}.spv"
-        VERBATIM
-    )
-    set_source_files_properties(${SHADER_fp16s_SPV_HEX_FILE} PROPERTIES GENERATED TRUE)
-
-    # fp16 storage + fp16 arithmetic
-    set(SHADER_fp16sa_SRC_NAME_WE "${SHADER_SRC_NAME_WE}_fp16sa")
-
-    set(SHADER_fp16sa_SPV_HEX_FILE ${CMAKE_CURRENT_BINARY_DIR}/${SHADER_fp16sa_SRC_NAME_WE}.spv.hex.h)
-    add_custom_command(
-        OUTPUT ${SHADER_fp16sa_SPV_HEX_FILE}
-        COMMAND ${GLSLANGVALIDATOR_EXECUTABLE}
-        ARGS -Dsfp=float16_t -Dsfpvec2=f16vec2 -Dsfpvec4=f16vec4 -Dsfpvec8=f16mat2x4 -Dsfpmat4=f16mat4
-             -Dafp=float16_t -Dafpvec2=f16vec2 -Dafpvec4=f16vec4 -Dafpvec8=f16mat2x4 -Dafpmat4=f16mat4
-             "-D buffer_ld1(buf,i)=buf[i]"
-             "-D buffer_st1(buf,i,v)={buf[i]=v;}"
-             "-D buffer_cp1(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_cp1to4(buf,i,sbuf,si4)={buf[i]=f16vec4(sbuf[si4.r],sbuf[si4.g],sbuf[si4.b],sbuf[si4.a]);}"
-             "-D buffer_cp1to8(buf,i,sbuf,si4,sii4)={buf[i]=f16mat2x4(sbuf[si4.r],sbuf[si4.g],sbuf[si4.b],sbuf[si4.a],sbuf[sii4.r],sbuf[sii4.g],sbuf[sii4.b],sbuf[sii4.a]);}"
-             "-D buffer_ld2(buf,i)=buf[i]"
-             "-D buffer_st2(buf,i,v)={buf[i]=v;}"
-             "-D buffer_cp2(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_ld4(buf,i)=buf[i]"
-             "-D buffer_st4(buf,i,v)={buf[i]=v;}"
-             "-D buffer_cp4(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_cp4to1(buf,i4,sbuf,si)={buf[i4.r]=sbuf[si].r;buf[i4.g]=sbuf[si].g;buf[i4.b]=sbuf[si].b;buf[i4.a]=sbuf[si].a;}"
-             "-D buffer_cp4to8(buf,i,sbuf,si2)={buf[i]=f16mat2x4(sbuf[si2.r],sbuf[si2.g]);}"
-             "-D buffer_ld8(buf,i)=buf[i]"
-             "-D buffer_st8(buf,i,v)={buf[i]=v;}"
-             "-D buffer_cp8(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_cp8to1(buf,i4,ii4,sbuf,si)={f16mat2x4 _v=sbuf[si]; buf[i4.r]=_v[0].r;buf[i4.g]=_v[0].g;buf[i4.b]=_v[0].b;buf[i4.a]=_v[0].a; buf[ii4.r]=_v[1].r;buf[ii4.g]=_v[1].g;buf[ii4.b]=_v[1].b;buf[ii4.a]=_v[1].a;}"
-             "-D buffer_cp8to4(buf,i2,sbuf,si)={f16mat2x4 _v=sbuf[si]; buf[i2.r]=_v[0];buf[i2.g]=_v[1];}"
-             "-D sfp2afpmat4(v)=v"
-             "-D afp2sfpmat4(v)=v"
-             "-D psc(x)=(x==0?p.x:x)"
-             -DNCNN_fp16_storage=1 -DNCNN_fp16_arithmetic=1
-             -V -s -x -o ${SHADER_fp16sa_SPV_HEX_FILE} ${SHADER_SRC}
-        DEPENDS ${SHADER_SRC}
-        COMMENT "Building SPIR-V module ${SHADER_fp16sa_SRC_NAME_WE}.spv"
-        VERBATIM
-    )
-    set_source_files_properties(${SHADER_fp16sa_SPV_HEX_FILE} PROPERTIES GENERATED TRUE)
-
-    # image + fp32
-    set(SHADER_image_SRC_NAME_WE "${SHADER_SRC_NAME_WE}_image")
-
-    set(SHADER_image_SPV_HEX_FILE ${CMAKE_CURRENT_BINARY_DIR}/${SHADER_image_SRC_NAME_WE}.spv.hex.h)
-    add_custom_command(
-        OUTPUT ${SHADER_image_SPV_HEX_FILE}
-        COMMAND ${GLSLANGVALIDATOR_EXECUTABLE}
-        ARGS -Dsfp=float -Dsfpvec2=vec2 -Dsfpvec4=vec4 -Dsfpvec8=mat2x4 -Dsfpmat4=mat4
-             -Dafp=float -Dafpvec2=vec2 -Dafpvec4=vec4 -Dafpvec8=mat2x4 -Dafpmat4=mat4
-
-             -Dimfmtc1=r32f -Dimfmtc4=rgba32f
-             -Dunfp=highp
-
-             "-D image1d_ld1(tex,p)=texelFetch(tex,p,0).r"
-             "-D image2d_ld1(tex,p)=texelFetch(tex,p,0).r"
-             "-D image3d_ld1(tex,p)=texelFetch(tex,p,0).r"
-             "-D image1d_st1(img,p,v)={vec4 _v;_v.r=v;imageStore(img,p,_v);}"
-             "-D image2d_st1(img,p,v)={vec4 _v;_v.r=v;imageStore(img,p,_v);}"
-             "-D image3d_st1(img,p,v)={vec4 _v;_v.r=v;imageStore(img,p,_v);}"
-             "-D image1d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
-             "-D image2d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
-             "-D image3d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
-
-             "-D image1d_ld4(tex,p)=texelFetch(tex,p,0)"
-             "-D image2d_ld4(tex,p)=texelFetch(tex,p,0)"
-             "-D image3d_ld4(tex,p)=texelFetch(tex,p,0)"
-             "-D image1d_st4(img,p,v)={imageStore(img,p,v);}"
-             "-D image2d_st4(img,p,v)={imageStore(img,p,v);}"
-             "-D image3d_st4(img,p,v)={imageStore(img,p,v);}"
-             "-D image1d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
-             "-D image2d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
-             "-D image3d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
-
-             "-D image1d_ld8(tex,p)=mat2x4(texelFetch(tex,(p)*2,0),texelFetch(tex,(p)*2+1,0))"
-             "-D image2d_ld8(tex,p)=mat2x4(texelFetch(tex,ivec2(p.x*2,p.y),0),texelFetch(tex,ivec2(p.x*2+1,p.y),0))"
-             "-D image3d_ld8(tex,p)=mat2x4(texelFetch(tex,ivec3(p.x*2,p.y,p.z),0),texelFetch(tex,ivec3(p.x*2+1,p.y,p.z),0))"
-             "-D image1d_st8(img,p,v)={imageStore(img,(p)*2,v[0]);imageStore(img,(p)*2+1,v[1]);}"
-             "-D image2d_st8(img,p,v)={imageStore(img,ivec2(p.x*2,p.y),v[0]);imageStore(img,ivec2(p.x*2+1,p.y),v[1]);}"
-             "-D image3d_st8(img,p,v)={imageStore(img,ivec3(p.x*2,p.y,p.z),v[0]);imageStore(img,ivec3(p.x*2+1,p.y,p.z),v[1]);}"
-             "-D image1d_cp8(img,p,tex,sp)={imageStore(img,(p)*2,texelFetch(tex,sp*2,0));imageStore(img,(p)*2+1,texelFetch(tex,sp*2+1,0));}"
-             "-D image2d_cp8(img,p,tex,sp)={imageStore(img,ivec2(p.x*2,p.y),texelFetch(tex,ivec2(sp.x*2,sp.y),0));imageStore(img,ivec2(p.x*2+1,p.y),texelFetch(tex,ivec2(sp.x*2+1,sp.y),0));}"
-             "-D image3d_cp8(img,p,tex,sp)={imageStore(img,ivec3(p.x*2,p.y,p.z),texelFetch(tex,ivec3(sp.x*2,sp.y,sp.z),0));imageStore(img,ivec3(p.x*2+1,p.y,p.z),texelFetch(tex,ivec3(sp.x*2+1,sp.y,sp.z),0));}"
-
-             "-D buffer_ld1(buf,i)=buf[i]"
-             "-D buffer_st1(buf,i,v)={buf[i]=v;}"
-             "-D buffer_cp1(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_cp1to4(buf,i,sbuf,si4)={buf[i]=vec4(sbuf[si4.r],sbuf[si4.g],sbuf[si4.b],sbuf[si4.a]);}"
-             "-D buffer_cp1to8(buf,i,sbuf,si4,sii4)={buf[i]=mat2x4(sbuf[si4.r],sbuf[si4.g],sbuf[si4.b],sbuf[si4.a],sbuf[sii4.r],sbuf[sii4.g],sbuf[sii4.b],sbuf[sii4.a]);}"
-             "-D buffer_ld2(buf,i)=buf[i]"
-             "-D buffer_st2(buf,i,v)={buf[i]=v;}"
-             "-D buffer_cp2(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_ld4(buf,i)=buf[i]"
-             "-D buffer_st4(buf,i,v)={buf[i]=v;}"
-             "-D buffer_cp4(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_cp4to1(buf,i4,sbuf,si)={vec4 _v=sbuf[si]; buf[i4.r]=_v.r;buf[i4.g]=_v.g;buf[i4.b]=_v.b;buf[i4.a]=_v.a;}"
-             "-D buffer_cp4to8(buf,i,sbuf,si2)={buf[i]=mat2x4(sbuf[si2.r],sbuf[si2.g]);}"
-             "-D buffer_ld8(buf,i)=buf[i]"
-             "-D buffer_st8(buf,i,v)={buf[i]=v;}"
-             "-D buffer_cp8(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_cp8to1(buf,i4,ii4,sbuf,si)={mat2x4 _v=sbuf[si]; buf[i4.r]=_v[0].r;buf[i4.g]=_v[0].g;buf[i4.b]=_v[0].b;buf[i4.a]=_v[0].a; buf[ii4.r]=_v[1].r;buf[ii4.g]=_v[1].g;buf[ii4.b]=_v[1].b;buf[ii4.a]=_v[1].a;}"
-             "-D buffer_cp8to4(buf,i2,sbuf,si)={mat2x4 _v=sbuf[si]; buf[i2.r]=_v[0];buf[i2.g]=_v[1];}"
-
-             "-D sfp2afpmat4(v)=v"
-             "-D afp2sfpmat4(v)=v"
-             "-D psc(x)=(x==0?p.x:x)"
-             -DNCNN_image_shader=1
-             -V -s -x -o ${SHADER_image_SPV_HEX_FILE} ${SHADER_SRC}
-        DEPENDS ${SHADER_SRC}
-        COMMENT "Building SPIR-V module ${SHADER_image_SRC_NAME_WE}.spv"
-        VERBATIM
-    )
-    set_source_files_properties(${SHADER_image_SPV_HEX_FILE} PROPERTIES GENERATED TRUE)
-
-    # image + fp16p
-    set(SHADER_image_fp16p_SRC_NAME_WE "${SHADER_SRC_NAME_WE}_image_fp16p")
-
-    set(SHADER_image_fp16p_SPV_HEX_FILE ${CMAKE_CURRENT_BINARY_DIR}/${SHADER_image_fp16p_SRC_NAME_WE}.spv.hex.h)
-    add_custom_command(
-        OUTPUT ${SHADER_image_fp16p_SPV_HEX_FILE}
-        COMMAND ${GLSLANGVALIDATOR_EXECUTABLE}
-        ARGS -Dsfp=float -Dsfpvec2=uint -Dsfpvec4=uvec2 -Dsfpvec8=uvec4
-             -Dafp=float -Dafpvec2=vec2 -Dafpvec4=vec4  -Dafpvec8=mat2x4 -Dafpmat4=mat4
-
-             -Dimfmtc1=r32f -Dimfmtc4=rgba16f
-             -Dunfp=mediump
-
-             "-D image1d_ld1(tex,p)=texelFetch(tex,p,0).r"
-             "-D image2d_ld1(tex,p)=texelFetch(tex,p,0).r"
-             "-D image3d_ld1(tex,p)=texelFetch(tex,p,0).r"
-             "-D image1d_st1(img,p,v)={vec4 _v;_v.r=v;imageStore(img,p,_v);}"
-             "-D image2d_st1(img,p,v)={vec4 _v;_v.r=v;imageStore(img,p,_v);}"
-             "-D image3d_st1(img,p,v)={vec4 _v;_v.r=v;imageStore(img,p,_v);}"
-             "-D image1d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
-             "-D image2d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
-             "-D image3d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
-
-             "-D image1d_ld4(tex,p)=texelFetch(tex,p,0)"
-             "-D image2d_ld4(tex,p)=texelFetch(tex,p,0)"
-             "-D image3d_ld4(tex,p)=texelFetch(tex,p,0)"
-             "-D image1d_st4(img,p,v)={imageStore(img,p,v);}"
-             "-D image2d_st4(img,p,v)={imageStore(img,p,v);}"
-             "-D image3d_st4(img,p,v)={imageStore(img,p,v);}"
-             "-D image1d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
-             "-D image2d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
-             "-D image3d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
-
-             "-D image1d_ld8(tex,p)=mat2x4(texelFetch(tex,(p)*2,0),texelFetch(tex,(p)*2+1,0))"
-             "-D image2d_ld8(tex,p)=mat2x4(texelFetch(tex,ivec2(p.x*2,p.y),0),texelFetch(tex,ivec2(p.x*2+1,p.y),0))"
-             "-D image3d_ld8(tex,p)=mat2x4(texelFetch(tex,ivec3(p.x*2,p.y,p.z),0),texelFetch(tex,ivec3(p.x*2+1,p.y,p.z),0))"
-             "-D image1d_st8(img,p,v)={imageStore(img,(p)*2,v[0]);imageStore(img,(p)*2+1,v[1]);}"
-             "-D image2d_st8(img,p,v)={imageStore(img,ivec2(p.x*2,p.y),v[0]);imageStore(img,ivec2(p.x*2+1,p.y),v[1]);}"
-             "-D image3d_st8(img,p,v)={imageStore(img,ivec3(p.x*2,p.y,p.z),v[0]);imageStore(img,ivec3(p.x*2+1,p.y,p.z),v[1]);}"
-             "-D image1d_cp8(img,p,tex,sp)={imageStore(img,(p)*2,texelFetch(tex,sp*2,0));imageStore(img,(p)*2+1,texelFetch(tex,sp*2+1,0));}"
-             "-D image2d_cp8(img,p,tex,sp)={imageStore(img,ivec2(p.x*2,p.y),texelFetch(tex,ivec2(sp.x*2,sp.y),0));imageStore(img,ivec2(p.x*2+1,p.y),texelFetch(tex,ivec2(sp.x*2+1,sp.y),0));}"
-             "-D image3d_cp8(img,p,tex,sp)={imageStore(img,ivec3(p.x*2,p.y,p.z),texelFetch(tex,ivec3(sp.x*2,sp.y,sp.z),0));imageStore(img,ivec3(p.x*2+1,p.y,p.z),texelFetch(tex,ivec3(sp.x*2+1,sp.y,sp.z),0));}"
-
-             "-D buffer_ld1(buf,i)=buf[i]"
-             "-D buffer_st1(buf,i,v)={buf[i]=v;}"
-             "-D buffer_cp1(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_cp1to4(buf,i,sbuf,si4)={buf[i]=uvec2(packHalf2x16(vec2(sbuf[si4.r],sbuf[si4.g])),packHalf2x16(vec2(sbuf[si4.b],sbuf[si4.a])));}"
-             "-D buffer_cp1to8(buf,i,sbuf,si4,sii4)={buf[i]=uvec4(packHalf2x16(vec2(sbuf[si4.r],sbuf[si4.g])),packHalf2x16(vec2(sbuf[si4.b],sbuf[si4.a])),packHalf2x16(vec2(sbuf[sii4.r],sbuf[sii4.g])),packHalf2x16(vec2(sbuf[sii4.b],sbuf[sii4.a])));}"
-             "-D buffer_ld2(buf,i)=unpackHalf2x16(buf[i])"
-             "-D buffer_st2(buf,i,v)={buf[i]=packHalf2x16(v)}"
-             "-D buffer_cp2(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_ld4(buf,i)=vec4(unpackHalf2x16(buf[i].x),unpackHalf2x16(buf[i].y))"
-             "-D buffer_st4(buf,i,v)={buf[i]=uvec2(packHalf2x16(v.rg),packHalf2x16(v.ba));}"
-             "-D buffer_cp4(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_cp4to1(buf,i4,sbuf,si)={uvec2 _v=sbuf[si]; vec2 _v0=unpackHalf2x16(_v.x);vec2 _v1=unpackHalf2x16(_v.y); buf[i4.r]=_v0.r;buf[i4.g]=_v0.g;buf[i4.b]=_v1.r;buf[i4.a]=_v1.g;}"
-             "-D buffer_cp4to8(buf,i,sbuf,si2)={buf[i]=uvec4(sbuf[si2.r],sbuf[si2.g]);}"
-             "-D buffer_ld8(buf,i)=mat2x4(vec4(unpackHalf2x16(buf[i].r),unpackHalf2x16(buf[i].g)),vec4(unpackHalf2x16(buf[i].b),unpackHalf2x16(buf[i].a)))"
-             "-D buffer_st8(buf,i,v)={buf[i]=uvec4(uvec2(packHalf2x16(v[0].rg),packHalf2x16(v[0].ba)),uvec2(packHalf2x16(v[1].rg),packHalf2x16(v[1].ba)));}"
-             "-D buffer_cp8(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_cp8to1(buf,i4,ii4,sbuf,si)={uvec4 _v=sbuf[si]; vec2 _v0=unpackHalf2x16(_v.r);vec2 _v1=unpackHalf2x16(_v.g);vec2 _v2=unpackHalf2x16(_v.b);vec2 _v3=unpackHalf2x16(_v.a); buf[i4.r]=_v0.r;buf[i4.g]=_v0.g;buf[i4.b]=_v1.r;buf[i4.a]=_v1.g; buf[ii4.r]=_v2.r;buf[ii4.g]=_v2.g;buf[ii4.b]=_v3.r;buf[ii4.a]=_v3.g;}"
-             "-D buffer_cp8to4(buf,i2,sbuf,si)={uvec4 _v=sbuf[si]; buf[i2.r]=_v.rg;buf[i2.g]=_v.ba;}"
-
-             "-D psc(x)=(x==0?p.x:x)"
-             -DNCNN_image_shader=1 -DNCNN_fp16_packed=1
-             -V -s -x -o ${SHADER_image_fp16p_SPV_HEX_FILE} ${SHADER_SRC}
-        DEPENDS ${SHADER_SRC}
-        COMMENT "Building SPIR-V module ${SHADER_image_fp16p_SRC_NAME_WE}.spv"
-        VERBATIM
-    )
-    set_source_files_properties(${SHADER_image_fp16p_SPV_HEX_FILE} PROPERTIES GENERATED TRUE)
-
-    # image + fp16p + fp16a
-    set(SHADER_image_fp16pa_SRC_NAME_WE "${SHADER_SRC_NAME_WE}_image_fp16pa")
-
-    set(SHADER_image_fp16pa_SPV_HEX_FILE ${CMAKE_CURRENT_BINARY_DIR}/${SHADER_image_fp16pa_SRC_NAME_WE}.spv.hex.h)
-    add_custom_command(
-        OUTPUT ${SHADER_image_fp16pa_SPV_HEX_FILE}
-        COMMAND ${GLSLANGVALIDATOR_EXECUTABLE}
-        ARGS -Dsfp=float -Dsfpvec2=uint -Dsfpvec4=uvec2 -Dsfpvec8=uvec4
-             -Dafp=float16_t -Dafpvec2=f16vec2 -Dafpvec4=f16vec4  -Dafpvec8=f16mat2x4 -Dafpmat4=f16mat4
-
-             -Dimfmtc1=r32f -Dimfmtc4=rgba16f
-             -Dunfp=mediump
-
-             "-D image1d_ld1(tex,p)=float16_t(texelFetch(tex,p,0).r)"
-             "-D image2d_ld1(tex,p)=float16_t(texelFetch(tex,p,0).r)"
-             "-D image3d_ld1(tex,p)=float16_t(texelFetch(tex,p,0).r)"
-             "-D image1d_st1(img,p,v)={vec4 _v;_v.r=v;imageStore(img,p,_v);}"
-             "-D image2d_st1(img,p,v)={vec4 _v;_v.r=v;imageStore(img,p,_v);}"
-             "-D image3d_st1(img,p,v)={vec4 _v;_v.r=v;imageStore(img,p,_v);}"
-             "-D image1d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
-             "-D image2d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
-             "-D image3d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
-
-             "-D image1d_ld4(tex,p)=f16vec4(texelFetch(tex,p,0))"
-             "-D image2d_ld4(tex,p)=f16vec4(texelFetch(tex,p,0))"
-             "-D image3d_ld4(tex,p)=f16vec4(texelFetch(tex,p,0))"
-             "-D image1d_st4(img,p,v)={imageStore(img,p,v);}"
-             "-D image2d_st4(img,p,v)={imageStore(img,p,v);}"
-             "-D image3d_st4(img,p,v)={imageStore(img,p,v);}"
-             "-D image1d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
-             "-D image2d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
-             "-D image3d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
-
-             "-D image1d_ld8(tex,p)=f16mat2x4(texelFetch(tex,(p)*2,0),texelFetch(tex,(p)*2+1,0))"
-             "-D image2d_ld8(tex,p)=f16mat2x4(texelFetch(tex,ivec2(p.x*2,p.y),0),texelFetch(tex,ivec2(p.x*2+1,p.y),0))"
-             "-D image3d_ld8(tex,p)=f16mat2x4(texelFetch(tex,ivec3(p.x*2,p.y,p.z),0),texelFetch(tex,ivec3(p.x*2+1,p.y,p.z),0))"
-             "-D image1d_st8(img,p,v)={imageStore(img,(p)*2,v[0]);imageStore(img,(p)*2+1,v[1]);}"
-             "-D image2d_st8(img,p,v)={imageStore(img,ivec2(p.x*2,p.y),v[0]);imageStore(img,ivec2(p.x*2+1,p.y),v[1]);}"
-             "-D image3d_st8(img,p,v)={imageStore(img,ivec3(p.x*2,p.y,p.z),v[0]);imageStore(img,ivec3(p.x*2+1,p.y,p.z),v[1]);}"
-             "-D image1d_cp8(img,p,tex,sp)={imageStore(img,(p)*2,texelFetch(tex,sp*2,0));imageStore(img,(p)*2+1,texelFetch(tex,sp*2+1,0));}"
-             "-D image2d_cp8(img,p,tex,sp)={imageStore(img,ivec2(p.x*2,p.y),texelFetch(tex,ivec2(sp.x*2,sp.y),0));imageStore(img,ivec2(p.x*2+1,p.y),texelFetch(tex,ivec2(sp.x*2+1,sp.y),0));}"
-             "-D image3d_cp8(img,p,tex,sp)={imageStore(img,ivec3(p.x*2,p.y,p.z),texelFetch(tex,ivec3(sp.x*2,sp.y,sp.z),0));imageStore(img,ivec3(p.x*2+1,p.y,p.z),texelFetch(tex,ivec3(sp.x*2+1,sp.y,sp.z),0));}"
-
-             "-D buffer_ld1(buf,i)=float16_t(buf[i])"
-             "-D buffer_st1(buf,i,v)={buf[i]=float(v);}"
-             "-D buffer_cp1(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_cp1to4(buf,i,sbuf,si4)={buf[i]=uvec2(packHalf2x16(vec2(f16vec2(sbuf[si4.r],sbuf[si4.g]))),packHalf2x16(vec2(f16vec2(sbuf[si4.b],sbuf[si4.a]))));}"
-             "-D buffer_cp1to8(buf,i,sbuf,si4,sii4)={buf[i]=uvec4(packHalf2x16(vec2(f16vec2(sbuf[si4.r],sbuf[si4.g]))),packHalf2x16(vec2(f16vec2(sbuf[si4.b],sbuf[si4.a]))),packHalf2x16(vec2(f16vec2(sbuf[sii4.r],sbuf[sii4.g]))),packHalf2x16(vec2(f16vec2(sbuf[sii4.b],sbuf[sii4.a]))));}"
-             "-D buffer_ld2(buf,i)=f16vec2(unpackHalf2x16(buf[i]))"
-             "-D buffer_st2(buf,i,v)={buf[i]=packHalf2x16(vec2(v))}"
-             "-D buffer_cp2(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_ld4(buf,i)=f16vec4(vec4(unpackHalf2x16(buf[i].x),unpackHalf2x16(buf[i].y)))"
-             "-D buffer_st4(buf,i,v)={buf[i]=uvec2(packHalf2x16(vec2(v.rg)),packHalf2x16(vec2(v.ba)));}"
-             "-D buffer_cp4(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_cp4to1(buf,i4,sbuf,si)={uvec2 _v=sbuf[si]; vec2 _v0=unpackHalf2x16(_v.x);vec2 _v1=unpackHalf2x16(_v.y); buf[i4.r]=_v0.r;buf[i4.g]=_v0.g;buf[i4.b]=_v1.r;buf[i4.a]=_v1.g;}"
-             "-D buffer_cp4to8(buf,i,sbuf,si2)={buf[i]=uvec4(sbuf[si2.r],sbuf[si2.g]);}"
-             "-D buffer_ld8(buf,i)=f16mat2x4(f16vec4(vec4(unpackHalf2x16(buf[i].r),unpackHalf2x16(buf[i].g))),f16vec4(vec4(unpackHalf2x16(buf[i].b),unpackHalf2x16(buf[i].a))))"
-             "-D buffer_st8(buf,i,v)={buf[i]=uvec4(uvec2(packHalf2x16(vec2(v[0].rg)),packHalf2x16(vec2(v[0].ba))),uvec2(packHalf2x16(vec2(v[1].rg)),packHalf2x16(vec2(v[1].ba))));}"
-             "-D buffer_cp8(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_cp8to1(buf,i4,ii4,sbuf,si)={uvec4 _v=sbuf[si]; vec2 _v0=unpackHalf2x16(_v.r);vec2 _v1=unpackHalf2x16(_v.g);vec2 _v2=unpackHalf2x16(_v.b);vec2 _v3=unpackHalf2x16(_v.a); buf[i4.r]=_v0.r;buf[i4.g]=_v0.g;buf[i4.b]=_v1.r;buf[i4.a]=_v1.g; buf[ii4.r]=_v2.r;buf[ii4.g]=_v2.g;buf[ii4.b]=_v3.r;buf[ii4.a]=_v3.g;}"
-             "-D buffer_cp8to4(buf,i2,sbuf,si)={uvec4 _v=sbuf[si]; buf[i2.r]=_v.rg;buf[i2.g]=_v.ba;}"
-
-             "-D psc(x)=(x==0?p.x:x)"
-             -DNCNN_image_shader=1 -DNCNN_fp16_packed=1 -DNCNN_fp16_arithmetic=1
-             -V -s -x -o ${SHADER_image_fp16pa_SPV_HEX_FILE} ${SHADER_SRC}
-        DEPENDS ${SHADER_SRC}
-        COMMENT "Building SPIR-V module ${SHADER_image_fp16pa_SRC_NAME_WE}.spv"
-        VERBATIM
-    )
-    set_source_files_properties(${SHADER_image_fp16pa_SPV_HEX_FILE} PROPERTIES GENERATED TRUE)
-
-    # image + fp16s
-    set(SHADER_image_fp16s_SRC_NAME_WE "${SHADER_SRC_NAME_WE}_image_fp16s")
-
-    set(SHADER_image_fp16s_SPV_HEX_FILE ${CMAKE_CURRENT_BINARY_DIR}/${SHADER_image_fp16s_SRC_NAME_WE}.spv.hex.h)
-    add_custom_command(
-        OUTPUT ${SHADER_image_fp16s_SPV_HEX_FILE}
-        COMMAND ${GLSLANGVALIDATOR_EXECUTABLE}
-        ARGS -Dsfp=float16_t -Dsfpvec2=f16vec2 -Dsfpvec4=f16vec4
-             -Dafp=float     -Dafpvec2=vec2    -Dafpvec4=vec4    -Dafpvec8=mat2x4 -Dafpmat4=mat4
-
-             -Dimfmtc1=r16f -Dimfmtc4=rgba16f
-             -Dunfp=mediump
-
-             "-D image1d_ld1(tex,p)=texelFetch(tex,p,0).r"
-             "-D image2d_ld1(tex,p)=texelFetch(tex,p,0).r"
-             "-D image3d_ld1(tex,p)=texelFetch(tex,p,0).r"
-             "-D image1d_st1(img,p,v)={vec4 _v;_v.r=v;imageStore(img,p,_v);}"
-             "-D image2d_st1(img,p,v)={vec4 _v;_v.r=v;imageStore(img,p,_v);}"
-             "-D image3d_st1(img,p,v)={vec4 _v;_v.r=v;imageStore(img,p,_v);}"
-             "-D image1d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
-             "-D image2d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
-             "-D image3d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
-
-             "-D image1d_ld4(tex,p)=texelFetch(tex,p,0)"
-             "-D image2d_ld4(tex,p)=texelFetch(tex,p,0)"
-             "-D image3d_ld4(tex,p)=texelFetch(tex,p,0)"
-             "-D image1d_st4(img,p,v)={imageStore(img,p,v);}"
-             "-D image2d_st4(img,p,v)={imageStore(img,p,v);}"
-             "-D image3d_st4(img,p,v)={imageStore(img,p,v);}"
-             "-D image1d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
-             "-D image2d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
-             "-D image3d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
-
-             "-D image1d_ld8(tex,p)=mat2x4(texelFetch(tex,(p)*2,0),texelFetch(tex,(p)*2+1,0))"
-             "-D image2d_ld8(tex,p)=mat2x4(texelFetch(tex,ivec2(p.x*2,p.y),0),texelFetch(tex,ivec2(p.x*2+1,p.y),0))"
-             "-D image3d_ld8(tex,p)=mat2x4(texelFetch(tex,ivec3(p.x*2,p.y,p.z),0),texelFetch(tex,ivec3(p.x*2+1,p.y,p.z),0))"
-             "-D image1d_st8(img,p,v)={imageStore(img,(p)*2,v[0]);imageStore(img,(p)*2+1,v[1]);}"
-             "-D image2d_st8(img,p,v)={imageStore(img,ivec2(p.x*2,p.y),v[0]);imageStore(img,ivec2(p.x*2+1,p.y),v[1]);}"
-             "-D image3d_st8(img,p,v)={imageStore(img,ivec3(p.x*2,p.y,p.z),v[0]);imageStore(img,ivec3(p.x*2+1,p.y,p.z),v[1]);}"
-             "-D image1d_cp8(img,p,tex,sp)={imageStore(img,(p)*2,texelFetch(tex,sp*2,0));imageStore(img,(p)*2+1,texelFetch(tex,sp*2+1,0));}"
-             "-D image2d_cp8(img,p,tex,sp)={imageStore(img,ivec2(p.x*2,p.y),texelFetch(tex,ivec2(sp.x*2,sp.y),0));imageStore(img,ivec2(p.x*2+1,p.y),texelFetch(tex,ivec2(sp.x*2+1,sp.y),0));}"
-             "-D image3d_cp8(img,p,tex,sp)={imageStore(img,ivec3(p.x*2,p.y,p.z),texelFetch(tex,ivec3(sp.x*2,sp.y,sp.z),0));imageStore(img,ivec3(p.x*2+1,p.y,p.z),texelFetch(tex,ivec3(sp.x*2+1,sp.y,sp.z),0));}"
-
-             "-D buffer_ld1(buf,i)=float(buf[i])"
-             "-D buffer_st1(buf,i,v)={buf[i]=float16_t(v);}"
-             "-D buffer_cp1(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_cp1to4(buf,i,sbuf,si4)={buf[i].r=sbuf[si4.r];buf[i].g=sbuf[si4.g];buf[i].b=sbuf[si4.b];buf[i].a=sbuf[si4.a];}"
-             "-D buffer_cp1to8(buf,i,sbuf,si4,sii4)={buf[i].abcd.r=sbuf[si4.r];buf[i].abcd.g=sbuf[si4.g];buf[i].abcd.b=sbuf[si4.b];buf[i].abcd.a=sbuf[si4.a];buf[i].efgh.r=sbuf[sii4.r];buf[i].efgh.g=sbuf[sii4.g];buf[i].efgh.b=sbuf[sii4.b];buf[i].efgh.a=sbuf[sii4.a];}"
-             "-D buffer_ld2(buf,i)=vec2(buf[i])"
-             "-D buffer_st2(buf,i,v)={buf[i]=f16vec2(v);}"
-             "-D buffer_cp2(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_ld4(buf,i)=vec4(buf[i])"
-             "-D buffer_st4(buf,i,v)={buf[i]=f16vec4(v);}"
-             "-D buffer_cp4(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_cp4to1(buf,i4,sbuf,si)={buf[i4.r]=sbuf[si].r;buf[i4.g]=sbuf[si].g;buf[i4.b]=sbuf[si].b;buf[i4.a]=sbuf[si].a;}"
-             "-D buffer_cp4to8(buf,i,sbuf,si2)={buf[i].abcd=sbuf[si2.r];buf[i].efgh=sbuf[si2.g];}"
-             "-D buffer_ld8(buf,i)=mat2x4(vec4(buf[i].abcd),vec4(buf[i].efgh))"
-             "-D buffer_st8(buf,i,v)={buf[i].abcd=f16vec4(v[0]);buf[i].efgh=f16vec4(v[1]);}"
-             "-D buffer_cp8(buf,i,sbuf,si)={buf[i].abcd=sbuf[si].abcd;buf[i].efgh=sbuf[si].efgh;}"
-             "-D buffer_cp8to1(buf,i4,ii4,sbuf,si)={buf[i4.r]=sbuf[si].abcd.r;buf[i4.g]=sbuf[si].abcd.g;buf[i4.b]=sbuf[si].abcd.b;buf[i4.a]=sbuf[si].abcd.a; buf[ii4.r]=sbuf[si].efgh.r;buf[ii4.g]=sbuf[si].efgh.g;buf[ii4.b]=sbuf[si].efgh.b;buf[ii4.a]=sbuf[si].efgh.a;}"
-             "-D buffer_cp8to4(buf,i2,sbuf,si)={buf[i2.r]=sbuf[si].abcd;buf[i2.g]=sbuf[si].efgh;}"
-
-             "-D sfp2afpmat4(v)=v"
-             "-D afp2sfpmat4(v)=v"
-             "-D psc(x)=(x==0?p.x:x)"
-             -DNCNN_image_shader=1 -DNCNN_fp16_storage=1
-             -V -s -x -o ${SHADER_image_fp16s_SPV_HEX_FILE} ${SHADER_SRC}
-        DEPENDS ${SHADER_SRC}
-        COMMENT "Building SPIR-V module ${SHADER_image_fp16s_SRC_NAME_WE}.spv"
-        VERBATIM
-    )
-    set_source_files_properties(${SHADER_image_fp16s_SPV_HEX_FILE} PROPERTIES GENERATED TRUE)
-
-    # image + fp16s + fp16a
-    set(SHADER_image_fp16sa_SRC_NAME_WE "${SHADER_SRC_NAME_WE}_image_fp16sa")
-
-    set(SHADER_image_fp16sa_SPV_HEX_FILE ${CMAKE_CURRENT_BINARY_DIR}/${SHADER_image_fp16sa_SRC_NAME_WE}.spv.hex.h)
-    add_custom_command(
-        OUTPUT ${SHADER_image_fp16sa_SPV_HEX_FILE}
-        COMMAND ${GLSLANGVALIDATOR_EXECUTABLE}
-        ARGS -Dsfp=float16_t -Dsfpvec2=f16vec2 -Dsfpvec4=f16vec4 -Dsfpvec8=f16mat2x4 -Dsfpmat4=f16mat4
-             -Dafp=float16_t -Dafpvec2=f16vec2 -Dafpvec4=f16vec4 -Dafpvec8=f16mat2x4 -Dafpmat4=f16mat4
-
-             -Dimfmtc1=r16f -Dimfmtc4=rgba16f
-             -Dunfp=mediump
-
-             "-D image1d_ld1(tex,p)=float16_t(texelFetch(tex,p,0).r)"
-             "-D image2d_ld1(tex,p)=float16_t(texelFetch(tex,p,0).r)"
-             "-D image3d_ld1(tex,p)=float16_t(texelFetch(tex,p,0).r)"
-             "-D image1d_st1(img,p,v)={f16vec4 _v;_v.r=float16_t(v);imageStore(img,p,_v);}"
-             "-D image2d_st1(img,p,v)={f16vec4 _v;_v.r=float16_t(v);imageStore(img,p,_v);}"
-             "-D image3d_st1(img,p,v)={f16vec4 _v;_v.r=float16_t(v);imageStore(img,p,_v);}"
-             "-D image1d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
-             "-D image2d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
-             "-D image3d_cp1(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
-
-             "-D image1d_ld4(tex,p)=f16vec4(texelFetch(tex,p,0))"
-             "-D image2d_ld4(tex,p)=f16vec4(texelFetch(tex,p,0))"
-             "-D image3d_ld4(tex,p)=f16vec4(texelFetch(tex,p,0))"
-             "-D image1d_st4(img,p,v)={imageStore(img,p,vec4(v));}"
-             "-D image2d_st4(img,p,v)={imageStore(img,p,vec4(v));}"
-             "-D image3d_st4(img,p,v)={imageStore(img,p,vec4(v));}"
-             "-D image1d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
-             "-D image2d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
-             "-D image3d_cp4(img,p,tex,sp)={imageStore(img,p,texelFetch(tex,sp,0));}"
-
-             "-D image1d_ld8(tex,p)=f16mat2x4(texelFetch(tex,(p)*2,0),texelFetch(tex,(p)*2+1,0))"
-             "-D image2d_ld8(tex,p)=f16mat2x4(texelFetch(tex,ivec2(p.x*2,p.y),0),texelFetch(tex,ivec2(p.x*2+1,p.y),0))"
-             "-D image3d_ld8(tex,p)=f16mat2x4(texelFetch(tex,ivec3(p.x*2,p.y,p.z),0),texelFetch(tex,ivec3(p.x*2+1,p.y,p.z),0))"
-             "-D image1d_st8(img,p,v)={imageStore(img,(p)*2,vec4(v[0]));imageStore(img,(p)*2+1,vec4(v[1]));}"
-             "-D image2d_st8(img,p,v)={imageStore(img,ivec2(p.x*2,p.y),vec4(v[0]));imageStore(img,ivec2(p.x*2+1,p.y),vec4(v[1]));}"
-             "-D image3d_st8(img,p,v)={imageStore(img,ivec3(p.x*2,p.y,p.z),vec4(v[0]));imageStore(img,ivec3(p.x*2+1,p.y,p.z),vec4(v[1]));}"
-             "-D image1d_cp8(img,p,tex,sp)={imageStore(img,(p)*2,texelFetch(tex,sp*2,0));imageStore(img,(p)*2+1,texelFetch(tex,sp*2+1,0));}"
-             "-D image2d_cp8(img,p,tex,sp)={imageStore(img,ivec2(p.x*2,p.y),texelFetch(tex,ivec2(sp.x*2,sp.y),0));imageStore(img,ivec2(p.x*2+1,p.y),texelFetch(tex,ivec2(sp.x*2+1,sp.y),0));}"
-             "-D image3d_cp8(img,p,tex,sp)={imageStore(img,ivec3(p.x*2,p.y,p.z),texelFetch(tex,ivec3(sp.x*2,sp.y,sp.z),0));imageStore(img,ivec3(p.x*2+1,p.y,p.z),texelFetch(tex,ivec3(sp.x*2+1,sp.y,sp.z),0));}"
-
-             "-D buffer_ld1(buf,i)=buf[i]"
-             "-D buffer_st1(buf,i,v)={buf[i]=v;}"
-             "-D buffer_cp1(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_cp1to4(buf,i,sbuf,si4)={buf[i]=f16vec4(sbuf[si4.r],sbuf[si4.g],sbuf[si4.b],sbuf[si4.a]);}"
-             "-D buffer_cp1to8(buf,i,sbuf,si4,sii4)={buf[i]=f16mat2x4(sbuf[si4.r],sbuf[si4.g],sbuf[si4.b],sbuf[si4.a],sbuf[sii4.r],sbuf[sii4.g],sbuf[sii4.b],sbuf[sii4.a]);}"
-             "-D buffer_ld2(buf,i)=buf[i]"
-             "-D buffer_st2(buf,i,v)={buf[i]=v;}"
-             "-D buffer_cp2(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_ld4(buf,i)=buf[i]"
-             "-D buffer_st4(buf,i,v)={buf[i]=v;}"
-             "-D buffer_cp4(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_cp4to1(buf,i4,sbuf,si)={buf[i4.r]=sbuf[si].r;buf[i4.g]=sbuf[si].g;buf[i4.b]=sbuf[si].b;buf[i4.a]=sbuf[si].a;}"
-             "-D buffer_cp4to8(buf,i,sbuf,si2)={buf[i]=f16mat2x4(sbuf[si2.r],sbuf[si2.g]);}"
-             "-D buffer_ld8(buf,i)=buf[i]"
-             "-D buffer_st8(buf,i,v)={buf[i]=v;}"
-             "-D buffer_cp8(buf,i,sbuf,si)={buf[i]=sbuf[si];}"
-             "-D buffer_cp8to1(buf,i4,ii4,sbuf,si)={f16mat2x4 _v=sbuf[si]; buf[i4.r]=_v[0].r;buf[i4.g]=_v[0].g;buf[i4.b]=_v[0].b;buf[i4.a]=_v[0].a; buf[ii4.r]=_v[1].r;buf[ii4.g]=_v[1].g;buf[ii4.b]=_v[1].b;buf[ii4.a]=_v[1].a;}"
-             "-D buffer_cp8to4(buf,i2,sbuf,si)={f16mat2x4 _v=sbuf[si]; buf[i2.r]=_v[0];buf[i2.g]=_v[1];}"
-             "-D sfp2afpmat4(v)=v"
-             "-D afp2sfpmat4(v)=v"
-
-             "-D psc(x)=(x==0?p.x:x)"
-             -DNCNN_image_shader=1 -DNCNN_fp16_storage=1 -DNCNN_fp16_arithmetic=1
-             -V -s -x -o ${SHADER_image_fp16sa_SPV_HEX_FILE} ${SHADER_SRC}
-        DEPENDS ${SHADER_SRC}
-        COMMENT "Building SPIR-V module ${SHADER_image_fp16sa_SRC_NAME_WE}.spv"
-        VERBATIM
-    )
-    set_source_files_properties(${SHADER_image_fp16sa_SPV_HEX_FILE} PROPERTIES GENERATED TRUE)
-
-    set(LOCAL_SHADER_SPV_HEADER ${CMAKE_CURRENT_BINARY_DIR}/${SHADER_SRC_NAME_WE}.spv.h)
-
-    file(WRITE ${LOCAL_SHADER_SPV_HEADER}
-        "static const uint32_t ${SHADER_SRC_NAME_WE}_spv_data[] = {\n#include \"${SHADER_SRC_NAME_WE}.spv.hex.h\"\n};\n"
-        "static const uint32_t ${SHADER_fp16p_SRC_NAME_WE}_spv_data[] = {\n#include \"${SHADER_fp16p_SRC_NAME_WE}.spv.hex.h\"\n};\n"
-        "static const uint32_t ${SHADER_fp16pa_SRC_NAME_WE}_spv_data[] = {\n#include \"${SHADER_fp16pa_SRC_NAME_WE}.spv.hex.h\"\n};\n"
-        "static const uint32_t ${SHADER_fp16s_SRC_NAME_WE}_spv_data[] = {\n#include \"${SHADER_fp16s_SRC_NAME_WE}.spv.hex.h\"\n};\n"
-        "static const uint32_t ${SHADER_fp16sa_SRC_NAME_WE}_spv_data[] = {\n#include \"${SHADER_fp16sa_SRC_NAME_WE}.spv.hex.h\"\n};\n"
-        "static const uint32_t ${SHADER_image_SRC_NAME_WE}_spv_data[] = {\n#include \"${SHADER_image_SRC_NAME_WE}.spv.hex.h\"\n};\n"
-        "static const uint32_t ${SHADER_image_fp16p_SRC_NAME_WE}_spv_data[] = {\n#include \"${SHADER_image_fp16p_SRC_NAME_WE}.spv.hex.h\"\n};\n"
-        "static const uint32_t ${SHADER_image_fp16pa_SRC_NAME_WE}_spv_data[] = {\n#include \"${SHADER_image_fp16pa_SRC_NAME_WE}.spv.hex.h\"\n};\n"
-        "static const uint32_t ${SHADER_image_fp16s_SRC_NAME_WE}_spv_data[] = {\n#include \"${SHADER_image_fp16s_SRC_NAME_WE}.spv.hex.h\"\n};\n"
-        "static const uint32_t ${SHADER_image_fp16sa_SRC_NAME_WE}_spv_data[] = {\n#include \"${SHADER_image_fp16sa_SRC_NAME_WE}.spv.hex.h\"\n};\n"
-    )
-
-    set_source_files_properties(${LOCAL_SHADER_SPV_HEADER} PROPERTIES GENERATED TRUE)
-
-    set(LOCAL_SHADER_SPV_HEX_HEADERS
-        ${SHADER_SPV_HEX_FILE}
-        ${SHADER_fp16p_SPV_HEX_FILE}
-        ${SHADER_fp16pa_SPV_HEX_FILE}
-        ${SHADER_fp16s_SPV_HEX_FILE}
-        ${SHADER_fp16sa_SPV_HEX_FILE}
-        ${SHADER_image_SPV_HEX_FILE}
-        ${SHADER_image_fp16p_SPV_HEX_FILE}
-        ${SHADER_image_fp16pa_SPV_HEX_FILE}
-        ${SHADER_image_fp16s_SPV_HEX_FILE}
-        ${SHADER_image_fp16sa_SPV_HEX_FILE}
-    )
-
-    set(${SHADER_SPV_HEADER} ${LOCAL_SHADER_SPV_HEADER} PARENT_SCOPE)
-    set(${SHADER_SPV_HEX_HEADERS} ${LOCAL_SHADER_SPV_HEX_HEADERS} PARENT_SCOPE)
-
-endfunction()
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index d53bb1099e0..9ba035b9422 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -51,8 +51,6 @@ ncnn_src_group(ncnn_SRCS "sources")
 
 include_directories("${CMAKE_CURRENT_SOURCE_DIR}/layer/${NCNN_TARGET_ARCH}")
 
-include(${CMAKE_CURRENT_SOURCE_DIR}/../cmake/ncnn_generate_shader_spv_header.cmake)
-
 # ncnn macro
 include(${CMAKE_CURRENT_SOURCE_DIR}/../cmake/ncnn_add_shader.cmake)
 include(${CMAKE_CURRENT_SOURCE_DIR}/../cmake/ncnn_add_layer.cmake)

From d0a930576486b510d860c2722393ae5b73e41267 Mon Sep 17 00:00:00 2001
From: nihuini <nihuini@tencent.com>
Date: Fri, 5 Jan 2024 10:49:49 +0800
Subject: [PATCH 17/19] wip

---
 src/layer.cpp | 94 +++++++++++++++++++++++----------------------------
 1 file changed, 42 insertions(+), 52 deletions(-)

diff --git a/src/layer.cpp b/src/layer.cpp
index 253430a1974..cca3e77bf1f 100644
--- a/src/layer.cpp
+++ b/src/layer.cpp
@@ -288,7 +288,7 @@ class Layer_final : public Layer
         support_tensor_storage = 0;
 
 #if NCNN_VULKAN
-        if (layer_vulkan && vkdev)
+        if (layer_vulkan)
         {
             support_vulkan = layer_vulkan->support_vulkan;
             support_image_storage = layer_vulkan->support_image_storage;
@@ -318,81 +318,71 @@ class Layer_final : public Layer
     {
         set_layer_properties();
 #if NCNN_VULKAN
-        if (layer_vulkan && vkdev)
+        if (layer_vulkan)
         {
-            int ret = layer_vulkan->load_param(pd);
-            if (ret)
-                return ret;
+            if (vkdev)
+            {
+                int ret = layer_vulkan->load_param(pd);
+                get_layer_properties();
+
+                if (layer_vulkan->support_vulkan)
+                    return ret;
+            }
+
+            // fallback to cpu layer
+            delete layer_vulkan;
+            layer_vulkan = 0;
         }
-        else
 #endif // NCNN_VULKAN
-        {
-            int ret = layer_cpu->load_param(pd);
-            if (ret)
-                return ret;
-        }
+
+        int ret = layer_cpu->load_param(pd);
         get_layer_properties();
-        return 0;
+        return ret;
     }
 
     virtual int load_model(const ModelBin& mb)
     {
 #if NCNN_VULKAN
-        if (layer_vulkan && vkdev)
+        if (layer_vulkan)
         {
             int ret = layer_vulkan->load_model(mb);
-            if (ret)
-                return ret;
+            get_layer_properties();
+            return ret;
         }
-        else
 #endif // NCNN_VULKAN
-        {
-            int ret = layer_cpu->load_model(mb);
-            if (ret)
-                return ret;
-        }
+
+        int ret = layer_cpu->load_model(mb);
         get_layer_properties();
-        return 0;
+        return ret;
     }
 
     virtual int create_pipeline(const Option& opt)
     {
         set_layer_properties();
 #if NCNN_VULKAN
-        if (layer_vulkan && vkdev)
+        if (layer_vulkan)
         {
             int ret = layer_vulkan->create_pipeline(opt);
-            if (ret)
-                return ret;
+            get_layer_properties();
+            return ret;
         }
-        else
 #endif // NCNN_VULKAN
-        {
-            int ret = layer_cpu->create_pipeline(opt);
-            if (ret)
-                return ret;
-        }
+
+        int ret = layer_cpu->create_pipeline(opt);
         get_layer_properties();
-        return 0;
+        return ret;
     }
 
     virtual int destroy_pipeline(const Option& opt)
     {
 #if NCNN_VULKAN
-        if (layer_vulkan && vkdev)
+        if (layer_vulkan)
         {
-            int ret = layer_vulkan->destroy_pipeline(opt);
-            if (ret)
-                return ret;
+            return layer_vulkan->destroy_pipeline(opt);
         }
-        else
 #endif // NCNN_VULKAN
-        {
-            int ret = layer_cpu->destroy_pipeline(opt);
-            if (ret)
-                return ret;
-        }
-        return 0;
+
+        return layer_cpu->destroy_pipeline(opt);
     }
 
 public:
@@ -420,47 +410,47 @@ class Layer_final : public Layer
 public:
     virtual int upload_model(VkTransfer& cmd, const Option& opt)
     {
-        return layer_vulkan->upload_model(cmd, opt);
+        return layer_vulkan ? layer_vulkan->upload_model(cmd, opt) : -1;
     }
 
     virtual int forward(const std::vector<VkMat>& bottom_blobs, std::vector<VkMat>& top_blobs, VkCompute& cmd, const Option& opt) const
     {
-        return layer_vulkan->forward(bottom_blobs, top_blobs, cmd, opt);
+        return layer_vulkan ? layer_vulkan->forward(bottom_blobs, top_blobs, cmd, opt) : -1;
     }
 
     virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const
     {
-        return layer_vulkan->forward(bottom_blob, top_blob, cmd, opt);
+        return layer_vulkan ? layer_vulkan->forward(bottom_blob, top_blob, cmd, opt) : -1;
     }
 
     virtual int forward(const std::vector<VkImageMat>& bottom_blobs, std::vector<VkImageMat>& top_blobs, VkCompute& cmd, const Option& opt) const
     {
-        return layer_vulkan->forward(bottom_blobs, top_blobs, cmd, opt);
+        return layer_vulkan ? layer_vulkan->forward(bottom_blobs, top_blobs, cmd, opt) : -1;
     }
 
     virtual int forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const
     {
-        return layer_vulkan->forward(bottom_blob, top_blob, cmd, opt);
+        return layer_vulkan ? layer_vulkan->forward(bottom_blob, top_blob, cmd, opt) : -1;
     }
 
     virtual int forward_inplace(std::vector<VkMat>& bottom_top_blobs, VkCompute& cmd, const Option& opt) const
     {
-        return layer_vulkan->forward_inplace(bottom_top_blobs, cmd, opt);
+        return layer_vulkan ? layer_vulkan->forward_inplace(bottom_top_blobs, cmd, opt) : -1;
     }
 
     virtual int forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const
     {
-        return layer_vulkan->forward_inplace(bottom_top_blob, cmd, opt);
+        return layer_vulkan ? layer_vulkan->forward_inplace(bottom_top_blob, cmd, opt) : -1;
     }
 
     virtual int forward_inplace(std::vector<VkImageMat>& bottom_top_blobs, VkCompute& cmd, const Option& opt) const
     {
-        return layer_vulkan->forward_inplace(bottom_top_blobs, cmd, opt);
+        return layer_vulkan ? layer_vulkan->forward_inplace(bottom_top_blobs, cmd, opt) : -1;
     }
 
     virtual int forward_inplace(VkImageMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const
     {
-        return layer_vulkan->forward_inplace(bottom_top_blob, cmd, opt);
+        return layer_vulkan ? layer_vulkan->forward_inplace(bottom_top_blob, cmd, opt) : -1;
     }
 #endif // NCNN_VULKAN
 };

From 8813cb74f8805fcb9be15bd7ab04df52ec3c1dde Mon Sep 17 00:00:00 2001
From: nihuini <nihuini@tencent.com>
Date: Fri, 5 Jan 2024 10:53:57 +0800
Subject: [PATCH 18/19] clean

---
 src/layer/vulkan/convolution1d_vulkan.cpp          | 4 +---
 src/layer/vulkan/convolution_vulkan.cpp            | 4 +---
 src/layer/vulkan/convolutiondepthwise_vulkan.cpp   | 4 +---
 src/layer/vulkan/deconvolution_vulkan.cpp          | 4 +---
 src/layer/vulkan/deconvolutiondepthwise_vulkan.cpp | 4 +---
 5 files changed, 5 insertions(+), 15 deletions(-)

diff --git a/src/layer/vulkan/convolution1d_vulkan.cpp b/src/layer/vulkan/convolution1d_vulkan.cpp
index fdef247bdf9..2747012addc 100644
--- a/src/layer/vulkan/convolution1d_vulkan.cpp
+++ b/src/layer/vulkan/convolution1d_vulkan.cpp
@@ -32,8 +32,6 @@ Convolution1D_vulkan::Convolution1D_vulkan()
 int Convolution1D_vulkan::load_param(const ParamDict& pd)
 {
     int ret = Convolution1D::load_param(pd);
-    if (ret)
-        return ret;
 
     if (dynamic_weight)
     {
@@ -41,7 +39,7 @@ int Convolution1D_vulkan::load_param(const ParamDict& pd)
         support_image_storage = false;
     }
 
-    return 0;
+    return ret;
 }
 
 int Convolution1D_vulkan::create_pipeline(const Option& _opt)
diff --git a/src/layer/vulkan/convolution_vulkan.cpp b/src/layer/vulkan/convolution_vulkan.cpp
index 4ba7c279e3c..302ab9085c5 100644
--- a/src/layer/vulkan/convolution_vulkan.cpp
+++ b/src/layer/vulkan/convolution_vulkan.cpp
@@ -46,8 +46,6 @@ Convolution_vulkan::Convolution_vulkan()
 int Convolution_vulkan::load_param(const ParamDict& pd)
 {
     int ret = Convolution::load_param(pd);
-    if (ret)
-        return ret;
 
     if (dynamic_weight)
     {
@@ -55,7 +53,7 @@ int Convolution_vulkan::load_param(const ParamDict& pd)
         support_image_storage = false;
     }
 
-    return 0;
+    return ret;
 }
 
 int Convolution_vulkan::create_pipeline(const Option& _opt)
diff --git a/src/layer/vulkan/convolutiondepthwise_vulkan.cpp b/src/layer/vulkan/convolutiondepthwise_vulkan.cpp
index 9b54d136ccf..54f73ea0695 100644
--- a/src/layer/vulkan/convolutiondepthwise_vulkan.cpp
+++ b/src/layer/vulkan/convolutiondepthwise_vulkan.cpp
@@ -44,8 +44,6 @@ ConvolutionDepthWise_vulkan::ConvolutionDepthWise_vulkan()
 int ConvolutionDepthWise_vulkan::load_param(const ParamDict& pd)
 {
     int ret = ConvolutionDepthWise::load_param(pd);
-    if (ret)
-        return ret;
 
     if (dynamic_weight)
     {
@@ -53,7 +51,7 @@ int ConvolutionDepthWise_vulkan::load_param(const ParamDict& pd)
         support_image_storage = false;
     }
 
-    return 0;
+    return ret;
 }
 
 int ConvolutionDepthWise_vulkan::create_pipeline(const Option& _opt)
diff --git a/src/layer/vulkan/deconvolution_vulkan.cpp b/src/layer/vulkan/deconvolution_vulkan.cpp
index 83f71f84172..66e57db57bf 100644
--- a/src/layer/vulkan/deconvolution_vulkan.cpp
+++ b/src/layer/vulkan/deconvolution_vulkan.cpp
@@ -36,8 +36,6 @@ Deconvolution_vulkan::Deconvolution_vulkan()
 int Deconvolution_vulkan::load_param(const ParamDict& pd)
 {
     int ret = Deconvolution::load_param(pd);
-    if (ret)
-        return ret;
 
     if (dynamic_weight)
     {
@@ -45,7 +43,7 @@ int Deconvolution_vulkan::load_param(const ParamDict& pd)
         support_image_storage = false;
     }
 
-    return 0;
+    return ret;
 }
 
 int Deconvolution_vulkan::create_pipeline(const Option& _opt)
diff --git a/src/layer/vulkan/deconvolutiondepthwise_vulkan.cpp b/src/layer/vulkan/deconvolutiondepthwise_vulkan.cpp
index af23d229547..dcc3100be29 100644
--- a/src/layer/vulkan/deconvolutiondepthwise_vulkan.cpp
+++ b/src/layer/vulkan/deconvolutiondepthwise_vulkan.cpp
@@ -45,8 +45,6 @@ DeconvolutionDepthWise_vulkan::DeconvolutionDepthWise_vulkan()
 int DeconvolutionDepthWise_vulkan::load_param(const ParamDict& pd)
 {
     int ret = DeconvolutionDepthWise::load_param(pd);
-    if (ret)
-        return ret;
 
     if (dynamic_weight)
     {
@@ -54,7 +52,7 @@ int DeconvolutionDepthWise_vulkan::load_param(const ParamDict& pd)
         support_image_storage = false;
     }
 
-    return 0;
+    return ret;
 }
 
 int DeconvolutionDepthWise_vulkan::create_pipeline(const Option& _opt)

From 8d7aa028e47d68c67fe7fcfa169b0988a85766d4 Mon Sep 17 00:00:00 2001
From: nihuini <nihuini@tencent.com>
Date: Fri, 5 Jan 2024 11:47:24 +0800
Subject: [PATCH 19/19] less

---
 src/layer/vulkan/convolutiondepthwise_vulkan.cpp   | 3 +++
 src/layer/vulkan/deconvolutiondepthwise_vulkan.cpp | 3 +++
 src/layer/vulkan/innerproduct_vulkan.cpp           | 6 ++++++
 3 files changed, 12 insertions(+)

diff --git a/src/layer/vulkan/convolutiondepthwise_vulkan.cpp b/src/layer/vulkan/convolutiondepthwise_vulkan.cpp
index 54f73ea0695..59eca6a55c6 100644
--- a/src/layer/vulkan/convolutiondepthwise_vulkan.cpp
+++ b/src/layer/vulkan/convolutiondepthwise_vulkan.cpp
@@ -271,6 +271,9 @@ int ConvolutionDepthWise_vulkan::create_pipeline(const Option& _opt)
             pipeline_convolutiondepthwise_pack8->create(LayerShaderType::convolutiondepthwise_pack8, opt, specializations);
         }
 
+        weight_data.release();
+        bias_data.release();
+
         return 0;
     }
 
diff --git a/src/layer/vulkan/deconvolutiondepthwise_vulkan.cpp b/src/layer/vulkan/deconvolutiondepthwise_vulkan.cpp
index dcc3100be29..a715a4782f4 100644
--- a/src/layer/vulkan/deconvolutiondepthwise_vulkan.cpp
+++ b/src/layer/vulkan/deconvolutiondepthwise_vulkan.cpp
@@ -295,6 +295,9 @@ int DeconvolutionDepthWise_vulkan::create_pipeline(const Option& _opt)
             pipeline_deconvolutiondepthwise_pack8->create(LayerShaderType::deconvolutiondepthwise_pack8, opt, specializations);
         }
 
+        weight_data.release();
+        bias_data.release();
+
         return 0;
     }
 
diff --git a/src/layer/vulkan/innerproduct_vulkan.cpp b/src/layer/vulkan/innerproduct_vulkan.cpp
index de23feef70e..ee73d4bb4ac 100644
--- a/src/layer/vulkan/innerproduct_vulkan.cpp
+++ b/src/layer/vulkan/innerproduct_vulkan.cpp
@@ -154,6 +154,9 @@ int InnerProduct_vulkan::create_pipeline(const Option& _opt)
         pipeline_innerproduct_gemm->set_optimal_local_size_xyz(local_size_xyz);
         pipeline_innerproduct_gemm->create(shader_type_index, opt, specializations);
 
+        weight_data.release();
+        bias_data.release();
+
         return 0;
     }
 
@@ -361,6 +364,9 @@ int InnerProduct_vulkan::create_pipeline(const Option& _opt)
         pipeline_innerproduct_gemm->set_optimal_local_size_xyz(local_size_xyz);
         pipeline_innerproduct_gemm->create(shader_type_index, opt, specializations);
 
+        weight_data.release();
+        bias_data.release();
+
         return 0;
     }