From e7cae68a227f7bab2e085a9e1f24437d6749ac23 Mon Sep 17 00:00:00 2001
From: nihui <nihuini@tencent.com>
Date: Sat, 13 Jul 2024 23:56:29 +0800
Subject: [PATCH 01/38] pnnx convert onnx
 logsoftmax/logsigmoid/mish/selu/sigmoid/silu/softmin/softplus/softshrink/softsign/tanh/tanhshrink
 (#5581)

---
 tools/pnnx/src/pass_level2/F_log_softmax.cpp | 73 +++++++++++++++++++
 tools/pnnx/src/pass_level2/F_logsigmoid.cpp  | 22 ++++++
 tools/pnnx/src/pass_level2/F_mish.cpp        | 23 ++++++
 tools/pnnx/src/pass_level2/F_selu.cpp        | 21 ++++++
 tools/pnnx/src/pass_level2/F_softmin.cpp     | 22 ++++++
 tools/pnnx/src/pass_level2/F_softplus.cpp    | 58 +++++++++++++++
 tools/pnnx/src/pass_level2/F_softshrink.cpp  | 58 +++++++++++++++
 tools/pnnx/src/pass_level2/F_softsign.cpp    | 24 +++++++
 tools/pnnx/src/pass_level2/F_tanhshrink.cpp  | 22 ++++++
 tools/pnnx/tests/onnx/CMakeLists.txt         | 22 ++++++
 tools/pnnx/tests/onnx/test_F_log_softmax.py  | 66 +++++++++++++++++
 tools/pnnx/tests/onnx/test_F_logsigmoid.py   | 66 +++++++++++++++++
 tools/pnnx/tests/onnx/test_F_mish.py         | 76 ++++++++++++++++++++
 tools/pnnx/tests/onnx/test_F_selu.py         | 66 +++++++++++++++++
 tools/pnnx/tests/onnx/test_F_sigmoid.py      |  9 ++-
 tools/pnnx/tests/onnx/test_F_silu.py         | 69 ++++++++++++++++++
 tools/pnnx/tests/onnx/test_F_softmin.py      | 66 +++++++++++++++++
 tools/pnnx/tests/onnx/test_F_softplus.py     | 70 ++++++++++++++++++
 tools/pnnx/tests/onnx/test_F_softshrink.py   | 70 ++++++++++++++++++
 tools/pnnx/tests/onnx/test_F_softsign.py     | 66 +++++++++++++++++
 tools/pnnx/tests/onnx/test_F_tanh.py         | 66 +++++++++++++++++
 tools/pnnx/tests/onnx/test_F_tanhshrink.py   | 66 +++++++++++++++++
 tools/pnnx/tests/onnx/test_nn_LogSigmoid.py  | 68 ++++++++++++++++++
 tools/pnnx/tests/onnx/test_nn_LogSoftmax.py  | 71 ++++++++++++++++++
 tools/pnnx/tests/onnx/test_nn_Mish.py        | 72 +++++++++++++++++++
 tools/pnnx/tests/onnx/test_nn_SELU.py        | 68 ++++++++++++++++++
 tools/pnnx/tests/onnx/test_nn_SiLU.py        | 68 ++++++++++++++++++
 tools/pnnx/tests/onnx/test_nn_Sigmoid.py     |  9 ++-
 tools/pnnx/tests/onnx/test_nn_Softmin.py     | 71 ++++++++++++++++++
 tools/pnnx/tests/onnx/test_nn_Softplus.py    | 73 +++++++++++++++++++
 tools/pnnx/tests/onnx/test_nn_Softshrink.py  | 73 +++++++++++++++++++
 tools/pnnx/tests/onnx/test_nn_Softsign.py    | 68 ++++++++++++++++++
 tools/pnnx/tests/onnx/test_nn_Tanh.py        | 68 ++++++++++++++++++
 tools/pnnx/tests/onnx/test_nn_Tanhshrink.py  | 68 ++++++++++++++++++
 34 files changed, 1872 insertions(+), 6 deletions(-)
 create mode 100644 tools/pnnx/tests/onnx/test_F_log_softmax.py
 create mode 100644 tools/pnnx/tests/onnx/test_F_logsigmoid.py
 create mode 100644 tools/pnnx/tests/onnx/test_F_mish.py
 create mode 100644 tools/pnnx/tests/onnx/test_F_selu.py
 create mode 100644 tools/pnnx/tests/onnx/test_F_silu.py
 create mode 100644 tools/pnnx/tests/onnx/test_F_softmin.py
 create mode 100644 tools/pnnx/tests/onnx/test_F_softplus.py
 create mode 100644 tools/pnnx/tests/onnx/test_F_softshrink.py
 create mode 100644 tools/pnnx/tests/onnx/test_F_softsign.py
 create mode 100644 tools/pnnx/tests/onnx/test_F_tanh.py
 create mode 100644 tools/pnnx/tests/onnx/test_F_tanhshrink.py
 create mode 100644 tools/pnnx/tests/onnx/test_nn_LogSigmoid.py
 create mode 100644 tools/pnnx/tests/onnx/test_nn_LogSoftmax.py
 create mode 100644 tools/pnnx/tests/onnx/test_nn_Mish.py
 create mode 100644 tools/pnnx/tests/onnx/test_nn_SELU.py
 create mode 100644 tools/pnnx/tests/onnx/test_nn_SiLU.py
 create mode 100644 tools/pnnx/tests/onnx/test_nn_Softmin.py
 create mode 100644 tools/pnnx/tests/onnx/test_nn_Softplus.py
 create mode 100644 tools/pnnx/tests/onnx/test_nn_Softshrink.py
 create mode 100644 tools/pnnx/tests/onnx/test_nn_Softsign.py
 create mode 100644 tools/pnnx/tests/onnx/test_nn_Tanh.py
 create mode 100644 tools/pnnx/tests/onnx/test_nn_Tanhshrink.py

diff --git a/tools/pnnx/src/pass_level2/F_log_softmax.cpp b/tools/pnnx/src/pass_level2/F_log_softmax.cpp
index 0264973783b..ad9eba30d1c 100644
--- a/tools/pnnx/src/pass_level2/F_log_softmax.cpp
+++ b/tools/pnnx/src/pass_level2/F_log_softmax.cpp
@@ -39,4 +39,77 @@ pnnx.Output             output      1 0 out
 
 REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_log_softmax, 10)
 
+class F_log_softmax_onnx : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+3 2
+pnnx.Input              input_0     0 1 input
+LogSoftmax              op_0        1 1 input out axis=%dim
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "F.log_softmax";
+    }
+};
+
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_log_softmax_onnx, 10)
+
+class F_log_softmax_onnx_1 : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+5 4
+pnnx.Input              input_0     0 1 input
+Transpose               op_0        1 1 input a perm=%perm
+LogSoftmax              op_1        1 1 a b axis=%axis
+Transpose               op_2        1 1 b out perm=%perm
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "F.log_softmax";
+    }
+
+    bool match(const std::map<std::string, Parameter>& captured_params) const
+    {
+        const std::vector<int>& perm = captured_params.at("perm").ai;
+        const int axis = captured_params.at("axis").i;
+
+        if (axis >= (int)perm.size())
+            return false;
+
+        int excount = 0;
+        for (int i = 0; i < (int)perm.size(); i++)
+        {
+            if (perm[i] != i)
+                excount++;
+        }
+
+        if (excount != 2)
+            return false;
+
+        return true;
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
+    {
+        const std::vector<int>& perm = captured_params.at("perm").ai;
+        const int axis = captured_params.at("axis").i;
+
+        op->params["dim"] = perm[axis];
+    }
+};
+
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_log_softmax_onnx_1, 9)
+
 } // namespace pnnx
diff --git a/tools/pnnx/src/pass_level2/F_logsigmoid.cpp b/tools/pnnx/src/pass_level2/F_logsigmoid.cpp
index e35670686a0..e0d4df607f2 100644
--- a/tools/pnnx/src/pass_level2/F_logsigmoid.cpp
+++ b/tools/pnnx/src/pass_level2/F_logsigmoid.cpp
@@ -37,4 +37,26 @@ pnnx.Output             output      1 0 out
 
 REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_logsigmoid, 10)
 
+class F_logsigmoid_onnx : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+4 3
+pnnx.Input              input       0 1 input
+aten::sigmoid           op_0        1 1 input a
+aten::log               op_1        1 1 a out
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "F.logsigmoid";
+    }
+};
+
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_logsigmoid_onnx, 9)
+
 } // namespace pnnx
diff --git a/tools/pnnx/src/pass_level2/F_mish.cpp b/tools/pnnx/src/pass_level2/F_mish.cpp
index 1a083ba85d9..485a7e3b0b5 100644
--- a/tools/pnnx/src/pass_level2/F_mish.cpp
+++ b/tools/pnnx/src/pass_level2/F_mish.cpp
@@ -62,4 +62,27 @@ pnnx.Output             output      1 0 out
 
 REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_mish_1, 9)
 
+class F_mish_onnx : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+5 4
+pnnx.Input              input       0 1 input
+Softplus                op_0        1 1 input a
+aten::tanh              op_1        1 1 a b
+aten::mul               op_2        2 1 input b out
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "F.mish";
+    }
+};
+
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_mish_onnx, 9)
+
 } // namespace pnnx
diff --git a/tools/pnnx/src/pass_level2/F_selu.cpp b/tools/pnnx/src/pass_level2/F_selu.cpp
index 592c3dd8ed7..9df970b1bbc 100644
--- a/tools/pnnx/src/pass_level2/F_selu.cpp
+++ b/tools/pnnx/src/pass_level2/F_selu.cpp
@@ -37,4 +37,25 @@ pnnx.Output             output      1 0 out
 
 REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_selu, 10)
 
+class F_selu_onnx : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+3 2
+pnnx.Input              input       0 1 input
+Selu                    op_0        1 1 input out
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "F.selu";
+    }
+};
+
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_selu_onnx, 10)
+
 } // namespace pnnx
diff --git a/tools/pnnx/src/pass_level2/F_softmin.cpp b/tools/pnnx/src/pass_level2/F_softmin.cpp
index bb0768663c5..89e5d9aeaf8 100644
--- a/tools/pnnx/src/pass_level2/F_softmin.cpp
+++ b/tools/pnnx/src/pass_level2/F_softmin.cpp
@@ -40,4 +40,26 @@ pnnx.Output             output      1 0 out
 
 REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_softmin, 9)
 
+class F_softmin_onnx : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+4 3
+pnnx.Input              input       0 1 input
+aten::neg               op_0        1 1 input 6
+Softmax                 op_1        1 1 6 out axis=%dim
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "F.softmin";
+    }
+};
+
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_softmin_onnx, 9)
+
 } // namespace pnnx
diff --git a/tools/pnnx/src/pass_level2/F_softplus.cpp b/tools/pnnx/src/pass_level2/F_softplus.cpp
index c6a5279b414..8d346eb76ed 100644
--- a/tools/pnnx/src/pass_level2/F_softplus.cpp
+++ b/tools/pnnx/src/pass_level2/F_softplus.cpp
@@ -39,4 +39,62 @@ pnnx.Output             output      1 0 out
 
 REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_softplus, 10)
 
+class F_softplus_onnx : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+3 2
+pnnx.Input              input_0     0 1 input
+Softplus                op_0        1 1 input out
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "F.softplus";
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& /*captured_params*/) const
+    {
+        op->params["beta"] = 1.f;
+        op->params["threshold"] = 20.f;
+    }
+};
+
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_softplus_onnx, 10)
+
+class F_softplus_onnx_1 : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+7 6
+pnnx.Input              input_0     0 1 input
+prim::Constant          op_0        0 1 beta value=%beta
+aten::mul               op_1        2 1 input beta a
+Softplus                op_2        1 1 a b
+prim::Constant          op_3        0 1 beta2 value=%beta
+aten::div               op_4        2 1 b beta2 out
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "F.softplus";
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
+    {
+        op->params["beta"] = captured_params.at("beta");
+        op->params["threshold"] = 20.f;
+    }
+};
+
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_softplus_onnx_1, 9)
+
 } // namespace pnnx
diff --git a/tools/pnnx/src/pass_level2/F_softshrink.cpp b/tools/pnnx/src/pass_level2/F_softshrink.cpp
index 286990bf2c5..8d14a8a644b 100644
--- a/tools/pnnx/src/pass_level2/F_softshrink.cpp
+++ b/tools/pnnx/src/pass_level2/F_softshrink.cpp
@@ -38,4 +38,62 @@ pnnx.Output             output      1 0 out
 
 REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_softshrink, 10)
 
+static bool NearlyEqual(float a, float b, float epsilon)
+{
+    if (a == b)
+        return true;
+
+    float diff = (float)fabs(a - b);
+    if (diff <= epsilon)
+        return true;
+
+    // relative error
+    return diff < epsilon * std::max(fabs(a), fabs(b));
+}
+
+class F_softshrink_onnx : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+15 14
+pnnx.Input              input       0 1 input
+prim::Constant          op_0        0 1 lambd value=%lambd
+aten::gt                op_1        2 1 input lambd 8
+prim::Constant          op_2        0 1 lambd2 value=%lambd
+aten::sub               op_3        2 1 input lambd2 9
+prim::Constant          op_4        0 1 zero value=0
+aten::where             op_5        3 1 8 9 zero a
+prim::Constant          op_6        0 1 mlambd value=%lambd2
+aten::lt                op_7        2 1 input mlambd 11
+prim::Constant          op_8        0 1 lambd3 value=%lambd
+aten::add               op_9        2 1 input lambd3 12
+prim::Constant          op_10       0 1 zero2 value=0
+aten::where             op_11       3 1 11 12 zero2 b
+aten::add               op_12       2 1 a b out
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "F.softshrink";
+    }
+
+    bool match(const std::map<std::string, Parameter>& captured_params) const
+    {
+        float lambd = captured_params.at("lambd").f;
+        float lambd2 = captured_params.at("lambd2").f;
+        return NearlyEqual(lambd, -lambd2, 0.001);
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
+    {
+        op->params["lambd"] = captured_params.at("lambd");
+    }
+};
+
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_softshrink_onnx, 10)
+
 } // namespace pnnx
diff --git a/tools/pnnx/src/pass_level2/F_softsign.cpp b/tools/pnnx/src/pass_level2/F_softsign.cpp
index 4ec8ae9e520..ae6005d6337 100644
--- a/tools/pnnx/src/pass_level2/F_softsign.cpp
+++ b/tools/pnnx/src/pass_level2/F_softsign.cpp
@@ -41,4 +41,28 @@ pnnx.Output             output      1 0 out
 
 REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_softsign, 10)
 
+class F_softsign_onnx : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+6 5
+pnnx.Input              input       0 1 input
+aten::abs               op_0        1 1 input 6
+prim::Constant          op_1        0 1 8 value=1
+aten::add               op_2        2 1 6 8 9
+aten::div               op_3        2 1 input 9 out
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "F.softsign";
+    }
+};
+
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_softsign_onnx, 10)
+
 } // namespace pnnx
diff --git a/tools/pnnx/src/pass_level2/F_tanhshrink.cpp b/tools/pnnx/src/pass_level2/F_tanhshrink.cpp
index d8d6c311fcd..01e578bf8ad 100644
--- a/tools/pnnx/src/pass_level2/F_tanhshrink.cpp
+++ b/tools/pnnx/src/pass_level2/F_tanhshrink.cpp
@@ -39,4 +39,26 @@ pnnx.Output             output      1 0 out
 
 REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_tanhshrink, 9)
 
+class F_tanhshrink_onnx : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+4 3
+pnnx.Input              input       0 1 input
+aten::tanh              op_0        1 1 input 7
+aten::sub               op_1        2 1 input 7 out
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "F.tanhshrink";
+    }
+};
+
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_tanhshrink_onnx, 9)
+
 } // namespace pnnx
diff --git a/tools/pnnx/tests/onnx/CMakeLists.txt b/tools/pnnx/tests/onnx/CMakeLists.txt
index 0c0a136fbaf..0e283e77d48 100644
--- a/tools/pnnx/tests/onnx/CMakeLists.txt
+++ b/tools/pnnx/tests/onnx/CMakeLists.txt
@@ -29,16 +29,27 @@ pnnx_onnx_add_test(F_layer_norm)
 pnnx_onnx_add_test(F_leaky_relu)
 pnnx_onnx_add_test(F_linear)
 pnnx_onnx_add_test(F_local_response_norm)
+pnnx_onnx_add_test(F_logsigmoid)
+pnnx_onnx_add_test(F_log_softmax)
 pnnx_onnx_add_test(F_max_pool1d)
 pnnx_onnx_add_test(F_max_pool2d)
 pnnx_onnx_add_test(F_max_pool3d)
+pnnx_onnx_add_test(F_mish)
 pnnx_onnx_add_test(F_pad)
 pnnx_onnx_add_test(F_prelu)
 pnnx_onnx_add_test(F_relu)
 pnnx_onnx_add_test(F_relu6)
 pnnx_onnx_add_test(F_scaled_dot_product_attention)
+pnnx_onnx_add_test(F_selu)
 pnnx_onnx_add_test(F_sigmoid)
+pnnx_onnx_add_test(F_silu)
 pnnx_onnx_add_test(F_softmax)
+pnnx_onnx_add_test(F_softmin)
+pnnx_onnx_add_test(F_softplus)
+pnnx_onnx_add_test(F_softshrink)
+pnnx_onnx_add_test(F_softsign)
+pnnx_onnx_add_test(F_tanh)
+pnnx_onnx_add_test(F_tanhshrink)
 pnnx_onnx_add_test(F_upsample_bilinear)
 pnnx_onnx_add_test(F_upsample_nearest)
 pnnx_onnx_add_test(F_upsample)
@@ -74,10 +85,13 @@ pnnx_onnx_add_test(nn_LayerNorm)
 pnnx_onnx_add_test(nn_LeakyReLU)
 pnnx_onnx_add_test(nn_Linear)
 pnnx_onnx_add_test(nn_LocalResponseNorm)
+pnnx_onnx_add_test(nn_LogSigmoid)
+pnnx_onnx_add_test(nn_LogSoftmax)
 pnnx_onnx_add_test(nn_LSTM)
 pnnx_onnx_add_test(nn_MaxPool1d)
 pnnx_onnx_add_test(nn_MaxPool2d)
 pnnx_onnx_add_test(nn_MaxPool3d)
+pnnx_onnx_add_test(nn_Mish)
 pnnx_onnx_add_test(nn_MultiheadAttention)
 pnnx_onnx_add_test(nn_PReLU)
 pnnx_onnx_add_test(nn_ReflectionPad1d)
@@ -88,8 +102,16 @@ pnnx_onnx_add_test(nn_ReplicationPad1d)
 pnnx_onnx_add_test(nn_ReplicationPad2d)
 pnnx_onnx_add_test(nn_ReplicationPad3d)
 pnnx_onnx_add_test(nn_RNN)
+pnnx_onnx_add_test(nn_SELU)
 pnnx_onnx_add_test(nn_Sigmoid)
+pnnx_onnx_add_test(nn_SiLU)
 pnnx_onnx_add_test(nn_Softmax)
+pnnx_onnx_add_test(nn_Softmin)
+pnnx_onnx_add_test(nn_Softplus)
+pnnx_onnx_add_test(nn_Softshrink)
+pnnx_onnx_add_test(nn_Softsign)
+pnnx_onnx_add_test(nn_Tanh)
+pnnx_onnx_add_test(nn_Tanhshrink)
 pnnx_onnx_add_test(nn_Upsample)
 pnnx_onnx_add_test(nn_UpsamplingBilinear2d)
 pnnx_onnx_add_test(nn_UpsamplingNearest2d)
diff --git a/tools/pnnx/tests/onnx/test_F_log_softmax.py b/tools/pnnx/tests/onnx/test_F_log_softmax.py
new file mode 100644
index 00000000000..8bc657c6778
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_F_log_softmax.py
@@ -0,0 +1,66 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z, w):
+        x = x * 2 - 1
+        y = y * 2 - 1
+        z = z * 2 - 1
+        w = w * 2 - 1
+        x = F.log_softmax(x, 1)
+        y = F.log_softmax(y, 0)
+        z = F.log_softmax(z, 2)
+        w = F.log_softmax(w, 3)
+        return x, y, z, w
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 16)
+    y = torch.rand(12, 2, 16)
+    z = torch.rand(1, 3, 12, 16)
+    w = torch.rand(1, 5, 7, 9, 11)
+
+    a = net(x, y, z, w)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z, w), "test_F_log_softmax.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_F_log_softmax.onnx inputshape=[1,16],[12,2,16],[1,3,12,16],[1,5,7,9,11]")
+
+    # pnnx inference
+    import test_F_log_softmax_pnnx
+    b = test_F_log_softmax_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_F_logsigmoid.py b/tools/pnnx/tests/onnx/test_F_logsigmoid.py
new file mode 100644
index 00000000000..a731936a109
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_F_logsigmoid.py
@@ -0,0 +1,66 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z, w):
+        x = x * 2 - 1
+        y = y * 2 - 1
+        z = z * 2 - 1
+        w = w * 2 - 1
+        x = F.logsigmoid(x)
+        y = F.logsigmoid(y)
+        z = F.logsigmoid(z)
+        w = F.logsigmoid(w)
+        return x, y, z, w
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 16)
+    y = torch.rand(12, 2, 16)
+    z = torch.rand(1, 3, 12, 16)
+    w = torch.rand(1, 5, 7, 9, 11)
+
+    a = net(x, y, z, w)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z, w), "test_F_logsigmoid.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_F_logsigmoid.onnx inputshape=[1,16],[12,2,16],[1,3,12,16],[1,5,7,9,11]")
+
+    # pnnx inference
+    import test_F_logsigmoid_pnnx
+    b = test_F_logsigmoid_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_F_mish.py b/tools/pnnx/tests/onnx/test_F_mish.py
new file mode 100644
index 00000000000..69026d38b2b
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_F_mish.py
@@ -0,0 +1,76 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from packaging import version
+
+def mish_forward_0(x):
+    return x * F.softplus(x).tanh()
+
+def mish_forward_1(x):
+    return x.mul(torch.tanh(F.softplus(x)))
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z, w):
+        x = x * 2 - 1
+        y = y * 2 - 1
+        z = z * 2 - 1
+        w = w * 2 - 1
+        x = F.mish(x)
+        y = F.mish(y)
+        z = mish_forward_0(z)
+        w = mish_forward_1(w)
+        return x, y, z, w
+
+def test():
+    if version.parse(torch.__version__) < version.parse('1.9'):
+        return True
+
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 16)
+    y = torch.rand(12, 2, 16)
+    z = torch.rand(1, 3, 12, 16)
+    w = torch.rand(1, 5, 7, 9, 11)
+
+    a = net(x, y, z, w)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z, w), "test_F_mish.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_F_mish.onnx inputshape=[1,16],[12,2,16],[1,3,12,16],[1,5,7,9,11]")
+
+    # pnnx inference
+    import test_F_mish_pnnx
+    b = test_F_mish_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.allclose(a0, b0, 1e-4, 1e-4):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_F_selu.py b/tools/pnnx/tests/onnx/test_F_selu.py
new file mode 100644
index 00000000000..e70f9344191
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_F_selu.py
@@ -0,0 +1,66 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z, w):
+        x = x * 2 - 1
+        y = y * 2 - 1
+        z = z * 2 - 1
+        w = w * 2 - 1
+        x = F.selu(x)
+        y = F.selu(y)
+        z = F.selu(z)
+        w = F.selu(w)
+        return x, y, z, w
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 16)
+    y = torch.rand(12, 2, 16)
+    z = torch.rand(1, 3, 12, 16)
+    w = torch.rand(1, 5, 7, 9, 11)
+
+    a = net(x, y, z, w)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z, w), "test_F_selu.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_F_selu.onnx inputshape=[1,16],[12,2,16],[1,3,12,16],[1,5,7,9,11]")
+
+    # pnnx inference
+    import test_F_selu_pnnx
+    b = test_F_selu_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_F_sigmoid.py b/tools/pnnx/tests/onnx/test_F_sigmoid.py
index 684a7ab48d9..c90e570e005 100644
--- a/tools/pnnx/tests/onnx/test_F_sigmoid.py
+++ b/tools/pnnx/tests/onnx/test_F_sigmoid.py
@@ -41,7 +41,7 @@ def test():
     z = torch.rand(1, 3, 12, 16)
     w = torch.rand(1, 5, 7, 9, 11)
 
-    a0, a1, a2, a3 = net(x, y, z, w)
+    a = net(x, y, z, w)
 
     # export onnx
     torch.onnx.export(net, (x, y, z, w), "test_F_sigmoid.onnx")
@@ -52,9 +52,12 @@ def test():
 
     # pnnx inference
     import test_F_sigmoid_pnnx
-    b0, b1, b2, b3 = test_F_sigmoid_pnnx.test_inference()
+    b = test_F_sigmoid_pnnx.test_inference()
 
-    return torch.equal(a0, b0) and torch.equal(a1, b1) and torch.equal(a2, b2) and torch.equal(a3, b3)
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
 
 if __name__ == "__main__":
     if test():
diff --git a/tools/pnnx/tests/onnx/test_F_silu.py b/tools/pnnx/tests/onnx/test_F_silu.py
new file mode 100644
index 00000000000..d6cc987262e
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_F_silu.py
@@ -0,0 +1,69 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+def silu_forward_0(x):
+    return x * torch.sigmoid(x)
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z, w):
+        x = x * 2 - 1
+        y = y * 2 - 1
+        z = z * 2 - 1
+        w = w * 2 - 1
+        x = F.silu(x)
+        y = F.silu(y)
+        z = F.silu(z)
+        w = silu_forward_0(w)
+        return x, y, z, w
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 16)
+    y = torch.rand(12, 2, 16)
+    z = torch.rand(1, 3, 12, 16)
+    w = torch.rand(1, 5, 7, 9, 11)
+
+    a = net(x, y, z, w)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z, w), "test_F_silu.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_F_silu.onnx inputshape=[1,16],[12,2,16],[1,3,12,16],[1,5,7,9,11]")
+
+    # pnnx inference
+    import test_F_silu_pnnx
+    b = test_F_silu_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.allclose(a0, b0, 1e-4, 1e-4):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_F_softmin.py b/tools/pnnx/tests/onnx/test_F_softmin.py
new file mode 100644
index 00000000000..88a82fea00a
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_F_softmin.py
@@ -0,0 +1,66 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z, w):
+        x = x * 2 - 1
+        y = y * 2 - 1
+        z = z * 2 - 1
+        w = w * 2 - 1
+        x = F.softmin(x, 1)
+        y = F.softmin(y, 0)
+        z = F.softmin(z, 2)
+        w = F.softmin(w, 3)
+        return x, y, z, w
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 16)
+    y = torch.rand(12, 2, 16)
+    z = torch.rand(1, 3, 12, 16)
+    w = torch.rand(1, 5, 7, 9, 11)
+
+    a = net(x, y, z, w)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z, w), "test_F_softmin.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_F_softmin.onnx inputshape=[1,16],[12,2,16],[1,3,12,16],[1,5,7,9,11]")
+
+    # pnnx inference
+    import test_F_softmin_pnnx
+    b = test_F_softmin_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_F_softplus.py b/tools/pnnx/tests/onnx/test_F_softplus.py
new file mode 100644
index 00000000000..c261f58d67c
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_F_softplus.py
@@ -0,0 +1,70 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from packaging import version
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z, w):
+        x = x * 2 - 1
+        y = y * 2 - 1
+        z = z * 2 - 1
+        w = w * 2 - 1
+        x = F.softplus(x)
+        y = F.softplus(y, 2, 5.2)
+        z = F.softplus(z, -0.7, 15)
+        w = F.softplus(w, 0.1, 0.3)
+        return x, y, z, w
+
+def test():
+    if version.parse(torch.__version__) < version.parse('1.11'):
+        return True
+
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 16)
+    y = torch.rand(12, 2, 16)
+    z = torch.rand(1, 3, 12, 16)
+    w = torch.rand(1, 5, 7, 9, 11)
+
+    a = net(x, y, z, w)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z, w), "test_F_softplus.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_F_softplus.onnx inputshape=[1,16],[12,2,16],[1,3,12,16],[1,5,7,9,11]")
+
+    # pnnx inference
+    import test_F_softplus_pnnx
+    b = test_F_softplus_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.allclose(a0, b0, 1e-4, 1e-4):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_F_softshrink.py b/tools/pnnx/tests/onnx/test_F_softshrink.py
new file mode 100644
index 00000000000..7f1fb883807
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_F_softshrink.py
@@ -0,0 +1,70 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from packaging import version
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z, w):
+        x = x * 2 - 1
+        y = y * 2 - 1
+        z = z * 2 - 1
+        w = w * 2 - 1
+        x = F.softshrink(x)
+        y = F.softshrink(y, 0.1)
+        z = F.softshrink(z, 0.22)
+        w = F.softshrink(w, 0)
+        return x, y, z, w
+
+def test():
+    if version.parse(torch.__version__) < version.parse('1.11'):
+        return True
+
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 16)
+    y = torch.rand(12, 2, 16)
+    z = torch.rand(1, 3, 12, 16)
+    w = torch.rand(1, 5, 7, 9, 11)
+
+    a = net(x, y, z, w)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z, w), "test_F_softshrink.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_F_softshrink.onnx inputshape=[1,16],[12,2,16],[1,3,12,16],[1,5,7,9,11]")
+
+    # pnnx inference
+    import test_F_softshrink_pnnx
+    b = test_F_softshrink_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_F_softsign.py b/tools/pnnx/tests/onnx/test_F_softsign.py
new file mode 100644
index 00000000000..27164f3dfc1
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_F_softsign.py
@@ -0,0 +1,66 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z, w):
+        x = x * 2 - 1
+        y = y * 2 - 1
+        z = z * 2 - 1
+        w = w * 2 - 1
+        x = F.softsign(x)
+        y = F.softsign(y)
+        z = F.softsign(z)
+        w = F.softsign(w)
+        return x, y, z, w
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 16)
+    y = torch.rand(12, 2, 16)
+    z = torch.rand(1, 3, 12, 16)
+    w = torch.rand(1, 5, 7, 9, 11)
+
+    a = net(x, y, z, w)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z, w), "test_F_softsign.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_F_softsign.onnx inputshape=[1,16],[12,2,16],[1,3,12,16],[1,5,7,9,11]")
+
+    # pnnx inference
+    import test_F_softsign_pnnx
+    b = test_F_softsign_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_F_tanh.py b/tools/pnnx/tests/onnx/test_F_tanh.py
new file mode 100644
index 00000000000..b56d513f655
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_F_tanh.py
@@ -0,0 +1,66 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z, w):
+        x = x * 2 - 1
+        y = y * 2 - 1
+        z = z * 2 - 1
+        w = w * 2 - 1
+        x = F.tanh(x)
+        y = F.tanh(y)
+        z = F.tanh(z)
+        w = F.tanh(w)
+        return x, y, z, w
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 16)
+    y = torch.rand(12, 2, 16)
+    z = torch.rand(1, 3, 12, 16)
+    w = torch.rand(1, 5, 7, 9, 11)
+
+    a = net(x, y, z, w)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z, w), "test_F_tanh.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_F_tanh.onnx inputshape=[1,16],[12,2,16],[1,3,12,16],[1,5,7,9,11]")
+
+    # pnnx inference
+    import test_F_tanh_pnnx
+    b = test_F_tanh_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.allclose(a0, b0, 1e-4, 1e-4):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_F_tanhshrink.py b/tools/pnnx/tests/onnx/test_F_tanhshrink.py
new file mode 100644
index 00000000000..7be2bf57cb1
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_F_tanhshrink.py
@@ -0,0 +1,66 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z, w):
+        x = x * 2 - 1
+        y = y * 2 - 1
+        z = z * 2 - 1
+        w = w * 2 - 1
+        x = F.tanhshrink(x)
+        y = F.tanhshrink(y)
+        z = F.tanhshrink(z)
+        w = F.tanhshrink(w)
+        return x, y, z, w
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 16)
+    y = torch.rand(12, 2, 16)
+    z = torch.rand(1, 3, 12, 16)
+    w = torch.rand(1, 5, 7, 9, 11)
+
+    a = net(x, y, z, w)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z, w), "test_F_tanhshrink.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_F_tanhshrink.onnx inputshape=[1,16],[12,2,16],[1,3,12,16],[1,5,7,9,11]")
+
+    # pnnx inference
+    import test_F_tanhshrink_pnnx
+    b = test_F_tanhshrink_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_nn_LogSigmoid.py b/tools/pnnx/tests/onnx/test_nn_LogSigmoid.py
new file mode 100644
index 00000000000..ddb44cbf442
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_nn_LogSigmoid.py
@@ -0,0 +1,68 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+        self.act_0 = nn.LogSigmoid()
+
+    def forward(self, x, y, z, w):
+        x = x * 2 - 1
+        y = y * 2 - 1
+        z = z * 2 - 1
+        w = w * 2 - 1
+        x = self.act_0(x)
+        y = self.act_0(y)
+        z = self.act_0(z)
+        w = self.act_0(w)
+        return x, y, z, w
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 12)
+    y = torch.rand(1, 12, 64)
+    z = torch.rand(1, 12, 24, 64)
+    w = torch.rand(1, 12, 24, 32, 64)
+
+    a = net(x, y, z, w)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z, w), "test_nn_LogSigmoid.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_nn_LogSigmoid.onnx inputshape=[1,12],[1,12,64],[1,12,24,64],[1,12,24,32,64]")
+
+    # pnnx inference
+    import test_nn_LogSigmoid_pnnx
+    b = test_nn_LogSigmoid_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_nn_LogSoftmax.py b/tools/pnnx/tests/onnx/test_nn_LogSoftmax.py
new file mode 100644
index 00000000000..dbe8dc96d82
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_nn_LogSoftmax.py
@@ -0,0 +1,71 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+        self.act_0 = nn.LogSoftmax(dim=1)
+        self.act_1 = nn.LogSoftmax(dim=1)
+        self.act_2 = nn.LogSoftmax(dim=0)
+        self.act_3 = nn.LogSoftmax(dim=2)
+
+    def forward(self, x, y, z, w):
+        x = x * 2 - 1
+        y = y * 2 - 1
+        z = z * 2 - 1
+        w = w * 2 - 1
+        x = self.act_0(x)
+        y = self.act_1(y)
+        z = self.act_2(z)
+        w = self.act_3(w)
+        return x, y, z, w
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 12)
+    y = torch.rand(1, 12, 64)
+    z = torch.rand(1, 12, 24, 64)
+    w = torch.rand(1, 12, 24, 32, 64)
+
+    a = net(x, y, z, w)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z, w), "test_nn_LogSoftmax.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_nn_LogSoftmax.onnx inputshape=[1,12],[1,12,64],[1,12,24,64],[1,12,24,32,64]")
+
+    # pnnx inference
+    import test_nn_LogSoftmax_pnnx
+    b = test_nn_LogSoftmax_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_nn_Mish.py b/tools/pnnx/tests/onnx/test_nn_Mish.py
new file mode 100644
index 00000000000..481ba718111
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_nn_Mish.py
@@ -0,0 +1,72 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from packaging import version
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+        self.act_0 = nn.Mish()
+
+    def forward(self, x, y, z, w):
+        x = x * 2 - 1
+        y = y * 2 - 1
+        z = z * 2 - 1
+        w = w * 2 - 1
+        x = self.act_0(x)
+        y = self.act_0(y)
+        z = self.act_0(z)
+        w = self.act_0(w)
+        return x, y, z, w
+
+def test():
+    if version.parse(torch.__version__) < version.parse('1.9'):
+        return True
+
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 12)
+    y = torch.rand(1, 12, 64)
+    z = torch.rand(1, 12, 24, 64)
+    w = torch.rand(1, 12, 24, 32, 64)
+
+    a = net(x, y, z, w)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z, w), "test_nn_Mish.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_nn_Mish.onnx inputshape=[1,12],[1,12,64],[1,12,24,64],[1,12,24,32,64]")
+
+    # pnnx inference
+    import test_nn_Mish_pnnx
+    b = test_nn_Mish_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.allclose(a0, b0, 1e-4, 1e-4):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_nn_SELU.py b/tools/pnnx/tests/onnx/test_nn_SELU.py
new file mode 100644
index 00000000000..a78c9e2336f
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_nn_SELU.py
@@ -0,0 +1,68 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+        self.act_0 = nn.SELU()
+
+    def forward(self, x, y, z, w):
+        x = x * 2 - 1
+        y = y * 2 - 1
+        z = z * 2 - 1
+        w = w * 2 - 1
+        x = self.act_0(x)
+        y = self.act_0(y)
+        z = self.act_0(z)
+        w = self.act_0(w)
+        return x, y, z, w
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 12)
+    y = torch.rand(1, 12, 64)
+    z = torch.rand(1, 12, 24, 64)
+    w = torch.rand(1, 12, 24, 32, 64)
+
+    a = net(x, y, z, w)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z, w), "test_nn_SELU.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_nn_SELU.onnx inputshape=[1,12],[1,12,64],[1,12,24,64],[1,12,24,32,64]")
+
+    # pnnx inference
+    import test_nn_SELU_pnnx
+    b = test_nn_SELU_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_nn_SiLU.py b/tools/pnnx/tests/onnx/test_nn_SiLU.py
new file mode 100644
index 00000000000..e509ddb6754
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_nn_SiLU.py
@@ -0,0 +1,68 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+        self.act_0 = nn.SiLU()
+
+    def forward(self, x, y, z, w):
+        x = x * 2 - 1
+        y = y * 2 - 1
+        z = z * 2 - 1
+        w = w * 2 - 1
+        x = self.act_0(x)
+        y = self.act_0(y)
+        z = self.act_0(z)
+        w = self.act_0(w)
+        return x, y, z, w
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 12)
+    y = torch.rand(1, 12, 64)
+    z = torch.rand(1, 12, 24, 64)
+    w = torch.rand(1, 12, 24, 32, 64)
+
+    a = net(x, y, z, w)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z, w), "test_nn_SiLU.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_nn_SiLU.onnx inputshape=[1,12],[1,12,64],[1,12,24,64],[1,12,24,32,64]")
+
+    # pnnx inference
+    import test_nn_SiLU_pnnx
+    b = test_nn_SiLU_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.allclose(a0, b0, 1e-4, 1e-4):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_nn_Sigmoid.py b/tools/pnnx/tests/onnx/test_nn_Sigmoid.py
index 5b9cfc9a2be..72d5d798ef4 100644
--- a/tools/pnnx/tests/onnx/test_nn_Sigmoid.py
+++ b/tools/pnnx/tests/onnx/test_nn_Sigmoid.py
@@ -43,7 +43,7 @@ def test():
     z = torch.rand(1, 12, 24, 64)
     w = torch.rand(1, 12, 24, 32, 64)
 
-    a0, a1, a2, a3 = net(x, y, z, w)
+    a = net(x, y, z, w)
 
     # export onnx
     torch.onnx.export(net, (x, y, z, w), "test_nn_Sigmoid.onnx")
@@ -54,9 +54,12 @@ def test():
 
     # pnnx inference
     import test_nn_Sigmoid_pnnx
-    b0, b1, b2, b3 = test_nn_Sigmoid_pnnx.test_inference()
+    b = test_nn_Sigmoid_pnnx.test_inference()
 
-    return torch.equal(a0, b0) and torch.equal(a1, b1) and torch.equal(a2, b2) and torch.equal(a3, b3)
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
 
 if __name__ == "__main__":
     if test():
diff --git a/tools/pnnx/tests/onnx/test_nn_Softmin.py b/tools/pnnx/tests/onnx/test_nn_Softmin.py
new file mode 100644
index 00000000000..9cb8417f2f6
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_nn_Softmin.py
@@ -0,0 +1,71 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+        self.act_0 = nn.Softmin(dim=1)
+        self.act_1 = nn.Softmin(dim=1)
+        self.act_2 = nn.Softmin(dim=0)
+        self.act_3 = nn.Softmin(dim=2)
+
+    def forward(self, x, y, z, w):
+        x = x * 2 - 1
+        y = y * 2 - 1
+        z = z * 2 - 1
+        w = w * 2 - 1
+        x = self.act_0(x)
+        y = self.act_1(y)
+        z = self.act_2(z)
+        w = self.act_3(w)
+        return x, y, z, w
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 12)
+    y = torch.rand(1, 12, 64)
+    z = torch.rand(1, 12, 24, 64)
+    w = torch.rand(1, 12, 24, 32, 64)
+
+    a = net(x, y, z, w)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z, w), "test_nn_Softmin.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_nn_Softmin.onnx inputshape=[1,12],[1,12,64],[1,12,24,64],[1,12,24,32,64]")
+
+    # pnnx inference
+    import test_nn_Softmin_pnnx
+    b = test_nn_Softmin_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_nn_Softplus.py b/tools/pnnx/tests/onnx/test_nn_Softplus.py
new file mode 100644
index 00000000000..445c6341b29
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_nn_Softplus.py
@@ -0,0 +1,73 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from packaging import version
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+        self.act_0 = nn.Softplus()
+        self.act_1 = nn.Softplus(beta=0.7, threshold=15)
+
+    def forward(self, x, y, z, w):
+        x = x * 2 - 1
+        y = y * 2 - 1
+        z = z * 2 - 1
+        w = w * 2 - 1
+        x = self.act_0(x)
+        y = self.act_0(y)
+        z = self.act_1(z)
+        w = self.act_1(w)
+        return x, y, z, w
+
+def test():
+    if version.parse(torch.__version__) < version.parse('1.11'):
+        return True
+
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 12)
+    y = torch.rand(1, 12, 64)
+    z = torch.rand(1, 12, 24, 64)
+    w = torch.rand(1, 12, 24, 32, 64)
+
+    a = net(x, y, z, w)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z, w), "test_nn_Softplus.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_nn_Softplus.onnx inputshape=[1,12],[1,12,64],[1,12,24,64],[1,12,24,32,64]")
+
+    # pnnx inference
+    import test_nn_Softplus_pnnx
+    b = test_nn_Softplus_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.allclose(a0, b0, 1e-4, 1e-4):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_nn_Softshrink.py b/tools/pnnx/tests/onnx/test_nn_Softshrink.py
new file mode 100644
index 00000000000..b86e9239c16
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_nn_Softshrink.py
@@ -0,0 +1,73 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from packaging import version
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+        self.act_0 = nn.Softshrink()
+        self.act_1 = nn.Softshrink(lambd=1.3)
+
+    def forward(self, x, y, z, w):
+        x = x * 2 - 1
+        y = y * 2 - 1
+        z = z * 2 - 1
+        w = w * 2 - 1
+        x = self.act_0(x)
+        y = self.act_0(y)
+        z = self.act_1(z)
+        w = self.act_1(w)
+        return x, y, z, w
+
+def test():
+    if version.parse(torch.__version__) < version.parse('1.11'):
+        return True
+
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 12)
+    y = torch.rand(1, 12, 64)
+    z = torch.rand(1, 12, 24, 64)
+    w = torch.rand(1, 12, 24, 32, 64)
+
+    a = net(x, y, z, w)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z, w), "test_nn_Softshrink.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_nn_Softshrink.onnx inputshape=[1,12],[1,12,64],[1,12,24,64],[1,12,24,32,64]")
+
+    # pnnx inference
+    import test_nn_Softshrink_pnnx
+    b = test_nn_Softshrink_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_nn_Softsign.py b/tools/pnnx/tests/onnx/test_nn_Softsign.py
new file mode 100644
index 00000000000..da86752ca67
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_nn_Softsign.py
@@ -0,0 +1,68 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+        self.act_0 = nn.Softsign()
+
+    def forward(self, x, y, z, w):
+        x = x * 2 - 1
+        y = y * 2 - 1
+        z = z * 2 - 1
+        w = w * 2 - 1
+        x = self.act_0(x)
+        y = self.act_0(y)
+        z = self.act_0(z)
+        w = self.act_0(w)
+        return x, y, z, w
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 12)
+    y = torch.rand(1, 12, 64)
+    z = torch.rand(1, 12, 24, 64)
+    w = torch.rand(1, 12, 24, 32, 64)
+
+    a = net(x, y, z, w)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z, w), "test_nn_Softsign.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_nn_Softsign.onnx inputshape=[1,12],[1,12,64],[1,12,24,64],[1,12,24,32,64]")
+
+    # pnnx inference
+    import test_nn_Softsign_pnnx
+    b = test_nn_Softsign_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_nn_Tanh.py b/tools/pnnx/tests/onnx/test_nn_Tanh.py
new file mode 100644
index 00000000000..083275d277f
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_nn_Tanh.py
@@ -0,0 +1,68 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+        self.act_0 = nn.Tanh()
+
+    def forward(self, x, y, z, w):
+        x = x * 2 - 1
+        y = y * 2 - 1
+        z = z * 2 - 1
+        w = w * 2 - 1
+        x = self.act_0(x)
+        y = self.act_0(y)
+        z = self.act_0(z)
+        w = self.act_0(w)
+        return x, y, z, w
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 12)
+    y = torch.rand(1, 12, 64)
+    z = torch.rand(1, 12, 24, 64)
+    w = torch.rand(1, 12, 24, 32, 64)
+
+    a = net(x, y, z, w)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z, w), "test_nn_Tanh.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_nn_Tanh.onnx inputshape=[1,12],[1,12,64],[1,12,24,64],[1,12,24,32,64]")
+
+    # pnnx inference
+    import test_nn_Tanh_pnnx
+    b = test_nn_Tanh_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.allclose(a0, b0, 1e-4, 1e-4):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_nn_Tanhshrink.py b/tools/pnnx/tests/onnx/test_nn_Tanhshrink.py
new file mode 100644
index 00000000000..20cabe2559a
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_nn_Tanhshrink.py
@@ -0,0 +1,68 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+        self.act_0 = nn.Tanhshrink()
+
+    def forward(self, x, y, z, w):
+        x = x * 2 - 1
+        y = y * 2 - 1
+        z = z * 2 - 1
+        w = w * 2 - 1
+        x = self.act_0(x)
+        y = self.act_0(y)
+        z = self.act_0(z)
+        w = self.act_0(w)
+        return x, y, z, w
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 12)
+    y = torch.rand(1, 12, 64)
+    z = torch.rand(1, 12, 24, 64)
+    w = torch.rand(1, 12, 24, 32, 64)
+
+    a = net(x, y, z, w)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z, w), "test_nn_Tanhshrink.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_nn_Tanhshrink.onnx inputshape=[1,12],[1,12,64],[1,12,24,64],[1,12,24,32,64]")
+
+    # pnnx inference
+    import test_nn_Tanhshrink_pnnx
+    b = test_nn_Tanhshrink_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)

From 569617f212b2878137813b6cb16a5bd6a0076fc7 Mon Sep 17 00:00:00 2001
From: nihui <nihuini@tencent.com>
Date: Mon, 15 Jul 2024 16:00:11 +0800
Subject: [PATCH 02/38] pnnx convert onnx
 expand/permute/repeat/reshape/select/slice/cat/ceil/chunk/flatten/floor/maximum/minimum/split/squeeze/stack/transpose/unbind/unsqueeze
 (#5583)

---
 tools/pnnx/src/pass_level2/Tensor_expand.cpp  | 48 +++++++++++
 tools/pnnx/src/pass_level2/Tensor_reshape.cpp | 77 ++++++------------
 tools/pnnx/src/pass_level2/torch_squeeze.cpp  | 17 ++--
 tools/pnnx/src/pass_level2/torch_tile.cpp     | 41 ++++++++++
 .../pass_onnx/fuse_constant_as_attribute.cpp  |  2 +
 tools/pnnx/tests/ncnn/test_torch_unbind.py    |  3 +-
 tools/pnnx/tests/onnx/CMakeLists.txt          | 21 +++++
 tools/pnnx/tests/onnx/test_Tensor_expand.py   | 60 ++++++++++++++
 tools/pnnx/tests/onnx/test_Tensor_permute.py  | 64 +++++++++++++++
 tools/pnnx/tests/onnx/test_Tensor_repeat.py   | 63 +++++++++++++++
 tools/pnnx/tests/onnx/test_Tensor_reshape.py  | 63 +++++++++++++++
 tools/pnnx/tests/onnx/test_Tensor_select.py   | 60 ++++++++++++++
 tools/pnnx/tests/onnx/test_Tensor_slice.py    | 79 +++++++++++++++++++
 tools/pnnx/tests/onnx/test_Tensor_view.py     | 63 +++++++++++++++
 tools/pnnx/tests/onnx/test_torch_cat.py       | 61 ++++++++++++++
 tools/pnnx/tests/onnx/test_torch_ceil.py      | 60 ++++++++++++++
 tools/pnnx/tests/onnx/test_torch_chunk.py     | 60 ++++++++++++++
 tools/pnnx/tests/onnx/test_torch_flatten.py   | 63 +++++++++++++++
 tools/pnnx/tests/onnx/test_torch_floor.py     | 60 ++++++++++++++
 tools/pnnx/tests/onnx/test_torch_maximum.py   | 64 +++++++++++++++
 tools/pnnx/tests/onnx/test_torch_minimum.py   | 64 +++++++++++++++
 tools/pnnx/tests/onnx/test_torch_split.py     | 60 ++++++++++++++
 tools/pnnx/tests/onnx/test_torch_squeeze.py   | 60 ++++++++++++++
 tools/pnnx/tests/onnx/test_torch_stack.py     | 62 +++++++++++++++
 tools/pnnx/tests/onnx/test_torch_transpose.py | 60 ++++++++++++++
 tools/pnnx/tests/onnx/test_torch_unbind.py    | 60 ++++++++++++++
 tools/pnnx/tests/onnx/test_torch_unsqueeze.py | 63 +++++++++++++++
 tools/pnnx/tests/test_torch_unbind.py         |  2 +-
 28 files changed, 1399 insertions(+), 61 deletions(-)
 create mode 100644 tools/pnnx/tests/onnx/test_Tensor_expand.py
 create mode 100644 tools/pnnx/tests/onnx/test_Tensor_permute.py
 create mode 100644 tools/pnnx/tests/onnx/test_Tensor_repeat.py
 create mode 100644 tools/pnnx/tests/onnx/test_Tensor_reshape.py
 create mode 100644 tools/pnnx/tests/onnx/test_Tensor_select.py
 create mode 100644 tools/pnnx/tests/onnx/test_Tensor_slice.py
 create mode 100644 tools/pnnx/tests/onnx/test_Tensor_view.py
 create mode 100644 tools/pnnx/tests/onnx/test_torch_cat.py
 create mode 100644 tools/pnnx/tests/onnx/test_torch_ceil.py
 create mode 100644 tools/pnnx/tests/onnx/test_torch_chunk.py
 create mode 100644 tools/pnnx/tests/onnx/test_torch_flatten.py
 create mode 100644 tools/pnnx/tests/onnx/test_torch_floor.py
 create mode 100644 tools/pnnx/tests/onnx/test_torch_maximum.py
 create mode 100644 tools/pnnx/tests/onnx/test_torch_minimum.py
 create mode 100644 tools/pnnx/tests/onnx/test_torch_split.py
 create mode 100644 tools/pnnx/tests/onnx/test_torch_squeeze.py
 create mode 100644 tools/pnnx/tests/onnx/test_torch_stack.py
 create mode 100644 tools/pnnx/tests/onnx/test_torch_transpose.py
 create mode 100644 tools/pnnx/tests/onnx/test_torch_unbind.py
 create mode 100644 tools/pnnx/tests/onnx/test_torch_unsqueeze.py

diff --git a/tools/pnnx/src/pass_level2/Tensor_expand.cpp b/tools/pnnx/src/pass_level2/Tensor_expand.cpp
index 23c1af6a863..4c94d7b8e04 100644
--- a/tools/pnnx/src/pass_level2/Tensor_expand.cpp
+++ b/tools/pnnx/src/pass_level2/Tensor_expand.cpp
@@ -61,4 +61,52 @@ pnnx.Output             output      1 0 out
 
 REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(Tensor_expand_1, 20)
 
+class Tensor_expand_onnx : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+3 2
+pnnx.Input              input       0 1 input
+Expand                  op_0        1 1 input out %*=%*
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "Tensor.expand";
+    }
+
+    bool match(const std::map<std::string, Parameter>& captured_params) const
+    {
+        if (captured_params.find("op_0.shape") == captured_params.end())
+            return false;
+
+        return true;
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
+    {
+        if (captured_params.at("op_0.shape").type == 5)
+        {
+            op->params["shape"] = captured_params.at("op_0.shape");
+        }
+        else // if (captured_params.at("op_0.shape").type == 2)
+        {
+            op->params["shape"] = std::vector<int>{captured_params.at("op_0.shape").i};
+        }
+
+        // onnx set expand shape 1 for not changing the size of that dimension while torch uses -1
+        for (size_t i = 0; i < op->params["shape"].ai.size(); i++)
+        {
+            if (op->params["shape"].ai[i] == 1)
+                op->params["shape"].ai[i] = -1;
+        }
+    }
+};
+
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(Tensor_expand_onnx, 20)
+
 } // namespace pnnx
diff --git a/tools/pnnx/src/pass_level2/Tensor_reshape.cpp b/tools/pnnx/src/pass_level2/Tensor_reshape.cpp
index 1c578a8d633..412e609cc40 100644
--- a/tools/pnnx/src/pass_level2/Tensor_reshape.cpp
+++ b/tools/pnnx/src/pass_level2/Tensor_reshape.cpp
@@ -48,7 +48,7 @@ class Tensor_reshape_onnx : public GraphRewriterPass
 pnnx.Input              input_0     0 1 input
 pnnx.Input              input_1     0 1 shape
 aten::cat               op_0        1 1 shape cat dim=0
-Reshape                 op_1        2 1 input cat out allowzero=*
+Reshape                 op_1        2 1 input cat out %*=%*
 pnnx.Output             output      1 0 out
 )PNNXIR";
     }
@@ -57,46 +57,15 @@ pnnx.Output             output      1 0 out
     {
         return "Tensor.reshape";
     }
-};
-
-REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(Tensor_reshape_onnx, 19)
-
-class Tensor_reshape_onnx_1 : public Tensor_reshape_onnx
-{
-public:
-    const char* match_pattern_graph() const
-    {
-        return R"PNNXIR(7767517
-5 4
-pnnx.Input              input_0     0 1 input
-pnnx.Input              input_1     0 1 shape
-aten::cat               op_0        1 1 shape cat dim=0
-Reshape                 op_1        2 1 input cat out
-pnnx.Output             output      1 0 out
-)PNNXIR";
-    }
-};
-
-REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(Tensor_reshape_onnx_1, 19)
 
-class Tensor_reshape_onnx_2 : public Tensor_reshape_onnx
-{
-public:
-    const char* match_pattern_graph() const
+    void write(Operator* /*op*/, const std::map<std::string, Parameter>& /*captured_params*/) const
     {
-        return R"PNNXIR(7767517
-4 3
-pnnx.Input              input_0     0 1 input
-pnnx.Input              input_1     0 1 shape
-Reshape                 op_1        2 1 input shape out allowzero=*
-pnnx.Output             output      1 0 out
-)PNNXIR";
     }
 };
 
-REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(Tensor_reshape_onnx_2, 20)
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(Tensor_reshape_onnx, 19)
 
-class Tensor_reshape_onnx_3 : public Tensor_reshape_onnx
+class Tensor_reshape_onnx_1 : public Tensor_reshape_onnx
 {
 public:
     const char* match_pattern_graph() const
@@ -105,15 +74,15 @@ class Tensor_reshape_onnx_3 : public Tensor_reshape_onnx
 4 3
 pnnx.Input              input_0     0 1 input
 pnnx.Input              input_1     0 1 shape
-Reshape                 op_1        2 1 input shape out
+Reshape                 op_0        2 1 input shape out %*=%*
 pnnx.Output             output      1 0 out
 )PNNXIR";
     }
 };
 
-REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(Tensor_reshape_onnx_3, 20)
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(Tensor_reshape_onnx_1, 20)
 
-class Tensor_reshape_onnx_4 : public GraphRewriterPass
+class Tensor_reshape_onnx_2 : public GraphRewriterPass
 {
 public:
     const char* match_pattern_graph() const
@@ -121,7 +90,7 @@ class Tensor_reshape_onnx_4 : public GraphRewriterPass
         return R"PNNXIR(7767517
 3 2
 pnnx.Input              input       0 1 input
-Reshape                 op_1        1 1 input out shape=%shape allowzero=*
+Reshape                 op_0        1 1 input out %*=%*
 pnnx.Output             output      1 0 out
 )PNNXIR";
     }
@@ -130,24 +99,28 @@ pnnx.Output             output      1 0 out
     {
         return "Tensor.reshape";
     }
-};
 
-REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(Tensor_reshape_onnx_4, 20)
+    bool match(const std::map<std::string, Parameter>& captured_params) const
+    {
+        if (captured_params.find("op_0.shape") == captured_params.end())
+            return false;
+
+        return true;
+    }
 
-class Tensor_reshape_onnx_5 : public Tensor_reshape_onnx_4
-{
-public:
-    const char* match_pattern_graph() const
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
     {
-        return R"PNNXIR(7767517
-3 2
-pnnx.Input              input       0 1 input
-Reshape                 op_1        1 1 input out shape=%shape
-pnnx.Output             output      1 0 out
-)PNNXIR";
+        if (captured_params.at("op_0.shape").type == 5)
+        {
+            op->params["shape"] = captured_params.at("op_0.shape");
+        }
+        else // if (captured_params.at("op_0.shape").type == 2)
+        {
+            op->params["shape"] = std::vector<int>{captured_params.at("op_0.shape").i};
+        }
     }
 };
 
-REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(Tensor_reshape_onnx_5, 20)
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(Tensor_reshape_onnx_2, 20)
 
 } // namespace pnnx
diff --git a/tools/pnnx/src/pass_level2/torch_squeeze.cpp b/tools/pnnx/src/pass_level2/torch_squeeze.cpp
index d7e157d94b1..dabffebc126 100644
--- a/tools/pnnx/src/pass_level2/torch_squeeze.cpp
+++ b/tools/pnnx/src/pass_level2/torch_squeeze.cpp
@@ -110,20 +110,23 @@ class torch_squeeze_onnx_1 : public torch_squeeze_onnx
         return R"PNNXIR(7767517
 3 2
 pnnx.Input              input       0 1 input
-Squeeze                 op_0        1 1 input out axes=%axes
+Squeeze                 op_0        1 1 input out %*=%*
 pnnx.Output             output      1 0 out
 )PNNXIR";
     }
 
     void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
     {
-        if (captured_params.at("axes").type == 5 && captured_params.at("axes").ai.size() == 1)
+        if (captured_params.find("op_0.axes") != captured_params.end())
         {
-            op->params["dim"] = captured_params.at("axes").ai[0];
-        }
-        else
-        {
-            op->params["dim"] = captured_params.at("axes");
+            if (captured_params.at("op_0.axes").type == 5 && captured_params.at("op_0.axes").ai.size() == 1)
+            {
+                op->params["dim"] = captured_params.at("op_0.axes").ai[0];
+            }
+            else
+            {
+                op->params["dim"] = captured_params.at("op_0.axes");
+            }
         }
     }
 };
diff --git a/tools/pnnx/src/pass_level2/torch_tile.cpp b/tools/pnnx/src/pass_level2/torch_tile.cpp
index d1504bacda8..a2f2780116c 100644
--- a/tools/pnnx/src/pass_level2/torch_tile.cpp
+++ b/tools/pnnx/src/pass_level2/torch_tile.cpp
@@ -60,4 +60,45 @@ pnnx.Output             output      1 0 out
 
 REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(torch_tile_onnx, 20)
 
+class torch_tile_onnx_1 : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+3 2
+pnnx.Input              input       0 1 input
+Tile                    op_0        1 1 input out %*=%*
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "torch.tile";
+    }
+
+    bool match(const std::map<std::string, Parameter>& captured_params) const
+    {
+        if (captured_params.find("op_0.repeats") == captured_params.end())
+            return false;
+
+        return true;
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
+    {
+        if (captured_params.at("op_0.repeats").type == 5)
+        {
+            op->params["dims"] = captured_params.at("op_0.repeats");
+        }
+        else // if (captured_params.at("op_0.repeats").type == 2)
+        {
+            op->params["dims"] = std::vector<int>{captured_params.at("op_0.repeats").i};
+        }
+    }
+};
+
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(torch_tile_onnx_1, 20)
+
 } // namespace pnnx
diff --git a/tools/pnnx/src/pass_onnx/fuse_constant_as_attribute.cpp b/tools/pnnx/src/pass_onnx/fuse_constant_as_attribute.cpp
index a3021d33c90..aba88976233 100644
--- a/tools/pnnx/src/pass_onnx/fuse_constant_as_attribute.cpp
+++ b/tools/pnnx/src/pass_onnx/fuse_constant_as_attribute.cpp
@@ -32,6 +32,7 @@ struct constant_as_attribute
 };
 
 static constant_as_attribute caas[] = {
+    {"Expand", 1, "shape"},
     {"Gather", 1, "indices"},
     {"If", 0, "cond"},
     {"Pad", 1, "pads"},
@@ -49,6 +50,7 @@ static constant_as_attribute caas[] = {
     {"Slice", 3, "axes"},
     {"Slice", 4, "steps"},
     {"Squeeze", 1, "axes"},
+    {"Tile", 1, "repeats"},
     {"Unsqueeze", 1, "axes"},
     {"Upsample", 1, "scales"},
 };
diff --git a/tools/pnnx/tests/ncnn/test_torch_unbind.py b/tools/pnnx/tests/ncnn/test_torch_unbind.py
index 3b8e427010c..8e224612d7e 100644
--- a/tools/pnnx/tests/ncnn/test_torch_unbind.py
+++ b/tools/pnnx/tests/ncnn/test_torch_unbind.py
@@ -26,6 +26,7 @@ def forward(self, x, y):
 
         x0 = F.relu(x0)
         x1 = F.relu(x1)
+        x2 = F.relu(x2)
         y0 = F.relu(y0)
         y1 = F.relu(y1)
         y2 = F.relu(y2)
@@ -35,7 +36,7 @@ def forward(self, x, y):
         y6 = F.relu(y6)
         y7 = F.relu(y7)
         y8 = F.relu(y8)
-        return x0, x1, y0, y1, y2, y3, y4, y5, y6, y7, y8
+        return x0, x1, x2, y0, y1, y2, y3, y4, y5, y6, y7, y8
 
 def test():
     net = Model()
diff --git a/tools/pnnx/tests/onnx/CMakeLists.txt b/tools/pnnx/tests/onnx/CMakeLists.txt
index 0e283e77d48..f4756740a79 100644
--- a/tools/pnnx/tests/onnx/CMakeLists.txt
+++ b/tools/pnnx/tests/onnx/CMakeLists.txt
@@ -126,8 +126,29 @@ pnnx_onnx_add_test(squeezenet1_1)
 pnnx_onnx_add_test(swin_t)
 pnnx_onnx_add_test(vit_b_32)
 
+pnnx_onnx_add_test(Tensor_expand)
+pnnx_onnx_add_test(Tensor_permute)
+pnnx_onnx_add_test(Tensor_repeat)
+pnnx_onnx_add_test(Tensor_reshape)
+pnnx_onnx_add_test(Tensor_select)
+pnnx_onnx_add_test(Tensor_slice)
+pnnx_onnx_add_test(Tensor_view)
+
+pnnx_onnx_add_test(torch_cat)
+pnnx_onnx_add_test(torch_ceil)
+pnnx_onnx_add_test(torch_chunk)
+pnnx_onnx_add_test(torch_flatten)
+pnnx_onnx_add_test(torch_floor)
 pnnx_onnx_add_test(torch_max)
+pnnx_onnx_add_test(torch_maximum)
 pnnx_onnx_add_test(torch_mean)
 pnnx_onnx_add_test(torch_min)
+pnnx_onnx_add_test(torch_minimum)
 pnnx_onnx_add_test(torch_prod)
+pnnx_onnx_add_test(torch_split)
+pnnx_onnx_add_test(torch_squeeze)
+pnnx_onnx_add_test(torch_stack)
 pnnx_onnx_add_test(torch_sum)
+pnnx_onnx_add_test(torch_transpose)
+pnnx_onnx_add_test(torch_unbind)
+pnnx_onnx_add_test(torch_unsqueeze)
diff --git a/tools/pnnx/tests/onnx/test_Tensor_expand.py b/tools/pnnx/tests/onnx/test_Tensor_expand.py
new file mode 100644
index 00000000000..ceb01dac4c8
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_Tensor_expand.py
@@ -0,0 +1,60 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = x.expand(24)
+        y = y.expand(-1, 11, -1)
+        z = z.expand(2, 8, 3, -1, 4)
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1)
+    y = torch.rand(3, 1, 1)
+    z = torch.rand(1, 8, 1, 9, 1)
+
+    a = net(x, y, z)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z), "test_Tensor_expand.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_Tensor_expand.onnx inputshape=[1],[3,1,1],[1,8,1,9,1]")
+
+    # pnnx inference
+    import test_Tensor_expand_pnnx
+    b = test_Tensor_expand_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_Tensor_permute.py b/tools/pnnx/tests/onnx/test_Tensor_permute.py
new file mode 100644
index 00000000000..a36de4c251c
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_Tensor_permute.py
@@ -0,0 +1,64 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from packaging import version
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = x.permute(1, 0, 2)
+        x = x.permute(0, 1, 2)
+        y = y.permute(2, 3, 1, 0)
+        y = y.permute(3, 1, 0, 2)
+        z = z.permute(1, 3, 0, 4, 2)
+        z = z.permute(0, 2, 4, 3, 1)
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 3, 16)
+    y = torch.rand(1, 5, 9, 11)
+    z = torch.rand(14, 8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z), "test_Tensor_permute.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_Tensor_permute.onnx inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]")
+
+    # pnnx inference
+    import test_Tensor_permute_pnnx
+    b = test_Tensor_permute_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_Tensor_repeat.py b/tools/pnnx/tests/onnx/test_Tensor_repeat.py
new file mode 100644
index 00000000000..569ad548bea
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_Tensor_repeat.py
@@ -0,0 +1,63 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = x.repeat(1, 2, 3)
+        x = x.repeat(2, 3, 4)
+        y = y.repeat(1, 2, 1, 4)
+        y = y.repeat(3, 4, 5, 1)
+        z = z.repeat(1, 2, 3, 1, 5)
+        z = z.repeat(2, 3, 3, 1, 1)
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 3, 16)
+    y = torch.rand(1, 5, 9, 11)
+    z = torch.rand(14, 8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z), "test_Tensor_repeat.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_Tensor_repeat.onnx inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]")
+
+    # pnnx inference
+    import test_Tensor_repeat_pnnx
+    b = test_Tensor_repeat_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_Tensor_reshape.py b/tools/pnnx/tests/onnx/test_Tensor_reshape.py
new file mode 100644
index 00000000000..027fb40a07d
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_Tensor_reshape.py
@@ -0,0 +1,63 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = x.reshape(1, 2, 24)
+        x = x.reshape(48)
+        y = y.reshape(1, 11, 5, 9)
+        y = y.reshape(99, 5)
+        z = z.reshape(4, 3, 30, 10, 14)
+        z = z.reshape(15, 2, 10, 7, 8, 3)
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 3, 16)
+    y = torch.rand(1, 5, 9, 11)
+    z = torch.rand(14, 8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z), "test_Tensor_reshape.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_Tensor_reshape.onnx inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]")
+
+    # pnnx inference
+    import test_Tensor_reshape_pnnx
+    b = test_Tensor_reshape_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_Tensor_select.py b/tools/pnnx/tests/onnx/test_Tensor_select.py
new file mode 100644
index 00000000000..4f7488b55a5
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_Tensor_select.py
@@ -0,0 +1,60 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = x.select(1, 1)
+        y = y.select(2, 4)
+        z = z.select(0, 10)
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 3, 16)
+    y = torch.rand(1, 5, 9, 11)
+    z = torch.rand(14, 8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z), "test_Tensor_select.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_Tensor_select.onnx inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]")
+
+    # pnnx inference
+    import test_Tensor_select_pnnx
+    b = test_Tensor_select_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_Tensor_slice.py b/tools/pnnx/tests/onnx/test_Tensor_slice.py
new file mode 100644
index 00000000000..7fe32b4af61
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_Tensor_slice.py
@@ -0,0 +1,79 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from packaging import version
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        if version.parse(torch.__version__) < version.parse('1.12'):
+            x = x[:,:12,1:14:1]
+        else:
+            x = x[:,:12,1:14:2]
+        x = x[...,1:]
+        if version.parse(torch.__version__) >= version.parse('1.10'):
+            x = x[:,:,:x.size(2)-1]
+        y = y[0:,1:,5:,3:]
+        if version.parse(torch.__version__) < version.parse('1.12'):
+            y = y[:,:,1:13:1,:14]
+        else:
+            y = y[:,:,1:13:2,:14]
+        if version.parse(torch.__version__) >= version.parse('1.10'):
+            y = y[:1,:y.size(1):,:,:]
+        z = z[4:]
+        if version.parse(torch.__version__) < version.parse('1.12'):
+            z = z[:2,:,:,:,2:-2:1]
+        else:
+            z = z[:2,:,:,:,2:-2:3]
+        if version.parse(torch.__version__) >= version.parse('1.10'):
+            z = z[:,:,:,z.size(3)-3:,:]
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 13, 26)
+    y = torch.rand(1, 15, 19, 21)
+    z = torch.rand(14, 18, 15, 19, 20)
+
+    a = net(x, y, z)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z), "test_Tensor_slice.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_Tensor_slice.onnx inputshape=[1,13,26],[1,15,19,21],[14,18,15,19,20]")
+
+    # pnnx inference
+    import test_Tensor_slice_pnnx
+    b = test_Tensor_slice_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_Tensor_view.py b/tools/pnnx/tests/onnx/test_Tensor_view.py
new file mode 100644
index 00000000000..40df090a07b
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_Tensor_view.py
@@ -0,0 +1,63 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = x.view(1, 2, 24)
+        x = x.view(48)
+        y = y.view(1, 11, 5, 9)
+        y = y.view(99, 5)
+        z = z.view(4, 3, 30, 10, 14)
+        z = z.view(15, 2, 10, 7, 8, 3)
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 3, 16)
+    y = torch.rand(1, 5, 9, 11)
+    z = torch.rand(14, 8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z), "test_Tensor_view.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_Tensor_view.onnx inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]")
+
+    # pnnx inference
+    import test_Tensor_view_pnnx
+    b = test_Tensor_view_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_torch_cat.py b/tools/pnnx/tests/onnx/test_torch_cat.py
new file mode 100644
index 00000000000..0d944434d28
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_torch_cat.py
@@ -0,0 +1,61 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z, w):
+        out0 = torch.cat((x, y), dim=1)
+        out1 = torch.cat((z, w), dim=3)
+        out2 = torch.cat((w, w), dim=2)
+        return out0, out1, out2
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 3, 16)
+    y = torch.rand(1, 2, 16)
+    z = torch.rand(1, 5, 9, 11)
+    w = torch.rand(1, 5, 9, 3)
+
+    a = net(x, y, z, w)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z, w), "test_torch_cat.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_torch_cat.onnx inputshape=[1,3,16],[1,2,16],[1,5,9,11],[1,5,9,3]")
+
+    # pnnx inference
+    import test_torch_cat_pnnx
+    b = test_torch_cat_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_torch_ceil.py b/tools/pnnx/tests/onnx/test_torch_ceil.py
new file mode 100644
index 00000000000..1ff59b37a48
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_torch_ceil.py
@@ -0,0 +1,60 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = torch.ceil(x * 10)
+        y = torch.ceil(y * 10)
+        z = torch.ceil(z * 10)
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 3, 16)
+    y = torch.rand(1, 5, 9, 11)
+    z = torch.rand(14, 8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z), "test_torch_ceil.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_torch_ceil.onnx inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]")
+
+    # pnnx inference
+    import test_torch_ceil_pnnx
+    b = test_torch_ceil_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_torch_chunk.py b/tools/pnnx/tests/onnx/test_torch_chunk.py
new file mode 100644
index 00000000000..2d1400103b9
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_torch_chunk.py
@@ -0,0 +1,60 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x0, x1 = torch.chunk(x, chunks=2, dim=1)
+        y0, y1, y2 = torch.chunk(y, chunks=3, dim=2)
+        z0, z1, z2, z3, z4 = torch.chunk(z, chunks=5, dim=0)
+        return x0, x1, y0, y1, y2, z0, z1, z2, z3, z4
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 3, 16)
+    y = torch.rand(1, 5, 9, 11)
+    z = torch.rand(14, 8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z), "test_torch_chunk.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_torch_chunk.onnx inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]")
+
+    # pnnx inference
+    import test_torch_chunk_pnnx
+    b = test_torch_chunk_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_torch_flatten.py b/tools/pnnx/tests/onnx/test_torch_flatten.py
new file mode 100644
index 00000000000..6105b106804
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_torch_flatten.py
@@ -0,0 +1,63 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = torch.flatten(x)
+        y = torch.flatten(y, start_dim=1, end_dim=-1)
+        z = torch.flatten(z, start_dim=3, end_dim=4)
+        x = x.relu()
+        y = y.relu()
+        z = z.relu()
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 3, 16)
+    y = torch.rand(1, 5, 9, 11)
+    z = torch.rand(14, 8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z), "test_torch_flatten.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_torch_flatten.onnx inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]")
+
+    # pnnx inference
+    import test_torch_flatten_pnnx
+    b = test_torch_flatten_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_torch_floor.py b/tools/pnnx/tests/onnx/test_torch_floor.py
new file mode 100644
index 00000000000..a046e4c241a
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_torch_floor.py
@@ -0,0 +1,60 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = torch.floor(x * 10)
+        y = torch.floor(y * 10)
+        z = torch.floor(z * 10)
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 3, 16)
+    y = torch.rand(1, 5, 9, 11)
+    z = torch.rand(14, 8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z), "test_torch_floor.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_torch_floor.onnx inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]")
+
+    # pnnx inference
+    import test_torch_floor_pnnx
+    b = test_torch_floor_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_torch_maximum.py b/tools/pnnx/tests/onnx/test_torch_maximum.py
new file mode 100644
index 00000000000..5e17d5cb2d2
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_torch_maximum.py
@@ -0,0 +1,64 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from packaging import version
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        out0 = torch.maximum(x, y)
+        out1 = torch.maximum(y, y)
+        out2 = torch.maximum(z, torch.ones_like(z) + 0.1)
+        return out0, out1, out2
+
+def test():
+    if version.parse(torch.__version__) < version.parse('1.12'):
+        return True
+
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(3, 16)
+    y = torch.rand(3, 16)
+    z = torch.rand(5, 9, 3)
+
+    a = net(x, y, z)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z), "test_torch_maximum.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_torch_maximum.onnx inputshape=[3,16],[3,16],[5,9,3]")
+
+    # pnnx inference
+    import test_torch_maximum_pnnx
+    b = test_torch_maximum_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_torch_minimum.py b/tools/pnnx/tests/onnx/test_torch_minimum.py
new file mode 100644
index 00000000000..0d8e9a87e50
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_torch_minimum.py
@@ -0,0 +1,64 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from packaging import version
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        out0 = torch.minimum(x, y)
+        out1 = torch.minimum(y, y)
+        out2 = torch.minimum(z, torch.ones_like(z) + 0.1)
+        return out0, out1, out2
+
+def test():
+    if version.parse(torch.__version__) < version.parse('1.12'):
+        return True
+
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(3, 16)
+    y = torch.rand(3, 16)
+    z = torch.rand(5, 9, 3)
+
+    a = net(x, y, z)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z), "test_torch_minimum.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_torch_minimum.onnx inputshape=[3,16],[3,16],[5,9,3]")
+
+    # pnnx inference
+    import test_torch_minimum_pnnx
+    b = test_torch_minimum_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_torch_split.py b/tools/pnnx/tests/onnx/test_torch_split.py
new file mode 100644
index 00000000000..b13b041cd96
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_torch_split.py
@@ -0,0 +1,60 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x0, x1 = torch.split(x, split_size_or_sections=2, dim=1)
+        y0, y1, y2 = torch.split(y, split_size_or_sections=[1,3,5], dim=2)
+        z0, z1, z2, z3, z4 = torch.split(z, split_size_or_sections=3, dim=0)
+        return x0, x1, y0, y1, y2, z0, z1, z2, z3, z4
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 3, 16)
+    y = torch.rand(1, 5, 9, 11)
+    z = torch.rand(14, 8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z), "test_torch_split.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_torch_split.onnx inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]")
+
+    # pnnx inference
+    import test_torch_split_pnnx
+    b = test_torch_split_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_torch_squeeze.py b/tools/pnnx/tests/onnx/test_torch_squeeze.py
new file mode 100644
index 00000000000..b29e4ba2f9d
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_torch_squeeze.py
@@ -0,0 +1,60 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = torch.squeeze(x, 1)
+        y = torch.squeeze(y)
+        z = torch.squeeze(z, 4)
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 1, 16)
+    y = torch.rand(1, 5, 1, 11)
+    z = torch.rand(14, 8, 5, 9, 1)
+
+    a = net(x, y, z)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z), "test_torch_squeeze.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_torch_squeeze.onnx inputshape=[1,1,16],[1,5,1,11],[14,8,5,9,1]")
+
+    # pnnx inference
+    import test_torch_squeeze_pnnx
+    b = test_torch_squeeze_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_torch_stack.py b/tools/pnnx/tests/onnx/test_torch_stack.py
new file mode 100644
index 00000000000..7b04ddd307f
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_torch_stack.py
@@ -0,0 +1,62 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z, w):
+        out0 = torch.stack((x, y), dim=0)
+        out1 = torch.stack((x, y), dim=2)
+        out2 = torch.stack((z, w), dim=2)
+        out3 = torch.stack((z, w), dim=-1)
+        return out0, out1, out2, out3
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(3, 16)
+    y = torch.rand(3, 16)
+    z = torch.rand(5, 9, 3)
+    w = torch.rand(5, 9, 3)
+
+    a = net(x, y, z, w)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z, w), "test_torch_stack.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_torch_stack.onnx inputshape=[3,16],[3,16],[5,9,3],[5,9,3]")
+
+    # pnnx inference
+    import test_torch_stack_pnnx
+    b = test_torch_stack_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_torch_transpose.py b/tools/pnnx/tests/onnx/test_torch_transpose.py
new file mode 100644
index 00000000000..e6a25c44101
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_torch_transpose.py
@@ -0,0 +1,60 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = torch.transpose(x, 1, 2)
+        y = torch.transpose(y, 2, 3)
+        z = torch.transpose(z, 1, 3)
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 3, 16)
+    y = torch.rand(1, 5, 9, 11)
+    z = torch.rand(14, 8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z), "test_torch_transpose.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_torch_transpose.onnx inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]")
+
+    # pnnx inference
+    import test_torch_transpose_pnnx
+    b = test_torch_transpose_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_torch_unbind.py b/tools/pnnx/tests/onnx/test_torch_unbind.py
new file mode 100644
index 00000000000..a98fa25c51c
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_torch_unbind.py
@@ -0,0 +1,60 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x0, x1, x2 = torch.unbind(x, dim=1)
+        y0, y1, y2, y3, y4, y5, y6, y7, y8 = torch.unbind(y, dim=2)
+        z0, z1, z2, z3 = torch.unbind(z, dim=0)
+        return x0, x1, x2, y0, y1, y2, y3, y4, y5, y6, y7, y8, z0, z1, z2, z3
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 3, 16)
+    y = torch.rand(1, 5, 9, 11)
+    z = torch.rand(4, 8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z), "test_torch_unbind.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_torch_unbind.onnx inputshape=[1,3,16],[1,5,9,11],[4,8,5,9,10]")
+
+    # pnnx inference
+    import test_torch_unbind_pnnx
+    b = test_torch_unbind_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/test_torch_unsqueeze.py b/tools/pnnx/tests/onnx/test_torch_unsqueeze.py
new file mode 100644
index 00000000000..01bf84076cf
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_torch_unsqueeze.py
@@ -0,0 +1,63 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = torch.unsqueeze(x, 0)
+        x = torch.unsqueeze(x, 1)
+        y = torch.unsqueeze(y, 2)
+        y = torch.unsqueeze(y, -1)
+        z = torch.unsqueeze(z, -2)
+        z = torch.unsqueeze(z, 3)
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 3, 16)
+    y = torch.rand(1, 5, 9, 11)
+    z = torch.rand(14, 8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z), "test_torch_unsqueeze.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_torch_unsqueeze.onnx inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]")
+
+    # pnnx inference
+    import test_torch_unsqueeze_pnnx
+    b = test_torch_unsqueeze_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/test_torch_unbind.py b/tools/pnnx/tests/test_torch_unbind.py
index c92c87b7435..b232f289dab 100644
--- a/tools/pnnx/tests/test_torch_unbind.py
+++ b/tools/pnnx/tests/test_torch_unbind.py
@@ -24,7 +24,7 @@ def forward(self, x, y, z):
         x0, x1, x2 = torch.unbind(x, dim=1)
         y0, y1, y2, y3, y4, y5, y6, y7, y8 = torch.unbind(y, dim=2)
         z0, z1, z2, z3 = torch.unbind(z, dim=0)
-        return x0, x1, y0, y1, y2, y3, y4, y5, y6, y7, y8, z0, z1, z2, z3
+        return x0, x1, x2, y0, y1, y2, y3, y4, y5, y6, y7, y8, z0, z1, z2, z3
 
 def test():
     net = Model()

From 081a9c39c8a6d4486f67f43699d97e6a6e4c89c2 Mon Sep 17 00:00:00 2001
From: zhangyang2057 <zhangyang@canaan-creative.com>
Date: Thu, 18 Jul 2024 14:19:52 +0800
Subject: [PATCH 03/38] Fix tanh typo for rvv. (#5584)

* Fix tanh typo for rvv.

* Fix tanh for rvv fp16.
---
 src/layer/riscv/rvv_mathfun.h       | 2 +-
 src/layer/riscv/rvv_mathfun_fp16s.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/layer/riscv/rvv_mathfun.h b/src/layer/riscv/rvv_mathfun.h
index 980261a1496..2ec10bae48a 100644
--- a/src/layer/riscv/rvv_mathfun.h
+++ b/src/layer/riscv/rvv_mathfun.h
@@ -308,7 +308,7 @@ _RVV_FLOAT32_COS_OP(8, 4)
                                                                                                  \
         /* clamp the inputs to the range [-9, 9] since anything outside */                       \
         /* this range is -/+1.0f in single-precision.                   */                       \
-        x2 = vfmin_vf_f32m##LMUL(x, c_tanh_hi, vl);                                              \
+        x2 = vfmin_vf_f32m##LMUL(x2, c_tanh_hi, vl);                                             \
                                                                                                  \
         /* since the polynomials are odd/even, we need x**2. */                                  \
         vfloat32m##LMUL##_t z = vfmul_vv_f32m##LMUL(x2, x2, vl);                                 \
diff --git a/src/layer/riscv/rvv_mathfun_fp16s.h b/src/layer/riscv/rvv_mathfun_fp16s.h
index ee5ffe4a304..2cf5d08f4f0 100644
--- a/src/layer/riscv/rvv_mathfun_fp16s.h
+++ b/src/layer/riscv/rvv_mathfun_fp16s.h
@@ -308,7 +308,7 @@ _RVV_FLOAT16_COS_OP(8, 2)
                                                                                                  \
         /* clamp the inputs to the range [-9, 9] since anything outside */                       \
         /* this range is -/+1.0f in single-precision.                   */                       \
-        x2 = vfmin_vf_f16m##LMUL(x, c_tanh_hi, vl);                                              \
+        x2 = vfmin_vf_f16m##LMUL(x2, c_tanh_hi, vl);                                             \
                                                                                                  \
         /* since the polynomials are odd/even, we need x**2. */                                  \
         vfloat16m##LMUL##_t z = vfmul_vv_f16m##LMUL(x2, x2, vl);                                 \

From 997c8926d706db5f9e4098aec8ed51c49ab9417c Mon Sep 17 00:00:00 2001
From: nihui <nihuini@tencent.com>
Date: Thu, 18 Jul 2024 14:20:09 +0800
Subject: [PATCH 04/38] use ruapu detection only on windows arm, enable cpu
 powerinfo with mingw compiler (#5593)

---
 src/cpu.cpp       | 41 +++++++++++++++++++++--------------------
 src/cpu.h         |  4 ++--
 src/platform.h.in |  8 ++++----
 3 files changed, 27 insertions(+), 26 deletions(-)

diff --git a/src/cpu.cpp b/src/cpu.cpp
index b1afbba3f65..f9e64a1cc75 100644
--- a/src/cpu.cpp
+++ b/src/cpu.cpp
@@ -46,10 +46,9 @@
 #include <emscripten/threading.h>
 #endif
 
-#if defined _WIN32 && !(defined __MINGW32__)
+#if defined _WIN32
 #define WIN32_LEAN_AND_MEAN
 #include <windows.h>
-#include <powerbase.h>
 #endif
 
 #if defined __ANDROID__ || defined __linux__
@@ -129,8 +128,10 @@
 #include <immintrin.h>
 #endif
 
+#if (defined _WIN32 && (__aarch64__ || __arm__))
 #define RUAPU_IMPLEMENTATION
 #include "ruapu.h"
+#endif
 
 // topology info
 static int g_cpucount;
@@ -596,9 +597,6 @@ static int get_cpu_support_x86_avx2()
 
 static int get_cpu_support_x86_avx_vnni()
 {
-#if __APPLE__
-    return ruapu_supports("avxvnni");
-#else
     unsigned int cpu_info[4] = {0};
     x86_cpuid(0, cpu_info);
 
@@ -617,13 +615,16 @@ static int get_cpu_support_x86_avx_vnni()
 
     x86_cpuid_sublevel(7, 1, cpu_info);
     return cpu_info[0] & (1u << 4);
-#endif
 }
 
 static int get_cpu_support_x86_avx512()
 {
 #if __APPLE__
-    return ruapu_supports("avx512f") && ruapu_supports("avx512bw") && ruapu_supports("avx512cd") && ruapu_supports("avx512dq") && ruapu_supports("avx512vl");
+    return get_hw_capability("hw.optional.avx512f")
+           && get_hw_capability("hw.optional.avx512bw")
+           && get_hw_capability("hw.optional.avx512cd")
+           && get_hw_capability("hw.optional.avx512dq")
+           && get_hw_capability("hw.optional.avx512vl");
 #else
     unsigned int cpu_info[4] = {0};
     x86_cpuid(0, cpu_info);
@@ -653,7 +654,7 @@ static int get_cpu_support_x86_avx512()
 static int get_cpu_support_x86_avx512_vnni()
 {
 #if __APPLE__
-    return ruapu_supports("avx512vnni");
+    return get_hw_capability("hw.optional.avx512vnni");
 #else
     unsigned int cpu_info[4] = {0};
     x86_cpuid(0, cpu_info);
@@ -683,7 +684,7 @@ static int get_cpu_support_x86_avx512_vnni()
 static int get_cpu_support_x86_avx512_bf16()
 {
 #if __APPLE__
-    return ruapu_supports("avx512bf16");
+    return get_hw_capability("hw.optional.avx512bf16");
 #else
     unsigned int cpu_info[4] = {0};
     x86_cpuid(0, cpu_info);
@@ -709,7 +710,7 @@ static int get_cpu_support_x86_avx512_bf16()
 static int get_cpu_support_x86_avx512_fp16()
 {
 #if __APPLE__
-    return ruapu_supports("avx512fp16");
+    return get_hw_capability("hw.optional.avx512fp16");
 #else
     unsigned int cpu_info[4] = {0};
     x86_cpuid(0, cpu_info);
@@ -745,7 +746,7 @@ static int get_cpucount()
         count = emscripten_num_logical_cores();
     else
         count = 1;
-#elif (defined _WIN32 && !(defined __MINGW32__))
+#elif defined _WIN32
     SYSTEM_INFO system_info;
     GetSystemInfo(&system_info);
     count = system_info.dwNumberOfProcessors;
@@ -812,7 +813,7 @@ static int get_thread_siblings(int cpuid)
 static int get_physical_cpucount()
 {
     int count = 0;
-#if (defined _WIN32 && !(defined __MINGW32__))
+#if defined _WIN32
     typedef BOOL(WINAPI * LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD);
     LPFN_GLPI glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation");
     if (glpi == NULL)
@@ -1050,7 +1051,7 @@ static int get_big_cpu_data_cache_size(int level)
 static int get_cpu_level2_cachesize()
 {
     int size = 0;
-#if (defined _WIN32 && !(defined __MINGW32__))
+#if defined _WIN32
     typedef BOOL(WINAPI * LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD);
     LPFN_GLPI glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation");
     if (glpi != NULL)
@@ -1120,7 +1121,7 @@ static int get_cpu_level2_cachesize()
 static int get_cpu_level3_cachesize()
 {
     int size = 0;
-#if (defined _WIN32 && !(defined __MINGW32__))
+#if defined _WIN32
     typedef BOOL(WINAPI * LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD);
     LPFN_GLPI glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation");
     if (glpi != NULL)
@@ -1167,7 +1168,7 @@ static int get_cpu_level3_cachesize()
     return size;
 }
 
-#if (defined _WIN32 && !(defined __MINGW32__))
+#if defined _WIN32
 static ncnn::CpuSet get_smt_cpu_mask()
 {
     ncnn::CpuSet smt_cpu_mask;
@@ -1261,7 +1262,7 @@ static int set_sched_affinity(const ncnn::CpuSet& thread_affinity_mask)
 
     return 0;
 }
-#endif // (defined _WIN32 && !(defined __MINGW32__))
+#endif // defined _WIN32
 
 #if defined __ANDROID__ || defined __linux__
 static int get_max_freq_khz(int cpuid)
@@ -1435,7 +1436,7 @@ static void initialize_cpu_thread_affinity_mask(ncnn::CpuSet& mask_all, ncnn::Cp
         mask_all.enable(i);
     }
 
-#if (defined _WIN32 && !(defined __MINGW32__))
+#if defined _WIN32
     // get max freq mhz for all cores
     int max_freq_mhz_min = INT_MAX;
     int max_freq_mhz_max = 0;
@@ -1867,7 +1868,7 @@ static void initialize_global_cpu_info()
     g_powersave = 0;
     initialize_cpu_thread_affinity_mask(g_cpu_affinity_mask_all, g_cpu_affinity_mask_little, g_cpu_affinity_mask_big);
 
-#if (defined _WIN32 && (__aarch64__ || __arm__)) || __APPLE__
+#if (defined _WIN32 && (__aarch64__ || __arm__))
     if (!is_being_debugged())
     {
         ruapu_init();
@@ -1944,7 +1945,7 @@ static inline void try_initialize_global_cpu_info()
 
 namespace ncnn {
 
-#if (defined _WIN32 && !(defined __MINGW32__))
+#if defined _WIN32
 CpuSet::CpuSet()
 {
     disable_all();
@@ -2685,7 +2686,7 @@ const CpuSet& get_cpu_thread_affinity_mask(int powersave)
 int set_cpu_thread_affinity(const CpuSet& thread_affinity_mask)
 {
     try_initialize_global_cpu_info();
-#if defined __ANDROID__ || defined __linux__ || (defined _WIN32 && !(defined __MINGW32__))
+#if defined __ANDROID__ || defined __linux__ || defined _WIN32
 #ifdef _OPENMP
     int num_threads = thread_affinity_mask.num_enabled();
 
diff --git a/src/cpu.h b/src/cpu.h
index 7d6bfce1108..2ae6b8c3ffe 100644
--- a/src/cpu.h
+++ b/src/cpu.h
@@ -17,7 +17,7 @@
 
 #include <stddef.h>
 
-#if (defined _WIN32 && !(defined __MINGW32__))
+#if defined _WIN32
 #define WIN32_LEAN_AND_MEAN
 #include <windows.h>
 #endif
@@ -40,7 +40,7 @@ class NCNN_EXPORT CpuSet
     int num_enabled() const;
 
 public:
-#if (defined _WIN32 && !(defined __MINGW32__))
+#if defined _WIN32
     ULONG_PTR mask;
 #endif
 #if defined __ANDROID__ || defined __linux__
diff --git a/src/platform.h.in b/src/platform.h.in
index a0f17f39e31..50a9454b7da 100644
--- a/src/platform.h.in
+++ b/src/platform.h.in
@@ -70,7 +70,7 @@
 #ifdef __cplusplus
 
 #if NCNN_THREADS
-#if (defined _WIN32 && !(defined __MINGW32__))
+#if defined _WIN32
 #define WIN32_LEAN_AND_MEAN
 #include <windows.h>
 #include <process.h>
@@ -86,7 +86,7 @@
 namespace ncnn {
 
 #if NCNN_THREADS
-#if (defined _WIN32 && !(defined __MINGW32__))
+#if defined _WIN32
 class NCNN_EXPORT Mutex
 {
 public:
@@ -141,7 +141,7 @@ public:
 private:
     DWORD key;
 };
-#else // (defined _WIN32 && !(defined __MINGW32__))
+#else // defined _WIN32
 class NCNN_EXPORT Mutex
 {
 public:
@@ -186,7 +186,7 @@ public:
 private:
     pthread_key_t key;
 };
-#endif // (defined _WIN32 && !(defined __MINGW32__))
+#endif // defined _WIN32
 #else // NCNN_THREADS
 class NCNN_EXPORT Mutex
 {

From f825d3a23c77cfd51b42ddfcd10343627c2d536d Mon Sep 17 00:00:00 2001
From: nihui <nihuini@tencent.com>
Date: Thu, 18 Jul 2024 14:20:30 +0800
Subject: [PATCH 05/38] pnnx fuse onnx sdpa pattern and ncnn qdim mha fusion
 (#5589)

---
 tools/pnnx/src/CMakeLists.txt                 |   1 +
 .../fuse_scaled_dot_product_attention.cpp     |  84 ++++++-
 .../F_scaled_dot_product_attention.cpp        | 223 ++++++++++++++++++
 3 files changed, 306 insertions(+), 2 deletions(-)
 create mode 100644 tools/pnnx/src/pass_ncnn/F_scaled_dot_product_attention.cpp

diff --git a/tools/pnnx/src/CMakeLists.txt b/tools/pnnx/src/CMakeLists.txt
index e2fc28da9a9..986f6ebe81e 100644
--- a/tools/pnnx/src/CMakeLists.txt
+++ b/tools/pnnx/src/CMakeLists.txt
@@ -472,6 +472,7 @@ set(pnnx_pass_ncnn_SRCS
     pass_ncnn/F_prelu.cpp
     pass_ncnn/F_relu.cpp
     pass_ncnn/F_relu6.cpp
+    pass_ncnn/F_scaled_dot_product_attention.cpp
     pass_ncnn/F_selu.cpp
     pass_ncnn/F_sigmoid.cpp
     pass_ncnn/F_silu.cpp
diff --git a/tools/pnnx/src/pass_level5/fuse_scaled_dot_product_attention.cpp b/tools/pnnx/src/pass_level5/fuse_scaled_dot_product_attention.cpp
index 8f265f374dc..a6dcbc86db7 100644
--- a/tools/pnnx/src/pass_level5/fuse_scaled_dot_product_attention.cpp
+++ b/tools/pnnx/src/pass_level5/fuse_scaled_dot_product_attention.cpp
@@ -62,7 +62,7 @@ pnnx.Output             output      1 0 out
 pnnx.Input              input_0     0 1 query
 pnnx.Input              input_1     0 1 key
 pnnx.Input              input_2     0 1 value
-F.scaled_dot_product_attention op_0 3 1 query key value out attn_mask=None dropout_p=0.0 is_causal=False
+F.scaled_dot_product_attention sdpa 3 1 query key value out attn_mask=None dropout_p=0.0 is_causal=False
 pnnx.Output             output      1 0 out
 )PNNXIR";
     }
@@ -114,7 +114,7 @@ pnnx.Input              input_Rh    0 1 Rh
 pnnx.Input              input_Rw    0 1 Rw
 pnnx.Expression         RhRw        2 1 Rh Rw RhRw expr=add(@0,@1) #RhRw=(%batch,%h,%w,%h,%w)f32
 Tensor.reshape          attn_mask   1 1 RhRw attn_mask shape=(%batch,%qsize,%qsize) #attn_mask=(%batch,%qsize,%qsize)f32
-F.scaled_dot_product_attention op_0 4 1 query key value attn_mask out dropout_p=0.0 is_causal=False $attn_mask=attn_mask
+F.scaled_dot_product_attention sdpa 4 1 query key value attn_mask out dropout_p=0.0 is_causal=False $attn_mask=attn_mask
 pnnx.Output             output      1 0 out
 )PNNXIR";
     }
@@ -137,15 +137,95 @@ pnnx.Output             output      1 0 out
     }
 };
 
+class fuse_scaled_dot_product_attention_pass_onnx : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+12 11
+pnnx.Input              input_0     0 1 query
+pnnx.Input              input_1     0 1 key
+pnnx.Input              input_2     0 1 value
+pnnx.Input              input_3     0 1 attn_mask
+Tensor.permute          op_0        1 1 query 13 dims=(0,2,1,3)
+Tensor.permute          op_1        1 1 key 20 dims=(0,2,3,1)
+Tensor.permute          op_2        1 1 value 19 dims=(0,2,1,3)
+torch.matmul            op_3        2 1 13 20 21
+pnnx.Expression         op_4        2 1 21 attn_mask 23 expr=add(@0,@1)
+F.softmax               softmax     1 1 23 24 dim=%softmax_dim
+torch.matmul            op_6        2 1 24 19 out
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* replace_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+9 8
+pnnx.Input              input_0     0 1 query
+pnnx.Input              input_1     0 1 key
+pnnx.Input              input_2     0 1 value
+pnnx.Input              input_3     0 1 attn_mask
+Tensor.permute          op_0        1 1 query q dims=(0,2,1,3)
+Tensor.permute          op_1        1 1 key k dims=(0,2,1,3)
+Tensor.permute          op_2        1 1 value v dims=(0,2,1,3)
+F.scaled_dot_product_attention sdpa 4 1 q k v attn_mask out dropout_p=0.0 is_causal=False $attn_mask=attn_mask
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    bool match(const std::map<std::string, const Operator*>& matched_operators, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& /*captured_attrs*/) const
+    {
+        const int softmax_dim = captured_params.at("softmax_dim").i;
+
+        int softmax_input_rank = (int)matched_operators.at("softmax")->inputs[0]->shape.size();
+        if (softmax_dim != -1 && softmax_dim != softmax_input_rank - 1)
+            return false;
+
+        return true;
+    }
+
+    void write(const std::map<std::string, Operator*>& ops, const std::map<std::string, Parameter>& /*captured_params*/, const std::map<std::string, Attribute>& /*captured_attrs*/) const
+    {
+        Operator* op = ops.at("sdpa");
+
+        op->params["scale"] = 1.f;
+
+        // rewrite qkv shape
+        {
+            std::vector<int> q_shape = ops.at("op_0")->inputs[0]->shape;
+            std::vector<int> k_shape = ops.at("op_1")->inputs[0]->shape;
+            std::vector<int> v_shape = ops.at("op_2")->inputs[0]->shape;
+
+            if (!q_shape.empty())
+                std::swap(q_shape[1], q_shape[2]);
+            if (!k_shape.empty())
+                std::swap(k_shape[1], k_shape[2]);
+            if (!v_shape.empty())
+                std::swap(v_shape[1], v_shape[2]);
+
+            ops.at("op_0")->outputs[0]->shape = q_shape;
+            ops.at("op_0")->outputs[0]->type = ops.at("op_0")->inputs[0]->type;
+            ops.at("op_1")->outputs[0]->shape = k_shape;
+            ops.at("op_1")->outputs[0]->type = ops.at("op_1")->inputs[0]->type;
+            ops.at("op_2")->outputs[0]->shape = v_shape;
+            ops.at("op_2")->outputs[0]->type = ops.at("op_2")->inputs[0]->type;
+        }
+    }
+};
+
 void fuse_scaled_dot_product_attention(Graph& graph)
 {
 #if TORCH_VERSION_MAJOR >= 2
     fuse_scaled_dot_product_attention_pass a;
     fuse_scaled_dot_product_attention_pass_1 b;
+    fuse_scaled_dot_product_attention_pass_onnx onnx0;
     int opindex = 0;
 
     pnnx_graph_rewrite(graph, &a, opindex);
     pnnx_graph_rewrite(graph, &b, opindex);
+    pnnx_graph_rewrite(graph, &onnx0, opindex);
 #endif
 }
 
diff --git a/tools/pnnx/src/pass_ncnn/F_scaled_dot_product_attention.cpp b/tools/pnnx/src/pass_ncnn/F_scaled_dot_product_attention.cpp
new file mode 100644
index 00000000000..af9f06b3f52
--- /dev/null
+++ b/tools/pnnx/src/pass_ncnn/F_scaled_dot_product_attention.cpp
@@ -0,0 +1,223 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pass_ncnn.h"
+
+namespace pnnx {
+
+namespace ncnn {
+
+class F_scaled_dot_product_attention : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+16 15
+pnnx.Input              input_0     0 1 input
+pnnx.Input              input_1     0 1 attn_mask
+nn.Linear               op_0        1 1 input q bias=%qbias in_features=%qdim out_features=%embed_dim @bias @weight
+nn.Linear               op_1        1 1 input k bias=%kbias in_features=%kdim out_features=%embed_dim @bias @weight
+nn.Linear               op_2        1 1 input v bias=%vbias in_features=%vdim out_features=%embed_dim @bias @weight
+Tensor.reshape          op_3        1 1 q 10 shape=(%batch,%size,%num_heads,%feat_per_head)
+Tensor.reshape          op_4        1 1 k 12 shape=(%batch,%size,%num_heads,%feat_per_head)
+Tensor.reshape          op_5        1 1 v 14 shape=(%batch,%size,%num_heads,%feat_per_head)
+Tensor.permute          op_6        1 1 10 16 dims=(0,2,1,3)
+Tensor.permute          op_7        1 1 12 17 dims=(0,2,1,3)
+Tensor.permute          op_8        1 1 14 18 dims=(0,2,1,3)
+F.scaled_dot_product_attention op_9 4 1 16 17 18 attn_mask 19 dropout_p=0.0 is_causal=False scale=%scale
+Tensor.permute          op_10       1 1 19 20 dims=(0,2,1,3)
+Tensor.reshape          op_11       1 1 20 21 shape=(%batch,%size,%embed_dim)
+nn.Linear               out_proj    1 1 21 out bias=%outbias in_features=%embed_dim out_features=%qdim @bias @weight
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "MultiHeadAttention";
+    }
+
+    const char* name_str() const
+    {
+        return "sdpa_attention";
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
+    {
+        op->params["0"] = captured_params.at("embed_dim");
+        op->params["1"] = captured_params.at("num_heads");
+
+        const int embed_dim = captured_params.at("embed_dim").i;
+        const int qdim = captured_params.at("qdim").i;
+        const int kdim = captured_params.at("kdim").i;
+        const int vdim = captured_params.at("vdim").i;
+
+        op->params["2"] = embed_dim * qdim;
+        op->params["3"] = kdim;
+        op->params["4"] = vdim;
+        op->params["5"] = 1;
+        op->params["6"] = captured_params.at("scale");
+
+        op->attrs["0"] = Attribute();
+        op->attrs["0"].data = {0, 0, 0, 0};
+        op->attrs["1"] = captured_attrs.at("op_0.weight");
+        if (captured_params.at("qbias").b)
+        {
+            op->attrs["2"] = captured_attrs.at("op_0.bias");
+        }
+        else
+        {
+            op->attrs["2"] = Attribute({embed_dim}, std::vector<float>(embed_dim, 0.f));
+        }
+        op->attrs["3"] = Attribute();
+        op->attrs["3"].data = {0, 0, 0, 0};
+        op->attrs["4"] = captured_attrs.at("op_1.weight");
+        if (captured_params.at("kbias").b)
+        {
+            op->attrs["5"] = captured_attrs.at("op_1.bias");
+        }
+        else
+        {
+            op->attrs["5"] = Attribute({embed_dim}, std::vector<float>(embed_dim, 0.f));
+        }
+        op->attrs["6"] = Attribute();
+        op->attrs["6"].data = {0, 0, 0, 0};
+        op->attrs["7"] = captured_attrs.at("op_2.weight");
+        if (captured_params.at("vbias").b)
+        {
+            op->attrs["8"] = captured_attrs.at("op_2.bias");
+        }
+        else
+        {
+            op->attrs["8"] = Attribute({embed_dim}, std::vector<float>(embed_dim, 0.f));
+        }
+        op->attrs["9"] = Attribute();
+        op->attrs["9"].data = {0, 0, 0, 0};
+        op->attrs["a"] = captured_attrs.at("out_proj.weight");
+        if (captured_params.at("outbias").b)
+        {
+            op->attrs["b"] = captured_attrs.at("out_proj.bias");
+        }
+        else
+        {
+            op->attrs["b"] = Attribute({qdim}, std::vector<float>(qdim, 0.f));
+        }
+    }
+};
+
+REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_scaled_dot_product_attention, 10)
+
+class F_scaled_dot_product_attention_1 : public F_scaled_dot_product_attention
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+17 16
+pnnx.Input              input_0     0 1 input
+pnnx.Input              input_1     0 1 kv
+pnnx.Input              input_2     0 1 attn_mask
+nn.Linear               op_0        1 1 input q bias=%qbias in_features=%qdim out_features=%embed_dim @bias @weight
+nn.Linear               op_1        1 1 kv k bias=%kbias in_features=%kdim out_features=%embed_dim @bias @weight
+nn.Linear               op_2        1 1 kv v bias=%vbias in_features=%vdim out_features=%embed_dim @bias @weight
+Tensor.reshape          op_3        1 1 q 10 shape=(%batch,%qsize,%num_heads,%feat_per_head)
+Tensor.reshape          op_4        1 1 k 12 shape=(%batch,%size,%num_heads,%feat_per_head)
+Tensor.reshape          op_5        1 1 v 14 shape=(%batch,%size,%num_heads,%feat_per_head)
+Tensor.permute          op_6        1 1 10 16 dims=(0,2,1,3)
+Tensor.permute          op_7        1 1 12 17 dims=(0,2,1,3)
+Tensor.permute          op_8        1 1 14 18 dims=(0,2,1,3)
+F.scaled_dot_product_attention op_9 4 1 16 17 18 attn_mask 19 dropout_p=0.0 is_causal=False scale=%scale
+Tensor.permute          op_10       1 1 19 20 dims=(0,2,1,3)
+Tensor.reshape          op_11       1 1 20 21 shape=(%batch,%qsize,%embed_dim)
+nn.Linear               out_proj    1 1 21 out bias=%outbias in_features=%embed_dim out_features=%qdim @bias @weight
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+};
+
+REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_scaled_dot_product_attention_1, 10)
+
+class F_scaled_dot_product_attention_2 : public F_scaled_dot_product_attention
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+15 14
+pnnx.Input              input       0 1 input
+nn.Linear               op_0        1 1 input q bias=%qbias in_features=%qdim out_features=%embed_dim @bias @weight
+nn.Linear               op_1        1 1 input k bias=%kbias in_features=%kdim out_features=%embed_dim @bias @weight
+nn.Linear               op_2        1 1 input v bias=%vbias in_features=%vdim out_features=%embed_dim @bias @weight
+Tensor.reshape          op_3        1 1 q 10 shape=(%batch,%size,%num_heads,%feat_per_head)
+Tensor.reshape          op_4        1 1 k 12 shape=(%batch,%size,%num_heads,%feat_per_head)
+Tensor.reshape          op_5        1 1 v 14 shape=(%batch,%size,%num_heads,%feat_per_head)
+Tensor.permute          op_6        1 1 10 16 dims=(0,2,1,3)
+Tensor.permute          op_7        1 1 12 17 dims=(0,2,1,3)
+Tensor.permute          op_8        1 1 14 18 dims=(0,2,1,3)
+F.scaled_dot_product_attention op_9 3 1 16 17 18 19 dropout_p=0.0 is_causal=False attn_mask=None scale=%scale
+Tensor.permute          op_10       1 1 19 20 dims=(0,2,1,3)
+Tensor.reshape          op_11       1 1 20 21 shape=(%batch,%size,%embed_dim)
+nn.Linear               out_proj    1 1 21 out bias=%outbias in_features=%embed_dim out_features=%qdim @bias @weight
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
+    {
+        F_scaled_dot_product_attention::write(op, captured_params, captured_attrs);
+        op->params["5"] = 0;
+    }
+};
+
+REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_scaled_dot_product_attention_2, 10)
+
+class F_scaled_dot_product_attention_3 : public F_scaled_dot_product_attention
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+16 15
+pnnx.Input              input_0     0 1 input
+pnnx.Input              input_1     0 1 kv
+nn.Linear               op_0        1 1 input q bias=%qbias in_features=%qdim out_features=%embed_dim @bias @weight
+nn.Linear               op_1        1 1 kv k bias=%kbias in_features=%kdim out_features=%embed_dim @bias @weight
+nn.Linear               op_2        1 1 kv v bias=%vbias in_features=%vdim out_features=%embed_dim @bias @weight
+Tensor.reshape          op_3        1 1 q 10 shape=(%batch,%qsize,%num_heads,%feat_per_head)
+Tensor.reshape          op_4        1 1 k 12 shape=(%batch,%size,%num_heads,%feat_per_head)
+Tensor.reshape          op_5        1 1 v 14 shape=(%batch,%size,%num_heads,%feat_per_head)
+Tensor.permute          op_6        1 1 10 16 dims=(0,2,1,3)
+Tensor.permute          op_7        1 1 12 17 dims=(0,2,1,3)
+Tensor.permute          op_8        1 1 14 18 dims=(0,2,1,3)
+F.scaled_dot_product_attention op_9 3 1 16 17 18 19 dropout_p=0.0 is_causal=False attn_mask=None scale=%scale
+Tensor.permute          op_10       1 1 19 20 dims=(0,2,1,3)
+Tensor.reshape          op_11       1 1 20 21 shape=(%batch,%qsize,%embed_dim)
+nn.Linear               out_proj    1 1 21 out bias=%outbias in_features=%embed_dim out_features=%qdim @bias @weight
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
+    {
+        F_scaled_dot_product_attention::write(op, captured_params, captured_attrs);
+        op->params["5"] = 0;
+    }
+};
+
+REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_scaled_dot_product_attention_3, 10)
+
+} // namespace ncnn
+
+} // namespace pnnx

From e82015878c5d9d67ee4a6d85f769cdc14a6d561f Mon Sep 17 00:00:00 2001
From: nihui <nihuini@tencent.com>
Date: Thu, 18 Jul 2024 15:58:28 +0800
Subject: [PATCH 06/38] Update modelwriter.h for mha scale param

---
 tools/modelwriter.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/modelwriter.h b/tools/modelwriter.h
index 88ccb948a9c..4f445cfe2a4 100644
--- a/tools/modelwriter.h
+++ b/tools/modelwriter.h
@@ -2007,6 +2007,7 @@ int ModelWriter::save(const char* parampath, const char* binpath)
             fprintf_param_value(" 3=%d", kdim)
             fprintf_param_value(" 4=%d", vdim)
             fprintf_param_value(" 5=%d", attn_mask)
+            fprintf_param_value(" 6=%e", scale)
 
             fwrite_weight_tag_data(op->q_weight_data, bp);
             fwrite_weight_data(op->q_bias_data, bp);

From 3ee5c18f84963542ead978afa3c027ebf5526260 Mon Sep 17 00:00:00 2001
From: nihui <nihuini@tencent.com>
Date: Sat, 20 Jul 2024 00:16:38 +0800
Subject: [PATCH 07/38] pnnx logaddexp (#5598)

---
 tools/pnnx/src/ir.cpp                         |  4 +-
 .../pnnx/src/pass_level3/fuse_expression.cpp  |  3 +
 .../pnnx/src/pass_level5/eval_expression.cpp  |  8 ++-
 .../pnnx/src/pass_ncnn/expand_expression.cpp  |  2 +
 tools/pnnx/tests/CMakeLists.txt               |  1 +
 tools/pnnx/tests/test_torch_logaddexp.py      | 61 +++++++++++++++++++
 6 files changed, 77 insertions(+), 2 deletions(-)
 create mode 100644 tools/pnnx/tests/test_torch_logaddexp.py

diff --git a/tools/pnnx/src/ir.cpp b/tools/pnnx/src/ir.cpp
index 07d2bbefefd..cacd84fde79 100644
--- a/tools/pnnx/src/ir.cpp
+++ b/tools/pnnx/src/ir.cpp
@@ -1091,7 +1091,8 @@ static std::string expand_expression(const Operator* op)
                  || t == "maximum"
                  || t == "min"
                  || t == "minimum"
-                 || t == "pow")
+                 || t == "pow"
+                 || t == "logaddexp")
         {
             std::string binaryop;
             if (t == "atan2") binaryop = "torch.atan2";
@@ -1101,6 +1102,7 @@ static std::string expand_expression(const Operator* op)
             if (t == "min") binaryop = "torch.min";
             if (t == "minimum") binaryop = "torch.minimum";
             if (t == "pow") binaryop = "torch.pow";
+            if (t == "logaddexp") binaryop = "torch.logaddexp";
 
             std::string a = exprstack.top();
             exprstack.pop();
diff --git a/tools/pnnx/src/pass_level3/fuse_expression.cpp b/tools/pnnx/src/pass_level3/fuse_expression.cpp
index 708d1a548df..8fc918fed9d 100644
--- a/tools/pnnx/src/pass_level3/fuse_expression.cpp
+++ b/tools/pnnx/src/pass_level3/fuse_expression.cpp
@@ -154,6 +154,7 @@ static bool operand_maybe_tensor(const Operand* operand)
             || op->type == "aten::div"
             || op->type == "aten::floor_divide"
             || op->type == "aten::fmod"
+            || op->type == "aten::logaddexp"
             || op->type == "aten::max"
             || op->type == "aten::maximum"
             || op->type == "aten::min"
@@ -653,6 +654,7 @@ static void fuse_expression(Graph& graph, Operand* operand, std::string& expr, s
     else if (op->type == "aten::atan2"
              || op->type == "aten::floor_divide"
              || op->type == "aten::fmod"
+             || op->type == "aten::logaddexp"
              || op->type == "aten::max"
              || op->type == "aten::maximum"
              || op->type == "aten::min"
@@ -867,6 +869,7 @@ void fuse_expression(Graph& graph, const std::set<std::string>& foldable_constan
                     || op->type == "aten::fmod"
                     || op->type == "aten::log"
                     || op->type == "aten::log10"
+                    || op->type == "aten::logaddexp"
                     || op->type == "aten::max"
                     || op->type == "aten::maximum"
                     || op->type == "aten::min"
diff --git a/tools/pnnx/src/pass_level5/eval_expression.cpp b/tools/pnnx/src/pass_level5/eval_expression.cpp
index 44e1f7e3691..c7d5d5d0226 100644
--- a/tools/pnnx/src/pass_level5/eval_expression.cpp
+++ b/tools/pnnx/src/pass_level5/eval_expression.cpp
@@ -390,7 +390,8 @@ static std::string eval_expression(const Operator* op)
                  || t == "floor_divide"
                  || t == "fmod"
                  || t == "pow"
-                 || t == "remainder")
+                 || t == "remainder"
+                 || t == "logaddexp")
         {
             std::string a = exprstack.top();
             exprstack.pop();
@@ -459,6 +460,11 @@ static std::string eval_expression(const Operator* op)
                         r += bf;
                     exprstack.push(std::to_string(r));
                 }
+                if (t == "logaddexp")
+                {
+                    float r = log(exp(af) + exp(bf));
+                    exprstack.push(std::to_string(r));
+                }
             }
             else
             {
diff --git a/tools/pnnx/src/pass_ncnn/expand_expression.cpp b/tools/pnnx/src/pass_ncnn/expand_expression.cpp
index f8f97baa55c..2fdc6d77d62 100644
--- a/tools/pnnx/src/pass_ncnn/expand_expression.cpp
+++ b/tools/pnnx/src/pass_ncnn/expand_expression.cpp
@@ -185,6 +185,7 @@ static std::string expand_expression(Graph& graph, const Operator* op, int& pnnx
                  || t == "div"
                  || t == "floor_divide"
                  || t == "fmod"
+                 || t == "logaddexp"
                  || t == "max"
                  || t == "maximum"
                  || t == "min"
@@ -211,6 +212,7 @@ static std::string expand_expression(Graph& graph, const Operator* op, int& pnnx
             if (t == "sub") op_binary->params["0"] = 1;
             if (t == "mul") op_binary->params["0"] = 2;
             if (t == "div") op_binary->params["0"] = 3;
+            if (t == "logaddexp") fprintf(stderr, "BinaryOp logaddexp not supported yet\n"); // TODO
             if (t == "max" || t == "maximum") op_binary->params["0"] = 4;
             if (t == "min" || t == "minimum") op_binary->params["0"] = 5;
             if (t == "floor_divide") fprintf(stderr, "BinaryOp floor_divide not supported yet\n"); // TODO
diff --git a/tools/pnnx/tests/CMakeLists.txt b/tools/pnnx/tests/CMakeLists.txt
index 2046a639256..7bbf1c6ea9c 100644
--- a/tools/pnnx/tests/CMakeLists.txt
+++ b/tools/pnnx/tests/CMakeLists.txt
@@ -295,6 +295,7 @@ pnnx_add_test(torch_floor)
 pnnx_add_test(torch_imag)
 pnnx_add_test(torch_log)
 pnnx_add_test(torch_log10)
+pnnx_add_test(torch_logaddexp)
 pnnx_add_test(torch_maximum)
 pnnx_add_test(torch_minimum)
 pnnx_add_test(torch_neg)
diff --git a/tools/pnnx/tests/test_torch_logaddexp.py b/tools/pnnx/tests/test_torch_logaddexp.py
new file mode 100644
index 00000000000..6914dbd6213
--- /dev/null
+++ b/tools/pnnx/tests/test_torch_logaddexp.py
@@ -0,0 +1,61 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        out0 = torch.logaddexp(x, y)
+        out1 = torch.logaddexp(y, y)
+        out2 = torch.logaddexp(z, torch.ones_like(z) + 0.5)
+        return out0, out1, out2
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(3, 16)
+    y = torch.rand(3, 16)
+    z = torch.rand(5, 9, 3)
+
+    a = net(x, y, z)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y, z))
+    mod.save("test_torch_logaddexp.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../src/pnnx test_torch_logaddexp.pt inputshape=[3,16],[3,16],[5,9,3]")
+
+    # pnnx inference
+    import test_torch_logaddexp_pnnx
+    b = test_torch_logaddexp_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)

From d355b6dc5bf6daf2fcd00caa24875f6f3fcb862e Mon Sep 17 00:00:00 2001
From: lll143653 <58139948+lll143653@users.noreply.github.com>
Date: Tue, 23 Jul 2024 16:54:35 +0800
Subject: [PATCH 08/38] Add warning and recommend to use pnnx (#5588)

---
 tools/onnx/onnx2ncnn.cpp | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/tools/onnx/onnx2ncnn.cpp b/tools/onnx/onnx2ncnn.cpp
index e443a28edf1..1b29e34c128 100644
--- a/tools/onnx/onnx2ncnn.cpp
+++ b/tools/onnx/onnx2ncnn.cpp
@@ -2956,6 +2956,15 @@ static std::string trunc_name(std::string name)
 
 int main(int argc, char** argv)
 {
+    fprintf(stderr, "onnx2ncnn may not fully meet your needs. For more accurate and elegant\n\
+conversion results, please use PNNX. PyTorch Neural Network eXchange (PNNX) is\n\
+an open standard for PyTorch model interoperability. PNNX provides an open model\n\
+format for PyTorch. It defines computation graph as well as high level operators\n\
+strictly matches PyTorch. You can obtain pnnx through the following ways:\n\
+1. Install via python\n\
+   pip3 install pnnx\n\
+2. Get the executable from https://github.com/pnnx/pnnx\n\
+For more information, please refer to https://github.com/pnnx/pnnx\n");
     if (!(argc == 2 || argc == 4))
     {
         fprintf(stderr, "Usage: %s [onnxpb] [ncnnparam] [ncnnbin]\n", argv[0]);

From 051b04ffb48e2d887bab758252eac55fc92bc028 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=B5=E5=B0=8F=E5=87=A1?=
 <2672931+whyb@users.noreply.github.com>
Date: Wed, 24 Jul 2024 10:40:17 +0800
Subject: [PATCH 09/38] Updated use-ncnn-with-pytorch-or-onnx document (#5557)

---
 .../use-ncnn-with-pytorch-or-onnx.md          | 148 ++++++++++++++++--
 1 file changed, 136 insertions(+), 12 deletions(-)

diff --git a/docs/how-to-use-and-FAQ/use-ncnn-with-pytorch-or-onnx.md b/docs/how-to-use-and-FAQ/use-ncnn-with-pytorch-or-onnx.md
index 9b0559a8eb8..e0195aa1403 100644
--- a/docs/how-to-use-and-FAQ/use-ncnn-with-pytorch-or-onnx.md
+++ b/docs/how-to-use-and-FAQ/use-ncnn-with-pytorch-or-onnx.md
@@ -2,8 +2,114 @@ Here is a practical guide for converting pytorch model to ncnn
 
 resnet18 is used as the example
 
-## pytorch to onnx
-
+## pytorch to ncnn, onnx to ncnn
+
+### What's the pnnx?
+PyTorch Neural Network eXchange(PNNX) is an open standard for PyTorch model interoperability. PNNX provides an open model format for PyTorch. It defines computation graph as well as high level operators strictly matches PyTorch.
+It is recommended to use the `pnnx` tool to convert your `onnx` or `pytorch` model into a ncnn model now.
+
+### How to install pnnx?
+* A. python pip (recommended)
+  * Windows/Linux/macOS 64bit
+  * python 3.7 or later
+
+  ```shell
+  pip3 install pnnx
+  ```
+
+* B. portable binary package (recommended if you hate python)
+  * Windows/Linux/macOS 64bit
+  * For Linux, glibc 2.17+
+
+  Download portable pnnx binary package from https://github.com/pnnx/pnnx/releases and extract it.
+
+* C. build from source
+  1. install pytorch
+  2. (optional) install torchvision for pnnx torchvision operator support
+  3. (optional) install protobuf for pnnx onnx-zero support
+  4. clone https://github.com/Tencent/ncnn.git
+  5. build pnnx in ncnn/tools/pnnx with cmake
+
+  You will probably refer https://github.com/pnnx/pnnx/blob/main/.github/workflows/release.yml for detailed steps
+
+  ```shell
+  git clone https://github.com/Tencent/ncnn.git
+  mkdir ncnn/tools/pnnx/build
+  cd ncnn/tools/pnnx/build
+  cmake -DCMAKE_INSTALL_PREFIX=install -DTorch_INSTALL_DIR=<your libtorch install dir> -DTorchVision_INSTALL_DIR=<your torchvision install dir> ..
+  cmake --build . --config Release -j 4
+  cmake --build . --config Release --target install
+  ```
+
+### How to use pnnx?
+* A. python
+  1. optimize and export your torch model with pnnx.export()
+      ```python
+      import torch
+      import torchvision.models as models
+      import pnnx
+
+      model = models.resnet18(pretrained=True)
+
+      x = torch.rand(1, 3, 224, 224)
+
+      opt_model = pnnx.export(model, "resnet18.pt", x)
+
+      # use tuple for model with multiple inputs
+      # opt_model = pnnx.export(model, "resnet18.pt", (x, y, z))
+      ```
+  2. use optimized module just like the normal one
+      ```python
+      result = opt_model(x) 
+      ```
+  3. pick resnet18_pnnx.py for pnnx-optimized torch model
+  4. pick resnet18.ncnn.param and resnet18.ncnn.bin for ncnn inference
+
+B. command line
+  1. export your torch model to torchscript / onnx
+      ```python
+      import torch
+      import torchvision.models as models
+
+      net = models.resnet18(pretrained=True)
+      net = net.eval()
+
+      x = torch.rand(1, 3, 224, 224)
+
+      # You could try disabling checking when tracing raises error
+      # mod = torch.jit.trace(net, x, check_trace=False)
+      mod = torch.jit.trace(net, x)
+
+      mod.save("resnet18.pt")
+
+      # You could also try exporting to the good-old onnx
+      torch.onnx.export(net, x, 'resnet18.onnx')
+      ```
+
+  2. pnnx convert torchscript / onnx to optimized pnnx model and ncnn model files
+      ```shell
+      ./pnnx resnet18.pt inputshape=[1,3,224,224]
+      ./pnnx resnet18.onnx inputshape=[1,3,224,224]
+      ```
+      macOS zsh user may need double quotes to prevent ambiguity
+      ```shell
+      ./pnnx resnet18.pt "inputshape=[1,3,224,224]"
+      ```
+      For model with multiple inputs, use list
+      ```shell
+      ./pnnx resnet18.pt inputshape=[1,3,224,224],[1,32]
+      ```
+      For model with non-fp32 input data type, add type suffix
+      ```shell
+      ./pnnx resnet18.pt inputshape=[1,3,224,224]f32,[1,32]i64
+      ```
+  3. pick resnet18_pnnx.py for pnnx-optimized torch model
+  4. pick resnet18.ncnn.param and resnet18.ncnn.bin for ncnn inference
+
+see more pnnx informations: https://github.com/pnnx/pnnx
+
+## pytorch to onnx (deprecated)
+<details><summary>pytorch to onnx</summary>
 The official pytorch tutorial for exporting onnx model
 
 https://pytorch.org/tutorials/advanced/super_resolution_with_caffe2.html
@@ -22,9 +128,10 @@ x = torch.rand(1, 3, 224, 224)
 # Export the model
 torch_out = torch.onnx._export(model, x, "resnet18.onnx", export_params=True)
 ```
+</details>
 
-## simplify onnx model
-
+## simplify onnx model (deprecated)
+<details><summary>simplify onnx model</summary>
 The exported resnet18.onnx model may contains many redundant operators such as Shape, Gather and Unsqueeze that is not supported in ncnn
 
 ```
@@ -37,19 +144,36 @@ Unsqueeze not supported yet!
   # axes 7
 ```
 
-Fortunately, daquexian developed a handy tool to eliminate them. cheers!
+### onnxsim
 
-https://github.com/daquexian/onnx-simplifier
+Fortunately, [@daquexian](https://github.com/daquexian) developed a handy tool to eliminate them. cheers!
 
+#### how to use onnxsim?
+```shell
+pip install onnxsim
+python -m onnxsim resnet18.onnx resnet18-sim.onnx
 ```
-python3 -m onnxsim resnet18.onnx resnet18-sim.onnx
-```
+more informations: https://github.com/daquexian/onnx-simplifier
 
-## onnx to ncnn
+### onnxslim
 
-Finally, you can convert the model to ncnn using tools/onnx2ncnn
+Or you can use another powerful model simplification tool implemented in pure Python development by [@inisis](https://github.com/inisis):
 
+#### how to use onnxslim?
+```shell
+pip install onnxslim
+python -m onnxslim resnet18.onnx resnet18-slim.onnx
 ```
-onnx2ncnn resnet18-sim.onnx resnet18.param resnet18.bin
-```
+more informations: https://github.com/inisis/OnnxSlim
+</details>
+
+## onnx2ncnn (deprecated)
+
+~~The onnx2ncnn tool has stopped maintenance. It is recommended to use the PNNX tool~~
+
+<details><summary>onnx2ncnn tool</summary>
+
+~~Finally, you can convert the model to ncnn using tools/onnx2ncnn~~
 
+~~onnx2ncnn resnet18-sim.onnx resnet18.param resnet18.bin~~
+</details>
\ No newline at end of file

From 92e0b8253bc9d16b0d77bd17693fe9a72fb64b64 Mon Sep 17 00:00:00 2001
From: quink <zhilizhao@tencent.com>
Date: Tue, 30 Jul 2024 10:47:00 +0800
Subject: [PATCH 10/38] arm/convolution_3x3_pack1to8_fp16s: prefer ldr/str over
 ld1/st1 (#5603)

Depending on the arch, ldr/str can be faster than ld1/st1, especially
for loading to one lane form. For example, on Cortex A75,

1. execution latency of 'ldr q0' and 'ldr h0' are 5
2. execution latency of 'ld1 {v0.16b}' is 6
3. execution latency of 'ld1 {v0.h}[0]' is 8

On Cortex X3,
1. execution latency of 'ldr q0' and 'ldr h0' are 6
2. execution latency of 'ld1 {v0.16b}' is 6
3. execution latency of 'ld1 {v0.h}[0]' is 8

Signed-off-by: Zhao Zhili <zhilizhao@tencent.com>
---
 .../arm/convolution_3x3_pack1to8_fp16s.h      | 68 +++++++++----------
 1 file changed, 34 insertions(+), 34 deletions(-)

diff --git a/src/layer/arm/convolution_3x3_pack1to8_fp16s.h b/src/layer/arm/convolution_3x3_pack1to8_fp16s.h
index bd03d450b2e..40e276cdedf 100644
--- a/src/layer/arm/convolution_3x3_pack1to8_fp16s.h
+++ b/src/layer/arm/convolution_3x3_pack1to8_fp16s.h
@@ -68,8 +68,8 @@ static void conv3x3s1_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob
                         "sub    %0, %0, #64                 \n"
 
                         "prfm   pldl1keep, [%1, #128]       \n"
-                        "ld1    {v0.8h}, [%1], #16          \n" // r0
-                        "ld1    {v1.4h}, [%1]               \n"
+                        "ldr    q0, [%1], #16               \n" // r0
+                        "ldr    s1, [%1]                    \n"
 
                         "fmla   v24.8h, %8.8h, v0.h[0]      \n"
                         "fmla   v25.8h, %8.8h, v0.h[1]      \n"
@@ -99,8 +99,8 @@ static void conv3x3s1_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob
                         "fmla   v31.8h, %10.8h, v1.h[1]     \n"
 
                         "prfm   pldl1keep, [%2, #128]       \n"
-                        "ld1    {v2.8h}, [%2], #16          \n" // r1
-                        "ld1    {v3.4h}, [%2]               \n"
+                        "ldr    q2, [%2], #16               \n" // r1
+                        "ldr    s3, [%2]                    \n"
 
                         "fmla   v24.8h, %11.8h, v2.h[0]     \n"
                         "fmla   v25.8h, %11.8h, v2.h[1]     \n"
@@ -130,8 +130,8 @@ static void conv3x3s1_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob
                         "fmla   v31.8h, %13.8h, v3.h[1]     \n"
 
                         "prfm   pldl1keep, [%3, #128]       \n"
-                        "ld1    {v4.8h}, [%3], #16          \n" // r2
-                        "ld1    {v5.4h}, [%3]               \n"
+                        "ldr    q4, [%3], #16               \n" // r2
+                        "ldr    s5, [%3]                    \n"
 
                         "fmla   v24.8h, %14.8h, v4.h[0]     \n"
                         "fmla   v25.8h, %14.8h, v4.h[1]     \n"
@@ -189,7 +189,7 @@ static void conv3x3s1_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob
                         "ld1    {v28.8h, v29.8h, v30.8h, v31.8h}, [%0] \n" // sum0 sum1 sum2 sum3
 
                         "prfm   pldl1keep, [%1, #128]       \n"
-                        "ld1    {v0.8h}, [%1]               \n" // r0
+                        "ldr    q0, [%1]                    \n" // r0
 
                         "fmla   v28.8h, %8.8h, v0.h[0]      \n"
                         "fmla   v29.8h, %8.8h, v0.h[1]      \n"
@@ -207,7 +207,7 @@ static void conv3x3s1_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob
                         "fmla   v31.8h, %10.8h, v0.h[5]     \n"
 
                         "prfm   pldl1keep, [%2, #128]       \n"
-                        "ld1    {v1.8h}, [%2]               \n" // r1
+                        "ldr    q1, [%2]                    \n" // r1
 
                         "fmla   v28.8h, %11.8h, v1.h[0]     \n"
                         "fmla   v29.8h, %11.8h, v1.h[1]     \n"
@@ -225,7 +225,7 @@ static void conv3x3s1_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob
                         "fmla   v31.8h, %13.8h, v1.h[5]     \n"
 
                         "prfm   pldl1keep, [%3, #128]       \n"
-                        "ld1    {v2.8h}, [%3]               \n" // r2
+                        "ldr    q2, [%3]                    \n" // r2
 
                         "fmla   v28.8h, %14.8h, v2.h[0]     \n"
                         "fmla   v29.8h, %14.8h, v2.h[1]     \n"
@@ -274,7 +274,7 @@ static void conv3x3s1_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob
                         "ld1    {v30.8h, v31.8h}, [%0]      \n" // sum0 sum1
 
                         "prfm   pldl1keep, [%1, #64]        \n"
-                        "ld1    {v0.4h}, [%1]               \n" // r0
+                        "ldr    d0, [%1]                    \n" // r0
 
                         "fmla   v30.8h, %8.8h, v0.h[0]      \n"
                         "fmla   v31.8h, %8.8h, v0.h[1]      \n"
@@ -284,7 +284,7 @@ static void conv3x3s1_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob
                         "fmla   v31.8h, %10.8h, v0.h[3]     \n"
 
                         "prfm   pldl1keep, [%2, #64]        \n"
-                        "ld1    {v1.4h}, [%2]               \n" // r1
+                        "ldr    d1, [%2]                    \n" // r1
 
                         "fmla   v30.8h, %11.8h, v1.h[0]     \n"
                         "fmla   v31.8h, %11.8h, v1.h[1]     \n"
@@ -294,7 +294,7 @@ static void conv3x3s1_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob
                         "fmla   v31.8h, %13.8h, v1.h[3]     \n"
 
                         "prfm   pldl1keep, [%3, #64]        \n"
-                        "ld1    {v2.4h}, [%3]               \n" // r2
+                        "ldr    d2, [%3]                    \n" // r2
 
                         "fmla   v30.8h, %14.8h, v2.h[0]     \n"
                         "fmla   v31.8h, %14.8h, v2.h[1]     \n"
@@ -332,24 +332,24 @@ static void conv3x3s1_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob
                 {
                     asm volatile(
                         "prfm   pldl1keep, [%0, #128]       \n"
-                        "ld1    {v30.8h}, [%0]              \n" // sum0
+                        "ldr    q30, [%0]                   \n" // sum0
 
                         "prfm   pldl1keep, [%1, #64]        \n"
-                        "ld1    {v0.4h}, [%1]               \n" // r0
+                        "ldr    d0, [%1]                    \n" // r0
 
                         "fmla   v30.8h, %8.8h, v0.h[0]      \n"
                         "fmla   v30.8h, %9.8h, v0.h[1]      \n"
                         "fmla   v30.8h, %10.8h, v0.h[2]     \n"
 
                         "prfm   pldl1keep, [%2, #64]        \n"
-                        "ld1    {v1.4h}, [%2]               \n" // r1
+                        "ldr    d1, [%2]                    \n" // r1
 
                         "fmla   v30.8h, %11.8h, v1.h[0]     \n"
                         "fmla   v30.8h, %12.8h, v1.h[1]     \n"
                         "fmla   v30.8h, %13.8h, v1.h[2]     \n"
 
                         "prfm   pldl1keep, [%3, #64]        \n"
-                        "ld1    {v2.4h}, [%3]               \n" // r2
+                        "ldr    d2, [%3]                    \n" // r2
 
                         "fmla   v30.8h, %14.8h, v2.h[0]     \n"
                         "fmla   v30.8h, %15.8h, v2.h[1]     \n"
@@ -359,7 +359,7 @@ static void conv3x3s1_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob
                         "add    %2, %2, #2                  \n"
                         "add    %3, %3, #2                  \n"
 
-                        "st1    {v30.8h}, [%0], #16         \n"
+                        "str    q30, [%0], #16              \n"
 
                         : "=r"(outptr0), // %0
                         "=r"(r0),      // %1
@@ -445,8 +445,8 @@ static void conv3x3s2_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob
                         "ld1    {v28.8h, v29.8h, v30.8h, v31.8h}, [%0] \n" // sum0 sum1 sum2 sum3
 
                         "prfm   pldl1keep, [%1, #128]       \n"
-                        "ld1    {v0.8h}, [%1], #16          \n" // r0
-                        "ld1    {v1.h}[0], [%1]             \n"
+                        "ldr    q0, [%1], #16               \n" // r0
+                        "ldr    h1, [%1]                    \n"
 
                         "fmla   v28.8h, %8.8h, v0.h[0]      \n"
                         "fmla   v29.8h, %8.8h, v0.h[2]      \n"
@@ -464,8 +464,8 @@ static void conv3x3s2_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob
                         "fmla   v31.8h, %10.8h, v1.h[0]     \n"
 
                         "prfm   pldl1keep, [%2, #128]       \n"
-                        "ld1    {v2.8h}, [%2], #16          \n" // r1
-                        "ld1    {v3.h}[0], [%2]             \n"
+                        "ldr    q2, [%2], #16               \n" // r1
+                        "ldr    h3, [%2]                    \n"
 
                         "fmla   v28.8h, %11.8h, v2.h[0]     \n"
                         "fmla   v29.8h, %11.8h, v2.h[2]     \n"
@@ -483,8 +483,8 @@ static void conv3x3s2_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob
                         "fmla   v31.8h, %13.8h, v3.h[0]     \n"
 
                         "prfm   pldl1keep, [%3, #128]       \n"
-                        "ld1    {v4.8h}, [%3], #16          \n" // r2
-                        "ld1    {v5.h}[0], [%3]             \n"
+                        "ldr    q4, [%3], #16               \n" // r2
+                        "ldr    h5, [%3]                    \n"
 
                         "fmla   v28.8h, %14.8h, v4.h[0]     \n"
                         "fmla   v29.8h, %14.8h, v4.h[2]     \n"
@@ -529,8 +529,8 @@ static void conv3x3s2_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob
                         "ld1    {v30.8h, v31.8h}, [%0]      \n" // sum0 sum1
 
                         "prfm   pldl1keep, [%1, #64]        \n"
-                        "ld1    {v0.4h}, [%1], #8           \n" // r0
-                        "ld1    {v1.h}[0], [%1]             \n"
+                        "ldr    d0, [%1], #8                \n" // r0
+                        "ldr    h1, [%1]                    \n"
 
                         "fmla   v30.8h, %8.8h, v0.h[0]      \n"
                         "fmla   v31.8h, %8.8h, v0.h[2]      \n"
@@ -540,8 +540,8 @@ static void conv3x3s2_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob
                         "fmla   v31.8h, %10.8h, v1.h[0]     \n"
 
                         "prfm   pldl1keep, [%2, #64]        \n"
-                        "ld1    {v2.4h}, [%2], #8           \n" // r1
-                        "ld1    {v3.h}[0], [%2]             \n"
+                        "ldr    d2, [%2], #8                \n" // r1
+                        "ldr    h3, [%2]                    \n"
 
                         "fmla   v30.8h, %11.8h, v2.h[0]     \n"
                         "fmla   v31.8h, %11.8h, v2.h[2]     \n"
@@ -551,8 +551,8 @@ static void conv3x3s2_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob
                         "fmla   v31.8h, %13.8h, v3.h[0]     \n"
 
                         "prfm   pldl1keep, [%3, #64]        \n"
-                        "ld1    {v4.4h}, [%3], #8           \n" // r2
-                        "ld1    {v5.h}[0], [%3]             \n"
+                        "ldr    d4, [%3], #8                \n" // r2
+                        "ldr    h5, [%3]                    \n"
 
                         "fmla   v30.8h, %14.8h, v4.h[0]     \n"
                         "fmla   v31.8h, %14.8h, v4.h[2]     \n"
@@ -586,24 +586,24 @@ static void conv3x3s2_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob
                 {
                     asm volatile(
                         "prfm   pldl1keep, [%0, #128]       \n"
-                        "ld1    {v30.8h}, [%0]              \n" // sum0
+                        "ldr    q30, [%0]                   \n" // sum0
 
                         "prfm   pldl1keep, [%1, #64]        \n"
-                        "ld1    {v0.4h}, [%1]               \n" // r0
+                        "ldr    d0, [%1]                    \n" // r0
 
                         "fmla   v30.8h, %8.8h, v0.h[0]      \n"
                         "fmla   v30.8h, %9.8h, v0.h[1]      \n"
                         "fmla   v30.8h, %10.8h, v0.h[2]     \n"
 
                         "prfm   pldl1keep, [%2, #64]        \n"
-                        "ld1    {v1.4h}, [%2]               \n" // r1
+                        "ldr    d1, [%2]                    \n" // r1
 
                         "fmla   v30.8h, %11.8h, v1.h[0]     \n"
                         "fmla   v30.8h, %12.8h, v1.h[1]     \n"
                         "fmla   v30.8h, %13.8h, v1.h[2]     \n"
 
                         "prfm   pldl1keep, [%3, #64]        \n"
-                        "ld1    {v2.4h}, [%3]               \n" // r2
+                        "ldr    d2, [%3]                    \n" // r2
 
                         "fmla   v30.8h, %14.8h, v2.h[0]     \n"
                         "fmla   v30.8h, %15.8h, v2.h[1]     \n"
@@ -613,7 +613,7 @@ static void conv3x3s2_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob
                         "add    %2, %2, #4                  \n"
                         "add    %3, %3, #4                  \n"
 
-                        "st1    {v30.8h}, [%0], #16         \n"
+                        "str    q30, [%0], #16              \n"
 
                         : "=r"(outptr0), // %0
                         "=r"(r0),      // %1

From 391152f500cf20bef50da3a0617900acca34c770 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E4=BD=B0=E9=98=85?=
 <43716063+Baiyuetribe@users.noreply.github.com>
Date: Thu, 1 Aug 2024 21:06:38 +0800
Subject: [PATCH 11/38] c_api surpport set_vulkan_device (#5610)

---
 src/c_api.cpp | 7 +++++++
 src/c_api.h   | 4 ++++
 2 files changed, 11 insertions(+)

diff --git a/src/c_api.cpp b/src/c_api.cpp
index 5662d1b5155..f8146e054c2 100644
--- a/src/c_api.cpp
+++ b/src/c_api.cpp
@@ -1240,6 +1240,13 @@ void ncnn_net_set_option(ncnn_net_t net, ncnn_option_t opt)
     ((Net*)net->pthis)->opt = *((Option*)opt);
 }
 
+#if NCNN_VULKAN
+void ncnn_net_set_vulkan_device(ncnn_net_t net, int device_index)
+{
+    ((Net*)net->pthis)->set_vulkan_device(device_index);
+}
+#endif
+
 static ::ncnn::Layer* __Layer_c_api_layer_creator(void* userdata)
 {
     ncnn_net_custom_layer_factory_t ud = (ncnn_net_custom_layer_factory_t)userdata;
diff --git a/src/c_api.h b/src/c_api.h
index d153b2a4ef0..f752bfed663 100644
--- a/src/c_api.h
+++ b/src/c_api.h
@@ -275,6 +275,10 @@ NCNN_EXPORT void ncnn_net_destroy(ncnn_net_t net);
 NCNN_EXPORT ncnn_option_t ncnn_net_get_option(ncnn_net_t net);
 NCNN_EXPORT void ncnn_net_set_option(ncnn_net_t net, ncnn_option_t opt);
 
+#if NCNN_VULKAN
+NCNN_EXPORT void ncnn_net_set_vulkan_device(ncnn_net_t net, int device_index);
+#endif
+
 #if NCNN_STRING
 NCNN_EXPORT void ncnn_net_register_custom_layer_by_type(ncnn_net_t net, const char* type, ncnn_layer_creator_t creator, ncnn_layer_destroyer_t destroyer, void* userdata);
 #endif /* NCNN_STRING */

From 5b5c1fdb8fb80b52bbe63e9d5c8a5fca15ffda7f Mon Sep 17 00:00:00 2001
From: Galasnow <854932917@qq.com>
Date: Thu, 8 Aug 2024 11:00:23 +0800
Subject: [PATCH 12/38] Fix build error with NDK r27 (#5615)

Enable policy CMP0057 for cmake version >= 3.3
---
 CMakeLists.txt | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 309e3b8fbd0..0f32a80c86e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -37,6 +37,11 @@ if(POLICY CMP0025)
     cmake_policy(SET CMP0025 NEW)
 endif()
 
+if(POLICY CMP0057)
+    # reference from https://cmake.org/cmake/help/latest/policy/CMP0057.html
+    cmake_policy(SET CMP0057 NEW)
+endif()
+
 project(ncnn)
 
 if(MSVC AND NOT CMAKE_VERSION VERSION_LESS "3.15")

From 03cf161dbd28a24e57adb0cbbc693f98adec8e6a Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 8 Aug 2024 11:01:17 +0800
Subject: [PATCH 13/38] Bump pypa/cibuildwheel from 2.17.0 to 2.20.0 (#5613)

Bumps [pypa/cibuildwheel](https://github.com/pypa/cibuildwheel) from 2.17.0 to 2.20.0.
- [Release notes](https://github.com/pypa/cibuildwheel/releases)
- [Changelog](https://github.com/pypa/cibuildwheel/blob/main/docs/changelog.md)
- [Commits](https://github.com/pypa/cibuildwheel/compare/v2.17.0...v2.20.0)

---
updated-dependencies:
- dependency-name: pypa/cibuildwheel
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 .github/workflows/release-python.yml | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/release-python.yml b/.github/workflows/release-python.yml
index d8304c0e33c..82bd5551fcd 100644
--- a/.github/workflows/release-python.yml
+++ b/.github/workflows/release-python.yml
@@ -87,7 +87,7 @@ jobs:
     # build wheels for ubuntu-20.04
     - name: Build wheels for ubuntu
       if: matrix.os == 'ubuntu-20.04'
-      uses: pypa/cibuildwheel@v2.17.0
+      uses: pypa/cibuildwheel@v2.20.0
       env:
         CIBW_ARCHS_LINUX: ${{ matrix.arch }}
         CIBW_BUILD: ${{ matrix.build }}
@@ -99,7 +99,7 @@ jobs:
     # build wheels for windows-2019
     - name: Build wheels for windows
       if: matrix.os == 'windows-2019' && (matrix.arch == 'AMD64' || matrix.arch == 'x86')
-      uses: pypa/cibuildwheel@v2.17.0
+      uses: pypa/cibuildwheel@v2.20.0
       env:
         CIBW_ARCHS_WINDOWS: ${{ matrix.arch }}
         CIBW_BUILD: ${{ matrix.build }}
@@ -112,7 +112,7 @@ jobs:
 
     - name: Build wheels for windows ARM64
       if: matrix.os == 'windows-2019' && matrix.arch == 'ARM64'
-      uses: pypa/cibuildwheel@v2.17.0
+      uses: pypa/cibuildwheel@v2.20.0
       env:
         CIBW_ARCHS_WINDOWS: ${{ matrix.arch }}
         CIBW_BUILD: ${{ matrix.build }}
@@ -190,7 +190,7 @@ jobs:
 
     - name: Build wheels for macos x86_64
       if: matrix.os == 'macos-13' && matrix.arch == 'x86_64'
-      uses: pypa/cibuildwheel@v2.17.0
+      uses: pypa/cibuildwheel@v2.20.0
       env:
         CIBW_ARCHS_MACOS: ${{ matrix.arch }}
         CIBW_BUILD: ${{ matrix.build }}
@@ -207,7 +207,7 @@ jobs:
 
     - name: Build wheels for macos arm64
       if: matrix.os == 'macos-13' && matrix.arch == 'arm64'
-      uses: pypa/cibuildwheel@v2.17.0
+      uses: pypa/cibuildwheel@v2.20.0
       env:
         CIBW_ARCHS_MACOS: ${{ matrix.arch }}
         CIBW_BUILD: ${{ matrix.build }}
@@ -262,7 +262,7 @@ jobs:
         platforms: all
 
     - name: Build wheels for manylinux with qemu
-      uses: pypa/cibuildwheel@v2.17.0
+      uses: pypa/cibuildwheel@v2.20.0
       env:
         CIBW_ARCHS_LINUX: ${{ matrix.arch }}
         CIBW_BUILD: ${{ matrix.build_cp }}-${{ matrix.build_sub }}*
@@ -310,7 +310,7 @@ jobs:
         platforms: all
 
     - name: Build wheels for manylinux with qemu
-      uses: pypa/cibuildwheel@v2.17.0
+      uses: pypa/cibuildwheel@v2.20.0
       env:
         CIBW_ARCHS_LINUX: ${{ matrix.arch }}
         CIBW_BUILD: ${{ matrix.build_pp }}-*

From 60823a8de3defa2e7d642d981ed0af13b5da58f0 Mon Sep 17 00:00:00 2001
From: nihui <nihuini@tencent.com>
Date: Thu, 8 Aug 2024 14:07:20 +0800
Subject: [PATCH 14/38] pnnx handles sdpa batch index (#5617)

---
 tools/pnnx/src/pass_ncnn/solve_batch_index.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/pnnx/src/pass_ncnn/solve_batch_index.cpp b/tools/pnnx/src/pass_ncnn/solve_batch_index.cpp
index 6e53f7aa841..4b1100789fc 100644
--- a/tools/pnnx/src/pass_ncnn/solve_batch_index.cpp
+++ b/tools/pnnx/src/pass_ncnn/solve_batch_index.cpp
@@ -56,6 +56,7 @@ static bool is_known_operator_with_batch_index_0(const Operator* op)
         "F.pixel_shuffle",
         "F.pixel_unshuffle",
         "F.prelu",
+        "F.scaled_dot_product_attention",
         "F.unfold",
         "F.upsample_bilinear",
         "F.upsample_nearest",

From b9debee8fb92263cd3a087208d3657081a2e4f37 Mon Sep 17 00:00:00 2001
From: nihui <nihuini@tencent.com>
Date: Sat, 10 Aug 2024 11:39:32 +0800
Subject: [PATCH 15/38] pnnx ci for torch 2.4 (#5618)

* update onnx proto
---
 .ci/pnnx.yml                                  |  13 +-
 tools/pnnx/src/CMakeLists.txt                 |   4 +-
 tools/pnnx/src/load_onnx.cpp                  |   2 +-
 tools/pnnx/src/onnx-data.proto                | 155 ++++++++++++++++++
 tools/pnnx/src/{onnx.proto => onnx-ml.proto}  |  72 ++++++--
 tools/pnnx/src/onnx-operators-ml.proto        | 136 +++++++++++++++
 .../pass_level5/fuse_multiheadattention.cpp   |  60 +++++++
 tools/pnnx/src/pass_onnx.cpp                  |   2 +-
 tools/pnnx/src/pass_onnx/canonicalize.h       |   2 +-
 .../src/pass_onnx/dead_code_elimination.h     |   2 +-
 tools/pnnx/src/pass_onnx/eliminate_noop.h     |   2 +-
 tools/pnnx/src/pass_onnx/fold_constants.h     |   2 +-
 .../pass_onnx/fuse_constant_as_attribute.h    |   2 +-
 tools/pnnx/src/pass_onnx/inline_containers.h  |   2 +-
 tools/pnnx/src/pass_onnx/inline_if_graph.h    |   2 +-
 tools/pnnx/src/pass_onnx/model_stat.h         |   2 +-
 .../src/pass_onnx/nn_AdaptiveAvgPool2d.cpp    |   2 +-
 .../src/pass_onnx/nn_AdaptiveAvgPool3d.cpp    |   2 +-
 tools/pnnx/src/pass_onnx/nn_AvgPool2d.cpp     |   2 +-
 tools/pnnx/src/pass_onnx/nn_AvgPool3d.cpp     |   2 +-
 tools/pnnx/src/pass_onnx/nn_BatchNorm2d.cpp   |   2 +-
 tools/pnnx/src/pass_onnx/nn_BatchNorm3d.cpp   |   2 +-
 tools/pnnx/src/pass_onnx/nn_Conv2d.cpp        |   2 +-
 tools/pnnx/src/pass_onnx/nn_Conv3d.cpp        |   2 +-
 tools/pnnx/src/pass_onnx/nn_GELU.cpp          |   2 +-
 tools/pnnx/src/pass_onnx/nn_LayerNorm.cpp     |   2 +-
 tools/pnnx/src/pass_onnx/nn_Linear.cpp        |   2 +-
 tools/pnnx/src/pass_onnx/nn_MaxPool2d.cpp     |   2 +-
 tools/pnnx/src/pass_onnx/nn_MaxPool3d.cpp     |   2 +-
 .../src/pass_onnx/nn_MultiheadAttention.cpp   |   2 +-
 tools/pnnx/src/pass_onnx/shape_inference.h    |   2 +-
 tools/pnnx/src/save_onnx.cpp                  |   2 +-
 tools/pnnx/tests/onnx/test_nn_ReLU.py         |   2 +-
 tools/pnnx/tests/onnx/test_squeezenet1_1.py   |   2 +-
 tools/pnnx/tests/onnx/test_swin_t.py          |   2 +-
 tools/pnnx/tests/onnx/test_vit_b_32.py        |   2 +-
 36 files changed, 453 insertions(+), 47 deletions(-)
 create mode 100644 tools/pnnx/src/onnx-data.proto
 rename tools/pnnx/src/{onnx.proto => onnx-ml.proto} (92%)
 create mode 100644 tools/pnnx/src/onnx-operators-ml.proto

diff --git a/.ci/pnnx.yml b/.ci/pnnx.yml
index 5c44354aaaa..990690e0c5b 100644
--- a/.ci/pnnx.yml
+++ b/.ci/pnnx.yml
@@ -17,10 +17,10 @@ concurrency:
 
 variables:
   protobuf_version: 21.12
-  libtorch_version: 2.3.0
-  libtorchvision_version: 0.18.0
-  onnxruntime_version: 1.17.3
-  cache_date: 20240504
+  libtorch_version: 2.4.0
+  libtorchvision_version: 0.19.0
+  onnxruntime_version: 1.18.1
+  cache_date: 20240804
 
 jobs:
   ubuntu:
@@ -57,6 +57,9 @@ jobs:
           - torch-version: 2.3.0
             torchvision-version: 0.18.0
 
+          - torch-version: 2.4.0
+            torchvision-version: 0.19.0
+
     runs-on:
       pool-name: docker
       container:
@@ -160,7 +163,7 @@ jobs:
     - name: setup-pytorch
       run: |
         export PYTHONUSERBASE=${{ci.workspace}}/torch-${{matrix.torch-version}}
-        pip3 install --user torch==${{matrix.torch-version}}+cpu torchvision==${{matrix.torchvision-version}}+cpu -f https://download.pytorch.org/whl/torch_stable.html
+        pip3 install --user torch==${{matrix.torch-version}}+cpu torchvision==${{matrix.torchvision-version}}+cpu --index-url https://download.pytorch.org/whl/cpu
         pip3 install --user onnx
         pip3 install --user onnxscript
 
diff --git a/tools/pnnx/src/CMakeLists.txt b/tools/pnnx/src/CMakeLists.txt
index 986f6ebe81e..27dfdef52f8 100644
--- a/tools/pnnx/src/CMakeLists.txt
+++ b/tools/pnnx/src/CMakeLists.txt
@@ -587,12 +587,12 @@ if(PROTOBUF_FOUND)
     endif()
 
     if(Protobuf_FOUND OR protobuf_MODULE_COMPATIBLE)
-        protobuf_generate_cpp(ONNX_PROTO_SRCS ONNX_PROTO_HDRS onnx.proto)
+        protobuf_generate_cpp(ONNX_PROTO_SRCS ONNX_PROTO_HDRS onnx-data.proto onnx-ml.proto onnx-operators-ml.proto)
         add_library(onnxproto STATIC ${ONNX_PROTO_SRCS} ${ONNX_PROTO_HDRS})
         target_include_directories(onnxproto PUBLIC ${PROTOBUF_INCLUDE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
         target_link_libraries(onnxproto PUBLIC ${PROTOBUF_LIBRARIES})
     else()
-        add_library(onnxproto STATIC onnx.proto)
+        add_library(onnxproto STATIC onnx-data.proto onnx-ml.proto onnx-operators-ml.proto)
         target_include_directories(onnxproto PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
         protobuf_generate(TARGET onnxproto)
         target_link_libraries(onnxproto PUBLIC protobuf::libprotobuf)
diff --git a/tools/pnnx/src/load_onnx.cpp b/tools/pnnx/src/load_onnx.cpp
index 36624d916bd..9adf2b47088 100644
--- a/tools/pnnx/src/load_onnx.cpp
+++ b/tools/pnnx/src/load_onnx.cpp
@@ -14,7 +14,7 @@
 
 #include "load_onnx.h"
 
-#include "onnx.pb.h"
+#include "onnx-ml.pb.h"
 
 #include <google/protobuf/io/coded_stream.h>
 #include <google/protobuf/io/zero_copy_stream_impl.h>
diff --git a/tools/pnnx/src/onnx-data.proto b/tools/pnnx/src/onnx-data.proto
new file mode 100644
index 00000000000..d7d925d45d0
--- /dev/null
+++ b/tools/pnnx/src/onnx-data.proto
@@ -0,0 +1,155 @@
+//
+// WARNING: This file is automatically generated!  Please edit onnx.in.proto.
+//
+
+
+// SPDX-License-Identifier: Apache-2.0
+
+
+syntax = "proto2";
+
+package onnx;
+import "onnx-ml.proto";
+
+// This file contains the proto definitions for MapProto and
+// SequenceProto. These protos are used to represent the data structures
+// of maps and sequence for use in test data or ModelProto.
+
+// Sequences
+//
+// Defines a dense, ordered, collection of elements that are of homogeneous types.
+// Sequences can be made out of tensors, maps, or sequences.
+//
+// If a sequence is made out of tensors, the tensors must have the same element
+// type (i.e. int32). In some cases, the tensors in a sequence can have different
+// shapes.  Whether the tensors can have different shapes or not depends on the
+// type/shape associated with the corresponding "ValueInfo". For example,
+// "Sequence<Tensor<float, [M,N]>" means that all tensors have same shape. However,
+// "Sequence<Tensor<float, [omitted,omitted]>" means they can have different
+// shapes (all of rank 2), where "omitted" means the corresponding dimension has
+// no symbolic/constant value. Finally, "Sequence<Tensor<float, omitted>>" means
+// that the different tensors can have different ranks, when the "shape" itself
+// is omitted from the tensor-type. For a more complete description, refer to
+// https://github.com/onnx/onnx/blob/main/docs/IR.md#static-tensor-shapes.
+//
+message SequenceProto {
+
+  optional string name = 1;
+
+  enum DataType {
+    UNDEFINED = 0;
+    TENSOR = 1;
+    SPARSE_TENSOR = 2;
+    SEQUENCE = 3;
+    MAP = 4;
+    OPTIONAL = 5;
+  }
+
+  // The data type of the element.
+  // This field MUST have a valid SequenceProto.DataType value
+  optional int32 elem_type = 2;
+
+  // For TensorProto values.
+  // When this field is present, the elem_type field MUST be TENSOR.
+  repeated TensorProto tensor_values = 3;
+
+  // For SparseTensorProto values.
+  // When this field is present, the elem_type field MUST be SPARSE_TENSOR.
+  repeated SparseTensorProto sparse_tensor_values = 4;
+
+  // For SequenceProto values, allowing sequences to be of themselves.
+  // When this field is present, the elem_type field MUST be SEQUENCE.
+  repeated SequenceProto sequence_values = 5;
+
+  // For MapProto values.
+  // When this field is present, the elem_type field MUST be MAP.
+  repeated MapProto map_values = 6;
+
+  // For OptionalProto values.
+  // When this field is present, the elem_type field MUST be Optional.
+  repeated OptionalProto optional_values = 7;
+
+}
+
+
+// Maps
+//
+// Specifies an associative table, defined by keys and values.
+// MapProto is formed with a repeated field of keys (of type INT8, INT16, INT32,
+// INT64, UINT8, UINT16, UINT32, UINT64, or STRING) and values (of type TENSOR,
+// SPARSE_TENSOR, SEQUENCE, or MAP). Key types and value types have to remain
+// the same throughout the instantiation of the MapProto.
+//
+message MapProto {
+
+  optional string name = 1;
+
+  // All MapProto data types must have the same length of keys and values.
+
+  // The data type of the key.
+  // This field MUST have a valid TensorProto.DataType value of
+  // INT8, INT16, INT32, INT64, UINT8, UINT16, UINT32, UINT64, or STRING
+  optional int32 key_type = 2;
+
+  // Every element of keys has to be one of the following data types
+  // INT8, INT16, INT32, INT64, UINT8, UINT16, UINT32, UINT64, or STRING.
+  // The integer cases are represented by the repeated int64 field keys below.
+  repeated int64 keys = 3;
+
+  // If keys are strings, they are represented by the repeated bytes field
+  // string_keys below.
+  repeated bytes string_keys = 4;
+
+  // MapProto values are represented in a SequenceProto of the same length as the
+  // repeated keys field and have to be one of the following data types
+  // TENSOR, SPARSE_TENSOR, MAP, SEQUENCE.
+  optional SequenceProto values = 5;
+}
+
+// Optional
+//
+//
+message OptionalProto {
+
+  optional string name = 1;
+
+  enum DataType {
+    UNDEFINED = 0;
+    TENSOR = 1;
+    SPARSE_TENSOR = 2;
+    SEQUENCE = 3;
+    MAP = 4;
+    OPTIONAL = 5;
+  }
+
+  // The data type of the element, identifies if the OptionalProto value
+  // is Tensor, Sparse Tensor, Sequence, Map, or Optional.
+  // The type of the optional value MUST match the elem_type specified.
+  // This field MUST have a valid OptionalProto.DataType value.
+  optional int32 elem_type = 2;
+
+  // For TensorProto value.
+  // When this field is present, the elem_type field MUST be TENSOR.
+  optional TensorProto tensor_value = 3;
+
+  // For SparseTensorProto value.
+  // When this field is present, the elem_type field MUST be SPARSE_TENSOR.
+  optional SparseTensorProto sparse_tensor_value = 4;
+
+  // For SequenceProto value.
+  // When this field is present, the elem_type field MUST be SEQUENCE.
+  optional SequenceProto sequence_value = 5;
+
+  // For MapProto value.
+  // When this field is present, the elem_type field MUST be MAP.
+  optional MapProto map_value = 6;
+
+  // For OptionalProto value, allowing optional to be of itself (completeness)
+  // When this field is present, the elem_type field MUST be OPTIONAL.
+  optional OptionalProto optional_value = 7;
+
+}
+
+// For using protobuf-lite
+option optimize_for = LITE_RUNTIME;
+
diff --git a/tools/pnnx/src/onnx.proto b/tools/pnnx/src/onnx-ml.proto
similarity index 92%
rename from tools/pnnx/src/onnx.proto
rename to tools/pnnx/src/onnx-ml.proto
index 15012ce65c3..5f4c0f4a4e2 100644
--- a/tools/pnnx/src/onnx.proto
+++ b/tools/pnnx/src/onnx-ml.proto
@@ -24,6 +24,8 @@ package onnx;
 //
 // The normative semantic specification of the ONNX IR is found in docs/IR.md.
 // Definitions of the built-in neural network operators may be found in docs/Operators.md.
+// Definitions of the built-in classical machine learning operators may be found in
+// docs/Operators-ml.md.
 
 // Notes
 //
@@ -106,7 +108,11 @@ enum Version {
   // IR VERSION 9 published on May 5, 2023
   // Added AttributeProto to FunctionProto so that default attribute values can be set.
   // Added FLOAT8E4M3FN, FLOAT8E4M3FNUZ, FLOAT8E5M2, FLOAT8E5M2FNUZ.
-  IR_VERSION = 0x0000000000000009;
+  IR_VERSION_2023_5_5 = 0x0000000000000009;
+
+  // IR VERSION 10 published on TBD
+  // Added UINT4, INT4.
+  IR_VERSION = 0x000000000000000A;
 }
 
 // Attributes
@@ -190,6 +196,8 @@ message ValueInfoProto {
   optional TypeProto type = 2;
   // A human-readable documentation for this value. Markdown is allowed.
   optional string doc_string = 3;
+  // Named metadata values; keys should be distinct.
+  repeated StringStringEntryProto metadata_props = 4;
 }
 
 // Nodes
@@ -211,12 +219,17 @@ message NodeProto {
   optional string op_type = 4;  // namespace Operator
   // The domain of the OperatorSet that specifies the operator named by op_type.
   optional string domain = 7;   // namespace Domain
+  // Overload identifier, used only to map this to a model-local function.
+  optional string overload = 8;
 
   // Additional named attributes.
   repeated AttributeProto attribute = 5;
 
   // A human-readable documentation for this node. Markdown is allowed.
   optional string doc_string = 6;
+
+  // Named metadata values; keys should be distinct.
+  repeated StringStringEntryProto metadata_props = 9;
 }
 
 // Training information
@@ -401,7 +414,7 @@ message ModelProto {
 
   // A list of function protos local to the model.
   //
-  // Name of the function "FunctionProto.name" should be unique within the domain "FunctionProto.domain".
+  // The (domain, name, overload) tuple must be unique across the function protos in this list.
   // In case of any conflicts the behavior (whether the model local functions are given higher priority,
   // or standard operator sets are given higher priotity or this is treated as error) is defined by
   // the runtimes.
@@ -475,6 +488,9 @@ message GraphProto {
   // which means, tensor 'a_scale' and tensor 'a_zero_point' are scale and zero point of tensor 'a' in the model.
   repeated TensorAnnotation quantization_annotation = 14;
 
+  // Named metadata values; keys should be distinct.
+  repeated StringStringEntryProto metadata_props = 16;
+
   reserved 3, 4, 6 to 9;
   reserved "ir_version", "producer_version", "producer_tag", "domain";
 }
@@ -520,7 +536,11 @@ message TensorProto {
     FLOAT8E4M3FN = 17;    // float 8, mostly used for coefficients, supports nan, not inf
     FLOAT8E4M3FNUZ = 18;  // float 8, mostly used for coefficients, supports nan, not inf, no negative zero
     FLOAT8E5M2 = 19;      // follows IEEE 754, supports nan, inf, mostly used for gradients
-    FLOAT8E5M2FNUZ = 20;  // follows IEEE 754, supports nan, inf, mostly used for gradients, no negative zero
+    FLOAT8E5M2FNUZ = 20;  // follows IEEE 754, supports nan, not inf, mostly used for gradients, no negative zero
+
+    // 4-bit data-types
+    UINT4 = 21;  // Unsigned integer in range [0, 15]
+    INT4 = 22;   // Signed integer in range [-8, 7], using two's-complement representation
 
     // Future extensions go here.
   }
@@ -555,11 +575,13 @@ message TensorProto {
   // When this field is present, the data_type field MUST be FLOAT or COMPLEX64.
   repeated float float_data = 4 [packed = true];
 
-  // For int32, uint8, int8, uint16, int16, bool, float8, and float16 values
+  // For int32, uint8, int8, uint16, int16, uint4, int4, bool, float8 and float16 values
   // float16 and float8 values must be bit-wise converted to an uint16_t prior
   // to writing to the buffer.
+  // uint4 and int4 values must be packed to 4bitx2 prior to writing to the buffer, the first element is stored in
+  // the 4 LSB and the second element is stored in the 4 MSB.
   // When this field is present, the data_type field MUST be
-  // INT32, INT16, INT8, UINT16, UINT8, BOOL, FLOAT16, BFLOAT16, FLOAT8E4M3FN, FLOAT8E4M3FNUZ, FLOAT8E5M2, FLOAT8E5M2FNUZ
+  // INT32, INT16, INT8, INT4, UINT16, UINT8, UINT4, BOOL, FLOAT16, BFLOAT16, FLOAT8E4M3FN, FLOAT8E4M3FNUZ, FLOAT8E5M2, FLOAT8E5M2FNUZ
   repeated int32 int32_data = 5 [packed = true];
 
   // For strings.
@@ -589,6 +611,7 @@ message TensorProto {
   // Complex64 elements must be written as two consecutive FLOAT values, real component first.
   // Complex128 elements must be written as two consecutive DOUBLE values, real component first.
   // Boolean type MUST be written one byte per tensor element (00000001 for true, 00000000 for false).
+  // uint4 and int4 values must be packed to 4bitx2, the first element is stored in the 4 LSB and the second element is stored in the 4 MSB.
   //
   // Note: the advantage of specific field rather than the raw_data field is
   // that in some cases (e.g. int data), protobuf does a better packing via
@@ -631,6 +654,9 @@ message TensorProto {
   // When this field is present, the data_type field MUST be
   // UINT32 or UINT64
   repeated uint64 uint64_data = 11 [packed = true];
+
+  // Named metadata values; keys should be distinct.
+  repeated StringStringEntryProto metadata_props = 16;
 }
 
 // A serialized sparse-tensor value
@@ -724,6 +750,17 @@ message TypeProto {
   }
 
 
+  message Opaque {
+    // When missing, the domain is the same as the model's.
+    optional string domain = 1;
+    // The name is optional but significant when provided.
+    optional string name = 2;
+    // parameters that help defining the type
+    // DEPRECATED do not use.
+    // repeated TypeProto parameters = 3;
+  }
+
+
   oneof value {
     // The type of a tensor.
     Tensor tensor_type = 1;
@@ -746,6 +783,9 @@ message TypeProto {
     // Type of the sparse tensor
     SparseTensor sparse_tensor_type = 8;
 
+
+    Opaque opaque_type = 7;
+
   }
 
   // An optional denotation can be used to denote the whole
@@ -777,9 +817,8 @@ enum OperatorStatus {
 }
 
 message FunctionProto {
-  // The name of the function, similar usage of op_type in OperatorProto.
-  // Combined with FunctionProto.domain, this forms the unique identity of
-  // the FunctionProto.
+  // The name of the function, similar to op_type in NodeProto.
+  // This is part of the unique-id (domain, name, overload) of FunctionProtos in a model.
   optional string name = 1;
 
   // Deprecated since IR Version 8
@@ -826,9 +865,22 @@ message FunctionProto {
 
   repeated OperatorSetIdProto opset_import = 9;
 
-  // The domain which this function belongs to. Combined with FunctionProto.name, this forms the unique identity of
-  // the FunctionProto.
+  // The domain which this function belongs to.
+  // This is part of the unique-id (domain, name, overload) of FunctionProtos in a model.
   optional string domain = 10;
+
+  // The overload identifier of the function.
+  // This is part of the unique-id (domain, name, overload) of FunctionProtos in a model.
+  optional string overload = 13;
+
+  // Information for the values in the function. The ValueInfoProto.name's
+  // must be distinct and refer to names in the function (including inputs,
+  // outputs, and intermediate values). It is optional for a value to appear
+  // in value_info list.
+  repeated ValueInfoProto value_info = 12;
+
+  // Named metadata values; keys should be distinct.
+  repeated StringStringEntryProto metadata_props = 14;
 }
 
 // For using protobuf-lite
diff --git a/tools/pnnx/src/onnx-operators-ml.proto b/tools/pnnx/src/onnx-operators-ml.proto
new file mode 100644
index 00000000000..de62706f5cb
--- /dev/null
+++ b/tools/pnnx/src/onnx-operators-ml.proto
@@ -0,0 +1,136 @@
+//
+// WARNING: This file is automatically generated!  Please edit onnx.in.proto.
+//
+
+
+// Copyright (c) ONNX Project Contributors.
+// Licensed under the Apache-2.0 license.
+
+syntax = "proto2";
+
+package onnx;
+import "onnx-ml.proto";
+
+//
+// This file contains the proto definitions for OperatorSetProto and
+// OperatorProto.  OperatorSetProtos are used to describe a versioned
+// set of operators that can be used by a ModelProto.
+//
+// Like ModelProto, OperatorSetProto is defined as a top-level file/wire
+// format, however their usage is different.
+//
+// ModelProto files are used to describe executable graphs that can be
+// executed directly by a framework, runtime, or engine.
+//
+// OperatorSetProto files are used to describe a set of operators that are
+// available in a given environment.  The file TBD.TBD is the OperatorSetProto
+// that describes the ONNX standard operators.
+//
+
+// An OperatorProto represents the immutable specification of the signature
+// and semantics of an operator.
+//
+// Operators are declared as part of an OperatorSet, which also defines the
+// domain name for the set.
+//
+// Operators are uniquely identified by a three part identifier
+//   (domain, op_type, since_version)
+// where
+//   *domain* is the domain of an operator set that
+//      contains this operator specification.
+//
+//   *op_type* is the name of the operator as referenced by a
+//      NodeProto.op_type
+//
+//   *since_version* is the version of the operator set that
+//      this operator was initially declared in.
+//
+message OperatorProto {
+  // The name of the operator within a domain.
+  // This field MUST be present in this version of the IR.
+  optional string op_type = 1;
+
+  // The version of the operator set that first introduced this
+  // operator. This value MUST be the same value as the
+  // opset_version of the operator set that first published this operator.
+  // Subsequent versions of the operator set MUST NOT alter the signature
+  // or semantics of the operator once published as STABLE.
+  // This field MUST be present in this version of the IR.
+  optional int64 since_version = 2;
+
+  // This field indicates whether the syntax, semantics, or presence
+  // of this operator is in an experimental or stable stage. Once an
+  // operator is published as STABLE, it's syntax and semantics MUST NOT
+  // change in subsequent versions of the operator set.
+  // When an operator is published as EXPERIMENTAL, the syntax and semantics
+  // of the operator MAY change across operator set versions.
+  // Operators "become" stable by deprecating the experimental version and
+  // introducing a new stable operator with the same op_type.
+  optional OperatorStatus status = 3;
+
+  // Eventually we will declare the signature of the operator here
+
+  // A human-readable documentation for this operator. Markdown is allowed.
+  optional string doc_string = 10;
+}
+
+// An OperatorSetProto represents an immutable set of immutable operator
+// specifications.
+//
+// The domain of the set (OperatorSetProto.domain) is a reverse-DNS name
+// that disambiguates operator sets defined by independent entities.
+//
+// The version of the set (opset_version) is a monotonically increasing
+// integer that indicates changes to the membership of the operator set.
+//
+//
+// Operator sets are uniquely identified by a two part identifier (domain, opset_version)
+//
+// Like ModelProto, OperatorSetProto is intended as a top-level file/wire format,
+// and thus has the standard format headers in addition to the operator set information.
+//
+message OperatorSetProto {
+  // All OperatorSetProtos start with a distingushed byte sequence to disambiguate
+  // protobuf files containing OperatorSets from other content.
+  // This field MUST be "ONNXOPSET"
+  // This field MUST be present in this version of the IR
+  optional string magic = 1;
+
+  // All OperatorSetProtos indicate the version of the IR syntax and semantics
+  // they adhere to. It is always IR_VERSION.
+  // This field MUST be present in this version of the IR
+  optional int64 ir_version = 2;
+
+  // The prerelease component of the SemVer of the IR.
+  // This field MAY be absent in this version of the IR
+  optional string ir_version_prerelease = 3;
+
+  // The build metadata component of the SemVer of the IR.
+  // This field MAY be absent in this version of the IR
+  optional string ir_build_metadata = 7;
+
+  // Domain name of the operator set, in reverse DNS form (e.g., com.acme.dnnops).
+  optional string domain = 4;
+
+  // The version of the set of operators. This is a simple int value
+  // that is monotonically increasing as new versions of the operator set
+  // are published. All operators in this set MUST have since_version
+  // <= opset_version.
+  optional int64 opset_version = 5;
+
+  // A human-readable documentation for this set of operators. Markdown is allowed.
+  optional string doc_string = 6;
+
+  // The operators specified by this operator set.
+  // The (name, version) MUST be unique across all OperatorProtos in operator
+  repeated OperatorProto operator = 8;
+
+  // The functions specified by this operator set.
+  // The (name, version) MUST be unique across all OperatorProtos/FunctionProtos in operator/functions
+  repeated FunctionProto functions = 9;
+}
+
+
+// For using protobuf-lite
+option optimize_for = LITE_RUNTIME;
+
diff --git a/tools/pnnx/src/pass_level5/fuse_multiheadattention.cpp b/tools/pnnx/src/pass_level5/fuse_multiheadattention.cpp
index 2a9f3b837b1..b6297eb8a92 100644
--- a/tools/pnnx/src/pass_level5/fuse_multiheadattention.cpp
+++ b/tools/pnnx/src/pass_level5/fuse_multiheadattention.cpp
@@ -1734,6 +1734,64 @@ pnnx.Output             output      1 0 out
     }
 };
 
+class fuse_multiheadattention_pass_onnx_1_2 : public fuse_multiheadattention_pass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+21 20
+pnnx.Input              input_q     0 1 input
+nn.Linear               op_0        1 1 input 14 bias=%qkvbias in_features=%embed_dim out_features=%qkv_out_features @bias @weight
+Tensor.reshape          op_1        1 1 14 15 shape=(%batch,%size,1,3,%embed_dim)
+Tensor.permute          op_2        1 1 15 16 dims=(3,1,2,0,4)
+torch.squeeze           op_3        1 1 16 17 dim=3
+torch.unbind            op_4        1 3 17 18 19 20 dim=0
+Tensor.reshape          op_5        1 1 18 21 shape=(%size,%num_heads,%feat_per_head)
+Tensor.reshape          op_6        1 1 19 23 shape=(%size,%num_heads,%feat_per_head)
+Tensor.reshape          op_7        1 1 20 25 shape=(%size,%num_heads,%feat_per_head)
+Tensor.permute          op_8        1 1 21 22 dims=(1,0,2)
+Tensor.permute          op_9        1 1 23 24 dims=(1,0,2)
+Tensor.permute          op_10       1 1 25 26 dims=(1,0,2)
+Tensor.reshape          op_11       1 1 22 27 shape=(%batch,%num_heads,%size,%feat_per_head)
+Tensor.reshape          op_12       1 1 24 28 shape=(%batch,%num_heads,%size,%feat_per_head)
+Tensor.reshape          op_13       1 1 26 29 shape=(%batch,%num_heads,%size,%feat_per_head)
+F.scaled_dot_product_attention op_14 3 1 27 28 29 35 dropout_p=0.000000e+00 is_causal=False
+Tensor.permute          op_15       1 1 35 36 dims=(2,0,1,3)
+Tensor.reshape          op_16       1 1 36 37 shape=(%size,%embed_dim)
+nn.Linear               out_proj    1 1 37 38 bias=%outbias in_features=%embed_dim out_features=%embed_dim @bias @weight
+Tensor.reshape          op_18       1 1 38 out shape=(%size,%batch,%embed_dim)
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* replace_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+3 2
+pnnx.Input              input       0 1 input
+nn.MultiheadAttention   attention   1 1 input out embed_dim=%embed_dim kdim=%embed_dim vdim=%embed_dim num_heads=%num_heads batch_first=False add_zero_attn=False add_bias_kv=False
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    bool match(const std::map<std::string, const Operator*>& matched_operators, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& /*captured_attrs*/) const
+    {
+        const int embed_dim = captured_params.at("embed_dim").i;
+        const int qkv_out_features = captured_params.at("qkv_out_features").i;
+        const int num_heads = captured_params.at("num_heads").i;
+        const int feat_per_head = captured_params.at("feat_per_head").i;
+
+        if (qkv_out_features != embed_dim * 3)
+            return false;
+
+        if (embed_dim != num_heads * feat_per_head)
+            return false;
+
+        return true;
+    }
+};
+
 class fuse_multiheadattention_pass_onnx_2 : public fuse_multiheadattention_pass
 {
 public:
@@ -2048,6 +2106,7 @@ void fuse_multiheadattention(Graph& graph)
     fuse_multiheadattention_pass_onnx onnx0;
     fuse_multiheadattention_pass_onnx_1 onnx1;
     fuse_multiheadattention_pass_onnx_1_1 onnx1a;
+    fuse_multiheadattention_pass_onnx_1_2 onnx1b;
     fuse_multiheadattention_pass_onnx_2 onnx2;
     fuse_multiheadattention_pass_onnx_3 onnx3;
     fuse_multiheadattention_pass_onnx_4 onnx4;
@@ -2087,6 +2146,7 @@ void fuse_multiheadattention(Graph& graph)
     pnnx_graph_rewrite(graph, &onnx0, opindex);
     pnnx_graph_rewrite(graph, &onnx1, opindex);
     pnnx_graph_rewrite(graph, &onnx1a, opindex);
+    pnnx_graph_rewrite(graph, &onnx1b, opindex);
     pnnx_graph_rewrite(graph, &onnx2, opindex);
     pnnx_graph_rewrite(graph, &onnx3, opindex);
     pnnx_graph_rewrite(graph, &onnx4, opindex);
diff --git a/tools/pnnx/src/pass_onnx.cpp b/tools/pnnx/src/pass_onnx.cpp
index dd9194111fc..6318dacba25 100644
--- a/tools/pnnx/src/pass_onnx.cpp
+++ b/tools/pnnx/src/pass_onnx.cpp
@@ -14,7 +14,7 @@
 
 #include "pass_onnx.h"
 
-#include "onnx.pb.h"
+#include "onnx-ml.pb.h"
 
 #include <google/protobuf/io/coded_stream.h>
 #include <google/protobuf/io/zero_copy_stream_impl.h>
diff --git a/tools/pnnx/src/pass_onnx/canonicalize.h b/tools/pnnx/src/pass_onnx/canonicalize.h
index a24ad86a9fd..6ec55f2d140 100644
--- a/tools/pnnx/src/pass_onnx/canonicalize.h
+++ b/tools/pnnx/src/pass_onnx/canonicalize.h
@@ -12,7 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "onnx.pb.h"
+#include "onnx-ml.pb.h"
 
 namespace pnnx {
 
diff --git a/tools/pnnx/src/pass_onnx/dead_code_elimination.h b/tools/pnnx/src/pass_onnx/dead_code_elimination.h
index b890b6a7d7c..7d8b7e0d25d 100644
--- a/tools/pnnx/src/pass_onnx/dead_code_elimination.h
+++ b/tools/pnnx/src/pass_onnx/dead_code_elimination.h
@@ -12,7 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "onnx.pb.h"
+#include "onnx-ml.pb.h"
 
 namespace pnnx {
 
diff --git a/tools/pnnx/src/pass_onnx/eliminate_noop.h b/tools/pnnx/src/pass_onnx/eliminate_noop.h
index e465e398c0a..3325ae9cf10 100644
--- a/tools/pnnx/src/pass_onnx/eliminate_noop.h
+++ b/tools/pnnx/src/pass_onnx/eliminate_noop.h
@@ -12,7 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "onnx.pb.h"
+#include "onnx-ml.pb.h"
 
 namespace pnnx {
 
diff --git a/tools/pnnx/src/pass_onnx/fold_constants.h b/tools/pnnx/src/pass_onnx/fold_constants.h
index 98d6ef717ab..f165a96e177 100644
--- a/tools/pnnx/src/pass_onnx/fold_constants.h
+++ b/tools/pnnx/src/pass_onnx/fold_constants.h
@@ -12,7 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "onnx.pb.h"
+#include "onnx-ml.pb.h"
 
 namespace pnnx {
 
diff --git a/tools/pnnx/src/pass_onnx/fuse_constant_as_attribute.h b/tools/pnnx/src/pass_onnx/fuse_constant_as_attribute.h
index ad6cf80007c..a90c089fee6 100644
--- a/tools/pnnx/src/pass_onnx/fuse_constant_as_attribute.h
+++ b/tools/pnnx/src/pass_onnx/fuse_constant_as_attribute.h
@@ -12,7 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "onnx.pb.h"
+#include "onnx-ml.pb.h"
 
 namespace pnnx {
 
diff --git a/tools/pnnx/src/pass_onnx/inline_containers.h b/tools/pnnx/src/pass_onnx/inline_containers.h
index 56b21f47b37..e3051c5e333 100644
--- a/tools/pnnx/src/pass_onnx/inline_containers.h
+++ b/tools/pnnx/src/pass_onnx/inline_containers.h
@@ -12,7 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "onnx.pb.h"
+#include "onnx-ml.pb.h"
 
 namespace pnnx {
 
diff --git a/tools/pnnx/src/pass_onnx/inline_if_graph.h b/tools/pnnx/src/pass_onnx/inline_if_graph.h
index c84b5761ac5..e9c1c2f0ee8 100644
--- a/tools/pnnx/src/pass_onnx/inline_if_graph.h
+++ b/tools/pnnx/src/pass_onnx/inline_if_graph.h
@@ -12,7 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "onnx.pb.h"
+#include "onnx-ml.pb.h"
 
 namespace pnnx {
 
diff --git a/tools/pnnx/src/pass_onnx/model_stat.h b/tools/pnnx/src/pass_onnx/model_stat.h
index dd62e67a1bc..993630b1b4b 100644
--- a/tools/pnnx/src/pass_onnx/model_stat.h
+++ b/tools/pnnx/src/pass_onnx/model_stat.h
@@ -12,7 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "onnx.pb.h"
+#include "onnx-ml.pb.h"
 
 namespace pnnx {
 
diff --git a/tools/pnnx/src/pass_onnx/nn_AdaptiveAvgPool2d.cpp b/tools/pnnx/src/pass_onnx/nn_AdaptiveAvgPool2d.cpp
index 0e8851f05f2..21cf6076d2d 100644
--- a/tools/pnnx/src/pass_onnx/nn_AdaptiveAvgPool2d.cpp
+++ b/tools/pnnx/src/pass_onnx/nn_AdaptiveAvgPool2d.cpp
@@ -15,7 +15,7 @@
 #include "pass_onnx.h"
 #include "ir.h"
 
-#include "onnx.pb.h"
+#include "onnx-ml.pb.h"
 
 namespace pnnx {
 
diff --git a/tools/pnnx/src/pass_onnx/nn_AdaptiveAvgPool3d.cpp b/tools/pnnx/src/pass_onnx/nn_AdaptiveAvgPool3d.cpp
index 070981e1d64..a8e3e96be6b 100644
--- a/tools/pnnx/src/pass_onnx/nn_AdaptiveAvgPool3d.cpp
+++ b/tools/pnnx/src/pass_onnx/nn_AdaptiveAvgPool3d.cpp
@@ -15,7 +15,7 @@
 #include "pass_onnx.h"
 #include "ir.h"
 
-#include "onnx.pb.h"
+#include "onnx-ml.pb.h"
 
 namespace pnnx {
 
diff --git a/tools/pnnx/src/pass_onnx/nn_AvgPool2d.cpp b/tools/pnnx/src/pass_onnx/nn_AvgPool2d.cpp
index 5a006fe3709..6f5be930e64 100644
--- a/tools/pnnx/src/pass_onnx/nn_AvgPool2d.cpp
+++ b/tools/pnnx/src/pass_onnx/nn_AvgPool2d.cpp
@@ -15,7 +15,7 @@
 #include "pass_onnx.h"
 #include "ir.h"
 
-#include "onnx.pb.h"
+#include "onnx-ml.pb.h"
 
 namespace pnnx {
 
diff --git a/tools/pnnx/src/pass_onnx/nn_AvgPool3d.cpp b/tools/pnnx/src/pass_onnx/nn_AvgPool3d.cpp
index ff2a5dd8aad..9fdcfdd72d6 100644
--- a/tools/pnnx/src/pass_onnx/nn_AvgPool3d.cpp
+++ b/tools/pnnx/src/pass_onnx/nn_AvgPool3d.cpp
@@ -15,7 +15,7 @@
 #include "pass_onnx.h"
 #include "ir.h"
 
-#include "onnx.pb.h"
+#include "onnx-ml.pb.h"
 
 namespace pnnx {
 
diff --git a/tools/pnnx/src/pass_onnx/nn_BatchNorm2d.cpp b/tools/pnnx/src/pass_onnx/nn_BatchNorm2d.cpp
index c3639904d47..96448c0f25c 100644
--- a/tools/pnnx/src/pass_onnx/nn_BatchNorm2d.cpp
+++ b/tools/pnnx/src/pass_onnx/nn_BatchNorm2d.cpp
@@ -15,7 +15,7 @@
 #include "pass_onnx.h"
 #include "ir.h"
 
-#include "onnx.pb.h"
+#include "onnx-ml.pb.h"
 
 namespace pnnx {
 
diff --git a/tools/pnnx/src/pass_onnx/nn_BatchNorm3d.cpp b/tools/pnnx/src/pass_onnx/nn_BatchNorm3d.cpp
index 0f9405f160a..afac686a22a 100644
--- a/tools/pnnx/src/pass_onnx/nn_BatchNorm3d.cpp
+++ b/tools/pnnx/src/pass_onnx/nn_BatchNorm3d.cpp
@@ -15,7 +15,7 @@
 #include "pass_onnx.h"
 #include "ir.h"
 
-#include "onnx.pb.h"
+#include "onnx-ml.pb.h"
 
 namespace pnnx {
 
diff --git a/tools/pnnx/src/pass_onnx/nn_Conv2d.cpp b/tools/pnnx/src/pass_onnx/nn_Conv2d.cpp
index c9aeac561ac..2cd6b7dd750 100644
--- a/tools/pnnx/src/pass_onnx/nn_Conv2d.cpp
+++ b/tools/pnnx/src/pass_onnx/nn_Conv2d.cpp
@@ -15,7 +15,7 @@
 #include "pass_onnx.h"
 #include "ir.h"
 
-#include "onnx.pb.h"
+#include "onnx-ml.pb.h"
 
 namespace pnnx {
 
diff --git a/tools/pnnx/src/pass_onnx/nn_Conv3d.cpp b/tools/pnnx/src/pass_onnx/nn_Conv3d.cpp
index 6413685fcb5..f90c23cbb6a 100644
--- a/tools/pnnx/src/pass_onnx/nn_Conv3d.cpp
+++ b/tools/pnnx/src/pass_onnx/nn_Conv3d.cpp
@@ -15,7 +15,7 @@
 #include "pass_onnx.h"
 #include "ir.h"
 
-#include "onnx.pb.h"
+#include "onnx-ml.pb.h"
 
 namespace pnnx {
 
diff --git a/tools/pnnx/src/pass_onnx/nn_GELU.cpp b/tools/pnnx/src/pass_onnx/nn_GELU.cpp
index f5b7000e017..22d2823673a 100644
--- a/tools/pnnx/src/pass_onnx/nn_GELU.cpp
+++ b/tools/pnnx/src/pass_onnx/nn_GELU.cpp
@@ -15,7 +15,7 @@
 #include "pass_onnx.h"
 #include "ir.h"
 
-#include "onnx.pb.h"
+#include "onnx-ml.pb.h"
 
 namespace pnnx {
 
diff --git a/tools/pnnx/src/pass_onnx/nn_LayerNorm.cpp b/tools/pnnx/src/pass_onnx/nn_LayerNorm.cpp
index f4ecf289557..fece12e2bce 100644
--- a/tools/pnnx/src/pass_onnx/nn_LayerNorm.cpp
+++ b/tools/pnnx/src/pass_onnx/nn_LayerNorm.cpp
@@ -15,7 +15,7 @@
 #include "pass_onnx.h"
 #include "ir.h"
 
-#include "onnx.pb.h"
+#include "onnx-ml.pb.h"
 
 namespace pnnx {
 
diff --git a/tools/pnnx/src/pass_onnx/nn_Linear.cpp b/tools/pnnx/src/pass_onnx/nn_Linear.cpp
index 4dce81908b2..0515a8ea454 100644
--- a/tools/pnnx/src/pass_onnx/nn_Linear.cpp
+++ b/tools/pnnx/src/pass_onnx/nn_Linear.cpp
@@ -15,7 +15,7 @@
 #include "pass_onnx.h"
 #include "ir.h"
 
-#include "onnx.pb.h"
+#include "onnx-ml.pb.h"
 
 namespace pnnx {
 
diff --git a/tools/pnnx/src/pass_onnx/nn_MaxPool2d.cpp b/tools/pnnx/src/pass_onnx/nn_MaxPool2d.cpp
index 47924bd33fc..518abd434b0 100644
--- a/tools/pnnx/src/pass_onnx/nn_MaxPool2d.cpp
+++ b/tools/pnnx/src/pass_onnx/nn_MaxPool2d.cpp
@@ -15,7 +15,7 @@
 #include "pass_onnx.h"
 #include "ir.h"
 
-#include "onnx.pb.h"
+#include "onnx-ml.pb.h"
 
 namespace pnnx {
 
diff --git a/tools/pnnx/src/pass_onnx/nn_MaxPool3d.cpp b/tools/pnnx/src/pass_onnx/nn_MaxPool3d.cpp
index c8c467f5ba2..04de8bd104a 100644
--- a/tools/pnnx/src/pass_onnx/nn_MaxPool3d.cpp
+++ b/tools/pnnx/src/pass_onnx/nn_MaxPool3d.cpp
@@ -15,7 +15,7 @@
 #include "pass_onnx.h"
 #include "ir.h"
 
-#include "onnx.pb.h"
+#include "onnx-ml.pb.h"
 
 namespace pnnx {
 
diff --git a/tools/pnnx/src/pass_onnx/nn_MultiheadAttention.cpp b/tools/pnnx/src/pass_onnx/nn_MultiheadAttention.cpp
index a29ec9d9306..df1bd092273 100644
--- a/tools/pnnx/src/pass_onnx/nn_MultiheadAttention.cpp
+++ b/tools/pnnx/src/pass_onnx/nn_MultiheadAttention.cpp
@@ -15,7 +15,7 @@
 #include "pass_onnx.h"
 #include "ir.h"
 
-#include "onnx.pb.h"
+#include "onnx-ml.pb.h"
 
 namespace pnnx {
 
diff --git a/tools/pnnx/src/pass_onnx/shape_inference.h b/tools/pnnx/src/pass_onnx/shape_inference.h
index b4cd657bb81..b484d5265ca 100644
--- a/tools/pnnx/src/pass_onnx/shape_inference.h
+++ b/tools/pnnx/src/pass_onnx/shape_inference.h
@@ -12,7 +12,7 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "onnx.pb.h"
+#include "onnx-ml.pb.h"
 
 namespace pnnx {
 
diff --git a/tools/pnnx/src/save_onnx.cpp b/tools/pnnx/src/save_onnx.cpp
index 3406c730b2d..3ef3a772a2f 100644
--- a/tools/pnnx/src/save_onnx.cpp
+++ b/tools/pnnx/src/save_onnx.cpp
@@ -14,7 +14,7 @@
 
 #include "save_onnx.h"
 
-#include "onnx.pb.h"
+#include "onnx-ml.pb.h"
 
 #include <string.h>
 #include <fstream>
diff --git a/tools/pnnx/tests/onnx/test_nn_ReLU.py b/tools/pnnx/tests/onnx/test_nn_ReLU.py
index d381fb5bc0e..8230e3f4827 100644
--- a/tools/pnnx/tests/onnx/test_nn_ReLU.py
+++ b/tools/pnnx/tests/onnx/test_nn_ReLU.py
@@ -61,7 +61,7 @@ def test():
         if not torch.allclose(a0, b0, 1e-4, 1e-4):
             return False
 
-    if version.parse(torch.__version__) < version.parse('2.4'):
+    if version.parse(torch.__version__) < version.parse('2.5'):
         return True
 
     # export dynamo onnx
diff --git a/tools/pnnx/tests/onnx/test_squeezenet1_1.py b/tools/pnnx/tests/onnx/test_squeezenet1_1.py
index f5f5f4a668a..28c7df8fb81 100644
--- a/tools/pnnx/tests/onnx/test_squeezenet1_1.py
+++ b/tools/pnnx/tests/onnx/test_squeezenet1_1.py
@@ -39,7 +39,7 @@ def test():
     if not torch.allclose(a, b, 1e-4, 1e-4):
         return False
 
-    if version.parse(torch.__version__) < version.parse('2.4'):
+    if version.parse(torch.__version__) < version.parse('2.5'):
         return True
 
     # export dynamo onnx
diff --git a/tools/pnnx/tests/onnx/test_swin_t.py b/tools/pnnx/tests/onnx/test_swin_t.py
index be25520d0bc..6361d20c911 100644
--- a/tools/pnnx/tests/onnx/test_swin_t.py
+++ b/tools/pnnx/tests/onnx/test_swin_t.py
@@ -43,7 +43,7 @@ def test():
     if not torch.allclose(a, b, 1e-4, 1e-4):
         return False
 
-    if version.parse(torch.__version__) < version.parse('2.4'):
+    if version.parse(torch.__version__) < version.parse('2.5'):
         return True
 
     # export dynamo onnx
diff --git a/tools/pnnx/tests/onnx/test_vit_b_32.py b/tools/pnnx/tests/onnx/test_vit_b_32.py
index ecb0bd350f6..3c92a119406 100644
--- a/tools/pnnx/tests/onnx/test_vit_b_32.py
+++ b/tools/pnnx/tests/onnx/test_vit_b_32.py
@@ -46,7 +46,7 @@ def test():
     if not torch.allclose(a, b, 1e-4, 1e-4):
         return False
 
-    if version.parse(torch.__version__) < version.parse('2.4'):
+    if version.parse(torch.__version__) < version.parse('2.5'):
         return True
 
     # export dynamo onnx

From f3cd4c2e917ad264f73c600af9e5c6801af08608 Mon Sep 17 00:00:00 2001
From: nihui <nihuini@tencent.com>
Date: Mon, 12 Aug 2024 15:53:19 +0800
Subject: [PATCH 16/38] pnnx2ncnn handle F.maxpool without dilation param
 (#5622)

---
 tools/pnnx/src/pass_ncnn/F_max_pool1d.cpp | 16 ++++++++++++++++
 tools/pnnx/src/pass_ncnn/F_max_pool2d.cpp | 16 ++++++++++++++++
 tools/pnnx/src/pass_ncnn/F_max_pool3d.cpp | 16 ++++++++++++++++
 3 files changed, 48 insertions(+)

diff --git a/tools/pnnx/src/pass_ncnn/F_max_pool1d.cpp b/tools/pnnx/src/pass_ncnn/F_max_pool1d.cpp
index 1d9ca98e03d..aaef7db2d74 100644
--- a/tools/pnnx/src/pass_ncnn/F_max_pool1d.cpp
+++ b/tools/pnnx/src/pass_ncnn/F_max_pool1d.cpp
@@ -63,6 +63,22 @@ pnnx.Output             output      1 0 out
 
 REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_max_pool1d, 20)
 
+class F_max_pool1d_1 : public F_max_pool1d
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+3 2
+pnnx.Input              input       0 1 input
+F.max_pool1d            op_0        1 1 input out kernel_size=%kernel_size stride=%stride padding=%padding ceil_mode=%ceil_mode return_indices=False
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+};
+
+REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_max_pool1d_1, 20)
+
 } // namespace ncnn
 
 } // namespace pnnx
diff --git a/tools/pnnx/src/pass_ncnn/F_max_pool2d.cpp b/tools/pnnx/src/pass_ncnn/F_max_pool2d.cpp
index ba5a52f4f7d..3519c8a022b 100644
--- a/tools/pnnx/src/pass_ncnn/F_max_pool2d.cpp
+++ b/tools/pnnx/src/pass_ncnn/F_max_pool2d.cpp
@@ -66,6 +66,22 @@ pnnx.Output             output      1 0 out
 
 REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_max_pool2d, 20)
 
+class F_max_pool2d_1 : public F_max_pool2d
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+3 2
+pnnx.Input              input       0 1 input
+F.max_pool2d            op_0        1 1 input out kernel_size=%kernel_size stride=%stride padding=%padding ceil_mode=%ceil_mode return_indices=False
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+};
+
+REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_max_pool2d_1, 20)
+
 } // namespace ncnn
 
 } // namespace pnnx
diff --git a/tools/pnnx/src/pass_ncnn/F_max_pool3d.cpp b/tools/pnnx/src/pass_ncnn/F_max_pool3d.cpp
index 5476907fa88..2caede16a29 100644
--- a/tools/pnnx/src/pass_ncnn/F_max_pool3d.cpp
+++ b/tools/pnnx/src/pass_ncnn/F_max_pool3d.cpp
@@ -69,6 +69,22 @@ pnnx.Output             output      1 0 out
 
 REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_max_pool3d, 20)
 
+class F_max_pool3d_1 : public F_max_pool3d
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+3 2
+pnnx.Input              input       0 1 input
+F.max_pool3d            op_0        1 1 input out kernel_size=%kernel_size stride=%stride padding=%padding ceil_mode=%ceil_mode return_indices=False
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+};
+
+REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_max_pool3d_1, 20)
+
 } // namespace ncnn
 
 } // namespace pnnx

From ecfd88a11bdf6480c0496564c6392463997429fc Mon Sep 17 00:00:00 2001
From: nihui <nihuini@tencent.com>
Date: Mon, 12 Aug 2024 19:33:39 +0800
Subject: [PATCH 17/38] pnnx2ncnn convert torch.roll with one or two shifts
 (#5623)

---
 tools/pnnx/src/CMakeLists.txt            |   1 +
 tools/pnnx/src/pass_ncnn/torch_roll.cpp  | 193 +++++++++++++++++++++++
 tools/pnnx/tests/CMakeLists.txt          |   1 +
 tools/pnnx/tests/ncnn/CMakeLists.txt     |   1 +
 tools/pnnx/tests/ncnn/test_torch_roll.py |  64 ++++++++
 tools/pnnx/tests/onnx/CMakeLists.txt     |   1 +
 tools/pnnx/tests/onnx/test_torch_roll.py |  64 ++++++++
 tools/pnnx/tests/test_torch_roll.py      |  61 +++++++
 8 files changed, 386 insertions(+)
 create mode 100644 tools/pnnx/src/pass_ncnn/torch_roll.cpp
 create mode 100644 tools/pnnx/tests/ncnn/test_torch_roll.py
 create mode 100644 tools/pnnx/tests/onnx/test_torch_roll.py
 create mode 100644 tools/pnnx/tests/test_torch_roll.py

diff --git a/tools/pnnx/src/CMakeLists.txt b/tools/pnnx/src/CMakeLists.txt
index 27dfdef52f8..c5c6228dee7 100644
--- a/tools/pnnx/src/CMakeLists.txt
+++ b/tools/pnnx/src/CMakeLists.txt
@@ -572,6 +572,7 @@ set(pnnx_pass_ncnn_SRCS
     pass_ncnn/torch_mm.cpp
     pass_ncnn/torch_norm.cpp
     pass_ncnn/torch_prod.cpp
+    pass_ncnn/torch_roll.cpp
     pass_ncnn/torch_slice_scatter.cpp
     pass_ncnn/torch_squeeze.cpp
     pass_ncnn/torch_sum.cpp
diff --git a/tools/pnnx/src/pass_ncnn/torch_roll.cpp b/tools/pnnx/src/pass_ncnn/torch_roll.cpp
new file mode 100644
index 00000000000..c7c29593333
--- /dev/null
+++ b/tools/pnnx/src/pass_ncnn/torch_roll.cpp
@@ -0,0 +1,193 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pass_ncnn.h"
+
+namespace pnnx {
+
+namespace ncnn {
+
+class torch_roll : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+3 2
+pnnx.Input              input       0 1 input
+torch.roll              op_0        1 1 input out dims=%dims shifts=%shifts
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* replace_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+4 3
+pnnx.Input              input       0 1 input
+Slice                   slice       1 2 input a b
+Concat                  concat      2 1 b a out
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    bool match(const std::map<std::string, Parameter>& captured_params) const
+    {
+        if (captured_params.at("dims").type != 5)
+            return false;
+
+        if (captured_params.at("dims").ai.size() != 1)
+            return false;
+
+        if (captured_params.at("shifts").type != 5)
+            return false;
+
+        if (captured_params.at("shifts").ai.size() != 1)
+            return false;
+
+        return true;
+    }
+
+    void write(const std::map<std::string, Operator*>& ops, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
+    {
+        GraphRewriterPass::write(ops, captured_params, captured_attrs);
+
+        const Operand* in = ops.at("slice")->inputs[0];
+
+        const int batch_index = in->params.at("__batch_index").i;
+
+        int axis = captured_params.at("dims").ai[0];
+        if (axis == batch_index)
+        {
+            fprintf(stderr, "roll along batch axis %d is not supported\n", batch_index);
+        }
+
+        if (axis < 0)
+        {
+            int input_rank = in->shape.size();
+            axis = input_rank + axis;
+        }
+
+        if (axis > batch_index)
+            axis -= 1;
+
+        ops.at("slice")->params["1"] = axis;
+
+        ops.at("concat")->params["0"] = axis;
+
+        const int shift = captured_params.at("shifts").ai[0];
+        ops.at("slice")->params["2"] = std::vector<int>{-shift};
+    }
+};
+
+REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(torch_roll, 20)
+
+class torch_roll_1 : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+3 2
+pnnx.Input              input       0 1 input
+torch.roll              op_0        1 1 input out dims=%dims shifts=%shifts
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* replace_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+8 7
+pnnx.Input              input       0 1 input
+Slice                   slice       1 2 input a b
+Slice                   slice_a     1 2 a a0 a1
+Slice                   slice_b     1 2 b b0 b1
+Concat                  concat_a    2 1 a1 a0 a10
+Concat                  concat_b    2 1 b1 b0 b10
+Concat                  concat      2 1 b10 a10 out
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    bool match(const std::map<std::string, Parameter>& captured_params) const
+    {
+        if (captured_params.at("dims").type != 5)
+            return false;
+
+        if (captured_params.at("dims").ai.size() != 2)
+            return false;
+
+        if (captured_params.at("shifts").type != 5)
+            return false;
+
+        if (captured_params.at("shifts").ai.size() != 2)
+            return false;
+
+        return true;
+    }
+
+    void write(const std::map<std::string, Operator*>& ops, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
+    {
+        GraphRewriterPass::write(ops, captured_params, captured_attrs);
+
+        const Operand* in = ops.at("slice")->inputs[0];
+
+        const int batch_index = in->params.at("__batch_index").i;
+
+        int axis0 = captured_params.at("dims").ai[0];
+        int axis1 = captured_params.at("dims").ai[1];
+        if (axis0 == batch_index || axis1 == batch_index)
+        {
+            fprintf(stderr, "roll along batch axis %d is not supported\n", batch_index);
+        }
+
+        if (axis0 < 0)
+        {
+            int input_rank = in->shape.size();
+            axis0 = input_rank + axis0;
+        }
+
+        if (axis0 > batch_index)
+            axis0 -= 1;
+
+        if (axis1 < 0)
+        {
+            int input_rank = in->shape.size();
+            axis1 = input_rank + axis1;
+        }
+        if (axis1 > batch_index)
+            axis1 -= 1;
+
+        ops.at("slice")->params["1"] = axis0;
+        ops.at("slice_a")->params["1"] = axis1;
+        ops.at("slice_b")->params["1"] = axis1;
+
+        ops.at("concat_a")->params["0"] = axis1;
+        ops.at("concat_b")->params["0"] = axis1;
+        ops.at("concat")->params["0"] = axis0;
+
+        const int shift0 = captured_params.at("shifts").ai[0];
+        const int shift1 = captured_params.at("shifts").ai[1];
+        ops.at("slice")->params["2"] = std::vector<int>{-shift0};
+        ops.at("slice_a")->params["2"] = std::vector<int>{-shift1};
+        ops.at("slice_b")->params["2"] = std::vector<int>{-shift1};
+    }
+};
+
+REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(torch_roll_1, 20)
+
+} // namespace ncnn
+
+} // namespace pnnx
diff --git a/tools/pnnx/tests/CMakeLists.txt b/tools/pnnx/tests/CMakeLists.txt
index 7bbf1c6ea9c..a5522a70bb2 100644
--- a/tools/pnnx/tests/CMakeLists.txt
+++ b/tools/pnnx/tests/CMakeLists.txt
@@ -234,6 +234,7 @@ pnnx_add_test(torch_ones_like)
 pnnx_add_test(torch_positive)
 pnnx_add_test(torch_prod)
 pnnx_add_test(torch_repeat_interleave)
+pnnx_add_test(torch_roll)
 pnnx_add_test(torch_scatter_add)
 pnnx_add_test(torch_slice_scatter)
 pnnx_add_test(torch_sum)
diff --git a/tools/pnnx/tests/ncnn/CMakeLists.txt b/tools/pnnx/tests/ncnn/CMakeLists.txt
index a682e42835b..a60e63eb54b 100644
--- a/tools/pnnx/tests/ncnn/CMakeLists.txt
+++ b/tools/pnnx/tests/ncnn/CMakeLists.txt
@@ -162,6 +162,7 @@ pnnx_ncnn_add_test(torch_min)
 pnnx_ncnn_add_test(torch_mm)
 pnnx_ncnn_add_test(torch_norm)
 pnnx_ncnn_add_test(torch_prod)
+pnnx_ncnn_add_test(torch_roll)
 pnnx_ncnn_add_test(torch_slice_scatter)
 pnnx_ncnn_add_test(torch_sum)
 pnnx_ncnn_add_test(torch_squeeze)
diff --git a/tools/pnnx/tests/ncnn/test_torch_roll.py b/tools/pnnx/tests/ncnn/test_torch_roll.py
new file mode 100644
index 00000000000..6412ee6ba60
--- /dev/null
+++ b/tools/pnnx/tests/ncnn/test_torch_roll.py
@@ -0,0 +1,64 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = torch.roll(x, 3, 1)
+        y = torch.roll(y, -2, -1)
+        z = torch.roll(z, shifts=(2,1), dims=(0,1))
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(3, 16)
+    y = torch.rand(5, 9, 11)
+    z = torch.rand(8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y, z))
+    mod.save("test_torch_roll.pt")
+
+    # torchscript to ncnn
+    import os
+    os.system("../../src/pnnx test_torch_roll.pt inputshape=[3,16],[5,9,11],[8,5,9,10]")
+
+    # ncnn inference
+    import test_torch_roll_ncnn
+    b = test_torch_roll_ncnn.test_inference()
+
+    print(x)
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            print(a0)
+            print(b0)
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/onnx/CMakeLists.txt b/tools/pnnx/tests/onnx/CMakeLists.txt
index f4756740a79..673fa0434d9 100644
--- a/tools/pnnx/tests/onnx/CMakeLists.txt
+++ b/tools/pnnx/tests/onnx/CMakeLists.txt
@@ -145,6 +145,7 @@ pnnx_onnx_add_test(torch_mean)
 pnnx_onnx_add_test(torch_min)
 pnnx_onnx_add_test(torch_minimum)
 pnnx_onnx_add_test(torch_prod)
+pnnx_onnx_add_test(torch_roll)
 pnnx_onnx_add_test(torch_split)
 pnnx_onnx_add_test(torch_squeeze)
 pnnx_onnx_add_test(torch_stack)
diff --git a/tools/pnnx/tests/onnx/test_torch_roll.py b/tools/pnnx/tests/onnx/test_torch_roll.py
new file mode 100644
index 00000000000..06b8d579649
--- /dev/null
+++ b/tools/pnnx/tests/onnx/test_torch_roll.py
@@ -0,0 +1,64 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from packaging import version
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = torch.roll(x, 3, -1)
+        y = torch.roll(y, -2, -1)
+        z = torch.roll(z, shifts=(2,1), dims=(0,1))
+        return x, y, z
+
+def test():
+    if version.parse(torch.__version__) < version.parse('1.10'):
+        return True
+
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 3, 16)
+    y = torch.rand(1, 5, 9, 11)
+    z = torch.rand(14, 8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export onnx
+    torch.onnx.export(net, (x, y, z), "test_torch_roll.onnx")
+
+    # onnx to pnnx
+    import os
+    os.system("../../src/pnnx test_torch_roll.onnx inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]")
+
+    # pnnx inference
+    import test_torch_roll_pnnx
+    b = test_torch_roll_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/test_torch_roll.py b/tools/pnnx/tests/test_torch_roll.py
new file mode 100644
index 00000000000..32e3bde38e1
--- /dev/null
+++ b/tools/pnnx/tests/test_torch_roll.py
@@ -0,0 +1,61 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = torch.roll(x, 3)
+        y = torch.roll(y, -2, -1)
+        z = torch.roll(z, shifts=(2,1), dims=(0,1))
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 3, 16)
+    y = torch.rand(1, 5, 9, 11)
+    z = torch.rand(14, 8, 5, 9, 10)
+
+    a = net(x, y, z)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y, z))
+    mod.save("test_torch_roll.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../src/pnnx test_torch_roll.pt inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]")
+
+    # pnnx inference
+    import test_torch_roll_pnnx
+    b = test_torch_roll_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)

From ae17e5e177d92138c30daf8bf0b0f3345df49d2f Mon Sep 17 00:00:00 2001
From: nihui <nihuini@tencent.com>
Date: Wed, 14 Aug 2024 10:43:36 +0800
Subject: [PATCH 18/38] ci release ubuntu2404, major release yml refactor
 (#5624)

* release ubuntu 24.04 package, major release yml refactor

* update macos vulkan sdk

* set MACOSX_DEPLOYMENT_TARGET
---
 .github/workflows/release-python.yml |   18 +-
 .github/workflows/release.yml        | 1762 ++++++--------------------
 2 files changed, 420 insertions(+), 1360 deletions(-)

diff --git a/.github/workflows/release-python.yml b/.github/workflows/release-python.yml
index 82bd5551fcd..6b6db4f0d2e 100644
--- a/.github/workflows/release-python.yml
+++ b/.github/workflows/release-python.yml
@@ -184,9 +184,9 @@ jobs:
     - name: vulkansdk for macos
       if: matrix.os == 'macos-13'
       run: |
-        wget https://sdk.lunarg.com/sdk/download/1.3.236.0/mac/vulkansdk-macos-1.3.236.0.dmg?Human=true -O vulkansdk-macos-1.3.236.0.dmg
-        hdiutil attach vulkansdk-macos-1.3.236.0.dmg
-        sudo /Volumes/vulkansdk-macos-1.3.236.0/InstallVulkan.app/Contents/MacOS/InstallVulkan --root $GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0 --accept-licenses --default-answer --confirm-command install
+        wget https://sdk.lunarg.com/sdk/download/1.3.290.0/mac/vulkansdk-macos-1.3.290.0.dmg?Human=true -O vulkansdk-macos-1.3.290.0.dmg
+        hdiutil attach vulkansdk-macos-1.3.290.0.dmg
+        sudo /Volumes/vulkansdk-macos-1.3.290.0/InstallVulkan.app/Contents/MacOS/InstallVulkan --root $GITHUB_WORKSPACE/vulkansdk-macos-1.3.290.0 --accept-licenses --default-answer --confirm-command install
 
     - name: Build wheels for macos x86_64
       if: matrix.os == 'macos-13' && matrix.arch == 'x86_64'
@@ -197,11 +197,12 @@ jobs:
         CIBW_BUILD_VERBOSITY: 1
         CIBW_ENVIRONMENT: CMAKE_BUILD_PARALLEL_LEVEL=3
           CMAKE_TOOLCHAIN_FILE=$GITHUB_WORKSPACE/toolchains/ios.toolchain.cmake PLATFORM=MAC ARCHS="x86_64"
-          DEPLOYMENT_TARGET="10.9" ENABLE_BITCODE=OFF ENABLE_ARC=OFF ENABLE_VISIBILITY=OFF
+          DEPLOYMENT_TARGET=$MAC_DEPLOYMENT_TARGET ENABLE_BITCODE=OFF ENABLE_ARC=OFF ENABLE_VISIBILITY=OFF
           OpenMP_C_FLAGS="-Xclang -fopenmp" OpenMP_CXX_FLAGS="-Xclang -fopenmp"
           OpenMP_C_LIB_NAMES="libomp" OpenMP_CXX_LIB_NAMES="libomp"
           OpenMP_libomp_LIBRARY="libomp.a"
-          Vulkan_LIBRARY=$GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0/MoltenVK/dylib/macOS/libMoltenVK.dylib
+          Vulkan_LIBRARY=$GITHUB_WORKSPACE/vulkansdk-macos-1.3.290.0/macOS/lib/libMoltenVK.dylib
+          MACOSX_DEPLOYMENT_TARGET=$MAC_DEPLOYMENT_TARGET
       with:
         output-dir: wheelhouse
 
@@ -214,11 +215,12 @@ jobs:
         CIBW_BUILD_VERBOSITY: 1
         CIBW_ENVIRONMENT: CMAKE_BUILD_PARALLEL_LEVEL=3
           CMAKE_TOOLCHAIN_FILE=$GITHUB_WORKSPACE/toolchains/ios.toolchain.cmake PLATFORM=MAC_ARM64 ARCHS="arm64"
-          DEPLOYMENT_TARGET="11.0" ENABLE_BITCODE=OFF ENABLE_ARC=OFF ENABLE_VISIBILITY=OFF
+          DEPLOYMENT_TARGET=$MAC_DEPLOYMENT_TARGET ENABLE_BITCODE=OFF ENABLE_ARC=OFF ENABLE_VISIBILITY=OFF
           OpenMP_C_FLAGS="-Xclang -fopenmp" OpenMP_CXX_FLAGS="-Xclang -fopenmp"
           OpenMP_C_LIB_NAMES="libomp" OpenMP_CXX_LIB_NAMES="libomp"
           OpenMP_libomp_LIBRARY="libomp.a"
-          Vulkan_LIBRARY=$GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0/MoltenVK/dylib/macOS/libMoltenVK.dylib
+          Vulkan_LIBRARY=$GITHUB_WORKSPACE/vulkansdk-macos-1.3.290.0/macOS/lib/libMoltenVK.dylib
+          MACOSX_DEPLOYMENT_TARGET=$MAC_DEPLOYMENT_TARGET
       with:
         output-dir: wheelhouse
 
@@ -244,7 +246,7 @@ jobs:
       fail-fast: false
       matrix:
         arch: [aarch64, ppc64le, s390x]
-        build_cp: [cp36, cp37, cp38, cp39, cp310, cp311, cp312]
+        build_cp: [cp36, cp37, cp38, cp39, cp310, cp311, cp312, cp313]
         build_sub: [manylinux, musllinux]
 
     steps:
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 6309214e08f..2e875fc51e7 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -53,11 +53,20 @@ jobs:
         name: ${{ env.PACKAGENAME }}
         path: /tmp/${{ env.PACKAGENAME }}.zip
 
-  ubuntu-2004:
+  ubuntu:
     needs: [setup]
-    runs-on: ubuntu-20.04
+    strategy:
+      matrix:
+        opt:
+          - { shared-lib: OFF, os: ubuntu-20.04, id: ubuntu-2004        }
+          - { shared-lib: OFF, os: ubuntu-22.04, id: ubuntu-2204        }
+          - { shared-lib: OFF, os: ubuntu-24.04, id: ubuntu-2404        }
+          - { shared-lib: ON,  os: ubuntu-20.04, id: ubuntu-2004-shared }
+          - { shared-lib: ON,  os: ubuntu-22.04, id: ubuntu-2204-shared }
+          - { shared-lib: ON,  os: ubuntu-24.04, id: ubuntu-2404-shared }
+    runs-on: ${{ matrix.opt.os }}
     env:
-      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-ubuntu-2004
+      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-${{ matrix.opt.id }}
     steps:
     - uses: actions/checkout@v4
       with:
@@ -69,71 +78,7 @@ jobs:
       run: |
         mkdir build && cd build
         cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-            -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF ..
-        cmake --build . -j 2
-        cmake --build . --target install/strip
-    - name: package
-      run: |
-        rm -rf ${{ env.PACKAGENAME }}
-        mkdir -p ${{ env.PACKAGENAME }}
-        cp -a build/install/* ${{ env.PACKAGENAME }}
-        rm -f ${{ env.PACKAGENAME }}.zip
-        zip -9 -y -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }}
-    - name: upload-zip
-      uses: actions/upload-artifact@v4
-      with:
-        name: ${{ env.PACKAGENAME }}
-        path: ${{ env.PACKAGENAME }}.zip
-
-  ubuntu-2004-shared:
-    needs: [setup]
-    runs-on: ubuntu-20.04
-    env:
-      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-ubuntu-2004-shared
-    steps:
-    - uses: actions/checkout@v4
-      with:
-        submodules: true
-    - name: apt
-      run: |
-        sudo apt-get install -y libprotobuf-dev protobuf-compiler
-    - name: build
-      run: |
-        mkdir build && cd build
-        cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-            -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
-        cmake --build . -j $(nproc)
-        cmake --build . --target install/strip
-    - name: package
-      run: |
-        rm -rf ${{ env.PACKAGENAME }}
-        mkdir -p ${{ env.PACKAGENAME }}
-        cp -a -P build/install/* ${{ env.PACKAGENAME }}
-        rm -f ${{ env.PACKAGENAME }}.zip
-        zip -9 -y -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }}
-    - name: upload-zip
-      uses: actions/upload-artifact@v4
-      with:
-        name: ${{ env.PACKAGENAME }}
-        path: ${{ env.PACKAGENAME }}.zip
-
-  ubuntu-2204:
-    needs: [setup]
-    runs-on: ubuntu-22.04
-    env:
-      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-ubuntu-2204
-    steps:
-    - uses: actions/checkout@v4
-      with:
-        submodules: true
-    - name: apt
-      run: |
-        sudo apt-get install -y libprotobuf-dev protobuf-compiler
-    - name: build
-      run: |
-        mkdir build && cd build
-        cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-            -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF ..
+            -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=${{ matrix.opt.shared-lib }} ..
         cmake --build . -j $(nproc)
         cmake --build . --target install/strip
     - name: package
@@ -149,38 +94,6 @@ jobs:
         name: ${{ env.PACKAGENAME }}
         path: ${{ env.PACKAGENAME }}.zip
 
-  ubuntu-2204-shared:
-    needs: [setup]
-    runs-on: ubuntu-22.04
-    env:
-      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-ubuntu-2204-shared
-    steps:
-    - uses: actions/checkout@v4
-      with:
-        submodules: true
-    - name: apt
-      run: |
-        sudo apt-get install -y libprotobuf-dev protobuf-compiler
-    - name: build
-      run: |
-        mkdir build && cd build
-        cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-            -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
-        cmake --build . -j $(nproc)
-        cmake --build . --target install/strip
-    - name: package
-      run: |
-        rm -rf ${{ env.PACKAGENAME }}
-        mkdir -p ${{ env.PACKAGENAME }}
-        cp -a -P build/install/* ${{ env.PACKAGENAME }}
-        rm -f ${{ env.PACKAGENAME }}.zip
-        zip -9 -y -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }}
-    - name: upload-zip
-      uses: actions/upload-artifact@v4
-      with:
-        name: ${{ env.PACKAGENAME }}
-        path: ${{ env.PACKAGENAME }}.zip
-
   openmp-macos:
     runs-on: macos-13
     env:
@@ -255,85 +168,14 @@ jobs:
 
   macos:
     needs: [setup, openmp-macos]
+    strategy:
+      matrix:
+        opt:
+          - { vulkan: OFF, id: macos        }
+          - { vulkan: ON,  id: macos-vulkan }
     runs-on: macos-13
     env:
-      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-macos
-      NCNN_CMAKE_OPTIONS: |
-        -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \
-        -DDEPLOYMENT_TARGET=$MAC_DEPLOYMENT_TARGET \
-        -DENABLE_BITCODE=$ENABLE_BITCODE \
-        -DENABLE_ARC=$ENABLE_ARC \
-        -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \
-        -DCMAKE_INSTALL_PREFIX=install \
-        -DCMAKE_BUILD_TYPE=Release \
-        -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \
-        -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \
-        -DOpenMP_libomp_LIBRARY="libomp.a" \
-        -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-        -DNCNN_BUILD_TOOLS=OFF \
-        -DNCNN_BUILD_EXAMPLES=OFF \
-        -DNCNN_BUILD_BENCHMARK=OFF \
-
-    steps:
-    - uses: actions/checkout@v4
-    - name: download-openmp-macos
-      uses: actions/download-artifact@v4
-      with:
-        name: openmp-macos
-        path: openmp-macos
-    - name: install-openmp
-      run: |
-        sudo cp openmp-macos/include/* $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/include
-        sudo cp openmp-macos/lib/libomp.a $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/lib
-    - name: build-x86_64
-      run: |
-        mkdir build-x86_64 && cd build-x86_64
-        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=MAC -DARCHS="x86_64" ..
-        cmake --build . -j 4
-        cmake --build . --target install/strip
-    - name: build-arm64
-      run: |
-        mkdir build-arm64 && cd build-arm64
-        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=MAC_ARM64 -DARCHS="arm64" ..
-        cmake --build . -j 4
-        cmake --build . --target install/strip
-    - name: package-openmp
-      run: |
-        rm -rf openmp.framework
-        mkdir -p openmp.framework/Versions/A/Headers
-        mkdir -p openmp.framework/Versions/A/Resources
-        ln -s A openmp.framework/Versions/Current
-        ln -s Versions/Current/Headers openmp.framework/Headers
-        ln -s Versions/Current/Resources openmp.framework/Resources
-        ln -s Versions/Current/openmp openmp.framework/openmp
-        cp openmp-macos/lib/libomp.a openmp.framework/Versions/A/openmp
-        cp -a openmp-macos/include/* openmp.framework/Versions/A/Headers/
-        sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist
-    - name: package
-      run: |
-        rm -rf ncnn.framework
-        mkdir -p ncnn.framework/Versions/A/Headers
-        mkdir -p ncnn.framework/Versions/A/Resources
-        ln -s A ncnn.framework/Versions/Current
-        ln -s Versions/Current/Headers ncnn.framework/Headers
-        ln -s Versions/Current/Resources ncnn.framework/Resources
-        ln -s Versions/Current/ncnn ncnn.framework/ncnn
-        lipo -create build-x86_64/install/lib/libncnn.a build-arm64/install/lib/libncnn.a -o ncnn.framework/Versions/A/ncnn
-        cp -a build-x86_64/install/include/* ncnn.framework/Versions/A/Headers/
-        sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist
-        rm -f ${{ env.PACKAGENAME }}.zip
-        zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework
-    - name: upload-zip
-      uses: actions/upload-artifact@v4
-      with:
-        name: ${{ env.PACKAGENAME }}
-        path: ${{ env.PACKAGENAME }}.zip
-
-  macos-gpu:
-    needs: [setup, openmp-macos]
-    runs-on: macos-13
-    env:
-      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-macos-vulkan
+      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-${{ matrix.opt.id }}
       NCNN_CMAKE_OPTIONS: |
         -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \
         -DDEPLOYMENT_TARGET=$MAC_DEPLOYMENT_TARGET \
@@ -346,10 +188,10 @@ jobs:
         -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \
         -DOpenMP_libomp_LIBRARY="libomp.a" \
         -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-        -DNCNN_VULKAN=ON \
         -DNCNN_BUILD_TOOLS=OFF \
         -DNCNN_BUILD_EXAMPLES=OFF \
         -DNCNN_BUILD_BENCHMARK=OFF \
+        -DNCNN_VULKAN=${{ matrix.opt.vulkan }} \
 
     steps:
     - uses: actions/checkout@v4
@@ -389,6 +231,7 @@ jobs:
         cp -a openmp-macos/include/* openmp.framework/Versions/A/Headers/
         sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist
     - name: package-glslang
+      if: matrix.opt.vulkan == 'ON'
       run: |
         rm -rf glslang.framework
         mkdir -p glslang.framework/Versions/A/Headers
@@ -397,12 +240,26 @@ jobs:
         ln -s Versions/Current/Headers glslang.framework/Headers
         ln -s Versions/Current/Resources glslang.framework/Resources
         ln -s Versions/Current/glslang glslang.framework/glslang
-        libtool -static build-x86_64/install/lib/libglslang.a build-x86_64/install/lib/libMachineIndependent.a build-x86_64/install/lib/libGenericCodeGen.a build-x86_64/install/lib/libSPIRV.a build-x86_64/install/lib/libOGLCompiler.a build-x86_64/install/lib/libOSDependent.a -o build-x86_64/install/lib/libglslang_combined.a
-        libtool -static build-arm64/install/lib/libglslang.a build-arm64/install/lib/libMachineIndependent.a build-arm64/install/lib/libGenericCodeGen.a build-arm64/install/lib/libSPIRV.a build-arm64/install/lib/libOGLCompiler.a build-arm64/install/lib/libOSDependent.a -o build-arm64/install/lib/libglslang_combined.a
+        libtool -static \
+            build-x86_64/install/lib/libglslang.a \
+            build-x86_64/install/lib/libMachineIndependent.a \
+            build-x86_64/install/lib/libGenericCodeGen.a \
+            build-x86_64/install/lib/libSPIRV.a \
+            build-x86_64/install/lib/libOGLCompiler.a \
+            build-x86_64/install/lib/libOSDependent.a \
+            -o build-x86_64/install/lib/libglslang_combined.a
+        libtool -static \
+            build-arm64/install/lib/libglslang.a \
+            build-arm64/install/lib/libMachineIndependent.a \
+            build-arm64/install/lib/libGenericCodeGen.a \
+            build-arm64/install/lib/libSPIRV.a \
+            build-arm64/install/lib/libOGLCompiler.a \
+            build-arm64/install/lib/libOSDependent.a \
+            -o build-arm64/install/lib/libglslang_combined.a
         lipo -create build-x86_64/install/lib/libglslang_combined.a build-arm64/install/lib/libglslang_combined.a -o glslang.framework/Versions/A/glslang
         cp -a build-x86_64/install/include/glslang glslang.framework/Versions/A/Headers/
         sed -e 's/__NAME__/glslang/g' -e 's/__IDENTIFIER__/org.khronos.glslang/g' -e 's/__VERSION__/1.0/g' Info.plist > glslang.framework/Versions/A/Resources/Info.plist
-    - name: package
+    - name: package-ncnn
       run: |
         rm -rf ncnn.framework
         mkdir -p ncnn.framework/Versions/A/Headers
@@ -412,8 +269,16 @@ jobs:
         ln -s Versions/Current/Resources ncnn.framework/Resources
         ln -s Versions/Current/ncnn ncnn.framework/ncnn
         lipo -create build-x86_64/install/lib/libncnn.a build-arm64/install/lib/libncnn.a -o ncnn.framework/Versions/A/ncnn
-        cp -a build-x86_64/install/include/ncnn ncnn.framework/Versions/A/Headers/
+        cp -a build-x86_64/install/include/* ncnn.framework/Versions/A/Headers/
         sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist
+    - name: package
+      if: matrix.opt.vulkan == 'OFF'
+      run: |
+        rm -f ${{ env.PACKAGENAME }}.zip
+        zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework
+    - name: package
+      if: matrix.opt.vulkan == 'ON'
+      run: |
         rm -f ${{ env.PACKAGENAME }}.zip
         zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework glslang.framework ncnn.framework
     - name: upload-zip
@@ -485,77 +350,14 @@ jobs:
 
   ios:
     needs: [setup, openmp-ios]
+    strategy:
+      matrix:
+        opt:
+          - { vulkan: OFF, id: ios        }
+          - { vulkan: ON,  id: ios-vulkan }
     runs-on: macos-13
     env:
-      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-ios
-      NCNN_CMAKE_OPTIONS: |
-        -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \
-        -DDEPLOYMENT_TARGET=$IOS_DEPLOYMENT_TARGET \
-        -DENABLE_BITCODE=$ENABLE_BITCODE \
-        -DENABLE_ARC=$ENABLE_ARC \
-        -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \
-        -DCMAKE_INSTALL_PREFIX=install \
-        -DCMAKE_BUILD_TYPE=Release \
-        -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \
-        -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \
-        -DOpenMP_libomp_LIBRARY="libomp.a" \
-        -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-        -DNCNN_BUILD_BENCHMARK=OFF \
-
-    steps:
-    - uses: actions/checkout@v4
-    - name: download-openmp-ios
-      uses: actions/download-artifact@v4
-      with:
-        name: openmp-ios
-        path: openmp-ios
-    - name: install-openmp
-      run: |
-        sudo cp openmp-ios/include/* $DEVELOPER_DIR/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/usr/include
-        sudo cp openmp-ios/lib/libomp.a $DEVELOPER_DIR/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/usr/lib
-    - name: build-arm64
-      run: |
-        mkdir build-arm64 && cd build-arm64
-        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=OS64 -DARCHS="arm64" ..
-        cmake --build . -j 4
-        cmake --build . --target install/strip
-    - name: package-openmp
-      run: |
-        rm -rf openmp.framework
-        mkdir -p openmp.framework/Versions/A/Headers
-        mkdir -p openmp.framework/Versions/A/Resources
-        ln -s A openmp.framework/Versions/Current
-        ln -s Versions/Current/Headers openmp.framework/Headers
-        ln -s Versions/Current/Resources openmp.framework/Resources
-        ln -s Versions/Current/openmp openmp.framework/openmp
-        cp openmp-ios/lib/libomp.a openmp.framework/Versions/A/openmp
-        cp -a openmp-ios/include/* openmp.framework/Versions/A/Headers/
-        sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist
-    - name: package
-      run: |
-        rm -rf ncnn.framework
-        mkdir -p ncnn.framework/Versions/A/Headers
-        mkdir -p ncnn.framework/Versions/A/Resources
-        ln -s A ncnn.framework/Versions/Current
-        ln -s Versions/Current/Headers ncnn.framework/Headers
-        ln -s Versions/Current/Resources ncnn.framework/Resources
-        ln -s Versions/Current/ncnn ncnn.framework/ncnn
-        cp build-arm64/install/lib/libncnn.a ncnn.framework/Versions/A/ncnn
-        cp -a build-arm64/install/include/* ncnn.framework/Versions/A/Headers/
-        sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist
-        rm -f ${{ env.PACKAGENAME }}.zip
-        zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework
-    - name: upload-zip
-      uses: actions/upload-artifact@v4
-      with:
-        name: ${{ env.PACKAGENAME }}
-        path: ${{ env.PACKAGENAME }}.zip
-
-  ios-gpu:
-    needs: [setup, openmp-ios]
-    runs-on: macos-13
-    env:
-      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-ios-vulkan
+      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-${{ matrix.opt.id }}
       NCNN_CMAKE_OPTIONS: |
         -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \
         -DDEPLOYMENT_TARGET=$IOS_DEPLOYMENT_TARGET \
@@ -568,8 +370,8 @@ jobs:
         -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \
         -DOpenMP_libomp_LIBRARY="libomp.a" \
         -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-        -DNCNN_VULKAN=ON \
         -DNCNN_BUILD_BENCHMARK=OFF \
+        -DNCNN_VULKAN=${{ matrix.opt.vulkan }} \
 
     steps:
     - uses: actions/checkout@v4
@@ -603,6 +405,7 @@ jobs:
         cp -a openmp-ios/include/* openmp.framework/Versions/A/Headers/
         sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist
     - name: package-glslang
+      if: matrix.opt.vulkan == 'ON'
       run: |
         rm -rf glslang.framework
         mkdir -p glslang.framework/Versions/A/Headers
@@ -622,7 +425,7 @@ jobs:
         cp build-arm64/install/lib/libglslang_combined.a glslang.framework/Versions/A/glslang
         cp -a build-arm64/install/include/glslang glslang.framework/Versions/A/Headers/
         sed -e 's/__NAME__/glslang/g' -e 's/__IDENTIFIER__/org.khronos.glslang/g' -e 's/__VERSION__/1.0/g' Info.plist > glslang.framework/Versions/A/Resources/Info.plist
-    - name: package
+    - name: package-ncnn
       run: |
         rm -rf ncnn.framework
         mkdir -p ncnn.framework/Versions/A/Headers
@@ -632,8 +435,16 @@ jobs:
         ln -s Versions/Current/Resources ncnn.framework/Resources
         ln -s Versions/Current/ncnn ncnn.framework/ncnn
         cp build-arm64/install/lib/libncnn.a ncnn.framework/Versions/A/ncnn
-        cp -a build-arm64/install/include/ncnn ncnn.framework/Versions/A/Headers/
+        cp -a build-arm64/install/include/* ncnn.framework/Versions/A/Headers/
         sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist
+    - name: package
+      if: matrix.opt.vulkan == 'OFF'
+      run: |
+        rm -f ${{ env.PACKAGENAME }}.zip
+        zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework
+    - name: package
+      if: matrix.opt.vulkan == 'ON'
+      run: |
         rm -f ${{ env.PACKAGENAME }}.zip
         zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework glslang.framework ncnn.framework
     - name: upload-zip
@@ -716,9 +527,14 @@ jobs:
 
   ios-simulator:
     needs: [setup, openmp-ios-simulator]
+    strategy:
+      matrix:
+        opt:
+          - { vulkan: OFF, id: ios-simulator        }
+          - { vulkan: ON,  id: ios-simulator-vulkan }
     runs-on: macos-13
     env:
-      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-ios-simulator
+      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-${{ matrix.opt.id }}
       NCNN_CMAKE_OPTIONS: |
         -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \
         -DDEPLOYMENT_TARGET=$IOS_DEPLOYMENT_TARGET \
@@ -732,89 +548,12 @@ jobs:
         -DOpenMP_libomp_LIBRARY="libomp.a" \
         -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
         -DNCNN_BUILD_BENCHMARK=OFF \
+        -DNCNN_VULKAN=${{ matrix.opt.vulkan }} \
 
     steps:
     - uses: actions/checkout@v4
-    - name: download-openmp-ios-simulator
-      uses: actions/download-artifact@v4
-      with:
-        name: openmp-ios-simulator
-        path: openmp-ios-simulator
-    - name: install-openmp
-      run: |
-        sudo cp openmp-ios-simulator/include/* $DEVELOPER_DIR/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator.sdk/usr/include
-        sudo cp openmp-ios-simulator/lib/libomp.a $DEVELOPER_DIR/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator.sdk/usr/lib
-    - name: build-x86_64
-      run: |
-        mkdir build-x86_64 && cd build-x86_64
-        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=SIMULATOR64 -DARCHS="x86_64" ..
-        cmake --build . -j 4
-        cmake --build . --target install/strip
-    - name: build-arm64
-      run: |
-        mkdir build-arm64 && cd build-arm64
-        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=SIMULATORARM64 -DARCHS="arm64" ..
-        cmake --build . -j 4
-        cmake --build . --target install/strip
-    - name: package-openmp
-      run: |
-        rm -rf openmp.framework
-        mkdir -p openmp.framework/Versions/A/Headers
-        mkdir -p openmp.framework/Versions/A/Resources
-        ln -s A openmp.framework/Versions/Current
-        ln -s Versions/Current/Headers openmp.framework/Headers
-        ln -s Versions/Current/Resources openmp.framework/Resources
-        ln -s Versions/Current/openmp openmp.framework/openmp
-        cp openmp-ios-simulator/lib/libomp.a openmp.framework/Versions/A/openmp
-        cp -a openmp-ios-simulator/include/* openmp.framework/Versions/A/Headers/
-        sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist
-    - name: package
-      run: |
-        rm -rf ncnn.framework
-        mkdir -p ncnn.framework/Versions/A/Headers
-        mkdir -p ncnn.framework/Versions/A/Resources
-        ln -s A ncnn.framework/Versions/Current
-        ln -s Versions/Current/Headers ncnn.framework/Headers
-        ln -s Versions/Current/Resources ncnn.framework/Resources
-        ln -s Versions/Current/ncnn ncnn.framework/ncnn
-        lipo -create \
-            build-x86_64/install/lib/libncnn.a \
-            build-arm64/install/lib/libncnn.a \
-            -o ncnn.framework/Versions/A/ncnn
-        cp -a build-x86_64/install/include/* ncnn.framework/Versions/A/Headers/
-        sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist
-        rm -f ${{ env.PACKAGENAME }}.zip
-        zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework
-    - name: upload-zip
-      uses: actions/upload-artifact@v4
-      with:
-        name: ${{ env.PACKAGENAME }}
-        path: ${{ env.PACKAGENAME }}.zip
-
-  ios-simulator-gpu:
-    needs: [setup, openmp-ios-simulator]
-    runs-on: macos-13
-    env:
-      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-ios-simulator-vulkan
-      NCNN_CMAKE_OPTIONS: |
-        -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \
-        -DDEPLOYMENT_TARGET=$IOS_DEPLOYMENT_TARGET \
-        -DENABLE_BITCODE=$ENABLE_BITCODE \
-        -DENABLE_ARC=$ENABLE_ARC \
-        -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \
-        -DCMAKE_INSTALL_PREFIX=install \
-        -DCMAKE_BUILD_TYPE=Release \
-        -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \
-        -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \
-        -DOpenMP_libomp_LIBRARY="libomp.a" \
-        -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-        -DNCNN_VULKAN=ON \
-        -DNCNN_BUILD_BENCHMARK=OFF \
-
-    steps:
-    - uses: actions/checkout@v4
-      with:
-        submodules: true
+      with:
+        submodules: true
     - name: download-openmp-ios-simulator
       uses: actions/download-artifact@v4
       with:
@@ -849,6 +588,7 @@ jobs:
         cp -a openmp-ios-simulator/include/* openmp.framework/Versions/A/Headers/
         sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist
     - name: package-glslang
+      if: matrix.opt.vulkan == 'ON'
       run: |
         rm -rf glslang.framework
         mkdir -p glslang.framework/Versions/A/Headers
@@ -879,7 +619,7 @@ jobs:
             -o glslang.framework/Versions/A/glslang
         cp -a build-x86_64/install/include/glslang glslang.framework/Versions/A/Headers/
         sed -e 's/__NAME__/glslang/g' -e 's/__IDENTIFIER__/org.khronos.glslang/g' -e 's/__VERSION__/1.0/g' Info.plist > glslang.framework/Versions/A/Resources/Info.plist
-    - name: package
+    - name: package-ncnn
       run: |
         rm -rf ncnn.framework
         mkdir -p ncnn.framework/Versions/A/Headers
@@ -892,8 +632,16 @@ jobs:
             build-x86_64/install/lib/libncnn.a \
             build-arm64/install/lib/libncnn.a \
             -o ncnn.framework/Versions/A/ncnn
-        cp -a build-x86_64/install/include/ncnn ncnn.framework/Versions/A/Headers/
+        cp -a build-x86_64/install/include/* ncnn.framework/Versions/A/Headers/
         sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist
+    - name: package
+      if: matrix.opt.vulkan == 'OFF'
+      run: |
+        rm -f ${{ env.PACKAGENAME }}.zip
+        zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework
+    - name: package
+      if: matrix.opt.vulkan == 'ON'
+      run: |
         rm -f ${{ env.PACKAGENAME }}.zip
         zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework glslang.framework ncnn.framework
     - name: upload-zip
@@ -976,86 +724,14 @@ jobs:
 
   mac-catalyst:
     needs: [setup, openmp-mac-catalyst]
+    strategy:
+      matrix:
+        opt:
+          - { vulkan: OFF, id: mac-catalyst        }
+          - { vulkan: ON,  id: mac-catalyst-vulkan }
     runs-on: macos-13
     env:
-      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-mac-catalyst
-      NCNN_CMAKE_OPTIONS: |
-        -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \
-        -DDEPLOYMENT_TARGET=$MAC_CATALYST_DEPLOYMENT_TARGET \
-        -DENABLE_BITCODE=$ENABLE_BITCODE \
-        -DENABLE_ARC=$ENABLE_ARC \
-        -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \
-        -DCMAKE_INSTALL_PREFIX=install \
-        -DCMAKE_BUILD_TYPE=Release \
-        -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \
-        -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \
-        -DOpenMP_libomp_LIBRARY="libomp.a" \
-        -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-        -DNCNN_BUILD_BENCHMARK=OFF \
-
-    steps:
-    - uses: actions/checkout@v4
-    - name: download-openmp-mac-catalyst
-      uses: actions/download-artifact@v4
-      with:
-        name: openmp-mac-catalyst
-        path: openmp-mac-catalyst
-    - name: install-openmp
-      run: |
-        sudo cp openmp-mac-catalyst/include/* $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/include
-        sudo cp openmp-mac-catalyst/lib/libomp.a $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/lib
-    - name: build-x86_64
-      run: |
-        mkdir build-x86_64 && cd build-x86_64
-        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=MAC_CATALYST -DARCHS="x86_64" ..
-        cmake --build . -j 4
-        cmake --build . --target install/strip
-    - name: build-arm64
-      run: |
-        mkdir build-arm64 && cd build-arm64
-        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=MAC_CATALYST -DARCHS="arm64" ..
-        cmake --build . -j 4
-        cmake --build . --target install/strip
-    - name: package-openmp
-      run: |
-        rm -rf openmp.framework
-        mkdir -p openmp.framework/Versions/A/Headers
-        mkdir -p openmp.framework/Versions/A/Resources
-        ln -s A openmp.framework/Versions/Current
-        ln -s Versions/Current/Headers openmp.framework/Headers
-        ln -s Versions/Current/Resources openmp.framework/Resources
-        ln -s Versions/Current/openmp openmp.framework/openmp
-        cp openmp-mac-catalyst/lib/libomp.a openmp.framework/Versions/A/openmp
-        cp -a openmp-mac-catalyst/include/* openmp.framework/Versions/A/Headers/
-        sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist
-    - name: package
-      run: |
-        rm -rf ncnn.framework
-        mkdir -p ncnn.framework/Versions/A/Headers
-        mkdir -p ncnn.framework/Versions/A/Resources
-        ln -s A ncnn.framework/Versions/Current
-        ln -s Versions/Current/Headers ncnn.framework/Headers
-        ln -s Versions/Current/Resources ncnn.framework/Resources
-        ln -s Versions/Current/ncnn ncnn.framework/ncnn
-        lipo -create \
-            build-x86_64/install/lib/libncnn.a \
-            build-arm64/install/lib/libncnn.a \
-            -o ncnn.framework/Versions/A/ncnn
-        cp -a build-x86_64/install/include/* ncnn.framework/Versions/A/Headers/
-        sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist
-        rm -f ${{ env.PACKAGENAME }}.zip
-        zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework
-    - name: upload-zip
-      uses: actions/upload-artifact@v4
-      with:
-        name: ${{ env.PACKAGENAME }}
-        path: ${{ env.PACKAGENAME }}.zip
-
-  mac-catalyst-gpu:
-    needs: [setup, openmp-mac-catalyst]
-    runs-on: macos-13
-    env:
-      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-mac-catalyst-vulkan
+      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-${{ matrix.opt.id }}
       NCNN_CMAKE_OPTIONS: |
         -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \
         -DDEPLOYMENT_TARGET=$MAC_CATALYST_DEPLOYMENT_TARGET \
@@ -1068,8 +744,8 @@ jobs:
         -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \
         -DOpenMP_libomp_LIBRARY="libomp.a" \
         -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-        -DNCNN_VULKAN=ON \
         -DNCNN_BUILD_BENCHMARK=OFF \
+        -DNCNN_VULKAN=${{ matrix.opt.vulkan }} \
 
     steps:
     - uses: actions/checkout@v4
@@ -1109,6 +785,7 @@ jobs:
         cp -a openmp-mac-catalyst/include/* openmp.framework/Versions/A/Headers/
         sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist
     - name: package-glslang
+      if: matrix.opt.vulkan == 'ON'
       run: |
         rm -rf glslang.framework
         mkdir -p glslang.framework/Versions/A/Headers
@@ -1139,7 +816,7 @@ jobs:
             -o glslang.framework/Versions/A/glslang
         cp -a build-x86_64/install/include/glslang glslang.framework/Versions/A/Headers/
         sed -e 's/__NAME__/glslang/g' -e 's/__IDENTIFIER__/org.khronos.glslang/g' -e 's/__VERSION__/1.0/g' Info.plist > glslang.framework/Versions/A/Resources/Info.plist
-    - name: package
+    - name: package-ncnn
       run: |
         rm -rf ncnn.framework
         mkdir -p ncnn.framework/Versions/A/Headers
@@ -1152,8 +829,16 @@ jobs:
             build-x86_64/install/lib/libncnn.a \
             build-arm64/install/lib/libncnn.a \
             -o ncnn.framework/Versions/A/ncnn
-        cp -a build-x86_64/install/include/ncnn ncnn.framework/Versions/A/Headers/
+        cp -a build-x86_64/install/include/* ncnn.framework/Versions/A/Headers/
         sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist
+    - name: package
+      if: matrix.opt.vulkan == 'OFF'
+      run: |
+        rm -f ${{ env.PACKAGENAME }}.zip
+        zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework
+    - name: package
+      if: matrix.opt.vulkan == 'ON'
+      run: |
         rm -f ${{ env.PACKAGENAME }}.zip
         zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework glslang.framework ncnn.framework
     - name: upload-zip
@@ -1534,86 +1219,14 @@ jobs:
 
   tvos:
     needs: [setup, openmp-tvos]
+    strategy:
+      matrix:
+        opt:
+          - { vulkan: OFF, id: tvos        }
+          - { vulkan: ON,  id: tvos-vulkan }
     runs-on: macos-13
     env:
-      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-tvos
-      NCNN_CMAKE_OPTIONS: |
-        -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \
-        -DDEPLOYMENT_TARGET=$TVOS_DEPLOYMENT_TARGET \
-        -DENABLE_BITCODE=$ENABLE_BITCODE \
-        -DENABLE_ARC=$ENABLE_ARC \
-        -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \
-        -DCMAKE_INSTALL_PREFIX=install \
-        -DCMAKE_BUILD_TYPE=Release \
-        -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \
-        -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \
-        -DOpenMP_libomp_LIBRARY="libomp.a" \
-        -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-        -DNCNN_BUILD_BENCHMARK=OFF \
-
-    steps:
-    - uses: actions/checkout@v4
-    - name: download-openmp-tvos
-      uses: actions/download-artifact@v4
-      with:
-        name: openmp-tvos
-        path: openmp-tvos
-    - name: install-openmp
-      run: |
-        sudo cp openmp-tvos/include/* $DEVELOPER_DIR/Platforms/AppleTVOS.platform/Developer/SDKs/AppleTVOS.sdk/usr/include
-        sudo cp openmp-tvos/lib/libomp.a $DEVELOPER_DIR/Platforms/AppleTVOS.platform/Developer/SDKs/AppleTVOS.sdk/usr/lib
-    - name: build-arm64
-      run: |
-        mkdir build-arm64 && cd build-arm64
-        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=TVOS -DARCHS="arm64" ..
-        cmake --build . -j 4
-        cmake --build . --target install/strip
-    - name: build-arm64e
-      run: |
-        mkdir build-arm64e && cd build-arm64e
-        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=TVOS -DARCHS="arm64e" ..
-        cmake --build . -j 4
-        cmake --build . --target install/strip
-    - name: package-openmp
-      run: |
-        rm -rf openmp.framework
-        mkdir -p openmp.framework/Versions/A/Headers
-        mkdir -p openmp.framework/Versions/A/Resources
-        ln -s A openmp.framework/Versions/Current
-        ln -s Versions/Current/Headers openmp.framework/Headers
-        ln -s Versions/Current/Resources openmp.framework/Resources
-        ln -s Versions/Current/openmp openmp.framework/openmp
-        cp openmp-tvos/lib/libomp.a openmp.framework/Versions/A/openmp
-        cp -a openmp-tvos/include/* openmp.framework/Versions/A/Headers/
-        sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist
-    - name: package
-      run: |
-        rm -rf ncnn.framework
-        mkdir -p ncnn.framework/Versions/A/Headers
-        mkdir -p ncnn.framework/Versions/A/Resources
-        ln -s A ncnn.framework/Versions/Current
-        ln -s Versions/Current/Headers ncnn.framework/Headers
-        ln -s Versions/Current/Resources ncnn.framework/Resources
-        ln -s Versions/Current/ncnn ncnn.framework/ncnn
-        lipo -create \
-            build-arm64/install/lib/libncnn.a \
-            build-arm64e/install/lib/libncnn.a \
-            -o ncnn.framework/Versions/A/ncnn
-        cp -a build-arm64/install/include/* ncnn.framework/Versions/A/Headers/
-        sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist
-        rm -f ${{ env.PACKAGENAME }}.zip
-        zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework
-    - name: upload-zip
-      uses: actions/upload-artifact@v4
-      with:
-        name: ${{ env.PACKAGENAME }}
-        path: ${{ env.PACKAGENAME }}.zip
-
-  tvos-gpu:
-    needs: [setup, openmp-tvos]
-    runs-on: macos-13
-    env:
-      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-tvos-vulkan
+      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-${{ matrix.opt.id }}
       NCNN_CMAKE_OPTIONS: |
         -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \
         -DDEPLOYMENT_TARGET=$TVOS_DEPLOYMENT_TARGET \
@@ -1626,8 +1239,8 @@ jobs:
         -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \
         -DOpenMP_libomp_LIBRARY="libomp.a" \
         -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-        -DNCNN_VULKAN=ON \
         -DNCNN_BUILD_BENCHMARK=OFF \
+        -DNCNN_VULKAN=${{ matrix.opt.vulkan }} \
 
     steps:
     - uses: actions/checkout@v4
@@ -1667,6 +1280,7 @@ jobs:
         cp -a openmp-tvos/include/* openmp.framework/Versions/A/Headers/
         sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist
     - name: package-glslang
+      if: matrix.opt.vulkan == 'ON'
       run: |
         rm -rf glslang.framework
         mkdir -p glslang.framework/Versions/A/Headers
@@ -1697,7 +1311,7 @@ jobs:
             -o glslang.framework/Versions/A/glslang
         cp -a build-arm64/install/include/glslang glslang.framework/Versions/A/Headers/
         sed -e 's/__NAME__/glslang/g' -e 's/__IDENTIFIER__/org.khronos.glslang/g' -e 's/__VERSION__/1.0/g' Info.plist > glslang.framework/Versions/A/Resources/Info.plist
-    - name: package
+    - name: package-ncnn
       run: |
         rm -rf ncnn.framework
         mkdir -p ncnn.framework/Versions/A/Headers
@@ -1712,6 +1326,14 @@ jobs:
             -o ncnn.framework/Versions/A/ncnn
         cp -a build-arm64/install/include/* ncnn.framework/Versions/A/Headers/
         sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist
+    - name: package
+      if: matrix.opt.vulkan == 'OFF'
+      run: |
+        rm -f ${{ env.PACKAGENAME }}.zip
+        zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework
+    - name: package
+      if: matrix.opt.vulkan == 'ON'
+      run: |
         rm -f ${{ env.PACKAGENAME }}.zip
         zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework glslang.framework ncnn.framework
     - name: upload-zip
@@ -1794,9 +1416,14 @@ jobs:
 
   tvos-simulator:
     needs: [setup, openmp-tvos-simulator]
+    strategy:
+      matrix:
+        opt:
+          - { vulkan: OFF, id: tvos-simulator        }
+          - { vulkan: ON,  id: tvos-simulator-vulkan }
     runs-on: macos-13
     env:
-      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-tvos-simulator
+      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-${{ matrix.opt.id }}
       NCNN_CMAKE_OPTIONS: |
         -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \
         -DDEPLOYMENT_TARGET=$TVOS_DEPLOYMENT_TARGET \
@@ -1810,9 +1437,12 @@ jobs:
         -DOpenMP_libomp_LIBRARY="libomp.a" \
         -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
         -DNCNN_BUILD_BENCHMARK=OFF \
+        -DNCNN_VULKAN=${{ matrix.opt.vulkan }} \
 
     steps:
     - uses: actions/checkout@v4
+      with:
+        submodules: true
     - name: download-openmp-tvos-simulator
       uses: actions/download-artifact@v4
       with:
@@ -1846,87 +1476,8 @@ jobs:
         cp openmp-tvos-simulator/lib/libomp.a openmp.framework/Versions/A/openmp
         cp -a openmp-tvos-simulator/include/* openmp.framework/Versions/A/Headers/
         sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist
-    - name: package
-      run: |
-        rm -rf ncnn.framework
-        mkdir -p ncnn.framework/Versions/A/Headers
-        mkdir -p ncnn.framework/Versions/A/Resources
-        ln -s A ncnn.framework/Versions/Current
-        ln -s Versions/Current/Headers ncnn.framework/Headers
-        ln -s Versions/Current/Resources ncnn.framework/Resources
-        ln -s Versions/Current/ncnn ncnn.framework/ncnn
-        lipo -create \
-            build-x86_64/install/lib/libncnn.a \
-            build-arm64/install/lib/libncnn.a \
-            -o ncnn.framework/Versions/A/ncnn
-        cp -a build-x86_64/install/include/* ncnn.framework/Versions/A/Headers/
-        sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist
-        rm -f ${{ env.PACKAGENAME }}.zip
-        zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework
-    - name: upload-zip
-      uses: actions/upload-artifact@v4
-      with:
-        name: ${{ env.PACKAGENAME }}
-        path: ${{ env.PACKAGENAME }}.zip
-
-  tvos-simulator-gpu:
-    needs: [setup, openmp-tvos-simulator]
-    runs-on: macos-13
-    env:
-      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-tvos-simulator-vulkan
-      NCNN_CMAKE_OPTIONS: |
-        -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \
-        -DDEPLOYMENT_TARGET=$TVOS_DEPLOYMENT_TARGET \
-        -DENABLE_BITCODE=$ENABLE_BITCODE \
-        -DENABLE_ARC=$ENABLE_ARC \
-        -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \
-        -DCMAKE_INSTALL_PREFIX=install \
-        -DCMAKE_BUILD_TYPE=Release \
-        -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \
-        -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \
-        -DOpenMP_libomp_LIBRARY="libomp.a" \
-        -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-        -DNCNN_VULKAN=ON \
-        -DNCNN_BUILD_BENCHMARK=OFF \
-
-    steps:
-    - uses: actions/checkout@v4
-      with:
-        submodules: true
-    - name: download-openmp-tvos-simulator
-      uses: actions/download-artifact@v4
-      with:
-        name: openmp-tvos-simulator
-        path: openmp-tvos-simulator
-    - name: install-openmp
-      run: |
-        sudo cp openmp-tvos-simulator/include/* $DEVELOPER_DIR/Platforms/AppleTVSimulator.platform/Developer/SDKs/AppleTVSimulator.sdk/usr/include
-        sudo cp openmp-tvos-simulator/lib/libomp.a $DEVELOPER_DIR/Platforms/AppleTVSimulator.platform/Developer/SDKs/AppleTVSimulator.sdk/usr/lib
-    - name: build-x86_64
-      run: |
-        mkdir build-x86_64 && cd build-x86_64
-        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=SIMULATOR_TVOS -DARCHS="x86_64" ..
-        cmake --build . -j 4
-        cmake --build . --target install/strip
-    - name: build-arm64
-      run: |
-        mkdir build-arm64 && cd build-arm64
-        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=SIMULATOR_TVOS -DARCHS="arm64" ..
-        cmake --build . -j 4
-        cmake --build . --target install/strip
-    - name: package-openmp
-      run: |
-        rm -rf openmp.framework
-        mkdir -p openmp.framework/Versions/A/Headers
-        mkdir -p openmp.framework/Versions/A/Resources
-        ln -s A openmp.framework/Versions/Current
-        ln -s Versions/Current/Headers openmp.framework/Headers
-        ln -s Versions/Current/Resources openmp.framework/Resources
-        ln -s Versions/Current/openmp openmp.framework/openmp
-        cp openmp-tvos-simulator/lib/libomp.a openmp.framework/Versions/A/openmp
-        cp -a openmp-tvos-simulator/include/* openmp.framework/Versions/A/Headers/
-        sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist
-    - name: package-glslang
+    - name: package-glslang
+      if: matrix.opt.vulkan == 'ON'
       run: |
         rm -rf glslang.framework
         mkdir -p glslang.framework/Versions/A/Headers
@@ -1957,7 +1508,7 @@ jobs:
             -o glslang.framework/Versions/A/glslang
         cp -a build-x86_64/install/include/glslang glslang.framework/Versions/A/Headers/
         sed -e 's/__NAME__/glslang/g' -e 's/__IDENTIFIER__/org.khronos.glslang/g' -e 's/__VERSION__/1.0/g' Info.plist > glslang.framework/Versions/A/Resources/Info.plist
-    - name: package
+    - name: package-ncnn
       run: |
         rm -rf ncnn.framework
         mkdir -p ncnn.framework/Versions/A/Headers
@@ -1972,6 +1523,14 @@ jobs:
             -o ncnn.framework/Versions/A/ncnn
         cp -a build-x86_64/install/include/* ncnn.framework/Versions/A/Headers/
         sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist
+    - name: package
+      if: matrix.opt.vulkan == 'OFF'
+      run: |
+        rm -f ${{ env.PACKAGENAME }}.zip
+        zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework
+    - name: package
+      if: matrix.opt.vulkan == 'ON'
+      run: |
         rm -f ${{ env.PACKAGENAME }}.zip
         zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework glslang.framework ncnn.framework
     - name: upload-zip
@@ -2043,9 +1602,14 @@ jobs:
 
   visionos:
     needs: [setup, openmp-visionos]
+    strategy:
+      matrix:
+        opt:
+          - { vulkan: OFF, id: visionos        }
+          - { vulkan: ON,  id: visionos-vulkan }
     runs-on: macos-13
     env:
-      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-visionos
+      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-${{ matrix.opt.id }}
       NCNN_CMAKE_OPTIONS: |
         -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \
         -DDEPLOYMENT_TARGET=$VISIONOS_DEPLOYMENT_TARGET \
@@ -2059,9 +1623,12 @@ jobs:
         -DOpenMP_libomp_LIBRARY="libomp.a" \
         -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
         -DNCNN_BUILD_BENCHMARK=OFF \
+        -DNCNN_VULKAN=${{ matrix.opt.vulkan }} \
 
     steps:
     - uses: actions/checkout@v4
+      with:
+        submodules: true
     - name: download-openmp-visionos
       uses: actions/download-artifact@v4
       with:
@@ -2089,7 +1656,28 @@ jobs:
         cp openmp-visionos/lib/libomp.a openmp.framework/Versions/A/openmp
         cp -a openmp-visionos/include/* openmp.framework/Versions/A/Headers/
         sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist
-    - name: package
+    - name: package-glslang
+      if: matrix.opt.vulkan == 'ON'
+      run: |
+        rm -rf glslang.framework
+        mkdir -p glslang.framework/Versions/A/Headers
+        mkdir -p glslang.framework/Versions/A/Resources
+        ln -s A glslang.framework/Versions/Current
+        ln -s Versions/Current/Headers glslang.framework/Headers
+        ln -s Versions/Current/Resources glslang.framework/Resources
+        ln -s Versions/Current/glslang glslang.framework/glslang
+        libtool -static \
+            build-arm64/install/lib/libglslang.a \
+            build-arm64/install/lib/libMachineIndependent.a \
+            build-arm64/install/lib/libGenericCodeGen.a \
+            build-arm64/install/lib/libSPIRV.a \
+            build-arm64/install/lib/libOGLCompiler.a \
+            build-arm64/install/lib/libOSDependent.a \
+            -o build-arm64/install/lib/libglslang_combined.a
+        cp build-arm64/install/lib/libglslang_combined.a glslang.framework/Versions/A/glslang
+        cp -a build-arm64/install/include/glslang glslang.framework/Versions/A/Headers/
+        sed -e 's/__NAME__/glslang/g' -e 's/__IDENTIFIER__/org.khronos.glslang/g' -e 's/__VERSION__/1.0/g' Info.plist > glslang.framework/Versions/A/Resources/Info.plist
+    - name: package-ncnn
       run: |
         rm -rf ncnn.framework
         mkdir -p ncnn.framework/Versions/A/Headers
@@ -2101,8 +1689,16 @@ jobs:
         cp build-arm64/install/lib/libncnn.a ncnn.framework/Versions/A/ncnn
         cp -a build-arm64/install/include/* ncnn.framework/Versions/A/Headers/
         sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist
+    - name: package
+      if: matrix.opt.vulkan == 'OFF'
+      run: |
         rm -f ${{ env.PACKAGENAME }}.zip
         zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework
+    - name: package
+      if: matrix.opt.vulkan == 'ON'
+      run: |
+        rm -f ${{ env.PACKAGENAME }}.zip
+        zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework glslang.framework ncnn.framework
     - name: upload-zip
       uses: actions/upload-artifact@v4
       with:
@@ -2183,9 +1779,14 @@ jobs:
 
   visionos-simulator:
     needs: [setup, openmp-visionos-simulator]
+    strategy:
+      matrix:
+        opt:
+          - { vulkan: OFF, id: visionos-simulator        }
+          - { vulkan: ON,  id: visionos-simulator-vulkan }
     runs-on: macos-13
     env:
-      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-visionos-simulator
+      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-${{ matrix.opt.id }}
       NCNN_CMAKE_OPTIONS: |
         -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \
         -DDEPLOYMENT_TARGET=$VISIONOS_DEPLOYMENT_TARGET \
@@ -2199,9 +1800,12 @@ jobs:
         -DOpenMP_libomp_LIBRARY="libomp.a" \
         -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
         -DNCNN_BUILD_BENCHMARK=OFF \
+        -DNCNN_VULKAN=${{ matrix.opt.vulkan }} \
 
     steps:
     - uses: actions/checkout@v4
+      with:
+        submodules: true
     - name: download-openmp-visionos-simulator
       uses: actions/download-artifact@v4
       with:
@@ -2235,7 +1839,39 @@ jobs:
         cp openmp-visionos-simulator/lib/libomp.a openmp.framework/Versions/A/openmp
         cp -a openmp-visionos-simulator/include/* openmp.framework/Versions/A/Headers/
         sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist
-    - name: package
+    - name: package-glslang
+      if: matrix.opt.vulkan == 'ON'
+      run: |
+        rm -rf glslang.framework
+        mkdir -p glslang.framework/Versions/A/Headers
+        mkdir -p glslang.framework/Versions/A/Resources
+        ln -s A glslang.framework/Versions/Current
+        ln -s Versions/Current/Headers glslang.framework/Headers
+        ln -s Versions/Current/Resources glslang.framework/Resources
+        ln -s Versions/Current/glslang glslang.framework/glslang
+        libtool -static \
+            build-x86_64/install/lib/libglslang.a \
+            build-x86_64/install/lib/libMachineIndependent.a \
+            build-x86_64/install/lib/libGenericCodeGen.a \
+            build-x86_64/install/lib/libSPIRV.a \
+            build-x86_64/install/lib/libOGLCompiler.a \
+            build-x86_64/install/lib/libOSDependent.a \
+            -o build-x86_64/install/lib/libglslang_combined.a
+        libtool -static \
+            build-arm64/install/lib/libglslang.a \
+            build-arm64/install/lib/libMachineIndependent.a \
+            build-arm64/install/lib/libGenericCodeGen.a \
+            build-arm64/install/lib/libSPIRV.a \
+            build-arm64/install/lib/libOGLCompiler.a \
+            build-arm64/install/lib/libOSDependent.a \
+            -o build-arm64/install/lib/libglslang_combined.a
+        lipo -create \
+            build-x86_64/install/lib/libglslang_combined.a \
+            build-arm64/install/lib/libglslang_combined.a \
+            -o glslang.framework/Versions/A/glslang
+        cp -a build-x86_64/install/include/glslang glslang.framework/Versions/A/Headers/
+        sed -e 's/__NAME__/glslang/g' -e 's/__IDENTIFIER__/org.khronos.glslang/g' -e 's/__VERSION__/1.0/g' Info.plist > glslang.framework/Versions/A/Resources/Info.plist
+    - name: package-ncnn
       run: |
         rm -rf ncnn.framework
         mkdir -p ncnn.framework/Versions/A/Headers
@@ -2250,8 +1886,16 @@ jobs:
             -o ncnn.framework/Versions/A/ncnn
         cp -a build-x86_64/install/include/* ncnn.framework/Versions/A/Headers/
         sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist
+    - name: package
+      if: matrix.opt.vulkan == 'OFF'
+      run: |
         rm -f ${{ env.PACKAGENAME }}.zip
         zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework
+    - name: package
+      if: matrix.opt.vulkan == 'ON'
+      run: |
+        rm -f ${{ env.PACKAGENAME }}.zip
+        zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework glslang.framework ncnn.framework
     - name: upload-zip
       uses: actions/upload-artifact@v4
       with:
@@ -2260,51 +1904,63 @@ jobs:
 
   android:
     needs: [setup]
+    strategy:
+      matrix:
+        opt:
+          - { vulkan: OFF, shared-lib: OFF, id: android               }
+          - { vulkan: OFF, shared-lib: ON,  id: android-shared        }
+          - { vulkan: ON,  shared-lib: OFF, id: android-vulkan        }
+          - { vulkan: ON,  shared-lib: ON,  id: android-vulkan-shared }
     runs-on: ubuntu-latest
     env:
-      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-android
+      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-${{ matrix.opt.id }}
+      NCNN_CMAKE_OPTIONS: |
+        -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake \
+        -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DCMAKE_INSTALL_PREFIX=install \
+        -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
+        -DNCNN_BUILD_BENCHMARK=OFF \
+        -DNCNN_VULKAN=${{ matrix.opt.vulkan }} \
+        -DNCNN_SHARED_LIB=${{ matrix.opt.shared-lib }} \
+        -DNCNN_AVX512BF16=OFF \
+
     steps:
     - uses: actions/checkout@v4
+      with:
+        submodules: true
     - name: ndk-fix-debug
       run: sed -i -e '/^  -g$/d' $ANDROID_NDK_LATEST_HOME/build/cmake/android-legacy.toolchain.cmake
-    - name: build-armv7
+    - name: build-armeabi-v7a
       run: |
-        mkdir build-armv7 && cd build-armv7
-        cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-            -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-14 \
-            -DNCNN_BUILD_BENCHMARK=OFF ..
+        mkdir build-armeabi-v7a && cd build-armeabi-v7a
+        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-14 ..
         cmake --build . -j $(nproc)
         cmake --build . --target install/strip
-    - name: build-aarch64
+    - name: build-arm64-v8a
       run: |
-        mkdir build-aarch64 && cd build-aarch64
-        cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-            -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-21 \
-            -DNCNN_BUILD_BENCHMARK=OFF ..
+        mkdir build-arm64-v8a && cd build-arm64-v8a
+        cmake ${{ env.NCNN_CMAKE_OPTIONS }}-DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-21 ..
         cmake --build . -j $(nproc)
         cmake --build . --target install/strip
     - name: build-x86
       run: |
         mkdir build-x86 && cd build-x86
-        cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-            -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-14 \
-            -DNCNN_BUILD_BENCHMARK=OFF ..
+        cmake ${{ env.NCNN_CMAKE_OPTIONS }}-DANDROID_ABI="x86" -DANDROID_PLATFORM=android-14 ..
         cmake --build . -j $(nproc)
         cmake --build . --target install/strip
     - name: build-x86_64
       run: |
         mkdir build-x86_64 && cd build-x86_64
-        cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-            -DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-21 \
-            -DNCNN_BUILD_BENCHMARK=OFF ..
+        cmake ${{ env.NCNN_CMAKE_OPTIONS }}-DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-21 ..
         cmake --build . -j $(nproc)
         cmake --build . --target install/strip
     - name: package
       run: |
         rm -rf ${{ env.PACKAGENAME }}
         mkdir -p ${{ env.PACKAGENAME }}
-        cp -a build-armv7/install ${{ env.PACKAGENAME }}/armeabi-v7a
-        cp -a build-aarch64/install ${{ env.PACKAGENAME }}/arm64-v8a
+        cp -a build-armeabi-v7a/install ${{ env.PACKAGENAME }}/armeabi-v7a
+        cp -a build-arm64-v8a/install ${{ env.PACKAGENAME }}/arm64-v8a
         cp -a build-x86/install ${{ env.PACKAGENAME }}/x86
         cp -a build-x86_64/install ${{ env.PACKAGENAME }}/x86_64
         rm -f ${{ env.PACKAGENAME }}.zip
@@ -2315,55 +1971,63 @@ jobs:
         name: ${{ env.PACKAGENAME }}
         path: ${{ env.PACKAGENAME }}.zip
 
-  android-shared:
+  webassembly:
     needs: [setup]
     runs-on: ubuntu-latest
     env:
-      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-android-shared
+      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-webassembly
     steps:
     - uses: actions/checkout@v4
-    - name: ndk-fix-debug
-      run: sed -i -e '/^  -g$/d' $ANDROID_NDK_LATEST_HOME/build/cmake/android-legacy.toolchain.cmake
-    - name: build-armv7
+    - name: emsdk
+      run: |
+        git clone https://github.com/emscripten-core/emsdk.git
+        cd emsdk
+        ./emsdk install $EMSCRIPTEN_VERSION
+        ./emsdk activate $EMSCRIPTEN_VERSION
+    - name: build
       run: |
-        mkdir build-armv7 && cd build-armv7
-        cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-            -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-14 \
-            -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
+        source emsdk/emsdk_env.sh
+        mkdir build && cd build
+        cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
+            -DNCNN_THREADS=OFF -DNCNN_OPENMP=OFF -DNCNN_SIMPLEOMP=OFF -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=OFF -DNCNN_AVX2=OFF -DNCNN_AVX=OFF \
+            -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF ..
         cmake --build . -j $(nproc)
         cmake --build . --target install/strip
-    - name: build-aarch64
+    - name: build-simd
       run: |
-        mkdir build-aarch64 && cd build-aarch64
-        cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-            -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-21 \
-            -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
+        source emsdk/emsdk_env.sh
+        mkdir build-simd && cd build-simd
+        cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
+            -DNCNN_THREADS=OFF -DNCNN_OPENMP=OFF -DNCNN_SIMPLEOMP=OFF -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=ON -DNCNN_AVX2=OFF -DNCNN_AVX=OFF \
+            -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF ..
         cmake --build . -j $(nproc)
         cmake --build . --target install/strip
-    - name: build-x86
+    - name: build-threads
       run: |
-        mkdir build-x86 && cd build-x86
-        cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-            -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-14 \
-            -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
+        source emsdk/emsdk_env.sh
+        mkdir build-threads && cd build-threads
+        cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
+            -DNCNN_THREADS=ON -DNCNN_OPENMP=ON -DNCNN_SIMPLEOMP=ON -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=OFF -DNCNN_AVX2=OFF -DNCNN_AVX=OFF \
+            -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF ..
         cmake --build . -j $(nproc)
         cmake --build . --target install/strip
-    - name: build-x86_64
+    - name: build-simd-threads
       run: |
-        mkdir build-x86_64 && cd build-x86_64
-        cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-            -DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-21 \
-            -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
+        source emsdk/emsdk_env.sh
+        mkdir build-simd-threads && cd build-simd-threads
+        cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
+            -DNCNN_THREADS=ON -DNCNN_OPENMP=ON -DNCNN_SIMPLEOMP=ON -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=ON -DNCNN_AVX2=OFF -DNCNN_AVX=OFF \
+            -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF ..
         cmake --build . -j $(nproc)
         cmake --build . --target install/strip
     - name: package
       run: |
         rm -rf ${{ env.PACKAGENAME }}
         mkdir -p ${{ env.PACKAGENAME }}
-        cp -a build-armv7/install ${{ env.PACKAGENAME }}/armeabi-v7a
-        cp -a build-aarch64/install ${{ env.PACKAGENAME }}/arm64-v8a
-        cp -a build-x86/install ${{ env.PACKAGENAME }}/x86
-        cp -a build-x86_64/install ${{ env.PACKAGENAME }}/x86_64
+        cp -a build/install ${{ env.PACKAGENAME }}/basic
+        cp -a build-simd/install ${{ env.PACKAGENAME }}/simd
+        cp -a build-threads/install ${{ env.PACKAGENAME }}/threads
+        cp -a build-simd-threads/install ${{ env.PACKAGENAME }}/simd-threads
         rm -f ${{ env.PACKAGENAME }}.zip
         zip -9 -y -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }}
     - name: upload-zip
@@ -2372,692 +2036,96 @@ jobs:
         name: ${{ env.PACKAGENAME }}
         path: ${{ env.PACKAGENAME }}.zip
 
-  android-gpu:
+  windows:
     needs: [setup]
-    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        opt:
+          - { shared-lib: OFF, os: windows-2019, toolset-version: v140, id: vs2015 }
+          - { shared-lib: OFF, os: windows-2019, toolset-version: v141, id: vs2017 }
+          - { shared-lib: OFF, os: windows-2019, toolset-version: v142, id: vs2019 }
+          - { shared-lib: OFF, os: windows-2022, toolset-version: v143, id: vs2022 }
+          - { shared-lib: ON,  os: windows-2019, toolset-version: v140, id: vs2015-shared }
+          - { shared-lib: ON,  os: windows-2019, toolset-version: v141, id: vs2017-shared }
+          - { shared-lib: ON,  os: windows-2019, toolset-version: v142, id: vs2019-shared }
+          - { shared-lib: ON,  os: windows-2022, toolset-version: v143, id: vs2022-shared }
+    runs-on: ${{ matrix.opt.os }}
     env:
-      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-android-vulkan
+      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-windows-${{ matrix.opt.id }}
+      UseMultiToolTask: true
+      NCNN_CMAKE_OPTIONS: |
+        -T ${{ matrix.opt.toolset-version }},host=x64 `
+        -DCMAKE_BUILD_TYPE=Release `
+        -DCMAKE_INSTALL_PREFIX=install `
+        -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" `
+        -DNCNN_BUILD_EXAMPLES=OFF `
+        -DNCNN_BUILD_TOOLS=ON `
+        -DNCNN_BUILD_BENCHMARK=OFF `
+        -DNCNN_VULKAN=ON `
+        -DNCNN_SHARED_LIB=${{ matrix.opt.shared-lib }} `
+
     steps:
     - uses: actions/checkout@v4
       with:
         submodules: true
-    - name: ndk-fix-debug
-      run: sed -i -e '/^  -g$/d' $ANDROID_NDK_LATEST_HOME/build/cmake/android-legacy.toolchain.cmake
-    - name: build-armv7
-      run: |
-        mkdir build-armv7 && cd build-armv7
-        cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-            -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-14 \
-            -DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF ..
-        cmake --build . -j $(nproc)
-        cmake --build . --target install/strip
-    - name: build-aarch64
+    - name: cache-protobuf
+      id: cache-protobuf
+      uses: actions/cache@v4
+      with:
+        path: "protobuf-install"
+        key: protobuf-${{ matrix.opt.toolset-version }}-x86-x64-install
+    - name: protobuf
+      if: steps.cache-protobuf.outputs.cache-hit != 'true'
       run: |
-        mkdir build-aarch64 && cd build-aarch64
-        cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-            -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-21 \
-            -DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF ..
-        cmake --build . -j $(nproc)
-        cmake --build . --target install/strip
+        Invoke-WebRequest -Uri https://github.com/protocolbuffers/protobuf/archive/v3.11.2.zip -OutFile protobuf-3.11.2.zip
+        7z x ./protobuf-3.11.2.zip
+        cd protobuf-3.11.2
+        mkdir build-x86; cd build-x86;
+        cmake -T ${{ matrix.opt.toolset-version }},host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x86" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
+        cmake --build . --config Release -j 4
+        cmake --build . --config Release --target install
+        cd ..
+        mkdir build-x64; cd build-x64;
+        cmake -T ${{ matrix.opt.toolset-version }},host=x64 -A x64 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x64" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
+        cmake --build . --config Release -j 4
+        cmake --build . --config Release --target install
     - name: build-x86
       run: |
-        mkdir build-x86 && cd build-x86
-        cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-            -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-14 \
-            -DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF ..
-        cmake --build . -j $(nproc)
-        cmake --build . --target install/strip
-    - name: build-x86_64
+        mkdir build-x86; cd build-x86
+        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -A Win32 -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x86\cmake" ..
+        cmake --build . --config Release -j 4
+        cmake --build . --config Release --target install
+    - name: build-x64
       run: |
-        mkdir build-x86_64 && cd build-x86_64
-        cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-            -DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-21 \
-            -DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF ..
-        cmake --build . -j $(nproc)
-        cmake --build . --target install/strip
-    - name: package
-      run: |
-        rm -rf ${{ env.PACKAGENAME }}
-        mkdir -p ${{ env.PACKAGENAME }}
-        cp -a build-armv7/install ${{ env.PACKAGENAME }}/armeabi-v7a
-        cp -a build-aarch64/install ${{ env.PACKAGENAME }}/arm64-v8a
-        cp -a build-x86/install ${{ env.PACKAGENAME }}/x86
-        cp -a build-x86_64/install ${{ env.PACKAGENAME }}/x86_64
-        rm -f ${{ env.PACKAGENAME }}.zip
-        zip -9 -y -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }}
-    - name: upload-zip
-      uses: actions/upload-artifact@v4
-      with:
-        name: ${{ env.PACKAGENAME }}
-        path: ${{ env.PACKAGENAME }}.zip
-
-  android-gpu-shared:
-    needs: [setup]
-    runs-on: ubuntu-latest
-    env:
-      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-android-vulkan-shared
-    steps:
-    - uses: actions/checkout@v4
-      with:
-        submodules: true
-    - name: ndk-fix-debug
-      run: sed -i -e '/^  -g$/d' $ANDROID_NDK_LATEST_HOME/build/cmake/android-legacy.toolchain.cmake
-    - name: build-armv7
-      run: |
-        mkdir build-armv7 && cd build-armv7
-        cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-            -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-14 \
-            -DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
-        cmake --build . -j $(nproc)
-        cmake --build . --target install/strip
-    - name: build-aarch64
-      run: |
-        mkdir build-aarch64 && cd build-aarch64
-        cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-            -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-21 \
-            -DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
-        cmake --build . -j $(nproc)
-        cmake --build . --target install/strip
-    - name: build-x86
-      run: |
-        mkdir build-x86 && cd build-x86
-        cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-            -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-14 \
-            -DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
-        cmake --build . -j $(nproc)
-        cmake --build . --target install/strip
-    - name: build-x86_64
-      run: |
-        mkdir build-x86_64 && cd build-x86_64
-        cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-            -DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-21 \
-            -DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
-        cmake --build . -j $(nproc)
-        cmake --build . --target install/strip
-    - name: package
-      run: |
-        rm -rf ${{ env.PACKAGENAME }}
-        mkdir -p ${{ env.PACKAGENAME }}
-        cp -a build-armv7/install ${{ env.PACKAGENAME }}/armeabi-v7a
-        cp -a build-aarch64/install ${{ env.PACKAGENAME }}/arm64-v8a
-        cp -a build-x86/install ${{ env.PACKAGENAME }}/x86
-        cp -a build-x86_64/install ${{ env.PACKAGENAME }}/x86_64
-        rm -f ${{ env.PACKAGENAME }}.zip
-        zip -9 -y -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }}
-    - name: upload-zip
-      uses: actions/upload-artifact@v4
-      with:
-        name: ${{ env.PACKAGENAME }}
-        path: ${{ env.PACKAGENAME }}.zip
-
-  webassembly:
-    needs: [setup]
-    runs-on: ubuntu-latest
-    env:
-      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-webassembly
-    steps:
-    - uses: actions/checkout@v4
-    - name: emsdk
-      run: |
-        git clone https://github.com/emscripten-core/emsdk.git
-        cd emsdk
-        ./emsdk install $EMSCRIPTEN_VERSION
-        ./emsdk activate $EMSCRIPTEN_VERSION
-    - name: build
-      run: |
-        source emsdk/emsdk_env.sh
-        mkdir build && cd build
-        cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-            -DNCNN_THREADS=OFF -DNCNN_OPENMP=OFF -DNCNN_SIMPLEOMP=OFF -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=OFF -DNCNN_AVX2=OFF -DNCNN_AVX=OFF \
-            -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF ..
-        cmake --build . -j $(nproc)
-        cmake --build . --target install/strip
-    - name: build-simd
-      run: |
-        source emsdk/emsdk_env.sh
-        mkdir build-simd && cd build-simd
-        cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-            -DNCNN_THREADS=OFF -DNCNN_OPENMP=OFF -DNCNN_SIMPLEOMP=OFF -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=ON -DNCNN_AVX2=OFF -DNCNN_AVX=OFF \
-            -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF ..
-        cmake --build . -j $(nproc)
-        cmake --build . --target install/strip
-    - name: build-threads
-      run: |
-        source emsdk/emsdk_env.sh
-        mkdir build-threads && cd build-threads
-        cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-            -DNCNN_THREADS=ON -DNCNN_OPENMP=ON -DNCNN_SIMPLEOMP=ON -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=OFF -DNCNN_AVX2=OFF -DNCNN_AVX=OFF \
-            -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF ..
-        cmake --build . -j $(nproc)
-        cmake --build . --target install/strip
-    - name: build-simd-threads
-      run: |
-        source emsdk/emsdk_env.sh
-        mkdir build-simd-threads && cd build-simd-threads
-        cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
-            -DNCNN_THREADS=ON -DNCNN_OPENMP=ON -DNCNN_SIMPLEOMP=ON -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=ON -DNCNN_AVX2=OFF -DNCNN_AVX=OFF \
-            -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF ..
-        cmake --build . -j $(nproc)
-        cmake --build . --target install/strip
-    - name: package
-      run: |
-        rm -rf ${{ env.PACKAGENAME }}
-        mkdir -p ${{ env.PACKAGENAME }}
-        cp -a build/install ${{ env.PACKAGENAME }}/basic
-        cp -a build-simd/install ${{ env.PACKAGENAME }}/simd
-        cp -a build-threads/install ${{ env.PACKAGENAME }}/threads
-        cp -a build-simd-threads/install ${{ env.PACKAGENAME }}/simd-threads
-        rm -f ${{ env.PACKAGENAME }}.zip
-        zip -9 -y -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }}
-    - name: upload-zip
-      uses: actions/upload-artifact@v4
-      with:
-        name: ${{ env.PACKAGENAME }}
-        path: ${{ env.PACKAGENAME }}.zip
-
-  windows-vs2015:
-    needs: [setup]
-    runs-on: windows-2019
-    env:
-      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-windows-vs2015
-      UseMultiToolTask: true
-    steps:
-    - uses: actions/checkout@v4
-      with:
-        submodules: true
-    - name: cache-protobuf
-      id: cache-protobuf
-      uses: actions/cache@v4
-      with:
-        path: "protobuf-install"
-        key: protobuf-vs2015-x86-x64-install
-    - name: protobuf
-      if: steps.cache-protobuf.outputs.cache-hit != 'true'
-      run: |
-        Invoke-WebRequest -Uri https://github.com/protocolbuffers/protobuf/archive/v3.11.2.zip -OutFile protobuf-3.11.2.zip
-        7z x ./protobuf-3.11.2.zip
-        cd protobuf-3.11.2
-        mkdir build-x86; cd build-x86;
-        cmake -T v140,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x86" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-        cd ..
-        mkdir build-x64; cd build-x64;
-        cmake -T v140,host=x64 -A x64 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x64" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-    - name: build-x86
-      run: |
-        mkdir build-x86; cd build-x86
-        cmake -T v140,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x86\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF ..
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-    - name: build-x64
-      run: |
-        mkdir build-x64; cd build-x64
-        cmake -T v140,host=x64 -A x64 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x64\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF ..
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-    - name: package
-      run: |
-        mkdir ${{ env.PACKAGENAME }}
-        mkdir ${{ env.PACKAGENAME }}/x86
-        mkdir ${{ env.PACKAGENAME }}/x64
-        Copy-Item -Verbose -Recurse -Path "build-x86\install\*" -Destination "${{ env.PACKAGENAME }}\x86"
-        Copy-Item -Verbose -Recurse -Path "build-x64\install\*" -Destination "${{ env.PACKAGENAME }}\x64"
-        7z a -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }}
-    - name: upload-zip
-      uses: actions/upload-artifact@v4
-      with:
-        name: ${{ env.PACKAGENAME }}
-        path: ${{ env.PACKAGENAME }}.zip
-
-  windows-vs2015-shared:
-    needs: [setup]
-    runs-on: windows-2019
-    env:
-      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-windows-vs2015-shared
-      UseMultiToolTask: true
-    steps:
-    - uses: actions/checkout@v4
-      with:
-        submodules: true
-    - name: cache-protobuf
-      id: cache-protobuf
-      uses: actions/cache@v4
-      with:
-        path: "protobuf-install"
-        key: protobuf-vs2015-x86-x64-install
-    - name: protobuf
-      if: steps.cache-protobuf.outputs.cache-hit != 'true'
-      run: |
-        Invoke-WebRequest -Uri https://github.com/protocolbuffers/protobuf/archive/v3.11.2.zip -OutFile protobuf-3.11.2.zip
-        7z x ./protobuf-3.11.2.zip
-        cd protobuf-3.11.2
-        mkdir build-x86; cd build-x86;
-        cmake -T v140,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x86" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-        cd ..
-        mkdir build-x64; cd build-x64;
-        cmake -T v140,host=x64 -A x64 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x64" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-    - name: build-x86
-      run: |
-        mkdir build-x86; cd build-x86
-        cmake -T v140,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x86\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-    - name: build-x64
-      run: |
-        mkdir build-x64; cd build-x64
-        cmake -T v140,host=x64 -A x64 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x64\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-    - name: package
-      run: |
-        mkdir ${{ env.PACKAGENAME }}
-        mkdir ${{ env.PACKAGENAME }}/x86
-        mkdir ${{ env.PACKAGENAME }}/x64
-        Copy-Item -Verbose -Recurse -Path "build-x86\install\*" -Destination "${{ env.PACKAGENAME }}\x86"
-        Copy-Item -Verbose -Recurse -Path "build-x64\install\*" -Destination "${{ env.PACKAGENAME }}\x64"
-        7z a -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }}
-    - name: upload-zip
-      uses: actions/upload-artifact@v4
-      with:
-        name: ${{ env.PACKAGENAME }}
-        path: ${{ env.PACKAGENAME }}.zip
-
-  windows-vs2017:
-    needs: [setup]
-    runs-on: windows-2019
-    env:
-      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-windows-vs2017
-      UseMultiToolTask: true
-    steps:
-    - uses: actions/checkout@v4
-      with:
-        submodules: true
-    - name: cache-protobuf
-      id: cache-protobuf
-      uses: actions/cache@v4
-      with:
-        path: "protobuf-install"
-        key: protobuf-vs2017-x86-x64-install
-    - name: protobuf
-      if: steps.cache-protobuf.outputs.cache-hit != 'true'
-      run: |
-        Invoke-WebRequest -Uri https://github.com/protocolbuffers/protobuf/archive/v3.11.2.zip -OutFile protobuf-3.11.2.zip
-        7z x ./protobuf-3.11.2.zip
-        cd protobuf-3.11.2
-        mkdir build-x86; cd build-x86;
-        cmake -T v141,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x86" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-        cd ..
-        mkdir build-x64; cd build-x64;
-        cmake -T v141,host=x64 -A x64 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x64" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-    - name: build-x86
-      run: |
-        mkdir build-x86; cd build-x86
-        cmake -T v141,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x86\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF ..
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-    - name: build-x64
-      run: |
-        mkdir build-x64; cd build-x64
-        cmake -T v141,host=x64 -A x64 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x64\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF ..
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-    - name: package
-      run: |
-        mkdir ${{ env.PACKAGENAME }}
-        mkdir ${{ env.PACKAGENAME }}/x86
-        mkdir ${{ env.PACKAGENAME }}/x64
-        Copy-Item -Verbose -Recurse -Path "build-x86\install\*" -Destination "${{ env.PACKAGENAME }}\x86"
-        Copy-Item -Verbose -Recurse -Path "build-x64\install\*" -Destination "${{ env.PACKAGENAME }}\x64"
-        7z a -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }}
-    - name: upload-zip
-      uses: actions/upload-artifact@v4
-      with:
-        name: ${{ env.PACKAGENAME }}
-        path: ${{ env.PACKAGENAME }}.zip
-
-  windows-vs2017-shared:
-    needs: [setup]
-    runs-on: windows-2019
-    env:
-      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-windows-vs2017-shared
-      UseMultiToolTask: true
-    steps:
-    - uses: actions/checkout@v4
-      with:
-        submodules: true
-    - name: cache-protobuf
-      id: cache-protobuf
-      uses: actions/cache@v4
-      with:
-        path: "protobuf-install"
-        key: protobuf-vs2017-x86-x64-install
-    - name: protobuf
-      if: steps.cache-protobuf.outputs.cache-hit != 'true'
-      run: |
-        Invoke-WebRequest -Uri https://github.com/protocolbuffers/protobuf/archive/v3.11.2.zip -OutFile protobuf-3.11.2.zip
-        7z x ./protobuf-3.11.2.zip
-        cd protobuf-3.11.2
-        mkdir build-x86; cd build-x86;
-        cmake -T v141,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x86" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-        cd ..
-        mkdir build-x64; cd build-x64;
-        cmake -T v141,host=x64 -A x64 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x64" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-    - name: build-x86
-      run: |
-        mkdir build-x86; cd build-x86
-        cmake -T v141,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x86\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-    - name: build-x64
-      run: |
-        mkdir build-x64; cd build-x64
-        cmake -T v141,host=x64 -A x64 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x64\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-    - name: package
-      run: |
-        mkdir ${{ env.PACKAGENAME }}
-        mkdir ${{ env.PACKAGENAME }}/x86
-        mkdir ${{ env.PACKAGENAME }}/x64
-        Copy-Item -Verbose -Recurse -Path "build-x86\install\*" -Destination "${{ env.PACKAGENAME }}\x86"
-        Copy-Item -Verbose -Recurse -Path "build-x64\install\*" -Destination "${{ env.PACKAGENAME }}\x64"
-        7z a -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }}
-    - name: upload-zip
-      uses: actions/upload-artifact@v4
-      with:
-        name: ${{ env.PACKAGENAME }}
-        path: ${{ env.PACKAGENAME }}.zip
-
-  windows-vs2019:
-    needs: [setup]
-    runs-on: windows-latest
-    env:
-      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-windows-vs2019
-      UseMultiToolTask: true
-    steps:
-    - uses: actions/checkout@v4
-      with:
-        submodules: true
-    - name: cache-protobuf
-      id: cache-protobuf
-      uses: actions/cache@v4
-      with:
-        path: "protobuf-install"
-        key: protobuf-vs2019-x86-x64-install
-    - name: protobuf
-      if: steps.cache-protobuf.outputs.cache-hit != 'true'
-      run: |
-        Invoke-WebRequest -Uri https://github.com/protocolbuffers/protobuf/archive/v3.11.2.zip -OutFile protobuf-3.11.2.zip
-        7z x ./protobuf-3.11.2.zip
-        cd protobuf-3.11.2
-        mkdir build-x86; cd build-x86;
-        cmake -T v142,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x86" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-        cd ..
-        mkdir build-x64; cd build-x64;
-        cmake -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x64" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-    - name: build-x86
-      run: |
-        mkdir build-x86; cd build-x86
-        cmake -T v142,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x86\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF ..
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-    - name: build-x64
-      run: |
-        mkdir build-x64; cd build-x64
-        cmake -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x64\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF ..
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-    - name: build-arm
-      run: |
-        mkdir build-arm; cd build-arm
-        cmake -T v142,host=x64 -A arm -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF ..
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-    - name: build-arm64
-      run: |
-        mkdir build-arm64; cd build-arm64
-        cmake -T v142,host=x64 -A arm64 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF ..
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-    - name: package
-      run: |
-        mkdir ${{ env.PACKAGENAME }}
-        mkdir ${{ env.PACKAGENAME }}/x86
-        mkdir ${{ env.PACKAGENAME }}/x64
-        mkdir ${{ env.PACKAGENAME }}/arm
-        mkdir ${{ env.PACKAGENAME }}/arm64
-        Copy-Item -Verbose -Recurse -Path "build-x86\install\*" -Destination "${{ env.PACKAGENAME }}\x86"
-        Copy-Item -Verbose -Recurse -Path "build-x64\install\*" -Destination "${{ env.PACKAGENAME }}\x64"
-        Copy-Item -Verbose -Recurse -Path "build-arm\install\*" -Destination "${{ env.PACKAGENAME }}\arm"
-        Copy-Item -Verbose -Recurse -Path "build-arm64\install\*" -Destination "${{ env.PACKAGENAME }}\arm64"
-        7z a -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }}
-    - name: upload-zip
-      uses: actions/upload-artifact@v4
-      with:
-        name: ${{ env.PACKAGENAME }}
-        path: ${{ env.PACKAGENAME }}.zip
-
-  windows-vs2019-shared:
-    needs: [setup]
-    runs-on: windows-latest
-    env:
-      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-windows-vs2019-shared
-      UseMultiToolTask: true
-    steps:
-    - uses: actions/checkout@v4
-      with:
-        submodules: true
-    - name: cache-protobuf
-      id: cache-protobuf
-      uses: actions/cache@v4
-      with:
-        path: "protobuf-install"
-        key: protobuf-vs2019-x86-x64-install
-    - name: protobuf
-      if: steps.cache-protobuf.outputs.cache-hit != 'true'
-      run: |
-        Invoke-WebRequest -Uri https://github.com/protocolbuffers/protobuf/archive/v3.11.2.zip -OutFile protobuf-3.11.2.zip
-        7z x ./protobuf-3.11.2.zip
-        cd protobuf-3.11.2
-        mkdir build-x86; cd build-x86;
-        cmake -T v142,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x86" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-        cd ..
-        mkdir build-x64; cd build-x64;
-        cmake -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x64" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-    - name: build-x86
-      run: |
-        mkdir build-x86; cd build-x86
-        cmake -T v142,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x86\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-    - name: build-x64
-      run: |
-        mkdir build-x64; cd build-x64
-        cmake -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x64\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-    - name: build-arm
-      run: |
-        mkdir build-arm; cd build-arm
-        cmake -T v142,host=x64 -A arm -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-    - name: build-arm64
-      run: |
-        mkdir build-arm64; cd build-arm64
-        cmake -T v142,host=x64 -A arm64 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-    - name: package
-      run: |
-        mkdir ${{ env.PACKAGENAME }}
-        mkdir ${{ env.PACKAGENAME }}/x86
-        mkdir ${{ env.PACKAGENAME }}/x64
-        mkdir ${{ env.PACKAGENAME }}/arm
-        mkdir ${{ env.PACKAGENAME }}/arm64
-        Copy-Item -Verbose -Recurse -Path "build-x86\install\*" -Destination "${{ env.PACKAGENAME }}\x86"
-        Copy-Item -Verbose -Recurse -Path "build-x64\install\*" -Destination "${{ env.PACKAGENAME }}\x64"
-        Copy-Item -Verbose -Recurse -Path "build-arm\install\*" -Destination "${{ env.PACKAGENAME }}\arm"
-        Copy-Item -Verbose -Recurse -Path "build-arm64\install\*" -Destination "${{ env.PACKAGENAME }}\arm64"
-        7z a -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }}
-    - name: upload-zip
-      uses: actions/upload-artifact@v4
-      with:
-        name: ${{ env.PACKAGENAME }}
-        path: ${{ env.PACKAGENAME }}.zip
-
-  windows-vs2022:
-    needs: [setup]
-    runs-on: windows-latest
-    env:
-      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-windows-vs2022
-      UseMultiToolTask: true
-    steps:
-    - uses: actions/checkout@v4
-      with:
-        submodules: true
-    - name: cache-protobuf
-      id: cache-protobuf
-      uses: actions/cache@v4
-      with:
-        path: "protobuf-install"
-        key: protobuf-vs2022-x86-x64-install
-    - name: protobuf
-      if: steps.cache-protobuf.outputs.cache-hit != 'true'
-      run: |
-        Invoke-WebRequest -Uri https://github.com/protocolbuffers/protobuf/archive/v3.11.2.zip -OutFile protobuf-3.11.2.zip
-        7z x ./protobuf-3.11.2.zip
-        cd protobuf-3.11.2
-        mkdir build-x86; cd build-x86;
-        cmake -T v143,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x86" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-        cd ..
-        mkdir build-x64; cd build-x64;
-        cmake -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x64" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-    - name: build-x86
-      run: |
-        mkdir build-x86; cd build-x86
-        cmake -T v143,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x86\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF ..
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-    - name: build-x64
-      run: |
-        mkdir build-x64; cd build-x64
-        cmake -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x64\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF ..
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-    - name: build-arm
-      run: |
-        mkdir build-arm; cd build-arm
-        cmake -T v143,host=x64 -A arm -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF ..
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-    - name: build-arm64
-      run: |
-        mkdir build-arm64; cd build-arm64
-        cmake -T v143,host=x64 -A arm64 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF ..
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
+        mkdir build-x64; cd build-x64
+        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -A x64 -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x64\cmake" ..
+        cmake --build . --config Release -j 4
+        cmake --build . --config Release --target install
+    - name: build-arm
+      if: matrix.opt.toolset-version == 'v142' || matrix.opt.toolset-version == 'v143'
+      run: |
+        mkdir build-arm; cd build-arm
+        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -A arm ..
+        cmake --build . --config Release -j 4
+        cmake --build . --config Release --target install
+    - name: build-arm64
+      if: matrix.opt.toolset-version == 'v142' || matrix.opt.toolset-version == 'v143'
+      run: |
+        mkdir build-arm64; cd build-arm64
+        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -A arm64 ..
+        cmake --build . --config Release -j 4
+        cmake --build . --config Release --target install
     - name: package
+      if: matrix.opt.toolset-version == 'v140' || matrix.opt.toolset-version == 'v141'
       run: |
         mkdir ${{ env.PACKAGENAME }}
         mkdir ${{ env.PACKAGENAME }}/x86
         mkdir ${{ env.PACKAGENAME }}/x64
-        mkdir ${{ env.PACKAGENAME }}/arm
-        mkdir ${{ env.PACKAGENAME }}/arm64
         Copy-Item -Verbose -Recurse -Path "build-x86\install\*" -Destination "${{ env.PACKAGENAME }}\x86"
         Copy-Item -Verbose -Recurse -Path "build-x64\install\*" -Destination "${{ env.PACKAGENAME }}\x64"
-        Copy-Item -Verbose -Recurse -Path "build-arm\install\*" -Destination "${{ env.PACKAGENAME }}\arm"
-        Copy-Item -Verbose -Recurse -Path "build-arm64\install\*" -Destination "${{ env.PACKAGENAME }}\arm64"
         7z a -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }}
-    - name: upload-zip
-      uses: actions/upload-artifact@v4
-      with:
-        name: ${{ env.PACKAGENAME }}
-        path: ${{ env.PACKAGENAME }}.zip
-
-  windows-vs2022-shared:
-    needs: [setup]
-    runs-on: windows-latest
-    env:
-      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-windows-vs2022-shared
-      UseMultiToolTask: true
-    steps:
-    - uses: actions/checkout@v4
-      with:
-        submodules: true
-    - name: cache-protobuf
-      id: cache-protobuf
-      uses: actions/cache@v4
-      with:
-        path: "protobuf-install"
-        key: protobuf-vs2022-x86-x64-install
-    - name: protobuf
-      if: steps.cache-protobuf.outputs.cache-hit != 'true'
-      run: |
-        Invoke-WebRequest -Uri https://github.com/protocolbuffers/protobuf/archive/v3.11.2.zip -OutFile protobuf-3.11.2.zip
-        7z x ./protobuf-3.11.2.zip
-        cd protobuf-3.11.2
-        mkdir build-x86; cd build-x86;
-        cmake -T v143,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x86" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-        cd ..
-        mkdir build-x64; cd build-x64;
-        cmake -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x64" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-    - name: build-x86
-      run: |
-        mkdir build-x86; cd build-x86
-        cmake -T v143,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x86\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-    - name: build-x64
-      run: |
-        mkdir build-x64; cd build-x64
-        cmake -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x64\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-    - name: build-arm
-      run: |
-        mkdir build-arm; cd build-arm
-        cmake -T v143,host=x64 -A arm -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
-    - name: build-arm64
-      run: |
-        mkdir build-arm64; cd build-arm64
-        cmake -T v143,host=x64 -A arm64 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON ..
-        cmake --build . --config Release -j 4
-        cmake --build . --config Release --target install
     - name: package
+      if: matrix.opt.toolset-version == 'v142' || matrix.opt.toolset-version == 'v143'
       run: |
         mkdir ${{ env.PACKAGENAME }}
         mkdir ${{ env.PACKAGENAME }}/x86
@@ -3087,30 +2155,49 @@ jobs:
       with:
         path: artifacts
 
-    - name: create-xcframwork
+    - name: unzip
       run: |
-        mkdir -p ncnn-macos
         mkdir -p ncnn-ios
+        mkdir -p ncnn-ios-vulkan
         mkdir -p ncnn-ios-simulator
+        mkdir -p ncnn-ios-simulator-vulkan
         mkdir -p ncnn-mac-catalyst
-        mkdir -p ncnn-watchos
-        mkdir -p ncnn-watchos-simulator
+        mkdir -p ncnn-mac-catalyst-vulkan
+        mkdir -p ncnn-macos
+        mkdir -p ncnn-macos-vulkan
         mkdir -p ncnn-tvos
+        mkdir -p ncnn-tvos-vulkan
         mkdir -p ncnn-tvos-simulator
+        mkdir -p ncnn-tvos-simulator-vulkan
         mkdir -p ncnn-visionos
+        mkdir -p ncnn-visionos-vulkan
         mkdir -p ncnn-visionos-simulator
+        mkdir -p ncnn-visionos-simulator-vulkan
+        mkdir -p ncnn-watchos
+        mkdir -p ncnn-watchos-simulator
 
-        unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-macos/ncnn-${{ needs.setup.outputs.VERSION }}-macos.zip -d ncnn-macos
         unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-ios/ncnn-${{ needs.setup.outputs.VERSION }}-ios.zip -d ncnn-ios
+        unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-ios-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-ios-vulkan.zip -d ncnn-ios-vulkan
         unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-ios-simulator/ncnn-${{ needs.setup.outputs.VERSION }}-ios-simulator.zip -d ncnn-ios-simulator
+        unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-ios-simulator-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-ios-simulator-vulkan.zip -d ncnn-ios-simulator-vulkan
         unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-mac-catalyst/ncnn-${{ needs.setup.outputs.VERSION }}-mac-catalyst.zip -d ncnn-mac-catalyst
-        unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-watchos/ncnn-${{ needs.setup.outputs.VERSION }}-watchos.zip -d ncnn-watchos
-        unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-watchos-simulator/ncnn-${{ needs.setup.outputs.VERSION }}-watchos-simulator.zip -d ncnn-watchos-simulator
+        unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-mac-catalyst-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-mac-catalyst-vulkan.zip -d ncnn-mac-catalyst-vulkan
+        unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-macos/ncnn-${{ needs.setup.outputs.VERSION }}-macos.zip -d ncnn-macos
+        unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-macos-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-macos-vulkan.zip -d ncnn-macos-vulkan
         unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-tvos/ncnn-${{ needs.setup.outputs.VERSION }}-tvos.zip -d ncnn-tvos
+        unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-tvos-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-tvos-vulkan.zip -d ncnn-tvos-vulkan
         unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-tvos-simulator/ncnn-${{ needs.setup.outputs.VERSION }}-tvos-simulator.zip -d ncnn-tvos-simulator
+        unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-tvos-simulator-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-tvos-simulator-vulkan.zip -d ncnn-tvos-simulator-vulkan
         unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-visionos/ncnn-${{ needs.setup.outputs.VERSION }}-visionos.zip -d ncnn-visionos
+        unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-visionos-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-visionos-vulkan.zip -d ncnn-visionos-vulkan
         unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-visionos-simulator/ncnn-${{ needs.setup.outputs.VERSION }}-visionos-simulator.zip -d ncnn-visionos-simulator
+        unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-visionos-simulator-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-visionos-simulator-vulkan.zip -d ncnn-visionos-simulator-vulkan
+        unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-watchos/ncnn-${{ needs.setup.outputs.VERSION }}-watchos.zip -d ncnn-watchos
+        unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-watchos-simulator/ncnn-${{ needs.setup.outputs.VERSION }}-watchos-simulator.zip -d ncnn-watchos-simulator
 
+    - name: create-xcframwork
+      run: |
+        rm -rf openmp.xcframework
         xcodebuild -create-xcframework \
             -framework ncnn-macos/openmp.framework \
             -framework ncnn-ios/openmp.framework \
@@ -3124,6 +2211,7 @@ jobs:
             -framework ncnn-visionos-simulator/openmp.framework \
             -output openmp.xcframework
 
+        rm -rf ncnn.xcframework
         xcodebuild -create-xcframework \
             -framework ncnn-macos/ncnn.framework \
             -framework ncnn-ios/ncnn.framework \
@@ -3139,48 +2227,9 @@ jobs:
 
         rm -f ${{ env.PACKAGENAME }}.zip
         zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.xcframework ncnn.xcframework
-    - name: upload-zip
-      uses: actions/upload-artifact@v4
-      with:
-        name: ${{ env.PACKAGENAME }}
-        path: ${{ env.PACKAGENAME }}.zip
-
-  apple-gpu:
-    needs: [setup, macos-gpu, ios-gpu, ios-simulator-gpu, mac-catalyst-gpu, watchos, watchos-simulator, tvos-gpu, tvos-simulator-gpu, visionos, visionos-simulator]
-    runs-on: macos-13
-    env:
-      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-apple-vulkan
-    steps:
-    - run: sudo xcode-select --switch /Applications/Xcode_15.2.app
-    - name: download
-      uses: actions/download-artifact@v4
-      with:
-        path: artifacts
-
-    - name: create-xcframwork
+    - name: create-xcframwork-vulkan
       run: |
-        mkdir -p ncnn-macos-vulkan
-        mkdir -p ncnn-ios-vulkan
-        mkdir -p ncnn-ios-simulator-vulkan
-        mkdir -p ncnn-mac-catalyst-vulkan
-        mkdir -p ncnn-watchos
-        mkdir -p ncnn-watchos-simulator
-        mkdir -p ncnn-tvos-vulkan
-        mkdir -p ncnn-tvos-simulator-vulkan
-        mkdir -p ncnn-visionos
-        mkdir -p ncnn-visionos-simulator
-
-        unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-macos-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-macos-vulkan.zip -d ncnn-macos-vulkan
-        unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-ios-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-ios-vulkan.zip -d ncnn-ios-vulkan
-        unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-ios-simulator-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-ios-simulator-vulkan.zip -d ncnn-ios-simulator-vulkan
-        unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-mac-catalyst-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-mac-catalyst-vulkan.zip -d ncnn-mac-catalyst-vulkan
-        unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-watchos/ncnn-${{ needs.setup.outputs.VERSION }}-watchos.zip -d ncnn-watchos
-        unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-watchos-simulator/ncnn-${{ needs.setup.outputs.VERSION }}-watchos-simulator.zip -d ncnn-watchos-simulator
-        unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-tvos-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-tvos-vulkan.zip -d ncnn-tvos-vulkan
-        unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-tvos-simulator-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-tvos-simulator-vulkan.zip -d ncnn-tvos-simulator-vulkan
-        unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-visionos/ncnn-${{ needs.setup.outputs.VERSION }}-visionos.zip -d ncnn-visionos
-        unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-visionos-simulator/ncnn-${{ needs.setup.outputs.VERSION }}-visionos-simulator.zip -d ncnn-visionos-simulator
-
+        rm -rf openmp.xcframework
         xcodebuild -create-xcframework \
             -framework ncnn-macos-vulkan/openmp.framework \
             -framework ncnn-ios-vulkan/openmp.framework \
@@ -3194,6 +2243,7 @@ jobs:
             -framework ncnn-visionos-simulator/openmp.framework \
             -output openmp.xcframework
 
+        rm -rf glslang.xcframework
         xcodebuild -create-xcframework \
             -framework ncnn-macos-vulkan/glslang.framework \
             -framework ncnn-ios-vulkan/glslang.framework \
@@ -3201,8 +2251,11 @@ jobs:
             -framework ncnn-mac-catalyst-vulkan/glslang.framework \
             -framework ncnn-tvos-vulkan/glslang.framework \
             -framework ncnn-tvos-simulator-vulkan/glslang.framework \
+            -framework ncnn-visionos-vulkan/glslang.framework \
+            -framework ncnn-visionos-simulator-vulkan/glslang.framework \
             -output glslang.xcframework
 
+        rm -rf ncnn.xcframework
         xcodebuild -create-xcframework \
             -framework ncnn-macos-vulkan/ncnn.framework \
             -framework ncnn-ios-vulkan/ncnn.framework \
@@ -3212,22 +2265,27 @@ jobs:
             -framework ncnn-watchos-simulator/ncnn.framework \
             -framework ncnn-tvos-vulkan/ncnn.framework \
             -framework ncnn-tvos-simulator-vulkan/ncnn.framework \
-            -framework ncnn-visionos/ncnn.framework \
-            -framework ncnn-visionos-simulator/ncnn.framework \
+            -framework ncnn-visionos-vulkan/ncnn.framework \
+            -framework ncnn-visionos-simulator-vulkan/ncnn.framework \
             -output ncnn.xcframework
 
-        rm -f ${{ env.PACKAGENAME }}.zip
-        zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.xcframework glslang.xcframework ncnn.xcframework
+        rm -f ${{ env.PACKAGENAME }}-vulkan.zip
+        zip -9 -y -r ${{ env.PACKAGENAME }}-vulkan.zip openmp.xcframework glslang.xcframework ncnn.xcframework
     - name: upload-zip
       uses: actions/upload-artifact@v4
       with:
         name: ${{ env.PACKAGENAME }}
         path: ${{ env.PACKAGENAME }}.zip
+    - name: upload-zip-vulkan
+      uses: actions/upload-artifact@v4
+      with:
+        name: ${{ env.PACKAGENAME }}-vulkan
+        path: ${{ env.PACKAGENAME }}-vulkan.zip
 
   release:
     permissions:
       contents: write  # for softprops/action-gh-release to create a release
-    needs: [setup, full-source, ubuntu-2004, ubuntu-2004-shared, ubuntu-2204, ubuntu-2204-shared, macos, macos-gpu, ios, ios-gpu, ios-simulator, ios-simulator-gpu, mac-catalyst, mac-catalyst-gpu, watchos, watchos-simulator, tvos, tvos-simulator, android, android-shared, android-gpu, android-gpu-shared, webassembly, windows-vs2015, windows-vs2015-shared, windows-vs2017, windows-vs2017-shared, windows-vs2019, windows-vs2019-shared, windows-vs2022, windows-vs2022-shared, apple, apple-gpu]
+    needs: [setup, full-source, ubuntu, macos, ios, ios-simulator, mac-catalyst, watchos, watchos-simulator, tvos, tvos-simulator, android, webassembly, windows, apple]
     runs-on: ubuntu-latest
     steps:
     - name: download

From c46278d0bb32914c438af3db86d0671402c87c67 Mon Sep 17 00:00:00 2001
From: nihui <nihuini@tencent.com>
Date: Wed, 14 Aug 2024 11:51:39 +0800
Subject: [PATCH 19/38] pnnx convert onnx resize with roi, torch.max torch.min
 with dim returns tuple (#5627)

* pnnx convert onnx resize with roi, torch.max torch.min with dim returns tuple

* torch max min only support single dim
---
 tools/pnnx/src/ir.cpp                         |   9 +
 tools/pnnx/src/pass_level2/F_interpolate.cpp  | 177 +++++++++---------
 tools/pnnx/src/pass_level2/torch_max.cpp      |  13 +-
 tools/pnnx/src/pass_level2/torch_min.cpp      |  13 +-
 tools/pnnx/src/pass_ncnn/torch_max.cpp        |  16 ++
 tools/pnnx/src/pass_ncnn/torch_min.cpp        |  16 ++
 .../pass_onnx/fuse_constant_as_attribute.cpp  |   1 +
 7 files changed, 155 insertions(+), 90 deletions(-)

diff --git a/tools/pnnx/src/ir.cpp b/tools/pnnx/src/ir.cpp
index cacd84fde79..8b2b6dfd2d7 100644
--- a/tools/pnnx/src/ir.cpp
+++ b/tools/pnnx/src/ir.cpp
@@ -2111,6 +2111,15 @@ int Graph::python(const std::string& pypath, const std::string& pnnxbinpath)
                         fprintf(pyfp, ", ");
                 }
 
+                if (op->type == "torch.max" || op->type == "torch.max")
+                {
+                    if (op->has_param("dim") && op->outputs.size() == 1)
+                    {
+                        // torch.max and torch.min with dim returns tuple
+                        fprintf(pyfp, ", _");
+                    }
+                }
+
                 if (op->type.substr(0, 7) == "Tensor.")
                 {
                     if (op->type == "Tensor.fill")
diff --git a/tools/pnnx/src/pass_level2/F_interpolate.cpp b/tools/pnnx/src/pass_level2/F_interpolate.cpp
index b93bd2df6c8..119842b1c78 100644
--- a/tools/pnnx/src/pass_level2/F_interpolate.cpp
+++ b/tools/pnnx/src/pass_level2/F_interpolate.cpp
@@ -1005,7 +1005,7 @@ class F_interpolate_onnx : public GraphRewriterPass
         return R"PNNXIR(7767517
 3 2
 pnnx.Input              input       0 1 input
-Resize                  op_0        1 1 input out sizes=%sizes coordinate_transformation_mode=%coordinate_transformation_mode mode=%mode nearest_mode=floor cubic_coeff_a=*
+Resize                  op_0        1 1 input out %*=%*
 pnnx.Output             output      1 0 out
 )PNNXIR";
     }
@@ -1017,104 +1017,69 @@ pnnx.Output             output      1 0 out
 
     bool match(const std::map<std::string, const Operator*>& matched_operators, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& /*captured_attrs*/) const
     {
-        if (captured_params.at("sizes").type != 5)
+        if (captured_params.find("op_0.coordinate_transformation_mode") == captured_params.end())
             return false;
 
-        const std::vector<int>& sizes = captured_params.at("sizes").ai;
-
-        if (sizes.size() < 3 || sizes.size() > 5)
+        if (captured_params.at("op_0.coordinate_transformation_mode").type != 4)
             return false;
 
-        const std::vector<int>& input_shape = matched_operators.at("op_0")->inputs[0]->shape;
-        if (input_shape.size() < 3 || input_shape.size() > 5)
+        if (captured_params.find("op_0.mode") == captured_params.end())
             return false;
 
-        if (input_shape[0] != sizes[0] || input_shape[1] != sizes[1])
+        if (captured_params.at("op_0.mode").type != 4)
             return false;
 
-        return true;
-    }
-
-    void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
-    {
-        const std::string& coordinate_transformation_mode = captured_params.at("coordinate_transformation_mode").s;
-        std::string mode = captured_params.at("mode").s;
-        const std::vector<int>& sizes = captured_params.at("sizes").ai;
-
-        if (mode == "linear")
+        if (captured_params.find("op_0.nearest_mode") != captured_params.end())
         {
-            if (coordinate_transformation_mode == "half_pixel")
-                op->params["align_corners"] = false;
-            if (coordinate_transformation_mode == "align_corners")
-                op->params["align_corners"] = true;
-
-            if (sizes.size() == 4)
-                mode = "bilinear";
-            if (sizes.size() == 5)
-                mode = "trilinear";
+            if (captured_params.at("op_0.nearest_mode").type != 4 || captured_params.at("op_0.nearest_mode").s != "floor")
+                return false;
         }
 
-        if (mode == "cubic")
+        if (captured_params.find("op_0.roi") != captured_params.end())
         {
-            if (coordinate_transformation_mode == "half_pixel")
-                op->params["align_corners"] = false;
-            if (coordinate_transformation_mode == "align_corners")
-                op->params["align_corners"] = true;
-
-            mode = "bicubic";
+            if (captured_params.at("op_0.roi").type != 6 || !captured_params.at("op_0.roi").ai.empty())
+                return false;
         }
 
-        op->params["mode"] = mode;
-        if (sizes.size() == 3)
-            op->params["size"] = {sizes[2]};
-        if (sizes.size() == 4)
-            op->params["size"] = {sizes[2], sizes[3]};
-        if (sizes.size() == 5)
-            op->params["size"] = {sizes[2], sizes[3], sizes[4]};
-    }
-};
+        if (captured_params.find("op_0.sizes") == captured_params.end() && captured_params.find("op_0.scales") == captured_params.end())
+            return false;
 
-REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_interpolate_onnx, 10)
+        if (captured_params.find("op_0.sizes") != captured_params.end() && captured_params.at("op_0.sizes").type == 5 && !captured_params.at("op_0.sizes").ai.empty())
+        {
+            const std::vector<int>& sizes = captured_params.at("op_0.sizes").ai;
 
-class F_interpolate_onnx_1 : public GraphRewriterPass
-{
-public:
-    const char* match_pattern_graph() const
-    {
-        return R"PNNXIR(7767517
-3 2
-pnnx.Input              input       0 1 input
-Resize                  op_0        1 1 input out scales=%scales coordinate_transformation_mode=%coordinate_transformation_mode mode=%mode nearest_mode=floor cubic_coeff_a=*
-pnnx.Output             output      1 0 out
-)PNNXIR";
-    }
+            if (sizes.size() < 3 || sizes.size() > 5)
+                return false;
 
-    const char* type_str() const
-    {
-        return "F.interpolate";
-    }
+            const std::vector<int>& input_shape = matched_operators.at("op_0")->inputs[0]->shape;
+            if (input_shape.size() < 3 || input_shape.size() > 5)
+                return false;
 
-    bool match(const std::map<std::string, Parameter>& captured_params) const
-    {
-        if (captured_params.at("scales").type != 6)
-            return false;
-
-        const std::vector<float>& scales = captured_params.at("scales").af;
+            if (input_shape[0] != sizes[0] || input_shape[1] != sizes[1])
+                return false;
+        }
+        else if (captured_params.find("op_0.scales") != captured_params.end() && captured_params.at("op_0.scales").type == 6 && !captured_params.at("op_0.scales").af.empty())
+        {
+            const std::vector<float>& scales = captured_params.at("op_0.scales").af;
 
-        if (scales.size() < 3 || scales.size() > 5)
-            return false;
+            if (scales.size() < 3 || scales.size() > 5)
+                return false;
 
-        if (scales[0] != 1.f || scales[1] != 1.f)
+            if (scales[0] != 1.f || scales[1] != 1.f)
+                return false;
+        }
+        else
+        {
             return false;
+        }
 
         return true;
     }
 
     void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
     {
-        const std::string& coordinate_transformation_mode = captured_params.at("coordinate_transformation_mode").s;
-        std::string mode = captured_params.at("mode").s;
-        const std::vector<float>& scales = captured_params.at("scales").af;
+        const std::string& coordinate_transformation_mode = captured_params.at("op_0.coordinate_transformation_mode").s;
+        std::string mode = captured_params.at("op_0.mode").s;
 
         if (mode == "linear")
         {
@@ -1122,11 +1087,6 @@ pnnx.Output             output      1 0 out
                 op->params["align_corners"] = false;
             if (coordinate_transformation_mode == "align_corners")
                 op->params["align_corners"] = true;
-
-            if (scales.size() == 4)
-                mode = "bilinear";
-            if (scales.size() == 5)
-                mode = "trilinear";
         }
 
         if (mode == "cubic")
@@ -1135,22 +1095,63 @@ pnnx.Output             output      1 0 out
                 op->params["align_corners"] = false;
             if (coordinate_transformation_mode == "align_corners")
                 op->params["align_corners"] = true;
-
-            mode = "bicubic";
         }
 
-        op->params["mode"] = mode;
-        op->params["recompute_scale_factor"] = false;
-        if (scales.size() == 3)
-            op->params["scale_factor"] = {scales[2]};
-        if (scales.size() == 4)
-            op->params["scale_factor"] = {scales[2], scales[3]};
-        if (scales.size() == 5)
-            op->params["scale_factor"] = {scales[2], scales[3], scales[4]};
+        if (captured_params.find("op_0.sizes") != captured_params.end() && captured_params.at("op_0.sizes").type == 5 && !captured_params.at("op_0.sizes").ai.empty())
+        {
+            const std::vector<int>& sizes = captured_params.at("op_0.sizes").ai;
+
+            if (mode == "linear")
+            {
+                if (sizes.size() == 4)
+                    mode = "bilinear";
+                if (sizes.size() == 5)
+                    mode = "trilinear";
+            }
+
+            if (mode == "cubic")
+            {
+                mode = "bicubic";
+            }
+
+            op->params["mode"] = mode;
+            if (sizes.size() == 3)
+                op->params["size"] = {sizes[2]};
+            if (sizes.size() == 4)
+                op->params["size"] = {sizes[2], sizes[3]};
+            if (sizes.size() == 5)
+                op->params["size"] = {sizes[2], sizes[3], sizes[4]};
+        }
+        else if (captured_params.find("op_0.scales") != captured_params.end() && captured_params.at("op_0.scales").type == 6 && !captured_params.at("op_0.scales").af.empty())
+        {
+            const std::vector<float>& scales = captured_params.at("op_0.scales").af;
+
+            if (mode == "linear")
+            {
+                if (scales.size() == 4)
+                    mode = "bilinear";
+                if (scales.size() == 5)
+                    mode = "trilinear";
+            }
+
+            if (mode == "cubic")
+            {
+                mode = "bicubic";
+            }
+
+            op->params["mode"] = mode;
+            op->params["recompute_scale_factor"] = false;
+            if (scales.size() == 3)
+                op->params["scale_factor"] = {scales[2]};
+            if (scales.size() == 4)
+                op->params["scale_factor"] = {scales[2], scales[3]};
+            if (scales.size() == 5)
+                op->params["scale_factor"] = {scales[2], scales[3], scales[4]};
+        }
     }
 };
 
-REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_interpolate_onnx_1, 10)
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_interpolate_onnx, 10)
 
 class F_interpolate_onnx_2 : public GraphRewriterPass
 {
diff --git a/tools/pnnx/src/pass_level2/torch_max.cpp b/tools/pnnx/src/pass_level2/torch_max.cpp
index b606fed066b..eef7f33b4d0 100644
--- a/tools/pnnx/src/pass_level2/torch_max.cpp
+++ b/tools/pnnx/src/pass_level2/torch_max.cpp
@@ -78,11 +78,22 @@ pnnx.Output             output      1 0 out
         return "torch.max";
     }
 
+    bool match(const std::map<std::string, Parameter>& captured_params) const
+    {
+        if (captured_params.find("op_0.axes") != captured_params.end())
+        {
+            if (captured_params.at("op_0.axes").type != 5 || captured_params.at("op_0.axes").ai.size() != 1)
+                return false;
+        }
+
+        return true;
+    }
+
     void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
     {
         if (captured_params.find("op_0.axes") != captured_params.end())
         {
-            op->params["dim"] = captured_params.at("op_0.axes");
+            op->params["dim"] = captured_params.at("op_0.axes").ai[0];
 
             if (captured_params.find("op_0.keepdims") != captured_params.end())
             {
diff --git a/tools/pnnx/src/pass_level2/torch_min.cpp b/tools/pnnx/src/pass_level2/torch_min.cpp
index 35cc4988a19..509b858c1c1 100644
--- a/tools/pnnx/src/pass_level2/torch_min.cpp
+++ b/tools/pnnx/src/pass_level2/torch_min.cpp
@@ -78,11 +78,22 @@ pnnx.Output             output      1 0 out
         return "torch.min";
     }
 
+    bool match(const std::map<std::string, Parameter>& captured_params) const
+    {
+        if (captured_params.find("op_0.axes") != captured_params.end())
+        {
+            if (captured_params.at("op_0.axes").type != 5 || captured_params.at("op_0.axes").ai.size() != 1)
+                return false;
+        }
+
+        return true;
+    }
+
     void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
     {
         if (captured_params.find("op_0.axes") != captured_params.end())
         {
-            op->params["dim"] = captured_params.at("op_0.axes");
+            op->params["dim"] = captured_params.at("op_0.axes").ai[0];
 
             if (captured_params.find("op_0.keepdims") != captured_params.end())
             {
diff --git a/tools/pnnx/src/pass_ncnn/torch_max.cpp b/tools/pnnx/src/pass_ncnn/torch_max.cpp
index 76cd33f239b..95987da5162 100644
--- a/tools/pnnx/src/pass_ncnn/torch_max.cpp
+++ b/tools/pnnx/src/pass_ncnn/torch_max.cpp
@@ -65,6 +65,22 @@ pnnx.Output             output      2 0 out indices
 
 REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(torch_max, 20)
 
+class torch_max_0 : public torch_max
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+3 2
+pnnx.Input              input       0 1 input
+torch.max               op_0        1 1 input out dim=%dim keepdim=%keepdim
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+};
+
+REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(torch_max_0, 20)
+
 class torch_max_1 : public GraphRewriterPass
 {
 public:
diff --git a/tools/pnnx/src/pass_ncnn/torch_min.cpp b/tools/pnnx/src/pass_ncnn/torch_min.cpp
index 49851b443db..3ef2ae47da0 100644
--- a/tools/pnnx/src/pass_ncnn/torch_min.cpp
+++ b/tools/pnnx/src/pass_ncnn/torch_min.cpp
@@ -65,6 +65,22 @@ pnnx.Output             output      2 0 out indices
 
 REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(torch_min, 20)
 
+class torch_min_0 : public torch_min
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+3 2
+pnnx.Input              input       0 1 input
+torch.min               op_0        1 1 input out dim=%dim keepdim=%keepdim
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+};
+
+REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(torch_min_0, 20)
+
 class torch_min_1 : public GraphRewriterPass
 {
 public:
diff --git a/tools/pnnx/src/pass_onnx/fuse_constant_as_attribute.cpp b/tools/pnnx/src/pass_onnx/fuse_constant_as_attribute.cpp
index aba88976233..39dc8d80882 100644
--- a/tools/pnnx/src/pass_onnx/fuse_constant_as_attribute.cpp
+++ b/tools/pnnx/src/pass_onnx/fuse_constant_as_attribute.cpp
@@ -43,6 +43,7 @@ static constant_as_attribute caas[] = {
     {"ReduceProd", 1, "axes"},
     {"ReduceSum", 1, "axes"},
     {"Reshape", 1, "shape"},
+    {"Resize", 1, "roi"},
     {"Resize", 2, "scales"},
     {"Resize", 3, "sizes"},
     {"Slice", 1, "starts"},

From eb6e084c2d6c036e7234d746035eb400c20a1756 Mon Sep 17 00:00:00 2001
From: nihui <nihuini@tencent.com>
Date: Wed, 14 Aug 2024 15:51:57 +0800
Subject: [PATCH 20/38] pnnx convert nn.RMSNorm F.rms_norm (#5628)

---
 tools/pnnx/src/CMakeLists.txt                 |  3 +
 tools/pnnx/src/pass_level1/nn_RMSNorm.cpp     | 51 ++++++++++++
 tools/pnnx/src/pass_level2/F_rms_norm.cpp     | 43 +++++++++++
 tools/pnnx/src/pass_level5.cpp                |  2 +
 .../src/pass_level5/fuse_static_rmsnorm.cpp   | 57 ++++++++++++++
 .../src/pass_level5/fuse_static_rmsnorm.h     | 21 +++++
 .../pnnx/src/pass_ncnn/solve_batch_index.cpp  |  4 +
 tools/pnnx/tests/CMakeLists.txt               |  2 +
 tools/pnnx/tests/test_F_rms_norm.py           | 77 +++++++++++++++++++
 tools/pnnx/tests/test_nn_RMSNorm.py           | 71 +++++++++++++++++
 10 files changed, 331 insertions(+)
 create mode 100644 tools/pnnx/src/pass_level1/nn_RMSNorm.cpp
 create mode 100644 tools/pnnx/src/pass_level2/F_rms_norm.cpp
 create mode 100644 tools/pnnx/src/pass_level5/fuse_static_rmsnorm.cpp
 create mode 100644 tools/pnnx/src/pass_level5/fuse_static_rmsnorm.h
 create mode 100644 tools/pnnx/tests/test_F_rms_norm.py
 create mode 100644 tools/pnnx/tests/test_nn_RMSNorm.py

diff --git a/tools/pnnx/src/CMakeLists.txt b/tools/pnnx/src/CMakeLists.txt
index c5c6228dee7..9834fabe069 100644
--- a/tools/pnnx/src/CMakeLists.txt
+++ b/tools/pnnx/src/CMakeLists.txt
@@ -77,6 +77,7 @@ set(pnnx_pass_level1_SRCS
     pass_level1/nn_ReplicationPad1d.cpp
     pass_level1/nn_ReplicationPad2d.cpp
     pass_level1/nn_ReplicationPad3d.cpp
+    pass_level1/nn_RMSNorm.cpp
     pass_level1/nn_RNN.cpp
     pass_level1/nn_RReLU.cpp
     pass_level1/nn_SELU.cpp
@@ -163,6 +164,7 @@ set(pnnx_pass_level2_SRCS
     pass_level2/F_prelu.cpp
     pass_level2/F_relu.cpp
     pass_level2/F_relu6.cpp
+    pass_level2/F_rms_norm.cpp
     pass_level2/F_rrelu.cpp
     pass_level2/F_scaled_dot_product_attention.cpp
     pass_level2/F_selu.cpp
@@ -383,6 +385,7 @@ set(pnnx_pass_level5_SRCS
     pass_level5/fuse_static_layernorm.cpp
     pass_level5/fuse_static_linear.cpp
     pass_level5/fuse_static_prelu.cpp
+    pass_level5/fuse_static_rmsnorm.cpp
     pass_level5/normalize_einsum_equation.cpp
     pass_level5/unroll_rnn_op.cpp
 )
diff --git a/tools/pnnx/src/pass_level1/nn_RMSNorm.cpp b/tools/pnnx/src/pass_level1/nn_RMSNorm.cpp
new file mode 100644
index 00000000000..4433f598935
--- /dev/null
+++ b/tools/pnnx/src/pass_level1/nn_RMSNorm.cpp
@@ -0,0 +1,51 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pass_level1.h"
+
+#include "../utils.h"
+
+namespace pnnx {
+
+class RMSNorm : public FuseModulePass
+{
+public:
+    const char* match_type_str() const
+    {
+        return "__torch__.torch.nn.modules.normalization.RMSNorm";
+    }
+
+    const char* type_str() const
+    {
+        return "nn.RMSNorm";
+    }
+
+    void write(Operator* op, const std::shared_ptr<torch::jit::Graph>& graph, const torch::jit::Module& mod) const
+    {
+        const torch::jit::Node* rmsn = find_node_by_kind(graph, "aten::rms_norm");
+
+        op->params["normalized_shape"] = rmsn->namedInput("normalized_shape");
+        op->params["eps"] = rmsn->namedInput("eps");
+        op->params["elementwise_affine"] = mod.hasattr("weight") && mod.hasattr("bias");
+
+        if (mod.hasattr("weight"))
+        {
+            op->attrs["weight"] = mod.attr("weight").toTensor();
+        }
+    }
+};
+
+REGISTER_GLOBAL_PNNX_FUSE_MODULE_PASS(RMSNorm)
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_level2/F_rms_norm.cpp b/tools/pnnx/src/pass_level2/F_rms_norm.cpp
new file mode 100644
index 00000000000..aaa1813c563
--- /dev/null
+++ b/tools/pnnx/src/pass_level2/F_rms_norm.cpp
@@ -0,0 +1,43 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pass_level2.h"
+
+namespace pnnx {
+
+class F_rms_norm : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+6 5
+pnnx.Input              input_0     0 1 input
+pnnx.Input              input_1     0 1 weight
+pnnx.Input              input_2     0 1 normalized_shape
+prim::Constant          op_0        0 1 eps value=%eps
+aten::rms_norm          op_1        4 1 input normalized_shape weight eps out
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "F.rms_norm";
+    }
+};
+
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_rms_norm, 10)
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_level5.cpp b/tools/pnnx/src/pass_level5.cpp
index 4903f185117..8bb3270aa2c 100644
--- a/tools/pnnx/src/pass_level5.cpp
+++ b/tools/pnnx/src/pass_level5.cpp
@@ -60,6 +60,7 @@
 #include "pass_level5/fuse_static_layernorm.h"
 #include "pass_level5/fuse_static_linear.h"
 #include "pass_level5/fuse_static_prelu.h"
+#include "pass_level5/fuse_static_rmsnorm.h"
 #include "pass_level5/normalize_einsum_equation.h"
 #include "pass_level4/dead_code_elimination.h"
 #include "pass_level4/canonicalize.h"
@@ -102,6 +103,7 @@ void pass_level5(Graph& g, const std::set<std::string>& foldable_constants, cons
     fuse_static_groupnorm(g);
     fuse_static_instancenorm(g);
     fuse_static_layernorm(g);
+    fuse_static_rmsnorm(g);
 
     fuse_static_conv(g);
     fuse_static_convtranspose(g);
diff --git a/tools/pnnx/src/pass_level5/fuse_static_rmsnorm.cpp b/tools/pnnx/src/pass_level5/fuse_static_rmsnorm.cpp
new file mode 100644
index 00000000000..ed68c026d30
--- /dev/null
+++ b/tools/pnnx/src/pass_level5/fuse_static_rmsnorm.cpp
@@ -0,0 +1,57 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "fuse_static_rmsnorm.h"
+
+#include "pass_level2.h"
+
+#include <math.h>
+#include <string.h>
+
+namespace pnnx {
+
+class fuse_static_Frmsnorm_pass : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+4 3
+pnnx.Input              input       0 1 input
+pnnx.Attribute          op_weight   0 1 weight @data
+F.rms_norm              op_0        2 1 input weight out normalized_shape=%normalized_shape eps=%eps
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* replace_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+3 2
+pnnx.Input              input       0 1 input
+nn.RMSNorm              rmsn        1 1 input out normalized_shape=%normalized_shape eps=%eps elementwise_affine=True @weight=%op_weight.data
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+};
+
+void fuse_static_rmsnorm(Graph& graph)
+{
+    fuse_static_Frmsnorm_pass a;
+    int opindex = 0;
+
+    pnnx_graph_rewrite(graph, &a, opindex);
+}
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_level5/fuse_static_rmsnorm.h b/tools/pnnx/src/pass_level5/fuse_static_rmsnorm.h
new file mode 100644
index 00000000000..c88b703cb07
--- /dev/null
+++ b/tools/pnnx/src/pass_level5/fuse_static_rmsnorm.h
@@ -0,0 +1,21 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "ir.h"
+
+namespace pnnx {
+
+void fuse_static_rmsnorm(Graph& graph);
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_ncnn/solve_batch_index.cpp b/tools/pnnx/src/pass_ncnn/solve_batch_index.cpp
index 4b1100789fc..d4532422b52 100644
--- a/tools/pnnx/src/pass_ncnn/solve_batch_index.cpp
+++ b/tools/pnnx/src/pass_ncnn/solve_batch_index.cpp
@@ -46,6 +46,7 @@ static bool is_known_operator_with_batch_index_0(const Operator* op)
         "F.group_norm",
         "F.instance_norm",
         "F.interpolate",
+        "F.layer_norm",
         "F.linear",
         "F.local_response_norm",
         "F.lp_pool1d",
@@ -56,6 +57,7 @@ static bool is_known_operator_with_batch_index_0(const Operator* op)
         "F.pixel_shuffle",
         "F.pixel_unshuffle",
         "F.prelu",
+        "F.rms_norm",
         "F.scaled_dot_product_attention",
         "F.unfold",
         "F.upsample_bilinear",
@@ -91,6 +93,7 @@ static bool is_known_operator_with_batch_index_0(const Operator* op)
         "nn.InstanceNorm2d",
         "nn.InstanceNorm3d",
         "nn.LocalResponseNorm",
+        "nn.LayerNorm",
         "nn.LPPool1d",
         "nn.LPPool2d",
         "nn.MaxPool1d",
@@ -104,6 +107,7 @@ static bool is_known_operator_with_batch_index_0(const Operator* op)
         "nn.ReplicationPad1d",
         "nn.ReplicationPad2d",
         "nn.ReplicationPad3d",
+        "nn.RMSNorm",
         "nn.Softmax2d",
         "nn.Unfold",
         "nn.Upsample",
diff --git a/tools/pnnx/tests/CMakeLists.txt b/tools/pnnx/tests/CMakeLists.txt
index a5522a70bb2..daf5501e9d8 100644
--- a/tools/pnnx/tests/CMakeLists.txt
+++ b/tools/pnnx/tests/CMakeLists.txt
@@ -61,6 +61,7 @@ pnnx_add_test(F_pixel_unshuffle)
 pnnx_add_test(F_prelu)
 pnnx_add_test(F_relu)
 pnnx_add_test(F_relu6)
+pnnx_add_test(F_rms_norm)
 pnnx_add_test(F_rrelu)
 pnnx_add_test(F_scaled_dot_product_attention)
 pnnx_add_test(F_selu)
@@ -145,6 +146,7 @@ pnnx_add_test(nn_ReLU6)
 pnnx_add_test(nn_ReplicationPad1d)
 pnnx_add_test(nn_ReplicationPad2d)
 pnnx_add_test(nn_ReplicationPad3d)
+pnnx_add_test(nn_RMSNorm)
 pnnx_add_test(nn_RNN)
 pnnx_add_test(nn_RReLU)
 pnnx_add_test(nn_SELU)
diff --git a/tools/pnnx/tests/test_F_rms_norm.py b/tools/pnnx/tests/test_F_rms_norm.py
new file mode 100644
index 00000000000..5dd9e699b23
--- /dev/null
+++ b/tools/pnnx/tests/test_F_rms_norm.py
@@ -0,0 +1,77 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from packaging import version
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+        self.w3 = nn.Parameter(torch.rand(24))
+        self.w4 = nn.Parameter(torch.rand(12, 16))
+        self.w5 = nn.Parameter(torch.rand(24))
+
+    def forward(self, x, y, z, w0, w1, w2):
+        x = F.rms_norm(x, (24,), w0)
+        x = F.rms_norm(x, (12,24), None)
+        x = F.rms_norm(x, (24,), self.w3)
+
+        y = F.rms_norm(y, (16,), None, eps=1e-3)
+        y = F.rms_norm(y, (12,16), w1)
+        y = F.rms_norm(y, (12,16), self.w4)
+
+        z = F.rms_norm(z, (24,), w2)
+        z = F.rms_norm(z, (12,16,24), None, eps=1e-2)
+        z = F.rms_norm(z, (24,), self.w5)
+        return x, y, z
+
+def test():
+    if version.parse(torch.__version__) < version.parse('2.4'):
+        return True
+
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 12, 24)
+    y = torch.rand(2, 3, 12, 16)
+    z = torch.rand(1, 10, 12, 16, 24)
+    w0 = torch.rand(24)
+    w1 = torch.rand(12, 16)
+    w2 = torch.rand(24)
+
+    a0, a1, a2 = net(x, y, z, w0, w1, w2)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y, z, w0, w1, w2))
+    mod.save("test_F_rms_norm.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../src/pnnx test_F_rms_norm.pt inputshape=[1,12,24],[2,3,12,16],[1,10,12,16,24],[24],[12,16],[24]")
+
+    # pnnx inference
+    import test_F_rms_norm_pnnx
+    b0, b1, b2 = test_F_rms_norm_pnnx.test_inference()
+
+    return torch.equal(a0, b0) and torch.equal(a1, b1) and torch.equal(a2, b2)
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/test_nn_RMSNorm.py b/tools/pnnx/tests/test_nn_RMSNorm.py
new file mode 100644
index 00000000000..a9b70cdb266
--- /dev/null
+++ b/tools/pnnx/tests/test_nn_RMSNorm.py
@@ -0,0 +1,71 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from packaging import version
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+        self.rmsn_0 = nn.RMSNorm(64)
+        self.rmsn_0.weight = nn.Parameter(torch.rand(64))
+        self.rmsn_1 = nn.RMSNorm(normalized_shape=(24,64), eps=1e-2, elementwise_affine=False)
+
+    def forward(self, x, y, z):
+        x = self.rmsn_0(x)
+        x = self.rmsn_1(x)
+
+        y = self.rmsn_0(y)
+        y = self.rmsn_1(y)
+
+        z = self.rmsn_0(z)
+        z = self.rmsn_1(z)
+        return x, y, z
+
+def test():
+    if version.parse(torch.__version__) < version.parse('2.4'):
+        return True
+
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 24, 64)
+    y = torch.rand(1, 12, 24, 64)
+    z = torch.rand(1, 12, 16, 24, 64)
+
+    a0, a1, a2 = net(x, y, z)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y, z))
+    mod.save("test_nn_RMSNorm.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../src/pnnx test_nn_RMSNorm.pt inputshape=[1,24,64],[1,12,24,64],[1,12,16,24,64]")
+
+    # pnnx inference
+    import test_nn_RMSNorm_pnnx
+    b0, b1, b2 = test_nn_RMSNorm_pnnx.test_inference()
+
+    return torch.equal(a0, b0) and torch.equal(a1, b1) and torch.equal(a2, b2)
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)

From abad90cb1cc9d2b5d47a22576bdcb391613ca209 Mon Sep 17 00:00:00 2001
From: nihui <nihuini@tencent.com>
Date: Wed, 14 Aug 2024 17:23:34 +0800
Subject: [PATCH 21/38] pnnx drop torch.max torch.min indice node if not used
 (#5629)

---
 tools/pnnx/src/pass_level2/torch_max.cpp | 12 ++++++++++++
 tools/pnnx/src/pass_level2/torch_min.cpp | 12 ++++++++++++
 2 files changed, 24 insertions(+)

diff --git a/tools/pnnx/src/pass_level2/torch_max.cpp b/tools/pnnx/src/pass_level2/torch_max.cpp
index eef7f33b4d0..5a993d6f55e 100644
--- a/tools/pnnx/src/pass_level2/torch_max.cpp
+++ b/tools/pnnx/src/pass_level2/torch_max.cpp
@@ -35,6 +35,18 @@ pnnx.Output             output      2 0 out indices
     {
         return "torch.max";
     }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
+    {
+        GraphRewriterPass::write(op, captured_params);
+
+        // drop indices if not used
+        if (op->outputs[1]->consumers.empty())
+        {
+            op->outputs[1]->producer = 0;
+            op->outputs.resize(1);
+        }
+    }
 };
 
 REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(torch_max, 20)
diff --git a/tools/pnnx/src/pass_level2/torch_min.cpp b/tools/pnnx/src/pass_level2/torch_min.cpp
index 509b858c1c1..fa174614e01 100644
--- a/tools/pnnx/src/pass_level2/torch_min.cpp
+++ b/tools/pnnx/src/pass_level2/torch_min.cpp
@@ -35,6 +35,18 @@ pnnx.Output             output      2 0 out indices
     {
         return "torch.min";
     }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
+    {
+        GraphRewriterPass::write(op, captured_params);
+
+        // drop indices if not used
+        if (op->outputs[1]->consumers.empty())
+        {
+            op->outputs[1]->producer = 0;
+            op->outputs.resize(1);
+        }
+    }
 };
 
 REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(torch_min, 20)

From fdf0df3079f779a19ffb0ef8b68e59916fbad1d8 Mon Sep 17 00:00:00 2001
From: nihui <nihuini@tencent.com>
Date: Thu, 15 Aug 2024 16:52:06 +0800
Subject: [PATCH 22/38] RMSNorm (#5630)

---
 docs/developer-guide/operators.md          |  21 +++
 src/CMakeLists.txt                         |   1 +
 src/layer/rmsnorm.cpp                      | 200 +++++++++++++++++++++
 src/layer/rmsnorm.h                        |  43 +++++
 tests/CMakeLists.txt                       |   1 +
 tests/test_rmsnorm.cpp                     | 121 +++++++++++++
 tools/pnnx/src/CMakeLists.txt              |   2 +
 tools/pnnx/src/pass_level1/nn_RMSNorm.cpp  |   2 +-
 tools/pnnx/src/pass_ncnn/F_rms_norm.cpp    |  65 +++++++
 tools/pnnx/src/pass_ncnn/nn_RMSNorm.cpp    |  70 ++++++++
 tools/pnnx/tests/ncnn/CMakeLists.txt       |   2 +
 tools/pnnx/tests/ncnn/test_F_layer_norm.py |   6 +-
 tools/pnnx/tests/ncnn/test_F_rms_norm.py   |  68 +++++++
 tools/pnnx/tests/ncnn/test_nn_LayerNorm.py |   6 +-
 tools/pnnx/tests/ncnn/test_nn_RMSNorm.py   |  68 +++++++
 15 files changed, 669 insertions(+), 7 deletions(-)
 create mode 100644 src/layer/rmsnorm.cpp
 create mode 100644 src/layer/rmsnorm.h
 create mode 100644 tests/test_rmsnorm.cpp
 create mode 100644 tools/pnnx/src/pass_ncnn/F_rms_norm.cpp
 create mode 100644 tools/pnnx/src/pass_ncnn/nn_RMSNorm.cpp
 create mode 100644 tools/pnnx/tests/ncnn/test_F_rms_norm.py
 create mode 100644 tools/pnnx/tests/ncnn/test_nn_RMSNorm.py

diff --git a/docs/developer-guide/operators.md b/docs/developer-guide/operators.md
index 05996f8d735..7594c0843ac 100644
--- a/docs/developer-guide/operators.md
+++ b/docs/developer-guide/operators.md
@@ -71,6 +71,7 @@
 * [Reorg](#reorg)
 * [Requantize](#requantize)
 * [Reshape](#reshape)
+* [RMSNorm](#rmsnorm)
 * [RNN](#rnn)
 * [Scale](#scale)
 * [SELU](#selu)
@@ -1670,6 +1671,26 @@ Reshape flag:
 - -1 = remaining
 - -233 = drop this dim(default)
 
+# RMSNorm
+```
+split x along outmost axis into part x0, x1 ...
+root mean square normalize for each part x0, x1 ...
+y = x * gamma by elementwise
+```
+
+* one_blob_only
+* support_inplace
+
+| param id  | name          | type  | default   | description       |
+| --------- | ------------- | ----- | --------- | ----------------- |
+| 0         | affine_size   | int   | 0         |                   |
+| 1         | eps           | float | 0.001f    | x = x / sqrt(var + eps) |
+| 2         | affine        | int   | 1         |                   |
+
+| weight        | type  | shape                 |
+| ------------- | ----- | --------------------- |
+| gamma_data    | float | [affine_size]         |
+
 # RNN
 Apply a single-layer RNN to a feature sequence of `T` timesteps. The input blob shape is `[w=input_size, h=T]` and the output blob shape is `[w=num_output, h=T]`.
 
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index d3f55ce7790..803c34a780d 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -166,6 +166,7 @@ ncnn_add_layer(Erf)
 ncnn_add_layer(Diag)
 ncnn_add_layer(CELU)
 ncnn_add_layer(Shrink)
+ncnn_add_layer(RMSNorm)
 
 if(NCNN_VULKAN)
     ncnn_add_shader(${CMAKE_CURRENT_SOURCE_DIR}/convert_ycbcr.comp)
diff --git a/src/layer/rmsnorm.cpp b/src/layer/rmsnorm.cpp
new file mode 100644
index 00000000000..77c74c6bccb
--- /dev/null
+++ b/src/layer/rmsnorm.cpp
@@ -0,0 +1,200 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "rmsnorm.h"
+
+namespace ncnn {
+
+RMSNorm::RMSNorm()
+{
+    one_blob_only = true;
+    support_inplace = true;
+}
+
+int RMSNorm::load_param(const ParamDict& pd)
+{
+    affine_size = pd.get(0, 0);
+    eps = pd.get(1, 0.001f);
+    affine = pd.get(2, 1);
+
+    return 0;
+}
+
+int RMSNorm::load_model(const ModelBin& mb)
+{
+    if (affine == 0)
+        return 0;
+
+    gamma_data = mb.load(affine_size, 1);
+    if (gamma_data.empty())
+        return -100;
+
+    return 0;
+}
+
+int RMSNorm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
+{
+    // x = x / sqrt(rms + eps) * gamma
+
+    int dims = bottom_top_blob.dims;
+
+    if (dims == 1)
+    {
+        int w = bottom_top_blob.w;
+        // assert affine_size == w
+
+        float* ptr = bottom_top_blob;
+
+        float sqsum = 0.f;
+        for (int i = 0; i < w; i++)
+        {
+            sqsum += ptr[i] * ptr[i];
+        }
+        float rms = sqrtf(sqsum / w + eps);
+
+        float a = 1.f / rms;
+
+        if (affine)
+        {
+            for (int i = 0; i < w; i++)
+            {
+                ptr[i] = (ptr[i] * a) * gamma_data[i];
+            }
+        }
+        else
+        {
+            for (int i = 0; i < w; i++)
+            {
+                ptr[i] = ptr[i] * a;
+            }
+        }
+    }
+
+    if (dims == 2)
+    {
+        int w = bottom_top_blob.w;
+        int h = bottom_top_blob.h;
+        // assert affine_size == w
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int i = 0; i < h; i++)
+        {
+            float* ptr = bottom_top_blob.row(i);
+
+            float sqsum = 0.f;
+            for (int j = 0; j < w; j++)
+            {
+                sqsum += ptr[j] * ptr[j];
+            }
+            float rms = sqrtf(sqsum / w + eps);
+
+            float a = 1.f / rms;
+
+            if (affine)
+            {
+                for (int j = 0; j < w; j++)
+                {
+                    ptr[j] = (ptr[j] * a) * gamma_data[j];
+                }
+            }
+            else
+            {
+                for (int j = 0; j < w; j++)
+                {
+                    ptr[j] = ptr[j] * a;
+                }
+            }
+        }
+    }
+
+    if (dims == 3)
+    {
+        int w = bottom_top_blob.w;
+        int h = bottom_top_blob.h;
+        int channels = bottom_top_blob.c;
+        int size = w * h;
+
+        if (affine_size == w)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                for (int i = 0; i < h; i++)
+                {
+                    float* ptr = bottom_top_blob.channel(q).row(i);
+
+                    float sqsum = 0.f;
+                    for (int j = 0; j < w; j++)
+                    {
+                        sqsum += ptr[j] * ptr[j];
+                    }
+                    float rms = sqrtf(sqsum / w + eps);
+
+                    float a = 1.f / rms;
+
+                    if (affine)
+                    {
+                        for (int j = 0; j < w; j++)
+                        {
+                            ptr[j] = (ptr[j] * a) * gamma_data[j];
+                        }
+                    }
+                    else
+                    {
+                        for (int j = 0; j < w; j++)
+                        {
+                            ptr[j] = ptr[j] * a;
+                        }
+                    }
+                }
+            }
+        }
+        else // if (affine_size == size)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                float* ptr = bottom_top_blob.channel(q);
+
+                float sqsum = 0.f;
+                for (int i = 0; i < size; i++)
+                {
+                    sqsum += ptr[i] * ptr[i];
+                }
+                float rms = sqrtf(sqsum / size + eps);
+
+                float a = 1.f / rms;
+
+                if (affine)
+                {
+                    for (int i = 0; i < size; i++)
+                    {
+                        ptr[i] = (ptr[i] * a) * gamma_data[i];
+                    }
+                }
+                else
+                {
+                    for (int i = 0; i < size; i++)
+                    {
+                        ptr[i] = ptr[i] * a;
+                    }
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/rmsnorm.h b/src/layer/rmsnorm.h
new file mode 100644
index 00000000000..4a09f2548bd
--- /dev/null
+++ b/src/layer/rmsnorm.h
@@ -0,0 +1,43 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_RMSNORM_H
+#define LAYER_RMSNORM_H
+
+#include "layer.h"
+
+namespace ncnn {
+
+class RMSNorm : public Layer
+{
+public:
+    RMSNorm();
+
+    virtual int load_param(const ParamDict& pd);
+
+    virtual int load_model(const ModelBin& mb);
+
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+
+public:
+    int affine_size;
+    float eps;
+    int affine;
+
+    Mat gamma_data;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_RMSNORM_H
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index d30229b870c..6c8939fc7c7 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -141,6 +141,7 @@ ncnn_add_layer_test(ReLU)
 ncnn_add_layer_test(Reorg)
 ncnn_add_layer_test(Requantize)
 ncnn_add_layer_test(Reshape)
+ncnn_add_layer_test(RMSNorm)
 ncnn_add_layer_test(RNN)
 ncnn_add_layer_test(ROIPooling)
 ncnn_add_layer_test(ROIAlign)
diff --git a/tests/test_rmsnorm.cpp b/tests/test_rmsnorm.cpp
new file mode 100644
index 00000000000..2d88c162d8b
--- /dev/null
+++ b/tests/test_rmsnorm.cpp
@@ -0,0 +1,121 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "testutil.h"
+
+static int test_rmsnorm(const ncnn::Mat& a, int affine_size, float eps, int affine)
+{
+    ncnn::ParamDict pd;
+    pd.set(0, affine_size);
+    pd.set(1, eps);
+    pd.set(2, affine);
+
+    std::vector<ncnn::Mat> weights(1);
+    weights[0] = RandomMat(affine_size);
+
+    int ret = test_layer("RMSNorm", pd, weights, a);
+    if (ret != 0)
+    {
+        fprintf(stderr, "test_rmsnorm failed a.dims=%d a=(%d %d %d) affine_size=%d eps=%f affine=%d\n", a.dims, a.w, a.h, a.c, affine_size, eps, affine);
+    }
+
+    return ret;
+}
+
+static int test_rmsnorm_0()
+{
+    return 0
+           || test_rmsnorm(RandomMat(6, 4, 2), 6, 0.01f, 0)
+           || test_rmsnorm(RandomMat(4, 5, 6), 4, 0.01f, 0)
+           || test_rmsnorm(RandomMat(3, 3, 8), 3, 0.002f, 0)
+           || test_rmsnorm(RandomMat(5, 6, 12), 5, 0.02f, 0)
+           || test_rmsnorm(RandomMat(4, 7, 16), 4, 0.02f, 0)
+           || test_rmsnorm(RandomMat(6, 7, 24), 6, 0.001f, 0)
+           || test_rmsnorm(RandomMat(5, 8, 32), 5, 0.001f, 0)
+           || test_rmsnorm(RandomMat(6, 4, 2), 6, 0.01f, 1)
+           || test_rmsnorm(RandomMat(4, 5, 6), 4, 0.01f, 1)
+           || test_rmsnorm(RandomMat(3, 3, 8), 3, 0.002f, 1)
+           || test_rmsnorm(RandomMat(5, 6, 12), 5, 0.02f, 1)
+           || test_rmsnorm(RandomMat(4, 7, 16), 4, 0.02f, 1)
+           || test_rmsnorm(RandomMat(6, 7, 24), 6, 0.001f, 1)
+           || test_rmsnorm(RandomMat(5, 8, 32), 5, 0.001f, 1);
+}
+
+static int test_rmsnorm_1()
+{
+    return 0
+           || test_rmsnorm(RandomMat(6, 4, 2), 24, 0.01f, 0)
+           || test_rmsnorm(RandomMat(4, 5, 6), 20, 0.01f, 0)
+           || test_rmsnorm(RandomMat(3, 3, 8), 9, 0.002f, 0)
+           || test_rmsnorm(RandomMat(5, 6, 12), 30, 0.02f, 0)
+           || test_rmsnorm(RandomMat(4, 7, 16), 28, 0.02f, 0)
+           || test_rmsnorm(RandomMat(6, 7, 24), 42, 0.001f, 0)
+           || test_rmsnorm(RandomMat(5, 8, 32), 40, 0.001f, 0)
+           || test_rmsnorm(RandomMat(6, 4, 2), 24, 0.01f, 1)
+           || test_rmsnorm(RandomMat(4, 5, 6), 20, 0.01f, 1)
+           || test_rmsnorm(RandomMat(3, 3, 8), 9, 0.002f, 1)
+           || test_rmsnorm(RandomMat(5, 6, 12), 30, 0.02f, 1)
+           || test_rmsnorm(RandomMat(4, 7, 16), 28, 0.02f, 1)
+           || test_rmsnorm(RandomMat(6, 7, 24), 42, 0.001f, 1)
+           || test_rmsnorm(RandomMat(5, 8, 32), 40, 0.001f, 1);
+}
+
+static int test_rmsnorm_2()
+{
+    return 0
+           || test_rmsnorm(RandomMat(4, 2), 4, 0.01f, 0)
+           || test_rmsnorm(RandomMat(5, 6), 5, 0.01f, 0)
+           || test_rmsnorm(RandomMat(3, 8), 3, 0.002f, 0)
+           || test_rmsnorm(RandomMat(6, 12), 6, 0.02f, 0)
+           || test_rmsnorm(RandomMat(4, 16), 4, 0.02f, 0)
+           || test_rmsnorm(RandomMat(7, 24), 7, 0.001f, 0)
+           || test_rmsnorm(RandomMat(8, 32), 8, 0.001f, 0)
+           || test_rmsnorm(RandomMat(4, 2), 4, 0.01f, 1)
+           || test_rmsnorm(RandomMat(5, 6), 5, 0.01f, 1)
+           || test_rmsnorm(RandomMat(3, 8), 3, 0.002f, 1)
+           || test_rmsnorm(RandomMat(6, 12), 6, 0.02f, 1)
+           || test_rmsnorm(RandomMat(4, 16), 4, 0.02f, 1)
+           || test_rmsnorm(RandomMat(7, 24), 7, 0.001f, 1)
+           || test_rmsnorm(RandomMat(8, 32), 8, 0.001f, 1);
+}
+
+static int test_rmsnorm_3()
+{
+    return 0
+           || test_rmsnorm(RandomMat(2), 2, 0.01f, 0)
+           || test_rmsnorm(RandomMat(6), 6, 0.01f, 0)
+           || test_rmsnorm(RandomMat(8), 8, 0.002f, 0)
+           || test_rmsnorm(RandomMat(12), 12, 0.02f, 0)
+           || test_rmsnorm(RandomMat(16), 16, 0.02f, 0)
+           || test_rmsnorm(RandomMat(24), 24, 0.001f, 0)
+           || test_rmsnorm(RandomMat(32), 32, 0.001f, 0)
+           || test_rmsnorm(RandomMat(2), 2, 0.01f, 1)
+           || test_rmsnorm(RandomMat(6), 6, 0.01f, 1)
+           || test_rmsnorm(RandomMat(8), 8, 0.002f, 1)
+           || test_rmsnorm(RandomMat(12), 12, 0.02f, 1)
+           || test_rmsnorm(RandomMat(16), 16, 0.02f, 1)
+           || test_rmsnorm(RandomMat(24), 24, 0.001f, 1)
+           || test_rmsnorm(RandomMat(32), 32, 0.001f, 1);
+}
+
+int main()
+{
+    SRAND(7767517);
+
+    return 0
+           || test_rmsnorm_0()
+           || test_rmsnorm_1()
+           || test_rmsnorm_2()
+           || test_rmsnorm_3();
+}
diff --git a/tools/pnnx/src/CMakeLists.txt b/tools/pnnx/src/CMakeLists.txt
index 9834fabe069..2c814bd486c 100644
--- a/tools/pnnx/src/CMakeLists.txt
+++ b/tools/pnnx/src/CMakeLists.txt
@@ -475,6 +475,7 @@ set(pnnx_pass_ncnn_SRCS
     pass_ncnn/F_prelu.cpp
     pass_ncnn/F_relu.cpp
     pass_ncnn/F_relu6.cpp
+    pass_ncnn/F_rms_norm.cpp
     pass_ncnn/F_scaled_dot_product_attention.cpp
     pass_ncnn/F_selu.cpp
     pass_ncnn/F_sigmoid.cpp
@@ -541,6 +542,7 @@ set(pnnx_pass_ncnn_SRCS
     pass_ncnn/nn_ReplicationPad1d.cpp
     pass_ncnn/nn_ReplicationPad2d.cpp
     pass_ncnn/nn_ReplicationPad3d.cpp
+    pass_ncnn/nn_RMSNorm.cpp
     pass_ncnn/nn_RNN.cpp
     pass_ncnn/nn_SELU.cpp
     pass_ncnn/nn_Sigmoid.cpp
diff --git a/tools/pnnx/src/pass_level1/nn_RMSNorm.cpp b/tools/pnnx/src/pass_level1/nn_RMSNorm.cpp
index 4433f598935..498f0453c14 100644
--- a/tools/pnnx/src/pass_level1/nn_RMSNorm.cpp
+++ b/tools/pnnx/src/pass_level1/nn_RMSNorm.cpp
@@ -37,7 +37,7 @@ class RMSNorm : public FuseModulePass
 
         op->params["normalized_shape"] = rmsn->namedInput("normalized_shape");
         op->params["eps"] = rmsn->namedInput("eps");
-        op->params["elementwise_affine"] = mod.hasattr("weight") && mod.hasattr("bias");
+        op->params["elementwise_affine"] = mod.hasattr("weight");
 
         if (mod.hasattr("weight"))
         {
diff --git a/tools/pnnx/src/pass_ncnn/F_rms_norm.cpp b/tools/pnnx/src/pass_ncnn/F_rms_norm.cpp
new file mode 100644
index 00000000000..8230168312c
--- /dev/null
+++ b/tools/pnnx/src/pass_ncnn/F_rms_norm.cpp
@@ -0,0 +1,65 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pass_ncnn.h"
+
+namespace pnnx {
+
+namespace ncnn {
+
+class F_rms_norm : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+3 2
+pnnx.Input              input       0 1 input
+F.rms_norm              op_0        1 1 input out weight=None normalized_shape=%normalized_shape eps=%eps
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "RMSNorm";
+    }
+
+    const char* name_str() const
+    {
+        return "rmsn";
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
+    {
+        const std::vector<int>& normalized_shape = captured_params.at("normalized_shape").ai;
+        int affine_size = normalized_shape[0];
+        for (size_t i = 1; i < normalized_shape.size(); i++)
+        {
+            affine_size *= normalized_shape[i];
+        }
+
+        const float eps = captured_params.at("eps").type == 0 ? 0.f : captured_params.at("eps").f;
+
+        op->params["0"] = affine_size;
+        op->params["1"] = eps;
+        op->params["2"] = 0;
+    }
+};
+
+REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_rms_norm, 20)
+
+} // namespace ncnn
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_ncnn/nn_RMSNorm.cpp b/tools/pnnx/src/pass_ncnn/nn_RMSNorm.cpp
new file mode 100644
index 00000000000..7fda637c5ca
--- /dev/null
+++ b/tools/pnnx/src/pass_ncnn/nn_RMSNorm.cpp
@@ -0,0 +1,70 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pass_ncnn.h"
+
+namespace pnnx {
+
+namespace ncnn {
+
+class nn_RMSNorm : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+3 2
+pnnx.Input              input       0 1 input
+nn.RMSNorm              op_0        1 1 input out normalized_shape=%normalized_shape eps=%eps elementwise_affine=%elementwise_affine @weight
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "RMSNorm";
+    }
+
+    const char* name_str() const
+    {
+        return "rmsn";
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
+    {
+        const std::vector<int>& normalized_shape = captured_params.at("normalized_shape").ai;
+        int affine_size = normalized_shape[0];
+        for (size_t i = 1; i < normalized_shape.size(); i++)
+        {
+            affine_size *= normalized_shape[i];
+        }
+
+        const float eps = captured_params.at("eps").type == 0 ? 0.f : captured_params.at("eps").f;
+
+        op->params["0"] = affine_size;
+        op->params["1"] = eps;
+        op->params["2"] = captured_params.at("elementwise_affine").b ? 1 : 0;
+
+        if (captured_params.at("elementwise_affine").b)
+        {
+            op->attrs["0"] = captured_attrs.at("op_0.weight");
+        }
+    }
+};
+
+REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(nn_RMSNorm, 20)
+
+} // namespace ncnn
+
+} // namespace pnnx
diff --git a/tools/pnnx/tests/ncnn/CMakeLists.txt b/tools/pnnx/tests/ncnn/CMakeLists.txt
index a60e63eb54b..49cb063f335 100644
--- a/tools/pnnx/tests/ncnn/CMakeLists.txt
+++ b/tools/pnnx/tests/ncnn/CMakeLists.txt
@@ -53,6 +53,7 @@ pnnx_ncnn_add_test(F_pixel_unshuffle)
 pnnx_ncnn_add_test(F_prelu)
 pnnx_ncnn_add_test(F_relu)
 pnnx_ncnn_add_test(F_relu6)
+pnnx_ncnn_add_test(F_rms_norm)
 pnnx_ncnn_add_test(F_selu)
 pnnx_ncnn_add_test(F_sigmoid)
 pnnx_ncnn_add_test(F_silu)
@@ -123,6 +124,7 @@ pnnx_ncnn_add_test(nn_ReLU6)
 pnnx_ncnn_add_test(nn_ReplicationPad1d)
 pnnx_ncnn_add_test(nn_ReplicationPad2d)
 pnnx_ncnn_add_test(nn_ReplicationPad3d)
+pnnx_ncnn_add_test(nn_RMSNorm)
 pnnx_ncnn_add_test(nn_RNN)
 pnnx_ncnn_add_test(nn_SELU)
 pnnx_ncnn_add_test(nn_Sigmoid)
diff --git a/tools/pnnx/tests/ncnn/test_F_layer_norm.py b/tools/pnnx/tests/ncnn/test_F_layer_norm.py
index 92244f17910..9d590aa76dd 100644
--- a/tools/pnnx/tests/ncnn/test_F_layer_norm.py
+++ b/tools/pnnx/tests/ncnn/test_F_layer_norm.py
@@ -37,8 +37,8 @@ def test():
     net.eval()
 
     torch.manual_seed(0)
-    x = torch.rand(12, 24)
-    y = torch.rand(3, 12, 16)
+    x = torch.rand(1, 12, 24)
+    y = torch.rand(1, 3, 12, 16)
 
     a = net(x, y)
 
@@ -48,7 +48,7 @@ def test():
 
     # torchscript to pnnx
     import os
-    os.system("../../src/pnnx test_F_layer_norm.pt inputshape=[12,24],[3,12,16]")
+    os.system("../../src/pnnx test_F_layer_norm.pt inputshape=[1,12,24],[1,3,12,16]")
 
     # ncnn inference
     import test_F_layer_norm_ncnn
diff --git a/tools/pnnx/tests/ncnn/test_F_rms_norm.py b/tools/pnnx/tests/ncnn/test_F_rms_norm.py
new file mode 100644
index 00000000000..4e60d9314aa
--- /dev/null
+++ b/tools/pnnx/tests/ncnn/test_F_rms_norm.py
@@ -0,0 +1,68 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from packaging import version
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+        self.w3 = nn.Parameter(torch.rand(24))
+        self.w4 = nn.Parameter(torch.rand(12, 16))
+
+    def forward(self, x, y):
+        x = F.rms_norm(x, (24,), self.w3)
+
+        y = F.rms_norm(y, (16,), None)
+        z = F.rms_norm(y, (12,16), self.w4, eps=1e-3)
+        return x, y, z
+
+def test():
+    if version.parse(torch.__version__) < version.parse('2.4'):
+        return True
+
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 12, 24)
+    y = torch.rand(1, 3, 12, 16)
+
+    a = net(x, y)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y))
+    mod.save("test_F_rms_norm.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../../src/pnnx test_F_rms_norm.pt inputshape=[1,12,24],[1,3,12,16]")
+
+    # ncnn inference
+    import test_F_rms_norm_ncnn
+    b = test_F_rms_norm_ncnn.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.allclose(a0, b0, 1e-4, 1e-4):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
diff --git a/tools/pnnx/tests/ncnn/test_nn_LayerNorm.py b/tools/pnnx/tests/ncnn/test_nn_LayerNorm.py
index a45444060d0..d409bdfba3a 100644
--- a/tools/pnnx/tests/ncnn/test_nn_LayerNorm.py
+++ b/tools/pnnx/tests/ncnn/test_nn_LayerNorm.py
@@ -36,8 +36,8 @@ def test():
     net.eval()
 
     torch.manual_seed(0)
-    x = torch.rand(24, 64)
-    y = torch.rand(12, 24, 64)
+    x = torch.rand(1, 24, 64)
+    y = torch.rand(1, 12, 24, 64)
 
     a = net(x, y)
 
@@ -47,7 +47,7 @@ def test():
 
     # torchscript to pnnx
     import os
-    os.system("../../src/pnnx test_nn_LayerNorm.pt inputshape=[24,64],[12,24,64]")
+    os.system("../../src/pnnx test_nn_LayerNorm.pt inputshape=[1,24,64],[1,12,24,64]")
 
     # ncnn inference
     import test_nn_LayerNorm_ncnn
diff --git a/tools/pnnx/tests/ncnn/test_nn_RMSNorm.py b/tools/pnnx/tests/ncnn/test_nn_RMSNorm.py
new file mode 100644
index 00000000000..0d5efa211e4
--- /dev/null
+++ b/tools/pnnx/tests/ncnn/test_nn_RMSNorm.py
@@ -0,0 +1,68 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from packaging import version
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+        self.rmsn_0 = nn.RMSNorm(64)
+        self.rmsn_0.weight = nn.Parameter(torch.rand(64))
+        self.rmsn_1 = nn.RMSNorm(normalized_shape=(24,64), eps=1e-2, elementwise_affine=False)
+
+    def forward(self, x, y):
+        x = self.rmsn_0(x)
+        y = self.rmsn_0(y)
+        z = self.rmsn_1(y)
+        return x, y, z
+
+def test():
+    if version.parse(torch.__version__) < version.parse('2.4'):
+        return True
+
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 24, 64)
+    y = torch.rand(1, 12, 24, 64)
+
+    a = net(x, y)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y))
+    mod.save("test_nn_RMSNorm.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../../src/pnnx test_nn_RMSNorm.pt inputshape=[1,24,64],[1,12,24,64]")
+
+    # ncnn inference
+    import test_nn_RMSNorm_ncnn
+    b = test_nn_RMSNorm_ncnn.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.allclose(a0, b0, 1e-4, 1e-4):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)

From e550419508c28cf3d2b7a918e45b952999d4f0fa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=B5=E5=B0=8F=E5=87=A1?=
 <2672931+whyb@users.noreply.github.com>
Date: Thu, 15 Aug 2024 16:52:33 +0800
Subject: [PATCH 23/38] Add yolov8 ncnn example (#5506)

---
 examples/CMakeLists.txt |   1 +
 examples/yolov8.cpp     | 410 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 411 insertions(+)
 create mode 100644 examples/yolov8.cpp

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index a7739be27e5..bf3017dbe68 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -69,6 +69,7 @@ if(NCNN_PIXEL)
             ncnn_add_example(yolov4)
             ncnn_add_example(rvm)
             ncnn_add_example(p2pnet)
+            ncnn_add_example(yolov8)
         endif()
     else()
         message(WARNING "OpenCV not found and NCNN_SIMPLEOCV disabled, examples won't be built")
diff --git a/examples/yolov8.cpp b/examples/yolov8.cpp
new file mode 100644
index 00000000000..5b3926582c8
--- /dev/null
+++ b/examples/yolov8.cpp
@@ -0,0 +1,410 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Copyright (C) 2024 whyb(https://github.com/whyb). All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+// ReadMe
+// Convert yolov8 model to ncnn model workflow:
+//
+// step 1:
+// If you don't want to train the model yourself. You should go to the ultralytics website download the pretrained model file.
+// original pretrained model from https://docs.ultralytics.com/models/yolov8/#supported-tasks-and-modes
+//
+// step 2:
+// run this command.
+// conda create --name yolov8 python=3.11
+// conda activate yolov8
+// pip install ultralytics onnx numpy protobuf
+//
+// step 3:
+// save source code file(export_model_to_ncnn.py):
+// from ultralytics import YOLO
+// detection_models = [
+//     ["./Detection-pt/yolov8n.pt", "./Detection-pt/"],
+//     ["./Detection-pt/yolov8s.pt", "./Detection-pt/"],
+//     ["./Detection-pt/yolov8m.pt", "./Detection-pt/"],
+//     ["./Detection-pt/yolov8l.pt", "./Detection-pt/"],
+//     ["./Detection-pt/yolov8x.pt", "./Detection-pt/"]
+// ]
+// for model_dict in detection_models:
+//     model = YOLO(model_dict[0])  # load an official pretrained weight model
+//     model.export(format="ncnn", dynamic=True, save_dir=model_dict[1], simplify=True)
+//
+// step 4:
+// run command: python export_model_to_ncnn.py
+
+#include <memory>
+#include <vector>
+#include <algorithm>
+#include "layer.h"
+#include "net.h"
+
+#include <opencv2/opencv.hpp>
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <float.h>
+#include <stdio.h>
+
+#define MAX_STRIDE 32
+
+struct Object
+{
+    cv::Rect_<float> rect;
+    int label;
+    float prob;
+};
+
+static inline float intersection_area(const Object& a, const Object& b)
+{
+    cv::Rect_<float> inter = a.rect & b.rect;
+    return inter.area();
+}
+
+static void qsort_descent_inplace(std::vector<Object>& objects, int left, int right)
+{
+    int i = left;
+    int j = right;
+    float p = objects[(left + right) / 2].prob;
+
+    while (i <= j)
+    {
+        while (objects[i].prob > p)
+            i++;
+
+        while (objects[j].prob < p)
+            j--;
+
+        if (i <= j)
+        {
+            // swap
+            std::swap(objects[i], objects[j]);
+
+            i++;
+            j--;
+        }
+    }
+
+    #pragma omp parallel sections
+    {
+        #pragma omp section
+        {
+            if (left < j) qsort_descent_inplace(objects, left, j);
+        }
+        #pragma omp section
+        {
+            if (i < right) qsort_descent_inplace(objects, i, right);
+        }
+    }
+}
+
+static void qsort_descent_inplace(std::vector<Object>& objects)
+{
+    if (objects.empty())
+        return;
+
+    qsort_descent_inplace(objects, 0, objects.size() - 1);
+}
+
+static void nms_sorted_bboxes(const std::vector<Object>& faceobjects, std::vector<int>& picked, float nms_threshold, bool agnostic = false)
+{
+    picked.clear();
+
+    const int n = faceobjects.size();
+
+    std::vector<float> areas(n);
+    for (int i = 0; i < n; i++)
+    {
+        areas[i] = faceobjects[i].rect.area();
+    }
+
+    for (int i = 0; i < n; i++)
+    {
+        const Object& a = faceobjects[i];
+
+        int keep = 1;
+        for (int j = 0; j < (int)picked.size(); j++)
+        {
+            const Object& b = faceobjects[picked[j]];
+
+            if (!agnostic && a.label != b.label)
+                continue;
+
+            // intersection over union
+            float inter_area = intersection_area(a, b);
+            float union_area = areas[i] + areas[picked[j]] - inter_area;
+            // float IoU = inter_area / union_area
+            if (inter_area / union_area > nms_threshold)
+                keep = 0;
+        }
+
+        if (keep)
+            picked.push_back(i);
+    }
+}
+
+static inline float sigmoid(float x)
+{
+    return static_cast<float>(1.f / (1.f + exp(-x)));
+}
+
+static inline float clampf(float d, float min, float max)
+{
+    const float t = d < min ? min : d;
+    return t > max ? max : t;
+}
+
+static void parse_yolov8_detections(
+    float* inputs, float confidence_threshold,
+    int num_channels, int num_anchors, int num_labels,
+    int infer_img_width, int infer_img_height,
+    std::vector<Object>& objects)
+{
+    std::vector<Object> detections;
+    cv::Mat output = cv::Mat((int)num_channels, (int)num_anchors, CV_32F, inputs).t();
+
+    for (int i = 0; i < num_anchors; i++)
+    {
+        auto row_ptr = output.row(i).ptr<float>();
+        auto bboxes_ptr = row_ptr;
+        auto scores_ptr = row_ptr + 4;
+        auto max_s_ptr = std::max_element(scores_ptr, scores_ptr + num_labels);
+        float score = *max_s_ptr;
+        if (score > confidence_threshold)
+        {
+            float x = *bboxes_ptr++;
+            float y = *bboxes_ptr++;
+            float w = *bboxes_ptr++;
+            float h = *bboxes_ptr;
+
+            float x0 = clampf((x - 0.5f * w), 0.f, (float)infer_img_width);
+            float y0 = clampf((y - 0.5f * h), 0.f, (float)infer_img_height);
+            float x1 = clampf((x + 0.5f * w), 0.f, (float)infer_img_width);
+            float y1 = clampf((y + 0.5f * h), 0.f, (float)infer_img_height);
+
+            cv::Rect_<float> bbox;
+            bbox.x = x0;
+            bbox.y = y0;
+            bbox.width = x1 - x0;
+            bbox.height = y1 - y0;
+            Object object;
+            object.label = max_s_ptr - scores_ptr;
+            object.prob = score;
+            object.rect = bbox;
+            detections.emplace_back(object);
+        }
+    }
+    objects = detections;
+}
+
+static int detect_yolov8(const cv::Mat& bgr, std::vector<Object>& objects)
+{
+    ncnn::Net yolov8;
+
+    yolov8.opt.use_vulkan_compute = true; // if you want detect in hardware, then enable it
+
+    yolov8.load_param("yolov8n.param");
+    yolov8.load_model("yolov8n.bin");
+
+    const int target_size = 640;
+    const float prob_threshold = 0.25f;
+    const float nms_threshold = 0.45f;
+
+    int img_w = bgr.cols;
+    int img_h = bgr.rows;
+
+    // letterbox pad to multiple of MAX_STRIDE
+    int w = img_w;
+    int h = img_h;
+    float scale = 1.f;
+    if (w > h)
+    {
+        scale = (float)target_size / w;
+        w = target_size;
+        h = h * scale;
+    }
+    else
+    {
+        scale = (float)target_size / h;
+        h = target_size;
+        w = w * scale;
+    }
+
+    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, img_w, img_h, w, h);
+
+    int wpad = (target_size + MAX_STRIDE - 1) / MAX_STRIDE * MAX_STRIDE - w;
+    int hpad = (target_size + MAX_STRIDE - 1) / MAX_STRIDE * MAX_STRIDE - h;
+    ncnn::Mat in_pad;
+    ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 114.f);
+
+    const float norm_vals[3] = {1 / 255.f, 1 / 255.f, 1 / 255.f};
+    in_pad.substract_mean_normalize(0, norm_vals);
+
+    ncnn::Extractor ex = yolov8.create_extractor();
+
+    ex.input("in0", in_pad);
+
+    std::vector<Object> proposals;
+
+    // stride 32
+    {
+        ncnn::Mat out;
+        ex.extract("out0", out);
+
+        std::vector<Object> objects32;
+        const int num_labels = 80; // COCO has detect 80 object labels.
+        parse_yolov8_detections(
+            (float*)out.data, prob_threshold,
+            out.h, out.w, num_labels,
+            in_pad.w, in_pad.h,
+            objects32);
+        proposals.insert(proposals.end(), objects32.begin(), objects32.end());
+    }
+
+    // sort all proposals by score from highest to lowest
+    qsort_descent_inplace(proposals);
+
+    // apply nms with nms_threshold
+    std::vector<int> picked;
+    nms_sorted_bboxes(proposals, picked, nms_threshold);
+
+    int count = picked.size();
+
+    objects.resize(count);
+    for (int i = 0; i < count; i++)
+    {
+        objects[i] = proposals[picked[i]];
+
+        // adjust offset to original unpadded
+        float x0 = (objects[i].rect.x - (wpad / 2)) / scale;
+        float y0 = (objects[i].rect.y - (hpad / 2)) / scale;
+        float x1 = (objects[i].rect.x + objects[i].rect.width - (wpad / 2)) / scale;
+        float y1 = (objects[i].rect.y + objects[i].rect.height - (hpad / 2)) / scale;
+
+        // clip
+        x0 = std::max(std::min(x0, (float)(img_w - 1)), 0.f);
+        y0 = std::max(std::min(y0, (float)(img_h - 1)), 0.f);
+        x1 = std::max(std::min(x1, (float)(img_w - 1)), 0.f);
+        y1 = std::max(std::min(y1, (float)(img_h - 1)), 0.f);
+
+        objects[i].rect.x = x0;
+        objects[i].rect.y = y0;
+        objects[i].rect.width = x1 - x0;
+        objects[i].rect.height = y1 - y0;
+    }
+
+    return 0;
+}
+
+static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
+{
+    static const char* class_names[] = {
+        "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
+        "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
+        "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
+        "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
+        "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
+        "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
+        "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
+        "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
+        "hair drier", "toothbrush"
+    };
+
+    static const unsigned char colors[19][3] = {
+        {54, 67, 244},
+        {99, 30, 233},
+        {176, 39, 156},
+        {183, 58, 103},
+        {181, 81, 63},
+        {243, 150, 33},
+        {244, 169, 3},
+        {212, 188, 0},
+        {136, 150, 0},
+        {80, 175, 76},
+        {74, 195, 139},
+        {57, 220, 205},
+        {59, 235, 255},
+        {7, 193, 255},
+        {0, 152, 255},
+        {34, 87, 255},
+        {72, 85, 121},
+        {158, 158, 158},
+        {139, 125, 96}
+    };
+
+    int color_index = 0;
+
+    cv::Mat image = bgr.clone();
+
+    for (size_t i = 0; i < objects.size(); i++)
+    {
+        const Object& obj = objects[i];
+
+        const unsigned char* color = colors[color_index % 19];
+        color_index++;
+
+        cv::Scalar cc(color[0], color[1], color[2]);
+
+        fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
+                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);
+
+        cv::rectangle(image, obj.rect, cc, 2);
+
+        char text[256];
+        sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);
+
+        int baseLine = 0;
+        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
+
+        int x = obj.rect.x;
+        int y = obj.rect.y - label_size.height - baseLine;
+        if (y < 0)
+            y = 0;
+        if (x + label_size.width > image.cols)
+            x = image.cols - label_size.width;
+
+        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
+                      cc, -1);
+
+        cv::putText(image, text, cv::Point(x, y + label_size.height),
+                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(255, 255, 255));
+    }
+
+    cv::imshow("image", image);
+    cv::waitKey(0);
+}
+
+int main(int argc, char** argv)
+{
+    if (argc != 2)
+    {
+        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
+        return -1;
+    }
+
+    const char* imagepath = argv[1];
+
+    cv::Mat m = cv::imread(imagepath, 1);
+    if (m.empty())
+    {
+        fprintf(stderr, "cv::imread %s failed\n", imagepath);
+        return -1;
+    }
+
+    std::vector<Object> objects;
+    detect_yolov8(m, objects);
+
+    draw_objects(m, objects);
+
+    return 0;
+}

From 07196eee2e48738ac58e5b1a551649d578e3e783 Mon Sep 17 00:00:00 2001
From: Kelun Lei <kelunlei@buaa.edu.cn>
Date: Fri, 16 Aug 2024 10:50:26 +0800
Subject: [PATCH 24/38] benchmark: add Kunpeng 920 7260 (#5606)

---
 benchmark/README.md | 292 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 292 insertions(+)

diff --git a/benchmark/README.md b/benchmark/README.md
index 1927acf81cd..df9e55de4a8 100644
--- a/benchmark/README.md
+++ b/benchmark/README.md
@@ -5911,6 +5911,298 @@ cooling_down = 0
           FastestDet  min =    5.13  max =    5.47  avg =    5.30
 ```
 
+### HUAWEI Kunpeng 920 7260 (x64 cores)
+test on Ubuntu 20.04 (gcc 9.4.0)
+```
+root@8d46e508165f:/home/lkl/ARM_CHAR/ncnn/benchmark# ../build/benchmark/benchncnn 300 1 0 -1 0
+loop_count = 300
+num_threads = 1
+powersave = 0
+gpu_device = -1
+cooling_down = 0
+          squeezenet  min =   11.64  max =   12.11  avg =   11.71
+     squeezenet_int8  min =   12.22  max =   13.22  avg =   12.37
+           mobilenet  min =   20.00  max =   20.79  avg =   20.08
+      mobilenet_int8  min =   17.44  max =   19.09  avg =   17.64
+        mobilenet_v2  min =   13.29  max =   14.25  avg =   13.39
+        mobilenet_v3  min =   11.06  max =   11.84  avg =   11.11
+          shufflenet  min =    7.56  max =    7.74  avg =    7.59
+       shufflenet_v2  min =    7.84  max =    8.37  avg =    7.88
+             mnasnet  min =   13.07  max =   13.78  avg =   13.14
+     proxylessnasnet  min =   15.71  max =   16.31  avg =   15.77
+     efficientnet_b0  min =   34.79  max =   35.98  avg =   34.92
+   efficientnetv2_b0  min =   35.28  max =   36.36  avg =   35.41
+        regnety_400m  min =   17.06  max =   17.74  avg =   17.16
+           blazeface  min =    2.99  max =    3.04  avg =    3.01
+           googlenet  min =   50.76  max =   51.74  avg =   51.00
+      googlenet_int8  min =   50.31  max =   52.27  avg =   50.65
+            resnet18  min =   34.97  max =   37.17  avg =   35.82
+       resnet18_int8  min =   40.47  max =   42.03  avg =   40.78
+             alexnet  min =   39.19  max =   39.80  avg =   39.32
+               vgg16  min =  176.62  max =  181.29  avg =  177.07
+          vgg16_int8  min =  352.35  max =  358.38  avg =  355.15
+            resnet50  min =   96.76  max =   98.63  avg =   97.09
+       resnet50_int8  min =   90.00  max =   92.74  avg =   90.81
+      squeezenet_ssd  min =   33.23  max =   33.99  avg =   33.39
+ squeezenet_ssd_int8  min =   38.50  max =   41.53  avg =   39.28
+       mobilenet_ssd  min =   42.49  max =   44.78  avg =   42.72
+  mobilenet_ssd_int8  min =   37.06  max =   39.97  avg =   37.57
+      mobilenet_yolo  min =   96.34  max =   98.91  avg =   96.73
+  mobilenetv2_yolov3  min =   50.88  max =   52.97  avg =   51.15
+         yolov4-tiny  min =   65.56  max =   67.13  avg =   65.80
+           nanodet_m  min =   19.94  max =   20.82  avg =   20.04
+    yolo-fastest-1.1  min =    7.66  max =    7.81  avg =    7.71
+      yolo-fastestv2  min =    6.82  max =    7.23  avg =    6.87
+  vision_transformer  min = 1535.03  max = 1552.84  avg = 1543.73
+          FastestDet  min =    7.17  max =    7.50  avg =    7.21
+root@8d46e508165f:/home/lkl/ARM_CHAR/ncnn/benchmark# ../build/benchmark/benchncnn 300 2 0 -1 0
+loop_count = 300
+num_threads = 2
+powersave = 0
+gpu_device = -1
+cooling_down = 0
+          squeezenet  min =    6.35  max =    9.15  avg =    7.33
+     squeezenet_int8  min =    8.06  max =    8.60  avg =    8.14
+           mobilenet  min =   10.30  max =   11.86  avg =   11.48
+      mobilenet_int8  min =    8.93  max =   11.87  avg =   10.47
+        mobilenet_v2  min =    9.05  max =   11.50  avg =    9.19
+        mobilenet_v3  min =    6.32  max =    6.42  avg =    6.36
+          shufflenet  min =    6.73  max =    8.55  avg =    6.81
+       shufflenet_v2  min =    4.94  max =    6.65  avg =    6.32
+             mnasnet  min =    7.38  max =   10.77  avg =    8.82
+     proxylessnasnet  min =    8.57  max =    9.72  avg =    8.63
+     efficientnet_b0  min =   18.61  max =   22.53  avg =   20.42
+   efficientnetv2_b0  min =   18.75  max =   21.93  avg =   20.79
+        regnety_400m  min =   11.86  max =   15.09  avg =   14.60
+           blazeface  min =    1.95  max =    3.37  avg =    2.06
+           googlenet  min =   28.66  max =   32.24  avg =   28.94
+      googlenet_int8  min =   27.64  max =   32.15  avg =   30.84
+            resnet18  min =   20.33  max =   20.77  avg =   20.47
+       resnet18_int8  min =   22.63  max =   23.72  avg =   22.88
+             alexnet  min =   20.41  max =   29.37  avg =   27.22
+               vgg16  min =  101.72  max =  140.33  avg =  103.29
+          vgg16_int8  min =  187.56  max =  211.44  avg =  189.92
+            resnet50  min =   51.07  max =   59.25  avg =   58.35
+       resnet50_int8  min =   46.50  max =   52.55  avg =   48.93
+      squeezenet_ssd  min =   22.48  max =   28.59  avg =   22.98
+ squeezenet_ssd_int8  min =   25.56  max =   26.82  avg =   25.99
+       mobilenet_ssd  min =   22.81  max =   26.21  avg =   24.88
+  mobilenet_ssd_int8  min =   19.31  max =   25.53  avg =   21.74
+      mobilenet_yolo  min =   59.58  max =   62.04  avg =   59.99
+  mobilenetv2_yolov3  min =   33.26  max =   35.74  avg =   33.51
+         yolov4-tiny  min =   41.14  max =   45.34  avg =   42.46
+           nanodet_m  min =   12.10  max =   16.69  avg =   15.02
+    yolo-fastest-1.1  min =    5.44  max =    7.78  avg =    7.24
+      yolo-fastestv2  min =    5.03  max =    8.08  avg =    6.75
+  vision_transformer  min =  994.46  max = 1090.68  avg = 1045.50
+          FastestDet  min =    6.76  max =    6.91  avg =    6.83
+root@8d46e508165f:/home/lkl/ARM_CHAR/ncnn/benchmark# ../build/benchmark/benchncnn 300 4 0 -1 0
+loop_count = 300
+num_threads = 4
+powersave = 0
+gpu_device = -1
+cooling_down = 0
+          squeezenet  min =    3.79  max =    6.99  avg =    4.55
+     squeezenet_int8  min =    5.13  max =    5.68  avg =    5.20
+           mobilenet  min =    6.25  max =    6.55  avg =    6.30
+      mobilenet_int8  min =    5.96  max =    6.10  avg =    6.03
+        mobilenet_v2  min =    5.34  max =    7.15  avg =    5.62
+        mobilenet_v3  min =    4.05  max =    5.74  avg =    5.01
+          shufflenet  min =    3.69  max =    5.81  avg =    5.15
+       shufflenet_v2  min =    4.31  max =    6.02  avg =    4.56
+             mnasnet  min =    4.48  max =    6.05  avg =    5.54
+     proxylessnasnet  min =    5.05  max =    8.08  avg =    6.03
+     efficientnet_b0  min =   10.17  max =   12.21  avg =   11.58
+   efficientnetv2_b0  min =   10.86  max =   15.78  avg =   12.70
+        regnety_400m  min =    9.24  max =   14.13  avg =   11.98
+           blazeface  min =    1.89  max =    1.97  avg =    1.93
+           googlenet  min =   15.19  max =   20.31  avg =   16.90
+      googlenet_int8  min =   17.97  max =   19.40  avg =   18.11
+            resnet18  min =   11.18  max =   11.48  avg =   11.29
+       resnet18_int8  min =   12.26  max =   12.78  avg =   12.44
+             alexnet  min =   14.43  max =   16.94  avg =   14.68
+               vgg16  min =   62.40  max =   78.42  avg =   64.96
+          vgg16_int8  min =  101.52  max =  109.42  avg =  104.46
+            resnet50  min =   29.19  max =   39.69  avg =   32.99
+       resnet50_int8  min =   26.94  max =   28.82  avg =   27.16
+      squeezenet_ssd  min =   12.90  max =   16.52  avg =   15.20
+ squeezenet_ssd_int8  min =   15.58  max =   18.40  avg =   16.28
+       mobilenet_ssd  min =   13.68  max =   14.45  avg =   13.87
+  mobilenet_ssd_int8  min =   12.20  max =   14.58  avg =   12.84
+      mobilenet_yolo  min =   34.85  max =   36.54  avg =   35.05
+  mobilenetv2_yolov3  min =   18.61  max =   20.93  avg =   19.92
+         yolov4-tiny  min =   26.09  max =   32.32  avg =   28.03
+           nanodet_m  min =    7.85  max =   12.48  avg =   11.00
+    yolo-fastest-1.1  min =    6.19  max =    6.49  avg =    6.31
+      yolo-fastestv2  min =    3.66  max =    6.83  avg =    5.11
+  vision_transformer  min =  605.95  max =  624.99  avg =  609.79
+          FastestDet  min =    4.32  max =    5.41  avg =    5.17
+root@8d46e508165f:/home/lkl/ARM_CHAR/ncnn/benchmark# ../build/benchmark/benchncnn 300 8 0 -1 0
+loop_count = 300
+num_threads = 8
+powersave = 0
+gpu_device = -1
+cooling_down = 0
+          squeezenet  min =    2.72  max =    3.74  avg =    3.05
+     squeezenet_int8  min =    3.80  max =    4.71  avg =    4.03
+           mobilenet  min =    3.94  max =    5.15  avg =    4.00
+      mobilenet_int8  min =    3.73  max =    3.87  avg =    3.80
+        mobilenet_v2  min =    4.51  max =    6.57  avg =    4.68
+        mobilenet_v3  min =    4.12  max =    4.38  avg =    4.28
+          shufflenet  min =    4.60  max =    6.27  avg =    4.88
+       shufflenet_v2  min =    4.07  max =    4.20  avg =    4.11
+             mnasnet  min =    4.26  max =    4.51  avg =    4.36
+     proxylessnasnet  min =    4.71  max =    7.40  avg =    4.80
+     efficientnet_b0  min =    8.49  max =    8.74  avg =    8.56
+   efficientnetv2_b0  min =    9.34  max =    9.68  avg =    9.41
+        regnety_400m  min =    8.00  max =   12.85  avg =   10.64
+           blazeface  min =    1.76  max =    1.84  avg =    1.80
+           googlenet  min =   10.89  max =   11.33  avg =   10.98
+      googlenet_int8  min =   11.66  max =   14.07  avg =   11.83
+            resnet18  min =    6.48  max =    6.61  avg =    6.54
+       resnet18_int8  min =    7.30  max =    7.79  avg =    7.51
+             alexnet  min =    8.33  max =    8.95  avg =    8.62
+               vgg16  min =   29.94  max =   47.54  avg =   31.95
+          vgg16_int8  min =   54.67  max =   60.76  avg =   56.03
+            resnet50  min =   16.13  max =   20.79  avg =   20.03
+       resnet50_int8  min =   15.64  max =   20.13  avg =   16.11
+      squeezenet_ssd  min =   11.58  max =   12.02  avg =   11.77
+ squeezenet_ssd_int8  min =   11.14  max =   13.72  avg =   12.10
+       mobilenet_ssd  min =    8.27  max =   10.77  avg =    8.76
+  mobilenet_ssd_int8  min =    8.13  max =    9.09  avg =    8.29
+      mobilenet_yolo  min =   23.90  max =   24.69  avg =   24.17
+  mobilenetv2_yolov3  min =   14.83  max =   15.72  avg =   15.19
+         yolov4-tiny  min =   19.78  max =   23.66  avg =   20.05
+           nanodet_m  min =    8.92  max =   10.76  avg =    9.09
+    yolo-fastest-1.1  min =    5.49  max =    5.77  avg =    5.63
+      yolo-fastestv2  min =    5.04  max =    5.21  avg =    5.10
+  vision_transformer  min =  318.42  max =  379.40  avg =  363.66
+          FastestDet  min =    4.18  max =    4.54  avg =    4.38
+root@8d46e508165f:/home/lkl/ARM_CHAR/ncnn/benchmark# ../build/benchmark/benchncnn 300 16 0 -1 0
+loop_count = 300
+num_threads = 16
+powersave = 0
+gpu_device = -1
+cooling_down = 0
+          squeezenet  min =    2.70  max =    3.14  avg =    2.81
+     squeezenet_int8  min =    3.21  max =    4.22  avg =    3.39
+           mobilenet  min =    3.13  max =    3.26  avg =    3.20
+      mobilenet_int8  min =    3.17  max =    5.05  avg =    3.30
+        mobilenet_v2  min =    4.31  max =    6.24  avg =    4.62
+        mobilenet_v3  min =    3.57  max =    3.77  avg =    3.68
+          shufflenet  min =    4.70  max =    6.45  avg =    4.80
+       shufflenet_v2  min =    3.73  max =    4.27  avg =    3.87
+             mnasnet  min =    3.67  max =    3.87  avg =    3.75
+     proxylessnasnet  min =    4.28  max =    4.81  avg =    4.35
+     efficientnet_b0  min =    7.31  max =    7.77  avg =    7.53
+   efficientnetv2_b0  min =    9.87  max =   12.33  avg =   10.07
+        regnety_400m  min =   17.95  max =   18.53  avg =   18.26
+           blazeface  min =    2.26  max =    2.40  avg =    2.33
+           googlenet  min =    9.51  max =    9.99  avg =    9.68
+      googlenet_int8  min =   10.98  max =   11.36  avg =   11.18
+            resnet18  min =    5.59  max =    6.08  avg =    5.71
+       resnet18_int8  min =    6.55  max =    7.28  avg =    6.77
+             alexnet  min =    6.26  max =    6.50  avg =    6.36
+               vgg16  min =   23.98  max =   27.37  avg =   24.89
+          vgg16_int8  min =   38.07  max =   39.66  avg =   39.02
+            resnet50  min =   12.81  max =   14.19  avg =   13.76
+       resnet50_int8  min =   12.42  max =   12.84  avg =   12.55
+      squeezenet_ssd  min =   10.80  max =   11.49  avg =   11.12
+ squeezenet_ssd_int8  min =   11.57  max =   12.21  avg =   11.74
+       mobilenet_ssd  min =    7.46  max =    8.08  avg =    7.84
+  mobilenet_ssd_int8  min =    7.47  max =    8.07  avg =    7.63
+      mobilenet_yolo  min =   21.70  max =   23.43  avg =   21.92
+  mobilenetv2_yolov3  min =   12.55  max =   14.56  avg =   12.90
+         yolov4-tiny  min =   17.68  max =   19.85  avg =   18.18
+           nanodet_m  min =    8.35  max =    8.70  avg =    8.45
+    yolo-fastest-1.1  min =    5.70  max =    7.11  avg =    6.05
+      yolo-fastestv2  min =    4.85  max =    5.70  avg =    5.37
+  vision_transformer  min =  214.36  max =  259.56  avg =  245.47
+          FastestDet  min =    5.01  max =    5.42  avg =    5.17
+root@8d46e508165f:/home/lkl/ARM_CHAR/ncnn/benchmark# ../build/benchmark/benchncnn 300 32 0 -1 0
+loop_count = 300
+num_threads = 32
+powersave = 0
+gpu_device = -1
+cooling_down = 0
+          squeezenet  min =    2.30  max =    2.94  avg =    2.46
+     squeezenet_int8  min =    3.08  max =    4.88  avg =    4.03
+           mobilenet  min =    2.49  max =    2.76  avg =    2.53
+      mobilenet_int8  min =    2.86  max =    3.73  avg =    2.95
+        mobilenet_v2  min =    4.51  max =    5.20  avg =    4.74
+        mobilenet_v3  min =    5.11  max =    6.91  avg =    6.10
+          shufflenet  min =    5.57  max =    6.51  avg =    5.78
+       shufflenet_v2  min =    4.37  max =    4.66  avg =    4.48
+             mnasnet  min =    3.72  max =    4.08  avg =    3.90
+     proxylessnasnet  min =    4.19  max =    6.18  avg =    4.79
+     efficientnet_b0  min =    6.80  max =    7.22  avg =    6.89
+   efficientnetv2_b0  min =   13.98  max =   17.55  avg =   15.06
+        regnety_400m  min =   16.10  max =   16.72  avg =   16.26
+           blazeface  min =    2.12  max =    2.53  avg =    2.17
+           googlenet  min =    8.63  max =    9.89  avg =    8.77
+      googlenet_int8  min =    9.90  max =   11.09  avg =   10.08
+            resnet18  min =    6.54  max =    6.99  avg =    6.73
+       resnet18_int8  min =    8.34  max =    9.00  avg =    8.67
+             alexnet  min =    6.64  max =    7.15  avg =    6.93
+               vgg16  min =   22.79  max =   23.91  avg =   23.50
+          vgg16_int8  min =   32.37  max =   37.51  avg =   33.13
+            resnet50  min =   11.19  max =   16.40  avg =   11.47
+       resnet50_int8  min =   11.92  max =   12.55  avg =   12.13
+      squeezenet_ssd  min =   10.75  max =   12.28  avg =   11.12
+ squeezenet_ssd_int8  min =   11.31  max =   12.29  avg =   11.57
+       mobilenet_ssd  min =   10.25  max =   11.26  avg =   10.79
+  mobilenet_ssd_int8  min =   11.39  max =   16.99  avg =   11.98
+      mobilenet_yolo  min =   52.11  max =   60.46  avg =   53.84
+  mobilenetv2_yolov3  min =   12.07  max =   12.47  avg =   12.20
+         yolov4-tiny  min =   17.48  max =   17.79  avg =   17.58
+           nanodet_m  min =   13.06  max =   14.71  avg =   13.64
+    yolo-fastest-1.1  min =    5.70  max =    5.89  avg =    5.79
+      yolo-fastestv2  min =    8.89  max =    9.99  avg =    9.21
+  vision_transformer  min =  158.92  max =  187.40  avg =  168.21
+          FastestDet  min =    8.70  max =    9.43  avg =    9.00
+root@8d46e508165f:/home/lkl/ARM_CHAR/ncnn/benchmark# ../build/benchmark/benchncnn 300 64 0 -1 0
+loop_count = 300
+num_threads = 64
+powersave = 0
+gpu_device = -1
+cooling_down = 0
+          squeezenet  min =    6.85  max =   78.56  avg =    7.81
+     squeezenet_int8  min =    8.06  max =   88.91  avg =    9.23
+           mobilenet  min =    3.02  max =   86.86  avg =    5.89
+      mobilenet_int8  min =    3.58  max =    4.55  avg =    3.68
+        mobilenet_v2  min =    5.05  max =  150.06  avg =   13.04
+        mobilenet_v3  min =    4.85  max =  125.22  avg =    8.34
+          shufflenet  min =   17.80  max =  220.55  avg =   21.01
+       shufflenet_v2  min =   11.23  max =  381.95  avg =   13.71
+             mnasnet  min =    9.83  max =  128.42  avg =   11.10
+     proxylessnasnet  min =   10.53  max =   68.52  avg =   12.03
+     efficientnet_b0  min =   16.78  max =  968.87  avg =   23.94
+   efficientnetv2_b0  min =   26.23  max =  551.18  avg =   31.34
+        regnety_400m  min =   70.14  max =  407.92  avg =   78.30
+           blazeface  min =    7.27  max =  191.44  avg =    9.37
+           googlenet  min =   16.69  max =  820.58  avg =   25.06
+      googlenet_int8  min =   20.58  max =  849.09  avg =   29.87
+            resnet18  min =    8.67  max =  349.00  avg =   11.33
+       resnet18_int8  min =   10.40  max =  128.98  avg =   11.45
+             alexnet  min =    6.15  max =  196.01  avg =   10.24
+               vgg16  min =   21.11  max =  288.66  avg =   29.37
+          vgg16_int8  min =   30.72  max =  251.95  avg =   37.68
+            resnet50  min =   19.10  max =  114.08  avg =   22.00
+       resnet50_int8  min =   18.99  max =  436.89  avg =   24.36
+      squeezenet_ssd  min =   22.22  max =  510.52  avg =   28.76
+ squeezenet_ssd_int8  min =   23.42  max =  614.70  avg =   30.82
+       mobilenet_ssd  min =    7.62  max =  202.66  avg =   14.59
+  mobilenet_ssd_int8  min =    7.89  max =  109.82  avg =    8.80
+      mobilenet_yolo  min =   31.43  max =  742.10  avg =   45.52
+  mobilenetv2_yolov3  min =   18.31  max =  273.05  avg =   20.78
+         yolov4-tiny  min =   21.03  max =  400.05  avg =   33.64
+           nanodet_m  min =   19.94  max =  114.18  avg =   21.89
+    yolo-fastest-1.1  min =    7.20  max =  174.60  avg =    9.13
+      yolo-fastestv2  min =    7.50  max =  170.55  avg =    9.01
+  vision_transformer  min =  126.90  max =  335.71  avg =  157.38
+          FastestDet  min =    6.59  max =   19.77  avg =    6.77
+```
+
 ### Intel Atom x5-Z8350
 ```
 nihui@nihui-ROCK-Pi-X:~/ncnn/build/benchmark$ ./benchncnn 20 4 0 -1 1

From 70310e951e1d863bc860141dab9506c7de2d118c Mon Sep 17 00:00:00 2001
From: nihui <nihuini@tencent.com>
Date: Fri, 16 Aug 2024 16:20:10 +0800
Subject: [PATCH 25/38] fix out of range read in convolution im2col aarch64
 (#5631)

---
 src/layer/arm/convolution_im2col_gemm.h       |  2 +-
 src/layer/arm/convolution_im2col_gemm_bf16s.h | 36 +++++++++----------
 2 files changed, 18 insertions(+), 20 deletions(-)

diff --git a/src/layer/arm/convolution_im2col_gemm.h b/src/layer/arm/convolution_im2col_gemm.h
index af501efa2f8..25a3e94d781 100644
--- a/src/layer/arm/convolution_im2col_gemm.h
+++ b/src/layer/arm/convolution_im2col_gemm.h
@@ -3377,7 +3377,7 @@ static void convolution_gemm_transB_packed_tile(const Mat& AT_tile, const Mat& B
                 "cbz    %w10, 0f                    \n"
 
                 "ld1    {v30.4s, v31.4s}, [%0]      \n"
-                "b      3f                          \n"
+                "b      2f                          \n"
 
                 "0:                                 \n"
                 // if pC
diff --git a/src/layer/arm/convolution_im2col_gemm_bf16s.h b/src/layer/arm/convolution_im2col_gemm_bf16s.h
index 82319d05850..95819e2d679 100644
--- a/src/layer/arm/convolution_im2col_gemm_bf16s.h
+++ b/src/layer/arm/convolution_im2col_gemm_bf16s.h
@@ -3110,7 +3110,7 @@ static void convolution_gemm_transB_packed_tile_bf16s(const Mat& AT_tile, const
                 "cbz    %w10, 0f                    \n"
 
                 "ld1    {v30.4s, v31.4s}, [%0]      \n"
-                "b      3f                          \n"
+                "b      2f                          \n"
 
                 "0:                                 \n"
                 // if pC
@@ -3125,15 +3125,13 @@ static void convolution_gemm_transB_packed_tile_bf16s(const Mat& AT_tile, const
                 "eor    v31.16b, v31.16b, v31.16b   \n"
 
                 "2:                                 \n"
-
-                "3:                                 \n"
                 "lsr    w4, %w9, #2                 \n" // w4 = max_kk >> 2
                 "cmp    w4, #0                      \n"
-                "beq    5f                          \n"
+                "beq    4f                          \n"
 
                 "eor    v28.16b, v28.16b, v28.16b   \n"
                 "eor    v29.16b, v29.16b, v29.16b   \n"
-                "4:                                 \n"
+                "3:                                 \n"
                 "prfm   pldl1keep, [%2, #64]        \n"
                 "ld1    {v0.4h}, [%2], #8           \n"
                 "shll   v0.4s, v0.4h, #16           \n"
@@ -3156,16 +3154,16 @@ static void convolution_gemm_transB_packed_tile_bf16s(const Mat& AT_tile, const
                 "subs   w4, w4, #1                  \n"
                 "fmla   v30.4s, v10.4s, v0.s[3]     \n"
                 "fmla   v31.4s, v11.4s, v0.s[3]     \n"
-                "bne    4b                          \n"
+                "bne    3b                          \n"
                 "fadd   v30.4s, v30.4s, v28.4s      \n"
                 "fadd   v31.4s, v31.4s, v29.4s      \n"
 
-                "5:                                 \n"
+                "4:                                 \n"
                 "and    w4, %w9, #3                 \n" // w4 = remain = max_kk & 3
                 "cmp    w4, #0                      \n"
-                "beq    7f                          \n"
+                "beq    6f                          \n"
 
-                "6:                                 \n"
+                "5:                                 \n"
                 "ld1r   {v0.4h}, [%2], #2           \n"
                 "shll   v0.4s, v0.4h, #16           \n"
                 "ld1    {v3.8h}, [%1], #16          \n"
@@ -3174,26 +3172,26 @@ static void convolution_gemm_transB_packed_tile_bf16s(const Mat& AT_tile, const
                 "subs   w4, w4, #1                  \n"
                 "fmla   v30.4s, v4.4s, v0.4s        \n"
                 "fmla   v31.4s, v5.4s, v0.4s        \n"
-                "bne    6b                          \n"
+                "bne    5b                          \n"
 
-                "7:                                 \n"
+                "6:                                 \n"
                 "shrn   v30.4h, v30.4s, #16         \n"
                 "shrn   v31.4h, v31.4s, #16         \n"
                 "tst    %w11, #255                  \n"
-                "beq    10f                         \n"
+                "beq    9f                          \n"
 
                 // if out_elempack == 4
                 "cmp    %w12, #4                    \n"
-                "bne    8f                          \n"
+                "bne    7f                          \n"
 
                 "lsl    w4, %w13, #2                \n"
                 "add    x4, %3, w4, sxtw 1          \n"
                 "st1    {v30.4h}, [%3], #8          \n"
                 "st1    {v31.4h}, [x4]              \n"
-                "b      9f                          \n"
+                "b      8f                          \n"
 
                 // if out_elempack == 1
-                "8:                                 \n"
+                "7:                                 \n"
                 "add    x4, %3, %w13, sxtw 1        \n"
                 "st1    {v30.h}[0], [%3], #2        \n"
                 "st1    {v30.h}[1], [x4]            \n"
@@ -3210,14 +3208,14 @@ static void convolution_gemm_transB_packed_tile_bf16s(const Mat& AT_tile, const
                 "add    x4, x4, %w13, sxtw 1        \n"
                 "st1    {v31.h}[3], [x4]            \n"
 
-                "9:                                 \n"
+                "8:                                 \n"
                 "add    %0, %0, #32                 \n"
-                "b      11f                         \n"
+                "b      10f                         \n"
 
-                "10:                                \n"
+                "9:                                 \n"
                 "st1    {v30.4s, v31.4s}, [%0], #32 \n"
 
-                "11:                                \n"
+                "10:                                \n"
 
                 : "=r"(outptr), // %0
                 "=r"(pA),     // %1

From 789d8686c7fc270e5579f2fad680e2aa1af4e3b4 Mon Sep 17 00:00:00 2001
From: nihui <nihuini@tencent.com>
Date: Fri, 16 Aug 2024 18:48:39 +0800
Subject: [PATCH 26/38] pnnx functionize do not create shadow op for identity
 consumers (#5632)

---
 tools/pnnx/src/pass_level2.cpp | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/tools/pnnx/src/pass_level2.cpp b/tools/pnnx/src/pass_level2.cpp
index bc7e51b8d5d..de44a355366 100644
--- a/tools/pnnx/src/pass_level2.cpp
+++ b/tools/pnnx/src/pass_level2.cpp
@@ -1166,6 +1166,18 @@ static void functionize(Graph& graph)
             if (out0->consumers.size() == 1)
                 continue;
 
+            bool all_consumers_are_same = true;
+            for (size_t j = 1; j < out0->consumers.size(); j++)
+            {
+                if (out0->consumers[j] != out0->consumers[0])
+                {
+                    all_consumers_are_same = false;
+                    break;
+                }
+            }
+            if (all_consumers_are_same)
+                continue;
+
             for (int j = (int)out0->consumers.size() - 1; j > 0; j--)
             {
                 Operator* op1 = out0->consumers[j];

From 4de536951ac618ce705bea519bf1b2afc43f21ab Mon Sep 17 00:00:00 2001
From: nihui <nihuini@tencent.com>
Date: Sat, 17 Aug 2024 10:39:47 +0800
Subject: [PATCH 27/38] onnx2pnnx do not fold single constant for gemm weight
 (#5634)

---
 tools/pnnx/src/pass_onnx.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/pnnx/src/pass_onnx.cpp b/tools/pnnx/src/pass_onnx.cpp
index 6318dacba25..87dd27d27cb 100644
--- a/tools/pnnx/src/pass_onnx.cpp
+++ b/tools/pnnx/src/pass_onnx.cpp
@@ -820,6 +820,8 @@ void pass_onnx(const onnx::ModelProto& model, Graph& pnnx_graph)
                         is_attr_weight = true;
                     if (sim_op_type == "Gather" && j == 0)
                         is_attr_weight = true;
+                    if (sim_op_type == "Gemm" && (j == 1 || j == 2))
+                        is_attr_weight = true;
                     if (sim_op_type == "GroupNormalization" && (j == 1 || j == 2))
                         is_attr_weight = true;
                     if (sim_op_type == "GRU" && (j == 1 || j == 2 || j == 3 || j == 5))

From a0c9e7783d221771457e1d71a8452475c2ba51f5 Mon Sep 17 00:00:00 2001
From: Joey Ballentine <34788790+joeyballentine@users.noreply.github.com>
Date: Sat, 17 Aug 2024 00:36:17 -0500
Subject: [PATCH 28/38] Add python binding for loading bin from memory (#5164)

---
 python/src/main.cpp      | 21 +++++++++++++++++++++
 python/tests/test_net.py | 26 ++++++++++++++++++++++++++
 2 files changed, 47 insertions(+)

diff --git a/python/src/main.cpp b/python/src/main.cpp
index a7ed0528c6a..e5b1264264c 100644
--- a/python/src/main.cpp
+++ b/python/src/main.cpp
@@ -34,6 +34,20 @@ using namespace ncnn;
 
 namespace py = pybind11;
 
+class DataReaderFromMemoryCopy : public DataReaderFromMemory
+{
+public:
+    explicit DataReaderFromMemoryCopy(const unsigned char*& mem)
+        : DataReaderFromMemory(mem)
+    {
+    }
+
+    virtual size_t reference(size_t size, const void** buf) const
+    {
+        return 0;
+    }
+};
+
 struct LayerFactory
 {
     std::string name;
@@ -956,6 +970,13 @@ PYBIND11_MODULE(ncnn, m)
 #endif // NCNN_STRING
     .def("load_param_bin", (int (Net::*)(const char*)) & Net::load_param_bin, py::arg("protopath"))
     .def("load_model", (int (Net::*)(const char*)) & Net::load_model, py::arg("modelpath"))
+    .def(
+    "load_model_mem", [](Net& net, const char* mem) {
+        const unsigned char* _mem = (const unsigned char*)mem;
+        DataReaderFromMemoryCopy dr(_mem);
+        net.load_model(dr);
+    },
+    py::arg("mem"))
 #endif // NCNN_STDIO
 
     .def("clear", &Net::clear)
diff --git a/python/tests/test_net.py b/python/tests/test_net.py
index 03271aff462..362cc4791fb 100644
--- a/python/tests/test_net.py
+++ b/python/tests/test_net.py
@@ -42,6 +42,32 @@ def test_net():
         assert len(net.blobs()) == 0 and len(net.layers()) == 0
 
 
+def test_net_mem():
+    modelbin = bytearray(303940)
+    modelbin[0:4] = 71,107,48,1
+    modelbin[180:184] = 71,107,48,1
+
+    with ncnn.Net() as net:
+        ret = net.load_param("tests/test.param")
+        net.load_model_mem(bytes(modelbin))
+        assert ret == 0 and len(net.blobs()) == 3 and len(net.layers()) == 3
+
+        input_names = net.input_names()
+        output_names = net.output_names()
+        assert len(input_names) > 0 and len(output_names) > 0
+
+        in_mat = ncnn.Mat((227, 227, 3))
+
+        with net.create_extractor() as ex:
+            ex.input("data", in_mat)
+            ret, out_mat = ex.extract("output")
+
+        assert ret == 0 and out_mat.dims == 1 and out_mat.w == 1
+
+        net.clear()
+        assert len(net.blobs()) == 0 and len(net.layers()) == 0
+
+
 def test_net_vulkan():
     if not hasattr(ncnn, "get_gpu_count"):
         return

From 27f64a1382e72d18e38577f3c922323c1a199ce4 Mon Sep 17 00:00:00 2001
From: nihui <nihuini@tencent.com>
Date: Mon, 19 Aug 2024 11:17:53 +0800
Subject: [PATCH 29/38] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index a9bb1c116fa..08166ab0766 100644
--- a/README.md
+++ b/README.md
@@ -560,7 +560,7 @@ https://github.com/Tencent/ncnn/releases/latest
 
 **[use netron for ncnn model visualization](https://netron.app)**
 
-**[out-of-the-box web model conversion](https://convertmodel.com/#outputFormat=ncnn)**
+**[use ncnn with pytorch or onnx](https://github.com/Tencent/ncnn/wiki/use-ncnn-with-pytorch-or-onnx)**
 
 [ncnn low-level operation api](https://github.com/Tencent/ncnn/wiki/low-level-operation-api)
 

From a6d3ef5a0bb59fb496c553c3ef54d141642b4fc5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=B5=E5=B0=8F=E5=87=A1?=
 <2672931+whyb@users.noreply.github.com>
Date: Tue, 20 Aug 2024 08:23:56 +0800
Subject: [PATCH 30/38] Fixed bug #5637 (#5640)

---
 examples/yolov8.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/yolov8.cpp b/examples/yolov8.cpp
index 5b3926582c8..e166e6c1d17 100644
--- a/examples/yolov8.cpp
+++ b/examples/yolov8.cpp
@@ -175,10 +175,10 @@ static void parse_yolov8_detections(
 
     for (int i = 0; i < num_anchors; i++)
     {
-        auto row_ptr = output.row(i).ptr<float>();
-        auto bboxes_ptr = row_ptr;
-        auto scores_ptr = row_ptr + 4;
-        auto max_s_ptr = std::max_element(scores_ptr, scores_ptr + num_labels);
+        const float* row_ptr = output.row(i).ptr<float>();
+        const float* bboxes_ptr = row_ptr;
+        const float* scores_ptr = row_ptr + 4;
+        const float* max_s_ptr = std::max_element(scores_ptr, scores_ptr + num_labels);
         float score = *max_s_ptr;
         if (score > confidence_threshold)
         {
@@ -201,7 +201,7 @@ static void parse_yolov8_detections(
             object.label = max_s_ptr - scores_ptr;
             object.prob = score;
             object.rect = bbox;
-            detections.emplace_back(object);
+            detections.push_back(object);
         }
     }
     objects = detections;

From 25a22e0c0c032b098153fb47c6199a48aa15ea92 Mon Sep 17 00:00:00 2001
From: nihui <nihuini@tencent.com>
Date: Tue, 20 Aug 2024 16:59:17 +0800
Subject: [PATCH 31/38] update release download

---
 README.md | 89 +++++++++++++++++++++++++++++++------------------------
 1 file changed, 50 insertions(+), 39 deletions(-)

diff --git a/README.md b/README.md
index 08166ab0766..146b04b1a4e 100644
--- a/README.md
+++ b/README.md
@@ -77,7 +77,7 @@ https://github.com/Tencent/ncnn/releases/latest
 <td>Source</td>
 <td colspan=2>
 
-  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-full-source.zip)
+  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-full-source.zip)
 
 </td>
 </tr>
@@ -97,8 +97,8 @@ https://github.com/Tencent/ncnn/releases/latest
 <td>Android</td>
 <td>
 
-  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-android-vulkan.zip)
-  [<img src="https://img.shields.io/badge/+cpuonly-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-android.zip)
+  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-android-vulkan.zip)
+  [<img src="https://img.shields.io/badge/+cpuonly-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-android.zip)
 
 </td>
 <td rowspan=2>
@@ -111,8 +111,8 @@ https://github.com/Tencent/ncnn/releases/latest
 <td>Android shared</td>
 <td>
 
-  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-android-vulkan-shared.zip)
-  [<img src="https://img.shields.io/badge/+cpuonly-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-android-shared.zip)
+  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-android-vulkan-shared.zip)
+  [<img src="https://img.shields.io/badge/+cpuonly-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-android-shared.zip)
 
 </td>
 </tr>
@@ -159,8 +159,8 @@ https://github.com/Tencent/ncnn/releases/latest
 <td>iOS</td>
 <td>
 
-  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-ios-vulkan.zip)
-  [<img src="https://img.shields.io/badge/+cpuonly-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-ios.zip)
+  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-ios-vulkan.zip)
+  [<img src="https://img.shields.io/badge/+cpuonly-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-ios.zip)
 
 </td>
 <td rowspan=2>
@@ -173,8 +173,8 @@ https://github.com/Tencent/ncnn/releases/latest
 <td>iOS-Simulator</td>
 <td>
 
-  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-ios-simulator-vulkan.zip)
-  [<img src="https://img.shields.io/badge/+cpuonly-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-ios-simulator.zip)
+  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-ios-simulator-vulkan.zip)
+  [<img src="https://img.shields.io/badge/+cpuonly-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-ios-simulator.zip)
 
 </td>
 </tr>
@@ -193,8 +193,8 @@ https://github.com/Tencent/ncnn/releases/latest
 <td>macOS</td>
 <td>
 
-  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-macos-vulkan.zip)
-  [<img src="https://img.shields.io/badge/+cpuonly-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-macos.zip)
+  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-macos-vulkan.zip)
+  [<img src="https://img.shields.io/badge/+cpuonly-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-macos.zip)
 
 </td>
 <td rowspan=1>
@@ -207,8 +207,8 @@ https://github.com/Tencent/ncnn/releases/latest
 <td>Mac-Catalyst</td>
 <td>
 
-  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-mac-catalyst-vulkan.zip)
-  [<img src="https://img.shields.io/badge/+cpuonly-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-mac-catalyst.zip)
+  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-mac-catalyst-vulkan.zip)
+  [<img src="https://img.shields.io/badge/+cpuonly-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-mac-catalyst.zip)
 
 </td>
 <td rowspan=1>
@@ -221,7 +221,7 @@ https://github.com/Tencent/ncnn/releases/latest
 <td>watchOS</td>
 <td>
 
-  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-watchos.zip)
+  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-watchos.zip)
 
 </td>
 <td rowspan=2>
@@ -234,7 +234,7 @@ https://github.com/Tencent/ncnn/releases/latest
 <td>watchOS-Simulator</td>
 <td>
 
-  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-watchos-simulator.zip)
+  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-watchos-simulator.zip)
 
 </td>
 </tr>
@@ -242,8 +242,8 @@ https://github.com/Tencent/ncnn/releases/latest
 <td>tvOS</td>
 <td>
 
-  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-tvos-vulkan.zip)
-  [<img src="https://img.shields.io/badge/+cpuonly-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-tvos.zip)
+  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-tvos-vulkan.zip)
+  [<img src="https://img.shields.io/badge/+cpuonly-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-tvos.zip)
 
 </td>
 <td rowspan=2>
@@ -256,8 +256,8 @@ https://github.com/Tencent/ncnn/releases/latest
 <td>tvOS-Simulator</td>
 <td>
 
-  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-tvos-simulator-vulkan.zip)
-  [<img src="https://img.shields.io/badge/+cpuonly-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-tvos-simulator.zip)
+  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-tvos-simulator-vulkan.zip)
+  [<img src="https://img.shields.io/badge/+cpuonly-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-tvos-simulator.zip)
 
 </td>
 </tr>
@@ -265,7 +265,8 @@ https://github.com/Tencent/ncnn/releases/latest
 <td>visionOS</td>
 <td>
 
-  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-visionos.zip)
+  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-visionos-vulkan.zip)
+  [<img src="https://img.shields.io/badge/+cpuonly-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-visionos.zip)
 
 </td>
 <td rowspan=2>
@@ -278,7 +279,8 @@ https://github.com/Tencent/ncnn/releases/latest
 <td>visionOS-Simulator</td>
 <td>
 
-  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-visionos-simulator.zip)
+  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-visionos-simulator-vulkan.zip)
+  [<img src="https://img.shields.io/badge/+cpuonly-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-visionos-simulator.zip)
 
 </td>
 </tr>
@@ -286,8 +288,8 @@ https://github.com/Tencent/ncnn/releases/latest
 <td>Apple xcframework</td>
 <td>
 
-  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-apple-vulkan.zip)
-  [<img src="https://img.shields.io/badge/+cpuonly-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-apple.zip)
+  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-apple-vulkan.zip)
+  [<img src="https://img.shields.io/badge/+cpuonly-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-apple.zip)
 
 </td>
 <td rowspan=1>
@@ -296,10 +298,10 @@ https://github.com/Tencent/ncnn/releases/latest
 </tr>
 
 <tr>
-<td rowspan=3>
+<td rowspan=4>
   <img src="https://user-images.githubusercontent.com/25181517/186884153-99edc188-e4aa-4c84-91b0-e2df260ebc33.png" width="120" height="auto">
 </td>
-<td colspan=3>
+<td colspan=4>
 
 - [Build for Linux / NVIDIA Jetson / Raspberry Pi3, Pi4 / POWER](https://github.com/Tencent/ncnn/wiki/how-to-build#build-for-linux)
 
@@ -309,11 +311,11 @@ https://github.com/Tencent/ncnn/releases/latest
 <td>Ubuntu 20.04</td>
 <td>
 
-  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-ubuntu-2004.zip)
-  [<img src="https://img.shields.io/badge/+shared-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-ubuntu-2004-shared.zip)
+  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-ubuntu-2004.zip)
+  [<img src="https://img.shields.io/badge/+shared-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-ubuntu-2004-shared.zip)
 
 </td>
-<td rowspan=2>
+<td rowspan=3>
 
   [<img src="https://img.shields.io/github/actions/workflow/status/Tencent/ncnn/linux-x64-gpu-gcc.yml?branch=master&style=for-the-badge&label=build">](https://github.com/Tencent/ncnn/actions?query=workflow%3Alinux-x64-gpu-gcc)
 
@@ -323,8 +325,17 @@ https://github.com/Tencent/ncnn/releases/latest
 <td>Ubuntu 22.04</td>
 <td>
 
-  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-ubuntu-2204.zip)
-  [<img src="https://img.shields.io/badge/+shared-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-ubuntu-2204-shared.zip)
+  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-ubuntu-2204.zip)
+  [<img src="https://img.shields.io/badge/+shared-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-ubuntu-2204-shared.zip)
+
+</td>
+</tr>
+<tr>
+<td>Ubuntu 24.04</td>
+<td>
+
+  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-ubuntu-2404.zip)
+  [<img src="https://img.shields.io/badge/+shared-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-ubuntu-2404-shared.zip)
 
 </td>
 </tr>
@@ -344,8 +355,8 @@ https://github.com/Tencent/ncnn/releases/latest
 <td>VS2015</td>
 <td>
 
-  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-windows-vs2015.zip)
-  [<img src="https://img.shields.io/badge/+shared-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-windows-vs2015-shared.zip)
+  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-windows-vs2015.zip)
+  [<img src="https://img.shields.io/badge/+shared-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-windows-vs2015-shared.zip)
 
 </td>
 <td rowspan=4>
@@ -358,8 +369,8 @@ https://github.com/Tencent/ncnn/releases/latest
 <td>VS2017</td>
 <td>
 
-  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-windows-vs2017.zip)
-  [<img src="https://img.shields.io/badge/+shared-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-windows-vs2017-shared.zip)
+  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-windows-vs2017.zip)
+  [<img src="https://img.shields.io/badge/+shared-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-windows-vs2017-shared.zip)
 
 </td>
 </tr>
@@ -367,8 +378,8 @@ https://github.com/Tencent/ncnn/releases/latest
 <td>VS2019</td>
 <td>
 
-  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-windows-vs2019.zip)
-  [<img src="https://img.shields.io/badge/+shared-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-windows-vs2019-shared.zip)
+  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-windows-vs2019.zip)
+  [<img src="https://img.shields.io/badge/+shared-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-windows-vs2019-shared.zip)
 
 </td>
 </tr>
@@ -376,8 +387,8 @@ https://github.com/Tencent/ncnn/releases/latest
 <td>VS2022</td>
 <td>
 
-  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-windows-vs2022.zip)
-  [<img src="https://img.shields.io/badge/+shared-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-windows-vs2022-shared.zip)
+  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-windows-vs2022.zip)
+  [<img src="https://img.shields.io/badge/+shared-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-windows-vs2022-shared.zip)
 
 </td>
 </tr>
@@ -396,7 +407,7 @@ https://github.com/Tencent/ncnn/releases/latest
 <td>WebAssembly</td>
 <td>
 
-  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-webassembly.zip)
+  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-webassembly.zip)
 
 </td>
 <td>

From 5e2d56d025d1f40a2a26b5cc4733547cacd2dd8f Mon Sep 17 00:00:00 2001
From: nihui <nihuini@tencent.com>
Date: Fri, 30 Aug 2024 12:27:54 +0800
Subject: [PATCH 32/38] pnnx fuse mobilevit style selfattention, onnx2pnnx
 handle more general gemm (#5659)

---
 tools/pnnx/src/pass_level2/F_hardswish.cpp    | 26 ++++++++
 tools/pnnx/src/pass_level2/F_linear.cpp       | 66 ++++++++++++++++++-
 .../pass_level5/fuse_multiheadattention.cpp   | 53 +++++++++++++++
 3 files changed, 143 insertions(+), 2 deletions(-)

diff --git a/tools/pnnx/src/pass_level2/F_hardswish.cpp b/tools/pnnx/src/pass_level2/F_hardswish.cpp
index caa724f55a7..2ce9e1b420b 100644
--- a/tools/pnnx/src/pass_level2/F_hardswish.cpp
+++ b/tools/pnnx/src/pass_level2/F_hardswish.cpp
@@ -343,4 +343,30 @@ pnnx.Output             output      1 0 out
 
 REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_hardswish_onnx_2, 9)
 
+class F_hardswish_onnx_3 : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+8 7
+pnnx.Input              input       0 1 input
+prim::Constant          op_0        0 1 v3 value=3
+aten::add               op_1        2 1 input v3 a
+aten::clamp             op_2        1 1 a b max=6 min=0
+aten::mul               op_3        2 1 input b c
+prim::Constant          op_4        0 1 v6 value=6
+aten::div               op_5        2 1 c v6 out
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "F.hardswish";
+    }
+};
+
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_hardswish_onnx_3, 9)
+
 } // namespace pnnx
diff --git a/tools/pnnx/src/pass_level2/F_linear.cpp b/tools/pnnx/src/pass_level2/F_linear.cpp
index 4c454581ec3..62f9d62e505 100644
--- a/tools/pnnx/src/pass_level2/F_linear.cpp
+++ b/tools/pnnx/src/pass_level2/F_linear.cpp
@@ -129,7 +129,7 @@ class F_linear_onnx : public GraphRewriterPass
 pnnx.Input              input_0     0 1 input
 pnnx.Input              input_1     0 1 weight
 pnnx.Input              input_2     0 1 bias
-Gemm                    op_0        3 1 input weight bias out alpha=1.000000e+00 beta=1.000000e+00 transB=1
+Gemm                    gemm        3 1 input weight bias out %*=%*
 pnnx.Output             output      1 0 out
 )PNNXIR";
     }
@@ -138,6 +138,39 @@ pnnx.Output             output      1 0 out
     {
         return "F.linear";
     }
+
+    bool match(const std::map<std::string, const Operator*>& matched_operators, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& /*captured_attrs*/) const
+    {
+        if (captured_params.find("gemm.alpha") != captured_params.end())
+        {
+            if (captured_params.at("gemm.alpha").type != 3 || captured_params.at("gemm.alpha").f != 1.f)
+                return false;
+        }
+
+        if (captured_params.find("gemm.beta") != captured_params.end())
+        {
+            if (captured_params.at("gemm.beta").type != 3 || captured_params.at("gemm.beta").f != 1.f)
+                return false;
+        }
+
+        if (captured_params.find("gemm.transA") != captured_params.end())
+        {
+            if (captured_params.at("gemm.transA").type != 2 || captured_params.at("gemm.transA").i != 0)
+                return false;
+        }
+
+        if (captured_params.find("gemm.transB") == captured_params.end())
+            return false;
+
+        if (captured_params.at("gemm.transB").type != 2 || captured_params.at("gemm.transB").i != 1)
+            return false;
+
+        return true;
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& /*captured_params*/) const
+    {
+    }
 };
 
 REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_linear_onnx, 10)
@@ -152,7 +185,7 @@ class F_linear_onnx_1 : public GraphRewriterPass
 pnnx.Input              input_0     0 1 input
 pnnx.Input              input_1     0 1 bias
 pnnx.Attribute          weight      0 1 weight @data=(%in_features,%out_features)f32
-Gemm                    gemm        3 1 input weight bias out alpha=1.000000e+00 beta=1.000000e+00
+Gemm                    gemm        3 1 input weight bias out %*=%*
 pnnx.Output             output      1 0 out
 )PNNXIR";
     }
@@ -169,6 +202,35 @@ pnnx.Output             output      1 0 out
 )PNNXIR";
     }
 
+    bool match(const std::map<std::string, const Operator*>& matched_operators, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& /*captured_attrs*/) const
+    {
+        if (captured_params.find("gemm.alpha") != captured_params.end())
+        {
+            if (captured_params.at("gemm.alpha").type != 3 || captured_params.at("gemm.alpha").f != 1.f)
+                return false;
+        }
+
+        if (captured_params.find("gemm.beta") != captured_params.end())
+        {
+            if (captured_params.at("gemm.beta").type != 3 || captured_params.at("gemm.beta").f != 1.f)
+                return false;
+        }
+
+        if (captured_params.find("gemm.transA") != captured_params.end())
+        {
+            if (captured_params.at("gemm.transA").type != 2 || captured_params.at("gemm.transA").i != 0)
+                return false;
+        }
+
+        if (captured_params.find("gemm.transB") != captured_params.end())
+        {
+            if (captured_params.at("gemm.transB").type != 2 || captured_params.at("gemm.transB").i != 0)
+                return false;
+        }
+
+        return true;
+    }
+
     void write(const std::map<std::string, Operator*>& ops, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& captured_attrs) const
     {
         const int in_features = captured_params.at("in_features").i;
diff --git a/tools/pnnx/src/pass_level5/fuse_multiheadattention.cpp b/tools/pnnx/src/pass_level5/fuse_multiheadattention.cpp
index b6297eb8a92..c178788f2a7 100644
--- a/tools/pnnx/src/pass_level5/fuse_multiheadattention.cpp
+++ b/tools/pnnx/src/pass_level5/fuse_multiheadattention.cpp
@@ -702,6 +702,57 @@ pnnx.Output             output      1 0 out
     }
 };
 
+class fuse_multiheadattention_pass_1_1_1 : public fuse_multiheadattention_pass_sameqkv
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+19 18
+pnnx.Input              input       0 1 input
+nn.Linear               op_0        1 1 input 256 bias=%qbias in_features=%embed_dim out_features=%embed_dim @bias @weight
+nn.Linear               op_1        1 1 input 257 bias=%kbias in_features=%embed_dim out_features=%embed_dim @bias @weight
+nn.Linear               op_2        1 1 input 260 bias=%vbias in_features=%embed_dim out_features=%embed_dim @bias @weight
+Tensor.view             op_3        1 1 256 263 shape=(%batch,%size,%num_heads,%feat_per_head)
+Tensor.view             op_4        1 1 257 258 shape=(%batch,%size,%num_heads,%feat_per_head)
+Tensor.view             op_5        1 1 260 261 shape=(%batch,%size,%num_heads,%feat_per_head)
+Tensor.permute          op_6        1 1 263 264 dims=(0,2,1,3)
+Tensor.permute          op_7        1 1 258 259 dims=(0,2,1,3)
+Tensor.permute          op_8        1 1 261 262 dims=(0,2,1,3)
+torch.transpose         op_9        1 1 259 265 dim0=-1 dim1=-2
+torch.matmul            op_10       2 1 264 265 266
+pnnx.Expression         op_11       1 1 266 267 expr=div(@0,%sqrt_feat_per_head)
+F.softmax               softmax     1 1 267 268 dim=%softmax_dim
+torch.matmul            op_13       2 1 268 262 269
+Tensor.permute          op_14       1 1 269 270 dims=(0,2,1,3)
+Tensor.reshape          op_15       1 1 270 271 shape=(%batch,%size,%embed_dim)
+nn.Linear               out_proj    1 1 271 out bias=%outbias in_features=%embed_dim out_features=%embed_dim @bias @weight
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    bool match(const std::map<std::string, const Operator*>& matched_operators, const std::map<std::string, Parameter>& captured_params, const std::map<std::string, Attribute>& /*captured_attrs*/) const
+    {
+        const int embed_dim = captured_params.at("embed_dim").i;
+        const int num_heads = captured_params.at("num_heads").i;
+        const int feat_per_head = captured_params.at("feat_per_head").i;
+        const float sqrt_feat_per_head = captured_params.at("sqrt_feat_per_head").f;
+        const int softmax_dim = captured_params.at("softmax_dim").i;
+
+        if (embed_dim != num_heads * feat_per_head)
+            return false;
+
+        if (!NearlyEqual(sqrt_feat_per_head, sqrt(feat_per_head), 0.001))
+            return false;
+
+        int softmax_input_rank = (int)matched_operators.at("softmax")->inputs[0]->shape.size();
+        if (softmax_dim != -1 && softmax_dim != softmax_input_rank - 1)
+            return false;
+
+        return true;
+    }
+};
+
 class fuse_multiheadattention_pass_1_2 : public fuse_multiheadattention_pass_qkv
 {
 public:
@@ -2082,6 +2133,7 @@ void fuse_multiheadattention(Graph& graph)
     fuse_multiheadattention_pass_q_samekv d;
     fuse_multiheadattention_pass_1 b1;
     fuse_multiheadattention_pass_1_1 b11;
+    fuse_multiheadattention_pass_1_1_1 b111;
     fuse_multiheadattention_pass_1_2 b12;
     fuse_multiheadattention_pass_2 c1;
     fuse_multiheadattention_pass_3 d1;
@@ -2122,6 +2174,7 @@ void fuse_multiheadattention(Graph& graph)
     pnnx_graph_rewrite(graph, &d, opindex);
     pnnx_graph_rewrite(graph, &b1, opindex);
     pnnx_graph_rewrite(graph, &b11, opindex);
+    pnnx_graph_rewrite(graph, &b111, opindex);
     pnnx_graph_rewrite(graph, &b12, opindex);
     pnnx_graph_rewrite(graph, &c1, opindex);
     pnnx_graph_rewrite(graph, &d1, opindex);

From 5df5413c81312b0106fe18066b47e2917afabd27 Mon Sep 17 00:00:00 2001
From: nihui <nihuini@tencent.com>
Date: Mon, 2 Sep 2024 18:48:01 +0800
Subject: [PATCH 33/38] embed int8 quantization and add embed test (#5667)

---
 .ci/pnnx.yml                      |   2 +
 docs/developer-guide/operators.md |   2 +
 src/layer/embed.cpp               |  88 +++++++++++++++++++++---
 src/layer/embed.h                 |   6 ++
 tests/CMakeLists.txt              |   1 +
 tests/test_embed.cpp              | 108 ++++++++++++++++++++++++++++++
 tools/modelwriter.h               |  11 +++
 tools/quantize/ncnn2int8.cpp      |  52 ++++++++++++++
 8 files changed, 261 insertions(+), 9 deletions(-)
 create mode 100644 tests/test_embed.cpp

diff --git a/.ci/pnnx.yml b/.ci/pnnx.yml
index 990690e0c5b..d49da39a0af 100644
--- a/.ci/pnnx.yml
+++ b/.ci/pnnx.yml
@@ -4,12 +4,14 @@ on:
     branches: [master]
     paths:
     - '.ci/pnnx.yml'
+    - 'src/layer/*'
     - 'tools/pnnx/**'
     - '!tools/pnnx/README.md'
   mr:
     target-branches: [master]
     paths:
     - '.ci/pnnx.yml'
+    - 'src/layer/*'
     - 'tools/pnnx/**'
     - '!tools/pnnx/README.md'
 concurrency:
diff --git a/docs/developer-guide/operators.md b/docs/developer-guide/operators.md
index 7594c0843ac..de4d6b428e9 100644
--- a/docs/developer-guide/operators.md
+++ b/docs/developer-guide/operators.md
@@ -837,11 +837,13 @@ y = embedding(x)
 | 1         | input_dim     | int   | 0         |                   |
 | 2         | bias_term     | int   | 0         |                   |
 | 3         | weight_data_size | int | 0        |                   |
+| 18        | int8_scale_term| int  | 0         |                   |
 
 | weight        | type  | shape                 |
 | ------------- | ----- | --------------------- |
 | weight_data   | float | [weight_data_size]    |
 | bias_term     | float | [num_output]          |
+| weight_data_int8_scales| float | [1]          |
 
 # Exp
 ```
diff --git a/src/layer/embed.cpp b/src/layer/embed.cpp
index ddda6b8bf19..2b9f8a60042 100644
--- a/src/layer/embed.cpp
+++ b/src/layer/embed.cpp
@@ -30,6 +30,7 @@ int Embed::load_param(const ParamDict& pd)
     input_dim = pd.get(1, 0);
     bias_term = pd.get(2, 0);
     weight_data_size = pd.get(3, 0);
+    int8_scale_term = pd.get(18, 0);
 
     return 0;
 }
@@ -47,18 +48,23 @@ int Embed::load_model(const ModelBin& mb)
             return -100;
     }
 
+#if NCNN_INT8
+    if (int8_scale_term)
+    {
+        weight_data_int8_scale = mb.load(1, 1)[0];
+    }
+#endif // NCNN_INT8
+
     return 0;
 }
 
-int Embed::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+static void embed(const Mat& bottom_blob, const Mat& weight_data, const Mat& bias_data, Mat& top_blob, int input_dim, const Option& opt)
 {
-    int words = static_cast<int>(bottom_blob.total());
+    const int num_output = top_blob.w;
+    const int words = top_blob.h;
 
-    top_blob.create(num_output, words, 4u, opt.blob_allocator);
-    if (top_blob.empty())
-        return -100;
+    const float* bias_ptr = bias_data;
 
-    // num_output
     #pragma omp parallel for num_threads(opt.num_threads)
     for (int q = 0; q < words; q++)
     {
@@ -73,15 +79,79 @@ int Embed::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) con
 
         const float* em = (const float*)weight_data + num_output * word_index;
 
-        memcpy(outptr, em, num_output * sizeof(float));
+        if (bias_ptr)
+        {
+            for (int p = 0; p < num_output; p++)
+            {
+                outptr[p] = em[p] + bias_ptr[p];
+            }
+        }
+        else
+        {
+            memcpy(outptr, em, num_output * sizeof(float));
+        }
+    }
+}
+
+#if NCNN_INT8
+static void embed_int8(const Mat& bottom_blob, const Mat& weight_data, float weight_data_int8_scale, const Mat& bias_data, Mat& top_blob, int input_dim, const Option& opt)
+{
+    const int num_output = top_blob.w;
+    const int words = top_blob.h;
+
+    const float* bias_ptr = bias_data;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < words; q++)
+    {
+        float* outptr = top_blob.row(q);
+
+        int word_index = ((const int*)bottom_blob)[q];
 
-        if (bias_term)
+        if (word_index < 0)
+            word_index = 0;
+        if (word_index >= input_dim)
+            word_index = input_dim - 1;
+
+        const float descale_em = 1.f / weight_data_int8_scale;
+
+        const signed char* em = (const signed char*)weight_data + num_output * word_index;
+
+        if (bias_ptr)
         {
             for (int p = 0; p < num_output; p++)
             {
-                outptr[p] += bias_data[p];
+                outptr[p] = em[p] * descale_em + bias_ptr[p];
             }
         }
+        else
+        {
+            for (int p = 0; p < num_output; p++)
+            {
+                outptr[p] = em[p] * descale_em;
+            }
+        }
+    }
+}
+#endif // NCNN_INT8
+
+int Embed::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    int words = static_cast<int>(bottom_blob.total());
+
+    top_blob.create(num_output, words, 4u, opt.blob_allocator);
+    if (top_blob.empty())
+        return -100;
+
+#if NCNN_INT8
+    if (int8_scale_term)
+    {
+        embed_int8(bottom_blob, weight_data, weight_data_int8_scale, bias_data, top_blob, input_dim, opt);
+    }
+    else
+#endif // NCNN_INT8
+    {
+        embed(bottom_blob, weight_data, bias_data, top_blob, input_dim, opt);
     }
 
     return 0;
diff --git a/src/layer/embed.h b/src/layer/embed.h
index 8e236656716..b94c2b17bee 100644
--- a/src/layer/embed.h
+++ b/src/layer/embed.h
@@ -38,9 +38,15 @@ class Embed : public Layer
 
     int weight_data_size;
 
+    int int8_scale_term;
+
     // model
     Mat weight_data;
     Mat bias_data;
+
+#if NCNN_INT8
+    float weight_data_int8_scale;
+#endif
 };
 
 } // namespace ncnn
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 6c8939fc7c7..e2ddc32a00d 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -101,6 +101,7 @@ ncnn_add_layer_test(Dropout)
 ncnn_add_layer_test(Einsum)
 ncnn_add_layer_test(Eltwise)
 ncnn_add_layer_test(ELU)
+ncnn_add_layer_test(Embed)
 ncnn_add_layer_test(Erf)
 ncnn_add_layer_test(ExpandDims)
 ncnn_add_layer_test(Flatten)
diff --git a/tests/test_embed.cpp b/tests/test_embed.cpp
new file mode 100644
index 00000000000..9c007ee5d7e
--- /dev/null
+++ b/tests/test_embed.cpp
@@ -0,0 +1,108 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "testutil.h"
+
+static int test_embed(int words, int num_output, int input_dim, int bias)
+{
+    ncnn::ParamDict pd;
+    pd.set(0, num_output);
+    pd.set(1, input_dim);
+    pd.set(2, bias);
+    pd.set(3, num_output * input_dim);
+
+    std::vector<ncnn::Mat> weights(bias ? 2 : 1);
+    weights[0] = RandomMat(num_output * input_dim);
+    if (bias)
+        weights[1] = RandomMat(num_output);
+
+    ncnn::Mat a(words);
+    RandomizeInt(a, 0, input_dim);
+
+    int ret = test_layer("Embed", pd, weights, a);
+    if (ret != 0)
+    {
+        fprintf(stderr, "test_embed failed words=%d num_output=%d input_dim=%d bias=%d\n", words, num_output, input_dim, bias);
+    }
+
+    return ret;
+}
+
+static int test_embed_0()
+{
+    return 0
+           || test_embed(128, 128, 128, 0)
+           || test_embed(128, 128, 128, 1)
+           || test_embed(127, 127, 127, 0)
+           || test_embed(127, 127, 127, 1)
+           || test_embed(124, 124, 124, 0)
+           || test_embed(124, 124, 124, 1);
+}
+
+#if NCNN_INT8
+static int test_embed_int8(int words, int num_output, int input_dim, int bias)
+{
+    ncnn::ParamDict pd;
+    pd.set(0, num_output);
+    pd.set(1, input_dim);
+    pd.set(2, bias);
+    pd.set(3, num_output * input_dim);
+    pd.set(18, 2);
+
+    std::vector<ncnn::Mat> weights(bias ? 3 : 2);
+    weights[0] = RandomS8Mat(num_output * input_dim);
+    if (bias)
+    {
+        weights[1] = RandomMat(num_output);
+        weights[2] = RandomMat(1, 100.f, 200.f);
+    }
+    else
+    {
+        weights[1] = RandomMat(1, 100.f, 200.f);
+    }
+
+    ncnn::Mat a(words);
+    RandomizeInt(a, 0, input_dim);
+
+    int ret = test_layer("Embed", pd, weights, a);
+    if (ret != 0)
+    {
+        fprintf(stderr, "test_embed_int8 failed words=%d num_output=%d input_dim=%d bias=%d\n", words, num_output, input_dim, bias);
+    }
+
+    return ret;
+}
+
+static int test_embed_1()
+{
+    return 0
+           || test_embed_int8(128, 128, 128, 0)
+           || test_embed_int8(128, 128, 128, 1)
+           || test_embed_int8(127, 127, 127, 0)
+           || test_embed_int8(127, 127, 127, 1)
+           || test_embed_int8(124, 124, 124, 0)
+           || test_embed_int8(124, 124, 124, 1);
+}
+#endif // NCNN_INT8
+
+int main()
+{
+    SRAND(7767517);
+
+#if NCNN_INT8
+    return test_embed_0() || test_embed_1();
+#else
+    return test_embed_0();
+#endif
+}
diff --git a/tools/modelwriter.h b/tools/modelwriter.h
index 4f445cfe2a4..39157c453ec 100644
--- a/tools/modelwriter.h
+++ b/tools/modelwriter.h
@@ -1676,9 +1676,20 @@ int ModelWriter::save(const char* parampath, const char* binpath)
             fprintf_param_value(" 1=%d", input_dim)
             fprintf_param_value(" 2=%d", bias_term)
             fprintf_param_value(" 3=%d", weight_data_size)
+            fprintf_param_value(" 18=%d", int8_scale_term)
 
             fwrite_weight_tag_data(op->weight_data, bp);
             fwrite_weight_data(op->bias_data, bp);
+
+#if NCNN_INT8
+            // write int8_scale data
+            if (op->int8_scale_term)
+            {
+                ncnn::Mat weight_data_int8_scales(1);
+                weight_data_int8_scales[0] = op->weight_data_int8_scale;
+                fwrite_weight_data(weight_data_int8_scales, bp, 90, 100);
+            }
+#endif // NCNN_INT8
         }
         else if (layer->type == "Exp")
         {
diff --git a/tools/quantize/ncnn2int8.cpp b/tools/quantize/ncnn2int8.cpp
index 4d19ceb6f16..5e92b333aa5 100644
--- a/tools/quantize/ncnn2int8.cpp
+++ b/tools/quantize/ncnn2int8.cpp
@@ -133,6 +133,8 @@ class NetQuantize : public ModelWriter
     int quantize_lstm();
     int quantize_gru();
 
+    int quantize_embed();
+
     int fuse_requantize();
 };
 
@@ -562,6 +564,55 @@ int NetQuantize::quantize_gru()
     return 0;
 }
 
+int NetQuantize::quantize_embed()
+{
+    for (size_t i = 0; i < layers.size(); i++)
+    {
+        if (layers[i]->type != "Embed")
+            continue;
+
+        // Embed - quantize weight from fp32 to int8
+        ncnn::Embed* embed = (ncnn::Embed*)layers[i];
+
+        fprintf(stderr, "quantize_embed %s\n", embed->name.c_str());
+
+        // TODO move to ncnn2table
+
+        const int num_output = embed->num_output;
+        const int input_dim = embed->input_dim;
+
+        ncnn::Mat weight_data_int8_scales(1);
+        {
+            const float* ptr = embed->weight_data;
+            float absmax = 0.f;
+            for (int i = 0; i < embed->weight_data.w; i++)
+            {
+                absmax = std::max(absmax, (float)fabs(ptr[i]));
+            }
+
+            weight_data_int8_scales[0] = absmax == 0.f ? 1.f : 127 / absmax;
+        }
+
+        {
+            ncnn::Mat weight_data_int8;
+
+            ncnn::Option opt_q = opt;
+            opt_q.blob_allocator = embed->weight_data.allocator;
+            opt_q.use_packing_layout = false;
+            ncnn::quantize_to_int8(embed->weight_data, weight_data_int8, weight_data_int8_scales, opt_q);
+            if (weight_data_int8.empty())
+                return -100;
+
+            embed->weight_data = weight_data_int8;
+        }
+
+        embed->int8_scale_term = 2;
+        embed->weight_data_int8_scale = weight_data_int8_scales[0];
+    }
+
+    return 0;
+}
+
 int NetQuantize::fuse_requantize()
 {
     const size_t layer_count = layers.size();
@@ -809,6 +860,7 @@ int main(int argc, char** argv)
     quantizer.quantize_rnn();
     quantizer.quantize_lstm();
     quantizer.quantize_gru();
+    quantizer.quantize_embed();
 
     quantizer.fuse_requantize();
 

From 8077d340a905ff4b15f7c266da85c811983e6291 Mon Sep 17 00:00:00 2001
From: nihui <nihuini@tencent.com>
Date: Tue, 3 Sep 2024 17:16:50 +0800
Subject: [PATCH 34/38] arm neon optimzation for rmsnorm (#5668)

---
 src/layer/arm/rmsnorm_arm.cpp         | 417 ++++++++++++++++++++++++++
 src/layer/arm/rmsnorm_arm.h           |  40 +++
 src/layer/arm/rmsnorm_arm_asimdhp.cpp | 272 +++++++++++++++++
 3 files changed, 729 insertions(+)
 create mode 100644 src/layer/arm/rmsnorm_arm.cpp
 create mode 100644 src/layer/arm/rmsnorm_arm.h
 create mode 100644 src/layer/arm/rmsnorm_arm_asimdhp.cpp

diff --git a/src/layer/arm/rmsnorm_arm.cpp b/src/layer/arm/rmsnorm_arm.cpp
new file mode 100644
index 00000000000..e19136ca29d
--- /dev/null
+++ b/src/layer/arm/rmsnorm_arm.cpp
@@ -0,0 +1,417 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "rmsnorm_arm.h"
+
+#if __ARM_NEON
+#include <arm_neon.h>
+#endif // __ARM_NEON
+
+#include "arm_usability.h"
+#include "cpu.h"
+
+namespace ncnn {
+
+RMSNorm_arm::RMSNorm_arm()
+{
+#if __ARM_NEON
+    support_packing = true;
+#if NCNN_ARM82
+    support_fp16_storage = cpu_support_arm_asimdhp();
+#endif
+#endif // __ARM_NEON
+
+#if NCNN_BF16
+    support_bf16_storage = true;
+#endif
+}
+
+static void rmsnorm(float* ptr, const float* gamma_ptr, float eps, int elemcount, int elempack)
+{
+    const int size = elemcount * elempack;
+
+#if __ARM_NEON
+    float32x4_t _rms = vdupq_n_f32(0.f);
+#endif // __ARM_NEON
+    float rms = 0.f;
+    {
+        const float* ptr0 = ptr;
+
+        int i = 0;
+#if __ARM_NEON
+        for (; i + 3 < size; i += 4)
+        {
+            float32x4_t _p = vld1q_f32(ptr0);
+            _rms = vmlaq_f32(_rms, _p, _p);
+            ptr0 += 4;
+        }
+#endif // __ARM_NEON
+        for (; i < size; i++)
+        {
+            rms += ptr0[0] * ptr0[0];
+            ptr0++;
+        }
+    }
+
+#if __ARM_NEON
+    if (elempack == 4)
+    {
+        float32x4_t _elemcount = vdupq_n_f32(elemcount);
+        float32x4_t _eps = vdupq_n_f32(eps);
+
+#if __aarch64__
+        _rms = vdivq_f32(_rms, _elemcount);
+        _rms = vaddq_f32(_rms, _eps);
+#else
+        float32x4_t _inv_elemcount = vrecpeq_f32(_elemcount);
+        _inv_elemcount = vmulq_f32(vrecpsq_f32(_elemcount, _inv_elemcount), _inv_elemcount);
+        _inv_elemcount = vmulq_f32(vrecpsq_f32(_elemcount, _inv_elemcount), _inv_elemcount);
+        _rms = vmlaq_f32(_eps, _rms, _inv_elemcount);
+#endif
+
+        float32x4_t _rsqrt_rms = vrsqrteq_f32(_rms);
+        _rsqrt_rms = vmulq_f32(vrsqrtsq_f32(vmulq_f32(_rms, _rsqrt_rms), _rsqrt_rms), _rsqrt_rms);
+        _rms = vmulq_f32(vrsqrtsq_f32(vmulq_f32(_rms, _rsqrt_rms), _rsqrt_rms), _rsqrt_rms);
+    }
+#endif // __ARM_NEON
+    if (elempack == 1)
+    {
+#if __ARM_NEON
+#if __aarch64__
+        rms += vaddvq_f32(_rms);
+#else
+        float32x2_t _s2 = vadd_f32(vget_low_f32(_rms), vget_high_f32(_rms));
+        _s2 = vpadd_f32(_s2, _s2);
+        rms += vget_lane_f32(_s2, 0);
+#endif
+#endif // __ARM_NEON
+
+        rms = 1.f / sqrtf(rms / elemcount + eps);
+#if __ARM_NEON
+        _rms = vdupq_n_f32(rms);
+#endif // __ARM_NEON
+    }
+
+    if (gamma_ptr)
+    {
+        int i = 0;
+#if __ARM_NEON
+        if (elempack == 4)
+        {
+            for (; i + 3 < size; i += 4)
+            {
+                float32x4_t _p = vld1q_f32(ptr);
+                float32x4_t _gamma = vdupq_n_f32(gamma_ptr[0]);
+                _p = vmulq_f32(_p, _rms);
+                _p = vmulq_f32(_p, _gamma);
+                vst1q_f32(ptr, _p);
+                ptr += 4;
+                gamma_ptr += 1;
+            }
+        }
+        if (elempack == 1)
+        {
+            for (; i + 3 < size; i += 4)
+            {
+                float32x4_t _p = vld1q_f32(ptr);
+                float32x4_t _gamma = vld1q_f32(gamma_ptr);
+                _p = vmulq_f32(_p, _rms);
+                _p = vmulq_f32(_p, _gamma);
+                vst1q_f32(ptr, _p);
+                ptr += 4;
+                gamma_ptr += 4;
+            }
+        }
+#endif // __ARM_NEON
+        for (; i < size; i++)
+        {
+            ptr[0] = (ptr[0] * rms) * gamma_ptr[0];
+            ptr++;
+            gamma_ptr++;
+        }
+    }
+    else
+    {
+        int i = 0;
+#if __ARM_NEON
+        for (; i + 3 < size; i += 4)
+        {
+            float32x4_t _p = vld1q_f32(ptr);
+            _p = vmulq_f32(_p, _rms);
+            vst1q_f32(ptr, _p);
+            ptr += 4;
+        }
+#endif // __ARM_NEON
+        for (; i < size; i++)
+        {
+            ptr[0] = ptr[0] * rms;
+            ptr++;
+        }
+    }
+}
+
+int RMSNorm_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
+{
+    int elembits = bottom_top_blob.elembits();
+
+#if NCNN_ARM82
+    if (support_fp16_storage && opt.use_fp16_storage && elembits == 16)
+        return forward_inplace_fp16s(bottom_top_blob, opt);
+#endif
+
+#if NCNN_BF16
+    if (opt.use_bf16_storage && elembits == 16)
+        return forward_inplace_bf16s(bottom_top_blob, opt);
+#endif
+
+    const int dims = bottom_top_blob.dims;
+    const int w = bottom_top_blob.w;
+    const int h = bottom_top_blob.h;
+    const int channels = bottom_top_blob.c;
+    const int elempack = bottom_top_blob.elempack;
+
+    if (dims == 1)
+    {
+        // assert affine_size == w
+
+        float* ptr = bottom_top_blob;
+        rmsnorm(ptr, gamma_data, eps, w * elempack, 1);
+    }
+
+    if (dims == 2)
+    {
+        // assert affine_size == w
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int i = 0; i < h; i++)
+        {
+            float* ptr = bottom_top_blob.row(i);
+            rmsnorm(ptr, gamma_data, eps, w, elempack);
+        }
+    }
+
+    if (dims == 3)
+    {
+        if (affine_size == w)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                for (int i = 0; i < h; i++)
+                {
+                    float* ptr = bottom_top_blob.channel(q).row(i);
+                    rmsnorm(ptr, gamma_data, eps, w, elempack);
+                }
+            }
+        }
+        else // if (affine_size == w * h)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                float* ptr = bottom_top_blob.channel(q);
+                rmsnorm(ptr, gamma_data, eps, w * h, elempack);
+            }
+        }
+    }
+
+    return 0;
+}
+
+#if NCNN_BF16
+static void rmsnorm_bf16s(unsigned short* ptr, const float* gamma_ptr, float eps, int elemcount, int elempack)
+{
+    const int size = elemcount * elempack;
+
+#if __ARM_NEON
+    float32x4_t _rms = vdupq_n_f32(0.f);
+#endif // __ARM_NEON
+    float rms = 0.f;
+    {
+        const unsigned short* ptr0 = ptr;
+
+        int i = 0;
+#if __ARM_NEON
+        for (; i + 3 < size; i += 4)
+        {
+            float32x4_t _p = bfloat2float(vld1_u16(ptr0));
+            _rms = vmlaq_f32(_rms, _p, _p);
+            ptr0 += 4;
+        }
+#endif // __ARM_NEON
+        for (; i < size; i++)
+        {
+            float v = bfloat16_to_float32(ptr0[0]);
+            rms += v * v;
+            ptr0++;
+        }
+    }
+
+#if __ARM_NEON
+    if (elempack == 4)
+    {
+        float32x4_t _elemcount = vdupq_n_f32(elemcount);
+        float32x4_t _eps = vdupq_n_f32(eps);
+
+#if __aarch64__
+        _rms = vdivq_f32(_rms, _elemcount);
+        _rms = vaddq_f32(_rms, _eps);
+#else
+        float32x4_t _inv_elemcount = vrecpeq_f32(_elemcount);
+        _inv_elemcount = vmulq_f32(vrecpsq_f32(_elemcount, _inv_elemcount), _inv_elemcount);
+        _inv_elemcount = vmulq_f32(vrecpsq_f32(_elemcount, _inv_elemcount), _inv_elemcount);
+        _rms = vmlaq_f32(_eps, _rms, _inv_elemcount);
+#endif
+
+        float32x4_t _rsqrt_rms = vrsqrteq_f32(_rms);
+        _rsqrt_rms = vmulq_f32(vrsqrtsq_f32(vmulq_f32(_rms, _rsqrt_rms), _rsqrt_rms), _rsqrt_rms);
+        _rms = vmulq_f32(vrsqrtsq_f32(vmulq_f32(_rms, _rsqrt_rms), _rsqrt_rms), _rsqrt_rms);
+    }
+#endif // __ARM_NEON
+    if (elempack == 1)
+    {
+#if __ARM_NEON
+#if __aarch64__
+        rms += vaddvq_f32(_rms);
+#else
+        float32x2_t _s2 = vadd_f32(vget_low_f32(_rms), vget_high_f32(_rms));
+        _s2 = vpadd_f32(_s2, _s2);
+        rms += vget_lane_f32(_s2, 0);
+#endif
+#endif // __ARM_NEON
+
+        rms = 1.f / sqrtf(rms / elemcount + eps);
+#if __ARM_NEON
+        _rms = vdupq_n_f32(rms);
+#endif // __ARM_NEON
+    }
+
+    if (gamma_ptr)
+    {
+        int i = 0;
+#if __ARM_NEON
+        if (elempack == 4)
+        {
+            for (; i + 3 < size; i += 4)
+            {
+                float32x4_t _p = bfloat2float(vld1_u16(ptr));
+                float32x4_t _gamma = vdupq_n_f32(gamma_ptr[0]);
+                _p = vmulq_f32(_p, _rms);
+                _p = vmulq_f32(_p, _gamma);
+                vst1_u16(ptr, float2bfloat(_p));
+                ptr += 4;
+                gamma_ptr += 1;
+            }
+        }
+        if (elempack == 1)
+        {
+            for (; i + 3 < size; i += 4)
+            {
+                float32x4_t _p = bfloat2float(vld1_u16(ptr));
+                float32x4_t _gamma = vld1q_f32(gamma_ptr);
+                _p = vmulq_f32(_p, _rms);
+                _p = vmulq_f32(_p, _gamma);
+                vst1_u16(ptr, float2bfloat(_p));
+                ptr += 4;
+                gamma_ptr += 4;
+            }
+        }
+#endif // __ARM_NEON
+        for (; i < size; i++)
+        {
+            float v = bfloat16_to_float32(ptr[0]);
+            ptr[0] = float32_to_bfloat16((v * rms) * gamma_ptr[0]);
+            ptr++;
+            gamma_ptr++;
+        }
+    }
+    else
+    {
+        int i = 0;
+#if __ARM_NEON
+        for (; i + 3 < size; i += 4)
+        {
+            float32x4_t _p = bfloat2float(vld1_u16(ptr));
+            _p = vmulq_f32(_p, _rms);
+            vst1_u16(ptr, float2bfloat(_p));
+            ptr += 4;
+        }
+#endif // __ARM_NEON
+        for (; i < size; i++)
+        {
+            float v = bfloat16_to_float32(ptr[0]);
+            ptr[0] = float32_to_bfloat16(v * rms);
+            ptr++;
+        }
+    }
+}
+
+int RMSNorm_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const
+{
+    const int dims = bottom_top_blob.dims;
+    const int w = bottom_top_blob.w;
+    const int h = bottom_top_blob.h;
+    const int channels = bottom_top_blob.c;
+    const int elempack = bottom_top_blob.elempack;
+
+    if (dims == 1)
+    {
+        // assert affine_size == w
+
+        unsigned short* ptr = bottom_top_blob;
+        rmsnorm_bf16s(ptr, gamma_data, eps, w * elempack, 1);
+    }
+
+    if (dims == 2)
+    {
+        // assert affine_size == w
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int i = 0; i < h; i++)
+        {
+            unsigned short* ptr = bottom_top_blob.row<unsigned short>(i);
+            rmsnorm_bf16s(ptr, gamma_data, eps, w, elempack);
+        }
+    }
+
+    if (dims == 3)
+    {
+        if (affine_size == w)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                for (int i = 0; i < h; i++)
+                {
+                    unsigned short* ptr = bottom_top_blob.channel(q).row<unsigned short>(i);
+                    rmsnorm_bf16s(ptr, gamma_data, eps, w, elempack);
+                }
+            }
+        }
+        else // if (affine_size == w * h)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                unsigned short* ptr = bottom_top_blob.channel(q);
+                rmsnorm_bf16s(ptr, gamma_data, eps, w * h, elempack);
+            }
+        }
+    }
+
+    return 0;
+}
+#endif // NCNN_BF16
+
+} // namespace ncnn
diff --git a/src/layer/arm/rmsnorm_arm.h b/src/layer/arm/rmsnorm_arm.h
new file mode 100644
index 00000000000..44015333371
--- /dev/null
+++ b/src/layer/arm/rmsnorm_arm.h
@@ -0,0 +1,40 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_RMSNORM_ARM_H
+#define LAYER_RMSNORM_ARM_H
+
+#include "rmsnorm.h"
+
+namespace ncnn {
+
+class RMSNorm_arm : public RMSNorm
+{
+public:
+    RMSNorm_arm();
+
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+
+protected:
+#if NCNN_ARM82
+    int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
+#endif
+#if NCNN_BF16
+    int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const;
+#endif
+};
+
+} // namespace ncnn
+
+#endif // LAYER_RMSNORM_ARM_H
diff --git a/src/layer/arm/rmsnorm_arm_asimdhp.cpp b/src/layer/arm/rmsnorm_arm_asimdhp.cpp
new file mode 100644
index 00000000000..98d8e696487
--- /dev/null
+++ b/src/layer/arm/rmsnorm_arm_asimdhp.cpp
@@ -0,0 +1,272 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "rmsnorm_arm.h"
+
+#if __ARM_NEON
+#include <arm_neon.h>
+#include "arm_usability.h"
+#endif // __ARM_NEON
+
+namespace ncnn {
+
+#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+static void rmsnorm_fp16s(__fp16* ptr, const float* gamma_ptr, float eps, int elemcount, int elempack)
+{
+    const int size = elemcount * elempack;
+
+    float32x4_t _rms0 = vdupq_n_f32(0.f);
+    float32x4_t _rms1 = vdupq_n_f32(0.f);
+    float rms = 0.f;
+    {
+        const __fp16* ptr0 = ptr;
+
+        int i = 0;
+        for (; i + 7 < size; i += 8)
+        {
+            float16x8_t _p = vld1q_f16(ptr0);
+            float32x4_t _p0 = vcvt_f32_f16(vget_low_f16(_p));
+            float32x4_t _p1 = vcvt_f32_f16(vget_high_f16(_p));
+            _rms0 = vmlaq_f32(_rms0, _p0, _p0);
+            _rms1 = vmlaq_f32(_rms1, _p1, _p1);
+            ptr0 += 8;
+        }
+        for (; i + 3 < size; i += 4)
+        {
+            float32x4_t _p = vcvt_f32_f16(vld1_f16(ptr0));
+            _rms0 = vmlaq_f32(_rms0, _p, _p);
+            ptr0 += 4;
+        }
+        for (; i < size; i++)
+        {
+            rms += (float)ptr0[0] * (float)ptr0[0];
+            ptr0++;
+        }
+    }
+
+    if (elempack == 8)
+    {
+        float32x4_t _elemcount = vdupq_n_f32(elemcount);
+        float32x4_t _eps = vdupq_n_f32(eps);
+
+        _rms0 = vdivq_f32(_rms0, _elemcount);
+        _rms1 = vdivq_f32(_rms1, _elemcount);
+        _rms0 = vaddq_f32(_rms0, _eps);
+        _rms1 = vaddq_f32(_rms1, _eps);
+
+        float32x4_t _rsqrt_rms0 = vrsqrteq_f32(_rms0);
+        float32x4_t _rsqrt_rms1 = vrsqrteq_f32(_rms1);
+        _rsqrt_rms0 = vmulq_f32(vrsqrtsq_f32(vmulq_f32(_rms0, _rsqrt_rms0), _rsqrt_rms0), _rsqrt_rms0);
+        _rsqrt_rms1 = vmulq_f32(vrsqrtsq_f32(vmulq_f32(_rms1, _rsqrt_rms1), _rsqrt_rms1), _rsqrt_rms1);
+        _rms0 = vmulq_f32(vrsqrtsq_f32(vmulq_f32(_rms0, _rsqrt_rms0), _rsqrt_rms0), _rsqrt_rms0);
+        _rms1 = vmulq_f32(vrsqrtsq_f32(vmulq_f32(_rms1, _rsqrt_rms1), _rsqrt_rms1), _rsqrt_rms1);
+    }
+    if (elempack == 4)
+    {
+        _rms0 = vaddq_f32(_rms0, _rms1);
+
+        float32x4_t _elemcount = vdupq_n_f32(elemcount);
+        float32x4_t _eps = vdupq_n_f32(eps);
+
+        _rms0 = vdivq_f32(_rms0, _elemcount);
+        _rms0 = vaddq_f32(_rms0, _eps);
+
+        float32x4_t _rsqrt_rms0 = vrsqrteq_f32(_rms0);
+        _rsqrt_rms0 = vmulq_f32(vrsqrtsq_f32(vmulq_f32(_rms0, _rsqrt_rms0), _rsqrt_rms0), _rsqrt_rms0);
+        _rms0 = vmulq_f32(vrsqrtsq_f32(vmulq_f32(_rms0, _rsqrt_rms0), _rsqrt_rms0), _rsqrt_rms0);
+        _rms1 = _rms0;
+    }
+    if (elempack == 1)
+    {
+        _rms0 = vaddq_f32(_rms0, _rms1);
+        rms += vaddvq_f32(_rms0);
+
+        rms = 1.f / sqrtf(rms / elemcount + eps);
+        _rms0 = vdupq_n_f32(rms);
+        _rms1 = _rms0;
+    }
+
+    if (gamma_ptr)
+    {
+        int i = 0;
+        if (elempack == 8)
+        {
+            for (; i + 7 < size; i += 8)
+            {
+                float16x8_t _p = vld1q_f16(ptr);
+                float32x4_t _p0 = vcvt_f32_f16(vget_low_f16(_p));
+                float32x4_t _p1 = vcvt_f32_f16(vget_high_f16(_p));
+                float32x4_t _gamma = vdupq_n_f32(gamma_ptr[0]);
+                _p0 = vmulq_f32(_p0, _rms0);
+                _p1 = vmulq_f32(_p1, _rms1);
+                _p0 = vmulq_f32(_p0, _gamma);
+                _p1 = vmulq_f32(_p1, _gamma);
+                _p = vcombine_f16(vcvt_f16_f32(_p0), vcvt_f16_f32(_p1));
+                vst1q_f16(ptr, _p);
+                ptr += 8;
+                gamma_ptr += 1;
+            }
+        }
+        if (elempack == 4)
+        {
+            for (; i + 7 < size; i += 8)
+            {
+                float16x8_t _p = vld1q_f16(ptr);
+                float32x4_t _p0 = vcvt_f32_f16(vget_low_f16(_p));
+                float32x4_t _p1 = vcvt_f32_f16(vget_high_f16(_p));
+                float32x4_t _gamma0 = vdupq_n_f32(gamma_ptr[0]);
+                float32x4_t _gamma1 = vdupq_n_f32(gamma_ptr[1]);
+                _p0 = vmulq_f32(_p0, _rms0);
+                _p1 = vmulq_f32(_p1, _rms1);
+                _p0 = vmulq_f32(_p0, _gamma0);
+                _p1 = vmulq_f32(_p1, _gamma1);
+                _p = vcombine_f16(vcvt_f16_f32(_p0), vcvt_f16_f32(_p1));
+                vst1q_f16(ptr, _p);
+                ptr += 8;
+                gamma_ptr += 2;
+            }
+            for (; i + 3 < size; i += 4)
+            {
+                float32x4_t _p = vcvt_f32_f16(vld1_f16(ptr));
+                float32x4_t _gamma = vdupq_n_f32(gamma_ptr[0]);
+                _p = vmulq_f32(_p, _rms0);
+                _p = vmulq_f32(_p, _gamma);
+                vst1_f16(ptr, vcvt_f16_f32(_p));
+                ptr += 4;
+                gamma_ptr += 1;
+            }
+        }
+        if (elempack == 1)
+        {
+            for (; i + 7 < size; i += 8)
+            {
+                float16x8_t _p = vld1q_f16(ptr);
+                float32x4_t _p0 = vcvt_f32_f16(vget_low_f16(_p));
+                float32x4_t _p1 = vcvt_f32_f16(vget_high_f16(_p));
+                float32x4_t _gamma0 = vld1q_f32(gamma_ptr);
+                float32x4_t _gamma1 = vld1q_f32(gamma_ptr + 4);
+                _p0 = vmulq_f32(_p0, _rms0);
+                _p1 = vmulq_f32(_p1, _rms1);
+                _p0 = vmulq_f32(_p0, _gamma0);
+                _p1 = vmulq_f32(_p1, _gamma1);
+                _p = vcombine_f16(vcvt_f16_f32(_p0), vcvt_f16_f32(_p1));
+                vst1q_f16(ptr, _p);
+                ptr += 8;
+                gamma_ptr += 8;
+            }
+            for (; i + 3 < size; i += 4)
+            {
+                float32x4_t _p = vcvt_f32_f16(vld1_f16(ptr));
+                float32x4_t _gamma = vld1q_f32(gamma_ptr);
+                _p = vmulq_f32(_p, _rms0);
+                _p = vmulq_f32(_p, _gamma);
+                vst1_f16(ptr, vcvt_f16_f32(_p));
+                ptr += 4;
+                gamma_ptr += 4;
+            }
+        }
+        for (; i < size; i++)
+        {
+            ptr[0] = (__fp16)(((float)ptr[0] * rms) * gamma_ptr[0]);
+            ptr++;
+            gamma_ptr++;
+        }
+    }
+    else
+    {
+        int i = 0;
+        for (; i + 7 < size; i += 8)
+        {
+            float16x8_t _p = vld1q_f16(ptr);
+            float32x4_t _p0 = vcvt_f32_f16(vget_low_f16(_p));
+            float32x4_t _p1 = vcvt_f32_f16(vget_high_f16(_p));
+            _p0 = vmulq_f32(_p0, _rms0);
+            _p1 = vmulq_f32(_p1, _rms1);
+            _p = vcombine_f16(vcvt_f16_f32(_p0), vcvt_f16_f32(_p1));
+            vst1q_f16(ptr, _p);
+            ptr += 8;
+        }
+        for (; i + 3 < size; i += 4)
+        {
+            float32x4_t _p = vcvt_f32_f16(vld1_f16(ptr));
+            _p = vmulq_f32(_p, _rms0);
+            vst1_f16(ptr, vcvt_f16_f32(_p));
+            ptr += 4;
+        }
+        for (; i < size; i++)
+        {
+            ptr[0] = (__fp16)((float)ptr[0] * rms);
+            ptr++;
+        }
+    }
+}
+
+int RMSNorm_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const
+{
+    const int dims = bottom_top_blob.dims;
+    const int w = bottom_top_blob.w;
+    const int h = bottom_top_blob.h;
+    const int channels = bottom_top_blob.c;
+    const int elempack = bottom_top_blob.elempack;
+
+    if (dims == 1)
+    {
+        // assert affine_size == w
+
+        __fp16* ptr = bottom_top_blob;
+        rmsnorm_fp16s(ptr, gamma_data, eps, w * elempack, 1);
+    }
+
+    if (dims == 2)
+    {
+        // assert affine_size == w
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int i = 0; i < h; i++)
+        {
+            __fp16* ptr = bottom_top_blob.row<__fp16>(i);
+            rmsnorm_fp16s(ptr, gamma_data, eps, w, elempack);
+        }
+    }
+
+    if (dims == 3)
+    {
+        if (affine_size == w)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                for (int i = 0; i < h; i++)
+                {
+                    __fp16* ptr = bottom_top_blob.channel(q).row<__fp16>(i);
+                    rmsnorm_fp16s(ptr, gamma_data, eps, w, elempack);
+                }
+            }
+        }
+        else // if (affine_size == w * h)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                __fp16* ptr = bottom_top_blob.channel(q);
+                rmsnorm_fp16s(ptr, gamma_data, eps, w * h, elempack);
+            }
+        }
+    }
+
+    return 0;
+}
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+} // namespace ncnn

From 204583ba52cbc1e4b39b4e77ee1b050eeb1734b7 Mon Sep 17 00:00:00 2001
From: nihui <nihuini@tencent.com>
Date: Tue, 3 Sep 2024 17:17:03 +0800
Subject: [PATCH 35/38] x86 sse2/avx/avx512 optimization for rmsnorm (#5672)

---
 src/layer/x86/rmsnorm_x86.cpp | 413 ++++++++++++++++++++++++++++++++++
 src/layer/x86/rmsnorm_x86.h   |  32 +++
 2 files changed, 445 insertions(+)
 create mode 100644 src/layer/x86/rmsnorm_x86.cpp
 create mode 100644 src/layer/x86/rmsnorm_x86.h

diff --git a/src/layer/x86/rmsnorm_x86.cpp b/src/layer/x86/rmsnorm_x86.cpp
new file mode 100644
index 00000000000..db592c3e381
--- /dev/null
+++ b/src/layer/x86/rmsnorm_x86.cpp
@@ -0,0 +1,413 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "rmsnorm_x86.h"
+
+#if __SSE2__
+#include <emmintrin.h>
+#if __AVX__
+#include <immintrin.h>
+#endif // __AVX__
+#endif // __SSE2__
+
+#include "x86_usability.h"
+
+namespace ncnn {
+
+RMSNorm_x86::RMSNorm_x86()
+{
+#if __SSE2__
+    support_packing = true;
+#endif // __SSE2__
+}
+
+static void rmsnorm(float* ptr, const float* gamma_ptr, float eps, int elemcount, int elempack)
+{
+    const int size = elemcount * elempack;
+
+#if __SSE2__
+#if __AVX__
+#if __AVX512F__
+    __m512 _rms_avx512 = _mm512_set1_ps(0.f);
+#endif // __AVX512F__
+    __m256 _rms_avx = _mm256_set1_ps(0.f);
+#endif // __AVX__
+    __m128 _rms = _mm_set1_ps(0.f);
+#endif // __SSE2__
+    float rms = 0.f;
+    {
+        const float* ptr0 = ptr;
+
+        int i = 0;
+#if __SSE2__
+#if __AVX__
+#if __AVX512F__
+        for (; i + 15 < size; i += 16)
+        {
+            __m512 _p = _mm512_loadu_ps(ptr0);
+            _rms_avx512 = _mm512_fmadd_ps(_p, _p, _rms_avx512);
+            ptr0 += 16;
+        }
+#endif // __AVX512F__
+        for (; i + 7 < size; i += 8)
+        {
+            __m256 _p = _mm256_loadu_ps(ptr0);
+            _rms_avx = _mm256_comp_fmadd_ps(_p, _p, _rms_avx);
+            ptr0 += 8;
+        }
+#endif // __AVX__
+        for (; i + 3 < size; i += 4)
+        {
+            __m128 _p = _mm_loadu_ps(ptr0);
+            _rms = _mm_comp_fmadd_ps(_p, _p, _rms);
+            ptr0 += 4;
+        }
+#endif // __SSE2__
+        for (; i < size; i++)
+        {
+            rms += ptr0[0] * ptr0[0];
+            ptr0++;
+        }
+    }
+
+#if __SSE2__
+#if __AVX__
+#if __AVX512F__
+    if (elempack == 16)
+    {
+        __m512 _elemcount = _mm512_set1_ps((float)elemcount);
+        __m512 _eps = _mm512_set1_ps(eps);
+
+        _rms_avx512 = _mm512_div_ps(_rms_avx512, _elemcount);
+        _rms_avx512 = _mm512_add_ps(_rms_avx512, _eps);
+
+        __m256 _rms0 = _mm256_rsqrt_ps(_mm512_extractf32x8_ps(_rms_avx512, 0));
+        __m256 _rms1 = _mm256_rsqrt_ps(_mm512_extractf32x8_ps(_rms_avx512, 1));
+        _rms_avx512 = _mm512_insertf32x8(_mm512_castps256_ps512(_rms0), _rms1, 1);
+    }
+#endif // __AVX512F__
+    if (elempack == 8)
+    {
+#if __AVX512F__
+        {
+            __m256 _rms0 = _mm512_castps512_ps256(_rms_avx512);
+            __m256 _rms1 = _mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(_rms_avx512), 1));
+            _rms_avx = _mm256_add_ps(_rms_avx, _rms0);
+            _rms_avx = _mm256_add_ps(_rms_avx, _rms1);
+        }
+#endif // __AVX512F__
+
+        __m256 _elemcount = _mm256_set1_ps((float)elemcount);
+        __m256 _eps = _mm256_set1_ps(eps);
+
+        _rms_avx = _mm256_div_ps(_rms_avx, _elemcount);
+        _rms_avx = _mm256_add_ps(_rms_avx, _eps);
+
+        _rms_avx = _mm256_rsqrt_ps(_rms_avx);
+#if __AVX512F__
+        _rms_avx512 = _mm512_insertf32x8(_mm512_castps256_ps512(_rms_avx), _rms_avx, 1);
+#endif // __AVX512F__
+    }
+#endif // __AVX__
+    if (elempack == 4)
+    {
+#if __AVX__
+#if __AVX512F__
+        {
+            __m256 _rms0 = _mm512_castps512_ps256(_rms_avx512);
+            __m256 _rms1 = _mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(_rms_avx512), 1));
+            _rms_avx = _mm256_add_ps(_rms_avx, _rms0);
+            _rms_avx = _mm256_add_ps(_rms_avx, _rms1);
+        }
+#endif // __AVX512F__
+        {
+            __m128 _rms0 = _mm256_castps256_ps128(_rms_avx);
+            __m128 _rms1 = _mm256_extractf128_ps(_rms_avx, 1);
+            _rms = _mm_add_ps(_rms, _rms0);
+            _rms = _mm_add_ps(_rms, _rms1);
+        }
+#endif // __AVX__
+
+        __m128 _elemcount = _mm_set1_ps((float)elemcount);
+        __m128 _eps = _mm_set1_ps(eps);
+
+        _rms = _mm_div_ps(_rms, _elemcount);
+        _rms = _mm_add_ps(_rms, _eps);
+
+        _rms = _mm_rsqrt_ps(_rms);
+#if __AVX__
+        _rms_avx = _mm256_insertf128_ps(_mm256_castps128_ps256(_rms), _rms, 1);
+#if __AVX512F__
+        _rms_avx512 = _mm512_insertf32x8(_mm512_castps256_ps512(_rms_avx), _rms_avx, 1);
+#endif // __AVX512F__
+#endif // __AVX__
+    }
+#endif // __SSE2__
+    if (elempack == 1)
+    {
+#if __SSE2__
+#if __AVX__
+#if __AVX512F__
+        rms += _mm512_comp_reduce_add_ps(_rms_avx512);
+#endif // __AVX512F__
+        rms += _mm256_reduce_add_ps(_rms_avx);
+#endif // __AVX__
+        rms += _mm_reduce_add_ps(_rms);
+#endif // __SSE2__
+
+        rms = 1.f / sqrtf(rms / elemcount + eps);
+#if __SSE2__
+        _rms = _mm_set1_ps(rms);
+#if __AVX__
+        _rms_avx = _mm256_insertf128_ps(_mm256_castps128_ps256(_rms), _rms, 1);
+#if __AVX512F__
+        _rms_avx512 = _mm512_insertf32x8(_mm512_castps256_ps512(_rms_avx), _rms_avx, 1);
+#endif // __AVX512F__
+#endif // __AVX__
+#endif // __SSE2__
+    }
+
+    if (gamma_ptr)
+    {
+        int i = 0;
+#if __SSE2__
+#if __AVX__
+#if __AVX512F__
+        if (elempack == 16)
+        {
+            for (; i + 15 < size; i += 16)
+            {
+                __m512 _p = _mm512_loadu_ps(ptr);
+                __m512 _gamma = _mm512_set1_ps(gamma_ptr[0]);
+                _p = _mm512_mul_ps(_p, _rms_avx512);
+                _p = _mm512_mul_ps(_p, _gamma);
+                _mm512_storeu_ps(ptr, _p);
+                ptr += 16;
+                gamma_ptr += 1;
+            }
+        }
+#endif // __AVX512F__
+        if (elempack == 8)
+        {
+#if __AVX512F__
+            for (; i + 15 < size; i += 16)
+            {
+                __m512 _p = _mm512_loadu_ps(ptr);
+                __m256 _gamma0 = _mm256_set1_ps(gamma_ptr[0]);
+                __m256 _gamma1 = _mm256_set1_ps(gamma_ptr[1]);
+                __m512 _gamma = _mm512_insertf32x8(_mm512_castps256_ps512(_gamma0), _gamma1, 1);
+                _p = _mm512_mul_ps(_p, _rms_avx512);
+                _p = _mm512_mul_ps(_p, _gamma);
+                _mm512_storeu_ps(ptr, _p);
+                ptr += 16;
+                gamma_ptr += 2;
+            }
+#endif // __AVX512F__
+            for (; i + 7 < size; i += 8)
+            {
+                __m256 _p = _mm256_loadu_ps(ptr);
+                __m256 _gamma = _mm256_set1_ps(gamma_ptr[0]);
+                _p = _mm256_mul_ps(_p, _rms_avx);
+                _p = _mm256_mul_ps(_p, _gamma);
+                _mm256_storeu_ps(ptr, _p);
+                ptr += 8;
+                gamma_ptr += 1;
+            }
+        }
+#endif // __AVX__
+        if (elempack == 4)
+        {
+#if __AVX__
+#if __AVX512F__
+            for (; i + 15 < size; i += 16)
+            {
+                __m512 _p = _mm512_loadu_ps(ptr);
+                __m128 _gamma0 = _mm_set1_ps(gamma_ptr[0]);
+                __m128 _gamma1 = _mm_set1_ps(gamma_ptr[1]);
+                __m128 _gamma2 = _mm_set1_ps(gamma_ptr[2]);
+                __m128 _gamma3 = _mm_set1_ps(gamma_ptr[3]);
+                __m256 _gamma01 = _mm256_insertf128_ps(_mm256_castps128_ps256(_gamma0), _gamma1, 1);
+                __m256 _gamma23 = _mm256_insertf128_ps(_mm256_castps128_ps256(_gamma2), _gamma3, 1);
+                __m512 _gamma = _mm512_insertf32x8(_mm512_castps256_ps512(_gamma01), _gamma23, 1);
+                _p = _mm512_mul_ps(_p, _rms_avx512);
+                _p = _mm512_mul_ps(_p, _gamma);
+                _mm512_storeu_ps(ptr, _p);
+                ptr += 16;
+                gamma_ptr += 4;
+            }
+#endif // __AVX512F__
+            for (; i + 7 < size; i += 8)
+            {
+                __m256 _p = _mm256_loadu_ps(ptr);
+                __m128 _gamma0 = _mm_set1_ps(gamma_ptr[0]);
+                __m128 _gamma1 = _mm_set1_ps(gamma_ptr[1]);
+                __m256 _gamma = _mm256_insertf128_ps(_mm256_castps128_ps256(_gamma0), _gamma1, 1);
+                _p = _mm256_mul_ps(_p, _rms_avx);
+                _p = _mm256_mul_ps(_p, _gamma);
+                _mm256_storeu_ps(ptr, _p);
+                ptr += 8;
+                gamma_ptr += 2;
+            }
+#endif // __AVX__
+            for (; i + 3 < size; i += 4)
+            {
+                __m128 _p = _mm_loadu_ps(ptr);
+                __m128 _gamma = _mm_set1_ps(gamma_ptr[0]);
+                _p = _mm_mul_ps(_p, _rms);
+                _p = _mm_mul_ps(_p, _gamma);
+                _mm_storeu_ps(ptr, _p);
+                ptr += 4;
+                gamma_ptr += 1;
+            }
+        }
+        if (elempack == 1)
+        {
+#if __AVX__
+#if __AVX512F__
+            for (; i + 15 < size; i += 16)
+            {
+                __m512 _p = _mm512_loadu_ps(ptr);
+                __m512 _gamma = _mm512_loadu_ps(gamma_ptr);
+                _p = _mm512_mul_ps(_p, _rms_avx512);
+                _p = _mm512_mul_ps(_p, _gamma);
+                _mm512_storeu_ps(ptr, _p);
+                ptr += 16;
+                gamma_ptr += 16;
+            }
+#endif // __AVX512F__
+            for (; i + 7 < size; i += 8)
+            {
+                __m256 _p = _mm256_loadu_ps(ptr);
+                __m256 _gamma = _mm256_loadu_ps(gamma_ptr);
+                _p = _mm256_mul_ps(_p, _rms_avx);
+                _p = _mm256_mul_ps(_p, _gamma);
+                _mm256_storeu_ps(ptr, _p);
+                ptr += 8;
+                gamma_ptr += 8;
+            }
+#endif // __AVX__
+            for (; i + 3 < size; i += 4)
+            {
+                __m128 _p = _mm_loadu_ps(ptr);
+                __m128 _gamma = _mm_loadu_ps(gamma_ptr);
+                _p = _mm_mul_ps(_p, _rms);
+                _p = _mm_mul_ps(_p, _gamma);
+                _mm_storeu_ps(ptr, _p);
+                ptr += 4;
+                gamma_ptr += 4;
+            }
+        }
+#endif // __SSE2__
+        for (; i < size; i++)
+        {
+            ptr[0] = (ptr[0] * rms) * gamma_ptr[0];
+            ptr++;
+            gamma_ptr++;
+        }
+    }
+    else
+    {
+        int i = 0;
+#if __SSE2__
+#if __AVX__
+#if __AVX512F__
+        for (; i + 15 < size; i += 16)
+        {
+            __m512 _p = _mm512_loadu_ps(ptr);
+            _p = _mm512_mul_ps(_p, _rms_avx512);
+            _mm512_storeu_ps(ptr, _p);
+            ptr += 16;
+        }
+#endif // __AVX512F__
+        for (; i + 7 < size; i += 8)
+        {
+            __m256 _p = _mm256_loadu_ps(ptr);
+            _p = _mm256_mul_ps(_p, _rms_avx);
+            _mm256_storeu_ps(ptr, _p);
+            ptr += 8;
+        }
+#endif // __AVX__
+        for (; i + 3 < size; i += 4)
+        {
+            __m128 _p = _mm_loadu_ps(ptr);
+            _p = _mm_mul_ps(_p, _rms);
+            _mm_storeu_ps(ptr, _p);
+            ptr += 4;
+        }
+#endif // __SSE2__
+        for (; i < size; i++)
+        {
+            ptr[0] = ptr[0] * rms;
+            ptr++;
+        }
+    }
+}
+
+int RMSNorm_x86::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
+{
+    const int dims = bottom_top_blob.dims;
+    const int w = bottom_top_blob.w;
+    const int h = bottom_top_blob.h;
+    const int channels = bottom_top_blob.c;
+    const int elempack = bottom_top_blob.elempack;
+
+    if (dims == 1)
+    {
+        // assert affine_size == w
+
+        float* ptr = bottom_top_blob;
+        rmsnorm(ptr, gamma_data, eps, w * elempack, 1);
+    }
+
+    if (dims == 2)
+    {
+        // assert affine_size == w
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int i = 0; i < h; i++)
+        {
+            float* ptr = bottom_top_blob.row(i);
+            rmsnorm(ptr, gamma_data, eps, w, elempack);
+        }
+    }
+
+    if (dims == 3)
+    {
+        if (affine_size == w)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                for (int i = 0; i < h; i++)
+                {
+                    float* ptr = bottom_top_blob.channel(q).row(i);
+                    rmsnorm(ptr, gamma_data, eps, w, elempack);
+                }
+            }
+        }
+        else // if (affine_size == w * h)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                float* ptr = bottom_top_blob.channel(q);
+                rmsnorm(ptr, gamma_data, eps, w * h, elempack);
+            }
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/x86/rmsnorm_x86.h b/src/layer/x86/rmsnorm_x86.h
new file mode 100644
index 00000000000..2e6296db1c3
--- /dev/null
+++ b/src/layer/x86/rmsnorm_x86.h
@@ -0,0 +1,32 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_RMSNORM_X86_H
+#define LAYER_RMSNORM_X86_H
+
+#include "rmsnorm.h"
+
+namespace ncnn {
+
+class RMSNorm_x86 : public RMSNorm
+{
+public:
+    RMSNorm_x86();
+
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_RMSNORM_X86_H

From 21e54d8c7a789884d1c17dc1b40701bede343975 Mon Sep 17 00:00:00 2001
From: nihui <nihuini@tencent.com>
Date: Wed, 4 Sep 2024 08:01:53 +0800
Subject: [PATCH 36/38] update modelwriter for rmsnorm (#5676)

---
 tools/modelwriter.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/tools/modelwriter.h b/tools/modelwriter.h
index 39157c453ec..ff86338bca9 100644
--- a/tools/modelwriter.h
+++ b/tools/modelwriter.h
@@ -99,6 +99,7 @@
 #include "layer/reorg.h"
 #include "layer/requantize.h"
 #include "layer/reshape.h"
+#include "layer/rmsnorm.h"
 #include "layer/rnn.h"
 #include "layer/roialign.h"
 #include "layer/roipooling.h"
@@ -2313,6 +2314,17 @@ int ModelWriter::save(const char* parampath, const char* binpath)
             fprintf_param_value(" 2=%d", c)
             fprintf_param_value(" 3=%d", permute)
         }
+        else if (layer->type == "RMSNorm")
+        {
+            ncnn::RMSNorm* op = (ncnn::RMSNorm*)layer;
+            ncnn::RMSNorm* op_default = (ncnn::RMSNorm*)layer_default;
+
+            fprintf_param_value(" 0=%d", affine_size)
+            fprintf_param_value(" 1=%e", eps)
+            fprintf_param_value(" 2=%d", affine)
+
+            fwrite_weight_data(op->gamma_data, bp);
+        }
         else if (layer->type == "RNN")
         {
             ncnn::RNN* op = (ncnn::RNN*)layer;

From 80c78a0e40d2c8843cdbb3917fd00387a0e33ce1 Mon Sep 17 00:00:00 2001
From: nihui <nihuini@tencent.com>
Date: Wed, 4 Sep 2024 14:29:09 +0800
Subject: [PATCH 37/38] pnnx fuse t5-layernorm as rmsnorm (#5675)

---
 tools/pnnx/src/CMakeLists.txt               |  1 +
 tools/pnnx/src/pass_level5.cpp              |  2 +
 tools/pnnx/src/pass_level5/fuse_rmsnorm.cpp | 97 +++++++++++++++++++++
 tools/pnnx/src/pass_level5/fuse_rmsnorm.h   | 21 +++++
 tools/pnnx/tests/CMakeLists.txt             |  1 +
 tools/pnnx/tests/ncnn/test_F_rms_norm.py    |  2 +-
 tools/pnnx/tests/ncnn/test_nn_RMSNorm.py    |  2 +-
 tools/pnnx/tests/test_pnnx_fuse_rmsnorm.py  | 77 ++++++++++++++++
 8 files changed, 201 insertions(+), 2 deletions(-)
 create mode 100644 tools/pnnx/src/pass_level5/fuse_rmsnorm.cpp
 create mode 100644 tools/pnnx/src/pass_level5/fuse_rmsnorm.h
 create mode 100644 tools/pnnx/tests/test_pnnx_fuse_rmsnorm.py

diff --git a/tools/pnnx/src/CMakeLists.txt b/tools/pnnx/src/CMakeLists.txt
index 2c814bd486c..7743a8ae453 100644
--- a/tools/pnnx/src/CMakeLists.txt
+++ b/tools/pnnx/src/CMakeLists.txt
@@ -369,6 +369,7 @@ set(pnnx_pass_level5_SRCS
     pass_level5/fuse_pixel_unshuffle.cpp
     pass_level5/fuse_layernorm.cpp
     pass_level5/fuse_multiheadattention.cpp
+    pass_level5/fuse_rmsnorm.cpp
     pass_level5/fuse_scaled_dot_product_attention.cpp
     pass_level5/fuse_select_to_unbind.cpp
     pass_level5/fuse_silu.cpp
diff --git a/tools/pnnx/src/pass_level5.cpp b/tools/pnnx/src/pass_level5.cpp
index 8bb3270aa2c..5f08b80f5ef 100644
--- a/tools/pnnx/src/pass_level5.cpp
+++ b/tools/pnnx/src/pass_level5.cpp
@@ -44,6 +44,7 @@
 #include "pass_level5/fuse_multiheadattention.h"
 #include "pass_level5/fuse_pad_conv1d.h"
 #include "pass_level5/fuse_pad_conv2d.h"
+#include "pass_level5/fuse_rmsnorm.h"
 #include "pass_level5/fuse_scaled_dot_product_attention.h"
 #include "pass_level5/fuse_select_to_unbind.h"
 #include "pass_level5/fuse_silu.h"
@@ -145,6 +146,7 @@ void pass_level5(Graph& g, const std::set<std::string>& foldable_constants, cons
 
     fuse_channel_shuffle(g);
     fuse_layernorm(g);
+    fuse_rmsnorm(g);
     fuse_multiheadattention(g);
     fuse_scaled_dot_product_attention(g);
 
diff --git a/tools/pnnx/src/pass_level5/fuse_rmsnorm.cpp b/tools/pnnx/src/pass_level5/fuse_rmsnorm.cpp
new file mode 100644
index 00000000000..7b99770ed6e
--- /dev/null
+++ b/tools/pnnx/src/pass_level5/fuse_rmsnorm.cpp
@@ -0,0 +1,97 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "fuse_rmsnorm.h"
+
+#include "pass_level2.h"
+
+#include <math.h>
+#include <string.h>
+
+namespace pnnx {
+
+class fuse_rmsnorm_pass : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+6 5
+pnnx.Input              input       0 1 input
+pnnx.Attribute          op_0        0 1 weight @data #weight=(%c)f32
+pnnx.Expression         op_1        1 1 input sq expr=pow(@0,2)
+torch.mean              op_2        1 1 sq sqmean dim=(-1) keepdim=True
+pnnx.Expression         op_3        3 1 weight input sqmean out expr=mul(@0,mul(@1,rsqrt(add(@2,%eps))))
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* replace_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+3 2
+pnnx.Input              input       0 1 input
+nn.RMSNorm              rmsnorm     1 1 input out elementwise_affine=True eps=%eps normalized_shape=(%c) @weight=%op_0.data
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+};
+
+class fuse_rmsnorm_pass_1 : public fuse_rmsnorm_pass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+6 5
+pnnx.Input              input       0 1 input
+pnnx.Attribute          op_0        0 1 weight @data #weight=(%c)f32
+pnnx.Expression         op_1        1 1 input sq expr=pow(@0,2.000000e+00)
+torch.mean              op_2        1 1 sq sqmean dim=(-1) keepdim=True
+pnnx.Expression         op_3        3 1 weight input sqmean out expr=mul(@0,mul(@1,reciprocal(sqrt(add(@2,%eps)))))
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+};
+
+class fuse_rmsnorm_pass_onnx : public fuse_rmsnorm_pass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+6 5
+pnnx.Input              input       0 1 input
+pnnx.Attribute          op_0        0 1 weight @data #weight=(%c)f32
+pnnx.Expression         op_1        1 1 input sq expr=pow(@0,2.000000e+00)
+torch.mean              op_2        1 1 sq sqmean dim=(-1) keepdim=True
+pnnx.Expression         op_3        3 1 weight input sqmean out expr=mul(@0,mul(@1,div(1.000000e+00,sqrt(add(@2,%eps)))))
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+};
+
+void fuse_rmsnorm(Graph& graph)
+{
+    fuse_rmsnorm_pass a;
+    fuse_rmsnorm_pass_1 a1;
+    fuse_rmsnorm_pass_onnx b;
+    int opindex = 0;
+
+    pnnx_graph_rewrite(graph, &a, opindex);
+    pnnx_graph_rewrite(graph, &a1, opindex);
+    pnnx_graph_rewrite(graph, &b, opindex);
+}
+
+} // namespace pnnx
diff --git a/tools/pnnx/src/pass_level5/fuse_rmsnorm.h b/tools/pnnx/src/pass_level5/fuse_rmsnorm.h
new file mode 100644
index 00000000000..0ba18e37f61
--- /dev/null
+++ b/tools/pnnx/src/pass_level5/fuse_rmsnorm.h
@@ -0,0 +1,21 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "ir.h"
+
+namespace pnnx {
+
+void fuse_rmsnorm(Graph& graph);
+
+} // namespace pnnx
diff --git a/tools/pnnx/tests/CMakeLists.txt b/tools/pnnx/tests/CMakeLists.txt
index daf5501e9d8..0dd566c37b5 100644
--- a/tools/pnnx/tests/CMakeLists.txt
+++ b/tools/pnnx/tests/CMakeLists.txt
@@ -346,6 +346,7 @@ pnnx_add_test(pnnx_fuse_input_unpack)
 pnnx_add_test(pnnx_fuse_layernorm)
 pnnx_add_test(pnnx_fuse_linear_batchnorm1d)
 pnnx_add_test(pnnx_fuse_multiheadattention)
+pnnx_add_test(pnnx_fuse_rmsnorm)
 pnnx_add_test(pnnx_fuse_scaled_dot_product_attention)
 pnnx_add_test(pnnx_fuse_select_to_unbind)
 pnnx_add_test(pnnx_fuse_slice_to_tensor_split)
diff --git a/tools/pnnx/tests/ncnn/test_F_rms_norm.py b/tools/pnnx/tests/ncnn/test_F_rms_norm.py
index 4e60d9314aa..f30f72f9ac4 100644
--- a/tools/pnnx/tests/ncnn/test_F_rms_norm.py
+++ b/tools/pnnx/tests/ncnn/test_F_rms_norm.py
@@ -57,7 +57,7 @@ def test():
     b = test_F_rms_norm_ncnn.test_inference()
 
     for a0, b0 in zip(a, b):
-        if not torch.allclose(a0, b0, 1e-4, 1e-4):
+        if not torch.allclose(a0, b0, 1e-3, 1e-3):
             return False
     return True
 
diff --git a/tools/pnnx/tests/ncnn/test_nn_RMSNorm.py b/tools/pnnx/tests/ncnn/test_nn_RMSNorm.py
index 0d5efa211e4..e69ad1220bc 100644
--- a/tools/pnnx/tests/ncnn/test_nn_RMSNorm.py
+++ b/tools/pnnx/tests/ncnn/test_nn_RMSNorm.py
@@ -57,7 +57,7 @@ def test():
     b = test_nn_RMSNorm_ncnn.test_inference()
 
     for a0, b0 in zip(a, b):
-        if not torch.allclose(a0, b0, 1e-4, 1e-4):
+        if not torch.allclose(a0, b0, 1e-3, 1e-3):
             return False
     return True
 
diff --git a/tools/pnnx/tests/test_pnnx_fuse_rmsnorm.py b/tools/pnnx/tests/test_pnnx_fuse_rmsnorm.py
new file mode 100644
index 00000000000..b04fa93442f
--- /dev/null
+++ b/tools/pnnx/tests/test_pnnx_fuse_rmsnorm.py
@@ -0,0 +1,77 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from packaging import version
+
+class T5LayerNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.rand(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, x):
+        variance = x.pow(2).mean(-1, keepdim=True)
+        x = x * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * x
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+        self.rmsn_0 = T5LayerNorm(26)
+        self.rmsn_1 = T5LayerNorm(21)
+
+    def forward(self, x, y):
+        x = self.rmsn_0(x)
+        y = self.rmsn_1(y)
+        return x, y
+
+def test():
+    if version.parse(torch.__version__) < version.parse('2.4'):
+        return True
+
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 64, 26)
+    y = torch.rand(3, 15, 15, 21)
+
+    a0, a1 = net(x, y)
+
+    # export onnx
+    torch.onnx.export(net, (x,y), "test.onnx")
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y))
+    mod.save("test_pnnx_fuse_rmsnorm.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../src/pnnx test_pnnx_fuse_rmsnorm.pt inputshape=[1,64,26],[3,15,15,21]")
+
+    # pnnx inference
+    import test_pnnx_fuse_rmsnorm_pnnx
+    b0, b1 = test_pnnx_fuse_rmsnorm_pnnx.test_inference()
+
+    return torch.allclose(a0, b0, 1e-4, 1e-4) and torch.allclose(a1, b1, 1e-4, 1e-4)
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)

From 9b5f6a39b4a4962accaad58caa771487f61f732a Mon Sep 17 00:00:00 2001
From: Ankush Goel <ankushgoel27@gmail.com>
Date: Wed, 25 Sep 2024 06:04:07 +0530
Subject: [PATCH 38/38] fix: typo (#5709)

---
 docs/faq.en.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/faq.en.md b/docs/faq.en.md
index 807c4a9e3ee..44d0068263b 100644
--- a/docs/faq.en.md
+++ b/docs/faq.en.md
@@ -262,7 +262,7 @@ Fully customizable op, first change to one that can export (e.g. concat slice),
 
    Set net.opt.use_vulkan_compute = true before load_param / load_model;
 
-- ## How to ececute multiple blob inputs, multiple blob outputs？
+- ## How to execute multiple blob inputs, multiple blob outputs？
    Multiple execute `ex.input()` and `ex.extract()` like following
     ```
     ex.input("data1", in_1);