From e7cae68a227f7bab2e085a9e1f24437d6749ac23 Mon Sep 17 00:00:00 2001 From: nihui Date: Sat, 13 Jul 2024 23:56:29 +0800 Subject: [PATCH 01/38] pnnx convert onnx logsoftmax/logsigmoid/mish/selu/sigmoid/silu/softmin/softplus/softshrink/softsign/tanh/tanhshrink (#5581) --- tools/pnnx/src/pass_level2/F_log_softmax.cpp | 73 +++++++++++++++++++ tools/pnnx/src/pass_level2/F_logsigmoid.cpp | 22 ++++++ tools/pnnx/src/pass_level2/F_mish.cpp | 23 ++++++ tools/pnnx/src/pass_level2/F_selu.cpp | 21 ++++++ tools/pnnx/src/pass_level2/F_softmin.cpp | 22 ++++++ tools/pnnx/src/pass_level2/F_softplus.cpp | 58 +++++++++++++++ tools/pnnx/src/pass_level2/F_softshrink.cpp | 58 +++++++++++++++ tools/pnnx/src/pass_level2/F_softsign.cpp | 24 +++++++ tools/pnnx/src/pass_level2/F_tanhshrink.cpp | 22 ++++++ tools/pnnx/tests/onnx/CMakeLists.txt | 22 ++++++ tools/pnnx/tests/onnx/test_F_log_softmax.py | 66 +++++++++++++++++ tools/pnnx/tests/onnx/test_F_logsigmoid.py | 66 +++++++++++++++++ tools/pnnx/tests/onnx/test_F_mish.py | 76 ++++++++++++++++++++ tools/pnnx/tests/onnx/test_F_selu.py | 66 +++++++++++++++++ tools/pnnx/tests/onnx/test_F_sigmoid.py | 9 ++- tools/pnnx/tests/onnx/test_F_silu.py | 69 ++++++++++++++++++ tools/pnnx/tests/onnx/test_F_softmin.py | 66 +++++++++++++++++ tools/pnnx/tests/onnx/test_F_softplus.py | 70 ++++++++++++++++++ tools/pnnx/tests/onnx/test_F_softshrink.py | 70 ++++++++++++++++++ tools/pnnx/tests/onnx/test_F_softsign.py | 66 +++++++++++++++++ tools/pnnx/tests/onnx/test_F_tanh.py | 66 +++++++++++++++++ tools/pnnx/tests/onnx/test_F_tanhshrink.py | 66 +++++++++++++++++ tools/pnnx/tests/onnx/test_nn_LogSigmoid.py | 68 ++++++++++++++++++ tools/pnnx/tests/onnx/test_nn_LogSoftmax.py | 71 ++++++++++++++++++ tools/pnnx/tests/onnx/test_nn_Mish.py | 72 +++++++++++++++++++ tools/pnnx/tests/onnx/test_nn_SELU.py | 68 ++++++++++++++++++ tools/pnnx/tests/onnx/test_nn_SiLU.py | 68 ++++++++++++++++++ tools/pnnx/tests/onnx/test_nn_Sigmoid.py | 9 ++- tools/pnnx/tests/onnx/test_nn_Softmin.py | 71 ++++++++++++++++++ tools/pnnx/tests/onnx/test_nn_Softplus.py | 73 +++++++++++++++++++ tools/pnnx/tests/onnx/test_nn_Softshrink.py | 73 +++++++++++++++++++ tools/pnnx/tests/onnx/test_nn_Softsign.py | 68 ++++++++++++++++++ tools/pnnx/tests/onnx/test_nn_Tanh.py | 68 ++++++++++++++++++ tools/pnnx/tests/onnx/test_nn_Tanhshrink.py | 68 ++++++++++++++++++ 34 files changed, 1872 insertions(+), 6 deletions(-) create mode 100644 tools/pnnx/tests/onnx/test_F_log_softmax.py create mode 100644 tools/pnnx/tests/onnx/test_F_logsigmoid.py create mode 100644 tools/pnnx/tests/onnx/test_F_mish.py create mode 100644 tools/pnnx/tests/onnx/test_F_selu.py create mode 100644 tools/pnnx/tests/onnx/test_F_silu.py create mode 100644 tools/pnnx/tests/onnx/test_F_softmin.py create mode 100644 tools/pnnx/tests/onnx/test_F_softplus.py create mode 100644 tools/pnnx/tests/onnx/test_F_softshrink.py create mode 100644 tools/pnnx/tests/onnx/test_F_softsign.py create mode 100644 tools/pnnx/tests/onnx/test_F_tanh.py create mode 100644 tools/pnnx/tests/onnx/test_F_tanhshrink.py create mode 100644 tools/pnnx/tests/onnx/test_nn_LogSigmoid.py create mode 100644 tools/pnnx/tests/onnx/test_nn_LogSoftmax.py create mode 100644 tools/pnnx/tests/onnx/test_nn_Mish.py create mode 100644 tools/pnnx/tests/onnx/test_nn_SELU.py create mode 100644 tools/pnnx/tests/onnx/test_nn_SiLU.py create mode 100644 tools/pnnx/tests/onnx/test_nn_Softmin.py create mode 100644 tools/pnnx/tests/onnx/test_nn_Softplus.py create mode 100644 tools/pnnx/tests/onnx/test_nn_Softshrink.py create mode 100644 tools/pnnx/tests/onnx/test_nn_Softsign.py create mode 100644 tools/pnnx/tests/onnx/test_nn_Tanh.py create mode 100644 tools/pnnx/tests/onnx/test_nn_Tanhshrink.py diff --git a/tools/pnnx/src/pass_level2/F_log_softmax.cpp b/tools/pnnx/src/pass_level2/F_log_softmax.cpp index 0264973783b..ad9eba30d1c 100644 --- a/tools/pnnx/src/pass_level2/F_log_softmax.cpp +++ b/tools/pnnx/src/pass_level2/F_log_softmax.cpp @@ -39,4 +39,77 @@ pnnx.Output output 1 0 out REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_log_softmax, 10) +class F_log_softmax_onnx : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +3 2 +pnnx.Input input_0 0 1 input +LogSoftmax op_0 1 1 input out axis=%dim +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "F.log_softmax"; + } +}; + +REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_log_softmax_onnx, 10) + +class F_log_softmax_onnx_1 : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +5 4 +pnnx.Input input_0 0 1 input +Transpose op_0 1 1 input a perm=%perm +LogSoftmax op_1 1 1 a b axis=%axis +Transpose op_2 1 1 b out perm=%perm +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "F.log_softmax"; + } + + bool match(const std::map& captured_params) const + { + const std::vector& perm = captured_params.at("perm").ai; + const int axis = captured_params.at("axis").i; + + if (axis >= (int)perm.size()) + return false; + + int excount = 0; + for (int i = 0; i < (int)perm.size(); i++) + { + if (perm[i] != i) + excount++; + } + + if (excount != 2) + return false; + + return true; + } + + void write(Operator* op, const std::map& captured_params) const + { + const std::vector& perm = captured_params.at("perm").ai; + const int axis = captured_params.at("axis").i; + + op->params["dim"] = perm[axis]; + } +}; + +REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_log_softmax_onnx_1, 9) + } // namespace pnnx diff --git a/tools/pnnx/src/pass_level2/F_logsigmoid.cpp b/tools/pnnx/src/pass_level2/F_logsigmoid.cpp index e35670686a0..e0d4df607f2 100644 --- a/tools/pnnx/src/pass_level2/F_logsigmoid.cpp +++ b/tools/pnnx/src/pass_level2/F_logsigmoid.cpp @@ -37,4 +37,26 @@ pnnx.Output output 1 0 out REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_logsigmoid, 10) +class F_logsigmoid_onnx : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +4 3 +pnnx.Input input 0 1 input +aten::sigmoid op_0 1 1 input a +aten::log op_1 1 1 a out +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "F.logsigmoid"; + } +}; + +REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_logsigmoid_onnx, 9) + } // namespace pnnx diff --git a/tools/pnnx/src/pass_level2/F_mish.cpp b/tools/pnnx/src/pass_level2/F_mish.cpp index 1a083ba85d9..485a7e3b0b5 100644 --- a/tools/pnnx/src/pass_level2/F_mish.cpp +++ b/tools/pnnx/src/pass_level2/F_mish.cpp @@ -62,4 +62,27 @@ pnnx.Output output 1 0 out REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_mish_1, 9) +class F_mish_onnx : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +5 4 +pnnx.Input input 0 1 input +Softplus op_0 1 1 input a +aten::tanh op_1 1 1 a b +aten::mul op_2 2 1 input b out +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "F.mish"; + } +}; + +REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_mish_onnx, 9) + } // namespace pnnx diff --git a/tools/pnnx/src/pass_level2/F_selu.cpp b/tools/pnnx/src/pass_level2/F_selu.cpp index 592c3dd8ed7..9df970b1bbc 100644 --- a/tools/pnnx/src/pass_level2/F_selu.cpp +++ b/tools/pnnx/src/pass_level2/F_selu.cpp @@ -37,4 +37,25 @@ pnnx.Output output 1 0 out REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_selu, 10) +class F_selu_onnx : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +3 2 +pnnx.Input input 0 1 input +Selu op_0 1 1 input out +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "F.selu"; + } +}; + +REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_selu_onnx, 10) + } // namespace pnnx diff --git a/tools/pnnx/src/pass_level2/F_softmin.cpp b/tools/pnnx/src/pass_level2/F_softmin.cpp index bb0768663c5..89e5d9aeaf8 100644 --- a/tools/pnnx/src/pass_level2/F_softmin.cpp +++ b/tools/pnnx/src/pass_level2/F_softmin.cpp @@ -40,4 +40,26 @@ pnnx.Output output 1 0 out REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_softmin, 9) +class F_softmin_onnx : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +4 3 +pnnx.Input input 0 1 input +aten::neg op_0 1 1 input 6 +Softmax op_1 1 1 6 out axis=%dim +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "F.softmin"; + } +}; + +REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_softmin_onnx, 9) + } // namespace pnnx diff --git a/tools/pnnx/src/pass_level2/F_softplus.cpp b/tools/pnnx/src/pass_level2/F_softplus.cpp index c6a5279b414..8d346eb76ed 100644 --- a/tools/pnnx/src/pass_level2/F_softplus.cpp +++ b/tools/pnnx/src/pass_level2/F_softplus.cpp @@ -39,4 +39,62 @@ pnnx.Output output 1 0 out REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_softplus, 10) +class F_softplus_onnx : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +3 2 +pnnx.Input input_0 0 1 input +Softplus op_0 1 1 input out +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "F.softplus"; + } + + void write(Operator* op, const std::map& /*captured_params*/) const + { + op->params["beta"] = 1.f; + op->params["threshold"] = 20.f; + } +}; + +REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_softplus_onnx, 10) + +class F_softplus_onnx_1 : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +7 6 +pnnx.Input input_0 0 1 input +prim::Constant op_0 0 1 beta value=%beta +aten::mul op_1 2 1 input beta a +Softplus op_2 1 1 a b +prim::Constant op_3 0 1 beta2 value=%beta +aten::div op_4 2 1 b beta2 out +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "F.softplus"; + } + + void write(Operator* op, const std::map& captured_params) const + { + op->params["beta"] = captured_params.at("beta"); + op->params["threshold"] = 20.f; + } +}; + +REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_softplus_onnx_1, 9) + } // namespace pnnx diff --git a/tools/pnnx/src/pass_level2/F_softshrink.cpp b/tools/pnnx/src/pass_level2/F_softshrink.cpp index 286990bf2c5..8d14a8a644b 100644 --- a/tools/pnnx/src/pass_level2/F_softshrink.cpp +++ b/tools/pnnx/src/pass_level2/F_softshrink.cpp @@ -38,4 +38,62 @@ pnnx.Output output 1 0 out REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_softshrink, 10) +static bool NearlyEqual(float a, float b, float epsilon) +{ + if (a == b) + return true; + + float diff = (float)fabs(a - b); + if (diff <= epsilon) + return true; + + // relative error + return diff < epsilon * std::max(fabs(a), fabs(b)); +} + +class F_softshrink_onnx : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +15 14 +pnnx.Input input 0 1 input +prim::Constant op_0 0 1 lambd value=%lambd +aten::gt op_1 2 1 input lambd 8 +prim::Constant op_2 0 1 lambd2 value=%lambd +aten::sub op_3 2 1 input lambd2 9 +prim::Constant op_4 0 1 zero value=0 +aten::where op_5 3 1 8 9 zero a +prim::Constant op_6 0 1 mlambd value=%lambd2 +aten::lt op_7 2 1 input mlambd 11 +prim::Constant op_8 0 1 lambd3 value=%lambd +aten::add op_9 2 1 input lambd3 12 +prim::Constant op_10 0 1 zero2 value=0 +aten::where op_11 3 1 11 12 zero2 b +aten::add op_12 2 1 a b out +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "F.softshrink"; + } + + bool match(const std::map& captured_params) const + { + float lambd = captured_params.at("lambd").f; + float lambd2 = captured_params.at("lambd2").f; + return NearlyEqual(lambd, -lambd2, 0.001); + } + + void write(Operator* op, const std::map& captured_params) const + { + op->params["lambd"] = captured_params.at("lambd"); + } +}; + +REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_softshrink_onnx, 10) + } // namespace pnnx diff --git a/tools/pnnx/src/pass_level2/F_softsign.cpp b/tools/pnnx/src/pass_level2/F_softsign.cpp index 4ec8ae9e520..ae6005d6337 100644 --- a/tools/pnnx/src/pass_level2/F_softsign.cpp +++ b/tools/pnnx/src/pass_level2/F_softsign.cpp @@ -41,4 +41,28 @@ pnnx.Output output 1 0 out REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_softsign, 10) +class F_softsign_onnx : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +6 5 +pnnx.Input input 0 1 input +aten::abs op_0 1 1 input 6 +prim::Constant op_1 0 1 8 value=1 +aten::add op_2 2 1 6 8 9 +aten::div op_3 2 1 input 9 out +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "F.softsign"; + } +}; + +REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_softsign_onnx, 10) + } // namespace pnnx diff --git a/tools/pnnx/src/pass_level2/F_tanhshrink.cpp b/tools/pnnx/src/pass_level2/F_tanhshrink.cpp index d8d6c311fcd..01e578bf8ad 100644 --- a/tools/pnnx/src/pass_level2/F_tanhshrink.cpp +++ b/tools/pnnx/src/pass_level2/F_tanhshrink.cpp @@ -39,4 +39,26 @@ pnnx.Output output 1 0 out REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_tanhshrink, 9) +class F_tanhshrink_onnx : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +4 3 +pnnx.Input input 0 1 input +aten::tanh op_0 1 1 input 7 +aten::sub op_1 2 1 input 7 out +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "F.tanhshrink"; + } +}; + +REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_tanhshrink_onnx, 9) + } // namespace pnnx diff --git a/tools/pnnx/tests/onnx/CMakeLists.txt b/tools/pnnx/tests/onnx/CMakeLists.txt index 0c0a136fbaf..0e283e77d48 100644 --- a/tools/pnnx/tests/onnx/CMakeLists.txt +++ b/tools/pnnx/tests/onnx/CMakeLists.txt @@ -29,16 +29,27 @@ pnnx_onnx_add_test(F_layer_norm) pnnx_onnx_add_test(F_leaky_relu) pnnx_onnx_add_test(F_linear) pnnx_onnx_add_test(F_local_response_norm) +pnnx_onnx_add_test(F_logsigmoid) +pnnx_onnx_add_test(F_log_softmax) pnnx_onnx_add_test(F_max_pool1d) pnnx_onnx_add_test(F_max_pool2d) pnnx_onnx_add_test(F_max_pool3d) +pnnx_onnx_add_test(F_mish) pnnx_onnx_add_test(F_pad) pnnx_onnx_add_test(F_prelu) pnnx_onnx_add_test(F_relu) pnnx_onnx_add_test(F_relu6) pnnx_onnx_add_test(F_scaled_dot_product_attention) +pnnx_onnx_add_test(F_selu) pnnx_onnx_add_test(F_sigmoid) +pnnx_onnx_add_test(F_silu) pnnx_onnx_add_test(F_softmax) +pnnx_onnx_add_test(F_softmin) +pnnx_onnx_add_test(F_softplus) +pnnx_onnx_add_test(F_softshrink) +pnnx_onnx_add_test(F_softsign) +pnnx_onnx_add_test(F_tanh) +pnnx_onnx_add_test(F_tanhshrink) pnnx_onnx_add_test(F_upsample_bilinear) pnnx_onnx_add_test(F_upsample_nearest) pnnx_onnx_add_test(F_upsample) @@ -74,10 +85,13 @@ pnnx_onnx_add_test(nn_LayerNorm) pnnx_onnx_add_test(nn_LeakyReLU) pnnx_onnx_add_test(nn_Linear) pnnx_onnx_add_test(nn_LocalResponseNorm) +pnnx_onnx_add_test(nn_LogSigmoid) +pnnx_onnx_add_test(nn_LogSoftmax) pnnx_onnx_add_test(nn_LSTM) pnnx_onnx_add_test(nn_MaxPool1d) pnnx_onnx_add_test(nn_MaxPool2d) pnnx_onnx_add_test(nn_MaxPool3d) +pnnx_onnx_add_test(nn_Mish) pnnx_onnx_add_test(nn_MultiheadAttention) pnnx_onnx_add_test(nn_PReLU) pnnx_onnx_add_test(nn_ReflectionPad1d) @@ -88,8 +102,16 @@ pnnx_onnx_add_test(nn_ReplicationPad1d) pnnx_onnx_add_test(nn_ReplicationPad2d) pnnx_onnx_add_test(nn_ReplicationPad3d) pnnx_onnx_add_test(nn_RNN) +pnnx_onnx_add_test(nn_SELU) pnnx_onnx_add_test(nn_Sigmoid) +pnnx_onnx_add_test(nn_SiLU) pnnx_onnx_add_test(nn_Softmax) +pnnx_onnx_add_test(nn_Softmin) +pnnx_onnx_add_test(nn_Softplus) +pnnx_onnx_add_test(nn_Softshrink) +pnnx_onnx_add_test(nn_Softsign) +pnnx_onnx_add_test(nn_Tanh) +pnnx_onnx_add_test(nn_Tanhshrink) pnnx_onnx_add_test(nn_Upsample) pnnx_onnx_add_test(nn_UpsamplingBilinear2d) pnnx_onnx_add_test(nn_UpsamplingNearest2d) diff --git a/tools/pnnx/tests/onnx/test_F_log_softmax.py b/tools/pnnx/tests/onnx/test_F_log_softmax.py new file mode 100644 index 00000000000..8bc657c6778 --- /dev/null +++ b/tools/pnnx/tests/onnx/test_F_log_softmax.py @@ -0,0 +1,66 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z, w): + x = x * 2 - 1 + y = y * 2 - 1 + z = z * 2 - 1 + w = w * 2 - 1 + x = F.log_softmax(x, 1) + y = F.log_softmax(y, 0) + z = F.log_softmax(z, 2) + w = F.log_softmax(w, 3) + return x, y, z, w + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 16) + y = torch.rand(12, 2, 16) + z = torch.rand(1, 3, 12, 16) + w = torch.rand(1, 5, 7, 9, 11) + + a = net(x, y, z, w) + + # export onnx + torch.onnx.export(net, (x, y, z, w), "test_F_log_softmax.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_F_log_softmax.onnx inputshape=[1,16],[12,2,16],[1,3,12,16],[1,5,7,9,11]") + + # pnnx inference + import test_F_log_softmax_pnnx + b = test_F_log_softmax_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_F_logsigmoid.py b/tools/pnnx/tests/onnx/test_F_logsigmoid.py new file mode 100644 index 00000000000..a731936a109 --- /dev/null +++ b/tools/pnnx/tests/onnx/test_F_logsigmoid.py @@ -0,0 +1,66 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z, w): + x = x * 2 - 1 + y = y * 2 - 1 + z = z * 2 - 1 + w = w * 2 - 1 + x = F.logsigmoid(x) + y = F.logsigmoid(y) + z = F.logsigmoid(z) + w = F.logsigmoid(w) + return x, y, z, w + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 16) + y = torch.rand(12, 2, 16) + z = torch.rand(1, 3, 12, 16) + w = torch.rand(1, 5, 7, 9, 11) + + a = net(x, y, z, w) + + # export onnx + torch.onnx.export(net, (x, y, z, w), "test_F_logsigmoid.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_F_logsigmoid.onnx inputshape=[1,16],[12,2,16],[1,3,12,16],[1,5,7,9,11]") + + # pnnx inference + import test_F_logsigmoid_pnnx + b = test_F_logsigmoid_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_F_mish.py b/tools/pnnx/tests/onnx/test_F_mish.py new file mode 100644 index 00000000000..69026d38b2b --- /dev/null +++ b/tools/pnnx/tests/onnx/test_F_mish.py @@ -0,0 +1,76 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F +from packaging import version + +def mish_forward_0(x): + return x * F.softplus(x).tanh() + +def mish_forward_1(x): + return x.mul(torch.tanh(F.softplus(x))) + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z, w): + x = x * 2 - 1 + y = y * 2 - 1 + z = z * 2 - 1 + w = w * 2 - 1 + x = F.mish(x) + y = F.mish(y) + z = mish_forward_0(z) + w = mish_forward_1(w) + return x, y, z, w + +def test(): + if version.parse(torch.__version__) < version.parse('1.9'): + return True + + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 16) + y = torch.rand(12, 2, 16) + z = torch.rand(1, 3, 12, 16) + w = torch.rand(1, 5, 7, 9, 11) + + a = net(x, y, z, w) + + # export onnx + torch.onnx.export(net, (x, y, z, w), "test_F_mish.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_F_mish.onnx inputshape=[1,16],[12,2,16],[1,3,12,16],[1,5,7,9,11]") + + # pnnx inference + import test_F_mish_pnnx + b = test_F_mish_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.allclose(a0, b0, 1e-4, 1e-4): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_F_selu.py b/tools/pnnx/tests/onnx/test_F_selu.py new file mode 100644 index 00000000000..e70f9344191 --- /dev/null +++ b/tools/pnnx/tests/onnx/test_F_selu.py @@ -0,0 +1,66 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z, w): + x = x * 2 - 1 + y = y * 2 - 1 + z = z * 2 - 1 + w = w * 2 - 1 + x = F.selu(x) + y = F.selu(y) + z = F.selu(z) + w = F.selu(w) + return x, y, z, w + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 16) + y = torch.rand(12, 2, 16) + z = torch.rand(1, 3, 12, 16) + w = torch.rand(1, 5, 7, 9, 11) + + a = net(x, y, z, w) + + # export onnx + torch.onnx.export(net, (x, y, z, w), "test_F_selu.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_F_selu.onnx inputshape=[1,16],[12,2,16],[1,3,12,16],[1,5,7,9,11]") + + # pnnx inference + import test_F_selu_pnnx + b = test_F_selu_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_F_sigmoid.py b/tools/pnnx/tests/onnx/test_F_sigmoid.py index 684a7ab48d9..c90e570e005 100644 --- a/tools/pnnx/tests/onnx/test_F_sigmoid.py +++ b/tools/pnnx/tests/onnx/test_F_sigmoid.py @@ -41,7 +41,7 @@ def test(): z = torch.rand(1, 3, 12, 16) w = torch.rand(1, 5, 7, 9, 11) - a0, a1, a2, a3 = net(x, y, z, w) + a = net(x, y, z, w) # export onnx torch.onnx.export(net, (x, y, z, w), "test_F_sigmoid.onnx") @@ -52,9 +52,12 @@ def test(): # pnnx inference import test_F_sigmoid_pnnx - b0, b1, b2, b3 = test_F_sigmoid_pnnx.test_inference() + b = test_F_sigmoid_pnnx.test_inference() - return torch.equal(a0, b0) and torch.equal(a1, b1) and torch.equal(a2, b2) and torch.equal(a3, b3) + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True if __name__ == "__main__": if test(): diff --git a/tools/pnnx/tests/onnx/test_F_silu.py b/tools/pnnx/tests/onnx/test_F_silu.py new file mode 100644 index 00000000000..d6cc987262e --- /dev/null +++ b/tools/pnnx/tests/onnx/test_F_silu.py @@ -0,0 +1,69 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +def silu_forward_0(x): + return x * torch.sigmoid(x) + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z, w): + x = x * 2 - 1 + y = y * 2 - 1 + z = z * 2 - 1 + w = w * 2 - 1 + x = F.silu(x) + y = F.silu(y) + z = F.silu(z) + w = silu_forward_0(w) + return x, y, z, w + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 16) + y = torch.rand(12, 2, 16) + z = torch.rand(1, 3, 12, 16) + w = torch.rand(1, 5, 7, 9, 11) + + a = net(x, y, z, w) + + # export onnx + torch.onnx.export(net, (x, y, z, w), "test_F_silu.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_F_silu.onnx inputshape=[1,16],[12,2,16],[1,3,12,16],[1,5,7,9,11]") + + # pnnx inference + import test_F_silu_pnnx + b = test_F_silu_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.allclose(a0, b0, 1e-4, 1e-4): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_F_softmin.py b/tools/pnnx/tests/onnx/test_F_softmin.py new file mode 100644 index 00000000000..88a82fea00a --- /dev/null +++ b/tools/pnnx/tests/onnx/test_F_softmin.py @@ -0,0 +1,66 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z, w): + x = x * 2 - 1 + y = y * 2 - 1 + z = z * 2 - 1 + w = w * 2 - 1 + x = F.softmin(x, 1) + y = F.softmin(y, 0) + z = F.softmin(z, 2) + w = F.softmin(w, 3) + return x, y, z, w + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 16) + y = torch.rand(12, 2, 16) + z = torch.rand(1, 3, 12, 16) + w = torch.rand(1, 5, 7, 9, 11) + + a = net(x, y, z, w) + + # export onnx + torch.onnx.export(net, (x, y, z, w), "test_F_softmin.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_F_softmin.onnx inputshape=[1,16],[12,2,16],[1,3,12,16],[1,5,7,9,11]") + + # pnnx inference + import test_F_softmin_pnnx + b = test_F_softmin_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_F_softplus.py b/tools/pnnx/tests/onnx/test_F_softplus.py new file mode 100644 index 00000000000..c261f58d67c --- /dev/null +++ b/tools/pnnx/tests/onnx/test_F_softplus.py @@ -0,0 +1,70 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F +from packaging import version + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z, w): + x = x * 2 - 1 + y = y * 2 - 1 + z = z * 2 - 1 + w = w * 2 - 1 + x = F.softplus(x) + y = F.softplus(y, 2, 5.2) + z = F.softplus(z, -0.7, 15) + w = F.softplus(w, 0.1, 0.3) + return x, y, z, w + +def test(): + if version.parse(torch.__version__) < version.parse('1.11'): + return True + + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 16) + y = torch.rand(12, 2, 16) + z = torch.rand(1, 3, 12, 16) + w = torch.rand(1, 5, 7, 9, 11) + + a = net(x, y, z, w) + + # export onnx + torch.onnx.export(net, (x, y, z, w), "test_F_softplus.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_F_softplus.onnx inputshape=[1,16],[12,2,16],[1,3,12,16],[1,5,7,9,11]") + + # pnnx inference + import test_F_softplus_pnnx + b = test_F_softplus_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.allclose(a0, b0, 1e-4, 1e-4): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_F_softshrink.py b/tools/pnnx/tests/onnx/test_F_softshrink.py new file mode 100644 index 00000000000..7f1fb883807 --- /dev/null +++ b/tools/pnnx/tests/onnx/test_F_softshrink.py @@ -0,0 +1,70 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F +from packaging import version + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z, w): + x = x * 2 - 1 + y = y * 2 - 1 + z = z * 2 - 1 + w = w * 2 - 1 + x = F.softshrink(x) + y = F.softshrink(y, 0.1) + z = F.softshrink(z, 0.22) + w = F.softshrink(w, 0) + return x, y, z, w + +def test(): + if version.parse(torch.__version__) < version.parse('1.11'): + return True + + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 16) + y = torch.rand(12, 2, 16) + z = torch.rand(1, 3, 12, 16) + w = torch.rand(1, 5, 7, 9, 11) + + a = net(x, y, z, w) + + # export onnx + torch.onnx.export(net, (x, y, z, w), "test_F_softshrink.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_F_softshrink.onnx inputshape=[1,16],[12,2,16],[1,3,12,16],[1,5,7,9,11]") + + # pnnx inference + import test_F_softshrink_pnnx + b = test_F_softshrink_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_F_softsign.py b/tools/pnnx/tests/onnx/test_F_softsign.py new file mode 100644 index 00000000000..27164f3dfc1 --- /dev/null +++ b/tools/pnnx/tests/onnx/test_F_softsign.py @@ -0,0 +1,66 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z, w): + x = x * 2 - 1 + y = y * 2 - 1 + z = z * 2 - 1 + w = w * 2 - 1 + x = F.softsign(x) + y = F.softsign(y) + z = F.softsign(z) + w = F.softsign(w) + return x, y, z, w + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 16) + y = torch.rand(12, 2, 16) + z = torch.rand(1, 3, 12, 16) + w = torch.rand(1, 5, 7, 9, 11) + + a = net(x, y, z, w) + + # export onnx + torch.onnx.export(net, (x, y, z, w), "test_F_softsign.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_F_softsign.onnx inputshape=[1,16],[12,2,16],[1,3,12,16],[1,5,7,9,11]") + + # pnnx inference + import test_F_softsign_pnnx + b = test_F_softsign_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_F_tanh.py b/tools/pnnx/tests/onnx/test_F_tanh.py new file mode 100644 index 00000000000..b56d513f655 --- /dev/null +++ b/tools/pnnx/tests/onnx/test_F_tanh.py @@ -0,0 +1,66 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z, w): + x = x * 2 - 1 + y = y * 2 - 1 + z = z * 2 - 1 + w = w * 2 - 1 + x = F.tanh(x) + y = F.tanh(y) + z = F.tanh(z) + w = F.tanh(w) + return x, y, z, w + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 16) + y = torch.rand(12, 2, 16) + z = torch.rand(1, 3, 12, 16) + w = torch.rand(1, 5, 7, 9, 11) + + a = net(x, y, z, w) + + # export onnx + torch.onnx.export(net, (x, y, z, w), "test_F_tanh.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_F_tanh.onnx inputshape=[1,16],[12,2,16],[1,3,12,16],[1,5,7,9,11]") + + # pnnx inference + import test_F_tanh_pnnx + b = test_F_tanh_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.allclose(a0, b0, 1e-4, 1e-4): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_F_tanhshrink.py b/tools/pnnx/tests/onnx/test_F_tanhshrink.py new file mode 100644 index 00000000000..7be2bf57cb1 --- /dev/null +++ b/tools/pnnx/tests/onnx/test_F_tanhshrink.py @@ -0,0 +1,66 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z, w): + x = x * 2 - 1 + y = y * 2 - 1 + z = z * 2 - 1 + w = w * 2 - 1 + x = F.tanhshrink(x) + y = F.tanhshrink(y) + z = F.tanhshrink(z) + w = F.tanhshrink(w) + return x, y, z, w + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 16) + y = torch.rand(12, 2, 16) + z = torch.rand(1, 3, 12, 16) + w = torch.rand(1, 5, 7, 9, 11) + + a = net(x, y, z, w) + + # export onnx + torch.onnx.export(net, (x, y, z, w), "test_F_tanhshrink.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_F_tanhshrink.onnx inputshape=[1,16],[12,2,16],[1,3,12,16],[1,5,7,9,11]") + + # pnnx inference + import test_F_tanhshrink_pnnx + b = test_F_tanhshrink_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_nn_LogSigmoid.py b/tools/pnnx/tests/onnx/test_nn_LogSigmoid.py new file mode 100644 index 00000000000..ddb44cbf442 --- /dev/null +++ b/tools/pnnx/tests/onnx/test_nn_LogSigmoid.py @@ -0,0 +1,68 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + self.act_0 = nn.LogSigmoid() + + def forward(self, x, y, z, w): + x = x * 2 - 1 + y = y * 2 - 1 + z = z * 2 - 1 + w = w * 2 - 1 + x = self.act_0(x) + y = self.act_0(y) + z = self.act_0(z) + w = self.act_0(w) + return x, y, z, w + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 12) + y = torch.rand(1, 12, 64) + z = torch.rand(1, 12, 24, 64) + w = torch.rand(1, 12, 24, 32, 64) + + a = net(x, y, z, w) + + # export onnx + torch.onnx.export(net, (x, y, z, w), "test_nn_LogSigmoid.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_nn_LogSigmoid.onnx inputshape=[1,12],[1,12,64],[1,12,24,64],[1,12,24,32,64]") + + # pnnx inference + import test_nn_LogSigmoid_pnnx + b = test_nn_LogSigmoid_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_nn_LogSoftmax.py b/tools/pnnx/tests/onnx/test_nn_LogSoftmax.py new file mode 100644 index 00000000000..dbe8dc96d82 --- /dev/null +++ b/tools/pnnx/tests/onnx/test_nn_LogSoftmax.py @@ -0,0 +1,71 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + self.act_0 = nn.LogSoftmax(dim=1) + self.act_1 = nn.LogSoftmax(dim=1) + self.act_2 = nn.LogSoftmax(dim=0) + self.act_3 = nn.LogSoftmax(dim=2) + + def forward(self, x, y, z, w): + x = x * 2 - 1 + y = y * 2 - 1 + z = z * 2 - 1 + w = w * 2 - 1 + x = self.act_0(x) + y = self.act_1(y) + z = self.act_2(z) + w = self.act_3(w) + return x, y, z, w + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 12) + y = torch.rand(1, 12, 64) + z = torch.rand(1, 12, 24, 64) + w = torch.rand(1, 12, 24, 32, 64) + + a = net(x, y, z, w) + + # export onnx + torch.onnx.export(net, (x, y, z, w), "test_nn_LogSoftmax.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_nn_LogSoftmax.onnx inputshape=[1,12],[1,12,64],[1,12,24,64],[1,12,24,32,64]") + + # pnnx inference + import test_nn_LogSoftmax_pnnx + b = test_nn_LogSoftmax_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_nn_Mish.py b/tools/pnnx/tests/onnx/test_nn_Mish.py new file mode 100644 index 00000000000..481ba718111 --- /dev/null +++ b/tools/pnnx/tests/onnx/test_nn_Mish.py @@ -0,0 +1,72 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F +from packaging import version + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + self.act_0 = nn.Mish() + + def forward(self, x, y, z, w): + x = x * 2 - 1 + y = y * 2 - 1 + z = z * 2 - 1 + w = w * 2 - 1 + x = self.act_0(x) + y = self.act_0(y) + z = self.act_0(z) + w = self.act_0(w) + return x, y, z, w + +def test(): + if version.parse(torch.__version__) < version.parse('1.9'): + return True + + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 12) + y = torch.rand(1, 12, 64) + z = torch.rand(1, 12, 24, 64) + w = torch.rand(1, 12, 24, 32, 64) + + a = net(x, y, z, w) + + # export onnx + torch.onnx.export(net, (x, y, z, w), "test_nn_Mish.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_nn_Mish.onnx inputshape=[1,12],[1,12,64],[1,12,24,64],[1,12,24,32,64]") + + # pnnx inference + import test_nn_Mish_pnnx + b = test_nn_Mish_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.allclose(a0, b0, 1e-4, 1e-4): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_nn_SELU.py b/tools/pnnx/tests/onnx/test_nn_SELU.py new file mode 100644 index 00000000000..a78c9e2336f --- /dev/null +++ b/tools/pnnx/tests/onnx/test_nn_SELU.py @@ -0,0 +1,68 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + self.act_0 = nn.SELU() + + def forward(self, x, y, z, w): + x = x * 2 - 1 + y = y * 2 - 1 + z = z * 2 - 1 + w = w * 2 - 1 + x = self.act_0(x) + y = self.act_0(y) + z = self.act_0(z) + w = self.act_0(w) + return x, y, z, w + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 12) + y = torch.rand(1, 12, 64) + z = torch.rand(1, 12, 24, 64) + w = torch.rand(1, 12, 24, 32, 64) + + a = net(x, y, z, w) + + # export onnx + torch.onnx.export(net, (x, y, z, w), "test_nn_SELU.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_nn_SELU.onnx inputshape=[1,12],[1,12,64],[1,12,24,64],[1,12,24,32,64]") + + # pnnx inference + import test_nn_SELU_pnnx + b = test_nn_SELU_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_nn_SiLU.py b/tools/pnnx/tests/onnx/test_nn_SiLU.py new file mode 100644 index 00000000000..e509ddb6754 --- /dev/null +++ b/tools/pnnx/tests/onnx/test_nn_SiLU.py @@ -0,0 +1,68 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + self.act_0 = nn.SiLU() + + def forward(self, x, y, z, w): + x = x * 2 - 1 + y = y * 2 - 1 + z = z * 2 - 1 + w = w * 2 - 1 + x = self.act_0(x) + y = self.act_0(y) + z = self.act_0(z) + w = self.act_0(w) + return x, y, z, w + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 12) + y = torch.rand(1, 12, 64) + z = torch.rand(1, 12, 24, 64) + w = torch.rand(1, 12, 24, 32, 64) + + a = net(x, y, z, w) + + # export onnx + torch.onnx.export(net, (x, y, z, w), "test_nn_SiLU.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_nn_SiLU.onnx inputshape=[1,12],[1,12,64],[1,12,24,64],[1,12,24,32,64]") + + # pnnx inference + import test_nn_SiLU_pnnx + b = test_nn_SiLU_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.allclose(a0, b0, 1e-4, 1e-4): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_nn_Sigmoid.py b/tools/pnnx/tests/onnx/test_nn_Sigmoid.py index 5b9cfc9a2be..72d5d798ef4 100644 --- a/tools/pnnx/tests/onnx/test_nn_Sigmoid.py +++ b/tools/pnnx/tests/onnx/test_nn_Sigmoid.py @@ -43,7 +43,7 @@ def test(): z = torch.rand(1, 12, 24, 64) w = torch.rand(1, 12, 24, 32, 64) - a0, a1, a2, a3 = net(x, y, z, w) + a = net(x, y, z, w) # export onnx torch.onnx.export(net, (x, y, z, w), "test_nn_Sigmoid.onnx") @@ -54,9 +54,12 @@ def test(): # pnnx inference import test_nn_Sigmoid_pnnx - b0, b1, b2, b3 = test_nn_Sigmoid_pnnx.test_inference() + b = test_nn_Sigmoid_pnnx.test_inference() - return torch.equal(a0, b0) and torch.equal(a1, b1) and torch.equal(a2, b2) and torch.equal(a3, b3) + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True if __name__ == "__main__": if test(): diff --git a/tools/pnnx/tests/onnx/test_nn_Softmin.py b/tools/pnnx/tests/onnx/test_nn_Softmin.py new file mode 100644 index 00000000000..9cb8417f2f6 --- /dev/null +++ b/tools/pnnx/tests/onnx/test_nn_Softmin.py @@ -0,0 +1,71 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + self.act_0 = nn.Softmin(dim=1) + self.act_1 = nn.Softmin(dim=1) + self.act_2 = nn.Softmin(dim=0) + self.act_3 = nn.Softmin(dim=2) + + def forward(self, x, y, z, w): + x = x * 2 - 1 + y = y * 2 - 1 + z = z * 2 - 1 + w = w * 2 - 1 + x = self.act_0(x) + y = self.act_1(y) + z = self.act_2(z) + w = self.act_3(w) + return x, y, z, w + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 12) + y = torch.rand(1, 12, 64) + z = torch.rand(1, 12, 24, 64) + w = torch.rand(1, 12, 24, 32, 64) + + a = net(x, y, z, w) + + # export onnx + torch.onnx.export(net, (x, y, z, w), "test_nn_Softmin.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_nn_Softmin.onnx inputshape=[1,12],[1,12,64],[1,12,24,64],[1,12,24,32,64]") + + # pnnx inference + import test_nn_Softmin_pnnx + b = test_nn_Softmin_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_nn_Softplus.py b/tools/pnnx/tests/onnx/test_nn_Softplus.py new file mode 100644 index 00000000000..445c6341b29 --- /dev/null +++ b/tools/pnnx/tests/onnx/test_nn_Softplus.py @@ -0,0 +1,73 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F +from packaging import version + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + self.act_0 = nn.Softplus() + self.act_1 = nn.Softplus(beta=0.7, threshold=15) + + def forward(self, x, y, z, w): + x = x * 2 - 1 + y = y * 2 - 1 + z = z * 2 - 1 + w = w * 2 - 1 + x = self.act_0(x) + y = self.act_0(y) + z = self.act_1(z) + w = self.act_1(w) + return x, y, z, w + +def test(): + if version.parse(torch.__version__) < version.parse('1.11'): + return True + + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 12) + y = torch.rand(1, 12, 64) + z = torch.rand(1, 12, 24, 64) + w = torch.rand(1, 12, 24, 32, 64) + + a = net(x, y, z, w) + + # export onnx + torch.onnx.export(net, (x, y, z, w), "test_nn_Softplus.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_nn_Softplus.onnx inputshape=[1,12],[1,12,64],[1,12,24,64],[1,12,24,32,64]") + + # pnnx inference + import test_nn_Softplus_pnnx + b = test_nn_Softplus_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.allclose(a0, b0, 1e-4, 1e-4): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_nn_Softshrink.py b/tools/pnnx/tests/onnx/test_nn_Softshrink.py new file mode 100644 index 00000000000..b86e9239c16 --- /dev/null +++ b/tools/pnnx/tests/onnx/test_nn_Softshrink.py @@ -0,0 +1,73 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F +from packaging import version + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + self.act_0 = nn.Softshrink() + self.act_1 = nn.Softshrink(lambd=1.3) + + def forward(self, x, y, z, w): + x = x * 2 - 1 + y = y * 2 - 1 + z = z * 2 - 1 + w = w * 2 - 1 + x = self.act_0(x) + y = self.act_0(y) + z = self.act_1(z) + w = self.act_1(w) + return x, y, z, w + +def test(): + if version.parse(torch.__version__) < version.parse('1.11'): + return True + + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 12) + y = torch.rand(1, 12, 64) + z = torch.rand(1, 12, 24, 64) + w = torch.rand(1, 12, 24, 32, 64) + + a = net(x, y, z, w) + + # export onnx + torch.onnx.export(net, (x, y, z, w), "test_nn_Softshrink.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_nn_Softshrink.onnx inputshape=[1,12],[1,12,64],[1,12,24,64],[1,12,24,32,64]") + + # pnnx inference + import test_nn_Softshrink_pnnx + b = test_nn_Softshrink_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_nn_Softsign.py b/tools/pnnx/tests/onnx/test_nn_Softsign.py new file mode 100644 index 00000000000..da86752ca67 --- /dev/null +++ b/tools/pnnx/tests/onnx/test_nn_Softsign.py @@ -0,0 +1,68 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + self.act_0 = nn.Softsign() + + def forward(self, x, y, z, w): + x = x * 2 - 1 + y = y * 2 - 1 + z = z * 2 - 1 + w = w * 2 - 1 + x = self.act_0(x) + y = self.act_0(y) + z = self.act_0(z) + w = self.act_0(w) + return x, y, z, w + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 12) + y = torch.rand(1, 12, 64) + z = torch.rand(1, 12, 24, 64) + w = torch.rand(1, 12, 24, 32, 64) + + a = net(x, y, z, w) + + # export onnx + torch.onnx.export(net, (x, y, z, w), "test_nn_Softsign.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_nn_Softsign.onnx inputshape=[1,12],[1,12,64],[1,12,24,64],[1,12,24,32,64]") + + # pnnx inference + import test_nn_Softsign_pnnx + b = test_nn_Softsign_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_nn_Tanh.py b/tools/pnnx/tests/onnx/test_nn_Tanh.py new file mode 100644 index 00000000000..083275d277f --- /dev/null +++ b/tools/pnnx/tests/onnx/test_nn_Tanh.py @@ -0,0 +1,68 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + self.act_0 = nn.Tanh() + + def forward(self, x, y, z, w): + x = x * 2 - 1 + y = y * 2 - 1 + z = z * 2 - 1 + w = w * 2 - 1 + x = self.act_0(x) + y = self.act_0(y) + z = self.act_0(z) + w = self.act_0(w) + return x, y, z, w + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 12) + y = torch.rand(1, 12, 64) + z = torch.rand(1, 12, 24, 64) + w = torch.rand(1, 12, 24, 32, 64) + + a = net(x, y, z, w) + + # export onnx + torch.onnx.export(net, (x, y, z, w), "test_nn_Tanh.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_nn_Tanh.onnx inputshape=[1,12],[1,12,64],[1,12,24,64],[1,12,24,32,64]") + + # pnnx inference + import test_nn_Tanh_pnnx + b = test_nn_Tanh_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.allclose(a0, b0, 1e-4, 1e-4): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_nn_Tanhshrink.py b/tools/pnnx/tests/onnx/test_nn_Tanhshrink.py new file mode 100644 index 00000000000..20cabe2559a --- /dev/null +++ b/tools/pnnx/tests/onnx/test_nn_Tanhshrink.py @@ -0,0 +1,68 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + self.act_0 = nn.Tanhshrink() + + def forward(self, x, y, z, w): + x = x * 2 - 1 + y = y * 2 - 1 + z = z * 2 - 1 + w = w * 2 - 1 + x = self.act_0(x) + y = self.act_0(y) + z = self.act_0(z) + w = self.act_0(w) + return x, y, z, w + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 12) + y = torch.rand(1, 12, 64) + z = torch.rand(1, 12, 24, 64) + w = torch.rand(1, 12, 24, 32, 64) + + a = net(x, y, z, w) + + # export onnx + torch.onnx.export(net, (x, y, z, w), "test_nn_Tanhshrink.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_nn_Tanhshrink.onnx inputshape=[1,12],[1,12,64],[1,12,24,64],[1,12,24,32,64]") + + # pnnx inference + import test_nn_Tanhshrink_pnnx + b = test_nn_Tanhshrink_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) From 569617f212b2878137813b6cb16a5bd6a0076fc7 Mon Sep 17 00:00:00 2001 From: nihui Date: Mon, 15 Jul 2024 16:00:11 +0800 Subject: [PATCH 02/38] pnnx convert onnx expand/permute/repeat/reshape/select/slice/cat/ceil/chunk/flatten/floor/maximum/minimum/split/squeeze/stack/transpose/unbind/unsqueeze (#5583) --- tools/pnnx/src/pass_level2/Tensor_expand.cpp | 48 +++++++++++ tools/pnnx/src/pass_level2/Tensor_reshape.cpp | 77 ++++++------------ tools/pnnx/src/pass_level2/torch_squeeze.cpp | 17 ++-- tools/pnnx/src/pass_level2/torch_tile.cpp | 41 ++++++++++ .../pass_onnx/fuse_constant_as_attribute.cpp | 2 + tools/pnnx/tests/ncnn/test_torch_unbind.py | 3 +- tools/pnnx/tests/onnx/CMakeLists.txt | 21 +++++ tools/pnnx/tests/onnx/test_Tensor_expand.py | 60 ++++++++++++++ tools/pnnx/tests/onnx/test_Tensor_permute.py | 64 +++++++++++++++ tools/pnnx/tests/onnx/test_Tensor_repeat.py | 63 +++++++++++++++ tools/pnnx/tests/onnx/test_Tensor_reshape.py | 63 +++++++++++++++ tools/pnnx/tests/onnx/test_Tensor_select.py | 60 ++++++++++++++ tools/pnnx/tests/onnx/test_Tensor_slice.py | 79 +++++++++++++++++++ tools/pnnx/tests/onnx/test_Tensor_view.py | 63 +++++++++++++++ tools/pnnx/tests/onnx/test_torch_cat.py | 61 ++++++++++++++ tools/pnnx/tests/onnx/test_torch_ceil.py | 60 ++++++++++++++ tools/pnnx/tests/onnx/test_torch_chunk.py | 60 ++++++++++++++ tools/pnnx/tests/onnx/test_torch_flatten.py | 63 +++++++++++++++ tools/pnnx/tests/onnx/test_torch_floor.py | 60 ++++++++++++++ tools/pnnx/tests/onnx/test_torch_maximum.py | 64 +++++++++++++++ tools/pnnx/tests/onnx/test_torch_minimum.py | 64 +++++++++++++++ tools/pnnx/tests/onnx/test_torch_split.py | 60 ++++++++++++++ tools/pnnx/tests/onnx/test_torch_squeeze.py | 60 ++++++++++++++ tools/pnnx/tests/onnx/test_torch_stack.py | 62 +++++++++++++++ tools/pnnx/tests/onnx/test_torch_transpose.py | 60 ++++++++++++++ tools/pnnx/tests/onnx/test_torch_unbind.py | 60 ++++++++++++++ tools/pnnx/tests/onnx/test_torch_unsqueeze.py | 63 +++++++++++++++ tools/pnnx/tests/test_torch_unbind.py | 2 +- 28 files changed, 1399 insertions(+), 61 deletions(-) create mode 100644 tools/pnnx/tests/onnx/test_Tensor_expand.py create mode 100644 tools/pnnx/tests/onnx/test_Tensor_permute.py create mode 100644 tools/pnnx/tests/onnx/test_Tensor_repeat.py create mode 100644 tools/pnnx/tests/onnx/test_Tensor_reshape.py create mode 100644 tools/pnnx/tests/onnx/test_Tensor_select.py create mode 100644 tools/pnnx/tests/onnx/test_Tensor_slice.py create mode 100644 tools/pnnx/tests/onnx/test_Tensor_view.py create mode 100644 tools/pnnx/tests/onnx/test_torch_cat.py create mode 100644 tools/pnnx/tests/onnx/test_torch_ceil.py create mode 100644 tools/pnnx/tests/onnx/test_torch_chunk.py create mode 100644 tools/pnnx/tests/onnx/test_torch_flatten.py create mode 100644 tools/pnnx/tests/onnx/test_torch_floor.py create mode 100644 tools/pnnx/tests/onnx/test_torch_maximum.py create mode 100644 tools/pnnx/tests/onnx/test_torch_minimum.py create mode 100644 tools/pnnx/tests/onnx/test_torch_split.py create mode 100644 tools/pnnx/tests/onnx/test_torch_squeeze.py create mode 100644 tools/pnnx/tests/onnx/test_torch_stack.py create mode 100644 tools/pnnx/tests/onnx/test_torch_transpose.py create mode 100644 tools/pnnx/tests/onnx/test_torch_unbind.py create mode 100644 tools/pnnx/tests/onnx/test_torch_unsqueeze.py diff --git a/tools/pnnx/src/pass_level2/Tensor_expand.cpp b/tools/pnnx/src/pass_level2/Tensor_expand.cpp index 23c1af6a863..4c94d7b8e04 100644 --- a/tools/pnnx/src/pass_level2/Tensor_expand.cpp +++ b/tools/pnnx/src/pass_level2/Tensor_expand.cpp @@ -61,4 +61,52 @@ pnnx.Output output 1 0 out REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(Tensor_expand_1, 20) +class Tensor_expand_onnx : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +3 2 +pnnx.Input input 0 1 input +Expand op_0 1 1 input out %*=%* +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "Tensor.expand"; + } + + bool match(const std::map& captured_params) const + { + if (captured_params.find("op_0.shape") == captured_params.end()) + return false; + + return true; + } + + void write(Operator* op, const std::map& captured_params) const + { + if (captured_params.at("op_0.shape").type == 5) + { + op->params["shape"] = captured_params.at("op_0.shape"); + } + else // if (captured_params.at("op_0.shape").type == 2) + { + op->params["shape"] = std::vector{captured_params.at("op_0.shape").i}; + } + + // onnx set expand shape 1 for not changing the size of that dimension while torch uses -1 + for (size_t i = 0; i < op->params["shape"].ai.size(); i++) + { + if (op->params["shape"].ai[i] == 1) + op->params["shape"].ai[i] = -1; + } + } +}; + +REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(Tensor_expand_onnx, 20) + } // namespace pnnx diff --git a/tools/pnnx/src/pass_level2/Tensor_reshape.cpp b/tools/pnnx/src/pass_level2/Tensor_reshape.cpp index 1c578a8d633..412e609cc40 100644 --- a/tools/pnnx/src/pass_level2/Tensor_reshape.cpp +++ b/tools/pnnx/src/pass_level2/Tensor_reshape.cpp @@ -48,7 +48,7 @@ class Tensor_reshape_onnx : public GraphRewriterPass pnnx.Input input_0 0 1 input pnnx.Input input_1 0 1 shape aten::cat op_0 1 1 shape cat dim=0 -Reshape op_1 2 1 input cat out allowzero=* +Reshape op_1 2 1 input cat out %*=%* pnnx.Output output 1 0 out )PNNXIR"; } @@ -57,46 +57,15 @@ pnnx.Output output 1 0 out { return "Tensor.reshape"; } -}; - -REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(Tensor_reshape_onnx, 19) - -class Tensor_reshape_onnx_1 : public Tensor_reshape_onnx -{ -public: - const char* match_pattern_graph() const - { - return R"PNNXIR(7767517 -5 4 -pnnx.Input input_0 0 1 input -pnnx.Input input_1 0 1 shape -aten::cat op_0 1 1 shape cat dim=0 -Reshape op_1 2 1 input cat out -pnnx.Output output 1 0 out -)PNNXIR"; - } -}; - -REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(Tensor_reshape_onnx_1, 19) -class Tensor_reshape_onnx_2 : public Tensor_reshape_onnx -{ -public: - const char* match_pattern_graph() const + void write(Operator* /*op*/, const std::map& /*captured_params*/) const { - return R"PNNXIR(7767517 -4 3 -pnnx.Input input_0 0 1 input -pnnx.Input input_1 0 1 shape -Reshape op_1 2 1 input shape out allowzero=* -pnnx.Output output 1 0 out -)PNNXIR"; } }; -REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(Tensor_reshape_onnx_2, 20) +REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(Tensor_reshape_onnx, 19) -class Tensor_reshape_onnx_3 : public Tensor_reshape_onnx +class Tensor_reshape_onnx_1 : public Tensor_reshape_onnx { public: const char* match_pattern_graph() const @@ -105,15 +74,15 @@ class Tensor_reshape_onnx_3 : public Tensor_reshape_onnx 4 3 pnnx.Input input_0 0 1 input pnnx.Input input_1 0 1 shape -Reshape op_1 2 1 input shape out +Reshape op_0 2 1 input shape out %*=%* pnnx.Output output 1 0 out )PNNXIR"; } }; -REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(Tensor_reshape_onnx_3, 20) +REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(Tensor_reshape_onnx_1, 20) -class Tensor_reshape_onnx_4 : public GraphRewriterPass +class Tensor_reshape_onnx_2 : public GraphRewriterPass { public: const char* match_pattern_graph() const @@ -121,7 +90,7 @@ class Tensor_reshape_onnx_4 : public GraphRewriterPass return R"PNNXIR(7767517 3 2 pnnx.Input input 0 1 input -Reshape op_1 1 1 input out shape=%shape allowzero=* +Reshape op_0 1 1 input out %*=%* pnnx.Output output 1 0 out )PNNXIR"; } @@ -130,24 +99,28 @@ pnnx.Output output 1 0 out { return "Tensor.reshape"; } -}; -REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(Tensor_reshape_onnx_4, 20) + bool match(const std::map& captured_params) const + { + if (captured_params.find("op_0.shape") == captured_params.end()) + return false; + + return true; + } -class Tensor_reshape_onnx_5 : public Tensor_reshape_onnx_4 -{ -public: - const char* match_pattern_graph() const + void write(Operator* op, const std::map& captured_params) const { - return R"PNNXIR(7767517 -3 2 -pnnx.Input input 0 1 input -Reshape op_1 1 1 input out shape=%shape -pnnx.Output output 1 0 out -)PNNXIR"; + if (captured_params.at("op_0.shape").type == 5) + { + op->params["shape"] = captured_params.at("op_0.shape"); + } + else // if (captured_params.at("op_0.shape").type == 2) + { + op->params["shape"] = std::vector{captured_params.at("op_0.shape").i}; + } } }; -REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(Tensor_reshape_onnx_5, 20) +REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(Tensor_reshape_onnx_2, 20) } // namespace pnnx diff --git a/tools/pnnx/src/pass_level2/torch_squeeze.cpp b/tools/pnnx/src/pass_level2/torch_squeeze.cpp index d7e157d94b1..dabffebc126 100644 --- a/tools/pnnx/src/pass_level2/torch_squeeze.cpp +++ b/tools/pnnx/src/pass_level2/torch_squeeze.cpp @@ -110,20 +110,23 @@ class torch_squeeze_onnx_1 : public torch_squeeze_onnx return R"PNNXIR(7767517 3 2 pnnx.Input input 0 1 input -Squeeze op_0 1 1 input out axes=%axes +Squeeze op_0 1 1 input out %*=%* pnnx.Output output 1 0 out )PNNXIR"; } void write(Operator* op, const std::map& captured_params) const { - if (captured_params.at("axes").type == 5 && captured_params.at("axes").ai.size() == 1) + if (captured_params.find("op_0.axes") != captured_params.end()) { - op->params["dim"] = captured_params.at("axes").ai[0]; - } - else - { - op->params["dim"] = captured_params.at("axes"); + if (captured_params.at("op_0.axes").type == 5 && captured_params.at("op_0.axes").ai.size() == 1) + { + op->params["dim"] = captured_params.at("op_0.axes").ai[0]; + } + else + { + op->params["dim"] = captured_params.at("op_0.axes"); + } } } }; diff --git a/tools/pnnx/src/pass_level2/torch_tile.cpp b/tools/pnnx/src/pass_level2/torch_tile.cpp index d1504bacda8..a2f2780116c 100644 --- a/tools/pnnx/src/pass_level2/torch_tile.cpp +++ b/tools/pnnx/src/pass_level2/torch_tile.cpp @@ -60,4 +60,45 @@ pnnx.Output output 1 0 out REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(torch_tile_onnx, 20) +class torch_tile_onnx_1 : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +3 2 +pnnx.Input input 0 1 input +Tile op_0 1 1 input out %*=%* +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "torch.tile"; + } + + bool match(const std::map& captured_params) const + { + if (captured_params.find("op_0.repeats") == captured_params.end()) + return false; + + return true; + } + + void write(Operator* op, const std::map& captured_params) const + { + if (captured_params.at("op_0.repeats").type == 5) + { + op->params["dims"] = captured_params.at("op_0.repeats"); + } + else // if (captured_params.at("op_0.repeats").type == 2) + { + op->params["dims"] = std::vector{captured_params.at("op_0.repeats").i}; + } + } +}; + +REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(torch_tile_onnx_1, 20) + } // namespace pnnx diff --git a/tools/pnnx/src/pass_onnx/fuse_constant_as_attribute.cpp b/tools/pnnx/src/pass_onnx/fuse_constant_as_attribute.cpp index a3021d33c90..aba88976233 100644 --- a/tools/pnnx/src/pass_onnx/fuse_constant_as_attribute.cpp +++ b/tools/pnnx/src/pass_onnx/fuse_constant_as_attribute.cpp @@ -32,6 +32,7 @@ struct constant_as_attribute }; static constant_as_attribute caas[] = { + {"Expand", 1, "shape"}, {"Gather", 1, "indices"}, {"If", 0, "cond"}, {"Pad", 1, "pads"}, @@ -49,6 +50,7 @@ static constant_as_attribute caas[] = { {"Slice", 3, "axes"}, {"Slice", 4, "steps"}, {"Squeeze", 1, "axes"}, + {"Tile", 1, "repeats"}, {"Unsqueeze", 1, "axes"}, {"Upsample", 1, "scales"}, }; diff --git a/tools/pnnx/tests/ncnn/test_torch_unbind.py b/tools/pnnx/tests/ncnn/test_torch_unbind.py index 3b8e427010c..8e224612d7e 100644 --- a/tools/pnnx/tests/ncnn/test_torch_unbind.py +++ b/tools/pnnx/tests/ncnn/test_torch_unbind.py @@ -26,6 +26,7 @@ def forward(self, x, y): x0 = F.relu(x0) x1 = F.relu(x1) + x2 = F.relu(x2) y0 = F.relu(y0) y1 = F.relu(y1) y2 = F.relu(y2) @@ -35,7 +36,7 @@ def forward(self, x, y): y6 = F.relu(y6) y7 = F.relu(y7) y8 = F.relu(y8) - return x0, x1, y0, y1, y2, y3, y4, y5, y6, y7, y8 + return x0, x1, x2, y0, y1, y2, y3, y4, y5, y6, y7, y8 def test(): net = Model() diff --git a/tools/pnnx/tests/onnx/CMakeLists.txt b/tools/pnnx/tests/onnx/CMakeLists.txt index 0e283e77d48..f4756740a79 100644 --- a/tools/pnnx/tests/onnx/CMakeLists.txt +++ b/tools/pnnx/tests/onnx/CMakeLists.txt @@ -126,8 +126,29 @@ pnnx_onnx_add_test(squeezenet1_1) pnnx_onnx_add_test(swin_t) pnnx_onnx_add_test(vit_b_32) +pnnx_onnx_add_test(Tensor_expand) +pnnx_onnx_add_test(Tensor_permute) +pnnx_onnx_add_test(Tensor_repeat) +pnnx_onnx_add_test(Tensor_reshape) +pnnx_onnx_add_test(Tensor_select) +pnnx_onnx_add_test(Tensor_slice) +pnnx_onnx_add_test(Tensor_view) + +pnnx_onnx_add_test(torch_cat) +pnnx_onnx_add_test(torch_ceil) +pnnx_onnx_add_test(torch_chunk) +pnnx_onnx_add_test(torch_flatten) +pnnx_onnx_add_test(torch_floor) pnnx_onnx_add_test(torch_max) +pnnx_onnx_add_test(torch_maximum) pnnx_onnx_add_test(torch_mean) pnnx_onnx_add_test(torch_min) +pnnx_onnx_add_test(torch_minimum) pnnx_onnx_add_test(torch_prod) +pnnx_onnx_add_test(torch_split) +pnnx_onnx_add_test(torch_squeeze) +pnnx_onnx_add_test(torch_stack) pnnx_onnx_add_test(torch_sum) +pnnx_onnx_add_test(torch_transpose) +pnnx_onnx_add_test(torch_unbind) +pnnx_onnx_add_test(torch_unsqueeze) diff --git a/tools/pnnx/tests/onnx/test_Tensor_expand.py b/tools/pnnx/tests/onnx/test_Tensor_expand.py new file mode 100644 index 00000000000..ceb01dac4c8 --- /dev/null +++ b/tools/pnnx/tests/onnx/test_Tensor_expand.py @@ -0,0 +1,60 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = x.expand(24) + y = y.expand(-1, 11, -1) + z = z.expand(2, 8, 3, -1, 4) + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1) + y = torch.rand(3, 1, 1) + z = torch.rand(1, 8, 1, 9, 1) + + a = net(x, y, z) + + # export onnx + torch.onnx.export(net, (x, y, z), "test_Tensor_expand.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_Tensor_expand.onnx inputshape=[1],[3,1,1],[1,8,1,9,1]") + + # pnnx inference + import test_Tensor_expand_pnnx + b = test_Tensor_expand_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_Tensor_permute.py b/tools/pnnx/tests/onnx/test_Tensor_permute.py new file mode 100644 index 00000000000..a36de4c251c --- /dev/null +++ b/tools/pnnx/tests/onnx/test_Tensor_permute.py @@ -0,0 +1,64 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F +from packaging import version + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = x.permute(1, 0, 2) + x = x.permute(0, 1, 2) + y = y.permute(2, 3, 1, 0) + y = y.permute(3, 1, 0, 2) + z = z.permute(1, 3, 0, 4, 2) + z = z.permute(0, 2, 4, 3, 1) + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 3, 16) + y = torch.rand(1, 5, 9, 11) + z = torch.rand(14, 8, 5, 9, 10) + + a = net(x, y, z) + + # export onnx + torch.onnx.export(net, (x, y, z), "test_Tensor_permute.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_Tensor_permute.onnx inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]") + + # pnnx inference + import test_Tensor_permute_pnnx + b = test_Tensor_permute_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_Tensor_repeat.py b/tools/pnnx/tests/onnx/test_Tensor_repeat.py new file mode 100644 index 00000000000..569ad548bea --- /dev/null +++ b/tools/pnnx/tests/onnx/test_Tensor_repeat.py @@ -0,0 +1,63 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = x.repeat(1, 2, 3) + x = x.repeat(2, 3, 4) + y = y.repeat(1, 2, 1, 4) + y = y.repeat(3, 4, 5, 1) + z = z.repeat(1, 2, 3, 1, 5) + z = z.repeat(2, 3, 3, 1, 1) + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 3, 16) + y = torch.rand(1, 5, 9, 11) + z = torch.rand(14, 8, 5, 9, 10) + + a = net(x, y, z) + + # export onnx + torch.onnx.export(net, (x, y, z), "test_Tensor_repeat.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_Tensor_repeat.onnx inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]") + + # pnnx inference + import test_Tensor_repeat_pnnx + b = test_Tensor_repeat_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_Tensor_reshape.py b/tools/pnnx/tests/onnx/test_Tensor_reshape.py new file mode 100644 index 00000000000..027fb40a07d --- /dev/null +++ b/tools/pnnx/tests/onnx/test_Tensor_reshape.py @@ -0,0 +1,63 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = x.reshape(1, 2, 24) + x = x.reshape(48) + y = y.reshape(1, 11, 5, 9) + y = y.reshape(99, 5) + z = z.reshape(4, 3, 30, 10, 14) + z = z.reshape(15, 2, 10, 7, 8, 3) + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 3, 16) + y = torch.rand(1, 5, 9, 11) + z = torch.rand(14, 8, 5, 9, 10) + + a = net(x, y, z) + + # export onnx + torch.onnx.export(net, (x, y, z), "test_Tensor_reshape.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_Tensor_reshape.onnx inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]") + + # pnnx inference + import test_Tensor_reshape_pnnx + b = test_Tensor_reshape_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_Tensor_select.py b/tools/pnnx/tests/onnx/test_Tensor_select.py new file mode 100644 index 00000000000..4f7488b55a5 --- /dev/null +++ b/tools/pnnx/tests/onnx/test_Tensor_select.py @@ -0,0 +1,60 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = x.select(1, 1) + y = y.select(2, 4) + z = z.select(0, 10) + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 3, 16) + y = torch.rand(1, 5, 9, 11) + z = torch.rand(14, 8, 5, 9, 10) + + a = net(x, y, z) + + # export onnx + torch.onnx.export(net, (x, y, z), "test_Tensor_select.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_Tensor_select.onnx inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]") + + # pnnx inference + import test_Tensor_select_pnnx + b = test_Tensor_select_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_Tensor_slice.py b/tools/pnnx/tests/onnx/test_Tensor_slice.py new file mode 100644 index 00000000000..7fe32b4af61 --- /dev/null +++ b/tools/pnnx/tests/onnx/test_Tensor_slice.py @@ -0,0 +1,79 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F +from packaging import version + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + if version.parse(torch.__version__) < version.parse('1.12'): + x = x[:,:12,1:14:1] + else: + x = x[:,:12,1:14:2] + x = x[...,1:] + if version.parse(torch.__version__) >= version.parse('1.10'): + x = x[:,:,:x.size(2)-1] + y = y[0:,1:,5:,3:] + if version.parse(torch.__version__) < version.parse('1.12'): + y = y[:,:,1:13:1,:14] + else: + y = y[:,:,1:13:2,:14] + if version.parse(torch.__version__) >= version.parse('1.10'): + y = y[:1,:y.size(1):,:,:] + z = z[4:] + if version.parse(torch.__version__) < version.parse('1.12'): + z = z[:2,:,:,:,2:-2:1] + else: + z = z[:2,:,:,:,2:-2:3] + if version.parse(torch.__version__) >= version.parse('1.10'): + z = z[:,:,:,z.size(3)-3:,:] + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 13, 26) + y = torch.rand(1, 15, 19, 21) + z = torch.rand(14, 18, 15, 19, 20) + + a = net(x, y, z) + + # export onnx + torch.onnx.export(net, (x, y, z), "test_Tensor_slice.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_Tensor_slice.onnx inputshape=[1,13,26],[1,15,19,21],[14,18,15,19,20]") + + # pnnx inference + import test_Tensor_slice_pnnx + b = test_Tensor_slice_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_Tensor_view.py b/tools/pnnx/tests/onnx/test_Tensor_view.py new file mode 100644 index 00000000000..40df090a07b --- /dev/null +++ b/tools/pnnx/tests/onnx/test_Tensor_view.py @@ -0,0 +1,63 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = x.view(1, 2, 24) + x = x.view(48) + y = y.view(1, 11, 5, 9) + y = y.view(99, 5) + z = z.view(4, 3, 30, 10, 14) + z = z.view(15, 2, 10, 7, 8, 3) + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 3, 16) + y = torch.rand(1, 5, 9, 11) + z = torch.rand(14, 8, 5, 9, 10) + + a = net(x, y, z) + + # export onnx + torch.onnx.export(net, (x, y, z), "test_Tensor_view.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_Tensor_view.onnx inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]") + + # pnnx inference + import test_Tensor_view_pnnx + b = test_Tensor_view_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_torch_cat.py b/tools/pnnx/tests/onnx/test_torch_cat.py new file mode 100644 index 00000000000..0d944434d28 --- /dev/null +++ b/tools/pnnx/tests/onnx/test_torch_cat.py @@ -0,0 +1,61 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z, w): + out0 = torch.cat((x, y), dim=1) + out1 = torch.cat((z, w), dim=3) + out2 = torch.cat((w, w), dim=2) + return out0, out1, out2 + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 3, 16) + y = torch.rand(1, 2, 16) + z = torch.rand(1, 5, 9, 11) + w = torch.rand(1, 5, 9, 3) + + a = net(x, y, z, w) + + # export onnx + torch.onnx.export(net, (x, y, z, w), "test_torch_cat.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_torch_cat.onnx inputshape=[1,3,16],[1,2,16],[1,5,9,11],[1,5,9,3]") + + # pnnx inference + import test_torch_cat_pnnx + b = test_torch_cat_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_torch_ceil.py b/tools/pnnx/tests/onnx/test_torch_ceil.py new file mode 100644 index 00000000000..1ff59b37a48 --- /dev/null +++ b/tools/pnnx/tests/onnx/test_torch_ceil.py @@ -0,0 +1,60 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = torch.ceil(x * 10) + y = torch.ceil(y * 10) + z = torch.ceil(z * 10) + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 3, 16) + y = torch.rand(1, 5, 9, 11) + z = torch.rand(14, 8, 5, 9, 10) + + a = net(x, y, z) + + # export onnx + torch.onnx.export(net, (x, y, z), "test_torch_ceil.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_torch_ceil.onnx inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]") + + # pnnx inference + import test_torch_ceil_pnnx + b = test_torch_ceil_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_torch_chunk.py b/tools/pnnx/tests/onnx/test_torch_chunk.py new file mode 100644 index 00000000000..2d1400103b9 --- /dev/null +++ b/tools/pnnx/tests/onnx/test_torch_chunk.py @@ -0,0 +1,60 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x0, x1 = torch.chunk(x, chunks=2, dim=1) + y0, y1, y2 = torch.chunk(y, chunks=3, dim=2) + z0, z1, z2, z3, z4 = torch.chunk(z, chunks=5, dim=0) + return x0, x1, y0, y1, y2, z0, z1, z2, z3, z4 + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 3, 16) + y = torch.rand(1, 5, 9, 11) + z = torch.rand(14, 8, 5, 9, 10) + + a = net(x, y, z) + + # export onnx + torch.onnx.export(net, (x, y, z), "test_torch_chunk.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_torch_chunk.onnx inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]") + + # pnnx inference + import test_torch_chunk_pnnx + b = test_torch_chunk_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_torch_flatten.py b/tools/pnnx/tests/onnx/test_torch_flatten.py new file mode 100644 index 00000000000..6105b106804 --- /dev/null +++ b/tools/pnnx/tests/onnx/test_torch_flatten.py @@ -0,0 +1,63 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = torch.flatten(x) + y = torch.flatten(y, start_dim=1, end_dim=-1) + z = torch.flatten(z, start_dim=3, end_dim=4) + x = x.relu() + y = y.relu() + z = z.relu() + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 3, 16) + y = torch.rand(1, 5, 9, 11) + z = torch.rand(14, 8, 5, 9, 10) + + a = net(x, y, z) + + # export onnx + torch.onnx.export(net, (x, y, z), "test_torch_flatten.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_torch_flatten.onnx inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]") + + # pnnx inference + import test_torch_flatten_pnnx + b = test_torch_flatten_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_torch_floor.py b/tools/pnnx/tests/onnx/test_torch_floor.py new file mode 100644 index 00000000000..a046e4c241a --- /dev/null +++ b/tools/pnnx/tests/onnx/test_torch_floor.py @@ -0,0 +1,60 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = torch.floor(x * 10) + y = torch.floor(y * 10) + z = torch.floor(z * 10) + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 3, 16) + y = torch.rand(1, 5, 9, 11) + z = torch.rand(14, 8, 5, 9, 10) + + a = net(x, y, z) + + # export onnx + torch.onnx.export(net, (x, y, z), "test_torch_floor.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_torch_floor.onnx inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]") + + # pnnx inference + import test_torch_floor_pnnx + b = test_torch_floor_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_torch_maximum.py b/tools/pnnx/tests/onnx/test_torch_maximum.py new file mode 100644 index 00000000000..5e17d5cb2d2 --- /dev/null +++ b/tools/pnnx/tests/onnx/test_torch_maximum.py @@ -0,0 +1,64 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F +from packaging import version + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + out0 = torch.maximum(x, y) + out1 = torch.maximum(y, y) + out2 = torch.maximum(z, torch.ones_like(z) + 0.1) + return out0, out1, out2 + +def test(): + if version.parse(torch.__version__) < version.parse('1.12'): + return True + + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(3, 16) + y = torch.rand(3, 16) + z = torch.rand(5, 9, 3) + + a = net(x, y, z) + + # export onnx + torch.onnx.export(net, (x, y, z), "test_torch_maximum.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_torch_maximum.onnx inputshape=[3,16],[3,16],[5,9,3]") + + # pnnx inference + import test_torch_maximum_pnnx + b = test_torch_maximum_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_torch_minimum.py b/tools/pnnx/tests/onnx/test_torch_minimum.py new file mode 100644 index 00000000000..0d8e9a87e50 --- /dev/null +++ b/tools/pnnx/tests/onnx/test_torch_minimum.py @@ -0,0 +1,64 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F +from packaging import version + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + out0 = torch.minimum(x, y) + out1 = torch.minimum(y, y) + out2 = torch.minimum(z, torch.ones_like(z) + 0.1) + return out0, out1, out2 + +def test(): + if version.parse(torch.__version__) < version.parse('1.12'): + return True + + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(3, 16) + y = torch.rand(3, 16) + z = torch.rand(5, 9, 3) + + a = net(x, y, z) + + # export onnx + torch.onnx.export(net, (x, y, z), "test_torch_minimum.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_torch_minimum.onnx inputshape=[3,16],[3,16],[5,9,3]") + + # pnnx inference + import test_torch_minimum_pnnx + b = test_torch_minimum_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_torch_split.py b/tools/pnnx/tests/onnx/test_torch_split.py new file mode 100644 index 00000000000..b13b041cd96 --- /dev/null +++ b/tools/pnnx/tests/onnx/test_torch_split.py @@ -0,0 +1,60 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x0, x1 = torch.split(x, split_size_or_sections=2, dim=1) + y0, y1, y2 = torch.split(y, split_size_or_sections=[1,3,5], dim=2) + z0, z1, z2, z3, z4 = torch.split(z, split_size_or_sections=3, dim=0) + return x0, x1, y0, y1, y2, z0, z1, z2, z3, z4 + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 3, 16) + y = torch.rand(1, 5, 9, 11) + z = torch.rand(14, 8, 5, 9, 10) + + a = net(x, y, z) + + # export onnx + torch.onnx.export(net, (x, y, z), "test_torch_split.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_torch_split.onnx inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]") + + # pnnx inference + import test_torch_split_pnnx + b = test_torch_split_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_torch_squeeze.py b/tools/pnnx/tests/onnx/test_torch_squeeze.py new file mode 100644 index 00000000000..b29e4ba2f9d --- /dev/null +++ b/tools/pnnx/tests/onnx/test_torch_squeeze.py @@ -0,0 +1,60 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = torch.squeeze(x, 1) + y = torch.squeeze(y) + z = torch.squeeze(z, 4) + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 1, 16) + y = torch.rand(1, 5, 1, 11) + z = torch.rand(14, 8, 5, 9, 1) + + a = net(x, y, z) + + # export onnx + torch.onnx.export(net, (x, y, z), "test_torch_squeeze.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_torch_squeeze.onnx inputshape=[1,1,16],[1,5,1,11],[14,8,5,9,1]") + + # pnnx inference + import test_torch_squeeze_pnnx + b = test_torch_squeeze_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_torch_stack.py b/tools/pnnx/tests/onnx/test_torch_stack.py new file mode 100644 index 00000000000..7b04ddd307f --- /dev/null +++ b/tools/pnnx/tests/onnx/test_torch_stack.py @@ -0,0 +1,62 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z, w): + out0 = torch.stack((x, y), dim=0) + out1 = torch.stack((x, y), dim=2) + out2 = torch.stack((z, w), dim=2) + out3 = torch.stack((z, w), dim=-1) + return out0, out1, out2, out3 + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(3, 16) + y = torch.rand(3, 16) + z = torch.rand(5, 9, 3) + w = torch.rand(5, 9, 3) + + a = net(x, y, z, w) + + # export onnx + torch.onnx.export(net, (x, y, z, w), "test_torch_stack.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_torch_stack.onnx inputshape=[3,16],[3,16],[5,9,3],[5,9,3]") + + # pnnx inference + import test_torch_stack_pnnx + b = test_torch_stack_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_torch_transpose.py b/tools/pnnx/tests/onnx/test_torch_transpose.py new file mode 100644 index 00000000000..e6a25c44101 --- /dev/null +++ b/tools/pnnx/tests/onnx/test_torch_transpose.py @@ -0,0 +1,60 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = torch.transpose(x, 1, 2) + y = torch.transpose(y, 2, 3) + z = torch.transpose(z, 1, 3) + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 3, 16) + y = torch.rand(1, 5, 9, 11) + z = torch.rand(14, 8, 5, 9, 10) + + a = net(x, y, z) + + # export onnx + torch.onnx.export(net, (x, y, z), "test_torch_transpose.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_torch_transpose.onnx inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]") + + # pnnx inference + import test_torch_transpose_pnnx + b = test_torch_transpose_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_torch_unbind.py b/tools/pnnx/tests/onnx/test_torch_unbind.py new file mode 100644 index 00000000000..a98fa25c51c --- /dev/null +++ b/tools/pnnx/tests/onnx/test_torch_unbind.py @@ -0,0 +1,60 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x0, x1, x2 = torch.unbind(x, dim=1) + y0, y1, y2, y3, y4, y5, y6, y7, y8 = torch.unbind(y, dim=2) + z0, z1, z2, z3 = torch.unbind(z, dim=0) + return x0, x1, x2, y0, y1, y2, y3, y4, y5, y6, y7, y8, z0, z1, z2, z3 + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 3, 16) + y = torch.rand(1, 5, 9, 11) + z = torch.rand(4, 8, 5, 9, 10) + + a = net(x, y, z) + + # export onnx + torch.onnx.export(net, (x, y, z), "test_torch_unbind.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_torch_unbind.onnx inputshape=[1,3,16],[1,5,9,11],[4,8,5,9,10]") + + # pnnx inference + import test_torch_unbind_pnnx + b = test_torch_unbind_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/test_torch_unsqueeze.py b/tools/pnnx/tests/onnx/test_torch_unsqueeze.py new file mode 100644 index 00000000000..01bf84076cf --- /dev/null +++ b/tools/pnnx/tests/onnx/test_torch_unsqueeze.py @@ -0,0 +1,63 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = torch.unsqueeze(x, 0) + x = torch.unsqueeze(x, 1) + y = torch.unsqueeze(y, 2) + y = torch.unsqueeze(y, -1) + z = torch.unsqueeze(z, -2) + z = torch.unsqueeze(z, 3) + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 3, 16) + y = torch.rand(1, 5, 9, 11) + z = torch.rand(14, 8, 5, 9, 10) + + a = net(x, y, z) + + # export onnx + torch.onnx.export(net, (x, y, z), "test_torch_unsqueeze.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_torch_unsqueeze.onnx inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]") + + # pnnx inference + import test_torch_unsqueeze_pnnx + b = test_torch_unsqueeze_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/test_torch_unbind.py b/tools/pnnx/tests/test_torch_unbind.py index c92c87b7435..b232f289dab 100644 --- a/tools/pnnx/tests/test_torch_unbind.py +++ b/tools/pnnx/tests/test_torch_unbind.py @@ -24,7 +24,7 @@ def forward(self, x, y, z): x0, x1, x2 = torch.unbind(x, dim=1) y0, y1, y2, y3, y4, y5, y6, y7, y8 = torch.unbind(y, dim=2) z0, z1, z2, z3 = torch.unbind(z, dim=0) - return x0, x1, y0, y1, y2, y3, y4, y5, y6, y7, y8, z0, z1, z2, z3 + return x0, x1, x2, y0, y1, y2, y3, y4, y5, y6, y7, y8, z0, z1, z2, z3 def test(): net = Model() From 081a9c39c8a6d4486f67f43699d97e6a6e4c89c2 Mon Sep 17 00:00:00 2001 From: zhangyang2057 Date: Thu, 18 Jul 2024 14:19:52 +0800 Subject: [PATCH 03/38] Fix tanh typo for rvv. (#5584) * Fix tanh typo for rvv. * Fix tanh for rvv fp16. --- src/layer/riscv/rvv_mathfun.h | 2 +- src/layer/riscv/rvv_mathfun_fp16s.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/layer/riscv/rvv_mathfun.h b/src/layer/riscv/rvv_mathfun.h index 980261a1496..2ec10bae48a 100644 --- a/src/layer/riscv/rvv_mathfun.h +++ b/src/layer/riscv/rvv_mathfun.h @@ -308,7 +308,7 @@ _RVV_FLOAT32_COS_OP(8, 4) \ /* clamp the inputs to the range [-9, 9] since anything outside */ \ /* this range is -/+1.0f in single-precision. */ \ - x2 = vfmin_vf_f32m##LMUL(x, c_tanh_hi, vl); \ + x2 = vfmin_vf_f32m##LMUL(x2, c_tanh_hi, vl); \ \ /* since the polynomials are odd/even, we need x**2. */ \ vfloat32m##LMUL##_t z = vfmul_vv_f32m##LMUL(x2, x2, vl); \ diff --git a/src/layer/riscv/rvv_mathfun_fp16s.h b/src/layer/riscv/rvv_mathfun_fp16s.h index ee5ffe4a304..2cf5d08f4f0 100644 --- a/src/layer/riscv/rvv_mathfun_fp16s.h +++ b/src/layer/riscv/rvv_mathfun_fp16s.h @@ -308,7 +308,7 @@ _RVV_FLOAT16_COS_OP(8, 2) \ /* clamp the inputs to the range [-9, 9] since anything outside */ \ /* this range is -/+1.0f in single-precision. */ \ - x2 = vfmin_vf_f16m##LMUL(x, c_tanh_hi, vl); \ + x2 = vfmin_vf_f16m##LMUL(x2, c_tanh_hi, vl); \ \ /* since the polynomials are odd/even, we need x**2. */ \ vfloat16m##LMUL##_t z = vfmul_vv_f16m##LMUL(x2, x2, vl); \ From 997c8926d706db5f9e4098aec8ed51c49ab9417c Mon Sep 17 00:00:00 2001 From: nihui Date: Thu, 18 Jul 2024 14:20:09 +0800 Subject: [PATCH 04/38] use ruapu detection only on windows arm, enable cpu powerinfo with mingw compiler (#5593) --- src/cpu.cpp | 41 +++++++++++++++++++++-------------------- src/cpu.h | 4 ++-- src/platform.h.in | 8 ++++---- 3 files changed, 27 insertions(+), 26 deletions(-) diff --git a/src/cpu.cpp b/src/cpu.cpp index b1afbba3f65..f9e64a1cc75 100644 --- a/src/cpu.cpp +++ b/src/cpu.cpp @@ -46,10 +46,9 @@ #include #endif -#if defined _WIN32 && !(defined __MINGW32__) +#if defined _WIN32 #define WIN32_LEAN_AND_MEAN #include -#include #endif #if defined __ANDROID__ || defined __linux__ @@ -129,8 +128,10 @@ #include #endif +#if (defined _WIN32 && (__aarch64__ || __arm__)) #define RUAPU_IMPLEMENTATION #include "ruapu.h" +#endif // topology info static int g_cpucount; @@ -596,9 +597,6 @@ static int get_cpu_support_x86_avx2() static int get_cpu_support_x86_avx_vnni() { -#if __APPLE__ - return ruapu_supports("avxvnni"); -#else unsigned int cpu_info[4] = {0}; x86_cpuid(0, cpu_info); @@ -617,13 +615,16 @@ static int get_cpu_support_x86_avx_vnni() x86_cpuid_sublevel(7, 1, cpu_info); return cpu_info[0] & (1u << 4); -#endif } static int get_cpu_support_x86_avx512() { #if __APPLE__ - return ruapu_supports("avx512f") && ruapu_supports("avx512bw") && ruapu_supports("avx512cd") && ruapu_supports("avx512dq") && ruapu_supports("avx512vl"); + return get_hw_capability("hw.optional.avx512f") + && get_hw_capability("hw.optional.avx512bw") + && get_hw_capability("hw.optional.avx512cd") + && get_hw_capability("hw.optional.avx512dq") + && get_hw_capability("hw.optional.avx512vl"); #else unsigned int cpu_info[4] = {0}; x86_cpuid(0, cpu_info); @@ -653,7 +654,7 @@ static int get_cpu_support_x86_avx512() static int get_cpu_support_x86_avx512_vnni() { #if __APPLE__ - return ruapu_supports("avx512vnni"); + return get_hw_capability("hw.optional.avx512vnni"); #else unsigned int cpu_info[4] = {0}; x86_cpuid(0, cpu_info); @@ -683,7 +684,7 @@ static int get_cpu_support_x86_avx512_vnni() static int get_cpu_support_x86_avx512_bf16() { #if __APPLE__ - return ruapu_supports("avx512bf16"); + return get_hw_capability("hw.optional.avx512bf16"); #else unsigned int cpu_info[4] = {0}; x86_cpuid(0, cpu_info); @@ -709,7 +710,7 @@ static int get_cpu_support_x86_avx512_bf16() static int get_cpu_support_x86_avx512_fp16() { #if __APPLE__ - return ruapu_supports("avx512fp16"); + return get_hw_capability("hw.optional.avx512fp16"); #else unsigned int cpu_info[4] = {0}; x86_cpuid(0, cpu_info); @@ -745,7 +746,7 @@ static int get_cpucount() count = emscripten_num_logical_cores(); else count = 1; -#elif (defined _WIN32 && !(defined __MINGW32__)) +#elif defined _WIN32 SYSTEM_INFO system_info; GetSystemInfo(&system_info); count = system_info.dwNumberOfProcessors; @@ -812,7 +813,7 @@ static int get_thread_siblings(int cpuid) static int get_physical_cpucount() { int count = 0; -#if (defined _WIN32 && !(defined __MINGW32__)) +#if defined _WIN32 typedef BOOL(WINAPI * LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD); LPFN_GLPI glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation"); if (glpi == NULL) @@ -1050,7 +1051,7 @@ static int get_big_cpu_data_cache_size(int level) static int get_cpu_level2_cachesize() { int size = 0; -#if (defined _WIN32 && !(defined __MINGW32__)) +#if defined _WIN32 typedef BOOL(WINAPI * LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD); LPFN_GLPI glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation"); if (glpi != NULL) @@ -1120,7 +1121,7 @@ static int get_cpu_level2_cachesize() static int get_cpu_level3_cachesize() { int size = 0; -#if (defined _WIN32 && !(defined __MINGW32__)) +#if defined _WIN32 typedef BOOL(WINAPI * LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD); LPFN_GLPI glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation"); if (glpi != NULL) @@ -1167,7 +1168,7 @@ static int get_cpu_level3_cachesize() return size; } -#if (defined _WIN32 && !(defined __MINGW32__)) +#if defined _WIN32 static ncnn::CpuSet get_smt_cpu_mask() { ncnn::CpuSet smt_cpu_mask; @@ -1261,7 +1262,7 @@ static int set_sched_affinity(const ncnn::CpuSet& thread_affinity_mask) return 0; } -#endif // (defined _WIN32 && !(defined __MINGW32__)) +#endif // defined _WIN32 #if defined __ANDROID__ || defined __linux__ static int get_max_freq_khz(int cpuid) @@ -1435,7 +1436,7 @@ static void initialize_cpu_thread_affinity_mask(ncnn::CpuSet& mask_all, ncnn::Cp mask_all.enable(i); } -#if (defined _WIN32 && !(defined __MINGW32__)) +#if defined _WIN32 // get max freq mhz for all cores int max_freq_mhz_min = INT_MAX; int max_freq_mhz_max = 0; @@ -1867,7 +1868,7 @@ static void initialize_global_cpu_info() g_powersave = 0; initialize_cpu_thread_affinity_mask(g_cpu_affinity_mask_all, g_cpu_affinity_mask_little, g_cpu_affinity_mask_big); -#if (defined _WIN32 && (__aarch64__ || __arm__)) || __APPLE__ +#if (defined _WIN32 && (__aarch64__ || __arm__)) if (!is_being_debugged()) { ruapu_init(); @@ -1944,7 +1945,7 @@ static inline void try_initialize_global_cpu_info() namespace ncnn { -#if (defined _WIN32 && !(defined __MINGW32__)) +#if defined _WIN32 CpuSet::CpuSet() { disable_all(); @@ -2685,7 +2686,7 @@ const CpuSet& get_cpu_thread_affinity_mask(int powersave) int set_cpu_thread_affinity(const CpuSet& thread_affinity_mask) { try_initialize_global_cpu_info(); -#if defined __ANDROID__ || defined __linux__ || (defined _WIN32 && !(defined __MINGW32__)) +#if defined __ANDROID__ || defined __linux__ || defined _WIN32 #ifdef _OPENMP int num_threads = thread_affinity_mask.num_enabled(); diff --git a/src/cpu.h b/src/cpu.h index 7d6bfce1108..2ae6b8c3ffe 100644 --- a/src/cpu.h +++ b/src/cpu.h @@ -17,7 +17,7 @@ #include -#if (defined _WIN32 && !(defined __MINGW32__)) +#if defined _WIN32 #define WIN32_LEAN_AND_MEAN #include #endif @@ -40,7 +40,7 @@ class NCNN_EXPORT CpuSet int num_enabled() const; public: -#if (defined _WIN32 && !(defined __MINGW32__)) +#if defined _WIN32 ULONG_PTR mask; #endif #if defined __ANDROID__ || defined __linux__ diff --git a/src/platform.h.in b/src/platform.h.in index a0f17f39e31..50a9454b7da 100644 --- a/src/platform.h.in +++ b/src/platform.h.in @@ -70,7 +70,7 @@ #ifdef __cplusplus #if NCNN_THREADS -#if (defined _WIN32 && !(defined __MINGW32__)) +#if defined _WIN32 #define WIN32_LEAN_AND_MEAN #include #include @@ -86,7 +86,7 @@ namespace ncnn { #if NCNN_THREADS -#if (defined _WIN32 && !(defined __MINGW32__)) +#if defined _WIN32 class NCNN_EXPORT Mutex { public: @@ -141,7 +141,7 @@ public: private: DWORD key; }; -#else // (defined _WIN32 && !(defined __MINGW32__)) +#else // defined _WIN32 class NCNN_EXPORT Mutex { public: @@ -186,7 +186,7 @@ public: private: pthread_key_t key; }; -#endif // (defined _WIN32 && !(defined __MINGW32__)) +#endif // defined _WIN32 #else // NCNN_THREADS class NCNN_EXPORT Mutex { From f825d3a23c77cfd51b42ddfcd10343627c2d536d Mon Sep 17 00:00:00 2001 From: nihui Date: Thu, 18 Jul 2024 14:20:30 +0800 Subject: [PATCH 05/38] pnnx fuse onnx sdpa pattern and ncnn qdim mha fusion (#5589) --- tools/pnnx/src/CMakeLists.txt | 1 + .../fuse_scaled_dot_product_attention.cpp | 84 ++++++- .../F_scaled_dot_product_attention.cpp | 223 ++++++++++++++++++ 3 files changed, 306 insertions(+), 2 deletions(-) create mode 100644 tools/pnnx/src/pass_ncnn/F_scaled_dot_product_attention.cpp diff --git a/tools/pnnx/src/CMakeLists.txt b/tools/pnnx/src/CMakeLists.txt index e2fc28da9a9..986f6ebe81e 100644 --- a/tools/pnnx/src/CMakeLists.txt +++ b/tools/pnnx/src/CMakeLists.txt @@ -472,6 +472,7 @@ set(pnnx_pass_ncnn_SRCS pass_ncnn/F_prelu.cpp pass_ncnn/F_relu.cpp pass_ncnn/F_relu6.cpp + pass_ncnn/F_scaled_dot_product_attention.cpp pass_ncnn/F_selu.cpp pass_ncnn/F_sigmoid.cpp pass_ncnn/F_silu.cpp diff --git a/tools/pnnx/src/pass_level5/fuse_scaled_dot_product_attention.cpp b/tools/pnnx/src/pass_level5/fuse_scaled_dot_product_attention.cpp index 8f265f374dc..a6dcbc86db7 100644 --- a/tools/pnnx/src/pass_level5/fuse_scaled_dot_product_attention.cpp +++ b/tools/pnnx/src/pass_level5/fuse_scaled_dot_product_attention.cpp @@ -62,7 +62,7 @@ pnnx.Output output 1 0 out pnnx.Input input_0 0 1 query pnnx.Input input_1 0 1 key pnnx.Input input_2 0 1 value -F.scaled_dot_product_attention op_0 3 1 query key value out attn_mask=None dropout_p=0.0 is_causal=False +F.scaled_dot_product_attention sdpa 3 1 query key value out attn_mask=None dropout_p=0.0 is_causal=False pnnx.Output output 1 0 out )PNNXIR"; } @@ -114,7 +114,7 @@ pnnx.Input input_Rh 0 1 Rh pnnx.Input input_Rw 0 1 Rw pnnx.Expression RhRw 2 1 Rh Rw RhRw expr=add(@0,@1) #RhRw=(%batch,%h,%w,%h,%w)f32 Tensor.reshape attn_mask 1 1 RhRw attn_mask shape=(%batch,%qsize,%qsize) #attn_mask=(%batch,%qsize,%qsize)f32 -F.scaled_dot_product_attention op_0 4 1 query key value attn_mask out dropout_p=0.0 is_causal=False $attn_mask=attn_mask +F.scaled_dot_product_attention sdpa 4 1 query key value attn_mask out dropout_p=0.0 is_causal=False $attn_mask=attn_mask pnnx.Output output 1 0 out )PNNXIR"; } @@ -137,15 +137,95 @@ pnnx.Output output 1 0 out } }; +class fuse_scaled_dot_product_attention_pass_onnx : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +12 11 +pnnx.Input input_0 0 1 query +pnnx.Input input_1 0 1 key +pnnx.Input input_2 0 1 value +pnnx.Input input_3 0 1 attn_mask +Tensor.permute op_0 1 1 query 13 dims=(0,2,1,3) +Tensor.permute op_1 1 1 key 20 dims=(0,2,3,1) +Tensor.permute op_2 1 1 value 19 dims=(0,2,1,3) +torch.matmul op_3 2 1 13 20 21 +pnnx.Expression op_4 2 1 21 attn_mask 23 expr=add(@0,@1) +F.softmax softmax 1 1 23 24 dim=%softmax_dim +torch.matmul op_6 2 1 24 19 out +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* replace_pattern_graph() const + { + return R"PNNXIR(7767517 +9 8 +pnnx.Input input_0 0 1 query +pnnx.Input input_1 0 1 key +pnnx.Input input_2 0 1 value +pnnx.Input input_3 0 1 attn_mask +Tensor.permute op_0 1 1 query q dims=(0,2,1,3) +Tensor.permute op_1 1 1 key k dims=(0,2,1,3) +Tensor.permute op_2 1 1 value v dims=(0,2,1,3) +F.scaled_dot_product_attention sdpa 4 1 q k v attn_mask out dropout_p=0.0 is_causal=False $attn_mask=attn_mask +pnnx.Output output 1 0 out +)PNNXIR"; + } + + bool match(const std::map& matched_operators, const std::map& captured_params, const std::map& /*captured_attrs*/) const + { + const int softmax_dim = captured_params.at("softmax_dim").i; + + int softmax_input_rank = (int)matched_operators.at("softmax")->inputs[0]->shape.size(); + if (softmax_dim != -1 && softmax_dim != softmax_input_rank - 1) + return false; + + return true; + } + + void write(const std::map& ops, const std::map& /*captured_params*/, const std::map& /*captured_attrs*/) const + { + Operator* op = ops.at("sdpa"); + + op->params["scale"] = 1.f; + + // rewrite qkv shape + { + std::vector q_shape = ops.at("op_0")->inputs[0]->shape; + std::vector k_shape = ops.at("op_1")->inputs[0]->shape; + std::vector v_shape = ops.at("op_2")->inputs[0]->shape; + + if (!q_shape.empty()) + std::swap(q_shape[1], q_shape[2]); + if (!k_shape.empty()) + std::swap(k_shape[1], k_shape[2]); + if (!v_shape.empty()) + std::swap(v_shape[1], v_shape[2]); + + ops.at("op_0")->outputs[0]->shape = q_shape; + ops.at("op_0")->outputs[0]->type = ops.at("op_0")->inputs[0]->type; + ops.at("op_1")->outputs[0]->shape = k_shape; + ops.at("op_1")->outputs[0]->type = ops.at("op_1")->inputs[0]->type; + ops.at("op_2")->outputs[0]->shape = v_shape; + ops.at("op_2")->outputs[0]->type = ops.at("op_2")->inputs[0]->type; + } + } +}; + void fuse_scaled_dot_product_attention(Graph& graph) { #if TORCH_VERSION_MAJOR >= 2 fuse_scaled_dot_product_attention_pass a; fuse_scaled_dot_product_attention_pass_1 b; + fuse_scaled_dot_product_attention_pass_onnx onnx0; int opindex = 0; pnnx_graph_rewrite(graph, &a, opindex); pnnx_graph_rewrite(graph, &b, opindex); + pnnx_graph_rewrite(graph, &onnx0, opindex); #endif } diff --git a/tools/pnnx/src/pass_ncnn/F_scaled_dot_product_attention.cpp b/tools/pnnx/src/pass_ncnn/F_scaled_dot_product_attention.cpp new file mode 100644 index 00000000000..af9f06b3f52 --- /dev/null +++ b/tools/pnnx/src/pass_ncnn/F_scaled_dot_product_attention.cpp @@ -0,0 +1,223 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "pass_ncnn.h" + +namespace pnnx { + +namespace ncnn { + +class F_scaled_dot_product_attention : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +16 15 +pnnx.Input input_0 0 1 input +pnnx.Input input_1 0 1 attn_mask +nn.Linear op_0 1 1 input q bias=%qbias in_features=%qdim out_features=%embed_dim @bias @weight +nn.Linear op_1 1 1 input k bias=%kbias in_features=%kdim out_features=%embed_dim @bias @weight +nn.Linear op_2 1 1 input v bias=%vbias in_features=%vdim out_features=%embed_dim @bias @weight +Tensor.reshape op_3 1 1 q 10 shape=(%batch,%size,%num_heads,%feat_per_head) +Tensor.reshape op_4 1 1 k 12 shape=(%batch,%size,%num_heads,%feat_per_head) +Tensor.reshape op_5 1 1 v 14 shape=(%batch,%size,%num_heads,%feat_per_head) +Tensor.permute op_6 1 1 10 16 dims=(0,2,1,3) +Tensor.permute op_7 1 1 12 17 dims=(0,2,1,3) +Tensor.permute op_8 1 1 14 18 dims=(0,2,1,3) +F.scaled_dot_product_attention op_9 4 1 16 17 18 attn_mask 19 dropout_p=0.0 is_causal=False scale=%scale +Tensor.permute op_10 1 1 19 20 dims=(0,2,1,3) +Tensor.reshape op_11 1 1 20 21 shape=(%batch,%size,%embed_dim) +nn.Linear out_proj 1 1 21 out bias=%outbias in_features=%embed_dim out_features=%qdim @bias @weight +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "MultiHeadAttention"; + } + + const char* name_str() const + { + return "sdpa_attention"; + } + + void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const + { + op->params["0"] = captured_params.at("embed_dim"); + op->params["1"] = captured_params.at("num_heads"); + + const int embed_dim = captured_params.at("embed_dim").i; + const int qdim = captured_params.at("qdim").i; + const int kdim = captured_params.at("kdim").i; + const int vdim = captured_params.at("vdim").i; + + op->params["2"] = embed_dim * qdim; + op->params["3"] = kdim; + op->params["4"] = vdim; + op->params["5"] = 1; + op->params["6"] = captured_params.at("scale"); + + op->attrs["0"] = Attribute(); + op->attrs["0"].data = {0, 0, 0, 0}; + op->attrs["1"] = captured_attrs.at("op_0.weight"); + if (captured_params.at("qbias").b) + { + op->attrs["2"] = captured_attrs.at("op_0.bias"); + } + else + { + op->attrs["2"] = Attribute({embed_dim}, std::vector(embed_dim, 0.f)); + } + op->attrs["3"] = Attribute(); + op->attrs["3"].data = {0, 0, 0, 0}; + op->attrs["4"] = captured_attrs.at("op_1.weight"); + if (captured_params.at("kbias").b) + { + op->attrs["5"] = captured_attrs.at("op_1.bias"); + } + else + { + op->attrs["5"] = Attribute({embed_dim}, std::vector(embed_dim, 0.f)); + } + op->attrs["6"] = Attribute(); + op->attrs["6"].data = {0, 0, 0, 0}; + op->attrs["7"] = captured_attrs.at("op_2.weight"); + if (captured_params.at("vbias").b) + { + op->attrs["8"] = captured_attrs.at("op_2.bias"); + } + else + { + op->attrs["8"] = Attribute({embed_dim}, std::vector(embed_dim, 0.f)); + } + op->attrs["9"] = Attribute(); + op->attrs["9"].data = {0, 0, 0, 0}; + op->attrs["a"] = captured_attrs.at("out_proj.weight"); + if (captured_params.at("outbias").b) + { + op->attrs["b"] = captured_attrs.at("out_proj.bias"); + } + else + { + op->attrs["b"] = Attribute({qdim}, std::vector(qdim, 0.f)); + } + } +}; + +REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_scaled_dot_product_attention, 10) + +class F_scaled_dot_product_attention_1 : public F_scaled_dot_product_attention +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +17 16 +pnnx.Input input_0 0 1 input +pnnx.Input input_1 0 1 kv +pnnx.Input input_2 0 1 attn_mask +nn.Linear op_0 1 1 input q bias=%qbias in_features=%qdim out_features=%embed_dim @bias @weight +nn.Linear op_1 1 1 kv k bias=%kbias in_features=%kdim out_features=%embed_dim @bias @weight +nn.Linear op_2 1 1 kv v bias=%vbias in_features=%vdim out_features=%embed_dim @bias @weight +Tensor.reshape op_3 1 1 q 10 shape=(%batch,%qsize,%num_heads,%feat_per_head) +Tensor.reshape op_4 1 1 k 12 shape=(%batch,%size,%num_heads,%feat_per_head) +Tensor.reshape op_5 1 1 v 14 shape=(%batch,%size,%num_heads,%feat_per_head) +Tensor.permute op_6 1 1 10 16 dims=(0,2,1,3) +Tensor.permute op_7 1 1 12 17 dims=(0,2,1,3) +Tensor.permute op_8 1 1 14 18 dims=(0,2,1,3) +F.scaled_dot_product_attention op_9 4 1 16 17 18 attn_mask 19 dropout_p=0.0 is_causal=False scale=%scale +Tensor.permute op_10 1 1 19 20 dims=(0,2,1,3) +Tensor.reshape op_11 1 1 20 21 shape=(%batch,%qsize,%embed_dim) +nn.Linear out_proj 1 1 21 out bias=%outbias in_features=%embed_dim out_features=%qdim @bias @weight +pnnx.Output output 1 0 out +)PNNXIR"; + } +}; + +REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_scaled_dot_product_attention_1, 10) + +class F_scaled_dot_product_attention_2 : public F_scaled_dot_product_attention +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +15 14 +pnnx.Input input 0 1 input +nn.Linear op_0 1 1 input q bias=%qbias in_features=%qdim out_features=%embed_dim @bias @weight +nn.Linear op_1 1 1 input k bias=%kbias in_features=%kdim out_features=%embed_dim @bias @weight +nn.Linear op_2 1 1 input v bias=%vbias in_features=%vdim out_features=%embed_dim @bias @weight +Tensor.reshape op_3 1 1 q 10 shape=(%batch,%size,%num_heads,%feat_per_head) +Tensor.reshape op_4 1 1 k 12 shape=(%batch,%size,%num_heads,%feat_per_head) +Tensor.reshape op_5 1 1 v 14 shape=(%batch,%size,%num_heads,%feat_per_head) +Tensor.permute op_6 1 1 10 16 dims=(0,2,1,3) +Tensor.permute op_7 1 1 12 17 dims=(0,2,1,3) +Tensor.permute op_8 1 1 14 18 dims=(0,2,1,3) +F.scaled_dot_product_attention op_9 3 1 16 17 18 19 dropout_p=0.0 is_causal=False attn_mask=None scale=%scale +Tensor.permute op_10 1 1 19 20 dims=(0,2,1,3) +Tensor.reshape op_11 1 1 20 21 shape=(%batch,%size,%embed_dim) +nn.Linear out_proj 1 1 21 out bias=%outbias in_features=%embed_dim out_features=%qdim @bias @weight +pnnx.Output output 1 0 out +)PNNXIR"; + } + + void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const + { + F_scaled_dot_product_attention::write(op, captured_params, captured_attrs); + op->params["5"] = 0; + } +}; + +REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_scaled_dot_product_attention_2, 10) + +class F_scaled_dot_product_attention_3 : public F_scaled_dot_product_attention +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +16 15 +pnnx.Input input_0 0 1 input +pnnx.Input input_1 0 1 kv +nn.Linear op_0 1 1 input q bias=%qbias in_features=%qdim out_features=%embed_dim @bias @weight +nn.Linear op_1 1 1 kv k bias=%kbias in_features=%kdim out_features=%embed_dim @bias @weight +nn.Linear op_2 1 1 kv v bias=%vbias in_features=%vdim out_features=%embed_dim @bias @weight +Tensor.reshape op_3 1 1 q 10 shape=(%batch,%qsize,%num_heads,%feat_per_head) +Tensor.reshape op_4 1 1 k 12 shape=(%batch,%size,%num_heads,%feat_per_head) +Tensor.reshape op_5 1 1 v 14 shape=(%batch,%size,%num_heads,%feat_per_head) +Tensor.permute op_6 1 1 10 16 dims=(0,2,1,3) +Tensor.permute op_7 1 1 12 17 dims=(0,2,1,3) +Tensor.permute op_8 1 1 14 18 dims=(0,2,1,3) +F.scaled_dot_product_attention op_9 3 1 16 17 18 19 dropout_p=0.0 is_causal=False attn_mask=None scale=%scale +Tensor.permute op_10 1 1 19 20 dims=(0,2,1,3) +Tensor.reshape op_11 1 1 20 21 shape=(%batch,%qsize,%embed_dim) +nn.Linear out_proj 1 1 21 out bias=%outbias in_features=%embed_dim out_features=%qdim @bias @weight +pnnx.Output output 1 0 out +)PNNXIR"; + } + + void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const + { + F_scaled_dot_product_attention::write(op, captured_params, captured_attrs); + op->params["5"] = 0; + } +}; + +REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_scaled_dot_product_attention_3, 10) + +} // namespace ncnn + +} // namespace pnnx From e82015878c5d9d67ee4a6d85f769cdc14a6d561f Mon Sep 17 00:00:00 2001 From: nihui Date: Thu, 18 Jul 2024 15:58:28 +0800 Subject: [PATCH 06/38] Update modelwriter.h for mha scale param --- tools/modelwriter.h | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/modelwriter.h b/tools/modelwriter.h index 88ccb948a9c..4f445cfe2a4 100644 --- a/tools/modelwriter.h +++ b/tools/modelwriter.h @@ -2007,6 +2007,7 @@ int ModelWriter::save(const char* parampath, const char* binpath) fprintf_param_value(" 3=%d", kdim) fprintf_param_value(" 4=%d", vdim) fprintf_param_value(" 5=%d", attn_mask) + fprintf_param_value(" 6=%e", scale) fwrite_weight_tag_data(op->q_weight_data, bp); fwrite_weight_data(op->q_bias_data, bp); From 3ee5c18f84963542ead978afa3c027ebf5526260 Mon Sep 17 00:00:00 2001 From: nihui Date: Sat, 20 Jul 2024 00:16:38 +0800 Subject: [PATCH 07/38] pnnx logaddexp (#5598) --- tools/pnnx/src/ir.cpp | 4 +- .../pnnx/src/pass_level3/fuse_expression.cpp | 3 + .../pnnx/src/pass_level5/eval_expression.cpp | 8 ++- .../pnnx/src/pass_ncnn/expand_expression.cpp | 2 + tools/pnnx/tests/CMakeLists.txt | 1 + tools/pnnx/tests/test_torch_logaddexp.py | 61 +++++++++++++++++++ 6 files changed, 77 insertions(+), 2 deletions(-) create mode 100644 tools/pnnx/tests/test_torch_logaddexp.py diff --git a/tools/pnnx/src/ir.cpp b/tools/pnnx/src/ir.cpp index 07d2bbefefd..cacd84fde79 100644 --- a/tools/pnnx/src/ir.cpp +++ b/tools/pnnx/src/ir.cpp @@ -1091,7 +1091,8 @@ static std::string expand_expression(const Operator* op) || t == "maximum" || t == "min" || t == "minimum" - || t == "pow") + || t == "pow" + || t == "logaddexp") { std::string binaryop; if (t == "atan2") binaryop = "torch.atan2"; @@ -1101,6 +1102,7 @@ static std::string expand_expression(const Operator* op) if (t == "min") binaryop = "torch.min"; if (t == "minimum") binaryop = "torch.minimum"; if (t == "pow") binaryop = "torch.pow"; + if (t == "logaddexp") binaryop = "torch.logaddexp"; std::string a = exprstack.top(); exprstack.pop(); diff --git a/tools/pnnx/src/pass_level3/fuse_expression.cpp b/tools/pnnx/src/pass_level3/fuse_expression.cpp index 708d1a548df..8fc918fed9d 100644 --- a/tools/pnnx/src/pass_level3/fuse_expression.cpp +++ b/tools/pnnx/src/pass_level3/fuse_expression.cpp @@ -154,6 +154,7 @@ static bool operand_maybe_tensor(const Operand* operand) || op->type == "aten::div" || op->type == "aten::floor_divide" || op->type == "aten::fmod" + || op->type == "aten::logaddexp" || op->type == "aten::max" || op->type == "aten::maximum" || op->type == "aten::min" @@ -653,6 +654,7 @@ static void fuse_expression(Graph& graph, Operand* operand, std::string& expr, s else if (op->type == "aten::atan2" || op->type == "aten::floor_divide" || op->type == "aten::fmod" + || op->type == "aten::logaddexp" || op->type == "aten::max" || op->type == "aten::maximum" || op->type == "aten::min" @@ -867,6 +869,7 @@ void fuse_expression(Graph& graph, const std::set& foldable_constan || op->type == "aten::fmod" || op->type == "aten::log" || op->type == "aten::log10" + || op->type == "aten::logaddexp" || op->type == "aten::max" || op->type == "aten::maximum" || op->type == "aten::min" diff --git a/tools/pnnx/src/pass_level5/eval_expression.cpp b/tools/pnnx/src/pass_level5/eval_expression.cpp index 44e1f7e3691..c7d5d5d0226 100644 --- a/tools/pnnx/src/pass_level5/eval_expression.cpp +++ b/tools/pnnx/src/pass_level5/eval_expression.cpp @@ -390,7 +390,8 @@ static std::string eval_expression(const Operator* op) || t == "floor_divide" || t == "fmod" || t == "pow" - || t == "remainder") + || t == "remainder" + || t == "logaddexp") { std::string a = exprstack.top(); exprstack.pop(); @@ -459,6 +460,11 @@ static std::string eval_expression(const Operator* op) r += bf; exprstack.push(std::to_string(r)); } + if (t == "logaddexp") + { + float r = log(exp(af) + exp(bf)); + exprstack.push(std::to_string(r)); + } } else { diff --git a/tools/pnnx/src/pass_ncnn/expand_expression.cpp b/tools/pnnx/src/pass_ncnn/expand_expression.cpp index f8f97baa55c..2fdc6d77d62 100644 --- a/tools/pnnx/src/pass_ncnn/expand_expression.cpp +++ b/tools/pnnx/src/pass_ncnn/expand_expression.cpp @@ -185,6 +185,7 @@ static std::string expand_expression(Graph& graph, const Operator* op, int& pnnx || t == "div" || t == "floor_divide" || t == "fmod" + || t == "logaddexp" || t == "max" || t == "maximum" || t == "min" @@ -211,6 +212,7 @@ static std::string expand_expression(Graph& graph, const Operator* op, int& pnnx if (t == "sub") op_binary->params["0"] = 1; if (t == "mul") op_binary->params["0"] = 2; if (t == "div") op_binary->params["0"] = 3; + if (t == "logaddexp") fprintf(stderr, "BinaryOp logaddexp not supported yet\n"); // TODO if (t == "max" || t == "maximum") op_binary->params["0"] = 4; if (t == "min" || t == "minimum") op_binary->params["0"] = 5; if (t == "floor_divide") fprintf(stderr, "BinaryOp floor_divide not supported yet\n"); // TODO diff --git a/tools/pnnx/tests/CMakeLists.txt b/tools/pnnx/tests/CMakeLists.txt index 2046a639256..7bbf1c6ea9c 100644 --- a/tools/pnnx/tests/CMakeLists.txt +++ b/tools/pnnx/tests/CMakeLists.txt @@ -295,6 +295,7 @@ pnnx_add_test(torch_floor) pnnx_add_test(torch_imag) pnnx_add_test(torch_log) pnnx_add_test(torch_log10) +pnnx_add_test(torch_logaddexp) pnnx_add_test(torch_maximum) pnnx_add_test(torch_minimum) pnnx_add_test(torch_neg) diff --git a/tools/pnnx/tests/test_torch_logaddexp.py b/tools/pnnx/tests/test_torch_logaddexp.py new file mode 100644 index 00000000000..6914dbd6213 --- /dev/null +++ b/tools/pnnx/tests/test_torch_logaddexp.py @@ -0,0 +1,61 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + out0 = torch.logaddexp(x, y) + out1 = torch.logaddexp(y, y) + out2 = torch.logaddexp(z, torch.ones_like(z) + 0.5) + return out0, out1, out2 + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(3, 16) + y = torch.rand(3, 16) + z = torch.rand(5, 9, 3) + + a = net(x, y, z) + + # export torchscript + mod = torch.jit.trace(net, (x, y, z)) + mod.save("test_torch_logaddexp.pt") + + # torchscript to pnnx + import os + os.system("../src/pnnx test_torch_logaddexp.pt inputshape=[3,16],[3,16],[5,9,3]") + + # pnnx inference + import test_torch_logaddexp_pnnx + b = test_torch_logaddexp_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) From d355b6dc5bf6daf2fcd00caa24875f6f3fcb862e Mon Sep 17 00:00:00 2001 From: lll143653 <58139948+lll143653@users.noreply.github.com> Date: Tue, 23 Jul 2024 16:54:35 +0800 Subject: [PATCH 08/38] Add warning and recommend to use pnnx (#5588) --- tools/onnx/onnx2ncnn.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tools/onnx/onnx2ncnn.cpp b/tools/onnx/onnx2ncnn.cpp index e443a28edf1..1b29e34c128 100644 --- a/tools/onnx/onnx2ncnn.cpp +++ b/tools/onnx/onnx2ncnn.cpp @@ -2956,6 +2956,15 @@ static std::string trunc_name(std::string name) int main(int argc, char** argv) { + fprintf(stderr, "onnx2ncnn may not fully meet your needs. For more accurate and elegant\n\ +conversion results, please use PNNX. PyTorch Neural Network eXchange (PNNX) is\n\ +an open standard for PyTorch model interoperability. PNNX provides an open model\n\ +format for PyTorch. It defines computation graph as well as high level operators\n\ +strictly matches PyTorch. You can obtain pnnx through the following ways:\n\ +1. Install via python\n\ + pip3 install pnnx\n\ +2. Get the executable from https://github.com/pnnx/pnnx\n\ +For more information, please refer to https://github.com/pnnx/pnnx\n"); if (!(argc == 2 || argc == 4)) { fprintf(stderr, "Usage: %s [onnxpb] [ncnnparam] [ncnnbin]\n", argv[0]); From 051b04ffb48e2d887bab758252eac55fc92bc028 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=B5=E5=B0=8F=E5=87=A1?= <2672931+whyb@users.noreply.github.com> Date: Wed, 24 Jul 2024 10:40:17 +0800 Subject: [PATCH 09/38] Updated use-ncnn-with-pytorch-or-onnx document (#5557) --- .../use-ncnn-with-pytorch-or-onnx.md | 148 ++++++++++++++++-- 1 file changed, 136 insertions(+), 12 deletions(-) diff --git a/docs/how-to-use-and-FAQ/use-ncnn-with-pytorch-or-onnx.md b/docs/how-to-use-and-FAQ/use-ncnn-with-pytorch-or-onnx.md index 9b0559a8eb8..e0195aa1403 100644 --- a/docs/how-to-use-and-FAQ/use-ncnn-with-pytorch-or-onnx.md +++ b/docs/how-to-use-and-FAQ/use-ncnn-with-pytorch-or-onnx.md @@ -2,8 +2,114 @@ Here is a practical guide for converting pytorch model to ncnn resnet18 is used as the example -## pytorch to onnx - +## pytorch to ncnn, onnx to ncnn + +### What's the pnnx? +PyTorch Neural Network eXchange(PNNX) is an open standard for PyTorch model interoperability. PNNX provides an open model format for PyTorch. It defines computation graph as well as high level operators strictly matches PyTorch. +It is recommended to use the `pnnx` tool to convert your `onnx` or `pytorch` model into a ncnn model now. + +### How to install pnnx? +* A. python pip (recommended) + * Windows/Linux/macOS 64bit + * python 3.7 or later + + ```shell + pip3 install pnnx + ``` + +* B. portable binary package (recommended if you hate python) + * Windows/Linux/macOS 64bit + * For Linux, glibc 2.17+ + + Download portable pnnx binary package from https://github.com/pnnx/pnnx/releases and extract it. + +* C. build from source + 1. install pytorch + 2. (optional) install torchvision for pnnx torchvision operator support + 3. (optional) install protobuf for pnnx onnx-zero support + 4. clone https://github.com/Tencent/ncnn.git + 5. build pnnx in ncnn/tools/pnnx with cmake + + You will probably refer https://github.com/pnnx/pnnx/blob/main/.github/workflows/release.yml for detailed steps + + ```shell + git clone https://github.com/Tencent/ncnn.git + mkdir ncnn/tools/pnnx/build + cd ncnn/tools/pnnx/build + cmake -DCMAKE_INSTALL_PREFIX=install -DTorch_INSTALL_DIR= -DTorchVision_INSTALL_DIR= .. + cmake --build . --config Release -j 4 + cmake --build . --config Release --target install + ``` + +### How to use pnnx? +* A. python + 1. optimize and export your torch model with pnnx.export() + ```python + import torch + import torchvision.models as models + import pnnx + + model = models.resnet18(pretrained=True) + + x = torch.rand(1, 3, 224, 224) + + opt_model = pnnx.export(model, "resnet18.pt", x) + + # use tuple for model with multiple inputs + # opt_model = pnnx.export(model, "resnet18.pt", (x, y, z)) + ``` + 2. use optimized module just like the normal one + ```python + result = opt_model(x) + ``` + 3. pick resnet18_pnnx.py for pnnx-optimized torch model + 4. pick resnet18.ncnn.param and resnet18.ncnn.bin for ncnn inference + +B. command line + 1. export your torch model to torchscript / onnx + ```python + import torch + import torchvision.models as models + + net = models.resnet18(pretrained=True) + net = net.eval() + + x = torch.rand(1, 3, 224, 224) + + # You could try disabling checking when tracing raises error + # mod = torch.jit.trace(net, x, check_trace=False) + mod = torch.jit.trace(net, x) + + mod.save("resnet18.pt") + + # You could also try exporting to the good-old onnx + torch.onnx.export(net, x, 'resnet18.onnx') + ``` + + 2. pnnx convert torchscript / onnx to optimized pnnx model and ncnn model files + ```shell + ./pnnx resnet18.pt inputshape=[1,3,224,224] + ./pnnx resnet18.onnx inputshape=[1,3,224,224] + ``` + macOS zsh user may need double quotes to prevent ambiguity + ```shell + ./pnnx resnet18.pt "inputshape=[1,3,224,224]" + ``` + For model with multiple inputs, use list + ```shell + ./pnnx resnet18.pt inputshape=[1,3,224,224],[1,32] + ``` + For model with non-fp32 input data type, add type suffix + ```shell + ./pnnx resnet18.pt inputshape=[1,3,224,224]f32,[1,32]i64 + ``` + 3. pick resnet18_pnnx.py for pnnx-optimized torch model + 4. pick resnet18.ncnn.param and resnet18.ncnn.bin for ncnn inference + +see more pnnx informations: https://github.com/pnnx/pnnx + +## pytorch to onnx (deprecated) +
pytorch to onnx The official pytorch tutorial for exporting onnx model https://pytorch.org/tutorials/advanced/super_resolution_with_caffe2.html @@ -22,9 +128,10 @@ x = torch.rand(1, 3, 224, 224) # Export the model torch_out = torch.onnx._export(model, x, "resnet18.onnx", export_params=True) ``` +
-## simplify onnx model - +## simplify onnx model (deprecated) +
simplify onnx model The exported resnet18.onnx model may contains many redundant operators such as Shape, Gather and Unsqueeze that is not supported in ncnn ``` @@ -37,19 +144,36 @@ Unsqueeze not supported yet! # axes 7 ``` -Fortunately, daquexian developed a handy tool to eliminate them. cheers! +### onnxsim -https://github.com/daquexian/onnx-simplifier +Fortunately, [@daquexian](https://github.com/daquexian) developed a handy tool to eliminate them. cheers! +#### how to use onnxsim? +```shell +pip install onnxsim +python -m onnxsim resnet18.onnx resnet18-sim.onnx ``` -python3 -m onnxsim resnet18.onnx resnet18-sim.onnx -``` +more informations: https://github.com/daquexian/onnx-simplifier -## onnx to ncnn +### onnxslim -Finally, you can convert the model to ncnn using tools/onnx2ncnn +Or you can use another powerful model simplification tool implemented in pure Python development by [@inisis](https://github.com/inisis): +#### how to use onnxslim? +```shell +pip install onnxslim +python -m onnxslim resnet18.onnx resnet18-slim.onnx ``` -onnx2ncnn resnet18-sim.onnx resnet18.param resnet18.bin -``` +more informations: https://github.com/inisis/OnnxSlim +
+ +## onnx2ncnn (deprecated) + +~~The onnx2ncnn tool has stopped maintenance. It is recommended to use the PNNX tool~~ + +
onnx2ncnn tool + +~~Finally, you can convert the model to ncnn using tools/onnx2ncnn~~ +~~onnx2ncnn resnet18-sim.onnx resnet18.param resnet18.bin~~ +
\ No newline at end of file From 92e0b8253bc9d16b0d77bd17693fe9a72fb64b64 Mon Sep 17 00:00:00 2001 From: quink Date: Tue, 30 Jul 2024 10:47:00 +0800 Subject: [PATCH 10/38] arm/convolution_3x3_pack1to8_fp16s: prefer ldr/str over ld1/st1 (#5603) Depending on the arch, ldr/str can be faster than ld1/st1, especially for loading to one lane form. For example, on Cortex A75, 1. execution latency of 'ldr q0' and 'ldr h0' are 5 2. execution latency of 'ld1 {v0.16b}' is 6 3. execution latency of 'ld1 {v0.h}[0]' is 8 On Cortex X3, 1. execution latency of 'ldr q0' and 'ldr h0' are 6 2. execution latency of 'ld1 {v0.16b}' is 6 3. execution latency of 'ld1 {v0.h}[0]' is 8 Signed-off-by: Zhao Zhili --- .../arm/convolution_3x3_pack1to8_fp16s.h | 68 +++++++++---------- 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/src/layer/arm/convolution_3x3_pack1to8_fp16s.h b/src/layer/arm/convolution_3x3_pack1to8_fp16s.h index bd03d450b2e..40e276cdedf 100644 --- a/src/layer/arm/convolution_3x3_pack1to8_fp16s.h +++ b/src/layer/arm/convolution_3x3_pack1to8_fp16s.h @@ -68,8 +68,8 @@ static void conv3x3s1_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob "sub %0, %0, #64 \n" "prfm pldl1keep, [%1, #128] \n" - "ld1 {v0.8h}, [%1], #16 \n" // r0 - "ld1 {v1.4h}, [%1] \n" + "ldr q0, [%1], #16 \n" // r0 + "ldr s1, [%1] \n" "fmla v24.8h, %8.8h, v0.h[0] \n" "fmla v25.8h, %8.8h, v0.h[1] \n" @@ -99,8 +99,8 @@ static void conv3x3s1_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob "fmla v31.8h, %10.8h, v1.h[1] \n" "prfm pldl1keep, [%2, #128] \n" - "ld1 {v2.8h}, [%2], #16 \n" // r1 - "ld1 {v3.4h}, [%2] \n" + "ldr q2, [%2], #16 \n" // r1 + "ldr s3, [%2] \n" "fmla v24.8h, %11.8h, v2.h[0] \n" "fmla v25.8h, %11.8h, v2.h[1] \n" @@ -130,8 +130,8 @@ static void conv3x3s1_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob "fmla v31.8h, %13.8h, v3.h[1] \n" "prfm pldl1keep, [%3, #128] \n" - "ld1 {v4.8h}, [%3], #16 \n" // r2 - "ld1 {v5.4h}, [%3] \n" + "ldr q4, [%3], #16 \n" // r2 + "ldr s5, [%3] \n" "fmla v24.8h, %14.8h, v4.h[0] \n" "fmla v25.8h, %14.8h, v4.h[1] \n" @@ -189,7 +189,7 @@ static void conv3x3s1_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob "ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [%0] \n" // sum0 sum1 sum2 sum3 "prfm pldl1keep, [%1, #128] \n" - "ld1 {v0.8h}, [%1] \n" // r0 + "ldr q0, [%1] \n" // r0 "fmla v28.8h, %8.8h, v0.h[0] \n" "fmla v29.8h, %8.8h, v0.h[1] \n" @@ -207,7 +207,7 @@ static void conv3x3s1_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob "fmla v31.8h, %10.8h, v0.h[5] \n" "prfm pldl1keep, [%2, #128] \n" - "ld1 {v1.8h}, [%2] \n" // r1 + "ldr q1, [%2] \n" // r1 "fmla v28.8h, %11.8h, v1.h[0] \n" "fmla v29.8h, %11.8h, v1.h[1] \n" @@ -225,7 +225,7 @@ static void conv3x3s1_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob "fmla v31.8h, %13.8h, v1.h[5] \n" "prfm pldl1keep, [%3, #128] \n" - "ld1 {v2.8h}, [%3] \n" // r2 + "ldr q2, [%3] \n" // r2 "fmla v28.8h, %14.8h, v2.h[0] \n" "fmla v29.8h, %14.8h, v2.h[1] \n" @@ -274,7 +274,7 @@ static void conv3x3s1_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob "ld1 {v30.8h, v31.8h}, [%0] \n" // sum0 sum1 "prfm pldl1keep, [%1, #64] \n" - "ld1 {v0.4h}, [%1] \n" // r0 + "ldr d0, [%1] \n" // r0 "fmla v30.8h, %8.8h, v0.h[0] \n" "fmla v31.8h, %8.8h, v0.h[1] \n" @@ -284,7 +284,7 @@ static void conv3x3s1_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob "fmla v31.8h, %10.8h, v0.h[3] \n" "prfm pldl1keep, [%2, #64] \n" - "ld1 {v1.4h}, [%2] \n" // r1 + "ldr d1, [%2] \n" // r1 "fmla v30.8h, %11.8h, v1.h[0] \n" "fmla v31.8h, %11.8h, v1.h[1] \n" @@ -294,7 +294,7 @@ static void conv3x3s1_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob "fmla v31.8h, %13.8h, v1.h[3] \n" "prfm pldl1keep, [%3, #64] \n" - "ld1 {v2.4h}, [%3] \n" // r2 + "ldr d2, [%3] \n" // r2 "fmla v30.8h, %14.8h, v2.h[0] \n" "fmla v31.8h, %14.8h, v2.h[1] \n" @@ -332,24 +332,24 @@ static void conv3x3s1_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob { asm volatile( "prfm pldl1keep, [%0, #128] \n" - "ld1 {v30.8h}, [%0] \n" // sum0 + "ldr q30, [%0] \n" // sum0 "prfm pldl1keep, [%1, #64] \n" - "ld1 {v0.4h}, [%1] \n" // r0 + "ldr d0, [%1] \n" // r0 "fmla v30.8h, %8.8h, v0.h[0] \n" "fmla v30.8h, %9.8h, v0.h[1] \n" "fmla v30.8h, %10.8h, v0.h[2] \n" "prfm pldl1keep, [%2, #64] \n" - "ld1 {v1.4h}, [%2] \n" // r1 + "ldr d1, [%2] \n" // r1 "fmla v30.8h, %11.8h, v1.h[0] \n" "fmla v30.8h, %12.8h, v1.h[1] \n" "fmla v30.8h, %13.8h, v1.h[2] \n" "prfm pldl1keep, [%3, #64] \n" - "ld1 {v2.4h}, [%3] \n" // r2 + "ldr d2, [%3] \n" // r2 "fmla v30.8h, %14.8h, v2.h[0] \n" "fmla v30.8h, %15.8h, v2.h[1] \n" @@ -359,7 +359,7 @@ static void conv3x3s1_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob "add %2, %2, #2 \n" "add %3, %3, #2 \n" - "st1 {v30.8h}, [%0], #16 \n" + "str q30, [%0], #16 \n" : "=r"(outptr0), // %0 "=r"(r0), // %1 @@ -445,8 +445,8 @@ static void conv3x3s2_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob "ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [%0] \n" // sum0 sum1 sum2 sum3 "prfm pldl1keep, [%1, #128] \n" - "ld1 {v0.8h}, [%1], #16 \n" // r0 - "ld1 {v1.h}[0], [%1] \n" + "ldr q0, [%1], #16 \n" // r0 + "ldr h1, [%1] \n" "fmla v28.8h, %8.8h, v0.h[0] \n" "fmla v29.8h, %8.8h, v0.h[2] \n" @@ -464,8 +464,8 @@ static void conv3x3s2_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob "fmla v31.8h, %10.8h, v1.h[0] \n" "prfm pldl1keep, [%2, #128] \n" - "ld1 {v2.8h}, [%2], #16 \n" // r1 - "ld1 {v3.h}[0], [%2] \n" + "ldr q2, [%2], #16 \n" // r1 + "ldr h3, [%2] \n" "fmla v28.8h, %11.8h, v2.h[0] \n" "fmla v29.8h, %11.8h, v2.h[2] \n" @@ -483,8 +483,8 @@ static void conv3x3s2_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob "fmla v31.8h, %13.8h, v3.h[0] \n" "prfm pldl1keep, [%3, #128] \n" - "ld1 {v4.8h}, [%3], #16 \n" // r2 - "ld1 {v5.h}[0], [%3] \n" + "ldr q4, [%3], #16 \n" // r2 + "ldr h5, [%3] \n" "fmla v28.8h, %14.8h, v4.h[0] \n" "fmla v29.8h, %14.8h, v4.h[2] \n" @@ -529,8 +529,8 @@ static void conv3x3s2_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob "ld1 {v30.8h, v31.8h}, [%0] \n" // sum0 sum1 "prfm pldl1keep, [%1, #64] \n" - "ld1 {v0.4h}, [%1], #8 \n" // r0 - "ld1 {v1.h}[0], [%1] \n" + "ldr d0, [%1], #8 \n" // r0 + "ldr h1, [%1] \n" "fmla v30.8h, %8.8h, v0.h[0] \n" "fmla v31.8h, %8.8h, v0.h[2] \n" @@ -540,8 +540,8 @@ static void conv3x3s2_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob "fmla v31.8h, %10.8h, v1.h[0] \n" "prfm pldl1keep, [%2, #64] \n" - "ld1 {v2.4h}, [%2], #8 \n" // r1 - "ld1 {v3.h}[0], [%2] \n" + "ldr d2, [%2], #8 \n" // r1 + "ldr h3, [%2] \n" "fmla v30.8h, %11.8h, v2.h[0] \n" "fmla v31.8h, %11.8h, v2.h[2] \n" @@ -551,8 +551,8 @@ static void conv3x3s2_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob "fmla v31.8h, %13.8h, v3.h[0] \n" "prfm pldl1keep, [%3, #64] \n" - "ld1 {v4.4h}, [%3], #8 \n" // r2 - "ld1 {v5.h}[0], [%3] \n" + "ldr d4, [%3], #8 \n" // r2 + "ldr h5, [%3] \n" "fmla v30.8h, %14.8h, v4.h[0] \n" "fmla v31.8h, %14.8h, v4.h[2] \n" @@ -586,24 +586,24 @@ static void conv3x3s2_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob { asm volatile( "prfm pldl1keep, [%0, #128] \n" - "ld1 {v30.8h}, [%0] \n" // sum0 + "ldr q30, [%0] \n" // sum0 "prfm pldl1keep, [%1, #64] \n" - "ld1 {v0.4h}, [%1] \n" // r0 + "ldr d0, [%1] \n" // r0 "fmla v30.8h, %8.8h, v0.h[0] \n" "fmla v30.8h, %9.8h, v0.h[1] \n" "fmla v30.8h, %10.8h, v0.h[2] \n" "prfm pldl1keep, [%2, #64] \n" - "ld1 {v1.4h}, [%2] \n" // r1 + "ldr d1, [%2] \n" // r1 "fmla v30.8h, %11.8h, v1.h[0] \n" "fmla v30.8h, %12.8h, v1.h[1] \n" "fmla v30.8h, %13.8h, v1.h[2] \n" "prfm pldl1keep, [%3, #64] \n" - "ld1 {v2.4h}, [%3] \n" // r2 + "ldr d2, [%3] \n" // r2 "fmla v30.8h, %14.8h, v2.h[0] \n" "fmla v30.8h, %15.8h, v2.h[1] \n" @@ -613,7 +613,7 @@ static void conv3x3s2_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob "add %2, %2, #4 \n" "add %3, %3, #4 \n" - "st1 {v30.8h}, [%0], #16 \n" + "str q30, [%0], #16 \n" : "=r"(outptr0), // %0 "=r"(r0), // %1 From 391152f500cf20bef50da3a0617900acca34c770 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BD=B0=E9=98=85?= <43716063+Baiyuetribe@users.noreply.github.com> Date: Thu, 1 Aug 2024 21:06:38 +0800 Subject: [PATCH 11/38] c_api surpport set_vulkan_device (#5610) --- src/c_api.cpp | 7 +++++++ src/c_api.h | 4 ++++ 2 files changed, 11 insertions(+) diff --git a/src/c_api.cpp b/src/c_api.cpp index 5662d1b5155..f8146e054c2 100644 --- a/src/c_api.cpp +++ b/src/c_api.cpp @@ -1240,6 +1240,13 @@ void ncnn_net_set_option(ncnn_net_t net, ncnn_option_t opt) ((Net*)net->pthis)->opt = *((Option*)opt); } +#if NCNN_VULKAN +void ncnn_net_set_vulkan_device(ncnn_net_t net, int device_index) +{ + ((Net*)net->pthis)->set_vulkan_device(device_index); +} +#endif + static ::ncnn::Layer* __Layer_c_api_layer_creator(void* userdata) { ncnn_net_custom_layer_factory_t ud = (ncnn_net_custom_layer_factory_t)userdata; diff --git a/src/c_api.h b/src/c_api.h index d153b2a4ef0..f752bfed663 100644 --- a/src/c_api.h +++ b/src/c_api.h @@ -275,6 +275,10 @@ NCNN_EXPORT void ncnn_net_destroy(ncnn_net_t net); NCNN_EXPORT ncnn_option_t ncnn_net_get_option(ncnn_net_t net); NCNN_EXPORT void ncnn_net_set_option(ncnn_net_t net, ncnn_option_t opt); +#if NCNN_VULKAN +NCNN_EXPORT void ncnn_net_set_vulkan_device(ncnn_net_t net, int device_index); +#endif + #if NCNN_STRING NCNN_EXPORT void ncnn_net_register_custom_layer_by_type(ncnn_net_t net, const char* type, ncnn_layer_creator_t creator, ncnn_layer_destroyer_t destroyer, void* userdata); #endif /* NCNN_STRING */ From 5b5c1fdb8fb80b52bbe63e9d5c8a5fca15ffda7f Mon Sep 17 00:00:00 2001 From: Galasnow <854932917@qq.com> Date: Thu, 8 Aug 2024 11:00:23 +0800 Subject: [PATCH 12/38] Fix build error with NDK r27 (#5615) Enable policy CMP0057 for cmake version >= 3.3 --- CMakeLists.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 309e3b8fbd0..0f32a80c86e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -37,6 +37,11 @@ if(POLICY CMP0025) cmake_policy(SET CMP0025 NEW) endif() +if(POLICY CMP0057) + # reference from https://cmake.org/cmake/help/latest/policy/CMP0057.html + cmake_policy(SET CMP0057 NEW) +endif() + project(ncnn) if(MSVC AND NOT CMAKE_VERSION VERSION_LESS "3.15") From 03cf161dbd28a24e57adb0cbbc693f98adec8e6a Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 8 Aug 2024 11:01:17 +0800 Subject: [PATCH 13/38] Bump pypa/cibuildwheel from 2.17.0 to 2.20.0 (#5613) Bumps [pypa/cibuildwheel](https://github.com/pypa/cibuildwheel) from 2.17.0 to 2.20.0. - [Release notes](https://github.com/pypa/cibuildwheel/releases) - [Changelog](https://github.com/pypa/cibuildwheel/blob/main/docs/changelog.md) - [Commits](https://github.com/pypa/cibuildwheel/compare/v2.17.0...v2.20.0) --- updated-dependencies: - dependency-name: pypa/cibuildwheel dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/release-python.yml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/release-python.yml b/.github/workflows/release-python.yml index d8304c0e33c..82bd5551fcd 100644 --- a/.github/workflows/release-python.yml +++ b/.github/workflows/release-python.yml @@ -87,7 +87,7 @@ jobs: # build wheels for ubuntu-20.04 - name: Build wheels for ubuntu if: matrix.os == 'ubuntu-20.04' - uses: pypa/cibuildwheel@v2.17.0 + uses: pypa/cibuildwheel@v2.20.0 env: CIBW_ARCHS_LINUX: ${{ matrix.arch }} CIBW_BUILD: ${{ matrix.build }} @@ -99,7 +99,7 @@ jobs: # build wheels for windows-2019 - name: Build wheels for windows if: matrix.os == 'windows-2019' && (matrix.arch == 'AMD64' || matrix.arch == 'x86') - uses: pypa/cibuildwheel@v2.17.0 + uses: pypa/cibuildwheel@v2.20.0 env: CIBW_ARCHS_WINDOWS: ${{ matrix.arch }} CIBW_BUILD: ${{ matrix.build }} @@ -112,7 +112,7 @@ jobs: - name: Build wheels for windows ARM64 if: matrix.os == 'windows-2019' && matrix.arch == 'ARM64' - uses: pypa/cibuildwheel@v2.17.0 + uses: pypa/cibuildwheel@v2.20.0 env: CIBW_ARCHS_WINDOWS: ${{ matrix.arch }} CIBW_BUILD: ${{ matrix.build }} @@ -190,7 +190,7 @@ jobs: - name: Build wheels for macos x86_64 if: matrix.os == 'macos-13' && matrix.arch == 'x86_64' - uses: pypa/cibuildwheel@v2.17.0 + uses: pypa/cibuildwheel@v2.20.0 env: CIBW_ARCHS_MACOS: ${{ matrix.arch }} CIBW_BUILD: ${{ matrix.build }} @@ -207,7 +207,7 @@ jobs: - name: Build wheels for macos arm64 if: matrix.os == 'macos-13' && matrix.arch == 'arm64' - uses: pypa/cibuildwheel@v2.17.0 + uses: pypa/cibuildwheel@v2.20.0 env: CIBW_ARCHS_MACOS: ${{ matrix.arch }} CIBW_BUILD: ${{ matrix.build }} @@ -262,7 +262,7 @@ jobs: platforms: all - name: Build wheels for manylinux with qemu - uses: pypa/cibuildwheel@v2.17.0 + uses: pypa/cibuildwheel@v2.20.0 env: CIBW_ARCHS_LINUX: ${{ matrix.arch }} CIBW_BUILD: ${{ matrix.build_cp }}-${{ matrix.build_sub }}* @@ -310,7 +310,7 @@ jobs: platforms: all - name: Build wheels for manylinux with qemu - uses: pypa/cibuildwheel@v2.17.0 + uses: pypa/cibuildwheel@v2.20.0 env: CIBW_ARCHS_LINUX: ${{ matrix.arch }} CIBW_BUILD: ${{ matrix.build_pp }}-* From 60823a8de3defa2e7d642d981ed0af13b5da58f0 Mon Sep 17 00:00:00 2001 From: nihui Date: Thu, 8 Aug 2024 14:07:20 +0800 Subject: [PATCH 14/38] pnnx handles sdpa batch index (#5617) --- tools/pnnx/src/pass_ncnn/solve_batch_index.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/pnnx/src/pass_ncnn/solve_batch_index.cpp b/tools/pnnx/src/pass_ncnn/solve_batch_index.cpp index 6e53f7aa841..4b1100789fc 100644 --- a/tools/pnnx/src/pass_ncnn/solve_batch_index.cpp +++ b/tools/pnnx/src/pass_ncnn/solve_batch_index.cpp @@ -56,6 +56,7 @@ static bool is_known_operator_with_batch_index_0(const Operator* op) "F.pixel_shuffle", "F.pixel_unshuffle", "F.prelu", + "F.scaled_dot_product_attention", "F.unfold", "F.upsample_bilinear", "F.upsample_nearest", From b9debee8fb92263cd3a087208d3657081a2e4f37 Mon Sep 17 00:00:00 2001 From: nihui Date: Sat, 10 Aug 2024 11:39:32 +0800 Subject: [PATCH 15/38] pnnx ci for torch 2.4 (#5618) * update onnx proto --- .ci/pnnx.yml | 13 +- tools/pnnx/src/CMakeLists.txt | 4 +- tools/pnnx/src/load_onnx.cpp | 2 +- tools/pnnx/src/onnx-data.proto | 155 ++++++++++++++++++ tools/pnnx/src/{onnx.proto => onnx-ml.proto} | 72 ++++++-- tools/pnnx/src/onnx-operators-ml.proto | 136 +++++++++++++++ .../pass_level5/fuse_multiheadattention.cpp | 60 +++++++ tools/pnnx/src/pass_onnx.cpp | 2 +- tools/pnnx/src/pass_onnx/canonicalize.h | 2 +- .../src/pass_onnx/dead_code_elimination.h | 2 +- tools/pnnx/src/pass_onnx/eliminate_noop.h | 2 +- tools/pnnx/src/pass_onnx/fold_constants.h | 2 +- .../pass_onnx/fuse_constant_as_attribute.h | 2 +- tools/pnnx/src/pass_onnx/inline_containers.h | 2 +- tools/pnnx/src/pass_onnx/inline_if_graph.h | 2 +- tools/pnnx/src/pass_onnx/model_stat.h | 2 +- .../src/pass_onnx/nn_AdaptiveAvgPool2d.cpp | 2 +- .../src/pass_onnx/nn_AdaptiveAvgPool3d.cpp | 2 +- tools/pnnx/src/pass_onnx/nn_AvgPool2d.cpp | 2 +- tools/pnnx/src/pass_onnx/nn_AvgPool3d.cpp | 2 +- tools/pnnx/src/pass_onnx/nn_BatchNorm2d.cpp | 2 +- tools/pnnx/src/pass_onnx/nn_BatchNorm3d.cpp | 2 +- tools/pnnx/src/pass_onnx/nn_Conv2d.cpp | 2 +- tools/pnnx/src/pass_onnx/nn_Conv3d.cpp | 2 +- tools/pnnx/src/pass_onnx/nn_GELU.cpp | 2 +- tools/pnnx/src/pass_onnx/nn_LayerNorm.cpp | 2 +- tools/pnnx/src/pass_onnx/nn_Linear.cpp | 2 +- tools/pnnx/src/pass_onnx/nn_MaxPool2d.cpp | 2 +- tools/pnnx/src/pass_onnx/nn_MaxPool3d.cpp | 2 +- .../src/pass_onnx/nn_MultiheadAttention.cpp | 2 +- tools/pnnx/src/pass_onnx/shape_inference.h | 2 +- tools/pnnx/src/save_onnx.cpp | 2 +- tools/pnnx/tests/onnx/test_nn_ReLU.py | 2 +- tools/pnnx/tests/onnx/test_squeezenet1_1.py | 2 +- tools/pnnx/tests/onnx/test_swin_t.py | 2 +- tools/pnnx/tests/onnx/test_vit_b_32.py | 2 +- 36 files changed, 453 insertions(+), 47 deletions(-) create mode 100644 tools/pnnx/src/onnx-data.proto rename tools/pnnx/src/{onnx.proto => onnx-ml.proto} (92%) create mode 100644 tools/pnnx/src/onnx-operators-ml.proto diff --git a/.ci/pnnx.yml b/.ci/pnnx.yml index 5c44354aaaa..990690e0c5b 100644 --- a/.ci/pnnx.yml +++ b/.ci/pnnx.yml @@ -17,10 +17,10 @@ concurrency: variables: protobuf_version: 21.12 - libtorch_version: 2.3.0 - libtorchvision_version: 0.18.0 - onnxruntime_version: 1.17.3 - cache_date: 20240504 + libtorch_version: 2.4.0 + libtorchvision_version: 0.19.0 + onnxruntime_version: 1.18.1 + cache_date: 20240804 jobs: ubuntu: @@ -57,6 +57,9 @@ jobs: - torch-version: 2.3.0 torchvision-version: 0.18.0 + - torch-version: 2.4.0 + torchvision-version: 0.19.0 + runs-on: pool-name: docker container: @@ -160,7 +163,7 @@ jobs: - name: setup-pytorch run: | export PYTHONUSERBASE=${{ci.workspace}}/torch-${{matrix.torch-version}} - pip3 install --user torch==${{matrix.torch-version}}+cpu torchvision==${{matrix.torchvision-version}}+cpu -f https://download.pytorch.org/whl/torch_stable.html + pip3 install --user torch==${{matrix.torch-version}}+cpu torchvision==${{matrix.torchvision-version}}+cpu --index-url https://download.pytorch.org/whl/cpu pip3 install --user onnx pip3 install --user onnxscript diff --git a/tools/pnnx/src/CMakeLists.txt b/tools/pnnx/src/CMakeLists.txt index 986f6ebe81e..27dfdef52f8 100644 --- a/tools/pnnx/src/CMakeLists.txt +++ b/tools/pnnx/src/CMakeLists.txt @@ -587,12 +587,12 @@ if(PROTOBUF_FOUND) endif() if(Protobuf_FOUND OR protobuf_MODULE_COMPATIBLE) - protobuf_generate_cpp(ONNX_PROTO_SRCS ONNX_PROTO_HDRS onnx.proto) + protobuf_generate_cpp(ONNX_PROTO_SRCS ONNX_PROTO_HDRS onnx-data.proto onnx-ml.proto onnx-operators-ml.proto) add_library(onnxproto STATIC ${ONNX_PROTO_SRCS} ${ONNX_PROTO_HDRS}) target_include_directories(onnxproto PUBLIC ${PROTOBUF_INCLUDE_DIR} ${CMAKE_CURRENT_BINARY_DIR}) target_link_libraries(onnxproto PUBLIC ${PROTOBUF_LIBRARIES}) else() - add_library(onnxproto STATIC onnx.proto) + add_library(onnxproto STATIC onnx-data.proto onnx-ml.proto onnx-operators-ml.proto) target_include_directories(onnxproto PUBLIC ${CMAKE_CURRENT_BINARY_DIR}) protobuf_generate(TARGET onnxproto) target_link_libraries(onnxproto PUBLIC protobuf::libprotobuf) diff --git a/tools/pnnx/src/load_onnx.cpp b/tools/pnnx/src/load_onnx.cpp index 36624d916bd..9adf2b47088 100644 --- a/tools/pnnx/src/load_onnx.cpp +++ b/tools/pnnx/src/load_onnx.cpp @@ -14,7 +14,7 @@ #include "load_onnx.h" -#include "onnx.pb.h" +#include "onnx-ml.pb.h" #include #include diff --git a/tools/pnnx/src/onnx-data.proto b/tools/pnnx/src/onnx-data.proto new file mode 100644 index 00000000000..d7d925d45d0 --- /dev/null +++ b/tools/pnnx/src/onnx-data.proto @@ -0,0 +1,155 @@ +// +// WARNING: This file is automatically generated! Please edit onnx.in.proto. +// + + +// SPDX-License-Identifier: Apache-2.0 + + +syntax = "proto2"; + +package onnx; +import "onnx-ml.proto"; + +// This file contains the proto definitions for MapProto and +// SequenceProto. These protos are used to represent the data structures +// of maps and sequence for use in test data or ModelProto. + +// Sequences +// +// Defines a dense, ordered, collection of elements that are of homogeneous types. +// Sequences can be made out of tensors, maps, or sequences. +// +// If a sequence is made out of tensors, the tensors must have the same element +// type (i.e. int32). In some cases, the tensors in a sequence can have different +// shapes. Whether the tensors can have different shapes or not depends on the +// type/shape associated with the corresponding "ValueInfo". For example, +// "Sequence" means that all tensors have same shape. However, +// "Sequence" means they can have different +// shapes (all of rank 2), where "omitted" means the corresponding dimension has +// no symbolic/constant value. Finally, "Sequence>" means +// that the different tensors can have different ranks, when the "shape" itself +// is omitted from the tensor-type. For a more complete description, refer to +// https://github.com/onnx/onnx/blob/main/docs/IR.md#static-tensor-shapes. +// +message SequenceProto { + + optional string name = 1; + + enum DataType { + UNDEFINED = 0; + TENSOR = 1; + SPARSE_TENSOR = 2; + SEQUENCE = 3; + MAP = 4; + OPTIONAL = 5; + } + + // The data type of the element. + // This field MUST have a valid SequenceProto.DataType value + optional int32 elem_type = 2; + + // For TensorProto values. + // When this field is present, the elem_type field MUST be TENSOR. + repeated TensorProto tensor_values = 3; + + // For SparseTensorProto values. + // When this field is present, the elem_type field MUST be SPARSE_TENSOR. + repeated SparseTensorProto sparse_tensor_values = 4; + + // For SequenceProto values, allowing sequences to be of themselves. + // When this field is present, the elem_type field MUST be SEQUENCE. + repeated SequenceProto sequence_values = 5; + + // For MapProto values. + // When this field is present, the elem_type field MUST be MAP. + repeated MapProto map_values = 6; + + // For OptionalProto values. + // When this field is present, the elem_type field MUST be Optional. + repeated OptionalProto optional_values = 7; + +} + + +// Maps +// +// Specifies an associative table, defined by keys and values. +// MapProto is formed with a repeated field of keys (of type INT8, INT16, INT32, +// INT64, UINT8, UINT16, UINT32, UINT64, or STRING) and values (of type TENSOR, +// SPARSE_TENSOR, SEQUENCE, or MAP). Key types and value types have to remain +// the same throughout the instantiation of the MapProto. +// +message MapProto { + + optional string name = 1; + + // All MapProto data types must have the same length of keys and values. + + // The data type of the key. + // This field MUST have a valid TensorProto.DataType value of + // INT8, INT16, INT32, INT64, UINT8, UINT16, UINT32, UINT64, or STRING + optional int32 key_type = 2; + + // Every element of keys has to be one of the following data types + // INT8, INT16, INT32, INT64, UINT8, UINT16, UINT32, UINT64, or STRING. + // The integer cases are represented by the repeated int64 field keys below. + repeated int64 keys = 3; + + // If keys are strings, they are represented by the repeated bytes field + // string_keys below. + repeated bytes string_keys = 4; + + // MapProto values are represented in a SequenceProto of the same length as the + // repeated keys field and have to be one of the following data types + // TENSOR, SPARSE_TENSOR, MAP, SEQUENCE. + optional SequenceProto values = 5; +} + +// Optional +// +// +message OptionalProto { + + optional string name = 1; + + enum DataType { + UNDEFINED = 0; + TENSOR = 1; + SPARSE_TENSOR = 2; + SEQUENCE = 3; + MAP = 4; + OPTIONAL = 5; + } + + // The data type of the element, identifies if the OptionalProto value + // is Tensor, Sparse Tensor, Sequence, Map, or Optional. + // The type of the optional value MUST match the elem_type specified. + // This field MUST have a valid OptionalProto.DataType value. + optional int32 elem_type = 2; + + // For TensorProto value. + // When this field is present, the elem_type field MUST be TENSOR. + optional TensorProto tensor_value = 3; + + // For SparseTensorProto value. + // When this field is present, the elem_type field MUST be SPARSE_TENSOR. + optional SparseTensorProto sparse_tensor_value = 4; + + // For SequenceProto value. + // When this field is present, the elem_type field MUST be SEQUENCE. + optional SequenceProto sequence_value = 5; + + // For MapProto value. + // When this field is present, the elem_type field MUST be MAP. + optional MapProto map_value = 6; + + // For OptionalProto value, allowing optional to be of itself (completeness) + // When this field is present, the elem_type field MUST be OPTIONAL. + optional OptionalProto optional_value = 7; + +} + +// For using protobuf-lite +option optimize_for = LITE_RUNTIME; + diff --git a/tools/pnnx/src/onnx.proto b/tools/pnnx/src/onnx-ml.proto similarity index 92% rename from tools/pnnx/src/onnx.proto rename to tools/pnnx/src/onnx-ml.proto index 15012ce65c3..5f4c0f4a4e2 100644 --- a/tools/pnnx/src/onnx.proto +++ b/tools/pnnx/src/onnx-ml.proto @@ -24,6 +24,8 @@ package onnx; // // The normative semantic specification of the ONNX IR is found in docs/IR.md. // Definitions of the built-in neural network operators may be found in docs/Operators.md. +// Definitions of the built-in classical machine learning operators may be found in +// docs/Operators-ml.md. // Notes // @@ -106,7 +108,11 @@ enum Version { // IR VERSION 9 published on May 5, 2023 // Added AttributeProto to FunctionProto so that default attribute values can be set. // Added FLOAT8E4M3FN, FLOAT8E4M3FNUZ, FLOAT8E5M2, FLOAT8E5M2FNUZ. - IR_VERSION = 0x0000000000000009; + IR_VERSION_2023_5_5 = 0x0000000000000009; + + // IR VERSION 10 published on TBD + // Added UINT4, INT4. + IR_VERSION = 0x000000000000000A; } // Attributes @@ -190,6 +196,8 @@ message ValueInfoProto { optional TypeProto type = 2; // A human-readable documentation for this value. Markdown is allowed. optional string doc_string = 3; + // Named metadata values; keys should be distinct. + repeated StringStringEntryProto metadata_props = 4; } // Nodes @@ -211,12 +219,17 @@ message NodeProto { optional string op_type = 4; // namespace Operator // The domain of the OperatorSet that specifies the operator named by op_type. optional string domain = 7; // namespace Domain + // Overload identifier, used only to map this to a model-local function. + optional string overload = 8; // Additional named attributes. repeated AttributeProto attribute = 5; // A human-readable documentation for this node. Markdown is allowed. optional string doc_string = 6; + + // Named metadata values; keys should be distinct. + repeated StringStringEntryProto metadata_props = 9; } // Training information @@ -401,7 +414,7 @@ message ModelProto { // A list of function protos local to the model. // - // Name of the function "FunctionProto.name" should be unique within the domain "FunctionProto.domain". + // The (domain, name, overload) tuple must be unique across the function protos in this list. // In case of any conflicts the behavior (whether the model local functions are given higher priority, // or standard operator sets are given higher priotity or this is treated as error) is defined by // the runtimes. @@ -475,6 +488,9 @@ message GraphProto { // which means, tensor 'a_scale' and tensor 'a_zero_point' are scale and zero point of tensor 'a' in the model. repeated TensorAnnotation quantization_annotation = 14; + // Named metadata values; keys should be distinct. + repeated StringStringEntryProto metadata_props = 16; + reserved 3, 4, 6 to 9; reserved "ir_version", "producer_version", "producer_tag", "domain"; } @@ -520,7 +536,11 @@ message TensorProto { FLOAT8E4M3FN = 17; // float 8, mostly used for coefficients, supports nan, not inf FLOAT8E4M3FNUZ = 18; // float 8, mostly used for coefficients, supports nan, not inf, no negative zero FLOAT8E5M2 = 19; // follows IEEE 754, supports nan, inf, mostly used for gradients - FLOAT8E5M2FNUZ = 20; // follows IEEE 754, supports nan, inf, mostly used for gradients, no negative zero + FLOAT8E5M2FNUZ = 20; // follows IEEE 754, supports nan, not inf, mostly used for gradients, no negative zero + + // 4-bit data-types + UINT4 = 21; // Unsigned integer in range [0, 15] + INT4 = 22; // Signed integer in range [-8, 7], using two's-complement representation // Future extensions go here. } @@ -555,11 +575,13 @@ message TensorProto { // When this field is present, the data_type field MUST be FLOAT or COMPLEX64. repeated float float_data = 4 [packed = true]; - // For int32, uint8, int8, uint16, int16, bool, float8, and float16 values + // For int32, uint8, int8, uint16, int16, uint4, int4, bool, float8 and float16 values // float16 and float8 values must be bit-wise converted to an uint16_t prior // to writing to the buffer. + // uint4 and int4 values must be packed to 4bitx2 prior to writing to the buffer, the first element is stored in + // the 4 LSB and the second element is stored in the 4 MSB. // When this field is present, the data_type field MUST be - // INT32, INT16, INT8, UINT16, UINT8, BOOL, FLOAT16, BFLOAT16, FLOAT8E4M3FN, FLOAT8E4M3FNUZ, FLOAT8E5M2, FLOAT8E5M2FNUZ + // INT32, INT16, INT8, INT4, UINT16, UINT8, UINT4, BOOL, FLOAT16, BFLOAT16, FLOAT8E4M3FN, FLOAT8E4M3FNUZ, FLOAT8E5M2, FLOAT8E5M2FNUZ repeated int32 int32_data = 5 [packed = true]; // For strings. @@ -589,6 +611,7 @@ message TensorProto { // Complex64 elements must be written as two consecutive FLOAT values, real component first. // Complex128 elements must be written as two consecutive DOUBLE values, real component first. // Boolean type MUST be written one byte per tensor element (00000001 for true, 00000000 for false). + // uint4 and int4 values must be packed to 4bitx2, the first element is stored in the 4 LSB and the second element is stored in the 4 MSB. // // Note: the advantage of specific field rather than the raw_data field is // that in some cases (e.g. int data), protobuf does a better packing via @@ -631,6 +654,9 @@ message TensorProto { // When this field is present, the data_type field MUST be // UINT32 or UINT64 repeated uint64 uint64_data = 11 [packed = true]; + + // Named metadata values; keys should be distinct. + repeated StringStringEntryProto metadata_props = 16; } // A serialized sparse-tensor value @@ -724,6 +750,17 @@ message TypeProto { } + message Opaque { + // When missing, the domain is the same as the model's. + optional string domain = 1; + // The name is optional but significant when provided. + optional string name = 2; + // parameters that help defining the type + // DEPRECATED do not use. + // repeated TypeProto parameters = 3; + } + + oneof value { // The type of a tensor. Tensor tensor_type = 1; @@ -746,6 +783,9 @@ message TypeProto { // Type of the sparse tensor SparseTensor sparse_tensor_type = 8; + + Opaque opaque_type = 7; + } // An optional denotation can be used to denote the whole @@ -777,9 +817,8 @@ enum OperatorStatus { } message FunctionProto { - // The name of the function, similar usage of op_type in OperatorProto. - // Combined with FunctionProto.domain, this forms the unique identity of - // the FunctionProto. + // The name of the function, similar to op_type in NodeProto. + // This is part of the unique-id (domain, name, overload) of FunctionProtos in a model. optional string name = 1; // Deprecated since IR Version 8 @@ -826,9 +865,22 @@ message FunctionProto { repeated OperatorSetIdProto opset_import = 9; - // The domain which this function belongs to. Combined with FunctionProto.name, this forms the unique identity of - // the FunctionProto. + // The domain which this function belongs to. + // This is part of the unique-id (domain, name, overload) of FunctionProtos in a model. optional string domain = 10; + + // The overload identifier of the function. + // This is part of the unique-id (domain, name, overload) of FunctionProtos in a model. + optional string overload = 13; + + // Information for the values in the function. The ValueInfoProto.name's + // must be distinct and refer to names in the function (including inputs, + // outputs, and intermediate values). It is optional for a value to appear + // in value_info list. + repeated ValueInfoProto value_info = 12; + + // Named metadata values; keys should be distinct. + repeated StringStringEntryProto metadata_props = 14; } // For using protobuf-lite diff --git a/tools/pnnx/src/onnx-operators-ml.proto b/tools/pnnx/src/onnx-operators-ml.proto new file mode 100644 index 00000000000..de62706f5cb --- /dev/null +++ b/tools/pnnx/src/onnx-operators-ml.proto @@ -0,0 +1,136 @@ +// +// WARNING: This file is automatically generated! Please edit onnx.in.proto. +// + + +// Copyright (c) ONNX Project Contributors. +// Licensed under the Apache-2.0 license. + +syntax = "proto2"; + +package onnx; +import "onnx-ml.proto"; + +// +// This file contains the proto definitions for OperatorSetProto and +// OperatorProto. OperatorSetProtos are used to describe a versioned +// set of operators that can be used by a ModelProto. +// +// Like ModelProto, OperatorSetProto is defined as a top-level file/wire +// format, however their usage is different. +// +// ModelProto files are used to describe executable graphs that can be +// executed directly by a framework, runtime, or engine. +// +// OperatorSetProto files are used to describe a set of operators that are +// available in a given environment. The file TBD.TBD is the OperatorSetProto +// that describes the ONNX standard operators. +// + +// An OperatorProto represents the immutable specification of the signature +// and semantics of an operator. +// +// Operators are declared as part of an OperatorSet, which also defines the +// domain name for the set. +// +// Operators are uniquely identified by a three part identifier +// (domain, op_type, since_version) +// where +// *domain* is the domain of an operator set that +// contains this operator specification. +// +// *op_type* is the name of the operator as referenced by a +// NodeProto.op_type +// +// *since_version* is the version of the operator set that +// this operator was initially declared in. +// +message OperatorProto { + // The name of the operator within a domain. + // This field MUST be present in this version of the IR. + optional string op_type = 1; + + // The version of the operator set that first introduced this + // operator. This value MUST be the same value as the + // opset_version of the operator set that first published this operator. + // Subsequent versions of the operator set MUST NOT alter the signature + // or semantics of the operator once published as STABLE. + // This field MUST be present in this version of the IR. + optional int64 since_version = 2; + + // This field indicates whether the syntax, semantics, or presence + // of this operator is in an experimental or stable stage. Once an + // operator is published as STABLE, it's syntax and semantics MUST NOT + // change in subsequent versions of the operator set. + // When an operator is published as EXPERIMENTAL, the syntax and semantics + // of the operator MAY change across operator set versions. + // Operators "become" stable by deprecating the experimental version and + // introducing a new stable operator with the same op_type. + optional OperatorStatus status = 3; + + // Eventually we will declare the signature of the operator here + + // A human-readable documentation for this operator. Markdown is allowed. + optional string doc_string = 10; +} + +// An OperatorSetProto represents an immutable set of immutable operator +// specifications. +// +// The domain of the set (OperatorSetProto.domain) is a reverse-DNS name +// that disambiguates operator sets defined by independent entities. +// +// The version of the set (opset_version) is a monotonically increasing +// integer that indicates changes to the membership of the operator set. +// +// +// Operator sets are uniquely identified by a two part identifier (domain, opset_version) +// +// Like ModelProto, OperatorSetProto is intended as a top-level file/wire format, +// and thus has the standard format headers in addition to the operator set information. +// +message OperatorSetProto { + // All OperatorSetProtos start with a distingushed byte sequence to disambiguate + // protobuf files containing OperatorSets from other content. + // This field MUST be "ONNXOPSET" + // This field MUST be present in this version of the IR + optional string magic = 1; + + // All OperatorSetProtos indicate the version of the IR syntax and semantics + // they adhere to. It is always IR_VERSION. + // This field MUST be present in this version of the IR + optional int64 ir_version = 2; + + // The prerelease component of the SemVer of the IR. + // This field MAY be absent in this version of the IR + optional string ir_version_prerelease = 3; + + // The build metadata component of the SemVer of the IR. + // This field MAY be absent in this version of the IR + optional string ir_build_metadata = 7; + + // Domain name of the operator set, in reverse DNS form (e.g., com.acme.dnnops). + optional string domain = 4; + + // The version of the set of operators. This is a simple int value + // that is monotonically increasing as new versions of the operator set + // are published. All operators in this set MUST have since_version + // <= opset_version. + optional int64 opset_version = 5; + + // A human-readable documentation for this set of operators. Markdown is allowed. + optional string doc_string = 6; + + // The operators specified by this operator set. + // The (name, version) MUST be unique across all OperatorProtos in operator + repeated OperatorProto operator = 8; + + // The functions specified by this operator set. + // The (name, version) MUST be unique across all OperatorProtos/FunctionProtos in operator/functions + repeated FunctionProto functions = 9; +} + + +// For using protobuf-lite +option optimize_for = LITE_RUNTIME; + diff --git a/tools/pnnx/src/pass_level5/fuse_multiheadattention.cpp b/tools/pnnx/src/pass_level5/fuse_multiheadattention.cpp index 2a9f3b837b1..b6297eb8a92 100644 --- a/tools/pnnx/src/pass_level5/fuse_multiheadattention.cpp +++ b/tools/pnnx/src/pass_level5/fuse_multiheadattention.cpp @@ -1734,6 +1734,64 @@ pnnx.Output output 1 0 out } }; +class fuse_multiheadattention_pass_onnx_1_2 : public fuse_multiheadattention_pass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +21 20 +pnnx.Input input_q 0 1 input +nn.Linear op_0 1 1 input 14 bias=%qkvbias in_features=%embed_dim out_features=%qkv_out_features @bias @weight +Tensor.reshape op_1 1 1 14 15 shape=(%batch,%size,1,3,%embed_dim) +Tensor.permute op_2 1 1 15 16 dims=(3,1,2,0,4) +torch.squeeze op_3 1 1 16 17 dim=3 +torch.unbind op_4 1 3 17 18 19 20 dim=0 +Tensor.reshape op_5 1 1 18 21 shape=(%size,%num_heads,%feat_per_head) +Tensor.reshape op_6 1 1 19 23 shape=(%size,%num_heads,%feat_per_head) +Tensor.reshape op_7 1 1 20 25 shape=(%size,%num_heads,%feat_per_head) +Tensor.permute op_8 1 1 21 22 dims=(1,0,2) +Tensor.permute op_9 1 1 23 24 dims=(1,0,2) +Tensor.permute op_10 1 1 25 26 dims=(1,0,2) +Tensor.reshape op_11 1 1 22 27 shape=(%batch,%num_heads,%size,%feat_per_head) +Tensor.reshape op_12 1 1 24 28 shape=(%batch,%num_heads,%size,%feat_per_head) +Tensor.reshape op_13 1 1 26 29 shape=(%batch,%num_heads,%size,%feat_per_head) +F.scaled_dot_product_attention op_14 3 1 27 28 29 35 dropout_p=0.000000e+00 is_causal=False +Tensor.permute op_15 1 1 35 36 dims=(2,0,1,3) +Tensor.reshape op_16 1 1 36 37 shape=(%size,%embed_dim) +nn.Linear out_proj 1 1 37 38 bias=%outbias in_features=%embed_dim out_features=%embed_dim @bias @weight +Tensor.reshape op_18 1 1 38 out shape=(%size,%batch,%embed_dim) +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* replace_pattern_graph() const + { + return R"PNNXIR(7767517 +3 2 +pnnx.Input input 0 1 input +nn.MultiheadAttention attention 1 1 input out embed_dim=%embed_dim kdim=%embed_dim vdim=%embed_dim num_heads=%num_heads batch_first=False add_zero_attn=False add_bias_kv=False +pnnx.Output output 1 0 out +)PNNXIR"; + } + + bool match(const std::map& matched_operators, const std::map& captured_params, const std::map& /*captured_attrs*/) const + { + const int embed_dim = captured_params.at("embed_dim").i; + const int qkv_out_features = captured_params.at("qkv_out_features").i; + const int num_heads = captured_params.at("num_heads").i; + const int feat_per_head = captured_params.at("feat_per_head").i; + + if (qkv_out_features != embed_dim * 3) + return false; + + if (embed_dim != num_heads * feat_per_head) + return false; + + return true; + } +}; + class fuse_multiheadattention_pass_onnx_2 : public fuse_multiheadattention_pass { public: @@ -2048,6 +2106,7 @@ void fuse_multiheadattention(Graph& graph) fuse_multiheadattention_pass_onnx onnx0; fuse_multiheadattention_pass_onnx_1 onnx1; fuse_multiheadattention_pass_onnx_1_1 onnx1a; + fuse_multiheadattention_pass_onnx_1_2 onnx1b; fuse_multiheadattention_pass_onnx_2 onnx2; fuse_multiheadattention_pass_onnx_3 onnx3; fuse_multiheadattention_pass_onnx_4 onnx4; @@ -2087,6 +2146,7 @@ void fuse_multiheadattention(Graph& graph) pnnx_graph_rewrite(graph, &onnx0, opindex); pnnx_graph_rewrite(graph, &onnx1, opindex); pnnx_graph_rewrite(graph, &onnx1a, opindex); + pnnx_graph_rewrite(graph, &onnx1b, opindex); pnnx_graph_rewrite(graph, &onnx2, opindex); pnnx_graph_rewrite(graph, &onnx3, opindex); pnnx_graph_rewrite(graph, &onnx4, opindex); diff --git a/tools/pnnx/src/pass_onnx.cpp b/tools/pnnx/src/pass_onnx.cpp index dd9194111fc..6318dacba25 100644 --- a/tools/pnnx/src/pass_onnx.cpp +++ b/tools/pnnx/src/pass_onnx.cpp @@ -14,7 +14,7 @@ #include "pass_onnx.h" -#include "onnx.pb.h" +#include "onnx-ml.pb.h" #include #include diff --git a/tools/pnnx/src/pass_onnx/canonicalize.h b/tools/pnnx/src/pass_onnx/canonicalize.h index a24ad86a9fd..6ec55f2d140 100644 --- a/tools/pnnx/src/pass_onnx/canonicalize.h +++ b/tools/pnnx/src/pass_onnx/canonicalize.h @@ -12,7 +12,7 @@ // CONDITIONS OF ANY KIND, either express or implied. See the License for the // specific language governing permissions and limitations under the License. -#include "onnx.pb.h" +#include "onnx-ml.pb.h" namespace pnnx { diff --git a/tools/pnnx/src/pass_onnx/dead_code_elimination.h b/tools/pnnx/src/pass_onnx/dead_code_elimination.h index b890b6a7d7c..7d8b7e0d25d 100644 --- a/tools/pnnx/src/pass_onnx/dead_code_elimination.h +++ b/tools/pnnx/src/pass_onnx/dead_code_elimination.h @@ -12,7 +12,7 @@ // CONDITIONS OF ANY KIND, either express or implied. See the License for the // specific language governing permissions and limitations under the License. -#include "onnx.pb.h" +#include "onnx-ml.pb.h" namespace pnnx { diff --git a/tools/pnnx/src/pass_onnx/eliminate_noop.h b/tools/pnnx/src/pass_onnx/eliminate_noop.h index e465e398c0a..3325ae9cf10 100644 --- a/tools/pnnx/src/pass_onnx/eliminate_noop.h +++ b/tools/pnnx/src/pass_onnx/eliminate_noop.h @@ -12,7 +12,7 @@ // CONDITIONS OF ANY KIND, either express or implied. See the License for the // specific language governing permissions and limitations under the License. -#include "onnx.pb.h" +#include "onnx-ml.pb.h" namespace pnnx { diff --git a/tools/pnnx/src/pass_onnx/fold_constants.h b/tools/pnnx/src/pass_onnx/fold_constants.h index 98d6ef717ab..f165a96e177 100644 --- a/tools/pnnx/src/pass_onnx/fold_constants.h +++ b/tools/pnnx/src/pass_onnx/fold_constants.h @@ -12,7 +12,7 @@ // CONDITIONS OF ANY KIND, either express or implied. See the License for the // specific language governing permissions and limitations under the License. -#include "onnx.pb.h" +#include "onnx-ml.pb.h" namespace pnnx { diff --git a/tools/pnnx/src/pass_onnx/fuse_constant_as_attribute.h b/tools/pnnx/src/pass_onnx/fuse_constant_as_attribute.h index ad6cf80007c..a90c089fee6 100644 --- a/tools/pnnx/src/pass_onnx/fuse_constant_as_attribute.h +++ b/tools/pnnx/src/pass_onnx/fuse_constant_as_attribute.h @@ -12,7 +12,7 @@ // CONDITIONS OF ANY KIND, either express or implied. See the License for the // specific language governing permissions and limitations under the License. -#include "onnx.pb.h" +#include "onnx-ml.pb.h" namespace pnnx { diff --git a/tools/pnnx/src/pass_onnx/inline_containers.h b/tools/pnnx/src/pass_onnx/inline_containers.h index 56b21f47b37..e3051c5e333 100644 --- a/tools/pnnx/src/pass_onnx/inline_containers.h +++ b/tools/pnnx/src/pass_onnx/inline_containers.h @@ -12,7 +12,7 @@ // CONDITIONS OF ANY KIND, either express or implied. See the License for the // specific language governing permissions and limitations under the License. -#include "onnx.pb.h" +#include "onnx-ml.pb.h" namespace pnnx { diff --git a/tools/pnnx/src/pass_onnx/inline_if_graph.h b/tools/pnnx/src/pass_onnx/inline_if_graph.h index c84b5761ac5..e9c1c2f0ee8 100644 --- a/tools/pnnx/src/pass_onnx/inline_if_graph.h +++ b/tools/pnnx/src/pass_onnx/inline_if_graph.h @@ -12,7 +12,7 @@ // CONDITIONS OF ANY KIND, either express or implied. See the License for the // specific language governing permissions and limitations under the License. -#include "onnx.pb.h" +#include "onnx-ml.pb.h" namespace pnnx { diff --git a/tools/pnnx/src/pass_onnx/model_stat.h b/tools/pnnx/src/pass_onnx/model_stat.h index dd62e67a1bc..993630b1b4b 100644 --- a/tools/pnnx/src/pass_onnx/model_stat.h +++ b/tools/pnnx/src/pass_onnx/model_stat.h @@ -12,7 +12,7 @@ // CONDITIONS OF ANY KIND, either express or implied. See the License for the // specific language governing permissions and limitations under the License. -#include "onnx.pb.h" +#include "onnx-ml.pb.h" namespace pnnx { diff --git a/tools/pnnx/src/pass_onnx/nn_AdaptiveAvgPool2d.cpp b/tools/pnnx/src/pass_onnx/nn_AdaptiveAvgPool2d.cpp index 0e8851f05f2..21cf6076d2d 100644 --- a/tools/pnnx/src/pass_onnx/nn_AdaptiveAvgPool2d.cpp +++ b/tools/pnnx/src/pass_onnx/nn_AdaptiveAvgPool2d.cpp @@ -15,7 +15,7 @@ #include "pass_onnx.h" #include "ir.h" -#include "onnx.pb.h" +#include "onnx-ml.pb.h" namespace pnnx { diff --git a/tools/pnnx/src/pass_onnx/nn_AdaptiveAvgPool3d.cpp b/tools/pnnx/src/pass_onnx/nn_AdaptiveAvgPool3d.cpp index 070981e1d64..a8e3e96be6b 100644 --- a/tools/pnnx/src/pass_onnx/nn_AdaptiveAvgPool3d.cpp +++ b/tools/pnnx/src/pass_onnx/nn_AdaptiveAvgPool3d.cpp @@ -15,7 +15,7 @@ #include "pass_onnx.h" #include "ir.h" -#include "onnx.pb.h" +#include "onnx-ml.pb.h" namespace pnnx { diff --git a/tools/pnnx/src/pass_onnx/nn_AvgPool2d.cpp b/tools/pnnx/src/pass_onnx/nn_AvgPool2d.cpp index 5a006fe3709..6f5be930e64 100644 --- a/tools/pnnx/src/pass_onnx/nn_AvgPool2d.cpp +++ b/tools/pnnx/src/pass_onnx/nn_AvgPool2d.cpp @@ -15,7 +15,7 @@ #include "pass_onnx.h" #include "ir.h" -#include "onnx.pb.h" +#include "onnx-ml.pb.h" namespace pnnx { diff --git a/tools/pnnx/src/pass_onnx/nn_AvgPool3d.cpp b/tools/pnnx/src/pass_onnx/nn_AvgPool3d.cpp index ff2a5dd8aad..9fdcfdd72d6 100644 --- a/tools/pnnx/src/pass_onnx/nn_AvgPool3d.cpp +++ b/tools/pnnx/src/pass_onnx/nn_AvgPool3d.cpp @@ -15,7 +15,7 @@ #include "pass_onnx.h" #include "ir.h" -#include "onnx.pb.h" +#include "onnx-ml.pb.h" namespace pnnx { diff --git a/tools/pnnx/src/pass_onnx/nn_BatchNorm2d.cpp b/tools/pnnx/src/pass_onnx/nn_BatchNorm2d.cpp index c3639904d47..96448c0f25c 100644 --- a/tools/pnnx/src/pass_onnx/nn_BatchNorm2d.cpp +++ b/tools/pnnx/src/pass_onnx/nn_BatchNorm2d.cpp @@ -15,7 +15,7 @@ #include "pass_onnx.h" #include "ir.h" -#include "onnx.pb.h" +#include "onnx-ml.pb.h" namespace pnnx { diff --git a/tools/pnnx/src/pass_onnx/nn_BatchNorm3d.cpp b/tools/pnnx/src/pass_onnx/nn_BatchNorm3d.cpp index 0f9405f160a..afac686a22a 100644 --- a/tools/pnnx/src/pass_onnx/nn_BatchNorm3d.cpp +++ b/tools/pnnx/src/pass_onnx/nn_BatchNorm3d.cpp @@ -15,7 +15,7 @@ #include "pass_onnx.h" #include "ir.h" -#include "onnx.pb.h" +#include "onnx-ml.pb.h" namespace pnnx { diff --git a/tools/pnnx/src/pass_onnx/nn_Conv2d.cpp b/tools/pnnx/src/pass_onnx/nn_Conv2d.cpp index c9aeac561ac..2cd6b7dd750 100644 --- a/tools/pnnx/src/pass_onnx/nn_Conv2d.cpp +++ b/tools/pnnx/src/pass_onnx/nn_Conv2d.cpp @@ -15,7 +15,7 @@ #include "pass_onnx.h" #include "ir.h" -#include "onnx.pb.h" +#include "onnx-ml.pb.h" namespace pnnx { diff --git a/tools/pnnx/src/pass_onnx/nn_Conv3d.cpp b/tools/pnnx/src/pass_onnx/nn_Conv3d.cpp index 6413685fcb5..f90c23cbb6a 100644 --- a/tools/pnnx/src/pass_onnx/nn_Conv3d.cpp +++ b/tools/pnnx/src/pass_onnx/nn_Conv3d.cpp @@ -15,7 +15,7 @@ #include "pass_onnx.h" #include "ir.h" -#include "onnx.pb.h" +#include "onnx-ml.pb.h" namespace pnnx { diff --git a/tools/pnnx/src/pass_onnx/nn_GELU.cpp b/tools/pnnx/src/pass_onnx/nn_GELU.cpp index f5b7000e017..22d2823673a 100644 --- a/tools/pnnx/src/pass_onnx/nn_GELU.cpp +++ b/tools/pnnx/src/pass_onnx/nn_GELU.cpp @@ -15,7 +15,7 @@ #include "pass_onnx.h" #include "ir.h" -#include "onnx.pb.h" +#include "onnx-ml.pb.h" namespace pnnx { diff --git a/tools/pnnx/src/pass_onnx/nn_LayerNorm.cpp b/tools/pnnx/src/pass_onnx/nn_LayerNorm.cpp index f4ecf289557..fece12e2bce 100644 --- a/tools/pnnx/src/pass_onnx/nn_LayerNorm.cpp +++ b/tools/pnnx/src/pass_onnx/nn_LayerNorm.cpp @@ -15,7 +15,7 @@ #include "pass_onnx.h" #include "ir.h" -#include "onnx.pb.h" +#include "onnx-ml.pb.h" namespace pnnx { diff --git a/tools/pnnx/src/pass_onnx/nn_Linear.cpp b/tools/pnnx/src/pass_onnx/nn_Linear.cpp index 4dce81908b2..0515a8ea454 100644 --- a/tools/pnnx/src/pass_onnx/nn_Linear.cpp +++ b/tools/pnnx/src/pass_onnx/nn_Linear.cpp @@ -15,7 +15,7 @@ #include "pass_onnx.h" #include "ir.h" -#include "onnx.pb.h" +#include "onnx-ml.pb.h" namespace pnnx { diff --git a/tools/pnnx/src/pass_onnx/nn_MaxPool2d.cpp b/tools/pnnx/src/pass_onnx/nn_MaxPool2d.cpp index 47924bd33fc..518abd434b0 100644 --- a/tools/pnnx/src/pass_onnx/nn_MaxPool2d.cpp +++ b/tools/pnnx/src/pass_onnx/nn_MaxPool2d.cpp @@ -15,7 +15,7 @@ #include "pass_onnx.h" #include "ir.h" -#include "onnx.pb.h" +#include "onnx-ml.pb.h" namespace pnnx { diff --git a/tools/pnnx/src/pass_onnx/nn_MaxPool3d.cpp b/tools/pnnx/src/pass_onnx/nn_MaxPool3d.cpp index c8c467f5ba2..04de8bd104a 100644 --- a/tools/pnnx/src/pass_onnx/nn_MaxPool3d.cpp +++ b/tools/pnnx/src/pass_onnx/nn_MaxPool3d.cpp @@ -15,7 +15,7 @@ #include "pass_onnx.h" #include "ir.h" -#include "onnx.pb.h" +#include "onnx-ml.pb.h" namespace pnnx { diff --git a/tools/pnnx/src/pass_onnx/nn_MultiheadAttention.cpp b/tools/pnnx/src/pass_onnx/nn_MultiheadAttention.cpp index a29ec9d9306..df1bd092273 100644 --- a/tools/pnnx/src/pass_onnx/nn_MultiheadAttention.cpp +++ b/tools/pnnx/src/pass_onnx/nn_MultiheadAttention.cpp @@ -15,7 +15,7 @@ #include "pass_onnx.h" #include "ir.h" -#include "onnx.pb.h" +#include "onnx-ml.pb.h" namespace pnnx { diff --git a/tools/pnnx/src/pass_onnx/shape_inference.h b/tools/pnnx/src/pass_onnx/shape_inference.h index b4cd657bb81..b484d5265ca 100644 --- a/tools/pnnx/src/pass_onnx/shape_inference.h +++ b/tools/pnnx/src/pass_onnx/shape_inference.h @@ -12,7 +12,7 @@ // CONDITIONS OF ANY KIND, either express or implied. See the License for the // specific language governing permissions and limitations under the License. -#include "onnx.pb.h" +#include "onnx-ml.pb.h" namespace pnnx { diff --git a/tools/pnnx/src/save_onnx.cpp b/tools/pnnx/src/save_onnx.cpp index 3406c730b2d..3ef3a772a2f 100644 --- a/tools/pnnx/src/save_onnx.cpp +++ b/tools/pnnx/src/save_onnx.cpp @@ -14,7 +14,7 @@ #include "save_onnx.h" -#include "onnx.pb.h" +#include "onnx-ml.pb.h" #include #include diff --git a/tools/pnnx/tests/onnx/test_nn_ReLU.py b/tools/pnnx/tests/onnx/test_nn_ReLU.py index d381fb5bc0e..8230e3f4827 100644 --- a/tools/pnnx/tests/onnx/test_nn_ReLU.py +++ b/tools/pnnx/tests/onnx/test_nn_ReLU.py @@ -61,7 +61,7 @@ def test(): if not torch.allclose(a0, b0, 1e-4, 1e-4): return False - if version.parse(torch.__version__) < version.parse('2.4'): + if version.parse(torch.__version__) < version.parse('2.5'): return True # export dynamo onnx diff --git a/tools/pnnx/tests/onnx/test_squeezenet1_1.py b/tools/pnnx/tests/onnx/test_squeezenet1_1.py index f5f5f4a668a..28c7df8fb81 100644 --- a/tools/pnnx/tests/onnx/test_squeezenet1_1.py +++ b/tools/pnnx/tests/onnx/test_squeezenet1_1.py @@ -39,7 +39,7 @@ def test(): if not torch.allclose(a, b, 1e-4, 1e-4): return False - if version.parse(torch.__version__) < version.parse('2.4'): + if version.parse(torch.__version__) < version.parse('2.5'): return True # export dynamo onnx diff --git a/tools/pnnx/tests/onnx/test_swin_t.py b/tools/pnnx/tests/onnx/test_swin_t.py index be25520d0bc..6361d20c911 100644 --- a/tools/pnnx/tests/onnx/test_swin_t.py +++ b/tools/pnnx/tests/onnx/test_swin_t.py @@ -43,7 +43,7 @@ def test(): if not torch.allclose(a, b, 1e-4, 1e-4): return False - if version.parse(torch.__version__) < version.parse('2.4'): + if version.parse(torch.__version__) < version.parse('2.5'): return True # export dynamo onnx diff --git a/tools/pnnx/tests/onnx/test_vit_b_32.py b/tools/pnnx/tests/onnx/test_vit_b_32.py index ecb0bd350f6..3c92a119406 100644 --- a/tools/pnnx/tests/onnx/test_vit_b_32.py +++ b/tools/pnnx/tests/onnx/test_vit_b_32.py @@ -46,7 +46,7 @@ def test(): if not torch.allclose(a, b, 1e-4, 1e-4): return False - if version.parse(torch.__version__) < version.parse('2.4'): + if version.parse(torch.__version__) < version.parse('2.5'): return True # export dynamo onnx From f3cd4c2e917ad264f73c600af9e5c6801af08608 Mon Sep 17 00:00:00 2001 From: nihui Date: Mon, 12 Aug 2024 15:53:19 +0800 Subject: [PATCH 16/38] pnnx2ncnn handle F.maxpool without dilation param (#5622) --- tools/pnnx/src/pass_ncnn/F_max_pool1d.cpp | 16 ++++++++++++++++ tools/pnnx/src/pass_ncnn/F_max_pool2d.cpp | 16 ++++++++++++++++ tools/pnnx/src/pass_ncnn/F_max_pool3d.cpp | 16 ++++++++++++++++ 3 files changed, 48 insertions(+) diff --git a/tools/pnnx/src/pass_ncnn/F_max_pool1d.cpp b/tools/pnnx/src/pass_ncnn/F_max_pool1d.cpp index 1d9ca98e03d..aaef7db2d74 100644 --- a/tools/pnnx/src/pass_ncnn/F_max_pool1d.cpp +++ b/tools/pnnx/src/pass_ncnn/F_max_pool1d.cpp @@ -63,6 +63,22 @@ pnnx.Output output 1 0 out REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_max_pool1d, 20) +class F_max_pool1d_1 : public F_max_pool1d +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +3 2 +pnnx.Input input 0 1 input +F.max_pool1d op_0 1 1 input out kernel_size=%kernel_size stride=%stride padding=%padding ceil_mode=%ceil_mode return_indices=False +pnnx.Output output 1 0 out +)PNNXIR"; + } +}; + +REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_max_pool1d_1, 20) + } // namespace ncnn } // namespace pnnx diff --git a/tools/pnnx/src/pass_ncnn/F_max_pool2d.cpp b/tools/pnnx/src/pass_ncnn/F_max_pool2d.cpp index ba5a52f4f7d..3519c8a022b 100644 --- a/tools/pnnx/src/pass_ncnn/F_max_pool2d.cpp +++ b/tools/pnnx/src/pass_ncnn/F_max_pool2d.cpp @@ -66,6 +66,22 @@ pnnx.Output output 1 0 out REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_max_pool2d, 20) +class F_max_pool2d_1 : public F_max_pool2d +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +3 2 +pnnx.Input input 0 1 input +F.max_pool2d op_0 1 1 input out kernel_size=%kernel_size stride=%stride padding=%padding ceil_mode=%ceil_mode return_indices=False +pnnx.Output output 1 0 out +)PNNXIR"; + } +}; + +REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_max_pool2d_1, 20) + } // namespace ncnn } // namespace pnnx diff --git a/tools/pnnx/src/pass_ncnn/F_max_pool3d.cpp b/tools/pnnx/src/pass_ncnn/F_max_pool3d.cpp index 5476907fa88..2caede16a29 100644 --- a/tools/pnnx/src/pass_ncnn/F_max_pool3d.cpp +++ b/tools/pnnx/src/pass_ncnn/F_max_pool3d.cpp @@ -69,6 +69,22 @@ pnnx.Output output 1 0 out REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_max_pool3d, 20) +class F_max_pool3d_1 : public F_max_pool3d +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +3 2 +pnnx.Input input 0 1 input +F.max_pool3d op_0 1 1 input out kernel_size=%kernel_size stride=%stride padding=%padding ceil_mode=%ceil_mode return_indices=False +pnnx.Output output 1 0 out +)PNNXIR"; + } +}; + +REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_max_pool3d_1, 20) + } // namespace ncnn } // namespace pnnx From ecfd88a11bdf6480c0496564c6392463997429fc Mon Sep 17 00:00:00 2001 From: nihui Date: Mon, 12 Aug 2024 19:33:39 +0800 Subject: [PATCH 17/38] pnnx2ncnn convert torch.roll with one or two shifts (#5623) --- tools/pnnx/src/CMakeLists.txt | 1 + tools/pnnx/src/pass_ncnn/torch_roll.cpp | 193 +++++++++++++++++++++++ tools/pnnx/tests/CMakeLists.txt | 1 + tools/pnnx/tests/ncnn/CMakeLists.txt | 1 + tools/pnnx/tests/ncnn/test_torch_roll.py | 64 ++++++++ tools/pnnx/tests/onnx/CMakeLists.txt | 1 + tools/pnnx/tests/onnx/test_torch_roll.py | 64 ++++++++ tools/pnnx/tests/test_torch_roll.py | 61 +++++++ 8 files changed, 386 insertions(+) create mode 100644 tools/pnnx/src/pass_ncnn/torch_roll.cpp create mode 100644 tools/pnnx/tests/ncnn/test_torch_roll.py create mode 100644 tools/pnnx/tests/onnx/test_torch_roll.py create mode 100644 tools/pnnx/tests/test_torch_roll.py diff --git a/tools/pnnx/src/CMakeLists.txt b/tools/pnnx/src/CMakeLists.txt index 27dfdef52f8..c5c6228dee7 100644 --- a/tools/pnnx/src/CMakeLists.txt +++ b/tools/pnnx/src/CMakeLists.txt @@ -572,6 +572,7 @@ set(pnnx_pass_ncnn_SRCS pass_ncnn/torch_mm.cpp pass_ncnn/torch_norm.cpp pass_ncnn/torch_prod.cpp + pass_ncnn/torch_roll.cpp pass_ncnn/torch_slice_scatter.cpp pass_ncnn/torch_squeeze.cpp pass_ncnn/torch_sum.cpp diff --git a/tools/pnnx/src/pass_ncnn/torch_roll.cpp b/tools/pnnx/src/pass_ncnn/torch_roll.cpp new file mode 100644 index 00000000000..c7c29593333 --- /dev/null +++ b/tools/pnnx/src/pass_ncnn/torch_roll.cpp @@ -0,0 +1,193 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "pass_ncnn.h" + +namespace pnnx { + +namespace ncnn { + +class torch_roll : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +3 2 +pnnx.Input input 0 1 input +torch.roll op_0 1 1 input out dims=%dims shifts=%shifts +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* replace_pattern_graph() const + { + return R"PNNXIR(7767517 +4 3 +pnnx.Input input 0 1 input +Slice slice 1 2 input a b +Concat concat 2 1 b a out +pnnx.Output output 1 0 out +)PNNXIR"; + } + + bool match(const std::map& captured_params) const + { + if (captured_params.at("dims").type != 5) + return false; + + if (captured_params.at("dims").ai.size() != 1) + return false; + + if (captured_params.at("shifts").type != 5) + return false; + + if (captured_params.at("shifts").ai.size() != 1) + return false; + + return true; + } + + void write(const std::map& ops, const std::map& captured_params, const std::map& captured_attrs) const + { + GraphRewriterPass::write(ops, captured_params, captured_attrs); + + const Operand* in = ops.at("slice")->inputs[0]; + + const int batch_index = in->params.at("__batch_index").i; + + int axis = captured_params.at("dims").ai[0]; + if (axis == batch_index) + { + fprintf(stderr, "roll along batch axis %d is not supported\n", batch_index); + } + + if (axis < 0) + { + int input_rank = in->shape.size(); + axis = input_rank + axis; + } + + if (axis > batch_index) + axis -= 1; + + ops.at("slice")->params["1"] = axis; + + ops.at("concat")->params["0"] = axis; + + const int shift = captured_params.at("shifts").ai[0]; + ops.at("slice")->params["2"] = std::vector{-shift}; + } +}; + +REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(torch_roll, 20) + +class torch_roll_1 : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +3 2 +pnnx.Input input 0 1 input +torch.roll op_0 1 1 input out dims=%dims shifts=%shifts +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* replace_pattern_graph() const + { + return R"PNNXIR(7767517 +8 7 +pnnx.Input input 0 1 input +Slice slice 1 2 input a b +Slice slice_a 1 2 a a0 a1 +Slice slice_b 1 2 b b0 b1 +Concat concat_a 2 1 a1 a0 a10 +Concat concat_b 2 1 b1 b0 b10 +Concat concat 2 1 b10 a10 out +pnnx.Output output 1 0 out +)PNNXIR"; + } + + bool match(const std::map& captured_params) const + { + if (captured_params.at("dims").type != 5) + return false; + + if (captured_params.at("dims").ai.size() != 2) + return false; + + if (captured_params.at("shifts").type != 5) + return false; + + if (captured_params.at("shifts").ai.size() != 2) + return false; + + return true; + } + + void write(const std::map& ops, const std::map& captured_params, const std::map& captured_attrs) const + { + GraphRewriterPass::write(ops, captured_params, captured_attrs); + + const Operand* in = ops.at("slice")->inputs[0]; + + const int batch_index = in->params.at("__batch_index").i; + + int axis0 = captured_params.at("dims").ai[0]; + int axis1 = captured_params.at("dims").ai[1]; + if (axis0 == batch_index || axis1 == batch_index) + { + fprintf(stderr, "roll along batch axis %d is not supported\n", batch_index); + } + + if (axis0 < 0) + { + int input_rank = in->shape.size(); + axis0 = input_rank + axis0; + } + + if (axis0 > batch_index) + axis0 -= 1; + + if (axis1 < 0) + { + int input_rank = in->shape.size(); + axis1 = input_rank + axis1; + } + if (axis1 > batch_index) + axis1 -= 1; + + ops.at("slice")->params["1"] = axis0; + ops.at("slice_a")->params["1"] = axis1; + ops.at("slice_b")->params["1"] = axis1; + + ops.at("concat_a")->params["0"] = axis1; + ops.at("concat_b")->params["0"] = axis1; + ops.at("concat")->params["0"] = axis0; + + const int shift0 = captured_params.at("shifts").ai[0]; + const int shift1 = captured_params.at("shifts").ai[1]; + ops.at("slice")->params["2"] = std::vector{-shift0}; + ops.at("slice_a")->params["2"] = std::vector{-shift1}; + ops.at("slice_b")->params["2"] = std::vector{-shift1}; + } +}; + +REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(torch_roll_1, 20) + +} // namespace ncnn + +} // namespace pnnx diff --git a/tools/pnnx/tests/CMakeLists.txt b/tools/pnnx/tests/CMakeLists.txt index 7bbf1c6ea9c..a5522a70bb2 100644 --- a/tools/pnnx/tests/CMakeLists.txt +++ b/tools/pnnx/tests/CMakeLists.txt @@ -234,6 +234,7 @@ pnnx_add_test(torch_ones_like) pnnx_add_test(torch_positive) pnnx_add_test(torch_prod) pnnx_add_test(torch_repeat_interleave) +pnnx_add_test(torch_roll) pnnx_add_test(torch_scatter_add) pnnx_add_test(torch_slice_scatter) pnnx_add_test(torch_sum) diff --git a/tools/pnnx/tests/ncnn/CMakeLists.txt b/tools/pnnx/tests/ncnn/CMakeLists.txt index a682e42835b..a60e63eb54b 100644 --- a/tools/pnnx/tests/ncnn/CMakeLists.txt +++ b/tools/pnnx/tests/ncnn/CMakeLists.txt @@ -162,6 +162,7 @@ pnnx_ncnn_add_test(torch_min) pnnx_ncnn_add_test(torch_mm) pnnx_ncnn_add_test(torch_norm) pnnx_ncnn_add_test(torch_prod) +pnnx_ncnn_add_test(torch_roll) pnnx_ncnn_add_test(torch_slice_scatter) pnnx_ncnn_add_test(torch_sum) pnnx_ncnn_add_test(torch_squeeze) diff --git a/tools/pnnx/tests/ncnn/test_torch_roll.py b/tools/pnnx/tests/ncnn/test_torch_roll.py new file mode 100644 index 00000000000..6412ee6ba60 --- /dev/null +++ b/tools/pnnx/tests/ncnn/test_torch_roll.py @@ -0,0 +1,64 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = torch.roll(x, 3, 1) + y = torch.roll(y, -2, -1) + z = torch.roll(z, shifts=(2,1), dims=(0,1)) + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(3, 16) + y = torch.rand(5, 9, 11) + z = torch.rand(8, 5, 9, 10) + + a = net(x, y, z) + + # export torchscript + mod = torch.jit.trace(net, (x, y, z)) + mod.save("test_torch_roll.pt") + + # torchscript to ncnn + import os + os.system("../../src/pnnx test_torch_roll.pt inputshape=[3,16],[5,9,11],[8,5,9,10]") + + # ncnn inference + import test_torch_roll_ncnn + b = test_torch_roll_ncnn.test_inference() + + print(x) + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + print(a0) + print(b0) + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/onnx/CMakeLists.txt b/tools/pnnx/tests/onnx/CMakeLists.txt index f4756740a79..673fa0434d9 100644 --- a/tools/pnnx/tests/onnx/CMakeLists.txt +++ b/tools/pnnx/tests/onnx/CMakeLists.txt @@ -145,6 +145,7 @@ pnnx_onnx_add_test(torch_mean) pnnx_onnx_add_test(torch_min) pnnx_onnx_add_test(torch_minimum) pnnx_onnx_add_test(torch_prod) +pnnx_onnx_add_test(torch_roll) pnnx_onnx_add_test(torch_split) pnnx_onnx_add_test(torch_squeeze) pnnx_onnx_add_test(torch_stack) diff --git a/tools/pnnx/tests/onnx/test_torch_roll.py b/tools/pnnx/tests/onnx/test_torch_roll.py new file mode 100644 index 00000000000..06b8d579649 --- /dev/null +++ b/tools/pnnx/tests/onnx/test_torch_roll.py @@ -0,0 +1,64 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F +from packaging import version + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = torch.roll(x, 3, -1) + y = torch.roll(y, -2, -1) + z = torch.roll(z, shifts=(2,1), dims=(0,1)) + return x, y, z + +def test(): + if version.parse(torch.__version__) < version.parse('1.10'): + return True + + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 3, 16) + y = torch.rand(1, 5, 9, 11) + z = torch.rand(14, 8, 5, 9, 10) + + a = net(x, y, z) + + # export onnx + torch.onnx.export(net, (x, y, z), "test_torch_roll.onnx") + + # onnx to pnnx + import os + os.system("../../src/pnnx test_torch_roll.onnx inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]") + + # pnnx inference + import test_torch_roll_pnnx + b = test_torch_roll_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/test_torch_roll.py b/tools/pnnx/tests/test_torch_roll.py new file mode 100644 index 00000000000..32e3bde38e1 --- /dev/null +++ b/tools/pnnx/tests/test_torch_roll.py @@ -0,0 +1,61 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = torch.roll(x, 3) + y = torch.roll(y, -2, -1) + z = torch.roll(z, shifts=(2,1), dims=(0,1)) + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 3, 16) + y = torch.rand(1, 5, 9, 11) + z = torch.rand(14, 8, 5, 9, 10) + + a = net(x, y, z) + + # export torchscript + mod = torch.jit.trace(net, (x, y, z)) + mod.save("test_torch_roll.pt") + + # torchscript to pnnx + import os + os.system("../src/pnnx test_torch_roll.pt inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]") + + # pnnx inference + import test_torch_roll_pnnx + b = test_torch_roll_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) From ae17e5e177d92138c30daf8bf0b0f3345df49d2f Mon Sep 17 00:00:00 2001 From: nihui Date: Wed, 14 Aug 2024 10:43:36 +0800 Subject: [PATCH 18/38] ci release ubuntu2404, major release yml refactor (#5624) * release ubuntu 24.04 package, major release yml refactor * update macos vulkan sdk * set MACOSX_DEPLOYMENT_TARGET --- .github/workflows/release-python.yml | 18 +- .github/workflows/release.yml | 1762 ++++++-------------------- 2 files changed, 420 insertions(+), 1360 deletions(-) diff --git a/.github/workflows/release-python.yml b/.github/workflows/release-python.yml index 82bd5551fcd..6b6db4f0d2e 100644 --- a/.github/workflows/release-python.yml +++ b/.github/workflows/release-python.yml @@ -184,9 +184,9 @@ jobs: - name: vulkansdk for macos if: matrix.os == 'macos-13' run: | - wget https://sdk.lunarg.com/sdk/download/1.3.236.0/mac/vulkansdk-macos-1.3.236.0.dmg?Human=true -O vulkansdk-macos-1.3.236.0.dmg - hdiutil attach vulkansdk-macos-1.3.236.0.dmg - sudo /Volumes/vulkansdk-macos-1.3.236.0/InstallVulkan.app/Contents/MacOS/InstallVulkan --root $GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0 --accept-licenses --default-answer --confirm-command install + wget https://sdk.lunarg.com/sdk/download/1.3.290.0/mac/vulkansdk-macos-1.3.290.0.dmg?Human=true -O vulkansdk-macos-1.3.290.0.dmg + hdiutil attach vulkansdk-macos-1.3.290.0.dmg + sudo /Volumes/vulkansdk-macos-1.3.290.0/InstallVulkan.app/Contents/MacOS/InstallVulkan --root $GITHUB_WORKSPACE/vulkansdk-macos-1.3.290.0 --accept-licenses --default-answer --confirm-command install - name: Build wheels for macos x86_64 if: matrix.os == 'macos-13' && matrix.arch == 'x86_64' @@ -197,11 +197,12 @@ jobs: CIBW_BUILD_VERBOSITY: 1 CIBW_ENVIRONMENT: CMAKE_BUILD_PARALLEL_LEVEL=3 CMAKE_TOOLCHAIN_FILE=$GITHUB_WORKSPACE/toolchains/ios.toolchain.cmake PLATFORM=MAC ARCHS="x86_64" - DEPLOYMENT_TARGET="10.9" ENABLE_BITCODE=OFF ENABLE_ARC=OFF ENABLE_VISIBILITY=OFF + DEPLOYMENT_TARGET=$MAC_DEPLOYMENT_TARGET ENABLE_BITCODE=OFF ENABLE_ARC=OFF ENABLE_VISIBILITY=OFF OpenMP_C_FLAGS="-Xclang -fopenmp" OpenMP_CXX_FLAGS="-Xclang -fopenmp" OpenMP_C_LIB_NAMES="libomp" OpenMP_CXX_LIB_NAMES="libomp" OpenMP_libomp_LIBRARY="libomp.a" - Vulkan_LIBRARY=$GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0/MoltenVK/dylib/macOS/libMoltenVK.dylib + Vulkan_LIBRARY=$GITHUB_WORKSPACE/vulkansdk-macos-1.3.290.0/macOS/lib/libMoltenVK.dylib + MACOSX_DEPLOYMENT_TARGET=$MAC_DEPLOYMENT_TARGET with: output-dir: wheelhouse @@ -214,11 +215,12 @@ jobs: CIBW_BUILD_VERBOSITY: 1 CIBW_ENVIRONMENT: CMAKE_BUILD_PARALLEL_LEVEL=3 CMAKE_TOOLCHAIN_FILE=$GITHUB_WORKSPACE/toolchains/ios.toolchain.cmake PLATFORM=MAC_ARM64 ARCHS="arm64" - DEPLOYMENT_TARGET="11.0" ENABLE_BITCODE=OFF ENABLE_ARC=OFF ENABLE_VISIBILITY=OFF + DEPLOYMENT_TARGET=$MAC_DEPLOYMENT_TARGET ENABLE_BITCODE=OFF ENABLE_ARC=OFF ENABLE_VISIBILITY=OFF OpenMP_C_FLAGS="-Xclang -fopenmp" OpenMP_CXX_FLAGS="-Xclang -fopenmp" OpenMP_C_LIB_NAMES="libomp" OpenMP_CXX_LIB_NAMES="libomp" OpenMP_libomp_LIBRARY="libomp.a" - Vulkan_LIBRARY=$GITHUB_WORKSPACE/vulkansdk-macos-1.3.236.0/MoltenVK/dylib/macOS/libMoltenVK.dylib + Vulkan_LIBRARY=$GITHUB_WORKSPACE/vulkansdk-macos-1.3.290.0/macOS/lib/libMoltenVK.dylib + MACOSX_DEPLOYMENT_TARGET=$MAC_DEPLOYMENT_TARGET with: output-dir: wheelhouse @@ -244,7 +246,7 @@ jobs: fail-fast: false matrix: arch: [aarch64, ppc64le, s390x] - build_cp: [cp36, cp37, cp38, cp39, cp310, cp311, cp312] + build_cp: [cp36, cp37, cp38, cp39, cp310, cp311, cp312, cp313] build_sub: [manylinux, musllinux] steps: diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 6309214e08f..2e875fc51e7 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -53,11 +53,20 @@ jobs: name: ${{ env.PACKAGENAME }} path: /tmp/${{ env.PACKAGENAME }}.zip - ubuntu-2004: + ubuntu: needs: [setup] - runs-on: ubuntu-20.04 + strategy: + matrix: + opt: + - { shared-lib: OFF, os: ubuntu-20.04, id: ubuntu-2004 } + - { shared-lib: OFF, os: ubuntu-22.04, id: ubuntu-2204 } + - { shared-lib: OFF, os: ubuntu-24.04, id: ubuntu-2404 } + - { shared-lib: ON, os: ubuntu-20.04, id: ubuntu-2004-shared } + - { shared-lib: ON, os: ubuntu-22.04, id: ubuntu-2204-shared } + - { shared-lib: ON, os: ubuntu-24.04, id: ubuntu-2404-shared } + runs-on: ${{ matrix.opt.os }} env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-ubuntu-2004 + PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-${{ matrix.opt.id }} steps: - uses: actions/checkout@v4 with: @@ -69,71 +78,7 @@ jobs: run: | mkdir build && cd build cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . -j 2 - cmake --build . --target install/strip - - name: package - run: | - rm -rf ${{ env.PACKAGENAME }} - mkdir -p ${{ env.PACKAGENAME }} - cp -a build/install/* ${{ env.PACKAGENAME }} - rm -f ${{ env.PACKAGENAME }}.zip - zip -9 -y -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }} - - name: upload-zip - uses: actions/upload-artifact@v4 - with: - name: ${{ env.PACKAGENAME }} - path: ${{ env.PACKAGENAME }}.zip - - ubuntu-2004-shared: - needs: [setup] - runs-on: ubuntu-20.04 - env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-ubuntu-2004-shared - steps: - - uses: actions/checkout@v4 - with: - submodules: true - - name: apt - run: | - sudo apt-get install -y libprotobuf-dev protobuf-compiler - - name: build - run: | - mkdir build && cd build - cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. - cmake --build . -j $(nproc) - cmake --build . --target install/strip - - name: package - run: | - rm -rf ${{ env.PACKAGENAME }} - mkdir -p ${{ env.PACKAGENAME }} - cp -a -P build/install/* ${{ env.PACKAGENAME }} - rm -f ${{ env.PACKAGENAME }}.zip - zip -9 -y -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }} - - name: upload-zip - uses: actions/upload-artifact@v4 - with: - name: ${{ env.PACKAGENAME }} - path: ${{ env.PACKAGENAME }}.zip - - ubuntu-2204: - needs: [setup] - runs-on: ubuntu-22.04 - env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-ubuntu-2204 - steps: - - uses: actions/checkout@v4 - with: - submodules: true - - name: apt - run: | - sudo apt-get install -y libprotobuf-dev protobuf-compiler - - name: build - run: | - mkdir build && cd build - cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF .. + -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=${{ matrix.opt.shared-lib }} .. cmake --build . -j $(nproc) cmake --build . --target install/strip - name: package @@ -149,38 +94,6 @@ jobs: name: ${{ env.PACKAGENAME }} path: ${{ env.PACKAGENAME }}.zip - ubuntu-2204-shared: - needs: [setup] - runs-on: ubuntu-22.04 - env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-ubuntu-2204-shared - steps: - - uses: actions/checkout@v4 - with: - submodules: true - - name: apt - run: | - sudo apt-get install -y libprotobuf-dev protobuf-compiler - - name: build - run: | - mkdir build && cd build - cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. - cmake --build . -j $(nproc) - cmake --build . --target install/strip - - name: package - run: | - rm -rf ${{ env.PACKAGENAME }} - mkdir -p ${{ env.PACKAGENAME }} - cp -a -P build/install/* ${{ env.PACKAGENAME }} - rm -f ${{ env.PACKAGENAME }}.zip - zip -9 -y -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }} - - name: upload-zip - uses: actions/upload-artifact@v4 - with: - name: ${{ env.PACKAGENAME }} - path: ${{ env.PACKAGENAME }}.zip - openmp-macos: runs-on: macos-13 env: @@ -255,85 +168,14 @@ jobs: macos: needs: [setup, openmp-macos] + strategy: + matrix: + opt: + - { vulkan: OFF, id: macos } + - { vulkan: ON, id: macos-vulkan } runs-on: macos-13 env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-macos - NCNN_CMAKE_OPTIONS: | - -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \ - -DDEPLOYMENT_TARGET=$MAC_DEPLOYMENT_TARGET \ - -DENABLE_BITCODE=$ENABLE_BITCODE \ - -DENABLE_ARC=$ENABLE_ARC \ - -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \ - -DCMAKE_INSTALL_PREFIX=install \ - -DCMAKE_BUILD_TYPE=Release \ - -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ - -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ - -DOpenMP_libomp_LIBRARY="libomp.a" \ - -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DNCNN_BUILD_TOOLS=OFF \ - -DNCNN_BUILD_EXAMPLES=OFF \ - -DNCNN_BUILD_BENCHMARK=OFF \ - - steps: - - uses: actions/checkout@v4 - - name: download-openmp-macos - uses: actions/download-artifact@v4 - with: - name: openmp-macos - path: openmp-macos - - name: install-openmp - run: | - sudo cp openmp-macos/include/* $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/include - sudo cp openmp-macos/lib/libomp.a $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/lib - - name: build-x86_64 - run: | - mkdir build-x86_64 && cd build-x86_64 - cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=MAC -DARCHS="x86_64" .. - cmake --build . -j 4 - cmake --build . --target install/strip - - name: build-arm64 - run: | - mkdir build-arm64 && cd build-arm64 - cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=MAC_ARM64 -DARCHS="arm64" .. - cmake --build . -j 4 - cmake --build . --target install/strip - - name: package-openmp - run: | - rm -rf openmp.framework - mkdir -p openmp.framework/Versions/A/Headers - mkdir -p openmp.framework/Versions/A/Resources - ln -s A openmp.framework/Versions/Current - ln -s Versions/Current/Headers openmp.framework/Headers - ln -s Versions/Current/Resources openmp.framework/Resources - ln -s Versions/Current/openmp openmp.framework/openmp - cp openmp-macos/lib/libomp.a openmp.framework/Versions/A/openmp - cp -a openmp-macos/include/* openmp.framework/Versions/A/Headers/ - sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist - - name: package - run: | - rm -rf ncnn.framework - mkdir -p ncnn.framework/Versions/A/Headers - mkdir -p ncnn.framework/Versions/A/Resources - ln -s A ncnn.framework/Versions/Current - ln -s Versions/Current/Headers ncnn.framework/Headers - ln -s Versions/Current/Resources ncnn.framework/Resources - ln -s Versions/Current/ncnn ncnn.framework/ncnn - lipo -create build-x86_64/install/lib/libncnn.a build-arm64/install/lib/libncnn.a -o ncnn.framework/Versions/A/ncnn - cp -a build-x86_64/install/include/* ncnn.framework/Versions/A/Headers/ - sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist - rm -f ${{ env.PACKAGENAME }}.zip - zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework - - name: upload-zip - uses: actions/upload-artifact@v4 - with: - name: ${{ env.PACKAGENAME }} - path: ${{ env.PACKAGENAME }}.zip - - macos-gpu: - needs: [setup, openmp-macos] - runs-on: macos-13 - env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-macos-vulkan + PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-${{ matrix.opt.id }} NCNN_CMAKE_OPTIONS: | -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \ -DDEPLOYMENT_TARGET=$MAC_DEPLOYMENT_TARGET \ @@ -346,10 +188,10 @@ jobs: -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ -DOpenMP_libomp_LIBRARY="libomp.a" \ -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DNCNN_VULKAN=ON \ -DNCNN_BUILD_TOOLS=OFF \ -DNCNN_BUILD_EXAMPLES=OFF \ -DNCNN_BUILD_BENCHMARK=OFF \ + -DNCNN_VULKAN=${{ matrix.opt.vulkan }} \ steps: - uses: actions/checkout@v4 @@ -389,6 +231,7 @@ jobs: cp -a openmp-macos/include/* openmp.framework/Versions/A/Headers/ sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist - name: package-glslang + if: matrix.opt.vulkan == 'ON' run: | rm -rf glslang.framework mkdir -p glslang.framework/Versions/A/Headers @@ -397,12 +240,26 @@ jobs: ln -s Versions/Current/Headers glslang.framework/Headers ln -s Versions/Current/Resources glslang.framework/Resources ln -s Versions/Current/glslang glslang.framework/glslang - libtool -static build-x86_64/install/lib/libglslang.a build-x86_64/install/lib/libMachineIndependent.a build-x86_64/install/lib/libGenericCodeGen.a build-x86_64/install/lib/libSPIRV.a build-x86_64/install/lib/libOGLCompiler.a build-x86_64/install/lib/libOSDependent.a -o build-x86_64/install/lib/libglslang_combined.a - libtool -static build-arm64/install/lib/libglslang.a build-arm64/install/lib/libMachineIndependent.a build-arm64/install/lib/libGenericCodeGen.a build-arm64/install/lib/libSPIRV.a build-arm64/install/lib/libOGLCompiler.a build-arm64/install/lib/libOSDependent.a -o build-arm64/install/lib/libglslang_combined.a + libtool -static \ + build-x86_64/install/lib/libglslang.a \ + build-x86_64/install/lib/libMachineIndependent.a \ + build-x86_64/install/lib/libGenericCodeGen.a \ + build-x86_64/install/lib/libSPIRV.a \ + build-x86_64/install/lib/libOGLCompiler.a \ + build-x86_64/install/lib/libOSDependent.a \ + -o build-x86_64/install/lib/libglslang_combined.a + libtool -static \ + build-arm64/install/lib/libglslang.a \ + build-arm64/install/lib/libMachineIndependent.a \ + build-arm64/install/lib/libGenericCodeGen.a \ + build-arm64/install/lib/libSPIRV.a \ + build-arm64/install/lib/libOGLCompiler.a \ + build-arm64/install/lib/libOSDependent.a \ + -o build-arm64/install/lib/libglslang_combined.a lipo -create build-x86_64/install/lib/libglslang_combined.a build-arm64/install/lib/libglslang_combined.a -o glslang.framework/Versions/A/glslang cp -a build-x86_64/install/include/glslang glslang.framework/Versions/A/Headers/ sed -e 's/__NAME__/glslang/g' -e 's/__IDENTIFIER__/org.khronos.glslang/g' -e 's/__VERSION__/1.0/g' Info.plist > glslang.framework/Versions/A/Resources/Info.plist - - name: package + - name: package-ncnn run: | rm -rf ncnn.framework mkdir -p ncnn.framework/Versions/A/Headers @@ -412,8 +269,16 @@ jobs: ln -s Versions/Current/Resources ncnn.framework/Resources ln -s Versions/Current/ncnn ncnn.framework/ncnn lipo -create build-x86_64/install/lib/libncnn.a build-arm64/install/lib/libncnn.a -o ncnn.framework/Versions/A/ncnn - cp -a build-x86_64/install/include/ncnn ncnn.framework/Versions/A/Headers/ + cp -a build-x86_64/install/include/* ncnn.framework/Versions/A/Headers/ sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist + - name: package + if: matrix.opt.vulkan == 'OFF' + run: | + rm -f ${{ env.PACKAGENAME }}.zip + zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework + - name: package + if: matrix.opt.vulkan == 'ON' + run: | rm -f ${{ env.PACKAGENAME }}.zip zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework glslang.framework ncnn.framework - name: upload-zip @@ -485,77 +350,14 @@ jobs: ios: needs: [setup, openmp-ios] + strategy: + matrix: + opt: + - { vulkan: OFF, id: ios } + - { vulkan: ON, id: ios-vulkan } runs-on: macos-13 env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-ios - NCNN_CMAKE_OPTIONS: | - -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \ - -DDEPLOYMENT_TARGET=$IOS_DEPLOYMENT_TARGET \ - -DENABLE_BITCODE=$ENABLE_BITCODE \ - -DENABLE_ARC=$ENABLE_ARC \ - -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \ - -DCMAKE_INSTALL_PREFIX=install \ - -DCMAKE_BUILD_TYPE=Release \ - -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ - -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ - -DOpenMP_libomp_LIBRARY="libomp.a" \ - -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DNCNN_BUILD_BENCHMARK=OFF \ - - steps: - - uses: actions/checkout@v4 - - name: download-openmp-ios - uses: actions/download-artifact@v4 - with: - name: openmp-ios - path: openmp-ios - - name: install-openmp - run: | - sudo cp openmp-ios/include/* $DEVELOPER_DIR/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/usr/include - sudo cp openmp-ios/lib/libomp.a $DEVELOPER_DIR/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/usr/lib - - name: build-arm64 - run: | - mkdir build-arm64 && cd build-arm64 - cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=OS64 -DARCHS="arm64" .. - cmake --build . -j 4 - cmake --build . --target install/strip - - name: package-openmp - run: | - rm -rf openmp.framework - mkdir -p openmp.framework/Versions/A/Headers - mkdir -p openmp.framework/Versions/A/Resources - ln -s A openmp.framework/Versions/Current - ln -s Versions/Current/Headers openmp.framework/Headers - ln -s Versions/Current/Resources openmp.framework/Resources - ln -s Versions/Current/openmp openmp.framework/openmp - cp openmp-ios/lib/libomp.a openmp.framework/Versions/A/openmp - cp -a openmp-ios/include/* openmp.framework/Versions/A/Headers/ - sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist - - name: package - run: | - rm -rf ncnn.framework - mkdir -p ncnn.framework/Versions/A/Headers - mkdir -p ncnn.framework/Versions/A/Resources - ln -s A ncnn.framework/Versions/Current - ln -s Versions/Current/Headers ncnn.framework/Headers - ln -s Versions/Current/Resources ncnn.framework/Resources - ln -s Versions/Current/ncnn ncnn.framework/ncnn - cp build-arm64/install/lib/libncnn.a ncnn.framework/Versions/A/ncnn - cp -a build-arm64/install/include/* ncnn.framework/Versions/A/Headers/ - sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist - rm -f ${{ env.PACKAGENAME }}.zip - zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework - - name: upload-zip - uses: actions/upload-artifact@v4 - with: - name: ${{ env.PACKAGENAME }} - path: ${{ env.PACKAGENAME }}.zip - - ios-gpu: - needs: [setup, openmp-ios] - runs-on: macos-13 - env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-ios-vulkan + PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-${{ matrix.opt.id }} NCNN_CMAKE_OPTIONS: | -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \ -DDEPLOYMENT_TARGET=$IOS_DEPLOYMENT_TARGET \ @@ -568,8 +370,8 @@ jobs: -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ -DOpenMP_libomp_LIBRARY="libomp.a" \ -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DNCNN_VULKAN=ON \ -DNCNN_BUILD_BENCHMARK=OFF \ + -DNCNN_VULKAN=${{ matrix.opt.vulkan }} \ steps: - uses: actions/checkout@v4 @@ -603,6 +405,7 @@ jobs: cp -a openmp-ios/include/* openmp.framework/Versions/A/Headers/ sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist - name: package-glslang + if: matrix.opt.vulkan == 'ON' run: | rm -rf glslang.framework mkdir -p glslang.framework/Versions/A/Headers @@ -622,7 +425,7 @@ jobs: cp build-arm64/install/lib/libglslang_combined.a glslang.framework/Versions/A/glslang cp -a build-arm64/install/include/glslang glslang.framework/Versions/A/Headers/ sed -e 's/__NAME__/glslang/g' -e 's/__IDENTIFIER__/org.khronos.glslang/g' -e 's/__VERSION__/1.0/g' Info.plist > glslang.framework/Versions/A/Resources/Info.plist - - name: package + - name: package-ncnn run: | rm -rf ncnn.framework mkdir -p ncnn.framework/Versions/A/Headers @@ -632,8 +435,16 @@ jobs: ln -s Versions/Current/Resources ncnn.framework/Resources ln -s Versions/Current/ncnn ncnn.framework/ncnn cp build-arm64/install/lib/libncnn.a ncnn.framework/Versions/A/ncnn - cp -a build-arm64/install/include/ncnn ncnn.framework/Versions/A/Headers/ + cp -a build-arm64/install/include/* ncnn.framework/Versions/A/Headers/ sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist + - name: package + if: matrix.opt.vulkan == 'OFF' + run: | + rm -f ${{ env.PACKAGENAME }}.zip + zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework + - name: package + if: matrix.opt.vulkan == 'ON' + run: | rm -f ${{ env.PACKAGENAME }}.zip zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework glslang.framework ncnn.framework - name: upload-zip @@ -716,9 +527,14 @@ jobs: ios-simulator: needs: [setup, openmp-ios-simulator] + strategy: + matrix: + opt: + - { vulkan: OFF, id: ios-simulator } + - { vulkan: ON, id: ios-simulator-vulkan } runs-on: macos-13 env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-ios-simulator + PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-${{ matrix.opt.id }} NCNN_CMAKE_OPTIONS: | -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \ -DDEPLOYMENT_TARGET=$IOS_DEPLOYMENT_TARGET \ @@ -732,89 +548,12 @@ jobs: -DOpenMP_libomp_LIBRARY="libomp.a" \ -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ -DNCNN_BUILD_BENCHMARK=OFF \ + -DNCNN_VULKAN=${{ matrix.opt.vulkan }} \ steps: - uses: actions/checkout@v4 - - name: download-openmp-ios-simulator - uses: actions/download-artifact@v4 - with: - name: openmp-ios-simulator - path: openmp-ios-simulator - - name: install-openmp - run: | - sudo cp openmp-ios-simulator/include/* $DEVELOPER_DIR/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator.sdk/usr/include - sudo cp openmp-ios-simulator/lib/libomp.a $DEVELOPER_DIR/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator.sdk/usr/lib - - name: build-x86_64 - run: | - mkdir build-x86_64 && cd build-x86_64 - cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=SIMULATOR64 -DARCHS="x86_64" .. - cmake --build . -j 4 - cmake --build . --target install/strip - - name: build-arm64 - run: | - mkdir build-arm64 && cd build-arm64 - cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=SIMULATORARM64 -DARCHS="arm64" .. - cmake --build . -j 4 - cmake --build . --target install/strip - - name: package-openmp - run: | - rm -rf openmp.framework - mkdir -p openmp.framework/Versions/A/Headers - mkdir -p openmp.framework/Versions/A/Resources - ln -s A openmp.framework/Versions/Current - ln -s Versions/Current/Headers openmp.framework/Headers - ln -s Versions/Current/Resources openmp.framework/Resources - ln -s Versions/Current/openmp openmp.framework/openmp - cp openmp-ios-simulator/lib/libomp.a openmp.framework/Versions/A/openmp - cp -a openmp-ios-simulator/include/* openmp.framework/Versions/A/Headers/ - sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist - - name: package - run: | - rm -rf ncnn.framework - mkdir -p ncnn.framework/Versions/A/Headers - mkdir -p ncnn.framework/Versions/A/Resources - ln -s A ncnn.framework/Versions/Current - ln -s Versions/Current/Headers ncnn.framework/Headers - ln -s Versions/Current/Resources ncnn.framework/Resources - ln -s Versions/Current/ncnn ncnn.framework/ncnn - lipo -create \ - build-x86_64/install/lib/libncnn.a \ - build-arm64/install/lib/libncnn.a \ - -o ncnn.framework/Versions/A/ncnn - cp -a build-x86_64/install/include/* ncnn.framework/Versions/A/Headers/ - sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist - rm -f ${{ env.PACKAGENAME }}.zip - zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework - - name: upload-zip - uses: actions/upload-artifact@v4 - with: - name: ${{ env.PACKAGENAME }} - path: ${{ env.PACKAGENAME }}.zip - - ios-simulator-gpu: - needs: [setup, openmp-ios-simulator] - runs-on: macos-13 - env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-ios-simulator-vulkan - NCNN_CMAKE_OPTIONS: | - -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \ - -DDEPLOYMENT_TARGET=$IOS_DEPLOYMENT_TARGET \ - -DENABLE_BITCODE=$ENABLE_BITCODE \ - -DENABLE_ARC=$ENABLE_ARC \ - -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \ - -DCMAKE_INSTALL_PREFIX=install \ - -DCMAKE_BUILD_TYPE=Release \ - -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ - -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ - -DOpenMP_libomp_LIBRARY="libomp.a" \ - -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DNCNN_VULKAN=ON \ - -DNCNN_BUILD_BENCHMARK=OFF \ - - steps: - - uses: actions/checkout@v4 - with: - submodules: true + with: + submodules: true - name: download-openmp-ios-simulator uses: actions/download-artifact@v4 with: @@ -849,6 +588,7 @@ jobs: cp -a openmp-ios-simulator/include/* openmp.framework/Versions/A/Headers/ sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist - name: package-glslang + if: matrix.opt.vulkan == 'ON' run: | rm -rf glslang.framework mkdir -p glslang.framework/Versions/A/Headers @@ -879,7 +619,7 @@ jobs: -o glslang.framework/Versions/A/glslang cp -a build-x86_64/install/include/glslang glslang.framework/Versions/A/Headers/ sed -e 's/__NAME__/glslang/g' -e 's/__IDENTIFIER__/org.khronos.glslang/g' -e 's/__VERSION__/1.0/g' Info.plist > glslang.framework/Versions/A/Resources/Info.plist - - name: package + - name: package-ncnn run: | rm -rf ncnn.framework mkdir -p ncnn.framework/Versions/A/Headers @@ -892,8 +632,16 @@ jobs: build-x86_64/install/lib/libncnn.a \ build-arm64/install/lib/libncnn.a \ -o ncnn.framework/Versions/A/ncnn - cp -a build-x86_64/install/include/ncnn ncnn.framework/Versions/A/Headers/ + cp -a build-x86_64/install/include/* ncnn.framework/Versions/A/Headers/ sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist + - name: package + if: matrix.opt.vulkan == 'OFF' + run: | + rm -f ${{ env.PACKAGENAME }}.zip + zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework + - name: package + if: matrix.opt.vulkan == 'ON' + run: | rm -f ${{ env.PACKAGENAME }}.zip zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework glslang.framework ncnn.framework - name: upload-zip @@ -976,86 +724,14 @@ jobs: mac-catalyst: needs: [setup, openmp-mac-catalyst] + strategy: + matrix: + opt: + - { vulkan: OFF, id: mac-catalyst } + - { vulkan: ON, id: mac-catalyst-vulkan } runs-on: macos-13 env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-mac-catalyst - NCNN_CMAKE_OPTIONS: | - -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \ - -DDEPLOYMENT_TARGET=$MAC_CATALYST_DEPLOYMENT_TARGET \ - -DENABLE_BITCODE=$ENABLE_BITCODE \ - -DENABLE_ARC=$ENABLE_ARC \ - -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \ - -DCMAKE_INSTALL_PREFIX=install \ - -DCMAKE_BUILD_TYPE=Release \ - -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ - -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ - -DOpenMP_libomp_LIBRARY="libomp.a" \ - -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DNCNN_BUILD_BENCHMARK=OFF \ - - steps: - - uses: actions/checkout@v4 - - name: download-openmp-mac-catalyst - uses: actions/download-artifact@v4 - with: - name: openmp-mac-catalyst - path: openmp-mac-catalyst - - name: install-openmp - run: | - sudo cp openmp-mac-catalyst/include/* $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/include - sudo cp openmp-mac-catalyst/lib/libomp.a $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/lib - - name: build-x86_64 - run: | - mkdir build-x86_64 && cd build-x86_64 - cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=MAC_CATALYST -DARCHS="x86_64" .. - cmake --build . -j 4 - cmake --build . --target install/strip - - name: build-arm64 - run: | - mkdir build-arm64 && cd build-arm64 - cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=MAC_CATALYST -DARCHS="arm64" .. - cmake --build . -j 4 - cmake --build . --target install/strip - - name: package-openmp - run: | - rm -rf openmp.framework - mkdir -p openmp.framework/Versions/A/Headers - mkdir -p openmp.framework/Versions/A/Resources - ln -s A openmp.framework/Versions/Current - ln -s Versions/Current/Headers openmp.framework/Headers - ln -s Versions/Current/Resources openmp.framework/Resources - ln -s Versions/Current/openmp openmp.framework/openmp - cp openmp-mac-catalyst/lib/libomp.a openmp.framework/Versions/A/openmp - cp -a openmp-mac-catalyst/include/* openmp.framework/Versions/A/Headers/ - sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist - - name: package - run: | - rm -rf ncnn.framework - mkdir -p ncnn.framework/Versions/A/Headers - mkdir -p ncnn.framework/Versions/A/Resources - ln -s A ncnn.framework/Versions/Current - ln -s Versions/Current/Headers ncnn.framework/Headers - ln -s Versions/Current/Resources ncnn.framework/Resources - ln -s Versions/Current/ncnn ncnn.framework/ncnn - lipo -create \ - build-x86_64/install/lib/libncnn.a \ - build-arm64/install/lib/libncnn.a \ - -o ncnn.framework/Versions/A/ncnn - cp -a build-x86_64/install/include/* ncnn.framework/Versions/A/Headers/ - sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist - rm -f ${{ env.PACKAGENAME }}.zip - zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework - - name: upload-zip - uses: actions/upload-artifact@v4 - with: - name: ${{ env.PACKAGENAME }} - path: ${{ env.PACKAGENAME }}.zip - - mac-catalyst-gpu: - needs: [setup, openmp-mac-catalyst] - runs-on: macos-13 - env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-mac-catalyst-vulkan + PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-${{ matrix.opt.id }} NCNN_CMAKE_OPTIONS: | -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \ -DDEPLOYMENT_TARGET=$MAC_CATALYST_DEPLOYMENT_TARGET \ @@ -1068,8 +744,8 @@ jobs: -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ -DOpenMP_libomp_LIBRARY="libomp.a" \ -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DNCNN_VULKAN=ON \ -DNCNN_BUILD_BENCHMARK=OFF \ + -DNCNN_VULKAN=${{ matrix.opt.vulkan }} \ steps: - uses: actions/checkout@v4 @@ -1109,6 +785,7 @@ jobs: cp -a openmp-mac-catalyst/include/* openmp.framework/Versions/A/Headers/ sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist - name: package-glslang + if: matrix.opt.vulkan == 'ON' run: | rm -rf glslang.framework mkdir -p glslang.framework/Versions/A/Headers @@ -1139,7 +816,7 @@ jobs: -o glslang.framework/Versions/A/glslang cp -a build-x86_64/install/include/glslang glslang.framework/Versions/A/Headers/ sed -e 's/__NAME__/glslang/g' -e 's/__IDENTIFIER__/org.khronos.glslang/g' -e 's/__VERSION__/1.0/g' Info.plist > glslang.framework/Versions/A/Resources/Info.plist - - name: package + - name: package-ncnn run: | rm -rf ncnn.framework mkdir -p ncnn.framework/Versions/A/Headers @@ -1152,8 +829,16 @@ jobs: build-x86_64/install/lib/libncnn.a \ build-arm64/install/lib/libncnn.a \ -o ncnn.framework/Versions/A/ncnn - cp -a build-x86_64/install/include/ncnn ncnn.framework/Versions/A/Headers/ + cp -a build-x86_64/install/include/* ncnn.framework/Versions/A/Headers/ sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist + - name: package + if: matrix.opt.vulkan == 'OFF' + run: | + rm -f ${{ env.PACKAGENAME }}.zip + zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework + - name: package + if: matrix.opt.vulkan == 'ON' + run: | rm -f ${{ env.PACKAGENAME }}.zip zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework glslang.framework ncnn.framework - name: upload-zip @@ -1534,86 +1219,14 @@ jobs: tvos: needs: [setup, openmp-tvos] + strategy: + matrix: + opt: + - { vulkan: OFF, id: tvos } + - { vulkan: ON, id: tvos-vulkan } runs-on: macos-13 env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-tvos - NCNN_CMAKE_OPTIONS: | - -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \ - -DDEPLOYMENT_TARGET=$TVOS_DEPLOYMENT_TARGET \ - -DENABLE_BITCODE=$ENABLE_BITCODE \ - -DENABLE_ARC=$ENABLE_ARC \ - -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \ - -DCMAKE_INSTALL_PREFIX=install \ - -DCMAKE_BUILD_TYPE=Release \ - -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ - -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ - -DOpenMP_libomp_LIBRARY="libomp.a" \ - -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DNCNN_BUILD_BENCHMARK=OFF \ - - steps: - - uses: actions/checkout@v4 - - name: download-openmp-tvos - uses: actions/download-artifact@v4 - with: - name: openmp-tvos - path: openmp-tvos - - name: install-openmp - run: | - sudo cp openmp-tvos/include/* $DEVELOPER_DIR/Platforms/AppleTVOS.platform/Developer/SDKs/AppleTVOS.sdk/usr/include - sudo cp openmp-tvos/lib/libomp.a $DEVELOPER_DIR/Platforms/AppleTVOS.platform/Developer/SDKs/AppleTVOS.sdk/usr/lib - - name: build-arm64 - run: | - mkdir build-arm64 && cd build-arm64 - cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=TVOS -DARCHS="arm64" .. - cmake --build . -j 4 - cmake --build . --target install/strip - - name: build-arm64e - run: | - mkdir build-arm64e && cd build-arm64e - cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=TVOS -DARCHS="arm64e" .. - cmake --build . -j 4 - cmake --build . --target install/strip - - name: package-openmp - run: | - rm -rf openmp.framework - mkdir -p openmp.framework/Versions/A/Headers - mkdir -p openmp.framework/Versions/A/Resources - ln -s A openmp.framework/Versions/Current - ln -s Versions/Current/Headers openmp.framework/Headers - ln -s Versions/Current/Resources openmp.framework/Resources - ln -s Versions/Current/openmp openmp.framework/openmp - cp openmp-tvos/lib/libomp.a openmp.framework/Versions/A/openmp - cp -a openmp-tvos/include/* openmp.framework/Versions/A/Headers/ - sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist - - name: package - run: | - rm -rf ncnn.framework - mkdir -p ncnn.framework/Versions/A/Headers - mkdir -p ncnn.framework/Versions/A/Resources - ln -s A ncnn.framework/Versions/Current - ln -s Versions/Current/Headers ncnn.framework/Headers - ln -s Versions/Current/Resources ncnn.framework/Resources - ln -s Versions/Current/ncnn ncnn.framework/ncnn - lipo -create \ - build-arm64/install/lib/libncnn.a \ - build-arm64e/install/lib/libncnn.a \ - -o ncnn.framework/Versions/A/ncnn - cp -a build-arm64/install/include/* ncnn.framework/Versions/A/Headers/ - sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist - rm -f ${{ env.PACKAGENAME }}.zip - zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework - - name: upload-zip - uses: actions/upload-artifact@v4 - with: - name: ${{ env.PACKAGENAME }} - path: ${{ env.PACKAGENAME }}.zip - - tvos-gpu: - needs: [setup, openmp-tvos] - runs-on: macos-13 - env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-tvos-vulkan + PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-${{ matrix.opt.id }} NCNN_CMAKE_OPTIONS: | -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \ -DDEPLOYMENT_TARGET=$TVOS_DEPLOYMENT_TARGET \ @@ -1626,8 +1239,8 @@ jobs: -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ -DOpenMP_libomp_LIBRARY="libomp.a" \ -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DNCNN_VULKAN=ON \ -DNCNN_BUILD_BENCHMARK=OFF \ + -DNCNN_VULKAN=${{ matrix.opt.vulkan }} \ steps: - uses: actions/checkout@v4 @@ -1667,6 +1280,7 @@ jobs: cp -a openmp-tvos/include/* openmp.framework/Versions/A/Headers/ sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist - name: package-glslang + if: matrix.opt.vulkan == 'ON' run: | rm -rf glslang.framework mkdir -p glslang.framework/Versions/A/Headers @@ -1697,7 +1311,7 @@ jobs: -o glslang.framework/Versions/A/glslang cp -a build-arm64/install/include/glslang glslang.framework/Versions/A/Headers/ sed -e 's/__NAME__/glslang/g' -e 's/__IDENTIFIER__/org.khronos.glslang/g' -e 's/__VERSION__/1.0/g' Info.plist > glslang.framework/Versions/A/Resources/Info.plist - - name: package + - name: package-ncnn run: | rm -rf ncnn.framework mkdir -p ncnn.framework/Versions/A/Headers @@ -1712,6 +1326,14 @@ jobs: -o ncnn.framework/Versions/A/ncnn cp -a build-arm64/install/include/* ncnn.framework/Versions/A/Headers/ sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist + - name: package + if: matrix.opt.vulkan == 'OFF' + run: | + rm -f ${{ env.PACKAGENAME }}.zip + zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework + - name: package + if: matrix.opt.vulkan == 'ON' + run: | rm -f ${{ env.PACKAGENAME }}.zip zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework glslang.framework ncnn.framework - name: upload-zip @@ -1794,9 +1416,14 @@ jobs: tvos-simulator: needs: [setup, openmp-tvos-simulator] + strategy: + matrix: + opt: + - { vulkan: OFF, id: tvos-simulator } + - { vulkan: ON, id: tvos-simulator-vulkan } runs-on: macos-13 env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-tvos-simulator + PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-${{ matrix.opt.id }} NCNN_CMAKE_OPTIONS: | -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \ -DDEPLOYMENT_TARGET=$TVOS_DEPLOYMENT_TARGET \ @@ -1810,9 +1437,12 @@ jobs: -DOpenMP_libomp_LIBRARY="libomp.a" \ -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ -DNCNN_BUILD_BENCHMARK=OFF \ + -DNCNN_VULKAN=${{ matrix.opt.vulkan }} \ steps: - uses: actions/checkout@v4 + with: + submodules: true - name: download-openmp-tvos-simulator uses: actions/download-artifact@v4 with: @@ -1846,87 +1476,8 @@ jobs: cp openmp-tvos-simulator/lib/libomp.a openmp.framework/Versions/A/openmp cp -a openmp-tvos-simulator/include/* openmp.framework/Versions/A/Headers/ sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist - - name: package - run: | - rm -rf ncnn.framework - mkdir -p ncnn.framework/Versions/A/Headers - mkdir -p ncnn.framework/Versions/A/Resources - ln -s A ncnn.framework/Versions/Current - ln -s Versions/Current/Headers ncnn.framework/Headers - ln -s Versions/Current/Resources ncnn.framework/Resources - ln -s Versions/Current/ncnn ncnn.framework/ncnn - lipo -create \ - build-x86_64/install/lib/libncnn.a \ - build-arm64/install/lib/libncnn.a \ - -o ncnn.framework/Versions/A/ncnn - cp -a build-x86_64/install/include/* ncnn.framework/Versions/A/Headers/ - sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist - rm -f ${{ env.PACKAGENAME }}.zip - zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework - - name: upload-zip - uses: actions/upload-artifact@v4 - with: - name: ${{ env.PACKAGENAME }} - path: ${{ env.PACKAGENAME }}.zip - - tvos-simulator-gpu: - needs: [setup, openmp-tvos-simulator] - runs-on: macos-13 - env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-tvos-simulator-vulkan - NCNN_CMAKE_OPTIONS: | - -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \ - -DDEPLOYMENT_TARGET=$TVOS_DEPLOYMENT_TARGET \ - -DENABLE_BITCODE=$ENABLE_BITCODE \ - -DENABLE_ARC=$ENABLE_ARC \ - -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \ - -DCMAKE_INSTALL_PREFIX=install \ - -DCMAKE_BUILD_TYPE=Release \ - -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \ - -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \ - -DOpenMP_libomp_LIBRARY="libomp.a" \ - -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DNCNN_VULKAN=ON \ - -DNCNN_BUILD_BENCHMARK=OFF \ - - steps: - - uses: actions/checkout@v4 - with: - submodules: true - - name: download-openmp-tvos-simulator - uses: actions/download-artifact@v4 - with: - name: openmp-tvos-simulator - path: openmp-tvos-simulator - - name: install-openmp - run: | - sudo cp openmp-tvos-simulator/include/* $DEVELOPER_DIR/Platforms/AppleTVSimulator.platform/Developer/SDKs/AppleTVSimulator.sdk/usr/include - sudo cp openmp-tvos-simulator/lib/libomp.a $DEVELOPER_DIR/Platforms/AppleTVSimulator.platform/Developer/SDKs/AppleTVSimulator.sdk/usr/lib - - name: build-x86_64 - run: | - mkdir build-x86_64 && cd build-x86_64 - cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=SIMULATOR_TVOS -DARCHS="x86_64" .. - cmake --build . -j 4 - cmake --build . --target install/strip - - name: build-arm64 - run: | - mkdir build-arm64 && cd build-arm64 - cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=SIMULATOR_TVOS -DARCHS="arm64" .. - cmake --build . -j 4 - cmake --build . --target install/strip - - name: package-openmp - run: | - rm -rf openmp.framework - mkdir -p openmp.framework/Versions/A/Headers - mkdir -p openmp.framework/Versions/A/Resources - ln -s A openmp.framework/Versions/Current - ln -s Versions/Current/Headers openmp.framework/Headers - ln -s Versions/Current/Resources openmp.framework/Resources - ln -s Versions/Current/openmp openmp.framework/openmp - cp openmp-tvos-simulator/lib/libomp.a openmp.framework/Versions/A/openmp - cp -a openmp-tvos-simulator/include/* openmp.framework/Versions/A/Headers/ - sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist - - name: package-glslang + - name: package-glslang + if: matrix.opt.vulkan == 'ON' run: | rm -rf glslang.framework mkdir -p glslang.framework/Versions/A/Headers @@ -1957,7 +1508,7 @@ jobs: -o glslang.framework/Versions/A/glslang cp -a build-x86_64/install/include/glslang glslang.framework/Versions/A/Headers/ sed -e 's/__NAME__/glslang/g' -e 's/__IDENTIFIER__/org.khronos.glslang/g' -e 's/__VERSION__/1.0/g' Info.plist > glslang.framework/Versions/A/Resources/Info.plist - - name: package + - name: package-ncnn run: | rm -rf ncnn.framework mkdir -p ncnn.framework/Versions/A/Headers @@ -1972,6 +1523,14 @@ jobs: -o ncnn.framework/Versions/A/ncnn cp -a build-x86_64/install/include/* ncnn.framework/Versions/A/Headers/ sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist + - name: package + if: matrix.opt.vulkan == 'OFF' + run: | + rm -f ${{ env.PACKAGENAME }}.zip + zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework + - name: package + if: matrix.opt.vulkan == 'ON' + run: | rm -f ${{ env.PACKAGENAME }}.zip zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework glslang.framework ncnn.framework - name: upload-zip @@ -2043,9 +1602,14 @@ jobs: visionos: needs: [setup, openmp-visionos] + strategy: + matrix: + opt: + - { vulkan: OFF, id: visionos } + - { vulkan: ON, id: visionos-vulkan } runs-on: macos-13 env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-visionos + PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-${{ matrix.opt.id }} NCNN_CMAKE_OPTIONS: | -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \ -DDEPLOYMENT_TARGET=$VISIONOS_DEPLOYMENT_TARGET \ @@ -2059,9 +1623,12 @@ jobs: -DOpenMP_libomp_LIBRARY="libomp.a" \ -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ -DNCNN_BUILD_BENCHMARK=OFF \ + -DNCNN_VULKAN=${{ matrix.opt.vulkan }} \ steps: - uses: actions/checkout@v4 + with: + submodules: true - name: download-openmp-visionos uses: actions/download-artifact@v4 with: @@ -2089,7 +1656,28 @@ jobs: cp openmp-visionos/lib/libomp.a openmp.framework/Versions/A/openmp cp -a openmp-visionos/include/* openmp.framework/Versions/A/Headers/ sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist - - name: package + - name: package-glslang + if: matrix.opt.vulkan == 'ON' + run: | + rm -rf glslang.framework + mkdir -p glslang.framework/Versions/A/Headers + mkdir -p glslang.framework/Versions/A/Resources + ln -s A glslang.framework/Versions/Current + ln -s Versions/Current/Headers glslang.framework/Headers + ln -s Versions/Current/Resources glslang.framework/Resources + ln -s Versions/Current/glslang glslang.framework/glslang + libtool -static \ + build-arm64/install/lib/libglslang.a \ + build-arm64/install/lib/libMachineIndependent.a \ + build-arm64/install/lib/libGenericCodeGen.a \ + build-arm64/install/lib/libSPIRV.a \ + build-arm64/install/lib/libOGLCompiler.a \ + build-arm64/install/lib/libOSDependent.a \ + -o build-arm64/install/lib/libglslang_combined.a + cp build-arm64/install/lib/libglslang_combined.a glslang.framework/Versions/A/glslang + cp -a build-arm64/install/include/glslang glslang.framework/Versions/A/Headers/ + sed -e 's/__NAME__/glslang/g' -e 's/__IDENTIFIER__/org.khronos.glslang/g' -e 's/__VERSION__/1.0/g' Info.plist > glslang.framework/Versions/A/Resources/Info.plist + - name: package-ncnn run: | rm -rf ncnn.framework mkdir -p ncnn.framework/Versions/A/Headers @@ -2101,8 +1689,16 @@ jobs: cp build-arm64/install/lib/libncnn.a ncnn.framework/Versions/A/ncnn cp -a build-arm64/install/include/* ncnn.framework/Versions/A/Headers/ sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist + - name: package + if: matrix.opt.vulkan == 'OFF' + run: | rm -f ${{ env.PACKAGENAME }}.zip zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework + - name: package + if: matrix.opt.vulkan == 'ON' + run: | + rm -f ${{ env.PACKAGENAME }}.zip + zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework glslang.framework ncnn.framework - name: upload-zip uses: actions/upload-artifact@v4 with: @@ -2183,9 +1779,14 @@ jobs: visionos-simulator: needs: [setup, openmp-visionos-simulator] + strategy: + matrix: + opt: + - { vulkan: OFF, id: visionos-simulator } + - { vulkan: ON, id: visionos-simulator-vulkan } runs-on: macos-13 env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-visionos-simulator + PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-${{ matrix.opt.id }} NCNN_CMAKE_OPTIONS: | -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \ -DDEPLOYMENT_TARGET=$VISIONOS_DEPLOYMENT_TARGET \ @@ -2199,9 +1800,12 @@ jobs: -DOpenMP_libomp_LIBRARY="libomp.a" \ -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ -DNCNN_BUILD_BENCHMARK=OFF \ + -DNCNN_VULKAN=${{ matrix.opt.vulkan }} \ steps: - uses: actions/checkout@v4 + with: + submodules: true - name: download-openmp-visionos-simulator uses: actions/download-artifact@v4 with: @@ -2235,7 +1839,39 @@ jobs: cp openmp-visionos-simulator/lib/libomp.a openmp.framework/Versions/A/openmp cp -a openmp-visionos-simulator/include/* openmp.framework/Versions/A/Headers/ sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist - - name: package + - name: package-glslang + if: matrix.opt.vulkan == 'ON' + run: | + rm -rf glslang.framework + mkdir -p glslang.framework/Versions/A/Headers + mkdir -p glslang.framework/Versions/A/Resources + ln -s A glslang.framework/Versions/Current + ln -s Versions/Current/Headers glslang.framework/Headers + ln -s Versions/Current/Resources glslang.framework/Resources + ln -s Versions/Current/glslang glslang.framework/glslang + libtool -static \ + build-x86_64/install/lib/libglslang.a \ + build-x86_64/install/lib/libMachineIndependent.a \ + build-x86_64/install/lib/libGenericCodeGen.a \ + build-x86_64/install/lib/libSPIRV.a \ + build-x86_64/install/lib/libOGLCompiler.a \ + build-x86_64/install/lib/libOSDependent.a \ + -o build-x86_64/install/lib/libglslang_combined.a + libtool -static \ + build-arm64/install/lib/libglslang.a \ + build-arm64/install/lib/libMachineIndependent.a \ + build-arm64/install/lib/libGenericCodeGen.a \ + build-arm64/install/lib/libSPIRV.a \ + build-arm64/install/lib/libOGLCompiler.a \ + build-arm64/install/lib/libOSDependent.a \ + -o build-arm64/install/lib/libglslang_combined.a + lipo -create \ + build-x86_64/install/lib/libglslang_combined.a \ + build-arm64/install/lib/libglslang_combined.a \ + -o glslang.framework/Versions/A/glslang + cp -a build-x86_64/install/include/glslang glslang.framework/Versions/A/Headers/ + sed -e 's/__NAME__/glslang/g' -e 's/__IDENTIFIER__/org.khronos.glslang/g' -e 's/__VERSION__/1.0/g' Info.plist > glslang.framework/Versions/A/Resources/Info.plist + - name: package-ncnn run: | rm -rf ncnn.framework mkdir -p ncnn.framework/Versions/A/Headers @@ -2250,8 +1886,16 @@ jobs: -o ncnn.framework/Versions/A/ncnn cp -a build-x86_64/install/include/* ncnn.framework/Versions/A/Headers/ sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist + - name: package + if: matrix.opt.vulkan == 'OFF' + run: | rm -f ${{ env.PACKAGENAME }}.zip zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework + - name: package + if: matrix.opt.vulkan == 'ON' + run: | + rm -f ${{ env.PACKAGENAME }}.zip + zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework glslang.framework ncnn.framework - name: upload-zip uses: actions/upload-artifact@v4 with: @@ -2260,51 +1904,63 @@ jobs: android: needs: [setup] + strategy: + matrix: + opt: + - { vulkan: OFF, shared-lib: OFF, id: android } + - { vulkan: OFF, shared-lib: ON, id: android-shared } + - { vulkan: ON, shared-lib: OFF, id: android-vulkan } + - { vulkan: ON, shared-lib: ON, id: android-vulkan-shared } runs-on: ubuntu-latest env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-android + PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-${{ matrix.opt.id }} + NCNN_CMAKE_OPTIONS: | + -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake \ + -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_INSTALL_PREFIX=install \ + -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ + -DNCNN_BUILD_BENCHMARK=OFF \ + -DNCNN_VULKAN=${{ matrix.opt.vulkan }} \ + -DNCNN_SHARED_LIB=${{ matrix.opt.shared-lib }} \ + -DNCNN_AVX512BF16=OFF \ + steps: - uses: actions/checkout@v4 + with: + submodules: true - name: ndk-fix-debug run: sed -i -e '/^ -g$/d' $ANDROID_NDK_LATEST_HOME/build/cmake/android-legacy.toolchain.cmake - - name: build-armv7 + - name: build-armeabi-v7a run: | - mkdir build-armv7 && cd build-armv7 - cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-14 \ - -DNCNN_BUILD_BENCHMARK=OFF .. + mkdir build-armeabi-v7a && cd build-armeabi-v7a + cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-14 .. cmake --build . -j $(nproc) cmake --build . --target install/strip - - name: build-aarch64 + - name: build-arm64-v8a run: | - mkdir build-aarch64 && cd build-aarch64 - cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-21 \ - -DNCNN_BUILD_BENCHMARK=OFF .. + mkdir build-arm64-v8a && cd build-arm64-v8a + cmake ${{ env.NCNN_CMAKE_OPTIONS }}-DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-21 .. cmake --build . -j $(nproc) cmake --build . --target install/strip - name: build-x86 run: | mkdir build-x86 && cd build-x86 - cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-14 \ - -DNCNN_BUILD_BENCHMARK=OFF .. + cmake ${{ env.NCNN_CMAKE_OPTIONS }}-DANDROID_ABI="x86" -DANDROID_PLATFORM=android-14 .. cmake --build . -j $(nproc) cmake --build . --target install/strip - name: build-x86_64 run: | mkdir build-x86_64 && cd build-x86_64 - cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-21 \ - -DNCNN_BUILD_BENCHMARK=OFF .. + cmake ${{ env.NCNN_CMAKE_OPTIONS }}-DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-21 .. cmake --build . -j $(nproc) cmake --build . --target install/strip - name: package run: | rm -rf ${{ env.PACKAGENAME }} mkdir -p ${{ env.PACKAGENAME }} - cp -a build-armv7/install ${{ env.PACKAGENAME }}/armeabi-v7a - cp -a build-aarch64/install ${{ env.PACKAGENAME }}/arm64-v8a + cp -a build-armeabi-v7a/install ${{ env.PACKAGENAME }}/armeabi-v7a + cp -a build-arm64-v8a/install ${{ env.PACKAGENAME }}/arm64-v8a cp -a build-x86/install ${{ env.PACKAGENAME }}/x86 cp -a build-x86_64/install ${{ env.PACKAGENAME }}/x86_64 rm -f ${{ env.PACKAGENAME }}.zip @@ -2315,55 +1971,63 @@ jobs: name: ${{ env.PACKAGENAME }} path: ${{ env.PACKAGENAME }}.zip - android-shared: + webassembly: needs: [setup] runs-on: ubuntu-latest env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-android-shared + PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-webassembly steps: - uses: actions/checkout@v4 - - name: ndk-fix-debug - run: sed -i -e '/^ -g$/d' $ANDROID_NDK_LATEST_HOME/build/cmake/android-legacy.toolchain.cmake - - name: build-armv7 + - name: emsdk + run: | + git clone https://github.com/emscripten-core/emsdk.git + cd emsdk + ./emsdk install $EMSCRIPTEN_VERSION + ./emsdk activate $EMSCRIPTEN_VERSION + - name: build run: | - mkdir build-armv7 && cd build-armv7 - cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-14 \ - -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. + source emsdk/emsdk_env.sh + mkdir build && cd build + cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ + -DNCNN_THREADS=OFF -DNCNN_OPENMP=OFF -DNCNN_SIMPLEOMP=OFF -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=OFF -DNCNN_AVX2=OFF -DNCNN_AVX=OFF \ + -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF .. cmake --build . -j $(nproc) cmake --build . --target install/strip - - name: build-aarch64 + - name: build-simd run: | - mkdir build-aarch64 && cd build-aarch64 - cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-21 \ - -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. + source emsdk/emsdk_env.sh + mkdir build-simd && cd build-simd + cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ + -DNCNN_THREADS=OFF -DNCNN_OPENMP=OFF -DNCNN_SIMPLEOMP=OFF -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=ON -DNCNN_AVX2=OFF -DNCNN_AVX=OFF \ + -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF .. cmake --build . -j $(nproc) cmake --build . --target install/strip - - name: build-x86 + - name: build-threads run: | - mkdir build-x86 && cd build-x86 - cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-14 \ - -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. + source emsdk/emsdk_env.sh + mkdir build-threads && cd build-threads + cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ + -DNCNN_THREADS=ON -DNCNN_OPENMP=ON -DNCNN_SIMPLEOMP=ON -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=OFF -DNCNN_AVX2=OFF -DNCNN_AVX=OFF \ + -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF .. cmake --build . -j $(nproc) cmake --build . --target install/strip - - name: build-x86_64 + - name: build-simd-threads run: | - mkdir build-x86_64 && cd build-x86_64 - cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-21 \ - -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. + source emsdk/emsdk_env.sh + mkdir build-simd-threads && cd build-simd-threads + cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ + -DNCNN_THREADS=ON -DNCNN_OPENMP=ON -DNCNN_SIMPLEOMP=ON -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=ON -DNCNN_AVX2=OFF -DNCNN_AVX=OFF \ + -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF .. cmake --build . -j $(nproc) cmake --build . --target install/strip - name: package run: | rm -rf ${{ env.PACKAGENAME }} mkdir -p ${{ env.PACKAGENAME }} - cp -a build-armv7/install ${{ env.PACKAGENAME }}/armeabi-v7a - cp -a build-aarch64/install ${{ env.PACKAGENAME }}/arm64-v8a - cp -a build-x86/install ${{ env.PACKAGENAME }}/x86 - cp -a build-x86_64/install ${{ env.PACKAGENAME }}/x86_64 + cp -a build/install ${{ env.PACKAGENAME }}/basic + cp -a build-simd/install ${{ env.PACKAGENAME }}/simd + cp -a build-threads/install ${{ env.PACKAGENAME }}/threads + cp -a build-simd-threads/install ${{ env.PACKAGENAME }}/simd-threads rm -f ${{ env.PACKAGENAME }}.zip zip -9 -y -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }} - name: upload-zip @@ -2372,692 +2036,96 @@ jobs: name: ${{ env.PACKAGENAME }} path: ${{ env.PACKAGENAME }}.zip - android-gpu: + windows: needs: [setup] - runs-on: ubuntu-latest + strategy: + matrix: + opt: + - { shared-lib: OFF, os: windows-2019, toolset-version: v140, id: vs2015 } + - { shared-lib: OFF, os: windows-2019, toolset-version: v141, id: vs2017 } + - { shared-lib: OFF, os: windows-2019, toolset-version: v142, id: vs2019 } + - { shared-lib: OFF, os: windows-2022, toolset-version: v143, id: vs2022 } + - { shared-lib: ON, os: windows-2019, toolset-version: v140, id: vs2015-shared } + - { shared-lib: ON, os: windows-2019, toolset-version: v141, id: vs2017-shared } + - { shared-lib: ON, os: windows-2019, toolset-version: v142, id: vs2019-shared } + - { shared-lib: ON, os: windows-2022, toolset-version: v143, id: vs2022-shared } + runs-on: ${{ matrix.opt.os }} env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-android-vulkan + PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-windows-${{ matrix.opt.id }} + UseMultiToolTask: true + NCNN_CMAKE_OPTIONS: | + -T ${{ matrix.opt.toolset-version }},host=x64 ` + -DCMAKE_BUILD_TYPE=Release ` + -DCMAKE_INSTALL_PREFIX=install ` + -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" ` + -DNCNN_BUILD_EXAMPLES=OFF ` + -DNCNN_BUILD_TOOLS=ON ` + -DNCNN_BUILD_BENCHMARK=OFF ` + -DNCNN_VULKAN=ON ` + -DNCNN_SHARED_LIB=${{ matrix.opt.shared-lib }} ` + steps: - uses: actions/checkout@v4 with: submodules: true - - name: ndk-fix-debug - run: sed -i -e '/^ -g$/d' $ANDROID_NDK_LATEST_HOME/build/cmake/android-legacy.toolchain.cmake - - name: build-armv7 - run: | - mkdir build-armv7 && cd build-armv7 - cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-14 \ - -DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . -j $(nproc) - cmake --build . --target install/strip - - name: build-aarch64 + - name: cache-protobuf + id: cache-protobuf + uses: actions/cache@v4 + with: + path: "protobuf-install" + key: protobuf-${{ matrix.opt.toolset-version }}-x86-x64-install + - name: protobuf + if: steps.cache-protobuf.outputs.cache-hit != 'true' run: | - mkdir build-aarch64 && cd build-aarch64 - cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-21 \ - -DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . -j $(nproc) - cmake --build . --target install/strip + Invoke-WebRequest -Uri https://github.com/protocolbuffers/protobuf/archive/v3.11.2.zip -OutFile protobuf-3.11.2.zip + 7z x ./protobuf-3.11.2.zip + cd protobuf-3.11.2 + mkdir build-x86; cd build-x86; + cmake -T ${{ matrix.opt.toolset-version }},host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x86" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake + cmake --build . --config Release -j 4 + cmake --build . --config Release --target install + cd .. + mkdir build-x64; cd build-x64; + cmake -T ${{ matrix.opt.toolset-version }},host=x64 -A x64 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x64" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake + cmake --build . --config Release -j 4 + cmake --build . --config Release --target install - name: build-x86 run: | - mkdir build-x86 && cd build-x86 - cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-14 \ - -DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . -j $(nproc) - cmake --build . --target install/strip - - name: build-x86_64 + mkdir build-x86; cd build-x86 + cmake ${{ env.NCNN_CMAKE_OPTIONS }} -A Win32 -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x86\cmake" .. + cmake --build . --config Release -j 4 + cmake --build . --config Release --target install + - name: build-x64 run: | - mkdir build-x86_64 && cd build-x86_64 - cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-21 \ - -DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . -j $(nproc) - cmake --build . --target install/strip - - name: package - run: | - rm -rf ${{ env.PACKAGENAME }} - mkdir -p ${{ env.PACKAGENAME }} - cp -a build-armv7/install ${{ env.PACKAGENAME }}/armeabi-v7a - cp -a build-aarch64/install ${{ env.PACKAGENAME }}/arm64-v8a - cp -a build-x86/install ${{ env.PACKAGENAME }}/x86 - cp -a build-x86_64/install ${{ env.PACKAGENAME }}/x86_64 - rm -f ${{ env.PACKAGENAME }}.zip - zip -9 -y -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }} - - name: upload-zip - uses: actions/upload-artifact@v4 - with: - name: ${{ env.PACKAGENAME }} - path: ${{ env.PACKAGENAME }}.zip - - android-gpu-shared: - needs: [setup] - runs-on: ubuntu-latest - env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-android-vulkan-shared - steps: - - uses: actions/checkout@v4 - with: - submodules: true - - name: ndk-fix-debug - run: sed -i -e '/^ -g$/d' $ANDROID_NDK_LATEST_HOME/build/cmake/android-legacy.toolchain.cmake - - name: build-armv7 - run: | - mkdir build-armv7 && cd build-armv7 - cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-14 \ - -DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. - cmake --build . -j $(nproc) - cmake --build . --target install/strip - - name: build-aarch64 - run: | - mkdir build-aarch64 && cd build-aarch64 - cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-21 \ - -DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. - cmake --build . -j $(nproc) - cmake --build . --target install/strip - - name: build-x86 - run: | - mkdir build-x86 && cd build-x86 - cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-14 \ - -DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. - cmake --build . -j $(nproc) - cmake --build . --target install/strip - - name: build-x86_64 - run: | - mkdir build-x86_64 && cd build-x86_64 - cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-21 \ - -DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. - cmake --build . -j $(nproc) - cmake --build . --target install/strip - - name: package - run: | - rm -rf ${{ env.PACKAGENAME }} - mkdir -p ${{ env.PACKAGENAME }} - cp -a build-armv7/install ${{ env.PACKAGENAME }}/armeabi-v7a - cp -a build-aarch64/install ${{ env.PACKAGENAME }}/arm64-v8a - cp -a build-x86/install ${{ env.PACKAGENAME }}/x86 - cp -a build-x86_64/install ${{ env.PACKAGENAME }}/x86_64 - rm -f ${{ env.PACKAGENAME }}.zip - zip -9 -y -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }} - - name: upload-zip - uses: actions/upload-artifact@v4 - with: - name: ${{ env.PACKAGENAME }} - path: ${{ env.PACKAGENAME }}.zip - - webassembly: - needs: [setup] - runs-on: ubuntu-latest - env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-webassembly - steps: - - uses: actions/checkout@v4 - - name: emsdk - run: | - git clone https://github.com/emscripten-core/emsdk.git - cd emsdk - ./emsdk install $EMSCRIPTEN_VERSION - ./emsdk activate $EMSCRIPTEN_VERSION - - name: build - run: | - source emsdk/emsdk_env.sh - mkdir build && cd build - cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DNCNN_THREADS=OFF -DNCNN_OPENMP=OFF -DNCNN_SIMPLEOMP=OFF -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=OFF -DNCNN_AVX2=OFF -DNCNN_AVX=OFF \ - -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . -j $(nproc) - cmake --build . --target install/strip - - name: build-simd - run: | - source emsdk/emsdk_env.sh - mkdir build-simd && cd build-simd - cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DNCNN_THREADS=OFF -DNCNN_OPENMP=OFF -DNCNN_SIMPLEOMP=OFF -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=ON -DNCNN_AVX2=OFF -DNCNN_AVX=OFF \ - -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . -j $(nproc) - cmake --build . --target install/strip - - name: build-threads - run: | - source emsdk/emsdk_env.sh - mkdir build-threads && cd build-threads - cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DNCNN_THREADS=ON -DNCNN_OPENMP=ON -DNCNN_SIMPLEOMP=ON -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=OFF -DNCNN_AVX2=OFF -DNCNN_AVX=OFF \ - -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . -j $(nproc) - cmake --build . --target install/strip - - name: build-simd-threads - run: | - source emsdk/emsdk_env.sh - mkdir build-simd-threads && cd build-simd-threads - cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \ - -DNCNN_THREADS=ON -DNCNN_OPENMP=ON -DNCNN_SIMPLEOMP=ON -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=ON -DNCNN_AVX2=OFF -DNCNN_AVX=OFF \ - -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . -j $(nproc) - cmake --build . --target install/strip - - name: package - run: | - rm -rf ${{ env.PACKAGENAME }} - mkdir -p ${{ env.PACKAGENAME }} - cp -a build/install ${{ env.PACKAGENAME }}/basic - cp -a build-simd/install ${{ env.PACKAGENAME }}/simd - cp -a build-threads/install ${{ env.PACKAGENAME }}/threads - cp -a build-simd-threads/install ${{ env.PACKAGENAME }}/simd-threads - rm -f ${{ env.PACKAGENAME }}.zip - zip -9 -y -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }} - - name: upload-zip - uses: actions/upload-artifact@v4 - with: - name: ${{ env.PACKAGENAME }} - path: ${{ env.PACKAGENAME }}.zip - - windows-vs2015: - needs: [setup] - runs-on: windows-2019 - env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-windows-vs2015 - UseMultiToolTask: true - steps: - - uses: actions/checkout@v4 - with: - submodules: true - - name: cache-protobuf - id: cache-protobuf - uses: actions/cache@v4 - with: - path: "protobuf-install" - key: protobuf-vs2015-x86-x64-install - - name: protobuf - if: steps.cache-protobuf.outputs.cache-hit != 'true' - run: | - Invoke-WebRequest -Uri https://github.com/protocolbuffers/protobuf/archive/v3.11.2.zip -OutFile protobuf-3.11.2.zip - 7z x ./protobuf-3.11.2.zip - cd protobuf-3.11.2 - mkdir build-x86; cd build-x86; - cmake -T v140,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x86" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - cd .. - mkdir build-x64; cd build-x64; - cmake -T v140,host=x64 -A x64 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x64" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - - name: build-x86 - run: | - mkdir build-x86; cd build-x86 - cmake -T v140,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x86\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - - name: build-x64 - run: | - mkdir build-x64; cd build-x64 - cmake -T v140,host=x64 -A x64 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x64\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - - name: package - run: | - mkdir ${{ env.PACKAGENAME }} - mkdir ${{ env.PACKAGENAME }}/x86 - mkdir ${{ env.PACKAGENAME }}/x64 - Copy-Item -Verbose -Recurse -Path "build-x86\install\*" -Destination "${{ env.PACKAGENAME }}\x86" - Copy-Item -Verbose -Recurse -Path "build-x64\install\*" -Destination "${{ env.PACKAGENAME }}\x64" - 7z a -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }} - - name: upload-zip - uses: actions/upload-artifact@v4 - with: - name: ${{ env.PACKAGENAME }} - path: ${{ env.PACKAGENAME }}.zip - - windows-vs2015-shared: - needs: [setup] - runs-on: windows-2019 - env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-windows-vs2015-shared - UseMultiToolTask: true - steps: - - uses: actions/checkout@v4 - with: - submodules: true - - name: cache-protobuf - id: cache-protobuf - uses: actions/cache@v4 - with: - path: "protobuf-install" - key: protobuf-vs2015-x86-x64-install - - name: protobuf - if: steps.cache-protobuf.outputs.cache-hit != 'true' - run: | - Invoke-WebRequest -Uri https://github.com/protocolbuffers/protobuf/archive/v3.11.2.zip -OutFile protobuf-3.11.2.zip - 7z x ./protobuf-3.11.2.zip - cd protobuf-3.11.2 - mkdir build-x86; cd build-x86; - cmake -T v140,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x86" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - cd .. - mkdir build-x64; cd build-x64; - cmake -T v140,host=x64 -A x64 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x64" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - - name: build-x86 - run: | - mkdir build-x86; cd build-x86 - cmake -T v140,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x86\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - - name: build-x64 - run: | - mkdir build-x64; cd build-x64 - cmake -T v140,host=x64 -A x64 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x64\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - - name: package - run: | - mkdir ${{ env.PACKAGENAME }} - mkdir ${{ env.PACKAGENAME }}/x86 - mkdir ${{ env.PACKAGENAME }}/x64 - Copy-Item -Verbose -Recurse -Path "build-x86\install\*" -Destination "${{ env.PACKAGENAME }}\x86" - Copy-Item -Verbose -Recurse -Path "build-x64\install\*" -Destination "${{ env.PACKAGENAME }}\x64" - 7z a -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }} - - name: upload-zip - uses: actions/upload-artifact@v4 - with: - name: ${{ env.PACKAGENAME }} - path: ${{ env.PACKAGENAME }}.zip - - windows-vs2017: - needs: [setup] - runs-on: windows-2019 - env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-windows-vs2017 - UseMultiToolTask: true - steps: - - uses: actions/checkout@v4 - with: - submodules: true - - name: cache-protobuf - id: cache-protobuf - uses: actions/cache@v4 - with: - path: "protobuf-install" - key: protobuf-vs2017-x86-x64-install - - name: protobuf - if: steps.cache-protobuf.outputs.cache-hit != 'true' - run: | - Invoke-WebRequest -Uri https://github.com/protocolbuffers/protobuf/archive/v3.11.2.zip -OutFile protobuf-3.11.2.zip - 7z x ./protobuf-3.11.2.zip - cd protobuf-3.11.2 - mkdir build-x86; cd build-x86; - cmake -T v141,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x86" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - cd .. - mkdir build-x64; cd build-x64; - cmake -T v141,host=x64 -A x64 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x64" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - - name: build-x86 - run: | - mkdir build-x86; cd build-x86 - cmake -T v141,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x86\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - - name: build-x64 - run: | - mkdir build-x64; cd build-x64 - cmake -T v141,host=x64 -A x64 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x64\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - - name: package - run: | - mkdir ${{ env.PACKAGENAME }} - mkdir ${{ env.PACKAGENAME }}/x86 - mkdir ${{ env.PACKAGENAME }}/x64 - Copy-Item -Verbose -Recurse -Path "build-x86\install\*" -Destination "${{ env.PACKAGENAME }}\x86" - Copy-Item -Verbose -Recurse -Path "build-x64\install\*" -Destination "${{ env.PACKAGENAME }}\x64" - 7z a -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }} - - name: upload-zip - uses: actions/upload-artifact@v4 - with: - name: ${{ env.PACKAGENAME }} - path: ${{ env.PACKAGENAME }}.zip - - windows-vs2017-shared: - needs: [setup] - runs-on: windows-2019 - env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-windows-vs2017-shared - UseMultiToolTask: true - steps: - - uses: actions/checkout@v4 - with: - submodules: true - - name: cache-protobuf - id: cache-protobuf - uses: actions/cache@v4 - with: - path: "protobuf-install" - key: protobuf-vs2017-x86-x64-install - - name: protobuf - if: steps.cache-protobuf.outputs.cache-hit != 'true' - run: | - Invoke-WebRequest -Uri https://github.com/protocolbuffers/protobuf/archive/v3.11.2.zip -OutFile protobuf-3.11.2.zip - 7z x ./protobuf-3.11.2.zip - cd protobuf-3.11.2 - mkdir build-x86; cd build-x86; - cmake -T v141,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x86" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - cd .. - mkdir build-x64; cd build-x64; - cmake -T v141,host=x64 -A x64 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x64" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - - name: build-x86 - run: | - mkdir build-x86; cd build-x86 - cmake -T v141,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x86\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - - name: build-x64 - run: | - mkdir build-x64; cd build-x64 - cmake -T v141,host=x64 -A x64 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x64\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - - name: package - run: | - mkdir ${{ env.PACKAGENAME }} - mkdir ${{ env.PACKAGENAME }}/x86 - mkdir ${{ env.PACKAGENAME }}/x64 - Copy-Item -Verbose -Recurse -Path "build-x86\install\*" -Destination "${{ env.PACKAGENAME }}\x86" - Copy-Item -Verbose -Recurse -Path "build-x64\install\*" -Destination "${{ env.PACKAGENAME }}\x64" - 7z a -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }} - - name: upload-zip - uses: actions/upload-artifact@v4 - with: - name: ${{ env.PACKAGENAME }} - path: ${{ env.PACKAGENAME }}.zip - - windows-vs2019: - needs: [setup] - runs-on: windows-latest - env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-windows-vs2019 - UseMultiToolTask: true - steps: - - uses: actions/checkout@v4 - with: - submodules: true - - name: cache-protobuf - id: cache-protobuf - uses: actions/cache@v4 - with: - path: "protobuf-install" - key: protobuf-vs2019-x86-x64-install - - name: protobuf - if: steps.cache-protobuf.outputs.cache-hit != 'true' - run: | - Invoke-WebRequest -Uri https://github.com/protocolbuffers/protobuf/archive/v3.11.2.zip -OutFile protobuf-3.11.2.zip - 7z x ./protobuf-3.11.2.zip - cd protobuf-3.11.2 - mkdir build-x86; cd build-x86; - cmake -T v142,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x86" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - cd .. - mkdir build-x64; cd build-x64; - cmake -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x64" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - - name: build-x86 - run: | - mkdir build-x86; cd build-x86 - cmake -T v142,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x86\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - - name: build-x64 - run: | - mkdir build-x64; cd build-x64 - cmake -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x64\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - - name: build-arm - run: | - mkdir build-arm; cd build-arm - cmake -T v142,host=x64 -A arm -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - - name: build-arm64 - run: | - mkdir build-arm64; cd build-arm64 - cmake -T v142,host=x64 -A arm64 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - - name: package - run: | - mkdir ${{ env.PACKAGENAME }} - mkdir ${{ env.PACKAGENAME }}/x86 - mkdir ${{ env.PACKAGENAME }}/x64 - mkdir ${{ env.PACKAGENAME }}/arm - mkdir ${{ env.PACKAGENAME }}/arm64 - Copy-Item -Verbose -Recurse -Path "build-x86\install\*" -Destination "${{ env.PACKAGENAME }}\x86" - Copy-Item -Verbose -Recurse -Path "build-x64\install\*" -Destination "${{ env.PACKAGENAME }}\x64" - Copy-Item -Verbose -Recurse -Path "build-arm\install\*" -Destination "${{ env.PACKAGENAME }}\arm" - Copy-Item -Verbose -Recurse -Path "build-arm64\install\*" -Destination "${{ env.PACKAGENAME }}\arm64" - 7z a -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }} - - name: upload-zip - uses: actions/upload-artifact@v4 - with: - name: ${{ env.PACKAGENAME }} - path: ${{ env.PACKAGENAME }}.zip - - windows-vs2019-shared: - needs: [setup] - runs-on: windows-latest - env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-windows-vs2019-shared - UseMultiToolTask: true - steps: - - uses: actions/checkout@v4 - with: - submodules: true - - name: cache-protobuf - id: cache-protobuf - uses: actions/cache@v4 - with: - path: "protobuf-install" - key: protobuf-vs2019-x86-x64-install - - name: protobuf - if: steps.cache-protobuf.outputs.cache-hit != 'true' - run: | - Invoke-WebRequest -Uri https://github.com/protocolbuffers/protobuf/archive/v3.11.2.zip -OutFile protobuf-3.11.2.zip - 7z x ./protobuf-3.11.2.zip - cd protobuf-3.11.2 - mkdir build-x86; cd build-x86; - cmake -T v142,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x86" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - cd .. - mkdir build-x64; cd build-x64; - cmake -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x64" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - - name: build-x86 - run: | - mkdir build-x86; cd build-x86 - cmake -T v142,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x86\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - - name: build-x64 - run: | - mkdir build-x64; cd build-x64 - cmake -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x64\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - - name: build-arm - run: | - mkdir build-arm; cd build-arm - cmake -T v142,host=x64 -A arm -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - - name: build-arm64 - run: | - mkdir build-arm64; cd build-arm64 - cmake -T v142,host=x64 -A arm64 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - - name: package - run: | - mkdir ${{ env.PACKAGENAME }} - mkdir ${{ env.PACKAGENAME }}/x86 - mkdir ${{ env.PACKAGENAME }}/x64 - mkdir ${{ env.PACKAGENAME }}/arm - mkdir ${{ env.PACKAGENAME }}/arm64 - Copy-Item -Verbose -Recurse -Path "build-x86\install\*" -Destination "${{ env.PACKAGENAME }}\x86" - Copy-Item -Verbose -Recurse -Path "build-x64\install\*" -Destination "${{ env.PACKAGENAME }}\x64" - Copy-Item -Verbose -Recurse -Path "build-arm\install\*" -Destination "${{ env.PACKAGENAME }}\arm" - Copy-Item -Verbose -Recurse -Path "build-arm64\install\*" -Destination "${{ env.PACKAGENAME }}\arm64" - 7z a -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }} - - name: upload-zip - uses: actions/upload-artifact@v4 - with: - name: ${{ env.PACKAGENAME }} - path: ${{ env.PACKAGENAME }}.zip - - windows-vs2022: - needs: [setup] - runs-on: windows-latest - env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-windows-vs2022 - UseMultiToolTask: true - steps: - - uses: actions/checkout@v4 - with: - submodules: true - - name: cache-protobuf - id: cache-protobuf - uses: actions/cache@v4 - with: - path: "protobuf-install" - key: protobuf-vs2022-x86-x64-install - - name: protobuf - if: steps.cache-protobuf.outputs.cache-hit != 'true' - run: | - Invoke-WebRequest -Uri https://github.com/protocolbuffers/protobuf/archive/v3.11.2.zip -OutFile protobuf-3.11.2.zip - 7z x ./protobuf-3.11.2.zip - cd protobuf-3.11.2 - mkdir build-x86; cd build-x86; - cmake -T v143,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x86" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - cd .. - mkdir build-x64; cd build-x64; - cmake -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x64" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - - name: build-x86 - run: | - mkdir build-x86; cd build-x86 - cmake -T v143,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x86\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - - name: build-x64 - run: | - mkdir build-x64; cd build-x64 - cmake -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x64\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - - name: build-arm - run: | - mkdir build-arm; cd build-arm - cmake -T v143,host=x64 -A arm -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - - name: build-arm64 - run: | - mkdir build-arm64; cd build-arm64 - cmake -T v143,host=x64 -A arm64 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF .. - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install + mkdir build-x64; cd build-x64 + cmake ${{ env.NCNN_CMAKE_OPTIONS }} -A x64 -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x64\cmake" .. + cmake --build . --config Release -j 4 + cmake --build . --config Release --target install + - name: build-arm + if: matrix.opt.toolset-version == 'v142' || matrix.opt.toolset-version == 'v143' + run: | + mkdir build-arm; cd build-arm + cmake ${{ env.NCNN_CMAKE_OPTIONS }} -A arm .. + cmake --build . --config Release -j 4 + cmake --build . --config Release --target install + - name: build-arm64 + if: matrix.opt.toolset-version == 'v142' || matrix.opt.toolset-version == 'v143' + run: | + mkdir build-arm64; cd build-arm64 + cmake ${{ env.NCNN_CMAKE_OPTIONS }} -A arm64 .. + cmake --build . --config Release -j 4 + cmake --build . --config Release --target install - name: package + if: matrix.opt.toolset-version == 'v140' || matrix.opt.toolset-version == 'v141' run: | mkdir ${{ env.PACKAGENAME }} mkdir ${{ env.PACKAGENAME }}/x86 mkdir ${{ env.PACKAGENAME }}/x64 - mkdir ${{ env.PACKAGENAME }}/arm - mkdir ${{ env.PACKAGENAME }}/arm64 Copy-Item -Verbose -Recurse -Path "build-x86\install\*" -Destination "${{ env.PACKAGENAME }}\x86" Copy-Item -Verbose -Recurse -Path "build-x64\install\*" -Destination "${{ env.PACKAGENAME }}\x64" - Copy-Item -Verbose -Recurse -Path "build-arm\install\*" -Destination "${{ env.PACKAGENAME }}\arm" - Copy-Item -Verbose -Recurse -Path "build-arm64\install\*" -Destination "${{ env.PACKAGENAME }}\arm64" 7z a -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }} - - name: upload-zip - uses: actions/upload-artifact@v4 - with: - name: ${{ env.PACKAGENAME }} - path: ${{ env.PACKAGENAME }}.zip - - windows-vs2022-shared: - needs: [setup] - runs-on: windows-latest - env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-windows-vs2022-shared - UseMultiToolTask: true - steps: - - uses: actions/checkout@v4 - with: - submodules: true - - name: cache-protobuf - id: cache-protobuf - uses: actions/cache@v4 - with: - path: "protobuf-install" - key: protobuf-vs2022-x86-x64-install - - name: protobuf - if: steps.cache-protobuf.outputs.cache-hit != 'true' - run: | - Invoke-WebRequest -Uri https://github.com/protocolbuffers/protobuf/archive/v3.11.2.zip -OutFile protobuf-3.11.2.zip - 7z x ./protobuf-3.11.2.zip - cd protobuf-3.11.2 - mkdir build-x86; cd build-x86; - cmake -T v143,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x86" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - cd .. - mkdir build-x64; cd build-x64; - cmake -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x64" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - - name: build-x86 - run: | - mkdir build-x86; cd build-x86 - cmake -T v143,host=x64 -A Win32 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x86\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - - name: build-x64 - run: | - mkdir build-x64; cd build-x64 - cmake -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x64\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - - name: build-arm - run: | - mkdir build-arm; cd build-arm - cmake -T v143,host=x64 -A arm -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - - name: build-arm64 - run: | - mkdir build-arm64; cd build-arm64 - cmake -T v143,host=x64 -A arm64 -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=ON .. - cmake --build . --config Release -j 4 - cmake --build . --config Release --target install - name: package + if: matrix.opt.toolset-version == 'v142' || matrix.opt.toolset-version == 'v143' run: | mkdir ${{ env.PACKAGENAME }} mkdir ${{ env.PACKAGENAME }}/x86 @@ -3087,30 +2155,49 @@ jobs: with: path: artifacts - - name: create-xcframwork + - name: unzip run: | - mkdir -p ncnn-macos mkdir -p ncnn-ios + mkdir -p ncnn-ios-vulkan mkdir -p ncnn-ios-simulator + mkdir -p ncnn-ios-simulator-vulkan mkdir -p ncnn-mac-catalyst - mkdir -p ncnn-watchos - mkdir -p ncnn-watchos-simulator + mkdir -p ncnn-mac-catalyst-vulkan + mkdir -p ncnn-macos + mkdir -p ncnn-macos-vulkan mkdir -p ncnn-tvos + mkdir -p ncnn-tvos-vulkan mkdir -p ncnn-tvos-simulator + mkdir -p ncnn-tvos-simulator-vulkan mkdir -p ncnn-visionos + mkdir -p ncnn-visionos-vulkan mkdir -p ncnn-visionos-simulator + mkdir -p ncnn-visionos-simulator-vulkan + mkdir -p ncnn-watchos + mkdir -p ncnn-watchos-simulator - unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-macos/ncnn-${{ needs.setup.outputs.VERSION }}-macos.zip -d ncnn-macos unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-ios/ncnn-${{ needs.setup.outputs.VERSION }}-ios.zip -d ncnn-ios + unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-ios-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-ios-vulkan.zip -d ncnn-ios-vulkan unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-ios-simulator/ncnn-${{ needs.setup.outputs.VERSION }}-ios-simulator.zip -d ncnn-ios-simulator + unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-ios-simulator-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-ios-simulator-vulkan.zip -d ncnn-ios-simulator-vulkan unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-mac-catalyst/ncnn-${{ needs.setup.outputs.VERSION }}-mac-catalyst.zip -d ncnn-mac-catalyst - unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-watchos/ncnn-${{ needs.setup.outputs.VERSION }}-watchos.zip -d ncnn-watchos - unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-watchos-simulator/ncnn-${{ needs.setup.outputs.VERSION }}-watchos-simulator.zip -d ncnn-watchos-simulator + unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-mac-catalyst-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-mac-catalyst-vulkan.zip -d ncnn-mac-catalyst-vulkan + unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-macos/ncnn-${{ needs.setup.outputs.VERSION }}-macos.zip -d ncnn-macos + unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-macos-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-macos-vulkan.zip -d ncnn-macos-vulkan unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-tvos/ncnn-${{ needs.setup.outputs.VERSION }}-tvos.zip -d ncnn-tvos + unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-tvos-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-tvos-vulkan.zip -d ncnn-tvos-vulkan unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-tvos-simulator/ncnn-${{ needs.setup.outputs.VERSION }}-tvos-simulator.zip -d ncnn-tvos-simulator + unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-tvos-simulator-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-tvos-simulator-vulkan.zip -d ncnn-tvos-simulator-vulkan unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-visionos/ncnn-${{ needs.setup.outputs.VERSION }}-visionos.zip -d ncnn-visionos + unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-visionos-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-visionos-vulkan.zip -d ncnn-visionos-vulkan unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-visionos-simulator/ncnn-${{ needs.setup.outputs.VERSION }}-visionos-simulator.zip -d ncnn-visionos-simulator + unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-visionos-simulator-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-visionos-simulator-vulkan.zip -d ncnn-visionos-simulator-vulkan + unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-watchos/ncnn-${{ needs.setup.outputs.VERSION }}-watchos.zip -d ncnn-watchos + unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-watchos-simulator/ncnn-${{ needs.setup.outputs.VERSION }}-watchos-simulator.zip -d ncnn-watchos-simulator + - name: create-xcframwork + run: | + rm -rf openmp.xcframework xcodebuild -create-xcframework \ -framework ncnn-macos/openmp.framework \ -framework ncnn-ios/openmp.framework \ @@ -3124,6 +2211,7 @@ jobs: -framework ncnn-visionos-simulator/openmp.framework \ -output openmp.xcframework + rm -rf ncnn.xcframework xcodebuild -create-xcframework \ -framework ncnn-macos/ncnn.framework \ -framework ncnn-ios/ncnn.framework \ @@ -3139,48 +2227,9 @@ jobs: rm -f ${{ env.PACKAGENAME }}.zip zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.xcframework ncnn.xcframework - - name: upload-zip - uses: actions/upload-artifact@v4 - with: - name: ${{ env.PACKAGENAME }} - path: ${{ env.PACKAGENAME }}.zip - - apple-gpu: - needs: [setup, macos-gpu, ios-gpu, ios-simulator-gpu, mac-catalyst-gpu, watchos, watchos-simulator, tvos-gpu, tvos-simulator-gpu, visionos, visionos-simulator] - runs-on: macos-13 - env: - PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-apple-vulkan - steps: - - run: sudo xcode-select --switch /Applications/Xcode_15.2.app - - name: download - uses: actions/download-artifact@v4 - with: - path: artifacts - - - name: create-xcframwork + - name: create-xcframwork-vulkan run: | - mkdir -p ncnn-macos-vulkan - mkdir -p ncnn-ios-vulkan - mkdir -p ncnn-ios-simulator-vulkan - mkdir -p ncnn-mac-catalyst-vulkan - mkdir -p ncnn-watchos - mkdir -p ncnn-watchos-simulator - mkdir -p ncnn-tvos-vulkan - mkdir -p ncnn-tvos-simulator-vulkan - mkdir -p ncnn-visionos - mkdir -p ncnn-visionos-simulator - - unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-macos-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-macos-vulkan.zip -d ncnn-macos-vulkan - unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-ios-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-ios-vulkan.zip -d ncnn-ios-vulkan - unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-ios-simulator-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-ios-simulator-vulkan.zip -d ncnn-ios-simulator-vulkan - unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-mac-catalyst-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-mac-catalyst-vulkan.zip -d ncnn-mac-catalyst-vulkan - unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-watchos/ncnn-${{ needs.setup.outputs.VERSION }}-watchos.zip -d ncnn-watchos - unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-watchos-simulator/ncnn-${{ needs.setup.outputs.VERSION }}-watchos-simulator.zip -d ncnn-watchos-simulator - unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-tvos-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-tvos-vulkan.zip -d ncnn-tvos-vulkan - unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-tvos-simulator-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-tvos-simulator-vulkan.zip -d ncnn-tvos-simulator-vulkan - unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-visionos/ncnn-${{ needs.setup.outputs.VERSION }}-visionos.zip -d ncnn-visionos - unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-visionos-simulator/ncnn-${{ needs.setup.outputs.VERSION }}-visionos-simulator.zip -d ncnn-visionos-simulator - + rm -rf openmp.xcframework xcodebuild -create-xcframework \ -framework ncnn-macos-vulkan/openmp.framework \ -framework ncnn-ios-vulkan/openmp.framework \ @@ -3194,6 +2243,7 @@ jobs: -framework ncnn-visionos-simulator/openmp.framework \ -output openmp.xcframework + rm -rf glslang.xcframework xcodebuild -create-xcframework \ -framework ncnn-macos-vulkan/glslang.framework \ -framework ncnn-ios-vulkan/glslang.framework \ @@ -3201,8 +2251,11 @@ jobs: -framework ncnn-mac-catalyst-vulkan/glslang.framework \ -framework ncnn-tvos-vulkan/glslang.framework \ -framework ncnn-tvos-simulator-vulkan/glslang.framework \ + -framework ncnn-visionos-vulkan/glslang.framework \ + -framework ncnn-visionos-simulator-vulkan/glslang.framework \ -output glslang.xcframework + rm -rf ncnn.xcframework xcodebuild -create-xcframework \ -framework ncnn-macos-vulkan/ncnn.framework \ -framework ncnn-ios-vulkan/ncnn.framework \ @@ -3212,22 +2265,27 @@ jobs: -framework ncnn-watchos-simulator/ncnn.framework \ -framework ncnn-tvos-vulkan/ncnn.framework \ -framework ncnn-tvos-simulator-vulkan/ncnn.framework \ - -framework ncnn-visionos/ncnn.framework \ - -framework ncnn-visionos-simulator/ncnn.framework \ + -framework ncnn-visionos-vulkan/ncnn.framework \ + -framework ncnn-visionos-simulator-vulkan/ncnn.framework \ -output ncnn.xcframework - rm -f ${{ env.PACKAGENAME }}.zip - zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.xcframework glslang.xcframework ncnn.xcframework + rm -f ${{ env.PACKAGENAME }}-vulkan.zip + zip -9 -y -r ${{ env.PACKAGENAME }}-vulkan.zip openmp.xcframework glslang.xcframework ncnn.xcframework - name: upload-zip uses: actions/upload-artifact@v4 with: name: ${{ env.PACKAGENAME }} path: ${{ env.PACKAGENAME }}.zip + - name: upload-zip-vulkan + uses: actions/upload-artifact@v4 + with: + name: ${{ env.PACKAGENAME }}-vulkan + path: ${{ env.PACKAGENAME }}-vulkan.zip release: permissions: contents: write # for softprops/action-gh-release to create a release - needs: [setup, full-source, ubuntu-2004, ubuntu-2004-shared, ubuntu-2204, ubuntu-2204-shared, macos, macos-gpu, ios, ios-gpu, ios-simulator, ios-simulator-gpu, mac-catalyst, mac-catalyst-gpu, watchos, watchos-simulator, tvos, tvos-simulator, android, android-shared, android-gpu, android-gpu-shared, webassembly, windows-vs2015, windows-vs2015-shared, windows-vs2017, windows-vs2017-shared, windows-vs2019, windows-vs2019-shared, windows-vs2022, windows-vs2022-shared, apple, apple-gpu] + needs: [setup, full-source, ubuntu, macos, ios, ios-simulator, mac-catalyst, watchos, watchos-simulator, tvos, tvos-simulator, android, webassembly, windows, apple] runs-on: ubuntu-latest steps: - name: download From c46278d0bb32914c438af3db86d0671402c87c67 Mon Sep 17 00:00:00 2001 From: nihui Date: Wed, 14 Aug 2024 11:51:39 +0800 Subject: [PATCH 19/38] pnnx convert onnx resize with roi, torch.max torch.min with dim returns tuple (#5627) * pnnx convert onnx resize with roi, torch.max torch.min with dim returns tuple * torch max min only support single dim --- tools/pnnx/src/ir.cpp | 9 + tools/pnnx/src/pass_level2/F_interpolate.cpp | 177 +++++++++--------- tools/pnnx/src/pass_level2/torch_max.cpp | 13 +- tools/pnnx/src/pass_level2/torch_min.cpp | 13 +- tools/pnnx/src/pass_ncnn/torch_max.cpp | 16 ++ tools/pnnx/src/pass_ncnn/torch_min.cpp | 16 ++ .../pass_onnx/fuse_constant_as_attribute.cpp | 1 + 7 files changed, 155 insertions(+), 90 deletions(-) diff --git a/tools/pnnx/src/ir.cpp b/tools/pnnx/src/ir.cpp index cacd84fde79..8b2b6dfd2d7 100644 --- a/tools/pnnx/src/ir.cpp +++ b/tools/pnnx/src/ir.cpp @@ -2111,6 +2111,15 @@ int Graph::python(const std::string& pypath, const std::string& pnnxbinpath) fprintf(pyfp, ", "); } + if (op->type == "torch.max" || op->type == "torch.max") + { + if (op->has_param("dim") && op->outputs.size() == 1) + { + // torch.max and torch.min with dim returns tuple + fprintf(pyfp, ", _"); + } + } + if (op->type.substr(0, 7) == "Tensor.") { if (op->type == "Tensor.fill") diff --git a/tools/pnnx/src/pass_level2/F_interpolate.cpp b/tools/pnnx/src/pass_level2/F_interpolate.cpp index b93bd2df6c8..119842b1c78 100644 --- a/tools/pnnx/src/pass_level2/F_interpolate.cpp +++ b/tools/pnnx/src/pass_level2/F_interpolate.cpp @@ -1005,7 +1005,7 @@ class F_interpolate_onnx : public GraphRewriterPass return R"PNNXIR(7767517 3 2 pnnx.Input input 0 1 input -Resize op_0 1 1 input out sizes=%sizes coordinate_transformation_mode=%coordinate_transformation_mode mode=%mode nearest_mode=floor cubic_coeff_a=* +Resize op_0 1 1 input out %*=%* pnnx.Output output 1 0 out )PNNXIR"; } @@ -1017,104 +1017,69 @@ pnnx.Output output 1 0 out bool match(const std::map& matched_operators, const std::map& captured_params, const std::map& /*captured_attrs*/) const { - if (captured_params.at("sizes").type != 5) + if (captured_params.find("op_0.coordinate_transformation_mode") == captured_params.end()) return false; - const std::vector& sizes = captured_params.at("sizes").ai; - - if (sizes.size() < 3 || sizes.size() > 5) + if (captured_params.at("op_0.coordinate_transformation_mode").type != 4) return false; - const std::vector& input_shape = matched_operators.at("op_0")->inputs[0]->shape; - if (input_shape.size() < 3 || input_shape.size() > 5) + if (captured_params.find("op_0.mode") == captured_params.end()) return false; - if (input_shape[0] != sizes[0] || input_shape[1] != sizes[1]) + if (captured_params.at("op_0.mode").type != 4) return false; - return true; - } - - void write(Operator* op, const std::map& captured_params) const - { - const std::string& coordinate_transformation_mode = captured_params.at("coordinate_transformation_mode").s; - std::string mode = captured_params.at("mode").s; - const std::vector& sizes = captured_params.at("sizes").ai; - - if (mode == "linear") + if (captured_params.find("op_0.nearest_mode") != captured_params.end()) { - if (coordinate_transformation_mode == "half_pixel") - op->params["align_corners"] = false; - if (coordinate_transformation_mode == "align_corners") - op->params["align_corners"] = true; - - if (sizes.size() == 4) - mode = "bilinear"; - if (sizes.size() == 5) - mode = "trilinear"; + if (captured_params.at("op_0.nearest_mode").type != 4 || captured_params.at("op_0.nearest_mode").s != "floor") + return false; } - if (mode == "cubic") + if (captured_params.find("op_0.roi") != captured_params.end()) { - if (coordinate_transformation_mode == "half_pixel") - op->params["align_corners"] = false; - if (coordinate_transformation_mode == "align_corners") - op->params["align_corners"] = true; - - mode = "bicubic"; + if (captured_params.at("op_0.roi").type != 6 || !captured_params.at("op_0.roi").ai.empty()) + return false; } - op->params["mode"] = mode; - if (sizes.size() == 3) - op->params["size"] = {sizes[2]}; - if (sizes.size() == 4) - op->params["size"] = {sizes[2], sizes[3]}; - if (sizes.size() == 5) - op->params["size"] = {sizes[2], sizes[3], sizes[4]}; - } -}; + if (captured_params.find("op_0.sizes") == captured_params.end() && captured_params.find("op_0.scales") == captured_params.end()) + return false; -REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_interpolate_onnx, 10) + if (captured_params.find("op_0.sizes") != captured_params.end() && captured_params.at("op_0.sizes").type == 5 && !captured_params.at("op_0.sizes").ai.empty()) + { + const std::vector& sizes = captured_params.at("op_0.sizes").ai; -class F_interpolate_onnx_1 : public GraphRewriterPass -{ -public: - const char* match_pattern_graph() const - { - return R"PNNXIR(7767517 -3 2 -pnnx.Input input 0 1 input -Resize op_0 1 1 input out scales=%scales coordinate_transformation_mode=%coordinate_transformation_mode mode=%mode nearest_mode=floor cubic_coeff_a=* -pnnx.Output output 1 0 out -)PNNXIR"; - } + if (sizes.size() < 3 || sizes.size() > 5) + return false; - const char* type_str() const - { - return "F.interpolate"; - } + const std::vector& input_shape = matched_operators.at("op_0")->inputs[0]->shape; + if (input_shape.size() < 3 || input_shape.size() > 5) + return false; - bool match(const std::map& captured_params) const - { - if (captured_params.at("scales").type != 6) - return false; - - const std::vector& scales = captured_params.at("scales").af; + if (input_shape[0] != sizes[0] || input_shape[1] != sizes[1]) + return false; + } + else if (captured_params.find("op_0.scales") != captured_params.end() && captured_params.at("op_0.scales").type == 6 && !captured_params.at("op_0.scales").af.empty()) + { + const std::vector& scales = captured_params.at("op_0.scales").af; - if (scales.size() < 3 || scales.size() > 5) - return false; + if (scales.size() < 3 || scales.size() > 5) + return false; - if (scales[0] != 1.f || scales[1] != 1.f) + if (scales[0] != 1.f || scales[1] != 1.f) + return false; + } + else + { return false; + } return true; } void write(Operator* op, const std::map& captured_params) const { - const std::string& coordinate_transformation_mode = captured_params.at("coordinate_transformation_mode").s; - std::string mode = captured_params.at("mode").s; - const std::vector& scales = captured_params.at("scales").af; + const std::string& coordinate_transformation_mode = captured_params.at("op_0.coordinate_transformation_mode").s; + std::string mode = captured_params.at("op_0.mode").s; if (mode == "linear") { @@ -1122,11 +1087,6 @@ pnnx.Output output 1 0 out op->params["align_corners"] = false; if (coordinate_transformation_mode == "align_corners") op->params["align_corners"] = true; - - if (scales.size() == 4) - mode = "bilinear"; - if (scales.size() == 5) - mode = "trilinear"; } if (mode == "cubic") @@ -1135,22 +1095,63 @@ pnnx.Output output 1 0 out op->params["align_corners"] = false; if (coordinate_transformation_mode == "align_corners") op->params["align_corners"] = true; - - mode = "bicubic"; } - op->params["mode"] = mode; - op->params["recompute_scale_factor"] = false; - if (scales.size() == 3) - op->params["scale_factor"] = {scales[2]}; - if (scales.size() == 4) - op->params["scale_factor"] = {scales[2], scales[3]}; - if (scales.size() == 5) - op->params["scale_factor"] = {scales[2], scales[3], scales[4]}; + if (captured_params.find("op_0.sizes") != captured_params.end() && captured_params.at("op_0.sizes").type == 5 && !captured_params.at("op_0.sizes").ai.empty()) + { + const std::vector& sizes = captured_params.at("op_0.sizes").ai; + + if (mode == "linear") + { + if (sizes.size() == 4) + mode = "bilinear"; + if (sizes.size() == 5) + mode = "trilinear"; + } + + if (mode == "cubic") + { + mode = "bicubic"; + } + + op->params["mode"] = mode; + if (sizes.size() == 3) + op->params["size"] = {sizes[2]}; + if (sizes.size() == 4) + op->params["size"] = {sizes[2], sizes[3]}; + if (sizes.size() == 5) + op->params["size"] = {sizes[2], sizes[3], sizes[4]}; + } + else if (captured_params.find("op_0.scales") != captured_params.end() && captured_params.at("op_0.scales").type == 6 && !captured_params.at("op_0.scales").af.empty()) + { + const std::vector& scales = captured_params.at("op_0.scales").af; + + if (mode == "linear") + { + if (scales.size() == 4) + mode = "bilinear"; + if (scales.size() == 5) + mode = "trilinear"; + } + + if (mode == "cubic") + { + mode = "bicubic"; + } + + op->params["mode"] = mode; + op->params["recompute_scale_factor"] = false; + if (scales.size() == 3) + op->params["scale_factor"] = {scales[2]}; + if (scales.size() == 4) + op->params["scale_factor"] = {scales[2], scales[3]}; + if (scales.size() == 5) + op->params["scale_factor"] = {scales[2], scales[3], scales[4]}; + } } }; -REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_interpolate_onnx_1, 10) +REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_interpolate_onnx, 10) class F_interpolate_onnx_2 : public GraphRewriterPass { diff --git a/tools/pnnx/src/pass_level2/torch_max.cpp b/tools/pnnx/src/pass_level2/torch_max.cpp index b606fed066b..eef7f33b4d0 100644 --- a/tools/pnnx/src/pass_level2/torch_max.cpp +++ b/tools/pnnx/src/pass_level2/torch_max.cpp @@ -78,11 +78,22 @@ pnnx.Output output 1 0 out return "torch.max"; } + bool match(const std::map& captured_params) const + { + if (captured_params.find("op_0.axes") != captured_params.end()) + { + if (captured_params.at("op_0.axes").type != 5 || captured_params.at("op_0.axes").ai.size() != 1) + return false; + } + + return true; + } + void write(Operator* op, const std::map& captured_params) const { if (captured_params.find("op_0.axes") != captured_params.end()) { - op->params["dim"] = captured_params.at("op_0.axes"); + op->params["dim"] = captured_params.at("op_0.axes").ai[0]; if (captured_params.find("op_0.keepdims") != captured_params.end()) { diff --git a/tools/pnnx/src/pass_level2/torch_min.cpp b/tools/pnnx/src/pass_level2/torch_min.cpp index 35cc4988a19..509b858c1c1 100644 --- a/tools/pnnx/src/pass_level2/torch_min.cpp +++ b/tools/pnnx/src/pass_level2/torch_min.cpp @@ -78,11 +78,22 @@ pnnx.Output output 1 0 out return "torch.min"; } + bool match(const std::map& captured_params) const + { + if (captured_params.find("op_0.axes") != captured_params.end()) + { + if (captured_params.at("op_0.axes").type != 5 || captured_params.at("op_0.axes").ai.size() != 1) + return false; + } + + return true; + } + void write(Operator* op, const std::map& captured_params) const { if (captured_params.find("op_0.axes") != captured_params.end()) { - op->params["dim"] = captured_params.at("op_0.axes"); + op->params["dim"] = captured_params.at("op_0.axes").ai[0]; if (captured_params.find("op_0.keepdims") != captured_params.end()) { diff --git a/tools/pnnx/src/pass_ncnn/torch_max.cpp b/tools/pnnx/src/pass_ncnn/torch_max.cpp index 76cd33f239b..95987da5162 100644 --- a/tools/pnnx/src/pass_ncnn/torch_max.cpp +++ b/tools/pnnx/src/pass_ncnn/torch_max.cpp @@ -65,6 +65,22 @@ pnnx.Output output 2 0 out indices REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(torch_max, 20) +class torch_max_0 : public torch_max +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +3 2 +pnnx.Input input 0 1 input +torch.max op_0 1 1 input out dim=%dim keepdim=%keepdim +pnnx.Output output 1 0 out +)PNNXIR"; + } +}; + +REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(torch_max_0, 20) + class torch_max_1 : public GraphRewriterPass { public: diff --git a/tools/pnnx/src/pass_ncnn/torch_min.cpp b/tools/pnnx/src/pass_ncnn/torch_min.cpp index 49851b443db..3ef2ae47da0 100644 --- a/tools/pnnx/src/pass_ncnn/torch_min.cpp +++ b/tools/pnnx/src/pass_ncnn/torch_min.cpp @@ -65,6 +65,22 @@ pnnx.Output output 2 0 out indices REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(torch_min, 20) +class torch_min_0 : public torch_min +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +3 2 +pnnx.Input input 0 1 input +torch.min op_0 1 1 input out dim=%dim keepdim=%keepdim +pnnx.Output output 1 0 out +)PNNXIR"; + } +}; + +REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(torch_min_0, 20) + class torch_min_1 : public GraphRewriterPass { public: diff --git a/tools/pnnx/src/pass_onnx/fuse_constant_as_attribute.cpp b/tools/pnnx/src/pass_onnx/fuse_constant_as_attribute.cpp index aba88976233..39dc8d80882 100644 --- a/tools/pnnx/src/pass_onnx/fuse_constant_as_attribute.cpp +++ b/tools/pnnx/src/pass_onnx/fuse_constant_as_attribute.cpp @@ -43,6 +43,7 @@ static constant_as_attribute caas[] = { {"ReduceProd", 1, "axes"}, {"ReduceSum", 1, "axes"}, {"Reshape", 1, "shape"}, + {"Resize", 1, "roi"}, {"Resize", 2, "scales"}, {"Resize", 3, "sizes"}, {"Slice", 1, "starts"}, From eb6e084c2d6c036e7234d746035eb400c20a1756 Mon Sep 17 00:00:00 2001 From: nihui Date: Wed, 14 Aug 2024 15:51:57 +0800 Subject: [PATCH 20/38] pnnx convert nn.RMSNorm F.rms_norm (#5628) --- tools/pnnx/src/CMakeLists.txt | 3 + tools/pnnx/src/pass_level1/nn_RMSNorm.cpp | 51 ++++++++++++ tools/pnnx/src/pass_level2/F_rms_norm.cpp | 43 +++++++++++ tools/pnnx/src/pass_level5.cpp | 2 + .../src/pass_level5/fuse_static_rmsnorm.cpp | 57 ++++++++++++++ .../src/pass_level5/fuse_static_rmsnorm.h | 21 +++++ .../pnnx/src/pass_ncnn/solve_batch_index.cpp | 4 + tools/pnnx/tests/CMakeLists.txt | 2 + tools/pnnx/tests/test_F_rms_norm.py | 77 +++++++++++++++++++ tools/pnnx/tests/test_nn_RMSNorm.py | 71 +++++++++++++++++ 10 files changed, 331 insertions(+) create mode 100644 tools/pnnx/src/pass_level1/nn_RMSNorm.cpp create mode 100644 tools/pnnx/src/pass_level2/F_rms_norm.cpp create mode 100644 tools/pnnx/src/pass_level5/fuse_static_rmsnorm.cpp create mode 100644 tools/pnnx/src/pass_level5/fuse_static_rmsnorm.h create mode 100644 tools/pnnx/tests/test_F_rms_norm.py create mode 100644 tools/pnnx/tests/test_nn_RMSNorm.py diff --git a/tools/pnnx/src/CMakeLists.txt b/tools/pnnx/src/CMakeLists.txt index c5c6228dee7..9834fabe069 100644 --- a/tools/pnnx/src/CMakeLists.txt +++ b/tools/pnnx/src/CMakeLists.txt @@ -77,6 +77,7 @@ set(pnnx_pass_level1_SRCS pass_level1/nn_ReplicationPad1d.cpp pass_level1/nn_ReplicationPad2d.cpp pass_level1/nn_ReplicationPad3d.cpp + pass_level1/nn_RMSNorm.cpp pass_level1/nn_RNN.cpp pass_level1/nn_RReLU.cpp pass_level1/nn_SELU.cpp @@ -163,6 +164,7 @@ set(pnnx_pass_level2_SRCS pass_level2/F_prelu.cpp pass_level2/F_relu.cpp pass_level2/F_relu6.cpp + pass_level2/F_rms_norm.cpp pass_level2/F_rrelu.cpp pass_level2/F_scaled_dot_product_attention.cpp pass_level2/F_selu.cpp @@ -383,6 +385,7 @@ set(pnnx_pass_level5_SRCS pass_level5/fuse_static_layernorm.cpp pass_level5/fuse_static_linear.cpp pass_level5/fuse_static_prelu.cpp + pass_level5/fuse_static_rmsnorm.cpp pass_level5/normalize_einsum_equation.cpp pass_level5/unroll_rnn_op.cpp ) diff --git a/tools/pnnx/src/pass_level1/nn_RMSNorm.cpp b/tools/pnnx/src/pass_level1/nn_RMSNorm.cpp new file mode 100644 index 00000000000..4433f598935 --- /dev/null +++ b/tools/pnnx/src/pass_level1/nn_RMSNorm.cpp @@ -0,0 +1,51 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "pass_level1.h" + +#include "../utils.h" + +namespace pnnx { + +class RMSNorm : public FuseModulePass +{ +public: + const char* match_type_str() const + { + return "__torch__.torch.nn.modules.normalization.RMSNorm"; + } + + const char* type_str() const + { + return "nn.RMSNorm"; + } + + void write(Operator* op, const std::shared_ptr& graph, const torch::jit::Module& mod) const + { + const torch::jit::Node* rmsn = find_node_by_kind(graph, "aten::rms_norm"); + + op->params["normalized_shape"] = rmsn->namedInput("normalized_shape"); + op->params["eps"] = rmsn->namedInput("eps"); + op->params["elementwise_affine"] = mod.hasattr("weight") && mod.hasattr("bias"); + + if (mod.hasattr("weight")) + { + op->attrs["weight"] = mod.attr("weight").toTensor(); + } + } +}; + +REGISTER_GLOBAL_PNNX_FUSE_MODULE_PASS(RMSNorm) + +} // namespace pnnx diff --git a/tools/pnnx/src/pass_level2/F_rms_norm.cpp b/tools/pnnx/src/pass_level2/F_rms_norm.cpp new file mode 100644 index 00000000000..aaa1813c563 --- /dev/null +++ b/tools/pnnx/src/pass_level2/F_rms_norm.cpp @@ -0,0 +1,43 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "pass_level2.h" + +namespace pnnx { + +class F_rms_norm : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +6 5 +pnnx.Input input_0 0 1 input +pnnx.Input input_1 0 1 weight +pnnx.Input input_2 0 1 normalized_shape +prim::Constant op_0 0 1 eps value=%eps +aten::rms_norm op_1 4 1 input normalized_shape weight eps out +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "F.rms_norm"; + } +}; + +REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_rms_norm, 10) + +} // namespace pnnx diff --git a/tools/pnnx/src/pass_level5.cpp b/tools/pnnx/src/pass_level5.cpp index 4903f185117..8bb3270aa2c 100644 --- a/tools/pnnx/src/pass_level5.cpp +++ b/tools/pnnx/src/pass_level5.cpp @@ -60,6 +60,7 @@ #include "pass_level5/fuse_static_layernorm.h" #include "pass_level5/fuse_static_linear.h" #include "pass_level5/fuse_static_prelu.h" +#include "pass_level5/fuse_static_rmsnorm.h" #include "pass_level5/normalize_einsum_equation.h" #include "pass_level4/dead_code_elimination.h" #include "pass_level4/canonicalize.h" @@ -102,6 +103,7 @@ void pass_level5(Graph& g, const std::set& foldable_constants, cons fuse_static_groupnorm(g); fuse_static_instancenorm(g); fuse_static_layernorm(g); + fuse_static_rmsnorm(g); fuse_static_conv(g); fuse_static_convtranspose(g); diff --git a/tools/pnnx/src/pass_level5/fuse_static_rmsnorm.cpp b/tools/pnnx/src/pass_level5/fuse_static_rmsnorm.cpp new file mode 100644 index 00000000000..ed68c026d30 --- /dev/null +++ b/tools/pnnx/src/pass_level5/fuse_static_rmsnorm.cpp @@ -0,0 +1,57 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "fuse_static_rmsnorm.h" + +#include "pass_level2.h" + +#include +#include + +namespace pnnx { + +class fuse_static_Frmsnorm_pass : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +4 3 +pnnx.Input input 0 1 input +pnnx.Attribute op_weight 0 1 weight @data +F.rms_norm op_0 2 1 input weight out normalized_shape=%normalized_shape eps=%eps +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* replace_pattern_graph() const + { + return R"PNNXIR(7767517 +3 2 +pnnx.Input input 0 1 input +nn.RMSNorm rmsn 1 1 input out normalized_shape=%normalized_shape eps=%eps elementwise_affine=True @weight=%op_weight.data +pnnx.Output output 1 0 out +)PNNXIR"; + } +}; + +void fuse_static_rmsnorm(Graph& graph) +{ + fuse_static_Frmsnorm_pass a; + int opindex = 0; + + pnnx_graph_rewrite(graph, &a, opindex); +} + +} // namespace pnnx diff --git a/tools/pnnx/src/pass_level5/fuse_static_rmsnorm.h b/tools/pnnx/src/pass_level5/fuse_static_rmsnorm.h new file mode 100644 index 00000000000..c88b703cb07 --- /dev/null +++ b/tools/pnnx/src/pass_level5/fuse_static_rmsnorm.h @@ -0,0 +1,21 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "ir.h" + +namespace pnnx { + +void fuse_static_rmsnorm(Graph& graph); + +} // namespace pnnx diff --git a/tools/pnnx/src/pass_ncnn/solve_batch_index.cpp b/tools/pnnx/src/pass_ncnn/solve_batch_index.cpp index 4b1100789fc..d4532422b52 100644 --- a/tools/pnnx/src/pass_ncnn/solve_batch_index.cpp +++ b/tools/pnnx/src/pass_ncnn/solve_batch_index.cpp @@ -46,6 +46,7 @@ static bool is_known_operator_with_batch_index_0(const Operator* op) "F.group_norm", "F.instance_norm", "F.interpolate", + "F.layer_norm", "F.linear", "F.local_response_norm", "F.lp_pool1d", @@ -56,6 +57,7 @@ static bool is_known_operator_with_batch_index_0(const Operator* op) "F.pixel_shuffle", "F.pixel_unshuffle", "F.prelu", + "F.rms_norm", "F.scaled_dot_product_attention", "F.unfold", "F.upsample_bilinear", @@ -91,6 +93,7 @@ static bool is_known_operator_with_batch_index_0(const Operator* op) "nn.InstanceNorm2d", "nn.InstanceNorm3d", "nn.LocalResponseNorm", + "nn.LayerNorm", "nn.LPPool1d", "nn.LPPool2d", "nn.MaxPool1d", @@ -104,6 +107,7 @@ static bool is_known_operator_with_batch_index_0(const Operator* op) "nn.ReplicationPad1d", "nn.ReplicationPad2d", "nn.ReplicationPad3d", + "nn.RMSNorm", "nn.Softmax2d", "nn.Unfold", "nn.Upsample", diff --git a/tools/pnnx/tests/CMakeLists.txt b/tools/pnnx/tests/CMakeLists.txt index a5522a70bb2..daf5501e9d8 100644 --- a/tools/pnnx/tests/CMakeLists.txt +++ b/tools/pnnx/tests/CMakeLists.txt @@ -61,6 +61,7 @@ pnnx_add_test(F_pixel_unshuffle) pnnx_add_test(F_prelu) pnnx_add_test(F_relu) pnnx_add_test(F_relu6) +pnnx_add_test(F_rms_norm) pnnx_add_test(F_rrelu) pnnx_add_test(F_scaled_dot_product_attention) pnnx_add_test(F_selu) @@ -145,6 +146,7 @@ pnnx_add_test(nn_ReLU6) pnnx_add_test(nn_ReplicationPad1d) pnnx_add_test(nn_ReplicationPad2d) pnnx_add_test(nn_ReplicationPad3d) +pnnx_add_test(nn_RMSNorm) pnnx_add_test(nn_RNN) pnnx_add_test(nn_RReLU) pnnx_add_test(nn_SELU) diff --git a/tools/pnnx/tests/test_F_rms_norm.py b/tools/pnnx/tests/test_F_rms_norm.py new file mode 100644 index 00000000000..5dd9e699b23 --- /dev/null +++ b/tools/pnnx/tests/test_F_rms_norm.py @@ -0,0 +1,77 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F +from packaging import version + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + self.w3 = nn.Parameter(torch.rand(24)) + self.w4 = nn.Parameter(torch.rand(12, 16)) + self.w5 = nn.Parameter(torch.rand(24)) + + def forward(self, x, y, z, w0, w1, w2): + x = F.rms_norm(x, (24,), w0) + x = F.rms_norm(x, (12,24), None) + x = F.rms_norm(x, (24,), self.w3) + + y = F.rms_norm(y, (16,), None, eps=1e-3) + y = F.rms_norm(y, (12,16), w1) + y = F.rms_norm(y, (12,16), self.w4) + + z = F.rms_norm(z, (24,), w2) + z = F.rms_norm(z, (12,16,24), None, eps=1e-2) + z = F.rms_norm(z, (24,), self.w5) + return x, y, z + +def test(): + if version.parse(torch.__version__) < version.parse('2.4'): + return True + + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 12, 24) + y = torch.rand(2, 3, 12, 16) + z = torch.rand(1, 10, 12, 16, 24) + w0 = torch.rand(24) + w1 = torch.rand(12, 16) + w2 = torch.rand(24) + + a0, a1, a2 = net(x, y, z, w0, w1, w2) + + # export torchscript + mod = torch.jit.trace(net, (x, y, z, w0, w1, w2)) + mod.save("test_F_rms_norm.pt") + + # torchscript to pnnx + import os + os.system("../src/pnnx test_F_rms_norm.pt inputshape=[1,12,24],[2,3,12,16],[1,10,12,16,24],[24],[12,16],[24]") + + # pnnx inference + import test_F_rms_norm_pnnx + b0, b1, b2 = test_F_rms_norm_pnnx.test_inference() + + return torch.equal(a0, b0) and torch.equal(a1, b1) and torch.equal(a2, b2) + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/test_nn_RMSNorm.py b/tools/pnnx/tests/test_nn_RMSNorm.py new file mode 100644 index 00000000000..a9b70cdb266 --- /dev/null +++ b/tools/pnnx/tests/test_nn_RMSNorm.py @@ -0,0 +1,71 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F +from packaging import version + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + self.rmsn_0 = nn.RMSNorm(64) + self.rmsn_0.weight = nn.Parameter(torch.rand(64)) + self.rmsn_1 = nn.RMSNorm(normalized_shape=(24,64), eps=1e-2, elementwise_affine=False) + + def forward(self, x, y, z): + x = self.rmsn_0(x) + x = self.rmsn_1(x) + + y = self.rmsn_0(y) + y = self.rmsn_1(y) + + z = self.rmsn_0(z) + z = self.rmsn_1(z) + return x, y, z + +def test(): + if version.parse(torch.__version__) < version.parse('2.4'): + return True + + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 24, 64) + y = torch.rand(1, 12, 24, 64) + z = torch.rand(1, 12, 16, 24, 64) + + a0, a1, a2 = net(x, y, z) + + # export torchscript + mod = torch.jit.trace(net, (x, y, z)) + mod.save("test_nn_RMSNorm.pt") + + # torchscript to pnnx + import os + os.system("../src/pnnx test_nn_RMSNorm.pt inputshape=[1,24,64],[1,12,24,64],[1,12,16,24,64]") + + # pnnx inference + import test_nn_RMSNorm_pnnx + b0, b1, b2 = test_nn_RMSNorm_pnnx.test_inference() + + return torch.equal(a0, b0) and torch.equal(a1, b1) and torch.equal(a2, b2) + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) From abad90cb1cc9d2b5d47a22576bdcb391613ca209 Mon Sep 17 00:00:00 2001 From: nihui Date: Wed, 14 Aug 2024 17:23:34 +0800 Subject: [PATCH 21/38] pnnx drop torch.max torch.min indice node if not used (#5629) --- tools/pnnx/src/pass_level2/torch_max.cpp | 12 ++++++++++++ tools/pnnx/src/pass_level2/torch_min.cpp | 12 ++++++++++++ 2 files changed, 24 insertions(+) diff --git a/tools/pnnx/src/pass_level2/torch_max.cpp b/tools/pnnx/src/pass_level2/torch_max.cpp index eef7f33b4d0..5a993d6f55e 100644 --- a/tools/pnnx/src/pass_level2/torch_max.cpp +++ b/tools/pnnx/src/pass_level2/torch_max.cpp @@ -35,6 +35,18 @@ pnnx.Output output 2 0 out indices { return "torch.max"; } + + void write(Operator* op, const std::map& captured_params) const + { + GraphRewriterPass::write(op, captured_params); + + // drop indices if not used + if (op->outputs[1]->consumers.empty()) + { + op->outputs[1]->producer = 0; + op->outputs.resize(1); + } + } }; REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(torch_max, 20) diff --git a/tools/pnnx/src/pass_level2/torch_min.cpp b/tools/pnnx/src/pass_level2/torch_min.cpp index 509b858c1c1..fa174614e01 100644 --- a/tools/pnnx/src/pass_level2/torch_min.cpp +++ b/tools/pnnx/src/pass_level2/torch_min.cpp @@ -35,6 +35,18 @@ pnnx.Output output 2 0 out indices { return "torch.min"; } + + void write(Operator* op, const std::map& captured_params) const + { + GraphRewriterPass::write(op, captured_params); + + // drop indices if not used + if (op->outputs[1]->consumers.empty()) + { + op->outputs[1]->producer = 0; + op->outputs.resize(1); + } + } }; REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(torch_min, 20) From fdf0df3079f779a19ffb0ef8b68e59916fbad1d8 Mon Sep 17 00:00:00 2001 From: nihui Date: Thu, 15 Aug 2024 16:52:06 +0800 Subject: [PATCH 22/38] RMSNorm (#5630) --- docs/developer-guide/operators.md | 21 +++ src/CMakeLists.txt | 1 + src/layer/rmsnorm.cpp | 200 +++++++++++++++++++++ src/layer/rmsnorm.h | 43 +++++ tests/CMakeLists.txt | 1 + tests/test_rmsnorm.cpp | 121 +++++++++++++ tools/pnnx/src/CMakeLists.txt | 2 + tools/pnnx/src/pass_level1/nn_RMSNorm.cpp | 2 +- tools/pnnx/src/pass_ncnn/F_rms_norm.cpp | 65 +++++++ tools/pnnx/src/pass_ncnn/nn_RMSNorm.cpp | 70 ++++++++ tools/pnnx/tests/ncnn/CMakeLists.txt | 2 + tools/pnnx/tests/ncnn/test_F_layer_norm.py | 6 +- tools/pnnx/tests/ncnn/test_F_rms_norm.py | 68 +++++++ tools/pnnx/tests/ncnn/test_nn_LayerNorm.py | 6 +- tools/pnnx/tests/ncnn/test_nn_RMSNorm.py | 68 +++++++ 15 files changed, 669 insertions(+), 7 deletions(-) create mode 100644 src/layer/rmsnorm.cpp create mode 100644 src/layer/rmsnorm.h create mode 100644 tests/test_rmsnorm.cpp create mode 100644 tools/pnnx/src/pass_ncnn/F_rms_norm.cpp create mode 100644 tools/pnnx/src/pass_ncnn/nn_RMSNorm.cpp create mode 100644 tools/pnnx/tests/ncnn/test_F_rms_norm.py create mode 100644 tools/pnnx/tests/ncnn/test_nn_RMSNorm.py diff --git a/docs/developer-guide/operators.md b/docs/developer-guide/operators.md index 05996f8d735..7594c0843ac 100644 --- a/docs/developer-guide/operators.md +++ b/docs/developer-guide/operators.md @@ -71,6 +71,7 @@ * [Reorg](#reorg) * [Requantize](#requantize) * [Reshape](#reshape) +* [RMSNorm](#rmsnorm) * [RNN](#rnn) * [Scale](#scale) * [SELU](#selu) @@ -1670,6 +1671,26 @@ Reshape flag: - -1 = remaining - -233 = drop this dim(default) +# RMSNorm +``` +split x along outmost axis into part x0, x1 ... +root mean square normalize for each part x0, x1 ... +y = x * gamma by elementwise +``` + +* one_blob_only +* support_inplace + +| param id | name | type | default | description | +| --------- | ------------- | ----- | --------- | ----------------- | +| 0 | affine_size | int | 0 | | +| 1 | eps | float | 0.001f | x = x / sqrt(var + eps) | +| 2 | affine | int | 1 | | + +| weight | type | shape | +| ------------- | ----- | --------------------- | +| gamma_data | float | [affine_size] | + # RNN Apply a single-layer RNN to a feature sequence of `T` timesteps. The input blob shape is `[w=input_size, h=T]` and the output blob shape is `[w=num_output, h=T]`. diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index d3f55ce7790..803c34a780d 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -166,6 +166,7 @@ ncnn_add_layer(Erf) ncnn_add_layer(Diag) ncnn_add_layer(CELU) ncnn_add_layer(Shrink) +ncnn_add_layer(RMSNorm) if(NCNN_VULKAN) ncnn_add_shader(${CMAKE_CURRENT_SOURCE_DIR}/convert_ycbcr.comp) diff --git a/src/layer/rmsnorm.cpp b/src/layer/rmsnorm.cpp new file mode 100644 index 00000000000..77c74c6bccb --- /dev/null +++ b/src/layer/rmsnorm.cpp @@ -0,0 +1,200 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "rmsnorm.h" + +namespace ncnn { + +RMSNorm::RMSNorm() +{ + one_blob_only = true; + support_inplace = true; +} + +int RMSNorm::load_param(const ParamDict& pd) +{ + affine_size = pd.get(0, 0); + eps = pd.get(1, 0.001f); + affine = pd.get(2, 1); + + return 0; +} + +int RMSNorm::load_model(const ModelBin& mb) +{ + if (affine == 0) + return 0; + + gamma_data = mb.load(affine_size, 1); + if (gamma_data.empty()) + return -100; + + return 0; +} + +int RMSNorm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const +{ + // x = x / sqrt(rms + eps) * gamma + + int dims = bottom_top_blob.dims; + + if (dims == 1) + { + int w = bottom_top_blob.w; + // assert affine_size == w + + float* ptr = bottom_top_blob; + + float sqsum = 0.f; + for (int i = 0; i < w; i++) + { + sqsum += ptr[i] * ptr[i]; + } + float rms = sqrtf(sqsum / w + eps); + + float a = 1.f / rms; + + if (affine) + { + for (int i = 0; i < w; i++) + { + ptr[i] = (ptr[i] * a) * gamma_data[i]; + } + } + else + { + for (int i = 0; i < w; i++) + { + ptr[i] = ptr[i] * a; + } + } + } + + if (dims == 2) + { + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + // assert affine_size == w + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + float* ptr = bottom_top_blob.row(i); + + float sqsum = 0.f; + for (int j = 0; j < w; j++) + { + sqsum += ptr[j] * ptr[j]; + } + float rms = sqrtf(sqsum / w + eps); + + float a = 1.f / rms; + + if (affine) + { + for (int j = 0; j < w; j++) + { + ptr[j] = (ptr[j] * a) * gamma_data[j]; + } + } + else + { + for (int j = 0; j < w; j++) + { + ptr[j] = ptr[j] * a; + } + } + } + } + + if (dims == 3) + { + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int channels = bottom_top_blob.c; + int size = w * h; + + if (affine_size == w) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + for (int i = 0; i < h; i++) + { + float* ptr = bottom_top_blob.channel(q).row(i); + + float sqsum = 0.f; + for (int j = 0; j < w; j++) + { + sqsum += ptr[j] * ptr[j]; + } + float rms = sqrtf(sqsum / w + eps); + + float a = 1.f / rms; + + if (affine) + { + for (int j = 0; j < w; j++) + { + ptr[j] = (ptr[j] * a) * gamma_data[j]; + } + } + else + { + for (int j = 0; j < w; j++) + { + ptr[j] = ptr[j] * a; + } + } + } + } + } + else // if (affine_size == size) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* ptr = bottom_top_blob.channel(q); + + float sqsum = 0.f; + for (int i = 0; i < size; i++) + { + sqsum += ptr[i] * ptr[i]; + } + float rms = sqrtf(sqsum / size + eps); + + float a = 1.f / rms; + + if (affine) + { + for (int i = 0; i < size; i++) + { + ptr[i] = (ptr[i] * a) * gamma_data[i]; + } + } + else + { + for (int i = 0; i < size; i++) + { + ptr[i] = ptr[i] * a; + } + } + } + } + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/rmsnorm.h b/src/layer/rmsnorm.h new file mode 100644 index 00000000000..4a09f2548bd --- /dev/null +++ b/src/layer/rmsnorm.h @@ -0,0 +1,43 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_RMSNORM_H +#define LAYER_RMSNORM_H + +#include "layer.h" + +namespace ncnn { + +class RMSNorm : public Layer +{ +public: + RMSNorm(); + + virtual int load_param(const ParamDict& pd); + + virtual int load_model(const ModelBin& mb); + + virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; + +public: + int affine_size; + float eps; + int affine; + + Mat gamma_data; +}; + +} // namespace ncnn + +#endif // LAYER_RMSNORM_H diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index d30229b870c..6c8939fc7c7 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -141,6 +141,7 @@ ncnn_add_layer_test(ReLU) ncnn_add_layer_test(Reorg) ncnn_add_layer_test(Requantize) ncnn_add_layer_test(Reshape) +ncnn_add_layer_test(RMSNorm) ncnn_add_layer_test(RNN) ncnn_add_layer_test(ROIPooling) ncnn_add_layer_test(ROIAlign) diff --git a/tests/test_rmsnorm.cpp b/tests/test_rmsnorm.cpp new file mode 100644 index 00000000000..2d88c162d8b --- /dev/null +++ b/tests/test_rmsnorm.cpp @@ -0,0 +1,121 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "testutil.h" + +static int test_rmsnorm(const ncnn::Mat& a, int affine_size, float eps, int affine) +{ + ncnn::ParamDict pd; + pd.set(0, affine_size); + pd.set(1, eps); + pd.set(2, affine); + + std::vector weights(1); + weights[0] = RandomMat(affine_size); + + int ret = test_layer("RMSNorm", pd, weights, a); + if (ret != 0) + { + fprintf(stderr, "test_rmsnorm failed a.dims=%d a=(%d %d %d) affine_size=%d eps=%f affine=%d\n", a.dims, a.w, a.h, a.c, affine_size, eps, affine); + } + + return ret; +} + +static int test_rmsnorm_0() +{ + return 0 + || test_rmsnorm(RandomMat(6, 4, 2), 6, 0.01f, 0) + || test_rmsnorm(RandomMat(4, 5, 6), 4, 0.01f, 0) + || test_rmsnorm(RandomMat(3, 3, 8), 3, 0.002f, 0) + || test_rmsnorm(RandomMat(5, 6, 12), 5, 0.02f, 0) + || test_rmsnorm(RandomMat(4, 7, 16), 4, 0.02f, 0) + || test_rmsnorm(RandomMat(6, 7, 24), 6, 0.001f, 0) + || test_rmsnorm(RandomMat(5, 8, 32), 5, 0.001f, 0) + || test_rmsnorm(RandomMat(6, 4, 2), 6, 0.01f, 1) + || test_rmsnorm(RandomMat(4, 5, 6), 4, 0.01f, 1) + || test_rmsnorm(RandomMat(3, 3, 8), 3, 0.002f, 1) + || test_rmsnorm(RandomMat(5, 6, 12), 5, 0.02f, 1) + || test_rmsnorm(RandomMat(4, 7, 16), 4, 0.02f, 1) + || test_rmsnorm(RandomMat(6, 7, 24), 6, 0.001f, 1) + || test_rmsnorm(RandomMat(5, 8, 32), 5, 0.001f, 1); +} + +static int test_rmsnorm_1() +{ + return 0 + || test_rmsnorm(RandomMat(6, 4, 2), 24, 0.01f, 0) + || test_rmsnorm(RandomMat(4, 5, 6), 20, 0.01f, 0) + || test_rmsnorm(RandomMat(3, 3, 8), 9, 0.002f, 0) + || test_rmsnorm(RandomMat(5, 6, 12), 30, 0.02f, 0) + || test_rmsnorm(RandomMat(4, 7, 16), 28, 0.02f, 0) + || test_rmsnorm(RandomMat(6, 7, 24), 42, 0.001f, 0) + || test_rmsnorm(RandomMat(5, 8, 32), 40, 0.001f, 0) + || test_rmsnorm(RandomMat(6, 4, 2), 24, 0.01f, 1) + || test_rmsnorm(RandomMat(4, 5, 6), 20, 0.01f, 1) + || test_rmsnorm(RandomMat(3, 3, 8), 9, 0.002f, 1) + || test_rmsnorm(RandomMat(5, 6, 12), 30, 0.02f, 1) + || test_rmsnorm(RandomMat(4, 7, 16), 28, 0.02f, 1) + || test_rmsnorm(RandomMat(6, 7, 24), 42, 0.001f, 1) + || test_rmsnorm(RandomMat(5, 8, 32), 40, 0.001f, 1); +} + +static int test_rmsnorm_2() +{ + return 0 + || test_rmsnorm(RandomMat(4, 2), 4, 0.01f, 0) + || test_rmsnorm(RandomMat(5, 6), 5, 0.01f, 0) + || test_rmsnorm(RandomMat(3, 8), 3, 0.002f, 0) + || test_rmsnorm(RandomMat(6, 12), 6, 0.02f, 0) + || test_rmsnorm(RandomMat(4, 16), 4, 0.02f, 0) + || test_rmsnorm(RandomMat(7, 24), 7, 0.001f, 0) + || test_rmsnorm(RandomMat(8, 32), 8, 0.001f, 0) + || test_rmsnorm(RandomMat(4, 2), 4, 0.01f, 1) + || test_rmsnorm(RandomMat(5, 6), 5, 0.01f, 1) + || test_rmsnorm(RandomMat(3, 8), 3, 0.002f, 1) + || test_rmsnorm(RandomMat(6, 12), 6, 0.02f, 1) + || test_rmsnorm(RandomMat(4, 16), 4, 0.02f, 1) + || test_rmsnorm(RandomMat(7, 24), 7, 0.001f, 1) + || test_rmsnorm(RandomMat(8, 32), 8, 0.001f, 1); +} + +static int test_rmsnorm_3() +{ + return 0 + || test_rmsnorm(RandomMat(2), 2, 0.01f, 0) + || test_rmsnorm(RandomMat(6), 6, 0.01f, 0) + || test_rmsnorm(RandomMat(8), 8, 0.002f, 0) + || test_rmsnorm(RandomMat(12), 12, 0.02f, 0) + || test_rmsnorm(RandomMat(16), 16, 0.02f, 0) + || test_rmsnorm(RandomMat(24), 24, 0.001f, 0) + || test_rmsnorm(RandomMat(32), 32, 0.001f, 0) + || test_rmsnorm(RandomMat(2), 2, 0.01f, 1) + || test_rmsnorm(RandomMat(6), 6, 0.01f, 1) + || test_rmsnorm(RandomMat(8), 8, 0.002f, 1) + || test_rmsnorm(RandomMat(12), 12, 0.02f, 1) + || test_rmsnorm(RandomMat(16), 16, 0.02f, 1) + || test_rmsnorm(RandomMat(24), 24, 0.001f, 1) + || test_rmsnorm(RandomMat(32), 32, 0.001f, 1); +} + +int main() +{ + SRAND(7767517); + + return 0 + || test_rmsnorm_0() + || test_rmsnorm_1() + || test_rmsnorm_2() + || test_rmsnorm_3(); +} diff --git a/tools/pnnx/src/CMakeLists.txt b/tools/pnnx/src/CMakeLists.txt index 9834fabe069..2c814bd486c 100644 --- a/tools/pnnx/src/CMakeLists.txt +++ b/tools/pnnx/src/CMakeLists.txt @@ -475,6 +475,7 @@ set(pnnx_pass_ncnn_SRCS pass_ncnn/F_prelu.cpp pass_ncnn/F_relu.cpp pass_ncnn/F_relu6.cpp + pass_ncnn/F_rms_norm.cpp pass_ncnn/F_scaled_dot_product_attention.cpp pass_ncnn/F_selu.cpp pass_ncnn/F_sigmoid.cpp @@ -541,6 +542,7 @@ set(pnnx_pass_ncnn_SRCS pass_ncnn/nn_ReplicationPad1d.cpp pass_ncnn/nn_ReplicationPad2d.cpp pass_ncnn/nn_ReplicationPad3d.cpp + pass_ncnn/nn_RMSNorm.cpp pass_ncnn/nn_RNN.cpp pass_ncnn/nn_SELU.cpp pass_ncnn/nn_Sigmoid.cpp diff --git a/tools/pnnx/src/pass_level1/nn_RMSNorm.cpp b/tools/pnnx/src/pass_level1/nn_RMSNorm.cpp index 4433f598935..498f0453c14 100644 --- a/tools/pnnx/src/pass_level1/nn_RMSNorm.cpp +++ b/tools/pnnx/src/pass_level1/nn_RMSNorm.cpp @@ -37,7 +37,7 @@ class RMSNorm : public FuseModulePass op->params["normalized_shape"] = rmsn->namedInput("normalized_shape"); op->params["eps"] = rmsn->namedInput("eps"); - op->params["elementwise_affine"] = mod.hasattr("weight") && mod.hasattr("bias"); + op->params["elementwise_affine"] = mod.hasattr("weight"); if (mod.hasattr("weight")) { diff --git a/tools/pnnx/src/pass_ncnn/F_rms_norm.cpp b/tools/pnnx/src/pass_ncnn/F_rms_norm.cpp new file mode 100644 index 00000000000..8230168312c --- /dev/null +++ b/tools/pnnx/src/pass_ncnn/F_rms_norm.cpp @@ -0,0 +1,65 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "pass_ncnn.h" + +namespace pnnx { + +namespace ncnn { + +class F_rms_norm : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +3 2 +pnnx.Input input 0 1 input +F.rms_norm op_0 1 1 input out weight=None normalized_shape=%normalized_shape eps=%eps +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "RMSNorm"; + } + + const char* name_str() const + { + return "rmsn"; + } + + void write(Operator* op, const std::map& captured_params) const + { + const std::vector& normalized_shape = captured_params.at("normalized_shape").ai; + int affine_size = normalized_shape[0]; + for (size_t i = 1; i < normalized_shape.size(); i++) + { + affine_size *= normalized_shape[i]; + } + + const float eps = captured_params.at("eps").type == 0 ? 0.f : captured_params.at("eps").f; + + op->params["0"] = affine_size; + op->params["1"] = eps; + op->params["2"] = 0; + } +}; + +REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_rms_norm, 20) + +} // namespace ncnn + +} // namespace pnnx diff --git a/tools/pnnx/src/pass_ncnn/nn_RMSNorm.cpp b/tools/pnnx/src/pass_ncnn/nn_RMSNorm.cpp new file mode 100644 index 00000000000..7fda637c5ca --- /dev/null +++ b/tools/pnnx/src/pass_ncnn/nn_RMSNorm.cpp @@ -0,0 +1,70 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "pass_ncnn.h" + +namespace pnnx { + +namespace ncnn { + +class nn_RMSNorm : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +3 2 +pnnx.Input input 0 1 input +nn.RMSNorm op_0 1 1 input out normalized_shape=%normalized_shape eps=%eps elementwise_affine=%elementwise_affine @weight +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "RMSNorm"; + } + + const char* name_str() const + { + return "rmsn"; + } + + void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const + { + const std::vector& normalized_shape = captured_params.at("normalized_shape").ai; + int affine_size = normalized_shape[0]; + for (size_t i = 1; i < normalized_shape.size(); i++) + { + affine_size *= normalized_shape[i]; + } + + const float eps = captured_params.at("eps").type == 0 ? 0.f : captured_params.at("eps").f; + + op->params["0"] = affine_size; + op->params["1"] = eps; + op->params["2"] = captured_params.at("elementwise_affine").b ? 1 : 0; + + if (captured_params.at("elementwise_affine").b) + { + op->attrs["0"] = captured_attrs.at("op_0.weight"); + } + } +}; + +REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(nn_RMSNorm, 20) + +} // namespace ncnn + +} // namespace pnnx diff --git a/tools/pnnx/tests/ncnn/CMakeLists.txt b/tools/pnnx/tests/ncnn/CMakeLists.txt index a60e63eb54b..49cb063f335 100644 --- a/tools/pnnx/tests/ncnn/CMakeLists.txt +++ b/tools/pnnx/tests/ncnn/CMakeLists.txt @@ -53,6 +53,7 @@ pnnx_ncnn_add_test(F_pixel_unshuffle) pnnx_ncnn_add_test(F_prelu) pnnx_ncnn_add_test(F_relu) pnnx_ncnn_add_test(F_relu6) +pnnx_ncnn_add_test(F_rms_norm) pnnx_ncnn_add_test(F_selu) pnnx_ncnn_add_test(F_sigmoid) pnnx_ncnn_add_test(F_silu) @@ -123,6 +124,7 @@ pnnx_ncnn_add_test(nn_ReLU6) pnnx_ncnn_add_test(nn_ReplicationPad1d) pnnx_ncnn_add_test(nn_ReplicationPad2d) pnnx_ncnn_add_test(nn_ReplicationPad3d) +pnnx_ncnn_add_test(nn_RMSNorm) pnnx_ncnn_add_test(nn_RNN) pnnx_ncnn_add_test(nn_SELU) pnnx_ncnn_add_test(nn_Sigmoid) diff --git a/tools/pnnx/tests/ncnn/test_F_layer_norm.py b/tools/pnnx/tests/ncnn/test_F_layer_norm.py index 92244f17910..9d590aa76dd 100644 --- a/tools/pnnx/tests/ncnn/test_F_layer_norm.py +++ b/tools/pnnx/tests/ncnn/test_F_layer_norm.py @@ -37,8 +37,8 @@ def test(): net.eval() torch.manual_seed(0) - x = torch.rand(12, 24) - y = torch.rand(3, 12, 16) + x = torch.rand(1, 12, 24) + y = torch.rand(1, 3, 12, 16) a = net(x, y) @@ -48,7 +48,7 @@ def test(): # torchscript to pnnx import os - os.system("../../src/pnnx test_F_layer_norm.pt inputshape=[12,24],[3,12,16]") + os.system("../../src/pnnx test_F_layer_norm.pt inputshape=[1,12,24],[1,3,12,16]") # ncnn inference import test_F_layer_norm_ncnn diff --git a/tools/pnnx/tests/ncnn/test_F_rms_norm.py b/tools/pnnx/tests/ncnn/test_F_rms_norm.py new file mode 100644 index 00000000000..4e60d9314aa --- /dev/null +++ b/tools/pnnx/tests/ncnn/test_F_rms_norm.py @@ -0,0 +1,68 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F +from packaging import version + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + self.w3 = nn.Parameter(torch.rand(24)) + self.w4 = nn.Parameter(torch.rand(12, 16)) + + def forward(self, x, y): + x = F.rms_norm(x, (24,), self.w3) + + y = F.rms_norm(y, (16,), None) + z = F.rms_norm(y, (12,16), self.w4, eps=1e-3) + return x, y, z + +def test(): + if version.parse(torch.__version__) < version.parse('2.4'): + return True + + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 12, 24) + y = torch.rand(1, 3, 12, 16) + + a = net(x, y) + + # export torchscript + mod = torch.jit.trace(net, (x, y)) + mod.save("test_F_rms_norm.pt") + + # torchscript to pnnx + import os + os.system("../../src/pnnx test_F_rms_norm.pt inputshape=[1,12,24],[1,3,12,16]") + + # ncnn inference + import test_F_rms_norm_ncnn + b = test_F_rms_norm_ncnn.test_inference() + + for a0, b0 in zip(a, b): + if not torch.allclose(a0, b0, 1e-4, 1e-4): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/ncnn/test_nn_LayerNorm.py b/tools/pnnx/tests/ncnn/test_nn_LayerNorm.py index a45444060d0..d409bdfba3a 100644 --- a/tools/pnnx/tests/ncnn/test_nn_LayerNorm.py +++ b/tools/pnnx/tests/ncnn/test_nn_LayerNorm.py @@ -36,8 +36,8 @@ def test(): net.eval() torch.manual_seed(0) - x = torch.rand(24, 64) - y = torch.rand(12, 24, 64) + x = torch.rand(1, 24, 64) + y = torch.rand(1, 12, 24, 64) a = net(x, y) @@ -47,7 +47,7 @@ def test(): # torchscript to pnnx import os - os.system("../../src/pnnx test_nn_LayerNorm.pt inputshape=[24,64],[12,24,64]") + os.system("../../src/pnnx test_nn_LayerNorm.pt inputshape=[1,24,64],[1,12,24,64]") # ncnn inference import test_nn_LayerNorm_ncnn diff --git a/tools/pnnx/tests/ncnn/test_nn_RMSNorm.py b/tools/pnnx/tests/ncnn/test_nn_RMSNorm.py new file mode 100644 index 00000000000..0d5efa211e4 --- /dev/null +++ b/tools/pnnx/tests/ncnn/test_nn_RMSNorm.py @@ -0,0 +1,68 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F +from packaging import version + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + self.rmsn_0 = nn.RMSNorm(64) + self.rmsn_0.weight = nn.Parameter(torch.rand(64)) + self.rmsn_1 = nn.RMSNorm(normalized_shape=(24,64), eps=1e-2, elementwise_affine=False) + + def forward(self, x, y): + x = self.rmsn_0(x) + y = self.rmsn_0(y) + z = self.rmsn_1(y) + return x, y, z + +def test(): + if version.parse(torch.__version__) < version.parse('2.4'): + return True + + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 24, 64) + y = torch.rand(1, 12, 24, 64) + + a = net(x, y) + + # export torchscript + mod = torch.jit.trace(net, (x, y)) + mod.save("test_nn_RMSNorm.pt") + + # torchscript to pnnx + import os + os.system("../../src/pnnx test_nn_RMSNorm.pt inputshape=[1,24,64],[1,12,24,64]") + + # ncnn inference + import test_nn_RMSNorm_ncnn + b = test_nn_RMSNorm_ncnn.test_inference() + + for a0, b0 in zip(a, b): + if not torch.allclose(a0, b0, 1e-4, 1e-4): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) From e550419508c28cf3d2b7a918e45b952999d4f0fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=B5=E5=B0=8F=E5=87=A1?= <2672931+whyb@users.noreply.github.com> Date: Thu, 15 Aug 2024 16:52:33 +0800 Subject: [PATCH 23/38] Add yolov8 ncnn example (#5506) --- examples/CMakeLists.txt | 1 + examples/yolov8.cpp | 410 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 411 insertions(+) create mode 100644 examples/yolov8.cpp diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index a7739be27e5..bf3017dbe68 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -69,6 +69,7 @@ if(NCNN_PIXEL) ncnn_add_example(yolov4) ncnn_add_example(rvm) ncnn_add_example(p2pnet) + ncnn_add_example(yolov8) endif() else() message(WARNING "OpenCV not found and NCNN_SIMPLEOCV disabled, examples won't be built") diff --git a/examples/yolov8.cpp b/examples/yolov8.cpp new file mode 100644 index 00000000000..5b3926582c8 --- /dev/null +++ b/examples/yolov8.cpp @@ -0,0 +1,410 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Copyright (C) 2024 whyb(https://github.com/whyb). All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +// ReadMe +// Convert yolov8 model to ncnn model workflow: +// +// step 1: +// If you don't want to train the model yourself. You should go to the ultralytics website download the pretrained model file. +// original pretrained model from https://docs.ultralytics.com/models/yolov8/#supported-tasks-and-modes +// +// step 2: +// run this command. +// conda create --name yolov8 python=3.11 +// conda activate yolov8 +// pip install ultralytics onnx numpy protobuf +// +// step 3: +// save source code file(export_model_to_ncnn.py): +// from ultralytics import YOLO +// detection_models = [ +// ["./Detection-pt/yolov8n.pt", "./Detection-pt/"], +// ["./Detection-pt/yolov8s.pt", "./Detection-pt/"], +// ["./Detection-pt/yolov8m.pt", "./Detection-pt/"], +// ["./Detection-pt/yolov8l.pt", "./Detection-pt/"], +// ["./Detection-pt/yolov8x.pt", "./Detection-pt/"] +// ] +// for model_dict in detection_models: +// model = YOLO(model_dict[0]) # load an official pretrained weight model +// model.export(format="ncnn", dynamic=True, save_dir=model_dict[1], simplify=True) +// +// step 4: +// run command: python export_model_to_ncnn.py + +#include +#include +#include +#include "layer.h" +#include "net.h" + +#include +#include +#include +#include +#include + +#define MAX_STRIDE 32 + +struct Object +{ + cv::Rect_ rect; + int label; + float prob; +}; + +static inline float intersection_area(const Object& a, const Object& b) +{ + cv::Rect_ inter = a.rect & b.rect; + return inter.area(); +} + +static void qsort_descent_inplace(std::vector& objects, int left, int right) +{ + int i = left; + int j = right; + float p = objects[(left + right) / 2].prob; + + while (i <= j) + { + while (objects[i].prob > p) + i++; + + while (objects[j].prob < p) + j--; + + if (i <= j) + { + // swap + std::swap(objects[i], objects[j]); + + i++; + j--; + } + } + + #pragma omp parallel sections + { + #pragma omp section + { + if (left < j) qsort_descent_inplace(objects, left, j); + } + #pragma omp section + { + if (i < right) qsort_descent_inplace(objects, i, right); + } + } +} + +static void qsort_descent_inplace(std::vector& objects) +{ + if (objects.empty()) + return; + + qsort_descent_inplace(objects, 0, objects.size() - 1); +} + +static void nms_sorted_bboxes(const std::vector& faceobjects, std::vector& picked, float nms_threshold, bool agnostic = false) +{ + picked.clear(); + + const int n = faceobjects.size(); + + std::vector areas(n); + for (int i = 0; i < n; i++) + { + areas[i] = faceobjects[i].rect.area(); + } + + for (int i = 0; i < n; i++) + { + const Object& a = faceobjects[i]; + + int keep = 1; + for (int j = 0; j < (int)picked.size(); j++) + { + const Object& b = faceobjects[picked[j]]; + + if (!agnostic && a.label != b.label) + continue; + + // intersection over union + float inter_area = intersection_area(a, b); + float union_area = areas[i] + areas[picked[j]] - inter_area; + // float IoU = inter_area / union_area + if (inter_area / union_area > nms_threshold) + keep = 0; + } + + if (keep) + picked.push_back(i); + } +} + +static inline float sigmoid(float x) +{ + return static_cast(1.f / (1.f + exp(-x))); +} + +static inline float clampf(float d, float min, float max) +{ + const float t = d < min ? min : d; + return t > max ? max : t; +} + +static void parse_yolov8_detections( + float* inputs, float confidence_threshold, + int num_channels, int num_anchors, int num_labels, + int infer_img_width, int infer_img_height, + std::vector& objects) +{ + std::vector detections; + cv::Mat output = cv::Mat((int)num_channels, (int)num_anchors, CV_32F, inputs).t(); + + for (int i = 0; i < num_anchors; i++) + { + auto row_ptr = output.row(i).ptr(); + auto bboxes_ptr = row_ptr; + auto scores_ptr = row_ptr + 4; + auto max_s_ptr = std::max_element(scores_ptr, scores_ptr + num_labels); + float score = *max_s_ptr; + if (score > confidence_threshold) + { + float x = *bboxes_ptr++; + float y = *bboxes_ptr++; + float w = *bboxes_ptr++; + float h = *bboxes_ptr; + + float x0 = clampf((x - 0.5f * w), 0.f, (float)infer_img_width); + float y0 = clampf((y - 0.5f * h), 0.f, (float)infer_img_height); + float x1 = clampf((x + 0.5f * w), 0.f, (float)infer_img_width); + float y1 = clampf((y + 0.5f * h), 0.f, (float)infer_img_height); + + cv::Rect_ bbox; + bbox.x = x0; + bbox.y = y0; + bbox.width = x1 - x0; + bbox.height = y1 - y0; + Object object; + object.label = max_s_ptr - scores_ptr; + object.prob = score; + object.rect = bbox; + detections.emplace_back(object); + } + } + objects = detections; +} + +static int detect_yolov8(const cv::Mat& bgr, std::vector& objects) +{ + ncnn::Net yolov8; + + yolov8.opt.use_vulkan_compute = true; // if you want detect in hardware, then enable it + + yolov8.load_param("yolov8n.param"); + yolov8.load_model("yolov8n.bin"); + + const int target_size = 640; + const float prob_threshold = 0.25f; + const float nms_threshold = 0.45f; + + int img_w = bgr.cols; + int img_h = bgr.rows; + + // letterbox pad to multiple of MAX_STRIDE + int w = img_w; + int h = img_h; + float scale = 1.f; + if (w > h) + { + scale = (float)target_size / w; + w = target_size; + h = h * scale; + } + else + { + scale = (float)target_size / h; + h = target_size; + w = w * scale; + } + + ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, img_w, img_h, w, h); + + int wpad = (target_size + MAX_STRIDE - 1) / MAX_STRIDE * MAX_STRIDE - w; + int hpad = (target_size + MAX_STRIDE - 1) / MAX_STRIDE * MAX_STRIDE - h; + ncnn::Mat in_pad; + ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 114.f); + + const float norm_vals[3] = {1 / 255.f, 1 / 255.f, 1 / 255.f}; + in_pad.substract_mean_normalize(0, norm_vals); + + ncnn::Extractor ex = yolov8.create_extractor(); + + ex.input("in0", in_pad); + + std::vector proposals; + + // stride 32 + { + ncnn::Mat out; + ex.extract("out0", out); + + std::vector objects32; + const int num_labels = 80; // COCO has detect 80 object labels. + parse_yolov8_detections( + (float*)out.data, prob_threshold, + out.h, out.w, num_labels, + in_pad.w, in_pad.h, + objects32); + proposals.insert(proposals.end(), objects32.begin(), objects32.end()); + } + + // sort all proposals by score from highest to lowest + qsort_descent_inplace(proposals); + + // apply nms with nms_threshold + std::vector picked; + nms_sorted_bboxes(proposals, picked, nms_threshold); + + int count = picked.size(); + + objects.resize(count); + for (int i = 0; i < count; i++) + { + objects[i] = proposals[picked[i]]; + + // adjust offset to original unpadded + float x0 = (objects[i].rect.x - (wpad / 2)) / scale; + float y0 = (objects[i].rect.y - (hpad / 2)) / scale; + float x1 = (objects[i].rect.x + objects[i].rect.width - (wpad / 2)) / scale; + float y1 = (objects[i].rect.y + objects[i].rect.height - (hpad / 2)) / scale; + + // clip + x0 = std::max(std::min(x0, (float)(img_w - 1)), 0.f); + y0 = std::max(std::min(y0, (float)(img_h - 1)), 0.f); + x1 = std::max(std::min(x1, (float)(img_w - 1)), 0.f); + y1 = std::max(std::min(y1, (float)(img_h - 1)), 0.f); + + objects[i].rect.x = x0; + objects[i].rect.y = y0; + objects[i].rect.width = x1 - x0; + objects[i].rect.height = y1 - y0; + } + + return 0; +} + +static void draw_objects(const cv::Mat& bgr, const std::vector& objects) +{ + static const char* class_names[] = { + "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", + "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", + "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", + "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", + "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", + "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", + "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", + "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", + "hair drier", "toothbrush" + }; + + static const unsigned char colors[19][3] = { + {54, 67, 244}, + {99, 30, 233}, + {176, 39, 156}, + {183, 58, 103}, + {181, 81, 63}, + {243, 150, 33}, + {244, 169, 3}, + {212, 188, 0}, + {136, 150, 0}, + {80, 175, 76}, + {74, 195, 139}, + {57, 220, 205}, + {59, 235, 255}, + {7, 193, 255}, + {0, 152, 255}, + {34, 87, 255}, + {72, 85, 121}, + {158, 158, 158}, + {139, 125, 96} + }; + + int color_index = 0; + + cv::Mat image = bgr.clone(); + + for (size_t i = 0; i < objects.size(); i++) + { + const Object& obj = objects[i]; + + const unsigned char* color = colors[color_index % 19]; + color_index++; + + cv::Scalar cc(color[0], color[1], color[2]); + + fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob, + obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height); + + cv::rectangle(image, obj.rect, cc, 2); + + char text[256]; + sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100); + + int baseLine = 0; + cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine); + + int x = obj.rect.x; + int y = obj.rect.y - label_size.height - baseLine; + if (y < 0) + y = 0; + if (x + label_size.width > image.cols) + x = image.cols - label_size.width; + + cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)), + cc, -1); + + cv::putText(image, text, cv::Point(x, y + label_size.height), + cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(255, 255, 255)); + } + + cv::imshow("image", image); + cv::waitKey(0); +} + +int main(int argc, char** argv) +{ + if (argc != 2) + { + fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]); + return -1; + } + + const char* imagepath = argv[1]; + + cv::Mat m = cv::imread(imagepath, 1); + if (m.empty()) + { + fprintf(stderr, "cv::imread %s failed\n", imagepath); + return -1; + } + + std::vector objects; + detect_yolov8(m, objects); + + draw_objects(m, objects); + + return 0; +} From 07196eee2e48738ac58e5b1a551649d578e3e783 Mon Sep 17 00:00:00 2001 From: Kelun Lei Date: Fri, 16 Aug 2024 10:50:26 +0800 Subject: [PATCH 24/38] benchmark: add Kunpeng 920 7260 (#5606) --- benchmark/README.md | 292 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 292 insertions(+) diff --git a/benchmark/README.md b/benchmark/README.md index 1927acf81cd..df9e55de4a8 100644 --- a/benchmark/README.md +++ b/benchmark/README.md @@ -5911,6 +5911,298 @@ cooling_down = 0 FastestDet min = 5.13 max = 5.47 avg = 5.30 ``` +### HUAWEI Kunpeng 920 7260 (x64 cores) +test on Ubuntu 20.04 (gcc 9.4.0) +``` +root@8d46e508165f:/home/lkl/ARM_CHAR/ncnn/benchmark# ../build/benchmark/benchncnn 300 1 0 -1 0 +loop_count = 300 +num_threads = 1 +powersave = 0 +gpu_device = -1 +cooling_down = 0 + squeezenet min = 11.64 max = 12.11 avg = 11.71 + squeezenet_int8 min = 12.22 max = 13.22 avg = 12.37 + mobilenet min = 20.00 max = 20.79 avg = 20.08 + mobilenet_int8 min = 17.44 max = 19.09 avg = 17.64 + mobilenet_v2 min = 13.29 max = 14.25 avg = 13.39 + mobilenet_v3 min = 11.06 max = 11.84 avg = 11.11 + shufflenet min = 7.56 max = 7.74 avg = 7.59 + shufflenet_v2 min = 7.84 max = 8.37 avg = 7.88 + mnasnet min = 13.07 max = 13.78 avg = 13.14 + proxylessnasnet min = 15.71 max = 16.31 avg = 15.77 + efficientnet_b0 min = 34.79 max = 35.98 avg = 34.92 + efficientnetv2_b0 min = 35.28 max = 36.36 avg = 35.41 + regnety_400m min = 17.06 max = 17.74 avg = 17.16 + blazeface min = 2.99 max = 3.04 avg = 3.01 + googlenet min = 50.76 max = 51.74 avg = 51.00 + googlenet_int8 min = 50.31 max = 52.27 avg = 50.65 + resnet18 min = 34.97 max = 37.17 avg = 35.82 + resnet18_int8 min = 40.47 max = 42.03 avg = 40.78 + alexnet min = 39.19 max = 39.80 avg = 39.32 + vgg16 min = 176.62 max = 181.29 avg = 177.07 + vgg16_int8 min = 352.35 max = 358.38 avg = 355.15 + resnet50 min = 96.76 max = 98.63 avg = 97.09 + resnet50_int8 min = 90.00 max = 92.74 avg = 90.81 + squeezenet_ssd min = 33.23 max = 33.99 avg = 33.39 + squeezenet_ssd_int8 min = 38.50 max = 41.53 avg = 39.28 + mobilenet_ssd min = 42.49 max = 44.78 avg = 42.72 + mobilenet_ssd_int8 min = 37.06 max = 39.97 avg = 37.57 + mobilenet_yolo min = 96.34 max = 98.91 avg = 96.73 + mobilenetv2_yolov3 min = 50.88 max = 52.97 avg = 51.15 + yolov4-tiny min = 65.56 max = 67.13 avg = 65.80 + nanodet_m min = 19.94 max = 20.82 avg = 20.04 + yolo-fastest-1.1 min = 7.66 max = 7.81 avg = 7.71 + yolo-fastestv2 min = 6.82 max = 7.23 avg = 6.87 + vision_transformer min = 1535.03 max = 1552.84 avg = 1543.73 + FastestDet min = 7.17 max = 7.50 avg = 7.21 +root@8d46e508165f:/home/lkl/ARM_CHAR/ncnn/benchmark# ../build/benchmark/benchncnn 300 2 0 -1 0 +loop_count = 300 +num_threads = 2 +powersave = 0 +gpu_device = -1 +cooling_down = 0 + squeezenet min = 6.35 max = 9.15 avg = 7.33 + squeezenet_int8 min = 8.06 max = 8.60 avg = 8.14 + mobilenet min = 10.30 max = 11.86 avg = 11.48 + mobilenet_int8 min = 8.93 max = 11.87 avg = 10.47 + mobilenet_v2 min = 9.05 max = 11.50 avg = 9.19 + mobilenet_v3 min = 6.32 max = 6.42 avg = 6.36 + shufflenet min = 6.73 max = 8.55 avg = 6.81 + shufflenet_v2 min = 4.94 max = 6.65 avg = 6.32 + mnasnet min = 7.38 max = 10.77 avg = 8.82 + proxylessnasnet min = 8.57 max = 9.72 avg = 8.63 + efficientnet_b0 min = 18.61 max = 22.53 avg = 20.42 + efficientnetv2_b0 min = 18.75 max = 21.93 avg = 20.79 + regnety_400m min = 11.86 max = 15.09 avg = 14.60 + blazeface min = 1.95 max = 3.37 avg = 2.06 + googlenet min = 28.66 max = 32.24 avg = 28.94 + googlenet_int8 min = 27.64 max = 32.15 avg = 30.84 + resnet18 min = 20.33 max = 20.77 avg = 20.47 + resnet18_int8 min = 22.63 max = 23.72 avg = 22.88 + alexnet min = 20.41 max = 29.37 avg = 27.22 + vgg16 min = 101.72 max = 140.33 avg = 103.29 + vgg16_int8 min = 187.56 max = 211.44 avg = 189.92 + resnet50 min = 51.07 max = 59.25 avg = 58.35 + resnet50_int8 min = 46.50 max = 52.55 avg = 48.93 + squeezenet_ssd min = 22.48 max = 28.59 avg = 22.98 + squeezenet_ssd_int8 min = 25.56 max = 26.82 avg = 25.99 + mobilenet_ssd min = 22.81 max = 26.21 avg = 24.88 + mobilenet_ssd_int8 min = 19.31 max = 25.53 avg = 21.74 + mobilenet_yolo min = 59.58 max = 62.04 avg = 59.99 + mobilenetv2_yolov3 min = 33.26 max = 35.74 avg = 33.51 + yolov4-tiny min = 41.14 max = 45.34 avg = 42.46 + nanodet_m min = 12.10 max = 16.69 avg = 15.02 + yolo-fastest-1.1 min = 5.44 max = 7.78 avg = 7.24 + yolo-fastestv2 min = 5.03 max = 8.08 avg = 6.75 + vision_transformer min = 994.46 max = 1090.68 avg = 1045.50 + FastestDet min = 6.76 max = 6.91 avg = 6.83 +root@8d46e508165f:/home/lkl/ARM_CHAR/ncnn/benchmark# ../build/benchmark/benchncnn 300 4 0 -1 0 +loop_count = 300 +num_threads = 4 +powersave = 0 +gpu_device = -1 +cooling_down = 0 + squeezenet min = 3.79 max = 6.99 avg = 4.55 + squeezenet_int8 min = 5.13 max = 5.68 avg = 5.20 + mobilenet min = 6.25 max = 6.55 avg = 6.30 + mobilenet_int8 min = 5.96 max = 6.10 avg = 6.03 + mobilenet_v2 min = 5.34 max = 7.15 avg = 5.62 + mobilenet_v3 min = 4.05 max = 5.74 avg = 5.01 + shufflenet min = 3.69 max = 5.81 avg = 5.15 + shufflenet_v2 min = 4.31 max = 6.02 avg = 4.56 + mnasnet min = 4.48 max = 6.05 avg = 5.54 + proxylessnasnet min = 5.05 max = 8.08 avg = 6.03 + efficientnet_b0 min = 10.17 max = 12.21 avg = 11.58 + efficientnetv2_b0 min = 10.86 max = 15.78 avg = 12.70 + regnety_400m min = 9.24 max = 14.13 avg = 11.98 + blazeface min = 1.89 max = 1.97 avg = 1.93 + googlenet min = 15.19 max = 20.31 avg = 16.90 + googlenet_int8 min = 17.97 max = 19.40 avg = 18.11 + resnet18 min = 11.18 max = 11.48 avg = 11.29 + resnet18_int8 min = 12.26 max = 12.78 avg = 12.44 + alexnet min = 14.43 max = 16.94 avg = 14.68 + vgg16 min = 62.40 max = 78.42 avg = 64.96 + vgg16_int8 min = 101.52 max = 109.42 avg = 104.46 + resnet50 min = 29.19 max = 39.69 avg = 32.99 + resnet50_int8 min = 26.94 max = 28.82 avg = 27.16 + squeezenet_ssd min = 12.90 max = 16.52 avg = 15.20 + squeezenet_ssd_int8 min = 15.58 max = 18.40 avg = 16.28 + mobilenet_ssd min = 13.68 max = 14.45 avg = 13.87 + mobilenet_ssd_int8 min = 12.20 max = 14.58 avg = 12.84 + mobilenet_yolo min = 34.85 max = 36.54 avg = 35.05 + mobilenetv2_yolov3 min = 18.61 max = 20.93 avg = 19.92 + yolov4-tiny min = 26.09 max = 32.32 avg = 28.03 + nanodet_m min = 7.85 max = 12.48 avg = 11.00 + yolo-fastest-1.1 min = 6.19 max = 6.49 avg = 6.31 + yolo-fastestv2 min = 3.66 max = 6.83 avg = 5.11 + vision_transformer min = 605.95 max = 624.99 avg = 609.79 + FastestDet min = 4.32 max = 5.41 avg = 5.17 +root@8d46e508165f:/home/lkl/ARM_CHAR/ncnn/benchmark# ../build/benchmark/benchncnn 300 8 0 -1 0 +loop_count = 300 +num_threads = 8 +powersave = 0 +gpu_device = -1 +cooling_down = 0 + squeezenet min = 2.72 max = 3.74 avg = 3.05 + squeezenet_int8 min = 3.80 max = 4.71 avg = 4.03 + mobilenet min = 3.94 max = 5.15 avg = 4.00 + mobilenet_int8 min = 3.73 max = 3.87 avg = 3.80 + mobilenet_v2 min = 4.51 max = 6.57 avg = 4.68 + mobilenet_v3 min = 4.12 max = 4.38 avg = 4.28 + shufflenet min = 4.60 max = 6.27 avg = 4.88 + shufflenet_v2 min = 4.07 max = 4.20 avg = 4.11 + mnasnet min = 4.26 max = 4.51 avg = 4.36 + proxylessnasnet min = 4.71 max = 7.40 avg = 4.80 + efficientnet_b0 min = 8.49 max = 8.74 avg = 8.56 + efficientnetv2_b0 min = 9.34 max = 9.68 avg = 9.41 + regnety_400m min = 8.00 max = 12.85 avg = 10.64 + blazeface min = 1.76 max = 1.84 avg = 1.80 + googlenet min = 10.89 max = 11.33 avg = 10.98 + googlenet_int8 min = 11.66 max = 14.07 avg = 11.83 + resnet18 min = 6.48 max = 6.61 avg = 6.54 + resnet18_int8 min = 7.30 max = 7.79 avg = 7.51 + alexnet min = 8.33 max = 8.95 avg = 8.62 + vgg16 min = 29.94 max = 47.54 avg = 31.95 + vgg16_int8 min = 54.67 max = 60.76 avg = 56.03 + resnet50 min = 16.13 max = 20.79 avg = 20.03 + resnet50_int8 min = 15.64 max = 20.13 avg = 16.11 + squeezenet_ssd min = 11.58 max = 12.02 avg = 11.77 + squeezenet_ssd_int8 min = 11.14 max = 13.72 avg = 12.10 + mobilenet_ssd min = 8.27 max = 10.77 avg = 8.76 + mobilenet_ssd_int8 min = 8.13 max = 9.09 avg = 8.29 + mobilenet_yolo min = 23.90 max = 24.69 avg = 24.17 + mobilenetv2_yolov3 min = 14.83 max = 15.72 avg = 15.19 + yolov4-tiny min = 19.78 max = 23.66 avg = 20.05 + nanodet_m min = 8.92 max = 10.76 avg = 9.09 + yolo-fastest-1.1 min = 5.49 max = 5.77 avg = 5.63 + yolo-fastestv2 min = 5.04 max = 5.21 avg = 5.10 + vision_transformer min = 318.42 max = 379.40 avg = 363.66 + FastestDet min = 4.18 max = 4.54 avg = 4.38 +root@8d46e508165f:/home/lkl/ARM_CHAR/ncnn/benchmark# ../build/benchmark/benchncnn 300 16 0 -1 0 +loop_count = 300 +num_threads = 16 +powersave = 0 +gpu_device = -1 +cooling_down = 0 + squeezenet min = 2.70 max = 3.14 avg = 2.81 + squeezenet_int8 min = 3.21 max = 4.22 avg = 3.39 + mobilenet min = 3.13 max = 3.26 avg = 3.20 + mobilenet_int8 min = 3.17 max = 5.05 avg = 3.30 + mobilenet_v2 min = 4.31 max = 6.24 avg = 4.62 + mobilenet_v3 min = 3.57 max = 3.77 avg = 3.68 + shufflenet min = 4.70 max = 6.45 avg = 4.80 + shufflenet_v2 min = 3.73 max = 4.27 avg = 3.87 + mnasnet min = 3.67 max = 3.87 avg = 3.75 + proxylessnasnet min = 4.28 max = 4.81 avg = 4.35 + efficientnet_b0 min = 7.31 max = 7.77 avg = 7.53 + efficientnetv2_b0 min = 9.87 max = 12.33 avg = 10.07 + regnety_400m min = 17.95 max = 18.53 avg = 18.26 + blazeface min = 2.26 max = 2.40 avg = 2.33 + googlenet min = 9.51 max = 9.99 avg = 9.68 + googlenet_int8 min = 10.98 max = 11.36 avg = 11.18 + resnet18 min = 5.59 max = 6.08 avg = 5.71 + resnet18_int8 min = 6.55 max = 7.28 avg = 6.77 + alexnet min = 6.26 max = 6.50 avg = 6.36 + vgg16 min = 23.98 max = 27.37 avg = 24.89 + vgg16_int8 min = 38.07 max = 39.66 avg = 39.02 + resnet50 min = 12.81 max = 14.19 avg = 13.76 + resnet50_int8 min = 12.42 max = 12.84 avg = 12.55 + squeezenet_ssd min = 10.80 max = 11.49 avg = 11.12 + squeezenet_ssd_int8 min = 11.57 max = 12.21 avg = 11.74 + mobilenet_ssd min = 7.46 max = 8.08 avg = 7.84 + mobilenet_ssd_int8 min = 7.47 max = 8.07 avg = 7.63 + mobilenet_yolo min = 21.70 max = 23.43 avg = 21.92 + mobilenetv2_yolov3 min = 12.55 max = 14.56 avg = 12.90 + yolov4-tiny min = 17.68 max = 19.85 avg = 18.18 + nanodet_m min = 8.35 max = 8.70 avg = 8.45 + yolo-fastest-1.1 min = 5.70 max = 7.11 avg = 6.05 + yolo-fastestv2 min = 4.85 max = 5.70 avg = 5.37 + vision_transformer min = 214.36 max = 259.56 avg = 245.47 + FastestDet min = 5.01 max = 5.42 avg = 5.17 +root@8d46e508165f:/home/lkl/ARM_CHAR/ncnn/benchmark# ../build/benchmark/benchncnn 300 32 0 -1 0 +loop_count = 300 +num_threads = 32 +powersave = 0 +gpu_device = -1 +cooling_down = 0 + squeezenet min = 2.30 max = 2.94 avg = 2.46 + squeezenet_int8 min = 3.08 max = 4.88 avg = 4.03 + mobilenet min = 2.49 max = 2.76 avg = 2.53 + mobilenet_int8 min = 2.86 max = 3.73 avg = 2.95 + mobilenet_v2 min = 4.51 max = 5.20 avg = 4.74 + mobilenet_v3 min = 5.11 max = 6.91 avg = 6.10 + shufflenet min = 5.57 max = 6.51 avg = 5.78 + shufflenet_v2 min = 4.37 max = 4.66 avg = 4.48 + mnasnet min = 3.72 max = 4.08 avg = 3.90 + proxylessnasnet min = 4.19 max = 6.18 avg = 4.79 + efficientnet_b0 min = 6.80 max = 7.22 avg = 6.89 + efficientnetv2_b0 min = 13.98 max = 17.55 avg = 15.06 + regnety_400m min = 16.10 max = 16.72 avg = 16.26 + blazeface min = 2.12 max = 2.53 avg = 2.17 + googlenet min = 8.63 max = 9.89 avg = 8.77 + googlenet_int8 min = 9.90 max = 11.09 avg = 10.08 + resnet18 min = 6.54 max = 6.99 avg = 6.73 + resnet18_int8 min = 8.34 max = 9.00 avg = 8.67 + alexnet min = 6.64 max = 7.15 avg = 6.93 + vgg16 min = 22.79 max = 23.91 avg = 23.50 + vgg16_int8 min = 32.37 max = 37.51 avg = 33.13 + resnet50 min = 11.19 max = 16.40 avg = 11.47 + resnet50_int8 min = 11.92 max = 12.55 avg = 12.13 + squeezenet_ssd min = 10.75 max = 12.28 avg = 11.12 + squeezenet_ssd_int8 min = 11.31 max = 12.29 avg = 11.57 + mobilenet_ssd min = 10.25 max = 11.26 avg = 10.79 + mobilenet_ssd_int8 min = 11.39 max = 16.99 avg = 11.98 + mobilenet_yolo min = 52.11 max = 60.46 avg = 53.84 + mobilenetv2_yolov3 min = 12.07 max = 12.47 avg = 12.20 + yolov4-tiny min = 17.48 max = 17.79 avg = 17.58 + nanodet_m min = 13.06 max = 14.71 avg = 13.64 + yolo-fastest-1.1 min = 5.70 max = 5.89 avg = 5.79 + yolo-fastestv2 min = 8.89 max = 9.99 avg = 9.21 + vision_transformer min = 158.92 max = 187.40 avg = 168.21 + FastestDet min = 8.70 max = 9.43 avg = 9.00 +root@8d46e508165f:/home/lkl/ARM_CHAR/ncnn/benchmark# ../build/benchmark/benchncnn 300 64 0 -1 0 +loop_count = 300 +num_threads = 64 +powersave = 0 +gpu_device = -1 +cooling_down = 0 + squeezenet min = 6.85 max = 78.56 avg = 7.81 + squeezenet_int8 min = 8.06 max = 88.91 avg = 9.23 + mobilenet min = 3.02 max = 86.86 avg = 5.89 + mobilenet_int8 min = 3.58 max = 4.55 avg = 3.68 + mobilenet_v2 min = 5.05 max = 150.06 avg = 13.04 + mobilenet_v3 min = 4.85 max = 125.22 avg = 8.34 + shufflenet min = 17.80 max = 220.55 avg = 21.01 + shufflenet_v2 min = 11.23 max = 381.95 avg = 13.71 + mnasnet min = 9.83 max = 128.42 avg = 11.10 + proxylessnasnet min = 10.53 max = 68.52 avg = 12.03 + efficientnet_b0 min = 16.78 max = 968.87 avg = 23.94 + efficientnetv2_b0 min = 26.23 max = 551.18 avg = 31.34 + regnety_400m min = 70.14 max = 407.92 avg = 78.30 + blazeface min = 7.27 max = 191.44 avg = 9.37 + googlenet min = 16.69 max = 820.58 avg = 25.06 + googlenet_int8 min = 20.58 max = 849.09 avg = 29.87 + resnet18 min = 8.67 max = 349.00 avg = 11.33 + resnet18_int8 min = 10.40 max = 128.98 avg = 11.45 + alexnet min = 6.15 max = 196.01 avg = 10.24 + vgg16 min = 21.11 max = 288.66 avg = 29.37 + vgg16_int8 min = 30.72 max = 251.95 avg = 37.68 + resnet50 min = 19.10 max = 114.08 avg = 22.00 + resnet50_int8 min = 18.99 max = 436.89 avg = 24.36 + squeezenet_ssd min = 22.22 max = 510.52 avg = 28.76 + squeezenet_ssd_int8 min = 23.42 max = 614.70 avg = 30.82 + mobilenet_ssd min = 7.62 max = 202.66 avg = 14.59 + mobilenet_ssd_int8 min = 7.89 max = 109.82 avg = 8.80 + mobilenet_yolo min = 31.43 max = 742.10 avg = 45.52 + mobilenetv2_yolov3 min = 18.31 max = 273.05 avg = 20.78 + yolov4-tiny min = 21.03 max = 400.05 avg = 33.64 + nanodet_m min = 19.94 max = 114.18 avg = 21.89 + yolo-fastest-1.1 min = 7.20 max = 174.60 avg = 9.13 + yolo-fastestv2 min = 7.50 max = 170.55 avg = 9.01 + vision_transformer min = 126.90 max = 335.71 avg = 157.38 + FastestDet min = 6.59 max = 19.77 avg = 6.77 +``` + ### Intel Atom x5-Z8350 ``` nihui@nihui-ROCK-Pi-X:~/ncnn/build/benchmark$ ./benchncnn 20 4 0 -1 1 From 70310e951e1d863bc860141dab9506c7de2d118c Mon Sep 17 00:00:00 2001 From: nihui Date: Fri, 16 Aug 2024 16:20:10 +0800 Subject: [PATCH 25/38] fix out of range read in convolution im2col aarch64 (#5631) --- src/layer/arm/convolution_im2col_gemm.h | 2 +- src/layer/arm/convolution_im2col_gemm_bf16s.h | 36 +++++++++---------- 2 files changed, 18 insertions(+), 20 deletions(-) diff --git a/src/layer/arm/convolution_im2col_gemm.h b/src/layer/arm/convolution_im2col_gemm.h index af501efa2f8..25a3e94d781 100644 --- a/src/layer/arm/convolution_im2col_gemm.h +++ b/src/layer/arm/convolution_im2col_gemm.h @@ -3377,7 +3377,7 @@ static void convolution_gemm_transB_packed_tile(const Mat& AT_tile, const Mat& B "cbz %w10, 0f \n" "ld1 {v30.4s, v31.4s}, [%0] \n" - "b 3f \n" + "b 2f \n" "0: \n" // if pC diff --git a/src/layer/arm/convolution_im2col_gemm_bf16s.h b/src/layer/arm/convolution_im2col_gemm_bf16s.h index 82319d05850..95819e2d679 100644 --- a/src/layer/arm/convolution_im2col_gemm_bf16s.h +++ b/src/layer/arm/convolution_im2col_gemm_bf16s.h @@ -3110,7 +3110,7 @@ static void convolution_gemm_transB_packed_tile_bf16s(const Mat& AT_tile, const "cbz %w10, 0f \n" "ld1 {v30.4s, v31.4s}, [%0] \n" - "b 3f \n" + "b 2f \n" "0: \n" // if pC @@ -3125,15 +3125,13 @@ static void convolution_gemm_transB_packed_tile_bf16s(const Mat& AT_tile, const "eor v31.16b, v31.16b, v31.16b \n" "2: \n" - - "3: \n" "lsr w4, %w9, #2 \n" // w4 = max_kk >> 2 "cmp w4, #0 \n" - "beq 5f \n" + "beq 4f \n" "eor v28.16b, v28.16b, v28.16b \n" "eor v29.16b, v29.16b, v29.16b \n" - "4: \n" + "3: \n" "prfm pldl1keep, [%2, #64] \n" "ld1 {v0.4h}, [%2], #8 \n" "shll v0.4s, v0.4h, #16 \n" @@ -3156,16 +3154,16 @@ static void convolution_gemm_transB_packed_tile_bf16s(const Mat& AT_tile, const "subs w4, w4, #1 \n" "fmla v30.4s, v10.4s, v0.s[3] \n" "fmla v31.4s, v11.4s, v0.s[3] \n" - "bne 4b \n" + "bne 3b \n" "fadd v30.4s, v30.4s, v28.4s \n" "fadd v31.4s, v31.4s, v29.4s \n" - "5: \n" + "4: \n" "and w4, %w9, #3 \n" // w4 = remain = max_kk & 3 "cmp w4, #0 \n" - "beq 7f \n" + "beq 6f \n" - "6: \n" + "5: \n" "ld1r {v0.4h}, [%2], #2 \n" "shll v0.4s, v0.4h, #16 \n" "ld1 {v3.8h}, [%1], #16 \n" @@ -3174,26 +3172,26 @@ static void convolution_gemm_transB_packed_tile_bf16s(const Mat& AT_tile, const "subs w4, w4, #1 \n" "fmla v30.4s, v4.4s, v0.4s \n" "fmla v31.4s, v5.4s, v0.4s \n" - "bne 6b \n" + "bne 5b \n" - "7: \n" + "6: \n" "shrn v30.4h, v30.4s, #16 \n" "shrn v31.4h, v31.4s, #16 \n" "tst %w11, #255 \n" - "beq 10f \n" + "beq 9f \n" // if out_elempack == 4 "cmp %w12, #4 \n" - "bne 8f \n" + "bne 7f \n" "lsl w4, %w13, #2 \n" "add x4, %3, w4, sxtw 1 \n" "st1 {v30.4h}, [%3], #8 \n" "st1 {v31.4h}, [x4] \n" - "b 9f \n" + "b 8f \n" // if out_elempack == 1 - "8: \n" + "7: \n" "add x4, %3, %w13, sxtw 1 \n" "st1 {v30.h}[0], [%3], #2 \n" "st1 {v30.h}[1], [x4] \n" @@ -3210,14 +3208,14 @@ static void convolution_gemm_transB_packed_tile_bf16s(const Mat& AT_tile, const "add x4, x4, %w13, sxtw 1 \n" "st1 {v31.h}[3], [x4] \n" - "9: \n" + "8: \n" "add %0, %0, #32 \n" - "b 11f \n" + "b 10f \n" - "10: \n" + "9: \n" "st1 {v30.4s, v31.4s}, [%0], #32 \n" - "11: \n" + "10: \n" : "=r"(outptr), // %0 "=r"(pA), // %1 From 789d8686c7fc270e5579f2fad680e2aa1af4e3b4 Mon Sep 17 00:00:00 2001 From: nihui Date: Fri, 16 Aug 2024 18:48:39 +0800 Subject: [PATCH 26/38] pnnx functionize do not create shadow op for identity consumers (#5632) --- tools/pnnx/src/pass_level2.cpp | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tools/pnnx/src/pass_level2.cpp b/tools/pnnx/src/pass_level2.cpp index bc7e51b8d5d..de44a355366 100644 --- a/tools/pnnx/src/pass_level2.cpp +++ b/tools/pnnx/src/pass_level2.cpp @@ -1166,6 +1166,18 @@ static void functionize(Graph& graph) if (out0->consumers.size() == 1) continue; + bool all_consumers_are_same = true; + for (size_t j = 1; j < out0->consumers.size(); j++) + { + if (out0->consumers[j] != out0->consumers[0]) + { + all_consumers_are_same = false; + break; + } + } + if (all_consumers_are_same) + continue; + for (int j = (int)out0->consumers.size() - 1; j > 0; j--) { Operator* op1 = out0->consumers[j]; From 4de536951ac618ce705bea519bf1b2afc43f21ab Mon Sep 17 00:00:00 2001 From: nihui Date: Sat, 17 Aug 2024 10:39:47 +0800 Subject: [PATCH 27/38] onnx2pnnx do not fold single constant for gemm weight (#5634) --- tools/pnnx/src/pass_onnx.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/pnnx/src/pass_onnx.cpp b/tools/pnnx/src/pass_onnx.cpp index 6318dacba25..87dd27d27cb 100644 --- a/tools/pnnx/src/pass_onnx.cpp +++ b/tools/pnnx/src/pass_onnx.cpp @@ -820,6 +820,8 @@ void pass_onnx(const onnx::ModelProto& model, Graph& pnnx_graph) is_attr_weight = true; if (sim_op_type == "Gather" && j == 0) is_attr_weight = true; + if (sim_op_type == "Gemm" && (j == 1 || j == 2)) + is_attr_weight = true; if (sim_op_type == "GroupNormalization" && (j == 1 || j == 2)) is_attr_weight = true; if (sim_op_type == "GRU" && (j == 1 || j == 2 || j == 3 || j == 5)) From a0c9e7783d221771457e1d71a8452475c2ba51f5 Mon Sep 17 00:00:00 2001 From: Joey Ballentine <34788790+joeyballentine@users.noreply.github.com> Date: Sat, 17 Aug 2024 00:36:17 -0500 Subject: [PATCH 28/38] Add python binding for loading bin from memory (#5164) --- python/src/main.cpp | 21 +++++++++++++++++++++ python/tests/test_net.py | 26 ++++++++++++++++++++++++++ 2 files changed, 47 insertions(+) diff --git a/python/src/main.cpp b/python/src/main.cpp index a7ed0528c6a..e5b1264264c 100644 --- a/python/src/main.cpp +++ b/python/src/main.cpp @@ -34,6 +34,20 @@ using namespace ncnn; namespace py = pybind11; +class DataReaderFromMemoryCopy : public DataReaderFromMemory +{ +public: + explicit DataReaderFromMemoryCopy(const unsigned char*& mem) + : DataReaderFromMemory(mem) + { + } + + virtual size_t reference(size_t size, const void** buf) const + { + return 0; + } +}; + struct LayerFactory { std::string name; @@ -956,6 +970,13 @@ PYBIND11_MODULE(ncnn, m) #endif // NCNN_STRING .def("load_param_bin", (int (Net::*)(const char*)) & Net::load_param_bin, py::arg("protopath")) .def("load_model", (int (Net::*)(const char*)) & Net::load_model, py::arg("modelpath")) + .def( + "load_model_mem", [](Net& net, const char* mem) { + const unsigned char* _mem = (const unsigned char*)mem; + DataReaderFromMemoryCopy dr(_mem); + net.load_model(dr); + }, + py::arg("mem")) #endif // NCNN_STDIO .def("clear", &Net::clear) diff --git a/python/tests/test_net.py b/python/tests/test_net.py index 03271aff462..362cc4791fb 100644 --- a/python/tests/test_net.py +++ b/python/tests/test_net.py @@ -42,6 +42,32 @@ def test_net(): assert len(net.blobs()) == 0 and len(net.layers()) == 0 +def test_net_mem(): + modelbin = bytearray(303940) + modelbin[0:4] = 71,107,48,1 + modelbin[180:184] = 71,107,48,1 + + with ncnn.Net() as net: + ret = net.load_param("tests/test.param") + net.load_model_mem(bytes(modelbin)) + assert ret == 0 and len(net.blobs()) == 3 and len(net.layers()) == 3 + + input_names = net.input_names() + output_names = net.output_names() + assert len(input_names) > 0 and len(output_names) > 0 + + in_mat = ncnn.Mat((227, 227, 3)) + + with net.create_extractor() as ex: + ex.input("data", in_mat) + ret, out_mat = ex.extract("output") + + assert ret == 0 and out_mat.dims == 1 and out_mat.w == 1 + + net.clear() + assert len(net.blobs()) == 0 and len(net.layers()) == 0 + + def test_net_vulkan(): if not hasattr(ncnn, "get_gpu_count"): return From 27f64a1382e72d18e38577f3c922323c1a199ce4 Mon Sep 17 00:00:00 2001 From: nihui Date: Mon, 19 Aug 2024 11:17:53 +0800 Subject: [PATCH 29/38] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a9bb1c116fa..08166ab0766 100644 --- a/README.md +++ b/README.md @@ -560,7 +560,7 @@ https://github.com/Tencent/ncnn/releases/latest **[use netron for ncnn model visualization](https://netron.app)** -**[out-of-the-box web model conversion](https://convertmodel.com/#outputFormat=ncnn)** +**[use ncnn with pytorch or onnx](https://github.com/Tencent/ncnn/wiki/use-ncnn-with-pytorch-or-onnx)** [ncnn low-level operation api](https://github.com/Tencent/ncnn/wiki/low-level-operation-api) From a6d3ef5a0bb59fb496c553c3ef54d141642b4fc5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=B5=E5=B0=8F=E5=87=A1?= <2672931+whyb@users.noreply.github.com> Date: Tue, 20 Aug 2024 08:23:56 +0800 Subject: [PATCH 30/38] Fixed bug #5637 (#5640) --- examples/yolov8.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/yolov8.cpp b/examples/yolov8.cpp index 5b3926582c8..e166e6c1d17 100644 --- a/examples/yolov8.cpp +++ b/examples/yolov8.cpp @@ -175,10 +175,10 @@ static void parse_yolov8_detections( for (int i = 0; i < num_anchors; i++) { - auto row_ptr = output.row(i).ptr(); - auto bboxes_ptr = row_ptr; - auto scores_ptr = row_ptr + 4; - auto max_s_ptr = std::max_element(scores_ptr, scores_ptr + num_labels); + const float* row_ptr = output.row(i).ptr(); + const float* bboxes_ptr = row_ptr; + const float* scores_ptr = row_ptr + 4; + const float* max_s_ptr = std::max_element(scores_ptr, scores_ptr + num_labels); float score = *max_s_ptr; if (score > confidence_threshold) { @@ -201,7 +201,7 @@ static void parse_yolov8_detections( object.label = max_s_ptr - scores_ptr; object.prob = score; object.rect = bbox; - detections.emplace_back(object); + detections.push_back(object); } } objects = detections; From 25a22e0c0c032b098153fb47c6199a48aa15ea92 Mon Sep 17 00:00:00 2001 From: nihui Date: Tue, 20 Aug 2024 16:59:17 +0800 Subject: [PATCH 31/38] update release download --- README.md | 89 +++++++++++++++++++++++++++++++------------------------ 1 file changed, 50 insertions(+), 39 deletions(-) diff --git a/README.md b/README.md index 08166ab0766..146b04b1a4e 100644 --- a/README.md +++ b/README.md @@ -77,7 +77,7 @@ https://github.com/Tencent/ncnn/releases/latest Source - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-full-source.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-full-source.zip) @@ -97,8 +97,8 @@ https://github.com/Tencent/ncnn/releases/latest Android - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-android-vulkan.zip) - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-android.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-android-vulkan.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-android.zip) @@ -111,8 +111,8 @@ https://github.com/Tencent/ncnn/releases/latest Android shared - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-android-vulkan-shared.zip) - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-android-shared.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-android-vulkan-shared.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-android-shared.zip) @@ -159,8 +159,8 @@ https://github.com/Tencent/ncnn/releases/latest iOS - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-ios-vulkan.zip) - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-ios.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-ios-vulkan.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-ios.zip) @@ -173,8 +173,8 @@ https://github.com/Tencent/ncnn/releases/latest iOS-Simulator - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-ios-simulator-vulkan.zip) - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-ios-simulator.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-ios-simulator-vulkan.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-ios-simulator.zip) @@ -193,8 +193,8 @@ https://github.com/Tencent/ncnn/releases/latest macOS - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-macos-vulkan.zip) - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-macos.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-macos-vulkan.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-macos.zip) @@ -207,8 +207,8 @@ https://github.com/Tencent/ncnn/releases/latest Mac-Catalyst - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-mac-catalyst-vulkan.zip) - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-mac-catalyst.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-mac-catalyst-vulkan.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-mac-catalyst.zip) @@ -221,7 +221,7 @@ https://github.com/Tencent/ncnn/releases/latest watchOS - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-watchos.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-watchos.zip) @@ -234,7 +234,7 @@ https://github.com/Tencent/ncnn/releases/latest watchOS-Simulator - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-watchos-simulator.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-watchos-simulator.zip) @@ -242,8 +242,8 @@ https://github.com/Tencent/ncnn/releases/latest tvOS - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-tvos-vulkan.zip) - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-tvos.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-tvos-vulkan.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-tvos.zip) @@ -256,8 +256,8 @@ https://github.com/Tencent/ncnn/releases/latest tvOS-Simulator - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-tvos-simulator-vulkan.zip) - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-tvos-simulator.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-tvos-simulator-vulkan.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-tvos-simulator.zip) @@ -265,7 +265,8 @@ https://github.com/Tencent/ncnn/releases/latest visionOS - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-visionos.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-visionos-vulkan.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-visionos.zip) @@ -278,7 +279,8 @@ https://github.com/Tencent/ncnn/releases/latest visionOS-Simulator - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-visionos-simulator.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-visionos-simulator-vulkan.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-visionos-simulator.zip) @@ -286,8 +288,8 @@ https://github.com/Tencent/ncnn/releases/latest Apple xcframework - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-apple-vulkan.zip) - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-apple.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-apple-vulkan.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-apple.zip) @@ -296,10 +298,10 @@ https://github.com/Tencent/ncnn/releases/latest - + - + - [Build for Linux / NVIDIA Jetson / Raspberry Pi3, Pi4 / POWER](https://github.com/Tencent/ncnn/wiki/how-to-build#build-for-linux) @@ -309,11 +311,11 @@ https://github.com/Tencent/ncnn/releases/latest Ubuntu 20.04 - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-ubuntu-2004.zip) - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-ubuntu-2004-shared.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-ubuntu-2004.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-ubuntu-2004-shared.zip) - + [](https://github.com/Tencent/ncnn/actions?query=workflow%3Alinux-x64-gpu-gcc) @@ -323,8 +325,17 @@ https://github.com/Tencent/ncnn/releases/latest Ubuntu 22.04 - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-ubuntu-2204.zip) - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-ubuntu-2204-shared.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-ubuntu-2204.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-ubuntu-2204-shared.zip) + + + + +Ubuntu 24.04 + + + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-ubuntu-2404.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-ubuntu-2404-shared.zip) @@ -344,8 +355,8 @@ https://github.com/Tencent/ncnn/releases/latest VS2015 - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-windows-vs2015.zip) - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-windows-vs2015-shared.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-windows-vs2015.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-windows-vs2015-shared.zip) @@ -358,8 +369,8 @@ https://github.com/Tencent/ncnn/releases/latest VS2017 - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-windows-vs2017.zip) - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-windows-vs2017-shared.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-windows-vs2017.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-windows-vs2017-shared.zip) @@ -367,8 +378,8 @@ https://github.com/Tencent/ncnn/releases/latest VS2019 - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-windows-vs2019.zip) - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-windows-vs2019-shared.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-windows-vs2019.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-windows-vs2019-shared.zip) @@ -376,8 +387,8 @@ https://github.com/Tencent/ncnn/releases/latest VS2022 - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-windows-vs2022.zip) - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-windows-vs2022-shared.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-windows-vs2022.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-windows-vs2022-shared.zip) @@ -396,7 +407,7 @@ https://github.com/Tencent/ncnn/releases/latest WebAssembly - [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240410-webassembly.zip) + [](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20240820-webassembly.zip) From 5e2d56d025d1f40a2a26b5cc4733547cacd2dd8f Mon Sep 17 00:00:00 2001 From: nihui Date: Fri, 30 Aug 2024 12:27:54 +0800 Subject: [PATCH 32/38] pnnx fuse mobilevit style selfattention, onnx2pnnx handle more general gemm (#5659) --- tools/pnnx/src/pass_level2/F_hardswish.cpp | 26 ++++++++ tools/pnnx/src/pass_level2/F_linear.cpp | 66 ++++++++++++++++++- .../pass_level5/fuse_multiheadattention.cpp | 53 +++++++++++++++ 3 files changed, 143 insertions(+), 2 deletions(-) diff --git a/tools/pnnx/src/pass_level2/F_hardswish.cpp b/tools/pnnx/src/pass_level2/F_hardswish.cpp index caa724f55a7..2ce9e1b420b 100644 --- a/tools/pnnx/src/pass_level2/F_hardswish.cpp +++ b/tools/pnnx/src/pass_level2/F_hardswish.cpp @@ -343,4 +343,30 @@ pnnx.Output output 1 0 out REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_hardswish_onnx_2, 9) +class F_hardswish_onnx_3 : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +8 7 +pnnx.Input input 0 1 input +prim::Constant op_0 0 1 v3 value=3 +aten::add op_1 2 1 input v3 a +aten::clamp op_2 1 1 a b max=6 min=0 +aten::mul op_3 2 1 input b c +prim::Constant op_4 0 1 v6 value=6 +aten::div op_5 2 1 c v6 out +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "F.hardswish"; + } +}; + +REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_hardswish_onnx_3, 9) + } // namespace pnnx diff --git a/tools/pnnx/src/pass_level2/F_linear.cpp b/tools/pnnx/src/pass_level2/F_linear.cpp index 4c454581ec3..62f9d62e505 100644 --- a/tools/pnnx/src/pass_level2/F_linear.cpp +++ b/tools/pnnx/src/pass_level2/F_linear.cpp @@ -129,7 +129,7 @@ class F_linear_onnx : public GraphRewriterPass pnnx.Input input_0 0 1 input pnnx.Input input_1 0 1 weight pnnx.Input input_2 0 1 bias -Gemm op_0 3 1 input weight bias out alpha=1.000000e+00 beta=1.000000e+00 transB=1 +Gemm gemm 3 1 input weight bias out %*=%* pnnx.Output output 1 0 out )PNNXIR"; } @@ -138,6 +138,39 @@ pnnx.Output output 1 0 out { return "F.linear"; } + + bool match(const std::map& matched_operators, const std::map& captured_params, const std::map& /*captured_attrs*/) const + { + if (captured_params.find("gemm.alpha") != captured_params.end()) + { + if (captured_params.at("gemm.alpha").type != 3 || captured_params.at("gemm.alpha").f != 1.f) + return false; + } + + if (captured_params.find("gemm.beta") != captured_params.end()) + { + if (captured_params.at("gemm.beta").type != 3 || captured_params.at("gemm.beta").f != 1.f) + return false; + } + + if (captured_params.find("gemm.transA") != captured_params.end()) + { + if (captured_params.at("gemm.transA").type != 2 || captured_params.at("gemm.transA").i != 0) + return false; + } + + if (captured_params.find("gemm.transB") == captured_params.end()) + return false; + + if (captured_params.at("gemm.transB").type != 2 || captured_params.at("gemm.transB").i != 1) + return false; + + return true; + } + + void write(Operator* op, const std::map& /*captured_params*/) const + { + } }; REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_linear_onnx, 10) @@ -152,7 +185,7 @@ class F_linear_onnx_1 : public GraphRewriterPass pnnx.Input input_0 0 1 input pnnx.Input input_1 0 1 bias pnnx.Attribute weight 0 1 weight @data=(%in_features,%out_features)f32 -Gemm gemm 3 1 input weight bias out alpha=1.000000e+00 beta=1.000000e+00 +Gemm gemm 3 1 input weight bias out %*=%* pnnx.Output output 1 0 out )PNNXIR"; } @@ -169,6 +202,35 @@ pnnx.Output output 1 0 out )PNNXIR"; } + bool match(const std::map& matched_operators, const std::map& captured_params, const std::map& /*captured_attrs*/) const + { + if (captured_params.find("gemm.alpha") != captured_params.end()) + { + if (captured_params.at("gemm.alpha").type != 3 || captured_params.at("gemm.alpha").f != 1.f) + return false; + } + + if (captured_params.find("gemm.beta") != captured_params.end()) + { + if (captured_params.at("gemm.beta").type != 3 || captured_params.at("gemm.beta").f != 1.f) + return false; + } + + if (captured_params.find("gemm.transA") != captured_params.end()) + { + if (captured_params.at("gemm.transA").type != 2 || captured_params.at("gemm.transA").i != 0) + return false; + } + + if (captured_params.find("gemm.transB") != captured_params.end()) + { + if (captured_params.at("gemm.transB").type != 2 || captured_params.at("gemm.transB").i != 0) + return false; + } + + return true; + } + void write(const std::map& ops, const std::map& captured_params, const std::map& captured_attrs) const { const int in_features = captured_params.at("in_features").i; diff --git a/tools/pnnx/src/pass_level5/fuse_multiheadattention.cpp b/tools/pnnx/src/pass_level5/fuse_multiheadattention.cpp index b6297eb8a92..c178788f2a7 100644 --- a/tools/pnnx/src/pass_level5/fuse_multiheadattention.cpp +++ b/tools/pnnx/src/pass_level5/fuse_multiheadattention.cpp @@ -702,6 +702,57 @@ pnnx.Output output 1 0 out } }; +class fuse_multiheadattention_pass_1_1_1 : public fuse_multiheadattention_pass_sameqkv +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +19 18 +pnnx.Input input 0 1 input +nn.Linear op_0 1 1 input 256 bias=%qbias in_features=%embed_dim out_features=%embed_dim @bias @weight +nn.Linear op_1 1 1 input 257 bias=%kbias in_features=%embed_dim out_features=%embed_dim @bias @weight +nn.Linear op_2 1 1 input 260 bias=%vbias in_features=%embed_dim out_features=%embed_dim @bias @weight +Tensor.view op_3 1 1 256 263 shape=(%batch,%size,%num_heads,%feat_per_head) +Tensor.view op_4 1 1 257 258 shape=(%batch,%size,%num_heads,%feat_per_head) +Tensor.view op_5 1 1 260 261 shape=(%batch,%size,%num_heads,%feat_per_head) +Tensor.permute op_6 1 1 263 264 dims=(0,2,1,3) +Tensor.permute op_7 1 1 258 259 dims=(0,2,1,3) +Tensor.permute op_8 1 1 261 262 dims=(0,2,1,3) +torch.transpose op_9 1 1 259 265 dim0=-1 dim1=-2 +torch.matmul op_10 2 1 264 265 266 +pnnx.Expression op_11 1 1 266 267 expr=div(@0,%sqrt_feat_per_head) +F.softmax softmax 1 1 267 268 dim=%softmax_dim +torch.matmul op_13 2 1 268 262 269 +Tensor.permute op_14 1 1 269 270 dims=(0,2,1,3) +Tensor.reshape op_15 1 1 270 271 shape=(%batch,%size,%embed_dim) +nn.Linear out_proj 1 1 271 out bias=%outbias in_features=%embed_dim out_features=%embed_dim @bias @weight +pnnx.Output output 1 0 out +)PNNXIR"; + } + + bool match(const std::map& matched_operators, const std::map& captured_params, const std::map& /*captured_attrs*/) const + { + const int embed_dim = captured_params.at("embed_dim").i; + const int num_heads = captured_params.at("num_heads").i; + const int feat_per_head = captured_params.at("feat_per_head").i; + const float sqrt_feat_per_head = captured_params.at("sqrt_feat_per_head").f; + const int softmax_dim = captured_params.at("softmax_dim").i; + + if (embed_dim != num_heads * feat_per_head) + return false; + + if (!NearlyEqual(sqrt_feat_per_head, sqrt(feat_per_head), 0.001)) + return false; + + int softmax_input_rank = (int)matched_operators.at("softmax")->inputs[0]->shape.size(); + if (softmax_dim != -1 && softmax_dim != softmax_input_rank - 1) + return false; + + return true; + } +}; + class fuse_multiheadattention_pass_1_2 : public fuse_multiheadattention_pass_qkv { public: @@ -2082,6 +2133,7 @@ void fuse_multiheadattention(Graph& graph) fuse_multiheadattention_pass_q_samekv d; fuse_multiheadattention_pass_1 b1; fuse_multiheadattention_pass_1_1 b11; + fuse_multiheadattention_pass_1_1_1 b111; fuse_multiheadattention_pass_1_2 b12; fuse_multiheadattention_pass_2 c1; fuse_multiheadattention_pass_3 d1; @@ -2122,6 +2174,7 @@ void fuse_multiheadattention(Graph& graph) pnnx_graph_rewrite(graph, &d, opindex); pnnx_graph_rewrite(graph, &b1, opindex); pnnx_graph_rewrite(graph, &b11, opindex); + pnnx_graph_rewrite(graph, &b111, opindex); pnnx_graph_rewrite(graph, &b12, opindex); pnnx_graph_rewrite(graph, &c1, opindex); pnnx_graph_rewrite(graph, &d1, opindex); From 5df5413c81312b0106fe18066b47e2917afabd27 Mon Sep 17 00:00:00 2001 From: nihui Date: Mon, 2 Sep 2024 18:48:01 +0800 Subject: [PATCH 33/38] embed int8 quantization and add embed test (#5667) --- .ci/pnnx.yml | 2 + docs/developer-guide/operators.md | 2 + src/layer/embed.cpp | 88 +++++++++++++++++++++--- src/layer/embed.h | 6 ++ tests/CMakeLists.txt | 1 + tests/test_embed.cpp | 108 ++++++++++++++++++++++++++++++ tools/modelwriter.h | 11 +++ tools/quantize/ncnn2int8.cpp | 52 ++++++++++++++ 8 files changed, 261 insertions(+), 9 deletions(-) create mode 100644 tests/test_embed.cpp diff --git a/.ci/pnnx.yml b/.ci/pnnx.yml index 990690e0c5b..d49da39a0af 100644 --- a/.ci/pnnx.yml +++ b/.ci/pnnx.yml @@ -4,12 +4,14 @@ on: branches: [master] paths: - '.ci/pnnx.yml' + - 'src/layer/*' - 'tools/pnnx/**' - '!tools/pnnx/README.md' mr: target-branches: [master] paths: - '.ci/pnnx.yml' + - 'src/layer/*' - 'tools/pnnx/**' - '!tools/pnnx/README.md' concurrency: diff --git a/docs/developer-guide/operators.md b/docs/developer-guide/operators.md index 7594c0843ac..de4d6b428e9 100644 --- a/docs/developer-guide/operators.md +++ b/docs/developer-guide/operators.md @@ -837,11 +837,13 @@ y = embedding(x) | 1 | input_dim | int | 0 | | | 2 | bias_term | int | 0 | | | 3 | weight_data_size | int | 0 | | +| 18 | int8_scale_term| int | 0 | | | weight | type | shape | | ------------- | ----- | --------------------- | | weight_data | float | [weight_data_size] | | bias_term | float | [num_output] | +| weight_data_int8_scales| float | [1] | # Exp ``` diff --git a/src/layer/embed.cpp b/src/layer/embed.cpp index ddda6b8bf19..2b9f8a60042 100644 --- a/src/layer/embed.cpp +++ b/src/layer/embed.cpp @@ -30,6 +30,7 @@ int Embed::load_param(const ParamDict& pd) input_dim = pd.get(1, 0); bias_term = pd.get(2, 0); weight_data_size = pd.get(3, 0); + int8_scale_term = pd.get(18, 0); return 0; } @@ -47,18 +48,23 @@ int Embed::load_model(const ModelBin& mb) return -100; } +#if NCNN_INT8 + if (int8_scale_term) + { + weight_data_int8_scale = mb.load(1, 1)[0]; + } +#endif // NCNN_INT8 + return 0; } -int Embed::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +static void embed(const Mat& bottom_blob, const Mat& weight_data, const Mat& bias_data, Mat& top_blob, int input_dim, const Option& opt) { - int words = static_cast(bottom_blob.total()); + const int num_output = top_blob.w; + const int words = top_blob.h; - top_blob.create(num_output, words, 4u, opt.blob_allocator); - if (top_blob.empty()) - return -100; + const float* bias_ptr = bias_data; - // num_output #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < words; q++) { @@ -73,15 +79,79 @@ int Embed::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) con const float* em = (const float*)weight_data + num_output * word_index; - memcpy(outptr, em, num_output * sizeof(float)); + if (bias_ptr) + { + for (int p = 0; p < num_output; p++) + { + outptr[p] = em[p] + bias_ptr[p]; + } + } + else + { + memcpy(outptr, em, num_output * sizeof(float)); + } + } +} + +#if NCNN_INT8 +static void embed_int8(const Mat& bottom_blob, const Mat& weight_data, float weight_data_int8_scale, const Mat& bias_data, Mat& top_blob, int input_dim, const Option& opt) +{ + const int num_output = top_blob.w; + const int words = top_blob.h; + + const float* bias_ptr = bias_data; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < words; q++) + { + float* outptr = top_blob.row(q); + + int word_index = ((const int*)bottom_blob)[q]; - if (bias_term) + if (word_index < 0) + word_index = 0; + if (word_index >= input_dim) + word_index = input_dim - 1; + + const float descale_em = 1.f / weight_data_int8_scale; + + const signed char* em = (const signed char*)weight_data + num_output * word_index; + + if (bias_ptr) { for (int p = 0; p < num_output; p++) { - outptr[p] += bias_data[p]; + outptr[p] = em[p] * descale_em + bias_ptr[p]; } } + else + { + for (int p = 0; p < num_output; p++) + { + outptr[p] = em[p] * descale_em; + } + } + } +} +#endif // NCNN_INT8 + +int Embed::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + int words = static_cast(bottom_blob.total()); + + top_blob.create(num_output, words, 4u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + +#if NCNN_INT8 + if (int8_scale_term) + { + embed_int8(bottom_blob, weight_data, weight_data_int8_scale, bias_data, top_blob, input_dim, opt); + } + else +#endif // NCNN_INT8 + { + embed(bottom_blob, weight_data, bias_data, top_blob, input_dim, opt); } return 0; diff --git a/src/layer/embed.h b/src/layer/embed.h index 8e236656716..b94c2b17bee 100644 --- a/src/layer/embed.h +++ b/src/layer/embed.h @@ -38,9 +38,15 @@ class Embed : public Layer int weight_data_size; + int int8_scale_term; + // model Mat weight_data; Mat bias_data; + +#if NCNN_INT8 + float weight_data_int8_scale; +#endif }; } // namespace ncnn diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 6c8939fc7c7..e2ddc32a00d 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -101,6 +101,7 @@ ncnn_add_layer_test(Dropout) ncnn_add_layer_test(Einsum) ncnn_add_layer_test(Eltwise) ncnn_add_layer_test(ELU) +ncnn_add_layer_test(Embed) ncnn_add_layer_test(Erf) ncnn_add_layer_test(ExpandDims) ncnn_add_layer_test(Flatten) diff --git a/tests/test_embed.cpp b/tests/test_embed.cpp new file mode 100644 index 00000000000..9c007ee5d7e --- /dev/null +++ b/tests/test_embed.cpp @@ -0,0 +1,108 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "testutil.h" + +static int test_embed(int words, int num_output, int input_dim, int bias) +{ + ncnn::ParamDict pd; + pd.set(0, num_output); + pd.set(1, input_dim); + pd.set(2, bias); + pd.set(3, num_output * input_dim); + + std::vector weights(bias ? 2 : 1); + weights[0] = RandomMat(num_output * input_dim); + if (bias) + weights[1] = RandomMat(num_output); + + ncnn::Mat a(words); + RandomizeInt(a, 0, input_dim); + + int ret = test_layer("Embed", pd, weights, a); + if (ret != 0) + { + fprintf(stderr, "test_embed failed words=%d num_output=%d input_dim=%d bias=%d\n", words, num_output, input_dim, bias); + } + + return ret; +} + +static int test_embed_0() +{ + return 0 + || test_embed(128, 128, 128, 0) + || test_embed(128, 128, 128, 1) + || test_embed(127, 127, 127, 0) + || test_embed(127, 127, 127, 1) + || test_embed(124, 124, 124, 0) + || test_embed(124, 124, 124, 1); +} + +#if NCNN_INT8 +static int test_embed_int8(int words, int num_output, int input_dim, int bias) +{ + ncnn::ParamDict pd; + pd.set(0, num_output); + pd.set(1, input_dim); + pd.set(2, bias); + pd.set(3, num_output * input_dim); + pd.set(18, 2); + + std::vector weights(bias ? 3 : 2); + weights[0] = RandomS8Mat(num_output * input_dim); + if (bias) + { + weights[1] = RandomMat(num_output); + weights[2] = RandomMat(1, 100.f, 200.f); + } + else + { + weights[1] = RandomMat(1, 100.f, 200.f); + } + + ncnn::Mat a(words); + RandomizeInt(a, 0, input_dim); + + int ret = test_layer("Embed", pd, weights, a); + if (ret != 0) + { + fprintf(stderr, "test_embed_int8 failed words=%d num_output=%d input_dim=%d bias=%d\n", words, num_output, input_dim, bias); + } + + return ret; +} + +static int test_embed_1() +{ + return 0 + || test_embed_int8(128, 128, 128, 0) + || test_embed_int8(128, 128, 128, 1) + || test_embed_int8(127, 127, 127, 0) + || test_embed_int8(127, 127, 127, 1) + || test_embed_int8(124, 124, 124, 0) + || test_embed_int8(124, 124, 124, 1); +} +#endif // NCNN_INT8 + +int main() +{ + SRAND(7767517); + +#if NCNN_INT8 + return test_embed_0() || test_embed_1(); +#else + return test_embed_0(); +#endif +} diff --git a/tools/modelwriter.h b/tools/modelwriter.h index 4f445cfe2a4..39157c453ec 100644 --- a/tools/modelwriter.h +++ b/tools/modelwriter.h @@ -1676,9 +1676,20 @@ int ModelWriter::save(const char* parampath, const char* binpath) fprintf_param_value(" 1=%d", input_dim) fprintf_param_value(" 2=%d", bias_term) fprintf_param_value(" 3=%d", weight_data_size) + fprintf_param_value(" 18=%d", int8_scale_term) fwrite_weight_tag_data(op->weight_data, bp); fwrite_weight_data(op->bias_data, bp); + +#if NCNN_INT8 + // write int8_scale data + if (op->int8_scale_term) + { + ncnn::Mat weight_data_int8_scales(1); + weight_data_int8_scales[0] = op->weight_data_int8_scale; + fwrite_weight_data(weight_data_int8_scales, bp, 90, 100); + } +#endif // NCNN_INT8 } else if (layer->type == "Exp") { diff --git a/tools/quantize/ncnn2int8.cpp b/tools/quantize/ncnn2int8.cpp index 4d19ceb6f16..5e92b333aa5 100644 --- a/tools/quantize/ncnn2int8.cpp +++ b/tools/quantize/ncnn2int8.cpp @@ -133,6 +133,8 @@ class NetQuantize : public ModelWriter int quantize_lstm(); int quantize_gru(); + int quantize_embed(); + int fuse_requantize(); }; @@ -562,6 +564,55 @@ int NetQuantize::quantize_gru() return 0; } +int NetQuantize::quantize_embed() +{ + for (size_t i = 0; i < layers.size(); i++) + { + if (layers[i]->type != "Embed") + continue; + + // Embed - quantize weight from fp32 to int8 + ncnn::Embed* embed = (ncnn::Embed*)layers[i]; + + fprintf(stderr, "quantize_embed %s\n", embed->name.c_str()); + + // TODO move to ncnn2table + + const int num_output = embed->num_output; + const int input_dim = embed->input_dim; + + ncnn::Mat weight_data_int8_scales(1); + { + const float* ptr = embed->weight_data; + float absmax = 0.f; + for (int i = 0; i < embed->weight_data.w; i++) + { + absmax = std::max(absmax, (float)fabs(ptr[i])); + } + + weight_data_int8_scales[0] = absmax == 0.f ? 1.f : 127 / absmax; + } + + { + ncnn::Mat weight_data_int8; + + ncnn::Option opt_q = opt; + opt_q.blob_allocator = embed->weight_data.allocator; + opt_q.use_packing_layout = false; + ncnn::quantize_to_int8(embed->weight_data, weight_data_int8, weight_data_int8_scales, opt_q); + if (weight_data_int8.empty()) + return -100; + + embed->weight_data = weight_data_int8; + } + + embed->int8_scale_term = 2; + embed->weight_data_int8_scale = weight_data_int8_scales[0]; + } + + return 0; +} + int NetQuantize::fuse_requantize() { const size_t layer_count = layers.size(); @@ -809,6 +860,7 @@ int main(int argc, char** argv) quantizer.quantize_rnn(); quantizer.quantize_lstm(); quantizer.quantize_gru(); + quantizer.quantize_embed(); quantizer.fuse_requantize(); From 8077d340a905ff4b15f7c266da85c811983e6291 Mon Sep 17 00:00:00 2001 From: nihui Date: Tue, 3 Sep 2024 17:16:50 +0800 Subject: [PATCH 34/38] arm neon optimzation for rmsnorm (#5668) --- src/layer/arm/rmsnorm_arm.cpp | 417 ++++++++++++++++++++++++++ src/layer/arm/rmsnorm_arm.h | 40 +++ src/layer/arm/rmsnorm_arm_asimdhp.cpp | 272 +++++++++++++++++ 3 files changed, 729 insertions(+) create mode 100644 src/layer/arm/rmsnorm_arm.cpp create mode 100644 src/layer/arm/rmsnorm_arm.h create mode 100644 src/layer/arm/rmsnorm_arm_asimdhp.cpp diff --git a/src/layer/arm/rmsnorm_arm.cpp b/src/layer/arm/rmsnorm_arm.cpp new file mode 100644 index 00000000000..e19136ca29d --- /dev/null +++ b/src/layer/arm/rmsnorm_arm.cpp @@ -0,0 +1,417 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "rmsnorm_arm.h" + +#if __ARM_NEON +#include +#endif // __ARM_NEON + +#include "arm_usability.h" +#include "cpu.h" + +namespace ncnn { + +RMSNorm_arm::RMSNorm_arm() +{ +#if __ARM_NEON + support_packing = true; +#if NCNN_ARM82 + support_fp16_storage = cpu_support_arm_asimdhp(); +#endif +#endif // __ARM_NEON + +#if NCNN_BF16 + support_bf16_storage = true; +#endif +} + +static void rmsnorm(float* ptr, const float* gamma_ptr, float eps, int elemcount, int elempack) +{ + const int size = elemcount * elempack; + +#if __ARM_NEON + float32x4_t _rms = vdupq_n_f32(0.f); +#endif // __ARM_NEON + float rms = 0.f; + { + const float* ptr0 = ptr; + + int i = 0; +#if __ARM_NEON + for (; i + 3 < size; i += 4) + { + float32x4_t _p = vld1q_f32(ptr0); + _rms = vmlaq_f32(_rms, _p, _p); + ptr0 += 4; + } +#endif // __ARM_NEON + for (; i < size; i++) + { + rms += ptr0[0] * ptr0[0]; + ptr0++; + } + } + +#if __ARM_NEON + if (elempack == 4) + { + float32x4_t _elemcount = vdupq_n_f32(elemcount); + float32x4_t _eps = vdupq_n_f32(eps); + +#if __aarch64__ + _rms = vdivq_f32(_rms, _elemcount); + _rms = vaddq_f32(_rms, _eps); +#else + float32x4_t _inv_elemcount = vrecpeq_f32(_elemcount); + _inv_elemcount = vmulq_f32(vrecpsq_f32(_elemcount, _inv_elemcount), _inv_elemcount); + _inv_elemcount = vmulq_f32(vrecpsq_f32(_elemcount, _inv_elemcount), _inv_elemcount); + _rms = vmlaq_f32(_eps, _rms, _inv_elemcount); +#endif + + float32x4_t _rsqrt_rms = vrsqrteq_f32(_rms); + _rsqrt_rms = vmulq_f32(vrsqrtsq_f32(vmulq_f32(_rms, _rsqrt_rms), _rsqrt_rms), _rsqrt_rms); + _rms = vmulq_f32(vrsqrtsq_f32(vmulq_f32(_rms, _rsqrt_rms), _rsqrt_rms), _rsqrt_rms); + } +#endif // __ARM_NEON + if (elempack == 1) + { +#if __ARM_NEON +#if __aarch64__ + rms += vaddvq_f32(_rms); +#else + float32x2_t _s2 = vadd_f32(vget_low_f32(_rms), vget_high_f32(_rms)); + _s2 = vpadd_f32(_s2, _s2); + rms += vget_lane_f32(_s2, 0); +#endif +#endif // __ARM_NEON + + rms = 1.f / sqrtf(rms / elemcount + eps); +#if __ARM_NEON + _rms = vdupq_n_f32(rms); +#endif // __ARM_NEON + } + + if (gamma_ptr) + { + int i = 0; +#if __ARM_NEON + if (elempack == 4) + { + for (; i + 3 < size; i += 4) + { + float32x4_t _p = vld1q_f32(ptr); + float32x4_t _gamma = vdupq_n_f32(gamma_ptr[0]); + _p = vmulq_f32(_p, _rms); + _p = vmulq_f32(_p, _gamma); + vst1q_f32(ptr, _p); + ptr += 4; + gamma_ptr += 1; + } + } + if (elempack == 1) + { + for (; i + 3 < size; i += 4) + { + float32x4_t _p = vld1q_f32(ptr); + float32x4_t _gamma = vld1q_f32(gamma_ptr); + _p = vmulq_f32(_p, _rms); + _p = vmulq_f32(_p, _gamma); + vst1q_f32(ptr, _p); + ptr += 4; + gamma_ptr += 4; + } + } +#endif // __ARM_NEON + for (; i < size; i++) + { + ptr[0] = (ptr[0] * rms) * gamma_ptr[0]; + ptr++; + gamma_ptr++; + } + } + else + { + int i = 0; +#if __ARM_NEON + for (; i + 3 < size; i += 4) + { + float32x4_t _p = vld1q_f32(ptr); + _p = vmulq_f32(_p, _rms); + vst1q_f32(ptr, _p); + ptr += 4; + } +#endif // __ARM_NEON + for (; i < size; i++) + { + ptr[0] = ptr[0] * rms; + ptr++; + } + } +} + +int RMSNorm_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const +{ + int elembits = bottom_top_blob.elembits(); + +#if NCNN_ARM82 + if (support_fp16_storage && opt.use_fp16_storage && elembits == 16) + return forward_inplace_fp16s(bottom_top_blob, opt); +#endif + +#if NCNN_BF16 + if (opt.use_bf16_storage && elembits == 16) + return forward_inplace_bf16s(bottom_top_blob, opt); +#endif + + const int dims = bottom_top_blob.dims; + const int w = bottom_top_blob.w; + const int h = bottom_top_blob.h; + const int channels = bottom_top_blob.c; + const int elempack = bottom_top_blob.elempack; + + if (dims == 1) + { + // assert affine_size == w + + float* ptr = bottom_top_blob; + rmsnorm(ptr, gamma_data, eps, w * elempack, 1); + } + + if (dims == 2) + { + // assert affine_size == w + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + float* ptr = bottom_top_blob.row(i); + rmsnorm(ptr, gamma_data, eps, w, elempack); + } + } + + if (dims == 3) + { + if (affine_size == w) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + for (int i = 0; i < h; i++) + { + float* ptr = bottom_top_blob.channel(q).row(i); + rmsnorm(ptr, gamma_data, eps, w, elempack); + } + } + } + else // if (affine_size == w * h) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* ptr = bottom_top_blob.channel(q); + rmsnorm(ptr, gamma_data, eps, w * h, elempack); + } + } + } + + return 0; +} + +#if NCNN_BF16 +static void rmsnorm_bf16s(unsigned short* ptr, const float* gamma_ptr, float eps, int elemcount, int elempack) +{ + const int size = elemcount * elempack; + +#if __ARM_NEON + float32x4_t _rms = vdupq_n_f32(0.f); +#endif // __ARM_NEON + float rms = 0.f; + { + const unsigned short* ptr0 = ptr; + + int i = 0; +#if __ARM_NEON + for (; i + 3 < size; i += 4) + { + float32x4_t _p = bfloat2float(vld1_u16(ptr0)); + _rms = vmlaq_f32(_rms, _p, _p); + ptr0 += 4; + } +#endif // __ARM_NEON + for (; i < size; i++) + { + float v = bfloat16_to_float32(ptr0[0]); + rms += v * v; + ptr0++; + } + } + +#if __ARM_NEON + if (elempack == 4) + { + float32x4_t _elemcount = vdupq_n_f32(elemcount); + float32x4_t _eps = vdupq_n_f32(eps); + +#if __aarch64__ + _rms = vdivq_f32(_rms, _elemcount); + _rms = vaddq_f32(_rms, _eps); +#else + float32x4_t _inv_elemcount = vrecpeq_f32(_elemcount); + _inv_elemcount = vmulq_f32(vrecpsq_f32(_elemcount, _inv_elemcount), _inv_elemcount); + _inv_elemcount = vmulq_f32(vrecpsq_f32(_elemcount, _inv_elemcount), _inv_elemcount); + _rms = vmlaq_f32(_eps, _rms, _inv_elemcount); +#endif + + float32x4_t _rsqrt_rms = vrsqrteq_f32(_rms); + _rsqrt_rms = vmulq_f32(vrsqrtsq_f32(vmulq_f32(_rms, _rsqrt_rms), _rsqrt_rms), _rsqrt_rms); + _rms = vmulq_f32(vrsqrtsq_f32(vmulq_f32(_rms, _rsqrt_rms), _rsqrt_rms), _rsqrt_rms); + } +#endif // __ARM_NEON + if (elempack == 1) + { +#if __ARM_NEON +#if __aarch64__ + rms += vaddvq_f32(_rms); +#else + float32x2_t _s2 = vadd_f32(vget_low_f32(_rms), vget_high_f32(_rms)); + _s2 = vpadd_f32(_s2, _s2); + rms += vget_lane_f32(_s2, 0); +#endif +#endif // __ARM_NEON + + rms = 1.f / sqrtf(rms / elemcount + eps); +#if __ARM_NEON + _rms = vdupq_n_f32(rms); +#endif // __ARM_NEON + } + + if (gamma_ptr) + { + int i = 0; +#if __ARM_NEON + if (elempack == 4) + { + for (; i + 3 < size; i += 4) + { + float32x4_t _p = bfloat2float(vld1_u16(ptr)); + float32x4_t _gamma = vdupq_n_f32(gamma_ptr[0]); + _p = vmulq_f32(_p, _rms); + _p = vmulq_f32(_p, _gamma); + vst1_u16(ptr, float2bfloat(_p)); + ptr += 4; + gamma_ptr += 1; + } + } + if (elempack == 1) + { + for (; i + 3 < size; i += 4) + { + float32x4_t _p = bfloat2float(vld1_u16(ptr)); + float32x4_t _gamma = vld1q_f32(gamma_ptr); + _p = vmulq_f32(_p, _rms); + _p = vmulq_f32(_p, _gamma); + vst1_u16(ptr, float2bfloat(_p)); + ptr += 4; + gamma_ptr += 4; + } + } +#endif // __ARM_NEON + for (; i < size; i++) + { + float v = bfloat16_to_float32(ptr[0]); + ptr[0] = float32_to_bfloat16((v * rms) * gamma_ptr[0]); + ptr++; + gamma_ptr++; + } + } + else + { + int i = 0; +#if __ARM_NEON + for (; i + 3 < size; i += 4) + { + float32x4_t _p = bfloat2float(vld1_u16(ptr)); + _p = vmulq_f32(_p, _rms); + vst1_u16(ptr, float2bfloat(_p)); + ptr += 4; + } +#endif // __ARM_NEON + for (; i < size; i++) + { + float v = bfloat16_to_float32(ptr[0]); + ptr[0] = float32_to_bfloat16(v * rms); + ptr++; + } + } +} + +int RMSNorm_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const +{ + const int dims = bottom_top_blob.dims; + const int w = bottom_top_blob.w; + const int h = bottom_top_blob.h; + const int channels = bottom_top_blob.c; + const int elempack = bottom_top_blob.elempack; + + if (dims == 1) + { + // assert affine_size == w + + unsigned short* ptr = bottom_top_blob; + rmsnorm_bf16s(ptr, gamma_data, eps, w * elempack, 1); + } + + if (dims == 2) + { + // assert affine_size == w + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + unsigned short* ptr = bottom_top_blob.row(i); + rmsnorm_bf16s(ptr, gamma_data, eps, w, elempack); + } + } + + if (dims == 3) + { + if (affine_size == w) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + for (int i = 0; i < h; i++) + { + unsigned short* ptr = bottom_top_blob.channel(q).row(i); + rmsnorm_bf16s(ptr, gamma_data, eps, w, elempack); + } + } + } + else // if (affine_size == w * h) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + unsigned short* ptr = bottom_top_blob.channel(q); + rmsnorm_bf16s(ptr, gamma_data, eps, w * h, elempack); + } + } + } + + return 0; +} +#endif // NCNN_BF16 + +} // namespace ncnn diff --git a/src/layer/arm/rmsnorm_arm.h b/src/layer/arm/rmsnorm_arm.h new file mode 100644 index 00000000000..44015333371 --- /dev/null +++ b/src/layer/arm/rmsnorm_arm.h @@ -0,0 +1,40 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_RMSNORM_ARM_H +#define LAYER_RMSNORM_ARM_H + +#include "rmsnorm.h" + +namespace ncnn { + +class RMSNorm_arm : public RMSNorm +{ +public: + RMSNorm_arm(); + + virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; + +protected: +#if NCNN_ARM82 + int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const; +#endif +#if NCNN_BF16 + int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const; +#endif +}; + +} // namespace ncnn + +#endif // LAYER_RMSNORM_ARM_H diff --git a/src/layer/arm/rmsnorm_arm_asimdhp.cpp b/src/layer/arm/rmsnorm_arm_asimdhp.cpp new file mode 100644 index 00000000000..98d8e696487 --- /dev/null +++ b/src/layer/arm/rmsnorm_arm_asimdhp.cpp @@ -0,0 +1,272 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "rmsnorm_arm.h" + +#if __ARM_NEON +#include +#include "arm_usability.h" +#endif // __ARM_NEON + +namespace ncnn { + +#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +static void rmsnorm_fp16s(__fp16* ptr, const float* gamma_ptr, float eps, int elemcount, int elempack) +{ + const int size = elemcount * elempack; + + float32x4_t _rms0 = vdupq_n_f32(0.f); + float32x4_t _rms1 = vdupq_n_f32(0.f); + float rms = 0.f; + { + const __fp16* ptr0 = ptr; + + int i = 0; + for (; i + 7 < size; i += 8) + { + float16x8_t _p = vld1q_f16(ptr0); + float32x4_t _p0 = vcvt_f32_f16(vget_low_f16(_p)); + float32x4_t _p1 = vcvt_f32_f16(vget_high_f16(_p)); + _rms0 = vmlaq_f32(_rms0, _p0, _p0); + _rms1 = vmlaq_f32(_rms1, _p1, _p1); + ptr0 += 8; + } + for (; i + 3 < size; i += 4) + { + float32x4_t _p = vcvt_f32_f16(vld1_f16(ptr0)); + _rms0 = vmlaq_f32(_rms0, _p, _p); + ptr0 += 4; + } + for (; i < size; i++) + { + rms += (float)ptr0[0] * (float)ptr0[0]; + ptr0++; + } + } + + if (elempack == 8) + { + float32x4_t _elemcount = vdupq_n_f32(elemcount); + float32x4_t _eps = vdupq_n_f32(eps); + + _rms0 = vdivq_f32(_rms0, _elemcount); + _rms1 = vdivq_f32(_rms1, _elemcount); + _rms0 = vaddq_f32(_rms0, _eps); + _rms1 = vaddq_f32(_rms1, _eps); + + float32x4_t _rsqrt_rms0 = vrsqrteq_f32(_rms0); + float32x4_t _rsqrt_rms1 = vrsqrteq_f32(_rms1); + _rsqrt_rms0 = vmulq_f32(vrsqrtsq_f32(vmulq_f32(_rms0, _rsqrt_rms0), _rsqrt_rms0), _rsqrt_rms0); + _rsqrt_rms1 = vmulq_f32(vrsqrtsq_f32(vmulq_f32(_rms1, _rsqrt_rms1), _rsqrt_rms1), _rsqrt_rms1); + _rms0 = vmulq_f32(vrsqrtsq_f32(vmulq_f32(_rms0, _rsqrt_rms0), _rsqrt_rms0), _rsqrt_rms0); + _rms1 = vmulq_f32(vrsqrtsq_f32(vmulq_f32(_rms1, _rsqrt_rms1), _rsqrt_rms1), _rsqrt_rms1); + } + if (elempack == 4) + { + _rms0 = vaddq_f32(_rms0, _rms1); + + float32x4_t _elemcount = vdupq_n_f32(elemcount); + float32x4_t _eps = vdupq_n_f32(eps); + + _rms0 = vdivq_f32(_rms0, _elemcount); + _rms0 = vaddq_f32(_rms0, _eps); + + float32x4_t _rsqrt_rms0 = vrsqrteq_f32(_rms0); + _rsqrt_rms0 = vmulq_f32(vrsqrtsq_f32(vmulq_f32(_rms0, _rsqrt_rms0), _rsqrt_rms0), _rsqrt_rms0); + _rms0 = vmulq_f32(vrsqrtsq_f32(vmulq_f32(_rms0, _rsqrt_rms0), _rsqrt_rms0), _rsqrt_rms0); + _rms1 = _rms0; + } + if (elempack == 1) + { + _rms0 = vaddq_f32(_rms0, _rms1); + rms += vaddvq_f32(_rms0); + + rms = 1.f / sqrtf(rms / elemcount + eps); + _rms0 = vdupq_n_f32(rms); + _rms1 = _rms0; + } + + if (gamma_ptr) + { + int i = 0; + if (elempack == 8) + { + for (; i + 7 < size; i += 8) + { + float16x8_t _p = vld1q_f16(ptr); + float32x4_t _p0 = vcvt_f32_f16(vget_low_f16(_p)); + float32x4_t _p1 = vcvt_f32_f16(vget_high_f16(_p)); + float32x4_t _gamma = vdupq_n_f32(gamma_ptr[0]); + _p0 = vmulq_f32(_p0, _rms0); + _p1 = vmulq_f32(_p1, _rms1); + _p0 = vmulq_f32(_p0, _gamma); + _p1 = vmulq_f32(_p1, _gamma); + _p = vcombine_f16(vcvt_f16_f32(_p0), vcvt_f16_f32(_p1)); + vst1q_f16(ptr, _p); + ptr += 8; + gamma_ptr += 1; + } + } + if (elempack == 4) + { + for (; i + 7 < size; i += 8) + { + float16x8_t _p = vld1q_f16(ptr); + float32x4_t _p0 = vcvt_f32_f16(vget_low_f16(_p)); + float32x4_t _p1 = vcvt_f32_f16(vget_high_f16(_p)); + float32x4_t _gamma0 = vdupq_n_f32(gamma_ptr[0]); + float32x4_t _gamma1 = vdupq_n_f32(gamma_ptr[1]); + _p0 = vmulq_f32(_p0, _rms0); + _p1 = vmulq_f32(_p1, _rms1); + _p0 = vmulq_f32(_p0, _gamma0); + _p1 = vmulq_f32(_p1, _gamma1); + _p = vcombine_f16(vcvt_f16_f32(_p0), vcvt_f16_f32(_p1)); + vst1q_f16(ptr, _p); + ptr += 8; + gamma_ptr += 2; + } + for (; i + 3 < size; i += 4) + { + float32x4_t _p = vcvt_f32_f16(vld1_f16(ptr)); + float32x4_t _gamma = vdupq_n_f32(gamma_ptr[0]); + _p = vmulq_f32(_p, _rms0); + _p = vmulq_f32(_p, _gamma); + vst1_f16(ptr, vcvt_f16_f32(_p)); + ptr += 4; + gamma_ptr += 1; + } + } + if (elempack == 1) + { + for (; i + 7 < size; i += 8) + { + float16x8_t _p = vld1q_f16(ptr); + float32x4_t _p0 = vcvt_f32_f16(vget_low_f16(_p)); + float32x4_t _p1 = vcvt_f32_f16(vget_high_f16(_p)); + float32x4_t _gamma0 = vld1q_f32(gamma_ptr); + float32x4_t _gamma1 = vld1q_f32(gamma_ptr + 4); + _p0 = vmulq_f32(_p0, _rms0); + _p1 = vmulq_f32(_p1, _rms1); + _p0 = vmulq_f32(_p0, _gamma0); + _p1 = vmulq_f32(_p1, _gamma1); + _p = vcombine_f16(vcvt_f16_f32(_p0), vcvt_f16_f32(_p1)); + vst1q_f16(ptr, _p); + ptr += 8; + gamma_ptr += 8; + } + for (; i + 3 < size; i += 4) + { + float32x4_t _p = vcvt_f32_f16(vld1_f16(ptr)); + float32x4_t _gamma = vld1q_f32(gamma_ptr); + _p = vmulq_f32(_p, _rms0); + _p = vmulq_f32(_p, _gamma); + vst1_f16(ptr, vcvt_f16_f32(_p)); + ptr += 4; + gamma_ptr += 4; + } + } + for (; i < size; i++) + { + ptr[0] = (__fp16)(((float)ptr[0] * rms) * gamma_ptr[0]); + ptr++; + gamma_ptr++; + } + } + else + { + int i = 0; + for (; i + 7 < size; i += 8) + { + float16x8_t _p = vld1q_f16(ptr); + float32x4_t _p0 = vcvt_f32_f16(vget_low_f16(_p)); + float32x4_t _p1 = vcvt_f32_f16(vget_high_f16(_p)); + _p0 = vmulq_f32(_p0, _rms0); + _p1 = vmulq_f32(_p1, _rms1); + _p = vcombine_f16(vcvt_f16_f32(_p0), vcvt_f16_f32(_p1)); + vst1q_f16(ptr, _p); + ptr += 8; + } + for (; i + 3 < size; i += 4) + { + float32x4_t _p = vcvt_f32_f16(vld1_f16(ptr)); + _p = vmulq_f32(_p, _rms0); + vst1_f16(ptr, vcvt_f16_f32(_p)); + ptr += 4; + } + for (; i < size; i++) + { + ptr[0] = (__fp16)((float)ptr[0] * rms); + ptr++; + } + } +} + +int RMSNorm_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const +{ + const int dims = bottom_top_blob.dims; + const int w = bottom_top_blob.w; + const int h = bottom_top_blob.h; + const int channels = bottom_top_blob.c; + const int elempack = bottom_top_blob.elempack; + + if (dims == 1) + { + // assert affine_size == w + + __fp16* ptr = bottom_top_blob; + rmsnorm_fp16s(ptr, gamma_data, eps, w * elempack, 1); + } + + if (dims == 2) + { + // assert affine_size == w + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + __fp16* ptr = bottom_top_blob.row<__fp16>(i); + rmsnorm_fp16s(ptr, gamma_data, eps, w, elempack); + } + } + + if (dims == 3) + { + if (affine_size == w) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + for (int i = 0; i < h; i++) + { + __fp16* ptr = bottom_top_blob.channel(q).row<__fp16>(i); + rmsnorm_fp16s(ptr, gamma_data, eps, w, elempack); + } + } + } + else // if (affine_size == w * h) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + __fp16* ptr = bottom_top_blob.channel(q); + rmsnorm_fp16s(ptr, gamma_data, eps, w * h, elempack); + } + } + } + + return 0; +} +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +} // namespace ncnn From 204583ba52cbc1e4b39b4e77ee1b050eeb1734b7 Mon Sep 17 00:00:00 2001 From: nihui Date: Tue, 3 Sep 2024 17:17:03 +0800 Subject: [PATCH 35/38] x86 sse2/avx/avx512 optimization for rmsnorm (#5672) --- src/layer/x86/rmsnorm_x86.cpp | 413 ++++++++++++++++++++++++++++++++++ src/layer/x86/rmsnorm_x86.h | 32 +++ 2 files changed, 445 insertions(+) create mode 100644 src/layer/x86/rmsnorm_x86.cpp create mode 100644 src/layer/x86/rmsnorm_x86.h diff --git a/src/layer/x86/rmsnorm_x86.cpp b/src/layer/x86/rmsnorm_x86.cpp new file mode 100644 index 00000000000..db592c3e381 --- /dev/null +++ b/src/layer/x86/rmsnorm_x86.cpp @@ -0,0 +1,413 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "rmsnorm_x86.h" + +#if __SSE2__ +#include +#if __AVX__ +#include +#endif // __AVX__ +#endif // __SSE2__ + +#include "x86_usability.h" + +namespace ncnn { + +RMSNorm_x86::RMSNorm_x86() +{ +#if __SSE2__ + support_packing = true; +#endif // __SSE2__ +} + +static void rmsnorm(float* ptr, const float* gamma_ptr, float eps, int elemcount, int elempack) +{ + const int size = elemcount * elempack; + +#if __SSE2__ +#if __AVX__ +#if __AVX512F__ + __m512 _rms_avx512 = _mm512_set1_ps(0.f); +#endif // __AVX512F__ + __m256 _rms_avx = _mm256_set1_ps(0.f); +#endif // __AVX__ + __m128 _rms = _mm_set1_ps(0.f); +#endif // __SSE2__ + float rms = 0.f; + { + const float* ptr0 = ptr; + + int i = 0; +#if __SSE2__ +#if __AVX__ +#if __AVX512F__ + for (; i + 15 < size; i += 16) + { + __m512 _p = _mm512_loadu_ps(ptr0); + _rms_avx512 = _mm512_fmadd_ps(_p, _p, _rms_avx512); + ptr0 += 16; + } +#endif // __AVX512F__ + for (; i + 7 < size; i += 8) + { + __m256 _p = _mm256_loadu_ps(ptr0); + _rms_avx = _mm256_comp_fmadd_ps(_p, _p, _rms_avx); + ptr0 += 8; + } +#endif // __AVX__ + for (; i + 3 < size; i += 4) + { + __m128 _p = _mm_loadu_ps(ptr0); + _rms = _mm_comp_fmadd_ps(_p, _p, _rms); + ptr0 += 4; + } +#endif // __SSE2__ + for (; i < size; i++) + { + rms += ptr0[0] * ptr0[0]; + ptr0++; + } + } + +#if __SSE2__ +#if __AVX__ +#if __AVX512F__ + if (elempack == 16) + { + __m512 _elemcount = _mm512_set1_ps((float)elemcount); + __m512 _eps = _mm512_set1_ps(eps); + + _rms_avx512 = _mm512_div_ps(_rms_avx512, _elemcount); + _rms_avx512 = _mm512_add_ps(_rms_avx512, _eps); + + __m256 _rms0 = _mm256_rsqrt_ps(_mm512_extractf32x8_ps(_rms_avx512, 0)); + __m256 _rms1 = _mm256_rsqrt_ps(_mm512_extractf32x8_ps(_rms_avx512, 1)); + _rms_avx512 = _mm512_insertf32x8(_mm512_castps256_ps512(_rms0), _rms1, 1); + } +#endif // __AVX512F__ + if (elempack == 8) + { +#if __AVX512F__ + { + __m256 _rms0 = _mm512_castps512_ps256(_rms_avx512); + __m256 _rms1 = _mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(_rms_avx512), 1)); + _rms_avx = _mm256_add_ps(_rms_avx, _rms0); + _rms_avx = _mm256_add_ps(_rms_avx, _rms1); + } +#endif // __AVX512F__ + + __m256 _elemcount = _mm256_set1_ps((float)elemcount); + __m256 _eps = _mm256_set1_ps(eps); + + _rms_avx = _mm256_div_ps(_rms_avx, _elemcount); + _rms_avx = _mm256_add_ps(_rms_avx, _eps); + + _rms_avx = _mm256_rsqrt_ps(_rms_avx); +#if __AVX512F__ + _rms_avx512 = _mm512_insertf32x8(_mm512_castps256_ps512(_rms_avx), _rms_avx, 1); +#endif // __AVX512F__ + } +#endif // __AVX__ + if (elempack == 4) + { +#if __AVX__ +#if __AVX512F__ + { + __m256 _rms0 = _mm512_castps512_ps256(_rms_avx512); + __m256 _rms1 = _mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(_rms_avx512), 1)); + _rms_avx = _mm256_add_ps(_rms_avx, _rms0); + _rms_avx = _mm256_add_ps(_rms_avx, _rms1); + } +#endif // __AVX512F__ + { + __m128 _rms0 = _mm256_castps256_ps128(_rms_avx); + __m128 _rms1 = _mm256_extractf128_ps(_rms_avx, 1); + _rms = _mm_add_ps(_rms, _rms0); + _rms = _mm_add_ps(_rms, _rms1); + } +#endif // __AVX__ + + __m128 _elemcount = _mm_set1_ps((float)elemcount); + __m128 _eps = _mm_set1_ps(eps); + + _rms = _mm_div_ps(_rms, _elemcount); + _rms = _mm_add_ps(_rms, _eps); + + _rms = _mm_rsqrt_ps(_rms); +#if __AVX__ + _rms_avx = _mm256_insertf128_ps(_mm256_castps128_ps256(_rms), _rms, 1); +#if __AVX512F__ + _rms_avx512 = _mm512_insertf32x8(_mm512_castps256_ps512(_rms_avx), _rms_avx, 1); +#endif // __AVX512F__ +#endif // __AVX__ + } +#endif // __SSE2__ + if (elempack == 1) + { +#if __SSE2__ +#if __AVX__ +#if __AVX512F__ + rms += _mm512_comp_reduce_add_ps(_rms_avx512); +#endif // __AVX512F__ + rms += _mm256_reduce_add_ps(_rms_avx); +#endif // __AVX__ + rms += _mm_reduce_add_ps(_rms); +#endif // __SSE2__ + + rms = 1.f / sqrtf(rms / elemcount + eps); +#if __SSE2__ + _rms = _mm_set1_ps(rms); +#if __AVX__ + _rms_avx = _mm256_insertf128_ps(_mm256_castps128_ps256(_rms), _rms, 1); +#if __AVX512F__ + _rms_avx512 = _mm512_insertf32x8(_mm512_castps256_ps512(_rms_avx), _rms_avx, 1); +#endif // __AVX512F__ +#endif // __AVX__ +#endif // __SSE2__ + } + + if (gamma_ptr) + { + int i = 0; +#if __SSE2__ +#if __AVX__ +#if __AVX512F__ + if (elempack == 16) + { + for (; i + 15 < size; i += 16) + { + __m512 _p = _mm512_loadu_ps(ptr); + __m512 _gamma = _mm512_set1_ps(gamma_ptr[0]); + _p = _mm512_mul_ps(_p, _rms_avx512); + _p = _mm512_mul_ps(_p, _gamma); + _mm512_storeu_ps(ptr, _p); + ptr += 16; + gamma_ptr += 1; + } + } +#endif // __AVX512F__ + if (elempack == 8) + { +#if __AVX512F__ + for (; i + 15 < size; i += 16) + { + __m512 _p = _mm512_loadu_ps(ptr); + __m256 _gamma0 = _mm256_set1_ps(gamma_ptr[0]); + __m256 _gamma1 = _mm256_set1_ps(gamma_ptr[1]); + __m512 _gamma = _mm512_insertf32x8(_mm512_castps256_ps512(_gamma0), _gamma1, 1); + _p = _mm512_mul_ps(_p, _rms_avx512); + _p = _mm512_mul_ps(_p, _gamma); + _mm512_storeu_ps(ptr, _p); + ptr += 16; + gamma_ptr += 2; + } +#endif // __AVX512F__ + for (; i + 7 < size; i += 8) + { + __m256 _p = _mm256_loadu_ps(ptr); + __m256 _gamma = _mm256_set1_ps(gamma_ptr[0]); + _p = _mm256_mul_ps(_p, _rms_avx); + _p = _mm256_mul_ps(_p, _gamma); + _mm256_storeu_ps(ptr, _p); + ptr += 8; + gamma_ptr += 1; + } + } +#endif // __AVX__ + if (elempack == 4) + { +#if __AVX__ +#if __AVX512F__ + for (; i + 15 < size; i += 16) + { + __m512 _p = _mm512_loadu_ps(ptr); + __m128 _gamma0 = _mm_set1_ps(gamma_ptr[0]); + __m128 _gamma1 = _mm_set1_ps(gamma_ptr[1]); + __m128 _gamma2 = _mm_set1_ps(gamma_ptr[2]); + __m128 _gamma3 = _mm_set1_ps(gamma_ptr[3]); + __m256 _gamma01 = _mm256_insertf128_ps(_mm256_castps128_ps256(_gamma0), _gamma1, 1); + __m256 _gamma23 = _mm256_insertf128_ps(_mm256_castps128_ps256(_gamma2), _gamma3, 1); + __m512 _gamma = _mm512_insertf32x8(_mm512_castps256_ps512(_gamma01), _gamma23, 1); + _p = _mm512_mul_ps(_p, _rms_avx512); + _p = _mm512_mul_ps(_p, _gamma); + _mm512_storeu_ps(ptr, _p); + ptr += 16; + gamma_ptr += 4; + } +#endif // __AVX512F__ + for (; i + 7 < size; i += 8) + { + __m256 _p = _mm256_loadu_ps(ptr); + __m128 _gamma0 = _mm_set1_ps(gamma_ptr[0]); + __m128 _gamma1 = _mm_set1_ps(gamma_ptr[1]); + __m256 _gamma = _mm256_insertf128_ps(_mm256_castps128_ps256(_gamma0), _gamma1, 1); + _p = _mm256_mul_ps(_p, _rms_avx); + _p = _mm256_mul_ps(_p, _gamma); + _mm256_storeu_ps(ptr, _p); + ptr += 8; + gamma_ptr += 2; + } +#endif // __AVX__ + for (; i + 3 < size; i += 4) + { + __m128 _p = _mm_loadu_ps(ptr); + __m128 _gamma = _mm_set1_ps(gamma_ptr[0]); + _p = _mm_mul_ps(_p, _rms); + _p = _mm_mul_ps(_p, _gamma); + _mm_storeu_ps(ptr, _p); + ptr += 4; + gamma_ptr += 1; + } + } + if (elempack == 1) + { +#if __AVX__ +#if __AVX512F__ + for (; i + 15 < size; i += 16) + { + __m512 _p = _mm512_loadu_ps(ptr); + __m512 _gamma = _mm512_loadu_ps(gamma_ptr); + _p = _mm512_mul_ps(_p, _rms_avx512); + _p = _mm512_mul_ps(_p, _gamma); + _mm512_storeu_ps(ptr, _p); + ptr += 16; + gamma_ptr += 16; + } +#endif // __AVX512F__ + for (; i + 7 < size; i += 8) + { + __m256 _p = _mm256_loadu_ps(ptr); + __m256 _gamma = _mm256_loadu_ps(gamma_ptr); + _p = _mm256_mul_ps(_p, _rms_avx); + _p = _mm256_mul_ps(_p, _gamma); + _mm256_storeu_ps(ptr, _p); + ptr += 8; + gamma_ptr += 8; + } +#endif // __AVX__ + for (; i + 3 < size; i += 4) + { + __m128 _p = _mm_loadu_ps(ptr); + __m128 _gamma = _mm_loadu_ps(gamma_ptr); + _p = _mm_mul_ps(_p, _rms); + _p = _mm_mul_ps(_p, _gamma); + _mm_storeu_ps(ptr, _p); + ptr += 4; + gamma_ptr += 4; + } + } +#endif // __SSE2__ + for (; i < size; i++) + { + ptr[0] = (ptr[0] * rms) * gamma_ptr[0]; + ptr++; + gamma_ptr++; + } + } + else + { + int i = 0; +#if __SSE2__ +#if __AVX__ +#if __AVX512F__ + for (; i + 15 < size; i += 16) + { + __m512 _p = _mm512_loadu_ps(ptr); + _p = _mm512_mul_ps(_p, _rms_avx512); + _mm512_storeu_ps(ptr, _p); + ptr += 16; + } +#endif // __AVX512F__ + for (; i + 7 < size; i += 8) + { + __m256 _p = _mm256_loadu_ps(ptr); + _p = _mm256_mul_ps(_p, _rms_avx); + _mm256_storeu_ps(ptr, _p); + ptr += 8; + } +#endif // __AVX__ + for (; i + 3 < size; i += 4) + { + __m128 _p = _mm_loadu_ps(ptr); + _p = _mm_mul_ps(_p, _rms); + _mm_storeu_ps(ptr, _p); + ptr += 4; + } +#endif // __SSE2__ + for (; i < size; i++) + { + ptr[0] = ptr[0] * rms; + ptr++; + } + } +} + +int RMSNorm_x86::forward_inplace(Mat& bottom_top_blob, const Option& opt) const +{ + const int dims = bottom_top_blob.dims; + const int w = bottom_top_blob.w; + const int h = bottom_top_blob.h; + const int channels = bottom_top_blob.c; + const int elempack = bottom_top_blob.elempack; + + if (dims == 1) + { + // assert affine_size == w + + float* ptr = bottom_top_blob; + rmsnorm(ptr, gamma_data, eps, w * elempack, 1); + } + + if (dims == 2) + { + // assert affine_size == w + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + float* ptr = bottom_top_blob.row(i); + rmsnorm(ptr, gamma_data, eps, w, elempack); + } + } + + if (dims == 3) + { + if (affine_size == w) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + for (int i = 0; i < h; i++) + { + float* ptr = bottom_top_blob.channel(q).row(i); + rmsnorm(ptr, gamma_data, eps, w, elempack); + } + } + } + else // if (affine_size == w * h) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* ptr = bottom_top_blob.channel(q); + rmsnorm(ptr, gamma_data, eps, w * h, elempack); + } + } + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/x86/rmsnorm_x86.h b/src/layer/x86/rmsnorm_x86.h new file mode 100644 index 00000000000..2e6296db1c3 --- /dev/null +++ b/src/layer/x86/rmsnorm_x86.h @@ -0,0 +1,32 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_RMSNORM_X86_H +#define LAYER_RMSNORM_X86_H + +#include "rmsnorm.h" + +namespace ncnn { + +class RMSNorm_x86 : public RMSNorm +{ +public: + RMSNorm_x86(); + + virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_RMSNORM_X86_H From 21e54d8c7a789884d1c17dc1b40701bede343975 Mon Sep 17 00:00:00 2001 From: nihui Date: Wed, 4 Sep 2024 08:01:53 +0800 Subject: [PATCH 36/38] update modelwriter for rmsnorm (#5676) --- tools/modelwriter.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tools/modelwriter.h b/tools/modelwriter.h index 39157c453ec..ff86338bca9 100644 --- a/tools/modelwriter.h +++ b/tools/modelwriter.h @@ -99,6 +99,7 @@ #include "layer/reorg.h" #include "layer/requantize.h" #include "layer/reshape.h" +#include "layer/rmsnorm.h" #include "layer/rnn.h" #include "layer/roialign.h" #include "layer/roipooling.h" @@ -2313,6 +2314,17 @@ int ModelWriter::save(const char* parampath, const char* binpath) fprintf_param_value(" 2=%d", c) fprintf_param_value(" 3=%d", permute) } + else if (layer->type == "RMSNorm") + { + ncnn::RMSNorm* op = (ncnn::RMSNorm*)layer; + ncnn::RMSNorm* op_default = (ncnn::RMSNorm*)layer_default; + + fprintf_param_value(" 0=%d", affine_size) + fprintf_param_value(" 1=%e", eps) + fprintf_param_value(" 2=%d", affine) + + fwrite_weight_data(op->gamma_data, bp); + } else if (layer->type == "RNN") { ncnn::RNN* op = (ncnn::RNN*)layer; From 80c78a0e40d2c8843cdbb3917fd00387a0e33ce1 Mon Sep 17 00:00:00 2001 From: nihui Date: Wed, 4 Sep 2024 14:29:09 +0800 Subject: [PATCH 37/38] pnnx fuse t5-layernorm as rmsnorm (#5675) --- tools/pnnx/src/CMakeLists.txt | 1 + tools/pnnx/src/pass_level5.cpp | 2 + tools/pnnx/src/pass_level5/fuse_rmsnorm.cpp | 97 +++++++++++++++++++++ tools/pnnx/src/pass_level5/fuse_rmsnorm.h | 21 +++++ tools/pnnx/tests/CMakeLists.txt | 1 + tools/pnnx/tests/ncnn/test_F_rms_norm.py | 2 +- tools/pnnx/tests/ncnn/test_nn_RMSNorm.py | 2 +- tools/pnnx/tests/test_pnnx_fuse_rmsnorm.py | 77 ++++++++++++++++ 8 files changed, 201 insertions(+), 2 deletions(-) create mode 100644 tools/pnnx/src/pass_level5/fuse_rmsnorm.cpp create mode 100644 tools/pnnx/src/pass_level5/fuse_rmsnorm.h create mode 100644 tools/pnnx/tests/test_pnnx_fuse_rmsnorm.py diff --git a/tools/pnnx/src/CMakeLists.txt b/tools/pnnx/src/CMakeLists.txt index 2c814bd486c..7743a8ae453 100644 --- a/tools/pnnx/src/CMakeLists.txt +++ b/tools/pnnx/src/CMakeLists.txt @@ -369,6 +369,7 @@ set(pnnx_pass_level5_SRCS pass_level5/fuse_pixel_unshuffle.cpp pass_level5/fuse_layernorm.cpp pass_level5/fuse_multiheadattention.cpp + pass_level5/fuse_rmsnorm.cpp pass_level5/fuse_scaled_dot_product_attention.cpp pass_level5/fuse_select_to_unbind.cpp pass_level5/fuse_silu.cpp diff --git a/tools/pnnx/src/pass_level5.cpp b/tools/pnnx/src/pass_level5.cpp index 8bb3270aa2c..5f08b80f5ef 100644 --- a/tools/pnnx/src/pass_level5.cpp +++ b/tools/pnnx/src/pass_level5.cpp @@ -44,6 +44,7 @@ #include "pass_level5/fuse_multiheadattention.h" #include "pass_level5/fuse_pad_conv1d.h" #include "pass_level5/fuse_pad_conv2d.h" +#include "pass_level5/fuse_rmsnorm.h" #include "pass_level5/fuse_scaled_dot_product_attention.h" #include "pass_level5/fuse_select_to_unbind.h" #include "pass_level5/fuse_silu.h" @@ -145,6 +146,7 @@ void pass_level5(Graph& g, const std::set& foldable_constants, cons fuse_channel_shuffle(g); fuse_layernorm(g); + fuse_rmsnorm(g); fuse_multiheadattention(g); fuse_scaled_dot_product_attention(g); diff --git a/tools/pnnx/src/pass_level5/fuse_rmsnorm.cpp b/tools/pnnx/src/pass_level5/fuse_rmsnorm.cpp new file mode 100644 index 00000000000..7b99770ed6e --- /dev/null +++ b/tools/pnnx/src/pass_level5/fuse_rmsnorm.cpp @@ -0,0 +1,97 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "fuse_rmsnorm.h" + +#include "pass_level2.h" + +#include +#include + +namespace pnnx { + +class fuse_rmsnorm_pass : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +6 5 +pnnx.Input input 0 1 input +pnnx.Attribute op_0 0 1 weight @data #weight=(%c)f32 +pnnx.Expression op_1 1 1 input sq expr=pow(@0,2) +torch.mean op_2 1 1 sq sqmean dim=(-1) keepdim=True +pnnx.Expression op_3 3 1 weight input sqmean out expr=mul(@0,mul(@1,rsqrt(add(@2,%eps)))) +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* replace_pattern_graph() const + { + return R"PNNXIR(7767517 +3 2 +pnnx.Input input 0 1 input +nn.RMSNorm rmsnorm 1 1 input out elementwise_affine=True eps=%eps normalized_shape=(%c) @weight=%op_0.data +pnnx.Output output 1 0 out +)PNNXIR"; + } +}; + +class fuse_rmsnorm_pass_1 : public fuse_rmsnorm_pass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +6 5 +pnnx.Input input 0 1 input +pnnx.Attribute op_0 0 1 weight @data #weight=(%c)f32 +pnnx.Expression op_1 1 1 input sq expr=pow(@0,2.000000e+00) +torch.mean op_2 1 1 sq sqmean dim=(-1) keepdim=True +pnnx.Expression op_3 3 1 weight input sqmean out expr=mul(@0,mul(@1,reciprocal(sqrt(add(@2,%eps))))) +pnnx.Output output 1 0 out +)PNNXIR"; + } +}; + +class fuse_rmsnorm_pass_onnx : public fuse_rmsnorm_pass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +6 5 +pnnx.Input input 0 1 input +pnnx.Attribute op_0 0 1 weight @data #weight=(%c)f32 +pnnx.Expression op_1 1 1 input sq expr=pow(@0,2.000000e+00) +torch.mean op_2 1 1 sq sqmean dim=(-1) keepdim=True +pnnx.Expression op_3 3 1 weight input sqmean out expr=mul(@0,mul(@1,div(1.000000e+00,sqrt(add(@2,%eps))))) +pnnx.Output output 1 0 out +)PNNXIR"; + } +}; + +void fuse_rmsnorm(Graph& graph) +{ + fuse_rmsnorm_pass a; + fuse_rmsnorm_pass_1 a1; + fuse_rmsnorm_pass_onnx b; + int opindex = 0; + + pnnx_graph_rewrite(graph, &a, opindex); + pnnx_graph_rewrite(graph, &a1, opindex); + pnnx_graph_rewrite(graph, &b, opindex); +} + +} // namespace pnnx diff --git a/tools/pnnx/src/pass_level5/fuse_rmsnorm.h b/tools/pnnx/src/pass_level5/fuse_rmsnorm.h new file mode 100644 index 00000000000..0ba18e37f61 --- /dev/null +++ b/tools/pnnx/src/pass_level5/fuse_rmsnorm.h @@ -0,0 +1,21 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "ir.h" + +namespace pnnx { + +void fuse_rmsnorm(Graph& graph); + +} // namespace pnnx diff --git a/tools/pnnx/tests/CMakeLists.txt b/tools/pnnx/tests/CMakeLists.txt index daf5501e9d8..0dd566c37b5 100644 --- a/tools/pnnx/tests/CMakeLists.txt +++ b/tools/pnnx/tests/CMakeLists.txt @@ -346,6 +346,7 @@ pnnx_add_test(pnnx_fuse_input_unpack) pnnx_add_test(pnnx_fuse_layernorm) pnnx_add_test(pnnx_fuse_linear_batchnorm1d) pnnx_add_test(pnnx_fuse_multiheadattention) +pnnx_add_test(pnnx_fuse_rmsnorm) pnnx_add_test(pnnx_fuse_scaled_dot_product_attention) pnnx_add_test(pnnx_fuse_select_to_unbind) pnnx_add_test(pnnx_fuse_slice_to_tensor_split) diff --git a/tools/pnnx/tests/ncnn/test_F_rms_norm.py b/tools/pnnx/tests/ncnn/test_F_rms_norm.py index 4e60d9314aa..f30f72f9ac4 100644 --- a/tools/pnnx/tests/ncnn/test_F_rms_norm.py +++ b/tools/pnnx/tests/ncnn/test_F_rms_norm.py @@ -57,7 +57,7 @@ def test(): b = test_F_rms_norm_ncnn.test_inference() for a0, b0 in zip(a, b): - if not torch.allclose(a0, b0, 1e-4, 1e-4): + if not torch.allclose(a0, b0, 1e-3, 1e-3): return False return True diff --git a/tools/pnnx/tests/ncnn/test_nn_RMSNorm.py b/tools/pnnx/tests/ncnn/test_nn_RMSNorm.py index 0d5efa211e4..e69ad1220bc 100644 --- a/tools/pnnx/tests/ncnn/test_nn_RMSNorm.py +++ b/tools/pnnx/tests/ncnn/test_nn_RMSNorm.py @@ -57,7 +57,7 @@ def test(): b = test_nn_RMSNorm_ncnn.test_inference() for a0, b0 in zip(a, b): - if not torch.allclose(a0, b0, 1e-4, 1e-4): + if not torch.allclose(a0, b0, 1e-3, 1e-3): return False return True diff --git a/tools/pnnx/tests/test_pnnx_fuse_rmsnorm.py b/tools/pnnx/tests/test_pnnx_fuse_rmsnorm.py new file mode 100644 index 00000000000..b04fa93442f --- /dev/null +++ b/tools/pnnx/tests/test_pnnx_fuse_rmsnorm.py @@ -0,0 +1,77 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F +from packaging import version + +class T5LayerNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + super().__init__() + self.weight = nn.Parameter(torch.rand(hidden_size)) + self.variance_epsilon = eps + + def forward(self, x): + variance = x.pow(2).mean(-1, keepdim=True) + x = x * torch.rsqrt(variance + self.variance_epsilon) + return self.weight * x + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + self.rmsn_0 = T5LayerNorm(26) + self.rmsn_1 = T5LayerNorm(21) + + def forward(self, x, y): + x = self.rmsn_0(x) + y = self.rmsn_1(y) + return x, y + +def test(): + if version.parse(torch.__version__) < version.parse('2.4'): + return True + + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 64, 26) + y = torch.rand(3, 15, 15, 21) + + a0, a1 = net(x, y) + + # export onnx + torch.onnx.export(net, (x,y), "test.onnx") + + # export torchscript + mod = torch.jit.trace(net, (x, y)) + mod.save("test_pnnx_fuse_rmsnorm.pt") + + # torchscript to pnnx + import os + os.system("../src/pnnx test_pnnx_fuse_rmsnorm.pt inputshape=[1,64,26],[3,15,15,21]") + + # pnnx inference + import test_pnnx_fuse_rmsnorm_pnnx + b0, b1 = test_pnnx_fuse_rmsnorm_pnnx.test_inference() + + return torch.allclose(a0, b0, 1e-4, 1e-4) and torch.allclose(a1, b1, 1e-4, 1e-4) + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) From 9b5f6a39b4a4962accaad58caa771487f61f732a Mon Sep 17 00:00:00 2001 From: Ankush Goel Date: Wed, 25 Sep 2024 06:04:07 +0530 Subject: [PATCH 38/38] fix: typo (#5709) --- docs/faq.en.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/faq.en.md b/docs/faq.en.md index 807c4a9e3ee..44d0068263b 100644 --- a/docs/faq.en.md +++ b/docs/faq.en.md @@ -262,7 +262,7 @@ Fully customizable op, first change to one that can export (e.g. concat slice), Set net.opt.use_vulkan_compute = true before load_param / load_model; -- ## How to ececute multiple blob inputs, multiple blob outputs? +- ## How to execute multiple blob inputs, multiple blob outputs? Multiple execute `ex.input()` and `ex.extract()` like following ``` ex.input("data1", in_1);