From 7b5c14cd1be8685cad2df1298724e9704ad326ce Mon Sep 17 00:00:00 2001 From: nihuini Date: Thu, 23 May 2024 15:56:35 +0800 Subject: [PATCH] fix... --- cmake/ncnn_add_layer.cmake | 32 ++++++++++++++++++++++++- src/CMakeLists.txt | 9 ++++--- src/layer/arm/arm_activation.h | 4 ++-- src/layer/arm/arm_usability.h | 2 +- src/layer/arm/batchnorm_arm_asimdhp.cpp | 4 ++-- src/layer/arm/neon_mathfun_fp16s.h | 16 ++++++------- src/layer/arm/pooling_arm_asimdhp.cpp | 8 +++---- src/layer/arm/prelu_arm_asimdhp.cpp | 14 +++++------ src/layer/arm/quantize_arm_asimdhp.cpp | 6 ++--- 9 files changed, 64 insertions(+), 31 deletions(-) diff --git a/cmake/ncnn_add_layer.cmake b/cmake/ncnn_add_layer.cmake index 7b6f7ba3789..6ce5feadbf3 100644 --- a/cmake/ncnn_add_layer.cmake +++ b/cmake/ncnn_add_layer.cmake @@ -247,7 +247,7 @@ macro(ncnn_add_layer class) endif() if(NCNN_TARGET_ARCH STREQUAL "arm" AND (CMAKE_SIZEOF_VOID_P EQUAL 8 OR NCNN_TARGET_ILP32)) - if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC" OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_SIMULATE_ID MATCHES "MSVC" AND CMAKE_CXX_COMPILER_FRONTEND_VARIANT MATCHES "MSVC")) + if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC") if(NCNN_VFPV4) ncnn_add_arch_opt_source(${class} vfpv4 " ") endif() @@ -277,6 +277,36 @@ macro(ncnn_add_layer class) endif() if(NCNN_ARM86SVEF32MM) endif() + elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_SIMULATE_ID MATCHES "MSVC" AND CMAKE_CXX_COMPILER_FRONTEND_VARIANT MATCHES "MSVC") + if(NCNN_VFPV4) + ncnn_add_arch_opt_source(${class} vfpv4 " ") + endif() + if(NCNN_ARM82) + ncnn_add_arch_opt_source(${class} asimdhp "/arch:armv8.2 -march=armv8.2-a+fp16 /D__ARM_FEATURE_FP16_VECTOR_ARITHMETIC") + endif() + if(NCNN_ARM82DOT) + ncnn_add_arch_opt_source(${class} asimddp "/arch:armv8.2 -march=armv8.2-a+fp16+dotprod /D__ARM_FEATURE_FP16_VECTOR_ARITHMETIC /D__ARM_FEATURE_DOTPROD") + endif() + if(NCNN_ARM82FP16FML) + ncnn_add_arch_opt_source(${class} asimdfhm "/arch:armv8.2 -march=armv8.2-a+fp16+fp16fml /D__ARM_FEATURE_FP16_VECTOR_ARITHMETIC /D__ARM_FEATURE_FP16_FML") + endif() + if(NCNN_ARM84BF16) + ncnn_add_arch_opt_source(${class} bf16 "/arch:armv8.4 -march=armv8.4-a+fp16+dotprod+bf16 /D__ARM_FEATURE_FP16_VECTOR_ARITHMETIC /D__ARM_FEATURE_DOTPROD /D__ARM_FEATURE_FP16_FML /D__ARM_FEATURE_BF16_VECTOR_ARITHMETIC") + endif() + if(NCNN_ARM84I8MM) + ncnn_add_arch_opt_source(${class} i8mm "/arch:armv8.4 -march=armv8.4-a+fp16+dotprod+i8mm /D__ARM_FEATURE_FP16_VECTOR_ARITHMETIC /D__ARM_FEATURE_DOTPROD /D__ARM_FEATURE_FP16_FML /D__ARM_FEATURE_MATMUL_INT8") + endif() + # TODO add support for sve family + if(NCNN_ARM86SVE) + endif() + if(NCNN_ARM86SVE2) + endif() + if(NCNN_ARM86SVEBF16) + endif() + if(NCNN_ARM86SVEI8MM) + endif() + if(NCNN_ARM86SVEF32MM) + endif() else() if(NCNN_VFPV4) ncnn_add_arch_opt_source(${class} vfpv4 " ") diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 2e444fdb629..5c83d56e216 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -545,7 +545,8 @@ if(NCNN_TARGET_ARCH STREQUAL "arm" AND (CMAKE_SIZEOF_VOID_P EQUAL 8 OR NCNN_TARG endif() if(NCNN_ARM86SVEF32MM) endif() - else() + endif() + if(NOT CMAKE_CXX_COMPILER_ID MATCHES "MSVC") set(ARM_MARCH_FLAG "-march=armv8.6-a+fp16+dotprod+sve") if(NCNN_ARM86SVE2) set(ARM_MARCH_FLAG "${ARM_MARCH_FLAG}+sve2") @@ -569,7 +570,8 @@ if(NCNN_TARGET_ARCH STREQUAL "arm" AND (CMAKE_SIZEOF_VOID_P EQUAL 8 OR NCNN_TARG if(NCNN_ARM84I8MM) target_compile_options(ncnn PRIVATE /D__ARM_FEATURE_MATMUL_INT8) endif() - else() + endif() + if(NOT CMAKE_CXX_COMPILER_ID MATCHES "MSVC") set(ARM_MARCH_FLAG "-march=armv8.4-a+fp16+dotprod") if(NCNN_ARM84BF16) set(ARM_MARCH_FLAG "${ARM_MARCH_FLAG}+bf16") @@ -587,7 +589,8 @@ if(NCNN_TARGET_ARCH STREQUAL "arm" AND (CMAKE_SIZEOF_VOID_P EQUAL 8 OR NCNN_TARG if(NCNN_ARM82FP16FML) target_compile_options(ncnn PRIVATE /D__ARM_FEATURE_FP16_FML) endif() - else() + endif() + if(NOT CMAKE_CXX_COMPILER_ID MATCHES "MSVC") set(ARM_MARCH_FLAG "-march=armv8.2-a+fp16") if(NCNN_ARM82DOT) set(ARM_MARCH_FLAG "${ARM_MARCH_FLAG}+dotprod") diff --git a/src/layer/arm/arm_activation.h b/src/layer/arm/arm_activation.h index aca3e57479f..2a3f6595cf8 100644 --- a/src/layer/arm/arm_activation.h +++ b/src/layer/arm/arm_activation.h @@ -126,7 +126,7 @@ static inline float16x4_t activation_ps_f16(float16x4_t _v, int activation_type, else if (activation_type == 2) { const float16x4_t _zero = vdup_n_f16(0.f); -#if _MSC_VER +#if defined(_MSC_VER) && !defined(__clang__) const float16x4_t _slope = vcvt_f16_f32(vdupq_n_f32(activation_params[0])); #else const float16x4_t _slope = vdup_n_f16((__fp16)activation_params[0]); @@ -176,7 +176,7 @@ static inline float16x8_t activation_ps_f16(float16x8_t _v, int activation_type, else if (activation_type == 2) { const float16x8_t _zero = vdupq_n_f16(0.f); -#if _MSC_VER +#if defined(_MSC_VER) && !defined(__clang__) const float16x4_t _slope0 = vcvt_f16_f32(vdupq_n_f32(activation_params[0])); const float16x8_t _slope = vcombine_f16(_slope0, _slope0); #else diff --git a/src/layer/arm/arm_usability.h b/src/layer/arm/arm_usability.h index 5d6afb7b078..be17249a28c 100644 --- a/src/layer/arm/arm_usability.h +++ b/src/layer/arm/arm_usability.h @@ -123,7 +123,7 @@ static inline int8x8_t float2int8leakyrelu(float32x4_t _vlow, float32x4_t _vhigh } #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -#ifdef _MSC_VER +#if defined(_MSC_VER) && !defined(__clang__) struct __fp16 { __fp16() diff --git a/src/layer/arm/batchnorm_arm_asimdhp.cpp b/src/layer/arm/batchnorm_arm_asimdhp.cpp index 3bfc1dbafec..565bbe69dfb 100644 --- a/src/layer/arm/batchnorm_arm_asimdhp.cpp +++ b/src/layer/arm/batchnorm_arm_asimdhp.cpp @@ -369,7 +369,7 @@ int BatchNorm_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& op __fp16 b = (__fp16)b_data[i]; float16x4_t _a = vdup_n_f16(a); -#if _MSC_VER +#if defined(_MSC_VER) && !defined(__clang__) float16x4_t _b = vcvt_f16_f32(vdupq_n_f32(b_data[i])); #else float16x4_t _b = vdup_n_f16(b); @@ -410,7 +410,7 @@ int BatchNorm_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& op __fp16 b = (__fp16)b_data[q]; float16x4_t _a = vdup_n_f16(a); -#if _MSC_VER +#if defined(_MSC_VER) && !defined(__clang__) float16x4_t _b = vcvt_f16_f32(vdupq_n_f32(b_data[q])); #else float16x4_t _b = vdup_n_f16(b); diff --git a/src/layer/arm/neon_mathfun_fp16s.h b/src/layer/arm/neon_mathfun_fp16s.h index 2f4864c13a3..5e3f7a28ead 100644 --- a/src/layer/arm/neon_mathfun_fp16s.h +++ b/src/layer/arm/neon_mathfun_fp16s.h @@ -201,7 +201,7 @@ static inline float16x4_t exp_ps_f16(float16x4_t x) x = vmax_f16(x, vdup_n_f16(c_exp_lo_f16)); /* express exp(x) as exp(g + n*log(2)) */ -#if _MSC_VER +#if defined(_MSC_VER) && !defined(__clang__) fx = vfma_f16(vdup_n_f16(0.5f), x, vcvt_f16_f32(vdupq_n_f32(c_cephes_LOG2EF))); #else fx = vfma_f16(vdup_n_f16(0.5f), x, vdup_n_f16(c_cephes_LOG2EF)); @@ -216,7 +216,7 @@ static inline float16x4_t exp_ps_f16(float16x4_t x) fx = vsub_f16(tmp, (float16x4_t)(mask)); -#if _MSC_VER +#if defined(_MSC_VER) && !defined(__clang__) tmp = vmul_f16(fx, vcvt_f16_f32(vdupq_n_f32(c_cephes_exp_C1))); float16x4_t z = vmul_f16(fx, vcvt_f16_f32(vdupq_n_f32(c_cephes_exp_C2))); #else @@ -258,7 +258,7 @@ static inline float16x8_t exp_ps_f16(float16x8_t x) x = vmaxq_f16(x, vdupq_n_f16(c_exp_lo_f16)); /* express exp(x) as exp(g + n*log(2)) */ -#if _MSC_VER +#if defined(_MSC_VER) && !defined(__clang__) float16x4_t _c_cephes_LOG2EF = vcvt_f16_f32(vdupq_n_f32(c_cephes_LOG2EF)); fx = vfmaq_f16(vdupq_n_f16(0.5f), x, vcombine_f16(_c_cephes_LOG2EF, _c_cephes_LOG2EF)); #else @@ -274,7 +274,7 @@ static inline float16x8_t exp_ps_f16(float16x8_t x) fx = vsubq_f16(tmp, vreinterpretq_f16_u16(mask)); -#if _MSC_VER +#if defined(_MSC_VER) && !defined(__clang__) float16x4_t _c_cephes_exp_C1 = vcvt_f16_f32(vdupq_n_f32(c_cephes_exp_C1)); tmp = vmulq_f16(fx, vcombine_f16(_c_cephes_exp_C1, _c_cephes_exp_C1)); float16x4_t _c_cephes_exp_C2 = vcvt_f16_f32(vdupq_n_f32(c_cephes_exp_C2)); @@ -347,7 +347,7 @@ static inline void sincos_ps_f16(float16x4_t x, float16x4_t* ysin, float16x4_t* x = vabs_f16(x); /* scale by 4/Pi */ -#if _MSC_VER +#if defined(_MSC_VER) && !defined(__clang__) float16x4_t _c_cephes_FOPI = vcvt_f16_f32(vdupq_n_f32(c_cephes_FOPI)); y = vmul_f16(x, _c_cephes_FOPI); #else @@ -371,7 +371,7 @@ static inline void sincos_ps_f16(float16x4_t x, float16x4_t* ysin, float16x4_t* /* The magic pass: "Extended precision modular arithmetic" * x = ((x - y * DP1) - y * DP2) - y * DP3; */ -#if _MSC_VER +#if defined(_MSC_VER) && !defined(__clang__) float16x4_t _c_minus_cephes_DP1 = vcvt_f16_f32(vdupq_n_f32(c_minus_cephes_DP1)); float16x4_t _c_minus_cephes_DP2 = vcvt_f16_f32(vdupq_n_f32(c_minus_cephes_DP2)); float16x4_t _c_minus_cephes_DP3 = vcvt_f16_f32(vdupq_n_f32(c_minus_cephes_DP3)); @@ -422,7 +422,7 @@ static inline void sincos_ps_f16(float16x8_t x, float16x8_t* ysin, float16x8_t* x = vabsq_f16(x); /* scale by 4/Pi */ -#if _MSC_VER +#if defined(_MSC_VER) && !defined(__clang__) float16x4_t _c_cephes_FOPI = vcvt_f16_f32(vdupq_n_f32(c_cephes_FOPI)); y = vmulq_f16(x, vcombine_f16(_c_cephes_FOPI, _c_cephes_FOPI)); #else @@ -446,7 +446,7 @@ static inline void sincos_ps_f16(float16x8_t x, float16x8_t* ysin, float16x8_t* /* The magic pass: "Extended precision modular arithmetic" * x = ((x - y * DP1) - y * DP2) - y * DP3; */ -#if _MSC_VER +#if defined(_MSC_VER) && !defined(__clang__) float16x4_t _c_minus_cephes_DP1 = vcvt_f16_f32(vdupq_n_f32(c_minus_cephes_DP1)); float16x4_t _c_minus_cephes_DP2 = vcvt_f16_f32(vdupq_n_f32(c_minus_cephes_DP2)); float16x4_t _c_minus_cephes_DP3 = vcvt_f16_f32(vdupq_n_f32(c_minus_cephes_DP3)); diff --git a/src/layer/arm/pooling_arm_asimdhp.cpp b/src/layer/arm/pooling_arm_asimdhp.cpp index ceabb1fda38..95ce4b1f3be 100644 --- a/src/layer/arm/pooling_arm_asimdhp.cpp +++ b/src/layer/arm/pooling_arm_asimdhp.cpp @@ -612,7 +612,7 @@ int Pooling_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Opt } } -#if _MSC_VER +#if defined(_MSC_VER) && !defined(__clang__) float16x4_t _inv_area0 = vcvt_f16_f32(vdupq_n_f32(1.f / area)); float16x8_t _inv_area = vcombine_f16(_inv_area0, _inv_area0); #else @@ -672,7 +672,7 @@ int Pooling_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Opt } } -#if _MSC_VER +#if defined(_MSC_VER) && !defined(__clang__) float16x4_t _inv_area = vcvt_f16_f32(vdupq_n_f32(1.f / area)); #else float16x4_t _inv_area = vdup_n_f16((__fp16)(1.f / area)); @@ -750,7 +750,7 @@ int Pooling_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Opt const Mat m = bottom_blob_bordered.channel(q); __fp16* outptr = top_blob.channel(q); -#if _MSC_VER +#if defined(_MSC_VER) && !defined(__clang__) float16x4_t _inv_maxk0 = vcvt_f16_f32(vdupq_n_f32(1.f / maxk)); float16x8_t _inv_maxk = vcombine_f16(_inv_maxk0, _inv_maxk0); #else @@ -788,7 +788,7 @@ int Pooling_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Opt const Mat m = bottom_blob_bordered.channel(q); __fp16* outptr = top_blob.channel(q); -#if _MSC_VER +#if defined(_MSC_VER) && !defined(__clang__) float16x4_t _inv_maxk = vcvt_f16_f32(vdupq_n_f32(1.f / maxk)); #else float16x4_t _inv_maxk = vdup_n_f16((__fp16)(1.f / maxk)); diff --git a/src/layer/arm/prelu_arm_asimdhp.cpp b/src/layer/arm/prelu_arm_asimdhp.cpp index e0efd6f07d9..a6732eae67f 100644 --- a/src/layer/arm/prelu_arm_asimdhp.cpp +++ b/src/layer/arm/prelu_arm_asimdhp.cpp @@ -266,7 +266,7 @@ int PReLU_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) c } else { -#if _MSC_VER +#if defined(_MSC_VER) && !defined(__clang__) float16x4_t _slope0 = vcvt_f16_f32(vdupq_n_f32(slope_data[0])); float16x8_t _slope = vcombine_f16(_slope0, _slope0); #else @@ -344,7 +344,7 @@ int PReLU_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) c { if (dims == 1) { -#if _MSC_VER +#if defined(_MSC_VER) && !defined(__clang__) float16x8_t _zero = vdupq_n_f16(0.f); #else float16x4_t _zero = vdup_n_f16(0.f); @@ -363,7 +363,7 @@ int PReLU_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) c float16x4_t _p = vld1_f16(ptr); float16x4_t _slope = vcvt_f16_f32(vld1q_f32(slope + i * 4)); -#if _MSC_VER +#if defined(_MSC_VER) && !defined(__clang__) uint16x4_t _lemask = vcle_f16(_p, vget_low_f16(_zero)); #else uint16x4_t _lemask = vcle_f16(_p, _zero); @@ -375,7 +375,7 @@ int PReLU_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) c } else { -#if _MSC_VER +#if defined(_MSC_VER) && !defined(__clang__) float16x8_t _slope = vdupq_n_f16((__fp16)slope_data[0]); #else float16x4_t _slope = vdup_n_f16((__fp16)slope_data[0]); @@ -387,7 +387,7 @@ int PReLU_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) c __fp16* ptr = (__fp16*)bottom_top_blob + i * 4; float16x4_t _p = vld1_f16(ptr); -#if _MSC_VER +#if defined(_MSC_VER) && !defined(__clang__) uint16x4_t _lemask = vcle_f16(_p, vget_low_f16(_zero)); float16x4_t _ps = vmul_f16(_p, vget_low_f16(_slope)); #else @@ -500,7 +500,7 @@ int PReLU_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) c const float slope = num_slope > 1 ? slope_data[i] : slope_data[0]; float16x4_t _zero = vdup_n_f16(0.f); -#if _MSC_VER +#if defined(_MSC_VER) && !defined(__clang__) float16x4_t _slope = vcvt_f16_f32(vdupq_n_f32(slope)); #else float16x4_t _slope = vdup_n_f16((__fp16)slope); @@ -543,7 +543,7 @@ int PReLU_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) c const float slope = num_slope > 1 ? slope_data[q] : slope_data[0]; float16x4_t _zero = vdup_n_f16(0.f); -#if _MSC_VER +#if defined(_MSC_VER) && !defined(__clang__) float16x4_t _slope = vcvt_f16_f32(vdupq_n_f32(slope)); #else float16x4_t _slope = vdup_n_f16((__fp16)slope); diff --git a/src/layer/arm/quantize_arm_asimdhp.cpp b/src/layer/arm/quantize_arm_asimdhp.cpp index ab43271ea5f..661f06c19cd 100644 --- a/src/layer/arm/quantize_arm_asimdhp.cpp +++ b/src/layer/arm/quantize_arm_asimdhp.cpp @@ -440,7 +440,7 @@ int Quantize_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Op if (scale_data_size == 1) { -#if _MSC_VER +#if defined(_MSC_VER) && !defined(__clang__) float16x4_t _scale0 = vcvt_f16_f32(vdupq_n_f32(scale_data[0])); float16x8_t _scale = vcombine_f16(_scale0, _scale0); #else @@ -485,7 +485,7 @@ int Quantize_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Op if (scale_data_size == 1) { -#if _MSC_VER +#if defined(_MSC_VER) && !defined(__clang__) float16x4_t _scale0 = vcvt_f16_f32(vdupq_n_f32(scale_data[0])); float16x8_t _scale = vcombine_f16(_scale0, _scale0); #else @@ -545,7 +545,7 @@ int Quantize_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Op if (scale_data_size == 1) { -#if _MSC_VER +#if defined(_MSC_VER) && !defined(__clang__) float16x4_t _scale0 = vcvt_f16_f32(vdupq_n_f32(scale_data[0])); float16x8_t _scale = vcombine_f16(_scale0, _scale0); #else