Skip to content

Commit

Permalink
fix...
Browse files Browse the repository at this point in the history
  • Loading branch information
nihui committed May 23, 2024
1 parent 3747a9d commit 7b5c14c
Show file tree
Hide file tree
Showing 9 changed files with 64 additions and 31 deletions.
32 changes: 31 additions & 1 deletion cmake/ncnn_add_layer.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,7 @@ macro(ncnn_add_layer class)
endif()

if(NCNN_TARGET_ARCH STREQUAL "arm" AND (CMAKE_SIZEOF_VOID_P EQUAL 8 OR NCNN_TARGET_ILP32))
if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC" OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_SIMULATE_ID MATCHES "MSVC" AND CMAKE_CXX_COMPILER_FRONTEND_VARIANT MATCHES "MSVC"))
if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
if(NCNN_VFPV4)
ncnn_add_arch_opt_source(${class} vfpv4 " ")
endif()
Expand Down Expand Up @@ -277,6 +277,36 @@ macro(ncnn_add_layer class)
endif()
if(NCNN_ARM86SVEF32MM)
endif()
elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_SIMULATE_ID MATCHES "MSVC" AND CMAKE_CXX_COMPILER_FRONTEND_VARIANT MATCHES "MSVC")
if(NCNN_VFPV4)
ncnn_add_arch_opt_source(${class} vfpv4 " ")
endif()
if(NCNN_ARM82)
ncnn_add_arch_opt_source(${class} asimdhp "/arch:armv8.2 -march=armv8.2-a+fp16 /D__ARM_FEATURE_FP16_VECTOR_ARITHMETIC")
endif()
if(NCNN_ARM82DOT)
ncnn_add_arch_opt_source(${class} asimddp "/arch:armv8.2 -march=armv8.2-a+fp16+dotprod /D__ARM_FEATURE_FP16_VECTOR_ARITHMETIC /D__ARM_FEATURE_DOTPROD")
endif()
if(NCNN_ARM82FP16FML)
ncnn_add_arch_opt_source(${class} asimdfhm "/arch:armv8.2 -march=armv8.2-a+fp16+fp16fml /D__ARM_FEATURE_FP16_VECTOR_ARITHMETIC /D__ARM_FEATURE_FP16_FML")
endif()
if(NCNN_ARM84BF16)
ncnn_add_arch_opt_source(${class} bf16 "/arch:armv8.4 -march=armv8.4-a+fp16+dotprod+bf16 /D__ARM_FEATURE_FP16_VECTOR_ARITHMETIC /D__ARM_FEATURE_DOTPROD /D__ARM_FEATURE_FP16_FML /D__ARM_FEATURE_BF16_VECTOR_ARITHMETIC")
endif()
if(NCNN_ARM84I8MM)
ncnn_add_arch_opt_source(${class} i8mm "/arch:armv8.4 -march=armv8.4-a+fp16+dotprod+i8mm /D__ARM_FEATURE_FP16_VECTOR_ARITHMETIC /D__ARM_FEATURE_DOTPROD /D__ARM_FEATURE_FP16_FML /D__ARM_FEATURE_MATMUL_INT8")
endif()
# TODO add support for sve family
if(NCNN_ARM86SVE)
endif()
if(NCNN_ARM86SVE2)
endif()
if(NCNN_ARM86SVEBF16)
endif()
if(NCNN_ARM86SVEI8MM)
endif()
if(NCNN_ARM86SVEF32MM)
endif()
else()
if(NCNN_VFPV4)
ncnn_add_arch_opt_source(${class} vfpv4 " ")
Expand Down
9 changes: 6 additions & 3 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -545,7 +545,8 @@ if(NCNN_TARGET_ARCH STREQUAL "arm" AND (CMAKE_SIZEOF_VOID_P EQUAL 8 OR NCNN_TARG
endif()
if(NCNN_ARM86SVEF32MM)
endif()
else()
endif()
if(NOT CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
set(ARM_MARCH_FLAG "-march=armv8.6-a+fp16+dotprod+sve")
if(NCNN_ARM86SVE2)
set(ARM_MARCH_FLAG "${ARM_MARCH_FLAG}+sve2")
Expand All @@ -569,7 +570,8 @@ if(NCNN_TARGET_ARCH STREQUAL "arm" AND (CMAKE_SIZEOF_VOID_P EQUAL 8 OR NCNN_TARG
if(NCNN_ARM84I8MM)
target_compile_options(ncnn PRIVATE /D__ARM_FEATURE_MATMUL_INT8)
endif()
else()
endif()
if(NOT CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
set(ARM_MARCH_FLAG "-march=armv8.4-a+fp16+dotprod")
if(NCNN_ARM84BF16)
set(ARM_MARCH_FLAG "${ARM_MARCH_FLAG}+bf16")
Expand All @@ -587,7 +589,8 @@ if(NCNN_TARGET_ARCH STREQUAL "arm" AND (CMAKE_SIZEOF_VOID_P EQUAL 8 OR NCNN_TARG
if(NCNN_ARM82FP16FML)
target_compile_options(ncnn PRIVATE /D__ARM_FEATURE_FP16_FML)
endif()
else()
endif()
if(NOT CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
set(ARM_MARCH_FLAG "-march=armv8.2-a+fp16")
if(NCNN_ARM82DOT)
set(ARM_MARCH_FLAG "${ARM_MARCH_FLAG}+dotprod")
Expand Down
4 changes: 2 additions & 2 deletions src/layer/arm/arm_activation.h
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ static inline float16x4_t activation_ps_f16(float16x4_t _v, int activation_type,
else if (activation_type == 2)
{
const float16x4_t _zero = vdup_n_f16(0.f);
#if _MSC_VER
#if defined(_MSC_VER) && !defined(__clang__)
const float16x4_t _slope = vcvt_f16_f32(vdupq_n_f32(activation_params[0]));
#else
const float16x4_t _slope = vdup_n_f16((__fp16)activation_params[0]);
Expand Down Expand Up @@ -176,7 +176,7 @@ static inline float16x8_t activation_ps_f16(float16x8_t _v, int activation_type,
else if (activation_type == 2)
{
const float16x8_t _zero = vdupq_n_f16(0.f);
#if _MSC_VER
#if defined(_MSC_VER) && !defined(__clang__)
const float16x4_t _slope0 = vcvt_f16_f32(vdupq_n_f32(activation_params[0]));
const float16x8_t _slope = vcombine_f16(_slope0, _slope0);
#else
Expand Down
2 changes: 1 addition & 1 deletion src/layer/arm/arm_usability.h
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ static inline int8x8_t float2int8leakyrelu(float32x4_t _vlow, float32x4_t _vhigh
}

#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
#ifdef _MSC_VER
#if defined(_MSC_VER) && !defined(__clang__)
struct __fp16
{
__fp16()
Expand Down
4 changes: 2 additions & 2 deletions src/layer/arm/batchnorm_arm_asimdhp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -369,7 +369,7 @@ int BatchNorm_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& op
__fp16 b = (__fp16)b_data[i];

float16x4_t _a = vdup_n_f16(a);
#if _MSC_VER
#if defined(_MSC_VER) && !defined(__clang__)
float16x4_t _b = vcvt_f16_f32(vdupq_n_f32(b_data[i]));
#else
float16x4_t _b = vdup_n_f16(b);
Expand Down Expand Up @@ -410,7 +410,7 @@ int BatchNorm_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& op
__fp16 b = (__fp16)b_data[q];

float16x4_t _a = vdup_n_f16(a);
#if _MSC_VER
#if defined(_MSC_VER) && !defined(__clang__)
float16x4_t _b = vcvt_f16_f32(vdupq_n_f32(b_data[q]));
#else
float16x4_t _b = vdup_n_f16(b);
Expand Down
16 changes: 8 additions & 8 deletions src/layer/arm/neon_mathfun_fp16s.h
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ static inline float16x4_t exp_ps_f16(float16x4_t x)
x = vmax_f16(x, vdup_n_f16(c_exp_lo_f16));

/* express exp(x) as exp(g + n*log(2)) */
#if _MSC_VER
#if defined(_MSC_VER) && !defined(__clang__)
fx = vfma_f16(vdup_n_f16(0.5f), x, vcvt_f16_f32(vdupq_n_f32(c_cephes_LOG2EF)));
#else
fx = vfma_f16(vdup_n_f16(0.5f), x, vdup_n_f16(c_cephes_LOG2EF));
Expand All @@ -216,7 +216,7 @@ static inline float16x4_t exp_ps_f16(float16x4_t x)

fx = vsub_f16(tmp, (float16x4_t)(mask));

#if _MSC_VER
#if defined(_MSC_VER) && !defined(__clang__)
tmp = vmul_f16(fx, vcvt_f16_f32(vdupq_n_f32(c_cephes_exp_C1)));
float16x4_t z = vmul_f16(fx, vcvt_f16_f32(vdupq_n_f32(c_cephes_exp_C2)));
#else
Expand Down Expand Up @@ -258,7 +258,7 @@ static inline float16x8_t exp_ps_f16(float16x8_t x)
x = vmaxq_f16(x, vdupq_n_f16(c_exp_lo_f16));

/* express exp(x) as exp(g + n*log(2)) */
#if _MSC_VER
#if defined(_MSC_VER) && !defined(__clang__)
float16x4_t _c_cephes_LOG2EF = vcvt_f16_f32(vdupq_n_f32(c_cephes_LOG2EF));
fx = vfmaq_f16(vdupq_n_f16(0.5f), x, vcombine_f16(_c_cephes_LOG2EF, _c_cephes_LOG2EF));
#else
Expand All @@ -274,7 +274,7 @@ static inline float16x8_t exp_ps_f16(float16x8_t x)

fx = vsubq_f16(tmp, vreinterpretq_f16_u16(mask));

#if _MSC_VER
#if defined(_MSC_VER) && !defined(__clang__)
float16x4_t _c_cephes_exp_C1 = vcvt_f16_f32(vdupq_n_f32(c_cephes_exp_C1));
tmp = vmulq_f16(fx, vcombine_f16(_c_cephes_exp_C1, _c_cephes_exp_C1));
float16x4_t _c_cephes_exp_C2 = vcvt_f16_f32(vdupq_n_f32(c_cephes_exp_C2));
Expand Down Expand Up @@ -347,7 +347,7 @@ static inline void sincos_ps_f16(float16x4_t x, float16x4_t* ysin, float16x4_t*
x = vabs_f16(x);

/* scale by 4/Pi */
#if _MSC_VER
#if defined(_MSC_VER) && !defined(__clang__)
float16x4_t _c_cephes_FOPI = vcvt_f16_f32(vdupq_n_f32(c_cephes_FOPI));
y = vmul_f16(x, _c_cephes_FOPI);
#else
Expand All @@ -371,7 +371,7 @@ static inline void sincos_ps_f16(float16x4_t x, float16x4_t* ysin, float16x4_t*

/* The magic pass: "Extended precision modular arithmetic"
* x = ((x - y * DP1) - y * DP2) - y * DP3; */
#if _MSC_VER
#if defined(_MSC_VER) && !defined(__clang__)
float16x4_t _c_minus_cephes_DP1 = vcvt_f16_f32(vdupq_n_f32(c_minus_cephes_DP1));
float16x4_t _c_minus_cephes_DP2 = vcvt_f16_f32(vdupq_n_f32(c_minus_cephes_DP2));
float16x4_t _c_minus_cephes_DP3 = vcvt_f16_f32(vdupq_n_f32(c_minus_cephes_DP3));
Expand Down Expand Up @@ -422,7 +422,7 @@ static inline void sincos_ps_f16(float16x8_t x, float16x8_t* ysin, float16x8_t*
x = vabsq_f16(x);

/* scale by 4/Pi */
#if _MSC_VER
#if defined(_MSC_VER) && !defined(__clang__)
float16x4_t _c_cephes_FOPI = vcvt_f16_f32(vdupq_n_f32(c_cephes_FOPI));
y = vmulq_f16(x, vcombine_f16(_c_cephes_FOPI, _c_cephes_FOPI));
#else
Expand All @@ -446,7 +446,7 @@ static inline void sincos_ps_f16(float16x8_t x, float16x8_t* ysin, float16x8_t*

/* The magic pass: "Extended precision modular arithmetic"
* x = ((x - y * DP1) - y * DP2) - y * DP3; */
#if _MSC_VER
#if defined(_MSC_VER) && !defined(__clang__)
float16x4_t _c_minus_cephes_DP1 = vcvt_f16_f32(vdupq_n_f32(c_minus_cephes_DP1));
float16x4_t _c_minus_cephes_DP2 = vcvt_f16_f32(vdupq_n_f32(c_minus_cephes_DP2));
float16x4_t _c_minus_cephes_DP3 = vcvt_f16_f32(vdupq_n_f32(c_minus_cephes_DP3));
Expand Down
8 changes: 4 additions & 4 deletions src/layer/arm/pooling_arm_asimdhp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -612,7 +612,7 @@ int Pooling_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Opt
}
}

#if _MSC_VER
#if defined(_MSC_VER) && !defined(__clang__)
float16x4_t _inv_area0 = vcvt_f16_f32(vdupq_n_f32(1.f / area));
float16x8_t _inv_area = vcombine_f16(_inv_area0, _inv_area0);
#else
Expand Down Expand Up @@ -672,7 +672,7 @@ int Pooling_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Opt
}
}

#if _MSC_VER
#if defined(_MSC_VER) && !defined(__clang__)
float16x4_t _inv_area = vcvt_f16_f32(vdupq_n_f32(1.f / area));
#else
float16x4_t _inv_area = vdup_n_f16((__fp16)(1.f / area));
Expand Down Expand Up @@ -750,7 +750,7 @@ int Pooling_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Opt
const Mat m = bottom_blob_bordered.channel(q);
__fp16* outptr = top_blob.channel(q);

#if _MSC_VER
#if defined(_MSC_VER) && !defined(__clang__)
float16x4_t _inv_maxk0 = vcvt_f16_f32(vdupq_n_f32(1.f / maxk));
float16x8_t _inv_maxk = vcombine_f16(_inv_maxk0, _inv_maxk0);
#else
Expand Down Expand Up @@ -788,7 +788,7 @@ int Pooling_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Opt
const Mat m = bottom_blob_bordered.channel(q);
__fp16* outptr = top_blob.channel(q);

#if _MSC_VER
#if defined(_MSC_VER) && !defined(__clang__)
float16x4_t _inv_maxk = vcvt_f16_f32(vdupq_n_f32(1.f / maxk));
#else
float16x4_t _inv_maxk = vdup_n_f16((__fp16)(1.f / maxk));
Expand Down
14 changes: 7 additions & 7 deletions src/layer/arm/prelu_arm_asimdhp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -266,7 +266,7 @@ int PReLU_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) c
}
else
{
#if _MSC_VER
#if defined(_MSC_VER) && !defined(__clang__)
float16x4_t _slope0 = vcvt_f16_f32(vdupq_n_f32(slope_data[0]));
float16x8_t _slope = vcombine_f16(_slope0, _slope0);
#else
Expand Down Expand Up @@ -344,7 +344,7 @@ int PReLU_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) c
{
if (dims == 1)
{
#if _MSC_VER
#if defined(_MSC_VER) && !defined(__clang__)
float16x8_t _zero = vdupq_n_f16(0.f);
#else
float16x4_t _zero = vdup_n_f16(0.f);
Expand All @@ -363,7 +363,7 @@ int PReLU_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) c

float16x4_t _p = vld1_f16(ptr);
float16x4_t _slope = vcvt_f16_f32(vld1q_f32(slope + i * 4));
#if _MSC_VER
#if defined(_MSC_VER) && !defined(__clang__)
uint16x4_t _lemask = vcle_f16(_p, vget_low_f16(_zero));
#else
uint16x4_t _lemask = vcle_f16(_p, _zero);
Expand All @@ -375,7 +375,7 @@ int PReLU_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) c
}
else
{
#if _MSC_VER
#if defined(_MSC_VER) && !defined(__clang__)
float16x8_t _slope = vdupq_n_f16((__fp16)slope_data[0]);
#else
float16x4_t _slope = vdup_n_f16((__fp16)slope_data[0]);
Expand All @@ -387,7 +387,7 @@ int PReLU_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) c
__fp16* ptr = (__fp16*)bottom_top_blob + i * 4;

float16x4_t _p = vld1_f16(ptr);
#if _MSC_VER
#if defined(_MSC_VER) && !defined(__clang__)
uint16x4_t _lemask = vcle_f16(_p, vget_low_f16(_zero));
float16x4_t _ps = vmul_f16(_p, vget_low_f16(_slope));
#else
Expand Down Expand Up @@ -500,7 +500,7 @@ int PReLU_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) c
const float slope = num_slope > 1 ? slope_data[i] : slope_data[0];

float16x4_t _zero = vdup_n_f16(0.f);
#if _MSC_VER
#if defined(_MSC_VER) && !defined(__clang__)
float16x4_t _slope = vcvt_f16_f32(vdupq_n_f32(slope));
#else
float16x4_t _slope = vdup_n_f16((__fp16)slope);
Expand Down Expand Up @@ -543,7 +543,7 @@ int PReLU_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) c
const float slope = num_slope > 1 ? slope_data[q] : slope_data[0];

float16x4_t _zero = vdup_n_f16(0.f);
#if _MSC_VER
#if defined(_MSC_VER) && !defined(__clang__)
float16x4_t _slope = vcvt_f16_f32(vdupq_n_f32(slope));
#else
float16x4_t _slope = vdup_n_f16((__fp16)slope);
Expand Down
6 changes: 3 additions & 3 deletions src/layer/arm/quantize_arm_asimdhp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -440,7 +440,7 @@ int Quantize_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Op

if (scale_data_size == 1)
{
#if _MSC_VER
#if defined(_MSC_VER) && !defined(__clang__)
float16x4_t _scale0 = vcvt_f16_f32(vdupq_n_f32(scale_data[0]));
float16x8_t _scale = vcombine_f16(_scale0, _scale0);
#else
Expand Down Expand Up @@ -485,7 +485,7 @@ int Quantize_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Op

if (scale_data_size == 1)
{
#if _MSC_VER
#if defined(_MSC_VER) && !defined(__clang__)
float16x4_t _scale0 = vcvt_f16_f32(vdupq_n_f32(scale_data[0]));
float16x8_t _scale = vcombine_f16(_scale0, _scale0);
#else
Expand Down Expand Up @@ -545,7 +545,7 @@ int Quantize_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Op

if (scale_data_size == 1)
{
#if _MSC_VER
#if defined(_MSC_VER) && !defined(__clang__)
float16x4_t _scale0 = vcvt_f16_f32(vdupq_n_f32(scale_data[0]));
float16x8_t _scale = vcombine_f16(_scale0, _scale0);
#else
Expand Down

0 comments on commit 7b5c14c

Please sign in to comment.