From 4c6496c1edd8e88040f61f306c5f0abbe63e8669 Mon Sep 17 00:00:00 2001 From: Yang Hau Date: Sat, 20 Jan 2024 20:35:55 +0800 Subject: [PATCH] feat: Add _mm_mul* --- sse2rvv.h | 107 +++++++++++-- tests/impl.cpp | 402 ++++++++++++++++++++++++------------------------- 2 files changed, 291 insertions(+), 218 deletions(-) diff --git a/sse2rvv.h b/sse2rvv.h index 252491b..9d016ac 100644 --- a/sse2rvv.h +++ b/sse2rvv.h @@ -205,7 +205,7 @@ typedef union ALIGN_STRUCT(16) SIMDVec { __riscv_vreinterpret_v_u32m1_u8m1(__riscv_vreinterpret_v_i32m1_u32m1(x)) #define vreinterpretq_m64_u16(x) \ __riscv_vreinterpret_v_u32m1_u16m1(__riscv_vreinterpret_v_i32m1_u32m1(x)) -#define vreinterpretq_m64_u32(x) __riscv_vreinterpret_v_f32m1_u32m1(x) +#define vreinterpretq_m64_u32(x) __riscv_vreinterpret_v_i32m1_u32m1(x) #define vreinterpretq_m64_u64(x) \ __riscv_vreinterpret_v_f64m1_u64m1(__riscv_vreinterpret_v_f32m1_f64m1(x)) #define vreinterpretq_m64_i8(x) __riscv_vreinterpret_v_i32m1_i8m1(x) @@ -222,7 +222,7 @@ typedef union ALIGN_STRUCT(16) SIMDVec { __riscv_vreinterpret_v_u32m1_i32m1(__riscv_vreinterpret_v_u16m1_u32m1(x)) #define vreinterpretq_u32_m64(x) __riscv_vreinterpret_v_u32m1_f32m1(x) #define vreinterpretq_u64_m64(x) \ - __riscv_vreinterpret_v_f64m1_f32m1(__riscv_vreinterpret_v_u64m1_f64m1(x)) + __riscv_vreinterpret_v_u32m1_i32m1(__riscv_vreinterpret_v_u64m1_u32m1(x)) #define vreinterpretq_i8_m64(x) __riscv_vreinterpret_v_i8m1_i32m1(x) #define vreinterpretq_i16_m64(x) __riscv_vreinterpret_v_i16m1_i32m1(x) #define vreinterpretq_i32_m64(x) (x) @@ -2045,33 +2045,108 @@ FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a) { // FORCE_INLINE __m128i _mm_mpsadbw_epu8 (__m128i a, __m128i b, const int imm8) // {} -// FORCE_INLINE __m128i _mm_mul_epi32 (__m128i a, __m128i b) {} +FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b) { + vint64m1_t _a = vreinterpretq_m128i_i64(a); + vint64m1_t _b = vreinterpretq_m128i_i64(b); + vint32mf2_t a_srl = __riscv_vnsra_wx_i32mf2(_a, 0, 2); + vint32mf2_t b_srl = __riscv_vnsra_wx_i32mf2(_b, 0, 2); + return vreinterpretq_i64_m128i(__riscv_vwmul_vv_i64m1(a_srl, b_srl, 2)); +} -// FORCE_INLINE __m128i _mm_mul_epu32 (__m128i a, __m128i b) {} +FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b) { + vuint64m1_t _a = vreinterpretq_m128i_u64(a); + vuint64m1_t _b = vreinterpretq_m128i_u64(b); + vuint32mf2_t a_srl = __riscv_vnsrl_wx_u32mf2(_a, 0, 2); + vuint32mf2_t b_srl = __riscv_vnsrl_wx_u32mf2(_b, 0, 2); + return vreinterpretq_u64_m128i(__riscv_vwmulu_vv_u64m1(a_srl, b_srl, 2)); +} -// FORCE_INLINE __m128d _mm_mul_pd (__m128d a, __m128d b) {} +FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b) { + vfloat64m1_t _a = vreinterpretq_m128d_f64(a); + vfloat64m1_t _b = vreinterpretq_m128d_f64(b); + return vreinterpretq_f64_m128d(__riscv_vfmul_vv_f64m1(_a, _b, 2)); +} -// FORCE_INLINE __m128 _mm_mul_ps (__m128 a, __m128 b) {} +FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b) { + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + vfloat32m1_t _b = vreinterpretq_m128_f32(b); + return vreinterpretq_f32_m128(__riscv_vfmul_vv_f32m1(_a, _b, 4)); +} -// FORCE_INLINE __m128d _mm_mul_sd (__m128d a, __m128d b) {} +FORCE_INLINE __m128d _mm_mul_sd(__m128d a, __m128d b) { + vfloat64m1_t _a = vreinterpretq_m128d_f64(a); + vfloat64m1_t _b = vreinterpretq_m128d_f64(b); + vfloat64m1_t mul = __riscv_vfmul_vv_f64m1(_a, _b, 2); + return vreinterpretq_f64_m128d(__riscv_vslideup_vx_f64m1(_a, mul, 0, 1)); +} -// FORCE_INLINE __m128 _mm_mul_ss (__m128 a, __m128 b) {} +FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b) { + vfloat32m1_t _a = vreinterpretq_m128_f32(a); + vfloat32m1_t _b = vreinterpretq_m128_f32(b); + vfloat32m1_t mul = __riscv_vfmul_vv_f32m1(_a, _b, 4); + return vreinterpretq_f32_m128(__riscv_vslideup_vx_f32m1(_a, mul, 0, 1)); +} -// FORCE_INLINE __m64 _mm_mul_su32 (__m64 a, __m64 b) {} +FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b) { + vuint32mf2_t _a = + __riscv_vlmul_trunc_v_u32m1_u32mf2(vreinterpretq_m64_u32(a)); + vuint32mf2_t _b = + __riscv_vlmul_trunc_v_u32m1_u32mf2(vreinterpretq_m64_u32(b)); + return vreinterpretq_u64_m64(__riscv_vwmulu_vv_u64m1(_a, _b, 2)); +} -// FORCE_INLINE __m128i _mm_mulhi_epi16 (__m128i a, __m128i b) {} +FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b) { + vint16m1_t _a = vreinterpretq_m128i_i16(a); + vint16m1_t _b = vreinterpretq_m128i_i16(b); + vint32m2_t ab_mul = __riscv_vwmul_vv_i32m2(_a, _b, 8); + return vreinterpretq_i16_m128i(__riscv_vnsra_wx_i16m1(ab_mul, 16, 8)); +} -// FORCE_INLINE __m128i _mm_mulhi_epu16 (__m128i a, __m128i b) {} +FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b) { + vuint16m1_t _a = vreinterpretq_m128i_u16(a); + vuint16m1_t _b = vreinterpretq_m128i_u16(b); + vuint32m2_t ab_mul = __riscv_vwmulu_vv_u32m2(_a, _b, 8); + return vreinterpretq_u16_m128i(__riscv_vnsrl_wx_u16m1(ab_mul, 16, 8)); +} -// FORCE_INLINE __m64 _mm_mulhi_pu16 (__m64 a, __m64 b) {} +FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b) { + vuint16m1_t _a = vreinterpretq_m64_u16(a); + vuint16m1_t _b = vreinterpretq_m64_u16(b); + vuint32m2_t ab_mul = __riscv_vwmulu_vv_u32m2(_a, _b, 8); + return vreinterpretq_u16_m64(__riscv_vnsrl_wx_u16m1(ab_mul, 16, 8)); +} -// FORCE_INLINE __m128i _mm_mulhrs_epi16 (__m128i a, __m128i b) {} +FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b) { + vint16m1_t _a = vreinterpretq_m128i_i16(a); + vint16m1_t _b = vreinterpretq_m128i_i16(b); + vint32m2_t ab_mul = __riscv_vwmul_vv_i32m2(_a, _b, 8); + vint32m2_t sra = __riscv_vsra_vx_i32m2(ab_mul, 14, 8); + return vreinterpretq_i16_m128i( + __riscv_vnclip_wx_i16m1(sra, 1, __RISCV_VXRM_RNU, 8)); +} -// FORCE_INLINE __m64 _mm_mulhrs_pi16 (__m64 a, __m64 b) {} +FORCE_INLINE __m64 _mm_mulhrs_pi16(__m64 a, __m64 b) { + vint16m1_t _a = vreinterpretq_m64_i16(a); + vint16m1_t _b = vreinterpretq_m64_i16(b); + vint32m2_t ab_mul = __riscv_vwmul_vv_i32m2(_a, _b, 8); + vint32m2_t sra = __riscv_vsra_vx_i32m2(ab_mul, 14, 8); + return vreinterpretq_i16_m64( + __riscv_vnclip_wx_i16m1(sra, 1, __RISCV_VXRM_RNU, 8)); +} -// FORCE_INLINE __m128i _mm_mullo_epi16 (__m128i a, __m128i b) {} +FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b) { + vint16m1_t _a = vreinterpretq_m128i_i16(a); + vint16m1_t _b = vreinterpretq_m128i_i16(b); + vint32m2_t ab_mul = __riscv_vwmul_vv_i32m2(_a, _b, 8); + return vreinterpretq_i16_m128i(__riscv_vnsra_wx_i16m1(ab_mul, 0, 8)); +} -// FORCE_INLINE __m128i _mm_mullo_epi32 (__m128i a, __m128i b) {} +FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b) { + vint32m1_t _a = vreinterpretq_m128i_i32(a); + vint32m1_t _b = vreinterpretq_m128i_i32(b); + vint64m2_t ab_mul = __riscv_vwmul_vv_i64m2(_a, _b, 4); + return vreinterpretq_i32_m128i(__riscv_vnsra_wx_i32m1(ab_mul, 0, 4)); +} // FORCE_INLINE __m128d _mm_or_pd (__m128d a, __m128d b) {} diff --git a/tests/impl.cpp b/tests/impl.cpp index 06a1098..9ff40f2 100644 --- a/tests/impl.cpp +++ b/tests/impl.cpp @@ -2602,59 +2602,59 @@ result_t test_mm_movemask_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { } result_t test_mm_mul_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { - // #ifdef ENABLE_TEST_ALL - // const float *_a = impl.test_cases_float_pointer1; - // const float *_b = impl.test_cases_float_pointer2; - // float dx = _a[0] * _b[0]; - // float dy = _a[1] * _b[1]; - // float dz = _a[2] * _b[2]; - // float dw = _a[3] * _b[3]; - // - // __m128 a = load_m128(_a); - // __m128 b = load_m128(_b); - // __m128 c = _mm_mul_ps(a, b); - // return validate_float(c, dx, dy, dz, dw); - // #else +#ifdef ENABLE_TEST_ALL + const float *_a = impl.test_cases_float_pointer1; + const float *_b = impl.test_cases_float_pointer2; + float dx = _a[0] * _b[0]; + float dy = _a[1] * _b[1]; + float dz = _a[2] * _b[2]; + float dw = _a[3] * _b[3]; + + __m128 a = load_m128(_a); + __m128 b = load_m128(_b); + __m128 c = _mm_mul_ps(a, b); + return validate_float(c, dx, dy, dz, dw); +#else return TEST_UNIMPL; - // #endif // ENABLE_TEST_ALL +#endif // ENABLE_TEST_ALL } result_t test_mm_mul_ss(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { - // #ifdef ENABLE_TEST_ALL - // const float *_a = impl.test_cases_float_pointer1; - // const float *_b = impl.test_cases_float_pointer2; - // - // float dx = _a[0] * _b[0]; - // float dy = _a[1]; - // float dz = _a[2]; - // float dw = _a[3]; - // - // __m128 a = load_m128(_a); - // __m128 b = load_m128(_b); - // __m128 c = _mm_mul_ss(a, b); - // return validate_float(c, dx, dy, dz, dw); - // #else +#ifdef ENABLE_TEST_ALL + const float *_a = impl.test_cases_float_pointer1; + const float *_b = impl.test_cases_float_pointer2; + + float dx = _a[0] * _b[0]; + float dy = _a[1]; + float dz = _a[2]; + float dw = _a[3]; + + __m128 a = load_m128(_a); + __m128 b = load_m128(_b); + __m128 c = _mm_mul_ss(a, b); + return validate_float(c, dx, dy, dz, dw); +#else return TEST_UNIMPL; - // #endif // ENABLE_TEST_ALL +#endif // ENABLE_TEST_ALL } result_t test_mm_mulhi_pu16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { - // #ifdef ENABLE_TEST_ALL - // const uint16_t *_a = (const uint16_t *)impl.test_cases_int_pointer1; - // const uint16_t *_b = (const uint16_t *)impl.test_cases_int_pointer2; - // uint16_t d[4]; - // for (uint32_t i = 0; i < 4; i++) { - // uint32_t m = (uint32_t)_a[i] * (uint32_t)_b[i]; - // d[i] = (uint16_t)(m >> 16); - // } - // - // __m64 a = load_m64(_a); - // __m64 b = load_m64(_b); - // __m64 c = _mm_mulhi_pu16(a, b); - // return VALIDATE_UINT16_M64(c, d); - // #else +#ifdef ENABLE_TEST_ALL + const uint16_t *_a = (const uint16_t *)impl.test_cases_int_pointer1; + const uint16_t *_b = (const uint16_t *)impl.test_cases_int_pointer2; + uint16_t d[4]; + for (uint32_t i = 0; i < 4; i++) { + uint32_t m = (uint32_t)_a[i] * (uint32_t)_b[i]; + d[i] = (uint16_t)(m >> 16); + } + + __m64 a = load_m64(_a); + __m64 b = load_m64(_b); + __m64 c = _mm_mulhi_pu16(a, b); + return VALIDATE_UINT16_M64(c, d); +#else return TEST_UNIMPL; - // #endif // ENABLE_TEST_ALL +#endif // ENABLE_TEST_ALL } result_t test_mm_or_ps(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { @@ -6114,129 +6114,129 @@ result_t test_mm_movpi64_epi64(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { } result_t test_mm_mul_epu32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { - // #ifdef ENABLE_TEST_ALL - // const uint32_t *_a = (const uint32_t *)impl.test_cases_int_pointer1; - // const uint32_t *_b = (const uint32_t *)impl.test_cases_int_pointer2; - // uint64_t dx = (uint64_t)(_a[0]) * (uint64_t)(_b[0]); - // uint64_t dy = (uint64_t)(_a[2]) * (uint64_t)(_b[2]); - // - // __m128i a = _mm_loadu_si128((const __m128i *)_a); - // __m128i b = _mm_loadu_si128((const __m128i *)_b); - // __m128i r = _mm_mul_epu32(a, b); - // return validate_uint64(r, dx, dy); - // #else +#ifdef ENABLE_TEST_ALL + const uint32_t *_a = (const uint32_t *)impl.test_cases_int_pointer1; + const uint32_t *_b = (const uint32_t *)impl.test_cases_int_pointer2; + uint64_t dx = (uint64_t)(_a[0]) * (uint64_t)(_b[0]); + uint64_t dy = (uint64_t)(_a[2]) * (uint64_t)(_b[2]); + + __m128i a = _mm_loadu_si128((const __m128i *)_a); + __m128i b = _mm_loadu_si128((const __m128i *)_b); + __m128i r = _mm_mul_epu32(a, b); + return validate_uint64(r, dx, dy); +#else return TEST_UNIMPL; - // #endif // ENABLE_TEST_ALL +#endif // ENABLE_TEST_ALL } result_t test_mm_mul_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { - // #ifdef ENABLE_TEST_ALL - // const double *_a = (const double *)impl.test_cases_float_pointer1; - // const double *_b = (const double *)impl.test_cases_float_pointer2; - // double d0 = _a[0] * _b[0]; - // double d1 = _a[1] * _b[1]; - // - // __m128d a = _mm_load_pd(_a); - // __m128d b = _mm_load_pd(_b); - // __m128d c = _mm_mul_pd(a, b); - // return validate_double(c, d0, d1); - // #else +#ifdef ENABLE_TEST_ALL + const double *_a = (const double *)impl.test_cases_float_pointer1; + const double *_b = (const double *)impl.test_cases_float_pointer2; + double d0 = _a[0] * _b[0]; + double d1 = _a[1] * _b[1]; + + __m128d a = _mm_load_pd(_a); + __m128d b = _mm_load_pd(_b); + __m128d c = _mm_mul_pd(a, b); + return validate_double(c, d0, d1); +#else return TEST_UNIMPL; - // #endif // ENABLE_TEST_ALL +#endif // ENABLE_TEST_ALL } result_t test_mm_mul_sd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { - // #ifdef ENABLE_TEST_ALL - // const double *_a = (const double *)impl.test_cases_float_pointer1; - // const double *_b = (const double *)impl.test_cases_float_pointer2; - // double dx = _a[0] * _b[0]; - // double dy = _a[1]; - // - // __m128d a = load_m128d(_a); - // __m128d b = load_m128d(_b); - // __m128d c = _mm_mul_sd(a, b); - // return validate_double(c, dx, dy); - // #else +#ifdef ENABLE_TEST_ALL + const double *_a = (const double *)impl.test_cases_float_pointer1; + const double *_b = (const double *)impl.test_cases_float_pointer2; + double dx = _a[0] * _b[0]; + double dy = _a[1]; + + __m128d a = load_m128d(_a); + __m128d b = load_m128d(_b); + __m128d c = _mm_mul_sd(a, b); + return validate_double(c, dx, dy); +#else return TEST_UNIMPL; - // #endif // ENABLE_TEST_ALL +#endif // ENABLE_TEST_ALL } result_t test_mm_mul_su32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { - // #ifdef ENABLE_TEST_ALL - // const uint32_t *_a = (const uint32_t *)impl.test_cases_int_pointer1; - // const uint32_t *_b = (const uint32_t *)impl.test_cases_int_pointer2; - // - // uint64_t u = (uint64_t)(_a[0]) * (uint64_t)(_b[0]); - // - // __m64 a = load_m64(_a); - // __m64 b = load_m64(_b); - // __m64 r = _mm_mul_su32(a, b); - // - // return validate_uint64(r, u); - // #else +#ifdef ENABLE_TEST_ALL + const uint32_t *_a = (const uint32_t *)impl.test_cases_int_pointer1; + const uint32_t *_b = (const uint32_t *)impl.test_cases_int_pointer2; + + uint64_t u = (uint64_t)(_a[0]) * (uint64_t)(_b[0]); + + __m64 a = load_m64(_a); + __m64 b = load_m64(_b); + __m64 r = _mm_mul_su32(a, b); + + return validate_uint64(r, u); +#else return TEST_UNIMPL; - // #endif // ENABLE_TEST_ALL +#endif // ENABLE_TEST_ALL } result_t test_mm_mulhi_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { - // #ifdef ENABLE_TEST_ALL - // const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; - // const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2; - // int16_t d[8]; - // for (uint32_t i = 0; i < 8; i++) { - // int32_t m = (int32_t)_a[i] * (int32_t)_b[i]; - // d[i] = (int16_t)(m >> 16); - // } - // - // __m128i a = load_m128i(_a); - // __m128i b = load_m128i(_b); - // __m128i c = _mm_mulhi_epi16(a, b); - // return VALIDATE_INT16_M128(c, d); - // #else +#ifdef ENABLE_TEST_ALL + const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; + const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2; + int16_t d[8]; + for (uint32_t i = 0; i < 8; i++) { + int32_t m = (int32_t)_a[i] * (int32_t)_b[i]; + d[i] = (int16_t)(m >> 16); + } + + __m128i a = load_m128i(_a); + __m128i b = load_m128i(_b); + __m128i c = _mm_mulhi_epi16(a, b); + return VALIDATE_INT16_M128(c, d); +#else return TEST_UNIMPL; - // #endif // ENABLE_TEST_ALL +#endif // ENABLE_TEST_ALL } result_t test_mm_mulhi_epu16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { - // #ifdef ENABLE_TEST_ALL - // const uint16_t *_a = (const uint16_t *)impl.test_cases_int_pointer1; - // const uint16_t *_b = (const uint16_t *)impl.test_cases_int_pointer2; - // uint16_t d[8]; - // for (uint32_t i = 0; i < 8; i++) { - // uint32_t m = (uint32_t)_a[i] * (uint32_t)_b[i]; - // d[i] = (uint16_t)(m >> 16); - // } - // - // __m128i a = load_m128i(_a); - // __m128i b = load_m128i(_b); - // __m128i c = _mm_mulhi_epu16(a, b); - // return VALIDATE_INT16_M128(c, d); - // #else +#ifdef ENABLE_TEST_ALL + const uint16_t *_a = (const uint16_t *)impl.test_cases_int_pointer1; + const uint16_t *_b = (const uint16_t *)impl.test_cases_int_pointer2; + uint16_t d[8]; + for (uint32_t i = 0; i < 8; i++) { + uint32_t m = (uint32_t)_a[i] * (uint32_t)_b[i]; + d[i] = (uint16_t)(m >> 16); + } + + __m128i a = load_m128i(_a); + __m128i b = load_m128i(_b); + __m128i c = _mm_mulhi_epu16(a, b); + return VALIDATE_INT16_M128(c, d); +#else return TEST_UNIMPL; - // #endif // ENABLE_TEST_ALL +#endif // ENABLE_TEST_ALL } result_t test_mm_mullo_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { - // #ifdef ENABLE_TEST_ALL - // const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; - // const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2; - // int16_t d[8]; - // d[0] = _a[0] * _b[0]; - // d[1] = _a[1] * _b[1]; - // d[2] = _a[2] * _b[2]; - // d[3] = _a[3] * _b[3]; - // d[4] = _a[4] * _b[4]; - // d[5] = _a[5] * _b[5]; - // d[6] = _a[6] * _b[6]; - // d[7] = _a[7] * _b[7]; - // - // __m128i a = load_m128i(_a); - // __m128i b = load_m128i(_b); - // __m128i c = _mm_mullo_epi16(a, b); - // return VALIDATE_INT16_M128(c, d); - // #else +#ifdef ENABLE_TEST_ALL + const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; + const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2; + int16_t d[8]; + d[0] = _a[0] * _b[0]; + d[1] = _a[1] * _b[1]; + d[2] = _a[2] * _b[2]; + d[3] = _a[3] * _b[3]; + d[4] = _a[4] * _b[4]; + d[5] = _a[5] * _b[5]; + d[6] = _a[6] * _b[6]; + d[7] = _a[7] * _b[7]; + + __m128i a = load_m128i(_a); + __m128i b = load_m128i(_b); + __m128i c = _mm_mullo_epi16(a, b); + return VALIDATE_INT16_M128(c, d); +#else return TEST_UNIMPL; - // #endif // ENABLE_TEST_ALL +#endif // ENABLE_TEST_ALL } result_t test_mm_or_pd(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { @@ -8934,43 +8934,41 @@ result_t test_mm_maddubs_pi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { } result_t test_mm_mulhrs_epi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { - // #ifdef ENABLE_TEST_ALL - // const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; - // const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2; - // - // __m128i a = load_m128i(_a); - // __m128i b = load_m128i(_b); - // int32_t _c[8]; - // for (int i = 0; i < 8; i++) { - // _c[i] = (((((int32_t)_a[i] * (int32_t)_b[i]) >> 14) + 1) & 0x1FFFE) >> - // 1; - // } - // __m128i c = _mm_mulhrs_epi16(a, b); - // - // return VALIDATE_INT16_M128(c, _c); - // #else +#ifdef ENABLE_TEST_ALL + const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; + const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2; + + __m128i a = load_m128i(_a); + __m128i b = load_m128i(_b); + int32_t _c[8]; + for (int i = 0; i < 8; i++) { + _c[i] = (((((int32_t)_a[i] * (int32_t)_b[i]) >> 14) + 1) & 0x1FFFE) >> 1; + } + __m128i c = _mm_mulhrs_epi16(a, b); + + return VALIDATE_INT16_M128(c, _c); +#else return TEST_UNIMPL; - // #endif // ENABLE_TEST_ALL +#endif // ENABLE_TEST_ALL } result_t test_mm_mulhrs_pi16(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { - // #ifdef ENABLE_TEST_ALL - // const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; - // const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2; - // - // __m64 a = load_m64(_a); - // __m64 b = load_m64(_b); - // int32_t _c[4]; - // for (int i = 0; i < 4; i++) { - // _c[i] = (((((int32_t)_a[i] * (int32_t)_b[i]) >> 14) + 1) & 0x1FFFE) >> - // 1; - // } - // __m64 c = _mm_mulhrs_pi16(a, b); - // - // return VALIDATE_INT16_M64(c, _c); - // #else +#ifdef ENABLE_TEST_ALL + const int16_t *_a = (const int16_t *)impl.test_cases_int_pointer1; + const int16_t *_b = (const int16_t *)impl.test_cases_int_pointer2; + + __m64 a = load_m64(_a); + __m64 b = load_m64(_b); + int32_t _c[4]; + for (int i = 0; i < 4; i++) { + _c[i] = (((((int32_t)_a[i] * (int32_t)_b[i]) >> 14) + 1) & 0x1FFFE) >> 1; + } + __m64 c = _mm_mulhrs_pi16(a, b); + + return VALIDATE_INT16_M64(c, _c); +#else return TEST_UNIMPL; - // #endif // ENABLE_TEST_ALL +#endif // ENABLE_TEST_ALL } result_t test_mm_shuffle_epi8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { @@ -10241,39 +10239,39 @@ result_t test_mm_mpsadbw_epu8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { } result_t test_mm_mul_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { - // #ifdef ENABLE_TEST_ALL - // const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1; - // const int32_t *_b = (const int32_t *)impl.test_cases_int_pointer2; - // - // int64_t dx = (int64_t)(_a[0]) * (int64_t)(_b[0]); - // int64_t dy = (int64_t)(_a[2]) * (int64_t)(_b[2]); - // - // __m128i a = _mm_loadu_si128((const __m128i *)_a); - // __m128i b = _mm_loadu_si128((const __m128i *)_b); - // __m128i r = _mm_mul_epi32(a, b); - // - // return validate_int64(r, dx, dy); - // #else +#ifdef ENABLE_TEST_ALL + const int32_t *_a = (const int32_t *)impl.test_cases_int_pointer1; + const int32_t *_b = (const int32_t *)impl.test_cases_int_pointer2; + + int64_t dx = (int64_t)(_a[0]) * (int64_t)(_b[0]); + int64_t dy = (int64_t)(_a[2]) * (int64_t)(_b[2]); + + __m128i a = _mm_loadu_si128((const __m128i *)_a); + __m128i b = _mm_loadu_si128((const __m128i *)_b); + __m128i r = _mm_mul_epi32(a, b); + + return validate_int64(r, dx, dy); +#else return TEST_UNIMPL; - // #endif // ENABLE_TEST_ALL +#endif // ENABLE_TEST_ALL } result_t test_mm_mullo_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) { - // #ifdef ENABLE_TEST_ALL - // const int32_t *_a = impl.test_cases_int_pointer1; - // const int32_t *_b = impl.test_cases_int_pointer2; - // int32_t d[4]; - // - // for (int i = 0; i < 4; i++) { - // d[i] = (int32_t)((int64_t)_a[i] * (int64_t)_b[i]); - // } - // __m128i a = load_m128i(_a); - // __m128i b = load_m128i(_b); - // __m128i c = _mm_mullo_epi32(a, b); - // return VALIDATE_INT32_M128(c, d); - // #else +#ifdef ENABLE_TEST_ALL + const int32_t *_a = impl.test_cases_int_pointer1; + const int32_t *_b = impl.test_cases_int_pointer2; + int32_t d[4]; + + for (int i = 0; i < 4; i++) { + d[i] = (int32_t)((int64_t)_a[i] * (int64_t)_b[i]); + } + __m128i a = load_m128i(_a); + __m128i b = load_m128i(_b); + __m128i c = _mm_mullo_epi32(a, b); + return VALIDATE_INT32_M128(c, d); +#else return TEST_UNIMPL; - // #endif // ENABLE_TEST_ALL +#endif // ENABLE_TEST_ALL } result_t test_mm_packus_epi32(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {