Skip to content

Commit

Permalink
feat: Add _mm_alignr_[epi8|pi8]
Browse files Browse the repository at this point in the history
  • Loading branch information
howjmay committed Jan 26, 2024
1 parent 0fc091b commit c97e361
Show file tree
Hide file tree
Showing 2 changed files with 97 additions and 92 deletions.
15 changes: 13 additions & 2 deletions sse2rvv.h
Original file line number Diff line number Diff line change
Expand Up @@ -383,9 +383,20 @@ FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b) {
return vreinterpretq_f32_m128(__riscv_vmerge_vvm_f32m1(sub, add, mask, 4));
}

// FORCE_INLINE __m128i _mm_alignr_epi8 (__m128i a, __m128i b, int imm8) {}
FORCE_INLINE __m128i _mm_alignr_epi8(__m128i a, __m128i b, int imm8) {
vuint8m2_t _a = __riscv_vlmul_ext_v_u8m1_u8m2(vreinterpretq_m128i_u8(a));
vuint8m2_t _b = __riscv_vlmul_ext_v_u8m1_u8m2(vreinterpretq_m128i_u8(b));
vuint8m2_t ab = __riscv_vslideup_vx_u8m2(_b, _a, 16, 32);
return vreinterpretq_u8_m128i(__riscv_vlmul_trunc_v_u8m2_u8m1(
__riscv_vslidedown_vx_u8m2(ab, imm8, 32)));
}

// FORCE_INLINE __m64 _mm_alignr_pi8 (__m64 a, __m64 b, int imm8) {}
FORCE_INLINE __m64 _mm_alignr_pi8(__m64 a, __m64 b, int imm8) {
vuint8m1_t _a = vreinterpretq_m64_u8(a);
vuint8m1_t _b = vreinterpretq_m64_u8(b);
vuint8m1_t ab = __riscv_vslideup_vx_u8m1(_b, _a, 8, 16);
return vreinterpretq_u8_m64(__riscv_vslidedown_vx_u8m1(ab, imm8, 16));
}

FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b) {
vint64m1_t _a = vreinterpretq_m128d_i64(a);
Expand Down
174 changes: 84 additions & 90 deletions tests/impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8475,102 +8475,96 @@ result_t test_mm_abs_pi8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
}

result_t test_mm_alignr_epi8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
// #ifdef ENABLE_TEST_ALL
// #if defined(__clang__)
// #else
return TEST_UNIMPL;
// #endif // ENABLE_TEST_ALL
// #else
// const uint8_t *_a = (const uint8_t *)impl.test_cases_int_pointer1;
// const uint8_t *_b = (const uint8_t *)impl.test_cases_int_pointer2;
// unsigned int shift = (iter % 5) << 3;
// uint8_t d[32];
//
// if (shift >= 32) {
// memset((void *)d, 0, sizeof(d));
// } else {
// memcpy((void *)d, (const void *)_b, 16);
// memcpy((void *)(d + 16), (const void *)_a, 16);
// // shifting
// for (size_t x = 0; x < sizeof(d); x++) {
// if (x + shift >= sizeof(d))
// d[x] = 0;
// else
// d[x] = d[x + shift];
// }
// }
//
// __m128i a = load_m128i(_a);
// __m128i b = load_m128i(_b);
// __m128i ret;
// switch (iter % 5) {
// case 0:
// ret = _mm_alignr_epi8(a, b, 0);
// break;
// case 1:
// ret = _mm_alignr_epi8(a, b, 8);
// break;
// case 2:
// ret = _mm_alignr_epi8(a, b, 16);
// break;
// case 3:
// ret = _mm_alignr_epi8(a, b, 24);
// break;
// case 4:
// ret = _mm_alignr_epi8(a, b, 32);
// break;
// }
//
// return VALIDATE_UINT8_M128(ret, d);
// #endif
// #ifdef ENABLE_TEST_ALL
#if defined(__clang__)
#else
const uint8_t *_a = (const uint8_t *)impl.test_cases_int_pointer1;
const uint8_t *_b = (const uint8_t *)impl.test_cases_int_pointer2;
unsigned int shift = (iter % 5) << 3;
uint8_t d[32];

if (shift >= 32) {
memset((void *)d, 0, sizeof(d));
} else {
memcpy((void *)d, (const void *)_b, 16);
memcpy((void *)(d + 16), (const void *)_a, 16);
// shifting
for (size_t x = 0; x < sizeof(d); x++) {
if (x + shift >= sizeof(d))
d[x] = 0;
else
d[x] = d[x + shift];
}
}

__m128i a = load_m128i(_a);
__m128i b = load_m128i(_b);
__m128i ret;
switch (iter % 5) {
case 0:
ret = _mm_alignr_epi8(a, b, 0);
break;
case 1:
ret = _mm_alignr_epi8(a, b, 8);
break;
case 2:
ret = _mm_alignr_epi8(a, b, 16);
break;
case 3:
ret = _mm_alignr_epi8(a, b, 24);
break;
case 4:
ret = _mm_alignr_epi8(a, b, 32);
break;
}

return VALIDATE_UINT8_M128(ret, d);
#endif
// #else
return TEST_UNIMPL;
// #endif // ENABLE_TEST_ALL
}

result_t test_mm_alignr_pi8(const SSE2RVV_TEST_IMPL &impl, uint32_t iter) {
// #ifdef ENABLE_TEST_ALL
// #if defined(__clang__)
// #else
return TEST_UNIMPL;
// #endif // ENABLE_TEST_ALL
// #else
// const uint8_t *_a = (const uint8_t *)impl.test_cases_int_pointer1;
// const uint8_t *_b = (const uint8_t *)impl.test_cases_int_pointer2;
// unsigned int shift = (iter % 3) << 3;
// uint8_t d[16];
//
// if (shift >= 16) {
// memset((void *)d, 0, sizeof(d));
// } else {
// memcpy((void *)d, (const void *)_b, 8);
// memcpy((void *)(d + 8), (const void *)_a, 8);
// // shifting
// for (size_t x = 0; x < sizeof(d); x++) {
// if (x + shift >= sizeof(d))
// d[x] = 0;
// else
// d[x] = d[x + shift];
// }
// }
//
// __m64 a = load_m64(_a);
// __m64 b = load_m64(_b);
// __m64 ret;
// switch (iter % 3) {
// case 0:
// ret = _mm_alignr_pi8(a, b, 0);
// break;
// case 1:
// ret = _mm_alignr_pi8(a, b, 8);
// break;
// case 2:
// ret = _mm_alignr_pi8(a, b, 16);
// break;
// }
//
// return VALIDATE_UINT8_M64(ret, d);
// #endif
// #ifdef ENABLE_TEST_ALL
#if defined(__clang__)
#else
const uint8_t *_a = (const uint8_t *)impl.test_cases_int_pointer1;
const uint8_t *_b = (const uint8_t *)impl.test_cases_int_pointer2;
unsigned int shift = (iter % 3) << 3;
uint8_t d[16];

if (shift >= 16) {
memset((void *)d, 0, sizeof(d));
} else {
memcpy((void *)d, (const void *)_b, 8);
memcpy((void *)(d + 8), (const void *)_a, 8);
// shifting
for (size_t x = 0; x < sizeof(d); x++) {
if (x + shift >= sizeof(d))
d[x] = 0;
else
d[x] = d[x + shift];
}
}

__m64 a = load_m64(_a);
__m64 b = load_m64(_b);
__m64 ret;
switch (iter % 3) {
case 0:
ret = _mm_alignr_pi8(a, b, 0);
break;
case 1:
ret = _mm_alignr_pi8(a, b, 8);
break;
case 2:
ret = _mm_alignr_pi8(a, b, 16);
break;
}

return VALIDATE_UINT8_M64(ret, d);
#endif
// #else
return TEST_UNIMPL;
// #endif // ENABLE_TEST_ALL
Expand Down

0 comments on commit c97e361

Please sign in to comment.