From 5853cd39d22506fbede37ac282936dd774be7f0b Mon Sep 17 00:00:00 2001 From: Yoan Picchi Date: Tue, 5 Nov 2024 15:59:19 +0000 Subject: [PATCH 1/2] Add double shufti to the benchmark Signed-off-by: Yoan Picchi --- benchmarks/benchmarks.cpp | 25 +++++++++++++++++++++++++ benchmarks/benchmarks.hpp | 1 + 2 files changed, 26 insertions(+) diff --git a/benchmarks/benchmarks.cpp b/benchmarks/benchmarks.cpp index 814029944..4d251bbcf 100644 --- a/benchmarks/benchmarks.cpp +++ b/benchmarks/benchmarks.cpp @@ -175,6 +175,31 @@ int main(){ }); } + for (size_t i = 0; i < std::size(sizes); i++) { + MicroBenchmark bench("Double Shufti", sizes[i]); + run_benchmarks( + sizes[i], MAX_LOOPS / sizes[i], matches[m], false, bench, + [&](MicroBenchmark &b) { + ue2::shuftiBuildMasks(b.chars, reinterpret_cast(&b.lo), reinterpret_cast(&b.hi)); + b.chars.clear(); + ue2::flat_set> pattern; + pattern.insert({'a', 'b'}); + ue2::shuftiBuildDoubleMasks(b.chars, pattern, + reinterpret_cast(&b.truffle_mask_lo), + reinterpret_cast(&b.truffle_mask_hi), + reinterpret_cast(&b.double_shufti_lo2), + reinterpret_cast(&b.double_shufti_hi2)); + memset(b.buf.data(), 'b', b.size); + }, + [&](MicroBenchmark &b) { + return shuftiDoubleExec(b.truffle_mask_lo, + b.truffle_mask_hi, + b.double_shufti_lo2, + b.double_shufti_hi2, + b.buf.data(), b.buf.data() + b.size); + }); + } + for (size_t i = 0; i < std::size(sizes); i++) { MicroBenchmark bench("Truffle", sizes[i]); run_benchmarks( diff --git a/benchmarks/benchmarks.hpp b/benchmarks/benchmarks.hpp index f96a5b0b4..8801d93a3 100644 --- a/benchmarks/benchmarks.hpp +++ b/benchmarks/benchmarks.hpp @@ -61,6 +61,7 @@ class MicroBenchmark { #endif }; }; + m128 double_shufti_lo2, double_shufti_hi2; MicroBenchmark(char const *label_, size_t size_) : label(label_), size(size_), buf(size_){}; From 76949c215ce0f7c165abd6b0c5c14f42d7856f6b Mon Sep 17 00:00:00 2001 From: Yoan Picchi Date: Mon, 28 Oct 2024 18:05:18 +0000 Subject: [PATCH 2/2] Fix shufti false positive on vector edge If we look for a pattern like "ab" and the letter 'a' fall at the end of the vector, then it was reporting a positive match, regardless of the second letter. This patch fix this false positive, but slows shufti down by 16%. Signed-off-by: Yoan Picchi --- src/nfa/arm/shufti.hpp | 14 +++++++++----- src/nfa/ppc64el/shufti.hpp | 14 +++++++++----- src/nfa/shufti_simd.hpp | 28 +++++++++++++++++----------- src/nfa/shufti_sve.hpp | 29 ++++++++++++++++++----------- src/nfa/x86/shufti.hpp | 14 +++++++++----- 5 files changed, 62 insertions(+), 37 deletions(-) diff --git a/src/nfa/arm/shufti.hpp b/src/nfa/arm/shufti.hpp index e710fd16a..b5190d5c3 100644 --- a/src/nfa/arm/shufti.hpp +++ b/src/nfa/arm/shufti.hpp @@ -46,7 +46,7 @@ const SuperVector blockSingleMask(SuperVector mask_lo, SuperVector mask template static really_inline -SuperVector blockDoubleMask(SuperVector mask1_lo, SuperVector mask1_hi, SuperVector mask2_lo, SuperVector mask2_hi, SuperVector chars) { +SuperVector blockDoubleMask(SuperVector mask1_lo, SuperVector mask1_hi, SuperVector mask2_lo, SuperVector mask2_hi, SuperVector chars, SuperVector offset_char) { const SuperVector low4bits = SuperVector::dup_u8(0xf); SuperVector chars_lo = chars & low4bits; @@ -60,14 +60,18 @@ SuperVector blockDoubleMask(SuperVector mask1_lo, SuperVector mask1_hi, SuperVector t1 = c1_lo | c1_hi; t1.print8("t1"); - SuperVector c2_lo = mask2_lo.template pshufb(chars_lo); + SuperVector chars_lo2 = offset_char & low4bits; + chars_lo.print8("chars_lo2"); + SuperVector chars_hi2 = offset_char.template vshr_64_imm<4>() & low4bits; + chars_hi.print8("chars_hi2"); + + SuperVector c2_lo = mask2_lo.template pshufb(chars_lo2); c2_lo.print8("c2_lo"); - SuperVector c2_hi = mask2_hi.template pshufb(chars_hi); + SuperVector c2_hi = mask2_hi.template pshufb(chars_hi2); c2_hi.print8("c2_hi"); SuperVector t2 = c2_lo | c2_hi; t2.print8("t2"); - t2.template vshr_128_imm<1>().print8("t2.vshr_128(1)"); - SuperVector t = t1 | (t2.template vshr_128_imm<1>()); + SuperVector t = t1 | t2; t.print8("t"); return !t.eq(SuperVector::Ones()); diff --git a/src/nfa/ppc64el/shufti.hpp b/src/nfa/ppc64el/shufti.hpp index dedeb52de..0208d29d9 100644 --- a/src/nfa/ppc64el/shufti.hpp +++ b/src/nfa/ppc64el/shufti.hpp @@ -48,7 +48,7 @@ const SuperVector blockSingleMask(SuperVector mask_lo, SuperVector mask template static really_inline -SuperVector blockDoubleMask(SuperVector mask1_lo, SuperVector mask1_hi, SuperVector mask2_lo, SuperVector mask2_hi, SuperVector chars) { +SuperVector blockDoubleMask(SuperVector mask1_lo, SuperVector mask1_hi, SuperVector mask2_lo, SuperVector mask2_hi, SuperVector chars, SuperVector offset_char) { const SuperVector low4bits = SuperVector::dup_u8(0xf); SuperVector chars_lo = chars & low4bits; @@ -62,14 +62,18 @@ SuperVector blockDoubleMask(SuperVector mask1_lo, SuperVector mask1_hi, SuperVector t1 = c1_lo | c1_hi; t1.print8("t1"); - SuperVector c2_lo = mask2_lo.template pshufb(chars_lo); + SuperVector chars_lo2 = offset_char & low4bits; + chars_lo.print8("chars_lo2"); + SuperVector chars_hi2 = offset_char.template vshr_64_imm<4>() & low4bits; + chars_hi.print8("chars_hi2"); + + SuperVector c2_lo = mask2_lo.template pshufb(chars_lo2); c2_lo.print8("c2_lo"); - SuperVector c2_hi = mask2_hi.template pshufb(chars_hi); + SuperVector c2_hi = mask2_hi.template pshufb(chars_hi2); c2_hi.print8("c2_hi"); SuperVector t2 = c2_lo | c2_hi; t2.print8("t2"); - t2.template vshr_128_imm<1>().print8("t2.vshr_128(1)"); - SuperVector t = t1 | (t2.template vshr_128_imm<1>()); + SuperVector t = t1 | t2; t.print8("t"); return t.eq(SuperVector::Ones()); diff --git a/src/nfa/shufti_simd.hpp b/src/nfa/shufti_simd.hpp index bdb0ff9fe..bb1a2d924 100644 --- a/src/nfa/shufti_simd.hpp +++ b/src/nfa/shufti_simd.hpp @@ -50,7 +50,7 @@ static really_inline const SuperVector blockSingleMask(SuperVector mask_lo, SuperVector mask_hi, SuperVector chars); template static really_inline -SuperVector blockDoubleMask(SuperVector mask1_lo, SuperVector mask1_hi, SuperVector mask2_lo, SuperVector mask2_hi, SuperVector chars); +SuperVector blockDoubleMask(SuperVector mask1_lo, SuperVector mask1_hi, SuperVector mask2_lo, SuperVector mask2_hi, SuperVector chars, SuperVector offset_chars); #if defined(VS_SIMDE_BACKEND) #include "x86/shufti.hpp" @@ -82,9 +82,9 @@ const u8 *revBlock(SuperVector mask_lo, SuperVector mask_hi, SuperVector static really_inline -const u8 *fwdBlockDouble(SuperVector mask1_lo, SuperVector mask1_hi, SuperVector mask2_lo, SuperVector mask2_hi, SuperVector chars, const u8 *buf) { +const u8 *fwdBlockDouble(SuperVector mask1_lo, SuperVector mask1_hi, SuperVector mask2_lo, SuperVector mask2_hi, SuperVector chars, SuperVector offset_chars, const u8 *buf) { - SuperVector mask = blockDoubleMask(mask1_lo, mask1_hi, mask2_lo, mask2_hi, chars); + SuperVector mask = blockDoubleMask(mask1_lo, mask1_hi, mask2_lo, mask2_hi, chars, offset_chars); return first_zero_match_inverted(buf, mask); } @@ -204,6 +204,8 @@ const u8 *shuftiDoubleExecReal(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo, m128 DEBUG_PRINTF("shufti %p len %zu\n", buf, buf_end - buf); DEBUG_PRINTF("b %s\n", buf); + const u8 *buf_one_off_end = buf_end - 1; + const SuperVector wide_mask1_lo(mask1_lo); const SuperVector wide_mask1_hi(mask1_hi); const SuperVector wide_mask2_lo(mask2_lo); @@ -217,24 +219,26 @@ const u8 *shuftiDoubleExecReal(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo, m128 __builtin_prefetch(d + 3*64); __builtin_prefetch(d + 4*64); DEBUG_PRINTF("start %p end %p \n", d, buf_end); - assert(d < buf_end); - if (d + S <= buf_end) { + assert(d < buf_one_off_end); + if (d + S <= buf_one_off_end) { // peel off first part to cacheline boundary DEBUG_PRINTF("until aligned %p \n", ROUNDUP_PTR(d, S)); if (!ISALIGNED_N(d, S)) { SuperVector chars = SuperVector::loadu(d); - rv = fwdBlockDouble(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, wide_mask2_hi, chars, d); + SuperVector offset_char = SuperVector::loadu(d + 1); + rv = fwdBlockDouble(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, wide_mask2_hi, chars, offset_char, d); DEBUG_PRINTF("rv %p \n", rv); if (rv) return rv; d = ROUNDUP_PTR(d, S); } - while(d + S <= buf_end) { + while(d + S <= buf_one_off_end) { __builtin_prefetch(d + 64); DEBUG_PRINTF("d %p \n", d); SuperVector chars = SuperVector::load(d); - rv = fwdBlockDouble(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, wide_mask2_hi, chars, d); + SuperVector offset_char = SuperVector::loadu(d + 1); + rv = fwdBlockDouble(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, wide_mask2_hi, chars, offset_char, d); if (rv) return rv; d += S; } @@ -243,17 +247,19 @@ const u8 *shuftiDoubleExecReal(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo, m128 DEBUG_PRINTF("tail d %p e %p \n", d, buf_end); // finish off tail - if (d != buf_end) { + if (d < buf_one_off_end) { SuperVector chars = SuperVector::Zeroes(); + SuperVector offset_char = SuperVector::Zeroes(); const u8 *end_buf; if (buf_end - buf < S) { memcpy(&chars.u, buf, buf_end - buf); end_buf = buf; } else { - chars = SuperVector::loadu(buf_end - S); + chars = SuperVector::loadu(buf_one_off_end - S); + offset_char = SuperVector::loadu(buf_end - S); end_buf = buf_end - S; } - rv = fwdBlockDouble(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, wide_mask2_hi, chars, end_buf); + rv = fwdBlockDouble(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, wide_mask2_hi, chars, offset_char, end_buf); DEBUG_PRINTF("rv %p \n", rv); if (rv && rv < buf_end) return rv; } diff --git a/src/nfa/shufti_sve.hpp b/src/nfa/shufti_sve.hpp index 76f1e7adb..c3ea12841 100644 --- a/src/nfa/shufti_sve.hpp +++ b/src/nfa/shufti_sve.hpp @@ -155,31 +155,36 @@ svbool_t doubleMatched(svuint8_t mask1_lo, svuint8_t mask1_hi, svuint8_t mask2_lo, svuint8_t mask2_hi, const u8 *buf, const svbool_t pg) { svuint8_t vec = svld1_u8(pg, buf); + svuint8_t vec2 = svld1_u8(pg, buf + 1); svuint8_t chars_lo = svand_x(svptrue_b8(), vec, (uint8_t)0xf); svuint8_t chars_hi = svlsr_x(svptrue_b8(), vec, 4); + svuint8_t chars_lo2 = svand_x(svptrue_b8(), vec2, (uint8_t)0xf); + svuint8_t chars_hi2 = svlsr_x(svptrue_b8(), vec2, 4); svuint8_t c1_lo = svtbl(mask1_lo, chars_lo); svuint8_t c1_hi = svtbl(mask1_hi, chars_hi); svuint8_t t1 = svorr_x(svptrue_b8(), c1_lo, c1_hi); - svuint8_t c2_lo = svtbl(mask2_lo, chars_lo); - svuint8_t c2_hi = svtbl(mask2_hi, chars_hi); - svuint8_t t2 = svext(svorr_z(pg, c2_lo, c2_hi), svdup_u8(0), 1); + svuint8_t c2_lo = svtbl(mask2_lo, chars_lo2); + svuint8_t c2_hi = svtbl(mask2_hi, chars_hi2); + svuint8_t t2 = svorr_x(svptrue_b8(), c2_lo, c2_hi); svuint8_t t = svorr_x(svptrue_b8(), t1, t2); - return svnot_z(svptrue_b8(), svcmpeq(svptrue_b8(), t, (uint8_t)0xff)); + return svnot_z(pg, svcmpeq(svptrue_b8(), t, (uint8_t)0xff)); } static really_inline const u8 *dshuftiOnce(svuint8_t mask1_lo, svuint8_t mask1_hi, svuint8_t mask2_lo, svuint8_t mask2_hi, const u8 *buf, const u8 *buf_end) { + const u8 *buf_one_off_end = buf_end - 1; + DEBUG_PRINTF("start %p end %p\n", buf, buf_end); - assert(buf < buf_end); + assert(buf < buf_one_off_end); DEBUG_PRINTF("l = %td\n", buf_end - buf); - svbool_t pg = svwhilelt_b8_s64(0, buf_end - buf); + svbool_t pg = svwhilelt_b8_s64(0, buf_one_off_end - buf); svbool_t matched = doubleMatched(mask1_lo, mask1_hi, mask2_lo, mask2_hi, buf, pg); return accelSearchCheckMatched(buf, matched); @@ -199,9 +204,11 @@ static really_inline const u8 *dshuftiSearch(svuint8_t mask1_lo, svuint8_t mask1_hi, svuint8_t mask2_lo, svuint8_t mask2_hi, const u8 *buf, const u8 *buf_end) { - assert(buf < buf_end); + const u8 *buf_one_off_end = buf_end - 1; + + assert(buf < buf_one_off_end); size_t len = buf_end - buf; - if (len <= svcntb()) { + if (len <= svcntb() + 1) { return dshuftiOnce(mask1_lo, mask1_hi, mask2_lo, mask2_hi, buf, buf_end); } @@ -214,7 +221,7 @@ const u8 *dshuftiSearch(svuint8_t mask1_lo, svuint8_t mask1_hi, if (ptr) return ptr; } buf = aligned_buf; - size_t loops = (buf_end - buf) / svcntb(); + size_t loops = (buf_one_off_end - buf) / svcntb(); DEBUG_PRINTF("loops %zu \n", loops); for (size_t i = 0; i < loops; i++, buf += svcntb()) { const u8 *ptr = dshuftiLoopBody(mask1_lo, mask1_hi, @@ -222,9 +229,9 @@ const u8 *dshuftiSearch(svuint8_t mask1_lo, svuint8_t mask1_hi, if (ptr) return ptr; } DEBUG_PRINTF("buf %p buf_end %p \n", buf, buf_end); - return buf == buf_end ? NULL : dshuftiLoopBody(mask1_lo, mask1_hi, + return buf == buf_one_off_end ? NULL : dshuftiLoopBody(mask1_lo, mask1_hi, mask2_lo, mask2_hi, - buf_end - svcntb()); + buf_one_off_end - svcntb()); } const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi, diff --git a/src/nfa/x86/shufti.hpp b/src/nfa/x86/shufti.hpp index 6fb34b2f2..8e743b782 100644 --- a/src/nfa/x86/shufti.hpp +++ b/src/nfa/x86/shufti.hpp @@ -46,7 +46,7 @@ const SuperVector blockSingleMask(SuperVector mask_lo, SuperVector mask template static really_inline -SuperVector blockDoubleMask(SuperVector mask1_lo, SuperVector mask1_hi, SuperVector mask2_lo, SuperVector mask2_hi, SuperVector chars) { +SuperVector blockDoubleMask(SuperVector mask1_lo, SuperVector mask1_hi, SuperVector mask2_lo, SuperVector mask2_hi, SuperVector chars, SuperVector offset_char) { const SuperVector low4bits = SuperVector::dup_u8(0xf); SuperVector chars_lo = chars & low4bits; @@ -60,14 +60,18 @@ SuperVector blockDoubleMask(SuperVector mask1_lo, SuperVector mask1_hi, SuperVector c1 = c1_lo | c1_hi; c1.print8("c1"); - SuperVector c2_lo = mask2_lo.pshufb(chars_lo); + SuperVector chars_lo2 = offset_char & low4bits; + chars_lo.print8("chars_lo2"); + SuperVector chars_hi2 = offset_char.template vshr_64_imm<4>() & low4bits; + chars_hi.print8("chars_hi2"); + + SuperVector c2_lo = mask2_lo.pshufb(chars_lo2); c2_lo.print8("c2_lo"); - SuperVector c2_hi = mask2_hi.pshufb(chars_hi); + SuperVector c2_hi = mask2_hi.pshufb(chars_hi2); c2_hi.print8("c2_hi"); SuperVector c2 = c2_lo | c2_hi; c2.print8("c2"); - c2.template vshr_128_imm<1>().print8("c2.vshr_128(1)"); - SuperVector c = c1 | (c2.template vshr_128_imm<1>()); + SuperVector c = c1 | c2; c.print8("c"); return c.eq(SuperVector::Ones());