diff --git a/src/nfa/arm/shufti.hpp b/src/nfa/arm/shufti.hpp index e710fd16..97931f4d 100644 --- a/src/nfa/arm/shufti.hpp +++ b/src/nfa/arm/shufti.hpp @@ -46,7 +46,7 @@ const SuperVector blockSingleMask(SuperVector mask_lo, SuperVector mask template static really_inline -SuperVector blockDoubleMask(SuperVector mask1_lo, SuperVector mask1_hi, SuperVector mask2_lo, SuperVector mask2_hi, SuperVector chars) { +SuperVector blockDoubleMask(SuperVector mask1_lo, SuperVector mask1_hi, SuperVector mask2_lo, SuperVector mask2_hi, SuperVector *inout_t1, SuperVector chars) { const SuperVector low4bits = SuperVector::dup_u8(0xf); SuperVector chars_lo = chars & low4bits; @@ -57,18 +57,25 @@ SuperVector blockDoubleMask(SuperVector mask1_lo, SuperVector mask1_hi, c1_lo.print8("c1_lo"); SuperVector c1_hi = mask1_hi.template pshufb(chars_hi); c1_hi.print8("c1_hi"); - SuperVector t1 = c1_lo | c1_hi; - t1.print8("t1"); + SuperVector new_t1 = c1_lo | c1_hi; + // t1 is the match mask for the first char of the patterns + new_t1.print8("t1"); SuperVector c2_lo = mask2_lo.template pshufb(chars_lo); c2_lo.print8("c2_lo"); SuperVector c2_hi = mask2_hi.template pshufb(chars_hi); c2_hi.print8("c2_hi"); SuperVector t2 = c2_lo | c2_hi; + // t2 is the match mask for the second char of the patterns t2.print8("t2"); - t2.template vshr_128_imm<1>().print8("t2.vshr_128(1)"); - SuperVector t = t1 | (t2.template vshr_128_imm<1>()); + + // offset t1 so it aligns with t2. The hole created by the offset is filled + // with the last elements of the previous t1 so no info is lost. + // Bits set to 0 lining up indicate a match. + SuperVector t = (new_t1.alignr(*inout_t1, S-1)) | t2; t.print8("t"); + *inout_t1 = new_t1; + return !t.eq(SuperVector::Ones()); } \ No newline at end of file diff --git a/src/nfa/ppc64el/shufti.hpp b/src/nfa/ppc64el/shufti.hpp index dedeb52d..580dbe40 100644 --- a/src/nfa/ppc64el/shufti.hpp +++ b/src/nfa/ppc64el/shufti.hpp @@ -48,7 +48,7 @@ const SuperVector blockSingleMask(SuperVector mask_lo, SuperVector mask template static really_inline -SuperVector blockDoubleMask(SuperVector mask1_lo, SuperVector mask1_hi, SuperVector mask2_lo, SuperVector mask2_hi, SuperVector chars) { +SuperVector blockDoubleMask(SuperVector mask1_lo, SuperVector mask1_hi, SuperVector mask2_lo, SuperVector mask2_hi, SuperVector *inout_t1, SuperVector chars) { const SuperVector low4bits = SuperVector::dup_u8(0xf); SuperVector chars_lo = chars & low4bits; @@ -59,18 +59,25 @@ SuperVector blockDoubleMask(SuperVector mask1_lo, SuperVector mask1_hi, c1_lo.print8("c1_lo"); SuperVector c1_hi = mask1_hi.template pshufb(chars_hi); c1_hi.print8("c1_hi"); - SuperVector t1 = c1_lo | c1_hi; - t1.print8("t1"); + SuperVector new_t1 = c1_lo | c1_hi; + // t1 is the match mask for the first char of the patterns + new_t1.print8("t1"); SuperVector c2_lo = mask2_lo.template pshufb(chars_lo); c2_lo.print8("c2_lo"); SuperVector c2_hi = mask2_hi.template pshufb(chars_hi); c2_hi.print8("c2_hi"); SuperVector t2 = c2_lo | c2_hi; + // t2 is the match mask for the second char of the patterns t2.print8("t2"); - t2.template vshr_128_imm<1>().print8("t2.vshr_128(1)"); - SuperVector t = t1 | (t2.template vshr_128_imm<1>()); + + // offset t1 so it aligns with t2. The hole created by the offset is filled + // with the last elements of the previous t1 so no info is lost. + // If bits with value 0 lines up, it indicate a match. + SuperVector t = (new_t1.alignr(*inout_t1, S-1)) | t2; t.print8("t"); + *inout_t1 = new_t1; + return t.eq(SuperVector::Ones()); } diff --git a/src/nfa/shufti_simd.hpp b/src/nfa/shufti_simd.hpp index 1a00b87b..f03f6354 100644 --- a/src/nfa/shufti_simd.hpp +++ b/src/nfa/shufti_simd.hpp @@ -50,7 +50,7 @@ static really_inline const SuperVector blockSingleMask(SuperVector mask_lo, SuperVector mask_hi, SuperVector chars); template static really_inline -SuperVector blockDoubleMask(SuperVector mask1_lo, SuperVector mask1_hi, SuperVector mask2_lo, SuperVector mask2_hi, SuperVector chars); +SuperVector blockDoubleMask(SuperVector mask1_lo, SuperVector mask1_hi, SuperVector mask2_lo, SuperVector mask2_hi, SuperVector *inout_first_char_mask, SuperVector chars); #if defined(VS_SIMDE_BACKEND) #include "x86/shufti.hpp" @@ -82,11 +82,13 @@ const u8 *revBlock(SuperVector mask_lo, SuperVector mask_hi, SuperVector static really_inline -const u8 *fwdBlockDouble(SuperVector mask1_lo, SuperVector mask1_hi, SuperVector mask2_lo, SuperVector mask2_hi, SuperVector chars, const u8 *buf) { +const u8 *fwdBlockDouble(SuperVector mask1_lo, SuperVector mask1_hi, SuperVector mask2_lo, SuperVector mask2_hi, SuperVector *prev_first_char_mask, SuperVector chars, const u8 *buf) { - SuperVector mask = blockDoubleMask(mask1_lo, mask1_hi, mask2_lo, mask2_hi, chars); + SuperVector mask = blockDoubleMask(mask1_lo, mask1_hi, mask2_lo, mask2_hi, prev_first_char_mask, chars); - return first_zero_match_inverted(buf, mask); + // By shifting first_char_mask instead of the legacy t2 mask, we would report + // on the second char instead of the first. we offset the buf to compensate. + return first_zero_match_inverted(buf-1, mask); } template @@ -196,6 +198,29 @@ const u8 *rshuftiExecReal(m128 mask_lo, m128 mask_hi, const u8 *buf, const u8 *b return buf - 1; } +// A match on the last char is valid if and only if it match a single char +// pattern, not a char pair. So we manually check the last match with the +// wildcard patterns. +template +static really_inline +const u8 *check_last_byte(SuperVector mask2_lo, SuperVector mask2_hi, + SuperVector mask, uint8_t mask_len, const u8 *buf_end) { + uint8_t last_elem = mask.u.u8[mask_len - 1]; + + SuperVector reduce = mask2_lo | mask2_hi; + for(uint16_t i = S; i > 2; i/=2) { + reduce = reduce | reduce.vshr(i/2); + } + uint8_t match_inverted = reduce.u.u8[0] | last_elem; + + // if 0xff, then no match + int match = match_inverted != 0xff; + if(match) { + return buf_end - 1; + } + return NULL; +} + template const u8 *shuftiDoubleExecReal(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo, m128 mask2_hi, const u8 *buf, const u8 *buf_end) { @@ -216,6 +241,8 @@ const u8 *shuftiDoubleExecReal(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo, m128 __builtin_prefetch(d + 2*64); __builtin_prefetch(d + 3*64); __builtin_prefetch(d + 4*64); + + SuperVector first_char_mask = SuperVector::Ones(); DEBUG_PRINTF("start %p end %p \n", d, buf_end); assert(d < buf_end); if (d + S <= buf_end) { @@ -223,33 +250,42 @@ const u8 *shuftiDoubleExecReal(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo, m128 DEBUG_PRINTF("until aligned %p \n", ROUNDUP_PTR(d, S)); if (!ISALIGNED_N(d, S)) { SuperVector chars = SuperVector::loadu(d); - rv = fwdBlockDouble(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, wide_mask2_hi, chars, d); + rv = fwdBlockDouble(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, wide_mask2_hi, &first_char_mask, chars, d); DEBUG_PRINTF("rv %p \n", rv); if (rv) return rv; d = ROUNDUP_PTR(d, S); + ptrdiff_t offset = d - buf; + first_char_mask.print8("inout_c1"); + first_char_mask = first_char_mask.vshl(S - offset); + first_char_mask.print8("inout_c1 shifted"); } + first_char_mask = SuperVector::Ones(); while(d + S <= buf_end) { __builtin_prefetch(d + 64); DEBUG_PRINTF("d %p \n", d); SuperVector chars = SuperVector::load(d); - rv = fwdBlockDouble(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, wide_mask2_hi, chars, d); - if (rv) return rv; + rv = fwdBlockDouble(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, wide_mask2_hi, &first_char_mask, chars, d); + if (rv && rv < buf_end - 1) return rv; d += S; } } + ptrdiff_t last_mask_len = S; DEBUG_PRINTF("tail d %p e %p \n", d, buf_end); // finish off tail if (d != buf_end) { SuperVector chars = SuperVector::loadu(d); - rv = fwdBlockDouble(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, wide_mask2_hi, chars, d); + rv = fwdBlockDouble(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, wide_mask2_hi, &first_char_mask, chars, d); DEBUG_PRINTF("rv %p \n", rv); - if (rv && rv < buf_end) return rv; + if (rv && rv < buf_end - 1) return rv; + last_mask_len = buf_end - d; } + rv = check_last_byte(wide_mask2_lo, wide_mask2_hi, first_char_mask, last_mask_len, buf_end); + if (rv) return rv; return buf_end; } diff --git a/src/nfa/shufti_sve.hpp b/src/nfa/shufti_sve.hpp index 76f1e7ad..3e1bc86c 100644 --- a/src/nfa/shufti_sve.hpp +++ b/src/nfa/shufti_sve.hpp @@ -153,7 +153,7 @@ const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, static really_inline svbool_t doubleMatched(svuint8_t mask1_lo, svuint8_t mask1_hi, svuint8_t mask2_lo, svuint8_t mask2_hi, - const u8 *buf, const svbool_t pg) { + svuint8_t* inout_t1, const u8 *buf, const svbool_t pg) { svuint8_t vec = svld1_u8(pg, buf); svuint8_t chars_lo = svand_x(svptrue_b8(), vec, (uint8_t)0xf); @@ -161,38 +161,59 @@ svbool_t doubleMatched(svuint8_t mask1_lo, svuint8_t mask1_hi, svuint8_t c1_lo = svtbl(mask1_lo, chars_lo); svuint8_t c1_hi = svtbl(mask1_hi, chars_hi); - svuint8_t t1 = svorr_x(svptrue_b8(), c1_lo, c1_hi); + svuint8_t new_t1 = svorr_z(svptrue_b8(), c1_lo, c1_hi); svuint8_t c2_lo = svtbl(mask2_lo, chars_lo); svuint8_t c2_hi = svtbl(mask2_hi, chars_hi); - svuint8_t t2 = svext(svorr_z(pg, c2_lo, c2_hi), svdup_u8(0), 1); + svuint8_t t2 = svorr_x(svptrue_b8(), c2_lo, c2_hi); - svuint8_t t = svorr_x(svptrue_b8(), t1, t2); + // shift t1 left by one and feeds in the last element from the previous t1 + uint8_t last_elem = svlastb(svptrue_b8(), *inout_t1); + svuint8_t merged_t1 = svinsr(new_t1, last_elem); + svuint8_t t = svorr_x(svptrue_b8(), merged_t1, t2); + *inout_t1 = new_t1; return svnot_z(svptrue_b8(), svcmpeq(svptrue_b8(), t, (uint8_t)0xff)); } +static really_inline +const u8 *check_last_byte(svuint8_t mask2_lo, svuint8_t mask2_hi, + uint8_t last_elem, const u8 *buf_end) { + uint8_t wild_lo = svorv(svptrue_b8(), mask2_lo); + uint8_t wild_hi = svorv(svptrue_b8(), mask2_hi); + uint8_t match_inverted = wild_lo | wild_hi | last_elem; + int match = match_inverted != 0xff; + if(match) { + return buf_end - 1; + } + return NULL; +} + static really_inline const u8 *dshuftiOnce(svuint8_t mask1_lo, svuint8_t mask1_hi, svuint8_t mask2_lo, svuint8_t mask2_hi, - const u8 *buf, const u8 *buf_end) { + svuint8_t *inout_t1, const u8 *buf, const u8 *buf_end) { DEBUG_PRINTF("start %p end %p\n", buf, buf_end); assert(buf < buf_end); DEBUG_PRINTF("l = %td\n", buf_end - buf); svbool_t pg = svwhilelt_b8_s64(0, buf_end - buf); svbool_t matched = doubleMatched(mask1_lo, mask1_hi, mask2_lo, mask2_hi, - buf, pg); - return accelSearchCheckMatched(buf, matched); + inout_t1, buf, pg); + // doubleMatched return match position of the second char, but here we + // return the position of the first char, hence the buffer offset + return accelSearchCheckMatched(buf - 1, matched); } static really_inline const u8 *dshuftiLoopBody(svuint8_t mask1_lo, svuint8_t mask1_hi, svuint8_t mask2_lo, svuint8_t mask2_hi, - const u8 *buf) { + svuint8_t *inout_t1, const u8 *buf) { DEBUG_PRINTF("start %p end %p\n", buf, buf + svcntb()); svbool_t matched = doubleMatched(mask1_lo, mask1_hi, mask2_lo, mask2_hi, - buf, svptrue_b8()); - return accelSearchCheckMatched(buf, matched); + inout_t1, buf, svptrue_b8()); + // doubleMatched return match position of the second char, but here we + // return the position of the first char, hence the buffer offset + return accelSearchCheckMatched(buf - 1, matched); } static really_inline @@ -200,31 +221,47 @@ const u8 *dshuftiSearch(svuint8_t mask1_lo, svuint8_t mask1_hi, svuint8_t mask2_lo, svuint8_t mask2_hi, const u8 *buf, const u8 *buf_end) { assert(buf < buf_end); + svuint8_t inout_t1 = svdup_u8(0xff); size_t len = buf_end - buf; - if (len <= svcntb()) { - return dshuftiOnce(mask1_lo, mask1_hi, - mask2_lo, mask2_hi, buf, buf_end); - } - // peel off first part to align to the vector size - const u8 *aligned_buf = ROUNDUP_PTR(buf, svcntb_pat(SV_POW2)); - assert(aligned_buf < buf_end); - if (buf != aligned_buf) { - const u8 *ptr = dshuftiLoopBody(mask1_lo, mask1_hi, - mask2_lo, mask2_hi, buf); - if (ptr) return ptr; - } - buf = aligned_buf; - size_t loops = (buf_end - buf) / svcntb(); - DEBUG_PRINTF("loops %zu \n", loops); - for (size_t i = 0; i < loops; i++, buf += svcntb()) { - const u8 *ptr = dshuftiLoopBody(mask1_lo, mask1_hi, - mask2_lo, mask2_hi, buf); - if (ptr) return ptr; + if (len > svcntb()) { + // peel off first part to align to the vector size + const u8 *aligned_buf = ROUNDUP_PTR(buf, svcntb_pat(SV_POW2)); + assert(aligned_buf < buf_end); + if (buf != aligned_buf) { + const u8 *ptr = dshuftiLoopBody(mask1_lo, mask1_hi, mask2_lo, + mask2_hi, &inout_t1, buf); + if (ptr) return ptr; + // The last match in inout won't line up with the next round as we + // use an overlap. We need to set inout according to the last + // unique-searched char. + size_t offset = aligned_buf - buf; + uint8_t last_unique_elem = + svlastb(svwhilelt_b8(0UL, offset), inout_t1); + inout_t1 = svdup_u8(last_unique_elem); + } + buf = aligned_buf; + size_t loops = (buf_end - buf) / svcntb(); + DEBUG_PRINTF("loops %zu \n", loops); + for (size_t i = 0; i < loops; i++, buf += svcntb()) { + const u8 *ptr = dshuftiLoopBody(mask1_lo, mask1_hi, mask2_lo, + mask2_hi, &inout_t1, buf); + if (ptr) return ptr; + } + if (buf == buf_end) { + uint8_t last_elem = svlastb(svptrue_b8(), inout_t1); + return check_last_byte(mask2_lo, mask2_hi, last_elem, buf_end); + } } DEBUG_PRINTF("buf %p buf_end %p \n", buf, buf_end); - return buf == buf_end ? NULL : dshuftiLoopBody(mask1_lo, mask1_hi, - mask2_lo, mask2_hi, - buf_end - svcntb()); + + len = buf_end - buf; + const u8 *ptr = dshuftiOnce(mask1_lo, mask1_hi, + mask2_lo, mask2_hi, &inout_t1, buf, buf_end); + if (ptr) return ptr; + uint8_t last_elem = + svlastb(svwhilelt_b8(0UL, len), inout_t1); + return check_last_byte(mask2_lo, mask2_hi, last_elem, buf_end); + } const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi, diff --git a/src/nfa/x86/shufti.hpp b/src/nfa/x86/shufti.hpp index 6fb34b2f..10d1e22b 100644 --- a/src/nfa/x86/shufti.hpp +++ b/src/nfa/x86/shufti.hpp @@ -46,7 +46,7 @@ const SuperVector blockSingleMask(SuperVector mask_lo, SuperVector mask template static really_inline -SuperVector blockDoubleMask(SuperVector mask1_lo, SuperVector mask1_hi, SuperVector mask2_lo, SuperVector mask2_hi, SuperVector chars) { +SuperVector blockDoubleMask(SuperVector mask1_lo, SuperVector mask1_hi, SuperVector mask2_lo, SuperVector mask2_hi, SuperVector *inout_c1, SuperVector chars) { const SuperVector low4bits = SuperVector::dup_u8(0xf); SuperVector chars_lo = chars & low4bits; @@ -57,18 +57,25 @@ SuperVector blockDoubleMask(SuperVector mask1_lo, SuperVector mask1_hi, c1_lo.print8("c1_lo"); SuperVector c1_hi = mask1_hi.pshufb(chars_hi); c1_hi.print8("c1_hi"); - SuperVector c1 = c1_lo | c1_hi; - c1.print8("c1"); + SuperVector new_c1 = c1_lo | c1_hi; + // c1 is the match mask for the first char of the patterns + new_c1.print8("c1"); SuperVector c2_lo = mask2_lo.pshufb(chars_lo); c2_lo.print8("c2_lo"); SuperVector c2_hi = mask2_hi.pshufb(chars_hi); c2_hi.print8("c2_hi"); SuperVector c2 = c2_lo | c2_hi; + // c2 is the match mask for the second char of the patterns c2.print8("c2"); - c2.template vshr_128_imm<1>().print8("c2.vshr_128(1)"); - SuperVector c = c1 | (c2.template vshr_128_imm<1>()); + + // offset c1 so it aligns with c2. The hole created by the offset is filled + // with the last elements of the previous c1 so no info is lost. + // If bits with value 0 lines up, it indicate a match. + SuperVector c = (new_c1.alignr(*inout_c1, S-1)) | c2; c.print8("c"); + *inout_c1 = new_c1; + return c.eq(SuperVector::Ones()); } diff --git a/src/util/supervector/arch/ppc64el/impl.cpp b/src/util/supervector/arch/ppc64el/impl.cpp index b2c69271..56f55b0c 100644 --- a/src/util/supervector/arch/ppc64el/impl.cpp +++ b/src/util/supervector/arch/ppc64el/impl.cpp @@ -560,11 +560,6 @@ really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> &other, in { if (offset == 0) return other; if (offset == 16) return *this; -#if defined(HAVE__BUILTIN_CONSTANT_P) - if (__builtin_constant_p(offset)) { - return SuperVector<16>(vec_sld(u.s8x16[0], other.u.s8x16[0], offset)); - } -#endif uint8x16_t sl = vec_splats((uint8_t) (offset << 3)); uint8x16_t sr = vec_splats((uint8_t) ((16 - offset) << 3)); uint8x16_t rhs = vec_slo(u.u8x16[0], sr); diff --git a/src/util/supervector/arch/x86/impl.cpp b/src/util/supervector/arch/x86/impl.cpp index 3fb54f1e..67d8a9cd 100644 --- a/src/util/supervector/arch/x86/impl.cpp +++ b/src/util/supervector/arch/x86/impl.cpp @@ -556,6 +556,7 @@ really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> &other, in case 13: return SuperVector<16>(_mm_alignr_epi8(u.v128[0], other.u.v128[0], 13)); break; case 14: return SuperVector<16>(_mm_alignr_epi8(u.v128[0], other.u.v128[0], 14)); break; case 15: return SuperVector<16>(_mm_alignr_epi8(u.v128[0], other.u.v128[0], 15)); break; + case 16: return *this; break; default: break; } return *this; @@ -877,10 +878,10 @@ template <> template really_inline SuperVector<32> SuperVector<32>::vshr_256_imm() const { - if (N == 0) return *this; - if (N == 16) return {SuperVector<32>(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)))}; - if (N == 32) return Zeroes(); - if (N < 16) { + if constexpr (N == 0) return *this; + if constexpr (N == 16) return {SuperVector<32>(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)))}; + if constexpr (N == 32) return Zeroes(); + if constexpr (N < 16) { return {SuperVector<32>(_mm256_alignr_epi8(u.v256[0], _mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 16 - N))}; } else { return {SuperVector<32>(_mm256_srli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), N - 16))}; @@ -1145,52 +1146,15 @@ really_inline SuperVector<32> SuperVector<32>::loadu_maskz(void const *ptr, uint template<> really_inline SuperVector<32> SuperVector<32>::alignr(SuperVector<32> &other, int8_t offset) { -#if defined(HAVE__BUILTIN_CONSTANT_P) && !(defined(__GNUC__) && ((__GNUC__ == 13) || (__GNUC__ == 14))) - if (__builtin_constant_p(offset)) { - if (offset == 16) { - return *this; - } else { - return {SuperVector<32>(_mm256_alignr_epi8(u.v256[0], other.u.v256[0], offset))}; - } - } -#endif - // As found here: https://stackoverflow.com/questions/8517970/mm-alignr-epi8-palignr-equivalent-in-avx2#8637458 - switch (offset){ - case 0 : return SuperVector<32>(_mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 0), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 0))); break; - case 1 : return SuperVector<32>(_mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 1), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 1))); break; - case 2 : return SuperVector<32>(_mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 2), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 2))); break; - case 3 : return SuperVector<32>(_mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 3), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 3))); break; - case 4 : return SuperVector<32>(_mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 4), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 4))); break; - case 5 : return SuperVector<32>(_mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 5), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 5))); break; - case 6 : return SuperVector<32>(_mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 6), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 6))); break; - case 7 : return SuperVector<32>(_mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 7), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 7))); break; - case 8 : return SuperVector<32>(_mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 8), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 8))); break; - case 9 : return SuperVector<32>(_mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 9), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 9))); break; - case 10 : return SuperVector<32>(_mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 10), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 10))); break; - case 11 : return SuperVector<32>(_mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 11), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 11))); break; - case 12 : return SuperVector<32>(_mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 12), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 12))); break; - case 13 : return SuperVector<32>(_mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 13), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 13))); break; - case 14 : return SuperVector<32>(_mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 14), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 14))); break; - case 15 : return SuperVector<32>(_mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 15), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 15))); break; - case 16 : return SuperVector<32>(_mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 0), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 0))); break; - case 17 : return SuperVector<32>(_mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 1), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 1))); break; - case 18 : return SuperVector<32>(_mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 2), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 2))); break; - case 19 : return SuperVector<32>(_mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 3), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 3))); break; - case 20 : return SuperVector<32>(_mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 4), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 4))); break; - case 21 : return SuperVector<32>(_mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 5), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 5))); break; - case 22 : return SuperVector<32>(_mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 6), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 6))); break; - case 23 : return SuperVector<32>(_mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 7), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 7))); break; - case 24 : return SuperVector<32>(_mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 8), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 8))); break; - case 25 : return SuperVector<32>(_mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 9), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 9))); break; - case 26 : return SuperVector<32>(_mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 10), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 10))); break; - case 27 : return SuperVector<32>(_mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 11), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 11))); break; - case 28 : return SuperVector<32>(_mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 12), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 12))); break; - case 29 : return SuperVector<32>(_mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 13), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 13))); break; - case 30 : return SuperVector<32>(_mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 14), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 14))); break; - case 31 : return SuperVector<32>(_mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 15), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 15))); break; - default: break; - } - return *this; + if (offset == 0) { return other; } + else if (offset < 32) { + SuperVector<32> this_shifted = *this << (32 - offset); + SuperVector<32> other_shifted = other >> offset; + this_shifted.print8("this << (32 - offset)"); + other_shifted.print8("other >> offset"); + return this_shifted | other_shifted; + } else if (offset == 32) { return *this; } + return Zeroes(); } template<> @@ -1532,14 +1496,39 @@ template <> template really_inline SuperVector<64> SuperVector<64>::vshr_256_imm() const { - return {}; + if constexpr (N == 0) return *this; + if constexpr (N == 16) return {SuperVector<64>(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)))}; + if constexpr (N == 32) return Zeroes(); + if constexpr (N < 16) { + return {SuperVector<64>(_mm256_alignr_epi8(u.v256[0], _mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 16 - N))}; + } else { + return {SuperVector<64>(_mm256_srli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), N - 16))}; + } } template <> template really_inline SuperVector<64> SuperVector<64>::vshr_512_imm() const { - return {}; + if constexpr (N == 0) return *this; + if constexpr (N < 32) { + SuperVector<32> lo256 = SuperVector<32>(u.v256[0]); + SuperVector<32> hi256 = SuperVector<32>(u.v256[1]); + SuperVector<32> carry = hi256 << (32 - N); + hi256 = hi256 >> N; + lo256 = (lo256 >> N) | carry; + return SuperVector<64>(lo256, hi256); + } + if constexpr (N == 32) { + SuperVector<32> hi256 = SuperVector<32>(u.v256[1]); + return SuperVector<64>(hi256, SuperVector<32>::Zeroes()); + } + if constexpr (N < 64) { + SuperVector<32> hi256 = SuperVector<32>(u.v256[1]); + return SuperVector<64>(hi256 >> (N - 32), SuperVector<32>::Zeroes()); + } else { + return Zeroes(); + } } template <> @@ -1560,6 +1549,7 @@ template SuperVector<64> SuperVector<64>::vshr_64_imm<1>() const; template SuperVector<64> SuperVector<64>::vshr_64_imm<4>() const; template SuperVector<64> SuperVector<64>::vshr_128_imm<1>() const; template SuperVector<64> SuperVector<64>::vshr_128_imm<4>() const; +template SuperVector<64> SuperVector<64>::vshr_imm<1>() const; #endif // template <> @@ -1799,36 +1789,17 @@ really_inline SuperVector<64> SuperVector<64>::pshufb_maskz(SuperVector<64> b, u } template<> -really_inline SuperVector<64> SuperVector<64>::alignr(SuperVector<64> &l, int8_t offset) +really_inline SuperVector<64> SuperVector<64>::alignr(SuperVector<64> &other, int8_t offset) { -#if defined(HAVE__BUILTIN_CONSTANT_P) && !(defined(__GNUC__) && (__GNUC__ == 14)) - if (__builtin_constant_p(offset)) { - if (offset == 16) { - return *this; - } else { - return {SuperVector<64>(_mm512_alignr_epi8(u.v512[0], l.u.v512[0], offset))}; - } - } -#endif - if(offset == 0) { - return *this; - } else if (offset < 32){ - SuperVector<32> lo256 = SuperVector<32>(u.v256[0]); - SuperVector<32> hi256 = SuperVector<32>(u.v256[1]); - SuperVector<32> o_lo256 = SuperVector<32>(l.u.v256[0]); - SuperVector<32> carry1 = SuperVector<32>(hi256.alignr(lo256,offset)); - SuperVector<32> carry2 = SuperVector<32>(o_lo256.alignr(hi256,offset)); - return SuperVector<64>(carry1, carry2); - } else if (offset <= 64){ - SuperVector<32> hi256 = SuperVector<32>(u.v256[1]); - SuperVector<32> o_lo256 = SuperVector<32>(l.u.v256[0]); - SuperVector<32> o_hi256 = SuperVector<32>(l.u.v256[1]); - SuperVector<32> carry1 = SuperVector<32>(o_lo256.alignr(hi256, offset - 32)); - SuperVector<32> carry2 = SuperVector<32>(o_hi256.alignr(o_lo256,offset -32)); - return SuperVector<64>(carry1, carry2); - } else { - return *this; - } + if (offset == 0) { return other; } + else if (offset < 64) { + SuperVector<64> this_shifted = *this << (64 - offset); + SuperVector<64> other_shifted = other >> offset; + this_shifted.print8("this << (64 - offset)"); + other_shifted.print8("other >> offset"); + return this_shifted | other_shifted; + } else if (offset == 64) { return *this; } + return Zeroes(); } #endif // HAVE_AVX512 diff --git a/unit/internal/shufti.cpp b/unit/internal/shufti.cpp index 9adcac8b..0b6f3219 100644 --- a/unit/internal/shufti.cpp +++ b/unit/internal/shufti.cpp @@ -899,7 +899,40 @@ TEST(DoubleShufti, ExecMatchMixed3) { const u8 *rv = shuftiDoubleExec(lo1, hi1, lo2, hi2, reinterpret_cast(t2), reinterpret_cast(t2) + len); - ASSERT_EQ(reinterpret_cast(&t2[len - i]), rv); + if(i < 2) { + // i=0 is "xy" out of buffer. i=1 is "x" in buffer but not "y" + ASSERT_EQ(reinterpret_cast(t2 + len), rv); + }else { + ASSERT_EQ(reinterpret_cast(&t2[len - i]), rv); + } + } +} + +// Double shufti used to report matches when the first char of a pair landed at +// the end of a vector. This test check for the regression. +TEST(DoubleShufti, ExecNoMatchVectorEdge) { + m128 lo1, hi1, lo2, hi2; + + flat_set> lits; + + lits.insert(make_pair('a','c')); + + bool ret = shuftiBuildDoubleMasks(CharReach(), lits, reinterpret_cast(&lo1), reinterpret_cast(&hi1), + reinterpret_cast(&lo2), reinterpret_cast(&hi2)); + ASSERT_TRUE(ret); + + const int len = 80; + char t1[len + 2]; + memset(t1, 'b', len); + + for (size_t i = 0; i < 70; i++) { + t1[len - i] = 'a'; + t1[len - i + 1] = 'b'; + DEBUG_PRINTF("i = %ld\n", i); + const u8 *rv = shuftiDoubleExec(lo1, hi1, lo2, hi2, + reinterpret_cast(t1), reinterpret_cast(t1) + len); + + ASSERT_EQ(reinterpret_cast(t1 + len), rv); } } diff --git a/unit/internal/supervector.cpp b/unit/internal/supervector.cpp index ac3daf2a..f4723fa5 100644 --- a/unit/internal/supervector.cpp +++ b/unit/internal/supervector.cpp @@ -697,7 +697,6 @@ TEST(SuperVectorUtilsTest,RShift128_256c){ } } - /*Define ALIGNR256 macro*/ #define TEST_ALIGNR256(v1, v2, buf, l) { \ auto v_aligned = v2.alignr(v1, l); \ @@ -706,6 +705,7 @@ TEST(SuperVectorUtilsTest,RShift128_256c){ } \ } + TEST(SuperVectorUtilsTest,Alignr256c){ u8 vec[64]; for (int i=0; i<64; i++) { @@ -713,7 +713,7 @@ TEST(SuperVectorUtilsTest,Alignr256c){ } auto SP1 = SuperVector<32>::loadu(vec); auto SP2 = SuperVector<32>::loadu(vec+32); - for(int j=0; j<32; j++) { + for(size_t j=0; j<32; j++) { TEST_ALIGNR256(SP1, SP2, vec, j); } } @@ -1045,10 +1045,9 @@ TEST(SuperVectorUtilsTest,LShift128_512c){ } } - /*Define ALIGNR512 macro*/ #define TEST_ALIGNR512(v1, v2, buf, l) { \ - auto v_aligned = v1.alignr(v2, l); \ + auto v_aligned = v2.alignr(v1, l); \ for (size_t i=0; i<64; i++) { \ ASSERT_EQ(v_aligned.u.u8[i], vec[i + l]); \ } \ @@ -1061,7 +1060,7 @@ TEST(SuperVectorUtilsTest,Alignr512c){ } auto SP1 = SuperVector<64>::loadu(vec); auto SP2 = SuperVector<64>::loadu(vec+64); - for(int j=0; j<64; j++){ + for(size_t j=0; j<64; j++){ TEST_ALIGNR512(SP1, SP2, vec, j); } }