diff --git a/src/nfa/x86/shufti.hpp b/src/nfa/x86/shufti.hpp index 88aa4904..4034acca 100644 --- a/src/nfa/x86/shufti.hpp +++ b/src/nfa/x86/shufti.hpp @@ -69,36 +69,11 @@ SuperVector blockDoubleMask(SuperVector mask1_lo, SuperVector mask1_hi, // c2 is the match mask for the second char of the patterns c2.print8("c2"); - // We want to shift the whole vector left by 1 and insert the last element of inout_c1. - // Due to lack of direct instructions to insert, extract and concatenate vectors - // we need to to store and load the vector. - uint8_t tmp_buf[2*S]; - SuperVector offset_c1; - if constexpr (S == 16) { - _mm_storeu_si128(reinterpret_cast(&tmp_buf[0]), inout_c1->u.v128[0]); - _mm_storeu_si128(reinterpret_cast(&tmp_buf[S]), new_c1.u.v128[0]); - offset_c1 = SuperVector(_mm_loadu_si128(reinterpret_cast(&tmp_buf[S-1]))); - } -#ifdef HAVE_AVX2 - else if constexpr (S == 32) { - _mm256_storeu_si256(reinterpret_cast(&tmp_buf[0]), inout_c1->u.v256[0]); - _mm256_storeu_si256(reinterpret_cast(&tmp_buf[S]), new_c1.u.v256[0]); - offset_c1 = SuperVector(_mm256_loadu_si256(reinterpret_cast(&tmp_buf[S-1]))); - } -#endif -#ifdef HAVE_AVX512 - else if constexpr (S == 64) { - _mm512_storeu_si512(reinterpret_cast(&tmp_buf[0]), inout_c1->u.v512[0]); - _mm512_storeu_si512(reinterpret_cast(&tmp_buf[S]), new_c1.u.v512[0]); - offset_c1 = SuperVector(_mm512_load_si512(reinterpret_cast(&tmp_buf[S-1]))); - } -#endif - offset_c1.print8("offset c1"); - // offset c1 so it aligns with c2. The hole created by the offset is filled // with the last elements of the previous c1 so no info is lost. // If bits with value 0 lines up, it indicate a match. - SuperVector c = offset_c1 | c2; + c2.template vshr_imm<1>().print8("c2.vshr_128(1)"); + SuperVector c = new_c1 | (c2.template vshr_imm<1>()); c.print8("c"); *inout_c1 = new_c1; diff --git a/src/util/supervector/arch/x86/impl.cpp b/src/util/supervector/arch/x86/impl.cpp index 3fb54f1e..8ec13f66 100644 --- a/src/util/supervector/arch/x86/impl.cpp +++ b/src/util/supervector/arch/x86/impl.cpp @@ -877,10 +877,10 @@ template <> template really_inline SuperVector<32> SuperVector<32>::vshr_256_imm() const { - if (N == 0) return *this; - if (N == 16) return {SuperVector<32>(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)))}; - if (N == 32) return Zeroes(); - if (N < 16) { + if constexpr (N == 0) return *this; + if constexpr (N == 16) return {SuperVector<32>(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)))}; + if constexpr (N == 32) return Zeroes(); + if constexpr (N < 16) { return {SuperVector<32>(_mm256_alignr_epi8(u.v256[0], _mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 16 - N))}; } else { return {SuperVector<32>(_mm256_srli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), N - 16))}; @@ -1532,14 +1532,39 @@ template <> template really_inline SuperVector<64> SuperVector<64>::vshr_256_imm() const { - return {}; + if constexpr (N == 0) return *this; + if constexpr (N == 16) return {SuperVector<64>(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)))}; + if constexpr (N == 32) return Zeroes(); + if constexpr (N < 16) { + return {SuperVector<64>(_mm256_alignr_epi8(u.v256[0], _mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 16 - N))}; + } else { + return {SuperVector<64>(_mm256_srli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), N - 16))}; + } } template <> template really_inline SuperVector<64> SuperVector<64>::vshr_512_imm() const { - return {}; + if constexpr (N == 0) return *this; + if constexpr (N < 32) { + SuperVector<32> lo256 = SuperVector<32>(u.v256[0]); + SuperVector<32> hi256 = SuperVector<32>(u.v256[1]); + SuperVector<32> carry = hi256 << (32 - N); + hi256 = hi256 >> N; + lo256 = (lo256 >> N) | carry; + return SuperVector<64>(lo256, hi256); + } + if constexpr (N == 32) { + SuperVector<32> hi256 = SuperVector<32>(u.v256[1]); + return SuperVector<64>(hi256, SuperVector<32>::Zeroes()); + } + if constexpr (N < 64) { + SuperVector<32> hi256 = SuperVector<32>(u.v256[1]); + return SuperVector<64>(hi256 >> (N - 32), SuperVector<32>::Zeroes()); + } else { + return Zeroes(); + } } template <> @@ -1560,6 +1585,7 @@ template SuperVector<64> SuperVector<64>::vshr_64_imm<1>() const; template SuperVector<64> SuperVector<64>::vshr_64_imm<4>() const; template SuperVector<64> SuperVector<64>::vshr_128_imm<1>() const; template SuperVector<64> SuperVector<64>::vshr_128_imm<4>() const; +template SuperVector<64> SuperVector<64>::vshr_imm<1>() const; #endif // template <>