From e7161fdfec7734cb01434f9e3bc37c85f383083a Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Mon, 20 Sep 2021 23:52:31 +0300 Subject: [PATCH] initial SSE/AVX2 implementation --- src/nfa/shufti_simd.hpp | 14 +- src/nfa/truffle_simd.hpp | 2 +- src/util/supervector/arch/x86/impl.cpp | 918 ++++++++++++++----------- src/util/supervector/supervector.hpp | 79 ++- unit/internal/supervector.cpp | 20 +- 5 files changed, 595 insertions(+), 438 deletions(-) diff --git a/src/nfa/shufti_simd.hpp b/src/nfa/shufti_simd.hpp index 3af3bc9f..3c5a1fbe 100644 --- a/src/nfa/shufti_simd.hpp +++ b/src/nfa/shufti_simd.hpp @@ -51,7 +51,7 @@ typename SuperVector::movemask_type block(SuperVector mask_lo, SuperVector SuperVector c_lo = chars & low4bits; c_lo = mask_lo.pshufb(c_lo); - SuperVector c_hi = mask_hi.pshufb(chars.rshift64(4) & low4bits); + SuperVector c_hi = mask_hi.pshufb(chars.template vshr_64_imm<4>() & low4bits); SuperVector t = c_lo & c_hi; return t.eqmask(SuperVector::Zeroes()); @@ -212,7 +212,7 @@ const u8 *fwdBlockDouble(SuperVector mask1_lo, SuperVector mask1_hi, Super const SuperVector low4bits = SuperVector::dup_u8(0xf); SuperVector chars_lo = chars & low4bits; chars_lo.print8("chars_lo"); - SuperVector chars_hi = chars.rshift64(4) & low4bits; + SuperVector chars_hi = chars.template vshr_64_imm<4>() & low4bits; chars_hi.print8("chars_hi"); SuperVector c1_lo = mask1_lo.pshufb(chars_lo); c1_lo.print8("c1_lo"); @@ -227,8 +227,8 @@ const u8 *fwdBlockDouble(SuperVector mask1_lo, SuperVector mask1_hi, Super c2_hi.print8("c2_hi"); SuperVector t2 = c2_lo | c2_hi; t2.print8("t2"); - t2.rshift128(1).print8("t2.rshift128(1)"); - SuperVector t = t1 | (t2.rshift128(1)); + t2.template vshr_128_imm<1>().print8("t2.rshift128(1)"); + SuperVector t = t1 | (t2.template vshr_128_imm<1>()); t.print8("t"); typename SuperVector::movemask_type z = t.eqmask(SuperVector::Ones()); @@ -250,7 +250,7 @@ static really_inline const u8 *shuftiDoubleMini(SuperVector mask1_lo, SuperVe SuperVector chars_lo = chars & low4bits; chars_lo.print8("chars_lo"); - SuperVector chars_hi = chars.rshift64(4) & low4bits; + SuperVector chars_hi = chars.template vshr_64_imm<4>() & low4bits; chars_hi.print8("chars_hi"); SuperVector c1_lo = mask1_lo.pshufb_maskz(chars_lo, len); c1_lo.print8("c1_lo"); @@ -265,8 +265,8 @@ static really_inline const u8 *shuftiDoubleMini(SuperVector mask1_lo, SuperVe c2_hi.print8("c2_hi"); SuperVector t2 = c2_lo | c2_hi; t2.print8("t2"); - t2.rshift128(1).print8("t2.rshift128(1)"); - SuperVector t = t1 | (t2.rshift128(1)); + t2.template vshr_128_imm<1>().print8("t2.rshift128(1)"); + SuperVector t = t1 | (t2.template vshr_128_imm<1>()); t.print8("t"); typename SuperVector::movemask_type z = t.eqmask(SuperVector::Ones()); diff --git a/src/nfa/truffle_simd.hpp b/src/nfa/truffle_simd.hpp index 73017578..c5f85135 100644 --- a/src/nfa/truffle_simd.hpp +++ b/src/nfa/truffle_simd.hpp @@ -53,7 +53,7 @@ typename SuperVector::movemask_type block(SuperVector shuf_mask_lo_highcle SuperVector shuf1 = shuf_mask_lo_highclear.pshufb(v); SuperVector t1 = v ^ highconst; SuperVector shuf2 = shuf_mask_lo_highset.pshufb(t1); - SuperVector t2 = highconst.opandnot(v.rshift64(4)); + SuperVector t2 = highconst.opandnot(v.template vshr_64_imm<4>()); SuperVector shuf3 = shuf_mask_hi.pshufb(t2); SuperVector tmp = (shuf1 | shuf2) & shuf3; diff --git a/src/util/supervector/arch/x86/impl.cpp b/src/util/supervector/arch/x86/impl.cpp index 26e45909..61107d58 100644 --- a/src/util/supervector/arch/x86/impl.cpp +++ b/src/util/supervector/arch/x86/impl.cpp @@ -110,7 +110,7 @@ really_inline SuperVector<16>::SuperVector(uint64_t const other) // Constants template<> -really_inline SuperVector<16> SuperVector<16>::Ones(void) +really_inline SuperVector<16> SuperVector<16>::Ones() { return {_mm_set1_epi8(0xFF)}; } @@ -171,29 +171,208 @@ really_inline typename SuperVector<16>::movemask_type SuperVector<16>::eqmask(Su return eq(b).movemask(); } +// template <> +// template +// really_inline SuperVector<16> SuperVector<16>::vshl_8_imm() const +// { +// const uint8_t i = N; +// return {_mm_slli_epi8(u.v128[0], i)}; +// } + template <> -really_inline SuperVector<16> SuperVector<16>::rshift128_var(uint8_t const N) const +template +really_inline SuperVector<16> SuperVector<16>::vshl_16_imm() const { - switch(N) { - case 1: return {_mm_srli_si128(u.v128[0], 1)}; break; - case 2: return {_mm_srli_si128(u.v128[0], 2)}; break; - case 3: return {_mm_srli_si128(u.v128[0], 3)}; break; - case 4: return {_mm_srli_si128(u.v128[0], 4)}; break; - case 5: return {_mm_srli_si128(u.v128[0], 5)}; break; - case 6: return {_mm_srli_si128(u.v128[0], 6)}; break; - case 7: return {_mm_srli_si128(u.v128[0], 7)}; break; - case 8: return {_mm_srli_si128(u.v128[0], 8)}; break; - case 9: return {_mm_srli_si128(u.v128[0], 9)}; break; - case 10: return {_mm_srli_si128(u.v128[0], 10)}; break; - case 11: return {_mm_srli_si128(u.v128[0], 11)}; break; - case 12: return {_mm_srli_si128(u.v128[0], 12)}; break; - case 13: return {_mm_srli_si128(u.v128[0], 13)}; break; - case 14: return {_mm_srli_si128(u.v128[0], 14)}; break; - case 15: return {_mm_srli_si128(u.v128[0], 15)}; break; - case 16: return Zeroes(); break; - default: break; - } - return *this; + return {_mm_slli_epi16(u.v128[0], N)}; +} + +template <> +template +really_inline SuperVector<16> SuperVector<16>::vshl_32_imm() const +{ + return {_mm_slli_epi32(u.v128[0], N)}; +} + +template <> +template +really_inline SuperVector<16> SuperVector<16>::vshl_64_imm() const +{ + return {_mm_slli_epi64(u.v128[0], N)}; +} + +template <> +template +really_inline SuperVector<16> SuperVector<16>::vshl_128_imm() const +{ + return {_mm_slli_si128(u.v128[0], N)}; +} + +template <> +template +really_inline SuperVector<16> SuperVector<16>::vshl_imm() const +{ + return vshl_128_imm(); +} + +// template <> +// template +// really_inline SuperVector<16> SuperVector<16>::vshr_8_imm() const +// { +// return {_mm_srli_epi8(u.v128[0], N)}; +// } + +template <> +template +really_inline SuperVector<16> SuperVector<16>::vshr_16_imm() const +{ + return {_mm_srli_epi16(u.v128[0], N)}; +} + +template <> +template +really_inline SuperVector<16> SuperVector<16>::vshr_32_imm() const +{ + return {_mm_srli_epi32(u.v128[0], N)}; +} + +template <> +template +really_inline SuperVector<16> SuperVector<16>::vshr_64_imm() const +{ + return {_mm_srli_epi64(u.v128[0], N)}; +} + +template <> +template +really_inline SuperVector<16> SuperVector<16>::vshr_128_imm() const +{ + return {_mm_srli_si128(u.v128[0], N)}; +} + +template <> +template +really_inline SuperVector<16> SuperVector<16>::vshr_imm() const +{ + return vshr_128_imm(); +} + +template SuperVector<16> SuperVector<16>::vshl_16_imm<1>() const; +template SuperVector<16> SuperVector<16>::vshl_64_imm<1>() const; +template SuperVector<16> SuperVector<16>::vshl_64_imm<4>() const; +template SuperVector<16> SuperVector<16>::vshl_128_imm<1>() const; +template SuperVector<16> SuperVector<16>::vshl_128_imm<4>() const; +template SuperVector<16> SuperVector<16>::vshr_16_imm<1>() const; +template SuperVector<16> SuperVector<16>::vshr_64_imm<1>() const; +template SuperVector<16> SuperVector<16>::vshr_64_imm<4>() const; +template SuperVector<16> SuperVector<16>::vshr_128_imm<1>() const; +template SuperVector<16> SuperVector<16>::vshr_128_imm<4>() const; + +// template <> +// really_inline SuperVector<16> SuperVector<16>::vshl_8 (uint8_t const N) const +// { +// Unroller<0, 15>::iterator([&,v=this](int i) { if (N == i) return {_mm_slli_epi8(v->u.v128[0], i)}; }); +// if (N == 16) return Zeroes(); +// } + +template <> +really_inline SuperVector<16> SuperVector<16>::vshl_16 (uint8_t const N) const +{ + if (N == 0) return *this; + if (N == 16) return Zeroes(); + SuperVector result; + Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm_slli_epi16(v->u.v128[0], n)}; }); + return result; +} + +template <> +really_inline SuperVector<16> SuperVector<16>::vshl_32 (uint8_t const N) const +{ + if (N == 0) return *this; + if (N == 16) return Zeroes(); + SuperVector result; + Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm_slli_epi32(v->u.v128[0], n)}; }); + return result; +} + +template <> +really_inline SuperVector<16> SuperVector<16>::vshl_64 (uint8_t const N) const +{ + if (N == 0) return *this; + if (N == 16) return Zeroes(); + SuperVector result; + Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm_slli_epi64(v->u.v128[0], n)}; }); + return result; +} + +template <> +really_inline SuperVector<16> SuperVector<16>::vshl_128(uint8_t const N) const +{ + if (N == 0) return *this; + if (N == 16) return Zeroes(); + SuperVector result; + Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm_slli_si128(v->u.v128[0], n)}; }); + return result; +} + +template <> +really_inline SuperVector<16> SuperVector<16>::vshl(uint8_t const N) const +{ + return vshl_128(N); +} + +// template <> +// really_inline SuperVector<16> SuperVector<16>::vshr_8 (uint8_t const N) const +// { +// SuperVector<16> result; +// Unroller<0, 15>::iterator([&,v=this](uint8_t const i) { if (N == i) result = {_mm_srli_epi8(v->u.v128[0], i)}; }); +// if (N == 16) result = Zeroes(); +// return result; +// } + +template <> +really_inline SuperVector<16> SuperVector<16>::vshr_16 (uint8_t const N) const +{ + if (N == 0) return *this; + if (N == 16) return Zeroes(); + SuperVector result; + Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm_srli_epi16(v->u.v128[0], n)}; }); + return result; +} + +template <> +really_inline SuperVector<16> SuperVector<16>::vshr_32 (uint8_t const N) const +{ + if (N == 0) return *this; + if (N == 16) return Zeroes(); + SuperVector result; + Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm_srli_epi32(v->u.v128[0], n)}; }); + return result; +} + +template <> +really_inline SuperVector<16> SuperVector<16>::vshr_64 (uint8_t const N) const +{ + if (N == 0) return *this; + if (N == 16) return Zeroes(); + SuperVector result; + Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm_srli_epi64(v->u.v128[0], n)}; }); + return result; +} + +template <> +really_inline SuperVector<16> SuperVector<16>::vshr_128(uint8_t const N) const +{ + if (N == 0) return *this; + if (N == 16) return Zeroes(); + SuperVector result; + Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm_srli_si128(v->u.v128[0], n)}; }); + return result; +} + +template <> +really_inline SuperVector<16> SuperVector<16>::vshr(uint8_t const N) const +{ + return vshr_128(N); } #ifdef HS_OPTIMIZE @@ -206,35 +385,10 @@ really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const template <> really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const { - return rshift128_var(N); + return vshr_128(N); } #endif -template <> -really_inline SuperVector<16> SuperVector<16>::lshift128_var(uint8_t const N) const -{ - switch(N) { - case 1: return {_mm_slli_si128(u.v128[0], 1)}; break; - case 2: return {_mm_slli_si128(u.v128[0], 2)}; break; - case 3: return {_mm_slli_si128(u.v128[0], 3)}; break; - case 4: return {_mm_slli_si128(u.v128[0], 4)}; break; - case 5: return {_mm_slli_si128(u.v128[0], 5)}; break; - case 6: return {_mm_slli_si128(u.v128[0], 6)}; break; - case 7: return {_mm_slli_si128(u.v128[0], 7)}; break; - case 8: return {_mm_slli_si128(u.v128[0], 8)}; break; - case 9: return {_mm_slli_si128(u.v128[0], 9)}; break; - case 10: return {_mm_slli_si128(u.v128[0], 10)}; break; - case 11: return {_mm_slli_si128(u.v128[0], 11)}; break; - case 12: return {_mm_slli_si128(u.v128[0], 12)}; break; - case 13: return {_mm_slli_si128(u.v128[0], 13)}; break; - case 14: return {_mm_slli_si128(u.v128[0], 14)}; break; - case 15: return {_mm_slli_si128(u.v128[0], 15)}; break; - case 16: return Zeroes(); break; - default: break; - } - return *this; -} - #ifdef HS_OPTIMIZE template <> really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const @@ -245,10 +399,24 @@ really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const template <> really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const { - return lshift128_var(N); + return vshl_128(N); } #endif +template<> +really_inline SuperVector<16> SuperVector<16>::Ones_vshr(uint8_t const N) +{ + if (N == 0) return Ones(); + else return Ones().vshr_128(N); +} + +template<> +really_inline SuperVector<16> SuperVector<16>::Ones_vshl(uint8_t const N) +{ + if (N == 0) return Ones(); + else return Ones().vshr_128(N); +} + template <> really_inline SuperVector<16> SuperVector<16>::loadu(void const *ptr) { @@ -266,9 +434,9 @@ really_inline SuperVector<16> SuperVector<16>::load(void const *ptr) template <> really_inline SuperVector<16> SuperVector<16>::loadu_maskz(void const *ptr, uint8_t const len) { - SuperVector<16> mask = Ones().rshift128_var(16 -len); + SuperVector mask = Ones_vshr(16 -len); mask.print8("mask"); - SuperVector<16> v = _mm_loadu_si128((const m128 *)ptr); + SuperVector v = _mm_loadu_si128((const m128 *)ptr); v.print8("v"); return mask & v; } @@ -315,90 +483,10 @@ really_inline SuperVector<16> SuperVector<16>::pshufb(SuperVector<16> b) template<> really_inline SuperVector<16> SuperVector<16>::pshufb_maskz(SuperVector<16> b, uint8_t const len) { - SuperVector<16> mask = Ones().rshift128_var(16 -len); + SuperVector mask = Ones_vshr(16 -len); return mask & pshufb(b); } -#ifdef HS_OPTIMIZE -template<> -really_inline SuperVector<16> SuperVector<16>::lshift64(uint8_t const N) -{ - return {_mm_slli_epi64(u.v128[0], N)}; -} -#else -template<> -really_inline SuperVector<16> SuperVector<16>::lshift64(uint8_t const N) -{ - switch(N) { - case 0: return *this; break; - case 1: return {_mm_slli_epi64(u.v128[0], 1)}; break; - case 2: return {_mm_slli_epi64(u.v128[0], 2)}; break; - case 3: return {_mm_slli_epi64(u.v128[0], 3)}; break; - case 4: return {_mm_slli_epi64(u.v128[0], 4)}; break; - case 5: return {_mm_slli_epi64(u.v128[0], 5)}; break; - case 6: return {_mm_slli_epi64(u.v128[0], 6)}; break; - case 7: return {_mm_slli_epi64(u.v128[0], 7)}; break; - case 8: return {_mm_slli_epi64(u.v128[0], 8)}; break; - case 9: return {_mm_slli_epi64(u.v128[0], 9)}; break; - case 10: return {_mm_slli_epi64(u.v128[0], 10)}; break; - case 11: return {_mm_slli_epi64(u.v128[0], 11)}; break; - case 12: return {_mm_slli_epi64(u.v128[0], 12)}; break; - case 13: return {_mm_slli_epi64(u.v128[0], 13)}; break; - case 14: return {_mm_slli_epi64(u.v128[0], 14)}; break; - case 15: return {_mm_slli_epi64(u.v128[0], 15)}; break; - case 16: return Zeroes(); - default: break; - } - return *this; -} -#endif - -#ifdef HS_OPTIMIZE -template<> -really_inline SuperVector<16> SuperVector<16>::rshift64(uint8_t const N) -{ - return {_mm_srli_epi64(u.v128[0], N)}; -} -#else -template<> -really_inline SuperVector<16> SuperVector<16>::rshift64(uint8_t const N) -{ - switch(N) { - case 0: return {_mm_srli_epi64(u.v128[0], 0)}; break; - case 1: return {_mm_srli_epi64(u.v128[0], 1)}; break; - case 2: return {_mm_srli_epi64(u.v128[0], 2)}; break; - case 3: return {_mm_srli_epi64(u.v128[0], 3)}; break; - case 4: return {_mm_srli_epi64(u.v128[0], 4)}; break; - case 5: return {_mm_srli_epi64(u.v128[0], 5)}; break; - case 6: return {_mm_srli_epi64(u.v128[0], 6)}; break; - case 7: return {_mm_srli_epi64(u.v128[0], 7)}; break; - case 8: return {_mm_srli_epi64(u.v128[0], 8)}; break; - case 9: return {_mm_srli_epi64(u.v128[0], 9)}; break; - case 10: return {_mm_srli_epi64(u.v128[0], 10)}; break; - case 11: return {_mm_srli_epi64(u.v128[0], 11)}; break; - case 12: return {_mm_srli_epi64(u.v128[0], 12)}; break; - case 13: return {_mm_srli_epi64(u.v128[0], 13)}; break; - case 14: return {_mm_srli_epi64(u.v128[0], 14)}; break; - case 15: return {_mm_srli_epi64(u.v128[0], 15)}; break; - case 16: return Zeroes(); - default: break; - } - return *this; -} -#endif - -template<> -really_inline SuperVector<16> SuperVector<16>::lshift128(uint8_t const N) -{ - return *this << N; -} - -template<> -really_inline SuperVector<16> SuperVector<16>::rshift128(uint8_t const N) -{ - return *this >> N; -} - // 256-bit AVX2 implementation #if defined(HAVE_AVX2) template<> @@ -420,6 +508,20 @@ really_inline SuperVector<32>::SuperVector(m128 const v) u.v256[0] = _mm256_broadcastsi128_si256(v); }; +template<> +really_inline SuperVector<32>::SuperVector(m128 const lo, m128 const hi) +{ + u.v128[0] = lo; + u.v128[1] = hi; +}; + +template<> +really_inline SuperVector<32>::SuperVector(SuperVector<16> const lo, SuperVector<16> const hi) +{ + u.v128[0] = lo.u.v128[0]; + u.v128[1] = hi.u.v128[0]; +}; + template<> template<> really_inline SuperVector<32>::SuperVector(int8_t const other) @@ -537,45 +639,245 @@ really_inline typename SuperVector<32>::movemask_type SuperVector<32>::eqmask(Su return eq(b).movemask(); } + +// template <> +// template +// really_inline SuperVector<32> SuperVector<32>::vshl_8_imm() const +// { +// const uint8_t i = N; +// return {_mm256_slli_epi8(u.v256[0], i)}; +// } + template <> -really_inline SuperVector<32> SuperVector<32>::rshift128_var(uint8_t const N) const +template +really_inline SuperVector<32> SuperVector<32>::vshl_16_imm() const { - switch(N) { - case 1: return {_mm256_alignr_epi8(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), u.v256[0], 1)}; break; - case 2: return {_mm256_alignr_epi8(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), u.v256[0], 2)}; break; - case 3: return {_mm256_alignr_epi8(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), u.v256[0], 3)}; break; - case 4: return {_mm256_alignr_epi8(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), u.v256[0], 4)}; break; - case 5: return {_mm256_alignr_epi8(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), u.v256[0], 5)}; break; - case 6: return {_mm256_alignr_epi8(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), u.v256[0], 6)}; break; - case 7: return {_mm256_alignr_epi8(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), u.v256[0], 7)}; break; - case 8: return {_mm256_alignr_epi8(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), u.v256[0], 8)}; break; - case 9: return {_mm256_alignr_epi8(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), u.v256[0], 9)}; break; - case 10: return {_mm256_alignr_epi8(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), u.v256[0], 10)}; break; - case 11: return {_mm256_alignr_epi8(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), u.v256[0], 11)}; break; - case 12: return {_mm256_alignr_epi8(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), u.v256[0], 12)}; break; - case 13: return {_mm256_alignr_epi8(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), u.v256[0], 13)}; break; - case 14: return {_mm256_alignr_epi8(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), u.v256[0], 14)}; break; - case 15: return {_mm256_alignr_epi8(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), u.v256[0], 15)}; break; - case 16: return {_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1))}; break; - case 17: return {_mm256_srli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), 1)}; break; - case 18: return {_mm256_srli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), 2)}; break; - case 19: return {_mm256_srli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), 3)}; break; - case 20: return {_mm256_srli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), 4)}; break; - case 21: return {_mm256_srli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), 5)}; break; - case 22: return {_mm256_srli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), 6)}; break; - case 23: return {_mm256_srli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), 7)}; break; - case 24: return {_mm256_srli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), 8)}; break; - case 25: return {_mm256_srli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), 9)}; break; - case 26: return {_mm256_srli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), 10)}; break; - case 27: return {_mm256_srli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), 11)}; break; - case 28: return {_mm256_srli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), 12)}; break; - case 29: return {_mm256_srli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), 13)}; break; - case 30: return {_mm256_srli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), 14)}; break; - case 31: return {_mm256_srli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), 15)}; break; - case 32: return Zeroes(); break; - default: break; - } - return *this; + return {_mm256_slli_epi16(u.v256[0], N)}; +} + +template <> +template +really_inline SuperVector<32> SuperVector<32>::vshl_32_imm() const +{ + return {_mm256_slli_epi32(u.v256[0], N)}; +} + +template <> +template +really_inline SuperVector<32> SuperVector<32>::vshl_64_imm() const +{ + return {_mm256_slli_epi64(u.v256[0], N)}; +} + +template <> +template +really_inline SuperVector<32> SuperVector<32>::vshl_128_imm() const +{ + return {_mm256_slli_si256(u.v256[0], N)}; +} + +template <> +template +really_inline SuperVector<32> SuperVector<32>::vshr_imm() const +{ + return vshr_256_imm(); +} + +// template <> +// template +// really_inline SuperVector<32> SuperVector<32>::vshr_8_imm() const +// { +// return {_mm256_srli_epi8(u.v256[0], N)}; +// } + +template <> +template +really_inline SuperVector<32> SuperVector<32>::vshr_16_imm() const +{ + return {_mm256_srli_epi16(u.v256[0], N)}; +} + +template <> +template +really_inline SuperVector<32> SuperVector<32>::vshr_32_imm() const +{ + return {_mm256_srli_epi32(u.v256[0], N)}; +} + +template <> +template +really_inline SuperVector<32> SuperVector<32>::vshr_64_imm() const +{ + return {_mm256_srli_epi64(u.v256[0], N)}; +} + +template <> +template +really_inline SuperVector<16> SuperVector<32>::vshr_128_imm() const +{ + return {_mm256_srli_si256(u.v256[0], N)}; +} + +template <> +template +really_inline SuperVector<32> SuperVector<32>::vshr_imm() const +{ + return vshr_256_imm(); +} + +template SuperVector<32> SuperVector<32>::vshl_16_imm<1>() const; +template SuperVector<32> SuperVector<32>::vshl_64_imm<1>() const; +template SuperVector<32> SuperVector<32>::vshl_64_imm<4>() const; +template SuperVector<32> SuperVector<32>::vshl_128_imm<1>() const; +template SuperVector<32> SuperVector<32>::vshl_128_imm<4>() const; +template SuperVector<32> SuperVector<32>::vshr_16_imm<1>() const; +template SuperVector<32> SuperVector<32>::vshr_64_imm<1>() const; +template SuperVector<32> SuperVector<32>::vshr_64_imm<4>() const; +template SuperVector<32> SuperVector<32>::vshr_128_imm<1>() const; +template SuperVector<32> SuperVector<32>::vshr_128_imm<4>() const; + +// template <> +// really_inline SuperVector<16> SuperVector<16>::vshl_8 (uint8_t const N) const +// { +// Unroller<0, 15>::iterator([&,v=this](int i) { if (N == i) return {_mm256_slli_epi8(v->u.v256[0], i)}; }); +// if (N == 16) return Zeroes(); +// } + +template <> +really_inline SuperVector<32> SuperVector<32>::vshl_16 (uint8_t const N) const +{ + if (N == 0) return *this; + if (N == 32) return Zeroes(); + SuperVector result; + Unroller<1, 32>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm256_slli_epi16(v->u.v256[0], n)}; }); + return result; +} + +template <> +really_inline SuperVector<32> SuperVector<32>::vshl_32 (uint8_t const N) const +{ + if (N == 0) return *this; + if (N == 32) return Zeroes(); + SuperVector result; + Unroller<1, 32>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm256_slli_epi32(v->u.v256[0], n)}; }); + return result; +} + +template <> +really_inline SuperVector<32> SuperVector<32>::vshl_64 (uint8_t const N) const +{ + if (N == 0) return *this; + if (N == 32) return Zeroes(); + SuperVector result; + Unroller<1, 32>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm256_slli_epi64(v->u.v256[0], n)}; }); + return result; +} + +template <> +really_inline SuperVector<32> SuperVector<32>::vshl_128(uint8_t const N) const +{ + if (N == 0) return *this; + if (N == 32) return Zeroes(); + SuperVector result; + Unroller<1, 32>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm256_slli_si256(v->u.v256[0], n)}; }); + return result; +} + +template <> +really_inline SuperVector<32> SuperVector<32>::vshl_256(uint8_t const N) const +{ + if (N == 0) return *this; + if (N == 16) return {_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0))}; + if (N == 32) return Zeroes(); + SuperVector result; + Unroller<1, 16>::iterator([&,v=this](auto const i) { + constexpr uint8_t n = i.value; + if (N == n) result = {_mm256_alignr_epi8(u.v256[0], _mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 16 - n)};; + }); + Unroller<17, 32>::iterator([&,v=this](auto const i) { + constexpr uint8_t n = i.value; + if (N == n) result = {_mm256_slli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), n - 16)}; + }); + return result; +} + +template <> +really_inline SuperVector<32> SuperVector<32>::vshl(uint8_t const N) const +{ + return vshl_256(N); +} + +// template <> +// really_inline SuperVector<16> SuperVector<16>::vshr_8 (uint8_t const N) const +// { +// SuperVector<16> result; +// Unroller<0, 15>::iterator([&,v=this](uint8_t const i) { if (N == i) result = {_mm_srli_epi8(v->u.v128[0], i)}; }); +// if (N == 16) result = Zeroes(); +// return result; +// } + +template <> +really_inline SuperVector<32> SuperVector<32>::vshr_16 (uint8_t const N) const +{ + if (N == 0) return *this; + if (N == 32) return Zeroes(); + SuperVector result; + Unroller<1, 32>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm256_srli_epi16(v->u.v256[0], n)}; }); + return result; +} + +template <> +really_inline SuperVector<32> SuperVector<32>::vshr_32 (uint8_t const N) const +{ + if (N == 0) return *this; + if (N == 32) return Zeroes(); + SuperVector result; + Unroller<1, 32>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm256_srli_epi32(v->u.v256[0], n)}; }); + return result; +} + +template <> +really_inline SuperVector<32> SuperVector<32>::vshr_64 (uint8_t const N) const +{ + if (N == 0) return *this; + if (N == 32) return Zeroes(); + SuperVector result; + Unroller<1, 32>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm256_srli_epi64(v->u.v256[0], n)}; }); + return result; +} + +template <> +really_inline SuperVector<32> SuperVector<32>::vshr_128(uint8_t const N) const +{ + if (N == 0) return *this; + if (N == 32) return Zeroes(); + SuperVector result; + Unroller<1, 32>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {_mm256_srli_si256(v->u.v256[0], n)}; }); + return result; +} + +template <> +really_inline SuperVector<32> SuperVector<32>::vshr_256(uint8_t const N) const +{ + if (N == 0) return *this; + if (N == 16) return {_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1))}; + if (N == 32) return Zeroes(); + SuperVector result; + Unroller<1, 16>::iterator([&,v=this](auto const i) { + constexpr uint8_t n = i.value; + if (N == n) result = {_mm256_alignr_epi8(_mm256_permute2x128_si256(v->u.v256[0], v->u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), v->u.v256[0], n)}; + }); + Unroller<17, 32>::iterator([&,v=this](auto const i) { + constexpr uint8_t n = i.value; + if (N == n) result = {_mm256_srli_si256(_mm256_permute2x128_si256(v->u.v256[0], v->u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), n - 16)}; + }); + return result; +} + +template <> +really_inline SuperVector<32> SuperVector<32>::vshr(uint8_t const N) const +{ + return vshr_256(N); } #ifdef HS_OPTIMIZE @@ -595,51 +897,10 @@ really_inline SuperVector<32> SuperVector<32>::operator>>(uint8_t const N) const template <> really_inline SuperVector<32> SuperVector<32>::operator>>(uint8_t const N) const { - return rshift128_var(N); + return vshr_256(N); } #endif -template <> -really_inline SuperVector<32> SuperVector<32>::lshift128_var(uint8_t const N) const -{ - switch(N) { - case 1: return {_mm256_alignr_epi8(u.v256[0], _mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 15)}; break; - case 2: return {_mm256_alignr_epi8(u.v256[0], _mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 14)}; break; - case 3: return {_mm256_alignr_epi8(u.v256[0], _mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 13)}; break; - case 4: return {_mm256_alignr_epi8(u.v256[0], _mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 12)}; break; - case 5: return {_mm256_alignr_epi8(u.v256[0], _mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 11)}; break; - case 6: return {_mm256_alignr_epi8(u.v256[0], _mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 10)}; break; - case 7: return {_mm256_alignr_epi8(u.v256[0], _mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 9)}; break; - case 8: return {_mm256_alignr_epi8(u.v256[0], _mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 8)}; break; - case 9: return {_mm256_alignr_epi8(u.v256[0], _mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 7)}; break; - case 10: return {_mm256_alignr_epi8(u.v256[0], _mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 6)}; break; - case 11: return {_mm256_alignr_epi8(u.v256[0], _mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 5)}; break; - case 12: return {_mm256_alignr_epi8(u.v256[0], _mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 4)}; break; - case 13: return {_mm256_alignr_epi8(u.v256[0], _mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 3)}; break; - case 14: return {_mm256_alignr_epi8(u.v256[0], _mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 2)}; break; - case 15: return {_mm256_alignr_epi8(u.v256[0], _mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 1)}; break; - case 16: return {_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0))}; break; - case 17: return {_mm256_slli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 1)}; break; - case 18: return {_mm256_slli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 2)}; break; - case 19: return {_mm256_slli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 3)}; break; - case 20: return {_mm256_slli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 4)}; break; - case 21: return {_mm256_slli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 5)}; break; - case 22: return {_mm256_slli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 6)}; break; - case 23: return {_mm256_slli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 7)}; break; - case 24: return {_mm256_slli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 8)}; break; - case 25: return {_mm256_slli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 9)}; break; - case 26: return {_mm256_slli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 10)}; break; - case 27: return {_mm256_slli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 11)}; break; - case 28: return {_mm256_slli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 12)}; break; - case 29: return {_mm256_slli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 13)}; break; - case 30: return {_mm256_slli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 14)}; break; - case 31: return {_mm256_slli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 15)}; break; - case 32: return Zeroes(); break; - default: break; - } - return *this; -} - #ifdef HS_OPTIMIZE template <> really_inline SuperVector<32> SuperVector<32>::operator<<(uint8_t const N) const @@ -657,10 +918,30 @@ really_inline SuperVector<32> SuperVector<32>::operator<<(uint8_t const N) const template <> really_inline SuperVector<32> SuperVector<32>::operator<<(uint8_t const N) const { - return lshift128_var(N); + return vshl_256(N); } #endif +template<> +really_inline SuperVector<32> SuperVector<32>::Ones_vshr(uint8_t const N) +{ + if (N == 0) return Ones(); + if (N >= 16) + return {SuperVector<16>::Ones_vshr(N - 16), SuperVector<16>::Zeroes()}; + else + return {SuperVector<16>::Ones(), SuperVector<16>::Ones_vshr(N)}; +} + +template<> +really_inline SuperVector<32> SuperVector<32>::Ones_vshl(uint8_t const N) +{ + if (N == 0) return Ones(); + if (N >= 16) + return {SuperVector<16>::Zeroes(), SuperVector<16>::Ones_vshl(N - 16)}; + else + return {SuperVector<16>::Ones_vshl(N), SuperVector<16>::Ones()}; +} + template <> really_inline SuperVector<32> SuperVector<32>::loadu(void const *ptr) { @@ -678,14 +959,22 @@ really_inline SuperVector<32> SuperVector<32>::load(void const *ptr) template <> really_inline SuperVector<32> SuperVector<32>::loadu_maskz(void const *ptr, uint8_t const len) { - SuperVector<32> mask = Ones().rshift128_var(32 -len); +#ifdef HAVE_AVX512 + u32 mask = (~0ULL) >> (32 - len); + SuperVector<32> v = _mm256_mask_loadu_epi8(Zeroes().u.v256[0], mask, (const m256 *)ptr); + v.print8("v"); + return v; +#else + DEBUG_PRINTF("len = %d", len); + SuperVector<32> mask = Ones_vshr(32 -len); mask.print8("mask"); + (Ones() >> (32 - len)).print8("mask"); SuperVector<32> v = _mm256_loadu_si256((const m256 *)ptr); v.print8("v"); return mask & v; +#endif } - #ifdef HS_OPTIMIZE template<> really_inline SuperVector<32> SuperVector<32>::alignr(SuperVector<32> &other, int8_t offset) @@ -736,7 +1025,6 @@ really_inline SuperVector<32> SuperVector<32>::alignr(SuperVector<32> &other, in } #endif - template<> really_inline SuperVector<32> SuperVector<32>::pshufb(SuperVector<32> b) { @@ -746,208 +1034,10 @@ really_inline SuperVector<32> SuperVector<32>::pshufb(SuperVector<32> b) template<> really_inline SuperVector<32> SuperVector<32>::pshufb_maskz(SuperVector<32> b, uint8_t const len) { - SuperVector<32> mask = Ones().rshift128_var(32 -len); + SuperVector<32> mask = Ones_vshr(32 -len); return mask & pshufb(b); } -#ifdef HS_OPTIMIZE -template<> -really_inline SuperVector<32> SuperVector<32>::lshift64(uint8_t const N) -{ - return {_mm256_slli_epi64(u.v256[0], N)}; -} -#else -template<> -really_inline SuperVector<32> SuperVector<32>::lshift64(uint8_t const N) -{ - switch(N) { - case 0: return *this; break; - case 1: return {_mm256_slli_epi64(u.v256[0], 1)}; break; - case 2: return {_mm256_slli_epi64(u.v256[0], 2)}; break; - case 3: return {_mm256_slli_epi64(u.v256[0], 3)}; break; - case 4: return {_mm256_slli_epi64(u.v256[0], 4)}; break; - case 5: return {_mm256_slli_epi64(u.v256[0], 5)}; break; - case 6: return {_mm256_slli_epi64(u.v256[0], 6)}; break; - case 7: return {_mm256_slli_epi64(u.v256[0], 7)}; break; - case 8: return {_mm256_slli_epi64(u.v256[0], 8)}; break; - case 9: return {_mm256_slli_epi64(u.v256[0], 9)}; break; - case 10: return {_mm256_slli_epi64(u.v256[0], 10)}; break; - case 11: return {_mm256_slli_epi64(u.v256[0], 11)}; break; - case 12: return {_mm256_slli_epi64(u.v256[0], 12)}; break; - case 13: return {_mm256_slli_epi64(u.v256[0], 13)}; break; - case 14: return {_mm256_slli_epi64(u.v256[0], 14)}; break; - case 15: return {_mm256_slli_epi64(u.v256[0], 15)}; break; - case 16: return {_mm256_slli_epi64(u.v256[0], 16)}; break; - case 17: return {_mm256_slli_epi64(u.v256[0], 17)}; break; - case 18: return {_mm256_slli_epi64(u.v256[0], 18)}; break; - case 19: return {_mm256_slli_epi64(u.v256[0], 19)}; break; - case 20: return {_mm256_slli_epi64(u.v256[0], 20)}; break; - case 21: return {_mm256_slli_epi64(u.v256[0], 21)}; break; - case 22: return {_mm256_slli_epi64(u.v256[0], 22)}; break; - case 23: return {_mm256_slli_epi64(u.v256[0], 23)}; break; - case 24: return {_mm256_slli_epi64(u.v256[0], 24)}; break; - case 25: return {_mm256_slli_epi64(u.v256[0], 25)}; break; - case 26: return {_mm256_slli_epi64(u.v256[0], 26)}; break; - case 27: return {_mm256_slli_epi64(u.v256[0], 27)}; break; - case 28: return {_mm256_slli_epi64(u.v256[0], 28)}; break; - case 29: return {_mm256_slli_epi64(u.v256[0], 29)}; break; - case 30: return {_mm256_slli_epi64(u.v256[0], 30)}; break; - case 31: return {_mm256_slli_epi64(u.v256[0], 31)}; break; - case 32: return Zeroes(); - default: break; - } - return *this; -} -#endif - -#ifdef HS_OPTIMIZE -template<> -really_inline SuperVector<32> SuperVector<32>::rshift64(uint8_t const N) -{ - return {_mm256_srli_epi64(u.v256[0], N)}; -} -#else -template<> -really_inline SuperVector<32> SuperVector<32>::rshift64(uint8_t const N) -{ - switch(N) { - case 0: return *this; break; - case 1: return {_mm256_srli_epi64(u.v256[0], 1)}; break; - case 2: return {_mm256_srli_epi64(u.v256[0], 2)}; break; - case 3: return {_mm256_srli_epi64(u.v256[0], 3)}; break; - case 4: return {_mm256_srli_epi64(u.v256[0], 4)}; break; - case 5: return {_mm256_srli_epi64(u.v256[0], 5)}; break; - case 6: return {_mm256_srli_epi64(u.v256[0], 6)}; break; - case 7: return {_mm256_srli_epi64(u.v256[0], 7)}; break; - case 8: return {_mm256_srli_epi64(u.v256[0], 8)}; break; - case 9: return {_mm256_srli_epi64(u.v256[0], 9)}; break; - case 10: return {_mm256_srli_epi64(u.v256[0], 10)}; break; - case 11: return {_mm256_srli_epi64(u.v256[0], 11)}; break; - case 12: return {_mm256_srli_epi64(u.v256[0], 12)}; break; - case 13: return {_mm256_srli_epi64(u.v256[0], 13)}; break; - case 14: return {_mm256_srli_epi64(u.v256[0], 14)}; break; - case 15: return {_mm256_srli_epi64(u.v256[0], 15)}; break; - case 16: return {_mm256_srli_epi64(u.v256[0], 16)}; break; - case 17: return {_mm256_srli_epi64(u.v256[0], 17)}; break; - case 18: return {_mm256_srli_epi64(u.v256[0], 18)}; break; - case 19: return {_mm256_srli_epi64(u.v256[0], 19)}; break; - case 20: return {_mm256_srli_epi64(u.v256[0], 20)}; break; - case 21: return {_mm256_srli_epi64(u.v256[0], 21)}; break; - case 22: return {_mm256_srli_epi64(u.v256[0], 22)}; break; - case 23: return {_mm256_srli_epi64(u.v256[0], 23)}; break; - case 24: return {_mm256_srli_epi64(u.v256[0], 24)}; break; - case 25: return {_mm256_srli_epi64(u.v256[0], 25)}; break; - case 26: return {_mm256_srli_epi64(u.v256[0], 26)}; break; - case 27: return {_mm256_srli_epi64(u.v256[0], 27)}; break; - case 28: return {_mm256_srli_epi64(u.v256[0], 28)}; break; - case 29: return {_mm256_srli_epi64(u.v256[0], 29)}; break; - case 30: return {_mm256_srli_epi64(u.v256[0], 30)}; break; - case 31: return {_mm256_srli_epi64(u.v256[0], 31)}; break; - case 32: return Zeroes(); - default: break; - } - return *this; -} -#endif - -#ifdef HS_OPTIMIZE -template<> -really_inline SuperVector<32> SuperVector<32>::lshift128(uint8_t const N) -{ - return {_mm256_slli_si256(u.v256[0], N)}; -} -#else -template<> -really_inline SuperVector<32> SuperVector<32>::lshift128(uint8_t const N) -{ - switch(N) { - case 0: return *this; break; - case 1: return {_mm256_slli_si256(u.v256[0], 1)}; break; - case 2: return {_mm256_slli_si256(u.v256[0], 2)}; break; - case 3: return {_mm256_slli_si256(u.v256[0], 3)}; break; - case 4: return {_mm256_slli_si256(u.v256[0], 4)}; break; - case 5: return {_mm256_slli_si256(u.v256[0], 5)}; break; - case 6: return {_mm256_slli_si256(u.v256[0], 6)}; break; - case 7: return {_mm256_slli_si256(u.v256[0], 7)}; break; - case 8: return {_mm256_slli_si256(u.v256[0], 8)}; break; - case 9: return {_mm256_slli_si256(u.v256[0], 9)}; break; - case 10: return {_mm256_slli_si256(u.v256[0], 10)}; break; - case 11: return {_mm256_slli_si256(u.v256[0], 11)}; break; - case 12: return {_mm256_slli_si256(u.v256[0], 12)}; break; - case 13: return {_mm256_slli_si256(u.v256[0], 13)}; break; - case 14: return {_mm256_slli_si256(u.v256[0], 14)}; break; - case 15: return {_mm256_slli_si256(u.v256[0], 15)}; break; - case 16: return {_mm256_slli_si256(u.v256[0], 16)}; break; - case 17: return {_mm256_slli_si256(u.v256[0], 17)}; break; - case 18: return {_mm256_slli_si256(u.v256[0], 18)}; break; - case 19: return {_mm256_slli_si256(u.v256[0], 19)}; break; - case 20: return {_mm256_slli_si256(u.v256[0], 20)}; break; - case 21: return {_mm256_slli_si256(u.v256[0], 21)}; break; - case 22: return {_mm256_slli_si256(u.v256[0], 22)}; break; - case 23: return {_mm256_slli_si256(u.v256[0], 23)}; break; - case 24: return {_mm256_slli_si256(u.v256[0], 24)}; break; - case 25: return {_mm256_slli_si256(u.v256[0], 25)}; break; - case 26: return {_mm256_slli_si256(u.v256[0], 26)}; break; - case 27: return {_mm256_slli_si256(u.v256[0], 27)}; break; - case 28: return {_mm256_slli_si256(u.v256[0], 28)}; break; - case 29: return {_mm256_slli_si256(u.v256[0], 29)}; break; - case 30: return {_mm256_slli_si256(u.v256[0], 30)}; break; - case 31: return {_mm256_slli_si256(u.v256[0], 31)}; break; - default: break; - } - return *this; -} -#endif - -#ifdef HS_OPTIMIZE -template<> -really_inline SuperVector<32> SuperVector<32>::rshift128(uint8_t const N) -{ - return {_mm256_srli_si256(u.v256[0], N)}; -} -#else -template<> -really_inline SuperVector<32> SuperVector<32>::rshift128(uint8_t const N) -{ - switch(N) { - case 0: return *this; break; - case 1: return {_mm256_srli_si256(u.v256[0], 1)}; break; - case 2: return {_mm256_srli_si256(u.v256[0], 2)}; break; - case 3: return {_mm256_srli_si256(u.v256[0], 3)}; break; - case 4: return {_mm256_srli_si256(u.v256[0], 4)}; break; - case 5: return {_mm256_srli_si256(u.v256[0], 5)}; break; - case 6: return {_mm256_srli_si256(u.v256[0], 6)}; break; - case 7: return {_mm256_srli_si256(u.v256[0], 7)}; break; - case 8: return {_mm256_srli_si256(u.v256[0], 8)}; break; - case 9: return {_mm256_srli_si256(u.v256[0], 9)}; break; - case 10: return {_mm256_srli_si256(u.v256[0], 10)}; break; - case 11: return {_mm256_srli_si256(u.v256[0], 11)}; break; - case 12: return {_mm256_srli_si256(u.v256[0], 12)}; break; - case 13: return {_mm256_srli_si256(u.v256[0], 13)}; break; - case 14: return {_mm256_srli_si256(u.v256[0], 14)}; break; - case 15: return {_mm256_srli_si256(u.v256[0], 15)}; break; - case 16: return {_mm256_srli_si256(u.v256[0], 16)}; break; - case 17: return {_mm256_srli_si256(u.v256[0], 17)}; break; - case 18: return {_mm256_srli_si256(u.v256[0], 18)}; break; - case 19: return {_mm256_srli_si256(u.v256[0], 19)}; break; - case 20: return {_mm256_srli_si256(u.v256[0], 20)}; break; - case 21: return {_mm256_srli_si256(u.v256[0], 21)}; break; - case 22: return {_mm256_srli_si256(u.v256[0], 22)}; break; - case 23: return {_mm256_srli_si256(u.v256[0], 23)}; break; - case 24: return {_mm256_srli_si256(u.v256[0], 24)}; break; - case 25: return {_mm256_srli_si256(u.v256[0], 25)}; break; - case 26: return {_mm256_srli_si256(u.v256[0], 26)}; break; - case 27: return {_mm256_srli_si256(u.v256[0], 27)}; break; - case 28: return {_mm256_srli_si256(u.v256[0], 28)}; break; - case 29: return {_mm256_srli_si256(u.v256[0], 29)}; break; - case 30: return {_mm256_srli_si256(u.v256[0], 30)}; break; - case 31: return {_mm256_srli_si256(u.v256[0], 31)}; break; - default: break; - } - return *this; -} -#endif - #endif // HAVE_AVX2 diff --git a/src/util/supervector/supervector.hpp b/src/util/supervector/supervector.hpp index e834fef0..718cd0f6 100644 --- a/src/util/supervector/supervector.hpp +++ b/src/util/supervector/supervector.hpp @@ -217,15 +217,63 @@ public: SuperVector pshufb(SuperVector b); SuperVector pshufb_maskz(SuperVector b, uint8_t const len); - SuperVector lshift64(uint8_t const N); - SuperVector rshift64(uint8_t const N); - SuperVector lshift128(uint8_t const N); - SuperVector rshift128(uint8_t const N); - SuperVector lshift128_var(uint8_t const N) const; - SuperVector rshift128_var(uint8_t const N) const; + + // Shift instructions + template + SuperVector vshl_8_imm() const; + template + SuperVector vshl_16_imm() const; + template + SuperVector vshl_32_imm() const; + template + SuperVector vshl_64_imm() const; + template + SuperVector vshl_128_imm() const; + #if defined(HAVE_SIMD_256_BITS) + template + SuperVector vshl_256_imm() const; + #endif + template + SuperVector vshl_imm() const; + template + SuperVector vshr_8_imm() const; + template + SuperVector vshr_16_imm() const; + template + SuperVector vshr_32_imm() const; + template + SuperVector vshr_64_imm() const; + template + SuperVector vshr_128_imm() const; + #if defined(HAVE_SIMD_256_BITS) + template + SuperVector vshr_256_imm() const; + #endif + template + SuperVector vshr_imm() const; + SuperVector vshl_8 (uint8_t const N) const; + SuperVector vshl_16 (uint8_t const N) const; + SuperVector vshl_32 (uint8_t const N) const; + SuperVector vshl_64 (uint8_t const N) const; + SuperVector vshl_128(uint8_t const N) const; + #if defined(HAVE_SIMD_256_BITS) + SuperVector vshl_256(uint8_t const N) const; + #endif + SuperVector vshl (uint8_t const N) const; + SuperVector vshr_8 (uint8_t const N) const; + SuperVector vshr_16 (uint8_t const N) const; + SuperVector vshr_32 (uint8_t const N) const; + SuperVector vshr_64 (uint8_t const N) const; + SuperVector vshr_128(uint8_t const N) const; + #if defined(HAVE_SIMD_256_BITS) + SuperVector vshr_256(uint8_t const N) const; + #endif + SuperVector vshr (uint8_t const N) const; // Constants static SuperVector Ones(); + static SuperVector Ones_vshr(uint8_t const N); + static SuperVector Ones_vshl(uint8_t const N); static SuperVector Zeroes(); #if defined(DEBUG) @@ -264,6 +312,25 @@ public: #endif }; +template +struct Unroller +{ + template + static void iterator(Action &&action) + { + action(std::integral_constant()); + Unroller::iterator(action); + } +}; + +template +struct Unroller +{ + template + static void iterator(Action &&action UNUSED) + {} +}; + #if defined(HS_OPTIMIZE) #if defined(ARCH_IA32) || defined(ARCH_X86_64) #include "util/supervector/arch/x86/impl.cpp" diff --git a/unit/internal/supervector.cpp b/unit/internal/supervector.cpp index 68fda015..8b6830f0 100644 --- a/unit/internal/supervector.cpp +++ b/unit/internal/supervector.cpp @@ -230,7 +230,7 @@ TEST(SuperVectorUtilsTest,LShift64_128c){ u64a vec[2] = {128, 512}; auto SP = SuperVector<16>::loadu(vec); for(int s = 0; s<16; s++) { - auto SP_after_shift = SP.lshift64(s); + auto SP_after_shift = SP.vshl_64(s); for (int i=0; i<2; i++) { ASSERT_EQ(SP_after_shift.u.u64[i], vec[i] << s); } @@ -241,7 +241,7 @@ TEST(SuperVectorUtilsTest,RShift64_128c){ u64a vec[2] = {128, 512}; auto SP = SuperVector<16>::loadu(vec); for(int s = 0; s<16; s++) { - auto SP_after_shift = SP.rshift64(s); + auto SP_after_shift = SP.vshr_64(s); for (int i=0; i<2; i++) { ASSERT_EQ(SP_after_shift.u.u64[i], vec[i] >> s); } @@ -293,7 +293,7 @@ TEST(SuperVectorUtilsTest,pshufb128c) { /*Define LSHIFT128_128 macro*/ #define TEST_LSHIFT128_128(buf, vec, v, l) { \ - auto v_shifted = v.lshift128(l); \ + auto v_shifted = v.vshl_128(l); \ for (int i=15; i>= l; --i) { \ buf[i] = vec[i-l]; \ } \ @@ -317,7 +317,7 @@ TEST(SuperVectorUtilsTest,LShift128_128c){ /*Define RSHIFT128_128 macro*/ #define TEST_RSHIFT128_128(buf, vec, v, l) { \ - auto v_shifted = v.rshift128(l); \ + auto v_shifted = v.vshr_128(l); \ for (int i=0; i<16-l; i++) { \ buf[i] = vec[i+l]; \ } \ @@ -578,7 +578,7 @@ TEST(SuperVectorUtilsTest,LShift64_256c){ u64a vec[4] = {128, 512, 256, 1024}; auto SP = SuperVector<32>::loadu(vec); for(int s = 0; s<32; s++) { - auto SP_after_shift = SP.lshift64(s); + auto SP_after_shift = SP.vshl_64(s); for (int i=0; i<4; i++) { ASSERT_EQ(SP_after_shift.u.u64[i], vec[i] << s); } @@ -589,7 +589,7 @@ TEST(SuperVectorUtilsTest,RShift64_256c){ u64a vec[4] = {128, 512, 256, 1024}; auto SP = SuperVector<32>::loadu(vec); for(int s = 0; s<32; s++) { - auto SP_after_shift = SP.rshift64(s); + auto SP_after_shift = SP.vshr_64(s); for (int i=0; i<4; i++) { ASSERT_EQ(SP_after_shift.u.u64[i], vec[i] >> s); } @@ -627,7 +627,7 @@ TEST(SuperVectorUtilsTest,RShift256c){ /*Define LSHIFT128_256 macro*/ #define TEST_LSHIFT128_256(buf, vec, v, l) { \ - auto v_shifted = v.lshift128(l); \ + auto v_shifted = v.vshl_128(l); \ for (int i=15; i>= l; --i) { \ buf[i] = vec[i-l]; \ buf[i+16] = vec[(16+i)-l]; \ @@ -653,7 +653,7 @@ TEST(SuperVectorUtilsTest,LShift128_256c){ /*Define RSHIFT128_128 macro*/ #define TEST_RSHIFT128_256(buf, vec, v, l) { \ - auto v_shifted = v.rshift128(l); \ + auto v_shifted = v.vshr_128(l); \ for (int i=0; i<16-l; i++) { \ buf[i] = vec[i+l]; \ buf[i+16] = vec[(i+16)+l]; \ @@ -966,7 +966,7 @@ TEST(SuperVectorUtilsTest,RShift512c){ /*Define RSHIFT128_512 macro*/ #define TEST_RSHIFT128_512(buf, vec, v, l) { \ - auto v_shifted = v.rshift128(l); \ + auto v_shifted = v.vshr_128(l); \ for (int i=0; i<16-l; i++) { \ buf[i] = vec[i+l]; \ buf[i+16] = vec[(i+16)+l]; \ @@ -995,7 +995,7 @@ TEST(SuperVectorUtilsTest,RShift128_512c){ /*Define LSHIFT512 macro*/ #define TEST_LSHIFT128_512(buf, vec, v, l) { \ - auto v_shifted = v.lshift128(l); \ + auto v_shifted = v.vshl_128(l); \ for (int i=15; i>=l; --i) { \ buf[i] = vec[i-l]; \ buf[i+16] = vec[(i+16)-l]; \