From 67fa6d2738f8dbb15e1438ed05a4f73c076f6b84 Mon Sep 17 00:00:00 2001 From: apostolos Date: Wed, 28 Jul 2021 12:55:32 +0300 Subject: [PATCH] alignr methods for avx2 and avx512 added --- src/util/supervector/arch/x86/impl.cpp | 106 +++++++++++++------------ unit/internal/supervector.cpp | 97 ++++------------------ 2 files changed, 71 insertions(+), 132 deletions(-) diff --git a/src/util/supervector/arch/x86/impl.cpp b/src/util/supervector/arch/x86/impl.cpp index 3c305d4b..26e45909 100644 --- a/src/util/supervector/arch/x86/impl.cpp +++ b/src/util/supervector/arch/x86/impl.cpp @@ -685,6 +685,7 @@ really_inline SuperVector<32> SuperVector<32>::loadu_maskz(void const *ptr, uint return mask & v; } + #ifdef HS_OPTIMIZE template<> really_inline SuperVector<32> SuperVector<32>::alignr(SuperVector<32> &other, int8_t offset) @@ -695,45 +696,47 @@ really_inline SuperVector<32> SuperVector<32>::alignr(SuperVector<32> &other, in template<> really_inline SuperVector<32> SuperVector<32>::alignr(SuperVector<32> &other, int8_t offset) { - switch(offset) { - case 0: return other; break; - case 1: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 1)}; break; - case 2: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 2)}; break; - case 3: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 3)}; break; - case 4: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 4)}; break; - case 5: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 5)}; break; - case 6: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 6)}; break; - case 7: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 7)}; break; - case 8: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 8)}; break; - case 9: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 9)}; break; - case 10: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 10)}; break; - case 11: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 11)}; break; - case 12: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 12)}; break; - case 13: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 13)}; break; - case 14: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 14)}; break; - case 15: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 15)}; break; - case 16: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 16)}; break; - case 17: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 17)}; break; - case 18: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 18)}; break; - case 19: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 19)}; break; - case 20: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 20)}; break; - case 21: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 21)}; break; - case 22: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 22)}; break; - case 23: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 23)}; break; - case 24: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 24)}; break; - case 25: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 25)}; break; - case 26: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 26)}; break; - case 27: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 27)}; break; - case 28: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 28)}; break; - case 29: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 39)}; break; - case 30: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 30)}; break; - case 31: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 31)}; break; + // As found here: https://stackoverflow.com/questions/8517970/mm-alignr-epi8-palignr-equivalent-in-avx2#8637458 + switch (offset){ + case 0 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 0), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 0)); break; + case 1 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 1), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 1)); break; + case 2 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 2), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 2)); break; + case 3 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 3), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 3)); break; + case 4 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 4), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 4)); break; + case 5 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 5), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 5)); break; + case 6 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 6), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 6)); break; + case 7 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 7), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 7)); break; + case 8 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 8), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 8)); break; + case 9 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 9), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 9)); break; + case 10 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 10), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 10)); break; + case 11 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 11), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 11)); break; + case 12 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 12), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 12)); break; + case 13 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 13), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 13)); break; + case 14 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 14), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 14)); break; + case 15 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[0], other.u.v128[1], 15), _mm_alignr_epi8(other.u.v128[1], other.u.v128[0], 15)); break; + case 16 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 0), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 0)); break; + case 17 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 1), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 1)); break; + case 18 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 2), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 2)); break; + case 19 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 3), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 3)); break; + case 20 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 4), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 4)); break; + case 21 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 5), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 5)); break; + case 22 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 6), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 6)); break; + case 23 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 7), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 7)); break; + case 24 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 8), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 8)); break; + case 25 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 9), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 9)); break; + case 26 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 10), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 10)); break; + case 27 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 11), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 11)); break; + case 28 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 12), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 12)); break; + case 29 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 13), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 13)); break; + case 30 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 14), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 14)); break; + case 31 : return _mm256_set_m128i(_mm_alignr_epi8(u.v128[1], u.v128[0], 15), _mm_alignr_epi8(u.v128[0], other.u.v128[1], 15)); break; default: break; } return *this; } #endif + template<> really_inline SuperVector<32> SuperVector<32>::pshufb(SuperVector<32> b) { @@ -1208,26 +1211,25 @@ really_inline SuperVector<64> SuperVector<64>::alignr(SuperVector<64> &l, int8_t template<> really_inline SuperVector<64> SuperVector<64>::alignr(SuperVector<64> &l, int8_t offset) { - switch(offset) { - case 0: return {_mm512_alignr_epi8(u.v512[0], l.u.v512[0], 0)};; break; - case 1: return {_mm512_alignr_epi8(u.v512[0], l.u.v512[0], 1)}; break; - case 2: return {_mm512_alignr_epi8(u.v512[0], l.u.v512[0], 2)}; break; - case 3: return {_mm512_alignr_epi8(u.v512[0], l.u.v512[0], 3)}; break; - case 4: return {_mm512_alignr_epi8(u.v512[0], l.u.v512[0], 4)}; break; - case 5: return {_mm512_alignr_epi8(u.v512[0], l.u.v512[0], 5)}; break; - case 6: return {_mm512_alignr_epi8(u.v512[0], l.u.v512[0], 6)}; break; - case 7: return {_mm512_alignr_epi8(u.v512[0], l.u.v512[0], 7)}; break; - case 8: return {_mm512_alignr_epi8(u.v512[0], l.u.v512[0], 8)}; break; - case 9: return {_mm512_alignr_epi8(u.v512[0], l.u.v512[0], 9)}; break; - case 10: return {_mm512_alignr_epi8(u.v512[0], l.u.v512[0], 10)}; break; - case 11: return {_mm512_alignr_epi8(u.v512[0], l.u.v512[0], 11)}; break; - case 12: return {_mm512_alignr_epi8(u.v512[0], l.u.v512[0], 12)}; break; - case 13: return {_mm512_alignr_epi8(u.v512[0], l.u.v512[0], 13)}; break; - case 14: return {_mm512_alignr_epi8(u.v512[0], l.u.v512[0], 14)}; break; - case 15: return {_mm512_alignr_epi8(u.v512[0], l.u.v512[0], 15)}; break; - default: break; + if(offset == 0){ + return *this; + } else if (offset < 32){ + SuperVector<32> lo256 = u.v256[0]; + SuperVector<32> hi256 = u.v256[1]; + SuperVector<32> o_lo256 = l.u.v256[0]; + SuperVector<32> carry1 = hi256.alignr(lo256,offset); + SuperVector<32> carry2 = o_lo256.alignr(hi256,offset); + return SuperVector(carry1, carry2); + } else if (offset <= 64){ + SuperVector<32> hi256 = u.v256[1]; + SuperVector<32> o_lo256 = l.u.v256[0]; + SuperVector<32> o_hi256 = l.u.v256[1]; + SuperVector<32> carry1 = o_lo256.alignr(hi256, offset - 32); + SuperVector<32> carry2 = o_hi256.alignr(o_lo256,offset -32); + return SuperVector(carry1, carry2); + } else { + return *this; } - return *this; } #endif diff --git a/unit/internal/supervector.cpp b/unit/internal/supervector.cpp index e85d815e..8ea30f85 100644 --- a/unit/internal/supervector.cpp +++ b/unit/internal/supervector.cpp @@ -354,23 +354,9 @@ TEST(SuperVectorUtilsTest,Alignr128c){ } auto SP1 = SuperVector<16>::loadu(vec); auto SP2 = SuperVector<16>::loadu(vec+16); - TEST_ALIGNR128(SP1, SP2, vec, 0); - TEST_ALIGNR128(SP1, SP2, vec, 1); - TEST_ALIGNR128(SP1, SP2, vec, 2); - TEST_ALIGNR128(SP1, SP2, vec, 3); - TEST_ALIGNR128(SP1, SP2, vec, 4); - TEST_ALIGNR128(SP1, SP2, vec, 5); - TEST_ALIGNR128(SP1, SP2, vec, 6); - TEST_ALIGNR128(SP1, SP2, vec, 7); - TEST_ALIGNR128(SP1, SP2, vec, 8); - TEST_ALIGNR128(SP1, SP2, vec, 9); - TEST_ALIGNR128(SP1, SP2, vec, 10); - TEST_ALIGNR128(SP1, SP2, vec, 11); - TEST_ALIGNR128(SP1, SP2, vec, 12); - TEST_ALIGNR128(SP1, SP2, vec, 13); - TEST_ALIGNR128(SP1, SP2, vec, 14); - TEST_ALIGNR128(SP1, SP2, vec, 15); - TEST_ALIGNR128(SP1, SP2, vec, 16); + for (int j = 0; j<16; j++){ + TEST_ALIGNR128(SP1, SP2, vec, j); + } } @@ -693,14 +679,11 @@ TEST(SuperVectorUtilsTest,RShift128_256c){ /*Define ALIGNR256 macro*/ -/* -#define TEST_ALIGNR256(v1, v2, buf, l) { \ - auto v_aligned = v2.alignr(v1, l); \ - v_aligned.print8("v_aligned");\ - for (size_t i=0; i<32; i++) { \ - printf("vec[%ld] = %02x\n", i+l, vec[i+l]);\ - ASSERT_EQ(v_aligned.u.u8[i], vec[i + l]); \ - } \ +#define TEST_ALIGNR256(v1, v2, buf, l) { \ + auto v_aligned = v2.alignr(v1, l); \ + for (size_t i=0; i<32; i++) { \ + ASSERT_EQ(v_aligned.u.u8[i], vec[i + l]); \ + } \ } TEST(SuperVectorUtilsTest,Alignr256c){ @@ -710,41 +693,10 @@ TEST(SuperVectorUtilsTest,Alignr256c){ } auto SP1 = SuperVector<32>::loadu(vec); auto SP2 = SuperVector<32>::loadu(vec+32); - TEST_ALIGNR256(SP1, SP2, vec, 0); - TEST_ALIGNR256(SP1, SP2, vec, 1); - TEST_ALIGNR256(SP1, SP2, vec, 2); - TEST_ALIGNR256(SP1, SP2, vec, 3); - TEST_ALIGNR256(SP1, SP2, vec, 4); - TEST_ALIGNR256(SP1, SP2, vec, 5); - TEST_ALIGNR256(SP1, SP2, vec, 6); - TEST_ALIGNR256(SP1, SP2, vec, 7); - TEST_ALIGNR256(SP1, SP2, vec, 8); - TEST_ALIGNR256(SP1, SP2, vec, 9); - TEST_ALIGNR256(SP1, SP2, vec, 10); - TEST_ALIGNR256(SP1, SP2, vec, 11); - TEST_ALIGNR256(SP1, SP2, vec, 12); - TEST_ALIGNR256(SP1, SP2, vec, 13); - TEST_ALIGNR256(SP1, SP2, vec, 14); - TEST_ALIGNR256(SP1, SP2, vec, 15); - TEST_ALIGNR256(SP1, SP2, vec, 16); - TEST_ALIGNR256(SP1, SP2, vec, 17); - TEST_ALIGNR256(SP1, SP2, vec, 18); - TEST_ALIGNR256(SP1, SP2, vec, 19); - TEST_ALIGNR256(SP1, SP2, vec, 20); - TEST_ALIGNR256(SP1, SP2, vec, 21); - TEST_ALIGNR256(SP1, SP2, vec, 22); - TEST_ALIGNR256(SP1, SP2, vec, 23); - TEST_ALIGNR256(SP1, SP2, vec, 24); - TEST_ALIGNR256(SP1, SP2, vec, 25); - TEST_ALIGNR256(SP1, SP2, vec, 26); - TEST_ALIGNR256(SP1, SP2, vec, 27); - TEST_ALIGNR256(SP1, SP2, vec, 28); - TEST_ALIGNR256(SP1, SP2, vec, 29); - TEST_ALIGNR256(SP1, SP2, vec, 30); - TEST_ALIGNR256(SP1, SP2, vec, 31); - TEST_ALIGNR256(SP1, SP2, vec, 32); + for(int j=0; j<32; j++) { + TEST_ALIGNR256(SP1, SP2, vec, j); + } } -*/ #endif // HAVE_AVX2 @@ -1073,9 +1025,8 @@ TEST(SuperVectorUtilsTest,LShift128_512c){ /*Define ALIGNR512 macro*/ -/* #define TEST_ALIGNR512(v1, v2, buf, l) { \ - auto v_aligned = v2.alignr(v1, l); \ + auto v_aligned = v1.alignr(v2, l); \ for (size_t i=0; i<64; i++) { \ ASSERT_EQ(v_aligned.u.u8[i], vec[i + l]); \ } \ @@ -1087,24 +1038,10 @@ TEST(SuperVectorUtilsTest,Alignr512c){ vec[i]=i; } auto SP1 = SuperVector<64>::loadu(vec); - auto SP2 = SuperVector<64>::loadu(vec+32); - TEST_ALIGNR512(SP1, SP2, vec, 0); - TEST_ALIGNR512(SP1, SP2, vec, 1); - TEST_ALIGNR512(SP1, SP2, vec, 2); - TEST_ALIGNR512(SP1, SP2, vec, 3); - TEST_ALIGNR512(SP1, SP2, vec, 4); - TEST_ALIGNR512(SP1, SP2, vec, 5); - TEST_ALIGNR512(SP1, SP2, vec, 6); - TEST_ALIGNR512(SP1, SP2, vec, 7); - TEST_ALIGNR512(SP1, SP2, vec, 8); - TEST_ALIGNR512(SP1, SP2, vec, 9); - TEST_ALIGNR512(SP1, SP2, vec, 10); - TEST_ALIGNR512(SP1, SP2, vec, 11); - TEST_ALIGNR512(SP1, SP2, vec, 12); - TEST_ALIGNR512(SP1, SP2, vec, 13); - TEST_ALIGNR512(SP1, SP2, vec, 14); - TEST_ALIGNR512(SP1, SP2, vec, 15); - TEST_ALIGNR512(SP1, SP2, vec, 16); + auto SP2 = SuperVector<64>::loadu(vec+64); + for(int j=0; j<64; j++){ + TEST_ALIGNR512(SP1, SP2, vec, j); + } } -*/ + #endif // HAVE_AVX512