diff --git a/src/util/supervector/arch/x86/impl.cpp b/src/util/supervector/arch/x86/impl.cpp index 9aa8002f..daac5f01 100644 --- a/src/util/supervector/arch/x86/impl.cpp +++ b/src/util/supervector/arch/x86/impl.cpp @@ -366,6 +366,18 @@ really_inline SuperVector<16> SuperVector<16>::rshift64(uint8_t const N) } #endif +template<> +really_inline SuperVector<16> SuperVector<16>::lshift128(uint8_t const N) +{ + return *this << N; +} + +template<> +really_inline SuperVector<16> SuperVector<16>::rshift128(uint8_t const N) +{ + return *this >> N; +} + // 256-bit AVX2 implementation #if defined(HAVE_AVX2) template<> @@ -667,6 +679,22 @@ really_inline SuperVector<32> SuperVector<32>::alignr(SuperVector<32> &other, in case 13: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 13)}; break; case 14: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 14)}; break; case 15: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 15)}; break; + case 16: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 16)}; break; + case 17: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 17)}; break; + case 18: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 18)}; break; + case 19: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 19)}; break; + case 20: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 20)}; break; + case 21: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 21)}; break; + case 22: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 22)}; break; + case 23: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 23)}; break; + case 24: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 24)}; break; + case 25: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 25)}; break; + case 26: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 26)}; break; + case 27: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 27)}; break; + case 28: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 28)}; break; + case 29: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 39)}; break; + case 30: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 30)}; break; + case 31: return {_mm256_alignr_epi8(u.v256[0], other.u.v256[0], 31)}; break; default: break; } return *this; @@ -706,6 +734,22 @@ really_inline SuperVector<32> SuperVector<32>::lshift64(uint8_t const N) case 13: return {_mm256_slli_epi64(u.v256[0], 13)}; break; case 14: return {_mm256_slli_epi64(u.v256[0], 14)}; break; case 15: return {_mm256_slli_epi64(u.v256[0], 15)}; break; + case 16: return {_mm256_slli_epi64(u.v256[0], 16)}; break; + case 17: return {_mm256_slli_epi64(u.v256[0], 17)}; break; + case 18: return {_mm256_slli_epi64(u.v256[0], 18)}; break; + case 19: return {_mm256_slli_epi64(u.v256[0], 19)}; break; + case 20: return {_mm256_slli_epi64(u.v256[0], 20)}; break; + case 21: return {_mm256_slli_epi64(u.v256[0], 21)}; break; + case 22: return {_mm256_slli_epi64(u.v256[0], 22)}; break; + case 23: return {_mm256_slli_epi64(u.v256[0], 23)}; break; + case 24: return {_mm256_slli_epi64(u.v256[0], 24)}; break; + case 25: return {_mm256_slli_epi64(u.v256[0], 25)}; break; + case 26: return {_mm256_slli_epi64(u.v256[0], 26)}; break; + case 27: return {_mm256_slli_epi64(u.v256[0], 27)}; break; + case 28: return {_mm256_slli_epi64(u.v256[0], 28)}; break; + case 29: return {_mm256_slli_epi64(u.v256[0], 29)}; break; + case 30: return {_mm256_slli_epi64(u.v256[0], 30)}; break; + case 31: return {_mm256_slli_epi64(u.v256[0], 31)}; break; default: break; } return *this; @@ -739,6 +783,120 @@ really_inline SuperVector<32> SuperVector<32>::rshift64(uint8_t const N) case 13: return {_mm256_srli_epi64(u.v256[0], 13)}; break; case 14: return {_mm256_srli_epi64(u.v256[0], 14)}; break; case 15: return {_mm256_srli_epi64(u.v256[0], 15)}; break; + case 16: return {_mm256_srli_epi64(u.v256[0], 16)}; break; + case 17: return {_mm256_srli_epi64(u.v256[0], 17)}; break; + case 18: return {_mm256_srli_epi64(u.v256[0], 18)}; break; + case 19: return {_mm256_srli_epi64(u.v256[0], 19)}; break; + case 20: return {_mm256_srli_epi64(u.v256[0], 20)}; break; + case 21: return {_mm256_srli_epi64(u.v256[0], 21)}; break; + case 22: return {_mm256_srli_epi64(u.v256[0], 22)}; break; + case 23: return {_mm256_srli_epi64(u.v256[0], 23)}; break; + case 24: return {_mm256_srli_epi64(u.v256[0], 24)}; break; + case 25: return {_mm256_srli_epi64(u.v256[0], 25)}; break; + case 26: return {_mm256_srli_epi64(u.v256[0], 26)}; break; + case 27: return {_mm256_srli_epi64(u.v256[0], 27)}; break; + case 28: return {_mm256_srli_epi64(u.v256[0], 28)}; break; + case 29: return {_mm256_srli_epi64(u.v256[0], 29)}; break; + case 30: return {_mm256_srli_epi64(u.v256[0], 30)}; break; + case 31: return {_mm256_srli_epi64(u.v256[0], 31)}; break; + default: break; + } + return *this; +} +#endif + +#ifdef HS_OPTIMIZE +template<> +really_inline SuperVector<32> SuperVector<32>::lshift128(uint8_t const N) +{ + return {_mm256_slli_si256(u.v256[0], N)}; +} +#else +template<> +really_inline SuperVector<32> SuperVector<32>::lshift128(uint8_t const N) +{ + switch(N) { + case 0: return *this; break; + case 1: return {_mm256_slli_si256(u.v256[0], 1)}; break; + case 2: return {_mm256_slli_si256(u.v256[0], 2)}; break; + case 3: return {_mm256_slli_si256(u.v256[0], 3)}; break; + case 4: return {_mm256_slli_si256(u.v256[0], 4)}; break; + case 5: return {_mm256_slli_si256(u.v256[0], 5)}; break; + case 6: return {_mm256_slli_si256(u.v256[0], 6)}; break; + case 7: return {_mm256_slli_si256(u.v256[0], 7)}; break; + case 8: return {_mm256_slli_si256(u.v256[0], 8)}; break; + case 9: return {_mm256_slli_si256(u.v256[0], 9)}; break; + case 10: return {_mm256_slli_si256(u.v256[0], 10)}; break; + case 11: return {_mm256_slli_si256(u.v256[0], 11)}; break; + case 12: return {_mm256_slli_si256(u.v256[0], 12)}; break; + case 13: return {_mm256_slli_si256(u.v256[0], 13)}; break; + case 14: return {_mm256_slli_si256(u.v256[0], 14)}; break; + case 15: return {_mm256_slli_si256(u.v256[0], 15)}; break; + case 16: return {_mm256_slli_si256(u.v256[0], 16)}; break; + case 17: return {_mm256_slli_si256(u.v256[0], 17)}; break; + case 18: return {_mm256_slli_si256(u.v256[0], 18)}; break; + case 19: return {_mm256_slli_si256(u.v256[0], 19)}; break; + case 20: return {_mm256_slli_si256(u.v256[0], 20)}; break; + case 21: return {_mm256_slli_si256(u.v256[0], 21)}; break; + case 22: return {_mm256_slli_si256(u.v256[0], 22)}; break; + case 23: return {_mm256_slli_si256(u.v256[0], 23)}; break; + case 24: return {_mm256_slli_si256(u.v256[0], 24)}; break; + case 25: return {_mm256_slli_si256(u.v256[0], 25)}; break; + case 26: return {_mm256_slli_si256(u.v256[0], 26)}; break; + case 27: return {_mm256_slli_si256(u.v256[0], 27)}; break; + case 28: return {_mm256_slli_si256(u.v256[0], 28)}; break; + case 29: return {_mm256_slli_si256(u.v256[0], 29)}; break; + case 30: return {_mm256_slli_si256(u.v256[0], 30)}; break; + case 31: return {_mm256_slli_si256(u.v256[0], 31)}; break; + default: break; + } + return *this; +} +#endif + +#ifdef HS_OPTIMIZE +template<> +really_inline SuperVector<32> SuperVector<32>::rshift128(uint8_t const N) +{ + return {_mm256_srli_si256(u.v256[0], N)}; +} +#else +template<> +really_inline SuperVector<32> SuperVector<32>::rshift128(uint8_t const N) +{ + switch(N) { + case 0: return *this; break; + case 1: return {_mm256_srli_si256(u.v256[0], 1)}; break; + case 2: return {_mm256_srli_si256(u.v256[0], 2)}; break; + case 3: return {_mm256_srli_si256(u.v256[0], 3)}; break; + case 4: return {_mm256_srli_si256(u.v256[0], 4)}; break; + case 5: return {_mm256_srli_si256(u.v256[0], 5)}; break; + case 6: return {_mm256_srli_si256(u.v256[0], 6)}; break; + case 7: return {_mm256_srli_si256(u.v256[0], 7)}; break; + case 8: return {_mm256_srli_si256(u.v256[0], 8)}; break; + case 9: return {_mm256_srli_si256(u.v256[0], 9)}; break; + case 10: return {_mm256_srli_si256(u.v256[0], 10)}; break; + case 11: return {_mm256_srli_si256(u.v256[0], 11)}; break; + case 12: return {_mm256_srli_si256(u.v256[0], 12)}; break; + case 13: return {_mm256_srli_si256(u.v256[0], 13)}; break; + case 14: return {_mm256_srli_si256(u.v256[0], 14)}; break; + case 15: return {_mm256_srli_si256(u.v256[0], 15)}; break; + case 16: return {_mm256_srli_si256(u.v256[0], 16)}; break; + case 17: return {_mm256_srli_si256(u.v256[0], 17)}; break; + case 18: return {_mm256_srli_si256(u.v256[0], 18)}; break; + case 19: return {_mm256_srli_si256(u.v256[0], 19)}; break; + case 20: return {_mm256_srli_si256(u.v256[0], 20)}; break; + case 21: return {_mm256_srli_si256(u.v256[0], 21)}; break; + case 22: return {_mm256_srli_si256(u.v256[0], 22)}; break; + case 23: return {_mm256_srli_si256(u.v256[0], 23)}; break; + case 24: return {_mm256_srli_si256(u.v256[0], 24)}; break; + case 25: return {_mm256_srli_si256(u.v256[0], 25)}; break; + case 26: return {_mm256_srli_si256(u.v256[0], 26)}; break; + case 27: return {_mm256_srli_si256(u.v256[0], 27)}; break; + case 28: return {_mm256_srli_si256(u.v256[0], 28)}; break; + case 29: return {_mm256_srli_si256(u.v256[0], 29)}; break; + case 30: return {_mm256_srli_si256(u.v256[0], 30)}; break; + case 31: return {_mm256_srli_si256(u.v256[0], 31)}; break; default: break; } return *this; diff --git a/src/util/supervector/supervector.hpp b/src/util/supervector/supervector.hpp index 45e2f518..0017592f 100644 --- a/src/util/supervector/supervector.hpp +++ b/src/util/supervector/supervector.hpp @@ -198,6 +198,8 @@ public: SuperVector pshufb(SuperVector b); SuperVector lshift64(uint8_t const N); SuperVector rshift64(uint8_t const N); + SuperVector lshift128(uint8_t const N); + SuperVector rshift128(uint8_t const N); // Constants static SuperVector Ones(); @@ -225,7 +227,7 @@ public: printf("\n"); } - void printv_u64(const char *label) { + void print64(const char *label) { printf("%12s: ", label); for(s16 i=SIZE/sizeof(u64a)-1; i >= 0; i--) printf("%016lx ", u.u64[i]); @@ -235,7 +237,7 @@ public: void print8(const char *label UNUSED) {}; void print16(const char *label UNUSED) {}; void print32(const char *label UNUSED) {}; - void printv_u64(const char *label UNUSED) {}; + void print64(const char *label UNUSED) {}; #endif }; diff --git a/unit/internal/supervector.cpp b/unit/internal/supervector.cpp index 2133eb3b..3c9ba1a9 100644 --- a/unit/internal/supervector.cpp +++ b/unit/internal/supervector.cpp @@ -578,6 +578,22 @@ TEST(SuperVectorUtilsTest,LShift256c){ TEST_LSHIFT256(buf, vec, SP, 14); TEST_LSHIFT256(buf, vec, SP, 15); TEST_LSHIFT256(buf, vec, SP, 16); + TEST_LSHIFT256(buf, vec, SP, 17); + TEST_LSHIFT256(buf, vec, SP, 18); + TEST_LSHIFT256(buf, vec, SP, 19); + TEST_LSHIFT256(buf, vec, SP, 20); + TEST_LSHIFT256(buf, vec, SP, 21); + TEST_LSHIFT256(buf, vec, SP, 22); + TEST_LSHIFT256(buf, vec, SP, 23); + TEST_LSHIFT256(buf, vec, SP, 24); + TEST_LSHIFT256(buf, vec, SP, 25); + TEST_LSHIFT256(buf, vec, SP, 26); + TEST_LSHIFT256(buf, vec, SP, 27); + TEST_LSHIFT256(buf, vec, SP, 28); + TEST_LSHIFT256(buf, vec, SP, 29); + TEST_LSHIFT256(buf, vec, SP, 30); + TEST_LSHIFT256(buf, vec, SP, 31); + TEST_LSHIFT256(buf, vec, SP, 32); } /* @@ -640,6 +656,22 @@ TEST(SuperVectorUtilsTest,RShift256c){ TEST_RSHIFT256(buf, vec, SP, 14); TEST_RSHIFT256(buf, vec, SP, 15); TEST_RSHIFT256(buf, vec, SP, 16); + TEST_RSHIFT256(buf, vec, SP, 17); + TEST_RSHIFT256(buf, vec, SP, 18); + TEST_RSHIFT256(buf, vec, SP, 19); + TEST_RSHIFT256(buf, vec, SP, 20); + TEST_RSHIFT256(buf, vec, SP, 21); + TEST_RSHIFT256(buf, vec, SP, 22); + TEST_RSHIFT256(buf, vec, SP, 23); + TEST_RSHIFT256(buf, vec, SP, 24); + TEST_RSHIFT256(buf, vec, SP, 25); + TEST_RSHIFT256(buf, vec, SP, 26); + TEST_RSHIFT256(buf, vec, SP, 27); + TEST_RSHIFT256(buf, vec, SP, 28); + TEST_RSHIFT256(buf, vec, SP, 29); + TEST_RSHIFT256(buf, vec, SP, 30); + TEST_RSHIFT256(buf, vec, SP, 31); + TEST_RSHIFT256(buf, vec, SP, 32); } @@ -647,7 +679,9 @@ TEST(SuperVectorUtilsTest,RShift256c){ /* #define TEST_ALIGNR256(v1, v2, buf, l) { \ auto v_aligned = v2.alignr(v1, l); \ + v_aligned.print8("v_aligned");\ for (size_t i=0; i<32; i++) { \ + printf("vec[%ld] = %02x\n", i+l, vec[i+l]);\ ASSERT_EQ(v_aligned.u.u8[i], vec[i + l]); \ } \ } @@ -676,6 +710,22 @@ TEST(SuperVectorUtilsTest,Alignr256c){ TEST_ALIGNR256(SP1, SP2, vec, 14); TEST_ALIGNR256(SP1, SP2, vec, 15); TEST_ALIGNR256(SP1, SP2, vec, 16); + TEST_ALIGNR256(SP1, SP2, vec, 17); + TEST_ALIGNR256(SP1, SP2, vec, 18); + TEST_ALIGNR256(SP1, SP2, vec, 19); + TEST_ALIGNR256(SP1, SP2, vec, 20); + TEST_ALIGNR256(SP1, SP2, vec, 21); + TEST_ALIGNR256(SP1, SP2, vec, 22); + TEST_ALIGNR256(SP1, SP2, vec, 23); + TEST_ALIGNR256(SP1, SP2, vec, 24); + TEST_ALIGNR256(SP1, SP2, vec, 25); + TEST_ALIGNR256(SP1, SP2, vec, 26); + TEST_ALIGNR256(SP1, SP2, vec, 27); + TEST_ALIGNR256(SP1, SP2, vec, 28); + TEST_ALIGNR256(SP1, SP2, vec, 29); + TEST_ALIGNR256(SP1, SP2, vec, 30); + TEST_ALIGNR256(SP1, SP2, vec, 31); + TEST_ALIGNR256(SP1, SP2, vec, 32); } */