mirror of
https://github.com/VectorCamp/vectorscan.git
synced 2025-06-28 16:41:01 +03:00
fix loadu_maskz, add {l,r}shift128_var(), tab fixes
This commit is contained in:
parent
a2e6143ea1
commit
f2d9784979
@ -171,15 +171,8 @@ really_inline typename SuperVector<16>::movemask_type SuperVector<16>::eqmask(Su
|
||||
return eq(b).movemask();
|
||||
}
|
||||
|
||||
#ifdef HS_OPTIMIZE
|
||||
template <>
|
||||
really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const
|
||||
{
|
||||
return {_mm_srli_si128(u.v128[0], N)};
|
||||
}
|
||||
#else
|
||||
template <>
|
||||
really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const
|
||||
really_inline SuperVector<16> SuperVector<16>::rshift128_var(uint8_t const N) const
|
||||
{
|
||||
switch(N) {
|
||||
case 1: return {_mm_srli_si128(u.v128[0], 1)}; break;
|
||||
@ -202,17 +195,23 @@ really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HS_OPTIMIZE
|
||||
template <>
|
||||
really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const
|
||||
really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const
|
||||
{
|
||||
return {_mm_slli_si128(u.v128[0], N)};
|
||||
return {_mm_srli_si128(u.v128[0], N)};
|
||||
}
|
||||
#else
|
||||
template <>
|
||||
really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const
|
||||
really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const
|
||||
{
|
||||
return rshift128_var(N);
|
||||
}
|
||||
#endif
|
||||
|
||||
template <>
|
||||
really_inline SuperVector<16> SuperVector<16>::lshift128_var(uint8_t const N) const
|
||||
{
|
||||
switch(N) {
|
||||
case 1: return {_mm_slli_si128(u.v128[0], 1)}; break;
|
||||
@ -235,6 +234,19 @@ really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
#ifdef HS_OPTIMIZE
|
||||
template <>
|
||||
really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const
|
||||
{
|
||||
return {_mm_slli_si128(u.v128[0], N)};
|
||||
}
|
||||
#else
|
||||
template <>
|
||||
really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const
|
||||
{
|
||||
return lshift128_var(N);
|
||||
}
|
||||
#endif
|
||||
|
||||
template <>
|
||||
@ -254,7 +266,7 @@ really_inline SuperVector<16> SuperVector<16>::load(void const *ptr)
|
||||
template <>
|
||||
really_inline SuperVector<16> SuperVector<16>::loadu_maskz(void const *ptr, uint8_t const len)
|
||||
{
|
||||
SuperVector<16> mask = Ones() >> (16 -len);
|
||||
SuperVector<16> mask = Ones().rshift128_var(16 -len);
|
||||
mask.print8("mask");
|
||||
SuperVector<16> v = _mm_loadu_si128((const m128 *)ptr);
|
||||
v.print8("v");
|
||||
@ -327,6 +339,7 @@ really_inline SuperVector<16> SuperVector<16>::lshift64(uint8_t const N)
|
||||
case 13: return {_mm_slli_epi64(u.v128[0], 13)}; break;
|
||||
case 14: return {_mm_slli_epi64(u.v128[0], 14)}; break;
|
||||
case 15: return {_mm_slli_epi64(u.v128[0], 15)}; break;
|
||||
case 16: return Zeroes();
|
||||
default: break;
|
||||
}
|
||||
return *this;
|
||||
@ -360,6 +373,7 @@ really_inline SuperVector<16> SuperVector<16>::rshift64(uint8_t const N)
|
||||
case 13: return {_mm_srli_epi64(u.v128[0], 13)}; break;
|
||||
case 14: return {_mm_srli_epi64(u.v128[0], 14)}; break;
|
||||
case 15: return {_mm_srli_epi64(u.v128[0], 15)}; break;
|
||||
case 16: return Zeroes();
|
||||
default: break;
|
||||
}
|
||||
return *this;
|
||||
@ -516,22 +530,8 @@ really_inline typename SuperVector<32>::movemask_type SuperVector<32>::eqmask(Su
|
||||
return eq(b).movemask();
|
||||
}
|
||||
|
||||
#ifdef HS_OPTIMIZE
|
||||
template <>
|
||||
really_inline SuperVector<32> SuperVector<32>::operator>>(uint8_t const N) const
|
||||
{
|
||||
// As found here: https://stackoverflow.com/questions/25248766/emulating-shifts-on-32-bytes-with-avx
|
||||
if (N < 16) {
|
||||
return {_mm256_alignr_epi8(_mm256_permute2x128_si256(A, A, _MM_SHUFFLE(2, 0, 0, 1)), A, N)};
|
||||
} else if (N == 16) {
|
||||
return {_mm256_permute2x128_si256(A, A, _MM_SHUFFLE(2, 0, 0, 1))};
|
||||
} else {
|
||||
return {_mm256_srli_si256(_mm256_permute2x128_si256(A, A, _MM_SHUFFLE(2, 0, 0, 1)), N - 16)};
|
||||
}
|
||||
}
|
||||
#else
|
||||
template <>
|
||||
really_inline SuperVector<32> SuperVector<32>::operator>>(uint8_t const N) const
|
||||
really_inline SuperVector<32> SuperVector<32>::rshift128_var(uint8_t const N) const
|
||||
{
|
||||
switch(N) {
|
||||
case 1: return {_mm256_alignr_epi8(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), u.v256[0], 1)}; break;
|
||||
@ -570,24 +570,30 @@ really_inline SuperVector<32> SuperVector<32>::operator>>(uint8_t const N) const
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HS_OPTIMIZE
|
||||
template <>
|
||||
really_inline SuperVector<32> SuperVector<32>::operator<<(uint8_t const N) const
|
||||
really_inline SuperVector<32> SuperVector<32>::operator>>(uint8_t const N) const
|
||||
{
|
||||
// As found here: https://stackoverflow.com/questions/25248766/emulating-shifts-on-32-bytes-with-avx
|
||||
if (N < 16) {
|
||||
return {_mm256_alignr_epi8(A, _mm256_permute2x128_si256(A, A, _MM_SHUFFLE(0, 0, 2, 0)), 16 - N)};
|
||||
return {_mm256_alignr_epi8(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), u.v256[0], N)};
|
||||
} else if (N == 16) {
|
||||
return {_mm256_permute2x128_si256(A, A, _MM_SHUFFLE(0, 0, 2, 0))};
|
||||
return {_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1))};
|
||||
} else {
|
||||
return {_mm256_slli_si256(_mm256_permute2x128_si256(A, A, _MM_SHUFFLE(0, 0, 2, 0)), N - 16)};
|
||||
return {_mm256_srli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), N - 16)};
|
||||
}
|
||||
}
|
||||
#else
|
||||
template <>
|
||||
really_inline SuperVector<32> SuperVector<32>::operator<<(uint8_t const N) const
|
||||
really_inline SuperVector<32> SuperVector<32>::operator>>(uint8_t const N) const
|
||||
{
|
||||
return rshift128_var(N);
|
||||
}
|
||||
#endif
|
||||
|
||||
template <>
|
||||
really_inline SuperVector<32> SuperVector<32>::lshift128_var(uint8_t const N) const
|
||||
{
|
||||
switch(N) {
|
||||
case 1: return {_mm256_alignr_epi8(u.v256[0], _mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 15)}; break;
|
||||
@ -626,6 +632,26 @@ really_inline SuperVector<32> SuperVector<32>::operator<<(uint8_t const N) const
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
#ifdef HS_OPTIMIZE
|
||||
template <>
|
||||
really_inline SuperVector<32> SuperVector<32>::operator<<(uint8_t const N) const
|
||||
{
|
||||
// As found here: https://stackoverflow.com/questions/25248766/emulating-shifts-on-32-bytes-with-avx
|
||||
if (N < 16) {
|
||||
return {_mm256_alignr_epi8(u.v256[0], _mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 16 - N)};
|
||||
} else if (N == 16) {
|
||||
return {_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0))};
|
||||
} else {
|
||||
return {_mm256_slli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), N - 16)};
|
||||
}
|
||||
}
|
||||
#else
|
||||
template <>
|
||||
really_inline SuperVector<32> SuperVector<32>::operator<<(uint8_t const N) const
|
||||
{
|
||||
return lshift128_var(N);
|
||||
}
|
||||
#endif
|
||||
|
||||
template <>
|
||||
@ -645,7 +671,7 @@ really_inline SuperVector<32> SuperVector<32>::load(void const *ptr)
|
||||
template <>
|
||||
really_inline SuperVector<32> SuperVector<32>::loadu_maskz(void const *ptr, uint8_t const len)
|
||||
{
|
||||
SuperVector<32> mask = Ones() >> (32 - len);
|
||||
SuperVector<32> mask = Ones().rshift128_var(32 -len);
|
||||
mask.print8("mask");
|
||||
SuperVector<32> v = _mm256_loadu_si256((const m256 *)ptr);
|
||||
v.print8("v");
|
||||
@ -750,6 +776,7 @@ really_inline SuperVector<32> SuperVector<32>::lshift64(uint8_t const N)
|
||||
case 29: return {_mm256_slli_epi64(u.v256[0], 29)}; break;
|
||||
case 30: return {_mm256_slli_epi64(u.v256[0], 30)}; break;
|
||||
case 31: return {_mm256_slli_epi64(u.v256[0], 31)}; break;
|
||||
case 32: return Zeroes();
|
||||
default: break;
|
||||
}
|
||||
return *this;
|
||||
@ -799,6 +826,7 @@ really_inline SuperVector<32> SuperVector<32>::rshift64(uint8_t const N)
|
||||
case 29: return {_mm256_srli_epi64(u.v256[0], 29)}; break;
|
||||
case 30: return {_mm256_srli_epi64(u.v256[0], 30)}; break;
|
||||
case 31: return {_mm256_srli_epi64(u.v256[0], 31)}; break;
|
||||
case 32: return Zeroes();
|
||||
default: break;
|
||||
}
|
||||
return *this;
|
||||
@ -927,6 +955,20 @@ really_inline SuperVector<64>::SuperVector(m256 const v)
|
||||
u.v512[0] = _mm512_broadcast_i64x4(v);
|
||||
};
|
||||
|
||||
template<>
|
||||
really_inline SuperVector<64>::SuperVector(m256 const lo, m256 const hi)
|
||||
{
|
||||
u.v256[0] = lo;
|
||||
u.v256[1] = hi;
|
||||
};
|
||||
|
||||
template<>
|
||||
really_inline SuperVector<64>::SuperVector(SuperVector<32> const lo, SuperVector<32> const hi)
|
||||
{
|
||||
u.v256[0] = lo.u.v256[0];
|
||||
u.v256[1] = hi.u.v256[0];
|
||||
};
|
||||
|
||||
template<>
|
||||
template<>
|
||||
really_inline SuperVector<64>::SuperVector(m128 const v)
|
||||
@ -1052,131 +1094,57 @@ really_inline typename SuperVector<64>::movemask_type SuperVector<64>::movemask(
|
||||
return _mm512_cmpeq_epi8_mask(mask.u.v512[0],msb.u.v512[0]);
|
||||
}
|
||||
|
||||
|
||||
template <>
|
||||
really_inline typename SuperVector<64>::movemask_type SuperVector<64>::eqmask(SuperVector<64> const b) const
|
||||
{
|
||||
return _mm512_cmpeq_epi8_mask(u.v512[0], b.u.v512[0]);
|
||||
}
|
||||
|
||||
|
||||
#ifdef HS_OPTIMIZE
|
||||
template <>
|
||||
really_inline SuperVector<64> SuperVector<64>::operator>>(uint8_t const N) const
|
||||
{
|
||||
// As found here: https://stackoverflow.com/questions/25248766/emulating-shifts-on-32-bytes-with-avx
|
||||
if (N < 16) {
|
||||
return {_mm256_alignr_epi8(_mm256_permute2x128_si256(A, A, _MM_SHUFFLE(2, 0, 0, 1)), A, N)};
|
||||
} else if (N == 16) {
|
||||
return {_mm256_permute2x128_si256(A, A, _MM_SHUFFLE(2, 0, 0, 1))};
|
||||
} else {
|
||||
return {_mm256_srli_si256(_mm256_permute2x128_si256(A, A, _MM_SHUFFLE(2, 0, 0, 1)), N - 16)};
|
||||
}
|
||||
}
|
||||
#else
|
||||
template <>
|
||||
really_inline SuperVector<64> SuperVector<64>::operator>>(uint8_t const N) const
|
||||
{
|
||||
switch(N) {
|
||||
case 1: return {_mm256_alignr_epi8(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), u.v256[0], 1)}; break;
|
||||
case 2: return {_mm256_alignr_epi8(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), u.v256[0], 2)}; break;
|
||||
case 3: return {_mm256_alignr_epi8(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), u.v256[0], 3)}; break;
|
||||
case 4: return {_mm256_alignr_epi8(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), u.v256[0], 4)}; break;
|
||||
case 5: return {_mm256_alignr_epi8(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), u.v256[0], 5)}; break;
|
||||
case 6: return {_mm256_alignr_epi8(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), u.v256[0], 6)}; break;
|
||||
case 7: return {_mm256_alignr_epi8(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), u.v256[0], 7)}; break;
|
||||
case 8: return {_mm256_alignr_epi8(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), u.v256[0], 8)}; break;
|
||||
case 9: return {_mm256_alignr_epi8(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), u.v256[0], 9)}; break;
|
||||
case 10: return {_mm256_alignr_epi8(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), u.v256[0], 10)}; break;
|
||||
case 11: return {_mm256_alignr_epi8(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), u.v256[0], 11)}; break;
|
||||
case 12: return {_mm256_alignr_epi8(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), u.v256[0], 12)}; break;
|
||||
case 13: return {_mm256_alignr_epi8(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), u.v256[0], 13)}; break;
|
||||
case 14: return {_mm256_alignr_epi8(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), u.v256[0], 14)}; break;
|
||||
case 15: return {_mm256_alignr_epi8(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), u.v256[0], 15)}; break;
|
||||
case 16: return {_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1))}; break;
|
||||
case 17: return {_mm256_srli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), 1)}; break;
|
||||
case 18: return {_mm256_srli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), 2)}; break;
|
||||
case 19: return {_mm256_srli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), 3)}; break;
|
||||
case 20: return {_mm256_srli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), 4)}; break;
|
||||
case 21: return {_mm256_srli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), 5)}; break;
|
||||
case 22: return {_mm256_srli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), 6)}; break;
|
||||
case 23: return {_mm256_srli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), 7)}; break;
|
||||
case 24: return {_mm256_srli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), 8)}; break;
|
||||
case 25: return {_mm256_srli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), 9)}; break;
|
||||
case 26: return {_mm256_srli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), 10)}; break;
|
||||
case 27: return {_mm256_srli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), 11)}; break;
|
||||
case 28: return {_mm256_srli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), 12)}; break;
|
||||
case 29: return {_mm256_srli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), 13)}; break;
|
||||
case 30: return {_mm256_srli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), 14)}; break;
|
||||
case 31: return {_mm256_srli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(2, 0, 0, 1)), 15)}; break;
|
||||
case 32: return Zeroes(); break;
|
||||
default: break;
|
||||
}
|
||||
if (N == 0) {
|
||||
return *this;
|
||||
} else if (N < 32) {
|
||||
SuperVector<32> lo256 = u.v256[0];
|
||||
SuperVector<32> hi256 = u.v256[1];
|
||||
SuperVector<32> carry = hi256 << (32 - N);
|
||||
hi256 = hi256 >> N;
|
||||
lo256 = (lo256 >> N) | carry;
|
||||
return SuperVector(lo256, hi256);
|
||||
} else if (N == 32) {
|
||||
SuperVector<32> hi256 = u.v256[1];
|
||||
return SuperVector(hi256, SuperVector<32>::Zeroes());
|
||||
} else if (N < 64) {
|
||||
SuperVector<32> hi256 = u.v256[1];
|
||||
return SuperVector(hi256 >> (N - 32), SuperVector<32>::Zeroes());
|
||||
} else {
|
||||
return Zeroes();
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HS_OPTIMIZE
|
||||
template <>
|
||||
really_inline SuperVector<64> SuperVector<64>::operator<<(uint8_t const N) const
|
||||
{
|
||||
// As found here: https://stackoverflow.com/questions/25248766/emulating-shifts-on-32-bytes-with-avx
|
||||
if (N < 16) {
|
||||
return {_mm256_alignr_epi8(A, _mm256_permute2x128_si256(A, A, _MM_SHUFFLE(0, 0, 2, 0)), 16 - N)};
|
||||
} else if (N == 16) {
|
||||
return {_mm256_permute2x128_si256(A, A, _MM_SHUFFLE(0, 0, 2, 0))};
|
||||
} else {
|
||||
return {_mm256_slli_si256(_mm256_permute2x128_si256(A, A, _MM_SHUFFLE(0, 0, 2, 0)), N - 16)};
|
||||
}
|
||||
}
|
||||
#else
|
||||
template <>
|
||||
really_inline SuperVector<64> SuperVector<64>::operator<<(uint8_t const N) const
|
||||
{
|
||||
switch(N) {
|
||||
case 1: return {_mm256_alignr_epi8(u.v256[0], _mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 15)}; break;
|
||||
case 2: return {_mm256_alignr_epi8(u.v256[0], _mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 14)}; break;
|
||||
case 3: return {_mm256_alignr_epi8(u.v256[0], _mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 13)}; break;
|
||||
case 4: return {_mm256_alignr_epi8(u.v256[0], _mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 12)}; break;
|
||||
case 5: return {_mm256_alignr_epi8(u.v256[0], _mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 11)}; break;
|
||||
case 6: return {_mm256_alignr_epi8(u.v256[0], _mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 10)}; break;
|
||||
case 7: return {_mm256_alignr_epi8(u.v256[0], _mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 9)}; break;
|
||||
case 8: return {_mm256_alignr_epi8(u.v256[0], _mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 8)}; break;
|
||||
case 9: return {_mm256_alignr_epi8(u.v256[0], _mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 7)}; break;
|
||||
case 10: return {_mm256_alignr_epi8(u.v256[0], _mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 6)}; break;
|
||||
case 11: return {_mm256_alignr_epi8(u.v256[0], _mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 5)}; break;
|
||||
case 12: return {_mm256_alignr_epi8(u.v256[0], _mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 4)}; break;
|
||||
case 13: return {_mm256_alignr_epi8(u.v256[0], _mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 3)}; break;
|
||||
case 14: return {_mm256_alignr_epi8(u.v256[0], _mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 2)}; break;
|
||||
case 15: return {_mm256_alignr_epi8(u.v256[0], _mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 1)}; break;
|
||||
case 16: return {_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0))}; break;
|
||||
case 17: return {_mm256_slli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 1)}; break;
|
||||
case 18: return {_mm256_slli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 2)}; break;
|
||||
case 19: return {_mm256_slli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 3)}; break;
|
||||
case 20: return {_mm256_slli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 4)}; break;
|
||||
case 21: return {_mm256_slli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 5)}; break;
|
||||
case 22: return {_mm256_slli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 6)}; break;
|
||||
case 23: return {_mm256_slli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 7)}; break;
|
||||
case 24: return {_mm256_slli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 8)}; break;
|
||||
case 25: return {_mm256_slli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 9)}; break;
|
||||
case 26: return {_mm256_slli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 10)}; break;
|
||||
case 27: return {_mm256_slli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 11)}; break;
|
||||
case 28: return {_mm256_slli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 12)}; break;
|
||||
case 29: return {_mm256_slli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 13)}; break;
|
||||
case 30: return {_mm256_slli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 14)}; break;
|
||||
case 31: return {_mm256_slli_si256(_mm256_permute2x128_si256(u.v256[0], u.v256[0], _MM_SHUFFLE(0, 0, 2, 0)), 15)}; break;
|
||||
case 32: return Zeroes(); break;
|
||||
default: break;
|
||||
}
|
||||
if (N == 0) {
|
||||
return *this;
|
||||
} else if (N < 32) {
|
||||
SuperVector<32> lo256 = u.v256[0];
|
||||
SuperVector<32> hi256 = u.v256[1];
|
||||
SuperVector<32> carry = lo256 >> (32 - N);
|
||||
hi256 = (hi256 << N) | carry;
|
||||
lo256 = lo256 << N;
|
||||
return SuperVector(lo256, hi256);
|
||||
} else if (N == 32) {
|
||||
SuperVector<32> lo256 = u.v256[0];
|
||||
return SuperVector(SuperVector<32>::Zeroes(), lo256);
|
||||
} else if (N < 64) {
|
||||
SuperVector<32> lo256 = u.v256[0];
|
||||
return SuperVector(SuperVector<32>::Zeroes(), lo256 << (N - 32));
|
||||
} else {
|
||||
return Zeroes();
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
// template <>
|
||||
// really_inline SuperVector<64> SuperVector<64>::operator<<(uint8_t const N) const
|
||||
// {
|
||||
// return {_mm512_slli_si512(u.v512[0], N)};
|
||||
// }
|
||||
|
||||
template <>
|
||||
really_inline SuperVector<64> SuperVector<64>::loadu(void const *ptr)
|
||||
@ -1195,14 +1163,13 @@ really_inline SuperVector<64> SuperVector<64>::load(void const *ptr)
|
||||
template <>
|
||||
really_inline SuperVector<64> SuperVector<64>::loadu_maskz(void const *ptr, uint8_t const len)
|
||||
{
|
||||
SuperVector<64> mask = (~0UL) >> (64 - len);
|
||||
mask.print8("mask");
|
||||
SuperVector<64> v = _mm512_loadu_si512((const m512 *)ptr);
|
||||
u64a mask = (~0ULL) >> (64 - len);
|
||||
DEBUG_PRINTF("mask = %016llx\n", mask);
|
||||
SuperVector<64> v = _mm512_mask_loadu_epi8(Zeroes().u.v512[0], mask, (const m512 *)ptr);
|
||||
v.print8("v");
|
||||
return mask & v;
|
||||
return v;
|
||||
}
|
||||
|
||||
|
||||
template<>
|
||||
really_inline SuperVector<64> SuperVector<64>::pshufb(SuperVector<64> b)
|
||||
{
|
||||
@ -1271,6 +1238,55 @@ really_inline SuperVector<64> SuperVector<64>::lshift64(uint8_t const N)
|
||||
case 13: return {_mm512_slli_epi64(u.v512[0], 13)}; break;
|
||||
case 14: return {_mm512_slli_epi64(u.v512[0], 14)}; break;
|
||||
case 15: return {_mm512_slli_epi64(u.v512[0], 15)}; break;
|
||||
case 16: return {_mm512_slli_epi64(u.v512[0], 16)}; break;
|
||||
case 17: return {_mm512_slli_epi64(u.v512[0], 17)}; break;
|
||||
case 18: return {_mm512_slli_epi64(u.v512[0], 18)}; break;
|
||||
case 19: return {_mm512_slli_epi64(u.v512[0], 19)}; break;
|
||||
case 20: return {_mm512_slli_epi64(u.v512[0], 20)}; break;
|
||||
case 21: return {_mm512_slli_epi64(u.v512[0], 21)}; break;
|
||||
case 22: return {_mm512_slli_epi64(u.v512[0], 22)}; break;
|
||||
case 23: return {_mm512_slli_epi64(u.v512[0], 23)}; break;
|
||||
case 24: return {_mm512_slli_epi64(u.v512[0], 24)}; break;
|
||||
case 25: return {_mm512_slli_epi64(u.v512[0], 25)}; break;
|
||||
case 26: return {_mm512_slli_epi64(u.v512[0], 26)}; break;
|
||||
case 27: return {_mm512_slli_epi64(u.v512[0], 27)}; break;
|
||||
case 28: return {_mm512_slli_epi64(u.v512[0], 28)}; break;
|
||||
case 29: return {_mm512_slli_epi64(u.v512[0], 29)}; break;
|
||||
case 30: return {_mm512_slli_epi64(u.v512[0], 30)}; break;
|
||||
case 31: return {_mm512_slli_epi64(u.v512[0], 31)}; break;
|
||||
case 32: return {_mm512_slli_epi64(u.v512[0], 32)}; break;
|
||||
case 33: return {_mm512_slli_epi64(u.v512[0], 33)}; break;
|
||||
case 34: return {_mm512_slli_epi64(u.v512[0], 34)}; break;
|
||||
case 35: return {_mm512_slli_epi64(u.v512[0], 35)}; break;
|
||||
case 36: return {_mm512_slli_epi64(u.v512[0], 36)}; break;
|
||||
case 37: return {_mm512_slli_epi64(u.v512[0], 37)}; break;
|
||||
case 38: return {_mm512_slli_epi64(u.v512[0], 38)}; break;
|
||||
case 39: return {_mm512_slli_epi64(u.v512[0], 39)}; break;
|
||||
case 40: return {_mm512_slli_epi64(u.v512[0], 40)}; break;
|
||||
case 41: return {_mm512_slli_epi64(u.v512[0], 41)}; break;
|
||||
case 42: return {_mm512_slli_epi64(u.v512[0], 42)}; break;
|
||||
case 43: return {_mm512_slli_epi64(u.v512[0], 43)}; break;
|
||||
case 44: return {_mm512_slli_epi64(u.v512[0], 44)}; break;
|
||||
case 45: return {_mm512_slli_epi64(u.v512[0], 45)}; break;
|
||||
case 46: return {_mm512_slli_epi64(u.v512[0], 46)}; break;
|
||||
case 47: return {_mm512_slli_epi64(u.v512[0], 47)}; break;
|
||||
case 48: return {_mm512_slli_epi64(u.v512[0], 48)}; break;
|
||||
case 49: return {_mm512_slli_epi64(u.v512[0], 49)}; break;
|
||||
case 50: return {_mm512_slli_epi64(u.v512[0], 50)}; break;
|
||||
case 51: return {_mm512_slli_epi64(u.v512[0], 51)}; break;
|
||||
case 52: return {_mm512_slli_epi64(u.v512[0], 52)}; break;
|
||||
case 53: return {_mm512_slli_epi64(u.v512[0], 53)}; break;
|
||||
case 54: return {_mm512_slli_epi64(u.v512[0], 54)}; break;
|
||||
case 55: return {_mm512_slli_epi64(u.v512[0], 55)}; break;
|
||||
case 56: return {_mm512_slli_epi64(u.v512[0], 56)}; break;
|
||||
case 57: return {_mm512_slli_epi64(u.v512[0], 57)}; break;
|
||||
case 58: return {_mm512_slli_epi64(u.v512[0], 58)}; break;
|
||||
case 59: return {_mm512_slli_epi64(u.v512[0], 59)}; break;
|
||||
case 60: return {_mm512_slli_epi64(u.v512[0], 60)}; break;
|
||||
case 61: return {_mm512_slli_epi64(u.v512[0], 61)}; break;
|
||||
case 62: return {_mm512_slli_epi64(u.v512[0], 62)}; break;
|
||||
case 63: return {_mm512_slli_epi64(u.v512[0], 63)}; break;
|
||||
case 64: return Zeroes();
|
||||
default: break;
|
||||
}
|
||||
return *this;
|
||||
@ -1304,12 +1320,224 @@ really_inline SuperVector<64> SuperVector<64>::rshift64(uint8_t const N)
|
||||
case 13: return {_mm512_srli_epi64(u.v512[0], 13)}; break;
|
||||
case 14: return {_mm512_srli_epi64(u.v512[0], 14)}; break;
|
||||
case 15: return {_mm512_srli_epi64(u.v512[0], 15)}; break;
|
||||
case 16: return {_mm512_srli_epi64(u.v512[0], 16)}; break;
|
||||
case 17: return {_mm512_srli_epi64(u.v512[0], 17)}; break;
|
||||
case 18: return {_mm512_srli_epi64(u.v512[0], 18)}; break;
|
||||
case 19: return {_mm512_srli_epi64(u.v512[0], 19)}; break;
|
||||
case 20: return {_mm512_srli_epi64(u.v512[0], 20)}; break;
|
||||
case 21: return {_mm512_srli_epi64(u.v512[0], 21)}; break;
|
||||
case 22: return {_mm512_srli_epi64(u.v512[0], 22)}; break;
|
||||
case 23: return {_mm512_srli_epi64(u.v512[0], 23)}; break;
|
||||
case 24: return {_mm512_srli_epi64(u.v512[0], 24)}; break;
|
||||
case 25: return {_mm512_srli_epi64(u.v512[0], 25)}; break;
|
||||
case 26: return {_mm512_srli_epi64(u.v512[0], 26)}; break;
|
||||
case 27: return {_mm512_srli_epi64(u.v512[0], 27)}; break;
|
||||
case 28: return {_mm512_srli_epi64(u.v512[0], 28)}; break;
|
||||
case 29: return {_mm512_srli_epi64(u.v512[0], 29)}; break;
|
||||
case 30: return {_mm512_srli_epi64(u.v512[0], 30)}; break;
|
||||
case 31: return {_mm512_srli_epi64(u.v512[0], 31)}; break;
|
||||
case 32: return {_mm512_srli_epi64(u.v512[0], 32)}; break;
|
||||
case 33: return {_mm512_srli_epi64(u.v512[0], 33)}; break;
|
||||
case 34: return {_mm512_srli_epi64(u.v512[0], 34)}; break;
|
||||
case 35: return {_mm512_srli_epi64(u.v512[0], 35)}; break;
|
||||
case 36: return {_mm512_srli_epi64(u.v512[0], 36)}; break;
|
||||
case 37: return {_mm512_srli_epi64(u.v512[0], 37)}; break;
|
||||
case 38: return {_mm512_srli_epi64(u.v512[0], 38)}; break;
|
||||
case 39: return {_mm512_srli_epi64(u.v512[0], 39)}; break;
|
||||
case 40: return {_mm512_srli_epi64(u.v512[0], 40)}; break;
|
||||
case 41: return {_mm512_srli_epi64(u.v512[0], 41)}; break;
|
||||
case 42: return {_mm512_srli_epi64(u.v512[0], 42)}; break;
|
||||
case 43: return {_mm512_srli_epi64(u.v512[0], 43)}; break;
|
||||
case 44: return {_mm512_srli_epi64(u.v512[0], 44)}; break;
|
||||
case 45: return {_mm512_srli_epi64(u.v512[0], 45)}; break;
|
||||
case 46: return {_mm512_srli_epi64(u.v512[0], 46)}; break;
|
||||
case 47: return {_mm512_srli_epi64(u.v512[0], 47)}; break;
|
||||
case 48: return {_mm512_srli_epi64(u.v512[0], 48)}; break;
|
||||
case 49: return {_mm512_srli_epi64(u.v512[0], 49)}; break;
|
||||
case 50: return {_mm512_srli_epi64(u.v512[0], 50)}; break;
|
||||
case 51: return {_mm512_srli_epi64(u.v512[0], 51)}; break;
|
||||
case 52: return {_mm512_srli_epi64(u.v512[0], 52)}; break;
|
||||
case 53: return {_mm512_srli_epi64(u.v512[0], 53)}; break;
|
||||
case 54: return {_mm512_srli_epi64(u.v512[0], 54)}; break;
|
||||
case 55: return {_mm512_srli_epi64(u.v512[0], 55)}; break;
|
||||
case 56: return {_mm512_srli_epi64(u.v512[0], 56)}; break;
|
||||
case 57: return {_mm512_srli_epi64(u.v512[0], 57)}; break;
|
||||
case 58: return {_mm512_srli_epi64(u.v512[0], 58)}; break;
|
||||
case 59: return {_mm512_srli_epi64(u.v512[0], 59)}; break;
|
||||
case 60: return {_mm512_srli_epi64(u.v512[0], 60)}; break;
|
||||
case 61: return {_mm512_srli_epi64(u.v512[0], 61)}; break;
|
||||
case 62: return {_mm512_srli_epi64(u.v512[0], 62)}; break;
|
||||
case 63: return {_mm512_srli_epi64(u.v512[0], 63)}; break;
|
||||
case 64: return Zeroes();
|
||||
default: break;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HS_OPTIMIZE
|
||||
template<>
|
||||
really_inline SuperVector<64> SuperVector<64>::lshift128(uint8_t const N)
|
||||
{
|
||||
return {_mm512_bslli_epi128(u.v512[0], N)};
|
||||
}
|
||||
#else
|
||||
template<>
|
||||
really_inline SuperVector<64> SuperVector<64>::lshift128(uint8_t const N)
|
||||
{
|
||||
switch(N) {
|
||||
case 0: return *this; break;
|
||||
case 1: return {_mm512_bslli_epi128(u.v512[0], 1)}; break;
|
||||
case 2: return {_mm512_bslli_epi128(u.v512[0], 2)}; break;
|
||||
case 3: return {_mm512_bslli_epi128(u.v512[0], 3)}; break;
|
||||
case 4: return {_mm512_bslli_epi128(u.v512[0], 4)}; break;
|
||||
case 5: return {_mm512_bslli_epi128(u.v512[0], 5)}; break;
|
||||
case 6: return {_mm512_bslli_epi128(u.v512[0], 6)}; break;
|
||||
case 7: return {_mm512_bslli_epi128(u.v512[0], 7)}; break;
|
||||
case 8: return {_mm512_bslli_epi128(u.v512[0], 8)}; break;
|
||||
case 9: return {_mm512_bslli_epi128(u.v512[0], 9)}; break;
|
||||
case 10: return {_mm512_bslli_epi128(u.v512[0], 10)}; break;
|
||||
case 11: return {_mm512_bslli_epi128(u.v512[0], 11)}; break;
|
||||
case 12: return {_mm512_bslli_epi128(u.v512[0], 12)}; break;
|
||||
case 13: return {_mm512_bslli_epi128(u.v512[0], 13)}; break;
|
||||
case 14: return {_mm512_bslli_epi128(u.v512[0], 14)}; break;
|
||||
case 15: return {_mm512_bslli_epi128(u.v512[0], 15)}; break;
|
||||
case 16: return {_mm512_bslli_epi128(u.v512[0], 16)}; break;
|
||||
case 17: return {_mm512_bslli_epi128(u.v512[0], 17)}; break;
|
||||
case 18: return {_mm512_bslli_epi128(u.v512[0], 18)}; break;
|
||||
case 19: return {_mm512_bslli_epi128(u.v512[0], 19)}; break;
|
||||
case 20: return {_mm512_bslli_epi128(u.v512[0], 20)}; break;
|
||||
case 21: return {_mm512_bslli_epi128(u.v512[0], 21)}; break;
|
||||
case 22: return {_mm512_bslli_epi128(u.v512[0], 22)}; break;
|
||||
case 23: return {_mm512_bslli_epi128(u.v512[0], 23)}; break;
|
||||
case 24: return {_mm512_bslli_epi128(u.v512[0], 24)}; break;
|
||||
case 25: return {_mm512_bslli_epi128(u.v512[0], 25)}; break;
|
||||
case 26: return {_mm512_bslli_epi128(u.v512[0], 26)}; break;
|
||||
case 27: return {_mm512_bslli_epi128(u.v512[0], 27)}; break;
|
||||
case 28: return {_mm512_bslli_epi128(u.v512[0], 28)}; break;
|
||||
case 29: return {_mm512_bslli_epi128(u.v512[0], 29)}; break;
|
||||
case 30: return {_mm512_bslli_epi128(u.v512[0], 30)}; break;
|
||||
case 31: return {_mm512_bslli_epi128(u.v512[0], 31)}; break;
|
||||
case 32: return {_mm512_bslli_epi128(u.v512[0], 32)}; break;
|
||||
case 33: return {_mm512_bslli_epi128(u.v512[0], 33)}; break;
|
||||
case 34: return {_mm512_bslli_epi128(u.v512[0], 34)}; break;
|
||||
case 35: return {_mm512_bslli_epi128(u.v512[0], 35)}; break;
|
||||
case 36: return {_mm512_bslli_epi128(u.v512[0], 36)}; break;
|
||||
case 37: return {_mm512_bslli_epi128(u.v512[0], 37)}; break;
|
||||
case 38: return {_mm512_bslli_epi128(u.v512[0], 38)}; break;
|
||||
case 39: return {_mm512_bslli_epi128(u.v512[0], 39)}; break;
|
||||
case 40: return {_mm512_bslli_epi128(u.v512[0], 40)}; break;
|
||||
case 41: return {_mm512_bslli_epi128(u.v512[0], 41)}; break;
|
||||
case 42: return {_mm512_bslli_epi128(u.v512[0], 42)}; break;
|
||||
case 43: return {_mm512_bslli_epi128(u.v512[0], 43)}; break;
|
||||
case 44: return {_mm512_bslli_epi128(u.v512[0], 44)}; break;
|
||||
case 45: return {_mm512_bslli_epi128(u.v512[0], 45)}; break;
|
||||
case 46: return {_mm512_bslli_epi128(u.v512[0], 46)}; break;
|
||||
case 47: return {_mm512_bslli_epi128(u.v512[0], 47)}; break;
|
||||
case 48: return {_mm512_bslli_epi128(u.v512[0], 48)}; break;
|
||||
case 49: return {_mm512_bslli_epi128(u.v512[0], 49)}; break;
|
||||
case 50: return {_mm512_bslli_epi128(u.v512[0], 50)}; break;
|
||||
case 51: return {_mm512_bslli_epi128(u.v512[0], 51)}; break;
|
||||
case 52: return {_mm512_bslli_epi128(u.v512[0], 52)}; break;
|
||||
case 53: return {_mm512_bslli_epi128(u.v512[0], 53)}; break;
|
||||
case 54: return {_mm512_bslli_epi128(u.v512[0], 54)}; break;
|
||||
case 55: return {_mm512_bslli_epi128(u.v512[0], 55)}; break;
|
||||
case 56: return {_mm512_bslli_epi128(u.v512[0], 56)}; break;
|
||||
case 57: return {_mm512_bslli_epi128(u.v512[0], 57)}; break;
|
||||
case 58: return {_mm512_bslli_epi128(u.v512[0], 58)}; break;
|
||||
case 59: return {_mm512_bslli_epi128(u.v512[0], 59)}; break;
|
||||
case 60: return {_mm512_bslli_epi128(u.v512[0], 60)}; break;
|
||||
case 61: return {_mm512_bslli_epi128(u.v512[0], 61)}; break;
|
||||
case 62: return {_mm512_bslli_epi128(u.v512[0], 62)}; break;
|
||||
case 63: return {_mm512_bslli_epi128(u.v512[0], 63)}; break;
|
||||
case 64: return Zeroes();
|
||||
default: break;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HS_OPTIMIZE
|
||||
template<>
|
||||
really_inline SuperVector<64> SuperVector<64>::rshift128(uint8_t const N)
|
||||
{
|
||||
return {_mm512_bsrli_epi128(u.v512[0], N)};
|
||||
}
|
||||
#else
|
||||
template<>
|
||||
really_inline SuperVector<64> SuperVector<64>::rshift128(uint8_t const N)
|
||||
{
|
||||
switch(N) {
|
||||
case 0: return *this; break;
|
||||
case 1: return {_mm512_bsrli_epi128(u.v512[0], 1)}; break;
|
||||
case 2: return {_mm512_bsrli_epi128(u.v512[0], 2)}; break;
|
||||
case 3: return {_mm512_bsrli_epi128(u.v512[0], 3)}; break;
|
||||
case 4: return {_mm512_bsrli_epi128(u.v512[0], 4)}; break;
|
||||
case 5: return {_mm512_bsrli_epi128(u.v512[0], 5)}; break;
|
||||
case 6: return {_mm512_bsrli_epi128(u.v512[0], 6)}; break;
|
||||
case 7: return {_mm512_bsrli_epi128(u.v512[0], 7)}; break;
|
||||
case 8: return {_mm512_bsrli_epi128(u.v512[0], 8)}; break;
|
||||
case 9: return {_mm512_bsrli_epi128(u.v512[0], 9)}; break;
|
||||
case 10: return {_mm512_bsrli_epi128(u.v512[0], 10)}; break;
|
||||
case 11: return {_mm512_bsrli_epi128(u.v512[0], 11)}; break;
|
||||
case 12: return {_mm512_bsrli_epi128(u.v512[0], 12)}; break;
|
||||
case 13: return {_mm512_bsrli_epi128(u.v512[0], 13)}; break;
|
||||
case 14: return {_mm512_bsrli_epi128(u.v512[0], 14)}; break;
|
||||
case 15: return {_mm512_bsrli_epi128(u.v512[0], 15)}; break;
|
||||
case 16: return {_mm512_bsrli_epi128(u.v512[0], 16)}; break;
|
||||
case 17: return {_mm512_bsrli_epi128(u.v512[0], 17)}; break;
|
||||
case 18: return {_mm512_bsrli_epi128(u.v512[0], 18)}; break;
|
||||
case 19: return {_mm512_bsrli_epi128(u.v512[0], 19)}; break;
|
||||
case 20: return {_mm512_bsrli_epi128(u.v512[0], 20)}; break;
|
||||
case 21: return {_mm512_bsrli_epi128(u.v512[0], 21)}; break;
|
||||
case 22: return {_mm512_bsrli_epi128(u.v512[0], 22)}; break;
|
||||
case 23: return {_mm512_bsrli_epi128(u.v512[0], 23)}; break;
|
||||
case 24: return {_mm512_bsrli_epi128(u.v512[0], 24)}; break;
|
||||
case 25: return {_mm512_bsrli_epi128(u.v512[0], 25)}; break;
|
||||
case 26: return {_mm512_bsrli_epi128(u.v512[0], 26)}; break;
|
||||
case 27: return {_mm512_bsrli_epi128(u.v512[0], 27)}; break;
|
||||
case 28: return {_mm512_bsrli_epi128(u.v512[0], 28)}; break;
|
||||
case 29: return {_mm512_bsrli_epi128(u.v512[0], 29)}; break;
|
||||
case 30: return {_mm512_bsrli_epi128(u.v512[0], 30)}; break;
|
||||
case 31: return {_mm512_bsrli_epi128(u.v512[0], 31)}; break;
|
||||
case 32: return {_mm512_bsrli_epi128(u.v512[0], 32)}; break;
|
||||
case 33: return {_mm512_bsrli_epi128(u.v512[0], 33)}; break;
|
||||
case 34: return {_mm512_bsrli_epi128(u.v512[0], 34)}; break;
|
||||
case 35: return {_mm512_bsrli_epi128(u.v512[0], 35)}; break;
|
||||
case 36: return {_mm512_bsrli_epi128(u.v512[0], 36)}; break;
|
||||
case 37: return {_mm512_bsrli_epi128(u.v512[0], 37)}; break;
|
||||
case 38: return {_mm512_bsrli_epi128(u.v512[0], 38)}; break;
|
||||
case 39: return {_mm512_bsrli_epi128(u.v512[0], 39)}; break;
|
||||
case 40: return {_mm512_bsrli_epi128(u.v512[0], 40)}; break;
|
||||
case 41: return {_mm512_bsrli_epi128(u.v512[0], 41)}; break;
|
||||
case 42: return {_mm512_bsrli_epi128(u.v512[0], 42)}; break;
|
||||
case 43: return {_mm512_bsrli_epi128(u.v512[0], 43)}; break;
|
||||
case 44: return {_mm512_bsrli_epi128(u.v512[0], 44)}; break;
|
||||
case 45: return {_mm512_bsrli_epi128(u.v512[0], 45)}; break;
|
||||
case 46: return {_mm512_bsrli_epi128(u.v512[0], 46)}; break;
|
||||
case 47: return {_mm512_bsrli_epi128(u.v512[0], 47)}; break;
|
||||
case 48: return {_mm512_bsrli_epi128(u.v512[0], 48)}; break;
|
||||
case 49: return {_mm512_bsrli_epi128(u.v512[0], 49)}; break;
|
||||
case 50: return {_mm512_bsrli_epi128(u.v512[0], 50)}; break;
|
||||
case 51: return {_mm512_bsrli_epi128(u.v512[0], 51)}; break;
|
||||
case 52: return {_mm512_bsrli_epi128(u.v512[0], 52)}; break;
|
||||
case 53: return {_mm512_bsrli_epi128(u.v512[0], 53)}; break;
|
||||
case 54: return {_mm512_bsrli_epi128(u.v512[0], 54)}; break;
|
||||
case 55: return {_mm512_bsrli_epi128(u.v512[0], 55)}; break;
|
||||
case 56: return {_mm512_bsrli_epi128(u.v512[0], 56)}; break;
|
||||
case 57: return {_mm512_bsrli_epi128(u.v512[0], 57)}; break;
|
||||
case 58: return {_mm512_bsrli_epi128(u.v512[0], 58)}; break;
|
||||
case 59: return {_mm512_bsrli_epi128(u.v512[0], 59)}; break;
|
||||
case 60: return {_mm512_bsrli_epi128(u.v512[0], 60)}; break;
|
||||
case 61: return {_mm512_bsrli_epi128(u.v512[0], 61)}; break;
|
||||
case 62: return {_mm512_bsrli_epi128(u.v512[0], 62)}; break;
|
||||
case 63: return {_mm512_bsrli_epi128(u.v512[0], 63)}; break;
|
||||
case 64: return Zeroes();
|
||||
default: break;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // HAVE_AVX512
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user