diff --git a/src/util/arch/common/simd_utils.h b/src/util/arch/common/simd_utils.h index e682e2d5..56d9dbaf 100644 --- a/src/util/arch/common/simd_utils.h +++ b/src/util/arch/common/simd_utils.h @@ -147,10 +147,12 @@ static really_inline int isnonzero256(m256 a) { } /** - * "Rich" version of diff256(). Takes two vectors a and b and returns an 8-bit + * "Rich" version of diff256(). Takes two vectors a and b and returns a 8-bit * mask indicating which 32-bit words contain differences. */ -static really_inline u32 diffrich256(m256 a, m256 b) { +static really_inline +u32 diffrich256(m256 a, m256 b) { + return diffrich128(a.lo, b.lo) | (diffrich128(a.hi, b.hi) << 8); } /** @@ -311,26 +313,6 @@ m256 pshufb_m256(m256 a, m256 b) { return rv; } -#define cast256to128(a) _mm256_castsi256_si128(a) -#define cast128to256(a) _mm256_castsi128_si256(a) -#define swap128in256(a) _mm256_permute4x64_epi64(a, 0x4E) -#define insert128to256(a, b, imm) _mm256_inserti128_si256(a, b, imm) -#define rshift128_m256(a, count_immed) _mm256_srli_si256(a, count_immed) -#define lshift128_m256(a, count_immed) _mm256_slli_si256(a, count_immed) -#define extract64from256(a, imm) _mm_extract_epi64(_mm256_extracti128_si256(a, imm >> 1), imm % 2) -#define extract32from256(a, imm) _mm_extract_epi32(_mm256_extracti128_si256(a, imm >> 2), imm % 4) -#define extractlow64from256(a) _mm_cvtsi128_si64(cast256to128(a)) -#define extractlow32from256(a) movd(cast256to128(a)) -#define interleave256hi(a, b) _mm256_unpackhi_epi8(a, b) -#define interleave256lo(a, b) _mm256_unpacklo_epi8(a, b) -#define vpalignr(r, l, offset) _mm256_alignr_epi8(r, l, offset) - -#define extract128from512(a, imm) _mm512_extracti32x4_epi32(a, imm) -#define interleave512hi(a, b) _mm512_unpackhi_epi8(a, b) -#define interleave512lo(a, b) _mm512_unpacklo_epi8(a, b) -#define set2x256(a) _mm512_broadcast_i64x4(a) -#define mask_set2x256(src, k, a) _mm512_mask_broadcast_i64x4(src, k, a) - #endif // HAVE_SIMD_256_BITS /**** @@ -402,13 +384,6 @@ static really_inline int isnonzero384(m384 a) { return isnonzero128(or128(or128(a.lo, a.mid), a.hi)); } -/** - * "Rich" version of diff384(). Takes two vectors a and b and returns a 12-bit - * mask indicating which 32-bit words contain differences. - */ -static really_inline u32 diffrich384(m384 a, m384 b) { -} - /** * "Rich" version of diff384(), 64-bit variant. Takes two vectors a and b and * returns a 12-bit mask indicating which 64-bit words contain differences. @@ -507,9 +482,6 @@ char testbit384(m384 val, unsigned int n) { ****/ #if !defined(HAVE_SIMD_512_BITS) -#define eq512mask(a, b) _mm512_cmpeq_epi8_mask((a), (b)) -#define masked_eq512mask(k, a, b) _mm512_mask_cmpeq_epi8_mask((k), (a), (b)) -#define vpermq512(idx, a) _mm512_permutexvar_epi64(idx, a) static really_inline m512 zeroes512(void) { @@ -608,12 +580,6 @@ m512 lshift64_m512(m512 a, unsigned b) { return rv; } -#if defined(HAVE_AVX512) -#define rshift64_m512(a, b) _mm512_srli_epi64((a), (b)) -#define rshift128_m512(a, count_immed) _mm512_bsrli_epi128(a, count_immed) -#define lshift128_m512(a, count_immed) _mm512_bslli_epi128(a, count_immed) -#endif - static really_inline int diff512(m512 a, m512 b) { return diff256(a.lo, b.lo) || diff256(a.hi, b.hi); @@ -621,9 +587,9 @@ int diff512(m512 a, m512 b) { static really_inline int isnonzero512(m512 a) { - m128 x = or128(a.lo.lo, a.lo.hi); - m128 y = or128(a.hi.lo, a.hi.hi); - return isnonzero128(or128(x, y)); + m256 x = or256(a.lo, a.lo); + m256 y = or256(a.hi, a.hi); + return isnonzero256(or256(x, y)); } /** diff --git a/src/util/arch/x86/simd_utils.h b/src/util/arch/x86/simd_utils.h index 2d099f56..4a1a691e 100644 --- a/src/util/arch/x86/simd_utils.h +++ b/src/util/arch/x86/simd_utils.h @@ -127,22 +127,8 @@ static really_inline u32 movd(const m128 in) { return _mm_cvtsi128_si32(in); } -#if defined(HAVE_AVX512) -static really_inline u32 movd512(const m512 in) { - // NOTE: seems gcc doesn't support _mm512_cvtsi512_si32(in), - // so we use 2-step convertions to work around. - return _mm_cvtsi128_si32(_mm512_castsi512_si128(in)); -} -#endif - static really_inline u64a movq(const m128 in) { -#if defined(ARCH_X86_64) return _mm_cvtsi128_si64(in); -#else // 32-bit - this is horrific - u32 lo = movd(in); - u32 hi = movd(_mm_srli_epi64(in, 32)); - return (u64a)hi << 32 | lo; -#endif } /* another form of movq */ @@ -281,36 +267,6 @@ m128 pshufb_m128(m128 a, m128 b) { return result; } -static really_inline -m256 pshufb_m256(m256 a, m256 b) { -#if defined(HAVE_AVX2) - return _mm256_shuffle_epi8(a, b); -#else - m256 rv; - rv.lo = pshufb_m128(a.lo, b.lo); - rv.hi = pshufb_m128(a.hi, b.hi); - return rv; -#endif -} - -#if defined(HAVE_AVX512) -static really_inline -m512 pshufb_m512(m512 a, m512 b) { - return _mm512_shuffle_epi8(a, b); -} - -static really_inline -m512 maskz_pshufb_m512(__mmask64 k, m512 a, m512 b) { - return _mm512_maskz_shuffle_epi8(k, a, b); -} - -#if defined(HAVE_AVX512VBMI) -#define vpermb512(idx, a) _mm512_permutexvar_epi8(idx, a) -#define maskz_vpermb512(k, idx, a) _mm512_maskz_permutexvar_epi8(k, idx, a) -#endif - -#endif - static really_inline m128 variable_byte_shift_m128(m128 in, s32 amount) { assert(amount >= -16 && amount <= 16); @@ -352,7 +308,12 @@ m128 set2x64(u64a hi, u64a lo) { **** 256-bit Primitives ****/ -#if defined(HAVE_AVX2) +#if defined(HAVE_SIMD_256_BITS) && defined(HAVE_AVX2) + +static really_inline +m256 pshufb_m256(m256 a, m256 b) { + return _mm256_shuffle_epi8(a, b); +} static really_really_inline m256 lshift64_m256(m256 a, unsigned b) { @@ -379,143 +340,41 @@ m256 set1_2x128(m128 a) { return _mm256_broadcastsi128_si256(a); } -#else - -static really_really_inline -m256 lshift64_m256(m256 a, int b) { - m256 rv = a; - rv.lo = lshift64_m128(rv.lo, b); - rv.hi = lshift64_m128(rv.hi, b); - return rv; -} - -static really_inline -m256 rshift64_m256(m256 a, int b) { - m256 rv = a; - rv.lo = rshift64_m128(rv.lo, b); - rv.hi = rshift64_m128(rv.hi, b); - return rv; -} - -static really_inline -m256 eq256(m256 a, m256 b) { - m256 rv; - rv.lo = eq128(a.lo, b.lo); - rv.hi = eq128(a.hi, b.hi); - return rv; -} - -static really_inline -u32 movemask256(m256 a) { - u32 lo_mask = movemask128(a.lo); - u32 hi_mask = movemask128(a.hi); - return lo_mask | (hi_mask << 16); -} - -static really_inline -m256 set1_2x128(m128 a) { - m256 rv = {a, a}; - return rv; -} -#endif - static really_inline m256 zeroes256(void) { -#if defined(HAVE_AVX2) return _mm256_setzero_si256(); -#else - m256 rv = {zeroes128(), zeroes128()}; - return rv; -#endif } static really_inline m256 ones256(void) { -#if defined(HAVE_AVX2) m256 rv = _mm256_set1_epi8(0xFF); -#else - m256 rv = {ones128(), ones128()}; -#endif return rv; } -#if defined(HAVE_AVX2) static really_inline m256 and256(m256 a, m256 b) { return _mm256_and_si256(a, b); } -#else -static really_inline m256 and256(m256 a, m256 b) { - m256 rv; - rv.lo = and128(a.lo, b.lo); - rv.hi = and128(a.hi, b.hi); - return rv; -} -#endif -#if defined(HAVE_AVX2) static really_inline m256 or256(m256 a, m256 b) { return _mm256_or_si256(a, b); } -#else -static really_inline m256 or256(m256 a, m256 b) { - m256 rv; - rv.lo = or128(a.lo, b.lo); - rv.hi = or128(a.hi, b.hi); - return rv; -} -#endif -#if defined(HAVE_AVX2) static really_inline m256 xor256(m256 a, m256 b) { return _mm256_xor_si256(a, b); } -#else -static really_inline m256 xor256(m256 a, m256 b) { - m256 rv; - rv.lo = xor128(a.lo, b.lo); - rv.hi = xor128(a.hi, b.hi); - return rv; -} -#endif -#if defined(HAVE_AVX2) static really_inline m256 not256(m256 a) { return _mm256_xor_si256(a, ones256()); } -#else -static really_inline m256 not256(m256 a) { - m256 rv; - rv.lo = not128(a.lo); - rv.hi = not128(a.hi); - return rv; -} -#endif -#if defined(HAVE_AVX2) static really_inline m256 andnot256(m256 a, m256 b) { return _mm256_andnot_si256(a, b); } -#else -static really_inline m256 andnot256(m256 a, m256 b) { - m256 rv; - rv.lo = andnot128(a.lo, b.lo); - rv.hi = andnot128(a.hi, b.hi); - return rv; -} -#endif static really_inline int diff256(m256 a, m256 b) { -#if defined(HAVE_AVX2) return !!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(a, b)) ^ (int)-1); -#else - return diff128(a.lo, b.lo) || diff128(a.hi, b.hi); -#endif } static really_inline int isnonzero256(m256 a) { -#if defined(HAVE_AVX2) return !!diff256(a, zeroes256()); -#else - return isnonzero128(or128(a.lo, a.hi)); -#endif } /** @@ -523,16 +382,8 @@ static really_inline int isnonzero256(m256 a) { * mask indicating which 32-bit words contain differences. */ static really_inline u32 diffrich256(m256 a, m256 b) { -#if defined(HAVE_AVX2) a = _mm256_cmpeq_epi32(a, b); return ~(_mm256_movemask_ps(_mm256_castsi256_ps(a))) & 0xFF; -#else - m128 z = zeroes128(); - a.lo = _mm_cmpeq_epi32(a.lo, b.lo); - a.hi = _mm_cmpeq_epi32(a.hi, b.hi); - m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo, a.hi), z); - return ~(_mm_movemask_epi8(packed)) & 0xff; -#endif } /** @@ -547,24 +398,12 @@ static really_inline u32 diffrich64_256(m256 a, m256 b) { // aligned load static really_inline m256 load256(const void *ptr) { assert(ISALIGNED_N(ptr, alignof(m256))); -#if defined(HAVE_AVX2) return _mm256_load_si256((const m256 *)ptr); -#else - m256 rv = { load128(ptr), load128((const char *)ptr + 16) }; - return rv; -#endif } // aligned load of 128-bit value to low and high part of 256-bit value static really_inline m256 load2x128(const void *ptr) { -#if defined(HAVE_AVX2) return set1_2x128(load128(ptr)); -#else - assert(ISALIGNED_N(ptr, alignof(m128))); - m256 rv; - rv.hi = rv.lo = load128(ptr); - return rv; -#endif } static really_inline m256 loadu2x128(const void *ptr) { @@ -574,32 +413,17 @@ static really_inline m256 loadu2x128(const void *ptr) { // aligned store static really_inline void store256(void *ptr, m256 a) { assert(ISALIGNED_N(ptr, alignof(m256))); -#if defined(HAVE_AVX2) _mm256_store_si256((m256 *)ptr, a); -#else - ptr = assume_aligned(ptr, 16); - *(m256 *)ptr = a; -#endif } // unaligned load static really_inline m256 loadu256(const void *ptr) { -#if defined(HAVE_AVX2) return _mm256_loadu_si256((const m256 *)ptr); -#else - m256 rv = { loadu128(ptr), loadu128((const char *)ptr + 16) }; - return rv; -#endif } // unaligned store static really_inline void storeu256(void *ptr, m256 a) { -#if defined(HAVE_AVX2) _mm256_storeu_si256((m256 *)ptr, a); -#else - storeu128(ptr, a.lo); - storeu128((char *)ptr + 16, a.hi); -#endif } // packed unaligned store of first N bytes @@ -628,101 +452,19 @@ m256 mask1bit256(unsigned int n) { static really_inline m256 set1_32x8(u32 in) { -#if defined(HAVE_AVX2) return _mm256_set1_epi8(in); -#else - m256 rv; - rv.hi = set1_16x8(in); - rv.lo = set1_16x8(in); - return rv; -#endif } static really_inline m256 set8x32(u32 hi_3, u32 hi_2, u32 hi_1, u32 hi_0, u32 lo_3, u32 lo_2, u32 lo_1, u32 lo_0) { -#if defined(HAVE_AVX2) return _mm256_set_epi32(hi_3, hi_2, hi_1, hi_0, lo_3, lo_2, lo_1, lo_0); -#else - m256 rv; - rv.hi = set4x32(hi_3, hi_2, hi_1, hi_0); - rv.lo = set4x32(lo_3, lo_2, lo_1, lo_0); - return rv; -#endif } static really_inline m256 set4x64(u64a hi_1, u64a hi_0, u64a lo_1, u64a lo_0) { -#if defined(HAVE_AVX2) return _mm256_set_epi64x(hi_1, hi_0, lo_1, lo_0); -#else - m256 rv; - rv.hi = set2x64(hi_1, hi_0); - rv.lo = set2x64(lo_1, lo_0); - return rv; -#endif } -#if !defined(HAVE_AVX2) -// switches on bit N in the given vector. -static really_inline -void setbit256(m256 *ptr, unsigned int n) { - assert(n < sizeof(*ptr) * 8); - m128 *sub; - if (n < 128) { - sub = &ptr->lo; - } else { - sub = &ptr->hi; - n -= 128; - } - setbit128(sub, n); -} - -// switches off bit N in the given vector. -static really_inline -void clearbit256(m256 *ptr, unsigned int n) { - assert(n < sizeof(*ptr) * 8); - m128 *sub; - if (n < 128) { - sub = &ptr->lo; - } else { - sub = &ptr->hi; - n -= 128; - } - clearbit128(sub, n); -} - -// tests bit N in the given vector. -static really_inline -char testbit256(m256 val, unsigned int n) { - assert(n < sizeof(val) * 8); - m128 sub; - if (n < 128) { - sub = val.lo; - } else { - sub = val.hi; - n -= 128; - } - return testbit128(sub, n); -} - -static really_really_inline -m128 movdq_hi(m256 x) { - return x.hi; -} - -static really_really_inline -m128 movdq_lo(m256 x) { - return x.lo; -} - -static really_inline -m256 combine2x128(m128 hi, m128 lo) { - m256 rv = {lo, hi}; - return rv; -} - -#else // AVX2 - // switches on bit N in the given vector. static really_inline void setbit256(m256 *ptr, unsigned int n) { @@ -775,88 +517,12 @@ m256 combine2x128(m128 hi, m128 lo) { } #endif //AVX2 -#if defined(HAVE_AVX512) -#define extract128from512(a, imm) _mm512_extracti32x4_epi32(a, imm) -#define interleave512hi(a, b) _mm512_unpackhi_epi8(a, b) -#define interleave512lo(a, b) _mm512_unpacklo_epi8(a, b) -#define set2x256(a) _mm512_broadcast_i64x4(a) -#define mask_set2x256(src, k, a) _mm512_mask_broadcast_i64x4(src, k, a) -#define vpermq512(idx, a) _mm512_permutexvar_epi64(idx, a) -#endif - -/**** - **** 384-bit Primitives - ****/ - -static really_inline m384 and384(m384 a, m384 b) { - m384 rv; - rv.lo = and128(a.lo, b.lo); - rv.mid = and128(a.mid, b.mid); - rv.hi = and128(a.hi, b.hi); - return rv; -} - -static really_inline m384 or384(m384 a, m384 b) { - m384 rv; - rv.lo = or128(a.lo, b.lo); - rv.mid = or128(a.mid, b.mid); - rv.hi = or128(a.hi, b.hi); - return rv; -} - -static really_inline m384 xor384(m384 a, m384 b) { - m384 rv; - rv.lo = xor128(a.lo, b.lo); - rv.mid = xor128(a.mid, b.mid); - rv.hi = xor128(a.hi, b.hi); - return rv; -} -static really_inline m384 not384(m384 a) { - m384 rv; - rv.lo = not128(a.lo); - rv.mid = not128(a.mid); - rv.hi = not128(a.hi); - return rv; -} -static really_inline m384 andnot384(m384 a, m384 b) { - m384 rv; - rv.lo = andnot128(a.lo, b.lo); - rv.mid = andnot128(a.mid, b.mid); - rv.hi = andnot128(a.hi, b.hi); - return rv; -} - -static really_really_inline -m384 lshift64_m384(m384 a, unsigned b) { - m384 rv; - rv.lo = lshift64_m128(a.lo, b); - rv.mid = lshift64_m128(a.mid, b); - rv.hi = lshift64_m128(a.hi, b); - return rv; -} - -static really_inline m384 zeroes384(void) { - m384 rv = {zeroes128(), zeroes128(), zeroes128()}; - return rv; -} - -static really_inline m384 ones384(void) { - m384 rv = {ones128(), ones128(), ones128()}; - return rv; -} - -static really_inline int diff384(m384 a, m384 b) { - return diff128(a.lo, b.lo) || diff128(a.mid, b.mid) || diff128(a.hi, b.hi); -} - -static really_inline int isnonzero384(m384 a) { - return isnonzero128(or128(or128(a.lo, a.mid), a.hi)); -} - +#if defined(HAVE_SIMD_128_BITS) /** * "Rich" version of diff384(). Takes two vectors a and b and returns a 12-bit * mask indicating which 32-bit words contain differences. */ + static really_inline u32 diffrich384(m384 a, m384 b) { m128 z = zeroes128(); a.lo = _mm_cmpeq_epi32(a.lo, b.lo); @@ -867,102 +533,42 @@ static really_inline u32 diffrich384(m384 a, m384 b) { return ~(_mm_movemask_epi8(packed)) & 0xfff; } -/** - * "Rich" version of diff384(), 64-bit variant. Takes two vectors a and b and - * returns a 12-bit mask indicating which 64-bit words contain differences. - */ -static really_inline u32 diffrich64_384(m384 a, m384 b) { - u32 d = diffrich384(a, b); - return (d | (d >> 1)) & 0x55555555; -} - -// aligned load -static really_inline m384 load384(const void *ptr) { - assert(ISALIGNED_16(ptr)); - m384 rv = { load128(ptr), load128((const char *)ptr + 16), - load128((const char *)ptr + 32) }; - return rv; -} - -// aligned store -static really_inline void store384(void *ptr, m384 a) { - assert(ISALIGNED_16(ptr)); - ptr = assume_aligned(ptr, 16); - *(m384 *)ptr = a; -} - -// unaligned load -static really_inline m384 loadu384(const void *ptr) { - m384 rv = { loadu128(ptr), loadu128((const char *)ptr + 16), - loadu128((const char *)ptr + 32)}; - return rv; -} - -// packed unaligned store of first N bytes -static really_inline -void storebytes384(void *ptr, m384 a, unsigned int n) { - assert(n <= sizeof(a)); - memcpy(ptr, &a, n); -} - -// packed unaligned load of first N bytes, pad with zero -static really_inline -m384 loadbytes384(const void *ptr, unsigned int n) { - m384 a = zeroes384(); - assert(n <= sizeof(a)); - memcpy(&a, ptr, n); - return a; -} - -// switches on bit N in the given vector. -static really_inline -void setbit384(m384 *ptr, unsigned int n) { - assert(n < sizeof(*ptr) * 8); - m128 *sub; - if (n < 128) { - sub = &ptr->lo; - } else if (n < 256) { - sub = &ptr->mid; - } else { - sub = &ptr->hi; - } - setbit128(sub, n % 128); -} - -// switches off bit N in the given vector. -static really_inline -void clearbit384(m384 *ptr, unsigned int n) { - assert(n < sizeof(*ptr) * 8); - m128 *sub; - if (n < 128) { - sub = &ptr->lo; - } else if (n < 256) { - sub = &ptr->mid; - } else { - sub = &ptr->hi; - } - clearbit128(sub, n % 128); -} - -// tests bit N in the given vector. -static really_inline -char testbit384(m384 val, unsigned int n) { - assert(n < sizeof(val) * 8); - m128 sub; - if (n < 128) { - sub = val.lo; - } else if (n < 256) { - sub = val.mid; - } else { - sub = val.hi; - } - return testbit128(sub, n % 128); -} +#endif // HAVE_SIMD_128_BITS /**** **** 512-bit Primitives ****/ +#if defined(HAVE_SIMD_512_BITS) + +#define extract128from512(a, imm) _mm512_extracti32x4_epi32(a, imm) +#define interleave512hi(a, b) _mm512_unpackhi_epi8(a, b) +#define interleave512lo(a, b) _mm512_unpacklo_epi8(a, b) +#define set2x256(a) _mm512_broadcast_i64x4(a) +#define mask_set2x256(src, k, a) _mm512_mask_broadcast_i64x4(src, k, a) +#define vpermq512(idx, a) _mm512_permutexvar_epi64(idx, a) + +static really_inline u32 movd512(const m512 in) { + // NOTE: seems gcc doesn't support _mm512_cvtsi512_si32(in), + // so we use 2-step convertions to work around. + return _mm_cvtsi128_si32(_mm512_castsi512_si128(in)); +} + +static really_inline +m512 pshufb_m512(m512 a, m512 b) { + return _mm512_shuffle_epi8(a, b); +} + +static really_inline +m512 maskz_pshufb_m512(__mmask64 k, m512 a, m512 b) { + return _mm512_maskz_shuffle_epi8(k, a, b); +} + +#if defined(HAVE_AVX512VBMI) +#define vpermb512(idx, a) _mm512_permutexvar_epi8(idx, a) +#define maskz_vpermb512(k, idx, a) _mm512_maskz_permutexvar_epi8(k, idx, a) +#endif + #define eq512mask(a, b) _mm512_cmpeq_epi8_mask((a), (b)) #define masked_eq512mask(k, a, b) _mm512_mask_cmpeq_epi8_mask((k), (a), (b)) @@ -978,16 +584,10 @@ m512 zeroes512(void) { static really_inline m512 ones512(void) { -#if defined(HAVE_AVX512) return _mm512_set1_epi8(0xFF); //return _mm512_xor_si512(_mm512_setzero_si512(), _mm512_setzero_si512()); -#else - m512 rv = {ones256(), ones256()}; - return rv; -#endif } -#if defined(HAVE_AVX512) static really_inline m512 set1_64x8(u8 a) { return _mm512_set1_epi8(a); @@ -1015,69 +615,32 @@ static really_inline m512 set1_4x128(m128 a) { return _mm512_broadcast_i32x4(a); } -#endif static really_inline m512 and512(m512 a, m512 b) { -#if defined(HAVE_AVX512) return _mm512_and_si512(a, b); -#else - m512 rv; - rv.lo = and256(a.lo, b.lo); - rv.hi = and256(a.hi, b.hi); - return rv; -#endif } static really_inline m512 or512(m512 a, m512 b) { -#if defined(HAVE_AVX512) return _mm512_or_si512(a, b); -#else - m512 rv; - rv.lo = or256(a.lo, b.lo); - rv.hi = or256(a.hi, b.hi); - return rv; -#endif } static really_inline m512 xor512(m512 a, m512 b) { -#if defined(HAVE_AVX512) return _mm512_xor_si512(a, b); -#else - m512 rv; - rv.lo = xor256(a.lo, b.lo); - rv.hi = xor256(a.hi, b.hi); - return rv; -#endif } static really_inline m512 not512(m512 a) { -#if defined(HAVE_AVX512) return _mm512_xor_si512(a, ones512()); -#else - m512 rv; - rv.lo = not256(a.lo); - rv.hi = not256(a.hi); - return rv; -#endif } static really_inline m512 andnot512(m512 a, m512 b) { -#if defined(HAVE_AVX512) return _mm512_andnot_si512(a, b); -#else - m512 rv; - rv.lo = andnot256(a.lo, b.lo); - rv.hi = andnot256(a.hi, b.hi); - return rv; -#endif } -#if defined(HAVE_AVX512) static really_really_inline m512 lshift64_m512(m512 a, unsigned b) { #if defined(HAVE__BUILTIN_CONSTANT_P) @@ -1088,21 +651,10 @@ m512 lshift64_m512(m512 a, unsigned b) { m128 x = _mm_cvtsi32_si128(b); return _mm512_sll_epi64(a, x); } -#else -static really_really_inline -m512 lshift64_m512(m512 a, unsigned b) { - m512 rv; - rv.lo = lshift64_m256(a.lo, b); - rv.hi = lshift64_m256(a.hi, b); - return rv; -} -#endif -#if defined(HAVE_AVX512) #define rshift64_m512(a, b) _mm512_srli_epi64((a), (b)) #define rshift128_m512(a, count_immed) _mm512_bsrli_epi128(a, count_immed) #define lshift128_m512(a, count_immed) _mm512_bslli_epi128(a, count_immed) -#endif #if !defined(_MM_CMPINT_NE) #define _MM_CMPINT_NE 0x4 @@ -1110,25 +662,12 @@ m512 lshift64_m512(m512 a, unsigned b) { static really_inline int diff512(m512 a, m512 b) { -#if defined(HAVE_AVX512) return !!_mm512_cmp_epi8_mask(a, b, _MM_CMPINT_NE); -#else - return diff256(a.lo, b.lo) || diff256(a.hi, b.hi); -#endif } static really_inline int isnonzero512(m512 a) { -#if defined(HAVE_AVX512) return diff512(a, zeroes512()); -#elif defined(HAVE_AVX2) - m256 x = or256(a.lo, a.hi); - return !!diff256(x, zeroes256()); -#else - m128 x = or128(a.lo.lo, a.lo.hi); - m128 y = or128(a.hi.lo, a.hi.hi); - return isnonzero128(or128(x, y)); -#endif } /** @@ -1137,19 +676,7 @@ int isnonzero512(m512 a) { */ static really_inline u32 diffrich512(m512 a, m512 b) { -#if defined(HAVE_AVX512) return _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_NE); -#elif defined(HAVE_AVX2) - return diffrich256(a.lo, b.lo) | (diffrich256(a.hi, b.hi) << 8); -#else - a.lo.lo = _mm_cmpeq_epi32(a.lo.lo, b.lo.lo); - a.lo.hi = _mm_cmpeq_epi32(a.lo.hi, b.lo.hi); - a.hi.lo = _mm_cmpeq_epi32(a.hi.lo, b.hi.lo); - a.hi.hi = _mm_cmpeq_epi32(a.hi.hi, b.hi.hi); - m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo.lo, a.lo.hi), - _mm_packs_epi32(a.hi.lo, a.hi.hi)); - return ~(_mm_movemask_epi8(packed)) & 0xffff; -#endif } /** @@ -1166,43 +693,22 @@ u32 diffrich64_512(m512 a, m512 b) { // aligned load static really_inline m512 load512(const void *ptr) { -#if defined(HAVE_AVX512) return _mm512_load_si512(ptr); -#else - assert(ISALIGNED_N(ptr, alignof(m256))); - m512 rv = { load256(ptr), load256((const char *)ptr + 32) }; - return rv; -#endif } // aligned store static really_inline void store512(void *ptr, m512 a) { assert(ISALIGNED_N(ptr, alignof(m512))); -#if defined(HAVE_AVX512) return _mm512_store_si512(ptr, a); -#elif defined(HAVE_AVX2) - m512 *x = (m512 *)ptr; - store256(&x->lo, a.lo); - store256(&x->hi, a.hi); -#else - ptr = assume_aligned(ptr, 16); - *(m512 *)ptr = a; -#endif } // unaligned load static really_inline m512 loadu512(const void *ptr) { -#if defined(HAVE_AVX512) return _mm512_loadu_si512(ptr); -#else - m512 rv = { loadu256(ptr), loadu256((const char *)ptr + 32) }; - return rv; -#endif } -#if defined(HAVE_AVX512) static really_inline m512 loadu_maskz_m512(__mmask64 k, const void *ptr) { return _mm512_maskz_loadu_epi8(k, ptr); @@ -1217,7 +723,6 @@ static really_inline m512 set_mask_m512(__mmask64 k) { return _mm512_movm_epi8(k); } -#endif // packed unaligned store of first N bytes static really_inline @@ -1247,91 +752,24 @@ m512 mask1bit512(unsigned int n) { static really_inline void setbit512(m512 *ptr, unsigned int n) { assert(n < sizeof(*ptr) * 8); -#if !defined(HAVE_AVX2) - m128 *sub; - if (n < 128) { - sub = &ptr->lo.lo; - } else if (n < 256) { - sub = &ptr->lo.hi; - } else if (n < 384) { - sub = &ptr->hi.lo; - } else { - sub = &ptr->hi.hi; - } - setbit128(sub, n % 128); -#elif defined(HAVE_AVX512) *ptr = or512(mask1bit512(n), *ptr); -#else - m256 *sub; - if (n < 256) { - sub = &ptr->lo; - } else { - sub = &ptr->hi; - n -= 256; - } - setbit256(sub, n); -#endif } // switches off bit N in the given vector. static really_inline void clearbit512(m512 *ptr, unsigned int n) { assert(n < sizeof(*ptr) * 8); -#if !defined(HAVE_AVX2) - m128 *sub; - if (n < 128) { - sub = &ptr->lo.lo; - } else if (n < 256) { - sub = &ptr->lo.hi; - } else if (n < 384) { - sub = &ptr->hi.lo; - } else { - sub = &ptr->hi.hi; - } - clearbit128(sub, n % 128); -#elif defined(HAVE_AVX512) *ptr = andnot512(mask1bit512(n), *ptr); -#else - m256 *sub; - if (n < 256) { - sub = &ptr->lo; - } else { - sub = &ptr->hi; - n -= 256; - } - clearbit256(sub, n); -#endif } // tests bit N in the given vector. static really_inline char testbit512(m512 val, unsigned int n) { assert(n < sizeof(val) * 8); -#if !defined(HAVE_AVX2) - m128 sub; - if (n < 128) { - sub = val.lo.lo; - } else if (n < 256) { - sub = val.lo.hi; - } else if (n < 384) { - sub = val.hi.lo; - } else { - sub = val.hi.hi; - } - return testbit128(sub, n % 128); -#elif defined(HAVE_AVX512) const m512 mask = mask1bit512(n); return !!_mm512_test_epi8_mask(mask, val); -#else - m256 sub; - if (n < 256) { - sub = val.lo; - } else { - sub = val.hi; - n -= 256; - } - return testbit256(sub, n); -#endif } +#endif // HAVE_SIMD_512_BITS + #endif // ARCH_X86_SIMD_UTILS_H