diff --git a/cmake/arch.cmake b/cmake/arch.cmake index e98fbf22..69902f57 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -31,5 +31,24 @@ int main(){ (void)_mm256_xor_si256(z, z); }" HAVE_AVX2) +if (NOT HAVE_AVX2) + message(STATUS "Building without AVX2 support") +endif () + +# and now for AVX512 +CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}> +#if !defined(__AVX512BW__) +#error no avx512bw +#endif + +int main(){ + __m512i z = _mm512_setzero_si512(); + (void)_mm512_abs_epi8(z); +}" HAVE_AVX512) + +if (NOT HAVE_AVX512) + message(STATUS "Building without AVX512 support") +endif () + unset (CMAKE_REQUIRED_FLAGS) unset (INTRIN_INC_H) diff --git a/cmake/config.h.in b/cmake/config.h.in index 5434668e..6e23f493 100644 --- a/cmake/config.h.in +++ b/cmake/config.h.in @@ -15,6 +15,9 @@ /* "Define if building for EM64T" */ #cmakedefine ARCH_X86_64 +/* Define if AVX-512BW available */ +#cmakedefine HAVE_AVX512 + /* internal build, switch on dump support. */ #cmakedefine DUMP_SUPPORT diff --git a/src/nfa/limex_accel.c b/src/nfa/limex_accel.c index c34216f3..4834b6a5 100644 --- a/src/nfa/limex_accel.c +++ b/src/nfa/limex_accel.c @@ -151,18 +151,20 @@ size_t doAccel512(const m512 *state, const struct LimExNFA512 *limex, DEBUG_PRINTF("using PSHUFB for 512-bit shuffle\n"); m512 accelPerm = limex->accelPermute; m512 accelComp = limex->accelCompare; -#if !defined(HAVE_AVX2) +#if defined(HAVE_AVX512) + idx = packedExtract512(s, accelPerm, accelComp); +#elif defined(HAVE_AVX2) + u32 idx1 = packedExtract256(s.lo, accelPerm.lo, accelComp.lo); + u32 idx2 = packedExtract256(s.hi, accelPerm.hi, accelComp.hi); + assert((idx1 & idx2) == 0); // should be no shared bits + idx = idx1 | idx2; +#else u32 idx1 = packedExtract128(s.lo.lo, accelPerm.lo.lo, accelComp.lo.lo); u32 idx2 = packedExtract128(s.lo.hi, accelPerm.lo.hi, accelComp.lo.hi); u32 idx3 = packedExtract128(s.hi.lo, accelPerm.hi.lo, accelComp.hi.lo); u32 idx4 = packedExtract128(s.hi.hi, accelPerm.hi.hi, accelComp.hi.hi); assert((idx1 & idx2 & idx3 & idx4) == 0); // should be no shared bits idx = idx1 | idx2 | idx3 | idx4; -#else - u32 idx1 = packedExtract256(s.lo, accelPerm.lo, accelComp.lo); - u32 idx2 = packedExtract256(s.hi, accelPerm.hi, accelComp.hi); - assert((idx1 & idx2) == 0); // should be no shared bits - idx = idx1 | idx2; #endif return accelScanWrapper(accelTable, aux, input, idx, i, end); } diff --git a/src/nfa/limex_shuffle.h b/src/nfa/limex_shuffle.h index 5d9b3ef8..4c142a34 100644 --- a/src/nfa/limex_shuffle.h +++ b/src/nfa/limex_shuffle.h @@ -62,4 +62,17 @@ u32 packedExtract256(m256 s, const m256 permute, const m256 compare) { } #endif // AVX2 +#if defined(HAVE_AVX512) +static really_inline +u32 packedExtract512(m512 s, const m512 permute, const m512 compare) { + // vpshufb doesn't cross lanes, so this is a bit of a cheat + m512 shuffled = pshufb_m512(s, permute); + m512 compared = and512(shuffled, compare); + u64a rv = ~eq512mask(compared, shuffled); + // stitch the lane-wise results back together + rv = rv >> 32 | rv; + return (u32)(((rv >> 16) | rv) & 0xffffU); +} +#endif // AVX512 + #endif // LIMEX_SHUFFLE_H diff --git a/src/nfa/nfa_build_util.cpp b/src/nfa/nfa_build_util.cpp index 3103cd29..9185ccdd 100644 --- a/src/nfa/nfa_build_util.cpp +++ b/src/nfa/nfa_build_util.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2016, Intel Corporation + * Copyright (c) 2015-2017, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -401,7 +401,7 @@ const char *NFATraits::name = "Sheng"; template<> struct NFATraits { UNUSED static const char *name; static const NFACategory category = NFA_OTHER; - static const u32 stateAlign = 32; + static const u32 stateAlign = 64; static const bool fast = true; static const nfa_dispatch_fn has_accel; static const nfa_dispatch_fn has_repeats; diff --git a/src/util/simd_types.h b/src/util/simd_types.h index 64844dcb..962cad6c 100644 --- a/src/util/simd_types.h +++ b/src/util/simd_types.h @@ -46,9 +46,12 @@ typedef __m256i m256; typedef struct ALIGN_AVX_DIRECTIVE {m128 lo; m128 hi;} m256; #endif -// these should align to 16 and 32 respectively typedef struct {m128 lo; m128 mid; m128 hi;} m384; -typedef struct {m256 lo; m256 hi;} m512; +#if defined(HAVE_AVX512) +typedef __m512i m512; +#else +typedef struct ALIGN_ATTR(64) {m256 lo; m256 hi;} m512; +#endif #endif /* SIMD_TYPES_H */ diff --git a/src/util/simd_utils.c b/src/util/simd_utils.c index 54b5b4ba..25a81412 100644 --- a/src/util/simd_utils.c +++ b/src/util/simd_utils.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016, Intel Corporation + * Copyright (c) 2016-2017, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -49,6 +49,7 @@ ALIGN_CL_DIRECTIVE const char vbs_mask_data[] = { /** \brief LUT for the mask1bit functions. */ ALIGN_CL_DIRECTIVE const u8 simd_onebit_masks[] = { + ZEROES_32, ZEROES_32, ZEROES_31, 0x01, ZEROES_32, ZEROES_31, 0x02, ZEROES_32, ZEROES_31, 0x04, ZEROES_32, @@ -57,4 +58,5 @@ ALIGN_CL_DIRECTIVE const u8 simd_onebit_masks[] = { ZEROES_31, 0x20, ZEROES_32, ZEROES_31, 0x40, ZEROES_32, ZEROES_31, 0x80, ZEROES_32, + ZEROES_32, ZEROES_32, }; diff --git a/src/util/simd_utils.h b/src/util/simd_utils.h index 1f884843..5f4fe921 100644 --- a/src/util/simd_utils.h +++ b/src/util/simd_utils.h @@ -240,7 +240,7 @@ extern const u8 simd_onebit_masks[]; static really_inline m128 mask1bit128(unsigned int n) { assert(n < sizeof(m128) * 8); - u32 mask_idx = ((n % 8) * 64) + 31; + u32 mask_idx = ((n % 8) * 64) + 95; mask_idx -= n / 8; return loadu128(&simd_onebit_masks[mask_idx]); } @@ -290,6 +290,18 @@ m256 vpshufb(m256 a, m256 b) { #endif } +#if defined(HAVE_AVX512) +static really_inline +m512 pshufb_m512(m512 a, m512 b) { + return _mm512_shuffle_epi8(a, b); +} + +static really_inline +m512 maskz_pshufb_m512(__mmask64 k, m512 a, m512 b) { + return _mm512_maskz_shuffle_epi8(k, a, b); +} +#endif + static really_inline m128 variable_byte_shift_m128(m128 in, s32 amount) { assert(amount >= -16 && amount <= 16); @@ -592,7 +604,7 @@ m256 loadbytes256(const void *ptr, unsigned int n) { static really_inline m256 mask1bit256(unsigned int n) { assert(n < sizeof(m256) * 8); - u32 mask_idx = ((n % 8) * 64) + 31; + u32 mask_idx = ((n % 8) * 64) + 95; mask_idx -= n / 8; return loadu256(&simd_onebit_masks[mask_idx]); } @@ -902,41 +914,110 @@ char testbit384(m384 val, unsigned int n) { **** 512-bit Primitives ****/ -static really_inline m512 and512(m512 a, m512 b) { +#define eq512mask(a, b) _mm512_cmpeq_epi8_mask((a), (b)) +#define masked_eq512mask(k, a, b) _mm512_mask_cmpeq_epi8_mask((k), (a), (b)) + +static really_inline +m512 zeroes512(void) { +#if defined(HAVE_AVX512) + return _mm512_setzero_si512(); +#else + m512 rv = {zeroes256(), zeroes256()}; + return rv; +#endif +} + +static really_inline +m512 ones512(void) { +#if defined(HAVE_AVX512) + return _mm512_set1_epi8(0xFF); + //return _mm512_xor_si512(_mm512_setzero_si512(), _mm512_setzero_si512()); +#else + m512 rv = {ones256(), ones256()}; + return rv; +#endif +} + +#if defined(HAVE_AVX512) +static really_inline +m512 set64x8(u8 a) { + return _mm512_set1_epi8(a); +} + +static really_inline +m512 set8x64(u64a a) { + return _mm512_set1_epi64(a); +} + +static really_inline +m512 set4x128(m128 a) { + return _mm512_broadcast_i32x4(a); +} +#endif + +static really_inline +m512 and512(m512 a, m512 b) { +#if defined(HAVE_AVX512) + return _mm512_and_si512(a, b); +#else m512 rv; rv.lo = and256(a.lo, b.lo); rv.hi = and256(a.hi, b.hi); return rv; +#endif } -static really_inline m512 or512(m512 a, m512 b) { +static really_inline +m512 or512(m512 a, m512 b) { +#if defined(HAVE_AVX512) + return _mm512_or_si512(a, b); +#else m512 rv; rv.lo = or256(a.lo, b.lo); rv.hi = or256(a.hi, b.hi); return rv; +#endif } -static really_inline m512 xor512(m512 a, m512 b) { +static really_inline +m512 xor512(m512 a, m512 b) { +#if defined(HAVE_AVX512) + return _mm512_xor_si512(a, b); +#else m512 rv; rv.lo = xor256(a.lo, b.lo); rv.hi = xor256(a.hi, b.hi); return rv; +#endif } -static really_inline m512 not512(m512 a) { +static really_inline +m512 not512(m512 a) { +#if defined(HAVE_AVX512) + return _mm512_xor_si512(a, ones512()); +#else m512 rv; rv.lo = not256(a.lo); rv.hi = not256(a.hi); return rv; +#endif } -static really_inline m512 andnot512(m512 a, m512 b) { +static really_inline +m512 andnot512(m512 a, m512 b) { +#if defined(HAVE_AVX512) + return _mm512_andnot_si512(a, b); +#else m512 rv; rv.lo = andnot256(a.lo, b.lo); rv.hi = andnot256(a.hi, b.hi); return rv; +#endif } +#if defined(HAVE_AVX512) +#define lshift64_m512(a, b) _mm512_slli_epi64((a), b) +#else // The shift amount is an immediate static really_really_inline m512 lshift64_m512(m512 a, unsigned b) { @@ -945,29 +1026,37 @@ m512 lshift64_m512(m512 a, unsigned b) { rv.hi = lshift64_m256(a.hi, b); return rv; } +#endif -static really_inline m512 zeroes512(void) { - m512 rv = {zeroes256(), zeroes256()}; - return rv; -} +#if defined(HAVE_AVX512) +#define rshift64_m512(a, b) _mm512_srli_epi64((a), (b)) +#define rshift128_m512(a, count_immed) _mm512_bsrli_epi128(a, count_immed) +#endif -static really_inline m512 ones512(void) { - m512 rv = {ones256(), ones256()}; - return rv; -} +#if !defined(_MM_CMPINT_NE) +#define _MM_CMPINT_NE 0x4 +#endif -static really_inline int diff512(m512 a, m512 b) { +static really_inline +int diff512(m512 a, m512 b) { +#if defined(HAVE_AVX512) + return !!_mm512_cmp_epi8_mask(a, b, _MM_CMPINT_NE); +#else return diff256(a.lo, b.lo) || diff256(a.hi, b.hi); +#endif } -static really_inline int isnonzero512(m512 a) { -#if !defined(HAVE_AVX2) +static really_inline +int isnonzero512(m512 a) { +#if defined(HAVE_AVX512) + return diff512(a, zeroes512()); +#elif defined(HAVE_AVX2) + m256 x = or256(a.lo, a.hi); + return !!diff256(x, zeroes256()); +#else m128 x = or128(a.lo.lo, a.lo.hi); m128 y = or128(a.hi.lo, a.hi.hi); return isnonzero128(or128(x, y)); -#else - m256 x = or256(a.lo, a.hi); - return !!diff256(x, zeroes256()); #endif } @@ -975,8 +1064,11 @@ static really_inline int isnonzero512(m512 a) { * "Rich" version of diff512(). Takes two vectors a and b and returns a 16-bit * mask indicating which 32-bit words contain differences. */ -static really_inline u32 diffrich512(m512 a, m512 b) { -#if defined(HAVE_AVX2) +static really_inline +u32 diffrich512(m512 a, m512 b) { +#if defined(HAVE_AVX512) + return _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_NE); +#elif defined(HAVE_AVX2) return diffrich256(a.lo, b.lo) | (diffrich256(a.hi, b.hi) << 8); #else a.lo.lo = _mm_cmpeq_epi32(a.lo.lo, b.lo.lo); @@ -993,22 +1085,32 @@ static really_inline u32 diffrich512(m512 a, m512 b) { * "Rich" version of diffrich(), 64-bit variant. Takes two vectors a and b and * returns a 16-bit mask indicating which 64-bit words contain differences. */ -static really_inline u32 diffrich64_512(m512 a, m512 b) { +static really_inline +u32 diffrich64_512(m512 a, m512 b) { + //TODO: cmp_epi64? u32 d = diffrich512(a, b); return (d | (d >> 1)) & 0x55555555; } // aligned load -static really_inline m512 load512(const void *ptr) { +static really_inline +m512 load512(const void *ptr) { +#if defined(HAVE_AVX512) + return _mm512_load_si512(ptr); +#else assert(ISALIGNED_N(ptr, alignof(m256))); m512 rv = { load256(ptr), load256((const char *)ptr + 32) }; return rv; +#endif } // aligned store -static really_inline void store512(void *ptr, m512 a) { - assert(ISALIGNED_N(ptr, alignof(m256))); -#if defined(HAVE_AVX2) +static really_inline +void store512(void *ptr, m512 a) { + assert(ISALIGNED_N(ptr, alignof(m512))); +#if defined(HAVE_AVX512) + return _mm512_store_si512(ptr, a); +#elif defined(HAVE_AVX2) m512 *x = (m512 *)ptr; store256(&x->lo, a.lo); store256(&x->hi, a.hi); @@ -1019,11 +1121,28 @@ static really_inline void store512(void *ptr, m512 a) { } // unaligned load -static really_inline m512 loadu512(const void *ptr) { +static really_inline +m512 loadu512(const void *ptr) { +#if defined(HAVE_AVX512) + return _mm512_loadu_si512(ptr); +#else m512 rv = { loadu256(ptr), loadu256((const char *)ptr + 32) }; return rv; +#endif } +#if defined(HAVE_AVX512) +static really_inline +m512 loadu_maskz_m512(__mmask64 k, const void *ptr) { + return _mm512_maskz_loadu_epi8(k, ptr); +} + +static really_inline +m512 loadu_mask_m512(m512 src, __mmask64 k, const void *ptr) { + return _mm512_mask_loadu_epi8(src, k, ptr); +} +#endif + // packed unaligned store of first N bytes static really_inline void storebytes512(void *ptr, m512 a, unsigned int n) { @@ -1040,6 +1159,14 @@ m512 loadbytes512(const void *ptr, unsigned int n) { return a; } +static really_inline +m512 mask1bit512(unsigned int n) { + assert(n < sizeof(m512) * 8); + u32 mask_idx = ((n % 8) * 64) + 95; + mask_idx -= n / 8; + return loadu512(&simd_onebit_masks[mask_idx]); +} + // switches on bit N in the given vector. static really_inline void setbit512(m512 *ptr, unsigned int n) { @@ -1056,6 +1183,8 @@ void setbit512(m512 *ptr, unsigned int n) { sub = &ptr->hi.hi; } setbit128(sub, n % 128); +#elif defined(HAVE_AVX512) + *ptr = or512(mask1bit512(n), *ptr); #else m256 *sub; if (n < 256) { @@ -1084,6 +1213,8 @@ void clearbit512(m512 *ptr, unsigned int n) { sub = &ptr->hi.hi; } clearbit128(sub, n % 128); +#elif defined(HAVE_AVX512) + *ptr = andnot512(mask1bit512(n), *ptr); #else m256 *sub; if (n < 256) { @@ -1112,6 +1243,9 @@ char testbit512(m512 val, unsigned int n) { sub = val.hi.hi; } return testbit128(sub, n % 128); +#elif defined(HAVE_AVX512) + const m512 mask = mask1bit512(n); + return !!_mm512_test_epi8_mask(mask, val); #else m256 sub; if (n < 256) { diff --git a/src/util/state_compress.c b/src/util/state_compress.c index 87e62429..7238849e 100644 --- a/src/util/state_compress.c +++ b/src/util/state_compress.c @@ -547,16 +547,21 @@ m512 loadcompressed512_32bit(const void *ptr, m512 mvec) { expand32(v[14], m[14]), expand32(v[15], m[15]) }; m512 xvec; -#if !defined(HAVE_AVX2) - xvec.lo.lo = _mm_set_epi32(x[3], x[2], x[1], x[0]); - xvec.lo.hi = _mm_set_epi32(x[7], x[6], x[5], x[4]); - xvec.hi.lo = _mm_set_epi32(x[11], x[10], x[9], x[8]); - xvec.hi.hi = _mm_set_epi32(x[15], x[14], x[13], x[12]); -#else +#if defined(HAVE_AVX512) + xvec = _mm512_set_epi32(x[15], x[14], x[13], x[12], + x[11], x[10], x[9], x[8], + x[7], x[6], x[5], x[4], + x[3], x[2], x[1], x[0]); +#elif defined(HAVE_AVX2) xvec.lo = _mm256_set_epi32(x[7], x[6], x[5], x[4], x[3], x[2], x[1], x[0]); xvec.hi = _mm256_set_epi32(x[15], x[14], x[13], x[12], x[11], x[10], x[9], x[8]); +#else + xvec.lo.lo = _mm_set_epi32(x[3], x[2], x[1], x[0]); + xvec.lo.hi = _mm_set_epi32(x[7], x[6], x[5], x[4]); + xvec.hi.lo = _mm_set_epi32(x[11], x[10], x[9], x[8]); + xvec.hi.hi = _mm_set_epi32(x[15], x[14], x[13], x[12]); #endif return xvec; } @@ -582,14 +587,17 @@ m512 loadcompressed512_64bit(const void *ptr, m512 mvec) { expand64(v[4], m[4]), expand64(v[5], m[5]), expand64(v[6], m[6]), expand64(v[7], m[7]) }; -#if !defined(HAVE_AVX2) +#if defined(HAVE_AVX512) + m512 xvec = _mm512_set_epi64(x[7], x[6], x[5], x[4], + x[3], x[2], x[1], x[0]); +#elif defined(HAVE_AVX2) + m512 xvec = { .lo = _mm256_set_epi64x(x[3], x[2], x[1], x[0]), + .hi = _mm256_set_epi64x(x[7], x[6], x[5], x[4])}; +#else m512 xvec = { .lo = { _mm_set_epi64x(x[1], x[0]), _mm_set_epi64x(x[3], x[2]) }, .hi = { _mm_set_epi64x(x[5], x[4]), _mm_set_epi64x(x[7], x[6]) } }; -#else - m512 xvec = { .lo = _mm256_set_epi64x(x[3], x[2], x[1], x[0]), - .hi = _mm256_set_epi64x(x[7], x[6], x[5], x[4])}; #endif return xvec; } diff --git a/unit/internal/shuffle.cpp b/unit/internal/shuffle.cpp index fcf337f2..b2316bab 100644 --- a/unit/internal/shuffle.cpp +++ b/unit/internal/shuffle.cpp @@ -165,14 +165,15 @@ TEST(Shuffle, PackedExtract64_3) { template static void build_pshufb_masks_onebit(unsigned int bit, T *permute, T *compare) { - static_assert(sizeof(T) == sizeof(m128) || sizeof(T) == sizeof(m256), + static_assert(sizeof(T) == sizeof(m128) || sizeof(T) == sizeof(m256) || + sizeof(T) == sizeof(m512), "should be valid type"); // permute mask has 0x80 in all bytes except the one we care about memset(permute, 0x80, sizeof(*permute)); memset(compare, 0, sizeof(*compare)); char *pmsk = (char *)permute; char *cmsk = (char *)compare; - u8 off = (bit >= 128) ? 0x10 : 0; + u8 off = (bit >= 128) ? (bit >= 256) ? (bit >= 384) ? 0x30 : 0x20 : 0x10 : 0; pmsk[off] = bit/8; cmsk[off] = ~(1 << (bit % 8)); } @@ -214,4 +215,24 @@ TEST(Shuffle, PackedExtract256_1) { } } #endif + +#if defined(HAVE_AVX512) +TEST(Shuffle, PackedExtract512_1) { + // Try all possible one-bit masks + for (unsigned int i = 0; i < 512; i++) { + // shuffle a single 1 bit to the front + m512 permute, compare; + build_pshufb_masks_onebit(i, &permute, &compare); + EXPECT_EQ(1U, packedExtract512(setbit(i), permute, compare)); + EXPECT_EQ(1U, packedExtract512(ones512(), permute, compare)); + // we should get zero out of these cases + EXPECT_EQ(0U, packedExtract512(zeroes512(), permute, compare)); + EXPECT_EQ(0U, packedExtract512(not512(setbit(i)), permute, compare)); + // we should get zero out of all the other bit positions + for (unsigned int j = 0; (j != i && j < 512); j++) { + EXPECT_EQ(0U, packedExtract512(setbit(j), permute, compare)); + } + } +} +#endif } // namespace diff --git a/unit/internal/simd_utils.cpp b/unit/internal/simd_utils.cpp index dac3722e..0d3926d6 100644 --- a/unit/internal/simd_utils.cpp +++ b/unit/internal/simd_utils.cpp @@ -590,7 +590,7 @@ TEST(SimdUtilsTest, alignment) { ASSERT_EQ(16, alignof(m128)); ASSERT_EQ(32, alignof(m256)); ASSERT_EQ(16, alignof(m384)); - ASSERT_EQ(32, alignof(m512)); + ASSERT_EQ(64, alignof(m512)); } TEST(SimdUtilsTest, movq) {