From 5f930b267c596eb277626d20475107a63cb874f8 Mon Sep 17 00:00:00 2001 From: Wang Xiang W Date: Fri, 24 Apr 2020 11:51:34 -0400 Subject: [PATCH] Limex: exception handling with AVX512 --- src/nfa/limex_compile.cpp | 58 ++++++++++++++++++++++++- src/nfa/limex_exceptional.h | 78 +++++++++++++++++++++++++++++++--- src/nfa/limex_internal.h | 6 ++- src/util/arch/x86/simd_utils.h | 22 ++++++++++ src/util/uniform_ops.h | 14 +++++- 5 files changed, 169 insertions(+), 9 deletions(-) diff --git a/src/nfa/limex_compile.cpp b/src/nfa/limex_compile.cpp index 207597ba..9233ae51 100644 --- a/src/nfa/limex_compile.cpp +++ b/src/nfa/limex_compile.cpp @@ -1922,7 +1922,8 @@ struct Factory { } static - void writeExceptions(const map> &exceptionMap, + void writeExceptions(const build_info &args, + const map> &exceptionMap, const vector &repeatOffsets, implNFA_t *limex, const u32 exceptionsOffset, const u32 reportListOffset) { @@ -1974,6 +1975,59 @@ struct Factory { limex->exceptionOffset = exceptionsOffset; limex->exceptionCount = ecount; + + if (args.num_states > 64 && args.cc.target_info.has_avx512vbmi()) { + const u8 *exceptionMask = (const u8 *)(&limex->exceptionMask); + u8 *shufMask = (u8 *)&limex->exceptionShufMask; + u8 *bitMask = (u8 *)&limex->exceptionBitMask; + u8 *andMask = (u8 *)&limex->exceptionAndMask; + + u32 tot_cnt = 0; + u32 pos = 0; + bool valid = true; + size_t tot = sizeof(limex->exceptionMask); + size_t base = 0; + + // We normally have up to 64 exceptions to handle, + // but treat 384 state Limex differently to simplify operations + size_t limit = 64; + if (args.num_states > 256 && args.num_states <= 384) { + limit = 48; + } + + for (size_t i = 0; i < tot; i++) { + if (!exceptionMask[i]) { + continue; + } + u32 bit_cnt = popcount32(exceptionMask[i]); + + tot_cnt += bit_cnt; + if (tot_cnt > limit) { + valid = false; + break; + } + + u32 emsk = exceptionMask[i]; + while (emsk) { + u32 t = findAndClearLSB_32(&emsk); + bitMask[pos] = 1U << t; + andMask[pos] = 1U << t; + shufMask[pos++] = i + base; + + if (pos == 32 && + (args.num_states > 128 && args.num_states <= 256)) { + base += 32; + } + } + } + // Avoid matching unused bytes + for (u32 i = pos; i < 64; i++) { + bitMask[i] = 0xff; + } + if (valid) { + setLimexFlag(limex, LIMEX_FLAG_EXTRACT_EXP); + } + } } static @@ -2299,7 +2353,7 @@ struct Factory { writeRepeats(repeats, repeatOffsets, limex, repeatOffsetsOffset, repeatsOffset); - writeExceptions(exceptionMap, repeatOffsets, limex, exceptionsOffset, + writeExceptions(args, exceptionMap, repeatOffsets, limex, exceptionsOffset, reportListOffset); writeLimexMasks(args, limex); diff --git a/src/nfa/limex_exceptional.h b/src/nfa/limex_exceptional.h index e770c327..6c7335f1 100644 --- a/src/nfa/limex_exceptional.h +++ b/src/nfa/limex_exceptional.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2016, Intel Corporation + * Copyright (c) 2015-2020, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -47,6 +47,8 @@ #define AND_STATE JOIN(and_, STATE_T) #define EQ_STATE(a, b) (!JOIN(noteq_, STATE_T)((a), (b))) #define OR_STATE JOIN(or_, STATE_T) +#define EXPAND_STATE JOIN(expand_, STATE_T) +#define SHUFFLE_BYTE_STATE JOIN(shuffle_byte_, STATE_T) #define TESTBIT_STATE JOIN(testbit_, STATE_T) #define EXCEPTION_T JOIN(struct NFAException, SIZE) #define CONTEXT_T JOIN(NFAContext, SIZE) @@ -208,7 +210,7 @@ int RUN_EXCEPTION_FN(const EXCEPTION_T *e, STATE_ARG, /** \brief Process all of the exceptions associated with the states in the \a * estate. */ static really_inline -int PE_FN(STATE_ARG, ESTATE_ARG, u32 diffmask, STATE_T *succ, +int PE_FN(STATE_ARG, ESTATE_ARG, UNUSED u32 diffmask, STATE_T *succ, const struct IMPL_NFA_T *limex, const EXCEPTION_T *exceptions, u64a offset, struct CONTEXT_T *ctx, char in_rev, char flags) { assert(diffmask > 0); // guaranteed by caller macro @@ -233,6 +235,72 @@ int PE_FN(STATE_ARG, ESTATE_ARG, u32 diffmask, STATE_T *succ, ctx->local_succ = ZERO_STATE; #endif + struct proto_cache new_cache = {0, NULL}; + enum CacheResult cacheable = CACHE_RESULT; + +#if defined(HAVE_AVX512VBMI) && SIZE > 64 + if (likely(limex->flags & LIMEX_FLAG_EXTRACT_EXP)) { + m512 emask = EXPAND_STATE(*STATE_ARG_P); + emask = SHUFFLE_BYTE_STATE(load_m512(&limex->exceptionShufMask), emask); + emask = and512(emask, load_m512(&limex->exceptionAndMask)); + u64a word = eq512mask(emask, load_m512(&limex->exceptionBitMask)); + + do { + u32 bit = FIND_AND_CLEAR_FN(&word); + const EXCEPTION_T *e = &exceptions[bit]; + + if (!RUN_EXCEPTION_FN(e, STATE_ARG_NAME, succ, +#ifndef BIG_MODEL + &local_succ, +#endif + limex, offset, ctx, &new_cache, &cacheable, + in_rev, flags)) { + return PE_RV_HALT; + } + } while (word); + } else { + // A copy of the estate as an array of GPR-sized chunks. + CHUNK_T chunks[sizeof(STATE_T) / sizeof(CHUNK_T)]; + CHUNK_T emask_chunks[sizeof(STATE_T) / sizeof(CHUNK_T)]; +#ifdef ESTATE_ON_STACK + memcpy(chunks, &estate, sizeof(STATE_T)); +#else + memcpy(chunks, estatep, sizeof(STATE_T)); +#endif + memcpy(emask_chunks, &limex->exceptionMask, sizeof(STATE_T)); + + u32 base_index[sizeof(STATE_T) / sizeof(CHUNK_T)]; + base_index[0] = 0; + for (s32 i = 0; i < (s32)ARRAY_LENGTH(base_index) - 1; i++) { + base_index[i + 1] = base_index[i] + POPCOUNT_FN(emask_chunks[i]); + } + + do { + u32 t = findAndClearLSB_32(&diffmask); +#ifdef ARCH_64_BIT + t >>= 1; // Due to diffmask64, which leaves holes in the bitmask. +#endif + assert(t < ARRAY_LENGTH(chunks)); + CHUNK_T word = chunks[t]; + assert(word != 0); + do { + u32 bit = FIND_AND_CLEAR_FN(&word); + u32 local_index = RANK_IN_MASK_FN(emask_chunks[t], bit); + u32 idx = local_index + base_index[t]; + const EXCEPTION_T *e = &exceptions[idx]; + + if (!RUN_EXCEPTION_FN(e, STATE_ARG_NAME, succ, +#ifndef BIG_MODEL + &local_succ, +#endif + limex, offset, ctx, &new_cache, &cacheable, + in_rev, flags)) { + return PE_RV_HALT; + } + } while (word); + } while (diffmask); + } +#else // A copy of the estate as an array of GPR-sized chunks. CHUNK_T chunks[sizeof(STATE_T) / sizeof(CHUNK_T)]; CHUNK_T emask_chunks[sizeof(STATE_T) / sizeof(CHUNK_T)]; @@ -243,9 +311,6 @@ int PE_FN(STATE_ARG, ESTATE_ARG, u32 diffmask, STATE_T *succ, #endif memcpy(emask_chunks, &limex->exceptionMask, sizeof(STATE_T)); - struct proto_cache new_cache = {0, NULL}; - enum CacheResult cacheable = CACHE_RESULT; - u32 base_index[sizeof(STATE_T) / sizeof(CHUNK_T)]; base_index[0] = 0; for (s32 i = 0; i < (s32)ARRAY_LENGTH(base_index) - 1; i++) { @@ -276,6 +341,7 @@ int PE_FN(STATE_ARG, ESTATE_ARG, u32 diffmask, STATE_T *succ, } } while (word); } while (diffmask); +#endif #ifndef BIG_MODEL *succ = OR_STATE(*succ, local_succ); @@ -307,6 +373,8 @@ int PE_FN(STATE_ARG, ESTATE_ARG, u32 diffmask, STATE_T *succ, #undef AND_STATE #undef EQ_STATE #undef OR_STATE +#undef EXPAND_STATE +#undef SHUFFLE_BYTE_STATE #undef TESTBIT_STATE #undef PE_FN #undef RUN_EXCEPTION_FN diff --git a/src/nfa/limex_internal.h b/src/nfa/limex_internal.h index db703f03..23b1bd97 100644 --- a/src/nfa/limex_internal.h +++ b/src/nfa/limex_internal.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2015-2020, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -86,6 +86,7 @@ #define LIMEX_FLAG_COMPRESS_STATE 1 /**< pack state into stream state */ #define LIMEX_FLAG_COMPRESS_MASKED 2 /**< use reach mask-based compression */ #define LIMEX_FLAG_CANNOT_DIE 4 /**< limex cannot have no states on */ +#define LIMEX_FLAG_EXTRACT_EXP 8 /**< use limex exception bit extraction */ enum LimExTrigger { LIMEX_TRIGGER_NONE = 0, @@ -157,6 +158,9 @@ struct LimExNFA##size { \ u_##size shift[MAX_SHIFT_COUNT]; \ u32 shiftCount; /**< number of shift masks used */ \ u8 shiftAmount[MAX_SHIFT_COUNT]; /**< shift amount for each mask */ \ + m512 exceptionShufMask; /**< exception byte shuffle mask */ \ + m512 exceptionBitMask; /**< exception bit mask */ \ + m512 exceptionAndMask; /**< exception and mask */ \ }; CREATE_NFA_LIMEX(32) diff --git a/src/util/arch/x86/simd_utils.h b/src/util/arch/x86/simd_utils.h index ca72b71d..fd13d676 100644 --- a/src/util/arch/x86/simd_utils.h +++ b/src/util/arch/x86/simd_utils.h @@ -187,6 +187,12 @@ static really_inline m128 or128(m128 a, m128 b) { return _mm_or_si128(a,b); } +#if defined(HAVE_AVX512VBMI) +static really_inline m512 expand128(m128 a) { + return _mm512_broadcast_i32x4(a); +} +#endif + static really_inline m128 andnot128(m128 a, m128 b) { return _mm_andnot_si128(a, b); } @@ -374,6 +380,12 @@ static really_inline m256 or256(m256 a, m256 b) { return _mm256_or_si256(a, b); } +#if defined(HAVE_AVX512VBMI) +static really_inline m512 expand256(m256 a) { + return _mm512_broadcast_i64x4(a); +} +#endif + static really_inline m256 xor256(m256 a, m256 b) { return _mm256_xor_si256(a, b); } @@ -684,6 +696,16 @@ m512 or512(m512 a, m512 b) { return _mm512_or_si512(a, b); } +#if defined(HAVE_AVX512VBMI) +static really_inline m512 expand384(m384 a) { + u64a *lo = (u64a*)&a.lo; + u64a *mid = (u64a*)&a.mid; + u64a *hi = (u64a*)&a.hi; + return _mm512_set_epi64(0ULL, 0ULL, hi[1], hi[0], mid[1], mid[0], + lo[1], lo[0]); +} +#endif + static really_inline m512 xor512(m512 a, m512 b) { return _mm512_xor_si512(a, b); diff --git a/src/util/uniform_ops.h b/src/util/uniform_ops.h index 3385e441..262104ac 100644 --- a/src/util/uniform_ops.h +++ b/src/util/uniform_ops.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015-2016, Intel Corporation + * Copyright (c) 2015-2020, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -101,6 +101,18 @@ #define or_m384(a, b) (or384(a, b)) #define or_m512(a, b) (or512(a, b)) +#if defined(HAVE_AVX512VBMI) +#define expand_m128(a) (expand128(a)) +#define expand_m256(a) (expand256(a)) +#define expand_m384(a) (expand384(a)) +#define expand_m512(a) (a) + +#define shuffle_byte_m128(a, b) (pshufb_m512(b, a)) +#define shuffle_byte_m256(a, b) (vpermb512(a, b)) +#define shuffle_byte_m384(a, b) (vpermb512(a, b)) +#define shuffle_byte_m512(a, b) (vpermb512(a, b)) +#endif + #define and_u8(a, b) ((a) & (b)) #define and_u32(a, b) ((a) & (b)) #define and_u64a(a, b) ((a) & (b))