Limex: exception handling with AVX512

This commit is contained in:
Wang Xiang W 2020-04-24 11:51:34 -04:00 committed by Konstantinos Margaritis
parent 001b7824d2
commit 5f930b267c
5 changed files with 169 additions and 9 deletions

View File

@ -1922,7 +1922,8 @@ struct Factory {
}
static
void writeExceptions(const map<ExceptionProto, vector<u32>> &exceptionMap,
void writeExceptions(const build_info &args,
const map<ExceptionProto, vector<u32>> &exceptionMap,
const vector<u32> &repeatOffsets, implNFA_t *limex,
const u32 exceptionsOffset,
const u32 reportListOffset) {
@ -1974,6 +1975,59 @@ struct Factory {
limex->exceptionOffset = exceptionsOffset;
limex->exceptionCount = ecount;
if (args.num_states > 64 && args.cc.target_info.has_avx512vbmi()) {
const u8 *exceptionMask = (const u8 *)(&limex->exceptionMask);
u8 *shufMask = (u8 *)&limex->exceptionShufMask;
u8 *bitMask = (u8 *)&limex->exceptionBitMask;
u8 *andMask = (u8 *)&limex->exceptionAndMask;
u32 tot_cnt = 0;
u32 pos = 0;
bool valid = true;
size_t tot = sizeof(limex->exceptionMask);
size_t base = 0;
// We normally have up to 64 exceptions to handle,
// but treat 384 state Limex differently to simplify operations
size_t limit = 64;
if (args.num_states > 256 && args.num_states <= 384) {
limit = 48;
}
for (size_t i = 0; i < tot; i++) {
if (!exceptionMask[i]) {
continue;
}
u32 bit_cnt = popcount32(exceptionMask[i]);
tot_cnt += bit_cnt;
if (tot_cnt > limit) {
valid = false;
break;
}
u32 emsk = exceptionMask[i];
while (emsk) {
u32 t = findAndClearLSB_32(&emsk);
bitMask[pos] = 1U << t;
andMask[pos] = 1U << t;
shufMask[pos++] = i + base;
if (pos == 32 &&
(args.num_states > 128 && args.num_states <= 256)) {
base += 32;
}
}
}
// Avoid matching unused bytes
for (u32 i = pos; i < 64; i++) {
bitMask[i] = 0xff;
}
if (valid) {
setLimexFlag(limex, LIMEX_FLAG_EXTRACT_EXP);
}
}
}
static
@ -2299,7 +2353,7 @@ struct Factory {
writeRepeats(repeats, repeatOffsets, limex, repeatOffsetsOffset,
repeatsOffset);
writeExceptions(exceptionMap, repeatOffsets, limex, exceptionsOffset,
writeExceptions(args, exceptionMap, repeatOffsets, limex, exceptionsOffset,
reportListOffset);
writeLimexMasks(args, limex);

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2015-2016, Intel Corporation
* Copyright (c) 2015-2020, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
@ -47,6 +47,8 @@
#define AND_STATE JOIN(and_, STATE_T)
#define EQ_STATE(a, b) (!JOIN(noteq_, STATE_T)((a), (b)))
#define OR_STATE JOIN(or_, STATE_T)
#define EXPAND_STATE JOIN(expand_, STATE_T)
#define SHUFFLE_BYTE_STATE JOIN(shuffle_byte_, STATE_T)
#define TESTBIT_STATE JOIN(testbit_, STATE_T)
#define EXCEPTION_T JOIN(struct NFAException, SIZE)
#define CONTEXT_T JOIN(NFAContext, SIZE)
@ -208,7 +210,7 @@ int RUN_EXCEPTION_FN(const EXCEPTION_T *e, STATE_ARG,
/** \brief Process all of the exceptions associated with the states in the \a
* estate. */
static really_inline
int PE_FN(STATE_ARG, ESTATE_ARG, u32 diffmask, STATE_T *succ,
int PE_FN(STATE_ARG, ESTATE_ARG, UNUSED u32 diffmask, STATE_T *succ,
const struct IMPL_NFA_T *limex, const EXCEPTION_T *exceptions,
u64a offset, struct CONTEXT_T *ctx, char in_rev, char flags) {
assert(diffmask > 0); // guaranteed by caller macro
@ -233,6 +235,72 @@ int PE_FN(STATE_ARG, ESTATE_ARG, u32 diffmask, STATE_T *succ,
ctx->local_succ = ZERO_STATE;
#endif
struct proto_cache new_cache = {0, NULL};
enum CacheResult cacheable = CACHE_RESULT;
#if defined(HAVE_AVX512VBMI) && SIZE > 64
if (likely(limex->flags & LIMEX_FLAG_EXTRACT_EXP)) {
m512 emask = EXPAND_STATE(*STATE_ARG_P);
emask = SHUFFLE_BYTE_STATE(load_m512(&limex->exceptionShufMask), emask);
emask = and512(emask, load_m512(&limex->exceptionAndMask));
u64a word = eq512mask(emask, load_m512(&limex->exceptionBitMask));
do {
u32 bit = FIND_AND_CLEAR_FN(&word);
const EXCEPTION_T *e = &exceptions[bit];
if (!RUN_EXCEPTION_FN(e, STATE_ARG_NAME, succ,
#ifndef BIG_MODEL
&local_succ,
#endif
limex, offset, ctx, &new_cache, &cacheable,
in_rev, flags)) {
return PE_RV_HALT;
}
} while (word);
} else {
// A copy of the estate as an array of GPR-sized chunks.
CHUNK_T chunks[sizeof(STATE_T) / sizeof(CHUNK_T)];
CHUNK_T emask_chunks[sizeof(STATE_T) / sizeof(CHUNK_T)];
#ifdef ESTATE_ON_STACK
memcpy(chunks, &estate, sizeof(STATE_T));
#else
memcpy(chunks, estatep, sizeof(STATE_T));
#endif
memcpy(emask_chunks, &limex->exceptionMask, sizeof(STATE_T));
u32 base_index[sizeof(STATE_T) / sizeof(CHUNK_T)];
base_index[0] = 0;
for (s32 i = 0; i < (s32)ARRAY_LENGTH(base_index) - 1; i++) {
base_index[i + 1] = base_index[i] + POPCOUNT_FN(emask_chunks[i]);
}
do {
u32 t = findAndClearLSB_32(&diffmask);
#ifdef ARCH_64_BIT
t >>= 1; // Due to diffmask64, which leaves holes in the bitmask.
#endif
assert(t < ARRAY_LENGTH(chunks));
CHUNK_T word = chunks[t];
assert(word != 0);
do {
u32 bit = FIND_AND_CLEAR_FN(&word);
u32 local_index = RANK_IN_MASK_FN(emask_chunks[t], bit);
u32 idx = local_index + base_index[t];
const EXCEPTION_T *e = &exceptions[idx];
if (!RUN_EXCEPTION_FN(e, STATE_ARG_NAME, succ,
#ifndef BIG_MODEL
&local_succ,
#endif
limex, offset, ctx, &new_cache, &cacheable,
in_rev, flags)) {
return PE_RV_HALT;
}
} while (word);
} while (diffmask);
}
#else
// A copy of the estate as an array of GPR-sized chunks.
CHUNK_T chunks[sizeof(STATE_T) / sizeof(CHUNK_T)];
CHUNK_T emask_chunks[sizeof(STATE_T) / sizeof(CHUNK_T)];
@ -243,9 +311,6 @@ int PE_FN(STATE_ARG, ESTATE_ARG, u32 diffmask, STATE_T *succ,
#endif
memcpy(emask_chunks, &limex->exceptionMask, sizeof(STATE_T));
struct proto_cache new_cache = {0, NULL};
enum CacheResult cacheable = CACHE_RESULT;
u32 base_index[sizeof(STATE_T) / sizeof(CHUNK_T)];
base_index[0] = 0;
for (s32 i = 0; i < (s32)ARRAY_LENGTH(base_index) - 1; i++) {
@ -276,6 +341,7 @@ int PE_FN(STATE_ARG, ESTATE_ARG, u32 diffmask, STATE_T *succ,
}
} while (word);
} while (diffmask);
#endif
#ifndef BIG_MODEL
*succ = OR_STATE(*succ, local_succ);
@ -307,6 +373,8 @@ int PE_FN(STATE_ARG, ESTATE_ARG, u32 diffmask, STATE_T *succ,
#undef AND_STATE
#undef EQ_STATE
#undef OR_STATE
#undef EXPAND_STATE
#undef SHUFFLE_BYTE_STATE
#undef TESTBIT_STATE
#undef PE_FN
#undef RUN_EXCEPTION_FN

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2015-2017, Intel Corporation
* Copyright (c) 2015-2020, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
@ -86,6 +86,7 @@
#define LIMEX_FLAG_COMPRESS_STATE 1 /**< pack state into stream state */
#define LIMEX_FLAG_COMPRESS_MASKED 2 /**< use reach mask-based compression */
#define LIMEX_FLAG_CANNOT_DIE 4 /**< limex cannot have no states on */
#define LIMEX_FLAG_EXTRACT_EXP 8 /**< use limex exception bit extraction */
enum LimExTrigger {
LIMEX_TRIGGER_NONE = 0,
@ -157,6 +158,9 @@ struct LimExNFA##size { \
u_##size shift[MAX_SHIFT_COUNT]; \
u32 shiftCount; /**< number of shift masks used */ \
u8 shiftAmount[MAX_SHIFT_COUNT]; /**< shift amount for each mask */ \
m512 exceptionShufMask; /**< exception byte shuffle mask */ \
m512 exceptionBitMask; /**< exception bit mask */ \
m512 exceptionAndMask; /**< exception and mask */ \
};
CREATE_NFA_LIMEX(32)

View File

@ -187,6 +187,12 @@ static really_inline m128 or128(m128 a, m128 b) {
return _mm_or_si128(a,b);
}
#if defined(HAVE_AVX512VBMI)
static really_inline m512 expand128(m128 a) {
return _mm512_broadcast_i32x4(a);
}
#endif
static really_inline m128 andnot128(m128 a, m128 b) {
return _mm_andnot_si128(a, b);
}
@ -374,6 +380,12 @@ static really_inline m256 or256(m256 a, m256 b) {
return _mm256_or_si256(a, b);
}
#if defined(HAVE_AVX512VBMI)
static really_inline m512 expand256(m256 a) {
return _mm512_broadcast_i64x4(a);
}
#endif
static really_inline m256 xor256(m256 a, m256 b) {
return _mm256_xor_si256(a, b);
}
@ -684,6 +696,16 @@ m512 or512(m512 a, m512 b) {
return _mm512_or_si512(a, b);
}
#if defined(HAVE_AVX512VBMI)
static really_inline m512 expand384(m384 a) {
u64a *lo = (u64a*)&a.lo;
u64a *mid = (u64a*)&a.mid;
u64a *hi = (u64a*)&a.hi;
return _mm512_set_epi64(0ULL, 0ULL, hi[1], hi[0], mid[1], mid[0],
lo[1], lo[0]);
}
#endif
static really_inline
m512 xor512(m512 a, m512 b) {
return _mm512_xor_si512(a, b);

View File

@ -1,5 +1,5 @@
/*
* Copyright (c) 2015-2016, Intel Corporation
* Copyright (c) 2015-2020, Intel Corporation
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
@ -101,6 +101,18 @@
#define or_m384(a, b) (or384(a, b))
#define or_m512(a, b) (or512(a, b))
#if defined(HAVE_AVX512VBMI)
#define expand_m128(a) (expand128(a))
#define expand_m256(a) (expand256(a))
#define expand_m384(a) (expand384(a))
#define expand_m512(a) (a)
#define shuffle_byte_m128(a, b) (pshufb_m512(b, a))
#define shuffle_byte_m256(a, b) (vpermb512(a, b))
#define shuffle_byte_m384(a, b) (vpermb512(a, b))
#define shuffle_byte_m512(a, b) (vpermb512(a, b))
#endif
#define and_u8(a, b) ((a) & (b))
#define and_u32(a, b) ((a) & (b))
#define and_u64a(a, b) ((a) & (b))