diff --git a/src/nfa/nfa_api_dispatch.c b/src/nfa/nfa_api_dispatch.c index f4b7552e..6786cbaf 100644 --- a/src/nfa/nfa_api_dispatch.c +++ b/src/nfa/nfa_api_dispatch.c @@ -76,6 +76,7 @@ DISPATCH_CASE(TAMARAMA_NFA, Tamarama, dbnt_func); \ DISPATCH_CASE(MCSHENG_NFA_8, McSheng8, dbnt_func); \ DISPATCH_CASE(MCSHENG_NFA_16, McSheng16, dbnt_func); \ + DISPATCH_CASE(SHENG_NFA_32, Sheng32, dbnt_func); \ default: \ assert(0); \ } diff --git a/src/nfa/nfa_build_util.cpp b/src/nfa/nfa_build_util.cpp index 9185ccdd..0ce6512e 100644 --- a/src/nfa/nfa_build_util.cpp +++ b/src/nfa/nfa_build_util.cpp @@ -446,6 +446,22 @@ const nfa_dispatch_fn NFATraits::has_repeats_other_than_firsts = const char *NFATraits::name = "Shengy McShengFace 16"; #endif +template<> struct NFATraits { + UNUSED static const char *name; + static const NFACategory category = NFA_OTHER; + static const u32 stateAlign = 1; + static const bool fast = true; + static const nfa_dispatch_fn has_accel; + static const nfa_dispatch_fn has_repeats; + static const nfa_dispatch_fn has_repeats_other_than_firsts; +}; +const nfa_dispatch_fn NFATraits::has_accel = has_accel_sheng; +const nfa_dispatch_fn NFATraits::has_repeats = dispatch_false; +const nfa_dispatch_fn NFATraits::has_repeats_other_than_firsts = dispatch_false; +#if defined(DUMP_SUPPORT) +const char *NFATraits::name = "Sheng 32"; +#endif + } // namespace #if defined(DUMP_SUPPORT) diff --git a/src/nfa/nfa_dump_dispatch.cpp b/src/nfa/nfa_dump_dispatch.cpp index 5607ed27..07dc5347 100644 --- a/src/nfa/nfa_dump_dispatch.cpp +++ b/src/nfa/nfa_dump_dispatch.cpp @@ -81,6 +81,7 @@ namespace ue2 { DISPATCH_CASE(TAMARAMA_NFA, Tamarama, dbnt_func); \ DISPATCH_CASE(MCSHENG_NFA_8, McSheng8, dbnt_func); \ DISPATCH_CASE(MCSHENG_NFA_16, McSheng16, dbnt_func); \ + DISPATCH_CASE(SHENG_NFA_32, Sheng32, dbnt_func); \ default: \ assert(0); \ } diff --git a/src/nfa/nfa_internal.h b/src/nfa/nfa_internal.h index 9d280822..0ec0b9d7 100644 --- a/src/nfa/nfa_internal.h +++ b/src/nfa/nfa_internal.h @@ -72,6 +72,7 @@ enum NFAEngineType { TAMARAMA_NFA, /**< magic nfa container */ MCSHENG_NFA_8, /**< magic pseudo nfa */ MCSHENG_NFA_16, /**< magic pseudo nfa */ + SHENG_NFA_32, /**< magic pseudo nfa */ /** \brief bogus NFA - not used */ INVALID_NFA }; @@ -157,10 +158,26 @@ static really_inline int isGoughType(u8 t) { } /** \brief True if the given type (from NFA::type) is a Sheng DFA. */ -static really_inline int isShengType(u8 t) { +static really_inline int isSheng16Type(u8 t) { return t == SHENG_NFA; } +#if defined(HAVE_AVX512VBMI) +/** \brief True if the given type (from NFA::type) is a Sheng32 DFA. */ +static really_inline int isSheng32Type(u8 t) { + return t == SHENG_NFA_32; +} +#endif + +/** \brief True if the given type (from NFA::type) is a Sheng/Sheng32 DFA. */ +static really_inline int isShengType(u8 t) { +#if defined(HAVE_AVX512VBMI) + return t == SHENG_NFA || t == SHENG_NFA_32; +#else + return t == SHENG_NFA; +#endif +} + /** * \brief True if the given type (from NFA::type) is a McClellan, Gough or * Sheng DFA. diff --git a/src/nfa/sheng.c b/src/nfa/sheng.c index 4f30910b..7d302206 100644 --- a/src/nfa/sheng.c +++ b/src/nfa/sheng.c @@ -154,6 +154,110 @@ char fireReports(const struct sheng *sh, NfaCallback cb, void *ctxt, return MO_CONTINUE_MATCHING; /* continue execution */ } +#if defined(HAVE_AVX512VBMI) +static really_inline +const struct sheng32 *get_sheng32(const struct NFA *n) { + return (const struct sheng32 *)getImplNfa(n); +} + +static really_inline +const struct sstate_aux *get_aux32(const struct sheng32 *sh, u8 id) { + u32 offset = sh->aux_offset - sizeof(struct NFA) + + (id & SHENG32_STATE_MASK) * sizeof(struct sstate_aux); + DEBUG_PRINTF("Getting aux for state %u at offset %llu\n", + id & SHENG32_STATE_MASK, (u64a)offset + sizeof(struct NFA)); + return (const struct sstate_aux *)((const char *) sh + offset); +} + +static really_inline +const union AccelAux *get_accel32(const struct sheng32 *sh, u8 id) { + const struct sstate_aux *saux = get_aux32(sh, id); + DEBUG_PRINTF("Getting accel aux at offset %u\n", saux->accel); + const union AccelAux *aux = (const union AccelAux *) + ((const char *)sh + saux->accel - sizeof(struct NFA)); + return aux; +} + +static really_inline +const struct report_list *get_rl32(const struct sheng32 *sh, + const struct sstate_aux *aux) { + DEBUG_PRINTF("Getting report list at offset %u\n", aux->accept); + return (const struct report_list *) + ((const char *)sh + aux->accept - sizeof(struct NFA)); +} + +static really_inline +const struct report_list *get_eod_rl32(const struct sheng32 *sh, + const struct sstate_aux *aux) { + DEBUG_PRINTF("Getting EOD report list at offset %u\n", aux->accept); + return (const struct report_list *) + ((const char *)sh + aux->accept_eod - sizeof(struct NFA)); +} + +static really_inline +char sheng32HasAccept(const struct sheng32 *sh, const struct sstate_aux *aux, + ReportID report) { + assert(sh && aux); + + const struct report_list *rl = get_rl32(sh, aux); + assert(ISALIGNED_N(rl, 4)); + + DEBUG_PRINTF("report list has %u entries\n", rl->count); + + for (u32 i = 0; i < rl->count; i++) { + if (rl->report[i] == report) { + DEBUG_PRINTF("reporting %u\n", rl->report[i]); + return 1; + } + } + + return 0; +} + +static really_inline +char fireReports32(const struct sheng32 *sh, NfaCallback cb, void *ctxt, + const u8 state, u64a loc, u8 *const cached_accept_state, + ReportID *const cached_accept_id, char eod) { + DEBUG_PRINTF("reporting matches @ %llu\n", loc); + + if (!eod && state == *cached_accept_state) { + DEBUG_PRINTF("reporting %u\n", *cached_accept_id); + if (cb(0, loc, *cached_accept_id, ctxt) == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; /* termination requested */ + } + + return MO_CONTINUE_MATCHING; /* continue execution */ + } + const struct sstate_aux *aux = get_aux32(sh, state); + const struct report_list *rl = eod ? get_eod_rl32(sh, aux) : + get_rl32(sh, aux); + assert(ISALIGNED(rl)); + + DEBUG_PRINTF("report list has %u entries\n", rl->count); + u32 count = rl->count; + + if (!eod && count == 1) { + *cached_accept_state = state; + *cached_accept_id = rl->report[0]; + + DEBUG_PRINTF("reporting %u\n", rl->report[0]); + if (cb(0, loc, rl->report[0], ctxt) == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; /* termination requested */ + } + + return MO_CONTINUE_MATCHING; /* continue execution */ + } + + for (u32 i = 0; i < count; i++) { + DEBUG_PRINTF("reporting %u\n", rl->report[i]); + if (cb(0, loc, rl->report[i], ctxt) == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; /* termination requested */ + } + } + return MO_CONTINUE_MATCHING; /* continue execution */ +} +#endif // end of HAVE_AVX512VBMI + /* include Sheng function definitions */ #include "sheng_defs.h" @@ -671,3 +775,523 @@ char nfaExecSheng_expandState(UNUSED const struct NFA *nfa, void *dest, *(u8 *)dest = *(const u8 *)src; return 0; } + +#if defined(HAVE_AVX512VBMI) +static really_inline +char runSheng32Cb(const struct sheng32 *sh, NfaCallback cb, void *ctxt, + u64a offset, u8 *const cached_accept_state, + ReportID *const cached_accept_id, const u8 *cur_buf, + const u8 *start, const u8 *end, u8 can_die, + u8 has_accel, u8 single, const u8 **scanned, u8 *state) { + DEBUG_PRINTF("Scanning %llu bytes (offset %llu) in callback mode\n", + (u64a)(end - start), offset); + DEBUG_PRINTF("start: %lli end: %lli\n", (s64a)(start - cur_buf), + (s64a)(end - cur_buf)); + DEBUG_PRINTF("can die: %u has accel: %u single: %u\n", !!can_die, + !!has_accel, !!single); + int rv; + /* scan and report all matches */ + if (can_die) { + if (has_accel) { + rv = sheng32_4_coda(state, cb, ctxt, sh, cached_accept_state, + cached_accept_id, single, offset, cur_buf, + start, end, scanned); + } else { + rv = sheng32_4_cod(state, cb, ctxt, sh, cached_accept_state, + cached_accept_id, single, offset, cur_buf, + start, end, scanned); + } + if (rv == MO_HALT_MATCHING) { + return MO_DEAD; + } + rv = sheng32_cod(state, cb, ctxt, sh, cached_accept_state, + cached_accept_id, single, offset, cur_buf, + *scanned, end, scanned); + } else { + if (has_accel) { + rv = sheng32_4_coa(state, cb, ctxt, sh, cached_accept_state, + cached_accept_id, single, offset, cur_buf, + start, end, scanned); + } else { + rv = sheng32_4_co(state, cb, ctxt, sh, cached_accept_state, + cached_accept_id, single, offset, cur_buf, + start, end, scanned); + } + if (rv == MO_HALT_MATCHING) { + return MO_DEAD; + } + rv = sheng32_co(state, cb, ctxt, sh, cached_accept_state, + cached_accept_id, single, offset, cur_buf, + *scanned, end, scanned); + } + if (rv == MO_HALT_MATCHING) { + return MO_DEAD; + } + return MO_ALIVE; +} + +static really_inline +void runSheng32Nm(const struct sheng32 *sh, NfaCallback cb, void *ctxt, + u64a offset, u8 *const cached_accept_state, + ReportID *const cached_accept_id, const u8 *cur_buf, + const u8 *start, const u8 *end, u8 can_die, u8 has_accel, + u8 single, const u8 **scanned, u8 *state) { + DEBUG_PRINTF("Scanning %llu bytes (offset %llu) in nomatch mode\n", + (u64a)(end - start), offset); + DEBUG_PRINTF("start: %lli end: %lli\n", (s64a)(start - cur_buf), + (s64a)(end - cur_buf)); + DEBUG_PRINTF("can die: %u has accel: %u single: %u\n", !!can_die, + !!has_accel, !!single); + /* just scan the buffer */ + if (can_die) { + if (has_accel) { + sheng32_4_nmda(state, cb, ctxt, sh, cached_accept_state, + cached_accept_id, single, offset, cur_buf, + start, end, scanned); + } else { + sheng32_4_nmd(state, cb, ctxt, sh, cached_accept_state, + cached_accept_id, single, offset, cur_buf, + start, end, scanned); + } + sheng32_nmd(state, cb, ctxt, sh, cached_accept_state, cached_accept_id, + single, offset, cur_buf, *scanned, end, scanned); + } else { + sheng32_4_nm(state, cb, ctxt, sh, cached_accept_state, cached_accept_id, + single, offset, cur_buf, start, end, scanned); + sheng32_nm(state, cb, ctxt, sh, cached_accept_state, cached_accept_id, + single, offset, cur_buf, *scanned, end, scanned); + } +} + +static really_inline +char runSheng32Sam(const struct sheng32 *sh, NfaCallback cb, void *ctxt, + u64a offset, u8 *const cached_accept_state, + ReportID *const cached_accept_id, const u8 *cur_buf, + const u8 *start, const u8 *end, u8 can_die, u8 has_accel, + u8 single, const u8 **scanned, u8 *state) { + DEBUG_PRINTF("Scanning %llu bytes (offset %llu) in stop at match mode\n", + (u64a)(end - start), offset); + DEBUG_PRINTF("start: %lli end: %lli\n", (s64a)(start - cur_buf), + (s64a)(end - cur_buf)); + DEBUG_PRINTF("can die: %u has accel: %u single: %u\n", !!can_die, + !!has_accel, !!single); + int rv; + /* scan until first match */ + if (can_die) { + if (has_accel) { + rv = sheng32_4_samda(state, cb, ctxt, sh, cached_accept_state, + cached_accept_id, single, offset, cur_buf, + start, end, scanned); + } else { + rv = sheng32_4_samd(state, cb, ctxt, sh, cached_accept_state, + cached_accept_id, single, offset, cur_buf, + start, end, scanned); + } + if (rv == MO_HALT_MATCHING) { + return MO_DEAD; + } + /* if we stopped before we expected, we found a match */ + if (rv == MO_MATCHES_PENDING) { + return MO_MATCHES_PENDING; + } + + rv = sheng32_samd(state, cb, ctxt, sh, cached_accept_state, + cached_accept_id, single, offset, cur_buf, + *scanned, end, scanned); + } else { + if (has_accel) { + rv = sheng32_4_sama(state, cb, ctxt, sh, cached_accept_state, + cached_accept_id, single, offset, cur_buf, + start, end, scanned); + } else { + rv = sheng32_4_sam(state, cb, ctxt, sh, cached_accept_state, + cached_accept_id, single, offset, cur_buf, + start, end, scanned); + } + if (rv == MO_HALT_MATCHING) { + return MO_DEAD; + } + /* if we stopped before we expected, we found a match */ + if (rv == MO_MATCHES_PENDING) { + return MO_MATCHES_PENDING; + } + + rv = sheng32_sam(state, cb, ctxt, sh, cached_accept_state, + cached_accept_id, single, offset, cur_buf, + *scanned, end, scanned); + } + if (rv == MO_HALT_MATCHING) { + return MO_DEAD; + } + /* if we stopped before we expected, we found a match */ + if (rv == MO_MATCHES_PENDING) { + return MO_MATCHES_PENDING; + } + return MO_ALIVE; +} + +static never_inline +char runSheng32(const struct sheng32 *sh, struct mq *q, s64a b_end, + enum MatchMode mode) { + u8 state = *(u8 *)q->state; + u8 can_die = sh->flags & SHENG_FLAG_CAN_DIE; + u8 has_accel = sh->flags & SHENG_FLAG_HAS_ACCEL; + u8 single = sh->flags & SHENG_FLAG_SINGLE_REPORT; + + u8 cached_accept_state = 0; + ReportID cached_accept_id = 0; + + DEBUG_PRINTF("starting Sheng32 execution in state %u\n", + state & SHENG32_STATE_MASK); + + if (q->report_current) { + DEBUG_PRINTF("reporting current pending matches\n"); + assert(sh); + + q->report_current = 0; + + int rv; + if (single) { + rv = fireSingleReport(q->cb, q->context, sh->report, + q_cur_offset(q)); + } else { + rv = fireReports32(sh, q->cb, q->context, state, q_cur_offset(q), + &cached_accept_state, &cached_accept_id, 0); + } + if (rv == MO_HALT_MATCHING) { + DEBUG_PRINTF("exiting in state %u\n", state & SHENG32_STATE_MASK); + return MO_DEAD; + } + + DEBUG_PRINTF("proceeding with matching\n"); + } + + assert(q_cur_type(q) == MQE_START); + s64a start = q_cur_loc(q); + + DEBUG_PRINTF("offset: %lli, location: %lli, mode: %s\n", q->offset, start, + mode == CALLBACK_OUTPUT ? "CALLBACK OUTPUT" : + mode == NO_MATCHES ? "NO MATCHES" : + mode == STOP_AT_MATCH ? "STOP AT MATCH" : "???"); + + DEBUG_PRINTF("processing event @ %lli: %s\n", q->offset + q_cur_loc(q), + q_cur_type(q) == MQE_START ? "START" : + q_cur_type(q) == MQE_TOP ? "TOP" : + q_cur_type(q) == MQE_END ? "END" : "???"); + + const u8* cur_buf; + if (start < 0) { + DEBUG_PRINTF("negative location, scanning history\n"); + DEBUG_PRINTF("min location: %zd\n", -q->hlength); + cur_buf = q->history + q->hlength; + } else { + DEBUG_PRINTF("positive location, scanning buffer\n"); + DEBUG_PRINTF("max location: %lli\n", b_end); + cur_buf = q->buffer; + } + + /* if we our queue event is past our end */ + if (mode != NO_MATCHES && q_cur_loc(q) > b_end) { + DEBUG_PRINTF("current location past buffer end\n"); + DEBUG_PRINTF("setting q location to %llu\n", b_end); + DEBUG_PRINTF("exiting in state %u\n", state & SHENG32_STATE_MASK); + q->items[q->cur].location = b_end; + return MO_ALIVE; + } + + q->cur++; + + s64a cur_start = start; + + while (1) { + DEBUG_PRINTF("processing event @ %lli: %s\n", q->offset + q_cur_loc(q), + q_cur_type(q) == MQE_START ? "START" : + q_cur_type(q) == MQE_TOP ? "TOP" : + q_cur_type(q) == MQE_END ? "END" : "???"); + s64a end = q_cur_loc(q); + if (mode != NO_MATCHES) { + end = MIN(end, b_end); + } + assert(end <= (s64a) q->length); + s64a cur_end = end; + + /* we may cross the border between history and current buffer */ + if (cur_start < 0) { + cur_end = MIN(0, cur_end); + } + + DEBUG_PRINTF("start: %lli end: %lli\n", start, end); + + /* don't scan zero length buffer */ + if (cur_start != cur_end) { + const u8 * scanned = cur_buf; + char rv; + + if (mode == NO_MATCHES) { + runSheng32Nm(sh, q->cb, q->context, q->offset, + &cached_accept_state, &cached_accept_id, cur_buf, + cur_buf + cur_start, cur_buf + cur_end, can_die, + has_accel, single, &scanned, &state); + } else if (mode == CALLBACK_OUTPUT) { + rv = runSheng32Cb(sh, q->cb, q->context, q->offset, + &cached_accept_state, &cached_accept_id, + cur_buf, cur_buf + cur_start, cur_buf + cur_end, + can_die, has_accel, single, &scanned, &state); + if (rv == MO_DEAD) { + DEBUG_PRINTF("exiting in state %u\n", + state & SHENG32_STATE_MASK); + return MO_DEAD; + } + } else if (mode == STOP_AT_MATCH) { + rv = runSheng32Sam(sh, q->cb, q->context, q->offset, + &cached_accept_state, &cached_accept_id, + cur_buf, cur_buf + cur_start, + cur_buf + cur_end, can_die, has_accel, single, + &scanned, &state); + if (rv == MO_DEAD) { + DEBUG_PRINTF("exiting in state %u\n", + state & SHENG32_STATE_MASK); + return rv; + } else if (rv == MO_MATCHES_PENDING) { + assert(q->cur); + DEBUG_PRINTF("found a match, setting q location to %zd\n", + scanned - cur_buf + 1); + q->cur--; + q->items[q->cur].type = MQE_START; + q->items[q->cur].location = + scanned - cur_buf + 1; /* due to exiting early */ + *(u8 *)q->state = state; + DEBUG_PRINTF("exiting in state %u\n", + state & SHENG32_STATE_MASK); + return rv; + } + } else { + assert(!"invalid scanning mode!"); + } + assert(scanned == cur_buf + cur_end); + + cur_start = cur_end; + } + + /* if we our queue event is past our end */ + if (mode != NO_MATCHES && q_cur_loc(q) > b_end) { + DEBUG_PRINTF("current location past buffer end\n"); + DEBUG_PRINTF("setting q location to %llu\n", b_end); + DEBUG_PRINTF("exiting in state %u\n", state & SHENG32_STATE_MASK); + q->cur--; + q->items[q->cur].type = MQE_START; + q->items[q->cur].location = b_end; + *(u8 *)q->state = state; + return MO_ALIVE; + } + + /* crossing over into actual buffer */ + if (cur_start == 0) { + DEBUG_PRINTF("positive location, scanning buffer\n"); + DEBUG_PRINTF("max offset: %lli\n", b_end); + cur_buf = q->buffer; + } + + /* continue scanning the same buffer */ + if (end != cur_end) { + continue; + } + + switch (q_cur_type(q)) { + case MQE_END: + *(u8 *)q->state = state; + q->cur++; + DEBUG_PRINTF("exiting in state %u\n", state & SHENG32_STATE_MASK); + if (can_die) { + return (state & SHENG32_STATE_DEAD) ? MO_DEAD : MO_ALIVE; + } + return MO_ALIVE; + case MQE_TOP: + if (q->offset + cur_start == 0) { + DEBUG_PRINTF("Anchored start, going to state %u\n", + sh->anchored); + state = sh->anchored; + } else { + u8 new_state = get_aux32(sh, state)->top; + DEBUG_PRINTF("Top event %u->%u\n", state & SHENG32_STATE_MASK, + new_state & SHENG32_STATE_MASK); + state = new_state; + } + break; + default: + assert(!"invalid queue event"); + break; + } + q->cur++; + } +} + +char nfaExecSheng32_B(const struct NFA *n, u64a offset, const u8 *buffer, + size_t length, NfaCallback cb, void *context) { + DEBUG_PRINTF("smallwrite Sheng32\n"); + assert(n->type == SHENG_NFA_32); + const struct sheng32 *sh = getImplNfa(n); + u8 state = sh->anchored; + u8 can_die = sh->flags & SHENG_FLAG_CAN_DIE; + u8 has_accel = sh->flags & SHENG_FLAG_HAS_ACCEL; + u8 single = sh->flags & SHENG_FLAG_SINGLE_REPORT; + u8 cached_accept_state = 0; + ReportID cached_accept_id = 0; + + /* scan and report all matches */ + int rv; + s64a end = length; + const u8 *scanned; + + rv = runSheng32Cb(sh, cb, context, offset, &cached_accept_state, + &cached_accept_id, buffer, buffer, buffer + end, can_die, + has_accel, single, &scanned, &state); + if (rv == MO_DEAD) { + DEBUG_PRINTF("exiting in state %u\n", + state & SHENG32_STATE_MASK); + return MO_DEAD; + } + + DEBUG_PRINTF("%u\n", state & SHENG32_STATE_MASK); + + const struct sstate_aux *aux = get_aux32(sh, state); + + if (aux->accept_eod) { + DEBUG_PRINTF("Reporting EOD matches\n"); + fireReports32(sh, cb, context, state, end + offset, + &cached_accept_state, &cached_accept_id, 1); + } + + return state & SHENG32_STATE_DEAD ? MO_DEAD : MO_ALIVE; +} + +char nfaExecSheng32_Q(const struct NFA *n, struct mq *q, s64a end) { + const struct sheng32 *sh = get_sheng32(n); + char rv = runSheng32(sh, q, end, CALLBACK_OUTPUT); + return rv; +} + +char nfaExecSheng32_Q2(const struct NFA *n, struct mq *q, s64a end) { + const struct sheng32 *sh = get_sheng32(n); + char rv = runSheng32(sh, q, end, STOP_AT_MATCH); + return rv; +} + +char nfaExecSheng32_QR(const struct NFA *n, struct mq *q, ReportID report) { + assert(q_cur_type(q) == MQE_START); + + const struct sheng32 *sh = get_sheng32(n); + char rv = runSheng32(sh, q, 0 /* end */, NO_MATCHES); + + if (rv && nfaExecSheng32_inAccept(n, report, q)) { + return MO_MATCHES_PENDING; + } + return rv; +} + +char nfaExecSheng32_inAccept(const struct NFA *n, ReportID report, + struct mq *q) { + assert(n && q); + + const struct sheng32 *sh = get_sheng32(n); + u8 s = *(const u8 *)q->state; + DEBUG_PRINTF("checking accepts for %u\n", (u8)(s & SHENG32_STATE_MASK)); + + const struct sstate_aux *aux = get_aux32(sh, s); + + if (!aux->accept) { + return 0; + } + + return sheng32HasAccept(sh, aux, report); +} + +char nfaExecSheng32_inAnyAccept(const struct NFA *n, struct mq *q) { + assert(n && q); + + const struct sheng32 *sh = get_sheng32(n); + u8 s = *(const u8 *)q->state; + DEBUG_PRINTF("checking accepts for %u\n", (u8)(s & SHENG32_STATE_MASK)); + + const struct sstate_aux *aux = get_aux32(sh, s); + return !!aux->accept; +} + +char nfaExecSheng32_testEOD(const struct NFA *nfa, const char *state, + UNUSED const char *streamState, u64a offset, + NfaCallback cb, void *ctxt) { + assert(nfa); + + const struct sheng32 *sh = get_sheng32(nfa); + u8 s = *(const u8 *)state; + DEBUG_PRINTF("checking EOD accepts for %u\n", (u8)(s & SHENG32_STATE_MASK)); + + const struct sstate_aux *aux = get_aux32(sh, s); + + if (!aux->accept_eod) { + return MO_CONTINUE_MATCHING; + } + + return fireReports32(sh, cb, ctxt, s, offset, NULL, NULL, 1); +} + +char nfaExecSheng32_reportCurrent(const struct NFA *n, struct mq *q) { + const struct sheng32 *sh = (const struct sheng32 *)getImplNfa(n); + NfaCallback cb = q->cb; + void *ctxt = q->context; + u8 s = *(u8 *)q->state; + const struct sstate_aux *aux = get_aux32(sh, s); + u64a offset = q_cur_offset(q); + u8 cached_state_id = 0; + ReportID cached_report_id = 0; + assert(q_cur_type(q) == MQE_START); + + if (aux->accept) { + if (sh->flags & SHENG_FLAG_SINGLE_REPORT) { + fireSingleReport(cb, ctxt, sh->report, offset); + } else { + fireReports32(sh, cb, ctxt, s, offset, &cached_state_id, + &cached_report_id, 0); + } + } + + return 0; +} + +char nfaExecSheng32_initCompressedState(const struct NFA *nfa, u64a offset, + void *state, UNUSED u8 key) { + const struct sheng32 *sh = get_sheng32(nfa); + u8 *s = (u8 *)state; + *s = offset ? sh->floating: sh->anchored; + return !(*s & SHENG32_STATE_DEAD); +} + +char nfaExecSheng32_queueInitState(const struct NFA *nfa, struct mq *q) { + assert(nfa->scratchStateSize == 1); + + /* starting in floating state */ + const struct sheng32 *sh = get_sheng32(nfa); + *(u8 *)q->state = sh->floating; + DEBUG_PRINTF("starting in floating state\n"); + return 0; +} + +char nfaExecSheng32_queueCompressState(UNUSED const struct NFA *nfa, + const struct mq *q, UNUSED s64a loc) { + void *dest = q->streamState; + const void *src = q->state; + assert(nfa->scratchStateSize == 1); + assert(nfa->streamStateSize == 1); + *(u8 *)dest = *(const u8 *)src; + return 0; +} + +char nfaExecSheng32_expandState(UNUSED const struct NFA *nfa, void *dest, + const void *src, UNUSED u64a offset, + UNUSED u8 key) { + assert(nfa->scratchStateSize == 1); + assert(nfa->streamStateSize == 1); + *(u8 *)dest = *(const u8 *)src; + return 0; +} +#endif // end of HAVE_AVX512VBMI diff --git a/src/nfa/sheng.h b/src/nfa/sheng.h index 84a2b6b5..d017bbbc 100644 --- a/src/nfa/sheng.h +++ b/src/nfa/sheng.h @@ -58,4 +58,45 @@ char nfaExecSheng_reportCurrent(const struct NFA *n, struct mq *q); char nfaExecSheng_B(const struct NFA *n, u64a offset, const u8 *buffer, size_t length, NfaCallback cb, void *context); +#if defined(HAVE_AVX512VBMI) +#define nfaExecSheng32_B_Reverse NFA_API_NO_IMPL +#define nfaExecSheng32_zombie_status NFA_API_ZOMBIE_NO_IMPL + +char nfaExecSheng32_Q(const struct NFA *n, struct mq *q, s64a end); +char nfaExecSheng32_Q2(const struct NFA *n, struct mq *q, s64a end); +char nfaExecSheng32_QR(const struct NFA *n, struct mq *q, ReportID report); +char nfaExecSheng32_inAccept(const struct NFA *n, ReportID report, + struct mq *q); +char nfaExecSheng32_inAnyAccept(const struct NFA *n, struct mq *q); +char nfaExecSheng32_queueInitState(const struct NFA *nfa, struct mq *q); +char nfaExecSheng32_queueCompressState(const struct NFA *nfa, + const struct mq *q, s64a loc); +char nfaExecSheng32_expandState(const struct NFA *nfa, void *dest, + const void *src, u64a offset, u8 key); +char nfaExecSheng32_initCompressedState(const struct NFA *nfa, u64a offset, + void *state, u8 key); +char nfaExecSheng32_testEOD(const struct NFA *nfa, const char *state, + const char *streamState, u64a offset, + NfaCallback callback, void *context); +char nfaExecSheng32_reportCurrent(const struct NFA *n, struct mq *q); + +char nfaExecSheng32_B(const struct NFA *n, u64a offset, const u8 *buffer, + size_t length, NfaCallback cb, void *context); +#else // !HAVE_AVX512VBMI +#define nfaExecSheng32_B_Reverse NFA_API_NO_IMPL +#define nfaExecSheng32_zombie_status NFA_API_ZOMBIE_NO_IMPL +#define nfaExecSheng32_Q NFA_API_NO_IMPL +#define nfaExecSheng32_Q2 NFA_API_NO_IMPL +#define nfaExecSheng32_QR NFA_API_NO_IMPL +#define nfaExecSheng32_inAccept NFA_API_NO_IMPL +#define nfaExecSheng32_inAnyAccept NFA_API_NO_IMPL +#define nfaExecSheng32_queueInitState NFA_API_NO_IMPL +#define nfaExecSheng32_queueCompressState NFA_API_NO_IMPL +#define nfaExecSheng32_expandState NFA_API_NO_IMPL +#define nfaExecSheng32_initCompressedState NFA_API_NO_IMPL +#define nfaExecSheng32_testEOD NFA_API_NO_IMPL +#define nfaExecSheng32_reportCurrent NFA_API_NO_IMPL +#define nfaExecSheng32_B NFA_API_NO_IMPL +#endif // end of HAVE_AVX512VBMI + #endif /* SHENG_H_ */ diff --git a/src/nfa/sheng_defs.h b/src/nfa/sheng_defs.h index 26bdbcee..ddf76f35 100644 --- a/src/nfa/sheng_defs.h +++ b/src/nfa/sheng_defs.h @@ -37,21 +37,49 @@ u8 isDeadState(const u8 a) { return a & SHENG_STATE_DEAD; } +#if defined(HAVE_AVX512VBMI) +static really_inline +u8 isDeadState32(const u8 a) { + return a & SHENG32_STATE_DEAD; +} +#endif + static really_inline u8 isAcceptState(const u8 a) { return a & SHENG_STATE_ACCEPT; } +#if defined(HAVE_AVX512VBMI) +static really_inline +u8 isAcceptState32(const u8 a) { + return a & SHENG32_STATE_ACCEPT; +} +#endif + static really_inline u8 isAccelState(const u8 a) { return a & SHENG_STATE_ACCEL; } +#if defined(HAVE_AVX512VBMI) +static really_inline +u8 isAccelState32(const u8 a) { + return a & SHENG32_STATE_ACCEL; +} +#endif + static really_inline u8 hasInterestingStates(const u8 a, const u8 b, const u8 c, const u8 d) { return (a | b | c | d) & (SHENG_STATE_FLAG_MASK); } +#if defined(HAVE_AVX512VBMI) +static really_inline +u8 hasInterestingStates32(const u8 a, const u8 b, const u8 c, const u8 d) { + return (a | b | c | d) & (SHENG32_STATE_FLAG_MASK); +} +#endif + /* these functions should be optimized out, used by NO_MATCHES mode */ static really_inline u8 dummyFunc4(UNUSED const u8 a, UNUSED const u8 b, UNUSED const u8 c, @@ -71,66 +99,126 @@ u8 dummyFunc(UNUSED const u8 a) { #define SHENG_IMPL sheng_cod #define DEAD_FUNC isDeadState #define ACCEPT_FUNC isAcceptState +#if defined(HAVE_AVX512VBMI) +#define SHENG32_IMPL sheng32_cod +#define DEAD_FUNC32 isDeadState32 +#define ACCEPT_FUNC32 isAcceptState32 +#endif #define STOP_AT_MATCH 0 #include "sheng_impl.h" #undef SHENG_IMPL #undef DEAD_FUNC #undef ACCEPT_FUNC +#if defined(HAVE_AVX512VBMI) +#undef SHENG32_IMPL +#undef DEAD_FUNC32 +#undef ACCEPT_FUNC32 +#endif #undef STOP_AT_MATCH /* callback output, can't die */ #define SHENG_IMPL sheng_co #define DEAD_FUNC dummyFunc #define ACCEPT_FUNC isAcceptState +#if defined(HAVE_AVX512VBMI) +#define SHENG32_IMPL sheng32_co +#define DEAD_FUNC32 dummyFunc +#define ACCEPT_FUNC32 isAcceptState32 +#endif #define STOP_AT_MATCH 0 #include "sheng_impl.h" #undef SHENG_IMPL #undef DEAD_FUNC #undef ACCEPT_FUNC +#if defined(HAVE_AVX512VBMI) +#undef SHENG32_IMPL +#undef DEAD_FUNC32 +#undef ACCEPT_FUNC32 +#endif #undef STOP_AT_MATCH /* stop at match, can die */ #define SHENG_IMPL sheng_samd #define DEAD_FUNC isDeadState #define ACCEPT_FUNC isAcceptState +#if defined(HAVE_AVX512VBMI) +#define SHENG32_IMPL sheng32_samd +#define DEAD_FUNC32 isDeadState32 +#define ACCEPT_FUNC32 isAcceptState32 +#endif #define STOP_AT_MATCH 1 #include "sheng_impl.h" #undef SHENG_IMPL #undef DEAD_FUNC #undef ACCEPT_FUNC +#if defined(HAVE_AVX512VBMI) +#undef SHENG32_IMPL +#undef DEAD_FUNC32 +#undef ACCEPT_FUNC32 +#endif #undef STOP_AT_MATCH /* stop at match, can't die */ #define SHENG_IMPL sheng_sam #define DEAD_FUNC dummyFunc #define ACCEPT_FUNC isAcceptState +#if defined(HAVE_AVX512VBMI) +#define SHENG32_IMPL sheng32_sam +#define DEAD_FUNC32 dummyFunc +#define ACCEPT_FUNC32 isAcceptState32 +#endif #define STOP_AT_MATCH 1 #include "sheng_impl.h" #undef SHENG_IMPL #undef DEAD_FUNC #undef ACCEPT_FUNC +#if defined(HAVE_AVX512VBMI) +#undef SHENG32_IMPL +#undef DEAD_FUNC32 +#undef ACCEPT_FUNC32 +#endif #undef STOP_AT_MATCH /* no match, can die */ #define SHENG_IMPL sheng_nmd #define DEAD_FUNC isDeadState #define ACCEPT_FUNC dummyFunc +#if defined(HAVE_AVX512VBMI) +#define SHENG32_IMPL sheng32_nmd +#define DEAD_FUNC32 isDeadState32 +#define ACCEPT_FUNC32 dummyFunc +#endif #define STOP_AT_MATCH 0 #include "sheng_impl.h" #undef SHENG_IMPL #undef DEAD_FUNC #undef ACCEPT_FUNC +#if defined(HAVE_AVX512VBMI) +#undef SHENG32_IMPL +#undef DEAD_FUNC32 +#undef ACCEPT_FUNC32 +#endif #undef STOP_AT_MATCH /* no match, can't die */ #define SHENG_IMPL sheng_nm #define DEAD_FUNC dummyFunc #define ACCEPT_FUNC dummyFunc +#if defined(HAVE_AVX512VBMI) +#define SHENG32_IMPL sheng32_nm +#define DEAD_FUNC32 dummyFunc +#define ACCEPT_FUNC32 dummyFunc +#endif #define STOP_AT_MATCH 0 #include "sheng_impl.h" #undef SHENG_IMPL #undef DEAD_FUNC #undef ACCEPT_FUNC +#if defined(HAVE_AVX512VBMI) +#undef SHENG32_IMPL +#undef DEAD_FUNC32 +#undef ACCEPT_FUNC32 +#endif #undef STOP_AT_MATCH /* @@ -144,6 +232,15 @@ u8 dummyFunc(UNUSED const u8 a) { #define INNER_ACCEL_FUNC isAccelState #define OUTER_ACCEL_FUNC dummyFunc #define ACCEPT_FUNC isAcceptState +#if defined(HAVE_AVX512VBMI) +#define SHENG32_IMPL sheng32_4_coda +#define INTERESTING_FUNC32 hasInterestingStates32 +#define INNER_DEAD_FUNC32 isDeadState32 +#define OUTER_DEAD_FUNC32 dummyFunc +#define INNER_ACCEL_FUNC32 isAccelState32 +#define OUTER_ACCEL_FUNC32 dummyFunc +#define ACCEPT_FUNC32 isAcceptState32 +#endif #define STOP_AT_MATCH 0 #include "sheng_impl4.h" #undef SHENG_IMPL @@ -153,6 +250,15 @@ u8 dummyFunc(UNUSED const u8 a) { #undef INNER_ACCEL_FUNC #undef OUTER_ACCEL_FUNC #undef ACCEPT_FUNC +#if defined(HAVE_AVX512VBMI) +#undef SHENG32_IMPL +#undef INTERESTING_FUNC32 +#undef INNER_DEAD_FUNC32 +#undef OUTER_DEAD_FUNC32 +#undef INNER_ACCEL_FUNC32 +#undef OUTER_ACCEL_FUNC32 +#undef ACCEPT_FUNC32 +#endif #undef STOP_AT_MATCH /* callback output, can die, not accelerated */ @@ -163,6 +269,15 @@ u8 dummyFunc(UNUSED const u8 a) { #define INNER_ACCEL_FUNC dummyFunc #define OUTER_ACCEL_FUNC dummyFunc #define ACCEPT_FUNC isAcceptState +#if defined(HAVE_AVX512VBMI) +#define SHENG32_IMPL sheng32_4_cod +#define INTERESTING_FUNC32 hasInterestingStates32 +#define INNER_DEAD_FUNC32 isDeadState32 +#define OUTER_DEAD_FUNC32 dummyFunc +#define INNER_ACCEL_FUNC32 dummyFunc +#define OUTER_ACCEL_FUNC32 dummyFunc +#define ACCEPT_FUNC32 isAcceptState32 +#endif #define STOP_AT_MATCH 0 #include "sheng_impl4.h" #undef SHENG_IMPL @@ -172,6 +287,15 @@ u8 dummyFunc(UNUSED const u8 a) { #undef INNER_ACCEL_FUNC #undef OUTER_ACCEL_FUNC #undef ACCEPT_FUNC +#if defined(HAVE_AVX512VBMI) +#undef SHENG32_IMPL +#undef INTERESTING_FUNC32 +#undef INNER_DEAD_FUNC32 +#undef OUTER_DEAD_FUNC32 +#undef INNER_ACCEL_FUNC32 +#undef OUTER_ACCEL_FUNC32 +#undef ACCEPT_FUNC32 +#endif #undef STOP_AT_MATCH /* callback output, can't die, accelerated */ @@ -182,6 +306,15 @@ u8 dummyFunc(UNUSED const u8 a) { #define INNER_ACCEL_FUNC isAccelState #define OUTER_ACCEL_FUNC dummyFunc #define ACCEPT_FUNC isAcceptState +#if defined(HAVE_AVX512VBMI) +#define SHENG32_IMPL sheng32_4_coa +#define INTERESTING_FUNC32 hasInterestingStates32 +#define INNER_DEAD_FUNC32 dummyFunc +#define OUTER_DEAD_FUNC32 dummyFunc +#define INNER_ACCEL_FUNC32 isAccelState32 +#define OUTER_ACCEL_FUNC32 dummyFunc +#define ACCEPT_FUNC32 isAcceptState32 +#endif #define STOP_AT_MATCH 0 #include "sheng_impl4.h" #undef SHENG_IMPL @@ -191,6 +324,15 @@ u8 dummyFunc(UNUSED const u8 a) { #undef INNER_ACCEL_FUNC #undef OUTER_ACCEL_FUNC #undef ACCEPT_FUNC +#if defined(HAVE_AVX512VBMI) +#undef SHENG32_IMPL +#undef INTERESTING_FUNC32 +#undef INNER_DEAD_FUNC32 +#undef OUTER_DEAD_FUNC32 +#undef INNER_ACCEL_FUNC32 +#undef OUTER_ACCEL_FUNC32 +#undef ACCEPT_FUNC32 +#endif #undef STOP_AT_MATCH /* callback output, can't die, not accelerated */ @@ -201,6 +343,15 @@ u8 dummyFunc(UNUSED const u8 a) { #define INNER_ACCEL_FUNC dummyFunc #define OUTER_ACCEL_FUNC dummyFunc #define ACCEPT_FUNC isAcceptState +#if defined(HAVE_AVX512VBMI) +#define SHENG32_IMPL sheng32_4_co +#define INTERESTING_FUNC32 hasInterestingStates32 +#define INNER_DEAD_FUNC32 dummyFunc +#define OUTER_DEAD_FUNC32 dummyFunc +#define INNER_ACCEL_FUNC32 dummyFunc +#define OUTER_ACCEL_FUNC32 dummyFunc +#define ACCEPT_FUNC32 isAcceptState32 +#endif #define STOP_AT_MATCH 0 #include "sheng_impl4.h" #undef SHENG_IMPL @@ -210,6 +361,15 @@ u8 dummyFunc(UNUSED const u8 a) { #undef INNER_ACCEL_FUNC #undef OUTER_ACCEL_FUNC #undef ACCEPT_FUNC +#if defined(HAVE_AVX512VBMI) +#undef SHENG32_IMPL +#undef INTERESTING_FUNC32 +#undef INNER_DEAD_FUNC32 +#undef OUTER_DEAD_FUNC32 +#undef INNER_ACCEL_FUNC32 +#undef OUTER_ACCEL_FUNC32 +#undef ACCEPT_FUNC32 +#endif #undef STOP_AT_MATCH /* stop at match, can die, accelerated */ @@ -220,6 +380,15 @@ u8 dummyFunc(UNUSED const u8 a) { #define INNER_ACCEL_FUNC isAccelState #define OUTER_ACCEL_FUNC dummyFunc #define ACCEPT_FUNC isAcceptState +#if defined(HAVE_AVX512VBMI) +#define SHENG32_IMPL sheng32_4_samda +#define INTERESTING_FUNC32 hasInterestingStates32 +#define INNER_DEAD_FUNC32 isDeadState32 +#define OUTER_DEAD_FUNC32 dummyFunc +#define INNER_ACCEL_FUNC32 isAccelState32 +#define OUTER_ACCEL_FUNC32 dummyFunc +#define ACCEPT_FUNC32 isAcceptState32 +#endif #define STOP_AT_MATCH 1 #include "sheng_impl4.h" #undef SHENG_IMPL @@ -229,6 +398,15 @@ u8 dummyFunc(UNUSED const u8 a) { #undef INNER_ACCEL_FUNC #undef OUTER_ACCEL_FUNC #undef ACCEPT_FUNC +#if defined(HAVE_AVX512VBMI) +#undef SHENG32_IMPL +#undef INTERESTING_FUNC32 +#undef INNER_DEAD_FUNC32 +#undef OUTER_DEAD_FUNC32 +#undef INNER_ACCEL_FUNC32 +#undef OUTER_ACCEL_FUNC32 +#undef ACCEPT_FUNC32 +#endif #undef STOP_AT_MATCH /* stop at match, can die, not accelerated */ @@ -239,6 +417,15 @@ u8 dummyFunc(UNUSED const u8 a) { #define INNER_ACCEL_FUNC dummyFunc #define OUTER_ACCEL_FUNC dummyFunc #define ACCEPT_FUNC isAcceptState +#if defined(HAVE_AVX512VBMI) +#define SHENG32_IMPL sheng32_4_samd +#define INTERESTING_FUNC32 hasInterestingStates32 +#define INNER_DEAD_FUNC32 isDeadState32 +#define OUTER_DEAD_FUNC32 dummyFunc +#define INNER_ACCEL_FUNC32 dummyFunc +#define OUTER_ACCEL_FUNC32 dummyFunc +#define ACCEPT_FUNC32 isAcceptState32 +#endif #define STOP_AT_MATCH 1 #include "sheng_impl4.h" #undef SHENG_IMPL @@ -248,6 +435,15 @@ u8 dummyFunc(UNUSED const u8 a) { #undef INNER_ACCEL_FUNC #undef OUTER_ACCEL_FUNC #undef ACCEPT_FUNC +#if defined(HAVE_AVX512VBMI) +#undef SHENG32_IMPL +#undef INTERESTING_FUNC32 +#undef INNER_DEAD_FUNC32 +#undef OUTER_DEAD_FUNC32 +#undef INNER_ACCEL_FUNC32 +#undef OUTER_ACCEL_FUNC32 +#undef ACCEPT_FUNC32 +#endif #undef STOP_AT_MATCH /* stop at match, can't die, accelerated */ @@ -258,6 +454,15 @@ u8 dummyFunc(UNUSED const u8 a) { #define INNER_ACCEL_FUNC isAccelState #define OUTER_ACCEL_FUNC dummyFunc #define ACCEPT_FUNC isAcceptState +#if defined(HAVE_AVX512VBMI) +#define SHENG32_IMPL sheng32_4_sama +#define INTERESTING_FUNC32 hasInterestingStates32 +#define INNER_DEAD_FUNC32 dummyFunc +#define OUTER_DEAD_FUNC32 dummyFunc +#define INNER_ACCEL_FUNC32 isAccelState32 +#define OUTER_ACCEL_FUNC32 dummyFunc +#define ACCEPT_FUNC32 isAcceptState32 +#endif #define STOP_AT_MATCH 1 #include "sheng_impl4.h" #undef SHENG_IMPL @@ -267,6 +472,15 @@ u8 dummyFunc(UNUSED const u8 a) { #undef INNER_ACCEL_FUNC #undef OUTER_ACCEL_FUNC #undef ACCEPT_FUNC +#if defined(HAVE_AVX512VBMI) +#undef SHENG32_IMPL +#undef INTERESTING_FUNC32 +#undef INNER_DEAD_FUNC32 +#undef OUTER_DEAD_FUNC32 +#undef INNER_ACCEL_FUNC32 +#undef OUTER_ACCEL_FUNC32 +#undef ACCEPT_FUNC32 +#endif #undef STOP_AT_MATCH /* stop at match, can't die, not accelerated */ @@ -277,6 +491,15 @@ u8 dummyFunc(UNUSED const u8 a) { #define INNER_ACCEL_FUNC dummyFunc #define OUTER_ACCEL_FUNC dummyFunc #define ACCEPT_FUNC isAcceptState +#if defined(HAVE_AVX512VBMI) +#define SHENG32_IMPL sheng32_4_sam +#define INTERESTING_FUNC32 hasInterestingStates32 +#define INNER_DEAD_FUNC32 dummyFunc +#define OUTER_DEAD_FUNC32 dummyFunc +#define INNER_ACCEL_FUNC32 dummyFunc +#define OUTER_ACCEL_FUNC32 dummyFunc +#define ACCEPT_FUNC32 isAcceptState32 +#endif #define STOP_AT_MATCH 1 #include "sheng_impl4.h" #undef SHENG_IMPL @@ -286,6 +509,15 @@ u8 dummyFunc(UNUSED const u8 a) { #undef INNER_ACCEL_FUNC #undef OUTER_ACCEL_FUNC #undef ACCEPT_FUNC +#if defined(HAVE_AVX512VBMI) +#undef SHENG32_IMPL +#undef INTERESTING_FUNC32 +#undef INNER_DEAD_FUNC32 +#undef OUTER_DEAD_FUNC32 +#undef INNER_ACCEL_FUNC32 +#undef OUTER_ACCEL_FUNC32 +#undef ACCEPT_FUNC32 +#endif #undef STOP_AT_MATCH /* no-match have interesting func as dummy, and die/accel checks are outer */ @@ -298,6 +530,15 @@ u8 dummyFunc(UNUSED const u8 a) { #define INNER_ACCEL_FUNC dummyFunc #define OUTER_ACCEL_FUNC isAccelState #define ACCEPT_FUNC dummyFunc +#if defined(HAVE_AVX512VBMI) +#define SHENG32_IMPL sheng32_4_nmda +#define INTERESTING_FUNC32 dummyFunc4 +#define INNER_DEAD_FUNC32 dummyFunc +#define OUTER_DEAD_FUNC32 isDeadState32 +#define INNER_ACCEL_FUNC32 dummyFunc +#define OUTER_ACCEL_FUNC32 isAccelState32 +#define ACCEPT_FUNC32 dummyFunc +#endif #define STOP_AT_MATCH 0 #include "sheng_impl4.h" #undef SHENG_IMPL @@ -307,6 +548,15 @@ u8 dummyFunc(UNUSED const u8 a) { #undef INNER_ACCEL_FUNC #undef OUTER_ACCEL_FUNC #undef ACCEPT_FUNC +#if defined(HAVE_AVX512VBMI) +#undef SHENG32_IMPL +#undef INTERESTING_FUNC32 +#undef INNER_DEAD_FUNC32 +#undef OUTER_DEAD_FUNC32 +#undef INNER_ACCEL_FUNC32 +#undef OUTER_ACCEL_FUNC32 +#undef ACCEPT_FUNC32 +#endif #undef STOP_AT_MATCH /* no match, can die, not accelerated */ @@ -317,6 +567,15 @@ u8 dummyFunc(UNUSED const u8 a) { #define INNER_ACCEL_FUNC dummyFunc #define OUTER_ACCEL_FUNC dummyFunc #define ACCEPT_FUNC dummyFunc +#if defined(HAVE_AVX512VBMI) +#define SHENG32_IMPL sheng32_4_nmd +#define INTERESTING_FUNC32 dummyFunc4 +#define INNER_DEAD_FUNC32 dummyFunc +#define OUTER_DEAD_FUNC32 isDeadState32 +#define INNER_ACCEL_FUNC32 dummyFunc +#define OUTER_ACCEL_FUNC32 dummyFunc +#define ACCEPT_FUNC32 dummyFunc +#endif #define STOP_AT_MATCH 0 #include "sheng_impl4.h" #undef SHENG_IMPL @@ -326,6 +585,15 @@ u8 dummyFunc(UNUSED const u8 a) { #undef INNER_ACCEL_FUNC #undef OUTER_ACCEL_FUNC #undef ACCEPT_FUNC +#if defined(HAVE_AVX512VBMI) +#undef SHENG32_IMPL +#undef INTERESTING_FUNC32 +#undef INNER_DEAD_FUNC32 +#undef OUTER_DEAD_FUNC32 +#undef INNER_ACCEL_FUNC32 +#undef OUTER_ACCEL_FUNC32 +#undef ACCEPT_FUNC32 +#endif #undef STOP_AT_MATCH /* there is no performance benefit in accelerating a no-match case that can't @@ -339,6 +607,15 @@ u8 dummyFunc(UNUSED const u8 a) { #define INNER_ACCEL_FUNC dummyFunc #define OUTER_ACCEL_FUNC dummyFunc #define ACCEPT_FUNC dummyFunc +#if defined(HAVE_AVX512VBMI) +#define SHENG32_IMPL sheng32_4_nm +#define INTERESTING_FUNC32 dummyFunc4 +#define INNER_DEAD_FUNC32 dummyFunc +#define OUTER_DEAD_FUNC32 dummyFunc +#define INNER_ACCEL_FUNC32 dummyFunc +#define OUTER_ACCEL_FUNC32 dummyFunc +#define ACCEPT_FUNC32 dummyFunc +#endif #define STOP_AT_MATCH 0 #include "sheng_impl4.h" #undef SHENG_IMPL @@ -348,6 +625,15 @@ u8 dummyFunc(UNUSED const u8 a) { #undef INNER_ACCEL_FUNC #undef OUTER_ACCEL_FUNC #undef ACCEPT_FUNC +#if defined(HAVE_AVX512VBMI) +#undef SHENG32_IMPL +#undef INTERESTING_FUNC32 +#undef INNER_DEAD_FUNC32 +#undef OUTER_DEAD_FUNC32 +#undef INNER_ACCEL_FUNC32 +#undef OUTER_ACCEL_FUNC32 +#undef ACCEPT_FUNC32 +#endif #undef STOP_AT_MATCH #endif // SHENG_DEFS_H diff --git a/src/nfa/sheng_impl.h b/src/nfa/sheng_impl.h index aa416194..2fc3b023 100644 --- a/src/nfa/sheng_impl.h +++ b/src/nfa/sheng_impl.h @@ -95,3 +95,66 @@ char SHENG_IMPL(u8 *state, NfaCallback cb, void *ctxt, const struct sheng *s, *scan_end = cur_buf; return MO_CONTINUE_MATCHING; } + +#if defined(HAVE_AVX512VBMI) +static really_inline +char SHENG32_IMPL(u8 *state, NfaCallback cb, void *ctxt, + const struct sheng32 *s, + u8 *const cached_accept_state, + ReportID *const cached_accept_id, + u8 single, u64a base_offset, const u8 *buf, const u8 *start, + const u8 *end, const u8 **scan_end) { + DEBUG_PRINTF("Starting DFA execution in state %u\n", + *state & SHENG32_STATE_MASK); + const u8 *cur_buf = start; + if (DEAD_FUNC32(*state)) { + DEBUG_PRINTF("Dead on arrival\n"); + *scan_end = end; + return MO_CONTINUE_MATCHING; + } + DEBUG_PRINTF("Scanning %lli bytes\n", (s64a)(end - start)); + + m512 cur_state = set64x8(*state); + const m512 *masks = s->succ_masks; + + while (likely(cur_buf != end)) { + const u8 c = *cur_buf; + const m512 succ_mask = masks[c]; + cur_state = vpermb512(cur_state, succ_mask); + const u8 tmp = movd512(cur_state); + + DEBUG_PRINTF("c: %02hhx '%c'\n", c, ourisprint(c) ? c : '?'); + DEBUG_PRINTF("s: %u (flag: %u)\n", tmp & SHENG32_STATE_MASK, + tmp & SHENG32_STATE_FLAG_MASK); + + if (unlikely(ACCEPT_FUNC32(tmp))) { + DEBUG_PRINTF("Accept state %u reached\n", tmp & SHENG32_STATE_MASK); + u64a match_offset = base_offset + (cur_buf - buf) + 1; + DEBUG_PRINTF("Match @ %llu\n", match_offset); + if (STOP_AT_MATCH) { + DEBUG_PRINTF("Stopping at match @ %lli\n", + (u64a)(cur_buf - start)); + *state = tmp; + *scan_end = cur_buf; + return MO_MATCHES_PENDING; + } + if (single) { + if (fireSingleReport(cb, ctxt, s->report, match_offset) == + MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } else { + if (fireReports32(s, cb, ctxt, tmp, match_offset, + cached_accept_state, cached_accept_id, + 0) == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } + } + cur_buf++; + } + *state = movd512(cur_state); + *scan_end = cur_buf; + return MO_CONTINUE_MATCHING; +} +#endif diff --git a/src/nfa/sheng_impl4.h b/src/nfa/sheng_impl4.h index c51bcdea..06356912 100644 --- a/src/nfa/sheng_impl4.h +++ b/src/nfa/sheng_impl4.h @@ -282,3 +282,243 @@ char SHENG_IMPL(u8 *state, NfaCallback cb, void *ctxt, const struct sheng *s, *scan_end = cur_buf; return MO_CONTINUE_MATCHING; } + +#if defined(HAVE_AVX512VBMI) +static really_inline +char SHENG32_IMPL(u8 *state, NfaCallback cb, void *ctxt, + const struct sheng32 *s, + u8 *const cached_accept_state, + ReportID *const cached_accept_id, + u8 single, u64a base_offset, const u8 *buf, const u8 *start, + const u8 *end, const u8 **scan_end) { + DEBUG_PRINTF("Starting DFAx4 execution in state %u\n", + *state & SHENG32_STATE_MASK); + const u8 *cur_buf = start; + const u8 *min_accel_dist = start; + base_offset++; + DEBUG_PRINTF("Scanning %llu bytes\n", (u64a)(end - start)); + + if (INNER_ACCEL_FUNC32(*state) || OUTER_ACCEL_FUNC32(*state)) { + DEBUG_PRINTF("Accel state reached @ 0\n"); + const union AccelAux *aaux = + get_accel32(s, *state & SHENG32_STATE_MASK); + const u8 *new_offset = run_accel(aaux, cur_buf, end); + if (new_offset < cur_buf + BAD_ACCEL_DIST) { + min_accel_dist = new_offset + BIG_ACCEL_PENALTY; + } else { + min_accel_dist = new_offset + SMALL_ACCEL_PENALTY; + } + DEBUG_PRINTF("Next accel chance: %llu\n", + (u64a)(min_accel_dist - start)); + DEBUG_PRINTF("Accel scanned %zu bytes\n", new_offset - cur_buf); + cur_buf = new_offset; + DEBUG_PRINTF("New offset: %lli\n", (s64a)(cur_buf - start)); + } + if (INNER_DEAD_FUNC32(*state) || OUTER_DEAD_FUNC32(*state)) { + DEBUG_PRINTF("Dead on arrival\n"); + *scan_end = end; + return MO_CONTINUE_MATCHING; + } + + m512 cur_state = set64x8(*state); + const m512 *masks = s->succ_masks; + + while (likely(end - cur_buf >= 4)) { + const u8 *b1 = cur_buf; + const u8 *b2 = cur_buf + 1; + const u8 *b3 = cur_buf + 2; + const u8 *b4 = cur_buf + 3; + const u8 c1 = *b1; + const u8 c2 = *b2; + const u8 c3 = *b3; + const u8 c4 = *b4; + + const m512 succ_mask1 = masks[c1]; + cur_state = vpermb512(cur_state, succ_mask1); + const u8 a1 = movd512(cur_state); + + const m512 succ_mask2 = masks[c2]; + cur_state = vpermb512(cur_state, succ_mask2); + const u8 a2 = movd512(cur_state); + + const m512 succ_mask3 = masks[c3]; + cur_state = vpermb512(cur_state, succ_mask3); + const u8 a3 = movd512(cur_state); + + const m512 succ_mask4 = masks[c4]; + cur_state = vpermb512(cur_state, succ_mask4); + const u8 a4 = movd512(cur_state); + + DEBUG_PRINTF("c: %02hhx '%c'\n", c1, ourisprint(c1) ? c1 : '?'); + DEBUG_PRINTF("s: %u (flag: %u)\n", a1 & SHENG32_STATE_MASK, + a1 & SHENG32_STATE_FLAG_MASK); + + DEBUG_PRINTF("c: %02hhx '%c'\n", c2, ourisprint(c2) ? c2 : '?'); + DEBUG_PRINTF("s: %u (flag: %u)\n", a2 & SHENG32_STATE_MASK, + a2 & SHENG32_STATE_FLAG_MASK); + + DEBUG_PRINTF("c: %02hhx '%c'\n", c3, ourisprint(c3) ? c3 : '?'); + DEBUG_PRINTF("s: %u (flag: %u)\n", a3 & SHENG32_STATE_MASK, + a3 & SHENG32_STATE_FLAG_MASK); + + DEBUG_PRINTF("c: %02hhx '%c'\n", c4, ourisprint(c4) ? c4 : '?'); + DEBUG_PRINTF("s: %u (flag: %u)\n", a4 & SHENG32_STATE_MASK, + a4 & SHENG32_STATE_FLAG_MASK); + + if (unlikely(INTERESTING_FUNC32(a1, a2, a3, a4))) { + if (ACCEPT_FUNC32(a1)) { + u64a match_offset = base_offset + b1 - buf; + DEBUG_PRINTF("Accept state %u reached\n", + a1 & SHENG32_STATE_MASK); + DEBUG_PRINTF("Match @ %llu\n", match_offset); + if (STOP_AT_MATCH) { + DEBUG_PRINTF("Stopping at match @ %lli\n", + (s64a)(b1 - start)); + *scan_end = b1; + *state = a1; + return MO_MATCHES_PENDING; + } + if (single) { + if (fireSingleReport(cb, ctxt, s->report, match_offset) == + MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } else { + if (fireReports32(s, cb, ctxt, a1, match_offset, + cached_accept_state, cached_accept_id, + 0) == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } + } + if (ACCEPT_FUNC32(a2)) { + u64a match_offset = base_offset + b2 - buf; + DEBUG_PRINTF("Accept state %u reached\n", + a2 & SHENG32_STATE_MASK); + DEBUG_PRINTF("Match @ %llu\n", match_offset); + if (STOP_AT_MATCH) { + DEBUG_PRINTF("Stopping at match @ %lli\n", + (s64a)(b2 - start)); + *scan_end = b2; + *state = a2; + return MO_MATCHES_PENDING; + } + if (single) { + if (fireSingleReport(cb, ctxt, s->report, match_offset) == + MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } else { + if (fireReports32(s, cb, ctxt, a2, match_offset, + cached_accept_state, cached_accept_id, + 0) == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } + } + if (ACCEPT_FUNC32(a3)) { + u64a match_offset = base_offset + b3 - buf; + DEBUG_PRINTF("Accept state %u reached\n", + a3 & SHENG32_STATE_MASK); + DEBUG_PRINTF("Match @ %llu\n", match_offset); + if (STOP_AT_MATCH) { + DEBUG_PRINTF("Stopping at match @ %lli\n", + (s64a)(b3 - start)); + *scan_end = b3; + *state = a3; + return MO_MATCHES_PENDING; + } + if (single) { + if (fireSingleReport(cb, ctxt, s->report, match_offset) == + MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } else { + if (fireReports32(s, cb, ctxt, a3, match_offset, + cached_accept_state, cached_accept_id, + 0) == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } + } + if (ACCEPT_FUNC32(a4)) { + u64a match_offset = base_offset + b4 - buf; + DEBUG_PRINTF("Accept state %u reached\n", + a4 & SHENG32_STATE_MASK); + DEBUG_PRINTF("Match @ %llu\n", match_offset); + if (STOP_AT_MATCH) { + DEBUG_PRINTF("Stopping at match @ %lli\n", + (s64a)(b4 - start)); + *scan_end = b4; + *state = a4; + return MO_MATCHES_PENDING; + } + if (single) { + if (fireSingleReport(cb, ctxt, s->report, match_offset) == + MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } else { + if (fireReports32(s, cb, ctxt, a4, match_offset, + cached_accept_state, cached_accept_id, + 0) == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } + } + if (INNER_DEAD_FUNC32(a4)) { + DEBUG_PRINTF("Dead state reached @ %lli\n", (s64a)(b4 - buf)); + *scan_end = end; + *state = a4; + return MO_CONTINUE_MATCHING; + } + if (cur_buf > min_accel_dist && INNER_ACCEL_FUNC32(a4)) { + DEBUG_PRINTF("Accel state reached @ %lli\n", (s64a)(b4 - buf)); + const union AccelAux *aaux = + get_accel32(s, a4 & SHENG32_STATE_MASK); + const u8 *new_offset = run_accel(aaux, cur_buf + 4, end); + if (new_offset < cur_buf + 4 + BAD_ACCEL_DIST) { + min_accel_dist = new_offset + BIG_ACCEL_PENALTY; + } else { + min_accel_dist = new_offset + SMALL_ACCEL_PENALTY; + } + DEBUG_PRINTF("Next accel chance: %llu\n", + (u64a)(min_accel_dist - start)); + DEBUG_PRINTF("Accel scanned %llu bytes\n", + (u64a)(new_offset - cur_buf - 4)); + cur_buf = new_offset; + DEBUG_PRINTF("New offset: %llu\n", (u64a)(cur_buf - buf)); + continue; + } + } + if (OUTER_DEAD_FUNC32(a4)) { + DEBUG_PRINTF("Dead state reached @ %lli\n", (s64a)(cur_buf - buf)); + *scan_end = end; + *state = a4; + return MO_CONTINUE_MATCHING; + }; + if (cur_buf > min_accel_dist && OUTER_ACCEL_FUNC32(a4)) { + DEBUG_PRINTF("Accel state reached @ %lli\n", (s64a)(b4 - buf)); + const union AccelAux *aaux = + get_accel32(s, a4 & SHENG32_STATE_MASK); + const u8 *new_offset = run_accel(aaux, cur_buf + 4, end); + if (new_offset < cur_buf + 4 + BAD_ACCEL_DIST) { + min_accel_dist = new_offset + BIG_ACCEL_PENALTY; + } else { + min_accel_dist = new_offset + SMALL_ACCEL_PENALTY; + } + DEBUG_PRINTF("Next accel chance: %llu\n", + (u64a)(min_accel_dist - start)); + DEBUG_PRINTF("Accel scanned %llu bytes\n", + (u64a)(new_offset - cur_buf - 4)); + cur_buf = new_offset; + DEBUG_PRINTF("New offset: %llu\n", (u64a)(cur_buf - buf)); + continue; + }; + cur_buf += 4; + } + *state = movd512(cur_state); + *scan_end = cur_buf; + return MO_CONTINUE_MATCHING; +} +#endif diff --git a/src/nfa/sheng_internal.h b/src/nfa/sheng_internal.h index ff843ebe..68a1680f 100644 --- a/src/nfa/sheng_internal.h +++ b/src/nfa/sheng_internal.h @@ -38,6 +38,14 @@ #define SHENG_STATE_MASK 0xF #define SHENG_STATE_FLAG_MASK 0x70 +#if defined (HAVE_AVX512VBMI) +#define SHENG32_STATE_ACCEPT 0x20 +#define SHENG32_STATE_DEAD 0x40 +#define SHENG32_STATE_ACCEL 0x80 +#define SHENG32_STATE_MASK 0x1F +#define SHENG32_STATE_FLAG_MASK 0xE0 +#endif + #define SHENG_FLAG_SINGLE_REPORT 0x1 #define SHENG_FLAG_CAN_DIE 0x2 #define SHENG_FLAG_HAS_ACCEL 0x4 @@ -67,4 +75,19 @@ struct sheng { ReportID report; }; +#if defined (HAVE_AVX512VBMI) +struct sheng32 { + m512 succ_masks[256]; + u32 length; + u32 aux_offset; + u32 report_offset; + u32 accel_offset; + u8 n_states; + u8 anchored; + u8 floating; + u8 flags; + ReportID report; +}; +#endif + #endif /* SHENG_INTERNAL_H_ */ diff --git a/src/nfa/shengcompile.cpp b/src/nfa/shengcompile.cpp index c4094ced..8b939973 100644 --- a/src/nfa/shengcompile.cpp +++ b/src/nfa/shengcompile.cpp @@ -301,6 +301,17 @@ void dumpShuffleMask(const u8 chr, const u8 *buf, unsigned sz) { } DEBUG_PRINTF("chr %3u: %s\n", chr, o.str().c_str()); } + +static really_inline +void dumpShuffleMask32(const u8 chr, const u8 *buf, unsigned sz) { + stringstream o; + + for (unsigned i = 0; i < sz; i++) { + o.width(2); + o << (buf[i] & SHENG32_STATE_MASK) << " "; + } + DEBUG_PRINTF("chr %3u: %s\n", chr, o.str().c_str()); +} #endif static @@ -311,9 +322,16 @@ void fillAccelOut(const map &accel_escape_info, } } +template static u8 getShengState(dstate &state, dfa_info &info, map &accelInfo) { + return 0; +} + +template <> +u8 getShengState(dstate &state, dfa_info &info, + map &accelInfo) { u8 s = state.impl_id; if (!state.reports.empty()) { s |= SHENG_STATE_ACCEPT; @@ -327,11 +345,30 @@ u8 getShengState(dstate &state, dfa_info &info, return s; } +#if defined(HAVE_AVX512VBMI) +template <> +u8 getShengState(dstate &state, dfa_info &info, + map &accelInfo) { + u8 s = state.impl_id; + if (!state.reports.empty()) { + s |= SHENG32_STATE_ACCEPT; + } + if (info.isDead(state)) { + s |= SHENG32_STATE_DEAD; + } + if (accelInfo.find(info.raw_id(state.impl_id)) != accelInfo.end()) { + s |= SHENG32_STATE_ACCEL; + } + return s; +} +#endif + +template static void fillAccelAux(struct NFA *n, dfa_info &info, map &accelInfo) { DEBUG_PRINTF("Filling accel aux structures\n"); - sheng *s = (sheng *)getMutableImplNfa(n); + T *s = (T *)getMutableImplNfa(n); u32 offset = s->accel_offset; for (dstate_id_t i = 0; i < info.size(); i++) { @@ -349,11 +386,20 @@ void fillAccelAux(struct NFA *n, dfa_info &info, } } +template static void populateBasicInfo(struct NFA *n, dfa_info &info, map &accelInfo, u32 aux_offset, u32 report_offset, u32 accel_offset, u32 total_size, u32 dfa_size) { +} + +template <> +void populateBasicInfo(struct NFA *n, dfa_info &info, + map &accelInfo, + u32 aux_offset, u32 report_offset, + u32 accel_offset, u32 total_size, + u32 dfa_size) { n->length = total_size; n->scratchStateSize = 1; n->streamStateSize = 1; @@ -369,14 +415,42 @@ void populateBasicInfo(struct NFA *n, dfa_info &info, s->length = dfa_size; s->flags |= info.can_die ? SHENG_FLAG_CAN_DIE : 0; - s->anchored = getShengState(info.anchored, info, accelInfo); - s->floating = getShengState(info.floating, info, accelInfo); + s->anchored = getShengState(info.anchored, info, accelInfo); + s->floating = getShengState(info.floating, info, accelInfo); } +#if defined(HAVE_AVX512VBMI) +template <> +void populateBasicInfo(struct NFA *n, dfa_info &info, + map &accelInfo, + u32 aux_offset, u32 report_offset, + u32 accel_offset, u32 total_size, + u32 dfa_size) { + n->length = total_size; + n->scratchStateSize = 1; + n->streamStateSize = 1; + n->nPositions = info.size(); + n->type = SHENG_NFA_32; + n->flags |= info.raw.hasEodReports() ? NFA_ACCEPTS_EOD : 0; + + sheng32 *s = (sheng32 *)getMutableImplNfa(n); + s->aux_offset = aux_offset; + s->report_offset = report_offset; + s->accel_offset = accel_offset; + s->n_states = info.size(); + s->length = dfa_size; + s->flags |= info.can_die ? SHENG_FLAG_CAN_DIE : 0; + + s->anchored = getShengState(info.anchored, info, accelInfo); + s->floating = getShengState(info.floating, info, accelInfo); +} +#endif + +template static void fillTops(NFA *n, dfa_info &info, dstate_id_t id, map &accelInfo) { - sheng *s = (sheng *)getMutableImplNfa(n); + T *s = (T *)getMutableImplNfa(n); u32 aux_base = s->aux_offset; DEBUG_PRINTF("Filling tops for state %u\n", id); @@ -393,13 +467,14 @@ void fillTops(NFA *n, dfa_info &info, dstate_id_t id, DEBUG_PRINTF("Top transition for state %u: %u\n", id, top_state.impl_id); - aux->top = getShengState(top_state, info, accelInfo); + aux->top = getShengState(top_state, info, accelInfo); } +template static void fillAux(NFA *n, dfa_info &info, dstate_id_t id, vector &reports, vector &reports_eod, vector &report_offsets) { - sheng *s = (sheng *)getMutableImplNfa(n); + T *s = (T *)getMutableImplNfa(n); u32 aux_base = s->aux_offset; auto raw_id = info.raw_id(id); @@ -419,25 +494,32 @@ void fillAux(NFA *n, dfa_info &info, dstate_id_t id, vector &reports, DEBUG_PRINTF("EOD report list offset: %u\n", aux->accept_eod); } +template static void fillSingleReport(NFA *n, ReportID r_id) { - sheng *s = (sheng *)getMutableImplNfa(n); + T *s = (T *)getMutableImplNfa(n); DEBUG_PRINTF("Single report ID: %u\n", r_id); s->report = r_id; s->flags |= SHENG_FLAG_SINGLE_REPORT; } +template static -void createShuffleMasks(sheng *s, dfa_info &info, +void createShuffleMasks(T *s, dfa_info &info, map &accelInfo) { +} + +template <> +void createShuffleMasks(sheng *s, dfa_info &info, + map &accelInfo) { for (u16 chr = 0; chr < 256; chr++) { u8 buf[16] = {0}; for (dstate_id_t idx = 0; idx < info.size(); idx++) { auto &succ_state = info.next(idx, chr); - buf[idx] = getShengState(succ_state, info, accelInfo); + buf[idx] = getShengState(succ_state, info, accelInfo); } #ifdef DEBUG dumpShuffleMask(chr, buf, sizeof(buf)); @@ -446,33 +528,38 @@ void createShuffleMasks(sheng *s, dfa_info &info, } } +#if defined(HAVE_AVX512VBMI) +template <> +void createShuffleMasks(sheng32 *s, dfa_info &info, + map &accelInfo) { + for (u16 chr = 0; chr < 256; chr++) { + u8 buf[64] = {0}; + + assert(info.size() <= 32); + for (dstate_id_t idx = 0; idx < info.size(); idx++) { + auto &succ_state = info.next(idx, chr); + + buf[idx] = getShengState(succ_state, info, accelInfo); + buf[32 + idx] = buf[idx]; + } +#ifdef DEBUG + dumpShuffleMask32(chr, buf, sizeof(buf)); +#endif + memcpy(&s->succ_masks[chr], buf, sizeof(m512)); + } +} +#endif + bool has_accel_sheng(const NFA *) { return true; /* consider the sheng region as accelerated */ } -bytecode_ptr shengCompile(raw_dfa &raw, const CompileContext &cc, - const ReportManager &rm, bool only_accel_init, - set *accel_states) { - if (!cc.grey.allowSheng) { - DEBUG_PRINTF("Sheng is not allowed!\n"); - return nullptr; - } - - sheng_build_strat strat(raw, rm, only_accel_init); - dfa_info info(strat); - - DEBUG_PRINTF("Trying to compile a %zu state Sheng\n", raw.states.size()); - - DEBUG_PRINTF("Anchored start state id: %u, floating start state id: %u\n", - raw.start_anchored, raw.start_floating); - - DEBUG_PRINTF("This DFA %s die so effective number of states is %zu\n", - info.can_die ? "can" : "cannot", info.size()); - if (info.size() > 16) { - DEBUG_PRINTF("Too many states\n"); - return nullptr; - } - +template +static +bytecode_ptr shengCompile_int(raw_dfa &raw, const CompileContext &cc, + set *accel_states, + sheng_build_strat &strat, + dfa_info &info) { if (!cc.streaming) { /* TODO: work out if we can do the strip in streaming * mode with our semantics */ raw.stripExtraEodReports(); @@ -487,7 +574,7 @@ bytecode_ptr shengCompile(raw_dfa &raw, const CompileContext &cc, DEBUG_PRINTF("Anchored start state: %u, floating start state: %u\n", info.anchored.impl_id, info.floating.impl_id); - u32 nfa_size = ROUNDUP_16(sizeof(NFA) + sizeof(sheng)); + u32 nfa_size = ROUNDUP_16(sizeof(NFA) + sizeof(T)); vector reports, eod_reports, report_offsets; u8 isSingle = 0; ReportID single_report = 0; @@ -509,30 +596,66 @@ bytecode_ptr shengCompile(raw_dfa &raw, const CompileContext &cc, auto nfa = make_zeroed_bytecode_ptr(total_size); - populateBasicInfo(nfa.get(), info, accelInfo, nfa_size, reports_offset, - accel_offset, total_size, total_size - sizeof(NFA)); + populateBasicInfo(nfa.get(), info, accelInfo, nfa_size, + reports_offset, accel_offset, total_size, + total_size - sizeof(NFA)); DEBUG_PRINTF("Setting up aux and report structures\n"); ri->fillReportLists(nfa.get(), reports_offset, report_offsets); for (dstate_id_t idx = 0; idx < info.size(); idx++) { - fillTops(nfa.get(), info, idx, accelInfo); - fillAux(nfa.get(), info, idx, reports, eod_reports, report_offsets); + fillTops(nfa.get(), info, idx, accelInfo); + fillAux(nfa.get(), info, idx, reports, eod_reports, + report_offsets); } if (isSingle) { - fillSingleReport(nfa.get(), single_report); + fillSingleReport(nfa.get(), single_report); } - fillAccelAux(nfa.get(), info, accelInfo); + fillAccelAux(nfa.get(), info, accelInfo); if (accel_states) { fillAccelOut(accelInfo, accel_states); } - createShuffleMasks((sheng *)getMutableImplNfa(nfa.get()), info, accelInfo); + createShuffleMasks((T *)getMutableImplNfa(nfa.get()), info, accelInfo); return nfa; } +bytecode_ptr shengCompile(raw_dfa &raw, const CompileContext &cc, + const ReportManager &rm, bool only_accel_init, + set *accel_states) { + if (!cc.grey.allowSheng) { + DEBUG_PRINTF("Sheng is not allowed!\n"); + return nullptr; + } + + sheng_build_strat strat(raw, rm, only_accel_init); + dfa_info info(strat); + + DEBUG_PRINTF("Trying to compile a %zu state Sheng\n", raw.states.size()); + + DEBUG_PRINTF("Anchored start state id: %u, floating start state id: %u\n", + raw.start_anchored, raw.start_floating); + + DEBUG_PRINTF("This DFA %s die so effective number of states is %zu\n", + info.can_die ? "can" : "cannot", info.size()); + if (info.size() > 16) { +#if defined(HAVE_AVX512VBMI) + if (info.size() > 32) { + DEBUG_PRINTF("Too many states\n"); + return nullptr; + } + return shengCompile_int(raw, cc, accel_states, strat, info); +#else + DEBUG_PRINTF("Too many states\n"); + return nullptr; +#endif + } + + return shengCompile_int(raw, cc, accel_states, strat, info); +} + } // namespace ue2 diff --git a/src/nfa/shengdump.cpp b/src/nfa/shengdump.cpp index 99fda76f..d6cca401 100644 --- a/src/nfa/shengdump.cpp +++ b/src/nfa/shengdump.cpp @@ -51,7 +51,7 @@ namespace ue2 { static const sstate_aux *get_aux(const NFA *n, dstate_id_t i) { - assert(n && isShengType(n->type)); + assert(n && isSheng16Type(n->type)); const sheng *s = (const sheng *)getImplNfa(n); const sstate_aux *aux_base = @@ -64,6 +64,23 @@ const sstate_aux *get_aux(const NFA *n, dstate_id_t i) { return aux; } +#if defined(HAVE_AVX512VBMI) +static +const sstate_aux *get_aux32(const NFA *n, dstate_id_t i) { + assert(n && isSheng32Type(n->type)); + + const sheng32 *s = (const sheng32 *)getImplNfa(n); + const sstate_aux *aux_base = + (const sstate_aux *)((const char *)n + s->aux_offset); + + const sstate_aux *aux = aux_base + i; + + assert((const char *)aux < (const char *)s + s->length); + + return aux; +} +#endif + static void dumpHeader(FILE *f, const sheng *s) { fprintf(f, "number of states: %u, DFA engine size: %u\n", s->n_states, @@ -79,6 +96,23 @@ void dumpHeader(FILE *f, const sheng *s) { !!(s->flags & SHENG_FLAG_SINGLE_REPORT)); } +#if defined(HAVE_AVX512VBMI) +static +void dumpHeader32(FILE *f, const sheng32 *s) { + fprintf(f, "number of states: %u, DFA engine size: %u\n", s->n_states, + s->length); + fprintf(f, "aux base offset: %u, reports base offset: %u, " + "accel offset: %u\n", + s->aux_offset, s->report_offset, s->accel_offset); + fprintf(f, "anchored start state: %u, floating start state: %u\n", + s->anchored & SHENG32_STATE_MASK, s->floating & SHENG32_STATE_MASK); + fprintf(f, "has accel: %u can die: %u single report: %u\n", + !!(s->flags & SHENG_FLAG_HAS_ACCEL), + !!(s->flags & SHENG_FLAG_CAN_DIE), + !!(s->flags & SHENG_FLAG_SINGLE_REPORT)); +} +#endif + static void dumpAux(FILE *f, u32 state, const sstate_aux *aux) { fprintf(f, "state id: %u, reports offset: %u, EOD reports offset: %u, " @@ -87,6 +121,16 @@ void dumpAux(FILE *f, u32 state, const sstate_aux *aux) { aux->top & SHENG_STATE_MASK); } +#if defined(HAVE_AVX512VBMI) +static +void dumpAux32(FILE *f, u32 state, const sstate_aux *aux) { + fprintf(f, "state id: %u, reports offset: %u, EOD reports offset: %u, " + "accel offset: %u, top: %u\n", + state, aux->accept, aux->accept_eod, aux->accel, + aux->top & SHENG32_STATE_MASK); +} +#endif + static void dumpReports(FILE *f, const report_list *rl) { fprintf(f, "reports count: %u\n", rl->count); @@ -115,6 +159,30 @@ void dumpMasks(FILE *f, const sheng *s) { } } +#if defined(HAVE_AVX512VBMI) +static +void dumpMasks32(FILE *f, const sheng32 *s) { + //u8 flags[64]; + //memcpy(flags, &s->flag_mask, sizeof(m512)); + for (u32 chr = 0; chr < 256; chr++) { + u8 buf[64]; + m512 succ_mask = s->succ_masks[chr]; + memcpy(buf, &succ_mask, sizeof(m512)); + + fprintf(f, "%3u: ", chr); + for (u32 pos = 0; pos < 64; pos++) { + u8 c = buf[pos]; + if (c & SHENG32_STATE_FLAG_MASK) { + fprintf(f, "%2u* ", c & SHENG32_STATE_MASK); + } else { + fprintf(f, "%2u ", c & SHENG32_STATE_MASK); + } + } + fprintf(f, "\n"); + } +} +#endif + static void nfaExecSheng_dumpText(const NFA *nfa, FILE *f) { assert(nfa->type == SHENG_NFA); @@ -153,6 +221,46 @@ void nfaExecSheng_dumpText(const NFA *nfa, FILE *f) { fprintf(f, "\n"); } +#if defined(HAVE_AVX512VBMI) +static +void nfaExecSheng32_dumpText(const NFA *nfa, FILE *f) { + assert(nfa->type == SHENG_NFA_32); + const sheng32 *s = (const sheng32 *)getImplNfa(nfa); + + fprintf(f, "sheng32 DFA\n"); + dumpHeader32(f, s); + + for (u32 state = 0; state < s->n_states; state++) { + const sstate_aux *aux = get_aux32(nfa, state); + dumpAux32(f, state, aux); + if (aux->accept) { + fprintf(f, "report list:\n"); + const report_list *rl = + (const report_list *)((const char *)nfa + aux->accept); + dumpReports(f, rl); + } + if (aux->accept_eod) { + fprintf(f, "EOD report list:\n"); + const report_list *rl = + (const report_list *)((const char *)nfa + aux->accept_eod); + dumpReports(f, rl); + } + if (aux->accel) { + fprintf(f, "accel:\n"); + const AccelAux *accel = + (const AccelAux *)((const char *)nfa + aux->accel); + dumpAccelInfo(f, *accel); + } + } + + fprintf(f, "\n"); + + dumpMasks32(f, s); + + fprintf(f, "\n"); +} +#endif + static void dumpDotPreambleDfa(FILE *f) { dumpDotPreamble(f); @@ -163,8 +271,13 @@ void dumpDotPreambleDfa(FILE *f) { fprintf(f, "0 [style=invis];\n"); } +template static -void describeNode(const NFA *n, const sheng *s, u16 i, FILE *f) { +void describeNode(const NFA *n, const T *s, u16 i, FILE *f) { +} + +template <> +void describeNode(const NFA *n, const sheng *s, u16 i, FILE *f) { const sstate_aux *aux = get_aux(n, i); fprintf(f, "%u [ width = 1, fixedsize = true, fontsize = 12, " @@ -193,6 +306,38 @@ void describeNode(const NFA *n, const sheng *s, u16 i, FILE *f) { } } +#if defined(HAVE_AVX512VBMI) +template <> +void describeNode(const NFA *n, const sheng32 *s, u16 i, FILE *f) { + const sstate_aux *aux = get_aux32(n, i); + + fprintf(f, "%u [ width = 1, fixedsize = true, fontsize = 12, " + "label = \"%u\" ]; \n", + i, i); + + if (aux->accept_eod) { + fprintf(f, "%u [ color = darkorchid ];\n", i); + } + + if (aux->accept) { + fprintf(f, "%u [ shape = doublecircle ];\n", i); + } + + if (aux->top && (aux->top & SHENG32_STATE_MASK) != i) { + fprintf(f, "%u -> %u [color = darkgoldenrod weight=0.1 ]\n", i, + aux->top & SHENG32_STATE_MASK); + } + + if (i == (s->anchored & SHENG32_STATE_MASK)) { + fprintf(f, "STARTA -> %u [color = blue ]\n", i); + } + + if (i == (s->floating & SHENG32_STATE_MASK)) { + fprintf(f, "STARTF -> %u [color = red ]\n", i); + } +} +#endif + static void describeEdge(FILE *f, const u16 *t, u16 i) { for (u16 s = 0; s < N_CHARS; s++) { @@ -228,7 +373,7 @@ void describeEdge(FILE *f, const u16 *t, u16 i) { static void shengGetTransitions(const NFA *n, u16 state, u16 *t) { - assert(isShengType(n->type)); + assert(isSheng16Type(n->type)); const sheng *s = (const sheng *)getImplNfa(n); const sstate_aux *aux = get_aux(n, state); @@ -244,6 +389,26 @@ void shengGetTransitions(const NFA *n, u16 state, u16 *t) { t[TOP] = aux->top & SHENG_STATE_MASK; } +#if defined(HAVE_AVX512VBMI) +static +void sheng32GetTransitions(const NFA *n, u16 state, u16 *t) { + assert(isSheng32Type(n->type)); + const sheng32 *s = (const sheng32 *)getImplNfa(n); + const sstate_aux *aux = get_aux32(n, state); + + for (unsigned i = 0; i < N_CHARS; i++) { + u8 buf[64]; + m512 succ_mask = s->succ_masks[i]; + + memcpy(buf, &succ_mask, sizeof(m512)); + + t[i] = buf[state] & SHENG32_STATE_MASK; + } + + t[TOP] = aux->top & SHENG32_STATE_MASK; +} +#endif + static void nfaExecSheng_dumpDot(const NFA *nfa, FILE *f) { assert(nfa->type == SHENG_NFA); @@ -252,7 +417,7 @@ void nfaExecSheng_dumpDot(const NFA *nfa, FILE *f) { dumpDotPreambleDfa(f); for (u16 i = 1; i < s->n_states; i++) { - describeNode(nfa, s, i, f); + describeNode(nfa, s, i, f); u16 t[ALPHABET_SIZE]; @@ -264,10 +429,40 @@ void nfaExecSheng_dumpDot(const NFA *nfa, FILE *f) { fprintf(f, "}\n"); } +#if defined(HAVE_AVX512VBMI) +static +void nfaExecSheng32_dumpDot(const NFA *nfa, FILE *f) { + assert(nfa->type == SHENG_NFA_32); + const sheng32 *s = (const sheng32 *)getImplNfa(nfa); + + dumpDotPreambleDfa(f); + + for (u16 i = 1; i < s->n_states; i++) { + describeNode(nfa, s, i, f); + + u16 t[ALPHABET_SIZE]; + + sheng32GetTransitions(nfa, i, t); + + describeEdge(f, t, i); + } + + fprintf(f, "}\n"); +} +#endif + void nfaExecSheng_dump(const NFA *nfa, const string &base) { assert(nfa->type == SHENG_NFA); nfaExecSheng_dumpText(nfa, StdioFile(base + ".txt", "w")); nfaExecSheng_dumpDot(nfa, StdioFile(base + ".dot", "w")); } +void nfaExecSheng32_dump(UNUSED const NFA *nfa, UNUSED const string &base) { +#if defined(HAVE_AVX512VBMI) + assert(nfa->type == SHENG_NFA_32); + nfaExecSheng32_dumpText(nfa, StdioFile(base + ".txt", "w")); + nfaExecSheng32_dumpDot(nfa, StdioFile(base + ".dot", "w")); +#endif +} + } // namespace ue2 diff --git a/src/nfa/shengdump.h b/src/nfa/shengdump.h index 2bdffeb9..a9a76233 100644 --- a/src/nfa/shengdump.h +++ b/src/nfa/shengdump.h @@ -38,6 +38,7 @@ struct NFA; namespace ue2 { void nfaExecSheng_dump(const struct NFA *nfa, const std::string &base); +void nfaExecSheng32_dump(const struct NFA *nfa, const std::string &base); } // namespace ue2