From 0102f03c9c6514b959220eb292e54db6a96d6bf6 Mon Sep 17 00:00:00 2001 From: "Zhu,Wenjun" Date: Tue, 8 Sep 2020 14:59:33 +0000 Subject: [PATCH] MCSHENG64: extend to 64-state based on mcsheng --- src/nfa/mcsheng.c | 1333 +++++++++++++++++++++++++++++- src/nfa/mcsheng.h | 75 +- src/nfa/mcsheng_compile.cpp | 460 ++++++++++- src/nfa/mcsheng_compile.h | 3 +- src/nfa/mcsheng_data.c | 15 +- src/nfa/mcsheng_dump.cpp | 327 +++++++- src/nfa/mcsheng_dump.h | 5 +- src/nfa/mcsheng_internal.h | 33 +- src/nfa/nfa_api_dispatch.c | 2 + src/nfa/nfa_build_util.cpp | 31 + src/nfa/nfa_dump_dispatch.cpp | 2 + src/nfa/nfa_internal.h | 7 + src/rose/rose_build_bytecode.cpp | 4 + src/util/simd_utils.h | 37 + 14 files changed, 2320 insertions(+), 14 deletions(-) diff --git a/src/nfa/mcsheng.c b/src/nfa/mcsheng.c index 4619ff6f..22cac119 100644 --- a/src/nfa/mcsheng.c +++ b/src/nfa/mcsheng.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2018, Intel Corporation + * Copyright (c) 2016-2020, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -1184,7 +1184,7 @@ char nfaExecMcSheng16_reportCurrent(const struct NFA *n, struct mq *q) { static char mcshengHasAccept(const struct mcsheng *m, const struct mstate_aux *aux, - ReportID report) { + ReportID report) { assert(m && aux); if (!aux->accept) { @@ -1405,3 +1405,1332 @@ char nfaExecMcSheng16_expandState(UNUSED const struct NFA *nfa, void *dest, *(u16 *)dest = unaligned_load_u16(src); return 0; } + +#if defined(HAVE_AVX512VBMI) +static really_inline +const struct mstate_aux *get_aux64(const struct mcsheng64 *m, u32 s) { + const char *nfa = (const char *)m - sizeof(struct NFA); + const struct mstate_aux *aux + = s + (const struct mstate_aux *)(nfa + m->aux_offset); + + assert(ISALIGNED(aux)); + return aux; +} + +static really_inline +u32 mcshengEnableStarts64(const struct mcsheng64 *m, u32 s) { + const struct mstate_aux *aux = get_aux64(m, s); + + DEBUG_PRINTF("enabling starts %u->%hu\n", s, aux->top); + return aux->top; +} + +static really_inline +char doComplexReport64(NfaCallback cb, void *ctxt, const struct mcsheng64 *m, + u32 s, u64a loc, char eod, u32 *cached_accept_state, + u32 *cached_accept_id) { + DEBUG_PRINTF("reporting state = %u, loc=%llu, eod %hhu\n", + s & STATE_MASK, loc, eod); + + if (!eod && s == *cached_accept_state) { + if (cb(0, loc, *cached_accept_id, ctxt) == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; /* termination requested */ + } + + return MO_CONTINUE_MATCHING; /* continue execution */ + } + + const struct mstate_aux *aux = get_aux64(m, s); + size_t offset = eod ? aux->accept_eod : aux->accept; + + assert(offset); + const struct report_list *rl + = (const void *)((const char *)m + offset - sizeof(struct NFA)); + assert(ISALIGNED(rl)); + + DEBUG_PRINTF("report list size %u\n", rl->count); + u32 count = rl->count; + + if (!eod && count == 1) { + *cached_accept_state = s; + *cached_accept_id = rl->report[0]; + + DEBUG_PRINTF("reporting %u\n", rl->report[0]); + if (cb(0, loc, rl->report[0], ctxt) == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; /* termination requested */ + } + + return MO_CONTINUE_MATCHING; /* continue execution */ + } + + for (u32 i = 0; i < count; i++) { + DEBUG_PRINTF("reporting %u\n", rl->report[i]); + if (cb(0, loc, rl->report[i], ctxt) == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; /* termination requested */ + } + } + + return MO_CONTINUE_MATCHING; /* continue execution */ +} + +static really_inline +u32 doSheng64(const struct mcsheng64 *m, const u8 **c_inout, const u8 *soft_c_end, + const u8 *hard_c_end, u32 s_in, char do_accel) { + assert(s_in < m->sheng_end); + assert(s_in); /* should not already be dead */ + assert(soft_c_end <= hard_c_end); + DEBUG_PRINTF("s_in = %u (adjusted %u)\n", s_in, s_in - 1); + m512 s = set64x8(s_in - 1); + const u8 *c = *c_inout; + const u8 *c_end = hard_c_end - SHENG_CHUNK + 1; + if (!do_accel) { + c_end = MIN(soft_c_end, hard_c_end - SHENG_CHUNK + 1); + } + + const m512 *masks = m->sheng_succ_masks; + u8 sheng_limit = m->sheng_end - 1; /* - 1: no dead state */ + u8 sheng_stop_limit = do_accel ? m->sheng_accel_limit : sheng_limit; + + /* When we use movd to get a u32 containing our state, it will have 4 lanes + * all duplicating the state. We can create versions of our limits with 4 + * copies to directly compare against, this prevents us generating code to + * extract a single copy of the state from the u32 for checking. */ + u32 sheng_stop_limit_x4 = sheng_stop_limit * 0x01010101; + +#if defined(HAVE_BMI2) && defined(ARCH_64_BIT) + u32 sheng_limit_x4 = sheng_limit * 0x01010101; + m512 simd_stop_limit = set16x32(sheng_stop_limit_x4); + m512 accel_delta = set64x8(sheng_limit - sheng_stop_limit); + DEBUG_PRINTF("end %hhu, accel %hu --> limit %hhu\n", sheng_limit, + m->sheng_accel_limit, sheng_stop_limit); +#endif + +#define SHENG64_SINGLE_ITER do { \ + m512 succ_mask = masks[*(c++)]; \ + s = vpermb512(s, succ_mask); \ + u32 s_gpr_x4 = movd512(s); /* convert to u8 */ \ + DEBUG_PRINTF("c %hhu (%c) --> s %u\n", c[-1], c[-1], s_gpr_x4); \ + if (s_gpr_x4 >= sheng_stop_limit_x4) { \ + s_gpr = s_gpr_x4; \ + goto exit; \ + } \ + } while (0) + + u8 s_gpr; + while (c < c_end) { +#if defined(HAVE_BMI2) && defined(ARCH_64_BIT) + /* This version uses pext for efficiently bitbashing out scaled + * versions of the bytes to process from a u64a */ + + u64a data_bytes = unaligned_load_u64a(c); + u64a cc0 = pdep64(data_bytes, 0x3fc0); /* extract scaled low byte */ + data_bytes &= ~0xffULL; /* clear low bits for scale space */ + + m512 succ_mask0 = load512((const char *)masks + cc0); + s = vpermb512(s, succ_mask0); + m512 s_max = s; + m512 s_max0 = s_max; + DEBUG_PRINTF("c %02llx --> s %u\n", cc0 >> 6, movd512(s)); + +#define SHENG64_SINGLE_UNROLL_ITER(iter) \ + assert(iter); \ + u64a cc##iter = pext64(data_bytes, mcsheng64_pext_mask[iter]); \ + assert(cc##iter == (u64a)c[iter] << 6); \ + m512 succ_mask##iter = load512((const char *)masks + cc##iter); \ + s = vpermb512(s, succ_mask##iter); \ + if (do_accel && iter == 7) { \ + /* in the final iteration we also have to check against accel */ \ + m512 s_temp = sadd_u8_m512(s, accel_delta); \ + s_max = max_u8_m512(s_max, s_temp); \ + } else { \ + s_max = max_u8_m512(s_max, s); \ + } \ + m512 s_max##iter = s_max; \ + DEBUG_PRINTF("c %02llx --> s %u max %u\n", cc##iter >> 6, \ + movd512(s), movd512(s_max)); + + SHENG64_SINGLE_UNROLL_ITER(1); + SHENG64_SINGLE_UNROLL_ITER(2); + SHENG64_SINGLE_UNROLL_ITER(3); + SHENG64_SINGLE_UNROLL_ITER(4); + SHENG64_SINGLE_UNROLL_ITER(5); + SHENG64_SINGLE_UNROLL_ITER(6); + SHENG64_SINGLE_UNROLL_ITER(7); + + if (movd512(s_max7) >= sheng_limit_x4) { + DEBUG_PRINTF("exit found\n"); + + /* Explicitly check the last byte as it is more likely as it also + * checks for acceleration. */ + if (movd512(s_max6) < sheng_limit_x4) { + c += SHENG_CHUNK; + s_gpr = movq512(s); + assert(s_gpr >= sheng_stop_limit); + goto exit; + } + + /* use shift-xor to create a register containing all of the max + * values */ + m512 blended = rshift64_m512(s_max0, 56); + blended = xor512(blended, rshift64_m512(s_max1, 48)); + blended = xor512(blended, rshift64_m512(s_max2, 40)); + blended = xor512(blended, rshift64_m512(s_max3, 32)); + blended = xor512(blended, rshift64_m512(s_max4, 24)); + blended = xor512(blended, rshift64_m512(s_max5, 16)); + blended = xor512(blended, rshift64_m512(s_max6, 8)); + blended = xor512(blended, s); + blended = xor512(blended, rshift64_m512(blended, 8)); + DEBUG_PRINTF("blended %016llx\n", movq512(blended)); + + m512 final = min_u8_m512(blended, simd_stop_limit); + m512 cmp = sub_u8_m512(final, simd_stop_limit); + m128 tmp = cast512to128(cmp); + u64a stops = ~movemask128(tmp); + assert(stops); + u32 earliest = ctz32(stops); + DEBUG_PRINTF("stops %02llx, earliest %u\n", stops, earliest); + assert(earliest < 8); + c += earliest + 1; + s_gpr = movq512(blended) >> (earliest * 8); + assert(s_gpr >= sheng_stop_limit); + goto exit; + } else { + c += SHENG_CHUNK; + } +#else + SHENG64_SINGLE_ITER; + SHENG64_SINGLE_ITER; + SHENG64_SINGLE_ITER; + SHENG64_SINGLE_ITER; + + SHENG64_SINGLE_ITER; + SHENG64_SINGLE_ITER; + SHENG64_SINGLE_ITER; + SHENG64_SINGLE_ITER; +#endif + } + + assert(c_end - c < SHENG_CHUNK); + if (c < soft_c_end) { + assert(soft_c_end - c < SHENG_CHUNK); + switch (soft_c_end - c) { + case 7: + SHENG64_SINGLE_ITER; // fallthrough + case 6: + SHENG64_SINGLE_ITER; // fallthrough + case 5: + SHENG64_SINGLE_ITER; // fallthrough + case 4: + SHENG64_SINGLE_ITER; // fallthrough + case 3: + SHENG64_SINGLE_ITER; // fallthrough + case 2: + SHENG64_SINGLE_ITER; // fallthrough + case 1: + SHENG64_SINGLE_ITER; // fallthrough + } + } + + assert(c >= soft_c_end); + + s_gpr = movq512(s); +exit: + assert(c <= hard_c_end); + DEBUG_PRINTF("%zu from end; s %hhu\n", c_end - c, s_gpr); + assert(c >= soft_c_end || s_gpr >= sheng_stop_limit); + /* undo state adjustment to match mcclellan view */ + if (s_gpr == sheng_limit) { + s_gpr = 0; + } else if (s_gpr < sheng_limit) { + s_gpr++; + } + + *c_inout = c; + return s_gpr; +} + +static really_inline +const char *findShermanState64(UNUSED const struct mcsheng64 *m, + const char *sherman_base_offset, + u32 sherman_base, u32 s) { + const char *rv + = sherman_base_offset + SHERMAN_FIXED_SIZE * (s - sherman_base); + assert(rv < (const char *)m + m->length - sizeof(struct NFA)); + UNUSED u8 type = *(const u8 *)(rv + SHERMAN_TYPE_OFFSET); + assert(type == SHERMAN_STATE); + return rv; +} + +static really_inline +const u8 *run_mcsheng_accel64(const struct mcsheng64 *m, + const struct mstate_aux *aux, u32 s, + const u8 **min_accel_offset, + const u8 *c, const u8 *c_end) { + DEBUG_PRINTF("skipping\n"); + u32 accel_offset = aux[s].accel_offset; + + assert(aux[s].accel_offset); + assert(accel_offset >= m->aux_offset); + assert(!m->sherman_offset || accel_offset < m->sherman_offset); + + const union AccelAux *aaux = (const void *)((const char *)m + accel_offset); + const u8 *c2 = run_accel(aaux, c, c_end); + + if (c2 < *min_accel_offset + BAD_ACCEL_DIST) { + *min_accel_offset = c2 + BIG_ACCEL_PENALTY; + } else { + *min_accel_offset = c2 + SMALL_ACCEL_PENALTY; + } + + if (*min_accel_offset >= c_end - ACCEL_MIN_LEN) { + *min_accel_offset = c_end; + } + + DEBUG_PRINTF("advanced %zd, next accel chance in %zd/%zd\n", + c2 - c, *min_accel_offset - c2, c_end - c2); + + return c2; +} + +static really_inline +u32 doNormal64_16(const struct mcsheng64 *m, const u8 **c_inout, const u8 *end, + u32 s, char do_accel, enum MatchMode mode) { + const u8 *c = *c_inout; + const u16 *succ_table + = (const u16 *)((const char *)m + sizeof(struct mcsheng64)); + assert(ISALIGNED_N(succ_table, 2)); + u32 sheng_end = m->sheng_end; + u32 sherman_base = m->sherman_limit; + const char *sherman_base_offset + = (const char *)m - sizeof(struct NFA) + m->sherman_offset; + u32 as = m->alphaShift; + + /* Adjust start of succ table so we can index into using state id (rather + * than adjust to normal id). As we will not be processing states with low + * state ids, we will not be accessing data before the succ table. Note: due + * to the size of the sheng tables, the succ_table pointer will still be + * inside the engine.*/ + succ_table -= sheng_end << as; + s &= STATE_MASK; + while (c < end && s >= sheng_end) { + u8 cprime = m->remap[*c]; + DEBUG_PRINTF("c: %02hhx '%c' cp:%02hhx (s=%u)\n", *c, + ourisprint(*c) ? *c : '?', cprime, s); + if (s < sherman_base) { + DEBUG_PRINTF("doing normal\n"); + assert(s < m->state_count); + s = succ_table[(s << as) + cprime]; + } else { + const char *sherman_state + = findShermanState64(m, sherman_base_offset, sherman_base, s); + DEBUG_PRINTF("doing sherman (%u)\n", s); + s = doSherman16(sherman_state, cprime, succ_table, as); + } + + DEBUG_PRINTF("s: %u (%u)\n", s, s & STATE_MASK); + c++; + + if (do_accel && (s & ACCEL_FLAG)) { + break; + } + if (mode != NO_MATCHES && (s & ACCEPT_FLAG)) { + break; + } + + s &= STATE_MASK; + } + + *c_inout = c; + return s; +} + +static really_inline +char mcsheng64Exec16_i(const struct mcsheng64 *m, u32 *state, const u8 *buf, + size_t len, u64a offAdj, NfaCallback cb, void *ctxt, + char single, const u8 **c_final, enum MatchMode mode) { + assert(ISALIGNED_N(state, 2)); + if (!len) { + if (mode == STOP_AT_MATCH) { + *c_final = buf; + } + return MO_ALIVE; + } + + u32 s = *state; + const u8 *c = buf; + const u8 *c_end = buf + len; + const u8 sheng_end = m->sheng_end; + const struct mstate_aux *aux + = (const struct mstate_aux *)((const char *)m + m->aux_offset + - sizeof(struct NFA)); + + s &= STATE_MASK; + + u32 cached_accept_id = 0; + u32 cached_accept_state = 0; + + DEBUG_PRINTF("s: %u, len %zu\n", s, len); + + const u8 *min_accel_offset = c; + if (!m->has_accel || len < ACCEL_MIN_LEN) { + min_accel_offset = c_end; + goto without_accel; + } + + goto with_accel; + +without_accel: + do { + assert(c < min_accel_offset); + int do_accept; + if (!s) { + goto exit; + } else if (s < sheng_end) { + s = doSheng64(m, &c, min_accel_offset, c_end, s, 0); + do_accept = mode != NO_MATCHES && get_aux64(m, s)->accept; + } else { + s = doNormal64_16(m, &c, min_accel_offset, s, 0, mode); + + do_accept = mode != NO_MATCHES && (s & ACCEPT_FLAG); + } + + if (do_accept) { + if (mode == STOP_AT_MATCH) { + *state = s & STATE_MASK; + *c_final = c - 1; + return MO_MATCHES_PENDING; + } + + u64a loc = (c - 1) - buf + offAdj + 1; + + if (single) { + DEBUG_PRINTF("reporting %u\n", m->arb_report); + if (cb(0, loc, m->arb_report, ctxt) == MO_HALT_MATCHING) { + return MO_DEAD; /* termination requested */ + } + } else if (doComplexReport64(cb, ctxt, m, s & STATE_MASK, loc, 0, + &cached_accept_state, + &cached_accept_id) + == MO_HALT_MATCHING) { + return MO_DEAD; + } + } + + assert(c <= c_end); /* sheng is fuzzy for min_accel_offset */ + } while (c < min_accel_offset); + + if (c == c_end) { + goto exit; + } + +with_accel: + do { + assert(c < c_end); + int do_accept; + + if (!s) { + goto exit; + } else if (s < sheng_end) { + if (s > m->sheng_accel_limit) { + c = run_mcsheng_accel64(m, aux, s, &min_accel_offset, c, c_end); + if (c == c_end) { + goto exit; + } else { + goto without_accel; + } + } + s = doSheng64(m, &c, c_end, c_end, s, 1); + do_accept = mode != NO_MATCHES && get_aux64(m, s)->accept; + } else { + if (s & ACCEL_FLAG) { + DEBUG_PRINTF("skipping\n"); + s &= STATE_MASK; + c = run_mcsheng_accel64(m, aux, s, &min_accel_offset, c, c_end); + if (c == c_end) { + goto exit; + } else { + goto without_accel; + } + } + + s = doNormal64_16(m, &c, c_end, s, 1, mode); + do_accept = mode != NO_MATCHES && (s & ACCEPT_FLAG); + } + + if (do_accept) { + if (mode == STOP_AT_MATCH) { + *state = s & STATE_MASK; + *c_final = c - 1; + return MO_MATCHES_PENDING; + } + + u64a loc = (c - 1) - buf + offAdj + 1; + + if (single) { + DEBUG_PRINTF("reporting %u\n", m->arb_report); + if (cb(0, loc, m->arb_report, ctxt) == MO_HALT_MATCHING) { + return MO_DEAD; /* termination requested */ + } + } else if (doComplexReport64(cb, ctxt, m, s & STATE_MASK, loc, 0, + &cached_accept_state, + &cached_accept_id) + == MO_HALT_MATCHING) { + return MO_DEAD; + } + } + + assert(c <= c_end); + } while (c < c_end); + +exit: + s &= STATE_MASK; + + if (mode == STOP_AT_MATCH) { + *c_final = c_end; + } + *state = s; + + return MO_ALIVE; +} + +static never_inline +char mcsheng64Exec16_i_cb(const struct mcsheng64 *m, u32 *state, const u8 *buf, + size_t len, u64a offAdj, NfaCallback cb, void *ctxt, + char single, const u8 **final_point) { + return mcsheng64Exec16_i(m, state, buf, len, offAdj, cb, ctxt, single, + final_point, CALLBACK_OUTPUT); +} + +static never_inline +char mcsheng64Exec16_i_sam(const struct mcsheng64 *m, u32 *state, const u8 *buf, + size_t len, u64a offAdj, NfaCallback cb, void *ctxt, + char single, const u8 **final_point) { + return mcsheng64Exec16_i(m, state, buf, len, offAdj, cb, ctxt, single, + final_point, STOP_AT_MATCH); +} + +static never_inline +char mcsheng64Exec16_i_nm(const struct mcsheng64 *m, u32 *state, const u8 *buf, + size_t len, u64a offAdj, NfaCallback cb, void *ctxt, + char single, const u8 **final_point) { + return mcsheng64Exec16_i(m, state, buf, len, offAdj, cb, ctxt, single, + final_point, NO_MATCHES); +} + +static really_inline +char mcsheng64Exec16_i_ni(const struct mcsheng64 *m, u32 *state, const u8 *buf, + size_t len, u64a offAdj, NfaCallback cb, void *ctxt, + char single, const u8 **final_point, + enum MatchMode mode) { + if (mode == CALLBACK_OUTPUT) { + return mcsheng64Exec16_i_cb(m, state, buf, len, offAdj, cb, ctxt, + single, final_point); + } else if (mode == STOP_AT_MATCH) { + return mcsheng64Exec16_i_sam(m, state, buf, len, offAdj, cb, ctxt, + single, final_point); + } else { + assert (mode == NO_MATCHES); + return mcsheng64Exec16_i_nm(m, state, buf, len, offAdj, cb, ctxt, + single, final_point); + } +} + +static really_inline +u32 doNormal64_8(const struct mcsheng64 *m, const u8 **c_inout, const u8 *end, u32 s, + char do_accel, enum MatchMode mode) { + const u8 *c = *c_inout; + u32 sheng_end = m->sheng_end; + u32 accel_limit = m->accel_limit_8; + u32 accept_limit = m->accept_limit_8; + + const u32 as = m->alphaShift; + const u8 *succ_table = (const u8 *)((const char *)m + + sizeof(struct mcsheng64)); + /* Adjust start of succ table so we can index into using state id (rather + * than adjust to normal id). As we will not be processing states with low + * state ids, we will not be accessing data before the succ table. Note: due + * to the size of the sheng tables, the succ_table pointer will still be + * inside the engine.*/ + succ_table -= sheng_end << as; + + assert(s >= sheng_end); + while (c < end && s >= sheng_end) { + u8 cprime = m->remap[*c]; + DEBUG_PRINTF("c: %02hhx '%c' cp:%02hhx\n", *c, + ourisprint(*c) ? *c : '?', cprime); + s = succ_table[(s << as) + cprime]; + + DEBUG_PRINTF("s: %u\n", s); + c++; + if (do_accel) { + if (s >= accel_limit) { + break; + } + } else { + if (mode != NO_MATCHES && s >= accept_limit) { + break; + } + } + } + *c_inout = c; + return s; +} + +static really_inline +char mcsheng64Exec8_i(const struct mcsheng64 *m, u32 *state, const u8 *buf, + size_t len, u64a offAdj, NfaCallback cb, void *ctxt, + char single, const u8 **c_final, enum MatchMode mode) { + if (!len) { + *c_final = buf; + return MO_ALIVE; + } + u32 s = *state; + const u8 *c = buf; + const u8 *c_end = buf + len; + const u8 sheng_end = m->sheng_end; + + const struct mstate_aux *aux + = (const struct mstate_aux *)((const char *)m + m->aux_offset + - sizeof(struct NFA)); + u32 accept_limit = m->accept_limit_8; + + u32 cached_accept_id = 0; + u32 cached_accept_state = 0; + + DEBUG_PRINTF("accel %hu, accept %u\n", m->accel_limit_8, accept_limit); + + DEBUG_PRINTF("s: %u, len %zu\n", s, len); + + const u8 *min_accel_offset = c; + if (!m->has_accel || len < ACCEL_MIN_LEN) { + min_accel_offset = c_end; + goto without_accel; + } + + goto with_accel; + +without_accel: + do { + assert(c < min_accel_offset); + if (!s) { + goto exit; + } else if (s < sheng_end) { + s = doSheng64(m, &c, min_accel_offset, c_end, s, 0); + } else { + s = doNormal64_8(m, &c, min_accel_offset, s, 0, mode); + assert(c <= min_accel_offset); + } + + if (mode != NO_MATCHES && s >= accept_limit) { + if (mode == STOP_AT_MATCH) { + DEBUG_PRINTF("match - pausing\n"); + *state = s; + *c_final = c - 1; + return MO_MATCHES_PENDING; + } + + u64a loc = (c - 1) - buf + offAdj + 1; + if (single) { + DEBUG_PRINTF("reporting %u\n", m->arb_report); + if (cb(0, loc, m->arb_report, ctxt) == MO_HALT_MATCHING) { + return MO_DEAD; + } + } else if (doComplexReport64(cb, ctxt, m, s, loc, 0, + &cached_accept_state, + &cached_accept_id) + == MO_HALT_MATCHING) { + return MO_DEAD; + } + } + + assert(c <= c_end); /* sheng is fuzzy for min_accel_offset */ + } while (c < min_accel_offset); + + if (c == c_end) { + goto exit; + } + +with_accel: + do { + u32 accel_limit = m->accel_limit_8; + + assert(c < c_end); + if (!s) { + goto exit; + } else if (s < sheng_end) { + if (s > m->sheng_accel_limit) { + c = run_mcsheng_accel64(m, aux, s, &min_accel_offset, c, c_end); + if (c == c_end) { + goto exit; + } else { + goto without_accel; + } + } + s = doSheng64(m, &c, c_end, c_end, s, 1); + } else { + if (s >= accel_limit && aux[s].accel_offset) { + c = run_mcsheng_accel64(m, aux, s, &min_accel_offset, c, c_end); + if (c == c_end) { + goto exit; + } else { + goto without_accel; + } + } + s = doNormal64_8(m, &c, c_end, s, 1, mode); + } + + if (mode != NO_MATCHES && s >= accept_limit) { + if (mode == STOP_AT_MATCH) { + DEBUG_PRINTF("match - pausing\n"); + *state = s; + *c_final = c - 1; + return MO_MATCHES_PENDING; + } + + u64a loc = (c - 1) - buf + offAdj + 1; + if (single) { + DEBUG_PRINTF("reporting %u\n", m->arb_report); + if (cb(0, loc, m->arb_report, ctxt) == MO_HALT_MATCHING) { + return MO_DEAD; + } + } else if (doComplexReport64(cb, ctxt, m, s, loc, 0, + &cached_accept_state, + &cached_accept_id) + == MO_HALT_MATCHING) { + return MO_DEAD; + } + } + + assert(c <= c_end); + } while (c < c_end); + +exit: + *state = s; + if (mode == STOP_AT_MATCH) { + *c_final = c_end; + } + return MO_ALIVE; +} + +static never_inline +char mcsheng64Exec8_i_cb(const struct mcsheng64 *m, u32 *state, const u8 *buf, + size_t len, u64a offAdj, NfaCallback cb, void *ctxt, + char single, const u8 **final_point) { + return mcsheng64Exec8_i(m, state, buf, len, offAdj, cb, ctxt, single, + final_point, CALLBACK_OUTPUT); +} + +static never_inline +char mcsheng64Exec8_i_sam(const struct mcsheng64 *m, u32 *state, const u8 *buf, + size_t len, u64a offAdj, NfaCallback cb, void *ctxt, + char single, const u8 **final_point) { + return mcsheng64Exec8_i(m, state, buf, len, offAdj, cb, ctxt, single, + final_point, STOP_AT_MATCH); +} + +static never_inline +char mcsheng64Exec8_i_nm(const struct mcsheng64 *m, u32 *state, const u8 *buf, + size_t len, u64a offAdj, NfaCallback cb, void *ctxt, + char single, const u8 **final_point) { + return mcsheng64Exec8_i(m, state, buf, len, offAdj, cb, ctxt, single, + final_point, NO_MATCHES); +} + +static really_inline +char mcsheng64Exec8_i_ni(const struct mcsheng64 *m, u32 *state, const u8 *buf, + size_t len, u64a offAdj, NfaCallback cb, void *ctxt, + char single, const u8 **final_point, + enum MatchMode mode) { + if (mode == CALLBACK_OUTPUT) { + return mcsheng64Exec8_i_cb(m, state, buf, len, offAdj, cb, ctxt, single, + final_point); + } else if (mode == STOP_AT_MATCH) { + return mcsheng64Exec8_i_sam(m, state, buf, len, offAdj, cb, ctxt, + single, final_point); + } else { + assert(mode == NO_MATCHES); + return mcsheng64Exec8_i_nm(m, state, buf, len, offAdj, cb, ctxt, single, + final_point); + } +} + +static really_inline +char mcshengCheckEOD64(const struct NFA *nfa, u32 s, u64a offset, + NfaCallback cb, void *ctxt) { + const struct mcsheng64 *m = getImplNfa(nfa); + const struct mstate_aux *aux = get_aux64(m, s); + + if (!aux->accept_eod) { + return MO_CONTINUE_MATCHING; + } + return doComplexReport64(cb, ctxt, m, s, offset, 1, NULL, NULL); +} + +static really_inline +char nfaExecMcSheng64_16_Q2i(const struct NFA *n, u64a offset, const u8 *buffer, + const u8 *hend, NfaCallback cb, void *context, + struct mq *q, char single, s64a end, + enum MatchMode mode) { + assert(n->type == MCSHENG_64_NFA_16); + const struct mcsheng64 *m = getImplNfa(n); + s64a sp; + + assert(ISALIGNED_N(q->state, 2)); + u32 s = *(u16 *)q->state; + + if (q->report_current) { + assert(s); + assert(get_aux64(m, s)->accept); + + int rv; + if (single) { + DEBUG_PRINTF("reporting %u\n", m->arb_report); + rv = cb(0, q_cur_offset(q), m->arb_report, context); + } else { + u32 cached_accept_id = 0; + u32 cached_accept_state = 0; + + rv = doComplexReport64(cb, context, m, s, q_cur_offset(q), 0, + &cached_accept_state, &cached_accept_id); + } + + q->report_current = 0; + + if (rv == MO_HALT_MATCHING) { + return MO_DEAD; + } + } + + sp = q_cur_loc(q); + q->cur++; + + const u8 *cur_buf = sp < 0 ? hend : buffer; + + assert(q->cur); + if (mode != NO_MATCHES && q->items[q->cur - 1].location > end) { + DEBUG_PRINTF("this is as far as we go\n"); + q->cur--; + q->items[q->cur].type = MQE_START; + q->items[q->cur].location = end; + *(u16 *)q->state = s; + return MO_ALIVE; + } + + while (1) { + assert(q->cur < q->end); + s64a ep = q->items[q->cur].location; + if (mode != NO_MATCHES) { + ep = MIN(ep, end); + } + + assert(ep >= sp); + + s64a local_ep = ep; + if (sp < 0) { + local_ep = MIN(0, ep); + } + + /* do main buffer region */ + const u8 *final_look; + char rv = mcsheng64Exec16_i_ni(m, &s, cur_buf + sp, local_ep - sp, + offset + sp, cb, context, single, + &final_look, mode); + if (rv == MO_DEAD) { + *(u16 *)q->state = 0; + return MO_DEAD; + } + if (mode == STOP_AT_MATCH && rv == MO_MATCHES_PENDING) { + DEBUG_PRINTF("this is as far as we go\n"); + DEBUG_PRINTF("state %u final_look %zd\n", s, final_look - cur_buf); + + assert(q->cur); + assert(final_look != cur_buf + local_ep); + + q->cur--; + q->items[q->cur].type = MQE_START; + q->items[q->cur].location = final_look - cur_buf + 1; /* due to + * early -1 */ + *(u16 *)q->state = s; + return MO_MATCHES_PENDING; + } + + assert(rv == MO_ALIVE); + assert(q->cur); + if (mode != NO_MATCHES && q->items[q->cur].location > end) { + DEBUG_PRINTF("this is as far as we go\n"); + q->cur--; + q->items[q->cur].type = MQE_START; + q->items[q->cur].location = end; + *(u16 *)q->state = s; + return MO_ALIVE; + } + + sp = local_ep; + + if (sp == 0) { + cur_buf = buffer; + } + + if (sp != ep) { + continue; + } + + switch (q->items[q->cur].type) { + case MQE_TOP: + assert(sp + offset || !s); + if (sp + offset == 0) { + s = m->start_anchored; + break; + } + s = mcshengEnableStarts64(m, s); + break; + case MQE_END: + *(u16 *)q->state = s; + q->cur++; + return s ? MO_ALIVE : MO_DEAD; + default: + assert(!"invalid queue event"); + } + + q->cur++; + } +} + +static really_inline +char nfaExecMcSheng64_8_Q2i(const struct NFA *n, u64a offset, const u8 *buffer, + const u8 *hend, NfaCallback cb, void *context, + struct mq *q, char single, s64a end, + enum MatchMode mode) { + assert(n->type == MCSHENG_64_NFA_8); + const struct mcsheng64 *m = getImplNfa(n); + s64a sp; + + u32 s = *(u8 *)q->state; + + if (q->report_current) { + assert(s); + assert(s >= m->accept_limit_8); + + int rv; + if (single) { + DEBUG_PRINTF("reporting %u\n", m->arb_report); + + rv = cb(0, q_cur_offset(q), m->arb_report, context); + } else { + u32 cached_accept_id = 0; + u32 cached_accept_state = 0; + + rv = doComplexReport64(cb, context, m, s, q_cur_offset(q), 0, + &cached_accept_state, &cached_accept_id); + } + + q->report_current = 0; + + if (rv == MO_HALT_MATCHING) { + return MO_DEAD; + } + } + + sp = q_cur_loc(q); + q->cur++; + + const u8 *cur_buf = sp < 0 ? hend : buffer; + + if (mode != NO_MATCHES && q->items[q->cur - 1].location > end) { + DEBUG_PRINTF("this is as far as we go\n"); + q->cur--; + q->items[q->cur].type = MQE_START; + q->items[q->cur].location = end; + *(u8 *)q->state = s; + return MO_ALIVE; + } + + while (1) { + DEBUG_PRINTF("%s @ %llu\n", q->items[q->cur].type == MQE_TOP ? "TOP" : + q->items[q->cur].type == MQE_END ? "END" : "???", + q->items[q->cur].location + offset); + assert(q->cur < q->end); + s64a ep = q->items[q->cur].location; + if (mode != NO_MATCHES) { + ep = MIN(ep, end); + } + + assert(ep >= sp); + + s64a local_ep = ep; + if (sp < 0) { + local_ep = MIN(0, ep); + } + + const u8 *final_look; + char rv = mcsheng64Exec8_i_ni(m, &s, cur_buf + sp, local_ep - sp, + offset + sp, cb, context, single, + &final_look, mode); + if (rv == MO_HALT_MATCHING) { + *(u8 *)q->state = 0; + return MO_DEAD; + } + if (mode == STOP_AT_MATCH && rv == MO_MATCHES_PENDING) { + DEBUG_PRINTF("this is as far as we go\n"); + DEBUG_PRINTF("state %u final_look %zd\n", s, final_look - cur_buf); + + assert(q->cur); + assert(final_look != cur_buf + local_ep); + + q->cur--; + q->items[q->cur].type = MQE_START; + q->items[q->cur].location = final_look - cur_buf + 1; /* due to + * early -1 */ + *(u8 *)q->state = s; + return MO_MATCHES_PENDING; + } + + assert(rv == MO_ALIVE); + assert(q->cur); + if (mode != NO_MATCHES && q->items[q->cur].location > end) { + DEBUG_PRINTF("this is as far as we go\n"); + assert(q->cur); + q->cur--; + q->items[q->cur].type = MQE_START; + q->items[q->cur].location = end; + *(u8 *)q->state = s; + return MO_ALIVE; + } + + sp = local_ep; + + if (sp == 0) { + cur_buf = buffer; + } + + if (sp != ep) { + continue; + } + + switch (q->items[q->cur].type) { + case MQE_TOP: + assert(sp + offset || !s); + if (sp + offset == 0) { + s = (u8)m->start_anchored; + break; + } + s = mcshengEnableStarts64(m, s); + break; + case MQE_END: + *(u8 *)q->state = s; + q->cur++; + return s ? MO_ALIVE : MO_DEAD; + default: + assert(!"invalid queue event"); + } + + q->cur++; + } +} + +char nfaExecMcSheng64_8_Q(const struct NFA *n, struct mq *q, s64a end) { + u64a offset = q->offset; + const u8 *buffer = q->buffer; + NfaCallback cb = q->cb; + void *context = q->context; + assert(n->type == MCSHENG_64_NFA_8); + const struct mcsheng64 *m = getImplNfa(n); + const u8 *hend = q->history + q->hlength; + + return nfaExecMcSheng64_8_Q2i(n, offset, buffer, hend, cb, context, q, + m->flags & MCSHENG_FLAG_SINGLE, end, + CALLBACK_OUTPUT); +} + +char nfaExecMcSheng64_16_Q(const struct NFA *n, struct mq *q, s64a end) { + u64a offset = q->offset; + const u8 *buffer = q->buffer; + NfaCallback cb = q->cb; + void *context = q->context; + assert(n->type == MCSHENG_64_NFA_16); + const struct mcsheng64 *m = getImplNfa(n); + const u8 *hend = q->history + q->hlength; + + return nfaExecMcSheng64_16_Q2i(n, offset, buffer, hend, cb, context, q, + m->flags & MCSHENG_FLAG_SINGLE, end, + CALLBACK_OUTPUT); +} + +char nfaExecMcSheng64_8_reportCurrent(const struct NFA *n, struct mq *q) { + const struct mcsheng64 *m = getImplNfa(n); + NfaCallback cb = q->cb; + void *ctxt = q->context; + u32 s = *(u8 *)q->state; + u8 single = m->flags & MCSHENG_FLAG_SINGLE; + u64a offset = q_cur_offset(q); + assert(q_cur_type(q) == MQE_START); + assert(s); + + if (s >= m->accept_limit_8) { + if (single) { + DEBUG_PRINTF("reporting %u\n", m->arb_report); + cb(0, offset, m->arb_report, ctxt); + } else { + u32 cached_accept_id = 0; + u32 cached_accept_state = 0; + + doComplexReport64(cb, ctxt, m, s, offset, 0, &cached_accept_state, + &cached_accept_id); + } + } + + return 0; +} + +char nfaExecMcSheng64_16_reportCurrent(const struct NFA *n, struct mq *q) { + const struct mcsheng64 *m = getImplNfa(n); + NfaCallback cb = q->cb; + void *ctxt = q->context; + u32 s = *(u16 *)q->state; + const struct mstate_aux *aux = get_aux64(m, s); + u8 single = m->flags & MCSHENG_FLAG_SINGLE; + u64a offset = q_cur_offset(q); + assert(q_cur_type(q) == MQE_START); + DEBUG_PRINTF("state %u\n", s); + assert(s); + + if (aux->accept) { + if (single) { + DEBUG_PRINTF("reporting %u\n", m->arb_report); + cb(0, offset, m->arb_report, ctxt); + } else { + u32 cached_accept_id = 0; + u32 cached_accept_state = 0; + + doComplexReport64(cb, ctxt, m, s, offset, 0, &cached_accept_state, + &cached_accept_id); + } + } + + return 0; +} + +static +char mcshengHasAccept64(const struct mcsheng64 *m, const struct mstate_aux *aux, + ReportID report) { + assert(m && aux); + + if (!aux->accept) { + return 0; + } + + const struct report_list *rl = (const struct report_list *) + ((const char *)m + aux->accept - sizeof(struct NFA)); + assert(ISALIGNED_N(rl, 4)); + + DEBUG_PRINTF("report list has %u entries\n", rl->count); + + for (u32 i = 0; i < rl->count; i++) { + if (rl->report[i] == report) { + return 1; + } + } + + return 0; +} + +char nfaExecMcSheng64_8_inAccept(const struct NFA *n, ReportID report, + struct mq *q) { + assert(n && q); + + const struct mcsheng64 *m = getImplNfa(n); + u8 s = *(u8 *)q->state; + DEBUG_PRINTF("checking accepts for %hhu\n", s); + + return mcshengHasAccept64(m, get_aux64(m, s), report); +} + +char nfaExecMcSheng64_8_inAnyAccept(const struct NFA *n, struct mq *q) { + assert(n && q); + + const struct mcsheng64 *m = getImplNfa(n); + u8 s = *(u8 *)q->state; + DEBUG_PRINTF("checking accepts for %hhu\n", s); + + return !!get_aux64(m, s)->accept; +} + +char nfaExecMcSheng64_16_inAccept(const struct NFA *n, ReportID report, + struct mq *q) { + assert(n && q); + + const struct mcsheng64 *m = getImplNfa(n); + u16 s = *(u16 *)q->state; + DEBUG_PRINTF("checking accepts for %hu\n", s); + + return mcshengHasAccept64(m, get_aux64(m, s), report); +} + +char nfaExecMcSheng64_16_inAnyAccept(const struct NFA *n, struct mq *q) { + assert(n && q); + + const struct mcsheng64 *m = getImplNfa(n); + u16 s = *(u16 *)q->state; + DEBUG_PRINTF("checking accepts for %hu\n", s); + + return !!get_aux64(m, s)->accept; +} + +char nfaExecMcSheng64_8_Q2(const struct NFA *n, struct mq *q, s64a end) { + u64a offset = q->offset; + const u8 *buffer = q->buffer; + NfaCallback cb = q->cb; + void *context = q->context; + assert(n->type == MCSHENG_64_NFA_8); + const struct mcsheng64 *m = getImplNfa(n); + const u8 *hend = q->history + q->hlength; + + return nfaExecMcSheng64_8_Q2i(n, offset, buffer, hend, cb, context, q, + m->flags & MCSHENG_FLAG_SINGLE, end, + STOP_AT_MATCH); +} + +char nfaExecMcSheng64_16_Q2(const struct NFA *n, struct mq *q, s64a end) { + u64a offset = q->offset; + const u8 *buffer = q->buffer; + NfaCallback cb = q->cb; + void *context = q->context; + assert(n->type == MCSHENG_64_NFA_16); + const struct mcsheng64 *m = getImplNfa(n); + const u8 *hend = q->history + q->hlength; + + return nfaExecMcSheng64_16_Q2i(n, offset, buffer, hend, cb, context, q, + m->flags & MCSHENG_FLAG_SINGLE, end, + STOP_AT_MATCH); +} + +char nfaExecMcSheng64_8_QR(const struct NFA *n, struct mq *q, ReportID report) { + u64a offset = q->offset; + const u8 *buffer = q->buffer; + NfaCallback cb = q->cb; + void *context = q->context; + assert(n->type == MCSHENG_64_NFA_8); + const struct mcsheng64 *m = getImplNfa(n); + const u8 *hend = q->history + q->hlength; + + char rv = nfaExecMcSheng64_8_Q2i(n, offset, buffer, hend, cb, context, q, + m->flags & MCSHENG_FLAG_SINGLE, + 0 /* end */, NO_MATCHES); + if (rv && nfaExecMcSheng64_8_inAccept(n, report, q)) { + return MO_MATCHES_PENDING; + } else { + return rv; + } +} + +char nfaExecMcSheng64_16_QR(const struct NFA *n, struct mq *q, ReportID report) { + u64a offset = q->offset; + const u8 *buffer = q->buffer; + NfaCallback cb = q->cb; + void *context = q->context; + assert(n->type == MCSHENG_64_NFA_16); + const struct mcsheng64 *m = getImplNfa(n); + const u8 *hend = q->history + q->hlength; + + char rv = nfaExecMcSheng64_16_Q2i(n, offset, buffer, hend, cb, context, q, + m->flags & MCSHENG_FLAG_SINGLE, + 0 /* end */, NO_MATCHES); + + if (rv && nfaExecMcSheng64_16_inAccept(n, report, q)) { + return MO_MATCHES_PENDING; + } else { + return rv; + } +} + +char nfaExecMcSheng64_8_initCompressedState(const struct NFA *nfa, u64a offset, + void *state, UNUSED u8 key) { + const struct mcsheng64 *m = getImplNfa(nfa); + u8 s = offset ? m->start_floating : m->start_anchored; + if (s) { + *(u8 *)state = s; + return 1; + } + return 0; +} + +char nfaExecMcSheng64_16_initCompressedState(const struct NFA *nfa, u64a offset, + void *state, UNUSED u8 key) { + const struct mcsheng64 *m = getImplNfa(nfa); + u16 s = offset ? m->start_floating : m->start_anchored; + if (s) { + unaligned_store_u16(state, s); + return 1; + } + return 0; +} + +char nfaExecMcSheng64_8_testEOD(const struct NFA *nfa, const char *state, + UNUSED const char *streamState, u64a offset, + NfaCallback callback, void *context) { + return mcshengCheckEOD64(nfa, *(const u8 *)state, offset, callback, + context); +} + +char nfaExecMcSheng64_16_testEOD(const struct NFA *nfa, const char *state, + UNUSED const char *streamState, u64a offset, + NfaCallback callback, void *context) { + assert(ISALIGNED_N(state, 2)); + return mcshengCheckEOD64(nfa, *(const u16 *)state, offset, callback, + context); +} + +char nfaExecMcSheng64_8_queueInitState(UNUSED const struct NFA *nfa, struct mq *q) { + assert(nfa->scratchStateSize == 1); + *(u8 *)q->state = 0; + return 0; +} + +char nfaExecMcSheng64_16_queueInitState(UNUSED const struct NFA *nfa, struct mq *q) { + assert(nfa->scratchStateSize == 2); + assert(ISALIGNED_N(q->state, 2)); + *(u16 *)q->state = 0; + return 0; +} + +char nfaExecMcSheng64_8_queueCompressState(UNUSED const struct NFA *nfa, + const struct mq *q, UNUSED s64a loc) { + void *dest = q->streamState; + const void *src = q->state; + assert(nfa->scratchStateSize == 1); + assert(nfa->streamStateSize == 1); + *(u8 *)dest = *(const u8 *)src; + return 0; +} + +char nfaExecMcSheng64_8_expandState(UNUSED const struct NFA *nfa, void *dest, + const void *src, UNUSED u64a offset, + UNUSED u8 key) { + assert(nfa->scratchStateSize == 1); + assert(nfa->streamStateSize == 1); + *(u8 *)dest = *(const u8 *)src; + return 0; +} + +char nfaExecMcSheng64_16_queueCompressState(UNUSED const struct NFA *nfa, + const struct mq *q, + UNUSED s64a loc) { + void *dest = q->streamState; + const void *src = q->state; + assert(nfa->scratchStateSize == 2); + assert(nfa->streamStateSize == 2); + assert(ISALIGNED_N(src, 2)); + unaligned_store_u16(dest, *(const u16 *)(src)); + return 0; +} + +char nfaExecMcSheng64_16_expandState(UNUSED const struct NFA *nfa, void *dest, + const void *src, UNUSED u64a offset, + UNUSED u8 key) { + assert(nfa->scratchStateSize == 2); + assert(nfa->streamStateSize == 2); + assert(ISALIGNED_N(dest, 2)); + *(u16 *)dest = unaligned_load_u16(src); + return 0; +} +#endif diff --git a/src/nfa/mcsheng.h b/src/nfa/mcsheng.h index 19fd6961..7cb808b7 100644 --- a/src/nfa/mcsheng.h +++ b/src/nfa/mcsheng.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016, Intel Corporation + * Copyright (c) 2016-2020, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -80,5 +80,78 @@ char nfaExecMcSheng16_expandState(const struct NFA *nfa, void *dest, #define nfaExecMcSheng16_B_Reverse NFA_API_NO_IMPL #define nfaExecMcSheng16_zombie_status NFA_API_ZOMBIE_NO_IMPL +#if defined(HAVE_AVX512VBMI) +/* 64-8 bit Sheng-McClellan hybrid */ +char nfaExecMcSheng64_8_testEOD(const struct NFA *nfa, const char *state, + const char *streamState, u64a offset, + NfaCallback callback, void *context); +char nfaExecMcSheng64_8_Q(const struct NFA *n, struct mq *q, s64a end); +char nfaExecMcSheng64_8_Q2(const struct NFA *n, struct mq *q, s64a end); +char nfaExecMcSheng64_8_QR(const struct NFA *n, struct mq *q, ReportID report); +char nfaExecMcSheng64_8_reportCurrent(const struct NFA *n, struct mq *q); +char nfaExecMcSheng64_8_inAccept(const struct NFA *n, ReportID report, + struct mq *q); +char nfaExecMcSheng64_8_inAnyAccept(const struct NFA *n, struct mq *q); +char nfaExecMcSheng64_8_queueInitState(const struct NFA *n, struct mq *q); +char nfaExecMcSheng64_8_initCompressedState(const struct NFA *n, u64a offset, + void *state, u8 key); +char nfaExecMcSheng64_8_queueCompressState(const struct NFA *nfa, + const struct mq *q, s64a loc); +char nfaExecMcSheng64_8_expandState(const struct NFA *nfa, void *dest, + const void *src, u64a offset, u8 key); + +#define nfaExecMcSheng64_8_B_Reverse NFA_API_NO_IMPL +#define nfaExecMcSheng64_8_zombie_status NFA_API_ZOMBIE_NO_IMPL + +/* 64-16 bit Sheng-McClellan hybrid */ +char nfaExecMcSheng64_16_testEOD(const struct NFA *nfa, const char *state, + const char *streamState, u64a offset, + NfaCallback callback, void *context); +char nfaExecMcSheng64_16_Q(const struct NFA *n, struct mq *q, s64a end); +char nfaExecMcSheng64_16_Q2(const struct NFA *n, struct mq *q, s64a end); +char nfaExecMcSheng64_16_QR(const struct NFA *n, struct mq *q, ReportID report); +char nfaExecMcSheng64_16_reportCurrent(const struct NFA *n, struct mq *q); +char nfaExecMcSheng64_16_inAccept(const struct NFA *n, ReportID report, + struct mq *q); +char nfaExecMcSheng64_16_inAnyAccept(const struct NFA *n, struct mq *q); +char nfaExecMcSheng64_16_queueInitState(const struct NFA *n, struct mq *q); +char nfaExecMcSheng64_16_initCompressedState(const struct NFA *n, u64a offset, + void *state, u8 key); +char nfaExecMcSheng64_16_queueCompressState(const struct NFA *nfa, + const struct mq *q, s64a loc); +char nfaExecMcSheng64_16_expandState(const struct NFA *nfa, void *dest, + const void *src, u64a offset, u8 key); +#define nfaExecMcSheng64_16_B_Reverse NFA_API_NO_IMPL +#define nfaExecMcSheng64_16_zombie_status NFA_API_ZOMBIE_NO_IMPL +#else // !HAVE_AVX512VBMI +#define nfaExecMcSheng64_8_B_Reverse NFA_API_NO_IMPL +#define nfaExecMcSheng64_8_zombie_status NFA_API_ZOMBIE_NO_IMPL +#define nfaExecMcSheng64_8_Q NFA_API_NO_IMPL +#define nfaExecMcSheng64_8_Q2 NFA_API_NO_IMPL +#define nfaExecMcSheng64_8_QR NFA_API_NO_IMPL +#define nfaExecMcSheng64_8_inAccept NFA_API_NO_IMPL +#define nfaExecMcSheng64_8_inAnyAccept NFA_API_NO_IMPL +#define nfaExecMcSheng64_8_queueInitState NFA_API_NO_IMPL +#define nfaExecMcSheng64_8_queueCompressState NFA_API_NO_IMPL +#define nfaExecMcSheng64_8_expandState NFA_API_NO_IMPL +#define nfaExecMcSheng64_8_initCompressedState NFA_API_NO_IMPL +#define nfaExecMcSheng64_8_testEOD NFA_API_NO_IMPL +#define nfaExecMcSheng64_8_reportCurrent NFA_API_NO_IMPL + +#define nfaExecMcSheng64_16_B_Reverse NFA_API_NO_IMPL +#define nfaExecMcSheng64_16_zombie_status NFA_API_ZOMBIE_NO_IMPL +#define nfaExecMcSheng64_16_Q NFA_API_NO_IMPL +#define nfaExecMcSheng64_16_Q2 NFA_API_NO_IMPL +#define nfaExecMcSheng64_16_QR NFA_API_NO_IMPL +#define nfaExecMcSheng64_16_inAccept NFA_API_NO_IMPL +#define nfaExecMcSheng64_16_inAnyAccept NFA_API_NO_IMPL +#define nfaExecMcSheng64_16_queueInitState NFA_API_NO_IMPL +#define nfaExecMcSheng64_16_queueCompressState NFA_API_NO_IMPL +#define nfaExecMcSheng64_16_expandState NFA_API_NO_IMPL +#define nfaExecMcSheng64_16_initCompressedState NFA_API_NO_IMPL +#define nfaExecMcSheng64_16_testEOD NFA_API_NO_IMPL +#define nfaExecMcSheng64_16_reportCurrent NFA_API_NO_IMPL + +#endif //end of HAVE_AVX512VBM #endif diff --git a/src/nfa/mcsheng_compile.cpp b/src/nfa/mcsheng_compile.cpp index 5277c54e..3dca0fd8 100644 --- a/src/nfa/mcsheng_compile.cpp +++ b/src/nfa/mcsheng_compile.cpp @@ -64,7 +64,6 @@ #include #include #include - #include using namespace std; @@ -244,6 +243,108 @@ void populateBasicInfo(size_t state_size, const dfa_info &info, } } +#if defined(HAVE_AVX512VBMI) +static +mstate_aux *getAux64(NFA *n, dstate_id_t i) { + mcsheng64 *m = (mcsheng64 *)getMutableImplNfa(n); + mstate_aux *aux_base = (mstate_aux *)((char *)n + m->aux_offset); + + mstate_aux *aux = aux_base + i; + assert((const char *)aux < (const char *)n + m->length); + return aux; +} + +static +void createShuffleMasks64(mcsheng64 *m, const dfa_info &info, + dstate_id_t sheng_end, + const map &accel_escape_info) { + DEBUG_PRINTF("using first %hu states for a sheng\n", sheng_end); + assert(sheng_end > DEAD_STATE + 1); + assert(sheng_end <= sizeof(m512) + 1); + vector> masks; + masks.resize(info.alpha_size); + /* -1 to avoid wasting a slot as we do not include dead state */ + vector raw_ids; + raw_ids.resize(sheng_end - 1); + for (dstate_id_t s = DEAD_STATE + 1; s < info.states.size(); s++) { + assert(info.implId(s)); /* should not map to DEAD_STATE */ + if (info.is_sheng(s)) { + raw_ids[info.extra[s].sheng_id] = s; + } + } + for (u32 i = 0; i < info.alpha_size; i++) { + if (i == info.alpha_remap[TOP]) { + continue; + } + auto &mask = masks[i]; + assert(sizeof(mask) == sizeof(m512)); + mask.fill(0); + + for (dstate_id_t sheng_id = 0; sheng_id < sheng_end - 1; sheng_id++) { + dstate_id_t raw_id = raw_ids[sheng_id]; + dstate_id_t next_id = info.implId(info.states[raw_id].next[i]); + if (next_id == DEAD_STATE) { + next_id = sheng_end - 1; + } else if (next_id < sheng_end) { + next_id--; + } + DEBUG_PRINTF("%hu: %u->next %hu\n", sheng_id, i, next_id); + mask[sheng_id] = verify_u8(next_id); + } + } + for (u32 i = 0; i < N_CHARS; i++) { + assert(info.alpha_remap[i] != info.alpha_remap[TOP]); + memcpy((u8 *)&m->sheng_succ_masks[i], + (u8 *)masks[info.alpha_remap[i]].data(), sizeof(m512)); + } + m->sheng_end = sheng_end; + m->sheng_accel_limit = sheng_end - 1; + + for (dstate_id_t s : raw_ids) { + if (contains(accel_escape_info, s)) { + LIMIT_TO_AT_MOST(&m->sheng_accel_limit, info.extra[s].sheng_id); + } + } +} + +static +void populateBasicInfo64(size_t state_size, const dfa_info &info, + u32 total_size, u32 aux_offset, u32 accel_offset, + u32 accel_count, ReportID arb, bool single, NFA *nfa) { + assert(state_size == sizeof(u16) || state_size == sizeof(u8)); + + nfa->length = total_size; + nfa->nPositions = info.states.size(); + + nfa->scratchStateSize = verify_u32(state_size); + nfa->streamStateSize = verify_u32(state_size); + + if (state_size == sizeof(u8)) { + nfa->type = MCSHENG_64_NFA_8; + } else { + nfa->type = MCSHENG_64_NFA_16; + } + + mcsheng64 *m = (mcsheng64 *)getMutableImplNfa(nfa); + for (u32 i = 0; i < 256; i++) { + m->remap[i] = verify_u8(info.alpha_remap[i]); + } + m->alphaShift = info.getAlphaShift(); + m->length = total_size; + m->aux_offset = aux_offset; + m->accel_offset = accel_offset; + m->arb_report = arb; + m->state_count = verify_u16(info.size()); + m->start_anchored = info.implId(info.raw.start_anchored); + m->start_floating = info.implId(info.raw.start_floating); + m->has_accel = accel_count ? 1 : 0; + + if (single) { + m->flags |= MCSHENG_FLAG_SINGLE; + } +} +#endif + static size_t calcShermanRegionSize(const dfa_info &info) { size_t rv = 0; @@ -272,7 +373,7 @@ void fillInAux(mstate_aux *aux, dstate_id_t i, const dfa_info &info, /* returns false on error */ static bool allocateImplId16(dfa_info &info, dstate_id_t sheng_end, - dstate_id_t *sherman_base) { + dstate_id_t *sherman_base) { info.states[0].impl_id = 0; /* dead is always 0 */ vector norm; @@ -382,6 +483,7 @@ CharReach get_edge_reach(dstate_id_t u, dstate_id_t v, const dfa_info &info) { } #define MAX_SHENG_STATES 16 +#define MAX_SHENG64_STATES 64 #define MAX_SHENG_LEAKINESS 0.05 using LeakinessCache = ue2_unordered_map, double>; @@ -435,7 +537,8 @@ double leakiness(const RdfaGraph &g, dfa_info &info, static dstate_id_t find_sheng_states(dfa_info &info, - map &accel_escape_info) { + map &accel_escape_info, + size_t max_sheng_states) { RdfaGraph g(info.raw); auto cyclics = find_vertices_in_cycles(g); @@ -470,7 +573,7 @@ dstate_id_t find_sheng_states(dfa_info &info, flat_set considered = { DEAD_STATE }; bool seen_back_edge = false; while (!to_consider.empty() - && sheng_states.size() < MAX_SHENG_STATES) { + && sheng_states.size() < max_sheng_states) { auto v = to_consider.front(); to_consider.pop_front(); if (!considered.insert(g[v].index).second) { @@ -616,6 +719,82 @@ void fill_in_succ_table_16(NFA *nfa, const dfa_info &info, } } +#if defined(HAVE_AVX512VBMI) +static +void fill_in_aux_info64(NFA *nfa, const dfa_info &info, + const map &accel_escape_info, + u32 accel_offset, UNUSED u32 accel_end_offset, + const vector &reports, + const vector &reports_eod, + u32 report_base_offset, + const raw_report_info &ri) { + mcsheng64 *m = (mcsheng64 *)getMutableImplNfa(nfa); + + vector reportOffsets; + + ri.fillReportLists(nfa, report_base_offset, reportOffsets); + + for (u32 i = 0; i < info.size(); i++) { + u16 impl_id = info.implId(i); + mstate_aux *this_aux = getAux64(nfa, impl_id); + + fillInAux(this_aux, i, info, reports, reports_eod, reportOffsets); + if (contains(accel_escape_info, i)) { + this_aux->accel_offset = accel_offset; + accel_offset += info.strat.accelSize(); + assert(accel_offset <= accel_end_offset); + assert(ISALIGNED_N(accel_offset, alignof(union AccelAux))); + info.strat.buildAccel(i, accel_escape_info.at(i), + (void *)((char *)m + this_aux->accel_offset)); + } + } +} + +static +u16 get_edge_flags64(NFA *nfa, dstate_id_t target_impl_id) { + mstate_aux *aux = getAux64(nfa, target_impl_id); + u16 flags = 0; + + if (aux->accept) { + flags |= ACCEPT_FLAG; + } + + if (aux->accel_offset) { + flags |= ACCEL_FLAG; + } + + return flags; +} + +static +void fill_in_succ_table_64_16(NFA *nfa, const dfa_info &info, + dstate_id_t sheng_end, + UNUSED dstate_id_t sherman_base) { + u16 *succ_table = (u16 *)((char *)nfa + sizeof(NFA) + sizeof(mcsheng64)); + + u8 alphaShift = info.getAlphaShift(); + assert(alphaShift <= 8); + + for (size_t i = 0; i < info.size(); i++) { + if (!info.is_normal(i)) { + assert(info.implId(i) < sheng_end || info.is_sherman(i)); + continue; + } + + assert(info.implId(i) < sherman_base); + u16 normal_id = verify_u16(info.implId(i) - sheng_end); + + for (size_t s = 0; s < info.impl_alpha_size; s++) { + dstate_id_t raw_succ = info.states[i].next[s]; + u16 &entry = succ_table[((size_t)normal_id << alphaShift) + s]; + + entry = info.implId(raw_succ); + entry |= get_edge_flags64(nfa, entry); + } + } +} +#endif + #define MAX_SHERMAN_LIST_LEN 8 static @@ -934,6 +1113,162 @@ void fill_in_succ_table_8(NFA *nfa, const dfa_info &info, } } +#if defined(HAVE_AVX512VBMI) +static +void fill_in_sherman64(NFA *nfa, dfa_info &info, UNUSED u16 sherman_limit) { + char *nfa_base = (char *)nfa; + mcsheng64 *m = (mcsheng64 *)getMutableImplNfa(nfa); + char *sherman_table = nfa_base + m->sherman_offset; + + assert(ISALIGNED_16(sherman_table)); + for (size_t i = 0; i < info.size(); i++) { + if (!info.is_sherman(i)) { + continue; + } + u16 fs = verify_u16(info.implId(i)); + DEBUG_PRINTF("building sherman %zu impl %hu\n", i, fs); + + assert(fs >= sherman_limit); + + char *curr_sherman_entry + = sherman_table + (fs - m->sherman_limit) * SHERMAN_FIXED_SIZE; + assert(curr_sherman_entry <= nfa_base + m->length); + + u8 len = verify_u8(info.impl_alpha_size - info.extra[i].daddytaken); + assert(len <= 9); + dstate_id_t d = info.states[i].daddy; + + *(u8 *)(curr_sherman_entry + SHERMAN_TYPE_OFFSET) = SHERMAN_STATE; + *(u8 *)(curr_sherman_entry + SHERMAN_LEN_OFFSET) = len; + *(u16 *)(curr_sherman_entry + SHERMAN_DADDY_OFFSET) = info.implId(d); + u8 *chars = (u8 *)(curr_sherman_entry + SHERMAN_CHARS_OFFSET); + + for (u16 s = 0; s < info.impl_alpha_size; s++) { + if (info.states[i].next[s] != info.states[d].next[s]) { + *(chars++) = (u8)s; + } + } + + u16 *states = (u16 *)(curr_sherman_entry + SHERMAN_STATES_OFFSET(len)); + for (u16 s = 0; s < info.impl_alpha_size; s++) { + if (info.states[i].next[s] != info.states[d].next[s]) { + DEBUG_PRINTF("s overrider %hu dad %hu char next %hu\n", fs, + info.implId(d), + info.implId(info.states[i].next[s])); + u16 entry_val = info.implId(info.states[i].next[s]); + entry_val |= get_edge_flags64(nfa, entry_val); + unaligned_store_u16((u8 *)states++, entry_val); + } + } + } +} + +static +bytecode_ptr mcsheng64Compile16(dfa_info&info, dstate_id_t sheng_end, + const map&accel_escape_info, + const Grey &grey) { + DEBUG_PRINTF("building mcsheng 64-16\n"); + + vector reports; /* index in ri for the appropriate report list */ + vector reports_eod; /* as above */ + ReportID arb; + u8 single; + + assert(info.getAlphaShift() <= 8); + + // Sherman optimization + if (info.impl_alpha_size > 16) { + u16 total_daddy = 0; + for (u32 i = 0; i < info.size(); i++) { + find_better_daddy(info, i, + is_cyclic_near(info.raw, info.raw.start_anchored), + grey); + total_daddy += info.extra[i].daddytaken; + } + + DEBUG_PRINTF("daddy %hu/%zu states=%zu alpha=%hu\n", total_daddy, + info.size() * info.impl_alpha_size, info.size(), + info.impl_alpha_size); + } + + u16 sherman_limit; + if (!allocateImplId16(info, sheng_end, &sherman_limit)) { + DEBUG_PRINTF("failed to allocate state numbers, %zu states total\n", + info.size()); + return nullptr; + } + u16 count_real_states = sherman_limit - sheng_end; + + auto ri = info.strat.gatherReports(reports, reports_eod, &single, &arb); + + size_t tran_size = (1 << info.getAlphaShift()) * sizeof(u16) + * count_real_states; + + size_t aux_size = sizeof(mstate_aux) * info.size(); + + size_t aux_offset = ROUNDUP_16(sizeof(NFA) + sizeof(mcsheng64) + tran_size); + size_t accel_size = info.strat.accelSize() * accel_escape_info.size(); + size_t accel_offset = ROUNDUP_N(aux_offset + aux_size + + ri->getReportListSize(), 32); + size_t sherman_offset = ROUNDUP_16(accel_offset + accel_size); + size_t sherman_size = calcShermanRegionSize(info); + + size_t total_size = sherman_offset + sherman_size; + + accel_offset -= sizeof(NFA); /* adj accel offset to be relative to m */ + assert(ISALIGNED_N(accel_offset, alignof(union AccelAux))); + + auto nfa = make_zeroed_bytecode_ptr(total_size); + mcsheng64 *m = (mcsheng64 *)getMutableImplNfa(nfa.get()); + + populateBasicInfo64(sizeof(u16), info, total_size, aux_offset, accel_offset, + accel_escape_info.size(), arb, single, nfa.get()); + createShuffleMasks64(m, info, sheng_end, accel_escape_info); + + /* copy in the mc header information */ + m->sherman_offset = sherman_offset; + m->sherman_end = total_size; + m->sherman_limit = sherman_limit; + + DEBUG_PRINTF("%hu sheng, %hu norm, %zu total\n", sheng_end, + count_real_states, info.size()); + + fill_in_aux_info64(nfa.get(), info, accel_escape_info, accel_offset, + sherman_offset - sizeof(NFA), reports, reports_eod, + aux_offset + aux_size, *ri); + + fill_in_succ_table_64_16(nfa.get(), info, sheng_end, sherman_limit); + + fill_in_sherman64(nfa.get(), info, sherman_limit); + + return nfa; +} + +static +void fill_in_succ_table_64_8(NFA *nfa, const dfa_info &info, + dstate_id_t sheng_end) { + u8 *succ_table = (u8 *)nfa + sizeof(NFA) + sizeof(mcsheng64); + + u8 alphaShift = info.getAlphaShift(); + assert(alphaShift <= 8); + + for (size_t i = 0; i < info.size(); i++) { + assert(!info.is_sherman(i)); + if (!info.is_normal(i)) { + assert(info.implId(i) < sheng_end); + continue; + } + u8 normal_id = verify_u8(info.implId(i) - sheng_end); + + for (size_t s = 0; s < info.impl_alpha_size; s++) { + dstate_id_t raw_succ = info.states[i].next[s]; + succ_table[((size_t)normal_id << alphaShift) + s] + = info.implId(raw_succ); + } + } +} +#endif + static void allocateImplId8(dfa_info &info, dstate_id_t sheng_end, const map &accel_escape_info, @@ -1031,6 +1366,60 @@ bytecode_ptr mcshengCompile8(dfa_info &info, dstate_id_t sheng_end, return nfa; } +#if defined(HAVE_AVX512VBMI) +static +bytecode_ptr mcsheng64Compile8(dfa_info &info, dstate_id_t sheng_end, + const map &accel_escape_info) { + DEBUG_PRINTF("building mcsheng 64-8\n"); + + vector reports; + vector reports_eod; + ReportID arb; + u8 single; + + auto ri = info.strat.gatherReports(reports, reports_eod, &single, &arb); + + size_t normal_count = info.size() - sheng_end; + + size_t tran_size = sizeof(u8) * (1 << info.getAlphaShift()) * normal_count; + size_t aux_size = sizeof(mstate_aux) * info.size(); + size_t aux_offset = ROUNDUP_16(sizeof(NFA) + sizeof(mcsheng64) + tran_size); + size_t accel_size = info.strat.accelSize() * accel_escape_info.size(); + size_t accel_offset = ROUNDUP_N(aux_offset + aux_size + + ri->getReportListSize(), 32); + size_t total_size = accel_offset + accel_size; + + DEBUG_PRINTF("aux_size %zu\n", aux_size); + DEBUG_PRINTF("aux_offset %zu\n", aux_offset); + DEBUG_PRINTF("rl size %u\n", ri->getReportListSize()); + DEBUG_PRINTF("accel_size %zu\n", accel_size); + DEBUG_PRINTF("accel_offset %zu\n", accel_offset); + DEBUG_PRINTF("total_size %zu\n", total_size); + + accel_offset -= sizeof(NFA); /* adj accel offset to be relative to m */ + assert(ISALIGNED_N(accel_offset, alignof(union AccelAux))); + + auto nfa = make_zeroed_bytecode_ptr(total_size); + mcsheng64 *m = (mcsheng64 *)getMutableImplNfa(nfa.get()); + + allocateImplId8(info, sheng_end, accel_escape_info, &m->accel_limit_8, + &m->accept_limit_8); + + populateBasicInfo64(sizeof(u8), info, total_size, aux_offset, accel_offset, + accel_escape_info.size(), arb, single, nfa.get()); + createShuffleMasks64(m, info, sheng_end, accel_escape_info); + + fill_in_aux_info64(nfa.get(), info, accel_escape_info, accel_offset, + total_size - sizeof(NFA), reports, reports_eod, + aux_offset + aux_size, *ri); + + fill_in_succ_table_64_8(nfa.get(), info, sheng_end); + DEBUG_PRINTF("rl size %zu\n", ri->size()); + + return nfa; +} +#endif + bytecode_ptr mcshengCompile(raw_dfa &raw, const CompileContext &cc, const ReportManager &rm) { if (!cc.grey.allowMcSheng) { @@ -1050,19 +1439,79 @@ bytecode_ptr mcshengCompile(raw_dfa &raw, const CompileContext &cc, map accel_escape_info = info.strat.getAccelInfo(cc.grey); + auto old_states = info.states; + dstate_id_t sheng_end = find_sheng_states(info, accel_escape_info, MAX_SHENG_STATES); - dstate_id_t sheng_end = find_sheng_states(info, accel_escape_info); if (sheng_end <= DEAD_STATE + 1) { + info.states = old_states; return nullptr; } bytecode_ptr nfa; + if (!using8bit) { nfa = mcshengCompile16(info, sheng_end, accel_escape_info, cc.grey); } else { nfa = mcshengCompile8(info, sheng_end, accel_escape_info); } + if (!nfa) { + info.states = old_states; + return nfa; + } + + if (has_eod_reports) { + nfa->flags |= NFA_ACCEPTS_EOD; + } + + DEBUG_PRINTF("compile done\n"); + return nfa; +} + +#if defined(HAVE_AVX512VBMI) +bytecode_ptr mcshengCompile64(raw_dfa &raw, const CompileContext &cc, + const ReportManager &rm) { + if (!cc.grey.allowMcSheng) { + return nullptr; + } + + mcclellan_build_strat mbs(raw, rm, false); + dfa_info info(mbs); + bool using8bit = cc.grey.allowMcClellan8 && info.size() <= 256; + + if (!cc.streaming) { /* TODO: work out if we can do the strip in streaming + * mode with our semantics */ + raw.stripExtraEodReports(); + } + + bool has_eod_reports = raw.hasEodReports(); + + map accel_escape_info + = info.strat.getAccelInfo(cc.grey); + bool using64state = false; /*default flag*/ + dstate_id_t sheng_end64; + sheng_end64 = find_sheng_states(info, accel_escape_info, MAX_SHENG64_STATES); + + if (sheng_end64 <= DEAD_STATE + 1) { + return nullptr; + } else { + using64state = true; + } + + bytecode_ptr nfa; + + if (using64state) { + assert((sheng_end64 > 17) && (sheng_end64 <= 65)); + if (!using8bit) { + nfa = mcsheng64Compile16(info, sheng_end64, accel_escape_info, cc.grey); + } else { + assert(using8bit); + nfa = mcsheng64Compile8(info, sheng_end64, accel_escape_info); + assert(nfa); + assert(nfa->type == MCSHENG_64_NFA_8); + } + } + if (!nfa) { return nfa; } @@ -1074,6 +1523,7 @@ bytecode_ptr mcshengCompile(raw_dfa &raw, const CompileContext &cc, DEBUG_PRINTF("compile done\n"); return nfa; } +#endif bool has_accel_mcsheng(const NFA *) { return true; /* consider the sheng region as accelerated */ diff --git a/src/nfa/mcsheng_compile.h b/src/nfa/mcsheng_compile.h index 487ab45f..3a79b46a 100644 --- a/src/nfa/mcsheng_compile.h +++ b/src/nfa/mcsheng_compile.h @@ -42,7 +42,8 @@ struct raw_dfa; bytecode_ptr mcshengCompile(raw_dfa &raw, const CompileContext &cc, const ReportManager &rm); - +bytecode_ptr mcshengCompile64(raw_dfa &raw, const CompileContext &cc, + const ReportManager &rm); bool has_accel_mcsheng(const NFA *nfa); } // namespace ue2 diff --git a/src/nfa/mcsheng_data.c b/src/nfa/mcsheng_data.c index eaf3cbbb..64aafcbf 100644 --- a/src/nfa/mcsheng_data.c +++ b/src/nfa/mcsheng_data.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016, Intel Corporation + * Copyright (c) 2016-2020, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -41,3 +41,16 @@ const u64a mcsheng_pext_mask[8] = { 0x00ff00000000000f, 0xff0000000000000f, }; +#if defined(HAVE_AVX512VBMI) +const u64a mcsheng64_pext_mask[8] = { + 0, /* dummy */ + 0x000000000000ff3f, + 0x0000000000ff003f, + 0x00000000ff00003f, + 0x000000ff0000003f, + 0x0000ff000000003f, + 0x00ff00000000003f, + 0xff0000000000003f, +}; +#endif + diff --git a/src/nfa/mcsheng_dump.cpp b/src/nfa/mcsheng_dump.cpp index 2b563079..1659987c 100644 --- a/src/nfa/mcsheng_dump.cpp +++ b/src/nfa/mcsheng_dump.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2017, Intel Corporation + * Copyright (c) 2016-2020, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -174,6 +174,126 @@ void describeEdge(FILE *f, const mcsheng *m, const u16 *t, u16 i) { } } +#if defined(HAVE_AVX512VBMI) +static +const mstate_aux *getAux64(const NFA *n, dstate_id_t i) { + auto *m = (const mcsheng64 *)getImplNfa(n); + auto *aux_base = (const mstate_aux *)((const char *)n + m->aux_offset); + + const mstate_aux *aux = aux_base + i; + + assert((const char *)aux < (const char *)n + m->length); + return aux; +} + +static +void next_states64(const NFA *n, u16 s, u16 *t) { + const mcsheng64 *m = (const mcsheng64 *)getImplNfa(n); + const mstate_aux *aux = getAux64(n, s); + const u32 as = m->alphaShift; + assert(s != DEAD_STATE); + + if (s < m->sheng_end) { + for (u16 c = 0; c < N_CHARS; c++) { + u8 sheng_s = s - 1; + auto trans_for_c = (const char *)&m->sheng_succ_masks[c]; + assert(sheng_s < sizeof(m512)); + u8 raw_succ = trans_for_c[sheng_s]; + if (raw_succ == m->sheng_end - 1) { + t[c] = DEAD_STATE; + } else if (raw_succ < m->sheng_end) { + t[c] = raw_succ + 1; + } else { + t[c] = raw_succ; + } + } + } else if (n->type == MCSHENG_64_NFA_8) { + const u8 *succ_table = (const u8 *)((const char *)m + sizeof(mcsheng64)); + for (u16 c = 0; c < N_CHARS; c++) { + u32 normal_id = s - m->sheng_end; + t[c] = succ_table[(normal_id << as) + m->remap[c]]; + } + } else { + u16 base_s = s; + const char *winfo_base = (const char *)n + m->sherman_offset; + const char *state_base + = winfo_base + SHERMAN_FIXED_SIZE * (s - m->sherman_limit); + + if (s >= m->sherman_limit) { + base_s = unaligned_load_u16(state_base + SHERMAN_DADDY_OFFSET); + assert(base_s >= m->sheng_end); + } + + const u16 *succ_table = (const u16 *)((const char *)m + + sizeof(mcsheng64)); + for (u16 c = 0; c < N_CHARS; c++) { + u32 normal_id = base_s - m->sheng_end; + t[c] = succ_table[(normal_id << as) + m->remap[c]]; + } + + if (s >= m->sherman_limit) { + UNUSED char type = *(state_base + SHERMAN_TYPE_OFFSET); + assert(type == SHERMAN_STATE); + u8 len = *(const u8 *)(SHERMAN_LEN_OFFSET + state_base); + const char *chars = state_base + SHERMAN_CHARS_OFFSET; + const u16 *states = (const u16 *)(state_base + + SHERMAN_STATES_OFFSET(len)); + + for (u8 i = 0; i < len; i++) { + for (u16 c = 0; c < N_CHARS; c++) { + if (m->remap[c] == chars[i]) { + t[c] = unaligned_load_u16((const u8*)&states[i]); + } + } + } + } + + for (u16 c = 0; c < N_CHARS; c++) { + t[c] &= STATE_MASK; + } + + } + + t[TOP] = aux->top & STATE_MASK; +} + +static +void describeEdge64(FILE *f, const mcsheng64 *m, const u16 *t, u16 i) { + for (u16 s = 0; s < N_CHARS; s++) { + if (!t[s]) { + continue; + } + + u16 ss; + for (ss = 0; ss < s; ss++) { + if (t[s] == t[ss]) { + break; + } + } + + if (ss != s) { + continue; + } + + CharReach reach; + for (ss = s; ss < 256; ss++) { + if (t[s] == t[ss]) { + reach.set(ss); + } + } + + fprintf(f, "%u -> %u [ ", i, t[s]); + if (i < m->sheng_end && t[s] < m->sheng_end) { + fprintf(f, "color = red, fontcolor = red "); + } + fprintf(f, "label = \""); + describeClass(f, reach, 5, CC_OUT_DOT); + + fprintf(f, "\" ];\n"); + } +} +#endif + static void dumpAccelDot(FILE *f, u16 i, const union AccelAux *accel) { switch(accel->accel_type) { @@ -256,6 +376,68 @@ void describeNode(const NFA *n, const mcsheng *m, u16 i, FILE *f) { } +#if defined(HAVE_AVX512VBMI) +static +void describeNode64(const NFA *n, const mcsheng64 *m, u16 i, FILE *f) { + const mstate_aux *aux = getAux64(n, i); + + bool isSherman = m->sherman_limit && i >= m->sherman_limit; + + fprintf(f, "%u [ width = 1, fixedsize = true, fontsize = 12, " + "label = \"%u%s\" ]; \n", i, i, isSherman ? "w":""); + + if (aux->accel_offset) { + dumpAccelDot(f, i, (const union AccelAux *) + ((const char *)m + aux->accel_offset)); + } + + if (i && i < m->sheng_end) { + fprintf(f, "%u [color = red, fontcolor = red]; \n", i); + } + + if (aux->accept_eod) { + fprintf(f, "%u [ color = darkorchid ];\n", i); + } + + if (aux->accept) { + fprintf(f, "%u [ shape = doublecircle ];\n", i); + } + + if (aux->top && aux->top != i) { + fprintf(f, "%u -> %u [color = darkgoldenrod weight=0.1 ]\n", i, + aux->top); + } + + if (i == m->start_anchored) { + fprintf(f, "STARTA -> %u [color = blue ]\n", i); + } + + if (i == m->start_floating) { + fprintf(f, "STARTF -> %u [color = red ]\n", i); + } + + if (isSherman) { + const char *winfo_base = (const char *)n + m->sherman_offset; + const char *state_base + = winfo_base + SHERMAN_FIXED_SIZE * (i - m->sherman_limit); + assert(state_base < (const char *)m + m->length - sizeof(NFA)); + UNUSED u8 type = *(const u8 *)(state_base + SHERMAN_TYPE_OFFSET); + assert(type == SHERMAN_STATE); + fprintf(f, "%u [ fillcolor = lightblue style=filled ];\n", i); + u16 daddy = *(const u16 *)(state_base + SHERMAN_DADDY_OFFSET); + if (daddy) { + fprintf(f, "%u -> %u [ color=royalblue style=dashed weight=0.1]\n", + i, daddy); + } + } + + if (i && i < m->sheng_end) { + fprintf(f, "subgraph cluster_sheng { %u } \n", i); + } + +} +#endif + static void dumpDotPreambleDfa(FILE *f) { dumpDotPreamble(f); @@ -392,6 +574,133 @@ void dump_text_8(const NFA *nfa, FILE *f) { dumpTextReverse(nfa, f); } +#if defined(HAVE_AVX512VBMI) +static +void dump64_dot_16(const NFA *nfa, FILE *f) { + auto *m = (const mcsheng64 *)getImplNfa(nfa); + + dumpDotPreambleDfa(f); + + for (u16 i = 1; i < m->state_count; i++) { + describeNode64(nfa, m, i, f); + + u16 t[ALPHABET_SIZE]; + + next_states64(nfa, i, t); + + describeEdge64(f, m, t, i); + } + + fprintf(f, "}\n"); +} + +static +void dump64_dot_8(const NFA *nfa, FILE *f) { + auto m = (const mcsheng64 *)getImplNfa(nfa); + + dumpDotPreambleDfa(f); + + for (u16 i = 1; i < m->state_count; i++) { + describeNode64(nfa, m, i, f); + + u16 t[ALPHABET_SIZE]; + + next_states64(nfa, i, t); + + describeEdge64(f, m, t, i); + } + + fprintf(f, "}\n"); +} + +static +void dumpAccelMasks64(FILE *f, const mcsheng64 *m, const mstate_aux *aux) { + fprintf(f, "\n"); + fprintf(f, "Acceleration\n"); + fprintf(f, "------------\n"); + + for (u16 i = 0; i < m->state_count; i++) { + if (!aux[i].accel_offset) { + continue; + } + + auto accel = (const AccelAux *)((const char *)m + aux[i].accel_offset); + fprintf(f, "%05hu ", i); + dumpAccelInfo(f, *accel); + } +} + +static +void describeAlphabet64(FILE *f, const mcsheng64 *m) { + map rev; + + for (u16 i = 0; i < N_CHARS; i++) { + rev[m->remap[i]].clear(); + } + + for (u16 i = 0; i < N_CHARS; i++) { + rev[m->remap[i]].set(i); + } + + map::const_iterator it; + fprintf(f, "\nAlphabet\n"); + for (it = rev.begin(); it != rev.end(); ++it) { + fprintf(f, "%3hhu: ", it->first); + describeClass(f, it->second, 10240, CC_OUT_TEXT); + fprintf(f, "\n"); + } + fprintf(f, "\n"); +} + +static +void dumpCommonHeader64(FILE *f, const mcsheng64 *m) { + fprintf(f, "report: %u, states: %u, length: %u\n", m->arb_report, + m->state_count, m->length); + fprintf(f, "astart: %hu, fstart: %hu\n", m->start_anchored, + m->start_floating); + fprintf(f, "single accept: %d, has_accel: %d\n", + !!(int)m->flags & MCSHENG_FLAG_SINGLE, m->has_accel); + fprintf(f, "sheng_end: %hu\n", m->sheng_end); + fprintf(f, "sheng_accel_limit: %hu\n", m->sheng_accel_limit); +} + +static +void dump64_text_8(const NFA *nfa, FILE *f) { + auto m = (const mcsheng64 *)getImplNfa(nfa); + auto aux = (const mstate_aux *)((const char *)nfa + m->aux_offset); + + fprintf(f, "mcsheng 64-8\n"); + dumpCommonHeader64(f, m); + fprintf(f, "accel_limit: %hu, accept_limit %hu\n", m->accel_limit_8, + m->accept_limit_8); + fprintf(f, "\n"); + + describeAlphabet64(f, m); + dumpAccelMasks64(f, m, aux); + + fprintf(f, "\n"); + dumpTextReverse(nfa, f); +} + +static +void dump64_text_16(const NFA *nfa, FILE *f) { + auto *m = (const mcsheng64 *)getImplNfa(nfa); + auto *aux = (const mstate_aux *)((const char *)nfa + m->aux_offset); + + fprintf(f, "mcsheng 64-16\n"); + dumpCommonHeader64(f, m); + fprintf(f, "sherman_limit: %d, sherman_end: %d\n", (int)m->sherman_limit, + (int)m->sherman_end); + fprintf(f, "\n"); + + describeAlphabet64(f, m); + dumpAccelMasks64(f, m, aux); + + fprintf(f, "\n"); + dumpTextReverse(nfa, f); +} +#endif + void nfaExecMcSheng16_dump(const NFA *nfa, const string &base) { assert(nfa->type == MCSHENG_NFA_16); dump_text_16(nfa, StdioFile(base + ".txt", "w")); @@ -404,4 +713,20 @@ void nfaExecMcSheng8_dump(const NFA *nfa, const string &base) { dump_dot_8(nfa, StdioFile(base + ".dot", "w")); } +void nfaExecMcSheng64_16_dump(UNUSED const NFA *nfa, UNUSED const string &base) { +#if defined(HAVE_AVX512VBMI) + assert(nfa->type == MCSHENG_64_NFA_16); + dump64_text_16(nfa, StdioFile(base + ".txt", "w")); + dump64_dot_16(nfa, StdioFile(base + ".dot", "w")); +#endif +} + +void nfaExecMcSheng64_8_dump(UNUSED const NFA *nfa, UNUSED const string &base) { +#if defined(HAVE_AVX512VBMI) + assert(nfa->type == MCSHENG_64_NFA_8); + dump64_text_8(nfa, StdioFile(base + ".txt", "w")); + dump64_dot_8(nfa, StdioFile(base + ".dot", "w")); +#endif +} + } // namespace ue2 diff --git a/src/nfa/mcsheng_dump.h b/src/nfa/mcsheng_dump.h index 1b699367..26e6cfda 100644 --- a/src/nfa/mcsheng_dump.h +++ b/src/nfa/mcsheng_dump.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016, Intel Corporation + * Copyright (c) 2016-2020, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -42,7 +42,8 @@ namespace ue2 { void nfaExecMcSheng8_dump(const struct NFA *nfa, const std::string &base); void nfaExecMcSheng16_dump(const struct NFA *nfa, const std::string &base); - +void nfaExecMcSheng64_8_dump(const struct NFA *nfa, const std::string &base); +void nfaExecMcSheng64_16_dump(const struct NFA *nfa, const std::string &base); } // namespace ue2 #endif // DUMP_SUPPORT diff --git a/src/nfa/mcsheng_internal.h b/src/nfa/mcsheng_internal.h index bb45ae23..c8b28c13 100644 --- a/src/nfa/mcsheng_internal.h +++ b/src/nfa/mcsheng_internal.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2018, Intel Corporation + * Copyright (c) 2016-2020, Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -92,4 +92,35 @@ struct mcsheng { * representing the data from a u64a. */ extern const u64a mcsheng_pext_mask[8]; +#if defined(HAVE_AVX512VBMI) +struct mcsheng64 { + u16 state_count; /**< total number of states */ + u32 length; /**< length of dfa in bytes */ + u16 start_anchored; /**< anchored start state */ + u16 start_floating; /**< floating start state */ + u32 aux_offset; /**< offset of the aux structures relative to the start of + * the nfa structure */ + u32 sherman_offset; /**< offset of array of sherman state offsets the + * state_info structures relative to the start of the + * nfa structure */ + u32 sherman_end; /**< offset of the end of the state_info structures + * relative to the start of the nfa structure */ + u16 sheng_end; /**< first non-sheng state */ + u16 sheng_accel_limit; /**< first sheng accel state. state given in terms of + * internal sheng ids */ + u16 accel_limit_8; /**< 8 bit, lowest accelerable state */ + u16 accept_limit_8; /**< 8 bit, lowest accept state */ + u16 sherman_limit; /**< lowest sherman state */ + u8 alphaShift; + u8 flags; + u8 has_accel; /**< 1 iff there are any accel plans */ + u8 remap[256]; /**< remaps characters to a smaller alphabet */ + ReportID arb_report; /**< one of the accepts that this dfa may raise */ + u32 accel_offset; /**< offset of accel structures from start of McClellan */ + m512 sheng_succ_masks[N_CHARS]; +}; + +extern const u64a mcsheng64_pext_mask[8]; +#endif + #endif diff --git a/src/nfa/nfa_api_dispatch.c b/src/nfa/nfa_api_dispatch.c index 4b45cf06..75cac4b4 100644 --- a/src/nfa/nfa_api_dispatch.c +++ b/src/nfa/nfa_api_dispatch.c @@ -78,6 +78,8 @@ DISPATCH_CASE(MCSHENG_NFA_16, McSheng16, dbnt_func); \ DISPATCH_CASE(SHENG_NFA_32, Sheng32, dbnt_func); \ DISPATCH_CASE(SHENG_NFA_64, Sheng64, dbnt_func); \ + DISPATCH_CASE(MCSHENG_64_NFA_8, McSheng64_8, dbnt_func); \ + DISPATCH_CASE(MCSHENG_64_NFA_16, McSheng64_16, dbnt_func); \ default: \ assert(0); \ } diff --git a/src/nfa/nfa_build_util.cpp b/src/nfa/nfa_build_util.cpp index df789d7d..fbe13fb5 100644 --- a/src/nfa/nfa_build_util.cpp +++ b/src/nfa/nfa_build_util.cpp @@ -478,6 +478,37 @@ const nfa_dispatch_fn NFATraits::has_repeats_other_than_firsts = d const char *NFATraits::name = "Sheng 64"; #endif +template<> struct NFATraits { + UNUSED static const char *name; + static const NFACategory category = NFA_OTHER; + static const u32 stateAlign = 1; + static const bool fast = true; + static const nfa_dispatch_fn has_accel; + static const nfa_dispatch_fn has_repeats; + static const nfa_dispatch_fn has_repeats_other_than_firsts; +}; +const nfa_dispatch_fn NFATraits::has_accel = has_accel_mcsheng; +const nfa_dispatch_fn NFATraits::has_repeats = dispatch_false; +const nfa_dispatch_fn NFATraits::has_repeats_other_than_firsts = dispatch_false; +#if defined(DUMP_SUPPORT) +const char *NFATraits::name = "Shengy64 McShengFace 8"; +#endif + +template<> struct NFATraits { + UNUSED static const char *name; + static const NFACategory category = NFA_OTHER; + static const u32 stateAlign = 2; + static const bool fast = true; + static const nfa_dispatch_fn has_accel; + static const nfa_dispatch_fn has_repeats; + static const nfa_dispatch_fn has_repeats_other_than_firsts; +}; +const nfa_dispatch_fn NFATraits::has_accel = has_accel_mcsheng; +const nfa_dispatch_fn NFATraits::has_repeats = dispatch_false; +const nfa_dispatch_fn NFATraits::has_repeats_other_than_firsts = dispatch_false; +#if defined(DUMP_SUPPORT) +const char *NFATraits::name = "Shengy64 McShengFace 16"; +#endif } // namespace #if defined(DUMP_SUPPORT) diff --git a/src/nfa/nfa_dump_dispatch.cpp b/src/nfa/nfa_dump_dispatch.cpp index 09137ccd..bc8c175d 100644 --- a/src/nfa/nfa_dump_dispatch.cpp +++ b/src/nfa/nfa_dump_dispatch.cpp @@ -83,6 +83,8 @@ namespace ue2 { DISPATCH_CASE(MCSHENG_NFA_16, McSheng16, dbnt_func); \ DISPATCH_CASE(SHENG_NFA_32, Sheng32, dbnt_func); \ DISPATCH_CASE(SHENG_NFA_64, Sheng64, dbnt_func); \ + DISPATCH_CASE(MCSHENG_64_NFA_8, McSheng64_8, dbnt_func); \ + DISPATCH_CASE(MCSHENG_64_NFA_16, McSheng64_16, dbnt_func); \ default: \ assert(0); \ } diff --git a/src/nfa/nfa_internal.h b/src/nfa/nfa_internal.h index de43c0b5..864ea900 100644 --- a/src/nfa/nfa_internal.h +++ b/src/nfa/nfa_internal.h @@ -74,6 +74,8 @@ enum NFAEngineType { MCSHENG_NFA_16, /**< magic pseudo nfa */ SHENG_NFA_32, /**< magic pseudo nfa */ SHENG_NFA_64, /**< magic pseudo nfa */ + MCSHENG_64_NFA_8, /**< magic pseudo nfa */ + MCSHENG_64_NFA_16, /**< magic pseudo nfa */ /** \brief bogus NFA - not used */ INVALID_NFA }; @@ -150,7 +152,12 @@ static really_inline int isMcClellanType(u8 t) { /** \brief True if the given type (from NFA::type) is a Sheng-McClellan hybrid * DFA. */ static really_inline int isShengMcClellanType(u8 t) { +#if defined(HAVE_AVX512VBMI) + return t == MCSHENG_64_NFA_8 || t == MCSHENG_64_NFA_16 || t == MCSHENG_NFA_8 || + t == MCSHENG_NFA_16; +#else return t == MCSHENG_NFA_8 || t == MCSHENG_NFA_16; +#endif } /** \brief True if the given type (from NFA::type) is a Gough DFA. */ diff --git a/src/rose/rose_build_bytecode.cpp b/src/rose/rose_build_bytecode.cpp index abd5281d..3b51daa2 100644 --- a/src/rose/rose_build_bytecode.cpp +++ b/src/rose/rose_build_bytecode.cpp @@ -632,6 +632,7 @@ bytecode_ptr getDfa(raw_dfa &rdfa, bool is_transient, * bytecode and that they are usually run on small blocks */ dfa = mcshengCompile(rdfa, cc, rm); } + #if defined(HAVE_AVX512VBMI) if (!dfa) { dfa = sheng32Compile(rdfa, cc, rm, false); @@ -639,6 +640,9 @@ bytecode_ptr getDfa(raw_dfa &rdfa, bool is_transient, if (!dfa) { dfa = sheng64Compile(rdfa, cc, rm, false); } + if (!dfa && !is_transient) { + dfa = mcshengCompile64(rdfa, cc, rm); + } #endif if (!dfa) { // Sheng wasn't successful, so unleash McClellan! diff --git a/src/util/simd_utils.h b/src/util/simd_utils.h index 481eb933..df171345 100644 --- a/src/util/simd_utils.h +++ b/src/util/simd_utils.h @@ -138,6 +138,12 @@ m128 lshift64_m128(m128 a, unsigned b) { #define eq128(a, b) _mm_cmpeq_epi8((a), (b)) #define movemask128(a) ((u32)_mm_movemask_epi8((a))) +#if defined(HAVE_AVX512) +static really_inline m128 cast512to128(const m512 in) { + return _mm512_castsi512_si128(in); +} +#endif + static really_inline m128 set16x8(u8 c) { return _mm_set1_epi8(c); } @@ -156,6 +162,12 @@ static really_inline u32 movd512(const m512 in) { // so we use 2-step convertions to work around. return _mm_cvtsi128_si32(_mm512_castsi512_si128(in)); } + +static really_inline u64a movq512(const m512 in) { + // NOTE: seems AVX512 doesn't support _mm512_cvtsi512_si64(in), + // so we use 2-step convertions to work around. + return _mm_cvtsi128_si64(_mm512_castsi512_si128(in)); +} #endif static really_inline u64a movq(const m128 in) { @@ -1000,6 +1012,11 @@ m512 set8x64(u64a a) { return _mm512_set1_epi64(a); } +static really_inline +m512 set16x32(u32 a) { + return _mm512_set1_epi32(a); +} + static really_inline m512 set512_64(u64a hi_3, u64a hi_2, u64a hi_1, u64a hi_0, u64a lo_3, u64a lo_2, u64a lo_1, u64a lo_0) { @@ -1017,6 +1034,26 @@ static really_inline m512 set4x128(m128 a) { return _mm512_broadcast_i32x4(a); } + +static really_inline +m512 sadd_u8_m512(m512 a, m512 b) { + return _mm512_adds_epu8(a, b); +} + +static really_inline +m512 max_u8_m512(m512 a, m512 b) { + return _mm512_max_epu8(a, b); +} + +static really_inline +m512 min_u8_m512(m512 a, m512 b) { + return _mm512_min_epu8(a, b); +} + +static really_inline +m512 sub_u8_m512(m512 a, m512 b) { + return _mm512_sub_epi8(a, b); +} #endif static really_inline