diff --git a/src/nfa/sheng.c b/src/nfa/sheng.c index 3f36e218..922e8f80 100644 --- a/src/nfa/sheng.c +++ b/src/nfa/sheng.c @@ -154,7 +154,7 @@ char fireReports(const struct sheng *sh, NfaCallback cb, void *ctxt, return MO_CONTINUE_MATCHING; /* continue execution */ } -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) // Sheng32 static really_inline const struct sheng32 *get_sheng32(const struct NFA *n) { @@ -351,7 +351,7 @@ char fireReports64(const struct sheng64 *sh, NfaCallback cb, void *ctxt, } return MO_CONTINUE_MATCHING; /* continue execution */ } -#endif // end of HAVE_AVX512VBMI +#endif // end of HAVE_AVX512VBMI || HAVE_SVE /* include Sheng function definitions */ #include "sheng_defs.h" @@ -871,7 +871,7 @@ char nfaExecSheng_expandState(UNUSED const struct NFA *nfa, void *dest, return 0; } -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) // Sheng32 static really_inline char runSheng32Cb(const struct sheng32 *sh, NfaCallback cb, void *ctxt, @@ -1874,4 +1874,4 @@ char nfaExecSheng64_expandState(UNUSED const struct NFA *nfa, void *dest, *(u8 *)dest = *(const u8 *)src; return 0; } -#endif // end of HAVE_AVX512VBMI +#endif // end of HAVE_AVX512VBMI || HAVE_SVE diff --git a/src/nfa/sheng.h b/src/nfa/sheng.h index 7b90e303..212bd3a4 100644 --- a/src/nfa/sheng.h +++ b/src/nfa/sheng.h @@ -58,7 +58,7 @@ char nfaExecSheng_reportCurrent(const struct NFA *n, struct mq *q); char nfaExecSheng_B(const struct NFA *n, u64a offset, const u8 *buffer, size_t length, NfaCallback cb, void *context); -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #define nfaExecSheng32_B_Reverse NFA_API_NO_IMPL #define nfaExecSheng32_zombie_status NFA_API_ZOMBIE_NO_IMPL @@ -106,8 +106,7 @@ char nfaExecSheng64_reportCurrent(const struct NFA *n, struct mq *q); char nfaExecSheng64_B(const struct NFA *n, u64a offset, const u8 *buffer, size_t length, NfaCallback cb, void *context); - -#else // !HAVE_AVX512VBMI +#else // !HAVE_AVX512VBMI && !HAVE_SVE #define nfaExecSheng32_B_Reverse NFA_API_NO_IMPL #define nfaExecSheng32_zombie_status NFA_API_ZOMBIE_NO_IMPL @@ -138,6 +137,7 @@ char nfaExecSheng64_B(const struct NFA *n, u64a offset, const u8 *buffer, #define nfaExecSheng64_testEOD NFA_API_NO_IMPL #define nfaExecSheng64_reportCurrent NFA_API_NO_IMPL #define nfaExecSheng64_B NFA_API_NO_IMPL -#endif // end of HAVE_AVX512VBMI +#endif // end of HAVE_AVX512VBMI || defined(HAVE_SVE) + #endif /* SHENG_H_ */ diff --git a/src/nfa/sheng_defs.h b/src/nfa/sheng_defs.h index 390af752..886af28e 100644 --- a/src/nfa/sheng_defs.h +++ b/src/nfa/sheng_defs.h @@ -52,7 +52,7 @@ u8 hasInterestingStates(const u8 a, const u8 b, const u8 c, const u8 d) { return (a | b | c | d) & (SHENG_STATE_FLAG_MASK); } -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) static really_inline u8 isDeadState32(const u8 a) { return a & SHENG32_STATE_DEAD; @@ -108,7 +108,7 @@ u8 dummyFunc(UNUSED const u8 a) { #define SHENG_IMPL sheng_cod #define DEAD_FUNC isDeadState #define ACCEPT_FUNC isAcceptState -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #define SHENG32_IMPL sheng32_cod #define DEAD_FUNC32 isDeadState32 #define ACCEPT_FUNC32 isAcceptState32 @@ -121,7 +121,7 @@ u8 dummyFunc(UNUSED const u8 a) { #undef SHENG_IMPL #undef DEAD_FUNC #undef ACCEPT_FUNC -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #undef SHENG32_IMPL #undef DEAD_FUNC32 #undef ACCEPT_FUNC32 @@ -135,7 +135,7 @@ u8 dummyFunc(UNUSED const u8 a) { #define SHENG_IMPL sheng_co #define DEAD_FUNC dummyFunc #define ACCEPT_FUNC isAcceptState -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #define SHENG32_IMPL sheng32_co #define DEAD_FUNC32 dummyFunc #define ACCEPT_FUNC32 isAcceptState32 @@ -148,7 +148,7 @@ u8 dummyFunc(UNUSED const u8 a) { #undef SHENG_IMPL #undef DEAD_FUNC #undef ACCEPT_FUNC -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #undef SHENG32_IMPL #undef DEAD_FUNC32 #undef ACCEPT_FUNC32 @@ -162,7 +162,7 @@ u8 dummyFunc(UNUSED const u8 a) { #define SHENG_IMPL sheng_samd #define DEAD_FUNC isDeadState #define ACCEPT_FUNC isAcceptState -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #define SHENG32_IMPL sheng32_samd #define DEAD_FUNC32 isDeadState32 #define ACCEPT_FUNC32 isAcceptState32 @@ -175,7 +175,7 @@ u8 dummyFunc(UNUSED const u8 a) { #undef SHENG_IMPL #undef DEAD_FUNC #undef ACCEPT_FUNC -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #undef SHENG32_IMPL #undef DEAD_FUNC32 #undef ACCEPT_FUNC32 @@ -189,7 +189,7 @@ u8 dummyFunc(UNUSED const u8 a) { #define SHENG_IMPL sheng_sam #define DEAD_FUNC dummyFunc #define ACCEPT_FUNC isAcceptState -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #define SHENG32_IMPL sheng32_sam #define DEAD_FUNC32 dummyFunc #define ACCEPT_FUNC32 isAcceptState32 @@ -202,7 +202,7 @@ u8 dummyFunc(UNUSED const u8 a) { #undef SHENG_IMPL #undef DEAD_FUNC #undef ACCEPT_FUNC -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #undef SHENG32_IMPL #undef DEAD_FUNC32 #undef ACCEPT_FUNC32 @@ -216,7 +216,7 @@ u8 dummyFunc(UNUSED const u8 a) { #define SHENG_IMPL sheng_nmd #define DEAD_FUNC isDeadState #define ACCEPT_FUNC dummyFunc -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #define SHENG32_IMPL sheng32_nmd #define DEAD_FUNC32 isDeadState32 #define ACCEPT_FUNC32 dummyFunc @@ -229,7 +229,7 @@ u8 dummyFunc(UNUSED const u8 a) { #undef SHENG_IMPL #undef DEAD_FUNC #undef ACCEPT_FUNC -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #undef SHENG32_IMPL #undef DEAD_FUNC32 #undef ACCEPT_FUNC32 @@ -243,7 +243,7 @@ u8 dummyFunc(UNUSED const u8 a) { #define SHENG_IMPL sheng_nm #define DEAD_FUNC dummyFunc #define ACCEPT_FUNC dummyFunc -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #define SHENG32_IMPL sheng32_nm #define DEAD_FUNC32 dummyFunc #define ACCEPT_FUNC32 dummyFunc @@ -256,7 +256,7 @@ u8 dummyFunc(UNUSED const u8 a) { #undef SHENG_IMPL #undef DEAD_FUNC #undef ACCEPT_FUNC -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #undef SHENG32_IMPL #undef DEAD_FUNC32 #undef ACCEPT_FUNC32 @@ -277,7 +277,7 @@ u8 dummyFunc(UNUSED const u8 a) { #define INNER_ACCEL_FUNC isAccelState #define OUTER_ACCEL_FUNC dummyFunc #define ACCEPT_FUNC isAcceptState -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #define SHENG32_IMPL sheng32_4_coda #define INTERESTING_FUNC32 hasInterestingStates32 #define INNER_DEAD_FUNC32 isDeadState32 @@ -296,7 +296,7 @@ u8 dummyFunc(UNUSED const u8 a) { #undef INNER_ACCEL_FUNC #undef OUTER_ACCEL_FUNC #undef ACCEPT_FUNC -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #undef SHENG32_IMPL #undef INTERESTING_FUNC32 #undef INNER_DEAD_FUNC32 @@ -316,7 +316,7 @@ u8 dummyFunc(UNUSED const u8 a) { #define INNER_ACCEL_FUNC dummyFunc #define OUTER_ACCEL_FUNC dummyFunc #define ACCEPT_FUNC isAcceptState -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #define SHENG32_IMPL sheng32_4_cod #define INTERESTING_FUNC32 hasInterestingStates32 #define INNER_DEAD_FUNC32 isDeadState32 @@ -339,7 +339,7 @@ u8 dummyFunc(UNUSED const u8 a) { #undef INNER_ACCEL_FUNC #undef OUTER_ACCEL_FUNC #undef ACCEPT_FUNC -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #undef SHENG32_IMPL #undef INTERESTING_FUNC32 #undef INNER_DEAD_FUNC32 @@ -363,7 +363,7 @@ u8 dummyFunc(UNUSED const u8 a) { #define INNER_ACCEL_FUNC isAccelState #define OUTER_ACCEL_FUNC dummyFunc #define ACCEPT_FUNC isAcceptState -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #define SHENG32_IMPL sheng32_4_coa #define INTERESTING_FUNC32 hasInterestingStates32 #define INNER_DEAD_FUNC32 dummyFunc @@ -382,7 +382,7 @@ u8 dummyFunc(UNUSED const u8 a) { #undef INNER_ACCEL_FUNC #undef OUTER_ACCEL_FUNC #undef ACCEPT_FUNC -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #undef SHENG32_IMPL #undef INTERESTING_FUNC32 #undef INNER_DEAD_FUNC32 @@ -402,7 +402,7 @@ u8 dummyFunc(UNUSED const u8 a) { #define INNER_ACCEL_FUNC dummyFunc #define OUTER_ACCEL_FUNC dummyFunc #define ACCEPT_FUNC isAcceptState -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #define SHENG32_IMPL sheng32_4_co #define INTERESTING_FUNC32 hasInterestingStates32 #define INNER_DEAD_FUNC32 dummyFunc @@ -425,7 +425,7 @@ u8 dummyFunc(UNUSED const u8 a) { #undef INNER_ACCEL_FUNC #undef OUTER_ACCEL_FUNC #undef ACCEPT_FUNC -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #undef SHENG32_IMPL #undef INTERESTING_FUNC32 #undef INNER_DEAD_FUNC32 @@ -449,7 +449,7 @@ u8 dummyFunc(UNUSED const u8 a) { #define INNER_ACCEL_FUNC isAccelState #define OUTER_ACCEL_FUNC dummyFunc #define ACCEPT_FUNC isAcceptState -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #define SHENG32_IMPL sheng32_4_samda #define INTERESTING_FUNC32 hasInterestingStates32 #define INNER_DEAD_FUNC32 isDeadState32 @@ -468,7 +468,7 @@ u8 dummyFunc(UNUSED const u8 a) { #undef INNER_ACCEL_FUNC #undef OUTER_ACCEL_FUNC #undef ACCEPT_FUNC -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #undef SHENG32_IMPL #undef INTERESTING_FUNC32 #undef INNER_DEAD_FUNC32 @@ -488,7 +488,7 @@ u8 dummyFunc(UNUSED const u8 a) { #define INNER_ACCEL_FUNC dummyFunc #define OUTER_ACCEL_FUNC dummyFunc #define ACCEPT_FUNC isAcceptState -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #define SHENG32_IMPL sheng32_4_samd #define INTERESTING_FUNC32 hasInterestingStates32 #define INNER_DEAD_FUNC32 isDeadState32 @@ -511,7 +511,7 @@ u8 dummyFunc(UNUSED const u8 a) { #undef INNER_ACCEL_FUNC #undef OUTER_ACCEL_FUNC #undef ACCEPT_FUNC -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #undef SHENG32_IMPL #undef INTERESTING_FUNC32 #undef INNER_DEAD_FUNC32 @@ -535,7 +535,7 @@ u8 dummyFunc(UNUSED const u8 a) { #define INNER_ACCEL_FUNC isAccelState #define OUTER_ACCEL_FUNC dummyFunc #define ACCEPT_FUNC isAcceptState -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #define SHENG32_IMPL sheng32_4_sama #define INTERESTING_FUNC32 hasInterestingStates32 #define INNER_DEAD_FUNC32 dummyFunc @@ -554,7 +554,7 @@ u8 dummyFunc(UNUSED const u8 a) { #undef INNER_ACCEL_FUNC #undef OUTER_ACCEL_FUNC #undef ACCEPT_FUNC -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #undef SHENG32_IMPL #undef INTERESTING_FUNC32 #undef INNER_DEAD_FUNC32 @@ -574,7 +574,7 @@ u8 dummyFunc(UNUSED const u8 a) { #define INNER_ACCEL_FUNC dummyFunc #define OUTER_ACCEL_FUNC dummyFunc #define ACCEPT_FUNC isAcceptState -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #define SHENG32_IMPL sheng32_4_sam #define INTERESTING_FUNC32 hasInterestingStates32 #define INNER_DEAD_FUNC32 dummyFunc @@ -597,7 +597,7 @@ u8 dummyFunc(UNUSED const u8 a) { #undef INNER_ACCEL_FUNC #undef OUTER_ACCEL_FUNC #undef ACCEPT_FUNC -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #undef SHENG32_IMPL #undef INTERESTING_FUNC32 #undef INNER_DEAD_FUNC32 @@ -623,7 +623,7 @@ u8 dummyFunc(UNUSED const u8 a) { #define INNER_ACCEL_FUNC dummyFunc #define OUTER_ACCEL_FUNC isAccelState #define ACCEPT_FUNC dummyFunc -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #define SHENG32_IMPL sheng32_4_nmda #define INTERESTING_FUNC32 dummyFunc4 #define INNER_DEAD_FUNC32 dummyFunc @@ -642,7 +642,7 @@ u8 dummyFunc(UNUSED const u8 a) { #undef INNER_ACCEL_FUNC #undef OUTER_ACCEL_FUNC #undef ACCEPT_FUNC -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #undef SHENG32_IMPL #undef INTERESTING_FUNC32 #undef INNER_DEAD_FUNC32 @@ -662,7 +662,7 @@ u8 dummyFunc(UNUSED const u8 a) { #define INNER_ACCEL_FUNC dummyFunc #define OUTER_ACCEL_FUNC dummyFunc #define ACCEPT_FUNC dummyFunc -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #define SHENG32_IMPL sheng32_4_nmd #define INTERESTING_FUNC32 dummyFunc4 #define INNER_DEAD_FUNC32 dummyFunc @@ -685,7 +685,7 @@ u8 dummyFunc(UNUSED const u8 a) { #undef INNER_ACCEL_FUNC #undef OUTER_ACCEL_FUNC #undef ACCEPT_FUNC -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #undef SHENG32_IMPL #undef INTERESTING_FUNC32 #undef INNER_DEAD_FUNC32 @@ -712,7 +712,7 @@ u8 dummyFunc(UNUSED const u8 a) { #define INNER_ACCEL_FUNC dummyFunc #define OUTER_ACCEL_FUNC dummyFunc #define ACCEPT_FUNC dummyFunc -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #define SHENG32_IMPL sheng32_4_nm #define INTERESTING_FUNC32 dummyFunc4 #define INNER_DEAD_FUNC32 dummyFunc @@ -735,7 +735,7 @@ u8 dummyFunc(UNUSED const u8 a) { #undef INNER_ACCEL_FUNC #undef OUTER_ACCEL_FUNC #undef ACCEPT_FUNC -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #undef SHENG32_IMPL #undef INTERESTING_FUNC32 #undef INNER_DEAD_FUNC32 diff --git a/src/nfa/sheng_impl.h b/src/nfa/sheng_impl.h index 1fa5c831..9634fa65 100644 --- a/src/nfa/sheng_impl.h +++ b/src/nfa/sheng_impl.h @@ -96,6 +96,133 @@ char SHENG_IMPL(u8 *state, NfaCallback cb, void *ctxt, const struct sheng *s, return MO_CONTINUE_MATCHING; } +#if defined(HAVE_SVE) + +static really_inline +char SHENG32_IMPL(u8 *state, NfaCallback cb, void *ctxt, + const struct sheng32 *s, + u8 *const cached_accept_state, + ReportID *const cached_accept_id, + u8 single, u64a base_offset, const u8 *buf, const u8 *start, + const u8 *end, const u8 **scan_end) { + DEBUG_PRINTF("Starting DFA execution in state %u\n", + *state & SHENG32_STATE_MASK); + const u8 *cur_buf = start; + if (DEAD_FUNC32(*state)) { + DEBUG_PRINTF("Dead on arrival\n"); + *scan_end = end; + return MO_CONTINUE_MATCHING; + } + DEBUG_PRINTF("Scanning %lli bytes\n", (s64a)(end - start)); + + const svbool_t lane_pred_32 = svwhilelt_b8(0, 32); + svuint8_t cur_state = svld1(lane_pred_32, state); + const m512 *masks = s->succ_masks; + + while (likely(cur_buf != end)) { + const u8 c = *cur_buf; + svuint8_t succ_mask = svld1(lane_pred_32, (const u8*)(masks + c)); + cur_state = svtbl(cur_state, succ_mask); + const u8 tmp = svlastb(lane_pred_32, cur_state); + + DEBUG_PRINTF("c: %02hhx '%c'\n", c, ourisprint(c) ? c : '?'); + DEBUG_PRINTF("s: %u (flag: %u)\n", tmp & SHENG32_STATE_MASK, + tmp & SHENG32_STATE_FLAG_MASK); + + if (unlikely(ACCEPT_FUNC32(tmp))) { + DEBUG_PRINTF("Accept state %u reached\n", tmp & SHENG32_STATE_MASK); + u64a match_offset = base_offset + (cur_buf - buf) + 1; + DEBUG_PRINTF("Match @ %llu\n", match_offset); + if (STOP_AT_MATCH) { + DEBUG_PRINTF("Stopping at match @ %lli\n", + (u64a)(cur_buf - start)); + *state = tmp; + *scan_end = cur_buf; + return MO_MATCHES_PENDING; + } + if (single) { + if (fireSingleReport(cb, ctxt, s->report, match_offset) == + MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } else { + if (fireReports32(s, cb, ctxt, tmp, match_offset, + cached_accept_state, cached_accept_id, + 0) == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } + } + cur_buf++; + } + *state = svlastb(lane_pred_32, cur_state); + *scan_end = cur_buf; + return MO_CONTINUE_MATCHING; +} + +static really_inline +char SHENG64_IMPL(u8 *state, NfaCallback cb, void *ctxt, + const struct sheng64 *s, + u8 *const cached_accept_state, + ReportID *const cached_accept_id, + u8 single, u64a base_offset, const u8 *buf, const u8 *start, + const u8 *end, const u8 **scan_end) { + DEBUG_PRINTF("Starting DFA execution in state %u\n", + *state & SHENG64_STATE_MASK); + const u8 *cur_buf = start; + if (DEAD_FUNC64(*state)) { + DEBUG_PRINTF("Dead on arrival\n"); + *scan_end = end; + return MO_CONTINUE_MATCHING; + } + DEBUG_PRINTF("Scanning %lli bytes\n", (s64a)(end - start)); + + const svbool_t lane_pred_64 = svwhilelt_b8(0, 64); + svuint8_t cur_state = svld1(lane_pred_64, state); + const m512 *masks = s->succ_masks; + + while (likely(cur_buf != end)) { + const u8 c = *cur_buf; + svuint8_t succ_mask = svld1(lane_pred_64, (const u8*)(masks + c)); + cur_state = svtbl(cur_state, succ_mask); + const u8 tmp = svlastb(lane_pred_64, cur_state); + + DEBUG_PRINTF("c: %02hhx '%c'\n", c, ourisprint(c) ? c : '?'); + DEBUG_PRINTF("s: %u (flag: %u)\n", tmp & SHENG64_STATE_MASK, + tmp & SHENG64_STATE_FLAG_MASK); + + if (unlikely(ACCEPT_FUNC64(tmp))) { + DEBUG_PRINTF("Accept state %u reached\n", tmp & SHENG64_STATE_MASK); + u64a match_offset = base_offset + (cur_buf - buf) + 1; + DEBUG_PRINTF("Match @ %llu\n", match_offset); + if (STOP_AT_MATCH) { + DEBUG_PRINTF("Stopping at match @ %lli\n", + (u64a)(cur_buf - start)); + *state = tmp; + *scan_end = cur_buf; + return MO_MATCHES_PENDING; + } + if (single) { + if (fireSingleReport(cb, ctxt, s->report, match_offset) == + MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } else { + if (fireReports64(s, cb, ctxt, tmp, match_offset, + cached_accept_state, cached_accept_id, + 0) == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } + } + cur_buf++; + } + *state = svlastb(lane_pred_64, cur_state); + *scan_end = cur_buf; + return MO_CONTINUE_MATCHING; +} +#endif + #if defined(HAVE_AVX512VBMI) static really_inline char SHENG32_IMPL(u8 *state, NfaCallback cb, void *ctxt, diff --git a/src/nfa/sheng_impl4.h b/src/nfa/sheng_impl4.h index e5d3468f..10ad4ea0 100644 --- a/src/nfa/sheng_impl4.h +++ b/src/nfa/sheng_impl4.h @@ -283,6 +283,434 @@ char SHENG_IMPL(u8 *state, NfaCallback cb, void *ctxt, const struct sheng *s, return MO_CONTINUE_MATCHING; } +#if defined(HAVE_SVE) +static really_inline +char SHENG32_IMPL(u8 *state, NfaCallback cb, void *ctxt, + const struct sheng32 *s, + u8 *const cached_accept_state, + ReportID *const cached_accept_id, + u8 single, u64a base_offset, const u8 *buf, const u8 *start, + const u8 *end, const u8 **scan_end) { + DEBUG_PRINTF("Starting DFAx4 execution in state %u\n", + *state & SHENG32_STATE_MASK); + const u8 *cur_buf = start; + const u8 *min_accel_dist = start; + base_offset++; + DEBUG_PRINTF("Scanning %llu bytes\n", (u64a)(end - start)); + + if (INNER_ACCEL_FUNC32(*state) || OUTER_ACCEL_FUNC32(*state)) { + DEBUG_PRINTF("Accel state reached @ 0\n"); + const union AccelAux *aaux = + get_accel32(s, *state & SHENG32_STATE_MASK); + const u8 *new_offset = run_accel(aaux, cur_buf, end); + if (new_offset < cur_buf + BAD_ACCEL_DIST) { + min_accel_dist = new_offset + BIG_ACCEL_PENALTY; + } else { + min_accel_dist = new_offset + SMALL_ACCEL_PENALTY; + } + DEBUG_PRINTF("Next accel chance: %llu\n", + (u64a)(min_accel_dist - start)); + DEBUG_PRINTF("Accel scanned %zu bytes\n", new_offset - cur_buf); + cur_buf = new_offset; + DEBUG_PRINTF("New offset: %lli\n", (s64a)(cur_buf - start)); + } + if (INNER_DEAD_FUNC32(*state) || OUTER_DEAD_FUNC32(*state)) { + DEBUG_PRINTF("Dead on arrival\n"); + *scan_end = end; + return MO_CONTINUE_MATCHING; + } + + const svbool_t lane_pred_32 = svwhilelt_b8(0, 32); + svuint8_t cur_state = svld1(lane_pred_32, state); + const m512 *masks = s->succ_masks; + + while (likely(end - cur_buf >= 4)) { + const u8 *b1 = cur_buf; + const u8 *b2 = cur_buf + 1; + const u8 *b3 = cur_buf + 2; + const u8 *b4 = cur_buf + 3; + const u8 c1 = *b1; + const u8 c2 = *b2; + const u8 c3 = *b3; + const u8 c4 = *b4; + svuint8_t succ_mask1 = svld1(lane_pred_32, (const u8*)(masks+c1)); + cur_state = svtbl(cur_state, succ_mask1); + const u8 a1 = svlastb(lane_pred_32, cur_state); + + svuint8_t succ_mask2 = svld1(lane_pred_32, (const u8*)(masks+c2)); + cur_state = svtbl(cur_state, succ_mask2); + const u8 a2 = svlastb(lane_pred_32, cur_state); + + svuint8_t succ_mask3 = svld1(lane_pred_32, (const u8*)(masks+c3)); + cur_state = svtbl(cur_state, succ_mask3); + const u8 a3 = svlastb(lane_pred_32, cur_state); + + svuint8_t succ_mask4 = svld1(lane_pred_32, (const u8*)(masks+c4)); + cur_state = svtbl(cur_state, succ_mask4); + const u8 a4 = svlastb(lane_pred_32, cur_state); + + DEBUG_PRINTF("c: %02hhx '%c'\n", c1, ourisprint(c1) ? c1 : '?'); + DEBUG_PRINTF("s: %u (flag: %u)\n", a1 & SHENG32_STATE_MASK, + a1 & SHENG32_STATE_FLAG_MASK); + + DEBUG_PRINTF("c: %02hhx '%c'\n", c2, ourisprint(c2) ? c2 : '?'); + DEBUG_PRINTF("s: %u (flag: %u)\n", a2 & SHENG32_STATE_MASK, + a2 & SHENG32_STATE_FLAG_MASK); + + DEBUG_PRINTF("c: %02hhx '%c'\n", c3, ourisprint(c3) ? c3 : '?'); + DEBUG_PRINTF("s: %u (flag: %u)\n", a3 & SHENG32_STATE_MASK, + a3 & SHENG32_STATE_FLAG_MASK); + + DEBUG_PRINTF("c: %02hhx '%c'\n", c4, ourisprint(c4) ? c4 : '?'); + DEBUG_PRINTF("s: %u (flag: %u)\n", a4 & SHENG32_STATE_MASK, + a4 & SHENG32_STATE_FLAG_MASK); + + if (unlikely(INTERESTING_FUNC32(a1, a2, a3, a4))) { + if (ACCEPT_FUNC32(a1)) { + u64a match_offset = base_offset + b1 - buf; + DEBUG_PRINTF("Accept state %u reached\n", + a1 & SHENG32_STATE_MASK); + DEBUG_PRINTF("Match @ %llu\n", match_offset); + if (STOP_AT_MATCH) { + DEBUG_PRINTF("Stopping at match @ %lli\n", + (s64a)(b1 - start)); + *scan_end = b1; + *state = a1; + return MO_MATCHES_PENDING; + } + if (single) { + if (fireSingleReport(cb, ctxt, s->report, match_offset) == + MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } else { + if (fireReports32(s, cb, ctxt, a1, match_offset, + cached_accept_state, cached_accept_id, + 0) == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } + } + if (ACCEPT_FUNC32(a2)) { + u64a match_offset = base_offset + b2 - buf; + DEBUG_PRINTF("Accept state %u reached\n", + a2 & SHENG32_STATE_MASK); + DEBUG_PRINTF("Match @ %llu\n", match_offset); + if (STOP_AT_MATCH) { + DEBUG_PRINTF("Stopping at match @ %lli\n", + (s64a)(b2 - start)); + *scan_end = b2; + *state = a2; + return MO_MATCHES_PENDING; + } + if (single) { + if (fireSingleReport(cb, ctxt, s->report, match_offset) == + MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } else { + if (fireReports32(s, cb, ctxt, a2, match_offset, + cached_accept_state, cached_accept_id, + 0) == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } + } + if (ACCEPT_FUNC32(a3)) { + u64a match_offset = base_offset + b3 - buf; + DEBUG_PRINTF("Accept state %u reached\n", + a3 & SHENG32_STATE_MASK); + DEBUG_PRINTF("Match @ %llu\n", match_offset); + if (STOP_AT_MATCH) { + DEBUG_PRINTF("Stopping at match @ %lli\n", + (s64a)(b3 - start)); + *scan_end = b3; + *state = a3; + return MO_MATCHES_PENDING; + } + if (single) { + if (fireSingleReport(cb, ctxt, s->report, match_offset) == + MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } else { + if (fireReports32(s, cb, ctxt, a3, match_offset, + cached_accept_state, cached_accept_id, + 0) == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } + } + if (ACCEPT_FUNC32(a4)) { + u64a match_offset = base_offset + b4 - buf; + DEBUG_PRINTF("Accept state %u reached\n", + a4 & SHENG32_STATE_MASK); + DEBUG_PRINTF("Match @ %llu\n", match_offset); + if (STOP_AT_MATCH) { + DEBUG_PRINTF("Stopping at match @ %lli\n", + (s64a)(b4 - start)); + *scan_end = b4; + *state = a4; + return MO_MATCHES_PENDING; + } + if (single) { + if (fireSingleReport(cb, ctxt, s->report, match_offset) == + MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } else { + if (fireReports32(s, cb, ctxt, a4, match_offset, + cached_accept_state, cached_accept_id, + 0) == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } + } + if (INNER_DEAD_FUNC32(a4)) { + DEBUG_PRINTF("Dead state reached @ %lli\n", (s64a)(b4 - buf)); + *scan_end = end; + *state = a4; + return MO_CONTINUE_MATCHING; + } + if (cur_buf > min_accel_dist && INNER_ACCEL_FUNC32(a4)) { + DEBUG_PRINTF("Accel state reached @ %lli\n", (s64a)(b4 - buf)); + const union AccelAux *aaux = + get_accel32(s, a4 & SHENG32_STATE_MASK); + const u8 *new_offset = run_accel(aaux, cur_buf + 4, end); + if (new_offset < cur_buf + 4 + BAD_ACCEL_DIST) { + min_accel_dist = new_offset + BIG_ACCEL_PENALTY; + } else { + min_accel_dist = new_offset + SMALL_ACCEL_PENALTY; + } + DEBUG_PRINTF("Next accel chance: %llu\n", + (u64a)(min_accel_dist - start)); + DEBUG_PRINTF("Accel scanned %llu bytes\n", + (u64a)(new_offset - cur_buf - 4)); + cur_buf = new_offset; + DEBUG_PRINTF("New offset: %llu\n", (u64a)(cur_buf - buf)); + continue; + } + } + if (OUTER_DEAD_FUNC32(a4)) { + DEBUG_PRINTF("Dead state reached @ %lli\n", (s64a)(cur_buf - buf)); + *scan_end = end; + *state = a4; + return MO_CONTINUE_MATCHING; + }; + if (cur_buf > min_accel_dist && OUTER_ACCEL_FUNC32(a4)) { + DEBUG_PRINTF("Accel state reached @ %lli\n", (s64a)(b4 - buf)); + const union AccelAux *aaux = + get_accel32(s, a4 & SHENG32_STATE_MASK); + const u8 *new_offset = run_accel(aaux, cur_buf + 4, end); + if (new_offset < cur_buf + 4 + BAD_ACCEL_DIST) { + min_accel_dist = new_offset + BIG_ACCEL_PENALTY; + } else { + min_accel_dist = new_offset + SMALL_ACCEL_PENALTY; + } + DEBUG_PRINTF("Next accel chance: %llu\n", + (u64a)(min_accel_dist - start)); + DEBUG_PRINTF("Accel scanned %llu bytes\n", + (u64a)(new_offset - cur_buf - 4)); + cur_buf = new_offset; + DEBUG_PRINTF("New offset: %llu\n", (u64a)(cur_buf - buf)); + continue; + }; + cur_buf += 4; + } + *state = svlastb(lane_pred_32, cur_state); + *scan_end = cur_buf; + return MO_CONTINUE_MATCHING; +} + +#if !defined(NO_SHENG64_IMPL) +static really_inline +char SHENG64_IMPL(u8 *state, NfaCallback cb, void *ctxt, + const struct sheng64 *s, + u8 *const cached_accept_state, + ReportID *const cached_accept_id, + u8 single, u64a base_offset, const u8 *buf, const u8 *start, + const u8 *end, const u8 **scan_end) { + DEBUG_PRINTF("Starting DFAx4 execution in state %u\n", + *state & SHENG64_STATE_MASK); + const u8 *cur_buf = start; + base_offset++; + DEBUG_PRINTF("Scanning %llu bytes\n", (u64a)(end - start)); + + if (INNER_DEAD_FUNC64(*state) || OUTER_DEAD_FUNC64(*state)) { + DEBUG_PRINTF("Dead on arrival\n"); + *scan_end = end; + return MO_CONTINUE_MATCHING; + } + + const svbool_t lane_pred_64 = svwhilelt_b8(0, 64); + svuint8_t cur_state = svld1(lane_pred_64, state); + const m512 *masks = s->succ_masks; + + while (likely(end - cur_buf >= 4)) { + const u8 *b1 = cur_buf; + const u8 *b2 = cur_buf + 1; + const u8 *b3 = cur_buf + 2; + const u8 *b4 = cur_buf + 3; + const u8 c1 = *b1; + const u8 c2 = *b2; + const u8 c3 = *b3; + const u8 c4 = *b4; + + svuint8_t succ_mask1 = svld1(lane_pred_64, (const u8*)(masks+c1)); + cur_state = svtbl(cur_state, succ_mask1); + const u8 a1 = svlastb(lane_pred_64, cur_state); + + svuint8_t succ_mask2 = svld1(lane_pred_64, (const u8*)(masks+c2)); + cur_state = svtbl(cur_state, succ_mask2); + const u8 a2 = svlastb(lane_pred_64, cur_state); + + svuint8_t succ_mask3 = svld1(lane_pred_64, (const u8*)(masks+c3)); + cur_state = svtbl(cur_state, succ_mask3); + const u8 a3 = svlastb(lane_pred_64, cur_state); + + svuint8_t succ_mask4 = svld1(lane_pred_64, (const u8*)(masks+c4)); + cur_state = svtbl(cur_state, succ_mask4); + const u8 a4 = svlastb(lane_pred_64, cur_state); + + DEBUG_PRINTF("c: %02hhx '%c'\n", c1, ourisprint(c1) ? c1 : '?'); + DEBUG_PRINTF("s: %u (flag: %u)\n", a1 & SHENG64_STATE_MASK, + a1 & SHENG64_STATE_FLAG_MASK); + + DEBUG_PRINTF("c: %02hhx '%c'\n", c2, ourisprint(c2) ? c2 : '?'); + DEBUG_PRINTF("s: %u (flag: %u)\n", a2 & SHENG64_STATE_MASK, + a2 & SHENG64_STATE_FLAG_MASK); + + DEBUG_PRINTF("c: %02hhx '%c'\n", c3, ourisprint(c3) ? c3 : '?'); + DEBUG_PRINTF("s: %u (flag: %u)\n", a3 & SHENG64_STATE_MASK, + a3 & SHENG64_STATE_FLAG_MASK); + + DEBUG_PRINTF("c: %02hhx '%c'\n", c4, ourisprint(c4) ? c4 : '?'); + DEBUG_PRINTF("s: %u (flag: %u)\n", a4 & SHENG64_STATE_MASK, + a4 & SHENG64_STATE_FLAG_MASK); + + if (unlikely(INTERESTING_FUNC64(a1, a2, a3, a4))) { + if (ACCEPT_FUNC64(a1)) { + u64a match_offset = base_offset + b1 - buf; + DEBUG_PRINTF("Accept state %u reached\n", + a1 & SHENG64_STATE_MASK); + DEBUG_PRINTF("Match @ %llu\n", match_offset); + if (STOP_AT_MATCH) { + DEBUG_PRINTF("Stopping at match @ %lli\n", + (s64a)(b1 - start)); + *scan_end = b1; + *state = a1; + return MO_MATCHES_PENDING; + } + if (single) { + if (fireSingleReport(cb, ctxt, s->report, match_offset) == + MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } else { + if (fireReports64(s, cb, ctxt, a1, match_offset, + cached_accept_state, cached_accept_id, + 0) == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } + } + if (ACCEPT_FUNC64(a2)) { + u64a match_offset = base_offset + b2 - buf; + DEBUG_PRINTF("Accept state %u reached\n", + a2 & SHENG64_STATE_MASK); + DEBUG_PRINTF("Match @ %llu\n", match_offset); + if (STOP_AT_MATCH) { + DEBUG_PRINTF("Stopping at match @ %lli\n", + (s64a)(b2 - start)); + *scan_end = b2; + *state = a2; + return MO_MATCHES_PENDING; + } + if (single) { + if (fireSingleReport(cb, ctxt, s->report, match_offset) == + MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } else { + if (fireReports64(s, cb, ctxt, a2, match_offset, + cached_accept_state, cached_accept_id, + 0) == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } + } + if (ACCEPT_FUNC64(a3)) { + u64a match_offset = base_offset + b3 - buf; + DEBUG_PRINTF("Accept state %u reached\n", + a3 & SHENG64_STATE_MASK); + DEBUG_PRINTF("Match @ %llu\n", match_offset); + if (STOP_AT_MATCH) { + DEBUG_PRINTF("Stopping at match @ %lli\n", + (s64a)(b3 - start)); + *scan_end = b3; + *state = a3; + return MO_MATCHES_PENDING; + } + if (single) { + if (fireSingleReport(cb, ctxt, s->report, match_offset) == + MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } else { + if (fireReports64(s, cb, ctxt, a3, match_offset, + cached_accept_state, cached_accept_id, + 0) == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } + } + if (ACCEPT_FUNC64(a4)) { + u64a match_offset = base_offset + b4 - buf; + DEBUG_PRINTF("Accept state %u reached\n", + a4 & SHENG64_STATE_MASK); + DEBUG_PRINTF("Match @ %llu\n", match_offset); + if (STOP_AT_MATCH) { + DEBUG_PRINTF("Stopping at match @ %lli\n", + (s64a)(b4 - start)); + *scan_end = b4; + *state = a4; + return MO_MATCHES_PENDING; + } + if (single) { + if (fireSingleReport(cb, ctxt, s->report, match_offset) == + MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } else { + if (fireReports64(s, cb, ctxt, a4, match_offset, + cached_accept_state, cached_accept_id, + 0) == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } + } + if (INNER_DEAD_FUNC64(a4)) { + DEBUG_PRINTF("Dead state reached @ %lli\n", (s64a)(b4 - buf)); + *scan_end = end; + *state = a4; + return MO_CONTINUE_MATCHING; + } + } + if (OUTER_DEAD_FUNC64(a4)) { + DEBUG_PRINTF("Dead state reached @ %lli\n", (s64a)(cur_buf - buf)); + *scan_end = end; + *state = a4; + return MO_CONTINUE_MATCHING; + } + cur_buf += 4; + } + *state = svlastb(lane_pred_64, cur_state); + *scan_end = cur_buf; + return MO_CONTINUE_MATCHING; +} +#endif +#endif + #if defined(HAVE_AVX512VBMI) static really_inline char SHENG32_IMPL(u8 *state, NfaCallback cb, void *ctxt, diff --git a/src/nfa/shengcompile.cpp b/src/nfa/shengcompile.cpp index 055e1971..0f93e139 100644 --- a/src/nfa/shengcompile.cpp +++ b/src/nfa/shengcompile.cpp @@ -730,10 +730,17 @@ bytecode_ptr sheng32Compile(raw_dfa &raw, const CompileContext &cc, return nullptr; } +#ifdef HAVE_SVE + if (svcntb()<32) { + DEBUG_PRINTF("Sheng32 failed, SVE width is too small!\n"); + return nullptr; + } +#else if (!cc.target_info.has_avx512vbmi()) { DEBUG_PRINTF("Sheng32 failed, no HS_CPU_FEATURES_AVX512VBMI!\n"); return nullptr; } +#endif sheng_build_strat strat(raw, rm, only_accel_init); dfa_info info(strat); @@ -762,10 +769,17 @@ bytecode_ptr sheng64Compile(raw_dfa &raw, const CompileContext &cc, return nullptr; } +#ifdef HAVE_SVE + if (svcntb()<64) { + DEBUG_PRINTF("Sheng64 failed, SVE width is too small!\n"); + return nullptr; + } +#else if (!cc.target_info.has_avx512vbmi()) { DEBUG_PRINTF("Sheng64 failed, no HS_CPU_FEATURES_AVX512VBMI!\n"); return nullptr; } +#endif sheng_build_strat strat(raw, rm, only_accel_init); dfa_info info(strat);