diff --git a/src/nfa/sheng.c b/src/nfa/sheng.c index 3f36e218..922e8f80 100644 --- a/src/nfa/sheng.c +++ b/src/nfa/sheng.c @@ -154,7 +154,7 @@ char fireReports(const struct sheng *sh, NfaCallback cb, void *ctxt, return MO_CONTINUE_MATCHING; /* continue execution */ } -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) // Sheng32 static really_inline const struct sheng32 *get_sheng32(const struct NFA *n) { @@ -351,7 +351,7 @@ char fireReports64(const struct sheng64 *sh, NfaCallback cb, void *ctxt, } return MO_CONTINUE_MATCHING; /* continue execution */ } -#endif // end of HAVE_AVX512VBMI +#endif // end of HAVE_AVX512VBMI || HAVE_SVE /* include Sheng function definitions */ #include "sheng_defs.h" @@ -871,7 +871,7 @@ char nfaExecSheng_expandState(UNUSED const struct NFA *nfa, void *dest, return 0; } -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) // Sheng32 static really_inline char runSheng32Cb(const struct sheng32 *sh, NfaCallback cb, void *ctxt, @@ -1874,4 +1874,4 @@ char nfaExecSheng64_expandState(UNUSED const struct NFA *nfa, void *dest, *(u8 *)dest = *(const u8 *)src; return 0; } -#endif // end of HAVE_AVX512VBMI +#endif // end of HAVE_AVX512VBMI || HAVE_SVE diff --git a/src/nfa/sheng.h b/src/nfa/sheng.h index 7b90e303..212bd3a4 100644 --- a/src/nfa/sheng.h +++ b/src/nfa/sheng.h @@ -58,7 +58,7 @@ char nfaExecSheng_reportCurrent(const struct NFA *n, struct mq *q); char nfaExecSheng_B(const struct NFA *n, u64a offset, const u8 *buffer, size_t length, NfaCallback cb, void *context); -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #define nfaExecSheng32_B_Reverse NFA_API_NO_IMPL #define nfaExecSheng32_zombie_status NFA_API_ZOMBIE_NO_IMPL @@ -106,8 +106,7 @@ char nfaExecSheng64_reportCurrent(const struct NFA *n, struct mq *q); char nfaExecSheng64_B(const struct NFA *n, u64a offset, const u8 *buffer, size_t length, NfaCallback cb, void *context); - -#else // !HAVE_AVX512VBMI +#else // !HAVE_AVX512VBMI && !HAVE_SVE #define nfaExecSheng32_B_Reverse NFA_API_NO_IMPL #define nfaExecSheng32_zombie_status NFA_API_ZOMBIE_NO_IMPL @@ -138,6 +137,7 @@ char nfaExecSheng64_B(const struct NFA *n, u64a offset, const u8 *buffer, #define nfaExecSheng64_testEOD NFA_API_NO_IMPL #define nfaExecSheng64_reportCurrent NFA_API_NO_IMPL #define nfaExecSheng64_B NFA_API_NO_IMPL -#endif // end of HAVE_AVX512VBMI +#endif // end of HAVE_AVX512VBMI || defined(HAVE_SVE) + #endif /* SHENG_H_ */ diff --git a/src/nfa/sheng_defs.h b/src/nfa/sheng_defs.h index 390af752..886af28e 100644 --- a/src/nfa/sheng_defs.h +++ b/src/nfa/sheng_defs.h @@ -52,7 +52,7 @@ u8 hasInterestingStates(const u8 a, const u8 b, const u8 c, const u8 d) { return (a | b | c | d) & (SHENG_STATE_FLAG_MASK); } -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) static really_inline u8 isDeadState32(const u8 a) { return a & SHENG32_STATE_DEAD; @@ -108,7 +108,7 @@ u8 dummyFunc(UNUSED const u8 a) { #define SHENG_IMPL sheng_cod #define DEAD_FUNC isDeadState #define ACCEPT_FUNC isAcceptState -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #define SHENG32_IMPL sheng32_cod #define DEAD_FUNC32 isDeadState32 #define ACCEPT_FUNC32 isAcceptState32 @@ -121,7 +121,7 @@ u8 dummyFunc(UNUSED const u8 a) { #undef SHENG_IMPL #undef DEAD_FUNC #undef ACCEPT_FUNC -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #undef SHENG32_IMPL #undef DEAD_FUNC32 #undef ACCEPT_FUNC32 @@ -135,7 +135,7 @@ u8 dummyFunc(UNUSED const u8 a) { #define SHENG_IMPL sheng_co #define DEAD_FUNC dummyFunc #define ACCEPT_FUNC isAcceptState -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #define SHENG32_IMPL sheng32_co #define DEAD_FUNC32 dummyFunc #define ACCEPT_FUNC32 isAcceptState32 @@ -148,7 +148,7 @@ u8 dummyFunc(UNUSED const u8 a) { #undef SHENG_IMPL #undef DEAD_FUNC #undef ACCEPT_FUNC -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #undef SHENG32_IMPL #undef DEAD_FUNC32 #undef ACCEPT_FUNC32 @@ -162,7 +162,7 @@ u8 dummyFunc(UNUSED const u8 a) { #define SHENG_IMPL sheng_samd #define DEAD_FUNC isDeadState #define ACCEPT_FUNC isAcceptState -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #define SHENG32_IMPL sheng32_samd #define DEAD_FUNC32 isDeadState32 #define ACCEPT_FUNC32 isAcceptState32 @@ -175,7 +175,7 @@ u8 dummyFunc(UNUSED const u8 a) { #undef SHENG_IMPL #undef DEAD_FUNC #undef ACCEPT_FUNC -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #undef SHENG32_IMPL #undef DEAD_FUNC32 #undef ACCEPT_FUNC32 @@ -189,7 +189,7 @@ u8 dummyFunc(UNUSED const u8 a) { #define SHENG_IMPL sheng_sam #define DEAD_FUNC dummyFunc #define ACCEPT_FUNC isAcceptState -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #define SHENG32_IMPL sheng32_sam #define DEAD_FUNC32 dummyFunc #define ACCEPT_FUNC32 isAcceptState32 @@ -202,7 +202,7 @@ u8 dummyFunc(UNUSED const u8 a) { #undef SHENG_IMPL #undef DEAD_FUNC #undef ACCEPT_FUNC -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #undef SHENG32_IMPL #undef DEAD_FUNC32 #undef ACCEPT_FUNC32 @@ -216,7 +216,7 @@ u8 dummyFunc(UNUSED const u8 a) { #define SHENG_IMPL sheng_nmd #define DEAD_FUNC isDeadState #define ACCEPT_FUNC dummyFunc -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #define SHENG32_IMPL sheng32_nmd #define DEAD_FUNC32 isDeadState32 #define ACCEPT_FUNC32 dummyFunc @@ -229,7 +229,7 @@ u8 dummyFunc(UNUSED const u8 a) { #undef SHENG_IMPL #undef DEAD_FUNC #undef ACCEPT_FUNC -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #undef SHENG32_IMPL #undef DEAD_FUNC32 #undef ACCEPT_FUNC32 @@ -243,7 +243,7 @@ u8 dummyFunc(UNUSED const u8 a) { #define SHENG_IMPL sheng_nm #define DEAD_FUNC dummyFunc #define ACCEPT_FUNC dummyFunc -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #define SHENG32_IMPL sheng32_nm #define DEAD_FUNC32 dummyFunc #define ACCEPT_FUNC32 dummyFunc @@ -256,7 +256,7 @@ u8 dummyFunc(UNUSED const u8 a) { #undef SHENG_IMPL #undef DEAD_FUNC #undef ACCEPT_FUNC -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #undef SHENG32_IMPL #undef DEAD_FUNC32 #undef ACCEPT_FUNC32 @@ -277,7 +277,7 @@ u8 dummyFunc(UNUSED const u8 a) { #define INNER_ACCEL_FUNC isAccelState #define OUTER_ACCEL_FUNC dummyFunc #define ACCEPT_FUNC isAcceptState -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #define SHENG32_IMPL sheng32_4_coda #define INTERESTING_FUNC32 hasInterestingStates32 #define INNER_DEAD_FUNC32 isDeadState32 @@ -296,7 +296,7 @@ u8 dummyFunc(UNUSED const u8 a) { #undef INNER_ACCEL_FUNC #undef OUTER_ACCEL_FUNC #undef ACCEPT_FUNC -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #undef SHENG32_IMPL #undef INTERESTING_FUNC32 #undef INNER_DEAD_FUNC32 @@ -316,7 +316,7 @@ u8 dummyFunc(UNUSED const u8 a) { #define INNER_ACCEL_FUNC dummyFunc #define OUTER_ACCEL_FUNC dummyFunc #define ACCEPT_FUNC isAcceptState -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #define SHENG32_IMPL sheng32_4_cod #define INTERESTING_FUNC32 hasInterestingStates32 #define INNER_DEAD_FUNC32 isDeadState32 @@ -339,7 +339,7 @@ u8 dummyFunc(UNUSED const u8 a) { #undef INNER_ACCEL_FUNC #undef OUTER_ACCEL_FUNC #undef ACCEPT_FUNC -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #undef SHENG32_IMPL #undef INTERESTING_FUNC32 #undef INNER_DEAD_FUNC32 @@ -363,7 +363,7 @@ u8 dummyFunc(UNUSED const u8 a) { #define INNER_ACCEL_FUNC isAccelState #define OUTER_ACCEL_FUNC dummyFunc #define ACCEPT_FUNC isAcceptState -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #define SHENG32_IMPL sheng32_4_coa #define INTERESTING_FUNC32 hasInterestingStates32 #define INNER_DEAD_FUNC32 dummyFunc @@ -382,7 +382,7 @@ u8 dummyFunc(UNUSED const u8 a) { #undef INNER_ACCEL_FUNC #undef OUTER_ACCEL_FUNC #undef ACCEPT_FUNC -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #undef SHENG32_IMPL #undef INTERESTING_FUNC32 #undef INNER_DEAD_FUNC32 @@ -402,7 +402,7 @@ u8 dummyFunc(UNUSED const u8 a) { #define INNER_ACCEL_FUNC dummyFunc #define OUTER_ACCEL_FUNC dummyFunc #define ACCEPT_FUNC isAcceptState -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #define SHENG32_IMPL sheng32_4_co #define INTERESTING_FUNC32 hasInterestingStates32 #define INNER_DEAD_FUNC32 dummyFunc @@ -425,7 +425,7 @@ u8 dummyFunc(UNUSED const u8 a) { #undef INNER_ACCEL_FUNC #undef OUTER_ACCEL_FUNC #undef ACCEPT_FUNC -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #undef SHENG32_IMPL #undef INTERESTING_FUNC32 #undef INNER_DEAD_FUNC32 @@ -449,7 +449,7 @@ u8 dummyFunc(UNUSED const u8 a) { #define INNER_ACCEL_FUNC isAccelState #define OUTER_ACCEL_FUNC dummyFunc #define ACCEPT_FUNC isAcceptState -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #define SHENG32_IMPL sheng32_4_samda #define INTERESTING_FUNC32 hasInterestingStates32 #define INNER_DEAD_FUNC32 isDeadState32 @@ -468,7 +468,7 @@ u8 dummyFunc(UNUSED const u8 a) { #undef INNER_ACCEL_FUNC #undef OUTER_ACCEL_FUNC #undef ACCEPT_FUNC -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #undef SHENG32_IMPL #undef INTERESTING_FUNC32 #undef INNER_DEAD_FUNC32 @@ -488,7 +488,7 @@ u8 dummyFunc(UNUSED const u8 a) { #define INNER_ACCEL_FUNC dummyFunc #define OUTER_ACCEL_FUNC dummyFunc #define ACCEPT_FUNC isAcceptState -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #define SHENG32_IMPL sheng32_4_samd #define INTERESTING_FUNC32 hasInterestingStates32 #define INNER_DEAD_FUNC32 isDeadState32 @@ -511,7 +511,7 @@ u8 dummyFunc(UNUSED const u8 a) { #undef INNER_ACCEL_FUNC #undef OUTER_ACCEL_FUNC #undef ACCEPT_FUNC -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #undef SHENG32_IMPL #undef INTERESTING_FUNC32 #undef INNER_DEAD_FUNC32 @@ -535,7 +535,7 @@ u8 dummyFunc(UNUSED const u8 a) { #define INNER_ACCEL_FUNC isAccelState #define OUTER_ACCEL_FUNC dummyFunc #define ACCEPT_FUNC isAcceptState -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #define SHENG32_IMPL sheng32_4_sama #define INTERESTING_FUNC32 hasInterestingStates32 #define INNER_DEAD_FUNC32 dummyFunc @@ -554,7 +554,7 @@ u8 dummyFunc(UNUSED const u8 a) { #undef INNER_ACCEL_FUNC #undef OUTER_ACCEL_FUNC #undef ACCEPT_FUNC -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #undef SHENG32_IMPL #undef INTERESTING_FUNC32 #undef INNER_DEAD_FUNC32 @@ -574,7 +574,7 @@ u8 dummyFunc(UNUSED const u8 a) { #define INNER_ACCEL_FUNC dummyFunc #define OUTER_ACCEL_FUNC dummyFunc #define ACCEPT_FUNC isAcceptState -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #define SHENG32_IMPL sheng32_4_sam #define INTERESTING_FUNC32 hasInterestingStates32 #define INNER_DEAD_FUNC32 dummyFunc @@ -597,7 +597,7 @@ u8 dummyFunc(UNUSED const u8 a) { #undef INNER_ACCEL_FUNC #undef OUTER_ACCEL_FUNC #undef ACCEPT_FUNC -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #undef SHENG32_IMPL #undef INTERESTING_FUNC32 #undef INNER_DEAD_FUNC32 @@ -623,7 +623,7 @@ u8 dummyFunc(UNUSED const u8 a) { #define INNER_ACCEL_FUNC dummyFunc #define OUTER_ACCEL_FUNC isAccelState #define ACCEPT_FUNC dummyFunc -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #define SHENG32_IMPL sheng32_4_nmda #define INTERESTING_FUNC32 dummyFunc4 #define INNER_DEAD_FUNC32 dummyFunc @@ -642,7 +642,7 @@ u8 dummyFunc(UNUSED const u8 a) { #undef INNER_ACCEL_FUNC #undef OUTER_ACCEL_FUNC #undef ACCEPT_FUNC -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #undef SHENG32_IMPL #undef INTERESTING_FUNC32 #undef INNER_DEAD_FUNC32 @@ -662,7 +662,7 @@ u8 dummyFunc(UNUSED const u8 a) { #define INNER_ACCEL_FUNC dummyFunc #define OUTER_ACCEL_FUNC dummyFunc #define ACCEPT_FUNC dummyFunc -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #define SHENG32_IMPL sheng32_4_nmd #define INTERESTING_FUNC32 dummyFunc4 #define INNER_DEAD_FUNC32 dummyFunc @@ -685,7 +685,7 @@ u8 dummyFunc(UNUSED const u8 a) { #undef INNER_ACCEL_FUNC #undef OUTER_ACCEL_FUNC #undef ACCEPT_FUNC -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #undef SHENG32_IMPL #undef INTERESTING_FUNC32 #undef INNER_DEAD_FUNC32 @@ -712,7 +712,7 @@ u8 dummyFunc(UNUSED const u8 a) { #define INNER_ACCEL_FUNC dummyFunc #define OUTER_ACCEL_FUNC dummyFunc #define ACCEPT_FUNC dummyFunc -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #define SHENG32_IMPL sheng32_4_nm #define INTERESTING_FUNC32 dummyFunc4 #define INNER_DEAD_FUNC32 dummyFunc @@ -735,7 +735,7 @@ u8 dummyFunc(UNUSED const u8 a) { #undef INNER_ACCEL_FUNC #undef OUTER_ACCEL_FUNC #undef ACCEPT_FUNC -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #undef SHENG32_IMPL #undef INTERESTING_FUNC32 #undef INNER_DEAD_FUNC32 diff --git a/src/nfa/sheng_impl.h b/src/nfa/sheng_impl.h index 1fa5c831..2c701446 100644 --- a/src/nfa/sheng_impl.h +++ b/src/nfa/sheng_impl.h @@ -96,7 +96,7 @@ char SHENG_IMPL(u8 *state, NfaCallback cb, void *ctxt, const struct sheng *s, return MO_CONTINUE_MATCHING; } -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) static really_inline char SHENG32_IMPL(u8 *state, NfaCallback cb, void *ctxt, const struct sheng32 *s, @@ -114,14 +114,28 @@ char SHENG32_IMPL(u8 *state, NfaCallback cb, void *ctxt, } DEBUG_PRINTF("Scanning %lli bytes\n", (s64a)(end - start)); +#if defined(HAVE_SVE) + const svbool_t lane_pred_32 = svwhilelt_b8(0, 32); + svuint8_t cur_state = svdup_u8(*state); + svuint8_t tbl_mask = svdup_u8((unsigned char)0x1F); + const m512 *masks = s->succ_masks; +#else m512 cur_state = set1_64x8(*state); const m512 *masks = s->succ_masks; +#endif while (likely(cur_buf != end)) { const u8 c = *cur_buf; + +#if defined(HAVE_SVE) + svuint8_t succ_mask = svld1(lane_pred_32, (const u8*)(masks + c)); + cur_state = svtbl(succ_mask, svand_x(svptrue_b8(), tbl_mask, cur_state)); + const u8 tmp = svlastb(lane_pred_32, cur_state); +#else const m512 succ_mask = masks[c]; cur_state = vpermb512(cur_state, succ_mask); const u8 tmp = movd512(cur_state); +#endif DEBUG_PRINTF("c: %02hhx '%c'\n", c, ourisprint(c) ? c : '?'); DEBUG_PRINTF("s: %u (flag: %u)\n", tmp & SHENG32_STATE_MASK, @@ -153,7 +167,11 @@ char SHENG32_IMPL(u8 *state, NfaCallback cb, void *ctxt, } cur_buf++; } +#if defined(HAVE_SVE) + *state = svlastb(lane_pred_32, cur_state); +#else *state = movd512(cur_state); +#endif *scan_end = cur_buf; return MO_CONTINUE_MATCHING; } @@ -175,14 +193,28 @@ char SHENG64_IMPL(u8 *state, NfaCallback cb, void *ctxt, } DEBUG_PRINTF("Scanning %lli bytes\n", (s64a)(end - start)); +#if defined(HAVE_SVE) + const svbool_t lane_pred_64 = svwhilelt_b8(0, 64); + svuint8_t cur_state = svdup_u8(*state); + svuint8_t tbl_mask = svdup_u8((unsigned char)0x3F); + const m512 *masks = s->succ_masks; +#else m512 cur_state = set1_64x8(*state); const m512 *masks = s->succ_masks; +#endif while (likely(cur_buf != end)) { const u8 c = *cur_buf; + +#if defined(HAVE_SVE) + svuint8_t succ_mask = svld1(lane_pred_64, (const u8*)(masks + c)); + cur_state = svtbl(succ_mask, svand_x(svptrue_b8(), tbl_mask, cur_state)); + const u8 tmp = svlastb(lane_pred_64, cur_state); +#else const m512 succ_mask = masks[c]; cur_state = vpermb512(cur_state, succ_mask); const u8 tmp = movd512(cur_state); +#endif DEBUG_PRINTF("c: %02hhx '%c'\n", c, ourisprint(c) ? c : '?'); DEBUG_PRINTF("s: %u (flag: %u)\n", tmp & SHENG64_STATE_MASK, @@ -214,7 +246,11 @@ char SHENG64_IMPL(u8 *state, NfaCallback cb, void *ctxt, } cur_buf++; } +#if defined(HAVE_SVE) + *state = svlastb(lane_pred_64, cur_state); +#else *state = movd512(cur_state); +#endif *scan_end = cur_buf; return MO_CONTINUE_MATCHING; } diff --git a/src/nfa/sheng_impl4.h b/src/nfa/sheng_impl4.h index e5d3468f..718c3409 100644 --- a/src/nfa/sheng_impl4.h +++ b/src/nfa/sheng_impl4.h @@ -283,7 +283,7 @@ char SHENG_IMPL(u8 *state, NfaCallback cb, void *ctxt, const struct sheng *s, return MO_CONTINUE_MATCHING; } -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) static really_inline char SHENG32_IMPL(u8 *state, NfaCallback cb, void *ctxt, const struct sheng32 *s, @@ -320,8 +320,15 @@ char SHENG32_IMPL(u8 *state, NfaCallback cb, void *ctxt, return MO_CONTINUE_MATCHING; } +#if defined(HAVE_SVE) + const svbool_t lane_pred_32 = svwhilelt_b8(0, 32); + svuint8_t cur_state = svdup_u8(*state); + svuint8_t tbl_mask = svdup_u8((unsigned char)0x1F); + const m512 *masks = s->succ_masks; +#else m512 cur_state = set1_64x8(*state); const m512 *masks = s->succ_masks; +#endif while (likely(end - cur_buf >= 4)) { const u8 *b1 = cur_buf; @@ -333,6 +340,23 @@ char SHENG32_IMPL(u8 *state, NfaCallback cb, void *ctxt, const u8 c3 = *b3; const u8 c4 = *b4; +#if defined(HAVE_SVE) + svuint8_t succ_mask1 = svld1(lane_pred_32, (const u8*)(masks+c1)); + cur_state = svtbl(succ_mask1, svand_x(svptrue_b8(), tbl_mask, cur_state)); + const u8 a1 = svlastb(lane_pred_32, cur_state); + + svuint8_t succ_mask2 = svld1(lane_pred_32, (const u8*)(masks+c2)); + cur_state = svtbl(succ_mask2, svand_x(svptrue_b8(), tbl_mask, cur_state)); + const u8 a2 = svlastb(lane_pred_32, cur_state); + + svuint8_t succ_mask3 = svld1(lane_pred_32, (const u8*)(masks+c3)); + cur_state = svtbl(succ_mask3, svand_x(svptrue_b8(), tbl_mask, cur_state)); + const u8 a3 = svlastb(lane_pred_32, cur_state); + + svuint8_t succ_mask4 = svld1(lane_pred_32, (const u8*)(masks+c4)); + cur_state = svtbl(succ_mask4, svand_x(svptrue_b8(), tbl_mask, cur_state)); + const u8 a4 = svlastb(lane_pred_32, cur_state); +#else const m512 succ_mask1 = masks[c1]; cur_state = vpermb512(cur_state, succ_mask1); const u8 a1 = movd512(cur_state); @@ -348,6 +372,7 @@ char SHENG32_IMPL(u8 *state, NfaCallback cb, void *ctxt, const m512 succ_mask4 = masks[c4]; cur_state = vpermb512(cur_state, succ_mask4); const u8 a4 = movd512(cur_state); +#endif DEBUG_PRINTF("c: %02hhx '%c'\n", c1, ourisprint(c1) ? c1 : '?'); DEBUG_PRINTF("s: %u (flag: %u)\n", a1 & SHENG32_STATE_MASK, @@ -517,7 +542,11 @@ char SHENG32_IMPL(u8 *state, NfaCallback cb, void *ctxt, }; cur_buf += 4; } +#if defined(HAVE_SVE) + *state = svlastb(lane_pred_32, cur_state); +#else *state = movd512(cur_state); +#endif *scan_end = cur_buf; return MO_CONTINUE_MATCHING; } @@ -541,9 +570,15 @@ char SHENG64_IMPL(u8 *state, NfaCallback cb, void *ctxt, *scan_end = end; return MO_CONTINUE_MATCHING; } - +#if defined(HAVE_SVE) + const svbool_t lane_pred_64 = svwhilelt_b8(0, 64); + svuint8_t cur_state = svdup_u8(*state); + svuint8_t tbl_mask = svdup_u8((unsigned char)0x3F); + const m512 *masks = s->succ_masks; +#else m512 cur_state = set1_64x8(*state); const m512 *masks = s->succ_masks; +#endif while (likely(end - cur_buf >= 4)) { const u8 *b1 = cur_buf; @@ -555,6 +590,23 @@ char SHENG64_IMPL(u8 *state, NfaCallback cb, void *ctxt, const u8 c3 = *b3; const u8 c4 = *b4; +#if defined(HAVE_SVE) + svuint8_t succ_mask1 = svld1(lane_pred_64, (const u8*)(masks+c1)); + cur_state = svtbl(succ_mask1, svand_x(svptrue_b8(), tbl_mask, cur_state)); + const u8 a1 = svlastb(lane_pred_64, cur_state); + + svuint8_t succ_mask2 = svld1(lane_pred_64, (const u8*)(masks+c2)); + cur_state = svtbl(succ_mask2, svand_x(svptrue_b8(), tbl_mask, cur_state)); + const u8 a2 = svlastb(lane_pred_64, cur_state); + + svuint8_t succ_mask3 = svld1(lane_pred_64, (const u8*)(masks+c3)); + cur_state = svtbl(succ_mask3, svand_x(svptrue_b8(), tbl_mask, cur_state)); + const u8 a3 = svlastb(lane_pred_64, cur_state); + + svuint8_t succ_mask4 = svld1(lane_pred_64, (const u8*)(masks+c4)); + cur_state = svtbl(succ_mask4, svand_x(svptrue_b8(), tbl_mask, cur_state)); + const u8 a4 = svlastb(lane_pred_64, cur_state); +#else const m512 succ_mask1 = masks[c1]; cur_state = vpermb512(cur_state, succ_mask1); const u8 a1 = movd512(cur_state); @@ -570,6 +622,7 @@ char SHENG64_IMPL(u8 *state, NfaCallback cb, void *ctxt, const m512 succ_mask4 = masks[c4]; cur_state = vpermb512(cur_state, succ_mask4); const u8 a4 = movd512(cur_state); +#endif DEBUG_PRINTF("c: %02hhx '%c'\n", c1, ourisprint(c1) ? c1 : '?'); DEBUG_PRINTF("s: %u (flag: %u)\n", a1 & SHENG64_STATE_MASK, @@ -703,7 +756,11 @@ char SHENG64_IMPL(u8 *state, NfaCallback cb, void *ctxt, } cur_buf += 4; } +#if defined(HAVE_SVE) + *state = svlastb(lane_pred_64, cur_state); +#else *state = movd512(cur_state); +#endif *scan_end = cur_buf; return MO_CONTINUE_MATCHING; } diff --git a/src/nfa/shengcompile.cpp b/src/nfa/shengcompile.cpp index 055e1971..0f93e139 100644 --- a/src/nfa/shengcompile.cpp +++ b/src/nfa/shengcompile.cpp @@ -730,10 +730,17 @@ bytecode_ptr sheng32Compile(raw_dfa &raw, const CompileContext &cc, return nullptr; } +#ifdef HAVE_SVE + if (svcntb()<32) { + DEBUG_PRINTF("Sheng32 failed, SVE width is too small!\n"); + return nullptr; + } +#else if (!cc.target_info.has_avx512vbmi()) { DEBUG_PRINTF("Sheng32 failed, no HS_CPU_FEATURES_AVX512VBMI!\n"); return nullptr; } +#endif sheng_build_strat strat(raw, rm, only_accel_init); dfa_info info(strat); @@ -762,10 +769,17 @@ bytecode_ptr sheng64Compile(raw_dfa &raw, const CompileContext &cc, return nullptr; } +#ifdef HAVE_SVE + if (svcntb()<64) { + DEBUG_PRINTF("Sheng64 failed, SVE width is too small!\n"); + return nullptr; + } +#else if (!cc.target_info.has_avx512vbmi()) { DEBUG_PRINTF("Sheng64 failed, no HS_CPU_FEATURES_AVX512VBMI!\n"); return nullptr; } +#endif sheng_build_strat strat(raw, rm, only_accel_init); dfa_info info(strat); diff --git a/unit/CMakeLists.txt b/unit/CMakeLists.txt index f5577d40..e2196459 100644 --- a/unit/CMakeLists.txt +++ b/unit/CMakeLists.txt @@ -102,6 +102,7 @@ set(unit_internal_SOURCES internal/rvermicelli.cpp internal/simd_utils.cpp internal/supervector.cpp + internal/sheng.cpp internal/shuffle.cpp internal/shufti.cpp internal/state_compress.cpp diff --git a/unit/internal/sheng.cpp b/unit/internal/sheng.cpp new file mode 100644 index 00000000..e8e45ac5 --- /dev/null +++ b/unit/internal/sheng.cpp @@ -0,0 +1,709 @@ +/* + * Copyright (c) 2024, Arm ltd + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" + +#include "gtest/gtest.h" +#include "nfa/shengcompile.h" +#include "nfa/rdfa.h" +#include "util/bytecode_ptr.h" +#include "util/compile_context.h" +#include "util/report_manager.h" + +extern "C" { + #include "hs_compile.h" + #include "nfa/nfa_api.h" + #include "nfa/nfa_api_queue.h" + #include "nfa/nfa_api_util.h" + #include "nfa/nfa_internal.h" + #include "nfa/rdfa.h" + #include "nfa/sheng.h" + #include "ue2common.h" +} + +namespace { + +struct callback_context { + unsigned int period; + unsigned int match_count; + unsigned int pattern_length; +}; + +int dummy_callback(u64a start, u64a end, ReportID id, void *context) { + (void) context; + printf("callback %llu %llu %u\n", start, end, id); + return 1; /* 0 stops matching, !0 continue */ +} + +int periodic_pattern_callback(u64a start, u64a end, ReportID id, void *raw_context) { + struct callback_context *context = (struct callback_context*) raw_context; + (void) start; + (void) id; + EXPECT_EQ(context->period * context->match_count, end - context->pattern_length); + context->match_count++; + return 1; /* 0 stops matching, !0 continue */ +} + +/** + * @brief Fill the state matrix with a diagonal pattern: accept the Nth character to go to the N+1 state + */ +static void fill_straight_regex_sequence(struct ue2::raw_dfa *dfa, int start_state, int end_state, int state_count) +{ + for (int state = start_state; state < end_state; state++) { + dfa->states[state].next.assign(state_count ,1); + dfa->states[state].next[0] = 2; + dfa->states[state].next[1] = 2; + dfa->states[state].next[state] = state+1; + } +} + +static void init_raw_dfa16(struct ue2::raw_dfa *dfa, const ReportID rID) +{ + dfa->start_anchored = 1; + dfa->start_floating = 1; + dfa->alpha_size = 8; + + int nb_state = 8; + for(int i = 0; i < nb_state; i++) { + struct ue2::dstate state(dfa->alpha_size); + state.next = std::vector(nb_state); + state.daddy = 0; + state.impl_id = i; /* id of the state */ + state.reports = ue2::flat_set(); + state.reports_eod = ue2::flat_set(); + dfa->states.push_back(state); + } + + /* add a report to every accept state */ + dfa->states[7].reports.insert(rID); + + /** + * [a,b][c-e]{3}of + * (1) -a,b-> (2) -c,d,e-> (3) -c,d,e-> (4) -c,d,e-> (5) -o-> (6) -f-> ((7)) + * (0) = dead + */ + + for(int i = 0; i < ue2::ALPHABET_SIZE; i++) { + dfa->alpha_remap[i] = 0; + } + + dfa->alpha_remap['a'] = 0; + dfa->alpha_remap['b'] = 1; + dfa->alpha_remap['c'] = 2; + dfa->alpha_remap['d'] = 3; + dfa->alpha_remap['e'] = 4; + dfa->alpha_remap['o'] = 5; + dfa->alpha_remap['f'] = 6; + dfa->alpha_remap[256] = 7; /* for some reason there's a check that run on dfa->alpha_size-1 */ + + /* a b c d e o f */ + dfa->states[0].next = {0,0,0,0,0,0,0}; + dfa->states[1].next = {2,2,1,1,1,1,1}; /* nothing */ + dfa->states[2].next = {2,2,3,3,3,1,1}; /* [a,b] */ + dfa->states[3].next = {2,2,4,4,4,1,1}; /* [a,b][c-e]{1} */ + dfa->states[4].next = {2,2,5,5,5,1,1}; /* [a,b][c-e]{2} */ + fill_straight_regex_sequence(dfa, 5, 7, 7); /* [a,b][c-e]{3}o */ + dfa->states[7].next = {2,2,1,1,1,1,1}; /* [a,b][c-e]{3}of */ +} + +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) +/* We need more than 16 states to run sheng32, so make the graph longer */ +static void init_raw_dfa32(struct ue2::raw_dfa *dfa, const ReportID rID) +{ + dfa->start_anchored = 1; + dfa->start_floating = 1; + dfa->alpha_size = 18; + + int nb_state = 18; + for(int i = 0; i < nb_state; i++) { + struct ue2::dstate state(dfa->alpha_size); + state.next = std::vector(nb_state); + state.daddy = 0; + state.impl_id = i; /* id of the state */ + state.reports = ue2::flat_set(); + state.reports_eod = ue2::flat_set(); + dfa->states.push_back(state); + } + + /* add a report to every accept state */ + dfa->states[17].reports.insert(rID); + + /** + * [a,b][c-e]{3}of0123456789 + * (1) -a,b-> (2) -c,d,e-> (3) -c,d,e-> (4) -c,d,e-> (5) -o-> (6) -f-> (7) --> ((17)) + * (0) = dead + */ + + for(int i = 0; i < ue2::ALPHABET_SIZE; i++) { + dfa->alpha_remap[i] = 0; + } + + dfa->alpha_remap['a'] = 0; + dfa->alpha_remap['b'] = 1; + dfa->alpha_remap['c'] = 2; + dfa->alpha_remap['d'] = 3; + dfa->alpha_remap['e'] = 4; + dfa->alpha_remap['o'] = 5; + dfa->alpha_remap['f'] = 6; + // maps 0 to 9 + for (int i = 0; i < 10; i ++) { + dfa->alpha_remap[i + '0'] = i + 7; + } + dfa->alpha_remap[256] = 17; /* for some reason there's a check that run on dfa->alpha_size-1 */ + + /* a b c d e o f 0 1 2 3 4 5 6 7 8 9 */ + dfa->states[0].next = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; + dfa->states[1].next = {2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}; /* nothing */ + dfa->states[2].next = {2,2,3,3,3,1,1,1,1,1,1,1,1,1,1,1,1}; /* [a,b] */ + dfa->states[3].next = {2,2,4,4,4,1,1,1,1,1,1,1,1,1,1,1,1}; /* [a,b][c-e]{1} */ + dfa->states[4].next = {2,2,5,5,5,1,1,1,1,1,1,1,1,1,1,1,1}; /* [a,b][c-e]{2} */ + fill_straight_regex_sequence(dfa, 5, 17, 17); /* [a,b][c-e]{3}of012345678 */ + dfa->states[17].next = {2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}; /* [a,b][c-e]{3}of0123456789 */ +} +#endif /* defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) */ + +typedef ue2::bytecode_ptr (*sheng_compile_ptr)(ue2::raw_dfa&, + const ue2::CompileContext&, + const ue2::ReportManager&, + bool, + std::set*); + +typedef void (*init_raw_dfa_ptr)(struct ue2::raw_dfa*, const ReportID); + + +static inline void init_nfa(struct NFA **out_nfa, sheng_compile_ptr compile_function, init_raw_dfa_ptr init_dfa_function) { + ue2::Grey *g = new ue2::Grey(); + hs_platform_info plat_info = {0, 0, 0, 0}; + ue2::CompileContext *cc = new ue2::CompileContext(false, false, ue2::target_t(plat_info), *g); + ue2::ReportManager *rm = new ue2::ReportManager(*g); + ue2::Report *report = new ue2::Report(ue2::EXTERNAL_CALLBACK, 0); + ReportID rID = rm->getInternalId(*report); + rm->setProgramOffset(0, 0); + + struct ue2::raw_dfa *dfa = new ue2::raw_dfa(ue2::NFA_OUTFIX); + init_dfa_function(dfa, rID); + + *out_nfa = (compile_function(*dfa, *cc, *rm, false, nullptr)).release(); + ASSERT_NE(nullptr, *out_nfa); + + delete report; + delete rm; + delete cc; + delete g; +} + +static void init_nfa16(struct NFA **out_nfa) { + init_nfa(out_nfa, ue2::shengCompile, init_raw_dfa16); +} + +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) +static void init_nfa32(struct NFA **out_nfa) { + init_nfa(out_nfa, ue2::sheng32Compile, init_raw_dfa32); +} +#endif /* defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) */ + +static char state_buffer; + +static inline void init_sheng_queue(struct mq **out_q, uint8_t *buffer, size_t max_size, void (*init_nfa_func)(struct NFA **out_nfa) ) { + struct NFA* nfa; + init_nfa_func(&nfa); + assert(nfa); + + struct mq *q = new mq(); + + memset(q, 0, sizeof(struct mq)); + q->nfa = nfa; + q->state = &state_buffer; + q->cb = dummy_callback; + q->buffer = buffer; + q->length = max_size; /* setting this as the max length scanable */ + + if (nfa != q->nfa) { + printf("Something went wrong while initializing sheng.\n"); + } + nfaQueueInitState(nfa, q); + pushQueueAt(q, 0, MQE_START, 0); + pushQueueAt(q, 1, MQE_END, q->length ); + + *out_q = q; +} + +static void init_sheng_queue16(struct mq **out_q, uint8_t *buffer ,size_t max_size) { + init_sheng_queue(out_q, buffer, max_size, init_nfa16); +} + +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) +static void init_sheng_queue32(struct mq **out_q, uint8_t *buffer, size_t max_size) { + init_sheng_queue(out_q, buffer, max_size, init_nfa32); +} +#endif /* defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) */ + +static +void fill_pattern(u8* buf, size_t buffer_size, unsigned int start_offset, unsigned int period, const char *pattern, unsigned int pattern_length) { + memset(buf, '_', buffer_size); + + for (unsigned int i = 0; i < buffer_size - 8; i+= 8) { + /* filling with some junk, including some character used for a valid state, to prevent the use of shufti */ + memcpy(buf + i, "jgohcxbf", 8); + } + + for (unsigned int i = start_offset; i < buffer_size - pattern_length; i += period) { + memcpy(buf + i, pattern, pattern_length); + } +} + +/* Generate ground truth to compare to */ +struct NFA *get_expected_nfa_header(u8 type, unsigned int length, unsigned int nposition) { + struct NFA *expected_nfa_header = new struct NFA(); + memset(expected_nfa_header, 0, sizeof(struct NFA)); + expected_nfa_header->length = length; + expected_nfa_header->type = type; + expected_nfa_header->nPositions = nposition; + expected_nfa_header->scratchStateSize = 1; + expected_nfa_header->streamStateSize = 1; + return expected_nfa_header; +} + +struct NFA *get_expected_nfa16_header() { + return get_expected_nfa_header(SHENG_NFA, 4736, 8); +} + +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) +struct NFA *get_expected_nfa32_header() { + return get_expected_nfa_header(SHENG_NFA_32, 17216, 18); +} +#endif /* defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) */ + +void test_nfa_equal(const NFA& l, const NFA& r) +{ + EXPECT_EQ(l.flags, r.flags); + EXPECT_EQ(l.length, r.length); + EXPECT_EQ(l.type, r.type); + EXPECT_EQ(l.rAccelType, r.rAccelType); + EXPECT_EQ(l.rAccelOffset, r.rAccelOffset); + EXPECT_EQ(l.maxBiAnchoredWidth, r.maxBiAnchoredWidth); + EXPECT_EQ(l.rAccelData.dc, r.rAccelData.dc); + EXPECT_EQ(l.queueIndex, r.queueIndex); + EXPECT_EQ(l.nPositions, r.nPositions); + EXPECT_EQ(l.scratchStateSize, r.scratchStateSize); + EXPECT_EQ(l.streamStateSize, r.streamStateSize); + EXPECT_EQ(l.maxWidth, r.maxWidth); + EXPECT_EQ(l.minWidth, r.minWidth); + EXPECT_EQ(l.maxOffset, r.maxOffset); +} + +/* Start of actual tests */ + +/* + * Runs shengCompile and compares its outputs to previously recorded outputs. + */ +TEST(Sheng16, std_compile_header) { + + ue2::Grey *g = new ue2::Grey(); + hs_platform_info plat_info = {0, 0, 0, 0}; + ue2::CompileContext *cc = new ue2::CompileContext(false, false, ue2::target_t(plat_info), *g); + ue2::ReportManager *rm = new ue2::ReportManager(*g); + ue2::Report *report = new ue2::Report(ue2::EXTERNAL_CALLBACK, 0); + ReportID rID = rm->getInternalId(*report); + rm->setProgramOffset(0, 0); + + struct ue2::raw_dfa *dfa = new ue2::raw_dfa(ue2::NFA_OUTFIX); + init_raw_dfa16(dfa, rID); + + struct NFA *nfa = (shengCompile(*dfa, *cc, *rm, false)).release(); + EXPECT_NE(nullptr, nfa); + + EXPECT_NE(0, nfa->length); + EXPECT_EQ(SHENG_NFA, nfa->type); + + struct NFA *expected_nfa = get_expected_nfa16_header(); + test_nfa_equal(*expected_nfa, *nfa); + + delete expected_nfa; + delete report; + delete rm; + delete cc; + delete g; +} + +/* + * nfaExecSheng_B is the most basic of the sheng variants. It simply calls the core of the algorithm. + * We test it with a buffer having a few matches at fixed intervals and check that it finds them all. + */ +TEST(Sheng16, std_run_B) { + struct mq *q; + unsigned int pattern_length = 6; + unsigned int period = 128; + const size_t buf_size = 200; + unsigned int expected_matches = buf_size/128 + 1; + u8 buf[buf_size]; + struct callback_context context = {period, 0, pattern_length}; + + struct NFA* nfa; + init_nfa16(&nfa); + ASSERT_NE(nullptr, nfa); + fill_pattern(buf, buf_size, 0, period, "acecof", pattern_length); + char ret_val; + unsigned int offset = 0; + unsigned int loop_count = 0; + for (; loop_count < expected_matches + 1; loop_count++) { + ASSERT_LT(offset, buf_size); + ret_val = nfaExecSheng_B(nfa, + offset, + buf + offset, + (s64a) buf_size - offset, + periodic_pattern_callback, + &context); + offset = (context.match_count - 1) * context.period + context.pattern_length; + if(unlikely(ret_val != MO_ALIVE)) { + break; + } + } + + /*check normal return*/ + EXPECT_EQ(MO_ALIVE, ret_val); + + /*check that we don't find additional match nor crash when no match are found*/ + EXPECT_EQ(expected_matches + 1, loop_count); + + /*check that we have all the matches*/ + EXPECT_EQ(expected_matches, context.match_count); +} + +/* + * nfaExecSheng_Q runs like the _B version (callback), but exercises the message queue logic. + * We test it with a buffer having a few matches at fixed intervals and check that it finds them all. + */ +TEST(Sheng16, std_run_Q) { + struct mq *q; + unsigned int pattern_length = 6; + unsigned int period = 128; + const size_t buf_size = 200; + unsigned int expected_matches = buf_size/128 + 1; + u8 buf[buf_size]; + struct callback_context context = {period, 0, pattern_length}; + + init_sheng_queue16(&q, buf, buf_size); + fill_pattern(buf, buf_size, 0, period, "acecof", pattern_length); + q->cur = 0; + q->items[q->cur].location = 0; + q->context = &context; + q->cb = periodic_pattern_callback; + + nfaExecSheng_Q(q->nfa, q, (s64a) buf_size); + /*check that we have all the matches*/ + EXPECT_EQ(expected_matches, context.match_count); + + delete q; +} + +/* + * nfaExecSheng_Q2 uses the message queue, but stops at match instead of using a callback. + * We test it with a buffer having a few matches at fixed intervals and check that it finds them all. + */ +TEST(Sheng16, std_run_Q2) { + struct mq *q; + unsigned int pattern_length = 6; + unsigned int period = 128; + const size_t buf_size = 200; + unsigned int expected_matches = buf_size/128 + 1; + u8 buf[buf_size]; + + init_sheng_queue16(&q, buf, buf_size); + fill_pattern(buf, buf_size, 0, period, "acecof", pattern_length); + q->cur = 0; + q->items[q->cur].location = 0; + + char ret_val; + int location; + unsigned int loop_count = 0; + do { + ret_val = nfaExecSheng_Q2(q->nfa, q, (s64a) buf_size); + location = q->items[q->cur].location; + loop_count++; + } while(likely((ret_val == MO_MATCHES_PENDING) && (location < (int)buf_size) && ((location % period) == pattern_length))); + + /*check if it's a spurious match*/ + EXPECT_EQ(0, (ret_val == MO_MATCHES_PENDING) && ((location % period) != pattern_length)); + + /*check that we have all the matches*/ + EXPECT_EQ(expected_matches, loop_count-1); + + delete q; +} + +/* + * The message queue can also run on the "history" buffer. We test it the same way as the normal + * buffer, expecting the same behavior. + * We test it with a buffer having a few matches at fixed intervals and check that it finds them all. + */ +TEST(Sheng16, history_run_Q2) { + struct mq *q; + unsigned int pattern_length = 6; + unsigned int period = 128; + const size_t buf_size = 200; + unsigned int expected_matches = buf_size/128 + 1; + u8 buf[buf_size]; + + init_sheng_queue16(&q, buf, buf_size); + fill_pattern(buf, buf_size, 0, period, "acecof", pattern_length); + q->history = buf; + q->hlength = buf_size; + q->cur = 0; + q->items[q->cur].location = -200; + + char ret_val; + int location; + unsigned int loop_count = 0; + do { + ret_val = nfaExecSheng_Q2(q->nfa, q, 0); + location = q->items[q->cur].location; + loop_count++; + } while(likely((ret_val == MO_MATCHES_PENDING) && (location > -(int)buf_size) && (location < 0) && (((buf_size + location) % period) == pattern_length))); + + /*check if it's a spurious match*/ + EXPECT_EQ(0, (ret_val == MO_MATCHES_PENDING) && (((buf_size + location) % period) != pattern_length)); + + /*check that we have all the matches*/ + EXPECT_EQ(expected_matches, loop_count-1); + + delete q; +} + +/** + * Those tests only covers the basic paths. More tests can cover: + * - running for history buffer to current buffer in Q2 + * - running while expecting no match + * - nfaExecSheng_QR + * - run sheng when it should call an accelerator and confirm it call them + */ + +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) + +/* + * Runs sheng32Compile and compares its outputs to previously recorded outputs. + */ +TEST(Sheng32, std_compile_header) { +#if defined(HAVE_SVE) + if(svcntb()<32) { + return; + } +#endif + ue2::Grey *g = new ue2::Grey(); + hs_platform_info plat_info = {0, 0, 0, 0}; + ue2::CompileContext *cc = new ue2::CompileContext(false, false, ue2::target_t(plat_info), *g); + ue2::ReportManager *rm = new ue2::ReportManager(*g); + ue2::Report *report = new ue2::Report(ue2::EXTERNAL_CALLBACK, 0); + ReportID rID = rm->getInternalId(*report); + rm->setProgramOffset(0, 0); + + struct ue2::raw_dfa *dfa = new ue2::raw_dfa(ue2::NFA_OUTFIX); + init_raw_dfa32(dfa, rID); + + struct NFA *nfa = (sheng32Compile(*dfa, *cc, *rm, false)).release(); + EXPECT_NE(nullptr, nfa); + + EXPECT_NE(0, nfa->length); + EXPECT_EQ(SHENG_NFA_32, nfa->type); + + struct NFA *expected_nfa = get_expected_nfa32_header(); + test_nfa_equal(*expected_nfa, *nfa); + + delete expected_nfa; + delete report; + delete rm; + delete cc; + delete g; +} + +/* + * nfaExecSheng32_B is the most basic of the sheng variants. It simply calls the core of the algorithm. + * We test it with a buffer having a few matches at fixed intervals and check that it finds them all. + */ +TEST(Sheng32, std_run_B) { +#if defined(HAVE_SVE) + if(svcntb()<32) { + return; + } +#endif + struct mq *q; + unsigned int pattern_length = 16; + unsigned int period = 128; + const size_t buf_size = 200; + unsigned int expected_matches = buf_size/128 + 1; + u8 buf[buf_size]; + struct callback_context context = {period, 0, pattern_length}; + + struct NFA* nfa; + init_nfa32(&nfa); + ASSERT_NE(nullptr, nfa); + fill_pattern(buf, buf_size, 0, period, "acecof0123456789", pattern_length); + char ret_val; + unsigned int offset = 0; + unsigned int loop_count = 0; + for (; loop_count < expected_matches + 1; loop_count++) { + ASSERT_LT(offset, buf_size); + ret_val = nfaExecSheng32_B(nfa, + offset, + buf + offset, + (s64a) buf_size - offset, + periodic_pattern_callback, + &context); + offset = (context.match_count - 1) * context.period + context.pattern_length; + if(unlikely(ret_val != MO_ALIVE)) { + break; + } + } + + /*check normal return*/ + EXPECT_EQ(MO_ALIVE, ret_val); + + /*check that we don't find additional match nor crash when no match are found*/ + EXPECT_EQ(expected_matches + 1, loop_count); + + /*check that we have all the matches*/ + EXPECT_EQ(expected_matches, context.match_count); +} + +/* + * nfaExecSheng32_Q runs like the _B version (callback), but exercises the message queue logic. + * We test it with a buffer having a few matches at fixed intervals and check that it finds them all. + */ +TEST(Sheng32, std_run_Q) { +#if defined(HAVE_SVE) + if(svcntb()<32) { + return; + } +#endif + struct mq *q; + unsigned int pattern_length = 16; + unsigned int period = 128; + const size_t buf_size = 200; + unsigned int expected_matches = buf_size/128 + 1; + u8 buf[buf_size]; + struct callback_context context = {period, 0, pattern_length}; + + init_sheng_queue32(&q, buf, buf_size); + fill_pattern(buf, buf_size, 0, period, "acecof0123456789", pattern_length); + q->cur = 0; + q->items[q->cur].location = 0; + q->context = &context; + q->cb = periodic_pattern_callback; + + nfaExecSheng32_Q(q->nfa, q, (s64a) buf_size); + /*check that we have all the matches*/ + EXPECT_EQ(expected_matches, context.match_count); + + delete q; +} + +/* + * nfaExecSheng32_Q2 uses the message queue, but stops at match instead of using a callback. + * We test it with a buffer having a few matches at fixed intervals and check that it finds them all. + */ +TEST(Sheng32, std_run_Q2) { +#if defined(HAVE_SVE) + if(svcntb()<32) { + return; + } +#endif + struct mq *q; + unsigned int pattern_length = 16; + unsigned int period = 128; + const size_t buf_size = 200; + unsigned int expected_matches = buf_size/128 + 1; + u8 buf[buf_size]; + + init_sheng_queue32(&q, buf, buf_size); + fill_pattern(buf, buf_size, 0, period, "acecof0123456789", pattern_length); + q->cur = 0; + q->items[q->cur].location = 0; + + char ret_val; + int location; + unsigned int loop_count = 0; + do { + ret_val = nfaExecSheng32_Q2(q->nfa, q, (s64a) buf_size); + location = q->items[q->cur].location; + loop_count++; + } while(likely((ret_val == MO_MATCHES_PENDING) && (location < (int)buf_size) && ((location % period) == pattern_length))); + + /*check if it's a spurious match*/ + EXPECT_EQ(0, (ret_val == MO_MATCHES_PENDING) && ((location % period) != pattern_length)); + + /*check that we have all the matches*/ + EXPECT_EQ(expected_matches, loop_count-1); + + delete q; +} + +/* + * The message queue can also runs on the "history" buffer. We test it the same way as the normal + * buffer, expecting the same behavior. + * We test it with a buffer having a few matches at fixed intervals and check that it finds them all. + */ +TEST(Sheng32, history_run_Q2) { +#if defined(HAVE_SVE) + if(svcntb()<32) { + return; + } +#endif + struct mq *q; + unsigned int pattern_length = 16; + unsigned int period = 128; + const size_t buf_size = 200; + unsigned int expected_matches = buf_size/128 + 1; + u8 buf[buf_size]; + + init_sheng_queue32(&q, buf, buf_size); + fill_pattern(buf, buf_size, 0, period, "acecof0123456789", pattern_length); + q->history = buf; + q->hlength = buf_size; + q->cur = 0; + q->items[q->cur].location = -200; + + char ret_val; + int location; + unsigned int loop_count = 0; + do { + ret_val = nfaExecSheng32_Q2(q->nfa, q, 0); + location = q->items[q->cur].location; + loop_count++; + } while(likely((ret_val == MO_MATCHES_PENDING) && (location > -(int)buf_size) && (location < 0) && (((buf_size + location) % period) == pattern_length))); + + /*check if it's a spurious match*/ + EXPECT_EQ(0, (ret_val == MO_MATCHES_PENDING) && (((buf_size + location) % period) != pattern_length)); + + /*check that we have all the matches*/ + EXPECT_EQ(expected_matches, loop_count-1); + + delete q; +} +#endif /* defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) */ + +} /* namespace */ \ No newline at end of file