From 49fd4f0047d2d04d6ea6edf2b1406fb1685e1c31 Mon Sep 17 00:00:00 2001 From: Yoan Picchi Date: Thu, 4 Apr 2024 09:46:23 +0000 Subject: [PATCH] Enable sheng32 and sheng64 on Arm Signed-off-by: Yoan Picchi --- src/nfa/sheng.c | 8 ++--- src/nfa/sheng.h | 8 ++--- src/nfa/sheng_defs.h | 70 ++++++++++++++++++++-------------------- src/nfa/sheng_impl.h | 38 +++++++++++++++++++++- src/nfa/sheng_impl4.h | 61 ++++++++++++++++++++++++++++++++-- src/nfa/shengcompile.cpp | 14 ++++++++ 6 files changed, 153 insertions(+), 46 deletions(-) diff --git a/src/nfa/sheng.c b/src/nfa/sheng.c index 3f36e218..922e8f80 100644 --- a/src/nfa/sheng.c +++ b/src/nfa/sheng.c @@ -154,7 +154,7 @@ char fireReports(const struct sheng *sh, NfaCallback cb, void *ctxt, return MO_CONTINUE_MATCHING; /* continue execution */ } -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) // Sheng32 static really_inline const struct sheng32 *get_sheng32(const struct NFA *n) { @@ -351,7 +351,7 @@ char fireReports64(const struct sheng64 *sh, NfaCallback cb, void *ctxt, } return MO_CONTINUE_MATCHING; /* continue execution */ } -#endif // end of HAVE_AVX512VBMI +#endif // end of HAVE_AVX512VBMI || HAVE_SVE /* include Sheng function definitions */ #include "sheng_defs.h" @@ -871,7 +871,7 @@ char nfaExecSheng_expandState(UNUSED const struct NFA *nfa, void *dest, return 0; } -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) // Sheng32 static really_inline char runSheng32Cb(const struct sheng32 *sh, NfaCallback cb, void *ctxt, @@ -1874,4 +1874,4 @@ char nfaExecSheng64_expandState(UNUSED const struct NFA *nfa, void *dest, *(u8 *)dest = *(const u8 *)src; return 0; } -#endif // end of HAVE_AVX512VBMI +#endif // end of HAVE_AVX512VBMI || HAVE_SVE diff --git a/src/nfa/sheng.h b/src/nfa/sheng.h index 7b90e303..212bd3a4 100644 --- a/src/nfa/sheng.h +++ b/src/nfa/sheng.h @@ -58,7 +58,7 @@ char nfaExecSheng_reportCurrent(const struct NFA *n, struct mq *q); char nfaExecSheng_B(const struct NFA *n, u64a offset, const u8 *buffer, size_t length, NfaCallback cb, void *context); -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #define nfaExecSheng32_B_Reverse NFA_API_NO_IMPL #define nfaExecSheng32_zombie_status NFA_API_ZOMBIE_NO_IMPL @@ -106,8 +106,7 @@ char nfaExecSheng64_reportCurrent(const struct NFA *n, struct mq *q); char nfaExecSheng64_B(const struct NFA *n, u64a offset, const u8 *buffer, size_t length, NfaCallback cb, void *context); - -#else // !HAVE_AVX512VBMI +#else // !HAVE_AVX512VBMI && !HAVE_SVE #define nfaExecSheng32_B_Reverse NFA_API_NO_IMPL #define nfaExecSheng32_zombie_status NFA_API_ZOMBIE_NO_IMPL @@ -138,6 +137,7 @@ char nfaExecSheng64_B(const struct NFA *n, u64a offset, const u8 *buffer, #define nfaExecSheng64_testEOD NFA_API_NO_IMPL #define nfaExecSheng64_reportCurrent NFA_API_NO_IMPL #define nfaExecSheng64_B NFA_API_NO_IMPL -#endif // end of HAVE_AVX512VBMI +#endif // end of HAVE_AVX512VBMI || defined(HAVE_SVE) + #endif /* SHENG_H_ */ diff --git a/src/nfa/sheng_defs.h b/src/nfa/sheng_defs.h index 390af752..886af28e 100644 --- a/src/nfa/sheng_defs.h +++ b/src/nfa/sheng_defs.h @@ -52,7 +52,7 @@ u8 hasInterestingStates(const u8 a, const u8 b, const u8 c, const u8 d) { return (a | b | c | d) & (SHENG_STATE_FLAG_MASK); } -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) static really_inline u8 isDeadState32(const u8 a) { return a & SHENG32_STATE_DEAD; @@ -108,7 +108,7 @@ u8 dummyFunc(UNUSED const u8 a) { #define SHENG_IMPL sheng_cod #define DEAD_FUNC isDeadState #define ACCEPT_FUNC isAcceptState -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #define SHENG32_IMPL sheng32_cod #define DEAD_FUNC32 isDeadState32 #define ACCEPT_FUNC32 isAcceptState32 @@ -121,7 +121,7 @@ u8 dummyFunc(UNUSED const u8 a) { #undef SHENG_IMPL #undef DEAD_FUNC #undef ACCEPT_FUNC -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #undef SHENG32_IMPL #undef DEAD_FUNC32 #undef ACCEPT_FUNC32 @@ -135,7 +135,7 @@ u8 dummyFunc(UNUSED const u8 a) { #define SHENG_IMPL sheng_co #define DEAD_FUNC dummyFunc #define ACCEPT_FUNC isAcceptState -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #define SHENG32_IMPL sheng32_co #define DEAD_FUNC32 dummyFunc #define ACCEPT_FUNC32 isAcceptState32 @@ -148,7 +148,7 @@ u8 dummyFunc(UNUSED const u8 a) { #undef SHENG_IMPL #undef DEAD_FUNC #undef ACCEPT_FUNC -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #undef SHENG32_IMPL #undef DEAD_FUNC32 #undef ACCEPT_FUNC32 @@ -162,7 +162,7 @@ u8 dummyFunc(UNUSED const u8 a) { #define SHENG_IMPL sheng_samd #define DEAD_FUNC isDeadState #define ACCEPT_FUNC isAcceptState -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #define SHENG32_IMPL sheng32_samd #define DEAD_FUNC32 isDeadState32 #define ACCEPT_FUNC32 isAcceptState32 @@ -175,7 +175,7 @@ u8 dummyFunc(UNUSED const u8 a) { #undef SHENG_IMPL #undef DEAD_FUNC #undef ACCEPT_FUNC -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #undef SHENG32_IMPL #undef DEAD_FUNC32 #undef ACCEPT_FUNC32 @@ -189,7 +189,7 @@ u8 dummyFunc(UNUSED const u8 a) { #define SHENG_IMPL sheng_sam #define DEAD_FUNC dummyFunc #define ACCEPT_FUNC isAcceptState -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #define SHENG32_IMPL sheng32_sam #define DEAD_FUNC32 dummyFunc #define ACCEPT_FUNC32 isAcceptState32 @@ -202,7 +202,7 @@ u8 dummyFunc(UNUSED const u8 a) { #undef SHENG_IMPL #undef DEAD_FUNC #undef ACCEPT_FUNC -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #undef SHENG32_IMPL #undef DEAD_FUNC32 #undef ACCEPT_FUNC32 @@ -216,7 +216,7 @@ u8 dummyFunc(UNUSED const u8 a) { #define SHENG_IMPL sheng_nmd #define DEAD_FUNC isDeadState #define ACCEPT_FUNC dummyFunc -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #define SHENG32_IMPL sheng32_nmd #define DEAD_FUNC32 isDeadState32 #define ACCEPT_FUNC32 dummyFunc @@ -229,7 +229,7 @@ u8 dummyFunc(UNUSED const u8 a) { #undef SHENG_IMPL #undef DEAD_FUNC #undef ACCEPT_FUNC -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #undef SHENG32_IMPL #undef DEAD_FUNC32 #undef ACCEPT_FUNC32 @@ -243,7 +243,7 @@ u8 dummyFunc(UNUSED const u8 a) { #define SHENG_IMPL sheng_nm #define DEAD_FUNC dummyFunc #define ACCEPT_FUNC dummyFunc -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #define SHENG32_IMPL sheng32_nm #define DEAD_FUNC32 dummyFunc #define ACCEPT_FUNC32 dummyFunc @@ -256,7 +256,7 @@ u8 dummyFunc(UNUSED const u8 a) { #undef SHENG_IMPL #undef DEAD_FUNC #undef ACCEPT_FUNC -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #undef SHENG32_IMPL #undef DEAD_FUNC32 #undef ACCEPT_FUNC32 @@ -277,7 +277,7 @@ u8 dummyFunc(UNUSED const u8 a) { #define INNER_ACCEL_FUNC isAccelState #define OUTER_ACCEL_FUNC dummyFunc #define ACCEPT_FUNC isAcceptState -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #define SHENG32_IMPL sheng32_4_coda #define INTERESTING_FUNC32 hasInterestingStates32 #define INNER_DEAD_FUNC32 isDeadState32 @@ -296,7 +296,7 @@ u8 dummyFunc(UNUSED const u8 a) { #undef INNER_ACCEL_FUNC #undef OUTER_ACCEL_FUNC #undef ACCEPT_FUNC -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #undef SHENG32_IMPL #undef INTERESTING_FUNC32 #undef INNER_DEAD_FUNC32 @@ -316,7 +316,7 @@ u8 dummyFunc(UNUSED const u8 a) { #define INNER_ACCEL_FUNC dummyFunc #define OUTER_ACCEL_FUNC dummyFunc #define ACCEPT_FUNC isAcceptState -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #define SHENG32_IMPL sheng32_4_cod #define INTERESTING_FUNC32 hasInterestingStates32 #define INNER_DEAD_FUNC32 isDeadState32 @@ -339,7 +339,7 @@ u8 dummyFunc(UNUSED const u8 a) { #undef INNER_ACCEL_FUNC #undef OUTER_ACCEL_FUNC #undef ACCEPT_FUNC -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #undef SHENG32_IMPL #undef INTERESTING_FUNC32 #undef INNER_DEAD_FUNC32 @@ -363,7 +363,7 @@ u8 dummyFunc(UNUSED const u8 a) { #define INNER_ACCEL_FUNC isAccelState #define OUTER_ACCEL_FUNC dummyFunc #define ACCEPT_FUNC isAcceptState -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #define SHENG32_IMPL sheng32_4_coa #define INTERESTING_FUNC32 hasInterestingStates32 #define INNER_DEAD_FUNC32 dummyFunc @@ -382,7 +382,7 @@ u8 dummyFunc(UNUSED const u8 a) { #undef INNER_ACCEL_FUNC #undef OUTER_ACCEL_FUNC #undef ACCEPT_FUNC -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #undef SHENG32_IMPL #undef INTERESTING_FUNC32 #undef INNER_DEAD_FUNC32 @@ -402,7 +402,7 @@ u8 dummyFunc(UNUSED const u8 a) { #define INNER_ACCEL_FUNC dummyFunc #define OUTER_ACCEL_FUNC dummyFunc #define ACCEPT_FUNC isAcceptState -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #define SHENG32_IMPL sheng32_4_co #define INTERESTING_FUNC32 hasInterestingStates32 #define INNER_DEAD_FUNC32 dummyFunc @@ -425,7 +425,7 @@ u8 dummyFunc(UNUSED const u8 a) { #undef INNER_ACCEL_FUNC #undef OUTER_ACCEL_FUNC #undef ACCEPT_FUNC -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #undef SHENG32_IMPL #undef INTERESTING_FUNC32 #undef INNER_DEAD_FUNC32 @@ -449,7 +449,7 @@ u8 dummyFunc(UNUSED const u8 a) { #define INNER_ACCEL_FUNC isAccelState #define OUTER_ACCEL_FUNC dummyFunc #define ACCEPT_FUNC isAcceptState -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #define SHENG32_IMPL sheng32_4_samda #define INTERESTING_FUNC32 hasInterestingStates32 #define INNER_DEAD_FUNC32 isDeadState32 @@ -468,7 +468,7 @@ u8 dummyFunc(UNUSED const u8 a) { #undef INNER_ACCEL_FUNC #undef OUTER_ACCEL_FUNC #undef ACCEPT_FUNC -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #undef SHENG32_IMPL #undef INTERESTING_FUNC32 #undef INNER_DEAD_FUNC32 @@ -488,7 +488,7 @@ u8 dummyFunc(UNUSED const u8 a) { #define INNER_ACCEL_FUNC dummyFunc #define OUTER_ACCEL_FUNC dummyFunc #define ACCEPT_FUNC isAcceptState -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #define SHENG32_IMPL sheng32_4_samd #define INTERESTING_FUNC32 hasInterestingStates32 #define INNER_DEAD_FUNC32 isDeadState32 @@ -511,7 +511,7 @@ u8 dummyFunc(UNUSED const u8 a) { #undef INNER_ACCEL_FUNC #undef OUTER_ACCEL_FUNC #undef ACCEPT_FUNC -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #undef SHENG32_IMPL #undef INTERESTING_FUNC32 #undef INNER_DEAD_FUNC32 @@ -535,7 +535,7 @@ u8 dummyFunc(UNUSED const u8 a) { #define INNER_ACCEL_FUNC isAccelState #define OUTER_ACCEL_FUNC dummyFunc #define ACCEPT_FUNC isAcceptState -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #define SHENG32_IMPL sheng32_4_sama #define INTERESTING_FUNC32 hasInterestingStates32 #define INNER_DEAD_FUNC32 dummyFunc @@ -554,7 +554,7 @@ u8 dummyFunc(UNUSED const u8 a) { #undef INNER_ACCEL_FUNC #undef OUTER_ACCEL_FUNC #undef ACCEPT_FUNC -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #undef SHENG32_IMPL #undef INTERESTING_FUNC32 #undef INNER_DEAD_FUNC32 @@ -574,7 +574,7 @@ u8 dummyFunc(UNUSED const u8 a) { #define INNER_ACCEL_FUNC dummyFunc #define OUTER_ACCEL_FUNC dummyFunc #define ACCEPT_FUNC isAcceptState -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #define SHENG32_IMPL sheng32_4_sam #define INTERESTING_FUNC32 hasInterestingStates32 #define INNER_DEAD_FUNC32 dummyFunc @@ -597,7 +597,7 @@ u8 dummyFunc(UNUSED const u8 a) { #undef INNER_ACCEL_FUNC #undef OUTER_ACCEL_FUNC #undef ACCEPT_FUNC -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #undef SHENG32_IMPL #undef INTERESTING_FUNC32 #undef INNER_DEAD_FUNC32 @@ -623,7 +623,7 @@ u8 dummyFunc(UNUSED const u8 a) { #define INNER_ACCEL_FUNC dummyFunc #define OUTER_ACCEL_FUNC isAccelState #define ACCEPT_FUNC dummyFunc -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #define SHENG32_IMPL sheng32_4_nmda #define INTERESTING_FUNC32 dummyFunc4 #define INNER_DEAD_FUNC32 dummyFunc @@ -642,7 +642,7 @@ u8 dummyFunc(UNUSED const u8 a) { #undef INNER_ACCEL_FUNC #undef OUTER_ACCEL_FUNC #undef ACCEPT_FUNC -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #undef SHENG32_IMPL #undef INTERESTING_FUNC32 #undef INNER_DEAD_FUNC32 @@ -662,7 +662,7 @@ u8 dummyFunc(UNUSED const u8 a) { #define INNER_ACCEL_FUNC dummyFunc #define OUTER_ACCEL_FUNC dummyFunc #define ACCEPT_FUNC dummyFunc -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #define SHENG32_IMPL sheng32_4_nmd #define INTERESTING_FUNC32 dummyFunc4 #define INNER_DEAD_FUNC32 dummyFunc @@ -685,7 +685,7 @@ u8 dummyFunc(UNUSED const u8 a) { #undef INNER_ACCEL_FUNC #undef OUTER_ACCEL_FUNC #undef ACCEPT_FUNC -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #undef SHENG32_IMPL #undef INTERESTING_FUNC32 #undef INNER_DEAD_FUNC32 @@ -712,7 +712,7 @@ u8 dummyFunc(UNUSED const u8 a) { #define INNER_ACCEL_FUNC dummyFunc #define OUTER_ACCEL_FUNC dummyFunc #define ACCEPT_FUNC dummyFunc -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #define SHENG32_IMPL sheng32_4_nm #define INTERESTING_FUNC32 dummyFunc4 #define INNER_DEAD_FUNC32 dummyFunc @@ -735,7 +735,7 @@ u8 dummyFunc(UNUSED const u8 a) { #undef INNER_ACCEL_FUNC #undef OUTER_ACCEL_FUNC #undef ACCEPT_FUNC -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) #undef SHENG32_IMPL #undef INTERESTING_FUNC32 #undef INNER_DEAD_FUNC32 diff --git a/src/nfa/sheng_impl.h b/src/nfa/sheng_impl.h index 1fa5c831..2c701446 100644 --- a/src/nfa/sheng_impl.h +++ b/src/nfa/sheng_impl.h @@ -96,7 +96,7 @@ char SHENG_IMPL(u8 *state, NfaCallback cb, void *ctxt, const struct sheng *s, return MO_CONTINUE_MATCHING; } -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) static really_inline char SHENG32_IMPL(u8 *state, NfaCallback cb, void *ctxt, const struct sheng32 *s, @@ -114,14 +114,28 @@ char SHENG32_IMPL(u8 *state, NfaCallback cb, void *ctxt, } DEBUG_PRINTF("Scanning %lli bytes\n", (s64a)(end - start)); +#if defined(HAVE_SVE) + const svbool_t lane_pred_32 = svwhilelt_b8(0, 32); + svuint8_t cur_state = svdup_u8(*state); + svuint8_t tbl_mask = svdup_u8((unsigned char)0x1F); + const m512 *masks = s->succ_masks; +#else m512 cur_state = set1_64x8(*state); const m512 *masks = s->succ_masks; +#endif while (likely(cur_buf != end)) { const u8 c = *cur_buf; + +#if defined(HAVE_SVE) + svuint8_t succ_mask = svld1(lane_pred_32, (const u8*)(masks + c)); + cur_state = svtbl(succ_mask, svand_x(svptrue_b8(), tbl_mask, cur_state)); + const u8 tmp = svlastb(lane_pred_32, cur_state); +#else const m512 succ_mask = masks[c]; cur_state = vpermb512(cur_state, succ_mask); const u8 tmp = movd512(cur_state); +#endif DEBUG_PRINTF("c: %02hhx '%c'\n", c, ourisprint(c) ? c : '?'); DEBUG_PRINTF("s: %u (flag: %u)\n", tmp & SHENG32_STATE_MASK, @@ -153,7 +167,11 @@ char SHENG32_IMPL(u8 *state, NfaCallback cb, void *ctxt, } cur_buf++; } +#if defined(HAVE_SVE) + *state = svlastb(lane_pred_32, cur_state); +#else *state = movd512(cur_state); +#endif *scan_end = cur_buf; return MO_CONTINUE_MATCHING; } @@ -175,14 +193,28 @@ char SHENG64_IMPL(u8 *state, NfaCallback cb, void *ctxt, } DEBUG_PRINTF("Scanning %lli bytes\n", (s64a)(end - start)); +#if defined(HAVE_SVE) + const svbool_t lane_pred_64 = svwhilelt_b8(0, 64); + svuint8_t cur_state = svdup_u8(*state); + svuint8_t tbl_mask = svdup_u8((unsigned char)0x3F); + const m512 *masks = s->succ_masks; +#else m512 cur_state = set1_64x8(*state); const m512 *masks = s->succ_masks; +#endif while (likely(cur_buf != end)) { const u8 c = *cur_buf; + +#if defined(HAVE_SVE) + svuint8_t succ_mask = svld1(lane_pred_64, (const u8*)(masks + c)); + cur_state = svtbl(succ_mask, svand_x(svptrue_b8(), tbl_mask, cur_state)); + const u8 tmp = svlastb(lane_pred_64, cur_state); +#else const m512 succ_mask = masks[c]; cur_state = vpermb512(cur_state, succ_mask); const u8 tmp = movd512(cur_state); +#endif DEBUG_PRINTF("c: %02hhx '%c'\n", c, ourisprint(c) ? c : '?'); DEBUG_PRINTF("s: %u (flag: %u)\n", tmp & SHENG64_STATE_MASK, @@ -214,7 +246,11 @@ char SHENG64_IMPL(u8 *state, NfaCallback cb, void *ctxt, } cur_buf++; } +#if defined(HAVE_SVE) + *state = svlastb(lane_pred_64, cur_state); +#else *state = movd512(cur_state); +#endif *scan_end = cur_buf; return MO_CONTINUE_MATCHING; } diff --git a/src/nfa/sheng_impl4.h b/src/nfa/sheng_impl4.h index e5d3468f..718c3409 100644 --- a/src/nfa/sheng_impl4.h +++ b/src/nfa/sheng_impl4.h @@ -283,7 +283,7 @@ char SHENG_IMPL(u8 *state, NfaCallback cb, void *ctxt, const struct sheng *s, return MO_CONTINUE_MATCHING; } -#if defined(HAVE_AVX512VBMI) +#if defined(HAVE_AVX512VBMI) || defined(HAVE_SVE) static really_inline char SHENG32_IMPL(u8 *state, NfaCallback cb, void *ctxt, const struct sheng32 *s, @@ -320,8 +320,15 @@ char SHENG32_IMPL(u8 *state, NfaCallback cb, void *ctxt, return MO_CONTINUE_MATCHING; } +#if defined(HAVE_SVE) + const svbool_t lane_pred_32 = svwhilelt_b8(0, 32); + svuint8_t cur_state = svdup_u8(*state); + svuint8_t tbl_mask = svdup_u8((unsigned char)0x1F); + const m512 *masks = s->succ_masks; +#else m512 cur_state = set1_64x8(*state); const m512 *masks = s->succ_masks; +#endif while (likely(end - cur_buf >= 4)) { const u8 *b1 = cur_buf; @@ -333,6 +340,23 @@ char SHENG32_IMPL(u8 *state, NfaCallback cb, void *ctxt, const u8 c3 = *b3; const u8 c4 = *b4; +#if defined(HAVE_SVE) + svuint8_t succ_mask1 = svld1(lane_pred_32, (const u8*)(masks+c1)); + cur_state = svtbl(succ_mask1, svand_x(svptrue_b8(), tbl_mask, cur_state)); + const u8 a1 = svlastb(lane_pred_32, cur_state); + + svuint8_t succ_mask2 = svld1(lane_pred_32, (const u8*)(masks+c2)); + cur_state = svtbl(succ_mask2, svand_x(svptrue_b8(), tbl_mask, cur_state)); + const u8 a2 = svlastb(lane_pred_32, cur_state); + + svuint8_t succ_mask3 = svld1(lane_pred_32, (const u8*)(masks+c3)); + cur_state = svtbl(succ_mask3, svand_x(svptrue_b8(), tbl_mask, cur_state)); + const u8 a3 = svlastb(lane_pred_32, cur_state); + + svuint8_t succ_mask4 = svld1(lane_pred_32, (const u8*)(masks+c4)); + cur_state = svtbl(succ_mask4, svand_x(svptrue_b8(), tbl_mask, cur_state)); + const u8 a4 = svlastb(lane_pred_32, cur_state); +#else const m512 succ_mask1 = masks[c1]; cur_state = vpermb512(cur_state, succ_mask1); const u8 a1 = movd512(cur_state); @@ -348,6 +372,7 @@ char SHENG32_IMPL(u8 *state, NfaCallback cb, void *ctxt, const m512 succ_mask4 = masks[c4]; cur_state = vpermb512(cur_state, succ_mask4); const u8 a4 = movd512(cur_state); +#endif DEBUG_PRINTF("c: %02hhx '%c'\n", c1, ourisprint(c1) ? c1 : '?'); DEBUG_PRINTF("s: %u (flag: %u)\n", a1 & SHENG32_STATE_MASK, @@ -517,7 +542,11 @@ char SHENG32_IMPL(u8 *state, NfaCallback cb, void *ctxt, }; cur_buf += 4; } +#if defined(HAVE_SVE) + *state = svlastb(lane_pred_32, cur_state); +#else *state = movd512(cur_state); +#endif *scan_end = cur_buf; return MO_CONTINUE_MATCHING; } @@ -541,9 +570,15 @@ char SHENG64_IMPL(u8 *state, NfaCallback cb, void *ctxt, *scan_end = end; return MO_CONTINUE_MATCHING; } - +#if defined(HAVE_SVE) + const svbool_t lane_pred_64 = svwhilelt_b8(0, 64); + svuint8_t cur_state = svdup_u8(*state); + svuint8_t tbl_mask = svdup_u8((unsigned char)0x3F); + const m512 *masks = s->succ_masks; +#else m512 cur_state = set1_64x8(*state); const m512 *masks = s->succ_masks; +#endif while (likely(end - cur_buf >= 4)) { const u8 *b1 = cur_buf; @@ -555,6 +590,23 @@ char SHENG64_IMPL(u8 *state, NfaCallback cb, void *ctxt, const u8 c3 = *b3; const u8 c4 = *b4; +#if defined(HAVE_SVE) + svuint8_t succ_mask1 = svld1(lane_pred_64, (const u8*)(masks+c1)); + cur_state = svtbl(succ_mask1, svand_x(svptrue_b8(), tbl_mask, cur_state)); + const u8 a1 = svlastb(lane_pred_64, cur_state); + + svuint8_t succ_mask2 = svld1(lane_pred_64, (const u8*)(masks+c2)); + cur_state = svtbl(succ_mask2, svand_x(svptrue_b8(), tbl_mask, cur_state)); + const u8 a2 = svlastb(lane_pred_64, cur_state); + + svuint8_t succ_mask3 = svld1(lane_pred_64, (const u8*)(masks+c3)); + cur_state = svtbl(succ_mask3, svand_x(svptrue_b8(), tbl_mask, cur_state)); + const u8 a3 = svlastb(lane_pred_64, cur_state); + + svuint8_t succ_mask4 = svld1(lane_pred_64, (const u8*)(masks+c4)); + cur_state = svtbl(succ_mask4, svand_x(svptrue_b8(), tbl_mask, cur_state)); + const u8 a4 = svlastb(lane_pred_64, cur_state); +#else const m512 succ_mask1 = masks[c1]; cur_state = vpermb512(cur_state, succ_mask1); const u8 a1 = movd512(cur_state); @@ -570,6 +622,7 @@ char SHENG64_IMPL(u8 *state, NfaCallback cb, void *ctxt, const m512 succ_mask4 = masks[c4]; cur_state = vpermb512(cur_state, succ_mask4); const u8 a4 = movd512(cur_state); +#endif DEBUG_PRINTF("c: %02hhx '%c'\n", c1, ourisprint(c1) ? c1 : '?'); DEBUG_PRINTF("s: %u (flag: %u)\n", a1 & SHENG64_STATE_MASK, @@ -703,7 +756,11 @@ char SHENG64_IMPL(u8 *state, NfaCallback cb, void *ctxt, } cur_buf += 4; } +#if defined(HAVE_SVE) + *state = svlastb(lane_pred_64, cur_state); +#else *state = movd512(cur_state); +#endif *scan_end = cur_buf; return MO_CONTINUE_MATCHING; } diff --git a/src/nfa/shengcompile.cpp b/src/nfa/shengcompile.cpp index 055e1971..0f93e139 100644 --- a/src/nfa/shengcompile.cpp +++ b/src/nfa/shengcompile.cpp @@ -730,10 +730,17 @@ bytecode_ptr sheng32Compile(raw_dfa &raw, const CompileContext &cc, return nullptr; } +#ifdef HAVE_SVE + if (svcntb()<32) { + DEBUG_PRINTF("Sheng32 failed, SVE width is too small!\n"); + return nullptr; + } +#else if (!cc.target_info.has_avx512vbmi()) { DEBUG_PRINTF("Sheng32 failed, no HS_CPU_FEATURES_AVX512VBMI!\n"); return nullptr; } +#endif sheng_build_strat strat(raw, rm, only_accel_init); dfa_info info(strat); @@ -762,10 +769,17 @@ bytecode_ptr sheng64Compile(raw_dfa &raw, const CompileContext &cc, return nullptr; } +#ifdef HAVE_SVE + if (svcntb()<64) { + DEBUG_PRINTF("Sheng64 failed, SVE width is too small!\n"); + return nullptr; + } +#else if (!cc.target_info.has_avx512vbmi()) { DEBUG_PRINTF("Sheng64 failed, no HS_CPU_FEATURES_AVX512VBMI!\n"); return nullptr; } +#endif sheng_build_strat strat(raw, rm, only_accel_init); dfa_info info(strat);